wip: write tensors in parallel

default max term height
create blobs in parallel
2025-04-25 13:39:12 -07:00 · 2025-04-25 12:54:07 -07:00 · 2025-04-25 12:54:07 -07:00
52 changed files with 367 additions and 1838 deletions
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -21,16 +21,14 @@
      "name": "CUDA 11",
      "inherits": [ "CUDA" ],
      "cacheVariables": {
-        "CMAKE_CUDA_ARCHITECTURES": "50;52;53;60;61;70;75;80;86",
-        "CMAKE_CUDA_FLAGS": "-Wno-deprecated-gpu-targets"
+        "CMAKE_CUDA_ARCHITECTURES": "50;52;53;60;61;70;75;80;86"
      }
    },
    {
      "name": "CUDA 12",
      "inherits": [ "CUDA" ],
      "cacheVariables": {
-        "CMAKE_CUDA_ARCHITECTURES": "50;60;61;70;75;80;86;87;89;90;90a;120",
-        "CMAKE_CUDA_FLAGS": "-Wno-deprecated-gpu-targets"
+        "CMAKE_CUDA_ARCHITECTURES": "50;60;61;70;75;80;86;87;89;90;90a;120"
      }
    },
    {
--- a/README.md
+++ b/README.md
@@ -285,7 +285,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Bionic GPT](https://github.com/bionic-gpt/bionic-gpt)
 - [HTML UI](https://github.com/rtcfirefly/ollama-ui)
 - [Saddle](https://github.com/jikkuatwork/saddle)
- [TagSpaces](https://www.tagspaces.org) (A platform for file-based apps, [utilizing Ollama](https://docs.tagspaces.org/ai/) for the generation of tags and descriptions)
+- [TagSpaces](https://www.tagspaces.org) (A platform for file based apps, [utilizing Ollama](https://docs.tagspaces.org/ai/) for the generation of tags and descriptions)
 - [Chatbot UI](https://github.com/ivanfioravanti/chatbot-ollama)
 - [Chatbot UI v2](https://github.com/mckaywrigley/chatbot-ui)
 - [Typescript UI](https://github.com/ollama-interface/Ollama-Gui?tab=readme-ov-file)
@@ -325,14 +325,14 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [RWKV-Runner](https://github.com/josStorer/RWKV-Runner) (RWKV offline LLM deployment tool, also usable as a client for ChatGPT and Ollama)
 - [Ollama Grid Search](https://github.com/dezoito/ollama-grid-search) (app to evaluate and compare models)
 - [Olpaka](https://github.com/Otacon/olpaka) (User-friendly Flutter Web App for Ollama)
- [Casibase](https://casibase.org) (An open source AI knowledge base and dialogue system combining the latest RAG, SSO, ollama support, and multiple large language models.)
+- [Casibase](https://casibase.org) (An open source AI knowledge base and dialogue system combining the latest RAG, SSO, ollama support and multiple large language models.)
 - [OllamaSpring](https://github.com/CrazyNeil/OllamaSpring) (Ollama Client for macOS)
 - [LLocal.in](https://github.com/kartikm7/llocal) (Easy to use Electron Desktop Client for Ollama)
 - [Shinkai Desktop](https://github.com/dcSpark/shinkai-apps) (Two click install Local AI using Ollama + Files + RAG)
- [AiLama](https://github.com/zeyoyt/ailama) (A Discord User App that allows you to interact with Ollama anywhere in Discord)
+- [AiLama](https://github.com/zeyoyt/ailama) (A Discord User App that allows you to interact with Ollama anywhere in discord )
 - [Ollama with Google Mesop](https://github.com/rapidarchitect/ollama_mesop/) (Mesop Chat Client implementation with Ollama)
 - [R2R](https://github.com/SciPhi-AI/R2R) (Open-source RAG engine)
- [Ollama-Kis](https://github.com/elearningshow/ollama-kis) (A simple easy-to-use GUI with sample custom LLM for Drivers Education)
+- [Ollama-Kis](https://github.com/elearningshow/ollama-kis) (A simple easy to use GUI with sample custom LLM for Drivers Education)
 - [OpenGPA](https://opengpa.org) (Open-source offline-first Enterprise Agentic Application)
 - [Painting Droid](https://github.com/mateuszmigas/painting-droid) (Painting app with AI integrations)
 - [Kerlig AI](https://www.kerlig.com/) (AI writing assistant for macOS)
@@ -341,16 +341,16 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [LLMStack](https://github.com/trypromptly/LLMStack) (No-code multi-agent framework to build LLM agents and workflows)
 - [BoltAI for Mac](https://boltai.com) (AI Chat Client for Mac)
 - [Harbor](https://github.com/av/harbor) (Containerized LLM Toolkit with Ollama as default backend)
- [PyGPT](https://github.com/szczyglis-dev/py-gpt) (AI desktop assistant for Linux, Windows, and Mac)
- [Alpaca](https://github.com/Jeffser/Alpaca) (An Ollama client application for Linux and macOS made with GTK4 and Adwaita)
+- [PyGPT](https://github.com/szczyglis-dev/py-gpt) (AI desktop assistant for Linux, Windows and Mac)
+- [Alpaca](https://github.com/Jeffser/Alpaca) (An Ollama client application for linux and macos made with GTK4 and Adwaita)
 - [AutoGPT](https://github.com/Significant-Gravitas/AutoGPT/blob/master/docs/content/platform/ollama.md) (AutoGPT Ollama integration)
 - [Go-CREW](https://www.jonathanhecl.com/go-crew/) (Powerful Offline RAG in Golang)
 - [PartCAD](https://github.com/openvmp/partcad/) (CAD model generation with OpenSCAD and CadQuery)
- [Ollama4j Web UI](https://github.com/ollama4j/ollama4j-web-ui) - Java-based Web UI for Ollama built with Vaadin, Spring Boot, and Ollama4j
+- [Ollama4j Web UI](https://github.com/ollama4j/ollama4j-web-ui) - Java-based Web UI for Ollama built with Vaadin, Spring Boot and Ollama4j
 - [PyOllaMx](https://github.com/kspviswa/pyOllaMx) - macOS application capable of chatting with both Ollama and Apple MLX models.
 - [Cline](https://github.com/cline/cline) - Formerly known as Claude Dev is a VSCode extension for multi-file/whole-repo coding
 - [Cherry Studio](https://github.com/kangfenmao/cherry-studio) (Desktop client with Ollama support)
- [ConfiChat](https://github.com/1runeberg/confichat) (Lightweight, standalone, multi-platform, and privacy-focused LLM chat interface with optional encryption)
+- [ConfiChat](https://github.com/1runeberg/confichat) (Lightweight, standalone, multi-platform, and privacy focused LLM chat interface with optional encryption)
 - [Archyve](https://github.com/nickthecook/archyve) (RAG-enabling document library)
 - [crewAI with Mesop](https://github.com/rapidarchitect/ollama-crew-mesop) (Mesop Web Interface to run crewAI with Ollama)
 - [Tkinter-based client](https://github.com/chyok/ollama-gui) (Python tkinter-based Client for Ollama)
@@ -368,7 +368,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [DualMind](https://github.com/tcsenpai/dualmind) (Experimental app allowing two models to talk to each other in the terminal or in a web interface)
 - [ollamarama-matrix](https://github.com/h1ddenpr0cess20/ollamarama-matrix) (Ollama chatbot for the Matrix chat protocol)
 - [ollama-chat-app](https://github.com/anan1213095357/ollama-chat-app) (Flutter-based chat app)
- [Perfect Memory AI](https://www.perfectmemory.ai/) (Productivity AI assists personalized by what you have seen on your screen, heard, and said in the meetings)
+- [Perfect Memory AI](https://www.perfectmemory.ai/) (Productivity AI assists personalized by what you have seen on your screen, heard and said in the meetings)
 - [Hexabot](https://github.com/hexastack/hexabot) (A conversational AI builder)
 - [Reddit Rate](https://github.com/rapidarchitect/reddit_analyzer) (Search and Rate Reddit topics with a weighted summation)
 - [OpenTalkGpt](https://github.com/adarshM84/OpenTalkGpt) (Chrome Extension to manage open-source models supported by Ollama, create custom models, and chat with models from a user-friendly UI)
@@ -386,7 +386,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [ChibiChat](https://github.com/CosmicEventHorizon/ChibiChat) (Kotlin-based Android app to chat with Ollama and Koboldcpp API endpoints)
 - [LocalLLM](https://github.com/qusaismael/localllm) (Minimal Web-App to run ollama models on it with a GUI)
 - [Ollamazing](https://github.com/buiducnhat/ollamazing) (Web extension to run Ollama models)
- [OpenDeepResearcher-via-searxng](https://github.com/benhaotang/OpenDeepResearcher-via-searxng) (A Deep Research equivalent endpoint with Ollama support for running locally)
+- [OpenDeepResearcher-via-searxng](https://github.com/benhaotang/OpenDeepResearcher-via-searxng) (A Deep Research equivent endpoint with Ollama support for running locally)
 - [AntSK](https://github.com/AIDotNet/AntSK) (Out-of-the-box & Adaptable RAG Chatbot)
 - [MaxKB](https://github.com/1Panel-dev/MaxKB/) (Ready-to-use & flexible RAG Chatbot)
 - [yla](https://github.com/danielekp/yla) (Web interface to freely interact with your customized models)
@@ -440,7 +440,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [PowershAI](https://github.com/rrg92/powershai) PowerShell module that brings AI to terminal on Windows, including support for Ollama
 - [DeepShell](https://github.com/Abyss-c0re/deepshell) Your self-hosted AI assistant. Interactive Shell, Files and Folders analysis.
 - [orbiton](https://github.com/xyproto/orbiton) Configuration-free text editor and IDE with support for tab completion with Ollama.
- [orca-cli](https://github.com/molbal/orca-cli) Ollama Registry CLI Application - Browse, pull, and download models from Ollama Registry in your terminal.
+- [orca-cli](https://github.com/molbal/orca-cli) Ollama Registry CLI Application - Browse, pull and download models from Ollama Registry in your terminal.
 - [GGUF-to-Ollama](https://github.com/jonathanhecl/gguf-to-ollama) - Importing GGUF to Ollama made easy (multiplatform)

 ### Apple Vision Pro
@@ -515,7 +515,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Swollama for Swift](https://github.com/marcusziade/Swollama) with [DocC](https://marcusziade.github.io/Swollama/documentation/swollama/)
 - [GoLamify](https://github.com/prasad89/golamify)
 - [Ollama for Haskell](https://github.com/tusharad/ollama-haskell)
- [multi-llm-ts](https://github.com/nbonamy/multi-llm-ts) (A Typescript/JavaScript library allowing access to different LLM in a unified API)
+- [multi-llm-ts](https://github.com/nbonamy/multi-llm-ts) (A Typescript/JavaScript library allowing access to different LLM in unified API)
 - [LlmTornado](https://github.com/lofcz/llmtornado) (C# library providing a unified interface for major FOSS & Commercial inference APIs)
 - [Ollama for Zig](https://github.com/dravenk/ollama-zig)
 - [Abso](https://github.com/lunary-ai/abso) (OpenAI-compatible TypeScript SDK for any LLM provider)
@@ -524,11 +524,11 @@ See the [API documentation](./docs/api.md) for all endpoints.

 ### Mobile

- [SwiftChat](https://github.com/aws-samples/swift-chat) (Lightning-fast Cross-platform AI chat app with native UI for Android, iOS, and iPad)
+- [SwiftChat](https://github.com/aws-samples/swift-chat) (Lightning-fast Cross-platform AI chat app with native UI for Android, iOS and iPad)
 - [Enchanted](https://github.com/AugustDev/enchanted)
 - [Maid](https://github.com/Mobile-Artificial-Intelligence/maid)
 - [Ollama App](https://github.com/JHubi1/ollama-app) (Modern and easy-to-use multi-platform client for Ollama)
- [ConfiChat](https://github.com/1runeberg/confichat) (Lightweight, standalone, multi-platform, and privacy-focused LLM chat interface with optional encryption)
+- [ConfiChat](https://github.com/1runeberg/confichat) (Lightweight, standalone, multi-platform, and privacy focused LLM chat interface with optional encryption)
 - [Ollama Android Chat](https://github.com/sunshine0523/OllamaServer) (No need for Termux, start the Ollama service with one click on an Android device)
 - [Reins](https://github.com/ibrahimcetin/reins) (Easily tweak parameters, customize system prompts per chat, and enhance your AI experiments with reasoning model support.)

@@ -552,7 +552,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Obsidian Local GPT plugin](https://github.com/pfrankov/obsidian-local-gpt)
 - [Open Interpreter](https://docs.openinterpreter.com/language-model-setup/local-models/ollama)
 - [Llama Coder](https://github.com/ex3ndr/llama-coder) (Copilot alternative using Ollama)
- [Ollama Copilot](https://github.com/bernardo-bruning/ollama-copilot) (Proxy that allows you to use Ollama as a copilot like GitHub Copilot)
+- [Ollama Copilot](https://github.com/bernardo-bruning/ollama-copilot) (Proxy that allows you to use ollama as a copilot like Github copilot)
 - [twinny](https://github.com/rjmacarthy/twinny) (Copilot and Copilot chat alternative using Ollama)
 - [Wingman-AI](https://github.com/RussellCanfield/wingman-ai) (Copilot code and chat alternative using Ollama and Hugging Face)
 - [Page Assist](https://github.com/n4ze3m/page-assist) (Chrome Extension)
@@ -562,8 +562,8 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Discord-Ollama Chat Bot](https://github.com/kevinthedang/discord-ollama) (Generalized TypeScript Discord Bot w/ Tuning Documentation)
 - [ChatGPTBox: All in one browser extension](https://github.com/josStorer/chatGPTBox) with [Integrating Tutorial](https://github.com/josStorer/chatGPTBox/issues/616#issuecomment-1975186467)
 - [Discord AI chat/moderation bot](https://github.com/rapmd73/Companion) Chat/moderation bot written in python. Uses Ollama to create personalities.
- [Headless Ollama](https://github.com/nischalj10/headless-ollama) (Scripts to automatically install ollama client & models on any OS for apps that depend on ollama server)
- [Terraform AWS Ollama & Open WebUI](https://github.com/xuyangbocn/terraform-aws-self-host-llm) (A Terraform module to deploy on AWS a ready-to-use Ollama service, together with its front-end Open WebUI service.)
+- [Headless Ollama](https://github.com/nischalj10/headless-ollama) (Scripts to automatically install ollama client & models on any OS for apps that depends on ollama server)
+- [Terraform AWS Ollama & Open WebUI](https://github.com/xuyangbocn/terraform-aws-self-host-llm) (A Terraform module to deploy on AWS a ready-to-use Ollama service, together with its front end Open WebUI service.)
 - [node-red-contrib-ollama](https://github.com/jakubburkiewicz/node-red-contrib-ollama)
 - [Local AI Helper](https://github.com/ivostoykov/localAI) (Chrome and Firefox extensions that enable interactions with the active tab and customisable API endpoints. Includes secure storage for user prompts.)
 - [vnc-lm](https://github.com/jake83741/vnc-lm) (Discord bot for messaging with LLMs through Ollama and LiteLLM. Seamlessly move between local and flagship models.)
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -22,6 +22,7 @@ import (
 	"sort"
 	"strconv"
 	"strings"
+	"sync"
 	"sync/atomic"
 	"syscall"
 	"time"
@@ -31,6 +32,7 @@ import (
 	"github.com/olekukonko/tablewriter"
 	"github.com/spf13/cobra"
 	"golang.org/x/crypto/ssh"
+	"golang.org/x/sync/errgroup"
 	"golang.org/x/term"

 	"github.com/ollama/ollama/api"
@@ -106,7 +108,7 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
 	}
 	spinner.Stop()

-	req.Name = args[0]
+	req.Model = args[0]
 	quantize, _ := cmd.Flags().GetString("quantize")
 	if quantize != "" {
 		req.Quantize = quantize
@@ -117,26 +119,43 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
 		return err
 	}

-	if len(req.Files) > 0 {
-		fileMap := map[string]string{}
-		for f, digest := range req.Files {
+	var mu sync.Mutex
+	var g errgroup.Group
+	g.SetLimit(max(runtime.GOMAXPROCS(0)-1, 1))
+	// copy files since we'll be modifying the map
+	temp := req.Files
+	req.Files = make(map[string]string, len(temp))
+	for f, digest := range temp {
+		g.Go(func() error {
 			if _, err := createBlob(cmd, client, f, digest, p); err != nil {
 				return err
 			}
-			fileMap[filepath.Base(f)] = digest
-		}
-		req.Files = fileMap
+
+			mu.Lock()
+			req.Files[filepath.Base(f)] = digest
+			mu.Unlock()
+			return nil
+		})
 	}

-	if len(req.Adapters) > 0 {
-		fileMap := map[string]string{}
-		for f, digest := range req.Adapters {
+	// copy files since we'll be modifying the map
+	temp = req.Adapters
+	req.Adapters = make(map[string]string, len(temp))
+	for f, digest := range temp {
+		g.Go(func() error {
 			if _, err := createBlob(cmd, client, f, digest, p); err != nil {
 				return err
 			}
-			fileMap[filepath.Base(f)] = digest
-		}
-		req.Adapters = fileMap
+
+			mu.Lock()
+			req.Adapters[filepath.Base(f)] = digest
+			mu.Unlock()
+			return nil
+		})
+	}
+
+	if err := g.Wait(); err != nil {
+		return err
 	}

 	bars := make(map[string]*progress.Bar)
@@ -213,7 +232,7 @@ func createBlob(cmd *cobra.Command, client *api.Client, path string, digest stri
 		}
 	}()

-	if err = client.CreateBlob(cmd.Context(), digest, io.TeeReader(bin, &pw)); err != nil {
+	if err := client.CreateBlob(cmd.Context(), digest, io.TeeReader(bin, &pw)); err != nil {
 		return "", err
 	}
 	return digest, nil
@@ -1407,6 +1426,7 @@ func NewCLI() *cobra.Command {
 				envVars["OLLAMA_LLM_LIBRARY"],
 				envVars["OLLAMA_GPU_OVERHEAD"],
 				envVars["OLLAMA_LOAD_TIMEOUT"],
+				envVars["OLLAMA_CONTEXT_LENGTH"],
 			})
 		default:
 			appendEnvDocs(cmd, envs)
--- a/cmd/cmd_test.go
+++ b/cmd/cmd_test.go
@@ -690,7 +690,7 @@ func TestCreateHandler(t *testing.T) {
 						return
 					}

-					if req.Name != "test-model" {
+					if req.Model != "test-model" {
 						t.Errorf("expected model name 'test-model', got %s", req.Name)
 					}

--- a/convert/convert.go
+++ b/convert/convert.go
@@ -4,10 +4,9 @@ import (
 	"encoding/json"
 	"errors"
 	"fmt"
-	"io"
 	"io/fs"
 	"log/slog"
-	"slices"
+	"os"
 	"strings"

 	"github.com/ollama/ollama/fs/ggml"
@@ -85,6 +84,14 @@ func (ModelParameters) specialTokenTypes() []string {
 	}
 }

+func (ModelParameters) writeFile(f *os.File, kv ggml.KV, ts []ggml.Tensor) error {
+	return ggml.WriteGGUF(f, kv, ts)
+}
+
+func (AdapterParameters) writeFile(f *os.File, kv ggml.KV, ts []ggml.Tensor) error {
+	return ggml.WriteGGUF(f, kv, ts)
+}
+
 type ModelConverter interface {
 	// KV maps parameters to LLM key-values
 	KV(*Tokenizer) ggml.KV
@@ -96,6 +103,8 @@ type ModelConverter interface {

 	// specialTokenTypes returns any special token types the model uses
 	specialTokenTypes() []string
+	// writeFile writes the model to the provided io.WriteSeeker
+	writeFile(*os.File, ggml.KV, []ggml.Tensor) error
 }

 type moreParser interface {
@@ -110,9 +119,11 @@ type AdapterConverter interface {
 	// Replacements returns a list of string pairs to replace in tensor names.
 	// See [strings.Replacer](https://pkg.go.dev/strings#Replacer) for details
 	Replacements() []string
+
+	writeFile(*os.File, ggml.KV, []ggml.Tensor) error
 }

-func ConvertAdapter(fsys fs.FS, ws io.WriteSeeker, baseKV ggml.KV) error {
+func ConvertAdapter(fsys fs.FS, f *os.File, baseKV ggml.KV) error {
 	bts, err := fs.ReadFile(fsys, "adapter_config.json")
 	if err != nil {
 		return err
@@ -147,14 +158,14 @@ func ConvertAdapter(fsys fs.FS, ws io.WriteSeeker, baseKV ggml.KV) error {
 		return err
 	}

-	return writeFile(ws, conv.KV(baseKV), conv.Tensors(ts))
+	return conv.writeFile(f, conv.KV(baseKV), conv.Tensors(ts))
 }

 // Convert writes an Ollama compatible model to the provided io.WriteSeeker based on configurations
 // and files it finds in the input path.
 // Supported input model formats include safetensors.
 // Supported input tokenizers files include tokenizer.json (preferred) and tokenizer.model.
-func ConvertModel(fsys fs.FS, ws io.WriteSeeker) error {
+func ConvertModel(fsys fs.FS, f *os.File) error {
 	bts, err := fs.ReadFile(fsys, "config.json")
 	if err != nil {
 		return err
@@ -173,8 +184,6 @@ func ConvertModel(fsys fs.FS, ws io.WriteSeeker) error {
 	switch p.Architectures[0] {
 	case "LlamaForCausalLM":
 		conv = &llamaModel{}
-	case "Llama4ForConditionalGeneration":
-		conv = &llama4Model{}
 	case "Mistral3ForConditionalGeneration":
 		conv = &mistral3Model{}
 	case "MixtralForCausalLM":
@@ -239,13 +248,5 @@ func ConvertModel(fsys fs.FS, ws io.WriteSeeker) error {
 		return err
 	}

-	return writeFile(ws, conv.KV(t), conv.Tensors(ts))
-}
-
-func writeFile(ws io.WriteSeeker, kv ggml.KV, ts []ggml.Tensor) error {
-	for i := range ts {
-		ts[i].Shape = slices.Clone(ts[i].Shape)
-		slices.Reverse(ts[i].Shape)
-	}
-	return ggml.WriteGGUF(ws, kv, ts)
+	return conv.writeFile(f, conv.KV(t), conv.Tensors(ts))
 }
--- a/convert/convert_llama.go
+++ b/convert/convert_llama.go
@@ -42,8 +42,6 @@ type llamaModel struct {
 	LayerNormEpsilon float32 `json:"layer_norm_epsilon"`
 	NormEpsilon      float32 `json:"norm_epsilon"`
 	HeadDim          uint32  `json:"head_dim"`
-
-	skipRepack bool
 }

 var _ ModelConverter = (*llamaModel)(nil)
@@ -72,10 +70,6 @@ func (p *llamaModel) KV(t *Tokenizer) ggml.KV {
 		kv["llama.rope.dimension_count"] = p.HiddenSize / headCount
 	}

-	if p.HeadDim > 0 {
-		kv["llama.attention.head_dim"] = p.HeadDim
-	}
-
 	if p.RopeTheta > 0 {
 		kv["llama.rope.freq_base"] = p.RopeTheta
 	}
@@ -139,10 +133,9 @@ func (p *llamaModel) Tensors(ts []Tensor) []ggml.Tensor {
 	}

 	for _, t := range ts {
-		if strings.HasSuffix(t.Name(), "attn_q.weight") || strings.HasSuffix(t.Name(), "attn_k.weight") {
-			if !p.skipRepack {
-				t.SetRepacker(p.repack)
-			}
+		if strings.HasSuffix(t.Name(), "attn_q.weight") ||
+			strings.HasSuffix(t.Name(), "attn_k.weight") {
+			t.SetRepacker(p.repack)
 		}

 		out = append(out, ggml.Tensor{
--- a/convert/convert_llama4.go
+++ b/convert/convert_llama4.go
@@ -1,169 +0,0 @@
-package convert
-
-import (
-	"slices"
-	"strings"
-
-	"github.com/pdevine/tensor"
-	"github.com/pdevine/tensor/native"
-
-	"github.com/ollama/ollama/fs/ggml"
-)
-
-type llama4Model struct {
-	ModelParameters
-	TextModel struct {
-		llamaModel
-		NumExpertsPerToken     uint32 `json:"num_experts_per_tok"`
-		NumLocalExperts        uint32 `json:"num_local_experts"`
-		InterleaveMOELayerStep uint32 `json:"interleave_moe_layer_step"`
-		UseQKNorm              bool   `json:"use_qk_norm"`
-		IntermediateSizeMLP    uint32 `json:"intermediate_size_mlp"`
-		AttentionChunkSize     uint32 `json:"attention_chunk_size"`
-	} `json:"text_config"`
-	VisionModel struct {
-		NumHiddenLayers   uint32  `json:"num_hidden_layers"`
-		HiddenSize        uint32  `json:"hidden_size"`
-		IntermediateSize  uint32  `json:"intermediate_size"`
-		NumAttentionHeads uint32  `json:"num_attention_heads"`
-		ImageSize         uint32  `json:"image_size"`
-		PatchSize         uint32  `json:"patch_size"`
-		RopeTheta         float32 `json:"rope_theta"`
-		NormEpsilon       float32 `json:"norm_eps"`
-		PixelShuffleRatio float32 `json:"pixel_shuffle_ratio"`
-	} `json:"vision_config"`
-}
-
-// KV implements ModelConverter.
-func (p *llama4Model) KV(t *Tokenizer) ggml.KV {
-	kv := p.ModelParameters.KV(t)
-	kv["general.architecture"] = "llama4"
-
-	for k, v := range p.TextModel.KV(t) {
-		if strings.HasPrefix(k, "llama.") {
-			kv[strings.ReplaceAll(k, "llama.", "llama4.")] = v
-		}
-	}
-
-	kv["llama4.feed_forward_length"] = p.TextModel.IntermediateSizeMLP
-	kv["llama4.expert_feed_forward_length"] = p.TextModel.IntermediateSize
-
-	kv["llama4.expert_count"] = p.TextModel.NumLocalExperts
-	kv["llama4.expert_used_count"] = p.TextModel.NumExpertsPerToken
-	kv["llama4.interleave_moe_layer_step"] = p.TextModel.InterleaveMOELayerStep
-	kv["llama4.use_qk_norm"] = p.TextModel.UseQKNorm
-	kv["llama4.attention.chunk_size"] = p.TextModel.AttentionChunkSize
-
-	kv["llama4.vision.block_count"] = p.VisionModel.NumHiddenLayers
-	kv["llama4.vision.embedding_length"] = p.VisionModel.HiddenSize
-	kv["llama4.vision.feed_forward_length"] = p.VisionModel.IntermediateSize
-	kv["llama4.vision.attention.head_count"] = p.VisionModel.NumAttentionHeads
-	kv["llama4.vision.image_size"] = p.VisionModel.ImageSize
-	kv["llama4.vision.patch_size"] = p.VisionModel.PatchSize
-	kv["llama4.vision.rope.freq_base"] = p.VisionModel.RopeTheta
-	kv["llama4.vision.layer_norm_epsilon"] = p.VisionModel.NormEpsilon
-	kv["llama4.vision.pixel_shuffle_ratio"] = p.VisionModel.PixelShuffleRatio
-	return kv
-}
-
-// Replacements implements ModelConverter.
-func (p *llama4Model) Replacements() []string {
-	return append(
-		p.TextModel.Replacements(),
-		"language_model.", "",
-		"vision_model", "v",
-		"multi_modal_projector", "mm",
-		"feed_forward.down_proj", "ffn_down",
-		"feed_forward.up_proj", "ffn_up",
-		"feed_forward.gate_proj", "ffn_gate",
-		"feed_forward.", "ffn_",
-		"shared_expert.down_proj", "down_shexp",
-		"shared_expert.gate_proj", "gate_shexp",
-		"shared_expert.up_proj", "up_shexp",
-		"experts.down_proj", "down_exps.weight",
-		"experts.gate_up_proj", "gate_up_exps.weight",
-		"router", "gate_inp",
-		"patch_embedding.linear", "patch_embedding",
-	)
-}
-
-// Tensors implements ModelConverter.
-func (p *llama4Model) Tensors(ts []Tensor) []ggml.Tensor {
-	var out []ggml.Tensor
-
-	var textTensors []Tensor
-	for _, t := range ts {
-		if strings.HasPrefix(t.Name(), "v.") || strings.HasPrefix(t.Name(), "mm.") {
-			out = append(out, ggml.Tensor{
-				Name:     t.Name(),
-				Kind:     t.Kind(),
-				Shape:    t.Shape(),
-				WriterTo: t,
-			})
-		} else if strings.Contains(t.Name(), "ffn_gate_up_exps") {
-			// gate and up projectors are fused
-			// dims[1], dims[2] must be swapped
-			// [experts, hidden_size, intermediate_size * 2] --> [experts, intermediate_size, hidden_size]
-			halfDim := int(t.Shape()[2]) / 2
-
-			newShape := slices.Clone(t.Shape())
-			newShape[1], newShape[2] = newShape[2]/2, newShape[1]
-			for i, name := range []string{"ffn_gate_exps", "ffn_up_exps"} {
-				// clone tensor since we need separate repackers
-				tt := t.Clone()
-				tt.SetRepacker(p.repack(nil, nil, tensor.S(i*halfDim, (i+1)*halfDim)))
-				out = append(out, ggml.Tensor{
-					Name:     strings.ReplaceAll(tt.Name(), "ffn_gate_up_exps", name),
-					Kind:     tt.Kind(),
-					Shape:    newShape,
-					WriterTo: tt,
-				})
-			}
-		} else if strings.Contains(t.Name(), "ffn_down_exps") {
-			// dims[1], dims[2] must be swapped
-			// [experts, intermediate_size, hidden_size] --> [experts, hidden_size, intermediate_size]
-			t.SetRepacker(p.repack())
-			newShape := slices.Clone(t.Shape())
-			newShape[1], newShape[2] = newShape[2], newShape[1]
-			out = append(out, ggml.Tensor{
-				Name:     t.Name(),
-				Kind:     t.Kind(),
-				Shape:    newShape,
-				WriterTo: t,
-			})
-		} else {
-			textTensors = append(textTensors, t)
-		}
-	}
-
-	p.TextModel.skipRepack = true
-	out = append(out, p.TextModel.Tensors(textTensors)...)
-	return out
-}
-
-func (p *llama4Model) repack(slice ...tensor.Slice) Repacker {
-	return func(name string, data []float32, shape []uint64) ([]float32, error) {
-		dims := make([]int, len(shape))
-		for i, dim := range shape {
-			dims[i] = int(dim)
-		}
-
-		var t tensor.Tensor = tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
-		t, err := t.Slice(slice...)
-		if err != nil {
-			return nil, err
-		}
-
-		if err := t.T(0, 2, 1); err != nil {
-			return nil, err
-		}
-
-		t = tensor.Materialize(t)
-		// flatten tensor so it can be return as a vector
-		if err := t.Reshape(t.Shape().TotalSize()); err != nil {
-			return nil, err
-		}
-
-		return native.VectorF32(t.(*tensor.Dense))
-	}
-}
--- a/convert/convert_test.go
+++ b/convert/convert_test.go
@@ -11,6 +11,7 @@ import (
 	"io"
 	"io/fs"
 	"log/slog"
+	"math"
 	"os"
 	"path/filepath"
 	"slices"
@@ -47,7 +48,7 @@ func convertFull(t *testing.T, fsys fs.FS) (*os.File, ggml.KV, ggml.Tensors) {
 	}
 	t.Cleanup(func() { r.Close() })

-	m, _, err := ggml.Decode(r, -1)
+	m, _, err := ggml.Decode(r, math.MaxInt)
 	if err != nil {
 		t.Fatal(err)
 	}
@@ -331,7 +332,7 @@ func TestConvertAdapter(t *testing.T) {
 			}
 			defer r.Close()

-			m, _, err := ggml.Decode(r, -1)
+			m, _, err := ggml.Decode(r, math.MaxInt)
 			if err != nil {
 				t.Fatal(err)
 			}
--- a/convert/reader.go
+++ b/convert/reader.go
@@ -11,15 +11,14 @@ type Tensor interface {
 	Name() string
 	Shape() []uint64
 	Kind() uint32
-	SetRepacker(Repacker)
+	SetRepacker(repacker)
 	WriteTo(io.Writer) (int64, error)
-	Clone() Tensor
 }

 type tensorBase struct {
-	name     string
-	shape    []uint64
-	repacker Repacker
+	name  string
+	shape []uint64
+	repacker
 }

 func (t tensorBase) Name() string {
@@ -37,8 +36,7 @@ const (

 func (t tensorBase) Kind() uint32 {
 	if strings.HasSuffix(t.name, ".ffn_gate_inp.weight") ||
-		t.name == "token_types.weight" ||
-		t.name == "v.positional_embedding_vlm" {
+		t.name == "token_types.weight" {
 		// these tensors are always F32
 		return 0
 	}
@@ -53,11 +51,11 @@ func (t tensorBase) Kind() uint32 {
 	}
 }

-func (t *tensorBase) SetRepacker(fn Repacker) {
+func (t *tensorBase) SetRepacker(fn repacker) {
 	t.repacker = fn
 }

-type Repacker func(string, []float32, []uint64) ([]float32, error)
+type repacker func(string, []float32, []uint64) ([]float32, error)

 func parseTensors(fsys fs.FS, replacer *strings.Replacer) ([]Tensor, error) {
 	patterns := []struct {
--- a/convert/reader_safetensors.go
+++ b/convert/reader_safetensors.go
@@ -94,21 +94,6 @@ type safetensor struct {
 	*tensorBase
 }

-func (st safetensor) Clone() Tensor {
-	return &safetensor{
-		fs:     st.fs,
-		path:   st.path,
-		dtype:  st.dtype,
-		offset: st.offset,
-		size:   st.size,
-		tensorBase: &tensorBase{
-			name:     st.name,
-			repacker: st.repacker,
-			shape:    slices.Clone(st.shape),
-		},
-	}
-}
-
 func (st safetensor) WriteTo(w io.Writer) (int64, error) {
 	f, err := st.fs.Open(st.path)
 	if err != nil {
--- a/convert/reader_torch.go
+++ b/convert/reader_torch.go
@@ -43,17 +43,6 @@ type torch struct {
 	*tensorBase
 }

-func (t torch) Clone() Tensor {
-	return torch{
-		storage: t.storage,
-		tensorBase: &tensorBase{
-			name:     t.name,
-			shape:    t.shape,
-			repacker: t.repacker,
-		},
-	}
-}
-
 func (pt torch) WriteTo(w io.Writer) (int64, error) {
 	return 0, nil
 }
--- a/docs/faq.md
+++ b/docs/faq.md
@@ -20,7 +20,7 @@ Please refer to the [GPU docs](./gpu.md).

 ## How can I specify the context window size?

-By default, Ollama uses a context window size of 4096 tokens. 
+By default, Ollama uses a context window size of 4096 tokens, unless you have a single GPU with <= 4 GB of VRAM, in which case it will default to 2048 tokens. 

 This can be overridden with the `OLLAMA_CONTEXT_LENGTH` environment variable. For example, to set the default context window to 8K, use: 

@@ -31,7 +31,7 @@ OLLAMA_CONTEXT_LENGTH=8192 ollama serve
 To change this when using `ollama run`, use `/set parameter`:

 ```shell
-/set parameter num_ctx 4096
+/set parameter num_ctx 8192
 ```

 When using the API, specify the `num_ctx` parameter:
@@ -41,7 +41,7 @@ curl http://localhost:11434/api/generate -d '{
  "model": "llama3.2",
  "prompt": "Why is the sky blue?",
  "options": {
-    "num_ctx": 4096
+    "num_ctx": 8192
  }
 }'
 ```
--- a/envconfig/config.go
+++ b/envconfig/config.go
@@ -169,7 +169,7 @@ var (
 	// Enable the new Ollama engine
 	NewEngine = Bool("OLLAMA_NEW_ENGINE")
 	// ContextLength sets the default context length
-	ContextLength = Uint("OLLAMA_CONTEXT_LENGTH", 4096)
+	ContextLength = Int64("OLLAMA_CONTEXT_LENGTH", -1)
 )

 func String(s string) func() string {
@@ -227,6 +227,20 @@ func Uint64(key string, defaultValue uint64) func() uint64 {
 	}
 }

+func Int64(key string, defaultValue int64) func() int64 {
+	return func() int64 {
+		if s := Var(key); s != "" {
+			if n, err := strconv.ParseInt(s, 10, 64); err != nil {
+				slog.Warn("invalid environment variable, using default", "key", key, "value", s, "default", defaultValue)
+			} else {
+				return n
+			}
+		}
+
+		return defaultValue
+	}
+}
+
 // Set aside VRAM per GPU
 var GpuOverhead = Uint64("OLLAMA_GPU_OVERHEAD", 0)

@@ -255,7 +269,7 @@ func AsMap() map[string]EnvVar {
 		"OLLAMA_ORIGINS":           {"OLLAMA_ORIGINS", AllowedOrigins(), "A comma separated list of allowed origins"},
 		"OLLAMA_SCHED_SPREAD":      {"OLLAMA_SCHED_SPREAD", SchedSpread(), "Always schedule model across all GPUs"},
 		"OLLAMA_MULTIUSER_CACHE":   {"OLLAMA_MULTIUSER_CACHE", MultiUserCache(), "Optimize prompt caching for multi-user scenarios"},
-		"OLLAMA_CONTEXT_LENGTH":    {"OLLAMA_CONTEXT_LENGTH", ContextLength(), "Context length to use unless otherwise specified (default: 4096)"},
+		"OLLAMA_CONTEXT_LENGTH":    {"OLLAMA_CONTEXT_LENGTH", ContextLength(), "Context length to use unless otherwise specified (default 4096 or 2048 with low VRAM)"},
 		"OLLAMA_NEW_ENGINE":        {"OLLAMA_NEW_ENGINE", NewEngine(), "Enable the new Ollama engine"},

 		// Informational
--- a/envconfig/config_test.go
+++ b/envconfig/config_test.go
@@ -278,9 +278,9 @@ func TestVar(t *testing.T) {
 }

 func TestContextLength(t *testing.T) {
-	cases := map[string]uint{
-		"":     4096,
-		"2048": 2048,
+	cases := map[string]int64{
+		"":     -1,
+		"4096": 4096,
 	}

 	for k, v := range cases {
--- a/fs/config.go
+++ b/fs/config.go
@@ -8,6 +8,6 @@ type Config interface {
 	Bool(string, ...bool) bool

 	Strings(string, ...[]string) []string
-	Ints(string, ...[]int32) []int32
+	Uints(string, ...[]uint32) []uint32
 	Floats(string, ...[]float32) []float32
 }
--- a/fs/ggml/ggml.go
+++ b/fs/ggml/ggml.go
@@ -33,7 +33,7 @@ func (kv KV) Kind() string {
 }

 func (kv KV) ParameterCount() uint64 {
-	return keyValue(kv, "general.parameter_count", uint64(0))
+	return keyValue[uint64](kv, "general.parameter_count")
 }

 func (kv KV) FileType() fileType {
@@ -105,42 +105,42 @@ func (kv KV) Bool(key string, defaultValue ...bool) bool {
 }

 func (kv KV) Strings(key string, defaultValue ...[]string) []string {
-	return keyValue(kv, key, &array[string]{values: append(defaultValue, []string(nil))[0]}).values
-}
+	r := keyValue(kv, key, &array{})
+	s := make([]string, r.size)
+	for i := range r.size {
+		s[i] = r.values[i].(string)
+	}

-func (kv KV) Ints(key string, defaultValue ...[]int32) []int32 {
-	return keyValue(kv, key, &array[int32]{values: append(defaultValue, []int32(nil))[0]}).values
+	return s
 }

 func (kv KV) Uints(key string, defaultValue ...[]uint32) []uint32 {
-	return keyValue(kv, key, &array[uint32]{values: append(defaultValue, []uint32(nil))[0]}).values
+	r := keyValue(kv, key, &array{})
+	s := make([]uint32, r.size)
+	for i := range r.size {
+		s[i] = uint32(r.values[i].(int32))
+	}
+
+	return s
 }

 func (kv KV) Floats(key string, defaultValue ...[]float32) []float32 {
-	return keyValue(kv, key, &array[float32]{values: append(defaultValue, []float32(nil))[0]}).values
+	r := keyValue(kv, key, &array{})
+	s := make([]float32, r.size)
+	for i := range r.size {
+		s[i] = float32(r.values[i].(float32))
+	}
+	return s
 }

 func (kv KV) OllamaEngineRequired() bool {
 	return slices.Contains([]string{
 		"gemma3",
 		"mistral3",
-		"llama4",
 	}, kv.Architecture())
 }

-type valueTypes interface {
-	uint8 | int8 | uint16 | int16 |
-		uint32 | int32 | uint64 | int64 |
-		string | float32 | float64 | bool
-}
-
-type arrayValueTypes interface {
-	*array[uint8] | *array[int8] | *array[uint16] | *array[int16] |
-		*array[uint32] | *array[int32] | *array[uint64] | *array[int64] |
-		*array[string] | *array[float32] | *array[float64] | *array[bool]
-}
-
-func keyValue[T valueTypes | arrayValueTypes](kv KV, key string, defaultValue ...T) T {
+func keyValue[T string | uint32 | uint64 | float32 | *array | bool](kv KV, key string, defaultValue ...T) T {
 	if !strings.HasPrefix(key, "tokenizer.") && !strings.HasPrefix(key, "general.") {
 		key = kv.Architecture() + "." + key
 	}
@@ -375,8 +375,13 @@ func DetectContentType(b []byte) string {
 // Decode decodes a GGML model from the given reader.
 //
 // It collects array values for arrays with a size less than or equal to
-// maxArraySize. If the maxArraySize is negative, all arrays are collected.
+// maxArraySize. If maxArraySize is 0, the default value of 1024 is used. If
+// the maxArraySize is negative, all arrays are collected.
 func Decode(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) {
+	if maxArraySize == 0 {
+		maxArraySize = 1024
+	}
+
 	rs = bufioutil.NewBufferedSeeker(rs, 32<<10)

 	var magic uint32
@@ -415,7 +420,7 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri
 	embedding := f.KV().EmbeddingLength()
 	heads := f.KV().HeadCount()
 	headsKV := f.KV().HeadCountKV()
-	vocab := uint64(f.KV()["tokenizer.ggml.tokens"].(*array[string]).size)
+	vocab := uint64(f.KV()["tokenizer.ggml.tokens"].(*array).size)

 	embeddingHeads := f.KV().EmbeddingHeadCount()
 	embeddingHeadsK := f.KV().EmbeddingHeadCountK()
@@ -430,7 +435,7 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri
 	}

 	switch f.KV().Architecture() {
-	case "llama", "llama4":
+	case "llama":
 		fullOffload = max(
 			4*batch*(1+4*embedding+context*(1+heads)),
 			4*batch*(embedding+vocab),
@@ -444,7 +449,7 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri

 		if ffnGateExpsWeight, ok := layers["blk.0"]["ffn_gate_exps.weight"]; ok {
 			// mixtral 8x22b
-			ff := uint64(f.KV().Uint("feed_forward_length"))
+			ff := uint64(f.KV()["llama.feed_forward_length"].(uint32))
 			partialOffload = max(
 				3*ffnGateExpsWeight.Size()+4*batch*(2*ff+headsKV+embedding+context+embeddingHeads*headsKV),
 				4*(context*batch*heads+context*embeddingHeads*headsKV+batch*1024+embeddingHeads*headsKV*batch),
@@ -461,9 +466,9 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri
 	case "mllama":
 		var visionTokens, tiles uint64 = 1601, 4

-		crossAttentionLayers := f.KV().Ints("attention.cross_attention_layers")
+		crossAttentionLayers := f.KV().Uints("attention.cross_attention_layers")
 		for i := range kv {
-			if slices.Contains(crossAttentionLayers, int32(i)) {
+			if slices.Contains(crossAttentionLayers, uint32(i)) {
 				kv[i] = headsKV * (embeddingHeadsK + embeddingHeadsV) *
 					4 * // sizeof(float32)
 					visionTokens *
@@ -640,9 +645,6 @@ func (llm GGML) VisionGraphSize() (weights, graphSize uint64) {
 		graphSize = 4 * (imageSize*imageSize*numChannels +
 			embeddingLength*patchSize +
 			numPatches*numPatches*headCount)
-	case "llama4":
-		// vision graph is computed independently in the same schedule
-		// and is negligible compared to the worst case text graph
 	}

 	return weights, graphSize
--- a/fs/ggml/ggml_test.go
+++ b/fs/ggml/ggml_test.go
@@ -2,7 +2,6 @@ package ggml

 import (
 	"maps"
-	"math"
 	"slices"
 	"strconv"
 	"strings"
@@ -211,61 +210,3 @@ func TestTensorTypes(t *testing.T) {
 		})
 	}
 }
-
-func TestKeyValue(t *testing.T) {
-	kv := KV{
-		"general.architecture": "test",
-		"test.strings":         &array[string]{size: 3, values: []string{"a", "b", "c"}},
-		"test.float32s":        &array[float32]{size: 3, values: []float32{1.0, 2.0, 3.0}},
-		"test.int32s":          &array[int32]{size: 3, values: []int32{1, 2, 3}},
-		"test.uint32s":         &array[uint32]{size: 3, values: []uint32{1, 2, 3}},
-	}
-
-	if diff := cmp.Diff(kv.Strings("strings"), []string{"a", "b", "c"}); diff != "" {
-		t.Errorf("unexpected strings (-got +want):\n%s", diff)
-	}
-
-	if diff := cmp.Diff(kv.Strings("nonexistent.strings"), []string(nil)); diff != "" {
-		t.Errorf("unexpected strings (-got +want):\n%s", diff)
-	}
-
-	if diff := cmp.Diff(kv.Strings("default.strings", []string{"ollama"}), []string{"ollama"}); diff != "" {
-		t.Errorf("unexpected strings (-got +want):\n%s", diff)
-	}
-
-	if diff := cmp.Diff(kv.Floats("float32s"), []float32{1.0, 2.0, 3.0}); diff != "" {
-		t.Errorf("unexpected float32s (-got +want):\n%s", diff)
-	}
-
-	if diff := cmp.Diff(kv.Floats("nonexistent.float32s"), []float32(nil)); diff != "" {
-		t.Errorf("unexpected float32s (-got +want):\n%s", diff)
-	}
-
-	if diff := cmp.Diff(kv.Floats("default.float32s", []float32{math.MaxFloat32}), []float32{math.MaxFloat32}); diff != "" {
-		t.Errorf("unexpected float32s (-got +want):\n%s", diff)
-	}
-
-	if diff := cmp.Diff(kv.Ints("int32s"), []int32{1, 2, 3}); diff != "" {
-		t.Errorf("unexpected int8s (-got +want):\n%s", diff)
-	}
-
-	if diff := cmp.Diff(kv.Ints("nonexistent.int32s"), []int32(nil)); diff != "" {
-		t.Errorf("unexpected int8s (-got +want):\n%s", diff)
-	}
-
-	if diff := cmp.Diff(kv.Ints("default.int32s", []int32{math.MaxInt32}), []int32{math.MaxInt32}); diff != "" {
-		t.Errorf("unexpected int8s (-got +want):\n%s", diff)
-	}
-
-	if diff := cmp.Diff(kv.Uints("uint32s"), []uint32{1, 2, 3}); diff != "" {
-		t.Errorf("unexpected uint8s (-got +want):\n%s", diff)
-	}
-
-	if diff := cmp.Diff(kv.Uints("nonexistent.uint32s"), []uint32(nil)); diff != "" {
-		t.Errorf("unexpected uint8s (-got +want):\n%s", diff)
-	}
-
-	if diff := cmp.Diff(kv.Uints("default.uint32s", []uint32{math.MaxUint32}), []uint32{math.MaxUint32}); diff != "" {
-		t.Errorf("unexpected uint8s (-got +want):\n%s", diff)
-	}
-}
--- a/fs/ggml/gguf.go
+++ b/fs/ggml/gguf.go
@@ -9,8 +9,12 @@ import (
 	"io"
 	"log/slog"
 	"maps"
+	"os"
+	"runtime"
 	"slices"
 	"strings"
+
+	"golang.org/x/sync/errgroup"
 )

 type containerGGUF struct {
@@ -36,6 +40,10 @@ type containerGGUF struct {
 	maxArraySize int
 }

+func (c *containerGGUF) canCollectArray(size int) bool {
+	return c.maxArraySize < 0 || size <= c.maxArraySize
+}
+
 func (c *containerGGUF) Name() string {
 	return "gguf"
 }
@@ -291,23 +299,6 @@ func readGGUFV1String(llm *gguf, r io.Reader) (string, error) {
 	return b.String(), nil
 }

-func readGGUFV1StringsData(llm *gguf, r io.Reader, a *array[string]) (any, error) {
-	for i := range a.size {
-		if a.values != nil {
-			e, err := readGGUFV1String(llm, r)
-			if err != nil {
-				return nil, err
-			}
-
-			a.values[i] = e
-		} else {
-			discardGGUFString(llm, r)
-		}
-	}
-
-	return a, nil
-}
-
 func discardGGUFString(llm *gguf, r io.Reader) error {
 	buf := llm.scratch[:8]
 	_, err := io.ReadFull(r, buf)
@@ -365,44 +356,78 @@ func writeGGUFString(w io.Writer, s string) error {
 	return err
 }

-func readGGUFStringsData(llm *gguf, r io.Reader, a *array[string]) (any, error) {
-	for i := range a.size {
-		if a.values != nil {
-			e, err := readGGUFString(llm, r)
-			if err != nil {
-				return nil, err
-			}
+type array struct {
+	size   int
+	values []any
+}

+func (a *array) MarshalJSON() ([]byte, error) {
+	return json.Marshal(a.values)
+}
+
+func readGGUFV1Array(llm *gguf, r io.Reader) (*array, error) {
+	t, err := readGGUF[uint32](llm, r)
+	if err != nil {
+		return nil, err
+	}
+
+	n, err := readGGUF[uint32](llm, r)
+	if err != nil {
+		return nil, err
+	}
+
+	a := &array{size: int(n)}
+	if llm.canCollectArray(int(n)) {
+		a.values = make([]any, 0, int(n))
+	}
+
+	for i := range n {
+		var e any
+		switch t {
+		case ggufTypeUint8:
+			e, err = readGGUF[uint8](llm, r)
+		case ggufTypeInt8:
+			e, err = readGGUF[int8](llm, r)
+		case ggufTypeUint16:
+			e, err = readGGUF[uint16](llm, r)
+		case ggufTypeInt16:
+			e, err = readGGUF[int16](llm, r)
+		case ggufTypeUint32:
+			e, err = readGGUF[uint32](llm, r)
+		case ggufTypeInt32:
+			e, err = readGGUF[int32](llm, r)
+		case ggufTypeUint64:
+			e, err = readGGUF[uint64](llm, r)
+		case ggufTypeInt64:
+			e, err = readGGUF[int64](llm, r)
+		case ggufTypeFloat32:
+			e, err = readGGUF[float32](llm, r)
+		case ggufTypeFloat64:
+			e, err = readGGUF[float64](llm, r)
+		case ggufTypeBool:
+			e, err = readGGUF[bool](llm, r)
+		case ggufTypeString:
+			e, err = readGGUFV1String(llm, r)
+		default:
+			return nil, fmt.Errorf("invalid array type: %d", t)
+		}
+		if err != nil {
+			return nil, err
+		}
+
+		if a.values != nil {
 			a.values[i] = e
-		} else {
-			discardGGUFString(llm, r)
 		}
 	}

 	return a, nil
 }

-type array[T any] struct {
-	// size is the actual size of the array
-	size int
-
-	// values is the array of values. this is nil if the array is larger than configured maxSize
-	values []T
-}
-
-func (a *array[T]) MarshalJSON() ([]byte, error) {
-	return json.Marshal(a.values)
-}
-
-func newArray[T any](size, maxSize int) *array[T] {
-	a := array[T]{size: size}
-	if maxSize < 0 || size <= maxSize {
-		a.values = make([]T, size)
+func readGGUFArray(llm *gguf, r io.Reader) (*array, error) {
+	if llm.Version == 1 {
+		return readGGUFV1Array(llm, r)
 	}
-	return &a
-}

-func readGGUFArray(llm *gguf, r io.Reader) (any, error) {
 	t, err := readGGUF[uint32](llm, r)
 	if err != nil {
 		return nil, err
@@ -413,55 +438,45 @@ func readGGUFArray(llm *gguf, r io.Reader) (any, error) {
 		return nil, err
 	}

-	switch t {
-	case ggufTypeUint8:
-		a := newArray[uint8](int(n), llm.maxArraySize)
-		return readGGUFArrayData(llm, r, a)
-	case ggufTypeInt8:
-		a := newArray[int8](int(n), llm.maxArraySize)
-		return readGGUFArrayData(llm, r, a)
-	case ggufTypeUint16:
-		a := newArray[uint16](int(n), llm.maxArraySize)
-		return readGGUFArrayData(llm, r, a)
-	case ggufTypeInt16:
-		a := newArray[int16](int(n), llm.maxArraySize)
-		return readGGUFArrayData(llm, r, a)
-	case ggufTypeUint32:
-		a := newArray[uint32](int(n), llm.maxArraySize)
-		return readGGUFArrayData(llm, r, a)
-	case ggufTypeInt32:
-		a := newArray[int32](int(n), llm.maxArraySize)
-		return readGGUFArrayData(llm, r, a)
-	case ggufTypeUint64:
-		a := newArray[uint64](int(n), llm.maxArraySize)
-		return readGGUFArrayData(llm, r, a)
-	case ggufTypeInt64:
-		a := newArray[int64](int(n), llm.maxArraySize)
-		return readGGUFArrayData(llm, r, a)
-	case ggufTypeFloat32:
-		a := newArray[float32](int(n), llm.maxArraySize)
-		return readGGUFArrayData(llm, r, a)
-	case ggufTypeFloat64:
-		a := newArray[float64](int(n), llm.maxArraySize)
-		return readGGUFArrayData(llm, r, a)
-	case ggufTypeBool:
-		a := newArray[bool](int(n), llm.maxArraySize)
-		return readGGUFArrayData(llm, r, a)
-	case ggufTypeString:
-		a := newArray[string](int(n), llm.maxArraySize)
-		if llm.Version == 1 {
-			return readGGUFV1StringsData(llm, r, a)
-		}
-
-		return readGGUFStringsData(llm, r, a)
-	default:
-		return nil, fmt.Errorf("invalid array type: %d", t)
+	a := &array{size: int(n)}
+	if llm.canCollectArray(int(n)) {
+		a.values = make([]any, int(n))
 	}
-}

-func readGGUFArrayData[T any](llm *gguf, r io.Reader, a *array[T]) (any, error) {
-	for i := range a.size {
-		e, err := readGGUF[T](llm, r)
+	for i := range n {
+		var e any
+		switch t {
+		case ggufTypeUint8:
+			e, err = readGGUF[uint8](llm, r)
+		case ggufTypeInt8:
+			e, err = readGGUF[int8](llm, r)
+		case ggufTypeUint16:
+			e, err = readGGUF[uint16](llm, r)
+		case ggufTypeInt16:
+			e, err = readGGUF[int16](llm, r)
+		case ggufTypeUint32:
+			e, err = readGGUF[uint32](llm, r)
+		case ggufTypeInt32:
+			e, err = readGGUF[int32](llm, r)
+		case ggufTypeUint64:
+			e, err = readGGUF[uint64](llm, r)
+		case ggufTypeInt64:
+			e, err = readGGUF[int64](llm, r)
+		case ggufTypeFloat32:
+			e, err = readGGUF[float32](llm, r)
+		case ggufTypeFloat64:
+			e, err = readGGUF[float64](llm, r)
+		case ggufTypeBool:
+			e, err = readGGUF[bool](llm, r)
+		case ggufTypeString:
+			if a.values != nil {
+				e, err = readGGUFString(llm, r)
+			} else {
+				err = discardGGUFString(llm, r)
+			}
+		default:
+			return nil, fmt.Errorf("invalid array type: %d", t)
+		}
 		if err != nil {
 			return nil, err
 		}
@@ -491,22 +506,22 @@ func writeGGUFArray[S ~[]E, E any](w io.Writer, t uint32, s S) error {
 	return binary.Write(w, binary.LittleEndian, s)
 }

-func WriteGGUF(ws io.WriteSeeker, kv KV, ts []Tensor) error {
+func WriteGGUF(f *os.File, kv KV, ts []Tensor) error {
 	alignment := kv.Uint("general.alignment", 32)

-	if err := binary.Write(ws, binary.LittleEndian, []byte("GGUF")); err != nil {
+	if err := binary.Write(f, binary.LittleEndian, []byte("GGUF")); err != nil {
 		return err
 	}

-	if err := binary.Write(ws, binary.LittleEndian, uint32(3)); err != nil {
+	if err := binary.Write(f, binary.LittleEndian, uint32(3)); err != nil {
 		return err
 	}

-	if err := binary.Write(ws, binary.LittleEndian, uint64(len(ts))); err != nil {
+	if err := binary.Write(f, binary.LittleEndian, uint64(len(ts))); err != nil {
 		return err
 	}

-	if err := binary.Write(ws, binary.LittleEndian, uint64(len(kv))); err != nil {
+	if err := binary.Write(f, binary.LittleEndian, uint64(len(kv))); err != nil {
 		return err
 	}

@@ -514,7 +529,7 @@ func WriteGGUF(ws io.WriteSeeker, kv KV, ts []Tensor) error {
 	slices.Sort(keys)

 	for _, key := range keys {
-		if err := ggufWriteKV(ws, key, kv[key]); err != nil {
+		if err := ggufWriteKV(f, key, kv[key]); err != nil {
 			return err
 		}
 	}
@@ -530,21 +545,34 @@ func WriteGGUF(ws io.WriteSeeker, kv KV, ts []Tensor) error {
 	})

 	var s uint64
-	for _, t := range ts {
-		t.Offset = s + uint64(ggufPadding(int64(s), int64(alignment)))
-		if err := ggufWriteTensorInfo(ws, t); err != nil {
+	for i := range ts {
+		ts[i].Offset = s + uint64(ggufPadding(int64(s), int64(alignment)))
+		if err := ggufWriteTensorInfo(f, ts[i]); err != nil {
 			return err
 		}
-		s += t.Size()
+		s += ts[i].Size()
 	}

+	offset, err := f.Seek(0, io.SeekCurrent)
+	if err != nil {
+		return err
+	}
+
+	offset += ggufPadding(offset, int64(alignment))
+	slog.Debug("gguf", "offset", offset, "size", s, "alignment", alignment)
+
+	var g errgroup.Group
+	g.SetLimit(runtime.GOMAXPROCS(0))
 	for _, t := range ts {
-		if err := ggufWriteTensor(ws, t, int64(alignment)); err != nil {
+		t := t
+		w := io.NewOffsetWriter(f, offset+int64(t.Offset))
+		g.Go(func() error {
+			_, err := t.WriteTo(w)
 			return err
-		}
+		})
 	}

-	return nil
+	return g.Wait()
 }

 func ggufWriteKV(ws io.WriteSeeker, k string, v any) error {
@@ -616,8 +644,8 @@ func ggufWriteTensorInfo(ws io.WriteSeeker, t Tensor) error {
 		return err
 	}

-	for _, n := range t.Shape {
-		if err := binary.Write(ws, binary.LittleEndian, n); err != nil {
+	for i := range len(t.Shape) {
+		if err := binary.Write(ws, binary.LittleEndian, t.Shape[len(t.Shape)-i-1]); err != nil {
 			return err
 		}
 	}
@@ -629,20 +657,6 @@ func ggufWriteTensorInfo(ws io.WriteSeeker, t Tensor) error {
 	return binary.Write(ws, binary.LittleEndian, t.Offset)
 }

-func ggufWriteTensor(ws io.WriteSeeker, t Tensor, alignment int64) error {
-	offset, err := ws.Seek(0, io.SeekCurrent)
-	if err != nil {
-		return err
-	}
-
-	if err := binary.Write(ws, binary.LittleEndian, bytes.Repeat([]byte{0}, int(ggufPadding(offset, alignment)))); err != nil {
-		return err
-	}
-
-	_, err = t.WriteTo(ws)
-	return err
-}
-
 func ggufPadding(offset, align int64) int64 {
 	return (align - offset%align) % align
 }
--- a/integration/embed_test.go
+++ b/integration/embed_test.go
@@ -34,15 +34,13 @@ func cosineSimilarity[V float32 | float64](v1, v2 []V) V {
 func TestAllMiniLMEmbeddings(t *testing.T) {
 	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
 	defer cancel()
-	client, _, cleanup := InitServerConnection(ctx, t)
-	defer cleanup()

 	req := api.EmbeddingRequest{
 		Model:  "all-minilm",
 		Prompt: "why is the sky blue?",
 	}

-	res, err := embeddingTestHelper(ctx, client, t, req)
+	res, err := embeddingTestHelper(ctx, t, req)

 	if err != nil {
 		t.Fatalf("error: %v", err)
@@ -64,15 +62,13 @@ func TestAllMiniLMEmbeddings(t *testing.T) {
 func TestAllMiniLMEmbed(t *testing.T) {
 	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
 	defer cancel()
-	client, _, cleanup := InitServerConnection(ctx, t)
-	defer cleanup()

 	req := api.EmbedRequest{
 		Model: "all-minilm",
 		Input: "why is the sky blue?",
 	}

-	res, err := embedTestHelper(ctx, client, t, req)
+	res, err := embedTestHelper(ctx, t, req)

 	if err != nil {
 		t.Fatalf("error: %v", err)
@@ -102,15 +98,13 @@ func TestAllMiniLMEmbed(t *testing.T) {
 func TestAllMiniLMBatchEmbed(t *testing.T) {
 	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
 	defer cancel()
-	client, _, cleanup := InitServerConnection(ctx, t)
-	defer cleanup()

 	req := api.EmbedRequest{
 		Model: "all-minilm",
 		Input: []string{"why is the sky blue?", "why is the grass green?"},
 	}

-	res, err := embedTestHelper(ctx, client, t, req)
+	res, err := embedTestHelper(ctx, t, req)

 	if err != nil {
 		t.Fatalf("error: %v", err)
@@ -150,8 +144,6 @@ func TestAllMiniLMBatchEmbed(t *testing.T) {
 func TestAllMiniLMEmbedTruncate(t *testing.T) {
 	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
 	defer cancel()
-	client, _, cleanup := InitServerConnection(ctx, t)
-	defer cleanup()

 	truncTrue, truncFalse := true, false

@@ -190,7 +182,7 @@ func TestAllMiniLMEmbedTruncate(t *testing.T) {
 	res := make(map[string]*api.EmbedResponse)

 	for _, req := range reqs {
-		response, err := embedTestHelper(ctx, client, t, req.Request)
+		response, err := embedTestHelper(ctx, t, req.Request)
 		if err != nil {
 			t.Fatalf("error: %v", err)
 		}
@@ -206,7 +198,7 @@ func TestAllMiniLMEmbedTruncate(t *testing.T) {
 	}

 	// check that truncate set to false returns an error if context length is exceeded
-	_, err := embedTestHelper(ctx, client, t, api.EmbedRequest{
+	_, err := embedTestHelper(ctx, t, api.EmbedRequest{
 		Model:    "all-minilm",
 		Input:    "why is the sky blue?",
 		Truncate: &truncFalse,
@@ -218,7 +210,9 @@ func TestAllMiniLMEmbedTruncate(t *testing.T) {
 	}
 }

-func embeddingTestHelper(ctx context.Context, client *api.Client, t *testing.T, req api.EmbeddingRequest) (*api.EmbeddingResponse, error) {
+func embeddingTestHelper(ctx context.Context, t *testing.T, req api.EmbeddingRequest) (*api.EmbeddingResponse, error) {
+	client, _, cleanup := InitServerConnection(ctx, t)
+	defer cleanup()
 	if err := PullIfMissing(ctx, client, req.Model); err != nil {
 		t.Fatalf("failed to pull model %s: %v", req.Model, err)
 	}
@@ -232,7 +226,9 @@ func embeddingTestHelper(ctx context.Context, client *api.Client, t *testing.T,
 	return response, nil
 }

-func embedTestHelper(ctx context.Context, client *api.Client, t *testing.T, req api.EmbedRequest) (*api.EmbedResponse, error) {
+func embedTestHelper(ctx context.Context, t *testing.T, req api.EmbedRequest) (*api.EmbedResponse, error) {
+	client, _, cleanup := InitServerConnection(ctx, t)
+	defer cleanup()
 	if err := PullIfMissing(ctx, client, req.Model); err != nil {
 		t.Fatalf("failed to pull model %s: %v", req.Model, err)
 	}
--- a/kvcache/causal.go
+++ b/kvcache/causal.go
@@ -21,7 +21,6 @@ type shiftFn func(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, e
 type Causal struct {
 	DType      ml.DType
 	windowSize int32
-	chunkSize  int32

 	opts CausalOptions

@@ -98,17 +97,6 @@ func NewSWACache(windowSize int32, shift shiftFn) *Causal {
 	}
 }

-func NewChunkedAttentionCache(chunkSize int32, shift shiftFn) *Causal {
-	return &Causal{
-		windowSize: math.MaxInt32,
-		chunkSize:  chunkSize,
-		shiftFn:    shift,
-		ctxs:       make(map[int]ml.Context),
-		keys:       make(map[int]ml.Tensor),
-		values:     make(map[int]ml.Tensor),
-	}
-}
-
 func (c *Causal) Init(backend ml.Backend, dtype ml.DType, maxSequences, capacity, maxBatch int) {
 	if c.config == nil {
 		var config ml.CacheConfig
@@ -312,7 +300,6 @@ func (c *Causal) buildMask(ctx ml.Context) (ml.Tensor, error) {
 		for j := c.curCellRange.min; j <= c.curCellRange.max; j++ {
 			if !slices.Contains(c.cells[j].sequences, c.curSequences[i]) ||
 				(enabled && c.cells[j].pos > c.curPositions[i]) ||
-				c.chunkSize > 0 && c.cells[j].pos < c.curPositions[i]-c.curPositions[i]%c.chunkSize ||
 				c.cells[j].pos < c.curPositions[i]-c.windowSize {
 				mask[i*length+(j-c.curCellRange.min)] = float32(math.Inf(-1))
 			}
--- a/kvcache/causal_test.go
+++ b/kvcache/causal_test.go
@@ -86,64 +86,6 @@ func TestSWA(t *testing.T) {
 	testCache(t, backend, cache, tests)
 }

-func TestChunkedAttention(t *testing.T) {
-	cache := NewChunkedAttentionCache(2, nil)
-	defer cache.Close()
-
-	var b testBackend
-	cache.Init(&b, ml.DTypeF16, 1, 16, 16)
-
-	x := float32(math.Inf(-1))
-
-	testCache(
-		t, &b, cache,
-		[]testCase{
-			{
-				name:          "FirstBatch",
-				in:            []float32{1, 2, 3, 4},
-				inShape:       []int{1, 1, 4},
-				seqs:          []int{0, 0, 0, 0},
-				pos:           []int32{0, 1, 2, 3},
-				expected:      []float32{1, 2, 3, 4},
-				expectedShape: []int{1, 1, 4},
-				expectedMask: []float32{
-					0, x, x, x,
-					0, 0, x, x,
-					x, x, 0, x,
-					x, x, 0, 0,
-				},
-			},
-			{
-				name:          "SecondBatch",
-				in:            []float32{5, 6, 7},
-				inShape:       []int{1, 1, 3},
-				seqs:          []int{0, 0, 0},
-				pos:           []int32{4, 5, 6},
-				expected:      []float32{1, 2, 3, 4, 5, 6, 7},
-				expectedShape: []int{1, 1, 7},
-				expectedMask: []float32{
-					x, x, x, x, 0, x, x,
-					x, x, x, x, 0, 0, x,
-					x, x, x, x, x, x, 0,
-				},
-			},
-			{
-				name:          "ThirdBatch",
-				in:            []float32{8, 9},
-				inShape:       []int{1, 1, 2},
-				seqs:          []int{0, 0},
-				pos:           []int32{7, 8},
-				expected:      []float32{1, 2, 3, 4, 5, 6, 7, 8, 9},
-				expectedShape: []int{1, 1, 9},
-				expectedMask: []float32{
-					x, x, x, x, x, x, 0, 0, x,
-					x, x, x, x, x, x, x, x, 0,
-				},
-			},
-		},
-	)
-}
-
 func TestSequences(t *testing.T) {
 	backend := &testBackend{}
 	cache := NewCausalCache(nil)
@@ -351,16 +293,8 @@ func testCache(t *testing.T, backend ml.Backend, cache Cache, tests []testCase)

 			context.Forward(out, mask).Compute(out, mask)

-			if !slices.Equal(out.Floats(), test.expected) {
-				t.Errorf("TestCache: have %v; want %v", out.Floats(), test.expected)
-			}
-
-			if !slices.Equal(out.Shape(), test.expectedShape) {
-				t.Errorf("TestCache: has shape %v; want %v", out.Shape(), test.expectedShape)
-			}
-
-			if !slices.Equal(mask.Floats(), test.expectedMask) {
-				t.Errorf("TestCache: have mask: have %v want %v", mask.Floats(), test.expectedMask)
+			if !slices.Equal(out.Floats(), test.expected) || !slices.Equal(out.Shape(), test.expectedShape) || !slices.Equal(mask.Floats(), test.expectedMask) {
+				t.Errorf("TestCache: have %v (shape %v); want %v (shape %v); mask: have %v (shape %v) want %v", out.Floats(), out.Shape(), test.expected, test.expectedShape, mask.Floats(), mask.Shape(), test.expectedMask)
 			}
 		})
 	}
--- a/llm/memory.go
+++ b/llm/memory.go
@@ -414,7 +414,7 @@ func projectorMemoryRequirements(filename string) (weights, graphSize uint64) {
 	}
 	defer file.Close()

-	ggml, _, err := ggml.Decode(file, 1024)
+	ggml, _, err := ggml.Decode(file, 0)
 	if err != nil {
 		return 0, 0
 	}
--- a/ml/backend.go
+++ b/ml/backend.go
@@ -133,7 +133,6 @@ type Tensor interface {
 	Mul(ctx Context, t2 Tensor) Tensor
 	Mulmat(ctx Context, t2 Tensor) Tensor
 	MulmatFullPrec(ctx Context, t2 Tensor) Tensor
-	MulmatID(ctx Context, t2, ids Tensor) Tensor

 	Softmax(ctx Context) Tensor
 	LayerNorm(ctx Context, weight, bias Tensor, eps float32) Tensor
@@ -151,7 +150,6 @@ type Tensor interface {
 	Tanh(ctx Context) Tensor
 	GELU(ctx Context) Tensor
 	SILU(ctx Context) Tensor
-	Sigmoid(ctx Context) Tensor

 	Reshape(ctx Context, shape ...int) Tensor
 	View(ctx Context, offset int, shape ...int) Tensor
@@ -170,8 +168,6 @@ type Tensor interface {
 	Rows(ctx Context, t2 Tensor) Tensor
 	Copy(ctx Context, t2 Tensor) Tensor
 	Duplicate(ctx Context) Tensor
-
-	TopK(ctx Context, k int) Tensor
 }

 // ScaledDotProductAttention implements a fused attention
--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@@ -884,32 +884,17 @@ func (t *Tensor) MulmatFullPrec(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
 	}
 }

-func (t *Tensor) MulmatID(ctx ml.Context, t2, ids ml.Tensor) ml.Tensor {
-	return &Tensor{
-		b: t.b,
-		t: C.ggml_mul_mat_id(ctx.(*Context).ctx, t.t, t2.(*Tensor).t, ids.(*Tensor).t),
-	}
-}
-
 func (t *Tensor) LayerNorm(ctx ml.Context, w, b ml.Tensor, eps float32) ml.Tensor {
-	tt := C.ggml_norm(ctx.(*Context).ctx, t.t, C.float(eps))
-	if w != nil {
-		tt = C.ggml_mul(ctx.(*Context).ctx, tt, w.(*Tensor).t)
-		if b != nil {
-			tt = C.ggml_add(ctx.(*Context).ctx, tt, b.(*Tensor).t)
-		}
+	tt := (&Tensor{b: t.b, t: C.ggml_norm(ctx.(*Context).ctx, t.t, C.float(eps))}).Mul(ctx, w)
+	if b != nil {
+		tt = tt.Add(ctx, b)
 	}

-	return &Tensor{b: t.b, t: tt}
+	return tt
 }

 func (t *Tensor) RMSNorm(ctx ml.Context, w ml.Tensor, eps float32) ml.Tensor {
-	tt := C.ggml_rms_norm(ctx.(*Context).ctx, t.t, C.float(eps))
-	if w != nil {
-		tt = C.ggml_mul(ctx.(*Context).ctx, tt, w.(*Tensor).t)
-	}
-
-	return &Tensor{b: t.b, t: tt}
+	return (&Tensor{b: t.b, t: C.ggml_rms_norm(ctx.(*Context).ctx, t.t, C.float(eps))}).Mul(ctx, w)
 }

 func (t *Tensor) Pad(ctx ml.Context, shape ...int) ml.Tensor {
@@ -1010,13 +995,6 @@ func (t *Tensor) Tanh(ctx ml.Context) ml.Tensor {
 	}
 }

-func (t *Tensor) Sigmoid(ctx ml.Context) ml.Tensor {
-	return &Tensor{
-		b: t.b,
-		t: C.ggml_sigmoid_inplace(ctx.(*Context).ctx, t.t),
-	}
-}
-
 func (t *Tensor) Unpad(ctx ml.Context, shape ...int) ml.Tensor {
 	if len(shape) != 4 {
 		panic("expected 4 dimensions")
@@ -1180,10 +1158,3 @@ func (t *Tensor) Duplicate(ctx ml.Context) ml.Tensor {
 		t: C.ggml_dup(ctx.(*Context).ctx, t.t),
 	}
 }
-
-func (t *Tensor) TopK(ctx ml.Context, k int) ml.Tensor {
-	return &Tensor{
-		b: t.b,
-		t: C.ggml_top_k(ctx.(*Context).ctx, t.t, C.int(k)),
-	}
-}
--- a/model/models/gemma2/model.go
+++ b/model/models/gemma2/model.go
@@ -42,7 +42,7 @@ func New(c fs.Config) (model.Model, error) {
 			&model.Vocabulary{
 				Values: c.Strings("tokenizer.ggml.tokens"),
 				Scores: c.Floats("tokenizer.ggml.scores"),
-				Types:  c.Ints("tokenizer.ggml.token_type"),
+				Types:  c.Uints("tokenizer.ggml.token_type"),
 				BOS:    int32(c.Uint("tokenizer.ggml.bos_token_id")),
 				EOS:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
 			},
--- a/model/models/gemma3/model.go
+++ b/model/models/gemma3/model.go
@@ -59,7 +59,7 @@ func New(c fs.Config) (model.Model, error) {
 			&model.Vocabulary{
 				Values: c.Strings("tokenizer.ggml.tokens"),
 				Scores: c.Floats("tokenizer.ggml.scores"),
-				Types:  c.Ints("tokenizer.ggml.token_type"),
+				Types:  c.Uints("tokenizer.ggml.token_type"),
 				BOS:    int32(c.Uint("tokenizer.ggml.bos_token_id")),
 				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
 				EOS:    int32(1),
--- a/model/models/gemma3/model_text.go
+++ b/model/models/gemma3/model_text.go
@@ -49,7 +49,7 @@ func newTextModel(c fs.Config) *TextModel {
 			&model.Vocabulary{
 				Values: c.Strings("tokenizer.ggml.tokens"),
 				Scores: c.Floats("tokenizer.ggml.scores"),
-				Types:  c.Ints("tokenizer.ggml.token_type"),
+				Types:  c.Uints("tokenizer.ggml.token_type"),
 				BOS:    int32(c.Uint("tokenizer.ggml.bos_token_id")),
 				EOS:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
 			},
--- a/model/models/llama/model.go
+++ b/model/models/llama/model.go
@@ -41,7 +41,7 @@ func New(c fs.Config) (model.Model, error) {
 			c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
 			&model.Vocabulary{
 				Values: c.Strings("tokenizer.ggml.tokens"),
-				Types:  c.Ints("tokenizer.ggml.token_type"),
+				Types:  c.Uints("tokenizer.ggml.token_type"),
 				Merges: c.Strings("tokenizer.ggml.merges"),
 				BOS:    int32(c.Uint("tokenizer.ggml.bos_token_id")),
 				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
--- a/model/models/llama4/model.go
+++ b/model/models/llama4/model.go
@@ -1,189 +0,0 @@
-package llama4
-
-import (
-	"bytes"
-	"image"
-	"slices"
-	"sync"
-
-	"github.com/ollama/ollama/fs"
-	"github.com/ollama/ollama/kvcache"
-	"github.com/ollama/ollama/ml"
-	"github.com/ollama/ollama/ml/nn"
-	"github.com/ollama/ollama/model"
-	"github.com/ollama/ollama/model/input"
-)
-
-type Model struct {
-	model.Base
-	model.BytePairEncoding
-	ImageProcessor
-
-	*VisionModel `gguf:"v,vision"`
-	*Projector   `gguf:"mm"`
-	*TextModel
-}
-
-type Projector struct {
-	Linear1 *nn.Linear `gguf:"linear_1"`
-}
-
-func (p *Projector) Forward(ctx ml.Context, visionOutputs ml.Tensor) ml.Tensor {
-	return p.Linear1.Forward(ctx, visionOutputs)
-}
-
-func New(c fs.Config) (model.Model, error) {
-	m := Model{
-		BytePairEncoding: model.NewBytePairEncoding(
-			c.String("tokenizer.ggml.pretokenizer",
-				`[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
-			&model.Vocabulary{
-				Values: c.Strings("tokenizer.ggml.tokens"),
-				Types:  c.Ints("tokenizer.ggml.token_type"),
-				Merges: c.Strings("tokenizer.ggml.merges"),
-				BOS:    int32(c.Uint("tokenizer.ggml.bos_token_id")),
-				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
-				EOS:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
-				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
-			},
-		),
-		ImageProcessor: newImageProcessor(c),
-		VisionModel:    newVisionModel(c),
-		TextModel:      newTextModel(c),
-	}
-
-	m.Cache = kvcache.NewWrapperCache(
-		kvcache.NewChunkedAttentionCache(int32(c.Uint("attention.chunk_size", 8192)), m.Shift),
-		kvcache.NewCausalCache(m.Shift),
-	)
-
-	return &m, nil
-}
-
-func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, error) {
-	if len(m.VisionModel.Layers) < 1 {
-		return nil, model.ErrNoVisionModel
-	}
-
-	img, _, err := image.Decode(bytes.NewReader(multimodalData))
-	if err != nil {
-		return nil, err
-	}
-
-	pixelsLocal, pixelsGlobal, size, err := m.ProcessImage(img)
-	if err != nil {
-		return nil, err
-	}
-
-	tilesLocal, err := ctx.Input().FromFloatSlice(pixelsLocal, size.X, size.Y, m.numChannels)
-	if err != nil {
-		return nil, err
-	}
-
-	ratioW, ratioH := size.X/m.imageSize, size.Y/m.imageSize
-
-	tilesLocal = tilesLocal.Reshape(ctx, size.X/ratioW, ratioW, size.Y, m.numChannels).Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
-	tilesLocal = tilesLocal.Reshape(ctx, size.X/ratioW*size.Y/ratioH, ratioH, ratioW, m.numChannels).Permute(ctx, 0, 3, 2, 1).Contiguous(ctx)
-	tilesLocal = tilesLocal.Reshape(ctx, size.X/ratioW, size.Y/ratioH, m.numChannels, ratioH*ratioW)
-
-	pixelValues := tilesLocal
-
-	if len(pixelsGlobal) > 0 {
-		tilesGlobal, err := ctx.Input().FromFloatSlice(pixelsGlobal, m.imageSize, m.imageSize, m.numChannels)
-		if err != nil {
-			return nil, err
-		}
-
-		pixelValues = pixelValues.Concat(ctx, tilesGlobal, 3)
-	}
-
-	visionOutputs := m.VisionModel.Forward(ctx, pixelValues)
-	visionOutputs = visionOutputs.Reshape(ctx, visionOutputs.Dim(0), visionOutputs.Dim(1)*visionOutputs.Dim(2)*visionOutputs.Dim(3))
-	projectedOutputs := m.Projector.Forward(ctx, visionOutputs)
-	return &chunks{Model: m, Tensor: projectedOutputs, aspectRatio: image.Point{ratioW, ratioH}}, nil
-}
-
-type chunks struct {
-	*Model
-	ml.Tensor
-	aspectRatio image.Point
-
-	dataOnce sync.Once
-	data     []float32
-}
-
-type chunk struct {
-	*chunks
-	s, n int
-}
-
-func (r *chunk) floats() []float32 {
-	r.dataOnce.Do(func() {
-		temp := r.Backend().NewContext()
-		defer temp.Close()
-		temp.Forward(r.Tensor).Compute(r.Tensor)
-		r.data = r.Floats()
-	})
-
-	return r.data[r.s*r.Dim(0) : (r.s+r.n)*r.Dim(0)]
-}
-
-func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
-	var result []input.Input
-	for _, inp := range inputs {
-		if inp.Multimodal == nil {
-			result = append(result, inp)
-			continue
-		}
-
-		t := inp.Multimodal.(*chunks)
-		var imageInputs []input.Input
-		imageInputs = append(imageInputs, input.Input{Token: 200080}) // <|image_start|>
-
-		var offset int
-		patchesPerChunk := t.Dim(1)
-		if t.aspectRatio.Y*t.aspectRatio.X > 1 {
-			patchesPerChunk = t.Dim(1) / (t.aspectRatio.X*t.aspectRatio.Y + 1)
-
-			for range t.aspectRatio.Y {
-				for x := range t.aspectRatio.X {
-					imageInputs = append(imageInputs, input.Input{Token: 200092, Multimodal: &chunk{t, offset, patchesPerChunk}, MultimodalHash: inp.MultimodalHash, SameBatch: patchesPerChunk}) // <|patch|>
-					imageInputs = append(imageInputs, slices.Repeat([]input.Input{{Token: 200092}}, patchesPerChunk-1)...)
-					if x < t.aspectRatio.X-1 {
-						imageInputs = append(imageInputs, input.Input{Token: 200084}) // <|tile_x_separator|>
-					}
-					offset += patchesPerChunk
-				}
-
-				imageInputs = append(imageInputs, input.Input{Token: 200085}) // <|tile_y_separator|>
-			}
-		}
-
-		imageInputs = append(imageInputs, input.Input{Token: 200090})                                                                                                                 // <|image|>
-		imageInputs = append(imageInputs, input.Input{Token: 200092, Multimodal: &chunk{t, offset, patchesPerChunk}, MultimodalHash: inp.MultimodalHash, SameBatch: patchesPerChunk}) // <|patch|>
-		imageInputs = append(imageInputs, slices.Repeat([]input.Input{{Token: 200092}}, patchesPerChunk-1)...)
-		imageInputs = append(imageInputs, input.Input{Token: 200080}) // <|image_end|>
-
-		result = append(result, imageInputs...)
-	}
-
-	return result, nil
-}
-
-func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
-	positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
-	if err != nil {
-		return nil, err
-	}
-
-	outputs, err := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
-	if err != nil {
-		return nil, err
-	}
-
-	return m.TextModel.Forward(ctx, batch.Inputs, positions, outputs, batch, m.Cache), nil
-}
-
-func init() {
-	model.Register("llama4", New)
-}
--- a/model/models/llama4/model_text.go
+++ b/model/models/llama4/model_text.go
@@ -1,259 +0,0 @@
-package llama4
-
-import (
-	"cmp"
-	"math"
-
-	"github.com/ollama/ollama/fs"
-	"github.com/ollama/ollama/kvcache"
-	"github.com/ollama/ollama/ml"
-	"github.com/ollama/ollama/ml/nn"
-	"github.com/ollama/ollama/model/input"
-)
-
-type TextAttention struct {
-	Query       *nn.Linear `gguf:"attn_q"`
-	Key         *nn.Linear `gguf:"attn_k"`
-	Value       *nn.Linear `gguf:"attn_v"`
-	Output      *nn.Linear `gguf:"attn_output"`
-	RopeFactors ml.Tensor  `gguf:"rope_factors"`
-}
-
-func (sa *TextAttention) Forward(ctx ml.Context, hiddenStates, positions, attentionScales ml.Tensor, cache kvcache.Cache, useRope bool, opts *TextOptions) ml.Tensor {
-	batchSize, headDim := hiddenStates.Dim(1), cmp.Or(opts.headDim, opts.hiddenSize/opts.numHeads)
-
-	query := sa.Query.Forward(ctx, hiddenStates)
-	key := sa.Key.Forward(ctx, hiddenStates)
-	value := sa.Value.Forward(ctx, hiddenStates)
-
-	query = query.Reshape(ctx, headDim, opts.numHeads, batchSize)
-	key = key.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
-	value = value.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
-
-	if useRope {
-		query = query.RoPE(ctx, positions, sa.RopeFactors, uint32(opts.ropeDim), uint32(0), opts.ropeBase, opts.ropeScale)
-		key = key.RoPE(ctx, positions, sa.RopeFactors, uint32(opts.ropeDim), uint32(0), opts.ropeBase, opts.ropeScale)
-	}
-
-	if opts.useQKNorm {
-		query = query.RMSNorm(ctx, nil, opts.eps)
-		key = key.RMSNorm(ctx, nil, opts.eps)
-	}
-
-	if attentionScales != nil && !useRope {
-		query = query.Mul(ctx, attentionScales)
-	}
-
-	attention := nn.Attention(ctx, query, key, value, 1./math.Sqrt(float64(headDim)), cache)
-	attention = attention.Reshape(ctx, opts.hiddenSize, batchSize)
-	return sa.Output.Forward(ctx, attention)
-}
-
-type TextMLP struct {
-	Gate *nn.Linear `gguf:"ffn_gate"`
-	Up   *nn.Linear `gguf:"ffn_up"`
-	Down *nn.Linear `gguf:"ffn_down"`
-}
-
-func (mlp *TextMLP) Forward(ctx ml.Context, hiddenStates ml.Tensor, opts *TextOptions) ml.Tensor {
-	hiddenStates = mlp.Gate.Forward(ctx, hiddenStates).SILU(ctx).Mul(ctx, mlp.Up.Forward(ctx, hiddenStates))
-	return mlp.Down.Forward(ctx, hiddenStates)
-}
-
-type TextExperts struct {
-	Gate ml.Tensor `gguf:"ffn_gate_exps.weight"`
-	Up   ml.Tensor `gguf:"ffn_up_exps.weight"`
-	Down ml.Tensor `gguf:"ffn_down_exps.weight"`
-}
-
-func (e *TextExperts) Forward(ctx ml.Context, hiddenStates, routerLogits ml.Tensor, opts *TextOptions) ml.Tensor {
-	experts := routerLogits.TopK(ctx, opts.numExpertsUsed)
-	scores := routerLogits.Sigmoid(ctx).Reshape(ctx, 1, opts.numExperts, hiddenStates.Dim(1)).Rows(ctx, experts)
-
-	hiddenStates = hiddenStates.Reshape(ctx, hiddenStates.Dim(0), 1, hiddenStates.Dim(1))
-	hiddenStates = hiddenStates.Repeat(ctx, 1, opts.numExpertsUsed)
-	hiddenStates = hiddenStates.Mul(ctx, scores)
-
-	upStates := e.Up.MulmatID(ctx, hiddenStates, experts)
-	gateStates := e.Gate.MulmatID(ctx, hiddenStates, experts)
-	downStates := e.Down.MulmatID(ctx, upStates.Mul(ctx, gateStates.SILU(ctx)), experts)
-
-	nextStates := downStates.View(ctx, 0, hiddenStates.Dim(0), downStates.Stride(2), hiddenStates.Dim(2))
-	for i := 1; i < opts.numExpertsUsed; i++ {
-		nextStates.Add(ctx, downStates.View(ctx, i*downStates.Stride(1), hiddenStates.Dim(0), downStates.Stride(2), hiddenStates.Dim(2)))
-	}
-
-	return nextStates
-}
-
-// TextSharedExpert is TextMLP with different tensor names
-type TextSharedExpert struct {
-	Gate *nn.Linear `gguf:"ffn_gate_shexp"`
-	Up   *nn.Linear `gguf:"ffn_up_shexp"`
-	Down *nn.Linear `gguf:"ffn_down_shexp"`
-}
-
-func (mlp *TextSharedExpert) Forward(ctx ml.Context, hiddenStates ml.Tensor, opts *TextOptions) ml.Tensor {
-	hiddenStates = mlp.Gate.Forward(ctx, hiddenStates).SILU(ctx).Mul(ctx, mlp.Up.Forward(ctx, hiddenStates))
-	return mlp.Down.Forward(ctx, hiddenStates)
-}
-
-type TextMOE struct {
-	Router       *nn.Linear `gguf:"ffn_gate_inp"`
-	Experts      *TextExperts
-	SharedExpert *TextSharedExpert
-}
-
-func (moe *TextMOE) Forward(ctx ml.Context, hiddenStates ml.Tensor, opts *TextOptions) ml.Tensor {
-	hiddenDim, sequenceLength, batchSize := hiddenStates.Dim(0), hiddenStates.Dim(1), hiddenStates.Dim(2)
-	hiddenStates = hiddenStates.Reshape(ctx, hiddenDim, sequenceLength*batchSize)
-	routerLogits := moe.Router.Forward(ctx, hiddenStates)
-
-	sharedStates := moe.SharedExpert.Forward(ctx, hiddenStates, opts)
-	routedStates := moe.Experts.Forward(ctx, hiddenStates, routerLogits, opts)
-	return sharedStates.Add(ctx, routedStates)
-}
-
-type TextFeedForward interface {
-	Forward(ctx ml.Context, hiddenStates ml.Tensor, opts *TextOptions) ml.Tensor
-}
-
-type TextLayer struct {
-	AttentionNorm *nn.LayerNorm `gguf:"attn_norm"`
-	Attention     *TextAttention
-
-	FFNNorm     *nn.LayerNorm `gguf:"ffn_norm"`
-	FeedForward TextFeedForward
-}
-
-func (d *TextLayer) Forward(ctx ml.Context, hiddenStates, positions, attentionScales, outputs ml.Tensor, cache kvcache.Cache, useRope bool, opts *TextOptions) ml.Tensor {
-	residual := hiddenStates
-
-	// self attention
-	hiddenStates = d.AttentionNorm.Forward(ctx, hiddenStates, opts.eps)
-	hiddenStates = d.Attention.Forward(ctx, hiddenStates, positions, attentionScales, cache, useRope, opts)
-
-	if outputs != nil {
-		hiddenStates = hiddenStates.Rows(ctx, outputs)
-		residual = residual.Rows(ctx, outputs)
-	}
-
-	hiddenStates = hiddenStates.Add(ctx, residual)
-	residual = hiddenStates
-
-	hiddenStates = d.FFNNorm.Forward(ctx, hiddenStates, opts.eps)
-	hiddenStates = d.FeedForward.Forward(ctx, hiddenStates, opts)
-
-	return residual.Add(ctx, hiddenStates)
-}
-
-type TextOptions struct {
-	hiddenSize                    int
-	numHeads, numKVHeads, headDim int
-	numExperts, numExpertsUsed    int
-	ropeDim                       int
-	ropeBase, ropeScale           float32
-	eps                           float32
-	interleaveLayerStep           int
-	noRopeInterval                int
-	useQKNorm                     bool
-	attentionTemperatureTuning    bool
-	attentionScale                float64
-	attentionFloorScale           float64
-}
-
-type TextModel struct {
-	Layers []TextLayer `gguf:"blk"`
-
-	TokenEmbedding *nn.Embedding `gguf:"token_embd"`
-	OutputNorm     *nn.LayerNorm `gguf:"output_norm"`
-	Output         *nn.Linear    `gguf:"output,alt:token_embd"`
-
-	*TextOptions
-}
-
-func newTextModel(c fs.Config) *TextModel {
-	layers := make([]TextLayer, c.Uint("block_count"))
-	interleaveLayerStep := c.Uint("interleave_moe_layer_step", 1)
-	for i := range layers {
-		if (i+1)%int(interleaveLayerStep) == 0 {
-			layers[i] = TextLayer{FeedForward: &TextMOE{}}
-		} else {
-			layers[i] = TextLayer{FeedForward: &TextMLP{}}
-		}
-	}
-
-	return &TextModel{
-		Layers: layers,
-		TextOptions: &TextOptions{
-			hiddenSize:                 int(c.Uint("embedding_length")),
-			numHeads:                   int(c.Uint("attention.head_count")),
-			numKVHeads:                 int(c.Uint("attention.head_count_kv")),
-			headDim:                    int(c.Uint("attention.head_dim", 128)),
-			numExperts:                 int(c.Uint("expert_count")),
-			numExpertsUsed:             int(c.Uint("expert_used_count")),
-			ropeDim:                    int(c.Uint("rope.dimension_count")),
-			ropeBase:                   c.Float("rope.freq_base"),
-			ropeScale:                  c.Float("rope.freq_scale", 1),
-			eps:                        c.Float("attention.layer_norm_rms_epsilon"),
-			interleaveLayerStep:        int(c.Uint("interleave_moe_layer_step", 1)),
-			noRopeInterval:             int(c.Uint("no_rope_interval", 4)),
-			useQKNorm:                  c.Bool("use_qk_norm", true),
-			attentionTemperatureTuning: c.Bool("attention.temperature_tuning", true),
-			attentionScale:             float64(c.Float("attention.scale", 0.1)),
-			attentionFloorScale:        float64(c.Float("attention.floor_scale", 8192)),
-		},
-	}
-}
-
-func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor, batch input.Batch, cache kvcache.Cache) ml.Tensor {
-	hiddenStates := m.TokenEmbedding.Forward(ctx, inputs).Duplicate(ctx)
-
-	for _, mi := range batch.Multimodal {
-		f32s := mi.Multimodal.(*chunk).floats()
-		img, err := ctx.Input().FromFloatSlice(f32s, len(f32s)/m.hiddenSize, m.hiddenSize)
-		if err != nil {
-			panic(err)
-		}
-
-		ctx.Forward(img.Copy(ctx, hiddenStates.View(ctx, mi.Index*hiddenStates.Stride(1), img.Dim(0)*img.Dim(1))))
-	}
-
-	var attentionScales ml.Tensor
-	if m.attentionTemperatureTuning {
-		scales := make([]float32, len(batch.Positions))
-		for i, p := range batch.Positions {
-			scales[i] = float32(math.Log(math.Floor(((float64(p)+1.0)/float64(m.attentionFloorScale))+1.0))*m.attentionScale + 1.0)
-		}
-
-		var err error
-		attentionScales, err = ctx.Input().FromFloatSlice(scales, 1, 1, len(scales))
-		if err != nil {
-			panic(err)
-		}
-	}
-
-	for i, layer := range m.Layers {
-		cache.SetLayer(i)
-		wc := cache.(*kvcache.WrapperCache)
-		wc.SetLayerType(1)
-		useChunkedAttention := (i+1)%m.noRopeInterval != 0
-		if useChunkedAttention {
-			wc.SetLayerType(0)
-		}
-
-		var lastLayerOutputs ml.Tensor
-		if i == len(m.Layers)-1 {
-			lastLayerOutputs = outputs
-		}
-
-		hiddenStates = layer.Forward(ctx, hiddenStates, positions, attentionScales, lastLayerOutputs, cache, useChunkedAttention, m.TextOptions)
-	}
-
-	hiddenStates = m.OutputNorm.Forward(ctx, hiddenStates, m.eps)
-	return m.Output.Forward(ctx, hiddenStates)
-}
-
-func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
-	return key.RoPE(ctx, shift, m.Layers[layer].Attention.RopeFactors, uint32(0), uint32(m.ropeDim), m.ropeBase, m.ropeScale), nil
-}
--- a/model/models/llama4/model_vision.go
+++ b/model/models/llama4/model_vision.go
@@ -1,256 +0,0 @@
-package llama4
-
-import (
-	"math"
-
-	"github.com/ollama/ollama/fs"
-	"github.com/ollama/ollama/ml"
-	"github.com/ollama/ollama/ml/nn"
-)
-
-type VisionAttention struct {
-	Query  *nn.Linear `gguf:"attn_q"`
-	Key    *nn.Linear `gguf:"attn_k"`
-	Value  *nn.Linear `gguf:"attn_v"`
-	Output *nn.Linear `gguf:"attn_output"`
-}
-
-// applyVisionRotaryEmbedding applies 2D rotary embedding to the input tensor.
-// This is equivalent to the Pytorch implmentation using half rotations:
-//
-//	cos, sin = torch.cos(freqs), torch.sin(freqs)
-//	cos = cos.unsqueeze(-1)
-//	sin = sin.unsqueeze(-1)
-//	t = t.reshape(*t.shape[:-1], -1, 2)
-//	t_out = (t * cos) + (_rotate_half(t) * sin)
-//	t_out = t_out.flatten(3)
-//
-// Which is equivalent to the Pytorch implementation using complex numbers:
-//
-//	t_ = torch.view_as_complex(t.float().reshape(*t.shape[:-1], -1, 2))
-//	freqs_ci = reshape_for_broadcast(freqs_ci=freq_cis, t=t_)  # freqs_ci[:,:,None,:]
-//	freqs_ci = freqs_ci.to(t_.device)
-//	t_out = torch.view_as_real(t_ * freqs_ci).flatten(3)
-//
-// Due to the 1) the dimensional and 2) the datatype limitations of current backends,
-// we need to use a different approach to achieve the same result.
-func applyVisionRotaryEmbedding(ctx ml.Context, t, cos, sin ml.Tensor) ml.Tensor {
-	width, height, channels, tiles := t.Dim(0), t.Dim(1), t.Dim(2), t.Dim(3)
-
-	t = t.Reshape(ctx, 2, t.Dim(0)/2, t.Dim(1)*t.Dim(2)*t.Dim(3))
-
-	// t1 = t[..., 0::2]
-	t1 := t.View(ctx, 0, 1, t.Stride(1), t.Dim(1), t.Stride(2), t.Dim(2)).Contiguous(ctx)
-	t1 = t1.Reshape(ctx, width/2, height, channels, tiles)
-
-	// t2 = t[..., 1::2]
-	t2 := t.View(ctx, t.Stride(0), 1, t.Stride(1), t.Dim(1), t.Stride(2), t.Dim(2)).Contiguous(ctx)
-	t2 = t2.Reshape(ctx, width/2, height, channels, tiles)
-
-	// cos_out = torch.stack((t1 * cos, t2 * cos), dim=-1)
-	cosOut := t1.Mul(ctx, cos).Concat(ctx, t2.Mul(ctx, cos), 0)
-	cosOut = cosOut.Reshape(ctx, cosOut.Dim(0)/2, 2, cosOut.Dim(1)*cosOut.Dim(2)*cosOut.Dim(3))
-	cosOut = cosOut.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx)
-	cosOut = cosOut.Reshape(ctx, width, height, channels, tiles)
-
-	// sin_out = torch.stack((-t2 * sin, t1 * sin), dim=-1)
-	sinOut := t2.Neg(ctx).Mul(ctx, sin).Concat(ctx, t1.Mul(ctx, sin), 0)
-	sinOut = sinOut.Reshape(ctx, sinOut.Dim(0)/2, 2, sinOut.Dim(1)*sinOut.Dim(2)*sinOut.Dim(3))
-	sinOut = sinOut.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx)
-	sinOut = sinOut.Reshape(ctx, width, height, channels, tiles)
-
-	return cosOut.Add(ctx, sinOut)
-}
-
-func (sa *VisionAttention) Forward(ctx ml.Context, hiddenState, cos, sin ml.Tensor, opts *VisionOptions) ml.Tensor {
-	headDim := opts.hiddenSize / opts.numHeads
-
-	query := sa.Query.Forward(ctx, hiddenState)
-	key := sa.Key.Forward(ctx, hiddenState)
-	value := sa.Value.Forward(ctx, hiddenState)
-
-	query = query.Reshape(ctx, headDim, opts.numHeads, query.Dim(1), query.Dim(2))
-	key = key.Reshape(ctx, headDim, opts.numHeads, key.Dim(1), key.Dim(2))
-	value = value.Reshape(ctx, headDim, opts.numHeads, value.Dim(1), value.Dim(2))
-
-	query = applyVisionRotaryEmbedding(ctx, query, cos, sin)
-	key = applyVisionRotaryEmbedding(ctx, key, cos, sin)
-
-	attention := nn.Attention(ctx, query, key, value, 1./math.Sqrt(float64(headDim)), nil)
-	attention = attention.Reshape(ctx, opts.hiddenSize, attention.Dim(2), attention.Dim(3))
-	return sa.Output.Forward(ctx, attention)
-}
-
-type VisionMLP struct {
-	FC1 *nn.Linear `gguf:"fc1"`
-	FC2 *nn.Linear `gguf:"fc2"`
-}
-
-func (mlp *VisionMLP) Forward(ctx ml.Context, hiddenStates ml.Tensor, opts *VisionOptions) ml.Tensor {
-	hiddenStates = mlp.FC1.Forward(ctx, hiddenStates).GELU(ctx)
-	hiddenStates = mlp.FC2.Forward(ctx, hiddenStates)
-	return hiddenStates
-}
-
-type VisionLayer struct {
-	InputLayerNorm *nn.LayerNorm `gguf:"attn_norm"`
-	*VisionAttention
-
-	PostAttentionNorm *nn.LayerNorm `gguf:"ffn_norm"`
-	*VisionMLP        `gguf:"mlp"`
-}
-
-func (e *VisionLayer) Forward(ctx ml.Context, hiddenStates, cos, sin ml.Tensor, opts *VisionOptions) ml.Tensor {
-	residual := hiddenStates
-
-	// self attention
-	hiddenStates = e.InputLayerNorm.Forward(ctx, hiddenStates, opts.eps)
-	hiddenStates = e.VisionAttention.Forward(ctx, hiddenStates, cos, sin, opts)
-	hiddenStates = hiddenStates.Add(ctx, residual)
-
-	// MLP
-	residual = hiddenStates
-	hiddenStates = e.PostAttentionNorm.Forward(ctx, hiddenStates, opts.eps)
-	hiddenStates = e.VisionMLP.Forward(ctx, hiddenStates, opts)
-	hiddenStates = hiddenStates.Add(ctx, residual)
-
-	return hiddenStates
-}
-
-type VisionAdapter struct {
-	FC1 *nn.Linear `gguf:"mlp.fc1"`
-	FC2 *nn.Linear `gguf:"mlp.fc2"`
-}
-
-func (a *VisionAdapter) Forward(ctx ml.Context, hiddenStates ml.Tensor, opts *VisionOptions) ml.Tensor {
-	patches := hiddenStates.Dim(1)
-	patchSize := int(math.Sqrt(float64(patches)))
-
-	hiddenStates = hiddenStates.Reshape(ctx, hiddenStates.Dim(0), patchSize, patchSize, hiddenStates.Dim(2))
-
-	channels, width, height, tiles := hiddenStates.Dim(0), hiddenStates.Dim(1), hiddenStates.Dim(2), hiddenStates.Dim(3)
-
-	channels, width = int(float32(channels)/opts.pixelShuffleRatio), int(float32(width)*opts.pixelShuffleRatio)
-	hiddenStates = hiddenStates.Reshape(ctx, channels, width, height, tiles)
-	hiddenStates = hiddenStates.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
-
-	channels, height = int(float32(channels)/opts.pixelShuffleRatio), int(float32(height)*opts.pixelShuffleRatio)
-	hiddenStates = hiddenStates.Reshape(ctx, channels, width, height, tiles)
-	hiddenStates = hiddenStates.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
-
-	hiddenStates = hiddenStates.Reshape(ctx, channels, width*height, tiles)
-
-	hiddenStates = a.FC1.Forward(ctx, hiddenStates).GELU(ctx)
-	hiddenStates = a.FC2.Forward(ctx, hiddenStates).GELU(ctx)
-	return hiddenStates
-}
-
-type VisionOptions struct {
-	hiddenSize, numHeads int
-	imageSize, patchSize int
-
-	ropeTheta         float32
-	eps               float32
-	pixelShuffleRatio float32
-}
-
-type PatchEmbedding struct {
-	*nn.Linear
-}
-
-func (p *PatchEmbedding) Forward(ctx ml.Context, hiddenStates ml.Tensor, opts *VisionOptions) ml.Tensor {
-	kernel := ctx.Input().Empty(ml.DTypeF32, opts.patchSize, opts.patchSize, hiddenStates.Dim(2))
-	hiddenStates = kernel.IM2Col(ctx, hiddenStates, opts.patchSize, opts.patchSize, 0, 0, 1, 1)
-	hiddenStates = hiddenStates.Reshape(ctx, hiddenStates.Dim(0), hiddenStates.Dim(1)*hiddenStates.Dim(2), hiddenStates.Dim(3))
-	return p.Linear.Forward(ctx, hiddenStates)
-}
-
-type VisionModel struct {
-	Layers []VisionLayer `gguf:"blk"`
-
-	*PatchEmbedding     `gguf:"patch_embedding"`
-	ClassEmbedding      ml.Tensor `gguf:"class_embedding"`
-	PositionalEmbedding ml.Tensor `gguf:"positional_embedding_vlm"`
-
-	LayerNormPre  *nn.LayerNorm `gguf:"layernorm_pre"`
-	LayerNormPost *nn.LayerNorm `gguf:"layernorm_post"`
-
-	*VisionAdapter `gguf:"vision_adapter"`
-
-	*VisionOptions
-}
-
-func newVisionModel(c fs.Config) *VisionModel {
-	return &VisionModel{
-		Layers: make([]VisionLayer, c.Uint("vision.block_count")),
-		VisionOptions: &VisionOptions{
-			hiddenSize:        int(c.Uint("vision.embedding_length")),
-			numHeads:          int(c.Uint("vision.attention.head_count")),
-			imageSize:         int(c.Uint("vision.image_size")),
-			patchSize:         int(c.Uint("vision.patch_size")),
-			ropeTheta:         float32(c.Float("vision.rope.freq_base")),
-			eps:               c.Float("vision.layer_norm_epsilon"),
-			pixelShuffleRatio: float32(c.Float("vision.pixel_shuffle_ratio")),
-		},
-	}
-}
-
-func (m *VisionModel) Forward(ctx ml.Context, pixelValues ml.Tensor) ml.Tensor {
-	hiddenStates := m.PatchEmbedding.Forward(ctx, pixelValues, m.VisionOptions)
-	hiddenStates = hiddenStates.Concat(ctx, m.ClassEmbedding.Repeat(ctx, 2, hiddenStates.Dim(2)), 1)
-
-	hiddenStates = hiddenStates.Add(ctx, m.PositionalEmbedding)
-	hiddenStates = m.LayerNormPre.Forward(ctx, hiddenStates, m.eps)
-
-	cos, sin := m.rotaryEmbedding(ctx)
-	for _, layer := range m.Layers {
-		hiddenStates = layer.Forward(ctx, hiddenStates, cos, sin, m.VisionOptions)
-	}
-
-	hiddenStates = m.LayerNormPost.Forward(ctx, hiddenStates, m.eps)
-	hiddenStates = hiddenStates.Unpad(ctx, 0, 1, 0, 0)
-	hiddenStates = m.VisionAdapter.Forward(ctx, hiddenStates, m.VisionOptions)
-	return hiddenStates
-}
-
-// floorDiv is a helper function to perform floor division. This mimics PyTorch's div(round_mode='floor') function
-// which in turn mimics Python's // operator.
-func floorDiv[T int | int16 | int32 | int64 | uint | uint16 | uint32 | uint64](a, b T) T {
-	if b == 0 {
-		panic("division by zero")
-	}
-
-	if (a >= 0 && b > 0) || (a <= 0 && b < 0) || a%b == 0 {
-		return a / b
-	}
-
-	return a/b - 1
-}
-
-func (m *VisionModel) rotaryEmbedding(ctx ml.Context) (ml.Tensor, ml.Tensor) {
-	patchesPerSide := m.imageSize / m.patchSize
-	numPatches := patchesPerSide*patchesPerSide + 1
-
-	headDim := m.hiddenSize / m.numHeads
-	freqDim := headDim / 2
-
-	freqs := make([]float32, numPatches*freqDim)
-	for i := range numPatches - 1 {
-		for j := 0; j < freqDim; j += 2 {
-			positionX := i*freqDim/2 + j/2
-			positionY := (i+numPatches)*freqDim/2 + j/2
-			ropeFreq := math.Pow(float64(m.ropeTheta), float64(j)*2/float64(headDim))
-			freqs[positionX] = float32(float64(1+i-floorDiv(i, patchesPerSide)*patchesPerSide) / ropeFreq)
-			freqs[positionY] = float32(float64(1+floorDiv(i, patchesPerSide)) / ropeFreq)
-		}
-	}
-
-	ropeFreqs, err := ctx.Input().FromFloatSlice(freqs, freqDim/2, numPatches, 2)
-	if err != nil {
-		panic(err)
-	}
-
-	ropeFreqs = ropeFreqs.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
-	ropeFreqs = ropeFreqs.Reshape(ctx, freqDim, 1, numPatches)
-	return ropeFreqs.Cos(ctx), ropeFreqs.Sin(ctx)
-}
--- a/model/models/llama4/process_image.go
+++ b/model/models/llama4/process_image.go
@@ -1,167 +0,0 @@
-package llama4
-
-import (
-	"cmp"
-	"image"
-	"math"
-	"slices"
-	"sort"
-
-	"golang.org/x/image/draw"
-
-	"github.com/ollama/ollama/fs"
-	"github.com/ollama/ollama/model/imageproc"
-)
-
-type ImageProcessor struct {
-	imageSize, patchSize, numChannels, maxUpscalingSize int
-}
-
-func newImageProcessor(c fs.Config) ImageProcessor {
-	return ImageProcessor{
-		imageSize:        int(c.Uint("vision.image_size")),
-		patchSize:        int(c.Uint("vision.patch_size")),
-		numChannels:      int(c.Uint("vision.num_channels", 3)),
-		maxUpscalingSize: int(c.Uint("vision.max_upscaling_size", 448)),
-	}
-}
-
-func factors(n int) []int {
-	var result []int
-	seen := make(map[int]bool)
-
-	for i := 1; i <= n/2; i++ {
-		if n%i == 0 && !seen[i] {
-			result = append(result, i)
-			seen[i] = true
-		}
-	}
-
-	result = append(result, n)
-	sort.Ints(result)
-
-	return result
-}
-
-func (p ImageProcessor) supportedResolutions() []image.Point {
-	var resolutions []image.Point
-
-	aspectMap := make(map[float64][]image.Point)
-	for i := p.patchSize; i >= 1; i-- {
-		for _, f := range factors(i) {
-			x := f
-			y := i / f
-			k := float64(y) / float64(x)
-			aspectMap[k] = append(aspectMap[k], image.Point{x, y})
-		}
-	}
-
-	for _, v := range aspectMap {
-		for _, i := range v {
-			resolutions = append(resolutions, image.Point{i.X * p.imageSize, i.Y * p.imageSize})
-		}
-	}
-
-	return resolutions
-}
-
-func (p ImageProcessor) bestResolution(img image.Point, possibleResolutions []image.Point, resizeToMaxCanvas bool) image.Point {
-	w, h := img.X, img.Y
-
-	scales := make([]float64, len(possibleResolutions))
-
-	for i, res := range possibleResolutions {
-		scaleW := float64(res.X) / float64(w)
-		scaleH := float64(res.Y) / float64(h)
-		scale := math.Min(scaleW, scaleH)
-
-		scales[i] = scale
-	}
-
-	minAboveOne := func(scales []float64) (float64, bool) {
-		min := math.MaxFloat64
-		found := false
-
-		for _, s := range scales {
-			if s >= 1.0 && s < min {
-				min = s
-				found = true
-			}
-		}
-
-		return min, found
-	}
-
-	bestScale, ok := minAboveOne(scales)
-	if resizeToMaxCanvas || !ok {
-		bestScale = slices.Max(scales)
-	}
-
-	var bestOptions []image.Point
-	for i, scale := range scales {
-		if math.Abs(scale-bestScale) < 1e-6 {
-			bestOptions = append(bestOptions, possibleResolutions[i])
-		}
-	}
-
-	var chosenResolution image.Point
-	if len(bestOptions) > 1 {
-		chosenResolution = slices.MinFunc(bestOptions, func(a, b image.Point) int {
-			return cmp.Compare(a.X*a.Y, b.X*b.Y)
-		})
-	} else {
-		chosenResolution = bestOptions[0]
-	}
-
-	return chosenResolution
-}
-
-func (p ImageProcessor) maxResolution(imageRes, targetRes image.Point) image.Point {
-	scaleW := float64(targetRes.X) / float64(imageRes.X)
-	scaleH := float64(targetRes.Y) / float64(imageRes.Y)
-
-	var newRes image.Point
-	if scaleW < scaleH {
-		newRes = image.Point{
-			targetRes.X,
-			int(math.Min(math.Floor(float64(imageRes.Y)*scaleW), float64(targetRes.Y))),
-		}
-	} else {
-		newRes = image.Point{
-			int(math.Min(math.Floor(float64(imageRes.X)*scaleH), float64(targetRes.X))),
-			targetRes.Y,
-		}
-	}
-
-	return newRes
-}
-
-func (p ImageProcessor) pad(src image.Image, outputSize image.Point) image.Image {
-	dst := image.NewRGBA(image.Rect(0, 0, outputSize.X, outputSize.Y))
-	draw.Draw(dst, src.Bounds(), src, image.Point{}, draw.Over)
-	return dst
-}
-
-func (p ImageProcessor) ProcessImage(img image.Image) (pixelsLocal, pixelsGlobal []float32, targetSize image.Point, _ error) {
-	img = imageproc.Composite(img)
-
-	targetSize = p.bestResolution(img.Bounds().Max, p.supportedResolutions(), false)
-	targetSizeWithoutDistortion := targetSize
-	if p.maxUpscalingSize > 0 {
-		targetSizeWithoutDistortion = p.maxResolution(img.Bounds().Max, targetSize)
-		targetSizeWithoutDistortion.X = min(max(img.Bounds().Max.X, p.maxUpscalingSize), targetSize.X)
-		targetSizeWithoutDistortion.Y = min(max(img.Bounds().Max.Y, p.maxUpscalingSize), targetSize.Y)
-	}
-
-	newSizeWithoutDistortion := p.maxResolution(img.Bounds().Max, targetSizeWithoutDistortion)
-
-	padded := p.pad(imageproc.Resize(img, newSizeWithoutDistortion, imageproc.ResizeBilinear), targetSize)
-	pixelsLocal = imageproc.Normalize(padded, imageproc.ImageNetStandardMean, imageproc.ImageNetStandardSTD, true, true)
-
-	if targetSize.X/p.imageSize*targetSize.Y/p.imageSize > 1 {
-		padded := imageproc.Resize(img, image.Point{p.imageSize, p.imageSize}, imageproc.ResizeBilinear)
-		pixelsGlobal = imageproc.Normalize(padded, imageproc.ImageNetStandardMean, imageproc.ImageNetStandardSTD, true, true)
-	}
-
-	return pixelsLocal, pixelsGlobal, targetSize, nil
-}
--- a/model/models/llama4/process_image_test.go
+++ b/model/models/llama4/process_image_test.go
@@ -1,300 +0,0 @@
-package llama4
-
-import (
-	"cmp"
-	"image"
-	"image/color"
-	"reflect"
-	"slices"
-	"testing"
-
-	gocmp "github.com/google/go-cmp/cmp"
-)
-
-func TestFactors(t *testing.T) {
-	tests := []struct {
-		name     string
-		input    int
-		expected []int
-	}{
-		{
-			name:     "factors of 1",
-			input:    1,
-			expected: []int{1},
-		},
-		{
-			name:     "factors of 2",
-			input:    2,
-			expected: []int{1, 2},
-		},
-		{
-			name:     "factors of 6",
-			input:    6,
-			expected: []int{1, 2, 3, 6},
-		},
-		{
-			name:     "factors of 28",
-			input:    28,
-			expected: []int{1, 2, 4, 7, 14, 28},
-		},
-		{
-			name:     "factors of 49",
-			input:    49,
-			expected: []int{1, 7, 49},
-		},
-		{
-			name:     "factors of 97 (prime)",
-			input:    97,
-			expected: []int{1, 97},
-		},
-	}
-
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			actual := factors(tt.input)
-			if !reflect.DeepEqual(actual, tt.expected) {
-				t.Errorf("factors(%d) = %v; want %v", tt.input, actual, tt.expected)
-			}
-		})
-	}
-}
-
-func TestSupportedResolutions(t *testing.T) {
-	expectedResolutions := []image.Point{
-		{X: 3360, Y: 336},
-		{X: 672, Y: 2688},
-		{X: 336, Y: 1344},
-		{X: 336, Y: 4032},
-		{X: 1008, Y: 1344},
-		{X: 1344, Y: 1008},
-		{X: 336, Y: 1680},
-		{X: 1680, Y: 336},
-		{X: 336, Y: 5040},
-		{X: 4032, Y: 336},
-		{X: 2352, Y: 336},
-		{X: 2688, Y: 672},
-		{X: 1344, Y: 336},
-		{X: 5376, Y: 336},
-		{X: 2352, Y: 672},
-		{X: 672, Y: 1008},
-		{X: 1008, Y: 672},
-		{X: 336, Y: 5376},
-		{X: 1680, Y: 1008},
-		{X: 5040, Y: 336},
-		{X: 336, Y: 3024},
-		{X: 3024, Y: 336},
-		{X: 336, Y: 2688},
-		{X: 672, Y: 1344},
-		{X: 336, Y: 672},
-		{X: 336, Y: 2352},
-		{X: 2016, Y: 672},
-		{X: 1008, Y: 336},
-		{X: 336, Y: 3360},
-		{X: 336, Y: 4368},
-		{X: 1008, Y: 1680},
-		{X: 336, Y: 4704},
-		{X: 4704, Y: 336},
-		{X: 1344, Y: 672},
-		{X: 672, Y: 336},
-		{X: 2688, Y: 336},
-		{X: 3696, Y: 336},
-		{X: 2016, Y: 336},
-		{X: 1344, Y: 1344},
-		{X: 1008, Y: 1008},
-		{X: 672, Y: 672},
-		{X: 336, Y: 336},
-		{X: 4368, Y: 336},
-		{X: 672, Y: 2016},
-		{X: 336, Y: 1008},
-		{X: 336, Y: 3696},
-		{X: 672, Y: 1680},
-		{X: 1680, Y: 672},
-		{X: 336, Y: 2016},
-		{X: 672, Y: 2352},
-	}
-
-	sortResolutionFunc := func(a, b image.Point) int {
-		return cmp.Or(cmp.Compare(a.X, b.X), cmp.Compare(a.Y, b.Y))
-	}
-
-	slices.SortStableFunc(expectedResolutions, sortResolutionFunc)
-
-	imgProc := ImageProcessor{
-		imageSize:        336,
-		patchSize:        16,
-		numChannels:      3,
-		maxUpscalingSize: 448,
-	}
-
-	actualResolutions := imgProc.supportedResolutions()
-	slices.SortStableFunc(actualResolutions, sortResolutionFunc)
-
-	if diff := gocmp.Diff(expectedResolutions, actualResolutions); diff != "" {
-		t.Errorf("supportedResolutions() mismatch (-want +got):\n%s", diff)
-	}
-}
-
-func TestBestResolution(t *testing.T) {
-	tests := []struct {
-		name        string
-		size        image.Point
-		resolutions []image.Point
-		max         bool
-		expected    image.Point
-	}{
-		{
-			"normal",
-			image.Point{800, 600},
-			[]image.Point{
-				{300, 200},
-				{640, 480},
-				{800, 600},
-				{1024, 768},
-				{1600, 1200},
-			},
-			false,
-			image.Point{800, 600},
-		},
-		{
-			"max",
-			image.Point{800, 600},
-			[]image.Point{
-				{300, 200},
-				{640, 480},
-				{800, 600},
-				{1024, 768},
-				{1600, 1200},
-			},
-			true,
-			image.Point{1600, 1200},
-		},
-		{
-			"mid",
-			image.Point{1000, 700},
-			[]image.Point{
-				{300, 200},
-				{640, 480},
-				{800, 600},
-				{1024, 768},
-				{1600, 1200},
-			},
-			false,
-			image.Point{1024, 768},
-		},
-		{
-			"smol",
-			image.Point{100, 100},
-			[]image.Point{
-				{300, 200},
-				{640, 480},
-				{800, 600},
-				{1024, 768},
-				{1600, 1200},
-			},
-			false,
-			image.Point{300, 200},
-		},
-		{
-			"huge",
-			image.Point{10000, 10000},
-			[]image.Point{
-				{300, 200},
-				{640, 480},
-				{800, 600},
-				{1024, 768},
-				{1600, 1200},
-			},
-			false,
-			image.Point{1600, 1200},
-		},
-	}
-
-	p := ImageProcessor{}
-
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			actual := p.bestResolution(tt.size, tt.resolutions, tt.max)
-			if diff := gocmp.Diff(tt.expected, actual); diff != "" {
-				t.Errorf("best resolution mismatch (-want +got):\n%s", diff)
-			}
-		})
-	}
-}
-
-func TestMaxResolution(t *testing.T) {
-	tests := []struct {
-		name      string
-		origRes   image.Point
-		targetRes image.Point
-		expected  image.Point
-	}{
-		{
-			"normal",
-			image.Point{800, 600},
-			image.Point{800, 600},
-			image.Point{800, 600},
-		},
-		{
-			"skew",
-			image.Point{800, 600},
-			image.Point{1100, 700},
-			image.Point{933, 700},
-		},
-	}
-
-	p := ImageProcessor{}
-
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			actual := p.maxResolution(tt.origRes, tt.targetRes)
-			if !reflect.DeepEqual(actual, tt.expected) {
-				t.Errorf("max resolution; got %v want %v", actual, tt.expected)
-			}
-		})
-	}
-}
-
-func TestProcessImage(t *testing.T) {
-	imgProc := ImageProcessor{
-		imageSize:        336,
-		patchSize:        16,
-		numChannels:      3,
-		maxUpscalingSize: 448,
-	}
-
-	generateImage := func(seed int) image.Image {
-		width, height := 20, 10
-		img := image.NewRGBA(image.Rect(0, 0, width, height))
-
-		for x := range width {
-			// Use the seed to vary color generation
-			r := uint8((seed + x*11) % 256)
-			g := uint8((seed + x*17) % 256)
-			b := uint8((seed + x*23) % 256)
-
-			c := color.RGBA{R: r, G: g, B: b, A: 255}
-			for y := range height {
-				img.Set(x, y, c)
-			}
-		}
-
-		return img
-	}
-
-	pixelsLocal, pixelsGlobal, targetSize, err := imgProc.ProcessImage(generateImage(12))
-	if err != nil {
-		t.Error(err)
-	}
-
-	if n := len(pixelsLocal); n != 336*336*3 {
-		t.Errorf("unexpected size of f32s: %d", n)
-	}
-
-	if n := len(pixelsGlobal); n > 0 {
-		t.Errorf("unexpected size of f32s: %d", n)
-	}
-
-	if !targetSize.Eq(image.Point{336, 336}) {
-		t.Errorf("unexpected target size: %v", targetSize)
-	}
-}
--- a/model/models/mistral3/model_text.go
+++ b/model/models/mistral3/model_text.go
@@ -152,7 +152,7 @@ func NewTextModel(c fs.Config) (*TextModel, error) {
 			c.String("tokenizer.ggml.pretokenizer", `[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
 			&model.Vocabulary{
 				Values: c.Strings("tokenizer.ggml.tokens"),
-				Types:  c.Ints("tokenizer.ggml.token_type"),
+				Types:  c.Uints("tokenizer.ggml.token_type"),
 				Merges: c.Strings("tokenizer.ggml.merges"),
 				BOS:    int32(c.Uint("tokenizer.ggml.bos_token_id", 1)),
 				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
--- a/model/models/mllama/model.go
+++ b/model/models/mllama/model.go
@@ -43,7 +43,7 @@ func New(c fs.Config) (model.Model, error) {
 			c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
 			&model.Vocabulary{
 				Values: c.Strings("tokenizer.ggml.tokens"),
-				Types:  c.Ints("tokenizer.ggml.token_type"),
+				Types:  c.Uints("tokenizer.ggml.token_type"),
 				Merges: c.Strings("tokenizer.ggml.merges"),
 				BOS:    int32(c.Uint("tokenizer.ggml.bos_token_id")),
 				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
--- a/model/models/mllama/model_text.go
+++ b/model/models/mllama/model_text.go
@@ -177,7 +177,7 @@ type TextDecoder struct {
 func (d *TextDecoder) Forward(ctx ml.Context, hiddenState, positionIDs, outputs, mask, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor {
 	for i, layer := range d.Layers {
 		layerType := selfAttentionLayer
-		if slices.Contains(opts.crossAttentionLayers, int32(i)) {
+		if slices.Contains(opts.crossAttentionLayers, uint32(i)) {
 			layerType = crossAttentionLayer
 		}

@@ -202,7 +202,7 @@ type TextModelOptions struct {
 	eps, ropeBase, ropeScale         float32
 	ropeDim                          uint32

-	crossAttentionLayers []int32
+	crossAttentionLayers []uint32
 }

 type TextModel struct {
@@ -225,7 +225,7 @@ func newTextModel(c fs.Config) *TextModel {
 	var decoderLayers []TextDecoderLayer
 	for i := range c.Uint("block_count") {
 		var textDecoderLayer TextDecoderLayer
-		if slices.Contains(c.Ints("attention.cross_attention_layers"), int32(i)) {
+		if slices.Contains(c.Uints("attention.cross_attention_layers"), i) {
 			textDecoderLayer = &TextCrossAttentionDecoderLayer{}
 		} else {
 			textDecoderLayer = &TextSelfAttentionDecoderLayer{}
@@ -244,7 +244,7 @@ func newTextModel(c fs.Config) *TextModel {
 			ropeBase:             c.Float("rope.freq_base"),
 			ropeScale:            c.Float("rope.freq_scale", 1),
 			ropeDim:              c.Uint("rope.dimension_count"),
-			crossAttentionLayers: c.Ints("attention.cross_attention_layers"),
+			crossAttentionLayers: c.Uints("attention.cross_attention_layers"),
 		},
 	}
 }
--- a/model/models/mllama/model_vision.go
+++ b/model/models/mllama/model_vision.go
@@ -96,10 +96,10 @@ type VisionEncoder struct {
 	Layers []VisionEncoderLayer
 }

-func (e *VisionEncoder) Forward(ctx ml.Context, hiddenState ml.Tensor, intermediateLayersIndices []int32, opts *VisionModelOptions) (ml.Tensor, []ml.Tensor) {
+func (e *VisionEncoder) Forward(ctx ml.Context, hiddenState ml.Tensor, intermediateLayersIndices []uint32, opts *VisionModelOptions) (ml.Tensor, []ml.Tensor) {
 	var intermediateHiddenStates []ml.Tensor
 	for i, layer := range e.Layers {
-		if slices.Contains(intermediateLayersIndices, int32(i)) {
+		if slices.Contains(intermediateLayersIndices, uint32(i)) {
 			intermediateHiddenStates = append(intermediateHiddenStates, hiddenState.Reshape(ctx, append([]int{1}, hiddenState.Shape()...)...))
 		}

@@ -154,7 +154,7 @@ type VisionModelOptions struct {
 	imageSize, patchSize           int
 	eps                            float32

-	intermediateLayersIndices []int32
+	intermediateLayersIndices []uint32
 }

 type VisionModel struct {
@@ -229,7 +229,7 @@ func newVisionModel(c fs.Config) *VisionModel {

 			eps: c.Float("vision.attention.layer_norm_epsilon"),

-			intermediateLayersIndices: c.Ints("vision.intermediate_layers_indices"),
+			intermediateLayersIndices: c.Uints("vision.intermediate_layers_indices"),
 		},
 	}
 }
--- a/model/models/models.go
+++ b/model/models/models.go
@@ -4,7 +4,6 @@ import (
 	_ "github.com/ollama/ollama/model/models/gemma2"
 	_ "github.com/ollama/ollama/model/models/gemma3"
 	_ "github.com/ollama/ollama/model/models/llama"
-	_ "github.com/ollama/ollama/model/models/llama4"
 	_ "github.com/ollama/ollama/model/models/mistral3"
 	_ "github.com/ollama/ollama/model/models/mllama"
 )
--- a/model/process_text.go
+++ b/model/process_text.go
@@ -37,7 +37,7 @@ type TextProcessor interface {

 type Vocabulary struct {
 	Values []string
-	Types  []int32
+	Types  []uint32
 	Scores []float32
 	Merges []string

--- a/model/process_text_spm_test.go
+++ b/model/process_text_spm_test.go
@@ -35,9 +35,9 @@ func loadSentencePieceVocab(t *testing.T) SentencePieceModel {
 			sentencepiece.ModelProto_SentencePiece_CONTROL,
 			sentencepiece.ModelProto_SentencePiece_UNUSED,
 			sentencepiece.ModelProto_SentencePiece_BYTE:
-			v.Types = append(v.Types, int32(t))
+			v.Types = append(v.Types, uint32(t))
 		default:
-			tt := int32(sentencepiece.ModelProto_SentencePiece_NORMAL)
+			tt := uint32(sentencepiece.ModelProto_SentencePiece_NORMAL)
 			// todo parse the special tokens file
 			//   - this will roundtrip correctly but the <start_of_turn> and
 			//     <end_of_turn> tokens aren't processed
@@ -124,7 +124,7 @@ func TestSentencePieceModelDecodeByteTokens(t *testing.T) {
 			"<0xC3>",
 			"<0xA3>",
 		},
-		Types: []int32{
+		Types: []uint32{
 			TOKEN_TYPE_NORMAL,
 			TOKEN_TYPE_BYTE,
 			TOKEN_TYPE_BYTE,
--- a/model/process_text_test.go
+++ b/model/process_text_test.go
@@ -28,7 +28,7 @@ func llama(t testing.TB) BytePairEncoding {
 		t.Fatal(err)
 	}

-	types := make([]int32, len(vocab))
+	types := make([]uint32, len(vocab))
 	tokens := make([]string, len(vocab))
 	for token, id := range vocab {
 		tokens[id] = token
--- a/progress/bar.go
+++ b/progress/bar.go
@@ -64,7 +64,7 @@ func formatDuration(d time.Duration) string {
 func (b *Bar) String() string {
 	termWidth, _, err := term.GetSize(int(os.Stderr.Fd()))
 	if err != nil {
-		termWidth = 80
+		termWidth = defaultTermWidth
 	}

 	var pre strings.Builder
--- a/progress/progress.go
+++ b/progress/progress.go
@@ -4,8 +4,16 @@ import (
 	"bufio"
 	"fmt"
 	"io"
+	"os"
 	"sync"
 	"time"
+
+	"golang.org/x/term"
+)
+
+const (
+	defaultTermWidth  = 80
+	defaultTermHeight = 24
 )

 type State interface {
@@ -83,6 +91,11 @@ func (p *Progress) Add(key string, state State) {
 }

 func (p *Progress) render() {
+	_, termHeight, err := term.GetSize(int(os.Stderr.Fd()))
+	if err != nil {
+		termHeight = defaultTermHeight
+	}
+
 	p.mu.Lock()
 	defer p.mu.Unlock()

@@ -102,8 +115,9 @@ func (p *Progress) render() {
 	fmt.Fprint(p.w, "\033[1G")

 	// render progress lines
-	for i, state := range p.states {
-		fmt.Fprint(p.w, state.String(), "\033[K")
+	maxHeight := min(len(p.states), termHeight)
+	for i := len(p.states) - maxHeight; i < len(p.states); i++ {
+		fmt.Fprint(p.w, p.states[i].String(), "\033[K")
 		if i < len(p.states)-1 {
 			fmt.Fprint(p.w, "\n")
 		}
--- a/runner/ollamarunner/runner.go
+++ b/runner/ollamarunner/runner.go
@@ -723,9 +723,7 @@ func (m *multiLPath) String() string {
 	return strings.Join(*m, ", ")
 }

-// TODO(jessegross): This is causing tensor allocation failures with large batches when not offloaded
-// to the GPU
-/*func (s *Server) reserveWorstCaseGraph() error {
+func (s *Server) reserveWorstCaseGraph() error {
 	ctx := s.model.Backend().NewContext()
 	defer ctx.Close()

@@ -768,7 +766,7 @@ func (m *multiLPath) String() string {
 	}

 	return nil
-}*/
+}

 func (s *Server) loadModel(
 	ctx context.Context,
@@ -805,10 +803,10 @@ func (s *Server) loadModel(
 	s.seqs = make([]*Sequence, s.parallel)
 	s.seqsSem = semaphore.NewWeighted(int64(s.parallel))

-	/*err = s.reserveWorstCaseGraph()
+	err = s.reserveWorstCaseGraph()
 	if err != nil {
 		panic(err)
-	}*/
+	}

 	s.status = llm.ServerStatusReady
 	s.ready.Done()
--- a/sample/samplers_test.go
+++ b/sample/samplers_test.go
@@ -74,6 +74,7 @@ func modelHelper(t testing.TB) model.BytePairEncoding {
 		t.Fatal(err)
 	}

+	types := make([]uint32, len(vocab))
 	tokens := make([]string, len(vocab))
 	for token, id := range vocab {
 		tokens[id] = token
@@ -85,7 +86,7 @@ func modelHelper(t testing.TB) model.BytePairEncoding {
 		``,
 		&model.Vocabulary{
 			Values: tokens,
-			Types:  make([]int32, len(vocab)),
+			Types:  types,
 			Merges: merges,
 		},
 	)
--- a/server/create.go
+++ b/server/create.go
@@ -295,7 +295,7 @@ func convertFromSafetensors(files map[string]string, baseLayers []*layerGGML, is
 	}
 	defer bin.Close()

-	f, _, err := ggml.Decode(bin, 1024)
+	f, _, err := ggml.Decode(bin, 0)
 	if err != nil {
 		return nil, err
 	}
@@ -457,7 +457,7 @@ func quantizeLayer(layer *layerGGML, quantizeType string, fn func(resp api.Progr
 		return nil, err
 	}

-	f, _, err := ggml.Decode(temp, 1024)
+	f, _, err := ggml.Decode(temp, 0)
 	if err != nil {
 		slog.Error(fmt.Sprintf("error decoding ggml: %s\n", err))
 		return nil, err
@@ -499,7 +499,7 @@ func ggufLayers(digest string, fn func(resp api.ProgressResponse)) ([]*layerGGML

 	var offset int64
 	for offset < stat.Size() {
-		f, n, err := ggml.Decode(blob, 1024)
+		f, n, err := ggml.Decode(blob, 0)
 		if errors.Is(err, io.EOF) {
 			break
 		} else if err != nil {
--- a/server/images.go
+++ b/server/images.go
@@ -75,7 +75,7 @@ func (m *Model) Capabilities() []model.Capability {
 	if err == nil {
 		defer r.Close()

-		f, _, err := ggml.Decode(r, 1024)
+		f, _, err := ggml.Decode(r, 0)
 		if err == nil {
 			if _, ok := f.KV()[fmt.Sprintf("%s.pooling_type", f.KV().Architecture())]; ok {
 				capabilities = append(capabilities, model.CapabilityEmbedding)
--- a/server/internal/registry/server.go
+++ b/server/internal/registry/server.go
@@ -73,13 +73,8 @@ type statusCodeRecorder struct {
 func (r *statusCodeRecorder) WriteHeader(status int) {
 	if r._status == 0 {
 		r._status = status
-		r.ResponseWriter.WriteHeader(status)
 	}
-}
-
-func (r *statusCodeRecorder) Write(b []byte) (int, error) {
-	r._status = r.status()
-	return r.ResponseWriter.Write(b)
+	r.ResponseWriter.WriteHeader(status)
 }

 var (
--- a/server/model.go
+++ b/server/model.go
@@ -64,7 +64,7 @@ func parseFromModel(ctx context.Context, name model.Name, fn func(api.ProgressRe
 			}
 			defer blob.Close()

-			f, _, err := ggml.Decode(blob, 1024)
+			f, _, err := ggml.Decode(blob, 0)
 			if err != nil {
 				return nil, err
 			}
--- a/server/routes_generate_test.go
+++ b/server/routes_generate_test.go
@@ -299,6 +299,9 @@ func TestGenerateChat(t *testing.T) {
 				{Role: "user", Content: "Hello!"},
 			},
 			Stream: &stream,
+			Options: map[string]any{
+				"num_ctx": 1024,
+			},
 		})

 		if w.Code != http.StatusOK {
@@ -321,6 +324,9 @@ func TestGenerateChat(t *testing.T) {
 				{Role: "user", Content: "Hello!"},
 			},
 			Stream: &stream,
+			Options: map[string]any{
+				"num_ctx": 1024,
+			},
 		})

 		if w.Code != http.StatusOK {
@@ -344,6 +350,9 @@ func TestGenerateChat(t *testing.T) {
 				{Role: "user", Content: "Help me write tests."},
 			},
 			Stream: &stream,
+			Options: map[string]any{
+				"num_ctx": 1024,
+			},
 		})

 		if w.Code != http.StatusOK {
--- a/server/sched.go
+++ b/server/sched.go
@@ -81,10 +81,6 @@ func InitScheduler(ctx context.Context) *Scheduler {

 // context must be canceled to decrement ref count and release the runner
 func (s *Scheduler) GetRunner(c context.Context, model *Model, opts api.Options, sessionDuration *api.Duration) (chan *runnerRef, chan error) {
-	if opts.NumCtx < 4 {
-		opts.NumCtx = 4
-	}
-
 	req := &LlmRequest{
 		ctx:             c,
 		model:           model,
@@ -114,6 +110,11 @@ func (s *Scheduler) Run(ctx context.Context) {
 	}()
 }

+const (
+	defaultContextLength  = 4096
+	smallGpuContextLength = 2048
+)
+
 func (s *Scheduler) processPending(ctx context.Context) {
 	for {
 		select {
@@ -166,6 +167,17 @@ func (s *Scheduler) processPending(ctx context.Context) {
 						gpus = s.getGpuFn()
 					}

+					if pending.origNumCtx == -1 {
+						if len(gpus) == 1 && gpus[0].Library != "cpu" && gpus[0].TotalMemory <= 4096*1024*1024 {
+							slog.Info("GPU is small, limiting default context window", "num_ctx", smallGpuContextLength)
+							pending.opts.NumCtx = smallGpuContextLength
+							pending.origNumCtx = smallGpuContextLength
+						} else {
+							pending.opts.NumCtx = defaultContextLength
+							pending.origNumCtx = defaultContextLength
+						}
+					}
+
 					if envconfig.MaxRunners() <= 0 {
 						// No user specified MaxRunners, so figure out what automatic setting to use
 						// If all GPUs have reliable free memory reporting, defaultModelsPerGPU * the number of GPUs
--- a/server/sched_test.go
+++ b/server/sched_test.go
@@ -148,6 +148,7 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est
 		successCh:       make(chan *runnerRef, 1),
 		errCh:           make(chan error, 1),
 	}
+	b.req.opts.NumCtx = 4096
 	b.srv = &mockLlm{estimatedVRAM: estimatedVRAM, estimatedVRAMByGPU: map[string]uint64{"": estimatedVRAM}}
 	return b
 }
Author	SHA1	Message	Date
Michael Yang	34ae8077d1	wip: write tensors in parallel	2025-04-25 13:39:12 -07:00
Michael Yang	b0f28d178a	default max term height	2025-04-25 12:54:07 -07:00
Michael Yang	588a97dbef	create blobs in parallel	2025-04-25 12:54:07 -07:00