ollamarunner: Multi-modal worst case graph

We currently preallocate compute graph memory for the worst case batch of text tokens. This adds support for doing the same for images. Note that image models are more complicated than text models in how they process their inputs so there may be cases where this approach isn't completely generic for all models. It covers all currently supported models though.
ollamarunner: Separate text and multimodal graphs
2025-05-12 16:35:02 -07:00 · 2025-05-12 16:26:21 -07:00 · 2025-05-12 16:26:21 -07:00
107 changed files with 4747 additions and 2737 deletions
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@ -103,11 +103,6 @@ jobs:
        arch: [amd64]
        preset: ['CPU']
        include:
-          - os: windows
-            arch: amd64
-            preset: 'CUDA 11'
-            install: https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.89_win10.exe
-            cuda-version: '11.3'
          - os: windows
            arch: amd64
            preset: 'CUDA 12'
@ -324,7 +319,6 @@ jobs:
            case "$COMPONENT" in
              bin/ollama)               echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
              lib/ollama/*.so)          echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
-              lib/ollama/cuda_v11)      echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
              lib/ollama/cuda_v12)      echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
              lib/ollama/cuda_jetpack5) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack5.tar.in ;;
              lib/ollama/cuda_jetpack6) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack6.tar.in ;;
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@ -46,7 +46,7 @@ jobs:
        include:
          - preset: CPU
          - preset: CUDA
-            container: nvidia/cuda:11.8.0-devel-ubuntu22.04
+            container: nvidia/cuda:12.8.1-devel-ubuntu22.04
            flags: '-DCMAKE_CUDA_ARCHITECTURES=87'
          - preset: ROCm
            container: rocm/dev-ubuntu-22.04:6.1.2
@ -78,7 +78,7 @@ jobs:
        include:
          - preset: CPU
          - preset: CUDA
-            install: https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.89_win10.exe
+            install: https://developer.download.nvidia.com/compute/cuda/12.8.0/local_installers/cuda_12.8.0_571.96_windows.exe
            flags: '-DCMAKE_CUDA_ARCHITECTURES=80'
          - preset: ROCm
            install: https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q4-WinSvr2022-For-HIP.exe
@ -102,7 +102,7 @@ jobs:
          $ErrorActionPreference = "Stop"
          if ("${{ steps.cache-install.outputs.cache-hit }}" -ne 'true') {
            Invoke-WebRequest -Uri "${{ matrix.install }}" -OutFile "install.exe"
-            Start-Process -FilePath .\install.exe -ArgumentList (@("-s", "cudart_11.3", "nvcc_11.3", "cublas_11.3", "cublas_dev_11.3")) -NoNewWindow -Wait
+            Start-Process -FilePath .\install.exe -ArgumentList (@("-s", "cudart_12.8", "nvcc_12.8", "cublas_12.8", "cublas_dev_12.8")) -NoNewWindow -Wait
          }

          $cudaPath = (Resolve-Path "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\*").path
--- a/CMakePresets.json
+++ b/CMakePresets.json
@ -17,14 +17,6 @@
      "name": "CUDA",
      "inherits": [ "Default" ]
    },
-    {
-      "name": "CUDA 11",
-      "inherits": [ "CUDA" ],
-      "cacheVariables": {
-        "CMAKE_CUDA_ARCHITECTURES": "50;52;53;60;61;70;75;80;86",
-        "CMAKE_CUDA_FLAGS": "-Wno-deprecated-gpu-targets"
-      }
-    },
    {
      "name": "CUDA 12",
      "inherits": [ "CUDA" ],
@ -78,11 +70,6 @@
      "configurePreset": "CUDA",
      "targets": [ "ggml-cuda" ]
    },
-    {
-      "name": "CUDA 11",
-      "inherits": [ "CUDA" ],
-      "configurePreset": "CUDA 11"
-    },
    {
      "name": "CUDA 12",
      "inherits": [ "CUDA" ],
--- a/17
+++ b/17
@ -7,14 +7,10 @@ ARG JETPACK5VERSION=r35.4.1
 ARG JETPACK6VERSION=r36.4.0
 ARG CMAKEVERSION=3.31.2

-# CUDA v11 requires gcc v10.  v10.3 has regressions, so the rockylinux 8.5 AppStream has the latest compatible version
 FROM --platform=linux/amd64 rocm/dev-almalinux-8:${ROCMVERSION}-complete AS base-amd64
 RUN yum install -y yum-utils \
-    && yum-config-manager --add-repo https://dl.rockylinux.org/vault/rocky/8.5/AppStream/\$basearch/os/ \
-    && rpm --import https://dl.rockylinux.org/pub/rocky/RPM-GPG-KEY-Rocky-8 \
-    && dnf install -y yum-utils ccache gcc-toolset-10-gcc-10.2.1-8.2.el8 gcc-toolset-10-gcc-c++-10.2.1-8.2.el8 gcc-toolset-10-binutils-2.35-11.el8 \
+    && dnf install -y ccache \
    && yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo
-ENV PATH=/opt/rh/gcc-toolset-10/root/usr/bin:$PATH

 FROM --platform=linux/arm64 almalinux:8 AS base-arm64
 # install epel-release for ccache
@ -38,15 +34,6 @@ RUN --mount=type=cache,target=/root/.ccache \
        && cmake --build --parallel --preset 'CPU' \
        && cmake --install build --component CPU --strip --parallel 8

-FROM base AS cuda-11
-ARG CUDA11VERSION=11.3
-RUN dnf install -y cuda-toolkit-${CUDA11VERSION//./-}
-ENV PATH=/usr/local/cuda-11/bin:$PATH
-RUN --mount=type=cache,target=/root/.ccache \
-    cmake --preset 'CUDA 11' \
-        && cmake --build --parallel --preset 'CUDA 11' \
-        && cmake --install build --component CUDA --strip --parallel 8
-
 FROM base AS cuda-12
 ARG CUDA12VERSION=12.8
 RUN dnf install -y cuda-toolkit-${CUDA12VERSION//./-}
@ -98,11 +85,9 @@ RUN --mount=type=cache,target=/root/.cache/go-build \
    go build -trimpath -buildmode=pie -o /bin/ollama .

 FROM --platform=linux/amd64 scratch AS amd64
-COPY --from=cuda-11 dist/lib/ollama/cuda_v11 /lib/ollama/cuda_v11
 COPY --from=cuda-12 dist/lib/ollama/cuda_v12 /lib/ollama/cuda_v12

 FROM --platform=linux/arm64 scratch AS arm64
-COPY --from=cuda-11 dist/lib/ollama/cuda_v11 /lib/ollama/cuda_v11
 COPY --from=cuda-12 dist/lib/ollama/cuda_v12 /lib/ollama/cuda_v12
 COPY --from=jetpack-5 dist/lib/ollama/cuda_v11 /lib/ollama/cuda_jetpack5
 COPY --from=jetpack-6 dist/lib/ollama/cuda_v12 /lib/ollama/cuda_jetpack6
--- a/Makefile.sync
+++ b/Makefile.sync
@ -15,13 +15,11 @@ help:
 	@echo "    make -f $(lastword $(MAKEFILE_LIST)) clean sync"

 .PHONY: sync
-sync: llama/build-info.cpp ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal
+sync: llama/build-info.cpp llama/llama.cpp ml/backend/ggml/ggml

-llama/build-info.cpp: llama/build-info.cpp.in llama/llama.cpp
-	sed -e 's|@FETCH_HEAD@|$(FETCH_HEAD)|' <$< >$@
-
-ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal: ml/backend/ggml/ggml
-	go generate ./$(@D)
+.PHONY: llama/build-info.cpp
+llama/build-info.cpp: llama/build-info.cpp.in
+	sed -e 's|@FETCH_HEAD@|$(FETCH_HEAD)|' $< > $@

 .PHONY: llama/llama.cpp
 llama/llama.cpp: llama/vendor/
@ -32,13 +30,12 @@ ml/backend/ggml/ggml: llama/vendor/ggml/
 	rsync -arvzc -f "merge $@/.rsync-filter" $< $@

 PATCHES=$(wildcard llama/patches/*.patch)
-PATCHED=$(join $(dir $(PATCHES)), $(addsuffix ed, $(addprefix ., $(notdir $(PATCHES)))))

 .PHONY: apply-patches
 .NOTPARALLEL:
-apply-patches: $(PATCHED)
+apply-patches: $(addsuffix ed, $(PATCHES))

-llama/patches/.%.patched: llama/patches/%.patch
+%.patched: %.patch
 	@if git -c user.name=nobody -c 'user.email=<>' -C $(WORKDIR) am -3 $(realpath $<); then touch $@; else git -C $(WORKDIR) am --abort; exit 1; fi

 .PHONY: checkout
@ -60,4 +57,4 @@ format-patches: llama/patches

 .PHONE: clean
 clean: checkout
-	$(RM) llama/patches/.*.patched
+	$(RM) $(addsuffix ed, $(PATCHES))
--- a/cmd/interactive.go
+++ b/cmd/interactive.go
@ -44,7 +44,7 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 		fmt.Fprintln(os.Stderr, "Use \"\"\" to begin a multi-line message.")

 		if opts.MultiModal {
-			fmt.Fprintf(os.Stderr, "Use %s to include .jpg, .png, or .webp images.\n", filepath.FromSlash("/path/to/file"))
+			fmt.Fprintf(os.Stderr, "Use %s to include .jpg or .png images.\n", filepath.FromSlash("/path/to/file"))
 		}

 		fmt.Fprintln(os.Stderr, "")
@ -511,7 +511,7 @@ func extractFileNames(input string) []string {
 	// Regex to match file paths starting with optional drive letter, / ./ \ or .\ and include escaped or unescaped spaces (\ or %20)
 	// and followed by more characters and a file extension
 	// This will capture non filename strings, but we'll check for file existence to remove mismatches
-	regexPattern := `(?:[a-zA-Z]:)?(?:\./|/|\\)[\S\\ ]+?\.(?i:jpg|jpeg|png|webp)\b`
+	regexPattern := `(?:[a-zA-Z]:)?(?:\./|/|\\)[\S\\ ]+?\.(?i:jpg|jpeg|png)\b`
 	re := regexp.MustCompile(regexPattern)

 	return re.FindAllString(input, -1)
@ -553,7 +553,7 @@ func getImageData(filePath string) ([]byte, error) {
 	}

 	contentType := http.DetectContentType(buf)
-	allowedTypes := []string{"image/jpeg", "image/jpg", "image/png", "image/webp"}
+	allowedTypes := []string{"image/jpeg", "image/jpg", "image/png"}
 	if !slices.Contains(allowedTypes, contentType) {
 		return nil, fmt.Errorf("invalid image type: %s", contentType)
 	}
--- a/cmd/interactive_test.go
+++ b/cmd/interactive_test.go
@ -12,17 +12,14 @@ func TestExtractFilenames(t *testing.T) {
 	// Unix style paths
 	input := ` some preamble 
 ./relative\ path/one.png inbetween1 ./not a valid two.jpg inbetween2 ./1.svg
-/unescaped space /three.jpeg inbetween3 /valid\ path/dir/four.png "./quoted with spaces/five.JPG
-/unescaped space /six.webp inbetween6 /valid\ path/dir/seven.WEBP`
+/unescaped space /three.jpeg inbetween3 /valid\ path/dir/four.png "./quoted with spaces/five.JPG`
 	res := extractFileNames(input)
-	assert.Len(t, res, 7)
+	assert.Len(t, res, 5)
 	assert.Contains(t, res[0], "one.png")
 	assert.Contains(t, res[1], "two.jpg")
 	assert.Contains(t, res[2], "three.jpeg")
 	assert.Contains(t, res[3], "four.png")
 	assert.Contains(t, res[4], "five.JPG")
-	assert.Contains(t, res[5], "six.webp")
-	assert.Contains(t, res[6], "seven.WEBP")
 	assert.NotContains(t, res[4], '"')
 	assert.NotContains(t, res, "inbetween1")
 	assert.NotContains(t, res, "./1.svg")
@ -33,12 +30,10 @@ func TestExtractFilenames(t *testing.T) {
 /absolute/nospace/three.jpeg inbetween3 /absolute/with space/four.png inbetween4
 ./relative\ path/five.JPG inbetween5 "./relative with/spaces/six.png inbetween6
 d:\path with\spaces\seven.JPEG inbetween7 c:\users\jdoe\eight.png inbetween8 
- d:\program files\someplace\nine.png inbetween9 "E:\program files\someplace\ten.PNG
-c:/users/jdoe/eleven.webp inbetween11 c:/program files/someplace/twelve.WebP inbetween12
-d:\path with\spaces\thirteen.WEBP some ending
+ d:\program files\someplace\nine.png inbetween9 "E:\program files\someplace\ten.PNG some ending
 `
 	res = extractFileNames(input)
-	assert.Len(t, res, 13)
+	assert.Len(t, res, 10)
 	assert.NotContains(t, res, "inbetween2")
 	assert.Contains(t, res[0], "one.png")
 	assert.Contains(t, res[0], "c:")
@ -56,12 +51,6 @@ d:\path with\spaces\thirteen.WEBP some ending
 	assert.Contains(t, res[8], "d:")
 	assert.Contains(t, res[9], "ten.PNG")
 	assert.Contains(t, res[9], "E:")
-	assert.Contains(t, res[10], "eleven.webp")
-	assert.Contains(t, res[10], "c:")
-	assert.Contains(t, res[11], "twelve.WebP")
-	assert.Contains(t, res[11], "c:")
-	assert.Contains(t, res[12], "thirteen.WEBP")
-	assert.Contains(t, res[12], "d:")
 }

 // Ensure that file paths wrapped in single quotes are removed with the quotes.
--- a/convert/convert.go
+++ b/convert/convert.go
@ -1,7 +1,6 @@
 package convert

 import (
-	"cmp"
 	"encoding/json"
 	"errors"
 	"fmt"
@ -15,12 +14,13 @@ import (
 )

 type ModelParameters struct {
-	Architectures []string `json:"architectures"`
-	VocabSize     uint32   `json:"vocab_size"`
+	Architectures []string       `json:"architectures"`
+	VocabSize     uint32         `json:"vocab_size"`
+	TextModel     TextParameters `json:"text_config"`
+}

-	TextModel struct {
-		VocabSize uint32 `json:"vocab_size"`
-	} `json:"text_config"`
+type TextParameters struct {
+	VocabSize uint32 `json:"vocab_size"`
 }

 type AdapterParameters struct {
@ -173,8 +173,6 @@ func ConvertModel(fsys fs.FS, f *os.File) error {
 	switch p.Architectures[0] {
 	case "LlamaForCausalLM":
 		conv = &llamaModel{}
-	case "MllamaForConditionalGeneration":
-		conv = &mllamaModel{}
 	case "Llama4ForConditionalGeneration":
 		conv = &llama4Model{}
 	case "Mistral3ForConditionalGeneration":
@ -191,8 +189,6 @@ func ConvertModel(fsys fs.FS, f *os.File) error {
 		conv = &phi3Model{}
 	case "Qwen2ForCausalLM":
 		conv = &qwen2Model{}
-	case "Qwen2_5_VLForConditionalGeneration":
-		conv = &qwen25VLModel{}
 	case "BertModel":
 		conv = &bertModel{}
 	case "CohereForCausalLM":
@ -216,22 +212,24 @@ func ConvertModel(fsys fs.FS, f *os.File) error {
 		return err
 	}

-	vocabSize := int(cmp.Or(p.VocabSize, p.TextModel.VocabSize))
+	vocabSize := int(p.VocabSize)
+	if vocabSize == 0 {
+		tVocabSize := int(p.TextModel.VocabSize)
+		vocabSize = tVocabSize
+	}

 	switch {
 	case vocabSize == 0:
-		slog.Debug("vocabulary size was not explicitly set by the model", "default size", len(t.Vocabulary.Tokens))
+		slog.Warn("vocabulary size was not explicitly set by the model", "default size", len(t.Vocabulary.Tokens))
 	case vocabSize > len(t.Vocabulary.Tokens):
-		slog.Debug("vocabulary is smaller than expected, padding with dummy tokens", "expect", vocabSize, "actual", len(t.Vocabulary.Tokens))
+		slog.Warn("vocabulary is smaller than expected, padding with dummy tokens", "expect", vocabSize, "actual", len(t.Vocabulary.Tokens))
 		for i := range vocabSize - len(t.Vocabulary.Tokens) {
 			t.Vocabulary.Tokens = append(t.Vocabulary.Tokens, fmt.Sprintf("[PAD%d]", i))
 			t.Vocabulary.Scores = append(t.Vocabulary.Scores, -1)
 			t.Vocabulary.Types = append(t.Vocabulary.Types, tokenTypeUserDefined)
 		}
 	case vocabSize < len(t.Vocabulary.Tokens):
-		slog.Debug("vocabulary is larger than expected", "want", vocabSize, "got", len(t.Vocabulary.Tokens))
-		p.VocabSize = uint32(len(t.Vocabulary.Tokens))
-		p.TextModel.VocabSize = uint32(len(t.Vocabulary.Tokens))
+		return fmt.Errorf("vocabulary is larger than expected '%d' instead of '%d'", len(t.Vocabulary.Tokens), vocabSize)
 	default:
 		slog.Debug("vocabulary", "size", len(t.Vocabulary.Tokens))
 	}
--- a/convert/convert_mllama.go
+++ b/convert/convert_mllama.go
@ -1,160 +0,0 @@
-package convert
-
-import (
-	"strings"
-
-	"github.com/ollama/ollama/fs/ggml"
-	"github.com/pdevine/tensor"
-	"github.com/pdevine/tensor/native"
-)
-
-type mllamaModel struct {
-	ModelParameters
-	TextModel struct {
-		llamaModel
-
-		CrossAttentionLayers []int32 `json:"cross_attention_layers"`
-	} `json:"text_config"`
-	VisionModel struct {
-		NumHiddenLayers           uint32  `json:"num_hidden_layers"`
-		NumGlobalLayers           uint32  `json:"num_global_layers"`
-		IntermediateLayersIndices []int32 `json:"intermediate_layers_indices"`
-
-		HiddenSize       uint32 `json:"hidden_size"`
-		IntermediateSize uint32 `json:"intermediate_size"`
-
-		AttentionHeads uint32 `json:"attention_heads"`
-
-		ImageSize   uint32  `json:"image_size"`
-		PatchSize   uint32  `json:"patch_size"`
-		NumChannels uint32  `json:"num_channels"`
-		MaxNumTiles uint32  `json:"max_num_tiles"`
-		NormEpsilon float32 `json:"norm_eps"`
-		RopeTheta   float32 `json:"rope.freq_base"`
-	} `json:"vision_config"`
-}
-
-func (m *mllamaModel) KV(t *Tokenizer) ggml.KV {
-	kv := m.ModelParameters.KV(t)
-	kv["general.architecture"] = "mllama"
-
-	for k, v := range m.TextModel.KV(t) {
-		if strings.HasPrefix(k, "llama.") {
-			kv[strings.ReplaceAll(k, "llama.", "mllama.")] = v
-		}
-	}
-
-	kv["mllama.attention.cross_attention_layers"] = m.TextModel.CrossAttentionLayers
-
-	kv["mllama.vision.block_count"] = m.VisionModel.NumHiddenLayers
-	kv["mllama.vision.global.block_count"] = m.VisionModel.NumGlobalLayers
-	kv["mllama.vision.intermediate_layers_indices"] = m.VisionModel.IntermediateLayersIndices
-
-	kv["mllama.vision.embedding_length"] = m.VisionModel.HiddenSize
-	kv["mllama.vision.feed_forward_length"] = m.VisionModel.IntermediateSize
-
-	kv["mllama.vision.attention.head_count"] = m.VisionModel.AttentionHeads
-	kv["mllama.vision.attention.layer_norm_epsilon"] = m.VisionModel.NormEpsilon
-
-	kv["mllama.vision.image_size"] = m.VisionModel.ImageSize
-	kv["mllama.vision.patch_size"] = m.VisionModel.PatchSize
-	kv["mllama.vision.max_num_tiles"] = m.VisionModel.MaxNumTiles
-	kv["mllama.vision.num_channels"] = m.VisionModel.NumChannels
-
-	return kv
-}
-
-func (m *mllamaModel) Replacements() []string {
-	return append(
-		m.TextModel.Replacements(),
-		"language_model.", "",
-		"gate_attn", "attn_gate",
-		"gate_ffn", "ffn_gate",
-		"cross_attn.", "cross_attn_",
-		"vision_model", "v",
-		"class_embedding", "class_embd",
-		"patch_embedding", "patch_embd",
-		"gated_positional_embedding.tile_embedding", "tile_position_embd",
-		"gated_positional_embedding.embedding", "position_embd.weight",
-		"gated_positional_embedding", "position_embd",
-		"embedding.weight", "weight",
-		"pre_tile_positional_embedding", "pre_tile_position_embd",
-		"post_tile_positional_embedding", "post_tile_position_embd",
-		"layernorm_pre", "pre_ln",
-		"layernorm_post", "post_ln",
-		"global_transformer.layers", "global.blk",
-		"transformer.layers", "blk",
-		"mlp.fc1", "ffn_up",
-		"mlp.fc2", "ffn_down",
-		"multi_modal_projector", "mm.0",
-	)
-}
-
-func (m *mllamaModel) Tensors(ts []Tensor) []*ggml.Tensor {
-	var out []*ggml.Tensor
-	var text []Tensor
-	for _, t := range ts {
-		if t.Name() == "v.position_embd.gate" {
-			for _, name := range []string{"v.position_embd.gate", "v.tile_position_embd.gate"} {
-				tt := t.Clone()
-				tt.SetRepacker(m.repack(name))
-				out = append(out, &ggml.Tensor{
-					Name:     name,
-					Kind:     t.Kind(),
-					Shape:    t.Shape(),
-					WriterTo: tt,
-				})
-			}
-		} else if t.Name() == "v.pre_tile_position_embd.gate" || t.Name() == "v.post_tile_position_embd.gate" {
-			t.SetRepacker(m.repack(t.Name()))
-			out = append(out, &ggml.Tensor{
-				Name:     t.Name(),
-				Kind:     t.Kind(),
-				Shape:    t.Shape(),
-				WriterTo: t,
-			})
-		} else if strings.HasPrefix(t.Name(), "v.") || strings.HasPrefix(t.Name(), "mm.") {
-			out = append(out, &ggml.Tensor{
-				Name:     t.Name(),
-				Kind:     t.Kind(),
-				Shape:    t.Shape(),
-				WriterTo: t,
-			})
-		} else {
-			text = append(text, t)
-		}
-	}
-
-	return append(out, m.TextModel.Tensors(text)...)
-}
-
-func (m *mllamaModel) repack(name string) Repacker {
-	return func(_ string, data []float32, shape []uint64) (_ []float32, err error) {
-		dims := make([]int, len(shape))
-		for i, dim := range shape {
-			dims[i] = int(dim)
-		}
-
-		var t tensor.Tensor = tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
-
-		t, err = tensor.Tanh(t)
-		if err != nil {
-			return nil, err
-		}
-
-		if name == "v.position_embd.gate" {
-			t, err = tensor.Sub(float32(1), t)
-			if err != nil {
-				return nil, err
-			}
-		}
-
-		t = tensor.Materialize(t)
-		// flatten tensor so it can be return as a vector
-		if err := t.Reshape(t.Shape().TotalSize()); err != nil {
-			return nil, err
-		}
-
-		return native.VectorF32(t.(*tensor.Dense))
-	}
-}
--- a/convert/convert_qwen2.go
+++ b/convert/convert_qwen2.go
@ -15,7 +15,6 @@ type qwen2Model struct {
 		Type                          string     `json:"type"`
 		Factor                        ropeFactor `json:"factor"`
 		OriginalMaxPositionEmbeddings uint32     `json:"original_max_position_embeddings"`
-		MropeSection                  []int32    `json:"mrope_section"`
 	} `json:"rope_scaling"`
 	RMSNormEPS float32 `json:"rms_norm_eps"`
 }
@ -40,8 +39,6 @@ func (q *qwen2Model) KV(t *Tokenizer) ggml.KV {
 	case "yarn":
 		kv["qwen2.rope.scaling.type"] = q.RopeScaling.Type
 		kv["qwen2.rope.scaling.factor"] = q.RopeScaling.Factor
-	case "mrope", "default":
-		kv["qwen2.rope.mrope_section"] = q.RopeScaling.MropeSection
 	default:
 		panic("unknown rope scaling type")
 	}
--- a/convert/convert_qwen25vl.go
+++ b/convert/convert_qwen25vl.go
@ -1,102 +0,0 @@
-package convert
-
-import (
-	"cmp"
-	"slices"
-	"strings"
-
-	"github.com/ollama/ollama/fs/ggml"
-)
-
-type qwen25VLModel struct {
-	qwen2Model
-
-	VisionModel struct {
-		Depth               uint32  `json:"depth"`
-		HiddenSize          uint32  `json:"hidden_size"`
-		NumHeads            uint32  `json:"num_heads"`
-		InChannels          uint32  `json:"in_chans"`
-		PatchSize           uint32  `json:"patch_size"`
-		SpatialMergeSize    uint32  `json:"spatial_merge_size"`
-		SpatialPatchSize    uint32  `json:"spatial_patch_size"`
-		WindowSize          uint32  `json:"window_size"`
-		RMSNormEps          float32 `json:"layer_norm_epsilon"`
-		RopeTheta           float32 `json:"rope_theta"`
-		FullAttentionBlocks []int32 `json:"fullatt_block_indexes"`
-		TemporalPatchSize   uint32  `json:"temporal_patch_size"`
-	} `json:"vision_config"`
-}
-
-var _ ModelConverter = (*qwen25VLModel)(nil)
-
-func (q *qwen25VLModel) KV(t *Tokenizer) ggml.KV {
-	kv := q.ModelParameters.KV(t)
-	kv["general.architecture"] = "qwen25vl"
-
-	for k, v := range q.qwen2Model.KV(t) {
-		if strings.HasPrefix(k, "qwen2.") {
-			kv[strings.Replace(k, "qwen2.", "qwen25vl.", 1)] = v
-		}
-	}
-
-	if q.VisionModel.FullAttentionBlocks == nil {
-		kv["qwen25vl.vision.fullatt_block_indexes"] = []int32{7, 15, 23, 31}
-	}
-
-	kv["qwen25vl.vision.block_count"] = cmp.Or(q.VisionModel.Depth, 32)
-	kv["qwen25vl.vision.embedding_length"] = q.VisionModel.HiddenSize
-	kv["qwen25vl.vision.attention.head_count"] = cmp.Or(q.VisionModel.NumHeads, 16)
-	kv["qwen25vl.vision.num_channels"] = q.VisionModel.InChannels
-	kv["qwen25vl.vision.patch_size"] = cmp.Or(q.VisionModel.PatchSize, 14)
-	kv["qwen25vl.vision.spatial_merge_size"] = cmp.Or(q.VisionModel.SpatialMergeSize, 2)
-	kv["qwen25vl.vision.spatial_patch_size"] = q.VisionModel.SpatialPatchSize
-	kv["qwen25vl.vision.window_size"] = cmp.Or(q.VisionModel.WindowSize, 112)
-	kv["qwen25vl.vision.attention.layer_norm_epsilon"] = cmp.Or(q.VisionModel.RMSNormEps, 1e-6)
-	kv["qwen25vl.vision.rope.freq_base"] = cmp.Or(q.VisionModel.RopeTheta, 1e4)
-	kv["qwen25vl.vision.fullatt_block_indexes"] = q.VisionModel.FullAttentionBlocks
-	kv["qwen25vl.vision.temporal_patch_size"] = cmp.Or(q.VisionModel.TemporalPatchSize, 2)
-
-	return kv
-}
-
-func (q *qwen25VLModel) Tensors(ts []Tensor) []*ggml.Tensor {
-	var out []*ggml.Tensor
-
-	for _, t := range ts {
-		if strings.Contains(t.Name(), "patch_embed.proj") {
-			for t := range splitDim(t, 2,
-				strings.NewReplacer("patch_embed.proj", "patch_embd_0"),
-				strings.NewReplacer("patch_embed.proj", "patch_embd_1"),
-			) {
-				t.Shape = slices.DeleteFunc(t.Shape, func(i uint64) bool { return i == 1 })
-				out = append(out, t)
-			}
-		} else if strings.Contains(t.Name(), "attn.qkv") {
-			out = append(out, slices.Collect(splitDim(t, 0,
-				strings.NewReplacer("attn.qkv", "attn_q"),
-				strings.NewReplacer("attn.qkv", "attn_k"),
-				strings.NewReplacer("attn.qkv", "attn_v"),
-			))...)
-		} else {
-			out = append(out, &ggml.Tensor{
-				Name:     t.Name(),
-				Kind:     t.Kind(),
-				Shape:    t.Shape(),
-				WriterTo: t,
-			})
-		}
-	}
-
-	return out
-}
-
-func (p *qwen25VLModel) Replacements() []string {
-	return append(
-		p.qwen2Model.Replacements(),
-		"visual", "v",
-		"blocks", "blk",
-		"attn.proj", "attn_out",
-		"norm1", "ln1",
-		"norm2", "ln2",
-	)
-}
--- a/convert/reader.go
+++ b/convert/reader.go
@ -38,10 +38,7 @@ const (
 func (t tensorBase) Kind() uint32 {
 	if strings.HasSuffix(t.name, ".ffn_gate_inp.weight") ||
 		t.name == "token_types.weight" ||
-		t.name == "v.positional_embedding_vlm" ||
-		t.name == "v.tile_position_embd.weight" ||
-		t.name == "v.pre_tile_position_embd.weight" ||
-		t.name == "v.post_tile_position_embd.weight" {
+		t.name == "v.positional_embedding_vlm" {
 		// these tensors are always F32
 		return 0
 	}
--- a/convert/tensor.go
+++ b/convert/tensor.go
@ -1,56 +0,0 @@
-package convert
-
-import (
-	"iter"
-	"slices"
-	"strings"
-
-	"github.com/ollama/ollama/fs/ggml"
-	"github.com/pdevine/tensor"
-	"github.com/pdevine/tensor/native"
-)
-
-// splitDim splits a tensor along a specified dimension into multiple tensors. The dimension
-// is split evenly based on the number of replacers provided.
-func splitDim(t Tensor, dim int, replacers ...*strings.Replacer) iter.Seq[*ggml.Tensor] {
-	return func(yield func(*ggml.Tensor) bool) {
-		for i, replacer := range replacers {
-			shape := slices.Clone(t.Shape())
-			shape[dim] = shape[dim] / uint64(len(replacers))
-
-			slice := slices.Repeat([]tensor.Slice{nil}, len(shape))
-			slice[dim] = tensor.S(i*int(shape[dim]), (i+1)*int(shape[dim]))
-
-			tt := t.Clone()
-			tt.SetRepacker(func(_ string, data []float32, shape []uint64) ([]float32, error) {
-				dims := make([]int, len(shape))
-				for i := range shape {
-					dims[i] = int(shape[i])
-				}
-
-				var t tensor.Tensor = tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
-				t, err := t.Slice(slice...)
-				if err != nil {
-					return nil, err
-				}
-
-				t = tensor.Materialize(t)
-				// flatten tensor so it can be written as a vector
-				if err := t.Reshape(t.Shape().TotalSize()); err != nil {
-					return nil, err
-				}
-
-				return native.VectorF32(t.(*tensor.Dense))
-			})
-
-			if !yield(&ggml.Tensor{
-				Name:     replacer.Replace(t.Name()),
-				Kind:     t.Kind(),
-				Shape:    shape,
-				WriterTo: tt,
-			}) {
-				break
-			}
-		}
-	}
-}
--- a/discover/cuda_common.go
+++ b/discover/cuda_common.go
@ -3,6 +3,7 @@
 package discover

 import (
+	"fmt"
 	"log/slog"
 	"os"
 	"regexp"
@ -59,6 +60,8 @@ func cudaVariant(gpuInfo CudaGPUInfo) string {

 	// driver 12.0 has problems with the cuda v12 library, so run v11 on those older drivers
 	if gpuInfo.DriverMajor < 12 || (gpuInfo.DriverMajor == 12 && gpuInfo.DriverMinor == 0) {
+		// The detected driver is older than Feb 2023
+		slog.Warn("old CUDA driver detected - please upgrade to a newer driver", "version", fmt.Sprintf("%d.%d", gpuInfo.DriverMajor, gpuInfo.DriverMinor))
 		return "v11"
 	}
 	return "v12"
--- a/discover/path.go
+++ b/discover/path.go
@ -12,7 +12,7 @@ import (
 // '../lib/ollama' on Linux and the executable's directory on macOS
 // note: distribution builds, additional GPU-specific libraries are
 // found in subdirectories of the returned path, such as
-// 'cuda_v11', 'cuda_v12', 'rocm', etc.
+// 'cuda_v12', 'rocm', etc.
 var LibOllamaPath string = func() string {
 	exe, err := os.Executable()
 	if err != nil {
--- a/docs/gpu.md
+++ b/docs/gpu.md
@ -1,6 +1,6 @@
 # GPU
 ## Nvidia
-Ollama supports Nvidia GPUs with compute capability 5.0+.
+Ollama supports Nvidia GPUs with compute capability 5.0+ and driver version 531 and newer.

 Check your compute compatibility to see if your card is supported:
 [https://developer.nvidia.com/cuda-gpus](https://developer.nvidia.com/cuda-gpus)
--- a/docs/troubleshooting.md
+++ b/docs/troubleshooting.md
@ -43,7 +43,7 @@ Ollama includes multiple LLM libraries compiled for different GPUs and CPU vecto
 In the server log, you will see a message that looks something like this (varies from release to release):

 ```
-Dynamic LLM libraries [rocm_v6 cpu cpu_avx cpu_avx2 cuda_v11 rocm_v5]
+Dynamic LLM libraries [rocm_v6 cpu cpu_avx cpu_avx2 cuda_v12 rocm_v5]
 ```

 **Experimental LLM Library Override**
--- a/fs/ggml/ggml.go
+++ b/fs/ggml/ggml.go
@ -6,7 +6,6 @@ import (
 	"fmt"
 	"io"
 	"log/slog"
-	"math"
 	"slices"
 	"strings"

@ -126,8 +125,6 @@ func (kv KV) OllamaEngineRequired() bool {
 		"gemma3",
 		"mistral3",
 		"llama4",
-		"mllama",
-		"qwen25vl",
 	}, kv.Architecture())
 }

@ -651,29 +648,6 @@ func (llm GGML) VisionGraphSize() (weights, graphSize uint64) {
 		graphSize = 4 * (imageSize*imageSize*numChannels +
 			embeddingLength*patchSize +
 			numPatches*numPatches*headCount)
-	case "qwen25vl":
-		maxPixels := uint64(llm.KV().Uint("vision.max_pixels", 28*28*1280))
-		mergeSize := uint64(llm.KV().Uint("vision.spatial_merge_size", 2))
-		temporalPatchSize := uint64(2)
-
-		// Calculate max possible patches based on max_pixels
-		maxHeight := uint64(math.Sqrt(float64(maxPixels)))
-		maxWidth := maxPixels / maxHeight
-		maxGridHeight := maxHeight / patchSize
-		maxGridWidth := maxWidth / patchSize
-		// Account for merged patches (2x2 grid)
-		numPatches := (maxGridHeight * maxGridWidth) / (mergeSize * mergeSize)
-
-		// Calculate graph size based on typical operations in ProcessImage and createPatches
-		graphSize = 4 * (maxPixels*numChannels + // Original image storage
-			// Normalized pixels
-			maxPixels*numChannels +
-			// Patches storage (numPatches * channels * temporalPatchSize * patchSize^2)
-			numPatches*numChannels*temporalPatchSize*patchSize*patchSize +
-			// Self-attention calculations (similar to other architectures)
-			numPatches*numPatches*headCount +
-			// Additional buffer for processing
-			embeddingLength*numPatches)
 	case "llama4":
 		// vision graph is computed independently in the same schedule
 		// and is negligible compared to the worst case text graph
--- a/llama/llama.cpp/include/llama.h
+++ b/llama/llama.cpp/include/llama.h
@ -258,6 +258,7 @@ extern "C" {

        llama_token  *  token;
        float        *  embd;
+        int32_t         n_embd;
        llama_pos    *  pos;
        int32_t      *  n_seq_id;
        llama_seq_id ** seq_id;
@ -365,6 +366,7 @@ extern "C" {
        bool flash_attn;  // whether to use flash attention [EXPERIMENTAL]
        bool no_perf;     // whether to measure performance timings
        bool op_offload;  // whether to offload host tensor operations to device
+        bool cross_attn;  // whether to use cross attention
    };

    // model quantization parameters
@ -464,6 +466,10 @@ extern "C" {
            struct llama_context_params   params),
            "use llama_init_from_model instead");

+    // TODO (jmorganca): this should most likely be passed in as part of a batch
+    // and not set on the context for all batches.
+    LLAMA_API void llama_set_cross_attention(struct llama_context * ctx, bool cross_attn_state);
+
    // Frees all allocated memory
    LLAMA_API void llama_free(struct llama_context * ctx);

--- a/llama/llama.cpp/src/llama-arch.cpp
+++ b/llama/llama.cpp/src/llama-arch.cpp
@ -6,6 +6,7 @@

 static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
    { LLM_ARCH_LLAMA,            "llama"            },
+    { LLM_ARCH_MLLAMA,           "mllama"           },
    { LLM_ARCH_LLAMA4,           "llama4"           },
    { LLM_ARCH_DECI,             "deci"             },
    { LLM_ARCH_FALCON,           "falcon"           },
@ -144,6 +145,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
    { LLM_KV_ATTENTION_SLIDING_WINDOW,               "%s.attention.sliding_window"               },
    { LLM_KV_ATTENTION_SCALE,                        "%s.attention.scale"                        },
    { LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,        "%s.attention.block_skip_connection"        },
+    { LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS,       "%s.attention.cross_attention_layers"       },
    { LLM_KV_ATTENTION_KEY_LENGTH_MLA,               "%s.attention.key_length_mla"               },
    { LLM_KV_ATTENTION_VALUE_LENGTH_MLA,             "%s.attention.value_length_mla"             },

@ -273,6 +275,40 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
            { LLM_TENSOR_FFN_UP_SHEXP,    "blk.%d.ffn_up_shexp" },
        },
    },
+    {
+        LLM_ARCH_MLLAMA,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+            { LLM_TENSOR_OUTPUT,          "output" },
+            { LLM_TENSOR_ROPE_FREQS,      "rope_freqs" },
+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_ATTN_ROT_EMBD,   "blk.%d.attn_rot_embd" },
+            { LLM_TENSOR_FFN_GATE_INP,    "blk.%d.ffn_gate_inp" },
+            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+            { LLM_TENSOR_FFN_GATE_EXP,    "blk.%d.ffn_gate.%d" },
+            { LLM_TENSOR_FFN_DOWN_EXP,    "blk.%d.ffn_down.%d" },
+            { LLM_TENSOR_FFN_UP_EXP,      "blk.%d.ffn_up.%d" },
+            { LLM_TENSOR_FFN_GATE_EXPS,   "blk.%d.ffn_gate_exps" },
+            { LLM_TENSOR_FFN_DOWN_EXPS,   "blk.%d.ffn_down_exps" },
+            { LLM_TENSOR_FFN_UP_EXPS,     "blk.%d.ffn_up_exps" },
+            { LLM_TENSOR_CROSS_ATTN_K_NORM,    "blk.%d.cross_attn_k_norm" },
+            { LLM_TENSOR_CROSS_ATTN_K_PROJ,    "blk.%d.cross_attn_k_proj" },
+            { LLM_TENSOR_CROSS_ATTN_O_PROJ,    "blk.%d.cross_attn_o_proj" },
+            { LLM_TENSOR_CROSS_ATTN_Q_NORM,    "blk.%d.cross_attn_q_norm" },
+            { LLM_TENSOR_CROSS_ATTN_Q_PROJ,    "blk.%d.cross_attn_q_proj" },
+            { LLM_TENSOR_CROSS_ATTN_V_PROJ,    "blk.%d.cross_attn_v_proj" },
+            { LLM_TENSOR_CROSS_ATTN_ATTN_GATE, "blk.%d.cross_attn_attn_gate" },
+            { LLM_TENSOR_CROSS_ATTN_MLP_GATE,  "blk.%d.cross_attn_mlp_gate" },
+        },
+    },
    {
        LLM_ARCH_DECI,
        {
@ -1701,6 +1737,14 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
    // this tensor is loaded for T5, but never used
    {LLM_TENSOR_DEC_CROSS_ATTN_REL_B,       {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
    {LLM_TENSOR_BSKCN_TV,                   {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_CROSS_ATTN_K_NORM,          {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_CROSS_ATTN_K_PROJ,          {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_CROSS_ATTN_O_PROJ,          {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_CROSS_ATTN_Q_NORM,          {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_CROSS_ATTN_Q_PROJ,          {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_CROSS_ATTN_V_PROJ,          {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_CROSS_ATTN_ATTN_GATE,       {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_CROSS_ATTN_MLP_GATE,        {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
    {LLM_TENSOR_CONV1D,                     {LLM_TENSOR_LAYER_INPUT,     GGML_OP_IM2COL}},
    {LLM_TENSOR_POS_NET_NORM,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
    {LLM_TENSOR_POS_NET_NORM1,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
--- a/llama/llama.cpp/src/llama-arch.h
+++ b/llama/llama.cpp/src/llama-arch.h
@ -11,6 +11,7 @@
 enum llm_arch {
    LLM_ARCH_LLAMA,
    LLM_ARCH_LLAMA4,
+    LLM_ARCH_MLLAMA,
    LLM_ARCH_DECI,
    LLM_ARCH_FALCON,
    LLM_ARCH_BAICHUAN,
@ -148,6 +149,7 @@ enum llm_kv {
    LLM_KV_ATTENTION_SLIDING_WINDOW,
    LLM_KV_ATTENTION_SCALE,
    LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
+    LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS,
    LLM_KV_ATTENTION_KEY_LENGTH_MLA,
    LLM_KV_ATTENTION_VALUE_LENGTH_MLA,

@ -349,6 +351,14 @@ enum llm_tensor {
    LLM_TENSOR_CLS,
    LLM_TENSOR_CLS_OUT,
    LLM_TENSOR_BSKCN_TV,
+    LLM_TENSOR_CROSS_ATTN_K_NORM,
+    LLM_TENSOR_CROSS_ATTN_K_PROJ,
+    LLM_TENSOR_CROSS_ATTN_O_PROJ,
+    LLM_TENSOR_CROSS_ATTN_Q_NORM,
+    LLM_TENSOR_CROSS_ATTN_Q_PROJ,
+    LLM_TENSOR_CROSS_ATTN_V_PROJ,
+    LLM_TENSOR_CROSS_ATTN_ATTN_GATE,
+    LLM_TENSOR_CROSS_ATTN_MLP_GATE,
    LLM_TENSOR_CONV1D,
    LLM_TENSOR_CONVNEXT_DW,
    LLM_TENSOR_CONVNEXT_NORM,
--- a/llama/llama.cpp/src/llama-batch.cpp
+++ b/llama/llama.cpp/src/llama-batch.cpp
@ -320,6 +320,7 @@ struct llama_batch llama_batch_get_one(
        /*n_tokens       =*/ n_tokens,
        /*tokens         =*/ tokens,
        /*embd           =*/ nullptr,
+        /*n_embd         =*/ 0,
        /*pos            =*/ nullptr,
        /*n_seq_id       =*/ nullptr,
        /*seq_id         =*/ nullptr,
@ -332,6 +333,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
        /*n_tokens       =*/ 0,
        /*tokens         =*/ nullptr,
        /*embd           =*/ nullptr,
+        /*n_embd         =*/ 0,
        /*pos            =*/ nullptr,
        /*n_seq_id       =*/ nullptr,
        /*seq_id         =*/ nullptr,
@ -340,6 +342,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_

    if (embd) {
        batch.embd = (float *) malloc(sizeof(float) * n_tokens_alloc * embd);
+        batch.n_embd = embd;
    } else {
        batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens_alloc);
    }
--- a/llama/llama.cpp/src/llama-context.cpp
+++ b/llama/llama.cpp/src/llama-context.cpp
@ -514,7 +514,7 @@ float * llama_context::get_logits_ith(int32_t i) {
            throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, n_outputs));
        }

-        return logits + j*model.vocab.n_tokens();
+        return logits + j*model.hparams.n_vocab;
    } catch (const std::exception & err) {
        LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what());
 #ifndef NDEBUG
@ -632,6 +632,10 @@ void llama_context::set_warmup(bool value) {
    cparams.warmup = value;
 }

+void llama_context::set_cross_attn(bool value) {
+    cparams.cross_attn = value;
+}
+
 void llama_context::set_adapter_lora(
            llama_adapter_lora * adapter,
            float scale) {
@ -709,7 +713,7 @@ int llama_context::encode(llama_batch & inp_batch) {

    const int64_t n_embd = hparams.n_embd;

-    llama_sbatch sbatch = llama_sbatch(batch, n_embd, /* simple_split */ true, /* logits_all */ true);
+    llama_sbatch sbatch = llama_sbatch(batch, batch.n_embd, /* simple_split */ true, /* logits_all */ true);

    const llama_ubatch ubatch = sbatch.split_simple(n_tokens);

@ -863,10 +867,9 @@ int llama_context::decode(llama_batch & inp_batch) {

    const llama_batch & batch = batch_allocr.batch;

-    const auto & vocab   = model.vocab;
    const auto & hparams = model.hparams;

-    const int32_t n_vocab = vocab.n_tokens();
+    const int32_t n_vocab = hparams.n_vocab;

    const int64_t n_tokens_all = batch.n_tokens;
    const int64_t n_embd       = hparams.n_embd;
@ -947,12 +950,9 @@ int llama_context::decode(llama_batch & inp_batch) {

        // find KV slot
        if (!kv_self->find_slot(ubatch)) {
-            kv_self->defrag_sched(-1.0f);
-            kv_self->update(*this);
-            if (!kv_self->find_slot(ubatch)) {
-                LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
-                return 1;
-            }
+            LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
+
+            return 1;
        }

        ggml_backend_sched_reset(sched.get());
@ -1090,7 +1090,7 @@ int llama_context::decode(llama_batch & inp_batch) {
        // make the outputs have the same order they had in the user-provided batch
        // note: this is mostly relevant for recurrent models atm
        if (!sorted_output) {
-            const uint32_t n_vocab = model.vocab.n_tokens();
+            const uint32_t n_vocab = model.hparams.n_vocab;
            const uint32_t n_embd  = model.hparams.n_embd;

            GGML_ASSERT((size_t) n_outputs == out_ids.size());
@ -1145,12 +1145,11 @@ int llama_context::decode(llama_batch & inp_batch) {

 int32_t llama_context::output_reserve(int32_t n_outputs) {
    const auto & hparams = model.hparams;
-    const auto & vocab   = model.vocab;

    const int64_t n_outputs_max = std::max<int64_t>(n_outputs, n_seq_max());

    const auto n_batch = cparams.n_batch;
-    const auto n_vocab = vocab.n_tokens();
+    const auto n_vocab = hparams.n_vocab;
    const auto n_embd  = hparams.n_embd;

    // TODO: use a per-batch flag for logits presence instead
@ -1685,7 +1684,7 @@ size_t llama_context::state_write_data(llama_io_write_i & io) {
    {
        LLAMA_LOG_DEBUG("%s: - writing logits\n", __func__);

-        const uint64_t logits_size = std::min((uint64_t) this->logits_size, (uint64_t) n_outputs * model.vocab.n_tokens());
+        const uint64_t logits_size = std::min((uint64_t) this->logits_size, (uint64_t) n_outputs * model.hparams.n_vocab);

        io.write(&logits_size, sizeof(logits_size));

@ -1968,12 +1967,9 @@ void llama_context::opt_epoch_iter(

            // TODO: not sure if this is needed
            if (!kv_self->find_slot(ubatch)) {
-                kv_self->defrag_sched(-1.0f);
-                kv_self->update(*this);
-                if (!kv_self->find_slot(ubatch)) {
-                    LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
-                    GGML_ABORT("TODO: handle this error");
-                }
+                LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
+
+                GGML_ABORT("TODO: handle this error");
            }

            auto * gf = graph_init();
@ -2097,6 +2093,7 @@ llama_context_params llama_context_default_params() {
        /*.flash_attn                  =*/ false,
        /*.no_perf                     =*/ true,
        /*.op_offload                  =*/ true,
+        /*.cross_attn                  =*/ false,
    };

    return result;
@ -2222,6 +2219,10 @@ void llama_set_warmup(llama_context * ctx, bool warmup) {
    ctx->set_warmup(warmup);
 }

+void llama_set_cross_attention(struct llama_context * ctx, bool cross_attention) {
+    ctx->set_cross_attn(cross_attention);
+}
+
 void llama_synchronize(llama_context * ctx) {
    ctx->synchronize();
 }
--- a/llama/llama.cpp/src/llama-context.h
+++ b/llama/llama.cpp/src/llama-context.h
@ -72,6 +72,7 @@ struct llama_context {
    void set_embeddings (bool value);
    void set_causal_attn(bool value);
    void set_warmup(bool value);
+    void set_cross_attn(bool value);

    void set_adapter_lora(
            llama_adapter_lora * adapter,
--- a/llama/llama.cpp/src/llama-cparams.h
+++ b/llama/llama.cpp/src/llama-cparams.h
@ -31,6 +31,7 @@ struct llama_cparams {
    bool no_perf;
    bool warmup;
    bool op_offload;
+    bool cross_attn;

    enum llama_pooling_type pooling_type;

--- a/llama/llama.cpp/src/llama-graph.cpp
+++ b/llama/llama.cpp/src/llama-graph.cpp
@ -532,6 +532,12 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
    }
 }

+void llm_graph_input_cross_attn_state::set_input(const llama_ubatch * ubatch) {
+    if (ubatch->embd) {
+        ggml_backend_tensor_set(cross_attn_state, ubatch->embd, 0, ggml_nbytes(cross_attn_state));
+    }
+}
+
 //
 // llm_graph_context
 //
@ -1514,6 +1520,25 @@ llm_graph_input_attn_cross * llm_graph_context::build_attn_inp_cross() const {
    return (llm_graph_input_attn_cross *) res->add_input(std::move(inp));
 }

+ggml_tensor * llm_graph_context::build_inp_cross_attn_state() const {
+    const int64_t n_embd = hparams.n_embd;
+
+    auto inp = std::make_unique<llm_graph_input_cross_attn_state>();
+
+    ggml_tensor * cur = nullptr;
+
+    inp->cross_attn_state = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd, 1601, 4);
+    ggml_set_input(inp->cross_attn_state);
+
+    cur = inp->cross_attn_state;
+
+    cb(cur, "inp_cross_attn_state", -1);
+
+    res->add_input(std::move(inp));
+
+    return cur;
+}
+
 ggml_tensor * llm_graph_context::build_attn(
        llm_graph_input_attn_cross * inp,
        ggml_cgraph * gf,
--- a/llama/llama.cpp/src/llama-graph.h
+++ b/llama/llama.cpp/src/llama-graph.h
@ -87,6 +87,7 @@ public:

    ggml_tensor * tokens = nullptr; // I32 [n_batch]
    ggml_tensor * embd   = nullptr; // F32 [n_embd, n_batch]
+    ggml_tensor * cross_attn_state; // F32 [4, n_embd, 1061]
 };

 class llm_graph_input_pos : public llm_graph_input_i {
@ -284,6 +285,16 @@ public:
    const llama_cross * cross = nullptr;
 };

+class llm_graph_input_cross_attn_state : public llm_graph_input_i {
+public:
+    llm_graph_input_cross_attn_state()          = default;
+    virtual ~llm_graph_input_cross_attn_state() = default;
+
+    void set_input(const llama_ubatch * ubatch) override;
+
+    ggml_tensor * cross_attn_state; // F32 [4, n_embd, 1061]
+};
+
 //
 // llm_graph_result
 //
@ -495,6 +506,7 @@ struct llm_graph_context {
    ggml_tensor * build_inp_cls() const;
    ggml_tensor * build_inp_s_copy() const;
    ggml_tensor * build_inp_s_mask() const;
+    ggml_tensor * build_inp_cross_attn_state() const;

    ggml_tensor * build_inp_cross_embd() const;
    ggml_tensor * build_inp_pos_bucket_enc() const;
--- a/llama/llama.cpp/src/llama-hparams.cpp
+++ b/llama/llama.cpp/src/llama-hparams.cpp
@ -85,3 +85,7 @@ bool llama_hparams::is_swa(uint32_t il) const {

    GGML_ABORT("fatal error");
 }
+
+bool llama_hparams::cross_attention_layers(uint32_t il) const {
+    return std::find(cross_attn_layers.begin(), cross_attn_layers.end(), il) != cross_attn_layers.end();
+}
--- a/llama/llama.cpp/src/llama-hparams.h
+++ b/llama/llama.cpp/src/llama-hparams.h
@ -2,6 +2,8 @@

 #include "llama.h"

+#include <algorithm>
+
 #include <array>

 // bump if necessary
@ -42,6 +44,7 @@ struct llama_hparams {
    uint32_t n_expert = 0;
    uint32_t n_expert_used = 0;
    uint32_t n_rel_attn_bkts = 0;
+    uint32_t n_vocab = 0;

    // note: deepseek2 using MLA converts into MQA with larger heads, then decompresses to MHA
    uint32_t n_embd_head_k_mla = 0;
@ -56,6 +59,7 @@ struct llama_hparams {
    std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;

    std::array<std::array<uint32_t, LLAMA_MAX_LAYERS>, 4> n_bskcn_arr = {};
+    std::array<uint32_t, LLAMA_MAX_LAYERS> cross_attn_layers;

    uint32_t n_layer_dense_lead = 0;
    uint32_t n_lora_q           = 0;
@ -159,6 +163,9 @@ struct llama_hparams {
    // Block skip connection
    bool n_bskcn(uint32_t n, uint32_t il) const;

+    // cross attention layers
+    bool cross_attention_layers(uint32_t il) const;
+
    bool is_swa(uint32_t il) const;
 };

--- a/llama/llama.cpp/src/llama-kv-cache.cpp
+++ b/llama/llama.cpp/src/llama-kv-cache.cpp
@ -100,8 +100,16 @@ llama_kv_cache_unified::llama_kv_cache_unified(
            throw std::runtime_error("failed to create ggml context for kv cache");
        }

-        ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
-        ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size);
+        ggml_tensor * k, *v;
+
+        // for cross attention layers
+        if (model.arch == LLM_ARCH_MLLAMA && hparams.cross_attention_layers(i)) {
+            k = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hparams.n_embd_head_k, 6404, hparams.n_head_kv(i));
+            v = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hparams.n_embd_head_v, 6404, hparams.n_head_kv(i));
+        } else {
+            k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
+            v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size);
+        }
        ggml_format_name(k, "cache_k_l%d", i);
        ggml_format_name(v, "cache_v_l%d", i);
        k_l.push_back(k);
@ -451,7 +459,7 @@ void llama_kv_cache_unified::set_full() {
 llama_sbatch llama_kv_cache_unified::sbatch_init(
        const llama_batch & batch,
        bool logits_all) {
-    return llama_sbatch(batch, hparams.n_embd, true, logits_all);
+    return llama_sbatch(batch, batch.n_embd, true, logits_all);
 }

 llama_ubatch llama_kv_cache_unified::ubatch_next(
--- a/llama/llama.cpp/src/llama-model-loader.cpp
+++ b/llama/llama.cpp/src/llama-model-loader.cpp
@ -315,6 +315,8 @@ namespace GGUFMeta {
        return true;
    }

+    template bool llama_model_loader::get_arr<std::array<unsigned int, 512>>(enum llm_kv kid, std::array<unsigned int, 512>& result, bool required);
+
    template<typename T, size_t N_MAX>
    bool llama_model_loader::get_arr(const std::string & key, std::array<T, N_MAX> & result, bool required) {
        const int kid = gguf_find_key(meta.get(), key.c_str());
--- a/llama/llama.cpp/src/llama-model.cpp
+++ b/llama/llama.cpp/src/llama-model.cpp
@ -433,6 +433,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {

    // get general kv
    ml.get_key(LLM_KV_GENERAL_NAME, name, false);
+    ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab, false);

    // everything past this point is not vocab-related
    if (hparams.vocab_only) {
@ -444,6 +445,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
    ml.get_key(LLM_KV_BLOCK_COUNT,       hparams.n_layer);
    ml.get_key(LLM_KV_EXPERT_COUNT,      hparams.n_expert,      false);
    ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
+    ml.get_key(LLM_KV_VOCAB_SIZE,        hparams.n_vocab,       false);

    if (arch == LLM_ARCH_WAVTOKENIZER_DEC) {
        ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd_features);
@ -467,9 +469,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
    std::fill(hparams.n_head_arr.begin(),    hparams.n_head_arr.end(),    0);
    std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
    std::fill(hparams.n_ff_arr.begin(),      hparams.n_ff_arr.end(),      0);
+    std::fill(hparams.cross_attn_layers.begin(), hparams.cross_attn_layers.end(), -1);

    ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH,  hparams.n_ff_arr,   hparams.n_layer, false);
    ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false);
+    ml.get_arr(LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS, hparams.cross_attn_layers, false);

    // n_head_kv is optional, default to n_head
    hparams.n_head_kv_arr = hparams.n_head_arr;
@ -522,7 +526,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {

        ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);

-        if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON) {
+        if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_MLLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON) {
            if (hparams.n_rot != hparams.n_embd_head_k) {
                throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
            }
@ -585,6 +589,16 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                    hparams.use_kq_norm = false;
                }
            } break;
+        case LLM_ARCH_MLLAMA:
+            {
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+                switch (hparams.n_layer) {
+                    case 40: type = LLM_TYPE_11B; break;
+                    case 100: type = LLM_TYPE_90B; break;
+                    default: type = LLM_TYPE_UNKNOWN;
+                }
+            } break;
        case LLM_ARCH_DECI:
            {
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@ -1581,7 +1595,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
        const int64_t n_embd_head_v = hparams.n_embd_head_v;
        const int64_t n_ff          = hparams.n_ff();
        const int64_t n_embd_gqa    = n_embd_v_gqa;
-        const int64_t n_vocab       = vocab.n_tokens();
+        const int64_t n_vocab       = hparams.n_vocab;
        const int64_t n_token_types = vocab.n_token_types();
        const int64_t n_rot         = hparams.n_rot;
        const int64_t n_expert      = hparams.n_expert;
@ -1840,6 +1854,52 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                        }
                    }
                } break;
+            case LLM_ARCH_MLLAMA:
+                {
+                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab+8}, 0);
+
+                    // output
+                    {
+                        output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                        output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
+
+                        // if output is NULL, init from the input tok embed
+                        if (output == NULL) {
+                            output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
+                        }
+                    }
+
+                    for (int i = 0; i < n_layer; ++i) {
+                        auto & layer = layers[i];
+
+                        if (hparams.cross_attention_layers(i)) {
+                            layer.cross_attn_k_norm = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_K_NORM,   "weight", i), {128}, 0);
+                            layer.cross_attn_k_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_K_PROJ,   "weight", i), {n_embd, 1024}, 0);
+                            layer.cross_attn_o_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_O_PROJ,   "weight", i), {n_embd, n_embd}, 0);
+                            layer.cross_attn_q_norm = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_Q_NORM, "weight", i), {128}, 0);
+                            layer.cross_attn_q_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_Q_PROJ, "weight", i), {n_embd, n_embd}, 0);
+                            layer.cross_attn_v_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_V_PROJ, "weight", i), {n_embd, 1024}, 0);
+                            layer.cross_attn_attn_gate = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_ATTN_GATE, i), {1}, 0);
+                            layer.cross_attn_mlp_gate = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_MLP_GATE, i), {1}, 0);
+                            layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
+                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+                            layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+                        } else {
+                            layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+                            layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
+                            layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
+                            layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
+                            layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
+                            layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+                            layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
+                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+                        }
+                    }
+                } break;
            case LLM_ARCH_DECI:
                {
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@ -4756,6 +4816,246 @@ struct llm_build_llama : public llm_graph_context {
    }
 };

+struct llm_build_mllama: public llm_graph_context {
+    llm_build_mllama(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
+        // mutable variable, needed during the last layer of the computation to skip unused tokens
+        int32_t n_tokens = this->n_tokens;
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+        GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+        ggml_tensor * cur;
+        ggml_tensor * inpL;
+        ggml_tensor * inpCAS;
+
+        inpL = build_inp_embd(model.tok_embd);
+        inpCAS = build_inp_cross_attn_state();
+
+          // inp_pos - contains the positions
+        ggml_tensor * inp_pos = build_inp_pos();
+
+        auto * inp_attn = build_attn_inp_kv_unified();
+        const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
+
+        for (int il = 0; il < n_layer; ++il) {
+            ggml_tensor * inpSA = inpL;
+
+            // norm
+            cur = build_norm(inpL,
+                    model.layers[il].attn_norm, NULL,
+                    LLM_NORM_RMS, il);
+            cb(cur, "attn_norm", il);
+
+            if (hparams.cross_attention_layers(il)) {
+                if (!ubatch.embd && !cparams.cross_attn) {
+                    continue;
+                }
+
+                // cross attention layer
+                ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_q_proj, cur);
+                cb(Qcur, "Qcur", il);
+
+                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+                cb(Qcur, "Qcur", il);
+
+                Qcur = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 0, 2, 1, 3));
+                cb(Qcur, "Qcur", il);
+
+                Qcur = build_norm(Qcur, model.layers[il].cross_attn_q_norm, NULL, LLM_NORM_RMS, il);
+                cb(Qcur, "Qcur", il);
+
+                ggml_tensor * Kcur, * Vcur;
+                if (ubatch.embd) {
+                    Kcur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_k_proj, inpCAS);
+                    cb(Kcur, "Kcur", il);
+
+                    Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, 6404);
+                    cb(Kcur, "Kcur", il);
+
+                    Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
+                    cb(Kcur, "Kcur", il);
+
+                    Kcur = build_norm(Kcur, model.layers[il].cross_attn_k_norm, NULL, LLM_NORM_RMS, il);
+                    cb(Kcur, "Kcur", il);
+
+                    ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, kv_self->k_l[il]));
+
+                    Vcur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_v_proj, inpCAS);
+                    cb(Vcur, "Vcur", il);
+
+                    Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, 6404);
+                    cb(Vcur, "Vcur", il);
+
+                    Vcur = ggml_permute(ctx0, Vcur, 0, 2, 1, 3);
+                    cb(Vcur, "Vcur", il);
+
+                    ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, kv_self->v_l[il]));
+                } else {
+                    Kcur = ggml_view_tensor(ctx0, kv_self->k_l[il]);
+                    cb(Kcur, "Kcur (view)", il);
+
+                    Vcur = ggml_view_tensor(ctx0, kv_self->v_l[il]);
+                    cb(Vcur, "Vcur (view)", il);
+                }
+
+                struct ggml_tensor * kq = ggml_mul_mat(ctx0, Kcur, Qcur);
+                cb(kq, "kq", il);
+
+                // TODO: apply causal masks
+                struct ggml_tensor * kq_soft_max = ggml_soft_max_ext(ctx0, kq, nullptr, 1.f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias);
+                cb(kq_soft_max, "kq_soft_max", il);
+
+                Vcur = ggml_cont(ctx0, ggml_transpose(ctx0, Vcur));
+                cb(Vcur, "Vcur", il);
+
+                struct ggml_tensor * kqv = ggml_mul_mat(ctx0, Vcur, kq_soft_max);
+                cb(kqv, "kqv", il);
+
+                struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
+                cb(kqv_merged, "kqv_merged", il);
+
+                cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_head_v*n_head, n_tokens);
+                cb(cur, "kqv_merged_cont", il);
+
+                cur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_o_proj, cur);
+                cb(cur, "cur", il);
+
+                // TODO: do this in place once?
+                cur = ggml_mul(ctx0, cur, ggml_tanh(ctx0, model.layers[il].cross_attn_attn_gate));
+
+                struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+                cb(ffn_inp, "ffn_inp", il);
+
+                // feed-forward network
+                cur = build_norm(ffn_inp,
+                        model.layers[il].ffn_norm, NULL,
+                        LLM_NORM_RMS, il);
+                cb(cur, "ffn_norm", il);
+
+                cur = build_ffn(cur,
+                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
+                        model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
+                        model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+                        NULL,
+                        LLM_FFN_SILU, LLM_FFN_PAR, il);
+                cb(cur, "ffn_out", il);
+
+                // TODO: do this inplace once?
+                cur = ggml_add_inplace(ctx0, ggml_mul_inplace(ctx0, cur, ggml_tanh(ctx0, model.layers[il].cross_attn_mlp_gate)), ffn_inp);
+                cb(cur, "ffn_out", il);
+
+                cur = build_cvec(cur, il);
+                cb(cur, "l_out", il);
+
+                // input for next layer
+                inpL = cur;
+            } else {
+                // self attention layer
+
+                // rope freq factors for llama3; may return nullptr for llama2 and other models
+                ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
+
+                // compute Q and K and RoPE them
+                ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+                cb(Qcur, "Qcur", il);
+                if (model.layers[il].bq) {
+                    Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+                    cb(Qcur, "Qcur", il);
+                }
+
+                ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+                cb(Kcur, "Kcur", il);
+                if (model.layers[il].bk) {
+                    Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+                    cb(Kcur, "Kcur", il);
+                }
+
+                ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+                cb(Vcur, "Vcur", il);
+                if (model.layers[il].bv) {
+                    Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+                    cb(Vcur, "Vcur", il);
+                }
+
+                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
+                Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+                Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+                Qcur = ggml_rope_ext(
+                        ctx0, Qcur, inp_pos, rope_factors,
+                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                        ext_factor, attn_factor, beta_fast, beta_slow
+                        );
+
+                Kcur = ggml_rope_ext(
+                        ctx0, Kcur, inp_pos, rope_factors,
+                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                        ext_factor, attn_factor, beta_fast, beta_slow
+                        );
+
+                cb(Qcur, "Qcur", il);
+                cb(Kcur, "Kcur", il);
+                cb(Vcur, "Vcur", il);
+
+                cur = build_attn(inp_attn, gf,
+                    model.layers[il].wo, model.layers[il].bo,
+                    Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+
+                if (il == n_layer - 1) {
+                    // skip computing output for unused tokens
+                    struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                    n_tokens = n_outputs;
+                    cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+                    inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+                }
+
+                struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+                cb(ffn_inp, "ffn_inp", il);
+
+                // feed-forward network
+                cur = build_norm(ffn_inp,
+                        model.layers[il].ffn_norm, NULL,
+                        LLM_NORM_RMS, il);
+                cb(cur, "ffn_norm", il);
+
+                cur = build_ffn(cur,
+                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
+                        model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
+                        model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+                        NULL,
+                        LLM_FFN_SILU, LLM_FFN_PAR, il);
+                cb(cur, "ffn_out", il);
+
+                cur = ggml_add(ctx0, cur, ffn_inp);
+                cb(cur, "ffn_out", il);
+
+                cur = build_cvec(cur, il);
+                cb(cur, "l_out", il);
+
+                // input for next layer
+                inpL = cur;
+            }
+        }
+
+        cur = inpL;
+
+        cur = build_norm(cur,
+                model.output_norm, NULL,
+                LLM_NORM_RMS, -1);
+        cb(cur, "result_norm", -1);
+        res->t_embd = cur;
+
+        // lm_head
+        cur = build_lora_mm(model.output, cur);
+
+        cb(cur, "result_output", -1);
+        res->t_logits = cur;
+
+        ggml_build_forward_expand(gf, cur);
+    }
+};
+
 struct llm_build_deci : public llm_graph_context {
    llm_build_deci(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
        const int64_t n_embd_head = hparams.n_embd_head_v;
@ -13128,6 +13428,10 @@ llm_graph_result_ptr llama_model::build_graph(
            {
                llm = std::make_unique<llm_build_llama>(*this, params, gf);
            } break;
+        case LLM_ARCH_MLLAMA:
+            {
+                llm = std::make_unique<llm_build_mllama>(*this, params, gf);
+            } break;
        case LLM_ARCH_DECI:
            {
                llm = std::make_unique<llm_build_deci>(*this, params, gf);
@ -13489,6 +13793,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
        // use what we call a normal RoPE, operating on pairs of consecutive head values
        case LLM_ARCH_LLAMA:
        case LLM_ARCH_LLAMA4:
+        case LLM_ARCH_MLLAMA:
        case LLM_ARCH_DECI:
        case LLM_ARCH_BAICHUAN:
        case LLM_ARCH_STARCODER:
--- a/llama/llama.cpp/src/llama-model.h
+++ b/llama/llama.cpp/src/llama-model.h
@ -11,6 +11,7 @@
 #include <string>
 #include <unordered_map>
 #include <vector>
+#include <stdexcept>

 struct llama_cparams;
 struct llama_ubatch;
@ -74,6 +75,7 @@ enum llm_type {
    LLM_TYPE_40B,
    LLM_TYPE_65B,
    LLM_TYPE_70B,
+    LLM_TYPE_90B,
    LLM_TYPE_236B,
    LLM_TYPE_290B,
    LLM_TYPE_314B,
@ -318,6 +320,16 @@ struct llama_layer {

    struct ggml_tensor * bskcn_tv = nullptr;

+    // cross attention
+    struct ggml_tensor * cross_attn_k_norm = nullptr;
+    struct ggml_tensor * cross_attn_k_proj = nullptr;
+    struct ggml_tensor * cross_attn_o_proj = nullptr;
+    struct ggml_tensor * cross_attn_q_norm = nullptr;
+    struct ggml_tensor * cross_attn_q_proj = nullptr;
+    struct ggml_tensor * cross_attn_v_proj = nullptr;
+    struct ggml_tensor * cross_attn_attn_gate = nullptr;
+    struct ggml_tensor * cross_attn_mlp_gate = nullptr;
+
    struct llama_layer_posnet posnet;

    struct llama_layer_convnext convnext;
--- a/llama/llama.cpp/src/llama-quant.cpp
+++ b/llama/llama.cpp/src/llama-quant.cpp
@ -639,7 +639,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
        if (llama_model_has_encoder(&model)) {
            n_attn_layer *= 3;
        }
-        GGML_ASSERT((qs.n_attention_wv == n_attn_layer) && "n_attention_wv is unexpected");
+        if (qs.n_attention_wv != n_attn_layer) {
+            LLAMA_LOG_WARN("%s: n_attention_wv is unexpected, expected: %d, found: %d\n", __func__, n_attn_layer, qs.n_attention_wv);
+        }
    }

    size_t total_size_org = 0;
--- a/llama/llama.cpp/src/llama-vocab.cpp
+++ b/llama/llama.cpp/src/llama-vocab.cpp
@ -1469,6 +1469,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
            const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
            if (precompiled_charsmap_keyidx != -1) {
                const gguf_type pc_type = gguf_get_arr_type(ctx, precompiled_charsmap_keyidx);
+                GGML_ASSERT(pc_type == GGUF_TYPE_INT8 || pc_type == GGUF_TYPE_UINT8);
+
                const size_t n_precompiled_charsmap = gguf_get_arr_data_n(ctx, precompiled_charsmap_keyidx);
                const char * pc = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx);
                precompiled_charsmap.assign(pc, pc + n_precompiled_charsmap);
--- a/llama/llama.cpp/tools/mtmd/llava.cpp
+++ b/llama/llama.cpp/tools/mtmd/llava.cpp
@ -462,7 +462,7 @@ struct llava_embd_batch {
    std::vector<llama_seq_id *> seq_ids;
    std::vector<int8_t>         logits;
    llama_batch batch;
-    llava_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
+    llava_embd_batch(float * embd, int32_t n_embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
        pos     .resize(n_tokens);
        n_seq_id.resize(n_tokens);
        seq_ids .resize(n_tokens + 1);
@ -474,6 +474,7 @@ struct llava_embd_batch {
            /*n_tokens       =*/ n_tokens,
            /*tokens         =*/ nullptr,
            /*embd           =*/ embd,
+            /*n_embd         =*/ n_embd,
            /*pos            =*/ pos.data(),
            /*n_seq_id       =*/ n_seq_id.data(),
            /*seq_id         =*/ seq_ids.data(),
@ -497,7 +498,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
            n_eval = n_batch;
        }
        float * embd = image_embed->embed+i*n_embd;
-        llava_embd_batch llava_batch = llava_embd_batch(embd, n_eval, *n_past, 0);
+        llava_embd_batch llava_batch = llava_embd_batch(embd, n_embd, n_eval, *n_past, 0);
        if (llama_decode(ctx_llama, llava_batch.batch)) {
            LOG_ERR("%s : failed to eval\n", __func__);
            return false;
--- a/llama/llama.go
+++ b/llama/llama.go
@ -17,6 +17,7 @@ package llama
 #include "llava.h"
 #include "gguf.h"

+#include "mllama.h"
 #include "sampling_ext.h"

 extern bool llamaProgressCallback(float progress, void *user_data);
@ -509,6 +510,63 @@ func (c *ClipContext) NewEmbed(llamaContext *Context, data []byte) ([][]float32,
 	return embed, nil
 }

+type MllamaContext struct {
+	c *C.struct_mllama_ctx
+}
+
+func NewMllamaContext(llamaContext *Context, modelPath string) (*MllamaContext, error) {
+	mp := C.CString(modelPath)
+	defer C.free(unsafe.Pointer(mp))
+	c := C.mllama_model_load(mp, 1)
+	if c == nil {
+		return nil, fmt.Errorf("unable to load mllama model: %v", modelPath)
+	}
+
+	projEmbedSize := int(C.mllama_n_embd(c))
+	modelEmbedSize := llamaContext.Model().NEmbd()
+	if projEmbedSize != modelEmbedSize {
+		return nil, fmt.Errorf("projector embedding size (%d) does not match model (%d)", projEmbedSize, modelEmbedSize)
+	}
+
+	return &MllamaContext{c: c}, nil
+}
+
+func (m *MllamaContext) Free() {
+	C.mllama_free(m.c)
+}
+
+func (m *MllamaContext) NewEmbed(llamaContext *Context, data []byte, aspectRatioId int) ([][]float32, error) {
+	img := C.mllama_image_init()
+	defer C.mllama_image_free(img)
+
+	ok := bool(C.mllama_image_load_from_data(unsafe.Pointer(&data[0]), C.int(len(data)), 560, 560, 3, 4, C.int(aspectRatioId), img))
+	if !ok {
+		return nil, errors.New("unable to load mllama image data")
+	}
+
+	rows := make([]float32, m.EmbedSize(llamaContext))
+	ok = bool(C.mllama_image_encode(m.c, C.int(llamaContext.numThreads), img, (*C.float)(unsafe.Pointer(&rows[0]))))
+	if !ok {
+		return nil, errors.New("unable to make mllama embedding from image")
+	}
+
+	embed := make([][]float32, 1)
+	embed[0] = rows
+
+	return embed, nil
+}
+
+func (m *MllamaContext) EmbedSize(llamaContext *Context) int {
+	numTokens := int(C.mllama_n_positions(m.c) * C.mllama_n_tiles(m.c))
+	numEmbed := llamaContext.Model().NEmbd()
+
+	return numTokens * numEmbed
+}
+
+func (c *Context) SetCrossAttention(state bool) {
+	C.llama_set_cross_attention(c.c, C.bool(state))
+}
+
 func (c *Context) Synchronize() {
 	C.llama_synchronize(c.c)
 }
--- a/llama/mllama.cpp
+++ b/llama/mllama.cpp
@ -0,0 +1,887 @@
+// NOTE: This is modified from clip.cpp for Mllama only
+#include "mllama.h"
+
+#include "ggml-alloc.h"
+#include "ggml-backend.h"
+#include "ggml-cpu.h"
+#include "ggml.h"
+#include "gguf.h"
+
+#ifdef GGML_USE_CUDA
+#include "ggml-cuda.h"
+#endif
+
+#ifdef GGML_USE_METAL
+#include "ggml-metal.h"
+#endif
+
+#ifdef GGML_USE_CANN
+#include "ggml-cann.h"
+#endif
+
+#ifdef GGML_USE_VULKAN
+#include "ggml-vulkan.h"
+#endif
+
+#include <algorithm>
+#include <cmath>
+#include <cstdarg>
+#include <cstdlib>
+#include <cstring>
+#include <fstream>
+#include <stdexcept>
+#include <vector>
+
+#define REQUIRE(x)                                           \
+    do {                                                     \
+        if (!(x)) {                                          \
+            throw std::runtime_error("REQUIRE failed: " #x); \
+        }                                                    \
+    } while (0)
+
+#define LOG(fmt, ...) fprintf(stderr, "%s: " fmt "\n", __func__, ##__VA_ARGS__)
+
+#if defined(_WIN32)
+#define WIN32_LEAN_AND_MEAN
+#ifndef NOMINMAX
+    #define NOMINMAX
+#endif
+#include <windows.h>
+#if __GLIBCXX__
+#include <cstdio>
+#include <ext/stdio_filebuf.h>
+#include <fcntl.h>
+#endif
+#endif
+
+struct mllama_image {
+    int width;
+    int height;
+
+    int num_channels = 3;
+    int num_tiles = 4;
+
+    int aspect_ratio_id;
+
+    std::vector<float> data;
+};
+
+static std::string format(const char *fmt, ...) {
+    va_list args;
+    va_start(args, fmt);
+    std::vector<char> b(128);
+    int n = vsnprintf(b.data(), b.size(), fmt, args);
+    REQUIRE(n >= 0 && n < b.size());
+    va_end(args);
+    return std::string(b.data(), b.size());
+}
+
+//
+// utilities to get data from a gguf file
+//
+
+static int get_key_index(const gguf_context *ctx, const char *key) {
+    int key_index = gguf_find_key(ctx, key);
+    REQUIRE(key_index != -1);
+    return key_index;
+}
+
+static std::vector<uint32_t> get_u32_array(const gguf_context *ctx, const std::string &key) {
+    const int i = get_key_index(ctx, key.c_str());
+    const int n = gguf_get_arr_n(ctx, i);
+    const uint32_t *data = (uint32_t *)gguf_get_arr_data(ctx, i);
+
+    std::vector<uint32_t> s(n);
+    for (size_t j = 0; j < s.size(); j++) {
+        s[j] = data[j];
+    }
+
+    return s;
+}
+
+static uint32_t get_u32(const gguf_context *ctx, const std::string &key) {
+    return gguf_get_val_u32(ctx, get_key_index(ctx, key.c_str()));
+}
+
+static float get_f32(const gguf_context *ctx, const std::string &key) {
+    return gguf_get_val_f32(ctx, get_key_index(ctx, key.c_str()));
+}
+
+static std::string get_ftype(int ftype) {
+    return ggml_type_name(static_cast<ggml_type>(ftype));
+}
+
+//
+// mllama layers
+//
+
+struct mllama_hparams {
+    uint32_t image_size;
+    uint32_t patch_size;
+    uint32_t hidden_size;
+    uint32_t n_intermediate;
+    uint32_t projection_dim;
+    uint32_t n_head;
+    uint32_t n_layer;
+    uint32_t n_global_layer;
+    uint32_t n_tiles;
+
+    float eps;
+
+    std::vector<bool> intermediate_layers;
+};
+
+struct mllama_layer {
+    // attention
+    struct ggml_tensor *k_w;
+    struct ggml_tensor *k_b;
+    struct ggml_tensor *q_w;
+    struct ggml_tensor *q_b;
+    struct ggml_tensor *v_w;
+    struct ggml_tensor *v_b;
+
+    struct ggml_tensor *o_w;
+    struct ggml_tensor *o_b;
+
+    struct ggml_tensor *attn_gate;
+
+    // layernorm 1
+    struct ggml_tensor *ln_1_w;
+    struct ggml_tensor *ln_1_b;
+
+    // ff
+    struct ggml_tensor *ff_i_w;
+    struct ggml_tensor *ff_i_b;
+
+    struct ggml_tensor *ff_o_w;
+    struct ggml_tensor *ff_o_b;
+
+    struct ggml_tensor *ff_gate;
+
+    // layernorm 2
+    struct ggml_tensor *ln_2_w;
+    struct ggml_tensor *ln_2_b;
+};
+
+struct mllama_vision_model {
+    struct mllama_hparams hparams;
+
+    // embeddings
+    struct ggml_tensor *class_embedding;
+    struct ggml_tensor *patch_embeddings;
+    struct ggml_tensor *position_embeddings;
+    struct ggml_tensor *position_embeddings_gate;
+    struct ggml_tensor *tile_position_embeddings;
+    struct ggml_tensor *tile_position_embeddings_gate;
+    struct ggml_tensor *pre_tile_position_embeddings;
+    struct ggml_tensor *pre_tile_position_embeddings_gate;
+    struct ggml_tensor *post_tile_position_embeddings;
+    struct ggml_tensor *post_tile_position_embeddings_gate;
+
+    struct ggml_tensor *pre_ln_w;
+    struct ggml_tensor *pre_ln_b;
+
+    std::vector<mllama_layer> layers;
+    std::vector<mllama_layer> global_layers;
+
+    struct ggml_tensor *post_ln_w;
+    struct ggml_tensor *post_ln_b;
+
+    struct ggml_tensor *mm_0_w;
+    struct ggml_tensor *mm_0_b;
+};
+
+struct mllama_ctx {
+    struct mllama_vision_model vision_model;
+
+    uint32_t ftype = 1;
+
+    struct gguf_context *ctx_gguf;
+    struct ggml_context *ctx_data;
+
+    std::vector<uint8_t> buf_compute_meta;
+
+    // memory buffers to evaluate the model
+    ggml_backend_buffer_t params_buffer = nullptr;
+
+    ggml_backend_t backend = nullptr;
+    ggml_gallocr_t compute_alloc = nullptr;
+};
+
+static ggml_tensor *mllama_image_build_encoder_layer(
+    struct ggml_context *ctx0, const size_t il, const struct mllama_layer &layer, struct ggml_tensor *embeddings,
+    const float eps, const int hidden_size, const int batch_size, const int n_head, const int d_head) {
+    struct ggml_tensor *cur = embeddings;
+
+    {
+        // layernorm1
+        cur = ggml_norm(ctx0, cur, eps);
+        cur = ggml_add(ctx0, ggml_mul(ctx0, cur, layer.ln_1_w), layer.ln_1_b);
+        ggml_set_name(cur, format("%d pre layernorm", il).c_str());
+    }
+
+    {
+        // self-attention
+        struct ggml_tensor *Q = ggml_mul_mat(ctx0, layer.q_w, cur);
+        if (layer.q_b != nullptr) {
+            Q = ggml_add(ctx0, Q, layer.q_b);
+        }
+
+        Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, Q->ne[1], batch_size);
+        Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
+        ggml_set_name(Q, format("%d query", il).c_str());
+
+        struct ggml_tensor *K = ggml_mul_mat(ctx0, layer.k_w, cur);
+        if (layer.k_b != nullptr) {
+            K = ggml_add(ctx0, K, layer.k_b);
+        }
+
+        K = ggml_reshape_4d(ctx0, K, d_head, n_head, K->ne[1], batch_size);
+        K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
+        ggml_set_name(K, format("%d key", il).c_str());
+
+        struct ggml_tensor *V = ggml_mul_mat(ctx0, layer.v_w, cur);
+        if (layer.v_b != nullptr) {
+            V = ggml_add(ctx0, V, layer.v_b);
+        }
+
+        V = ggml_reshape_4d(ctx0, V, d_head, n_head, V->ne[1], batch_size);
+        V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3));
+        ggml_set_name(V, format("%d value", il).c_str());
+
+        struct ggml_tensor *KQ = ggml_mul_mat(ctx0, K, Q);
+        KQ = ggml_scale_inplace(ctx0, KQ, 1.0f / sqrtf((float)d_head));
+        KQ = ggml_soft_max_inplace(ctx0, KQ);
+        ggml_set_name(KQ, format("%d KQ", il).c_str());
+
+        struct ggml_tensor *KQV = ggml_mul_mat(ctx0, V, KQ);
+        KQV = ggml_reshape_4d(ctx0, KQV, d_head, KQV->ne[1], n_head, batch_size);
+        KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
+        KQV = ggml_cont_3d(ctx0, KQV, hidden_size, KQV->ne[2], batch_size);
+        ggml_set_name(KQV, format("%d KQV", il).c_str());
+
+        cur = ggml_mul_mat(ctx0, layer.o_w, KQV);
+        if (layer.o_b != nullptr) {
+            cur = ggml_add(ctx0, cur, layer.o_b);
+        }
+        ggml_set_name(cur, format("%d self attention", il).c_str());
+
+        if (layer.attn_gate != nullptr) {
+            cur = ggml_mul_inplace(ctx0, cur, layer.attn_gate);
+            ggml_set_name(cur, format("%d self attention gate", il).c_str());
+        }
+    }
+
+    cur = ggml_add(ctx0, cur, embeddings);
+    ggml_set_name(cur, format("%d residual", il).c_str());
+
+    embeddings = cur;
+
+    {
+        // layernorm2
+        cur = ggml_norm(ctx0, cur, eps);
+        cur = ggml_add(ctx0, ggml_mul(ctx0, cur, layer.ln_2_w), layer.ln_2_b);
+        ggml_set_name(cur, format("%d post layernorm", il).c_str());
+    }
+
+    {
+        // feed forward
+        cur = ggml_add(ctx0, ggml_mul_mat(ctx0, layer.ff_i_w, cur), layer.ff_i_b);
+        cur = ggml_gelu_inplace(ctx0, cur);
+        cur = ggml_add(ctx0, ggml_mul_mat(ctx0, layer.ff_o_w, cur), layer.ff_o_b);
+        ggml_set_name(cur, format("%d feed forward", il).c_str());
+
+        if (layer.ff_gate != nullptr) {
+            cur = ggml_mul_inplace(ctx0, cur, layer.ff_gate);
+            ggml_set_name(cur, format("%d feed forward gate", il).c_str());
+        }
+    }
+
+    // residual 2
+    cur = ggml_add(ctx0, cur, embeddings);
+    ggml_set_name(cur, format("%d residual", il).c_str());
+
+    embeddings = cur;
+
+    return embeddings;
+}
+
+static ggml_cgraph *mllama_image_build_graph(mllama_ctx *ctx, const mllama_image_batch *imgs) {
+    const auto &model = ctx->vision_model;
+    const auto &hparams = model.hparams;
+
+    const int image_size = hparams.image_size;
+    const int image_size_width = image_size;
+    const int image_size_height = image_size;
+
+    const int patch_size = hparams.patch_size;
+    const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
+    const int num_positions = num_patches + (model.class_embedding == nullptr ? 0 : 1);
+    const int hidden_size = hparams.hidden_size;
+    const int n_head = hparams.n_head;
+    const int d_head = hidden_size / n_head;
+
+    const int batch_size = imgs->size;
+    REQUIRE(batch_size == 1);
+
+    int num_tiles = 4;
+    int num_channels = 3;
+    if (imgs->data != nullptr) {
+        num_tiles = imgs->data[0].num_tiles > 0 ? imgs->data[0].num_tiles : num_tiles;
+        num_channels = imgs->data[0].num_channels > 0 ? imgs->data[0].num_channels : num_channels;
+    }
+
+    struct ggml_init_params params = {
+        ctx->buf_compute_meta.size(), // mem_size
+        ctx->buf_compute_meta.data(), // mem_buffer
+        true,                         // no_alloc
+    };
+
+    struct ggml_context *ctx0 = ggml_init(params);
+    struct ggml_cgraph *gf = ggml_new_graph(ctx0);
+
+    struct ggml_tensor *inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size_width, image_size_height, num_channels, num_tiles);
+    ggml_set_name(inp_raw, "inp_raw");
+    ggml_set_input(inp_raw);
+
+    struct ggml_tensor *inp = ggml_conv_2d(ctx0, model.patch_embeddings, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
+
+    inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, num_tiles);
+    inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3));
+
+    struct ggml_tensor *aspect_ratios = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, imgs->size);
+    ggml_set_name(aspect_ratios, "aspect_ratios");
+    ggml_set_input(aspect_ratios);
+
+    if (model.pre_tile_position_embeddings != nullptr) {
+        struct ggml_tensor *pre_tile_position_embeddings = ggml_get_rows(ctx0, model.pre_tile_position_embeddings, aspect_ratios);
+        ggml_set_name(pre_tile_position_embeddings, "pre_tile_position_embeddings");
+
+        pre_tile_position_embeddings = ggml_reshape_3d(ctx0, pre_tile_position_embeddings, hidden_size, 1, num_tiles);
+        if (model.pre_tile_position_embeddings_gate != nullptr) {
+            pre_tile_position_embeddings = ggml_mul_inplace(ctx0, pre_tile_position_embeddings, model.pre_tile_position_embeddings_gate);
+        }
+
+        inp = ggml_add(ctx0, inp, pre_tile_position_embeddings);
+    }
+
+    struct ggml_tensor *embeddings = inp;
+
+    if (model.class_embedding != nullptr) {
+        // concat class_embeddings and patch_embeddings
+        embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, num_tiles);
+        ggml_set_name(embeddings, "embeddings");
+        ggml_set_input(embeddings);
+        for (int i = 0; i < num_tiles; ++i) {
+            // repeat class embeddings for each tile
+            embeddings = ggml_acc_inplace(ctx0, embeddings, model.class_embedding, embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], i * embeddings->nb[2]);
+        }
+
+        embeddings = ggml_acc_inplace(ctx0, embeddings, inp, embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
+    }
+
+    struct ggml_tensor *positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions);
+    ggml_set_name(positions, "positions");
+    ggml_set_input(positions);
+
+    struct ggml_tensor *position_embd = ggml_get_rows(ctx0, model.position_embeddings, positions);
+    if (model.position_embeddings_gate != nullptr) {
+        position_embd = ggml_mul_inplace(ctx0, position_embd, model.position_embeddings_gate);
+    }
+
+    embeddings = ggml_add(ctx0, embeddings, position_embd);
+
+    if (model.tile_position_embeddings != nullptr) {
+        struct ggml_tensor *tile_position_embeddings = ggml_get_rows(ctx0, model.tile_position_embeddings, aspect_ratios);
+        ggml_set_name(tile_position_embeddings, "tile_position_embeddings");
+
+        tile_position_embeddings = ggml_reshape_3d(ctx0, tile_position_embeddings, hidden_size, num_positions, num_tiles);
+        if (model.tile_position_embeddings_gate != nullptr) {
+            tile_position_embeddings = ggml_mul_inplace(ctx0, tile_position_embeddings, model.tile_position_embeddings_gate);
+        }
+
+        embeddings = ggml_add(ctx0, embeddings, tile_position_embeddings);
+    }
+
+    // pre-layernorm
+    if (model.pre_ln_w != nullptr) {
+        embeddings = ggml_mul(ctx0, ggml_norm(ctx0, embeddings, hparams.eps), model.pre_ln_w);
+        if (model.pre_ln_b != nullptr) {
+            embeddings = ggml_add(ctx0, embeddings, model.pre_ln_b);
+        }
+
+        ggml_set_name(embeddings, "pre layernorm");
+    }
+
+    const int num_padding_patches = 8 - (embeddings->ne[1] % 8) % 8;
+
+    embeddings = ggml_pad(ctx0, embeddings, 0, num_padding_patches, 0, 0);
+    embeddings = ggml_view_3d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1] * embeddings->ne[2], batch_size, embeddings->nb[1], embeddings->nb[2] * embeddings->ne[3], 0);
+
+    std::vector<struct ggml_tensor *> intermediate_embeddings;
+
+    // encoder
+    for (size_t il = 0; il < model.layers.size(); il++) {
+        if (hparams.intermediate_layers[il]) {
+            intermediate_embeddings.push_back(embeddings);
+        }
+
+        embeddings = mllama_image_build_encoder_layer(
+            ctx0, il, model.layers[il], embeddings,
+            hparams.eps, hidden_size, batch_size, n_head, d_head);
+    }
+
+    // post-layernorm
+    if (model.post_ln_w != nullptr) {
+        embeddings = ggml_mul(ctx0, ggml_norm(ctx0, embeddings, hparams.eps), model.post_ln_w);
+        if (model.post_ln_b != nullptr) {
+            embeddings = ggml_add(ctx0, embeddings, model.post_ln_b);
+        }
+
+        ggml_set_name(embeddings, "post layernorm");
+    }
+
+    embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size, num_positions + num_padding_patches, num_tiles);
+
+    if (model.post_tile_position_embeddings != nullptr) {
+        struct ggml_tensor *post_tile_position_embeddings = ggml_get_rows(ctx0, model.post_tile_position_embeddings, aspect_ratios);
+        ggml_set_name(post_tile_position_embeddings, "post_tile_position_embeddings");
+
+        post_tile_position_embeddings = ggml_reshape_3d(ctx0, post_tile_position_embeddings, hidden_size, 1, num_tiles);
+        if (model.post_tile_position_embeddings_gate != nullptr) {
+            post_tile_position_embeddings = ggml_mul(ctx0, post_tile_position_embeddings, model.post_tile_position_embeddings_gate);
+        }
+
+        embeddings = ggml_add(ctx0, embeddings, post_tile_position_embeddings);
+    }
+
+    embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size, num_tiles * (num_positions + num_padding_patches), 1);
+
+    // global encoder
+    for (size_t il = 0; il < model.global_layers.size(); il++) {
+        embeddings = mllama_image_build_encoder_layer(
+            ctx0, il, model.global_layers[il], embeddings,
+            hparams.eps, hidden_size, batch_size, n_head, d_head);
+    }
+
+    struct ggml_tensor *stacked_embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 0, hidden_size, (num_positions + num_padding_patches) * num_tiles);
+    for (size_t i = 0; i < intermediate_embeddings.size(); ++i) {
+        stacked_embeddings = ggml_concat(ctx0, stacked_embeddings, ggml_reshape_3d(ctx0, intermediate_embeddings[i], 1, intermediate_embeddings[i]->ne[0], intermediate_embeddings[i]->ne[1]), 0);
+    }
+
+    stacked_embeddings = ggml_reshape_4d(ctx0, stacked_embeddings, intermediate_embeddings.size() * hidden_size, num_positions + num_padding_patches, num_tiles, batch_size);
+    stacked_embeddings = ggml_unpad(ctx0, stacked_embeddings, 0, num_padding_patches, 0, 0);
+
+    embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size, num_positions + num_padding_patches, num_tiles);
+    embeddings = ggml_unpad(ctx0, embeddings, 0, num_padding_patches, 0, 0);
+    embeddings = ggml_concat(ctx0, embeddings, stacked_embeddings, 0);
+
+    // mllama projector
+    embeddings = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_0_w, embeddings), model.mm_0_b);
+    ggml_set_name(embeddings, "multi modal projector");
+
+    // build the graph
+    ggml_build_forward_expand(gf, embeddings);
+
+    ggml_free(ctx0);
+
+    return gf;
+}
+
+static struct ggml_tensor *mllama_tensor_load(struct ggml_context *ctx, const char *name, const bool optional) {
+    struct ggml_tensor *cur = ggml_get_tensor(ctx, name);
+    REQUIRE(cur != nullptr || optional);
+    return cur;
+}
+
+static std::vector<struct mllama_layer> mllama_layers_load(struct ggml_context *ctx, const char *prefix, const int n) {
+    std::vector<struct mllama_layer> layers(n);
+    for (size_t i = 0; i < layers.size(); i++) {
+        auto &layer = layers[i];
+        layer.ln_1_w = mllama_tensor_load(ctx, format("%s.blk.%d.ln1.weight", prefix, i).c_str(), false);
+        layer.ln_1_b = mllama_tensor_load(ctx, format("%s.blk.%d.ln1.bias", prefix, i).c_str(), false);
+        layer.ln_2_w = mllama_tensor_load(ctx, format("%s.blk.%d.ln2.weight", prefix, i).c_str(), false);
+        layer.ln_2_b = mllama_tensor_load(ctx, format("%s.blk.%d.ln2.bias", prefix, i).c_str(), false);
+
+        layer.k_w = mllama_tensor_load(ctx, format("%s.blk.%d.attn_k.weight", prefix, i).c_str(), false);
+        layer.k_b = mllama_tensor_load(ctx, format("%s.blk.%d.attn_k.bias", prefix, i).c_str(), true);
+        layer.q_w = mllama_tensor_load(ctx, format("%s.blk.%d.attn_q.weight", prefix, i).c_str(), false);
+        layer.q_b = mllama_tensor_load(ctx, format("%s.blk.%d.attn_q.bias", prefix, i).c_str(), true);
+        layer.v_w = mllama_tensor_load(ctx, format("%s.blk.%d.attn_v.weight", prefix, i).c_str(), false);
+        layer.v_b = mllama_tensor_load(ctx, format("%s.blk.%d.attn_v.bias", prefix, i).c_str(), true);
+        layer.o_w = mllama_tensor_load(ctx, format("%s.blk.%d.attn_out.weight", prefix, i).c_str(), false);
+        layer.o_b = mllama_tensor_load(ctx, format("%s.blk.%d.attn_out.bias", prefix, i).c_str(), true);
+
+        layer.ff_i_w = mllama_tensor_load(ctx, format("%s.blk.%d.ffn_down.weight", prefix, i).c_str(), false);
+        layer.ff_i_b = mllama_tensor_load(ctx, format("%s.blk.%d.ffn_down.bias", prefix, i).c_str(), false);
+        layer.ff_o_w = mllama_tensor_load(ctx, format("%s.blk.%d.ffn_up.weight", prefix, i).c_str(), false);
+        layer.ff_o_b = mllama_tensor_load(ctx, format("%s.blk.%d.ffn_up.bias", prefix, i).c_str(), false);
+
+        layer.attn_gate = mllama_tensor_load(ctx, format("%s.blk.%d.attn_gate", prefix, i).c_str(), true);
+        layer.ff_gate = mllama_tensor_load(ctx, format("%s.blk.%d.ffn_gate", prefix, i).c_str(), true);
+    }
+
+    return layers;
+}
+
+// read and create ggml_context containing the tensors and their data
+struct mllama_ctx *mllama_model_load(const char *fname, const int verbosity = 1) {
+    struct ggml_context *meta = nullptr;
+
+    struct gguf_init_params params = {
+        true,  // no_alloc
+        &meta, // ctx
+    };
+
+    struct gguf_context *ctx = gguf_init_from_file(fname, params);
+    REQUIRE(ctx != nullptr);
+
+    if (verbosity >= 1) {
+        const int n_tensors = gguf_get_n_tensors(ctx);
+        const int n_kv = gguf_get_n_kv(ctx);
+        const std::string ftype = get_ftype(get_u32(ctx, "general.file_type"));
+        const int idx_desc = get_key_index(ctx, "general.description");
+        const std::string description = gguf_get_val_str(ctx, idx_desc);
+        const int idx_name = gguf_find_key(ctx, "general.name");
+        if (idx_name != -1) { // make name optional temporarily as some of the uploaded models missing it due to a bug
+            const std::string name = gguf_get_val_str(ctx, idx_name);
+            LOG("model name:   %s", name.c_str());
+        }
+        LOG("description:  %s", description.c_str());
+        LOG("GGUF version: %d", gguf_get_version(ctx));
+        LOG("alignment:    %zu", gguf_get_alignment(ctx));
+        LOG("n_tensors:    %d", n_tensors);
+        LOG("n_kv:         %d", n_kv);
+        LOG("ftype:        %s", ftype.c_str());
+        LOG("");
+    }
+    const int n_tensors = gguf_get_n_tensors(ctx);
+
+    mllama_ctx *new_mllama = new mllama_ctx{};
+
+    ggml_backend_t backend = ggml_backend_init_best();
+    if (backend == nullptr) {
+        LOG("%s: failed to initialize backend\n", __func__);
+        mllama_free(new_mllama);
+        gguf_free(ctx);
+        return nullptr;
+    }
+    LOG("%s: using %s backend\n", __func__, ggml_backend_name(backend));
+    new_mllama->backend = backend;
+
+    // load tensors
+    {
+        std::vector<uint8_t> read_buf;
+        struct ggml_init_params params = {
+            (n_tensors + 1) * ggml_tensor_overhead(), // mem_size
+            nullptr,                                  // mem_buffer
+            true,                                     // no_alloc
+        };
+
+        new_mllama->ctx_data = ggml_init(params);
+        if (!new_mllama->ctx_data) {
+            LOG("ggml_init() failed");
+            mllama_free(new_mllama);
+            gguf_free(ctx);
+            return nullptr;
+        }
+
+#ifdef _WIN32
+        int wlen = MultiByteToWideChar(CP_UTF8, 0, fname, -1, NULL, 0);
+        if (!wlen) {
+            return NULL;
+        }
+        wchar_t * wbuf = (wchar_t *) malloc(wlen * sizeof(wchar_t));
+        wlen = MultiByteToWideChar(CP_UTF8, 0, fname, -1, wbuf, wlen);
+        if (!wlen) {
+            free(wbuf);
+            return NULL;
+        }
+#if __GLIBCXX__
+        int fd = _wopen(wbuf, _O_RDONLY | _O_BINARY);
+        __gnu_cxx::stdio_filebuf<char> buffer(fd, std::ios_base::in);
+        std::istream fin(&buffer);
+#else // MSVC
+        // unused in our current build
+        auto fin = std::ifstream(wbuf, std::ios::binary);
+#endif
+        free(wbuf);
+#else
+        auto fin = std::ifstream(fname, std::ios::binary);
+#endif
+        if (!fin) {
+            LOG("cannot open model file for loading tensors\n");
+            mllama_free(new_mllama);
+            gguf_free(ctx);
+            return nullptr;
+        }
+
+        // add tensors to context
+        for (int i = 0; i < n_tensors; ++i) {
+            const char *name = gguf_get_tensor_name(ctx, i);
+            struct ggml_tensor *t = ggml_get_tensor(meta, name);
+            struct ggml_tensor *cur = ggml_dup_tensor(new_mllama->ctx_data, t);
+            ggml_set_name(cur, name);
+        }
+
+        // alloc memory and offload data
+        new_mllama->params_buffer = ggml_backend_alloc_ctx_tensors(new_mllama->ctx_data, new_mllama->backend);
+        for (int i = 0; i < n_tensors; ++i) {
+            const char *name = gguf_get_tensor_name(ctx, i);
+            struct ggml_tensor *cur = ggml_get_tensor(new_mllama->ctx_data, name);
+            const size_t offset = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i);
+            fin.seekg(offset, std::ios::beg);
+            if (!fin) {
+                LOG("failed to seek for tensor %s\n", name);
+                mllama_free(new_mllama);
+                gguf_free(ctx);
+                return nullptr;
+            }
+            int num_bytes = ggml_nbytes(cur);
+            if (ggml_backend_buffer_is_host(new_mllama->params_buffer)) {
+                // for the CPU and Metal backend, we can read directly into the tensor
+                fin.read(reinterpret_cast<char *>(cur->data), num_bytes);
+            } else {
+                // read into a temporary buffer first, then copy to device memory
+                read_buf.resize(num_bytes);
+                fin.read(reinterpret_cast<char *>(read_buf.data()), num_bytes);
+                ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
+            }
+        }
+
+#if defined(_WIN32) && defined(__GLIBCXX__)
+        close(fd);
+#else
+        fin.close();
+#endif
+    }
+
+    // vision model
+    // load vision model
+    auto &vision_model = new_mllama->vision_model;
+    auto &hparams = vision_model.hparams;
+    hparams.hidden_size = get_u32(ctx, "mllama.vision.embedding_length");
+    hparams.n_head = get_u32(ctx, "mllama.vision.attention.head_count");
+    hparams.n_intermediate = get_u32(ctx, "mllama.vision.feed_forward_length");
+    hparams.n_layer = get_u32(ctx, "mllama.vision.block_count");
+    hparams.n_global_layer = get_u32(ctx, "mllama.vision.global.block_count");
+    hparams.n_tiles = get_u32(ctx, "mllama.vision.max_num_tiles");
+    hparams.image_size = get_u32(ctx, "mllama.vision.image_size");
+    hparams.patch_size = get_u32(ctx, "mllama.vision.patch_size");
+    hparams.projection_dim = get_u32(ctx, "mllama.vision.projection_dim");
+    hparams.eps = get_f32(ctx, "mllama.vision.attention.layer_norm_epsilon");
+
+    std::vector<uint32_t> intermediate_layers_indices = get_u32_array(ctx, "mllama.vision.intermediate_layers_indices");
+    hparams.intermediate_layers.resize(hparams.n_layer);
+    for (size_t i = 0; i < intermediate_layers_indices.size(); i++) {
+        hparams.intermediate_layers[intermediate_layers_indices[i]] = true;
+    }
+
+    if (verbosity >= 2) {
+        LOG("");
+        LOG("vision model hparams");
+        LOG("image_size         %d", hparams.image_size);
+        LOG("patch_size         %d", hparams.patch_size);
+        LOG("v_hidden_size      %d", hparams.hidden_size);
+        LOG("v_n_intermediate   %d", hparams.n_intermediate);
+        LOG("v_projection_dim   %d", hparams.projection_dim);
+        LOG("v_n_head           %d", hparams.n_head);
+        LOG("v_n_layer          %d", hparams.n_layer);
+        LOG("v_n_global_layer   %d", hparams.n_global_layer);
+        LOG("v_eps              %f", hparams.eps);
+    }
+
+    vision_model.class_embedding = mllama_tensor_load(new_mllama->ctx_data, "v.class_embd", true);
+    vision_model.patch_embeddings = mllama_tensor_load(new_mllama->ctx_data, "v.patch_embd.weight", true);
+
+    vision_model.position_embeddings = mllama_tensor_load(new_mllama->ctx_data, "v.position_embd.weight", true);
+    vision_model.position_embeddings_gate = mllama_tensor_load(new_mllama->ctx_data, "v.position_embd.gate", true);
+
+    vision_model.pre_ln_w = mllama_tensor_load(new_mllama->ctx_data, "v.pre_ln.weight", true);
+    vision_model.pre_ln_b = mllama_tensor_load(new_mllama->ctx_data, "v.pre_ln.bias", true);
+    vision_model.post_ln_w = mllama_tensor_load(new_mllama->ctx_data, "v.post_ln.weight", true);
+    vision_model.post_ln_b = mllama_tensor_load(new_mllama->ctx_data, "v.post_ln.bias", true);
+
+    vision_model.tile_position_embeddings = mllama_tensor_load(new_mllama->ctx_data, "v.tile_position_embd.weight", true);
+    vision_model.tile_position_embeddings_gate = mllama_tensor_load(new_mllama->ctx_data, "v.tile_position_embd.gate", true);
+
+    vision_model.pre_tile_position_embeddings = mllama_tensor_load(new_mllama->ctx_data, "v.pre_tile_position_embd.weight", true);
+    vision_model.pre_tile_position_embeddings_gate = mllama_tensor_load(new_mllama->ctx_data, "v.pre_tile_position_embd.gate", true);
+
+    vision_model.post_tile_position_embeddings = mllama_tensor_load(new_mllama->ctx_data, "v.post_tile_position_embd.weight", true);
+    vision_model.post_tile_position_embeddings_gate = mllama_tensor_load(new_mllama->ctx_data, "v.post_tile_position_embd.gate", true);
+
+    vision_model.mm_0_w = mllama_tensor_load(new_mllama->ctx_data, "mm.0.weight", false);
+    vision_model.mm_0_b = mllama_tensor_load(new_mllama->ctx_data, "mm.0.bias", false);
+
+    vision_model.layers = mllama_layers_load(new_mllama->ctx_data, "v", hparams.n_layer);
+    vision_model.global_layers = mllama_layers_load(new_mllama->ctx_data, "v.global", hparams.n_global_layer);
+
+    ggml_free(meta);
+
+    new_mllama->ctx_gguf = ctx;
+
+    {
+        // measure mem requirement and allocate
+        new_mllama->buf_compute_meta.resize(GGML_DEFAULT_GRAPH_SIZE * ggml_tensor_overhead() + ggml_graph_overhead());
+        new_mllama->compute_alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(new_mllama->backend));
+        struct mllama_image_batch batch;
+        batch.size = 1;
+        ggml_cgraph *gf = mllama_image_build_graph(new_mllama, &batch);
+        ggml_gallocr_reserve(new_mllama->compute_alloc, gf);
+        size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_mllama->compute_alloc, 0);
+        LOG("compute allocated memory: %.2f MB", compute_memory_buffer_size / 1024.0 / 1024.0);
+    }
+
+    return new_mllama;
+}
+
+struct mllama_image *mllama_image_init() {
+    return new mllama_image();
+}
+
+void mllama_image_free(struct mllama_image *img) { delete img; }
+void mllama_image_batch_free(struct mllama_image_batch *batch) {
+    if (batch->size > 0) {
+        delete[] batch->data;
+        batch->size = 0;
+    }
+}
+
+bool mllama_image_load_from_data(const void *data, const int n, const int width, const int height, const int num_channels, const int num_tiles, const int aspect_ratio_id, struct mllama_image *img) {
+    img->width = width;
+    img->height = height;
+    img->num_channels = num_channels;
+    img->num_tiles = num_tiles;
+    img->aspect_ratio_id = aspect_ratio_id;
+    img->data.resize(n);
+
+    memcpy(img->data.data(), data, n);
+    return true;
+}
+
+inline int mllama(int x, int lower, int upper) {
+    return std::max(lower, std::min(x, upper));
+}
+
+void mllama_free(mllama_ctx *ctx) {
+    ggml_free(ctx->ctx_data);
+    gguf_free(ctx->ctx_gguf);
+
+    ggml_backend_buffer_free(ctx->params_buffer);
+    ggml_backend_free(ctx->backend);
+    ggml_gallocr_free(ctx->compute_alloc);
+    delete ctx;
+}
+
+bool mllama_image_encode(struct mllama_ctx *ctx, const int n_threads, mllama_image *img, float *vec) {
+    mllama_image_batch imgs{};
+    imgs.size = 1;
+    imgs.data = img;
+    return mllama_image_batch_encode(ctx, n_threads, &imgs, vec);
+}
+
+bool mllama_image_batch_encode(mllama_ctx *ctx, const int n_threads, const mllama_image_batch *imgs, float *vec) {
+    int batch_size = imgs->size;
+    REQUIRE(batch_size == 1);
+
+    // build the inference graph
+    ggml_cgraph *gf = mllama_image_build_graph(ctx, imgs);
+    ggml_gallocr_alloc_graph(ctx->compute_alloc, gf);
+
+    // set inputs
+    const auto &model = ctx->vision_model;
+    const auto &hparams = model.hparams;
+
+    const int image_size = hparams.image_size;
+    int image_size_width = image_size;
+    int image_size_height = image_size;
+
+    const int patch_size = hparams.patch_size;
+    const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
+    const int num_positions = num_patches + (model.class_embedding == nullptr ? 0 : 1);
+
+    {
+        struct ggml_tensor *inp_raw = ggml_graph_get_tensor(gf, "inp_raw");
+        ggml_backend_tensor_set(inp_raw, imgs->data[0].data.data(), 0, ggml_nbytes(inp_raw));
+    }
+
+    {
+        struct ggml_tensor *embeddings = ggml_graph_get_tensor(gf, "embeddings");
+        if (embeddings != nullptr) {
+            void *zeros = malloc(ggml_nbytes(embeddings));
+            memset(zeros, 0, ggml_nbytes(embeddings));
+            ggml_backend_tensor_set(embeddings, zeros, 0, ggml_nbytes(embeddings));
+            free(zeros);
+        }
+    }
+
+    {
+        struct ggml_tensor *positions = ggml_graph_get_tensor(gf, "positions");
+        if (positions != nullptr) {
+            int *positions_data = (int *)malloc(ggml_nbytes(positions));
+            for (int i = 0; i < num_positions; i++) {
+                positions_data[i] = i;
+            }
+            ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
+            free(positions_data);
+        }
+    }
+
+    {
+        struct ggml_tensor *aspect_ratios = ggml_graph_get_tensor(gf, "aspect_ratios");
+        if (aspect_ratios != nullptr) {
+            int *aspect_ratios_data = (int *)malloc(ggml_nbytes(aspect_ratios));
+            aspect_ratios_data[0] = imgs->data[0].aspect_ratio_id;
+            ggml_backend_tensor_set(aspect_ratios, aspect_ratios_data, 0, ggml_nbytes(aspect_ratios));
+            free(aspect_ratios_data);
+        }
+    }
+
+    if (ggml_backend_is_cpu(ctx->backend)) {
+        ggml_backend_cpu_set_n_threads(ctx->backend, n_threads);
+    }
+
+    ggml_backend_graph_compute(ctx->backend, gf);
+
+    // the last node is the embedding tensor
+    struct ggml_tensor *embeddings = ggml_graph_node(gf, ggml_graph_n_nodes(gf) - 1);
+
+    // copy the embeddings to the location passed by the user
+    ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
+
+    return true;
+}
+
+int32_t mllama_image_size(const struct mllama_ctx *ctx) {
+    return ctx->vision_model.hparams.image_size;
+}
+
+int32_t mllama_patch_size(const struct mllama_ctx *ctx) {
+    return ctx->vision_model.hparams.patch_size;
+}
+
+int32_t mllama_hidden_size(const struct mllama_ctx *ctx) {
+    return ctx->vision_model.hparams.hidden_size;
+}
+
+int mllama_n_patches(const struct mllama_ctx *ctx) {
+    const auto &hparams = ctx->vision_model.hparams;
+    return (hparams.image_size / hparams.patch_size) * (hparams.image_size / hparams.patch_size);
+}
+
+int mllama_n_positions(const struct mllama_ctx *ctx) {
+    return mllama_n_patches(ctx) + (ctx->vision_model.class_embedding == nullptr ? 0 : 1);
+}
+
+int mllama_n_tiles(const struct mllama_ctx *ctx) {
+    return ctx->vision_model.hparams.n_tiles;
+}
+
+int mllama_n_embd(const struct mllama_ctx *ctx) {
+    return ctx->vision_model.hparams.projection_dim;
+}
+
+size_t mllama_n_embd_bytes(const struct mllama_ctx *ctx) {
+    return mllama_n_positions(ctx) * mllama_n_embd(ctx) * mllama_n_tiles(ctx) * sizeof(float);
+}
--- a/llama/mllama.h
+++ b/llama/mllama.h
@ -0,0 +1,61 @@
+#ifndef MLLAMA_H
+#define MLLAMA_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+#ifdef LLAMA_SHARED
+#if defined(_WIN32) && !defined(__MINGW32__)
+#ifdef LLAMA_BUILD
+#define MLLAMA_API __declspec(dllexport)
+#else
+#define MLLAMA_API __declspec(dllimport)
+#endif
+#else
+#define MLLAMA_API __attribute__((visibility("default")))
+#endif
+#else
+#define MLLAMA_API
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct mllama_ctx;
+
+struct mllama_image_batch {
+    struct mllama_image *data;
+    size_t size;
+};
+
+MLLAMA_API struct mllama_ctx *mllama_model_load(const char *fname, int verbosity);
+MLLAMA_API struct mllama_ctx *mllama_model_load_cpu(const char *fname, int verbosity);
+
+MLLAMA_API void mllama_free(struct mllama_ctx *ctx);
+
+MLLAMA_API int32_t mllama_image_size(const struct mllama_ctx *ctx);
+MLLAMA_API int32_t mllama_patch_size(const struct mllama_ctx *ctx);
+MLLAMA_API int32_t mllama_hidden_size(const struct mllama_ctx *ctx);
+
+MLLAMA_API int mllama_n_patches(const struct mllama_ctx *ctx);
+MLLAMA_API int mllama_n_positions(const struct mllama_ctx *ctx);
+MLLAMA_API int mllama_n_tiles(const struct mllama_ctx *ctx);
+MLLAMA_API int mllama_n_embd(const struct mllama_ctx *ctx);
+MLLAMA_API size_t mllama_n_embd_bytes(const struct mllama_ctx *ctx);
+
+MLLAMA_API struct mllama_image *mllama_image_init();
+
+MLLAMA_API void mllama_image_free(struct mllama_image *img);
+MLLAMA_API void mllama_image_batch_free(struct mllama_image_batch *batch);
+
+MLLAMA_API bool mllama_image_load_from_data(const void *data, const int n, const int nx, const int ny, const int nc, const int nt, const int aspect_ratio_id, struct mllama_image *img);
+
+MLLAMA_API bool mllama_image_encode(struct mllama_ctx *ctx, int n_threads, struct mllama_image *img, float *vec);
+MLLAMA_API bool mllama_image_batch_encode(struct mllama_ctx *ctx, int n_threads, const struct mllama_image_batch *imgs, float *vec);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // MLLAMA_H
--- a/llama/patches/0005-solar-pro.patch
+++ b/llama/patches/0005-solar-pro.patch
@ -270,7 +270,7 @@ index 3a4e72a3..831b68c0 100644
 +            // self-attention
 +            {
 +                // rope freq factors for llama3; may return nullptr for llama2 and other models
-+                ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
+                ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
 +
 +                // compute Q and K and RoPE them
 +                ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
--- a/llama/patches/0006-add-mllama-support.patch
+++ b/llama/patches/0006-add-mllama-support.patch
--- a/llama/patches/0007-add-unpad-operator.patch
+++ b/llama/patches/0007-add-unpad-operator.patch
@ -0,0 +1,419 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: jmorganca <jmorganca@gmail.com>
+Date: Sun, 13 Apr 2025 22:10:06 -0400
+Subject: [PATCH] add unpad operator
+
+adds the unpad operator to GGML
+---
+ ggml/include/ggml.h                  | 10 +++++
+ ggml/src/ggml-cpu/ggml-cpu.c         |  5 +++
+ ggml/src/ggml-cpu/ops.cpp            | 55 ++++++++++++++++++++++++++++
+ ggml/src/ggml-cpu/ops.h              |  1 +
+ ggml/src/ggml-cuda/ggml-cuda.cu      |  4 ++
+ ggml/src/ggml-cuda/pad.cu            | 46 +++++++++++++++++++++++
+ ggml/src/ggml-cuda/pad.cuh           |  1 +
+ ggml/src/ggml-metal/ggml-metal.m     | 33 +++++++++++++++++
+ ggml/src/ggml-metal/ggml-metal.metal | 45 +++++++++++++++++++++++
+ ggml/src/ggml.c                      | 25 ++++++++++++-
+ 10 files changed, 223 insertions(+), 2 deletions(-)
+
+diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
+index e91dedf1..8dc107ba 100644
+--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
+@@ -489,6 +489,7 @@ extern "C" {
+         GGML_OP_UPSCALE, // nearest interpolate
+         GGML_OP_PAD,
+         GGML_OP_PAD_REFLECT_1D,
+        GGML_OP_UNPAD,
+         GGML_OP_ARANGE,
+         GGML_OP_TIMESTEP_EMBEDDING,
+         GGML_OP_ARGSORT,
+@@ -1781,6 +1782,15 @@ extern "C" {
+             int                   p0,
+             int                   p1);
+ 
+    // unpad each dimension: [x, ..., x, y, ..., y] -> [x, ..., x]
+    GGML_API struct ggml_tensor * ggml_unpad(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                  p0,
+            int                  p1,
+            int                  p2,
+            int                  p3);
+
+     // Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151
+     // timesteps: [N,]
+     // return: [N, dim]
+diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
+index a30e67f2..835e6495 100644
+--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
+@@ -1951,6 +1951,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
+             {
+                 ggml_compute_forward_pad_reflect_1d(params, tensor);
+             } break;
+        case GGML_OP_UNPAD:
+            {
+                ggml_compute_forward_unpad(params, tensor);
+            } break;
+         case GGML_OP_ARANGE:
+             {
+                 ggml_compute_forward_arange(params, tensor);
+@@ -2274,6 +2278,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
+         case GGML_OP_UPSCALE:
+         case GGML_OP_PAD:
+         case GGML_OP_PAD_REFLECT_1D:
+        case GGML_OP_UNPAD:
+         case GGML_OP_ARANGE:
+         case GGML_OP_TIMESTEP_EMBEDDING:
+         case GGML_OP_ARGSORT:
+diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
+index 955fec59..1868a10c 100644
+--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
+@@ -6690,6 +6690,61 @@ void ggml_compute_forward_pad_reflect_1d(
+     }
+ }
+ 
+// ggml_compute_forward_unpad
+
+static void ggml_compute_forward_unpad_f32(
+    const struct ggml_compute_params *params,
+    struct ggml_tensor *dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    GGML_ASSERT(src0->nb[0] == sizeof(float));
+    GGML_ASSERT( dst->nb[0] == sizeof(float));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    float * dst_ptr = (float *) dst->data;
+
+    // TODO: optimize
+
+    for (int64_t i2 = 0; i2 < ne2; ++i2) {
+        for (int64_t i1 = ith; i1 < ne1; i1 += nth) {
+            for (int64_t i0 = 0; i0 < ne0; ++i0) {
+                for (int64_t i3 = 0; i3 < ne3; ++i3) {
+                    const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0;
+
+                    const float * src_ptr = (const float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
+
+                    if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
+                        dst_ptr[dst_idx] = *src_ptr;
+                    }
+                }
+            }
+        }
+    }
+}
+
+void ggml_compute_forward_unpad(
+    const struct ggml_compute_params * params,
+    struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_unpad_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+ // ggml_compute_forward_arange
+ 
+ static void ggml_compute_forward_arange_f32(
+diff --git a/ggml/src/ggml-cpu/ops.h b/ggml/src/ggml-cpu/ops.h
+index dc081b9e..a7125555 100644
+--- a/ggml/src/ggml-cpu/ops.h
+++ b/ggml/src/ggml-cpu/ops.h
+@@ -72,6 +72,7 @@ void ggml_compute_forward_pool_2d_back(const struct ggml_compute_params * params
+ void ggml_compute_forward_upscale(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+ void ggml_compute_forward_pad(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+ void ggml_compute_forward_pad_reflect_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_unpad(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+ void ggml_compute_forward_arange(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+ void ggml_compute_forward_timestep_embedding(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+ void ggml_compute_forward_argsort(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
+index cb0d8528..6fe86674 100644
+--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
+@@ -2238,6 +2238,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
+         case GGML_OP_PAD:
+             ggml_cuda_op_pad(ctx, dst);
+             break;
+        case GGML_OP_UNPAD:
+            ggml_cuda_op_unpad(ctx, dst);
+            break;
+         case GGML_OP_ARANGE:
+             ggml_cuda_op_arange(ctx, dst);
+             break;
+@@ -3212,6 +3215,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
+         case GGML_OP_UPSCALE:
+             return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST;
+         case GGML_OP_PAD:
+        case GGML_OP_UNPAD:
+         case GGML_OP_ARANGE:
+         case GGML_OP_TIMESTEP_EMBEDDING:
+         case GGML_OP_LEAKY_RELU:
+diff --git a/ggml/src/ggml-cuda/pad.cu b/ggml/src/ggml-cuda/pad.cu
+index 77432b04..7d45a7e1 100644
+--- a/ggml/src/ggml-cuda/pad.cu
+++ b/ggml/src/ggml-cuda/pad.cu
+@@ -47,3 +47,49 @@ void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+         src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
+         dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], stream);
+ }
+
+static __global__ void unpad_f32(const float * x, float * dst, const int ne0, const int ne00, const int ne01, const int ne02, const int ne03) {
+    // blockIdx.z: idx of ne2*ne3, aka ne02*ne03
+    // blockIdx.y: idx of ne1
+    // blockIDx.x: idx of ne0 / BLOCK_SIZE
+    int nidx = threadIdx.x + blockIdx.x * blockDim.x;
+    if (nidx >= ne0) {
+        return;
+    }
+
+    // operation
+    int offset_dst =
+        nidx +
+        blockIdx.y * ne0 +
+        blockIdx.z * ne0 * gridDim.y;
+    if (nidx < ne00 && blockIdx.y < ne01 && blockIdx.z < ne02*ne03) {
+        int offset_src =
+            nidx +
+            blockIdx.y * ne00 +
+            blockIdx.z * ne00 * ne01;
+        dst[offset_dst] = x[offset_src];
+    }
+}
+
+static void unpad_f32_cuda(const float * x, float * dst,
+    const int ne00, const int ne01, const int ne02, const int ne03,
+    const int ne0, const int ne1, const int ne2, const int ne3, cudaStream_t stream) {
+    int num_blocks = (ne0 + CUDA_PAD_BLOCK_SIZE - 1) / CUDA_PAD_BLOCK_SIZE;
+    dim3 gridDim(num_blocks, ne1, ne2*ne3);
+    unpad_f32<<<gridDim, CUDA_PAD_BLOCK_SIZE, 0, stream>>>(x, dst, ne0, ne00, ne01, ne02, ne03);
+}
+
+void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+    const float * src0_d = (const float *)src0->data;
+    float * dst_d = (float *)dst->data;
+    cudaStream_t stream = ctx.stream();
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+    GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
+
+    unpad_f32_cuda(src0_d, dst_d,
+        src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
+        dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], stream);
+}
+\ No newline at end of file
+diff --git a/ggml/src/ggml-cuda/pad.cuh b/ggml/src/ggml-cuda/pad.cuh
+index 8fd386b0..e2ededc3 100644
+--- a/ggml/src/ggml-cuda/pad.cuh
+++ b/ggml/src/ggml-cuda/pad.cuh
+@@ -3,3 +3,4 @@
+ #define CUDA_PAD_BLOCK_SIZE 256
+ 
+ void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
+index 1b56f858..7641247e 100644
+--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
+@@ -347,6 +347,7 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
+     GGML_METAL_KERNEL_TYPE_UPSCALE_F32,
+     GGML_METAL_KERNEL_TYPE_PAD_F32,
+     GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32,
+    GGML_METAL_KERNEL_TYPE_UNPAD_F32,
+     GGML_METAL_KERNEL_TYPE_ARANGE_F32,
+     GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,
+     GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,
+@@ -1294,6 +1295,7 @@ @implementation GGMLMetalClass
+         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32,                     upscale_f32,                     true);
+         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32,                         pad_f32,                         true);
+         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32,              pad_reflect_1d_f32,              true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UNPAD_F32,                       unpad_f32,                       true);
+         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,          timestep_embedding_f32,          true);
+         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARANGE_F32,                      arange_f32,                      true);
+         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,             argsort_f32_i32_asc,             true);
+@@ -1655,6 +1657,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
+         case GGML_OP_POOL_2D:
+         case GGML_OP_PAD:
+         case GGML_OP_PAD_REFLECT_1D:
+        case GGML_OP_UNPAD:
+         case GGML_OP_TIMESTEP_EMBEDDING:
+         case GGML_OP_ARGSORT:
+         case GGML_OP_LEAKY_RELU:
+@@ -4184,6 +4187,36 @@ static bool ggml_metal_encode_node(
+ 
+                 const int nth = MIN(1024, ne0);
+ 
+                [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+            } break;
+        case GGML_OP_UNPAD:
+            {
+                GGML_ASSERT(src0->type == GGML_TYPE_F32);
+
+                id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_UNPAD_F32].pipeline;
+
+                [encoder setComputePipelineState:pipeline];
+                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+                [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
+                [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
+                [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
+                [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5];
+                [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
+                [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
+                [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
+                [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9];
+                [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:10];
+                [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:11];
+                [encoder setBytes:&ne2  length:sizeof(ne2)  atIndex:12];
+                [encoder setBytes:&ne3  length:sizeof(ne3)  atIndex:13];
+                [encoder setBytes:&nb0  length:sizeof(nb0)  atIndex:14];
+                [encoder setBytes:&nb1  length:sizeof(nb1)  atIndex:15];
+                [encoder setBytes:&nb2  length:sizeof(nb2)  atIndex:16];
+                [encoder setBytes:&nb3  length:sizeof(nb3)  atIndex:17];
+
+                const int nth = MIN(1024, ne0);
+
+                 [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+             } break;
+         case GGML_OP_ARANGE:
+diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
+index 9cfddf45..080a943b 100644
+--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
+@@ -3121,6 +3121,51 @@ kernel void kernel_pad_reflect_1d_f32(
+     }
+ }
+ 
+kernel void kernel_unpad_f32(
+    device  const char * src0,
+    device        char * dst,
+    constant   int64_t & ne00,
+    constant   int64_t & ne01,
+    constant   int64_t & ne02,
+    constant   int64_t & ne03,
+    constant  uint64_t & nb00,
+    constant  uint64_t & nb01,
+    constant  uint64_t & nb02,
+    constant  uint64_t & nb03,
+    constant   int64_t & ne0,
+    constant   int64_t & ne1,
+    constant   int64_t & ne2,
+    constant   int64_t & ne3,
+    constant  uint64_t & nb0,
+    constant  uint64_t & nb1,
+    constant  uint64_t & nb2,
+    constant  uint64_t & nb3,
+    uint3 tgpig[[threadgroup_position_in_grid]],
+    uint3 tpitg[[thread_position_in_threadgroup]],
+    uint3   ntg[[threads_per_threadgroup]]) {
+
+    const int64_t i3 = tgpig.z;
+    const int64_t i2 = tgpig.y;
+    const int64_t i1 = tgpig.x;
+
+    const int64_t i03 = i3;
+    const int64_t i02 = i2;
+    const int64_t i01 = i1;
+
+    device const float * src0_ptr = (device const float *) (src0 + i03*nb03 + i02*nb02 + i01*nb01);
+    device       float * dst_ptr  = (device       float *) (dst  +  i3*nb3  +  i2*nb2  +  i1*nb1);
+
+    if (i1 < ne01 && i2 < ne02 && i3 < ne03) {
+        for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
+            if (i0 < ne00) {
+                dst_ptr[i0] = src0_ptr[i0];
+            }
+        }
+
+        return;
+    }
+}
+
+ kernel void kernel_arange_f32(
+     device        char * dst,
+     constant   ggml_metal_kargs_arange & args,
+diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
+index 8a654624..6b034d35 100644
+--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
+@@ -923,6 +923,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
+     "UPSCALE",
+     "PAD",
+     "PAD_REFLECT_1D",
+    "UNPAD",
+     "ARANGE",
+     "TIMESTEP_EMBEDDING",
+     "ARGSORT",
+@@ -953,7 +954,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
+     "OPT_STEP_ADAMW",
+ };
+ 
+-static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
+static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
+ 
+ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
+     "none",
+@@ -1018,6 +1019,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
+     "upscale(x)",
+     "pad(x)",
+     "pad_reflect_1d(x)",
+    "unpad(x)",
+     "arange(start, stop, step)",
+     "timestep_embedding(timesteps, dim, max_period)",
+     "argsort(x)",
+@@ -1048,7 +1050,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
+     "adamw(x)",
+ };
+ 
+-static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
+static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
+ 
+ static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
+ 
+@@ -4274,6 +4276,25 @@ struct ggml_tensor * ggml_pad_reflect_1d(
+     return result;
+ }
+ 
+// ggml_unpad
+
+struct ggml_tensor * ggml_unpad(
+    struct ggml_context * ctx,
+    struct ggml_tensor  * a,
+    int p0, int p1, int p2, int p3) {
+
+    struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type,
+            a->ne[0] - p0,
+            a->ne[1] - p1,
+            a->ne[2] - p2,
+            a->ne[3] - p3);
+
+    result->op = GGML_OP_UNPAD;
+    result->src[0] = a;
+
+    return result;
+}
+
+ // ggml_arange
+ 
+ struct ggml_tensor * ggml_arange(
--- a/llama/patches/0008-fix-deepseek-deseret-regex.patch
+++ b/llama/patches/0008-fix-deepseek-deseret-regex.patch
--- a/llama/patches/0009-maintain-ordering-for-rules-for-grammar.patch
+++ b/llama/patches/0009-maintain-ordering-for-rules-for-grammar.patch
--- a/llama/patches/0010-ensure-KV-cache-is-fully-defragmented.patch
+++ b/llama/patches/0010-ensure-KV-cache-is-fully-defragmented.patch
@ -15,50 +15,13 @@ but this can leave a cache that still does not have adequate space
 even after defragmentation is triggered. Instead, we should do
 multiple batches of processing until everything is complete.
 ---
- src/llama-context.cpp  |  18 ++++---
 src/llama-context.h    |   1 +
 src/llama-kv-cache.cpp | 107 ++++++++++++++---------------------------
 src/llama-kv-cache.h   |  12 ++++-
- 4 files changed, 59 insertions(+), 79 deletions(-)
+ 3 files changed, 47 insertions(+), 73 deletions(-)

-diff --git a/src/llama-context.cpp b/src/llama-context.cpp
-index c22687e4..c5948e8f 100644
--- a/src/llama-context.cpp
-+++ b/src/llama-context.cpp
-@@ -950,9 +950,12 @@ int llama_context::decode(llama_batch & inp_batch) {
- 
-         // find KV slot
-         if (!kv_self->find_slot(ubatch)) {
-            LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
-
-            return 1;
-+            kv_self->defrag_sched(-1.0f);
-+            kv_self->update(*this);
-+            if (!kv_self->find_slot(ubatch)) {
-+                LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
-+                return 1;
-+            }
-         }
- 
-         ggml_backend_sched_reset(sched.get());
-@@ -1967,9 +1970,12 @@ void llama_context::opt_epoch_iter(
- 
-             // TODO: not sure if this is needed
-             if (!kv_self->find_slot(ubatch)) {
-                LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
-
-                GGML_ABORT("TODO: handle this error");
-+                kv_self->defrag_sched(-1.0f);
-+                kv_self->update(*this);
-+                if (!kv_self->find_slot(ubatch)) {
-+                    LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
-+                    GGML_ABORT("TODO: handle this error");
-+                }
-             }
- 
-             auto * gf = graph_init();
 diff --git a/src/llama-context.h b/src/llama-context.h
-index c0ceacb1..0264e937 100644
+index c4ab242a..9970dfc6 100644
 --- a/src/llama-context.h
 +++ b/src/llama-context.h
@@ -5,6 +5,7 @@
@ -70,10 +33,10 @@ index c0ceacb1..0264e937 100644
 #include "ggml-cpp.h"
 #include "ggml-opt.h"
 diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
-index 3dcad65b..60e67b03 100644
+index a7b0a7eb..1a50c034 100644
 --- a/src/llama-kv-cache.cpp
 +++ b/src/llama-kv-cache.cpp
-@@ -364,8 +364,6 @@ void llama_kv_cache_unified::commit() {
+@@ -372,8 +372,6 @@ void llama_kv_cache_unified::commit() {
 }
 
 bool llama_kv_cache_unified::update(llama_context & lctx) {
@ -82,7 +45,7 @@ index 3dcad65b..60e67b03 100644
     auto * sched = lctx.get_sched();
 
     if (has_shift) {
-@@ -388,8 +386,6 @@ bool llama_kv_cache_unified::update(llama_context & lctx) {
+@@ -396,8 +394,6 @@ bool llama_kv_cache_unified::update(llama_context & lctx) {
             res->set_inputs(nullptr);
 
             lctx.graph_compute(gf, false);
@ -91,7 +54,7 @@ index 3dcad65b..60e67b03 100644
         }
 
         {
-@@ -403,27 +399,36 @@ bool llama_kv_cache_unified::update(llama_context & lctx) {
+@@ -411,27 +407,36 @@ bool llama_kv_cache_unified::update(llama_context & lctx) {
 
     if (do_defrag) {
         LLAMA_LOG_DEBUG("%s: defragmenting KV cache\n", __func__);
@ -133,7 +96,7 @@ index 3dcad65b..60e67b03 100644
 }
 
 void llama_kv_cache_unified::defrag_sched(float thold) {
-@@ -707,11 +712,10 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_shift(
+@@ -715,11 +720,10 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_shift(
 llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag(
         const llama_cparams & cparams,
                ggml_context * ctx,
@ -147,7 +110,7 @@ index 3dcad65b..60e67b03 100644
 #if 0
     // CPU defrag
     //
-@@ -783,32 +787,20 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag(
+@@ -791,32 +795,20 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag(
         ggml_backend_tensor_set(v_l[il], buf_v.data(), 0, buf_v.size());
     }
 #else
@ -185,7 +148,7 @@ index 3dcad65b..60e67b03 100644
 
             ggml_tensor * view_v_src;
             ggml_tensor * view_v_dst;
-@@ -816,31 +808,29 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag(
+@@ -824,31 +816,29 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag(
             if (cparams.flash_attn) {
                 // NOTE: the V cache is not transposed when using flash attention
                 view_v_src = ggml_view_2d(ctx, v_l[il],
@ -225,7 +188,7 @@ index 3dcad65b..60e67b03 100644
     }
 
     //LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
-@@ -857,17 +847,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
+@@ -865,17 +855,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
 
     assert(n_used <= n_kv);
 
@ -244,7 +207,7 @@ index 3dcad65b..60e67b03 100644
 
     // determine which KV cells to move where
     //
-@@ -875,10 +855,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
+@@ -883,10 +863,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
     //
     //  if ids[i] == i || ids[i] == n_kv, then cell i is not moved
     //
@ -256,7 +219,7 @@ index 3dcad65b..60e67b03 100644
 
     for (uint32_t i0 = 0; i0 < n_used; ++i0) {
         const auto & cell0 = cells[i0];
-@@ -927,19 +904,11 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
+@@ -935,19 +912,11 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
         // are we moving a continuous block of memory?
         bool cont = false;
 
@ -276,7 +239,7 @@ index 3dcad65b..60e67b03 100644
                 cont = false;
                 continue;
             }
-@@ -955,8 +924,10 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
+@@ -963,8 +932,10 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
             head = n_used;
 
             if (!cont) {
@ -288,7 +251,7 @@ index 3dcad65b..60e67b03 100644
             }
 
             nf++;
-@@ -966,22 +937,16 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
+@@ -974,22 +945,16 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
             }
         }
 
--- a/llama/patches/0011-sort-devices-by-score.patch
+++ b/llama/patches/0011-sort-devices-by-score.patch
@ -11,7 +11,7 @@ with the fastest acceleration is loaded
 1 file changed, 13 insertions(+), 8 deletions(-)

 diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
-index 405d8e31..4e67d243 100644
+index 82ae1b5b..1487f322 100644
 --- a/ggml/src/ggml-backend-reg.cpp
 +++ b/ggml/src/ggml-backend-reg.cpp
@@ -157,7 +157,7 @@ struct ggml_backend_reg_entry {
--- a/llama/patches/0012-add-phony-target-ggml-cpu-for-all-cpu-variants.patch
+++ b/llama/patches/0012-add-phony-target-ggml-cpu-for-all-cpu-variants.patch
--- a/llama/patches/0013-remove-amx.patch
+++ b/llama/patches/0013-remove-amx.patch
--- a/llama/patches/0014-fix-string-arr-kv-loading.patch
+++ b/llama/patches/0014-fix-string-arr-kv-loading.patch
@ -9,8 +9,8 @@ such as vocab fields
 ---
 ggml/include/gguf.h | 1 +
 ggml/src/gguf.cpp   | 7 +++++--
- src/llama-vocab.cpp | 4 +---
- 3 files changed, 7 insertions(+), 5 deletions(-)
+ src/llama-vocab.cpp | 2 +-
+ 3 files changed, 7 insertions(+), 3 deletions(-)

 diff --git a/ggml/include/gguf.h b/ggml/include/gguf.h
 index 79ee2020..3efb22f0 100644
@ -53,15 +53,13 @@ index 381a9c7d..e45b453d 100644
 }
 
 diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
-index 10f34d33..9f5fd57b 100644
+index 10f34d33..b098bb25 100644
 --- a/src/llama-vocab.cpp
 +++ b/src/llama-vocab.cpp
-@@ -1469,9 +1469,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
-             const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
-             if (precompiled_charsmap_keyidx != -1) {
+@@ -1471,7 +1471,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                 const gguf_type pc_type = gguf_get_arr_type(ctx, precompiled_charsmap_keyidx);
-                GGML_ASSERT(pc_type == GGUF_TYPE_INT8 || pc_type == GGUF_TYPE_UINT8);
-
+                 GGML_ASSERT(pc_type == GGUF_TYPE_INT8 || pc_type == GGUF_TYPE_UINT8);
+ 
 -                const size_t n_precompiled_charsmap = gguf_get_arr_n(ctx, precompiled_charsmap_keyidx);
 +                const size_t n_precompiled_charsmap = gguf_get_arr_data_n(ctx, precompiled_charsmap_keyidx);
                 const char * pc = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx);
--- a/llama/patches/0015-add-argsort-and-cuda-copy-for-i32.patch
+++ b/llama/patches/0015-add-argsort-and-cuda-copy-for-i32.patch
@ -1,277 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Michael Yang <git@mxy.ng>
-Date: Thu, 1 May 2025 13:45:12 -0700
-Subject: [PATCH] add argsort and cuda copy for i32
-
---
- ggml/src/ggml-cpu/ops.cpp     |  43 ++++++++++++++
- ggml/src/ggml-cuda/argsort.cu | 102 +++++++++++++++++++++++++++++++++-
- ggml/src/ggml-cuda/cpy.cu     |  49 ++++++++++++++++
- 3 files changed, 192 insertions(+), 2 deletions(-)
-
-diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
-index becdae07..7a44b6cf 100644
--- a/ggml/src/ggml-cpu/ops.cpp
-+++ b/ggml/src/ggml-cpu/ops.cpp
-@@ -6890,6 +6890,45 @@ static void ggml_compute_forward_argsort_f32(
-     }
- }
- 
-+static void ggml_compute_forward_argsort_i32(
-+    const ggml_compute_params * params,
-+    ggml_tensor * dst) {
-+
-+    const ggml_tensor * src0 = dst->src[0];
-+
-+    GGML_TENSOR_UNARY_OP_LOCALS
-+
-+    GGML_ASSERT(nb0 == sizeof(int32_t));
-+
-+    const int ith = params->ith;
-+    const int nth = params->nth;
-+
-+    const int64_t nr = ggml_nrows(src0);
-+
-+    ggml_sort_order order = (ggml_sort_order) ggml_get_op_params_i32(dst, 0);
-+
-+    for (int64_t i = ith; i < nr; i += nth) {
-+        int32_t * dst_data = (int32_t *)((char *) dst->data + i*nb1);
-+        const int32_t * src_data = (int32_t *)((char *) src0->data + i*nb01);
-+
-+        for (int64_t j = 0; j < ne0; j++) {
-+            dst_data[j] = j;
-+        }
-+
-+        // C doesn't have a functional sort, so we do a bubble sort instead
-+        for (int64_t j = 0; j < ne0; j++) {
-+            for (int64_t k = j + 1; k < ne0; k++) {
-+                if ((order == GGML_SORT_ORDER_ASC  && src_data[dst_data[j]] > src_data[dst_data[k]]) ||
-+                    (order == GGML_SORT_ORDER_DESC && src_data[dst_data[j]] < src_data[dst_data[k]])) {
-+                    int32_t tmp = dst_data[j];
-+                    dst_data[j] = dst_data[k];
-+                    dst_data[k] = tmp;
-+                }
-+            }
-+        }
-+    }
-+}
-+
- void ggml_compute_forward_argsort(
-     const ggml_compute_params * params,
-     ggml_tensor * dst) {
-@@ -6901,6 +6940,10 @@ void ggml_compute_forward_argsort(
-             {
-                 ggml_compute_forward_argsort_f32(params, dst);
-             } break;
-+        case GGML_TYPE_I32:
-+            {
-+                ggml_compute_forward_argsort_i32(params, dst);
-+            } break;
-         default:
-             {
-                 GGML_ABORT("fatal error");
-diff --git a/ggml/src/ggml-cuda/argsort.cu b/ggml/src/ggml-cuda/argsort.cu
-index 607ded85..53b02634 100644
--- a/ggml/src/ggml-cuda/argsort.cu
-+++ b/ggml/src/ggml-cuda/argsort.cu
-@@ -85,13 +85,107 @@ static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, co
-     }
- }
- 
-+
-+template<ggml_sort_order order>
-+static __global__ void k_argsort_i32_i32(const int32_t * x, int * dst, const int ncols, const int ncols_pad) {
-+    extern __shared__ int shared_mem[];
-+    int * indices = shared_mem;
-+
-+    const int tid = threadIdx.x;
-+    const int row = blockIdx.y;
-+
-+    // Initialize all indices, handling the case where threads < ncols_pad
-+    for (int i = tid; i < ncols_pad; i += blockDim.x) {
-+        indices[i] = i < ncols ? i : 0; // Use 0 for padding indices
-+    }
-+    __syncthreads();
-+
-+    // Bitonic sort
-+    for (int k = 2; k <= ncols_pad; k *= 2) {
-+        for (int j = k/2; j > 0; j /= 2) {
-+            for (int i = tid; i < ncols_pad; i += blockDim.x) {
-+                const int ij = i ^ j;
-+                if (ij > i) {
-+                    // Only compare values within the actual data range
-+                    if (i < ncols && ij < ncols) {
-+                        if ((i & k) == 0) {
-+                            if (order == GGML_SORT_ORDER_ASC) {
-+                                if (x[row * ncols + indices[i]] > x[row * ncols + indices[ij]]) {
-+                                    int tmp = indices[i];
-+                                    indices[i] = indices[ij];
-+                                    indices[ij] = tmp;
-+                                }
-+                            } else {
-+                                if (x[row * ncols + indices[i]] < x[row * ncols + indices[ij]]) {
-+                                    int tmp = indices[i];
-+                                    indices[i] = indices[ij];
-+                                    indices[ij] = tmp;
-+                                }
-+                            }
-+                        } else {
-+                            if (order == GGML_SORT_ORDER_ASC) {
-+                                if (x[row * ncols + indices[i]] < x[row * ncols + indices[ij]]) {
-+                                    int tmp = indices[i];
-+                                    indices[i] = indices[ij];
-+                                    indices[ij] = tmp;
-+                                }
-+                            } else {
-+                                if (x[row * ncols + indices[i]] > x[row * ncols + indices[ij]]) {
-+                                    int tmp = indices[i];
-+                                    indices[i] = indices[ij];
-+                                    indices[ij] = tmp;
-+                                }
-+                            }
-+                        }
-+                    }
-+                }
-+            }
-+            __syncthreads();
-+        }
-+    }
-+
-+    // Write sorted indices to output, only threads handling valid data
-+    for (int i = tid; i < ncols; i += blockDim.x) {
-+        dst[row * ncols + i] = indices[i];
-+    }
-+}
-+
-+static void argsort_i32_i32_cuda(const int32_t * x, int * dst, const int ncols, const int nrows, ggml_sort_order order, cudaStream_t stream) {
-+    // Bitonic sort requires ncols to be power of 2
-+    const int ncols_pad = next_power_of_2(ncols);
-+
-+    // Ensure thread count doesn't exceed maximum (typically 1024)
-+    const int max_threads = 1024;  // This is the typical max for most GPUs
-+    const int threads_per_block = ncols_pad > max_threads ? max_threads : ncols_pad;
-+
-+    const dim3 block_dims(threads_per_block, 1, 1);
-+    const dim3 block_nums(1, nrows, 1);
-+    const size_t shared_mem = ncols_pad * sizeof(int);
-+
-+    // Check if shared memory size is within limits
-+    const size_t max_shared_mem = ggml_cuda_info().devices[ggml_cuda_get_device()].smpb;
-+
-+    // Instead of logging an error, use GGML_ASSERT with a descriptive message
-+    GGML_ASSERT(shared_mem <= max_shared_mem && "argsort: required shared memory exceeds device limit");
-+
-+    // Launch kernels with the updated thread configuration
-+    if (order == GGML_SORT_ORDER_ASC) {
-+        k_argsort_i32_i32<GGML_SORT_ORDER_ASC><<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad);
-+    } else if (order == GGML_SORT_ORDER_DESC) {
-+        k_argsort_i32_i32<GGML_SORT_ORDER_DESC><<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad);
-+    } else {
-+        GGML_ABORT("fatal error");
-+    }
-+}
-+
-+
- void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
-     const ggml_tensor * src0 = dst->src[0];
-     const float * src0_d = (const float *)src0->data;
-     float * dst_d = (float *)dst->data;
-     cudaStream_t stream = ctx.stream();
- 
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-+    GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_I32);
-     GGML_ASSERT( dst->type == GGML_TYPE_I32);
-     GGML_ASSERT(ggml_is_contiguous(src0));
- 
-@@ -100,5 +194,9 @@ void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
- 
-     enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0];
- 
-    argsort_f32_i32_cuda(src0_d, (int *)dst_d, ncols, nrows, order, stream);
-+    if (src0->type == GGML_TYPE_I32) {
-+        argsort_i32_i32_cuda((const int32_t *)src0_d, (int *)dst_d, ncols, nrows, order, stream);
-+    } else {
-+        argsort_f32_i32_cuda(src0_d, (int *)dst_d, ncols, nrows, order, stream);
-+    }
- }
-diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu
-index 2d46176e..47383486 100644
--- a/ggml/src/ggml-cuda/cpy.cu
-+++ b/ggml/src/ggml-cuda/cpy.cu
-@@ -38,6 +38,13 @@ static __device__ void cpy_1_f16_f32(const char * cxi, char * cdsti) {
-     *dsti = *xi;
- }
- 
-+static __device__ void cpy_1_i32_i32(const char * cxi, char * cdsti) {
-+    const int32_t * xi = (const int32_t *) cxi;
-+    int32_t * dsti = (int32_t *) cdsti;
-+
-+    *dsti = *xi;
-+}
-+
- template <cpy_kernel_t cpy_1>
- static __global__ void cpy_f32_f16(const char * cx, char * cdst_direct, const int ne,
-                                    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
-@@ -68,6 +75,44 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst_direct, const in
-     cpy_1(cx + x_offset, cdst + dst_offset);
- }
- 
-+// First, add this template function after the other template functions
-+template <cpy_kernel_t cpy_1>
-+static __global__ void cpy_i32_i32(const char * cx, char * cdst, const int ne,
-+                                 const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
-+                                 const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
-+                                 const int nb12, const int nb13) {
-+    const int64_t i = blockDim.x*blockIdx.x + threadIdx.x;
-+
-+    if (i >= ne) {
-+        return;
-+    }
-+
-+    const int64_t i03 = i/(ne00 * ne01 * ne02);
-+    const int64_t i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
-+    const int64_t i01 = (i - i03*ne00*ne01*ne02  -  i02*ne01*ne00) / ne00;
-+    const int64_t i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
-+    const int64_t x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;
-+
-+    const int64_t i13 = i/(ne10 * ne11 * ne12);
-+    const int64_t i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
-+    const int64_t i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
-+    const int64_t i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
-+    const int64_t dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13 * nb13;
-+
-+    cpy_1(cx + x_offset, cdst + dst_offset);
-+}
-+
-+// Then modify the ggml_cpy_i32_i32_cuda function to use the new template
-+static void ggml_cpy_i32_i32_cuda(
-+    const char * cx, char * cdst, const int ne,
-+    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
-+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int graph_cpynode_index) {
-+
-+    const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
-+    cpy_i32_i32<cpy_1_i32_i32><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
-+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
-+}
-+
- static __device__ void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) {
-     const float * xi = (const float *) cxi;
-     block_q8_0 * dsti = (block_q8_0 *) cdsti;
-@@ -631,6 +676,8 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
-         ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
-     } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
-         ggml_cpy_f16_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
-+    } else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32) {
-+        ggml_cpy_i32_i32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
-     } else {
-         GGML_ABORT("%s: unsupported type combination (%s to %s)\n", __func__,
-                 ggml_type_name(src0->type), ggml_type_name(src1->type));
-@@ -686,6 +733,8 @@ void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) {
-         return (void*) cpy_f32_f16<cpy_1_f32_f16>;
-     } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
-         return (void*) cpy_f32_f16<cpy_1_f16_f32>;
-+    } else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32) {
-+        return (void*) cpy_i32_i32<cpy_1_i32_i32>;
-     } else {
-         GGML_ABORT("%s: unsupported type combination (%s to %s)\n", __func__,
-                 ggml_type_name(src0->type), ggml_type_name(src1->type));
--- a/llama/patches/0015-ollama-debug-tensor.patch
+++ b/llama/patches/0015-ollama-debug-tensor.patch
@ -8,7 +8,7 @@ Subject: [PATCH] ollama debug tensor
 1 file changed, 6 insertions(+)

 diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
-index a30e67f2..2462d2b8 100644
+index 835e6495..3902894b 100644
 --- a/ggml/src/ggml-cpu/ggml-cpu.c
 +++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -15,6 +15,8 @@
@ -20,7 +20,7 @@ index a30e67f2..2462d2b8 100644
 #if defined(_MSC_VER) || defined(__MINGW32__)
 #include <malloc.h> // using malloc.h with MSC/MINGW
 #elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
-@@ -2841,6 +2843,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
+@@ -2846,6 +2848,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
 
         ggml_compute_forward(&params, node);
 
--- a/llama/patches/0016-add-ollama-vocab-for-grammar-support.patch
+++ b/llama/patches/0016-add-ollama-vocab-for-grammar-support.patch
--- a/llama/sampling_ext.cpp
+++ b/llama/sampling_ext.cpp
@ -114,9 +114,6 @@ void grammar_free(struct llama_grammar *g) {
        if (g->vocab != nullptr) {
            delete g->vocab;
        }
-        if (g->o_vocab != nullptr) {
-                delete g->o_vocab;
-        }
        llama_grammar_free_impl(g);
    }
 }
--- a/llm/memory.go
+++ b/llm/memory.go
@ -1,12 +1,9 @@
 package llm

 import (
-	"cmp"
 	"fmt"
 	"log/slog"
-	"maps"
 	"os"
-	"slices"
 	"strconv"
 	"strings"

@ -111,8 +108,9 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 	slog.Debug("evaluating", "library", gpus[0].Library, "gpu_count", len(gpus), "available", availableList)

 	for _, projector := range projectors {
-		weight := projectorMemoryRequirements(projector)
+		weight, graph := projectorMemoryRequirements(projector)
 		projectorWeights += weight
+		projectorGraph += graph

 		// multimodal models require at least 2048 context
 		opts.NumCtx = max(opts.NumCtx, 2048)
@ -122,10 +120,12 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 	}

 	layers := f.Tensors().GroupLayers()
-	// add one layer (chosing the max layer) worth of memory as a buffer
-	layerSize = slices.MaxFunc(slices.Collect(maps.Values(layers)), func(a, b ggml.Layer) int {
-		return cmp.Compare(a.Size(), b.Size())
-	}).Size()
+	// add one layer worth of memory as a buffer
+	if blk0, ok := layers["blk.0"]; ok {
+		layerSize = blk0.Size()
+	} else {
+		slog.Warn("model missing blk.0 layer size")
+	}

 	var kvct string
 	if envconfig.FlashAttention() &&
@ -219,7 +219,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 	}

 	// For all the layers, find where they can fit on the GPU(s)
-	for i := int(f.KV().BlockCount()) - 1; i >= 0; i-- {
+	for i := range int(f.KV().BlockCount()) {
 		// Some models have inconsistent layer sizes
 		if blk, ok := layers[fmt.Sprintf("blk.%d", i)]; ok {
 			layerSize = blk.Size()
@ -229,7 +229,6 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin

 		if opts.NumGPU >= 0 && layerCount >= opts.NumGPU {
 			// Stop allocating on GPU(s) once we hit the users target NumGPU
-			overflow += layerSize
 			continue
 		}

@ -246,13 +245,13 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 				gpusWithSpace = append(gpusWithSpace[:i%j], gpusWithSpace[i%j+1:]...)
 			}
 		}
-
-		if len(gpusWithSpace) == 0 {
-			overflow += layerSize
-		}
 	}
 	if layerCount >= int(f.KV().BlockCount()) {
 		fullyLoaded = true
+	} else {
+		for i := layerCount; i < int(f.KV().BlockCount()); i++ {
+			overflow += layerSize
+		}
 	}

 	// Determine if we need to consider output then find where it fits
@ -408,21 +407,51 @@ func (m MemoryEstimate) LogValue() slog.Value {
 	return slog.GroupValue(attrs...)
 }

-func projectorMemoryRequirements(filename string) (weights uint64) {
+func projectorMemoryRequirements(filename string) (weights, graphSize uint64) {
 	file, err := os.Open(filename)
 	if err != nil {
-		return 0
+		return 0, 0
 	}
 	defer file.Close()

 	ggml, _, err := ggml.Decode(file, 1024)
 	if err != nil {
-		return 0
+		return 0, 0
 	}

 	for _, layer := range ggml.Tensors().GroupLayers() {
 		weights += layer.Size()
 	}

-	return weights
+	switch arch := ggml.KV().Architecture(); arch {
+	case "mllama":
+		kv := func(n string) uint64 {
+			if v, ok := ggml.KV()[arch+".vision."+n].(uint32); ok {
+				return uint64(v)
+			}
+
+			return 0
+		}
+
+		imageSize := kv("image_size")
+
+		maxNumTiles := kv("max_num_tiles")
+		embeddingLength := kv("embedding_length")
+		headCount := kv("attention.head_count")
+
+		numPatches := (imageSize / kv("patch_size")) * (imageSize / kv("patch_size"))
+		if _, ok := ggml.Tensors().GroupLayers()["v"]["class_embd"]; ok {
+			numPatches++
+		}
+
+		numPaddedPatches := numPatches + 8 - (numPatches%8)%8
+
+		graphSize = 4 * (8 +
+			imageSize*imageSize*kv("num_channels")*maxNumTiles +
+			embeddingLength*numPatches*maxNumTiles +
+			9*embeddingLength*numPaddedPatches*maxNumTiles +
+			numPaddedPatches*maxNumTiles*numPaddedPatches*maxNumTiles*headCount)
+	}
+
+	return weights, graphSize
 }
--- a/llm/server.go
+++ b/llm/server.go
@ -311,7 +311,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
 		params = append(params, "--mmproj", projectors[0])
 	}

-	// iterate through compatible GPU libraries such as 'cuda_v12', 'cuda_v11', 'rocm', etc.
+	// iterate through compatible GPU libraries such as 'cuda_v12', 'rocm', etc.
 	// adding each library's respective path to the LD_LIBRARY_PATH, until finally running
 	// without any LD_LIBRARY_PATH flags
 	for {
@ -679,8 +679,9 @@ ws ::= ([ \t\n] ws)?
 const maxBufferSize = 512 * format.KiloByte

 type ImageData struct {
-	Data []byte `json:"data"`
-	ID   int    `json:"id"`
+	Data          []byte `json:"data"`
+	ID            int    `json:"id"`
+	AspectRatioID int    `json:"aspect_ratio_id"`
 }

 type CompletionRequest struct {
--- a/ml/backend.go
+++ b/ml/backend.go
@ -119,21 +119,6 @@ type Context interface {
 	Layer(int) Context
 }

-// RopeOptions contains optional parameters for RoPE function
-type RopeOptions struct {
-	OriginalContextLen uint32
-}
-
-// RopeOption defines a function that modifies RopeOpts
-type RopeOption func(*RopeOptions)
-
-// WithContextLen sets a custom context length
-func WithContextLen(len uint32) RopeOption {
-	return func(opts *RopeOptions) {
-		opts.OriginalContextLen = len
-	}
-}
-
 type Tensor interface {
 	Dim(n int) int
 	Stride(n int) int
@ -159,7 +144,7 @@ type Tensor interface {
 	AvgPool2D(ctx Context, k, s int, p float32) Tensor
 	Conv2D(ctx Context, weight Tensor, s0, s1, p0, p1, d0, d1 int) Tensor

-	RoPE(ctx Context, positionIDs, ropeFactors Tensor, dim, ropeType uint32, base, scale float32, options ...RopeOption) Tensor
+	RoPE(ctx Context, positionIDs, ropeFactors Tensor, dim, ropeType uint32, base, scale float32) Tensor
 	IM2Col(ctx Context, weight Tensor, s0, s1, p0, p1, d0, d1 int) Tensor

 	Sin(ctx Context) Tensor
@ -176,6 +161,7 @@ type Tensor interface {
 	Set(ctx Context, t2 Tensor, offset int, strides ...int) Tensor

 	Pad(ctx Context, shape ...int) Tensor
+	Unpad(ctx Context, shape ...int) Tensor

 	Stack(ctx Context, dim int, s ...Tensor) Tensor

@ -187,7 +173,6 @@ type Tensor interface {
 	Duplicate(ctx Context) Tensor

 	TopK(ctx Context, k int) Tensor
-	Argsort(ctx Context) Tensor
 }

 // ScaledDotProductAttention implements a fused attention
--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@ -1017,6 +1017,17 @@ func (t *Tensor) Sigmoid(ctx ml.Context) ml.Tensor {
 	}
 }

+func (t *Tensor) Unpad(ctx ml.Context, shape ...int) ml.Tensor {
+	if len(shape) != 4 {
+		panic("expected 4 dimensions")
+	}
+
+	return &Tensor{
+		b: t.b,
+		t: C.ggml_unpad(ctx.(*Context).ctx, t.t, C.int(shape[0]), C.int(shape[1]), C.int(shape[2]), C.int(shape[3])),
+	}
+}
+
 func (t *Tensor) View(ctx ml.Context, offset int, shape ...int) ml.Tensor {
 	switch len(shape) {
 	case 1:
@ -1060,17 +1071,7 @@ const (
 	ropeTypeVision C.int = 24
 )

-func (t *Tensor) RoPE(ctx ml.Context, positionIDs, ropeFactors ml.Tensor, ropeDim, ropeType uint32, ropeBase, ropeScale float32, options ...ml.RopeOption) ml.Tensor {
-	// Default options
-	opts := &ml.RopeOptions{
-		OriginalContextLen: 131072,
-	}
-
-	// Apply any provided options
-	for _, option := range options {
-		option(opts)
-	}
-
+func (t *Tensor) RoPE(ctx ml.Context, positionIDs, ropeFactors ml.Tensor, ropeDim, ropeType uint32, ropeBase, ropeScale float32) ml.Tensor {
 	if ropeFactors == nil {
 		ropeFactors = &Tensor{b: t.b}
 	}
@ -1083,19 +1084,16 @@ func (t *Tensor) RoPE(ctx ml.Context, positionIDs, ropeFactors ml.Tensor, ropeDi
 	return &Tensor{
 		b: t.b,
 		t: C.ggml_rope_ext(
-			ctx.(*Context).ctx,
-			dequant,
-			positionIDs.(*Tensor).t,
-			ropeFactors.(*Tensor).t,
+			ctx.(*Context).ctx, dequant, positionIDs.(*Tensor).t, ropeFactors.(*Tensor).t,
 			C.int(ropeDim),
 			C.int(ropeType),
-			C.int(opts.OriginalContextLen),
+			131072, // YaRN n_ctx_train
 			C.float(ropeBase),
 			C.float(ropeScale),
-			C.float(0.0),
-			C.float(1.0),
-			C.float(32.0),
-			C.float(1.0),
+			0.,  // YaRN ext_factor
+			1.,  // YaRN attn_factor
+			32., // YaRN beta_fast
+			1.,  // YaRN beta_slow
 		),
 	}
 }
@ -1189,10 +1187,3 @@ func (t *Tensor) TopK(ctx ml.Context, k int) ml.Tensor {
 		t: C.ggml_top_k(ctx.(*Context).ctx, t.t, C.int(k)),
 	}
 }
-
-func (t *Tensor) Argsort(ctx ml.Context) ml.Tensor {
-	return &Tensor{
-		b: t.b,
-		t: C.ggml_argsort(ctx.(*Context).ctx, t.t, C.GGML_SORT_ORDER_ASC),
-	}
-}
--- a/ml/backend/ggml/ggml/include/ggml.h
+++ b/ml/backend/ggml/ggml/include/ggml.h
@ -489,6 +489,7 @@ extern "C" {
        GGML_OP_UPSCALE, // nearest interpolate
        GGML_OP_PAD,
        GGML_OP_PAD_REFLECT_1D,
+        GGML_OP_UNPAD,
        GGML_OP_ARANGE,
        GGML_OP_TIMESTEP_EMBEDDING,
        GGML_OP_ARGSORT,
@ -1781,6 +1782,15 @@ extern "C" {
            int                   p0,
            int                   p1);

+    // unpad each dimension: [x, ..., x, y, ..., y] -> [x, ..., x]
+    GGML_API struct ggml_tensor * ggml_unpad(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                  p0,
+            int                  p1,
+            int                  p2,
+            int                  p3);
+
    // Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151
    // timesteps: [N,]
    // return: [N, dim]
--- a/ml/backend/ggml/ggml/src/ggml-backend-reg.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-backend-reg.cpp
@ -178,9 +178,9 @@ struct ggml_backend_registry {
 #ifdef GGML_USE_CANN
        register_backend(ggml_backend_cann_reg());
 #endif
-#ifdef GGML_USE_BLAS
-        register_backend(ggml_backend_blas_reg());
-#endif
+// #ifdef GGML_USE_BLAS
+//         register_backend(ggml_backend_blas_reg());
+// #endif
 #ifdef GGML_USE_RPC
        register_backend(ggml_backend_rpc_reg());
 #endif
--- a/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.c
@ -1953,6 +1953,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
            {
                ggml_compute_forward_pad_reflect_1d(params, tensor);
            } break;
+        case GGML_OP_UNPAD:
+            {
+                ggml_compute_forward_unpad(params, tensor);
+            } break;
        case GGML_OP_ARANGE:
            {
                ggml_compute_forward_arange(params, tensor);
@ -2276,6 +2280,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
        case GGML_OP_UPSCALE:
        case GGML_OP_PAD:
        case GGML_OP_PAD_REFLECT_1D:
+        case GGML_OP_UNPAD:
        case GGML_OP_ARANGE:
        case GGML_OP_TIMESTEP_EMBEDDING:
        case GGML_OP_ARGSORT:
--- a/ml/backend/ggml/ggml/src/ggml-cpu/ops.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/ops.cpp
@ -6690,6 +6690,61 @@ void ggml_compute_forward_pad_reflect_1d(
    }
 }

+// ggml_compute_forward_unpad
+
+static void ggml_compute_forward_unpad_f32(
+    const struct ggml_compute_params *params,
+    struct ggml_tensor *dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    GGML_ASSERT(src0->nb[0] == sizeof(float));
+    GGML_ASSERT( dst->nb[0] == sizeof(float));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    float * dst_ptr = (float *) dst->data;
+
+    // TODO: optimize
+
+    for (int64_t i2 = 0; i2 < ne2; ++i2) {
+        for (int64_t i1 = ith; i1 < ne1; i1 += nth) {
+            for (int64_t i0 = 0; i0 < ne0; ++i0) {
+                for (int64_t i3 = 0; i3 < ne3; ++i3) {
+                    const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0;
+
+                    const float * src_ptr = (const float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
+
+                    if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
+                        dst_ptr[dst_idx] = *src_ptr;
+                    }
+                }
+            }
+        }
+    }
+}
+
+void ggml_compute_forward_unpad(
+    const struct ggml_compute_params * params,
+    struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_unpad_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
 // ggml_compute_forward_arange

 static void ggml_compute_forward_arange_f32(
@ -6822,45 +6877,6 @@ static void ggml_compute_forward_argsort_f32(
    }
 }

-static void ggml_compute_forward_argsort_i32(
-    const ggml_compute_params * params,
-    ggml_tensor * dst) {
-
-    const ggml_tensor * src0 = dst->src[0];
-
-    GGML_TENSOR_UNARY_OP_LOCALS
-
-    GGML_ASSERT(nb0 == sizeof(int32_t));
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int64_t nr = ggml_nrows(src0);
-
-    ggml_sort_order order = (ggml_sort_order) ggml_get_op_params_i32(dst, 0);
-
-    for (int64_t i = ith; i < nr; i += nth) {
-        int32_t * dst_data = (int32_t *)((char *) dst->data + i*nb1);
-        const int32_t * src_data = (int32_t *)((char *) src0->data + i*nb01);
-
-        for (int64_t j = 0; j < ne0; j++) {
-            dst_data[j] = j;
-        }
-
-        // C doesn't have a functional sort, so we do a bubble sort instead
-        for (int64_t j = 0; j < ne0; j++) {
-            for (int64_t k = j + 1; k < ne0; k++) {
-                if ((order == GGML_SORT_ORDER_ASC  && src_data[dst_data[j]] > src_data[dst_data[k]]) ||
-                    (order == GGML_SORT_ORDER_DESC && src_data[dst_data[j]] < src_data[dst_data[k]])) {
-                    int32_t tmp = dst_data[j];
-                    dst_data[j] = dst_data[k];
-                    dst_data[k] = tmp;
-                }
-            }
-        }
-    }
-}
-
 void ggml_compute_forward_argsort(
    const ggml_compute_params * params,
    ggml_tensor * dst) {
@ -6872,10 +6888,6 @@ void ggml_compute_forward_argsort(
            {
                ggml_compute_forward_argsort_f32(params, dst);
            } break;
-        case GGML_TYPE_I32:
-            {
-                ggml_compute_forward_argsort_i32(params, dst);
-            } break;
        default:
            {
                GGML_ABORT("fatal error");
--- a/ml/backend/ggml/ggml/src/ggml-cpu/ops.h
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/ops.h
@ -72,6 +72,7 @@ void ggml_compute_forward_pool_2d_back(const struct ggml_compute_params * params
 void ggml_compute_forward_upscale(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_pad(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_pad_reflect_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_unpad(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_arange(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_timestep_embedding(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_argsort(const struct ggml_compute_params * params, struct ggml_tensor * dst);
--- a/ml/backend/ggml/ggml/src/ggml-cuda/argsort.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/argsort.cu
@ -85,107 +85,13 @@ static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, co
    }
 }

-
-template<ggml_sort_order order>
-static __global__ void k_argsort_i32_i32(const int32_t * x, int * dst, const int ncols, const int ncols_pad) {
-    extern __shared__ int shared_mem[];
-    int * indices = shared_mem;
-
-    const int tid = threadIdx.x;
-    const int row = blockIdx.y;
-
-    // Initialize all indices, handling the case where threads < ncols_pad
-    for (int i = tid; i < ncols_pad; i += blockDim.x) {
-        indices[i] = i < ncols ? i : 0; // Use 0 for padding indices
-    }
-    __syncthreads();
-
-    // Bitonic sort
-    for (int k = 2; k <= ncols_pad; k *= 2) {
-        for (int j = k/2; j > 0; j /= 2) {
-            for (int i = tid; i < ncols_pad; i += blockDim.x) {
-                const int ij = i ^ j;
-                if (ij > i) {
-                    // Only compare values within the actual data range
-                    if (i < ncols && ij < ncols) {
-                        if ((i & k) == 0) {
-                            if (order == GGML_SORT_ORDER_ASC) {
-                                if (x[row * ncols + indices[i]] > x[row * ncols + indices[ij]]) {
-                                    int tmp = indices[i];
-                                    indices[i] = indices[ij];
-                                    indices[ij] = tmp;
-                                }
-                            } else {
-                                if (x[row * ncols + indices[i]] < x[row * ncols + indices[ij]]) {
-                                    int tmp = indices[i];
-                                    indices[i] = indices[ij];
-                                    indices[ij] = tmp;
-                                }
-                            }
-                        } else {
-                            if (order == GGML_SORT_ORDER_ASC) {
-                                if (x[row * ncols + indices[i]] < x[row * ncols + indices[ij]]) {
-                                    int tmp = indices[i];
-                                    indices[i] = indices[ij];
-                                    indices[ij] = tmp;
-                                }
-                            } else {
-                                if (x[row * ncols + indices[i]] > x[row * ncols + indices[ij]]) {
-                                    int tmp = indices[i];
-                                    indices[i] = indices[ij];
-                                    indices[ij] = tmp;
-                                }
-                            }
-                        }
-                    }
-                }
-            }
-            __syncthreads();
-        }
-    }
-
-    // Write sorted indices to output, only threads handling valid data
-    for (int i = tid; i < ncols; i += blockDim.x) {
-        dst[row * ncols + i] = indices[i];
-    }
-}
-
-static void argsort_i32_i32_cuda(const int32_t * x, int * dst, const int ncols, const int nrows, ggml_sort_order order, cudaStream_t stream) {
-    // Bitonic sort requires ncols to be power of 2
-    const int ncols_pad = next_power_of_2(ncols);
-
-    // Ensure thread count doesn't exceed maximum (typically 1024)
-    const int max_threads = 1024;  // This is the typical max for most GPUs
-    const int threads_per_block = ncols_pad > max_threads ? max_threads : ncols_pad;
-
-    const dim3 block_dims(threads_per_block, 1, 1);
-    const dim3 block_nums(1, nrows, 1);
-    const size_t shared_mem = ncols_pad * sizeof(int);
-
-    // Check if shared memory size is within limits
-    const size_t max_shared_mem = ggml_cuda_info().devices[ggml_cuda_get_device()].smpb;
-
-    // Instead of logging an error, use GGML_ASSERT with a descriptive message
-    GGML_ASSERT(shared_mem <= max_shared_mem && "argsort: required shared memory exceeds device limit");
-
-    // Launch kernels with the updated thread configuration
-    if (order == GGML_SORT_ORDER_ASC) {
-        k_argsort_i32_i32<GGML_SORT_ORDER_ASC><<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad);
-    } else if (order == GGML_SORT_ORDER_DESC) {
-        k_argsort_i32_i32<GGML_SORT_ORDER_DESC><<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad);
-    } else {
-        GGML_ABORT("fatal error");
-    }
-}
-
-
 void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    const ggml_tensor * src0 = dst->src[0];
    const float * src0_d = (const float *)src0->data;
    float * dst_d = (float *)dst->data;
    cudaStream_t stream = ctx.stream();

-    GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_I32);
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_I32);
    GGML_ASSERT(ggml_is_contiguous(src0));

@ -194,9 +100,5 @@ void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {

    enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0];

-    if (src0->type == GGML_TYPE_I32) {
-        argsort_i32_i32_cuda((const int32_t *)src0_d, (int *)dst_d, ncols, nrows, order, stream);
-    } else {
-        argsort_f32_i32_cuda(src0_d, (int *)dst_d, ncols, nrows, order, stream);
-    }
+    argsort_f32_i32_cuda(src0_d, (int *)dst_d, ncols, nrows, order, stream);
 }
--- a/ml/backend/ggml/ggml/src/ggml-cuda/cpy.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/cpy.cu
@ -38,13 +38,6 @@ static __device__ void cpy_1_f16_f32(const char * cxi, char * cdsti) {
    *dsti = *xi;
 }

-static __device__ void cpy_1_i32_i32(const char * cxi, char * cdsti) {
-    const int32_t * xi = (const int32_t *) cxi;
-    int32_t * dsti = (int32_t *) cdsti;
-
-    *dsti = *xi;
-}
-
 template <cpy_kernel_t cpy_1>
 static __global__ void cpy_f32_f16(const char * cx, char * cdst_direct, const int ne,
                                   const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
@ -75,44 +68,6 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst_direct, const in
    cpy_1(cx + x_offset, cdst + dst_offset);
 }

-// First, add this template function after the other template functions
-template <cpy_kernel_t cpy_1>
-static __global__ void cpy_i32_i32(const char * cx, char * cdst, const int ne,
-                                 const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
-                                 const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
-                                 const int nb12, const int nb13) {
-    const int64_t i = blockDim.x*blockIdx.x + threadIdx.x;
-
-    if (i >= ne) {
-        return;
-    }
-
-    const int64_t i03 = i/(ne00 * ne01 * ne02);
-    const int64_t i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
-    const int64_t i01 = (i - i03*ne00*ne01*ne02  -  i02*ne01*ne00) / ne00;
-    const int64_t i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
-    const int64_t x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;
-
-    const int64_t i13 = i/(ne10 * ne11 * ne12);
-    const int64_t i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
-    const int64_t i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
-    const int64_t i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
-    const int64_t dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13 * nb13;
-
-    cpy_1(cx + x_offset, cdst + dst_offset);
-}
-
-// Then modify the ggml_cpy_i32_i32_cuda function to use the new template
-static void ggml_cpy_i32_i32_cuda(
-    const char * cx, char * cdst, const int ne,
-    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
-    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int graph_cpynode_index) {
-
-    const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
-    cpy_i32_i32<cpy_1_i32_i32><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
-        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
-}
-
 static __device__ void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) {
    const float * xi = (const float *) cxi;
    block_q8_0 * dsti = (block_q8_0 *) cdsti;
@ -678,8 +633,6 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
        ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
    } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
        ggml_cpy_f16_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
-    } else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32) {
-        ggml_cpy_i32_i32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
    } else {
        GGML_ABORT("%s: unsupported type combination (%s to %s)\n", __func__,
                ggml_type_name(src0->type), ggml_type_name(src1->type));
@ -735,8 +688,6 @@ void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) {
        return (void*) cpy_f32_f16<cpy_1_f32_f16>;
    } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
        return (void*) cpy_f32_f16<cpy_1_f16_f32>;
-    } else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32) {
-        return (void*) cpy_i32_i32<cpy_1_i32_i32>;
    } else {
        GGML_ABORT("%s: unsupported type combination (%s to %s)\n", __func__,
                ggml_type_name(src0->type), ggml_type_name(src1->type));
--- a/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
@ -2238,6 +2238,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
        case GGML_OP_PAD:
            ggml_cuda_op_pad(ctx, dst);
            break;
+        case GGML_OP_UNPAD:
+            ggml_cuda_op_unpad(ctx, dst);
+            break;
        case GGML_OP_ARANGE:
            ggml_cuda_op_arange(ctx, dst);
            break;
@ -3212,6 +3215,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
        case GGML_OP_UPSCALE:
            return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST;
        case GGML_OP_PAD:
+        case GGML_OP_UNPAD:
        case GGML_OP_ARANGE:
        case GGML_OP_TIMESTEP_EMBEDDING:
        case GGML_OP_LEAKY_RELU:
--- a/ml/backend/ggml/ggml/src/ggml-cuda/pad.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/pad.cu
@ -47,3 +47,49 @@ void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
        src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
        dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], stream);
 }
+
+static __global__ void unpad_f32(const float * x, float * dst, const int ne0, const int ne00, const int ne01, const int ne02, const int ne03) {
+    // blockIdx.z: idx of ne2*ne3, aka ne02*ne03
+    // blockIdx.y: idx of ne1
+    // blockIDx.x: idx of ne0 / BLOCK_SIZE
+    int nidx = threadIdx.x + blockIdx.x * blockDim.x;
+    if (nidx >= ne0) {
+        return;
+    }
+
+    // operation
+    int offset_dst =
+        nidx +
+        blockIdx.y * ne0 +
+        blockIdx.z * ne0 * gridDim.y;
+    if (nidx < ne00 && blockIdx.y < ne01 && blockIdx.z < ne02*ne03) {
+        int offset_src =
+            nidx +
+            blockIdx.y * ne00 +
+            blockIdx.z * ne00 * ne01;
+        dst[offset_dst] = x[offset_src];
+    }
+}
+
+static void unpad_f32_cuda(const float * x, float * dst,
+    const int ne00, const int ne01, const int ne02, const int ne03,
+    const int ne0, const int ne1, const int ne2, const int ne3, cudaStream_t stream) {
+    int num_blocks = (ne0 + CUDA_PAD_BLOCK_SIZE - 1) / CUDA_PAD_BLOCK_SIZE;
+    dim3 gridDim(num_blocks, ne1, ne2*ne3);
+    unpad_f32<<<gridDim, CUDA_PAD_BLOCK_SIZE, 0, stream>>>(x, dst, ne0, ne00, ne01, ne02, ne03);
+}
+
+void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+    const float * src0_d = (const float *)src0->data;
+    float * dst_d = (float *)dst->data;
+    cudaStream_t stream = ctx.stream();
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+    GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
+
+    unpad_f32_cuda(src0_d, dst_d,
+        src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
+        dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], stream);
+}
--- a/ml/backend/ggml/ggml/src/ggml-cuda/pad.cuh
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/pad.cuh
@ -3,3 +3,4 @@
 #define CUDA_PAD_BLOCK_SIZE 256

 void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal
+++ b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal
@ -5599,6 +5599,51 @@ kernel void kernel_pad_reflect_1d_f32(
    }
 }

+kernel void kernel_unpad_f32(
+    device  const char * src0,
+    device        char * dst,
+    constant   int64_t & ne00,
+    constant   int64_t & ne01,
+    constant   int64_t & ne02,
+    constant   int64_t & ne03,
+    constant  uint64_t & nb00,
+    constant  uint64_t & nb01,
+    constant  uint64_t & nb02,
+    constant  uint64_t & nb03,
+    constant   int64_t & ne0,
+    constant   int64_t & ne1,
+    constant   int64_t & ne2,
+    constant   int64_t & ne3,
+    constant  uint64_t & nb0,
+    constant  uint64_t & nb1,
+    constant  uint64_t & nb2,
+    constant  uint64_t & nb3,
+    uint3 tgpig[[threadgroup_position_in_grid]],
+    uint3 tpitg[[thread_position_in_threadgroup]],
+    uint3   ntg[[threads_per_threadgroup]]) {
+
+    const int64_t i3 = tgpig.z;
+    const int64_t i2 = tgpig.y;
+    const int64_t i1 = tgpig.x;
+
+    const int64_t i03 = i3;
+    const int64_t i02 = i2;
+    const int64_t i01 = i1;
+
+    device const float * src0_ptr = (device const float *) (src0 + i03*nb03 + i02*nb02 + i01*nb01);
+    device       float * dst_ptr  = (device       float *) (dst  +  i3*nb3  +  i2*nb2  +  i1*nb1);
+
+    if (i1 < ne01 && i2 < ne02 && i3 < ne03) {
+        for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
+            if (i0 < ne00) {
+                dst_ptr[i0] = src0_ptr[i0];
+            }
+        }
+
+        return;
+    }
+}
+
 kernel void kernel_arange_f32(
    device        char * dst,
    constant   ggml_metal_kargs_arange & args,
--- a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m
+++ b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m
@ -347,6 +347,7 @@ enum ggml_metal_kernel_type {
    GGML_METAL_KERNEL_TYPE_UPSCALE_F32,
    GGML_METAL_KERNEL_TYPE_PAD_F32,
    GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32,
+    GGML_METAL_KERNEL_TYPE_UNPAD_F32,
    GGML_METAL_KERNEL_TYPE_ARANGE_F32,
    GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,
    GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,
@ -1294,6 +1295,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32,                     upscale_f32,                     true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32,                         pad_f32,                         true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32,              pad_reflect_1d_f32,              true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UNPAD_F32,                       unpad_f32,                       true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,          timestep_embedding_f32,          true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARANGE_F32,                      arange_f32,                      true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,             argsort_f32_i32_asc,             true);
@ -1655,6 +1657,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
        case GGML_OP_POOL_2D:
        case GGML_OP_PAD:
        case GGML_OP_PAD_REFLECT_1D:
+        case GGML_OP_UNPAD:
        case GGML_OP_TIMESTEP_EMBEDDING:
        case GGML_OP_ARGSORT:
        case GGML_OP_LEAKY_RELU:
@ -4184,6 +4187,36 @@ static bool ggml_metal_encode_node(

                const int nth = MIN(1024, ne0);

+                [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+            } break;
+        case GGML_OP_UNPAD:
+            {
+                GGML_ASSERT(src0->type == GGML_TYPE_F32);
+
+                id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_UNPAD_F32].pipeline;
+
+                [encoder setComputePipelineState:pipeline];
+                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+                [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
+                [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
+                [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
+                [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5];
+                [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
+                [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
+                [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
+                [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9];
+                [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:10];
+                [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:11];
+                [encoder setBytes:&ne2  length:sizeof(ne2)  atIndex:12];
+                [encoder setBytes:&ne3  length:sizeof(ne3)  atIndex:13];
+                [encoder setBytes:&nb0  length:sizeof(nb0)  atIndex:14];
+                [encoder setBytes:&nb1  length:sizeof(nb1)  atIndex:15];
+                [encoder setBytes:&nb2  length:sizeof(nb2)  atIndex:16];
+                [encoder setBytes:&nb3  length:sizeof(nb3)  atIndex:17];
+
+                const int nth = MIN(1024, ne0);
+
                [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
            } break;
        case GGML_OP_ARANGE:
--- a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.metal
@ -3121,6 +3121,51 @@ kernel void kernel_pad_reflect_1d_f32(
    }
 }

+kernel void kernel_unpad_f32(
+    device  const char * src0,
+    device        char * dst,
+    constant   int64_t & ne00,
+    constant   int64_t & ne01,
+    constant   int64_t & ne02,
+    constant   int64_t & ne03,
+    constant  uint64_t & nb00,
+    constant  uint64_t & nb01,
+    constant  uint64_t & nb02,
+    constant  uint64_t & nb03,
+    constant   int64_t & ne0,
+    constant   int64_t & ne1,
+    constant   int64_t & ne2,
+    constant   int64_t & ne3,
+    constant  uint64_t & nb0,
+    constant  uint64_t & nb1,
+    constant  uint64_t & nb2,
+    constant  uint64_t & nb3,
+    uint3 tgpig[[threadgroup_position_in_grid]],
+    uint3 tpitg[[thread_position_in_threadgroup]],
+    uint3   ntg[[threads_per_threadgroup]]) {
+
+    const int64_t i3 = tgpig.z;
+    const int64_t i2 = tgpig.y;
+    const int64_t i1 = tgpig.x;
+
+    const int64_t i03 = i3;
+    const int64_t i02 = i2;
+    const int64_t i01 = i1;
+
+    device const float * src0_ptr = (device const float *) (src0 + i03*nb03 + i02*nb02 + i01*nb01);
+    device       float * dst_ptr  = (device       float *) (dst  +  i3*nb3  +  i2*nb2  +  i1*nb1);
+
+    if (i1 < ne01 && i2 < ne02 && i3 < ne03) {
+        for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
+            if (i0 < ne00) {
+                dst_ptr[i0] = src0_ptr[i0];
+            }
+        }
+
+        return;
+    }
+}
+
 kernel void kernel_arange_f32(
    device        char * dst,
    constant   ggml_metal_kargs_arange & args,
--- a/ml/backend/ggml/ggml/src/ggml.c
+++ b/ml/backend/ggml/ggml/src/ggml.c
@ -923,6 +923,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
    "UPSCALE",
    "PAD",
    "PAD_REFLECT_1D",
+    "UNPAD",
    "ARANGE",
    "TIMESTEP_EMBEDDING",
    "ARGSORT",
@ -953,7 +954,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
    "OPT_STEP_ADAMW",
 };

-static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
+static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");

 static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
    "none",
@ -1018,6 +1019,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
    "upscale(x)",
    "pad(x)",
    "pad_reflect_1d(x)",
+    "unpad(x)",
    "arange(start, stop, step)",
    "timestep_embedding(timesteps, dim, max_period)",
    "argsort(x)",
@ -1048,7 +1050,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
    "adamw(x)",
 };

-static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
+static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");

 static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");

@ -4274,6 +4276,25 @@ struct ggml_tensor * ggml_pad_reflect_1d(
    return result;
 }

+// ggml_unpad
+
+struct ggml_tensor * ggml_unpad(
+    struct ggml_context * ctx,
+    struct ggml_tensor  * a,
+    int p0, int p1, int p2, int p3) {
+
+    struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type,
+            a->ne[0] - p0,
+            a->ne[1] - p1,
+            a->ne[2] - p2,
+            a->ne[3] - p3);
+
+    result->op = GGML_OP_UNPAD;
+    result->src[0] = a;
+
+    return result;
+}
+
 // ggml_arange

 struct ggml_tensor * ggml_arange(
--- a/model/input/input.go
+++ b/model/input/input.go
@ -2,16 +2,30 @@ package input

 import "github.com/ollama/ollama/ml"

+// Multimodal is a multimodal embedding or a component of one.
+// For example, it could be a row of an image that can be processed
+// independently.
+type Multimodal struct {
+	// Tensor is the embedding data. Implementations may chose what to
+	// store here or it may be nil if not needed. However, any ml.Tensor
+	// objects must be stored here and not in Data.
+	Tensor ml.Tensor
+
+	// Data is implementation-specific opaque data, such as metadata on how
+	// to layout Tensor. It may be nil if not needed. It may also store larger
+	// objects such as complete images if they are to be processed later.
+	Data any
+}
+
 // Input represents one token in the input stream
 type Input struct {
 	// Token is a single element of text.
 	Token int32

-	// Multimodal is opaque data representing a non-text
-	// element such as an image (or part of one if the image
-	// can be processed in pieces). It may be either together
-	// with Token or on its own.
-	Multimodal any
+	// Multimodal is represents a non-text element such as an
+	// image (or part of one if the image can be processed in pieces).
+	// It may be used either together with Token or on its own.
+	Multimodal []Multimodal

 	// MultimodalHash is a unique representation of the data
 	// stored in Multimodal, used for caching and comparing
@ -32,7 +46,7 @@ type Input struct {
 // Positions slice.
 type MultimodalIndex struct {
 	Index      int
-	Multimodal any
+	Multimodal []Multimodal
 }

 // Batch contains the inputs for a model forward pass
--- a/model/model.go
+++ b/model/model.go
@ -40,12 +40,13 @@ type MultimodalProcessor interface {
 	// EncodeMultimodal processes a single input (such as an image) and
 	// generates an output (typically an embedding) that can be used by the model.
 	//
-	// The return value is most typically an ml.Tensor, however, different
-	// type are possible, such as an object containing a tensor plus
-	// additional metadata, a slice of tensors or even just the original input.
+	// The return value is one or more tensors, each with optional model-specific
+	// opaque metadata. Typically, the tensors might be views into an embedding
+	// with each view representing a chunk of data that can be processed independently
+	// in different batches.
 	//
 	// The result may be cached by the runner.
-	EncodeMultimodal(ml.Context, []byte) (any, error)
+	EncodeMultimodal(ml.Context, []byte) ([]input.Multimodal, error)

 	// PostTokenize is called after tokenization to allow the model to edit the
 	// input stream to correctly arrange multimodal elements.
--- a/model/models/gemma2/model.go
+++ b/model/models/gemma2/model.go
@ -45,8 +45,6 @@ func New(c fs.Config) (model.Model, error) {
 				Types:  c.Ints("tokenizer.ggml.token_type"),
 				BOS:    int32(c.Uint("tokenizer.ggml.bos_token_id")),
 				EOS:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
-				// TODO: set EOT to EOS otherwise 0 will stop generation
-				EOT: int32(c.Uint("tokenizer.ggml.eos_token_id")),
 			},
 		),
 		Layers: make([]Layer, c.Uint("block_count")),
--- a/model/models/gemma3/model.go
+++ b/model/models/gemma3/model.go
@ -82,7 +82,7 @@ func New(c fs.Config) (model.Model, error) {
 	return &m, nil
 }

-func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, error) {
+func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input.Multimodal, error) {
 	if len(m.VisionModel.Layers) == 0 {
 		return nil, model.ErrNoVisionModel
 	}
@ -108,22 +108,22 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, er

 	visionOutputs := m.VisionModel.Forward(ctx, pixelValues)
 	visionOutputs = m.MultiModalProjector.Forward(ctx, visionOutputs, m.imageSize, m.patchSize, m.VisionModel.eps)
-	return visionOutputs, nil
+	return []input.Multimodal{{Tensor: visionOutputs}}, nil
 }

 func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
 	var result []input.Input

 	for _, inp := range inputs {
-		if inp.Multimodal == nil {
+		if len(inp.Multimodal) == 0 {
 			result = append(result, inp)
 		} else {
-			inputMultimodal := inp.Multimodal.(ml.Tensor)
+			inputMultimodal := inp.Multimodal[0].Tensor

 			result = append(result,
-				input.Input{Token: 108, SameBatch: inputMultimodal.Dim(1) + 3},               // "\n\n"
-				input.Input{Token: 255999},                                                   // "<start_of_image>""
-				input.Input{Multimodal: inputMultimodal, MultimodalHash: inp.MultimodalHash}, // image data is on the first placeholder
+				input.Input{Token: 108, SameBatch: inputMultimodal.Dim(1) + 3}, // "\n\n"
+				input.Input{Token: 255999},                                     // "<start_of_image>""
+				input.Input{Multimodal: []input.Multimodal{{Tensor: inputMultimodal}}, MultimodalHash: inp.MultimodalHash}, // image data is on the first placeholder
 			)

 			// add image token placeholders
--- a/model/models/gemma3/model_text.go
+++ b/model/models/gemma3/model_text.go
@ -7,6 +7,7 @@ import (
 	"github.com/ollama/ollama/kvcache"
 	"github.com/ollama/ollama/ml"
 	"github.com/ollama/ollama/ml/nn"
+	"github.com/ollama/ollama/model"
 	"github.com/ollama/ollama/model/input"
 )

@ -19,6 +20,9 @@ type TextConfig struct {
 }

 type TextModel struct {
+	model.Base
+	model.SentencePieceModel
+
 	TokenEmbedding *nn.Embedding `gguf:"token_embd"`
 	Layers         []TextLayer   `gguf:"blk"`
 	OutputNorm     *nn.RMSNorm   `gguf:"output_norm"`
@ -41,6 +45,15 @@ func newTextModel(c fs.Config) *TextModel {
 	numBlocks := int(c.Uint("block_count"))

 	m := TextModel{
+		SentencePieceModel: model.NewSentencePieceModel(
+			&model.Vocabulary{
+				Values: c.Strings("tokenizer.ggml.tokens"),
+				Scores: c.Floats("tokenizer.ggml.scores"),
+				Types:  c.Ints("tokenizer.ggml.token_type"),
+				BOS:    int32(c.Uint("tokenizer.ggml.bos_token_id")),
+				EOS:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
+			},
+		),
 		Layers: make([]TextLayer, numBlocks),
 		TextConfig: &TextConfig{
 			hiddenSize:     int(c.Uint("embedding_length")),
@ -165,7 +178,7 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor
 	// set image embeddings
 	var except []int
 	for _, image := range batch.Multimodal {
-		visionOutputs := image.Multimodal.(ml.Tensor)
+		visionOutputs := image.Multimodal[0].Tensor
 		ctx.Forward(visionOutputs.Copy(ctx, hiddenState.View(ctx, image.Index*hiddenState.Stride(1), visionOutputs.Dim(0)*visionOutputs.Dim(1))))

 		for i := range visionOutputs.Dim(1) {
--- a/model/models/llama/model.go
+++ b/model/models/llama/model.go
@ -47,9 +47,6 @@ func New(c fs.Config) (model.Model, error) {
 				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
 				EOS:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
 				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
-				// TODO: set EOT to EOS otherwise 0 will stop generation
-				EOT:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
-				AddEOT: c.Bool("tokenizer.ggml.add_eos_token", false),
 			},
 		),
 		Layers: make([]Layer, c.Uint("block_count")),
--- a/model/models/llama4/model.go
+++ b/model/models/llama4/model.go
@ -4,7 +4,6 @@ import (
 	"bytes"
 	"image"
 	"slices"
-	"sync"

 	"github.com/ollama/ollama/fs"
 	"github.com/ollama/ollama/kvcache"
@ -45,9 +44,6 @@ func New(c fs.Config) (model.Model, error) {
 				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
 				EOS:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
 				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
-				// TODO: set EOT to EOS otherwise 0 will stop generation
-				EOT:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
-				AddEOT: c.Bool("tokenizer.ggml.add_eos_token", false),
 			},
 		),
 		ImageProcessor: newImageProcessor(c),
@ -63,7 +59,7 @@ func New(c fs.Config) (model.Model, error) {
 	return &m, nil
 }

-func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, error) {
+func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input.Multimodal, error) {
 	if len(m.VisionModel.Layers) < 1 {
 		return nil, model.ErrNoVisionModel
 	}
@ -103,70 +99,79 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, er
 	visionOutputs := m.VisionModel.Forward(ctx, pixelValues)
 	visionOutputs = visionOutputs.Reshape(ctx, visionOutputs.Dim(0), visionOutputs.Dim(1)*visionOutputs.Dim(2)*visionOutputs.Dim(3))
 	projectedOutputs := m.Projector.Forward(ctx, visionOutputs)
-	return &chunks{Model: m, Tensor: projectedOutputs, aspectRatio: image.Point{ratioW, ratioH}}, nil
+
+	var multimodal []input.Multimodal
+	aspectRatio := image.Point{ratioW, ratioH}
+
+	var offset int
+	patchesPerChunk := projectedOutputs.Dim(1)
+	if aspectRatio.Y*aspectRatio.X > 1 {
+		patchesPerChunk = projectedOutputs.Dim(1) / (aspectRatio.X*aspectRatio.Y + 1)
+
+		for range aspectRatio.Y {
+			for x := range aspectRatio.X {
+				view := projectedOutputs.View(ctx, projectedOutputs.Stride(1)*offset,
+					projectedOutputs.Dim(0), projectedOutputs.Stride(1),
+					patchesPerChunk)
+				var separator separator
+				if x < aspectRatio.X-1 {
+					separator.x = true // <|tile_x_separator|>
+				} else {
+					separator.y = true // <|tile_y_separator|>
+				}
+				multimodal = append(multimodal, input.Multimodal{Tensor: view, Data: &separator})
+				offset += patchesPerChunk
+			}
+		}
+	}
+
+	view := projectedOutputs.View(ctx, projectedOutputs.Stride(1)*offset,
+		projectedOutputs.Dim(0), projectedOutputs.Stride(1),
+		patchesPerChunk)
+	multimodal = append(multimodal, input.Multimodal{Tensor: view, Data: &separator{}})
+
+	return multimodal, nil
 }

-type chunks struct {
-	*Model
-	ml.Tensor
-	aspectRatio image.Point
-
-	dataOnce sync.Once
-	data     []float32
-}
-
-type chunk struct {
-	*chunks
-	s, n int
-}
-
-func (r *chunk) floats() []float32 {
-	r.dataOnce.Do(func() {
-		temp := r.Backend().NewContext()
-		defer temp.Close()
-		temp.Forward(r.Tensor).Compute(r.Tensor)
-		r.data = r.Floats()
-	})
-
-	return r.data[r.s*r.Dim(0) : (r.s+r.n)*r.Dim(0)]
+type separator struct {
+	x bool
+	y bool
 }

 func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
 	var result []input.Input
 	for _, inp := range inputs {
-		if inp.Multimodal == nil {
+		if len(inp.Multimodal) == 0 {
 			result = append(result, inp)
 			continue
 		}

-		t := inp.Multimodal.(*chunks)
 		var imageInputs []input.Input
 		imageInputs = append(imageInputs, input.Input{Token: 200080}) // <|image_start|>

-		var offset int
-		patchesPerChunk := t.Dim(1)
-		if t.aspectRatio.Y*t.aspectRatio.X > 1 {
-			patchesPerChunk = t.Dim(1) / (t.aspectRatio.X*t.aspectRatio.Y + 1)
+		for i, mm := range inp.Multimodal {
+			patchesPerChunk := mm.Tensor.Dim(1)

-			for range t.aspectRatio.Y {
-				for x := range t.aspectRatio.X {
-					imageInputs = append(imageInputs, input.Input{Token: 200092, Multimodal: &chunk{t, offset, patchesPerChunk}, MultimodalHash: inp.MultimodalHash, SameBatch: patchesPerChunk}) // <|patch|>
-					imageInputs = append(imageInputs, slices.Repeat([]input.Input{{Token: 200092}}, patchesPerChunk-1)...)
-					if x < t.aspectRatio.X-1 {
-						imageInputs = append(imageInputs, input.Input{Token: 200084}) // <|tile_x_separator|>
-					}
-					offset += patchesPerChunk
+			if i < len(inp.Multimodal)-1 {
+				separator := mm.Data.(*separator)
+
+				imageInputs = append(imageInputs, input.Input{Token: 200092, Multimodal: []input.Multimodal{{Tensor: mm.Tensor}}, MultimodalHash: inp.MultimodalHash, SameBatch: patchesPerChunk}) // <|patch|>
+				imageInputs = append(imageInputs, slices.Repeat([]input.Input{{Token: 200092}}, patchesPerChunk-1)...)
+
+				if separator.x {
+					imageInputs = append(imageInputs, input.Input{Token: 200084}) // <|tile_x_separator|>
 				}
-
-				imageInputs = append(imageInputs, input.Input{Token: 200085}) // <|tile_y_separator|>
+				if separator.y {
+					imageInputs = append(imageInputs, input.Input{Token: 200085}) // <|tile_y_separator|>
+				}
+			} else {
+				imageInputs = append(imageInputs, input.Input{Token: 200090})                                                                                                                      // <|image|>
+				imageInputs = append(imageInputs, input.Input{Token: 200092, Multimodal: []input.Multimodal{{Tensor: mm.Tensor}}, MultimodalHash: inp.MultimodalHash, SameBatch: patchesPerChunk}) // <|patch|>
+				imageInputs = append(imageInputs, slices.Repeat([]input.Input{{Token: 200092}}, patchesPerChunk-1)...)
+				imageInputs = append(imageInputs, input.Input{Token: 200080}) // <|image_end|>
 			}
 		}

-		imageInputs = append(imageInputs, input.Input{Token: 200090})                                                                                                                 // <|image|>
-		imageInputs = append(imageInputs, input.Input{Token: 200092, Multimodal: &chunk{t, offset, patchesPerChunk}, MultimodalHash: inp.MultimodalHash, SameBatch: patchesPerChunk}) // <|patch|>
-		imageInputs = append(imageInputs, slices.Repeat([]input.Input{{Token: 200092}}, patchesPerChunk-1)...)
-		imageInputs = append(imageInputs, input.Input{Token: 200080}) // <|image_end|>
-
 		result = append(result, imageInputs...)
 	}

--- a/model/models/llama4/model_text.go
+++ b/model/models/llama4/model_text.go
@ -210,12 +210,7 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor
 	hiddenStates := m.TokenEmbedding.Forward(ctx, inputs).Duplicate(ctx)

 	for _, mi := range batch.Multimodal {
-		f32s := mi.Multimodal.(*chunk).floats()
-		img, err := ctx.Input().FromFloatSlice(f32s, len(f32s)/m.hiddenSize, m.hiddenSize)
-		if err != nil {
-			panic(err)
-		}
-
+		img := mi.Multimodal[0].Tensor
 		ctx.Forward(img.Copy(ctx, hiddenStates.View(ctx, mi.Index*hiddenStates.Stride(1), img.Dim(0)*img.Dim(1))))
 	}

--- a/model/models/llama4/model_vision.go
+++ b/model/models/llama4/model_vision.go
@ -208,7 +208,7 @@ func (m *VisionModel) Forward(ctx ml.Context, pixelValues ml.Tensor) ml.Tensor {
 	}

 	hiddenStates = m.LayerNormPost.Forward(ctx, hiddenStates, m.eps)
-	hiddenStates = hiddenStates.Pad(ctx, 0, -1, 0, 0)
+	hiddenStates = hiddenStates.Unpad(ctx, 0, 1, 0, 0)
 	hiddenStates = m.VisionAdapter.Forward(ctx, hiddenStates, m.VisionOptions)
 	return hiddenStates
 }
--- a/model/models/mistral3/model.go
+++ b/model/models/mistral3/model.go
@ -4,7 +4,6 @@ import (
 	"bytes"
 	"image"
 	"slices"
-	"sync"

 	"github.com/ollama/ollama/fs"
 	"github.com/ollama/ollama/kvcache"
@ -16,8 +15,6 @@ import (

 type Model struct {
 	model.Base
-	model.BytePairEncoding
-
 	*TextModel
 	*VisionModel         `gguf:"v,vision"`
 	*MultiModalProjector `gguf:"mm"`
@ -42,21 +39,6 @@ func New(c fs.Config) (model.Model, error) {
 		VisionModel:         newVisionModel(c),
 		ImageProcessor:      newImageProcessor(c),
 		MultiModalProjector: newMultiModalProjector(c),
-		BytePairEncoding: model.NewBytePairEncoding(
-			c.String("tokenizer.ggml.pretokenizer", `[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
-			&model.Vocabulary{
-				Values: c.Strings("tokenizer.ggml.tokens"),
-				Types:  c.Ints("tokenizer.ggml.token_type"),
-				Merges: c.Strings("tokenizer.ggml.merges"),
-				BOS:    int32(c.Uint("tokenizer.ggml.bos_token_id", 1)),
-				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
-				EOS:    int32(c.Uint("tokenizer.ggml.eos_token_id", 2)),
-				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
-				// TODO: set EOT to EOS otherwise 0 will stop generation
-				EOT:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
-				AddEOT: c.Bool("tokenizer.ggml.add_eos_token", false),
-			},
-		),
 	}

 	m.Cache = kvcache.NewCausalCache(m.TextModel.Shift)
@ -105,7 +87,7 @@ func newMultiModalProjector(c fs.Config) *MultiModalProjector {
 	}
 }

-func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, error) {
+func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input.Multimodal, error) {
 	if len(m.VisionModel.Layers) == 0 {
 		return nil, model.ErrNoVisionModel
 	}
@ -129,37 +111,14 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, er
 	features, size := m.MultiModalProjector.Forward(ctx, visionOutputs, size)

 	// split into patches to be sent to the text transformer
-	parent := imageFeatures{tensor: features}
-	rows := make([]*imageRow, size.Y)
+	rows := make([]input.Multimodal, size.Y)
 	for i := range rows {
-		rows[i] = &imageRow{parent: &parent, s: i, shape: []int{features.Dim(0), size.X}}
+		rows[i].Tensor = features.View(ctx, features.Stride(1)*size.X*i, features.Dim(0), features.Stride(1), size.X)
 	}

 	return rows, nil
 }

-type imageFeatures struct {
-	tensor ml.Tensor
-
-	dataOnce sync.Once
-	data     []float32
-}
-
-type imageRow struct {
-	parent *imageFeatures
-	s      int
-	shape  []int
-}
-
-func (r *imageRow) data() []float32 {
-	n := 1
-	for _, s := range r.shape {
-		n *= s
-	}
-
-	return r.parent.data[r.s*n : (r.s+1)*n]
-}
-
 // PostTokenize arranges Mistral 3's inputs for the forward pass
 // In Mistral 3 and Pixtral, the input patches are arranged as follows:
 // [IMG]...[IMG][IMG_BREAK][IMG]...[IMG][IMG_BREAK][IMG]...[IMG][IMG_END]
@ -168,15 +127,14 @@ func (r *imageRow) data() []float32 {
 func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
 	var result []input.Input
 	for _, inp := range inputs {
-		if inp.Multimodal == nil {
+		if len(inp.Multimodal) == 0 {
 			result = append(result, inp)
 		} else {
-			inputMultimodal := inp.Multimodal.([]*imageRow)
-			for i, row := range inputMultimodal {
+			for i, row := range inp.Multimodal {
 				// [IMG]
-				result = append(result, input.Input{Token: 10, Multimodal: row, MultimodalHash: inp.MultimodalHash, SameBatch: row.shape[1]})
-				result = append(result, slices.Repeat([]input.Input{{Token: 10}}, row.shape[1]-1)...)
-				if i == len(inputMultimodal)-1 {
+				result = append(result, input.Input{Token: 10, Multimodal: []input.Multimodal{{Tensor: row.Tensor}}, MultimodalHash: inp.MultimodalHash, SameBatch: row.Tensor.Dim(1)})
+				result = append(result, slices.Repeat([]input.Input{{Token: 10}}, row.Tensor.Dim(1)-1)...)
+				if i == len(inp.Multimodal)-1 {
 					// [IMG_END]
 					result = append(result, input.Input{Token: 13})
 				} else {
--- a/model/models/mistral3/model_text.go
+++ b/model/models/mistral3/model_text.go
@ -21,6 +21,7 @@ type TextOptions struct {

 type TextModel struct {
 	model.Base
+	model.BytePairEncoding

 	TokenEmbedding *nn.Embedding `gguf:"token_embd"`
 	Layers         []Layer       `gguf:"blk"`
@ -109,20 +110,7 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor

 	// image embeddings
 	for _, image := range batch.Multimodal {
-		row := image.Multimodal.(*imageRow)
-		row.parent.dataOnce.Do(func() {
-			// use a new, throwaway context so the image tensor is not added to the graph
-			temp := m.Backend().NewContext()
-			temp.Forward(row.parent.tensor).Compute(row.parent.tensor)
-			row.parent.data = row.parent.tensor.Floats()
-			temp.Close()
-		})
-
-		imageFeature, err := ctx.Input().FromFloatSlice(row.data(), row.shape...)
-		if err != nil {
-			panic(err)
-		}
-
+		imageFeature := image.Multimodal[0].Tensor
 		ctx.Forward(imageFeature.Copy(ctx, hiddenState.View(ctx, image.Index*hiddenState.Stride(1), imageFeature.Dim(0)*imageFeature.Dim(1))))
 	}

@ -147,6 +135,18 @@ func NewTextModel(c fs.Config) (*TextModel, error) {
 	}

 	textModel := &TextModel{
+		BytePairEncoding: model.NewBytePairEncoding(
+			c.String("tokenizer.ggml.pretokenizer", `[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
+			&model.Vocabulary{
+				Values: c.Strings("tokenizer.ggml.tokens"),
+				Types:  c.Ints("tokenizer.ggml.token_type"),
+				Merges: c.Strings("tokenizer.ggml.merges"),
+				BOS:    int32(c.Uint("tokenizer.ggml.bos_token_id", 1)),
+				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
+				EOS:    int32(c.Uint("tokenizer.ggml.eos_token_id", 2)),
+				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
+			},
+		),
 		Layers: make([]Layer, c.Uint("block_count")),
 		TextOptions: &TextOptions{
 			hiddenSize: int(c.Uint("embedding_length")),
--- a/model/models/mllama/imageproc.go
+++ b/model/models/mllama/imageproc.go
@ -0,0 +1,201 @@
+package mllama
+
+import (
+	"fmt"
+	"image"
+	_ "image/jpeg"
+	_ "image/png"
+	"io"
+	"math"
+	"slices"
+
+	"golang.org/x/image/draw"
+
+	"github.com/ollama/ollama/model/imageproc"
+)
+
+func getSupportedAspectRatios(maxTiles int) []image.Point {
+	ratios := []image.Point{}
+
+	for w := range maxTiles {
+		for h := range maxTiles {
+			if (w+1)*(h+1) <= maxTiles {
+				ratios = append(ratios, image.Point{w + 1, h + 1})
+			}
+		}
+	}
+
+	return ratios
+}
+
+func clip(a, a_min, a_max int) int {
+	if a < a_min {
+		return a_min
+	} else if a > a_max {
+		return a_max
+	}
+
+	return a
+}
+
+func getOptimalTiledCanvas(imageSize image.Point, maxImageTiles, tileSize int) image.Point {
+	possibleTileArrangements := getSupportedAspectRatios(maxImageTiles)
+	possibleCanvasSizes := []image.Point{}
+	for _, pta := range possibleTileArrangements {
+		possibleCanvasSizes = append(possibleCanvasSizes, image.Point{pta.X * tileSize, pta.Y * tileSize})
+	}
+
+	scales := []float64{}
+
+	for _, pcs := range possibleCanvasSizes {
+		scaleHeight := float64(pcs.Y) / float64(imageSize.Y)
+		scaleWidth := float64(pcs.X) / float64(imageSize.X)
+
+		if scaleWidth > scaleHeight {
+			scales = append(scales, scaleHeight)
+		} else {
+			scales = append(scales, scaleWidth)
+		}
+	}
+
+	var minUpscale float64
+	var maxDownscale float64
+	var upscale bool
+
+	for _, s := range scales {
+		if s > 1.0 {
+			upscale = true
+			if minUpscale == 0 {
+				minUpscale = s
+			} else {
+				minUpscale = math.Min(minUpscale, s)
+			}
+		} else {
+			maxDownscale = math.Max(maxDownscale, s)
+		}
+	}
+
+	selectedScale := maxDownscale
+	if upscale {
+		selectedScale = minUpscale
+	}
+
+	var selectedCanvas image.Point
+	for n, pcs := range possibleCanvasSizes {
+		if scales[n] == selectedScale {
+			// choose the smallest possible canvas
+			if selectedCanvas.X == 0 && selectedCanvas.Y == 0 {
+				selectedCanvas = pcs
+			} else if pcs.X*pcs.Y < selectedCanvas.X*selectedCanvas.Y {
+				selectedCanvas = pcs
+			}
+		}
+	}
+	return selectedCanvas
+}
+
+func getImageSizeFitToCanvas(imageSize, canvasSize image.Point, tileSize int) image.Point {
+	targetWidth := clip(imageSize.X, tileSize, canvasSize.X)
+	targetHeight := clip(imageSize.Y, tileSize, canvasSize.Y)
+
+	scaleWidth := float64(targetWidth) / float64(imageSize.X)
+	scaleHeight := float64(targetHeight) / float64(imageSize.Y)
+
+	var w, h int
+
+	if scaleWidth < scaleHeight {
+		w = targetWidth
+		h = min(int(math.Floor(float64(imageSize.Y)*scaleWidth)), targetHeight)
+	} else {
+		w = min(int(math.Floor(float64(imageSize.X)*scaleHeight)), targetWidth)
+		h = targetHeight
+	}
+
+	return image.Point{w, h}
+}
+
+func resizeImage(img image.Image, format string, outputSize image.Point, maxImageTiles int) (image.Image, image.Point) {
+	if format == "png" {
+		img = imageproc.Composite(img)
+	}
+
+	b := img.Bounds()
+	tileSize := outputSize.Y
+
+	canvasSize := getOptimalTiledCanvas(b.Max, maxImageTiles, tileSize)
+	aspectRatio := image.Point{canvasSize.X / tileSize, canvasSize.Y / tileSize}
+	newSize := getImageSizeFitToCanvas(b.Max, canvasSize, tileSize)
+
+	return imageproc.Resize(img, newSize, imageproc.ResizeBilinear), aspectRatio
+}
+
+func padImage(img image.Image, outputSize, aspectRatio image.Point) image.Image {
+	paddedSize := image.Point{
+		X: outputSize.X * aspectRatio.X,
+		Y: outputSize.Y * aspectRatio.Y,
+	}
+
+	dst := image.NewRGBA(image.Rect(0, 0, paddedSize.X, paddedSize.Y))
+	draw.Draw(dst, img.Bounds(), img, image.Point{0, 0}, draw.Over)
+
+	return dst
+}
+
+func splitToTiles(img image.Image, numTilesSize image.Point) []image.Image {
+	b := img.Bounds()
+	width := b.Max.X - b.Min.X
+	height := b.Max.Y - b.Min.Y
+	tileHeight := height / numTilesSize.Y
+	tileWidth := width / numTilesSize.X
+
+	images := []image.Image{}
+
+	for h := range numTilesSize.Y {
+		for w := range numTilesSize.X {
+			rect := image.Rect(tileWidth*w, tileHeight*h, tileWidth*(w+1), tileHeight*(h+1))
+			images = append(images, img.(interface {
+				SubImage(image.Rectangle) image.Image
+			}).SubImage(rect))
+		}
+	}
+
+	return images
+}
+
+func packImages(img image.Image, aspectRatio image.Point) []float32 {
+	subImages := splitToTiles(img, aspectRatio)
+
+	var pixelVals []float32
+
+	rescale := true
+	channelFirst := true
+
+	for _, subImg := range subImages {
+		vals := imageproc.Normalize(subImg, imageproc.ClipDefaultMean, imageproc.ClipDefaultSTD, rescale, channelFirst)
+		pixelVals = append(pixelVals, vals...)
+	}
+
+	return pixelVals
+}
+
+func Preprocess(imageData io.Reader) ([]float32, map[string]any, error) {
+	outputSize := image.Point{560, 560}
+	maxTiles := 4
+
+	img, format, err := image.Decode(imageData)
+	if err != nil {
+		return nil, nil, fmt.Errorf("failed to decode image: %w", err)
+	}
+
+	newImage, aspectRatio := resizeImage(img, format, outputSize, maxTiles)
+	newImage = padImage(newImage, outputSize, aspectRatio)
+
+	data := packImages(newImage, aspectRatio)
+	aspectRatioIndex := slices.Index(getSupportedAspectRatios(maxTiles), aspectRatio) + 1
+
+	opts := map[string]any{
+		"aspectRatioIndex": aspectRatioIndex,
+	}
+
+	return data, opts, nil
+}
--- a/model/models/mllama/imageproc_test.go
+++ b/model/models/mllama/imageproc_test.go
@ -0,0 +1,420 @@
+package mllama
+
+import (
+	"bytes"
+	"image"
+	"image/png"
+	"testing"
+
+	"github.com/google/go-cmp/cmp"
+)
+
+func TestAspectRatios(t *testing.T) {
+	type aspectCase struct {
+		MaxTiles int
+		Expected []image.Point
+	}
+
+	cases := []aspectCase{
+		{
+			MaxTiles: 1,
+			Expected: []image.Point{{1, 1}},
+		},
+		{
+			MaxTiles: 2,
+			Expected: []image.Point{{1, 1}, {1, 2}, {2, 1}},
+		},
+		{
+			MaxTiles: 3,
+			Expected: []image.Point{{1, 1}, {1, 2}, {1, 3}, {2, 1}, {3, 1}},
+		},
+		{
+			MaxTiles: 4,
+			Expected: []image.Point{{1, 1}, {1, 2}, {1, 3}, {1, 4}, {2, 1}, {2, 2}, {3, 1}, {4, 1}},
+		},
+	}
+
+	for _, c := range cases {
+		actual := getSupportedAspectRatios(c.MaxTiles)
+
+		if diff := cmp.Diff(actual, c.Expected); diff != "" {
+			t.Errorf("mismatch (-got +want):\n%s", diff)
+		}
+	}
+}
+
+func TestGetImageSizeFitToCanvas(t *testing.T) {
+	type imageSizeCase struct {
+		ImageRect  image.Point
+		CanvasRect image.Point
+		TileSize   int
+		Expected   image.Point
+	}
+
+	cases := []imageSizeCase{
+		{
+			ImageRect:  image.Point{400, 400},
+			CanvasRect: image.Point{640, 480},
+			TileSize:   200,
+			Expected:   image.Point{400, 400},
+		},
+		{
+			ImageRect:  image.Point{1024, 768},
+			CanvasRect: image.Point{640, 480},
+			TileSize:   200,
+			Expected:   image.Point{640, 480},
+		},
+		{
+			ImageRect:  image.Point{500, 500},
+			CanvasRect: image.Point{1000, 1000},
+			TileSize:   750,
+			Expected:   image.Point{750, 750},
+		},
+		{
+			ImageRect:  image.Point{500, 1000},
+			CanvasRect: image.Point{2000, 2000},
+			TileSize:   2000,
+			Expected:   image.Point{1000, 2000},
+		},
+		{
+			ImageRect:  image.Point{4000, 3000},
+			CanvasRect: image.Point{2000, 1000},
+			TileSize:   1000,
+			Expected:   image.Point{1333, 1000},
+		},
+		{
+			ImageRect:  image.Point{667, 1000},
+			CanvasRect: image.Point{1000, 1000},
+			TileSize:   560,
+			Expected:   image.Point{667, 1000},
+		},
+	}
+
+	for _, c := range cases {
+		actual := getImageSizeFitToCanvas(c.ImageRect, c.CanvasRect, c.TileSize)
+
+		if actual != c.Expected {
+			t.Errorf("incorrect image rect: '%#v'. expected: '%#v'", actual, c.Expected)
+		}
+	}
+}
+
+func TestGetOptimalTiledCanvas(t *testing.T) {
+	type tiledCanvasSizeCase struct {
+		ImageSize     image.Point
+		MaxImageTiles int
+		TileSize      int
+		Expected      image.Point
+	}
+
+	cases := []tiledCanvasSizeCase{
+		{
+			ImageSize:     image.Point{1024, 768},
+			MaxImageTiles: 4,
+			TileSize:      1000,
+			Expected:      image.Point{2000, 1000},
+		},
+		{
+			ImageSize:     image.Point{1024, 768},
+			MaxImageTiles: 4,
+			TileSize:      560,
+			Expected:      image.Point{1120, 1120},
+		},
+		{
+			ImageSize:     image.Point{800, 600},
+			MaxImageTiles: 4,
+			TileSize:      560,
+			Expected:      image.Point{1120, 1120},
+		},
+		{
+			ImageSize:     image.Point{640, 480},
+			MaxImageTiles: 4,
+			TileSize:      560,
+			Expected:      image.Point{1120, 560},
+		},
+		{
+			ImageSize:     image.Point{320, 200},
+			MaxImageTiles: 4,
+			TileSize:      560,
+			Expected:      image.Point{560, 560},
+		},
+		{
+			ImageSize:     image.Point{1320, 200},
+			MaxImageTiles: 4,
+			TileSize:      560,
+			Expected:      image.Point{1680, 560},
+		},
+		{
+			ImageSize:     image.Point{2000, 200},
+			MaxImageTiles: 4,
+			TileSize:      560,
+			Expected:      image.Point{2240, 560},
+		},
+		{
+			ImageSize:     image.Point{10000, 200},
+			MaxImageTiles: 4,
+			TileSize:      560,
+			Expected:      image.Point{2240, 560},
+		},
+		{
+			ImageSize:     image.Point{480, 640},
+			MaxImageTiles: 4,
+			TileSize:      560,
+			Expected:      image.Point{560, 1120},
+		},
+		{
+			ImageSize:     image.Point{200, 320},
+			MaxImageTiles: 4,
+			TileSize:      560,
+			Expected:      image.Point{560, 560},
+		},
+		{
+			ImageSize:     image.Point{200, 1320},
+			MaxImageTiles: 4,
+			TileSize:      560,
+			Expected:      image.Point{560, 1680},
+		},
+		{
+			ImageSize:     image.Point{200, 2000},
+			MaxImageTiles: 4,
+			TileSize:      560,
+			Expected:      image.Point{560, 2240},
+		},
+		{
+			ImageSize:     image.Point{200, 10000},
+			MaxImageTiles: 4,
+			TileSize:      560,
+			Expected:      image.Point{560, 2240},
+		},
+		{
+			ImageSize:     image.Point{10000, 10000},
+			MaxImageTiles: 4,
+			TileSize:      560,
+			Expected:      image.Point{1120, 1120},
+		},
+	}
+
+	for _, c := range cases {
+		actual := getOptimalTiledCanvas(c.ImageSize, c.MaxImageTiles, c.TileSize)
+
+		if actual != c.Expected {
+			t.Errorf("incorrect tiled canvas: '%#v'. expected: '%#v'", actual, c.Expected)
+		}
+	}
+}
+
+func TestSplitToTiles(t *testing.T) {
+	type splitCase struct {
+		TestImage    image.Image
+		NumTilesSize image.Point
+		Expected     []image.Image
+	}
+
+	cases := []splitCase{
+		{
+			TestImage:    image.NewRGBA(image.Rect(0, 0, 1024, 768)),
+			NumTilesSize: image.Point{1, 1},
+			Expected:     []image.Image{image.NewRGBA(image.Rect(0, 0, 1024, 768))},
+		},
+		{
+			TestImage:    image.NewRGBA(image.Rect(0, 0, 1000, 500)),
+			NumTilesSize: image.Point{2, 1},
+			Expected: []image.Image{
+				image.NewRGBA(image.Rect(0, 0, 500, 500)),
+				image.NewRGBA(image.Rect(500, 0, 1000, 500)),
+			},
+		},
+		{
+			TestImage:    image.NewRGBA(image.Rect(0, 0, 1000, 1000)),
+			NumTilesSize: image.Point{2, 2},
+			Expected: []image.Image{
+				image.NewRGBA(image.Rect(0, 0, 500, 500)),
+				image.NewRGBA(image.Rect(500, 0, 1000, 500)),
+				image.NewRGBA(image.Rect(0, 500, 500, 1000)),
+				image.NewRGBA(image.Rect(500, 500, 1000, 1000)),
+			},
+		},
+	}
+
+	for _, c := range cases {
+		actual := splitToTiles(c.TestImage, c.NumTilesSize)
+
+		if len(actual) != len(c.Expected) {
+			t.Errorf("incorrect number of images '%d': expected: '%d'", len(actual), len(c.Expected))
+		}
+
+		for i := range actual {
+			if actual[i].Bounds() != c.Expected[i].Bounds() {
+				t.Errorf("image size incorrect: '%#v': expected: '%#v'", actual[i].Bounds(), c.Expected[i].Bounds())
+			}
+		}
+	}
+}
+
+func TestResize(t *testing.T) {
+	type resizeCase struct {
+		TestImage           image.Image
+		OutputSize          image.Point
+		MaxImageTiles       int
+		ExpectedImage       image.Image
+		ExpectedAspectRatio image.Point
+	}
+
+	cases := []resizeCase{
+		{
+			TestImage:           image.NewRGBA(image.Rect(0, 0, 200, 200)),
+			OutputSize:          image.Point{100, 100},
+			MaxImageTiles:       1,
+			ExpectedImage:       image.NewRGBA(image.Rect(0, 0, 100, 100)),
+			ExpectedAspectRatio: image.Point{1, 1},
+		},
+		{
+			TestImage:           image.NewRGBA(image.Rect(0, 0, 200, 200)),
+			OutputSize:          image.Point{100, 100},
+			MaxImageTiles:       2,
+			ExpectedImage:       image.NewRGBA(image.Rect(0, 0, 100, 100)),
+			ExpectedAspectRatio: image.Point{1, 1},
+		},
+		{
+			TestImage:           image.NewRGBA(image.Rect(0, 0, 10, 10)),
+			OutputSize:          image.Point{560, 560},
+			MaxImageTiles:       4,
+			ExpectedImage:       image.NewRGBA(image.Rect(0, 0, 560, 560)),
+			ExpectedAspectRatio: image.Point{1, 1},
+		},
+		{
+			TestImage:           image.NewRGBA(image.Rect(0, 0, 2560, 1920)),
+			OutputSize:          image.Point{560, 560},
+			MaxImageTiles:       4,
+			ExpectedImage:       image.NewRGBA(image.Rect(0, 0, 1120, 840)),
+			ExpectedAspectRatio: image.Point{2, 2},
+		},
+		{
+			TestImage:           image.NewRGBA(image.Rect(0, 0, 1024, 768)),
+			OutputSize:          image.Point{560, 560},
+			MaxImageTiles:       4,
+			ExpectedImage:       image.NewRGBA(image.Rect(0, 0, 1024, 768)),
+			ExpectedAspectRatio: image.Point{2, 2},
+		},
+	}
+
+	for _, c := range cases {
+		actualImage, actualAspectRatio := resizeImage(c.TestImage, "png", c.OutputSize, c.MaxImageTiles)
+
+		if actualImage.Bounds() != c.ExpectedImage.Bounds() {
+			t.Errorf("image size incorrect: '%#v': expected: '%#v'", actualImage.Bounds(), c.ExpectedImage.Bounds())
+		}
+
+		if actualAspectRatio != c.ExpectedAspectRatio {
+			t.Errorf("aspect ratio incorrect: '%#v': expected: '%#v'", actualAspectRatio, c.ExpectedAspectRatio)
+		}
+	}
+}
+
+func TestPad(t *testing.T) {
+	type padCase struct {
+		TestImage   image.Image
+		OutputSize  image.Point
+		AspectRatio image.Point
+		Expected    image.Image
+	}
+
+	cases := []padCase{
+		{
+			TestImage:   image.NewRGBA(image.Rect(0, 0, 1000, 667)),
+			OutputSize:  image.Point{560, 560},
+			AspectRatio: image.Point{2, 2},
+			Expected:    image.NewRGBA(image.Rect(0, 0, 1120, 1120)),
+		},
+	}
+
+	for _, c := range cases {
+		actual := padImage(c.TestImage, c.OutputSize, c.AspectRatio)
+
+		if actual.Bounds() != c.Expected.Bounds() {
+			t.Errorf("image size incorrect: '%#v': expected: '%#v'", actual.Bounds(), c.Expected.Bounds())
+		}
+	}
+}
+
+func TestPackImages(t *testing.T) {
+	type packCase struct {
+		TestImage    image.Image
+		AspectRatio  image.Point
+		ExpectedVals int
+	}
+
+	cases := []packCase{
+		{
+			TestImage:    image.NewRGBA(image.Rect(0, 0, 1120, 1120)),
+			AspectRatio:  image.Point{2, 2},
+			ExpectedVals: 2 * 2 * 3 * 560 * 560,
+		},
+		{
+			TestImage:    image.NewRGBA(image.Rect(0, 0, 560, 560)),
+			AspectRatio:  image.Point{1, 1},
+			ExpectedVals: 1 * 1 * 3 * 560 * 560,
+		},
+		{
+			TestImage:    image.NewRGBA(image.Rect(0, 0, 1120, 560)),
+			AspectRatio:  image.Point{1, 2},
+			ExpectedVals: 1 * 2 * 3 * 560 * 560,
+		},
+	}
+
+	for _, c := range cases {
+		actualVals := packImages(c.TestImage, c.AspectRatio)
+		if len(actualVals) != c.ExpectedVals {
+			t.Errorf("packed image size incorrect: '%d': expected: '%d'", len(actualVals), c.ExpectedVals)
+		}
+	}
+}
+
+func TestPreprocess(t *testing.T) {
+	type preprocessCase struct {
+		TestImage             image.Image
+		ExpectedVals          int
+		ExpectedAspectRatioID int
+	}
+
+	cases := []preprocessCase{
+		{
+			TestImage:             image.NewRGBA(image.Rect(0, 0, 10, 10)),
+			ExpectedVals:          0,
+			ExpectedAspectRatioID: 1,
+		},
+		{
+			TestImage:             image.NewRGBA(image.Rect(0, 0, 1024, 768)),
+			ExpectedVals:          0,
+			ExpectedAspectRatioID: 6,
+		},
+	}
+
+	for _, c := range cases {
+		var buf bytes.Buffer
+		err := png.Encode(&buf, c.TestImage)
+		if err != nil {
+			t.Fatal(err)
+		}
+
+		imgData, opts, err := Preprocess(&buf)
+		if err != nil {
+			t.Fatalf("error processing: %q", err)
+		}
+
+		if len(imgData) == 0 {
+			t.Errorf("no image data returned")
+		}
+
+		ar, ok := opts["aspectRatioIndex"]
+		if !ok {
+			t.Fatalf("no aspect ratio found")
+		}
+
+		aspectRatioID := ar.(int)
+
+		if aspectRatioID != c.ExpectedAspectRatioID {
+			t.Errorf("aspect ratio incorrect: '%d': expected: '%d'", aspectRatioID, c.ExpectedAspectRatioID)
+		}
+	}
+}
--- a/model/models/mllama/model.go
+++ b/model/models/mllama/model.go
@ -2,7 +2,11 @@ package mllama

 import (
 	"bytes"
+	"encoding/binary"
+	"fmt"
+	"hash/fnv"
 	"image"
+	"slices"

 	"github.com/ollama/ollama/fs"
 	"github.com/ollama/ollama/kvcache"
@ -30,6 +34,10 @@ const (
 )

 func New(c fs.Config) (model.Model, error) {
+	// Verify unified config
+	if c.Uint("vision.block_count") == 0 {
+		return nil, fmt.Errorf("non-unified vision model not supported")
+	}
 	m := Model{
 		BytePairEncoding: model.NewBytePairEncoding(
 			c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
@ -41,9 +49,6 @@ func New(c fs.Config) (model.Model, error) {
 				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
 				EOS:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
 				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
-				// TODO: set EOT to EOS otherwise 0 will stop generation
-				EOT:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
-				AddEOT: c.Bool("tokenizer.ggml.add_eos_token", false),
 			},
 		),
 		ImageProcessor: newImageProcessor(c),
@ -58,7 +63,7 @@ func New(c fs.Config) (model.Model, error) {
 	return &m, nil
 }

-func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, error) {
+func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input.Multimodal, error) {
 	if len(m.VisionModel.Transformer.Layers) == 0 || len(m.GlobalTransformer.Layers) == 0 {
 		return nil, model.ErrNoVisionModel
 	}
@ -68,42 +73,67 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, er
 		return nil, err
 	}

-	f32s, ratio, err := m.ImageProcessor.ProcessImage(image)
+	f32s, aspectRatioID, err := m.ImageProcessor.ProcessImage(image)
 	if err != nil {
 		return nil, err
 	}

-	pixelValues, err := ctx.Input().FromFloatSlice(f32s, m.imageSize, m.imageSize, m.numChannels, ratio.numTiles())
+	pixelValues, err := ctx.Input().FromFloatSlice(f32s,
+		m.ImageProcessor.imageSize,
+		m.ImageProcessor.imageSize,
+		m.ImageProcessor.numChannels,
+		m.ImageProcessor.maxNumTiles,
+	)
 	if err != nil {
 		return nil, err
 	}

-	pixelValues = pixelValues.Pad(ctx, 0, 0, 0, m.ImageProcessor.maxNumTiles-ratio.numTiles())
-
-	aspectRatio, err := ctx.Input().FromIntSlice([]int32{int32(ratio.rank)}, 1)
+	aspectRatio, err := ctx.Input().FromIntSlice([]int32{int32(aspectRatioID)}, 1)
 	if err != nil {
 		return nil, err
 	}

 	positionIDs := ctx.Arange(0, 1601, 1, ml.DTypeI32)
 	crossAttentionStates := m.VisionModel.Forward(ctx, pixelValues, positionIDs, aspectRatio)
-	return m.Projector.Forward(ctx, crossAttentionStates), nil
+	return []input.Multimodal{{Tensor: m.Projector.Forward(ctx, crossAttentionStates)}}, nil
 }

 func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
+	var images []input.Input
+	fnvHash := fnv.New64a()
+
 	for i := range inputs {
-		if inputs[i].Multimodal != nil {
-			inputs[i].Token = 128256 // <|image|>
+		if len(inputs[i].Multimodal) == 0 {
+			if len(images) > 0 {
+				inputs[i].Multimodal = images[0].Multimodal
+				inputs[i].MultimodalHash = images[0].MultimodalHash
+				for j := 1; j < len(images); j++ {
+					inputs[i].Multimodal = append(inputs[i].Multimodal, images[j].Multimodal...)
+					fnvHash.Reset()
+					binary.Write(fnvHash, binary.NativeEndian, inputs[i].MultimodalHash)
+					binary.Write(fnvHash, binary.NativeEndian, inputs[j].MultimodalHash)
+					inputs[i].MultimodalHash = fnvHash.Sum64()
+				}
+				images = nil
+			}
+		} else {
+			images = append(images, inputs[i])
+			inputs[i].Token = -1
 		}
 	}

+	inputs = slices.DeleteFunc(inputs, func(input input.Input) bool { return input.Token == -1 })
+
 	return inputs, nil
 }

 func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
 	var crossAttentionStates ml.Tensor
 	if len(batch.Multimodal) > 0 {
-		crossAttentionStates = batch.Multimodal[len(batch.Multimodal)-1].Multimodal.(ml.Tensor)
+		images := batch.Multimodal[len(batch.Multimodal)-1].Multimodal
+		if len(images) > 0 {
+			crossAttentionStates = images[len(images)-1].Tensor
+		}
 	}

 	positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
@ -117,7 +147,7 @@ func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
 	}

 	// TODO: attention mask, cross attention mask
-	return m.TextModel.Forward(ctx, batch.Inputs, positions, outputs, crossAttentionStates, nil, m.Cache.(*kvcache.WrapperCache)), nil
+	return m.TextModel.Forward(ctx, batch.Inputs, positions, outputs, nil, crossAttentionStates, nil, m.Cache.(*kvcache.WrapperCache)), nil
 }

 func init() {
--- a/model/models/mllama/model_text.go
+++ b/model/models/mllama/model_text.go
@ -18,7 +18,7 @@ type TextSelfAttention struct {
 	RopeFactors ml.Tensor  `gguf:"rope_freqs.weight"`
 }

-func (sa *TextSelfAttention) Forward(ctx ml.Context, hiddenState, positions ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor {
+func (sa *TextSelfAttention) Forward(ctx ml.Context, hiddenState, positions, _ ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor {
 	batchSize := hiddenState.Dim(1)
 	headDim := opts.hiddenSize / opts.numHeads
 	ropeType := uint32(0)
@ -69,11 +69,11 @@ type TextSelfAttentionDecoderLayer struct {
 	MLP     *TextMLP
 }

-func (d *TextSelfAttentionDecoderLayer) Forward(ctx ml.Context, hiddenState, positions, outputs, _, _ ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor {
+func (d *TextSelfAttentionDecoderLayer) Forward(ctx ml.Context, hiddenState, positions, outputs, mask, _, _ ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor {
 	residual := hiddenState

 	hiddenState = d.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
-	hiddenState = d.SelfAttention.Forward(ctx, hiddenState, positions, cache, opts)
+	hiddenState = d.SelfAttention.Forward(ctx, hiddenState, positions, mask, cache, opts)

 	// In the final layer (outputs != nil), optimize by pruning to just the token positions
 	// we need logits for.
@ -151,7 +151,7 @@ type TextCrossAttentionDecoderLayer struct {
 	MLPGate ml.Tensor `gguf:"cross_attn_mlp_gate"`
 }

-func (d *TextCrossAttentionDecoderLayer) Forward(ctx ml.Context, hiddenState, _, _, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor {
+func (d *TextCrossAttentionDecoderLayer) Forward(ctx ml.Context, hiddenState, _, _, _, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor {
 	residual := hiddenState

 	hiddenState = d.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
@ -167,14 +167,14 @@ func (d *TextCrossAttentionDecoderLayer) Forward(ctx ml.Context, hiddenState, _,
 }

 type TextDecoderLayer interface {
-	Forward(ctx ml.Context, hiddenState, positionIDs, outputs, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor
+	Forward(ctx ml.Context, hiddenState, positionIDs, outputs, mask, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor
 }

 type TextDecoder struct {
 	Layers []TextDecoderLayer
 }

-func (d *TextDecoder) Forward(ctx ml.Context, hiddenState, positionIDs, outputs, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor {
+func (d *TextDecoder) Forward(ctx ml.Context, hiddenState, positionIDs, outputs, mask, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor {
 	for i, layer := range d.Layers {
 		layerType := selfAttentionLayer
 		if slices.Contains(opts.crossAttentionLayers, int32(i)) {
@ -190,7 +190,7 @@ func (d *TextDecoder) Forward(ctx ml.Context, hiddenState, positionIDs, outputs,
 				lastLayerOutputs = outputs
 			}

-			hiddenState = layer.Forward(ctx, hiddenState, positionIDs, lastLayerOutputs, crossAttentionStates, crossAttentionMask, cache, opts)
+			hiddenState = layer.Forward(ctx, hiddenState, positionIDs, lastLayerOutputs, mask, crossAttentionStates, crossAttentionMask, cache, opts)
 		}
 	}

@ -214,9 +214,9 @@ type TextModel struct {
 	*TextModelOptions
 }

-func (m *TextModel) Forward(ctx ml.Context, inputIDs, positionIDs, outputs, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache) ml.Tensor {
+func (m *TextModel) Forward(ctx ml.Context, inputIDs, positionIDs, outputs, mask, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache) ml.Tensor {
 	hiddenState := m.TokenEmbedding.Forward(ctx, inputIDs)
-	hiddenState = m.Transformer.Forward(ctx, hiddenState, positionIDs, outputs, crossAttentionStates, crossAttentionMask, cache, m.TextModelOptions)
+	hiddenState = m.Transformer.Forward(ctx, hiddenState, positionIDs, outputs, mask, crossAttentionStates, crossAttentionMask, cache, m.TextModelOptions)
 	hiddenState = m.OutputNorm.Forward(ctx, hiddenState, m.eps)
 	return m.Output.Forward(ctx, hiddenState)
 }
--- a/model/models/mllama/model_vision.go
+++ b/model/models/mllama/model_vision.go
@ -15,7 +15,7 @@ type VisionSelfAttention struct {
 	Query  *nn.Linear `gguf:"attn_q"`
 	Key    *nn.Linear `gguf:"attn_k"`
 	Value  *nn.Linear `gguf:"attn_v"`
-	Output *nn.Linear `gguf:"attn_output"`
+	Output *nn.Linear `gguf:"attn_out"`

 	Gate ml.Tensor `gguf:"attn_gate"`
 }
@ -45,29 +45,36 @@ func (sa *VisionSelfAttention) Forward(ctx ml.Context, hiddenState ml.Tensor, op
 	attention = attention.Reshape(ctx, opts.hiddenSize, attention.Dim(2), batchSize)

 	hiddenState = sa.Output.Forward(ctx, attention)
+	if sa.Gate != nil {
+		hiddenState = hiddenState.Mul(ctx, sa.Gate)
+	}
+
 	return hiddenState
 }

 type VisionMLP struct {
-	Up   *nn.Linear `gguf:"ffn_up"`
 	Down *nn.Linear `gguf:"ffn_down"`
+	Up   *nn.Linear `gguf:"ffn_up"`
+
+	Gate ml.Tensor `gguf:"ffn_gate"`
 }

 func (mlp *VisionMLP) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *VisionModelOptions) ml.Tensor {
-	hiddenState = mlp.Up.Forward(ctx, hiddenState).GELU(ctx)
-	hiddenState = mlp.Down.Forward(ctx, hiddenState)
+	hiddenState = mlp.Down.Forward(ctx, hiddenState).GELU(ctx)
+	hiddenState = mlp.Up.Forward(ctx, hiddenState)
+	if mlp.Gate != nil {
+		hiddenState = hiddenState.Mul(ctx, mlp.Gate)
+	}

 	return hiddenState
 }

 type VisionEncoderLayer struct {
-	AttentionNorm *nn.LayerNorm `gguf:"attn_norm"`
+	AttentionNorm *nn.LayerNorm `gguf:"ln1"`
 	SelfAttention *VisionSelfAttention
-	AttentionGate ml.Tensor `gguf:"attn_gate"`

-	MLPNorm *nn.LayerNorm `gguf:"ffn_norm"`
+	MLPNorm *nn.LayerNorm `gguf:"ln2"`
 	MLP     *VisionMLP
-	MLPGate ml.Tensor `gguf:"ffn_gate"`
 }

 func (e *VisionEncoderLayer) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *VisionModelOptions) ml.Tensor {
@ -76,22 +83,13 @@ func (e *VisionEncoderLayer) Forward(ctx ml.Context, hiddenState ml.Tensor, opts
 	// self attention
 	hiddenState = e.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
 	hiddenState = e.SelfAttention.Forward(ctx, hiddenState, opts)
-
-	if e.AttentionGate != nil {
-		hiddenState = hiddenState.Mul(ctx, e.AttentionGate)
-	}
 	hiddenState = hiddenState.Add(ctx, residual)
 	residual = hiddenState

 	// feed forward
 	hiddenState = e.MLPNorm.Forward(ctx, hiddenState, opts.eps)
 	hiddenState = e.MLP.Forward(ctx, hiddenState, opts)
-	hiddenState = hiddenState.Add(ctx, residual)
-	if e.MLPGate != nil {
-		hiddenState = hiddenState.Mul(ctx, e.MLPGate)
-	}
-
-	return hiddenState
+	return hiddenState.Add(ctx, residual)
 }

 type VisionEncoder struct {
@ -116,9 +114,9 @@ type PrecomputedAspectRatioEmbedding struct {
 	Gate      ml.Tensor `gguf:"gate"`
 }

-func (e *PrecomputedAspectRatioEmbedding) Forward(ctx ml.Context, hiddenState ml.Tensor, aspectRatioIDs ml.Tensor, numTiles int, opts *VisionModelOptions) ml.Tensor {
+func (e *PrecomputedAspectRatioEmbedding) Forward(ctx ml.Context, hiddenState ml.Tensor, aspectRatioIDs ml.Tensor, opts *VisionModelOptions) ml.Tensor {
 	embeddings := e.Embedding.Forward(ctx, aspectRatioIDs)
-	embeddings = embeddings.Reshape(ctx, opts.hiddenSize, 1, numTiles)
+	embeddings = embeddings.Reshape(ctx, opts.hiddenSize, 1, opts.numTiles)
 	if e.Gate != nil {
 		embeddings = embeddings.Mul(ctx, e.Gate)
 	}
@ -134,7 +132,7 @@ type PrecomputedPositionEmbedding struct {
 	TilePositionEmbeddingGate ml.Tensor     `gguf:"tile_position_embd.gate"`
 }

-func (e *PrecomputedPositionEmbedding) Forward(ctx ml.Context, hiddenState, positionIDs, aspectRatioIDs ml.Tensor, numPositions, numTiles int, opts *VisionModelOptions) ml.Tensor {
+func (e *PrecomputedPositionEmbedding) Forward(ctx ml.Context, hiddenState, positionIDs, aspectRatioIDs ml.Tensor, numPositions int, opts *VisionModelOptions) ml.Tensor {
 	positionEmbedding := e.PositionEmbedding.Forward(ctx, positionIDs)
 	if e.PositionEmbeddingGate != nil {
 		positionEmbedding = positionEmbedding.Mul(ctx, e.PositionEmbeddingGate)
@ -143,7 +141,7 @@ func (e *PrecomputedPositionEmbedding) Forward(ctx ml.Context, hiddenState, posi
 	hiddenState = hiddenState.Add(ctx, positionEmbedding)

 	tilePositionEmbedding := e.TilePositionEmbedding.Forward(ctx, aspectRatioIDs)
-	tilePositionEmbedding = tilePositionEmbedding.Reshape(ctx, opts.hiddenSize, numPositions, numTiles)
+	tilePositionEmbedding = tilePositionEmbedding.Reshape(ctx, opts.hiddenSize, numPositions, opts.numTiles)
 	if e.TilePositionEmbeddingGate != nil {
 		tilePositionEmbedding = tilePositionEmbedding.Mul(ctx, e.TilePositionEmbeddingGate)
 	}
@ -152,9 +150,9 @@ func (e *PrecomputedPositionEmbedding) Forward(ctx ml.Context, hiddenState, posi
 }

 type VisionModelOptions struct {
-	hiddenSize, numHeads int
-	imageSize, patchSize int
-	eps                  float32
+	hiddenSize, numHeads, numTiles int
+	imageSize, patchSize           int
+	eps                            float32

 	intermediateLayersIndices []int32
 }
@ -183,16 +181,14 @@ func (m *VisionModel) Forward(ctx ml.Context, pixelValues, positionIDs, aspectRa
 		numPositions++
 	}

-	numTiles := pixelValues.Dim(3)
-
 	hiddenState := m.PatchEmbeddings.Forward(ctx, pixelValues, m.patchSize, m.patchSize, 0, 0, 1, 1)
-	hiddenState = hiddenState.Reshape(ctx, numPatches, m.hiddenSize, numTiles)
+	hiddenState = hiddenState.Reshape(ctx, numPatches, m.hiddenSize, m.numTiles)
 	hiddenState = hiddenState.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx)

-	hiddenState = m.PreTilePositionEmbedding.Forward(ctx, hiddenState, aspectRatioIDs, numTiles, m.VisionModelOptions)
-	hiddenState = m.ClassEmbedding.Repeat(ctx, 2, numTiles).Concat(ctx, hiddenState, 1)
+	hiddenState = m.PreTilePositionEmbedding.Forward(ctx, hiddenState, aspectRatioIDs, m.VisionModelOptions)
+	hiddenState = m.ClassEmbedding.Repeat(ctx, 2, m.numTiles).Concat(ctx, hiddenState, 1)

-	hiddenState = m.PositionEmbedding.Forward(ctx, hiddenState, positionIDs, aspectRatioIDs, numPositions, numTiles, m.VisionModelOptions)
+	hiddenState = m.PositionEmbedding.Forward(ctx, hiddenState, positionIDs, aspectRatioIDs, numPositions, m.VisionModelOptions)
 	hiddenState = m.PreLayerNorm.Forward(ctx, hiddenState, m.eps)

 	numPaddingPatches := 8 - (hiddenState.Dim(1)%8)%8
@ -203,18 +199,18 @@ func (m *VisionModel) Forward(ctx ml.Context, pixelValues, positionIDs, aspectRa

 	hiddenState = m.PostLayerNorm.Forward(ctx, hiddenState, m.eps)

-	hiddenState = hiddenState.Reshape(ctx, m.hiddenSize, numPositions+numPaddingPatches, numTiles, batchSize)
-	hiddenState = m.PostTilePositionEmbedding.Forward(ctx, hiddenState, aspectRatioIDs, numTiles, m.VisionModelOptions)
+	hiddenState = hiddenState.Reshape(ctx, m.hiddenSize, numPositions+numPaddingPatches, m.numTiles, batchSize)
+	hiddenState = m.PostTilePositionEmbedding.Forward(ctx, hiddenState, aspectRatioIDs, m.VisionModelOptions)

-	hiddenState = hiddenState.Reshape(ctx, m.hiddenSize, numTiles*(numPositions+numPaddingPatches), batchSize)
+	hiddenState = hiddenState.Reshape(ctx, m.hiddenSize, m.numTiles*(numPositions+numPaddingPatches), batchSize)
 	hiddenState, _ = m.GlobalTransformer.Forward(ctx, hiddenState, nil, m.VisionModelOptions)

 	hiddenStates := intermediateHiddenStates[0].Stack(ctx, 0, intermediateHiddenStates[1:]...)
-	hiddenStates = hiddenStates.Reshape(ctx, len(intermediateHiddenStates)*m.hiddenSize, numPositions+numPaddingPatches, numTiles, batchSize)
-	hiddenStates = hiddenStates.Pad(ctx, 0, -numPaddingPatches, 0, 0)
+	hiddenStates = hiddenStates.Reshape(ctx, len(intermediateHiddenStates)*m.hiddenSize, numPositions+numPaddingPatches, m.numTiles, batchSize)
+	hiddenStates = hiddenStates.Unpad(ctx, 0, numPaddingPatches, 0, 0)

-	hiddenState = hiddenState.Reshape(ctx, m.hiddenSize, numPositions+numPaddingPatches, numTiles, batchSize)
-	hiddenState = hiddenState.Pad(ctx, 0, -numPaddingPatches, 0, 0)
+	hiddenState = hiddenState.Reshape(ctx, m.hiddenSize, numPositions+numPaddingPatches, m.numTiles, batchSize)
+	hiddenState = hiddenState.Unpad(ctx, 0, numPaddingPatches, 0, 0)
 	return hiddenState.Concat(ctx, hiddenStates, 0)
 }

@ -226,6 +222,7 @@ func newVisionModel(c fs.Config) *VisionModel {
 		VisionModelOptions: &VisionModelOptions{
 			hiddenSize: int(c.Uint("vision.embedding_length")),
 			numHeads:   int(c.Uint("vision.attention.head_count")),
+			numTiles:   int(c.Uint("vision.max_num_tiles")),

 			imageSize: int(c.Uint("vision.image_size")),
 			patchSize: int(c.Uint("vision.patch_size")),
--- a/model/models/mllama/process_image.go
+++ b/model/models/mllama/process_image.go
@ -2,31 +2,17 @@ package mllama

 import (
 	"image"
+	"image/color"
 	"math"
 	"slices"

 	"golang.org/x/image/draw"

 	"github.com/ollama/ollama/fs"
-	"github.com/ollama/ollama/model/imageproc"
 )

-type supportedAspectRatio struct {
-	rank, width, height int
-}
-
-func (a supportedAspectRatio) Point() image.Point {
-	return image.Point{a.width, a.height}
-}
-
-func (a supportedAspectRatio) numTiles() int {
-	return a.width * a.height
-}
-
 type ImageProcessor struct {
 	imageSize, numChannels, maxNumTiles int
-
-	mean, std [3]float32
 }

 func newImageProcessor(c fs.Config) ImageProcessor {
@ -34,49 +20,71 @@ func newImageProcessor(c fs.Config) ImageProcessor {
 		imageSize:   int(c.Uint("vision.image_size")),
 		numChannels: int(c.Uint("vision.num_channels")),
 		maxNumTiles: int(c.Uint("vision.max_num_tiles")),
-
-		mean: imageproc.ClipDefaultMean,
-		std:  imageproc.ClipDefaultSTD,
 	}
 }

-func (p ImageProcessor) supportedAspectRatios() (ratios []supportedAspectRatio) {
-	for w := 1; w <= p.maxNumTiles; w++ {
-		for h := 1; h <= p.maxNumTiles/w; h++ {
-			ratios = append(ratios, supportedAspectRatio{len(ratios) + 1, w, h})
+func (p *ImageProcessor) supportedAspectRatios(maxTiles int) []image.Point {
+	ratios := []image.Point{}
+
+	for w := range maxTiles {
+		for h := range maxTiles {
+			if (w+1)*(h+1) <= maxTiles {
+				ratios = append(ratios, image.Point{w + 1, h + 1})
+			}
 		}
 	}
+
 	return ratios
 }

-func (p ImageProcessor) fitToCanvas(imageSize, canvasSize image.Point) image.Point {
-	tw := min(max(imageSize.X, p.imageSize), canvasSize.X)
-	th := min(max(imageSize.Y, p.imageSize), canvasSize.Y)
+func (p *ImageProcessor) clip(a, a_min, a_max int) int {
+	if a < a_min {
+		return a_min
+	} else if a > a_max {
+		return a_max
+	}

-	r := math.Min(
-		float64(tw)/float64(imageSize.X),
-		float64(th)/float64(imageSize.Y),
-	)
+	return a
+}

-	w := min(int(math.Floor(float64(imageSize.X)*r)), tw)
-	h := min(int(math.Floor(float64(imageSize.Y)*r)), th)
+func (p *ImageProcessor) fitToCanvas(imageSize, canvasSize image.Point, tileSize int) image.Point {
+	targetWidth := p.clip(imageSize.X, tileSize, canvasSize.X)
+	targetHeight := p.clip(imageSize.Y, tileSize, canvasSize.Y)
+
+	scaleWidth := float64(targetWidth) / float64(imageSize.X)
+	scaleHeight := float64(targetHeight) / float64(imageSize.Y)
+
+	var w, h int
+
+	if scaleWidth < scaleHeight {
+		w = targetWidth
+		h = min(int(math.Floor(float64(imageSize.Y)*scaleWidth)), targetHeight)
+	} else {
+		w = min(int(math.Floor(float64(imageSize.X)*scaleHeight)), targetWidth)
+		h = targetHeight
+	}

 	return image.Point{w, h}
 }

-func (p ImageProcessor) optimalTiledCanvas(imageSize image.Point) image.Point {
-	possibleTileArrangements := p.supportedAspectRatios()
-	possibleCanvasSizes := make([]image.Point, len(possibleTileArrangements))
-	for i, pta := range possibleTileArrangements {
-		possibleCanvasSizes[i] = image.Point{pta.width * p.imageSize, pta.height * p.imageSize}
+func (p *ImageProcessor) optimalTiledCanvas(imageSize image.Point, maxImageTiles, tileSize int) image.Point {
+	possibleTileArrangements := p.supportedAspectRatios(maxImageTiles)
+	possibleCanvasSizes := []image.Point{}
+	for _, pta := range possibleTileArrangements {
+		possibleCanvasSizes = append(possibleCanvasSizes, image.Point{pta.X * tileSize, pta.Y * tileSize})
 	}

-	scales := make([]float64, len(possibleCanvasSizes))
-	for i, pcs := range possibleCanvasSizes {
-		scales[i] = min(
-			float64(pcs.Y)/float64(imageSize.Y),
-			float64(pcs.X)/float64(imageSize.X),
-		)
+	scales := []float64{}
+
+	for _, pcs := range possibleCanvasSizes {
+		scaleHeight := float64(pcs.Y) / float64(imageSize.Y)
+		scaleWidth := float64(pcs.X) / float64(imageSize.X)
+
+		if scaleWidth > scaleHeight {
+			scales = append(scales, scaleHeight)
+		} else {
+			scales = append(scales, scaleWidth)
+		}
 	}

 	var minUpscale float64
@ -115,41 +123,47 @@ func (p ImageProcessor) optimalTiledCanvas(imageSize image.Point) image.Point {
 	return selectedCanvas
 }

-func (p ImageProcessor) splitToTiles(img image.Image, numTilesSize image.Point) []image.Image {
+func (p *ImageProcessor) splitToTiles(img image.Image, numTilesSize image.Point) []image.Image {
 	b := img.Bounds()
 	width := b.Max.X - b.Min.X
 	height := b.Max.Y - b.Min.Y
 	tileHeight := height / numTilesSize.Y
 	tileWidth := width / numTilesSize.X

-	images := make([]image.Image, 0, numTilesSize.Y*numTilesSize.X)
+	images := []image.Image{}

 	for h := range numTilesSize.Y {
 		for w := range numTilesSize.X {
 			rect := image.Rect(tileWidth*w, tileHeight*h, tileWidth*(w+1), tileHeight*(h+1))
-			if subImg, ok := img.(interface {
+			images = append(images, img.(interface {
 				SubImage(image.Rectangle) image.Image
-			}); ok {
-				images = append(images, subImg.SubImage(rect))
-			} else {
-				// Handle the case where img does not implement SubImage
-				// This is a fallback and may not be efficient
-				newImg := image.NewRGBA(rect)
-				draw.Draw(newImg, rect, img, rect.Min, draw.Src)
-				images = append(images, newImg)
-			}
+			}).SubImage(rect))
 		}
 	}

 	return images
 }

-func (p ImageProcessor) resize(img image.Image) (image.Image, image.Point) {
-	b := img.Bounds()
+// remove the "alpha" channel by drawing over a prefilled image
+//
+//nolint:unused
+func (p *ImageProcessor) compositeImage(img image.Image) image.Image {
+	dst := image.NewRGBA(img.Bounds())

-	canvasSize := p.optimalTiledCanvas(b.Max)
-	aspectRatio := image.Point{canvasSize.X / p.imageSize, canvasSize.Y / p.imageSize}
-	newSize := p.fitToCanvas(b.Max, canvasSize)
+	white := color.RGBA{255, 255, 255, 255}
+	draw.Draw(dst, dst.Bounds(), &image.Uniform{white}, image.Point{}, draw.Src)
+	draw.Draw(dst, dst.Bounds(), img, img.Bounds().Min, draw.Over)
+
+	return dst
+}
+
+func (p *ImageProcessor) resize(img image.Image, outputSize image.Point, maxImageTiles int) (image.Image, image.Point) {
+	b := img.Bounds()
+	tileSize := outputSize.Y
+
+	canvasSize := p.optimalTiledCanvas(b.Max, maxImageTiles, tileSize)
+	aspectRatio := image.Point{canvasSize.X / tileSize, canvasSize.Y / tileSize}
+	newSize := p.fitToCanvas(b.Max, canvasSize, tileSize)

 	dst := image.NewRGBA(image.Rect(0, 0, newSize.X, newSize.Y))

@ -163,10 +177,10 @@ func (p ImageProcessor) resize(img image.Image) (image.Image, image.Point) {
 	return dst, aspectRatio
 }

-func (p ImageProcessor) pad(img image.Image, aspectRatio image.Point) image.Image {
+func (p *ImageProcessor) pad(img image.Image, outputSize, aspectRatio image.Point) image.Image {
 	paddedSize := image.Point{
-		X: p.imageSize * aspectRatio.X,
-		Y: p.imageSize * aspectRatio.Y,
+		X: outputSize.X * aspectRatio.X,
+		Y: outputSize.Y * aspectRatio.Y,
 	}

 	dst := image.NewRGBA(image.Rect(0, 0, paddedSize.X, paddedSize.Y))
@ -175,7 +189,7 @@ func (p ImageProcessor) pad(img image.Image, aspectRatio image.Point) image.Imag
 	return dst
 }

-func (p ImageProcessor) pack(img image.Image, aspectRatio image.Point) []float32 {
+func (p *ImageProcessor) pack(img image.Image, aspectRatio image.Point, mean, std [3]float32) []float32 {
 	subImages := p.splitToTiles(img, aspectRatio)

 	var pixelVals []float32
@ -191,9 +205,9 @@ func (p ImageProcessor) pack(img image.Image, aspectRatio image.Point) []float32
 				gVal := float32(g>>8) / 255.0
 				bVal := float32(b>>8) / 255.0

-				rVal = (rVal - p.mean[0]) / p.std[0]
-				gVal = (gVal - p.mean[1]) / p.std[1]
-				bVal = (bVal - p.mean[2]) / p.std[2]
+				rVal = (rVal - mean[0]) / std[0]
+				gVal = (gVal - mean[1]) / std[1]
+				bVal = (bVal - mean[2]) / std[2]

 				rVals = append(rVals, rVal)
 				gVals = append(gVals, gVal)
@ -208,15 +222,17 @@ func (p ImageProcessor) pack(img image.Image, aspectRatio image.Point) []float32
 	return pixelVals
 }

-func (p ImageProcessor) ProcessImage(img image.Image) ([]float32, supportedAspectRatio, error) {
-	newImage, newImageRatio := p.resize(img)
-	newImage = p.pad(newImage, newImageRatio)
-	pixelValues := p.pack(newImage, newImageRatio)
+func (p ImageProcessor) ProcessImage(img image.Image) ([]float32, int, error) {
+	outputSize := image.Point{p.imageSize, p.imageSize}

-	supportedAspectRatios := p.supportedAspectRatios()
-	aspectRatioID := slices.IndexFunc(supportedAspectRatios, func(i supportedAspectRatio) bool {
-		return i.width == newImageRatio.X && i.height == newImageRatio.Y
-	})
+	// clip values
+	mean := [3]float32{0.48145466, 0.4578275, 0.40821073}
+	std := [3]float32{0.26862954, 0.26130258, 0.27577711}

-	return pixelValues, supportedAspectRatios[aspectRatioID], nil
+	newImage, aspectRatio := p.resize(img, outputSize, p.maxNumTiles)
+	newImage = p.pad(newImage, outputSize, aspectRatio)
+
+	data := p.pack(newImage, aspectRatio, mean, std)
+	aspectRatioIndex := slices.Index(p.supportedAspectRatios(p.maxNumTiles), aspectRatio) + 1
+	return data, aspectRatioIndex, nil
 }
--- a/model/models/mllama/process_image_test.go
+++ b/model/models/mllama/process_image_test.go
@ -1,387 +0,0 @@
-package mllama
-
-import (
-	"image"
-	"testing"
-
-	"github.com/google/go-cmp/cmp"
-)
-
-func TestSupportedAspectRatios(t *testing.T) {
-	cases := []struct {
-		p    ImageProcessor
-		want []supportedAspectRatio
-	}{
-		{
-			p: ImageProcessor{maxNumTiles: 1},
-			want: []supportedAspectRatio{
-				{1, 1, 1},
-			},
-		},
-		{
-			p: ImageProcessor{maxNumTiles: 2},
-			want: []supportedAspectRatio{
-				{1, 1, 1},
-				{2, 1, 2},
-				{3, 2, 1},
-			},
-		},
-		{
-			p: ImageProcessor{maxNumTiles: 3},
-			want: []supportedAspectRatio{
-				{1, 1, 1},
-				{2, 1, 2},
-				{3, 1, 3},
-				{4, 2, 1},
-				{5, 3, 1},
-			},
-		},
-		{
-			p: ImageProcessor{maxNumTiles: 4},
-			want: []supportedAspectRatio{
-				{1, 1, 1},
-				{2, 1, 2},
-				{3, 1, 3},
-				{4, 1, 4},
-				{5, 2, 1},
-				{6, 2, 2},
-				{7, 3, 1},
-				{8, 4, 1},
-			},
-		},
-	}
-
-	for _, tt := range cases {
-		actual := tt.p.supportedAspectRatios()
-		if diff := cmp.Diff(actual, tt.want, cmp.AllowUnexported(supportedAspectRatio{})); diff != "" {
-			t.Errorf("mismatch (-got +want):\n%s", diff)
-		}
-	}
-}
-
-func TestFitToCanvas(t *testing.T) {
-	cases := []struct {
-		p      ImageProcessor
-		image  image.Point
-		canvas image.Point
-		expect image.Point
-	}{
-		{
-			p:      ImageProcessor{imageSize: 200},
-			image:  image.Point{400, 400},
-			canvas: image.Point{640, 480},
-			expect: image.Point{400, 400},
-		},
-		{
-			p:      ImageProcessor{imageSize: 200},
-			image:  image.Point{1024, 768},
-			canvas: image.Point{640, 480},
-			expect: image.Point{640, 480},
-		},
-		{
-			p:      ImageProcessor{imageSize: 750},
-			image:  image.Point{500, 500},
-			canvas: image.Point{1000, 1000},
-			expect: image.Point{750, 750},
-		},
-		{
-			p:      ImageProcessor{imageSize: 2000},
-			image:  image.Point{500, 1000},
-			canvas: image.Point{2000, 2000},
-			expect: image.Point{1000, 2000},
-		},
-		{
-			p:      ImageProcessor{imageSize: 1000},
-			image:  image.Point{4000, 3000},
-			canvas: image.Point{2000, 1000},
-			expect: image.Point{1333, 1000},
-		},
-		{
-			p:      ImageProcessor{imageSize: 560},
-			image:  image.Point{667, 1000},
-			canvas: image.Point{1000, 1000},
-			expect: image.Point{667, 1000},
-		},
-	}
-
-	for _, tt := range cases {
-		actual := tt.p.fitToCanvas(tt.image, tt.canvas)
-		if diff := cmp.Diff(actual, tt.expect); diff != "" {
-			t.Errorf("mismatch (-got +want):\n%s", diff)
-		}
-	}
-}
-
-func TestOptimalTiledCanvas(t *testing.T) {
-	cases := []struct {
-		p      ImageProcessor
-		image  image.Point
-		expect image.Point
-	}{
-		{
-			p:      ImageProcessor{maxNumTiles: 4, imageSize: 1000},
-			image:  image.Point{1024, 768},
-			expect: image.Point{2000, 1000},
-		},
-		{
-			p:      ImageProcessor{maxNumTiles: 4, imageSize: 560},
-			image:  image.Point{1024, 768},
-			expect: image.Point{1120, 1120},
-		},
-		{
-			p:      ImageProcessor{maxNumTiles: 4, imageSize: 560},
-			image:  image.Point{800, 600},
-			expect: image.Point{1120, 1120},
-		},
-		{
-			p:      ImageProcessor{maxNumTiles: 4, imageSize: 560},
-			image:  image.Point{640, 480},
-			expect: image.Point{1120, 560},
-		},
-		{
-			p:      ImageProcessor{maxNumTiles: 4, imageSize: 560},
-			image:  image.Point{320, 200},
-			expect: image.Point{560, 560},
-		},
-		{
-			p:      ImageProcessor{maxNumTiles: 4, imageSize: 560},
-			image:  image.Point{1320, 200},
-			expect: image.Point{1680, 560},
-		},
-		{
-			p:      ImageProcessor{maxNumTiles: 4, imageSize: 560},
-			image:  image.Point{2000, 200},
-			expect: image.Point{2240, 560},
-		},
-		{
-			p:      ImageProcessor{maxNumTiles: 4, imageSize: 560},
-			image:  image.Point{10000, 200},
-			expect: image.Point{2240, 560},
-		},
-		{
-			p:      ImageProcessor{maxNumTiles: 4, imageSize: 560},
-			image:  image.Point{480, 640},
-			expect: image.Point{560, 1120},
-		},
-		{
-			p:      ImageProcessor{maxNumTiles: 4, imageSize: 560},
-			image:  image.Point{200, 320},
-			expect: image.Point{560, 560},
-		},
-		{
-			p:      ImageProcessor{maxNumTiles: 4, imageSize: 560},
-			image:  image.Point{200, 1320},
-			expect: image.Point{560, 1680},
-		},
-		{
-			p:      ImageProcessor{maxNumTiles: 4, imageSize: 560},
-			image:  image.Point{200, 2000},
-			expect: image.Point{560, 2240},
-		},
-		{
-			p:      ImageProcessor{maxNumTiles: 4, imageSize: 560},
-			image:  image.Point{200, 10000},
-			expect: image.Point{560, 2240},
-		},
-		{
-			p:      ImageProcessor{maxNumTiles: 4, imageSize: 560},
-			image:  image.Point{10000, 10000},
-			expect: image.Point{1120, 1120},
-		},
-	}
-
-	for _, tt := range cases {
-		actual := tt.p.optimalTiledCanvas(tt.image)
-		if diff := cmp.Diff(actual, tt.expect); diff != "" {
-			t.Errorf("mismatch (-got +want):\n%s", diff)
-		}
-	}
-}
-
-func TestSplitToTiles(t *testing.T) {
-	cases := []struct {
-		imageMax image.Point
-		numTiles image.Point
-		expect   []image.Image
-	}{
-		{
-			imageMax: image.Point{1024, 768},
-			numTiles: image.Point{1, 1},
-			expect:   []image.Image{image.NewRGBA(image.Rect(0, 0, 1024, 768))},
-		},
-		{
-			imageMax: image.Point{1000, 500},
-			numTiles: image.Point{2, 1},
-			expect: []image.Image{
-				image.NewRGBA(image.Rect(0, 0, 500, 500)),
-				image.NewRGBA(image.Rect(500, 0, 1000, 500)),
-			},
-		},
-		{
-			imageMax: image.Point{1000, 1000},
-			numTiles: image.Point{2, 2},
-			expect: []image.Image{
-				image.NewRGBA(image.Rect(0, 0, 500, 500)),
-				image.NewRGBA(image.Rect(500, 0, 1000, 500)),
-				image.NewRGBA(image.Rect(0, 500, 500, 1000)),
-				image.NewRGBA(image.Rect(500, 500, 1000, 1000)),
-			},
-		},
-	}
-
-	var p ImageProcessor
-
-	for _, tt := range cases {
-		actual := p.splitToTiles(image.NewRGBA(image.Rectangle{Max: tt.imageMax}), tt.numTiles)
-
-		if len(actual) != len(tt.expect) {
-			t.Errorf("incorrect number of images '%d': expect: '%d'", len(actual), len(tt.expect))
-		}
-
-		for i := range actual {
-			if actual[i].Bounds() != tt.expect[i].Bounds() {
-				t.Errorf("image size incorrect: '%#v': expect: '%#v'", actual[i].Bounds(), tt.expect[i].Bounds())
-			}
-		}
-	}
-}
-
-func TestResize(t *testing.T) {
-	cases := []struct {
-		p                 ImageProcessor
-		imageMax          image.Point
-		expectImage       image.Image
-		expectAspectRatio image.Point
-	}{
-		{
-			p:                 ImageProcessor{maxNumTiles: 1, imageSize: 100},
-			imageMax:          image.Point{200, 200},
-			expectImage:       image.NewRGBA(image.Rect(0, 0, 100, 100)),
-			expectAspectRatio: image.Point{1, 1},
-		},
-		{
-			p:                 ImageProcessor{maxNumTiles: 2, imageSize: 100},
-			imageMax:          image.Point{200, 200},
-			expectImage:       image.NewRGBA(image.Rect(0, 0, 100, 100)),
-			expectAspectRatio: image.Point{1, 1},
-		},
-		{
-			p:                 ImageProcessor{maxNumTiles: 4, imageSize: 560},
-			imageMax:          image.Point{10, 10},
-			expectImage:       image.NewRGBA(image.Rect(0, 0, 560, 560)),
-			expectAspectRatio: image.Point{1, 1},
-		},
-		{
-			p:                 ImageProcessor{maxNumTiles: 4, imageSize: 560},
-			imageMax:          image.Point{2560, 1920},
-			expectImage:       image.NewRGBA(image.Rect(0, 0, 1120, 840)),
-			expectAspectRatio: image.Point{2, 2},
-		},
-		{
-			p:                 ImageProcessor{maxNumTiles: 4, imageSize: 560},
-			imageMax:          image.Point{1024, 768},
-			expectImage:       image.NewRGBA(image.Rect(0, 0, 1024, 768)),
-			expectAspectRatio: image.Point{2, 2},
-		},
-	}
-
-	for _, tt := range cases {
-		actualImage, actualAspectRatio := tt.p.resize(image.Rectangle{Max: tt.imageMax})
-
-		if actualImage.Bounds() != tt.expectImage.Bounds() {
-			t.Errorf("image size incorrect: '%#v': expect: '%#v'", actualImage.Bounds(), tt.expectImage.Bounds())
-		}
-
-		if actualAspectRatio != tt.expectAspectRatio {
-			t.Errorf("aspect ratio incorrect: '%#v': expect: '%#v'", actualAspectRatio, tt.expectAspectRatio)
-		}
-	}
-}
-
-func TestPad(t *testing.T) {
-	cases := []struct {
-		p           ImageProcessor
-		imageMax    image.Point
-		aspectRatio image.Point
-		expect      image.Image
-	}{
-		{
-			p:           ImageProcessor{maxNumTiles: 4, imageSize: 560},
-			imageMax:    image.Point{1000, 667},
-			aspectRatio: image.Point{2, 2},
-			expect:      image.NewRGBA(image.Rect(0, 0, 1120, 1120)),
-		},
-	}
-
-	for _, tt := range cases {
-		actual := tt.p.pad(image.Rectangle{Max: tt.imageMax}, tt.aspectRatio)
-
-		if actual.Bounds() != tt.expect.Bounds() {
-			t.Errorf("image size incorrect: '%#v': expect: '%#v'", actual.Bounds(), tt.expect.Bounds())
-		}
-	}
-}
-
-func TestPackImages(t *testing.T) {
-	cases := []struct {
-		imageMax    image.Point
-		aspectRatio image.Point
-		expectVals  int
-	}{
-		{
-			imageMax:    image.Point{1120, 1120},
-			aspectRatio: image.Point{2, 2},
-			expectVals:  2 * 2 * 3 * 560 * 560,
-		},
-		{
-			imageMax:    image.Point{560, 560},
-			aspectRatio: image.Point{1, 1},
-			expectVals:  1 * 1 * 3 * 560 * 560,
-		},
-		{
-			imageMax:    image.Point{1120, 560},
-			aspectRatio: image.Point{1, 2},
-			expectVals:  1 * 2 * 3 * 560 * 560,
-		},
-	}
-
-	for _, tt := range cases {
-		var p ImageProcessor
-		actualVals := p.pack(image.NewRGBA(image.Rectangle{Max: tt.imageMax}), tt.aspectRatio)
-		if len(actualVals) != tt.expectVals {
-			t.Errorf("packed image size incorrect: '%d': expect: '%d'", len(actualVals), tt.expectVals)
-		}
-	}
-}
-
-func TestPreprocess(t *testing.T) {
-	cases := []struct {
-		imageMax            image.Point
-		expectAspectRatioID int
-	}{
-		{
-			imageMax:            image.Point{10, 10},
-			expectAspectRatioID: 1,
-		},
-		{
-			imageMax:            image.Point{1024, 768},
-			expectAspectRatioID: 6,
-		},
-	}
-
-	p := ImageProcessor{imageSize: 560, maxNumTiles: 4}
-	for _, tt := range cases {
-		img, aspectRatio, err := p.ProcessImage(image.NewRGBA(image.Rectangle{Max: tt.imageMax}))
-		if err != nil {
-			t.Fatalf("error processing: %q", err)
-		}
-
-		if len(img) == 0 {
-			t.Errorf("no image data returned")
-		}
-
-		if aspectRatio.rank != tt.expectAspectRatioID {
-			t.Errorf("aspect ratio incorrect: '%d': expect: '%d'", aspectRatio, tt.expectAspectRatioID)
-		}
-	}
-}
--- a/model/models/models.go
+++ b/model/models/models.go
@ -7,5 +7,4 @@ import (
 	_ "github.com/ollama/ollama/model/models/llama4"
 	_ "github.com/ollama/ollama/model/models/mistral3"
 	_ "github.com/ollama/ollama/model/models/mllama"
-	_ "github.com/ollama/ollama/model/models/qwen25vl"
 )
--- a/model/models/qwen25vl/model.go
+++ b/model/models/qwen25vl/model.go
@ -1,187 +0,0 @@
-package qwen25vl
-
-import (
-	"bytes"
-	"fmt"
-	"image"
-	"slices"
-	"sync"
-
-	"github.com/ollama/ollama/fs"
-	"github.com/ollama/ollama/kvcache"
-	"github.com/ollama/ollama/ml"
-	"github.com/ollama/ollama/model"
-	"github.com/ollama/ollama/model/input"
-)
-
-type Model struct {
-	model.Base
-	model.BytePairEncoding
-
-	*TextModel
-	*VisionModel `gguf:"v,vision"`
-
-	ImageProcessor
-}
-
-// Implement MultimodalProcessor interface
-var _ model.MultimodalProcessor = (*Model)(nil)
-
-func New(c fs.Config) (model.Model, error) {
-	m := &Model{
-		BytePairEncoding: model.NewBytePairEncoding(
-			c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
-			&model.Vocabulary{
-				Values: c.Strings("tokenizer.ggml.tokens"),
-				Types:  c.Ints("tokenizer.ggml.token_type"),
-				Merges: c.Strings("tokenizer.ggml.merges"),
-				BOS:    int32(c.Uint("tokenizer.ggml.bos_token_id")),
-				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", false),
-				EOS:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
-				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
-				EOT:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
-				AddEOT: c.Bool("tokenizer.ggml.add_eos_token", false),
-			},
-		),
-		TextModel:      NewTextModel(c),
-		VisionModel:    newVisionModel(c),
-		ImageProcessor: newImageProcessor(c),
-	}
-
-	m.Cache = kvcache.NewCausalCache(m.TextModel.Shift)
-
-	return m, nil
-}
-
-func (m *Model) PixelValues(ctx ml.Context, multimodalData []byte) (ml.Tensor, *Grid, error) {
-	image, _, err := image.Decode(bytes.NewReader(multimodalData))
-	if err != nil {
-		return nil, nil, err
-	}
-
-	f32s, grid, err := m.ImageProcessor.ProcessImage(image)
-	if err != nil {
-		return nil, nil, err
-	}
-
-	// Calculate tensor dimensions
-	patchDim := m.ImageProcessor.numChannels * m.ImageProcessor.temporalPatchSize *
-		m.ImageProcessor.patchSize * m.ImageProcessor.patchSize
-	numPatches := grid.Temporal * grid.Height * grid.Width
-
-	pixelValues, err := ctx.Input().FromFloatSlice(f32s, patchDim, numPatches)
-	if err != nil {
-		return nil, nil, fmt.Errorf("failed to create tensor from image: %w", err)
-	}
-
-	return pixelValues, grid, nil
-}
-
-func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, error) {
-	if len(m.VisionModel.Layers) == 0 {
-		return nil, model.ErrNoVisionModel
-	}
-
-	pixels, grid, err := m.PixelValues(ctx, multimodalData)
-	if err != nil {
-		return nil, err
-	}
-
-	visionOutputs := m.VisionModel.Forward(ctx, pixels, grid)
-	return &chunks{Model: m, Tensor: visionOutputs}, nil
-}
-
-type chunks struct {
-	*Model
-	ml.Tensor
-
-	dataOnce sync.Once
-	data     []float32
-}
-
-type chunk struct {
-	*chunks
-	s, n int
-}
-
-func (r *chunk) floats() []float32 {
-	r.dataOnce.Do(func() {
-		temp := r.Backend().NewContext()
-		defer temp.Close()
-		temp.Forward(r.Tensor).Compute(r.Tensor)
-		r.data = r.Floats()
-	})
-
-	return r.data[r.s*r.Dim(0) : (r.s+r.n)*r.Dim(0)]
-}
-
-// PostTokenize arranges Qwen-2.5-VL's inputs for the forward pass
-func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
-	var result []input.Input
-
-	var (
-		imageToken       int32 = 151655
-		visionStartToken int32 = 151652
-		visionEndToken   int32 = 151653
-	)
-
-	nImg := 0
-	for _, inp := range inputs {
-		if inp.Multimodal == nil {
-			// If not a multimodal input, add it to the result unchanged
-			result = append(result, inp)
-		} else {
-			// Adding the 'Picture' prefix is a hack, at the time of writing there is no way to prefix
-			// the image tokens with a prompt, so we add a prefix here
-			nImg++
-			pre, err := m.Encode(fmt.Sprintf(" Picture %d: ", nImg), true)
-			if err != nil {
-				return nil, fmt.Errorf("failed to encode image prompt: %w", err)
-			}
-			for i := range pre {
-				result = append(result, input.Input{Token: pre[i]})
-			}
-
-			// This is an image token with multimodal data
-			chunksData := inp.Multimodal.(*chunks)
-			patchesPerChunk := chunksData.Dim(1)
-
-			// First add the vision start token
-			result = append(result, input.Input{Token: visionStartToken, SameBatch: patchesPerChunk + 2})
-
-			// Add the image token with the multimodal tensor data at the first position
-			// Create a chunk with proper s and n values
-			result = append(result, input.Input{
-				Token:          imageToken,
-				Multimodal:     &chunk{chunks: chunksData, s: 0, n: patchesPerChunk},
-				MultimodalHash: inp.MultimodalHash,
-				SameBatch:      patchesPerChunk,
-			})
-
-			// Add the placeholder tokens for the remaining positions (tokensPerGrid-1)
-			result = append(result, slices.Repeat([]input.Input{{Token: imageToken}}, patchesPerChunk-1)...)
-
-			result = append(result, input.Input{Token: visionEndToken})
-		}
-	}
-
-	return result, nil
-}
-
-func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
-	positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
-	if err != nil {
-		return nil, err
-	}
-
-	outputs, err := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
-	if err != nil {
-		return nil, err
-	}
-
-	return m.TextModel.Forward(ctx, batch.Inputs, positions, outputs, batch, m.Cache)
-}
-
-func init() {
-	model.Register("qwen25vl", New)
-}
--- a/model/models/qwen25vl/model_text.go
+++ b/model/models/qwen25vl/model_text.go
@ -1,155 +0,0 @@
-package qwen25vl
-
-import (
-	"math"
-
-	"github.com/ollama/ollama/fs"
-	"github.com/ollama/ollama/kvcache"
-	"github.com/ollama/ollama/ml"
-	"github.com/ollama/ollama/ml/nn"
-	"github.com/ollama/ollama/model/input"
-)
-
-type TextOptions struct {
-	ctxLen, hiddenSize, numHeads, numKVHeads int
-	eps, ropeBase, ropeScale                 float32
-	ropeDim, defaultContextLen               uint32
-}
-
-type TextModel struct {
-	TokenEmbedding *nn.Embedding `gguf:"token_embd"`
-	Layers         []Layer       `gguf:"blk"`
-	OutputNorm     *nn.RMSNorm   `gguf:"output_norm"`
-	Output         *nn.Linear    `gguf:"output,alt:token_embd"`
-
-	*TextOptions
-}
-
-func NewTextModel(c fs.Config) *TextModel {
-	m := TextModel{
-		Layers: make([]Layer, c.Uint("block_count")),
-		TextOptions: &TextOptions{
-			ctxLen:            int(c.Uint("context_length")),
-			hiddenSize:        int(c.Uint("embedding_length")),
-			numHeads:          int(c.Uint("attention.head_count")),
-			numKVHeads:        int(c.Uint("attention.head_count_kv")),
-			eps:               c.Float("attention.layer_norm_rms_epsilon"),
-			ropeBase:          c.Float("rope.freq_base"),
-			ropeScale:         c.Float("rope.freq_scale", 1),
-			ropeDim:           c.Uint("rope.dimension_count", 128),
-			defaultContextLen: c.Uint("context_length", 128000),
-		},
-	}
-
-	return &m
-}
-
-// SelfAttention implements the multi-head self-attention mechanism
-// with separate projections for query, key, value and output transformations
-type SelfAttention struct {
-	Query  *nn.Linear `gguf:"attn_q"`
-	Key    *nn.Linear `gguf:"attn_k"`
-	Value  *nn.Linear `gguf:"attn_v"`
-	Output *nn.Linear `gguf:"attn_output"`
-}
-
-func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *TextOptions) ml.Tensor {
-	batchSize := hiddenState.Dim(1)
-	headDim := opts.hiddenSize / opts.numHeads
-
-	q := sa.Query.Forward(ctx, hiddenState)
-	q = q.Reshape(ctx, headDim, opts.numHeads, batchSize)
-	q = q.RoPE(ctx, positionIDs, nil, opts.ropeDim, 2, opts.ropeBase, opts.ropeScale, ml.WithContextLen(opts.defaultContextLen))
-
-	k := sa.Key.Forward(ctx, hiddenState)
-	k = k.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
-	k = k.RoPE(ctx, positionIDs, nil, opts.ropeDim, 2, opts.ropeBase, opts.ropeScale, ml.WithContextLen(opts.defaultContextLen))
-
-	v := sa.Value.Forward(ctx, hiddenState)
-	v = v.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
-
-	scaleFactor := 1.0 / math.Sqrt(float64(headDim))
-	kqv := nn.Attention(ctx, q, k, v, scaleFactor, cache)
-	kqv = kqv.Reshape(ctx, opts.hiddenSize, batchSize)
-
-	return sa.Output.Forward(ctx, kqv)
-}
-
-// Shift applies rotary position embeddings to the key tensor for causal attention caching
-func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
-	return key.RoPE(ctx, shift, nil, m.ropeDim, 2, m.ropeBase, m.ropeScale, ml.WithContextLen(m.defaultContextLen)), nil
-}
-
-// MLP implements the feed-forward network component with SwiGLU activation
-type MLP struct {
-	Up   *nn.Linear `gguf:"ffn_up"`
-	Down *nn.Linear `gguf:"ffn_down"`
-	Gate *nn.Linear `gguf:"ffn_gate"`
-}
-
-func (mlp *MLP) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *TextOptions) ml.Tensor {
-	// Apply SwiGLU activation gating
-	hiddenState = mlp.Gate.Forward(ctx, hiddenState).SILU(ctx).Mul(ctx, mlp.Up.Forward(ctx, hiddenState))
-	// Project back to hidden dimension
-	return mlp.Down.Forward(ctx, hiddenState)
-}
-
-// Layer represents a single transformer layer combining self-attention and feed-forward components
-type Layer struct {
-	AttentionNorm *nn.RMSNorm `gguf:"attn_norm"`
-	SelfAttention *SelfAttention
-	MLPNorm       *nn.RMSNorm `gguf:"ffn_norm"`
-	MLP           *MLP
-}
-
-func (l *Layer) Forward(ctx ml.Context, hiddenState, positionIDs, outputs ml.Tensor, cache kvcache.Cache, opts *TextOptions) ml.Tensor {
-	// Self-attention branch with residual connection
-	residual := hiddenState
-
-	hiddenState = l.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
-	hiddenState = l.SelfAttention.Forward(ctx, hiddenState, positionIDs, cache, opts)
-
-	// In the final layer (outputs != nil), optimize by pruning to just the token positions
-	// we need logits for.
-	if outputs != nil {
-		hiddenState = hiddenState.Rows(ctx, outputs)
-		residual = residual.Rows(ctx, outputs)
-	}
-
-	hiddenState = hiddenState.Add(ctx, residual)
-	// Feed-forward branch with residual connection
-	residual = hiddenState
-	hiddenState = l.MLPNorm.Forward(ctx, hiddenState, opts.eps)
-	hiddenState = l.MLP.Forward(ctx, hiddenState, opts)
-	return hiddenState.Add(ctx, residual)
-}
-
-func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor, batch input.Batch, cache kvcache.Cache) (ml.Tensor, error) {
-	// Initial token embedding
-	hiddenStates := m.TokenEmbedding.Forward(ctx, inputs).Duplicate(ctx)
-
-	for _, mi := range batch.Multimodal {
-		f32s := mi.Multimodal.(*chunk).floats()
-		img, err := ctx.Input().FromFloatSlice(f32s, len(f32s)/m.hiddenSize, m.hiddenSize)
-		if err != nil {
-			panic(err)
-		}
-
-		ctx.Forward(img.Copy(ctx, hiddenStates.View(ctx, mi.Index*hiddenStates.Stride(1), img.Dim(0)*img.Dim(1))))
-	}
-
-	// Process through transformer layers
-	for i, layer := range m.Layers {
-		cache.SetLayer(i)
-
-		var lastLayerOutputs ml.Tensor
-		if i == len(m.Layers)-1 {
-			lastLayerOutputs = outputs
-		}
-
-		hiddenStates = layer.Forward(ctx, hiddenStates, positions, lastLayerOutputs, cache, m.TextOptions)
-	}
-
-	hiddenStates = m.OutputNorm.Forward(ctx, hiddenStates, m.eps)
-	return m.Output.Forward(ctx, hiddenStates), nil
-}
--- a/model/models/qwen25vl/model_vision.go
+++ b/model/models/qwen25vl/model_vision.go
@ -1,391 +0,0 @@
-package qwen25vl
-
-import (
-	"fmt"
-	"math"
-	"slices"
-
-	"github.com/ollama/ollama/fs"
-	"github.com/ollama/ollama/ml"
-	"github.com/ollama/ollama/ml/nn"
-)
-
-// We only support batch size of 1
-var batchSize int = 1
-
-func rotateHalf(ctx ml.Context, t ml.Tensor) ml.Tensor {
-	x1 := t.View(ctx, 0, t.Dim(0)/2, t.Stride(1), t.Dim(1), t.Stride(2), t.Dim(2), t.Stride(3), t.Dim(3))
-	x2 := t.View(ctx, t.Stride(0)*t.Dim(0)/2, t.Dim(0)/2, t.Stride(1), t.Dim(1), t.Stride(2), t.Dim(2), t.Stride(3), t.Dim(3)).Contiguous(ctx)
-	return x2.Neg(ctx).Concat(ctx, x1, 0)
-}
-
-func applyRotaryPositionalEmbedding(ctx ml.Context, t, cos, sin ml.Tensor) ml.Tensor {
-	return t.Mul(ctx, cos).Add(ctx, rotateHalf(ctx, t).Mul(ctx, sin))
-}
-
-func blockDiagonalMask(ctx ml.Context, seqLength int, bounds []int, numHeads int) ml.Tensor {
-	// Create a flat slice for the mask (all -inf initially to block all attention)
-	flat := make([]float32, seqLength*seqLength)
-	for i := range flat {
-		flat[i] = float32(math.Inf(-1)) // Negative infinity to block attention
-	}
-
-	// Fill in the mask with zeros for tokens that CAN attend to each other
-	for i := 1; i < len(bounds); i++ {
-		start := bounds[i-1]
-		end := bounds[i]
-
-		// Enable attention within this sequence block by setting values to 0
-		for row := start; row < end; row++ {
-			for col := start; col < end; col++ {
-				idx := row*seqLength + col
-				flat[idx] = 0.0 // 0 allows attention, -inf blocks it
-			}
-		}
-	}
-
-	mask, err := ctx.Input().FromFloatSlice(flat, seqLength, seqLength)
-	if err != nil {
-		panic(err)
-	}
-	// Reshape to match [seqLength, seqLength, 1] for broadcasting
-	mask = mask.Reshape(ctx, seqLength, seqLength, 1)
-
-	return mask
-}
-
-type VisionSelfAttention struct {
-	Query  *nn.Linear `gguf:"attn_q"`
-	Key    *nn.Linear `gguf:"attn_k"`
-	Value  *nn.Linear `gguf:"attn_v"`
-	Output *nn.Linear `gguf:"attn_out"`
-}
-
-func (sa *VisionSelfAttention) Forward(ctx ml.Context, hiddenStates, cos, sin, mask ml.Tensor, opts *VisionModelOptions) ml.Tensor {
-	query := sa.Query.Forward(ctx, hiddenStates)
-	key := sa.Key.Forward(ctx, hiddenStates)
-	value := sa.Value.Forward(ctx, hiddenStates)
-
-	query = query.Reshape(ctx, opts.headDim, opts.numHeads, query.Dim(1), batchSize)
-	key = key.Reshape(ctx, opts.headDim, opts.numHeads, key.Dim(1), batchSize)
-	value = value.Reshape(ctx, opts.headDim, opts.numHeads, value.Dim(1), batchSize)
-
-	query = applyRotaryPositionalEmbedding(ctx, query, cos, sin)
-	key = applyRotaryPositionalEmbedding(ctx, key, cos, sin)
-
-	// Scale factor for scaled dot-product attention
-	scale := 1.0 / math.Sqrt(float64(opts.headDim))
-
-	// Scaled dot-product attention
-	query = query.Permute(ctx, 0, 2, 1, 3)
-	key = key.Permute(ctx, 0, 2, 1, 3)
-	value = value.Permute(ctx, 1, 2, 0, 3).Contiguous(ctx)
-	kq := key.MulmatFullPrec(ctx, query)
-	kq = kq.Scale(ctx, scale)
-	if mask != nil {
-		kq = kq.Add(ctx, mask)
-	}
-	kq = kq.Softmax(ctx)
-	kqv := value.Mulmat(ctx, kq)
-	attention := kqv.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
-	attention = attention.Reshape(ctx, opts.hiddenSize, attention.Dim(2), batchSize)
-
-	return sa.Output.Forward(ctx, attention)
-}
-
-// VisionMLP implements the multi-layer perceptron
-type VisionMLP struct {
-	Gate *nn.Linear `gguf:"ffn_gate"`
-	Up   *nn.Linear `gguf:"ffn_up"`
-	Down *nn.Linear `gguf:"ffn_down"`
-}
-
-func (mlp *VisionMLP) Forward(ctx ml.Context, hiddenStates ml.Tensor, opts *VisionModelOptions) ml.Tensor {
-	// Using activation as specified in config (likely GELU or SiLU/Swish)
-	gateOutput := mlp.Gate.Forward(ctx, hiddenStates)
-	upOutput := mlp.Up.Forward(ctx, hiddenStates)
-	hiddenStates = gateOutput.SILU(ctx).Mul(ctx, upOutput)
-
-	return mlp.Down.Forward(ctx, hiddenStates)
-}
-
-type VisionEncoderLayer struct {
-	Norm1         *nn.RMSNorm `gguf:"ln1"`
-	SelfAttention *VisionSelfAttention
-	Norm2         *nn.RMSNorm `gguf:"ln2"`
-	MLP           *VisionMLP
-}
-
-func (e *VisionEncoderLayer) Forward(ctx ml.Context, hiddenStates, cos, sin, mask ml.Tensor, opts *VisionModelOptions) ml.Tensor {
-	residual := hiddenStates
-	hiddenStates = e.Norm1.Forward(ctx, hiddenStates, opts.eps)
-	hiddenStates = e.SelfAttention.Forward(ctx, hiddenStates, cos, sin, mask, opts)
-	hiddenStates = hiddenStates.Add(ctx, residual)
-
-	residual = hiddenStates
-	hiddenStates = e.Norm2.Forward(ctx, hiddenStates, opts.eps)
-	hiddenStates = e.MLP.Forward(ctx, hiddenStates, opts)
-	return hiddenStates.Add(ctx, residual)
-}
-
-// VisionModelOptions contains configuration options
-type VisionModelOptions struct {
-	hiddenSize        int
-	numHeads          int
-	headDim           int
-	patchSize         int
-	numChannels       int
-	eps               float32
-	ropeTheta         float32
-	spatialMergeSize  int
-	windowSize        int
-	fullAttnBlocks    []int32
-	temporalPatchSize int
-}
-
-type PatchEmbedding struct {
-	PatchConv0 *nn.Conv2D `gguf:"patch_embd_0"`
-	PatchConv1 *nn.Conv2D `gguf:"patch_embd_1"`
-}
-
-func (pe *PatchEmbedding) Forward(ctx ml.Context, pixelValues ml.Tensor, opts *VisionModelOptions) ml.Tensor {
-	numPatches := pixelValues.Shape()[1]
-
-	// Reshape the input tensor to match the expected dimensions
-	pixelValues = pixelValues.Reshape(ctx, opts.patchSize*opts.patchSize, opts.temporalPatchSize, opts.numChannels, numPatches)
-
-	// Permute the tensor to bring the temporal dimension to the front
-	pixelValues = pixelValues.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx)
-
-	// Split the tensor into parts for the temporal convolutions
-	in0 := pixelValues.View(ctx, 0, 1, pixelValues.Stride(1), pixelValues.Dim(1), pixelValues.Stride(2), pixelValues.Dim(2), pixelValues.Stride(3), pixelValues.Dim(3)).Contiguous(ctx)
-	in0 = in0.Reshape(ctx, opts.patchSize, opts.patchSize, opts.numChannels, numPatches)
-	in1 := pixelValues.View(ctx, pixelValues.Stride(0), 1, pixelValues.Stride(1), pixelValues.Dim(1), pixelValues.Stride(2), pixelValues.Dim(2), pixelValues.Stride(3), pixelValues.Dim(3)).Contiguous(ctx)
-	in1 = in1.Reshape(ctx, opts.patchSize, opts.patchSize, opts.numChannels, numPatches)
-
-	s0, s1 := opts.patchSize, opts.patchSize // Use full stride
-	p0, p1 := 0, 0                           // padding
-	d0, d1 := 1, 1                           // dilation
-	out0 := pe.PatchConv0.Forward(ctx, in0, s0, s1, p0, p1, d0, d1)
-	out1 := pe.PatchConv1.Forward(ctx, in1, s0, s1, p0, p1, d0, d1)
-
-	// Add the outputs from the two temporal convolutions
-	out := out0.Add(ctx, out1)
-
-	// Reshape the output tensor to match the expected dimensions
-	return out.Reshape(ctx, opts.hiddenSize, numPatches)
-}
-
-// VisionPatchMerger implements patch merging for the Qwen vision model
-type VisionPatchMerger struct {
-	LNQ  *nn.RMSNorm `gguf:"ln_q"`
-	MLP0 *nn.Linear  `gguf:"mlp.0"`
-	MLP2 *nn.Linear  `gguf:"mlp.2"`
-}
-
-// Forward computes patch merging for the vision model
-func (pm *VisionPatchMerger) Forward(ctx ml.Context, visionOutputs ml.Tensor, opts *VisionModelOptions) ml.Tensor {
-	normalized := pm.LNQ.Forward(ctx, visionOutputs, opts.eps)
-
-	hiddenSize := visionOutputs.Dim(0) * (opts.spatialMergeSize * opts.spatialMergeSize)
-
-	// Reshape the normalized output to view the hidden size dimension
-	reshaped := normalized.Reshape(ctx, hiddenSize, normalized.Dim(1)/(opts.spatialMergeSize*opts.spatialMergeSize), batchSize)
-	hidden := pm.MLP0.Forward(ctx, reshaped)
-	activated := hidden.GELU(ctx)
-
-	output := pm.MLP2.Forward(ctx, activated)
-
-	return output
-}
-
-// VisionModel implements the Qwen vision model
-type VisionModel struct {
-	PatchEmbedding *PatchEmbedding
-	Layers         []VisionEncoderLayer `gguf:"blk"`
-	PatchMerger    *VisionPatchMerger   `gguf:"merger"`
-
-	*VisionModelOptions
-}
-
-// Forward computes the vision model for an input tensor
-func (m *VisionModel) Forward(ctx ml.Context, pixelValues ml.Tensor, grid *Grid) ml.Tensor {
-	// Extract patch embeddings
-	hiddenStates := m.PatchEmbedding.Forward(ctx, pixelValues, m.VisionModelOptions)
-
-	positionEmbedding := m.PositionalEmbedding(ctx, grid)
-
-	windowIndex, bounds := m.WindowIndex(ctx, grid)
-
-	spatialMergeUnit := m.spatialMergeSize * m.spatialMergeSize
-
-	hiddenStates = hiddenStates.Reshape(ctx, hiddenStates.Dim(0)*spatialMergeUnit, hiddenStates.Dim(1)/spatialMergeUnit)
-	hiddenStates = hiddenStates.Rows(ctx, windowIndex)
-	hiddenStates = hiddenStates.Reshape(ctx, hiddenStates.Dim(0)/spatialMergeUnit, hiddenStates.Dim(1)*spatialMergeUnit)
-
-	positionEmbedding = positionEmbedding.Reshape(ctx, positionEmbedding.Dim(0)*spatialMergeUnit, positionEmbedding.Dim(1)/spatialMergeUnit)
-	positionEmbedding = positionEmbedding.Rows(ctx, windowIndex)
-	positionEmbedding = positionEmbedding.Reshape(ctx, positionEmbedding.Dim(0)/spatialMergeUnit, positionEmbedding.Dim(1)*spatialMergeUnit)
-	positionEmbedding = positionEmbedding.Concat(ctx, positionEmbedding, 0)
-
-	cos, sin := positionEmbedding.Cos(ctx), positionEmbedding.Sin(ctx)
-	cos = cos.Reshape(ctx, cos.Dim(0), 1, cos.Dim(1))
-	sin = sin.Reshape(ctx, sin.Dim(0), 1, sin.Dim(1))
-
-	mask := blockDiagonalMask(ctx, hiddenStates.Dim(1), bounds, m.VisionModelOptions.numHeads)
-	// Apply encoder layers
-	for i, layer := range m.Layers {
-		if slices.Contains(m.fullAttnBlocks, int32(i)) {
-			hiddenStates = layer.Forward(ctx, hiddenStates, cos, sin, nil, m.VisionModelOptions)
-		} else {
-			hiddenStates = layer.Forward(
-				ctx,
-				hiddenStates,
-				cos,
-				sin,
-				mask,
-				m.VisionModelOptions,
-			)
-		}
-	}
-
-	hiddenStates = m.PatchMerger.Forward(ctx, hiddenStates, m.VisionModelOptions)
-	reverseWindowIndex := windowIndex.Argsort(ctx)
-	return hiddenStates.Rows(ctx, reverseWindowIndex)
-}
-
-// WindowIndex divides the grid into windows and returns:
-//  1. A tensor containing flattened indices of all grid points organized by windows
-//  2. A slice of boundaries that mark where each window's data begins and ends
-//     in the flattened representation, scaled by spatialMergeSize squared
-//
-// The boundaries slice always starts with 0 and contains cumulative ending
-// positions for each window, allowing downstream processing to identify
-// window boundaries in the tensor data.
-func (m *VisionModel) WindowIndex(ctx ml.Context, grid *Grid) (ml.Tensor, []int) {
-	vitMergerWindowSize := m.windowSize / m.spatialMergeSize / m.patchSize
-
-	llmGridH := grid.Height / m.spatialMergeSize
-	llmGridW := grid.Width / m.spatialMergeSize
-
-	// Calculate window parameters
-	numWindowsH := int(math.Ceil(float64(llmGridH) / float64(vitMergerWindowSize)))
-	numWindowsW := int(math.Ceil(float64(llmGridW) / float64(vitMergerWindowSize)))
-
-	// Initialize index_new slice
-	var index []int32
-
-	// Initialize bounds with the first element as 0
-	bounds := []int{0}
-	totalSeqLen := 0
-
-	// Process each window without padding
-	for wh := range numWindowsH {
-		for ww := range numWindowsW {
-			// Calculate window boundaries
-			hStart := wh * vitMergerWindowSize
-			wStart := ww * vitMergerWindowSize
-			hEnd := min(hStart+vitMergerWindowSize, llmGridH)
-			wEnd := min(wStart+vitMergerWindowSize, llmGridW)
-
-			// Calculate sequence length for this window
-			seqLen := (hEnd - hStart) * (wEnd - wStart)
-
-			// Collect indices for this window
-			for h := hStart; h < hEnd; h++ {
-				for w := wStart; w < wEnd; w++ {
-					index = append(index, int32(h*llmGridW+w))
-				}
-			}
-
-			totalSeqLen += seqLen
-			bounds = append(bounds, totalSeqLen*(m.spatialMergeSize*m.spatialMergeSize)+bounds[0])
-		}
-	}
-
-	t, err := ctx.Input().FromIntSlice(index, len(index))
-	if err != nil {
-		panic(err)
-	}
-
-	return t, bounds
-}
-
-// PositionalEmbedding generates rotary position embeddings for attention mechanisms
-func (m *VisionModel) PositionalEmbedding(ctx ml.Context, grid *Grid) ml.Tensor {
-	dim := m.headDim / 2
-	freq := dim / 2
-	theta := float64(m.ropeTheta)
-	merge := m.spatialMergeSize
-
-	// Create frequency patterns for position encoding
-	maxGridSize := max(grid.Height, grid.Width)
-	freqVals := make([]float32, freq*maxGridSize)
-	for i := range maxGridSize {
-		for j := range freq {
-			freqVals[i*freq+j] = float32(i) / float32(math.Pow(theta, float64(j*2)/float64(dim)))
-		}
-	}
-	freqs, err := ctx.Input().FromFloatSlice(freqVals, freq, maxGridSize)
-	if err != nil {
-		panic(fmt.Errorf("failed to create tensor from frequencies: %w", err))
-	}
-
-	// Create position coordinates (y,x pairs) for the grid
-	// In PyTorch: Equivalent to generating position ids with torch.arange()
-	coords := make([]int32, 0, grid.Height*grid.Width*2)
-	for y := range grid.Height {
-		for x := range grid.Width {
-			coords = append(coords, int32(y), int32(x))
-		}
-	}
-	pos, err := ctx.Input().FromIntSlice(coords, 2, grid.Width, grid.Height)
-	if err != nil {
-		panic(fmt.Errorf("failed to create tensor from positions: %w", err))
-	}
-
-	// Reshape and permute positions to match spatial merging pattern
-	pos = pos.Reshape(ctx, 2, grid.Width, merge, grid.Height/merge)
-	pos = pos.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
-	pos = pos.Reshape(ctx, 2, merge, merge, grid.Width/merge*grid.Height/merge)
-	pos = pos.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
-	pos = pos.Reshape(ctx, 2*merge*merge*grid.Width/merge*grid.Height/merge)
-
-	// Use position indices to look up corresponding frequency values
-	positionalEmbedding := freqs.Rows(ctx, pos)
-	positionalEmbedding = positionalEmbedding.Reshape(ctx, positionalEmbedding.Dim(0)*2, positionalEmbedding.Dim(1)/2)
-	return positionalEmbedding
-}
-
-// newVisionModel creates a new instance of the Qwen vision model
-func newVisionModel(c fs.Config) *VisionModel {
-	patchSize := int(c.Uint("vision.patch_size", 14))
-	hiddenSize := int(c.Uint("vision.embedding_length", 1280))
-	numHeads := int(c.Uint("vision.attention.head_count", 16))
-	numChannels := int(c.Uint("vision.num_channels", 3))
-	eps := c.Float("vision.attention.layer_norm_epsilon", 1e-6)
-	ropeTheta := c.Float("vision.rope.freq_base", 10000.0)
-	spatialMergeSize := int(c.Uint("vision.spatial_merge_size", 2))
-	windowSize := int(c.Uint("vision.window_size", 112))
-	fullAttnBlocks := c.Ints("qwen25vl.vision.fullatt_block_indexes", []int32{7, 15, 23, 31})
-	temporalPatchSize := int(c.Uint("vision.temporal_patch_size", 2))
-
-	model := &VisionModel{
-		Layers: make([]VisionEncoderLayer, c.Uint("vision.block_count", 32)),
-		VisionModelOptions: &VisionModelOptions{
-			hiddenSize:        hiddenSize,
-			numHeads:          numHeads,
-			headDim:           hiddenSize / numHeads,
-			patchSize:         patchSize,
-			numChannels:       numChannels,
-			eps:               eps,
-			ropeTheta:         ropeTheta,
-			spatialMergeSize:  spatialMergeSize,
-			windowSize:        windowSize,
-			temporalPatchSize: temporalPatchSize,
-			fullAttnBlocks:    fullAttnBlocks,
-		},
-	}
-
-	return model
-}
--- a/model/models/qwen25vl/process_image.go
+++ b/model/models/qwen25vl/process_image.go
@ -1,184 +0,0 @@
-package qwen25vl
-
-import (
-	"fmt"
-	"image"
-	"math"
-
-	"github.com/ollama/ollama/fs"
-	"github.com/ollama/ollama/model/imageproc"
-)
-
-// ImageProcessor contains configuration for the Qwen 2.5 VL image processing
-type ImageProcessor struct {
-	numChannels       int
-	patchSize         int
-	temporalPatchSize int
-	mergeSize         int
-	minPixels         int
-	maxPixels         int
-	factor            int
-	rescaleFactor     float32
-	imageMean         []float32
-	imageStd          []float32
-}
-
-// newImageProcessor creates a new image processor with default values
-func newImageProcessor(c fs.Config) ImageProcessor {
-	patchSize := int(c.Uint("vision.patch_size", 14))
-	mergeSize := int(c.Uint("vision.spatial_merge_size", 2))
-
-	return ImageProcessor{
-		numChannels:       int(c.Uint("vision.num_channels", 3)), // not set
-		patchSize:         patchSize,
-		temporalPatchSize: 2,
-		mergeSize:         mergeSize,
-		minPixels:         56 * 56,
-		maxPixels:         int(c.Uint("vision.max_pixels", 28*28*1280)), // 1MP limit
-		factor:            patchSize * mergeSize,
-		rescaleFactor:     1.0 / 255.0,
-		imageMean:         imageproc.ClipDefaultMean[:],
-		imageStd:          imageproc.ClipDefaultSTD[:],
-	}
-}
-
-// SmartResize implements the smart resize algorithm
-func (p *ImageProcessor) SmartResize(height, width int) (int, int) {
-	factor := p.factor
-
-	if height < factor || width < factor {
-		panic(fmt.Sprintf("height:%d or width:%d must be larger than factor:%d", height, width, factor))
-	} else if aspectRatio := max(height, width) / min(height, width); aspectRatio > 200 {
-		panic(fmt.Sprintf("absolute aspect ratio must be smaller than 200, got %v", aspectRatio))
-	}
-
-	round := func(x float64) int { return int(math.RoundToEven(x)) }
-
-	hBar := round(float64(height)/float64(factor)) * factor
-	wBar := round(float64(width)/float64(factor)) * factor
-
-	if hBar*wBar > p.maxPixels {
-		beta := math.Sqrt(float64(height*width) / float64(p.maxPixels))
-
-		hBar = int(math.Floor(float64(height)/beta/float64(factor))) * factor
-		wBar = int(math.Floor(float64(width)/beta/float64(factor))) * factor
-	} else if hBar*wBar < p.minPixels {
-		beta := math.Sqrt(float64(p.minPixels) / float64(height*width))
-
-		hBar = int(math.Ceil(float64(height)*beta/float64(factor))) * factor
-		wBar = int(math.Ceil(float64(width)*beta/float64(factor))) * factor
-	}
-
-	return hBar, wBar
-}
-
-type Grid struct {
-	Height   int
-	Width    int
-	Temporal int
-}
-
-func (p *ImageProcessor) ProcessImage(img image.Image) ([]float32, *Grid, error) {
-	origWidth := img.Bounds().Dx()
-	origHeight := img.Bounds().Dy()
-
-	// Calculate smart resize dimensions
-	resizedHeight, resizedWidth := p.SmartResize(origHeight, origWidth)
-
-	// Resize image using existing functions
-	resizedImg := imageproc.Resize(img, image.Point{X: resizedWidth, Y: resizedHeight}, imageproc.ResizeBilinear)
-
-	normalizedPixels := imageproc.Normalize(
-		resizedImg,
-		[3]float32{p.imageMean[0], p.imageMean[1], p.imageMean[2]},
-		[3]float32{p.imageStd[0], p.imageStd[1], p.imageStd[2]},
-		true, // rescale
-		true, // channelFirst
-	)
-
-	// Calculate grid dimensions
-	grid := &Grid{
-		Height:   resizedHeight / p.patchSize,
-		Width:    resizedWidth / p.patchSize,
-		Temporal: 1, // For single images, temporal dimension is 1
-	}
-
-	patches, err := p.createPatches(normalizedPixels, resizedHeight, resizedWidth, grid)
-	if err != nil {
-		return nil, nil, fmt.Errorf("failed to create patches: %v", err)
-	}
-
-	// Return patches and grid dimensions
-	return patches, grid, nil
-}
-
-func (p *ImageProcessor) createPatches(pixels []float32, height, width int, grid *Grid) ([]float32, error) {
-	channels := p.numChannels
-	patchSize := p.patchSize
-	mergeSize := p.mergeSize
-	temporalPatchSize := p.temporalPatchSize
-
-	// Calculate output dimensions
-	numPatches := grid.Temporal * grid.Height * grid.Width
-	patchDim := channels * temporalPatchSize * patchSize * patchSize
-
-	result := make([]float32, numPatches*patchDim)
-	patchIndex := 0
-
-	// Single temporal frame handling (copies to all frames)
-	for range grid.Temporal {
-		for h := 0; h < grid.Height; h += mergeSize {
-			for w := 0; w < grid.Width; w += mergeSize {
-				// Handle the 2x2 merged patches
-				for mh := range mergeSize {
-					for mw := range mergeSize {
-						baseOffset := patchIndex * patchDim
-
-						// Extract patch data for first temporal frame
-						for c := range channels {
-							channelOffset := baseOffset + (c * temporalPatchSize * patchSize * patchSize)
-
-							for py := range patchSize {
-								for px := range patchSize {
-									// Calculate source pixel coordinates
-									y := (h+mh)*patchSize + py
-									x := (w+mw)*patchSize + px
-
-									// Source index in input tensor (CHW format)
-									srcIdx := c*height*width + y*width + x
-
-									// Destination index in first temporal frame
-									dstIdx := channelOffset + (py * patchSize) + px
-
-									if srcIdx < len(pixels) && dstIdx < len(result) {
-										result[dstIdx] = pixels[srcIdx]
-									}
-								}
-							}
-						}
-
-						// Copy first temporal frame to all other frames
-						if temporalPatchSize > 1 {
-							for c := range channels {
-								channelOffset := baseOffset + (c * temporalPatchSize * patchSize * patchSize)
-								firstFrameOffset := channelOffset
-								frameSize := patchSize * patchSize
-
-								// Copy first frame to all other frames
-								for tp := 1; tp < temporalPatchSize; tp++ {
-									currentFrameOffset := channelOffset + (tp * frameSize)
-									copy(result[currentFrameOffset:currentFrameOffset+frameSize],
-										result[firstFrameOffset:firstFrameOffset+frameSize])
-								}
-							}
-						}
-
-						patchIndex++
-					}
-				}
-			}
-		}
-	}
-
-	return result, nil
-}
--- a/runner/llamarunner/cache.go
+++ b/runner/llamarunner/cache.go
@ -104,8 +104,8 @@ func (c *InputCache) LoadCacheSlot(prompt []input, cachePrompt bool) (*InputCach
 	slog.Debug("loading cache slot", "id", slot.Id, "cache", len(slot.Inputs), "prompt", len(prompt),
 		"used", numPast, "remaining", len(prompt)-numPast)

+	slot.Inputs = prompt[:numPast]
 	prompt = prompt[numPast:]
-	slot.Inputs = slot.Inputs[:numPast]

 	return slot, prompt, nil
 }
--- a/runner/llamarunner/image.go
+++ b/runner/llamarunner/image.go
@ -5,6 +5,7 @@ import (
 	"fmt"
 	"hash/maphash"
 	"log/slog"
+	"slices"
 	"sync"
 	"time"

@ -17,7 +18,8 @@ type ImageContext struct {
 	// mu is required to be held when generating embeddings or accessing the cache
 	mu sync.Mutex

-	clip *llama.ClipContext
+	clip   *llama.ClipContext
+	mllama *llama.MllamaContext

 	// cache of images to embeddings
 	images    []imageCache
@ -33,6 +35,8 @@ func NewImageContext(llamaContext *llama.Context, modelPath string) (*ImageConte
 	var c ImageContext
 	if arch == "clip" {
 		c.clip, err = llama.NewClipContext(llamaContext, modelPath)
+	} else if arch == "mllama" {
+		c.mllama, err = llama.NewMllamaContext(llamaContext, modelPath)
 	} else {
 		return nil, fmt.Errorf("unknown vision model architecture: %s", arch)
 	}
@ -54,9 +58,12 @@ func (c *ImageContext) Free(modelPath string) {
 	if c.clip != nil {
 		c.clip.Free()
 	}
+	if c.mllama != nil {
+		c.mllama.Free()
+	}
 }

-func (c *ImageContext) NewEmbed(llamaContext *llama.Context, data []byte) ([][]float32, error) {
+func (c *ImageContext) NewEmbed(llamaContext *llama.Context, data []byte, aspectRatioId int) ([][]float32, error) {
 	if c == nil {
 		return nil, nil
 	}
@ -72,7 +79,12 @@ func (c *ImageContext) NewEmbed(llamaContext *llama.Context, data []byte) ([][]f

 	embed, err := c.findImage(hash)
 	if err != nil {
-		if c.clip != nil {
+		if c.mllama != nil {
+			embed, err = c.mllama.NewEmbed(llamaContext, data, aspectRatioId)
+			if err != nil {
+				return nil, err
+			}
+		} else if c.clip != nil {
 			embed, err = c.clip.NewEmbed(llamaContext, data)
 			if err != nil {
 				return nil, err
@ -93,11 +105,33 @@ func (c *ImageContext) BatchSize(configuredBatchSize int) int {
 		return 0
 	}

+	// Mllama maps an image to 1 embedding token (llava creates many tokens)
+	// and doesn't support more than a single image per request.
+	// The embeddings are large (100 MB), so allocating a big batch can fail
+	// on some systems
+	if c.mllama != nil {
+		return 1
+	}
+
 	return configuredBatchSize
 }

 func (c *ImageContext) EmbedSize(llamaContext *llama.Context) int {
-	return llamaContext.Model().NEmbd()
+	if c != nil && c.mllama != nil {
+		return c.mllama.EmbedSize(llamaContext)
+	} else {
+		return llamaContext.Model().NEmbd()
+	}
+}
+
+func (c *ImageContext) NeedCrossAttention(inputs ...input) bool {
+	if c == nil || c.mllama == nil {
+		return false
+	}
+
+	return slices.ContainsFunc(inputs, func(input input) bool {
+		return input.embed != nil
+	})
 }

 type imageCache struct {
--- a/runner/llamarunner/runner.go
+++ b/runner/llamarunner/runner.go
@ -57,6 +57,10 @@ type Sequence struct {
 	// input cache being used by this sequence
 	cache *InputCacheSlot

+	// does this sequence require cross-attention layers to be processed? - if we have seen
+	// an image for certain multi-modal models
+	crossAttention bool
+
 	// channel to send responses over
 	responses chan string

@ -201,7 +205,7 @@ func (s *Server) inputs(prompt string, images []llm.ImageData) ([]input, error)
 				return nil, fmt.Errorf("invalid image index: %d", n)
 			}

-			embed, err := s.image.NewEmbed(s.lc, images[imageIndex].Data)
+			embed, err := s.image.NewEmbed(s.lc, images[imageIndex].Data, images[imageIndex].AspectRatioID)
 			if err != nil {
 				return nil, err
 			}
@ -364,6 +368,7 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
 	defer s.mu.Unlock()

 	var batch *llama.Batch
+	crossAttention := false

 	seqIdx := s.nextSeq - 1
 	for range s.seqs {
@ -411,8 +416,9 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
 					batch = tokenBatch
 				} else {
 					batch = embedBatch
+					seq.crossAttention = s.image.NeedCrossAttention(input)
 				}
-			} else if embedding != batch.IsEmbedding() {
+			} else if embedding != batch.IsEmbedding() || crossAttention != seq.crossAttention {
 				s.nextSeq = seqIdx
 				break
 			}
@ -421,6 +427,7 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
 				break
 			}

+			crossAttention = seq.crossAttention
 			batch.Add(input.token, input.embed, len(seq.cache.Inputs)+len(seq.pendingInputs), i+1 == len(seq.inputs), seq.cache.Id)
 			seq.pendingInputs = append(seq.pendingInputs, input)
 			seq.iBatch = batch.NumTokens() - 1
@ -433,11 +440,20 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
 		return nil
 	}

+	s.lc.SetCrossAttention(crossAttention)
+
 	err := s.lc.Decode(batch)
 	if err != nil {
 		return fmt.Errorf("failed to decode batch: %w", err)
 	}

+	if crossAttention {
+		// synchronize state to ensure the cross attention batch is complete.
+		// needed specifically for multi-GPU systems otherwise an inflight
+		// task may be incorrectly invalidated causing a crash
+		s.lc.Synchronize()
+	}
+
 	for i, seq := range s.seqs {
 		if seq == nil {
 			continue
@ -606,6 +622,8 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
 				return
 			}

+			seq.crossAttention = s.image.NeedCrossAttention(seq.cache.Inputs...)
+
 			s.seqs[i] = seq
 			s.cond.Signal()
 			found = true
--- a/runner/ollamarunner/cache.go
+++ b/runner/ollamarunner/cache.go
@ -136,8 +136,8 @@ func (c *InputCache) LoadCacheSlot(prompt []input.Input) (*InputCacheSlot, []inp
 	slog.Debug("loading cache slot", "id", slot.Id, "cache", len(slot.Inputs), "prompt", len(prompt),
 		"used", numPast, "remaining", int32(len(prompt))-numPast)

+	slot.Inputs = prompt[:numPast]
 	prompt = prompt[numPast:]
-	slot.Inputs = slot.Inputs[:numPast]

 	return slot, prompt, nil
 }
--- a/runner/ollamarunner/cache_test.go
+++ b/runner/ollamarunner/cache_test.go
@ -3,7 +3,6 @@ package ollamarunner
 import (
 	"errors"
 	"fmt"
-	"image"
 	"testing"
 	"time"

@ -12,10 +11,6 @@ import (
 )

 func TestCountCommon(t *testing.T) {
-	imgA := image.NewRGBA(image.Rect(0, 0, 100, 100))
-	imgB := image.NewRGBA(image.Rect(0, 0, 50, 50))
-	imgC := image.NewRGBA(image.Rect(50, 50, 100, 100))
-
 	tests := []struct {
 		name     string
 		t1       []input.Input
@ -36,20 +31,20 @@ func TestCountCommon(t *testing.T) {
 		},
 		{
 			name:     "Image Prefix",
-			t1:       []input.Input{{Multimodal: imgA, MultimodalHash: 1}},
-			t2:       []input.Input{{Multimodal: imgA, MultimodalHash: 1}, {Multimodal: imgB, MultimodalHash: 2}, {Multimodal: imgC, MultimodalHash: 3}},
+			t1:       []input.Input{{MultimodalHash: 1}},
+			t2:       []input.Input{{MultimodalHash: 1}, {MultimodalHash: 2}, {MultimodalHash: 3}},
 			expected: 1,
 		},
 		{
 			name:     "Mixed",
-			t1:       []input.Input{{Token: 1}, {Multimodal: imgA, MultimodalHash: 1}},
-			t2:       []input.Input{{Token: 1}, {Multimodal: imgA, MultimodalHash: 1}, {Token: 5}},
+			t1:       []input.Input{{Token: 1}, {MultimodalHash: 1}},
+			t2:       []input.Input{{Token: 1}, {MultimodalHash: 1}, {Token: 5}},
 			expected: 2,
 		},
 		{
 			name:     "Mixed, Same Length",
-			t1:       []input.Input{{Token: 1}, {Multimodal: imgA, MultimodalHash: 1}},
-			t2:       []input.Input{{Token: 1}, {Multimodal: imgB, MultimodalHash: 2}},
+			t1:       []input.Input{{Token: 1}, {MultimodalHash: 1}},
+			t2:       []input.Input{{Token: 1}, {MultimodalHash: 2}},
 			expected: 1,
 		},
 		{
--- a/runner/ollamarunner/multimodal.go
+++ b/runner/ollamarunner/multimodal.go
@ -0,0 +1,116 @@
+package ollamarunner
+
+import (
+	"errors"
+
+	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/model/input"
+)
+
+// Tensors can't be used across multiple compute graphs. This is a problem
+// if a single embedding is split across batches using views since all of
+// the views will have the same source tensor. We also don't want to
+// recompute the entire embedding for each batch.
+//
+// To avoid this, we compute all of the tensors for the embedding on the
+// first use and then store the result in system memory. When we need
+// additional tensors, we recreate them from the stored data.
+
+// multimodalEntry represents the embeddings of a single object (such
+// as an image).
+type multimodalEntry struct {
+	// mm is the original set of tensors created by EncodeMultimodal
+	mm []input.Multimodal
+
+	// data is the computed result of mm. Nil if not yet computed
+	data [][]float32
+}
+
+// multimodalStore maps from an individual tensor (of which there
+// may be many in a single multimodal object) to its parent embedding
+type multimodalStore map[ml.Tensor]*multimodalEntry
+
+func newMultimodalStore() multimodalStore {
+	return make(multimodalStore)
+}
+
+// addMultimodal stores an embedding for later use in a compute graph
+func (m multimodalStore) addMultimodal(embedding []input.Multimodal) {
+	entry := &multimodalEntry{mm: embedding}
+
+	for _, e := range embedding {
+		if e.Tensor != nil {
+			m[e.Tensor] = entry
+		}
+	}
+}
+
+// getMultimodal takes a source set of tensors (which may contain a whole or
+// parts of one or more images) and returns the equivalent that can be used in
+// the current context
+func (m multimodalStore) getMultimodal(backend ml.Backend, ctx ml.Context, in []input.Multimodal, reserve bool) ([]input.Multimodal, error) {
+	out := make([]input.Multimodal, len(in))
+	for i := range out {
+		if in[i].Tensor != nil {
+			var err error
+			out[i].Tensor, err = m.getTensor(backend, ctx, in[i].Tensor, reserve)
+			if err != nil {
+				return nil, err
+			}
+		}
+
+		out[i].Data = in[i].Data
+	}
+
+	return out, nil
+}
+
+func (m multimodalStore) getTensor(backend ml.Backend, ctx ml.Context, in ml.Tensor, reserve bool) (ml.Tensor, error) {
+	entry := m[in]
+
+	if entry.data == nil {
+		computeCtx := backend.NewContext()
+		defer computeCtx.Close()
+
+		var tensors []ml.Tensor
+		for _, t := range entry.mm {
+			if t.Tensor != nil {
+				tensors = append(tensors, t.Tensor)
+			}
+		}
+
+		if len(tensors) == 0 {
+			return nil, nil
+		}
+
+		computeCtx.Forward(tensors...)
+		entry.data = make([][]float32, len(entry.mm))
+
+		if !reserve {
+			computeCtx.Compute(tensors...)
+
+			for i, t := range entry.mm {
+				if t.Tensor != nil {
+					entry.data[i] = t.Tensor.Floats()
+				}
+			}
+		} else {
+			err := computeCtx.Reserve()
+			if err != nil {
+				return nil, err
+			}
+		}
+	}
+
+	for i, t := range entry.mm {
+		if in == t.Tensor {
+			if !reserve {
+				return ctx.Input().FromFloatSlice(entry.data[i], t.Tensor.Shape()...)
+			} else {
+				return ctx.Input().Empty(t.Tensor.DType(), t.Tensor.Shape()...), nil
+			}
+		}
+	}
+
+	return nil, errors.New("multimodal tensor not found")
+}
--- a/Show More
+++ b/Show More