wip

Update model/models/qwen25vl/model_vision.go
use with pattern for rope
2025-05-12 19:15:42 -07:00 · 2025-05-12 14:16:46 -07:00 · 2025-05-12 14:14:03 -07:00 · 2025-05-12 14:03:37 -07:00 · 2025-05-12 13:51:46 -07:00 · 2025-05-12 13:51:04 -07:00
99 changed files with 5123 additions and 1127 deletions
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@ -103,11 +103,6 @@ jobs:
        arch: [amd64]
        preset: ['CPU']
        include:
          - os: windows
            arch: amd64
            preset: 'CUDA 11'
            install: https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.89_win10.exe
            cuda-version: '11.3'
          - os: windows
            arch: amd64
            preset: 'CUDA 12'
@ -324,7 +319,6 @@ jobs:
            case "$COMPONENT" in
              bin/ollama)               echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
              lib/ollama/*.so)          echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
              lib/ollama/cuda_v11)      echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
              lib/ollama/cuda_v12)      echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
              lib/ollama/cuda_jetpack5) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack5.tar.in ;;
              lib/ollama/cuda_jetpack6) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack6.tar.in ;;
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@ -46,7 +46,7 @@ jobs:
        include:
          - preset: CPU
          - preset: CUDA
-            container: nvidia/cuda:11.8.0-devel-ubuntu22.04
+            container: nvidia/cuda:12.8.1-devel-ubuntu22.04
            flags: '-DCMAKE_CUDA_ARCHITECTURES=87'
          - preset: ROCm
            container: rocm/dev-ubuntu-22.04:6.1.2
@ -78,7 +78,7 @@ jobs:
        include:
          - preset: CPU
          - preset: CUDA
-            install: https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.89_win10.exe
+            install: https://developer.download.nvidia.com/compute/cuda/12.8.0/local_installers/cuda_12.8.0_571.96_windows.exe
            flags: '-DCMAKE_CUDA_ARCHITECTURES=80'
          - preset: ROCm
            install: https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q4-WinSvr2022-For-HIP.exe
@ -102,7 +102,7 @@ jobs:
          $ErrorActionPreference = "Stop"
          if ("${{ steps.cache-install.outputs.cache-hit }}" -ne 'true') {
            Invoke-WebRequest -Uri "${{ matrix.install }}" -OutFile "install.exe"
-            Start-Process -FilePath .\install.exe -ArgumentList (@("-s", "cudart_11.3", "nvcc_11.3", "cublas_11.3", "cublas_dev_11.3")) -NoNewWindow -Wait
+            Start-Process -FilePath .\install.exe -ArgumentList (@("-s", "cudart_12.8", "nvcc_12.8", "cublas_12.8", "cublas_dev_12.8")) -NoNewWindow -Wait
          }
          $cudaPath = (Resolve-Path "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\*").path
--- a/CMakePresets.json
+++ b/CMakePresets.json
@ -17,14 +17,6 @@
      "name": "CUDA",
      "inherits": [ "Default" ]
    },
    {
      "name": "CUDA 11",
      "inherits": [ "CUDA" ],
      "cacheVariables": {
        "CMAKE_CUDA_ARCHITECTURES": "50;52;53;60;61;70;75;80;86",
        "CMAKE_CUDA_FLAGS": "-Wno-deprecated-gpu-targets"
      }
    },
    {
      "name": "CUDA 12",
      "inherits": [ "CUDA" ],
@ -78,11 +70,6 @@
      "configurePreset": "CUDA",
      "targets": [ "ggml-cuda" ]
    },
    {
      "name": "CUDA 11",
      "inherits": [ "CUDA" ],
      "configurePreset": "CUDA 11"
    },
    {
      "name": "CUDA 12",
      "inherits": [ "CUDA" ],
--- a/17
+++ b/17
@ -7,14 +7,10 @@ ARG JETPACK5VERSION=r35.4.1
 ARG JETPACK6VERSION=r36.4.0
 ARG CMAKEVERSION=3.31.2
 # CUDA v11 requires gcc v10.  v10.3 has regressions, so the rockylinux 8.5 AppStream has the latest compatible version
 FROM --platform=linux/amd64 rocm/dev-almalinux-8:${ROCMVERSION}-complete AS base-amd64
 RUN yum install -y yum-utils \
-    && yum-config-manager --add-repo https://dl.rockylinux.org/vault/rocky/8.5/AppStream/\$basearch/os/ \
+    && dnf install -y ccache \
    && rpm --import https://dl.rockylinux.org/pub/rocky/RPM-GPG-KEY-Rocky-8 \
    && dnf install -y yum-utils ccache gcc-toolset-10-gcc-10.2.1-8.2.el8 gcc-toolset-10-gcc-c++-10.2.1-8.2.el8 gcc-toolset-10-binutils-2.35-11.el8 \
    && yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo
 ENV PATH=/opt/rh/gcc-toolset-10/root/usr/bin:$PATH
 FROM --platform=linux/arm64 almalinux:8 AS base-arm64
 # install epel-release for ccache
@ -38,15 +34,6 @@ RUN --mount=type=cache,target=/root/.ccache \
        && cmake --build --parallel --preset 'CPU' \
        && cmake --install build --component CPU --strip --parallel 8
 FROM base AS cuda-11
 ARG CUDA11VERSION=11.3
 RUN dnf install -y cuda-toolkit-${CUDA11VERSION//./-}
 ENV PATH=/usr/local/cuda-11/bin:$PATH
 RUN --mount=type=cache,target=/root/.ccache \
    cmake --preset 'CUDA 11' \
        && cmake --build --parallel --preset 'CUDA 11' \
        && cmake --install build --component CUDA --strip --parallel 8
 FROM base AS cuda-12
 ARG CUDA12VERSION=12.8
 RUN dnf install -y cuda-toolkit-${CUDA12VERSION//./-}
@ -98,11 +85,9 @@ RUN --mount=type=cache,target=/root/.cache/go-build \
    go build -trimpath -buildmode=pie -o /bin/ollama .
 FROM --platform=linux/amd64 scratch AS amd64
 COPY --from=cuda-11 dist/lib/ollama/cuda_v11 /lib/ollama/cuda_v11
 COPY --from=cuda-12 dist/lib/ollama/cuda_v12 /lib/ollama/cuda_v12
 FROM --platform=linux/arm64 scratch AS arm64
 COPY --from=cuda-11 dist/lib/ollama/cuda_v11 /lib/ollama/cuda_v11
 COPY --from=cuda-12 dist/lib/ollama/cuda_v12 /lib/ollama/cuda_v12
 COPY --from=jetpack-5 dist/lib/ollama/cuda_v11 /lib/ollama/cuda_jetpack5
 COPY --from=jetpack-6 dist/lib/ollama/cuda_v12 /lib/ollama/cuda_jetpack6
--- a/Makefile.sync
+++ b/Makefile.sync
@ -15,13 +15,11 @@ help:
 	@echo "    make -f $(lastword $(MAKEFILE_LIST)) clean sync"
 .PHONY: sync
-sync: llama/build-info.cpp ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal
+sync: llama/build-info.cpp llama/llama.cpp ml/backend/ggml/ggml
-llama/build-info.cpp: llama/build-info.cpp.in llama/llama.cpp
+.PHONY: llama/build-info.cpp
-	sed -e 's|@FETCH_HEAD@|$(FETCH_HEAD)|' <$< >$@
+llama/build-info.cpp: llama/build-info.cpp.in
-
+	sed -e 's|@FETCH_HEAD@|$(FETCH_HEAD)|' $< > $@
 ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal: ml/backend/ggml/ggml
 	go generate ./$(@D)
 .PHONY: llama/llama.cpp
 llama/llama.cpp: llama/vendor/
@ -32,13 +30,12 @@ ml/backend/ggml/ggml: llama/vendor/ggml/
 	rsync -arvzc -f "merge $@/.rsync-filter" $< $@
 PATCHES=$(wildcard llama/patches/*.patch)
 PATCHED=$(join $(dir $(PATCHES)), $(addsuffix ed, $(addprefix ., $(notdir $(PATCHES)))))
 .PHONY: apply-patches
 .NOTPARALLEL:
-apply-patches: $(PATCHED)
+apply-patches: $(addsuffix ed, $(PATCHES))
-llama/patches/.%.patched: llama/patches/%.patch
+%.patched: %.patch
 	@if git -c user.name=nobody -c 'user.email=<>' -C $(WORKDIR) am -3 $(realpath $<); then touch $@; else git -C $(WORKDIR) am --abort; exit 1; fi
 .PHONY: checkout
@ -60,4 +57,4 @@ format-patches: llama/patches
 .PHONE: clean
 clean: checkout
-	$(RM) llama/patches/.*.patched
+	$(RM) $(addsuffix ed, $(PATCHES))
--- a/cmd/interactive.go
+++ b/cmd/interactive.go
@ -44,7 +44,7 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 		fmt.Fprintln(os.Stderr, "Use \"\"\" to begin a multi-line message.")
 		if opts.MultiModal {
-			fmt.Fprintf(os.Stderr, "Use %s to include .jpg, .png, or .webp images.\n", filepath.FromSlash("/path/to/file"))
+			fmt.Fprintf(os.Stderr, "Use %s to include .jpg or .png images.\n", filepath.FromSlash("/path/to/file"))
 		}
 		fmt.Fprintln(os.Stderr, "")
@ -511,7 +511,7 @@ func extractFileNames(input string) []string {
 	// Regex to match file paths starting with optional drive letter, / ./ \ or .\ and include escaped or unescaped spaces (\ or %20)
 	// and followed by more characters and a file extension
 	// This will capture non filename strings, but we'll check for file existence to remove mismatches
-	regexPattern := `(?:[a-zA-Z]:)?(?:\./|/|\\)[\S\\ ]+?\.(?i:jpg|jpeg|png|webp)\b`
+	regexPattern := `(?:[a-zA-Z]:)?(?:\./|/|\\)[\S\\ ]+?\.(?i:jpg|jpeg|png)\b`
 	re := regexp.MustCompile(regexPattern)
 	return re.FindAllString(input, -1)
@ -553,7 +553,7 @@ func getImageData(filePath string) ([]byte, error) {
 	}
 	contentType := http.DetectContentType(buf)
-	allowedTypes := []string{"image/jpeg", "image/jpg", "image/png", "image/webp"}
+	allowedTypes := []string{"image/jpeg", "image/jpg", "image/png"}
 	if !slices.Contains(allowedTypes, contentType) {
 		return nil, fmt.Errorf("invalid image type: %s", contentType)
 	}
--- a/cmd/interactive_test.go
+++ b/cmd/interactive_test.go
@ -12,17 +12,14 @@ func TestExtractFilenames(t *testing.T) {
 	// Unix style paths
 	input := ` some preamble 
 ./relative\ path/one.png inbetween1 ./not a valid two.jpg inbetween2 ./1.svg
-/unescaped space /three.jpeg inbetween3 /valid\ path/dir/four.png "./quoted with spaces/five.JPG
+/unescaped space /three.jpeg inbetween3 /valid\ path/dir/four.png "./quoted with spaces/five.JPG`
 /unescaped space /six.webp inbetween6 /valid\ path/dir/seven.WEBP`
 	res := extractFileNames(input)
-	assert.Len(t, res, 7)
+	assert.Len(t, res, 5)
 	assert.Contains(t, res[0], "one.png")
 	assert.Contains(t, res[1], "two.jpg")
 	assert.Contains(t, res[2], "three.jpeg")
 	assert.Contains(t, res[3], "four.png")
 	assert.Contains(t, res[4], "five.JPG")
 	assert.Contains(t, res[5], "six.webp")
 	assert.Contains(t, res[6], "seven.WEBP")
 	assert.NotContains(t, res[4], '"')
 	assert.NotContains(t, res, "inbetween1")
 	assert.NotContains(t, res, "./1.svg")
@ -33,12 +30,10 @@ func TestExtractFilenames(t *testing.T) {
 /absolute/nospace/three.jpeg inbetween3 /absolute/with space/four.png inbetween4
 ./relative\ path/five.JPG inbetween5 "./relative with/spaces/six.png inbetween6
 d:\path with\spaces\seven.JPEG inbetween7 c:\users\jdoe\eight.png inbetween8 
- d:\program files\someplace\nine.png inbetween9 "E:\program files\someplace\ten.PNG
+ d:\program files\someplace\nine.png inbetween9 "E:\program files\someplace\ten.PNG some ending
 c:/users/jdoe/eleven.webp inbetween11 c:/program files/someplace/twelve.WebP inbetween12
 d:\path with\spaces\thirteen.WEBP some ending
 `
 	res = extractFileNames(input)
-	assert.Len(t, res, 13)
+	assert.Len(t, res, 10)
 	assert.NotContains(t, res, "inbetween2")
 	assert.Contains(t, res[0], "one.png")
 	assert.Contains(t, res[0], "c:")
@ -56,12 +51,6 @@ d:\path with\spaces\thirteen.WEBP some ending
 	assert.Contains(t, res[8], "d:")
 	assert.Contains(t, res[9], "ten.PNG")
 	assert.Contains(t, res[9], "E:")
 	assert.Contains(t, res[10], "eleven.webp")
 	assert.Contains(t, res[10], "c:")
 	assert.Contains(t, res[11], "twelve.WebP")
 	assert.Contains(t, res[11], "c:")
 	assert.Contains(t, res[12], "thirteen.WEBP")
 	assert.Contains(t, res[12], "d:")
 }
 // Ensure that file paths wrapped in single quotes are removed with the quotes.
--- a/convert/convert.go
+++ b/convert/convert.go
@ -1,7 +1,6 @@
 package convert
 import (
 	"cmp"
 	"encoding/json"
 	"errors"
 	"fmt"
@ -15,12 +14,13 @@ import (
 )
 type ModelParameters struct {
-	Architectures []string `json:"architectures"`
+	Architectures []string       `json:"architectures"`
-	VocabSize     uint32   `json:"vocab_size"`
+	VocabSize     uint32         `json:"vocab_size"`
 	TextModel     TextParameters `json:"text_config"`
 }
-	TextModel struct {
+type TextParameters struct {
-		VocabSize uint32 `json:"vocab_size"`
+	VocabSize uint32 `json:"vocab_size"`
 	} `json:"text_config"`
 }
 type AdapterParameters struct {
@ -173,8 +173,6 @@ func ConvertModel(fsys fs.FS, f *os.File) error {
 	switch p.Architectures[0] {
 	case "LlamaForCausalLM":
 		conv = &llamaModel{}
 	case "MllamaForConditionalGeneration":
 		conv = &mllamaModel{}
 	case "Llama4ForConditionalGeneration":
 		conv = &llama4Model{}
 	case "Mistral3ForConditionalGeneration":
@ -216,22 +214,24 @@ func ConvertModel(fsys fs.FS, f *os.File) error {
 		return err
 	}
-	vocabSize := int(cmp.Or(p.VocabSize, p.TextModel.VocabSize))
+	vocabSize := int(p.VocabSize)
 	if vocabSize == 0 {
 		tVocabSize := int(p.TextModel.VocabSize)
 		vocabSize = tVocabSize
 	}
 	switch {
 	case vocabSize == 0:
-		slog.Debug("vocabulary size was not explicitly set by the model", "default size", len(t.Vocabulary.Tokens))
+		slog.Warn("vocabulary size was not explicitly set by the model", "default size", len(t.Vocabulary.Tokens))
 	case vocabSize > len(t.Vocabulary.Tokens):
-		slog.Debug("vocabulary is smaller than expected, padding with dummy tokens", "expect", vocabSize, "actual", len(t.Vocabulary.Tokens))
+		slog.Warn("vocabulary is smaller than expected, padding with dummy tokens", "expect", vocabSize, "actual", len(t.Vocabulary.Tokens))
 		for i := range vocabSize - len(t.Vocabulary.Tokens) {
 			t.Vocabulary.Tokens = append(t.Vocabulary.Tokens, fmt.Sprintf("[PAD%d]", i))
 			t.Vocabulary.Scores = append(t.Vocabulary.Scores, -1)
 			t.Vocabulary.Types = append(t.Vocabulary.Types, tokenTypeUserDefined)
 		}
 	case vocabSize < len(t.Vocabulary.Tokens):
-		slog.Debug("vocabulary is larger than expected", "want", vocabSize, "got", len(t.Vocabulary.Tokens))
+		return fmt.Errorf("vocabulary is larger than expected '%d' instead of '%d'", len(t.Vocabulary.Tokens), vocabSize)
 		p.VocabSize = uint32(len(t.Vocabulary.Tokens))
 		p.TextModel.VocabSize = uint32(len(t.Vocabulary.Tokens))
 	default:
 		slog.Debug("vocabulary", "size", len(t.Vocabulary.Tokens))
 	}
--- a/convert/convert_mllama.go
+++ b/convert/convert_mllama.go
@ -1,160 +0,0 @@
 package convert
 import (
 	"strings"
 	"github.com/ollama/ollama/fs/ggml"
 	"github.com/pdevine/tensor"
 	"github.com/pdevine/tensor/native"
 )
 type mllamaModel struct {
 	ModelParameters
 	TextModel struct {
 		llamaModel
 		CrossAttentionLayers []int32 `json:"cross_attention_layers"`
 	} `json:"text_config"`
 	VisionModel struct {
 		NumHiddenLayers           uint32  `json:"num_hidden_layers"`
 		NumGlobalLayers           uint32  `json:"num_global_layers"`
 		IntermediateLayersIndices []int32 `json:"intermediate_layers_indices"`
 		HiddenSize       uint32 `json:"hidden_size"`
 		IntermediateSize uint32 `json:"intermediate_size"`
 		AttentionHeads uint32 `json:"attention_heads"`
 		ImageSize   uint32  `json:"image_size"`
 		PatchSize   uint32  `json:"patch_size"`
 		NumChannels uint32  `json:"num_channels"`
 		MaxNumTiles uint32  `json:"max_num_tiles"`
 		NormEpsilon float32 `json:"norm_eps"`
 		RopeTheta   float32 `json:"rope.freq_base"`
 	} `json:"vision_config"`
 }
 func (m *mllamaModel) KV(t *Tokenizer) ggml.KV {
 	kv := m.ModelParameters.KV(t)
 	kv["general.architecture"] = "mllama"
 	for k, v := range m.TextModel.KV(t) {
 		if strings.HasPrefix(k, "llama.") {
 			kv[strings.ReplaceAll(k, "llama.", "mllama.")] = v
 		}
 	}
 	kv["mllama.attention.cross_attention_layers"] = m.TextModel.CrossAttentionLayers
 	kv["mllama.vision.block_count"] = m.VisionModel.NumHiddenLayers
 	kv["mllama.vision.global.block_count"] = m.VisionModel.NumGlobalLayers
 	kv["mllama.vision.intermediate_layers_indices"] = m.VisionModel.IntermediateLayersIndices
 	kv["mllama.vision.embedding_length"] = m.VisionModel.HiddenSize
 	kv["mllama.vision.feed_forward_length"] = m.VisionModel.IntermediateSize
 	kv["mllama.vision.attention.head_count"] = m.VisionModel.AttentionHeads
 	kv["mllama.vision.attention.layer_norm_epsilon"] = m.VisionModel.NormEpsilon
 	kv["mllama.vision.image_size"] = m.VisionModel.ImageSize
 	kv["mllama.vision.patch_size"] = m.VisionModel.PatchSize
 	kv["mllama.vision.max_num_tiles"] = m.VisionModel.MaxNumTiles
 	kv["mllama.vision.num_channels"] = m.VisionModel.NumChannels
 	return kv
 }
 func (m *mllamaModel) Replacements() []string {
 	return append(
 		m.TextModel.Replacements(),
 		"language_model.", "",
 		"gate_attn", "attn_gate",
 		"gate_ffn", "ffn_gate",
 		"cross_attn.", "cross_attn_",
 		"vision_model", "v",
 		"class_embedding", "class_embd",
 		"patch_embedding", "patch_embd",
 		"gated_positional_embedding.tile_embedding", "tile_position_embd",
 		"gated_positional_embedding.embedding", "position_embd.weight",
 		"gated_positional_embedding", "position_embd",
 		"embedding.weight", "weight",
 		"pre_tile_positional_embedding", "pre_tile_position_embd",
 		"post_tile_positional_embedding", "post_tile_position_embd",
 		"layernorm_pre", "pre_ln",
 		"layernorm_post", "post_ln",
 		"global_transformer.layers", "global.blk",
 		"transformer.layers", "blk",
 		"mlp.fc1", "ffn_up",
 		"mlp.fc2", "ffn_down",
 		"multi_modal_projector", "mm.0",
 	)
 }
 func (m *mllamaModel) Tensors(ts []Tensor) []*ggml.Tensor {
 	var out []*ggml.Tensor
 	var text []Tensor
 	for _, t := range ts {
 		if t.Name() == "v.position_embd.gate" {
 			for _, name := range []string{"v.position_embd.gate", "v.tile_position_embd.gate"} {
 				tt := t.Clone()
 				tt.SetRepacker(m.repack(name))
 				out = append(out, &ggml.Tensor{
 					Name:     name,
 					Kind:     t.Kind(),
 					Shape:    t.Shape(),
 					WriterTo: tt,
 				})
 			}
 		} else if t.Name() == "v.pre_tile_position_embd.gate" || t.Name() == "v.post_tile_position_embd.gate" {
 			t.SetRepacker(m.repack(t.Name()))
 			out = append(out, &ggml.Tensor{
 				Name:     t.Name(),
 				Kind:     t.Kind(),
 				Shape:    t.Shape(),
 				WriterTo: t,
 			})
 		} else if strings.HasPrefix(t.Name(), "v.") || strings.HasPrefix(t.Name(), "mm.") {
 			out = append(out, &ggml.Tensor{
 				Name:     t.Name(),
 				Kind:     t.Kind(),
 				Shape:    t.Shape(),
 				WriterTo: t,
 			})
 		} else {
 			text = append(text, t)
 		}
 	}
 	return append(out, m.TextModel.Tensors(text)...)
 }
 func (m *mllamaModel) repack(name string) Repacker {
 	return func(_ string, data []float32, shape []uint64) (_ []float32, err error) {
 		dims := make([]int, len(shape))
 		for i, dim := range shape {
 			dims[i] = int(dim)
 		}
 		var t tensor.Tensor = tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
 		t, err = tensor.Tanh(t)
 		if err != nil {
 			return nil, err
 		}
 		if name == "v.position_embd.gate" {
 			t, err = tensor.Sub(float32(1), t)
 			if err != nil {
 				return nil, err
 			}
 		}
 		t = tensor.Materialize(t)
 		// flatten tensor so it can be return as a vector
 		if err := t.Reshape(t.Shape().TotalSize()); err != nil {
 			return nil, err
 		}
 		return native.VectorF32(t.(*tensor.Dense))
 	}
 }
--- a/convert/reader.go
+++ b/convert/reader.go
@ -38,10 +38,7 @@ const (
 func (t tensorBase) Kind() uint32 {
 	if strings.HasSuffix(t.name, ".ffn_gate_inp.weight") ||
 		t.name == "token_types.weight" ||
-		t.name == "v.positional_embedding_vlm" ||
+		t.name == "v.positional_embedding_vlm" {
 		t.name == "v.tile_position_embd.weight" ||
 		t.name == "v.pre_tile_position_embd.weight" ||
 		t.name == "v.post_tile_position_embd.weight" {
 		// these tensors are always F32
 		return 0
 	}
--- a/discover/cuda_common.go
+++ b/discover/cuda_common.go
@ -3,6 +3,7 @@
 package discover
 import (
 	"fmt"
 	"log/slog"
 	"os"
 	"regexp"
@ -59,6 +60,8 @@ func cudaVariant(gpuInfo CudaGPUInfo) string {
 	// driver 12.0 has problems with the cuda v12 library, so run v11 on those older drivers
 	if gpuInfo.DriverMajor < 12 || (gpuInfo.DriverMajor == 12 && gpuInfo.DriverMinor == 0) {
 		// The detected driver is older than Feb 2023
 		slog.Warn("old CUDA driver detected - please upgrade to a newer driver", "version", fmt.Sprintf("%d.%d", gpuInfo.DriverMajor, gpuInfo.DriverMinor))
 		return "v11"
 	}
 	return "v12"
--- a/discover/path.go
+++ b/discover/path.go
@ -12,7 +12,7 @@ import (
 // '../lib/ollama' on Linux and the executable's directory on macOS
 // note: distribution builds, additional GPU-specific libraries are
 // found in subdirectories of the returned path, such as
-// 'cuda_v11', 'cuda_v12', 'rocm', etc.
+// 'cuda_v12', 'rocm', etc.
 var LibOllamaPath string = func() string {
 	exe, err := os.Executable()
 	if err != nil {
--- a/docs/api.md
+++ b/docs/api.md
@ -19,7 +19,7 @@
 ### Model names
-Model names follow a `model:tag` format, where `model` can have an optional namespace such as `example/model`. Some examples are `orca-mini:3b-q8_0` and `llama3:70b`. The tag is optional and, if not provided, will default to `latest`. The tag is used to identify a specific version.
+Model names follow a `model:tag` format, where `model` can have an optional namespace such as `example/model`. Some examples are `orca-mini:3b-q4_1` and `llama3:70b`. The tag is optional and, if not provided, will default to `latest`. The tag is used to identify a specific version.
 ### Durations
@ -952,8 +952,19 @@ If you are creating a model from a safetensors directory or from a GGUF file, yo
 | Type | Recommended |
 | --- | :-: |
 | q2_K | |
 | q3_K_L | |
 | q3_K_M | |
 | q3_K_S | |
 | q4_0 | |
 | q4_1 | |
 | q4_K_M | * |
 | q4_K_S | |
 | q5_0 | |
 | q5_1 | |
 | q5_K_M | |
 | q5_K_S | |
 | q6_K | |
 | q8_0 | * |
 ### Examples
@ -998,8 +1009,8 @@ Quantize a non-quantized model.
 ```shell
 curl http://localhost:11434/api/create -d '{
-  "model": "llama3.2:quantized",
+  "model": "llama3.1:quantized",
-  "from": "llama3.2:3b-instruct-fp16",
+  "from": "llama3.1:8b-instruct-fp16",
  "quantize": "q4_K_M"
 }'
 ```
@ -1009,14 +1020,12 @@ curl http://localhost:11434/api/create -d '{
 A stream of JSON objects is returned:
 ```json
-{"status":"quantizing F16 model to Q4_K_M","digest":"0","total":6433687776,"completed":12302}
+{"status":"quantizing F16 model to Q4_K_M"}
-{"status":"quantizing F16 model to Q4_K_M","digest":"0","total":6433687776,"completed":6433687552}
+{"status":"creating new layer sha256:667b0c1932bc6ffc593ed1d03f895bf2dc8dc6df21db3042284a6f4416b06a29"}
-{"status":"verifying conversion"}
+{"status":"using existing layer sha256:11ce4ee3e170f6adebac9a991c22e22ab3f8530e154ee669954c4bc73061c258"}
-{"status":"creating new layer sha256:fb7f4f211b89c6c4928ff4ddb73db9f9c0cfca3e000c3e40d6cf27ddc6ca72eb"}
+{"status":"using existing layer sha256:0ba8f0e314b4264dfd19df045cde9d4c394a52474bf92ed6a3de22a4ca31a177"}
 {"status":"using existing layer sha256:966de95ca8a62200913e3f8bfbf84c8494536f1b94b49166851e76644e966396"}
 {"status":"using existing layer sha256:fcc5a6bec9daf9b561a68827b67ab6088e1dba9d1fa2a50d7bbcc8384e0a265d"}
 {"status":"using existing layer sha256:a70ff7e570d97baaf4e62ac6e6ad9975e04caa6d900d3742d37698494479e0cd"}
 {"status":"using existing layer sha256:56bb8bd477a519ffa694fc449c2413c6f0e1d3b1c88fa7e3c9d88d3ae49d4dcb"}
 {"status":"creating new layer sha256:455f34728c9b5dd3376378bfb809ee166c145b0b4c1f1a6feca069055066ef9a"}
 {"status":"writing manifest"}
 {"status":"success"}
 ```
@ -1154,37 +1163,29 @@ A single JSON object will be returned.
 {
  "models": [
    {
-      "name": "deepseek-r1:latest",
+      "name": "codellama:13b",
-      "model": "deepseek-r1:latest",
+      "modified_at": "2023-11-04T14:56:49.277302595-07:00",
-      "modified_at": "2025-05-10T08:06:48.639712648-07:00",
+      "size": 7365960935,
-      "size": 4683075271,
+      "digest": "9f438cb9cd581fc025612d27f7c1a6669ff83a8bb0ed86c94fcf4c5440555697",
      "digest": "0a8c266910232fd3291e71e5ba1e058cc5af9d411192cf88b6d30e92b6e73163",
      "details": {
        "parent_model": "",
        "format": "gguf",
-        "family": "qwen2",
+        "family": "llama",
-        "families": [
+        "families": null,
-          "qwen2"
+        "parameter_size": "13B",
-        ],
+        "quantization_level": "Q4_0"
        "parameter_size": "7.6B",
        "quantization_level": "Q4_K_M"
      }
    },
    {
-      "name": "llama3.2:latest",
+      "name": "llama3:latest",
-      "model": "llama3.2:latest",
+      "modified_at": "2023-12-07T09:32:18.757212583-08:00",
-      "modified_at": "2025-05-04T17:37:44.706015396-07:00",
+      "size": 3825819519,
-      "size": 2019393189,
+      "digest": "fe938a131f40e6f6d40083c9f0f430a515233eb2edaa6d72eb85c50d64f2300e",
      "digest": "a80c4f17acd55265feec403c7aef86be0c25983ab279d83f3bcd3abbcb5b8b72",
      "details": {
        "parent_model": "",
        "format": "gguf",
        "family": "llama",
-        "families": [
+        "families": null,
-          "llama"
+        "parameter_size": "7B",
-        ],
+        "quantization_level": "Q4_0"
        "parameter_size": "3.2B",
        "quantization_level": "Q4_K_M"
      }
    }
  ]
--- a/docs/gpu.md
+++ b/docs/gpu.md
@ -1,6 +1,6 @@
 # GPU
 ## Nvidia
-Ollama supports Nvidia GPUs with compute capability 5.0+.
+Ollama supports Nvidia GPUs with compute capability 5.0+ and driver version 531 and newer.
 Check your compute compatibility to see if your card is supported:
 [https://developer.nvidia.com/cuda-gpus](https://developer.nvidia.com/cuda-gpus)
--- a/docs/troubleshooting.md
+++ b/docs/troubleshooting.md
@ -43,7 +43,7 @@ Ollama includes multiple LLM libraries compiled for different GPUs and CPU vecto
 In the server log, you will see a message that looks something like this (varies from release to release):
 ```
-Dynamic LLM libraries [rocm_v6 cpu cpu_avx cpu_avx2 cuda_v11 rocm_v5]
+Dynamic LLM libraries [rocm_v6 cpu cpu_avx cpu_avx2 cuda_v12 rocm_v5]
 ```
 **Experimental LLM Library Override**
--- a/fs/ggml/ggml.go
+++ b/fs/ggml/ggml.go
@ -6,7 +6,6 @@ import (
 	"fmt"
 	"io"
 	"log/slog"
 	"math"
 	"slices"
 	"strings"
@ -126,7 +125,6 @@ func (kv KV) OllamaEngineRequired() bool {
 		"gemma3",
 		"mistral3",
 		"llama4",
 		"mllama",
 		"qwen25vl",
 	}, kv.Architecture())
 }
@ -651,29 +649,6 @@ func (llm GGML) VisionGraphSize() (weights, graphSize uint64) {
 		graphSize = 4 * (imageSize*imageSize*numChannels +
 			embeddingLength*patchSize +
 			numPatches*numPatches*headCount)
 	case "qwen25vl":
 		maxPixels := uint64(llm.KV().Uint("vision.max_pixels", 28*28*1280))
 		mergeSize := uint64(llm.KV().Uint("vision.spatial_merge_size", 2))
 		temporalPatchSize := uint64(2)
 		// Calculate max possible patches based on max_pixels
 		maxHeight := uint64(math.Sqrt(float64(maxPixels)))
 		maxWidth := maxPixels / maxHeight
 		maxGridHeight := maxHeight / patchSize
 		maxGridWidth := maxWidth / patchSize
 		// Account for merged patches (2x2 grid)
 		numPatches := (maxGridHeight * maxGridWidth) / (mergeSize * mergeSize)
 		// Calculate graph size based on typical operations in ProcessImage and createPatches
 		graphSize = 4 * (maxPixels*numChannels + // Original image storage
 			// Normalized pixels
 			maxPixels*numChannels +
 			// Patches storage (numPatches * channels * temporalPatchSize * patchSize^2)
 			numPatches*numChannels*temporalPatchSize*patchSize*patchSize +
 			// Self-attention calculations (similar to other architectures)
 			numPatches*numPatches*headCount +
 			// Additional buffer for processing
 			embeddingLength*numPatches)
 	case "llama4":
 		// vision graph is computed independently in the same schedule
 		// and is negligible compared to the worst case text graph
--- a/fs/ggml/type.go
+++ b/fs/ggml/type.go
@ -12,42 +12,42 @@ type FileType uint32
 const (
 	FileTypeF32 FileType = iota
 	FileTypeF16
-	fileTypeQ4_0
+	FileTypeQ4_0
-	fileTypeQ4_1
+	FileTypeQ4_1
 	fileTypeQ4_1_F16 // unused by GGML
 	fileTypeQ4_2     // unused by GGML
 	fileTypeQ4_3     // unused by GGML
 	FileTypeQ8_0
-	fileTypeQ5_0
+	FileTypeQ5_0
-	fileTypeQ5_1
+	FileTypeQ5_1
-	fileTypeQ2_K
+	FileTypeQ2_K
-	fileTypeQ3_K_S
+	FileTypeQ3_K_S
-	fileTypeQ3_K_M
+	FileTypeQ3_K_M
-	fileTypeQ3_K_L
+	FileTypeQ3_K_L
 	FileTypeQ4_K_S
 	FileTypeQ4_K_M
-	fileTypeQ5_K_S
+	FileTypeQ5_K_S
-	fileTypeQ5_K_M
+	FileTypeQ5_K_M
-	fileTypeQ6_K
+	FileTypeQ6_K
-	fileTypeIQ2_XXS
+	fileTypeIQ2_XXS // not supported by ollama
-	fileTypeIQ2_XS
+	fileTypeIQ2_XS  // not supported by ollama
-	fileTypeQ2_K_S
+	FileTypeQ2_K_S
-	fileTypeIQ3_XS
+	fileTypeIQ3_XS  // not supported by ollama
-	fileTypeIQ3_XXS
+	fileTypeIQ3_XXS // not supported by ollama
-	fileTypeIQ1_S
+	fileTypeIQ1_S   // not supported by ollama
-	fileTypeIQ4_NL
+	fileTypeIQ4_NL  // not supported by ollama
-	fileTypeIQ3_S
+	fileTypeIQ3_S   // not supported by ollama
-	fileTypeIQ3_M
+	fileTypeIQ3_M   // not supported by ollama
-	fileTypeIQ2_S
+	fileTypeIQ2_S   // not supported by ollama
-	fileTypeIQ2_M
+	fileTypeIQ2_M   // not supported by ollama
-	fileTypeIQ4_XS
+	fileTypeIQ4_XS  // not supported by ollama
-	fileTypeIQ1_M
+	fileTypeIQ1_M   // not supported by ollama
 	FileTypeBF16
 	fileTypeQ4_0_4_4 // unused by GGML
 	fileTypeQ4_0_4_8 // unused by GGML
 	fileTypeQ4_0_8_8 // unused by GGML
-	fileTypeTQ1_0
+	fileTypeTQ1_0    // not supported by ollama
-	fileTypeTQ2_0
+	fileTypeTQ2_0    // not supported by ollama
 	FileTypeUnknown = 1024
 )
@ -60,12 +60,36 @@ func ParseFileType(s string) (FileType, error) {
 		return FileTypeF32, nil
 	case "F16":
 		return FileTypeF16, nil
 	case "Q4_0":
 		return FileTypeQ4_0, nil
 	case "Q4_1":
 		return FileTypeQ4_1, nil
 	case "Q8_0":
 		return FileTypeQ8_0, nil
 	case "Q5_0":
 		return FileTypeQ5_0, nil
 	case "Q5_1":
 		return FileTypeQ5_1, nil
 	case "Q2_K":
 		return FileTypeQ2_K, nil
 	case "Q3_K_S":
 		return FileTypeQ3_K_S, nil
 	case "Q3_K_M":
 		return FileTypeQ3_K_M, nil
 	case "Q3_K_L":
 		return FileTypeQ3_K_L, nil
 	case "Q4_K_S":
 		return FileTypeQ4_K_S, nil
 	case "Q4_K_M", "Q4_K":
 		return FileTypeQ4_K_M, nil
 	case "Q5_K_S":
 		return FileTypeQ5_K_S, nil
 	case "Q5_K_M", "Q5_K":
 		return FileTypeQ5_K_M, nil
 	case "Q6_K":
 		return FileTypeQ6_K, nil
 	case "Q2_K_S":
 		return FileTypeQ2_K_S, nil
 	case "BF16":
 		return FileTypeBF16, nil
 	default:
@ -87,41 +111,40 @@ func ParseFileType(s string) (FileType, error) {
 }
 func (t FileType) String() string {
 	// Note: this routine will return a broader set of file types for existing models
 	switch t {
 	case FileTypeF32:
 		return "F32"
 	case FileTypeF16:
 		return "F16"
-	case fileTypeQ4_0:
+	case FileTypeQ4_0:
 		return "Q4_0"
-	case fileTypeQ4_1:
+	case FileTypeQ4_1:
 		return "Q4_1"
 	case FileTypeQ8_0:
 		return "Q8_0"
-	case fileTypeQ5_0:
+	case FileTypeQ5_0:
 		return "Q5_0"
-	case fileTypeQ5_1:
+	case FileTypeQ5_1:
 		return "Q5_1"
-	case fileTypeQ2_K:
+	case FileTypeQ2_K:
 		return "Q2_K"
-	case fileTypeQ3_K_S:
+	case FileTypeQ3_K_S:
 		return "Q3_K_S"
-	case fileTypeQ3_K_M:
+	case FileTypeQ3_K_M:
 		return "Q3_K_M"
-	case fileTypeQ3_K_L:
+	case FileTypeQ3_K_L:
 		return "Q3_K_L"
 	case FileTypeQ4_K_S:
 		return "Q4_K_S"
 	case FileTypeQ4_K_M:
 		return "Q4_K_M"
-	case fileTypeQ5_K_S:
+	case FileTypeQ5_K_S:
 		return "Q5_K_S"
-	case fileTypeQ5_K_M:
+	case FileTypeQ5_K_M:
 		return "Q5_K_M"
-	case fileTypeQ6_K:
+	case FileTypeQ6_K:
 		return "Q6_K"
-	case fileTypeQ2_K_S:
+	case FileTypeQ2_K_S:
 		return "Q2_K_S"
 	case FileTypeBF16:
 		return "BF16"
@ -140,35 +163,35 @@ func (ftype FileType) ToTensorType() TensorType {
 		return TensorTypeF32
 	case FileTypeF16:
 		return TensorTypeF16
-	case fileTypeQ4_0:
+	case FileTypeQ4_0:
 		return TensorTypeQ4_0
-	case fileTypeQ4_1:
+	case FileTypeQ4_1:
 		return TensorTypeQ4_1
 	case FileTypeQ8_0:
 		return TensorTypeQ8_0
-	case fileTypeQ5_0:
+	case FileTypeQ5_0:
 		return TensorTypeQ5_0
-	case fileTypeQ5_1:
+	case FileTypeQ5_1:
 		return TensorTypeQ5_1
-	case fileTypeQ2_K:
+	case FileTypeQ2_K:
 		return TensorTypeQ2_K
-	case fileTypeQ3_K_S:
+	case FileTypeQ3_K_S:
 		return TensorTypeQ3_K
-	case fileTypeQ3_K_M:
+	case FileTypeQ3_K_M:
 		return TensorTypeQ3_K
-	case fileTypeQ3_K_L:
+	case FileTypeQ3_K_L:
 		return TensorTypeQ3_K
 	case FileTypeQ4_K_S:
 		return TensorTypeQ4_K
 	case FileTypeQ4_K_M:
 		return TensorTypeQ4_K
-	case fileTypeQ5_K_S:
+	case FileTypeQ5_K_S:
 		return TensorTypeQ5_K
-	case fileTypeQ5_K_M:
+	case FileTypeQ5_K_M:
 		return TensorTypeQ5_K
-	case fileTypeQ6_K:
+	case FileTypeQ6_K:
 		return TensorTypeQ6_K
-	case fileTypeQ2_K_S:
+	case FileTypeQ2_K_S:
 		return TensorTypeQ2_K
 	case FileTypeBF16:
 		return TensorTypeBF16
--- a/llama/llama.cpp/include/llama.h
+++ b/llama/llama.cpp/include/llama.h
@ -258,6 +258,7 @@ extern "C" {
        llama_token  *  token;
        float        *  embd;
        int32_t         n_embd;
        llama_pos    *  pos;
        int32_t      *  n_seq_id;
        llama_seq_id ** seq_id;
@ -365,6 +366,7 @@ extern "C" {
        bool flash_attn;  // whether to use flash attention [EXPERIMENTAL]
        bool no_perf;     // whether to measure performance timings
        bool op_offload;  // whether to offload host tensor operations to device
        bool cross_attn;  // whether to use cross attention
    };
    // model quantization parameters
@ -464,6 +466,10 @@ extern "C" {
            struct llama_context_params   params),
            "use llama_init_from_model instead");
    // TODO (jmorganca): this should most likely be passed in as part of a batch
    // and not set on the context for all batches.
    LLAMA_API void llama_set_cross_attention(struct llama_context * ctx, bool cross_attn_state);
    // Frees all allocated memory
    LLAMA_API void llama_free(struct llama_context * ctx);
--- a/llama/llama.cpp/src/llama-arch.cpp
+++ b/llama/llama.cpp/src/llama-arch.cpp
@ -6,6 +6,7 @@
 static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
    { LLM_ARCH_LLAMA,            "llama"            },
    { LLM_ARCH_MLLAMA,           "mllama"           },
    { LLM_ARCH_LLAMA4,           "llama4"           },
    { LLM_ARCH_DECI,             "deci"             },
    { LLM_ARCH_FALCON,           "falcon"           },
@ -144,6 +145,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
    { LLM_KV_ATTENTION_SLIDING_WINDOW,               "%s.attention.sliding_window"               },
    { LLM_KV_ATTENTION_SCALE,                        "%s.attention.scale"                        },
    { LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,        "%s.attention.block_skip_connection"        },
    { LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS,       "%s.attention.cross_attention_layers"       },
    { LLM_KV_ATTENTION_KEY_LENGTH_MLA,               "%s.attention.key_length_mla"               },
    { LLM_KV_ATTENTION_VALUE_LENGTH_MLA,             "%s.attention.value_length_mla"             },
@ -273,6 +275,40 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
            { LLM_TENSOR_FFN_UP_SHEXP,    "blk.%d.ffn_up_shexp" },
        },
    },
    {
        LLM_ARCH_MLLAMA,
        {
            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
            { LLM_TENSOR_OUTPUT,          "output" },
            { LLM_TENSOR_ROPE_FREQS,      "rope_freqs" },
            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
            { LLM_TENSOR_ATTN_ROT_EMBD,   "blk.%d.attn_rot_embd" },
            { LLM_TENSOR_FFN_GATE_INP,    "blk.%d.ffn_gate_inp" },
            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
            { LLM_TENSOR_FFN_GATE_EXP,    "blk.%d.ffn_gate.%d" },
            { LLM_TENSOR_FFN_DOWN_EXP,    "blk.%d.ffn_down.%d" },
            { LLM_TENSOR_FFN_UP_EXP,      "blk.%d.ffn_up.%d" },
            { LLM_TENSOR_FFN_GATE_EXPS,   "blk.%d.ffn_gate_exps" },
            { LLM_TENSOR_FFN_DOWN_EXPS,   "blk.%d.ffn_down_exps" },
            { LLM_TENSOR_FFN_UP_EXPS,     "blk.%d.ffn_up_exps" },
            { LLM_TENSOR_CROSS_ATTN_K_NORM,    "blk.%d.cross_attn_k_norm" },
            { LLM_TENSOR_CROSS_ATTN_K_PROJ,    "blk.%d.cross_attn_k_proj" },
            { LLM_TENSOR_CROSS_ATTN_O_PROJ,    "blk.%d.cross_attn_o_proj" },
            { LLM_TENSOR_CROSS_ATTN_Q_NORM,    "blk.%d.cross_attn_q_norm" },
            { LLM_TENSOR_CROSS_ATTN_Q_PROJ,    "blk.%d.cross_attn_q_proj" },
            { LLM_TENSOR_CROSS_ATTN_V_PROJ,    "blk.%d.cross_attn_v_proj" },
            { LLM_TENSOR_CROSS_ATTN_ATTN_GATE, "blk.%d.cross_attn_attn_gate" },
            { LLM_TENSOR_CROSS_ATTN_MLP_GATE,  "blk.%d.cross_attn_mlp_gate" },
        },
    },
    {
        LLM_ARCH_DECI,
        {
@ -1701,6 +1737,14 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
    // this tensor is loaded for T5, but never used
    {LLM_TENSOR_DEC_CROSS_ATTN_REL_B,       {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
    {LLM_TENSOR_BSKCN_TV,                   {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
    {LLM_TENSOR_CROSS_ATTN_K_NORM,          {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
    {LLM_TENSOR_CROSS_ATTN_K_PROJ,          {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
    {LLM_TENSOR_CROSS_ATTN_O_PROJ,          {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
    {LLM_TENSOR_CROSS_ATTN_Q_NORM,          {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
    {LLM_TENSOR_CROSS_ATTN_Q_PROJ,          {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
    {LLM_TENSOR_CROSS_ATTN_V_PROJ,          {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
    {LLM_TENSOR_CROSS_ATTN_ATTN_GATE,       {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
    {LLM_TENSOR_CROSS_ATTN_MLP_GATE,        {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
    {LLM_TENSOR_CONV1D,                     {LLM_TENSOR_LAYER_INPUT,     GGML_OP_IM2COL}},
    {LLM_TENSOR_POS_NET_NORM,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
    {LLM_TENSOR_POS_NET_NORM1,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
--- a/llama/llama.cpp/src/llama-arch.h
+++ b/llama/llama.cpp/src/llama-arch.h
@ -11,6 +11,7 @@
 enum llm_arch {
    LLM_ARCH_LLAMA,
    LLM_ARCH_LLAMA4,
    LLM_ARCH_MLLAMA,
    LLM_ARCH_DECI,
    LLM_ARCH_FALCON,
    LLM_ARCH_BAICHUAN,
@ -148,6 +149,7 @@ enum llm_kv {
    LLM_KV_ATTENTION_SLIDING_WINDOW,
    LLM_KV_ATTENTION_SCALE,
    LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
    LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS,
    LLM_KV_ATTENTION_KEY_LENGTH_MLA,
    LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
@ -349,6 +351,14 @@ enum llm_tensor {
    LLM_TENSOR_CLS,
    LLM_TENSOR_CLS_OUT,
    LLM_TENSOR_BSKCN_TV,
    LLM_TENSOR_CROSS_ATTN_K_NORM,
    LLM_TENSOR_CROSS_ATTN_K_PROJ,
    LLM_TENSOR_CROSS_ATTN_O_PROJ,
    LLM_TENSOR_CROSS_ATTN_Q_NORM,
    LLM_TENSOR_CROSS_ATTN_Q_PROJ,
    LLM_TENSOR_CROSS_ATTN_V_PROJ,
    LLM_TENSOR_CROSS_ATTN_ATTN_GATE,
    LLM_TENSOR_CROSS_ATTN_MLP_GATE,
    LLM_TENSOR_CONV1D,
    LLM_TENSOR_CONVNEXT_DW,
    LLM_TENSOR_CONVNEXT_NORM,
--- a/llama/llama.cpp/src/llama-batch.cpp
+++ b/llama/llama.cpp/src/llama-batch.cpp
@ -320,6 +320,7 @@ struct llama_batch llama_batch_get_one(
        /*n_tokens       =*/ n_tokens,
        /*tokens         =*/ tokens,
        /*embd           =*/ nullptr,
        /*n_embd         =*/ 0,
        /*pos            =*/ nullptr,
        /*n_seq_id       =*/ nullptr,
        /*seq_id         =*/ nullptr,
@ -332,6 +333,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
        /*n_tokens       =*/ 0,
        /*tokens         =*/ nullptr,
        /*embd           =*/ nullptr,
        /*n_embd         =*/ 0,
        /*pos            =*/ nullptr,
        /*n_seq_id       =*/ nullptr,
        /*seq_id         =*/ nullptr,
@ -340,6 +342,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
    if (embd) {
        batch.embd = (float *) malloc(sizeof(float) * n_tokens_alloc * embd);
        batch.n_embd = embd;
    } else {
        batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens_alloc);
    }
--- a/llama/llama.cpp/src/llama-context.cpp
+++ b/llama/llama.cpp/src/llama-context.cpp
@ -514,7 +514,7 @@ float * llama_context::get_logits_ith(int32_t i) {
            throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, n_outputs));
        }
-        return logits + j*model.vocab.n_tokens();
+        return logits + j*model.hparams.n_vocab;
    } catch (const std::exception & err) {
        LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what());
 #ifndef NDEBUG
@ -632,6 +632,10 @@ void llama_context::set_warmup(bool value) {
    cparams.warmup = value;
 }
 void llama_context::set_cross_attn(bool value) {
    cparams.cross_attn = value;
 }
 void llama_context::set_adapter_lora(
            llama_adapter_lora * adapter,
            float scale) {
@ -709,7 +713,7 @@ int llama_context::encode(llama_batch & inp_batch) {
    const int64_t n_embd = hparams.n_embd;
-    llama_sbatch sbatch = llama_sbatch(batch, n_embd, /* simple_split */ true, /* logits_all */ true);
+    llama_sbatch sbatch = llama_sbatch(batch, batch.n_embd, /* simple_split */ true, /* logits_all */ true);
    const llama_ubatch ubatch = sbatch.split_simple(n_tokens);
@ -863,10 +867,9 @@ int llama_context::decode(llama_batch & inp_batch) {
    const llama_batch & batch = batch_allocr.batch;
    const auto & vocab   = model.vocab;
    const auto & hparams = model.hparams;
-    const int32_t n_vocab = vocab.n_tokens();
+    const int32_t n_vocab = hparams.n_vocab;
    const int64_t n_tokens_all = batch.n_tokens;
    const int64_t n_embd       = hparams.n_embd;
@ -947,12 +950,9 @@ int llama_context::decode(llama_batch & inp_batch) {
        // find KV slot
        if (!kv_self->find_slot(ubatch)) {
-            kv_self->defrag_sched(-1.0f);
+            LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
-            kv_self->update(*this);
+
-            if (!kv_self->find_slot(ubatch)) {
+            return 1;
                LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
                return 1;
            }
        }
        ggml_backend_sched_reset(sched.get());
@ -1090,7 +1090,7 @@ int llama_context::decode(llama_batch & inp_batch) {
        // make the outputs have the same order they had in the user-provided batch
        // note: this is mostly relevant for recurrent models atm
        if (!sorted_output) {
-            const uint32_t n_vocab = model.vocab.n_tokens();
+            const uint32_t n_vocab = model.hparams.n_vocab;
            const uint32_t n_embd  = model.hparams.n_embd;
            GGML_ASSERT((size_t) n_outputs == out_ids.size());
@ -1145,12 +1145,11 @@ int llama_context::decode(llama_batch & inp_batch) {
 int32_t llama_context::output_reserve(int32_t n_outputs) {
    const auto & hparams = model.hparams;
    const auto & vocab   = model.vocab;
    const int64_t n_outputs_max = std::max<int64_t>(n_outputs, n_seq_max());
    const auto n_batch = cparams.n_batch;
-    const auto n_vocab = vocab.n_tokens();
+    const auto n_vocab = hparams.n_vocab;
    const auto n_embd  = hparams.n_embd;
    // TODO: use a per-batch flag for logits presence instead
@ -1685,7 +1684,7 @@ size_t llama_context::state_write_data(llama_io_write_i & io) {
    {
        LLAMA_LOG_DEBUG("%s: - writing logits\n", __func__);
-        const uint64_t logits_size = std::min((uint64_t) this->logits_size, (uint64_t) n_outputs * model.vocab.n_tokens());
+        const uint64_t logits_size = std::min((uint64_t) this->logits_size, (uint64_t) n_outputs * model.hparams.n_vocab);
        io.write(&logits_size, sizeof(logits_size));
@ -1968,12 +1967,9 @@ void llama_context::opt_epoch_iter(
            // TODO: not sure if this is needed
            if (!kv_self->find_slot(ubatch)) {
-                kv_self->defrag_sched(-1.0f);
+                LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
-                kv_self->update(*this);
+
-                if (!kv_self->find_slot(ubatch)) {
+                GGML_ABORT("TODO: handle this error");
                    LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
                    GGML_ABORT("TODO: handle this error");
                }
            }
            auto * gf = graph_init();
@ -2097,6 +2093,7 @@ llama_context_params llama_context_default_params() {
        /*.flash_attn                  =*/ false,
        /*.no_perf                     =*/ true,
        /*.op_offload                  =*/ true,
        /*.cross_attn                  =*/ false,
    };
    return result;
@ -2222,6 +2219,10 @@ void llama_set_warmup(llama_context * ctx, bool warmup) {
    ctx->set_warmup(warmup);
 }
 void llama_set_cross_attention(struct llama_context * ctx, bool cross_attention) {
    ctx->set_cross_attn(cross_attention);
 }
 void llama_synchronize(llama_context * ctx) {
    ctx->synchronize();
 }
--- a/llama/llama.cpp/src/llama-context.h
+++ b/llama/llama.cpp/src/llama-context.h
@ -72,6 +72,7 @@ struct llama_context {
    void set_embeddings (bool value);
    void set_causal_attn(bool value);
    void set_warmup(bool value);
    void set_cross_attn(bool value);
    void set_adapter_lora(
            llama_adapter_lora * adapter,
--- a/llama/llama.cpp/src/llama-cparams.h
+++ b/llama/llama.cpp/src/llama-cparams.h
@ -31,6 +31,7 @@ struct llama_cparams {
    bool no_perf;
    bool warmup;
    bool op_offload;
    bool cross_attn;
    enum llama_pooling_type pooling_type;
--- a/llama/llama.cpp/src/llama-graph.cpp
+++ b/llama/llama.cpp/src/llama-graph.cpp
@ -532,6 +532,12 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
    }
 }
 void llm_graph_input_cross_attn_state::set_input(const llama_ubatch * ubatch) {
    if (ubatch->embd) {
        ggml_backend_tensor_set(cross_attn_state, ubatch->embd, 0, ggml_nbytes(cross_attn_state));
    }
 }
 //
 // llm_graph_context
 //
@ -1514,6 +1520,25 @@ llm_graph_input_attn_cross * llm_graph_context::build_attn_inp_cross() const {
    return (llm_graph_input_attn_cross *) res->add_input(std::move(inp));
 }
 ggml_tensor * llm_graph_context::build_inp_cross_attn_state() const {
    const int64_t n_embd = hparams.n_embd;
    auto inp = std::make_unique<llm_graph_input_cross_attn_state>();
    ggml_tensor * cur = nullptr;
    inp->cross_attn_state = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd, 1601, 4);
    ggml_set_input(inp->cross_attn_state);
    cur = inp->cross_attn_state;
    cb(cur, "inp_cross_attn_state", -1);
    res->add_input(std::move(inp));
    return cur;
 }
 ggml_tensor * llm_graph_context::build_attn(
        llm_graph_input_attn_cross * inp,
        ggml_cgraph * gf,
--- a/llama/llama.cpp/src/llama-graph.h
+++ b/llama/llama.cpp/src/llama-graph.h
@ -87,6 +87,7 @@ public:
    ggml_tensor * tokens = nullptr; // I32 [n_batch]
    ggml_tensor * embd   = nullptr; // F32 [n_embd, n_batch]
    ggml_tensor * cross_attn_state; // F32 [4, n_embd, 1061]
 };
 class llm_graph_input_pos : public llm_graph_input_i {
@ -284,6 +285,16 @@ public:
    const llama_cross * cross = nullptr;
 };
 class llm_graph_input_cross_attn_state : public llm_graph_input_i {
 public:
    llm_graph_input_cross_attn_state()          = default;
    virtual ~llm_graph_input_cross_attn_state() = default;
    void set_input(const llama_ubatch * ubatch) override;
    ggml_tensor * cross_attn_state; // F32 [4, n_embd, 1061]
 };
 //
 // llm_graph_result
 //
@ -495,6 +506,7 @@ struct llm_graph_context {
    ggml_tensor * build_inp_cls() const;
    ggml_tensor * build_inp_s_copy() const;
    ggml_tensor * build_inp_s_mask() const;
    ggml_tensor * build_inp_cross_attn_state() const;
    ggml_tensor * build_inp_cross_embd() const;
    ggml_tensor * build_inp_pos_bucket_enc() const;
--- a/llama/llama.cpp/src/llama-hparams.cpp
+++ b/llama/llama.cpp/src/llama-hparams.cpp
@ -85,3 +85,7 @@ bool llama_hparams::is_swa(uint32_t il) const {
    GGML_ABORT("fatal error");
 }
 bool llama_hparams::cross_attention_layers(uint32_t il) const {
    return std::find(cross_attn_layers.begin(), cross_attn_layers.end(), il) != cross_attn_layers.end();
 }
--- a/llama/llama.cpp/src/llama-hparams.h
+++ b/llama/llama.cpp/src/llama-hparams.h
@ -2,6 +2,8 @@
 #include "llama.h"
 #include <algorithm>
 #include <array>
 // bump if necessary
@ -42,6 +44,7 @@ struct llama_hparams {
    uint32_t n_expert = 0;
    uint32_t n_expert_used = 0;
    uint32_t n_rel_attn_bkts = 0;
    uint32_t n_vocab = 0;
    // note: deepseek2 using MLA converts into MQA with larger heads, then decompresses to MHA
    uint32_t n_embd_head_k_mla = 0;
@ -56,6 +59,7 @@ struct llama_hparams {
    std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
    std::array<std::array<uint32_t, LLAMA_MAX_LAYERS>, 4> n_bskcn_arr = {};
    std::array<uint32_t, LLAMA_MAX_LAYERS> cross_attn_layers;
    uint32_t n_layer_dense_lead = 0;
    uint32_t n_lora_q           = 0;
@ -159,6 +163,9 @@ struct llama_hparams {
    // Block skip connection
    bool n_bskcn(uint32_t n, uint32_t il) const;
    // cross attention layers
    bool cross_attention_layers(uint32_t il) const;
    bool is_swa(uint32_t il) const;
 };
--- a/llama/llama.cpp/src/llama-kv-cache.cpp
+++ b/llama/llama.cpp/src/llama-kv-cache.cpp
@ -100,8 +100,16 @@ llama_kv_cache_unified::llama_kv_cache_unified(
            throw std::runtime_error("failed to create ggml context for kv cache");
        }
-        ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
+        ggml_tensor * k, *v;
-        ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size);
+
        // for cross attention layers
        if (model.arch == LLM_ARCH_MLLAMA && hparams.cross_attention_layers(i)) {
            k = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hparams.n_embd_head_k, 6404, hparams.n_head_kv(i));
            v = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hparams.n_embd_head_v, 6404, hparams.n_head_kv(i));
        } else {
            k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
            v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size);
        }
        ggml_format_name(k, "cache_k_l%d", i);
        ggml_format_name(v, "cache_v_l%d", i);
        k_l.push_back(k);
@ -451,7 +459,7 @@ void llama_kv_cache_unified::set_full() {
 llama_sbatch llama_kv_cache_unified::sbatch_init(
        const llama_batch & batch,
        bool logits_all) {
-    return llama_sbatch(batch, hparams.n_embd, true, logits_all);
+    return llama_sbatch(batch, batch.n_embd, true, logits_all);
 }
 llama_ubatch llama_kv_cache_unified::ubatch_next(
--- a/llama/llama.cpp/src/llama-model-loader.cpp
+++ b/llama/llama.cpp/src/llama-model-loader.cpp
@ -315,6 +315,8 @@ namespace GGUFMeta {
        return true;
    }
    template bool llama_model_loader::get_arr<std::array<unsigned int, 512>>(enum llm_kv kid, std::array<unsigned int, 512>& result, bool required);
    template<typename T, size_t N_MAX>
    bool llama_model_loader::get_arr(const std::string & key, std::array<T, N_MAX> & result, bool required) {
        const int kid = gguf_find_key(meta.get(), key.c_str());
--- a/llama/llama.cpp/src/llama-model.cpp
+++ b/llama/llama.cpp/src/llama-model.cpp
@ -433,6 +433,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
    // get general kv
    ml.get_key(LLM_KV_GENERAL_NAME, name, false);
    ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab, false);
    // everything past this point is not vocab-related
    if (hparams.vocab_only) {
@ -444,6 +445,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
    ml.get_key(LLM_KV_BLOCK_COUNT,       hparams.n_layer);
    ml.get_key(LLM_KV_EXPERT_COUNT,      hparams.n_expert,      false);
    ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
    ml.get_key(LLM_KV_VOCAB_SIZE,        hparams.n_vocab,       false);
    if (arch == LLM_ARCH_WAVTOKENIZER_DEC) {
        ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd_features);
@ -467,9 +469,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
    std::fill(hparams.n_head_arr.begin(),    hparams.n_head_arr.end(),    0);
    std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
    std::fill(hparams.n_ff_arr.begin(),      hparams.n_ff_arr.end(),      0);
    std::fill(hparams.cross_attn_layers.begin(), hparams.cross_attn_layers.end(), -1);
    ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH,  hparams.n_ff_arr,   hparams.n_layer, false);
    ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false);
    ml.get_arr(LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS, hparams.cross_attn_layers, false);
    // n_head_kv is optional, default to n_head
    hparams.n_head_kv_arr = hparams.n_head_arr;
@ -522,7 +526,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
        ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
-        if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON) {
+        if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_MLLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON) {
            if (hparams.n_rot != hparams.n_embd_head_k) {
                throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
            }
@ -585,6 +589,16 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                    hparams.use_kq_norm = false;
                }
            } break;
        case LLM_ARCH_MLLAMA:
            {
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
                switch (hparams.n_layer) {
                    case 40: type = LLM_TYPE_11B; break;
                    case 100: type = LLM_TYPE_90B; break;
                    default: type = LLM_TYPE_UNKNOWN;
                }
            } break;
        case LLM_ARCH_DECI:
            {
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@ -1581,7 +1595,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
        const int64_t n_embd_head_v = hparams.n_embd_head_v;
        const int64_t n_ff          = hparams.n_ff();
        const int64_t n_embd_gqa    = n_embd_v_gqa;
-        const int64_t n_vocab       = vocab.n_tokens();
+        const int64_t n_vocab       = hparams.n_vocab;
        const int64_t n_token_types = vocab.n_token_types();
        const int64_t n_rot         = hparams.n_rot;
        const int64_t n_expert      = hparams.n_expert;
@ -1840,6 +1854,52 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                        }
                    }
                } break;
            case LLM_ARCH_MLLAMA:
                {
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab+8}, 0);
                    // output
                    {
                        output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
                        output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
                        // if output is NULL, init from the input tok embed
                        if (output == NULL) {
                            output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
                        }
                    }
                    for (int i = 0; i < n_layer; ++i) {
                        auto & layer = layers[i];
                        if (hparams.cross_attention_layers(i)) {
                            layer.cross_attn_k_norm = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_K_NORM,   "weight", i), {128}, 0);
                            layer.cross_attn_k_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_K_PROJ,   "weight", i), {n_embd, 1024}, 0);
                            layer.cross_attn_o_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_O_PROJ,   "weight", i), {n_embd, n_embd}, 0);
                            layer.cross_attn_q_norm = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_Q_NORM, "weight", i), {128}, 0);
                            layer.cross_attn_q_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_Q_PROJ, "weight", i), {n_embd, n_embd}, 0);
                            layer.cross_attn_v_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_V_PROJ, "weight", i), {n_embd, 1024}, 0);
                            layer.cross_attn_attn_gate = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_ATTN_GATE, i), {1}, 0);
                            layer.cross_attn_mlp_gate = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_MLP_GATE, i), {1}, 0);
                            layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
                            layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
                        } else {
                            layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
                            layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
                            layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
                            layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
                            layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
                            layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
                            layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
                        }
                    }
                } break;
            case LLM_ARCH_DECI:
                {
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@ -4756,6 +4816,246 @@ struct llm_build_llama : public llm_graph_context {
    }
 };
 struct llm_build_mllama: public llm_graph_context {
    llm_build_mllama(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
        // mutable variable, needed during the last layer of the computation to skip unused tokens
        int32_t n_tokens = this->n_tokens;
        const int64_t n_embd_head = hparams.n_embd_head_v;
        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
        GGML_ASSERT(n_embd_head == hparams.n_rot);
        ggml_tensor * cur;
        ggml_tensor * inpL;
        ggml_tensor * inpCAS;
        inpL = build_inp_embd(model.tok_embd);
        inpCAS = build_inp_cross_attn_state();
          // inp_pos - contains the positions
        ggml_tensor * inp_pos = build_inp_pos();
        auto * inp_attn = build_attn_inp_kv_unified();
        const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
        for (int il = 0; il < n_layer; ++il) {
            ggml_tensor * inpSA = inpL;
            // norm
            cur = build_norm(inpL,
                    model.layers[il].attn_norm, NULL,
                    LLM_NORM_RMS, il);
            cb(cur, "attn_norm", il);
            if (hparams.cross_attention_layers(il)) {
                if (!ubatch.embd && !cparams.cross_attn) {
                    continue;
                }
                // cross attention layer
                ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_q_proj, cur);
                cb(Qcur, "Qcur", il);
                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
                cb(Qcur, "Qcur", il);
                Qcur = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 0, 2, 1, 3));
                cb(Qcur, "Qcur", il);
                Qcur = build_norm(Qcur, model.layers[il].cross_attn_q_norm, NULL, LLM_NORM_RMS, il);
                cb(Qcur, "Qcur", il);
                ggml_tensor * Kcur, * Vcur;
                if (ubatch.embd) {
                    Kcur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_k_proj, inpCAS);
                    cb(Kcur, "Kcur", il);
                    Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, 6404);
                    cb(Kcur, "Kcur", il);
                    Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
                    cb(Kcur, "Kcur", il);
                    Kcur = build_norm(Kcur, model.layers[il].cross_attn_k_norm, NULL, LLM_NORM_RMS, il);
                    cb(Kcur, "Kcur", il);
                    ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, kv_self->k_l[il]));
                    Vcur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_v_proj, inpCAS);
                    cb(Vcur, "Vcur", il);
                    Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, 6404);
                    cb(Vcur, "Vcur", il);
                    Vcur = ggml_permute(ctx0, Vcur, 0, 2, 1, 3);
                    cb(Vcur, "Vcur", il);
                    ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, kv_self->v_l[il]));
                } else {
                    Kcur = ggml_view_tensor(ctx0, kv_self->k_l[il]);
                    cb(Kcur, "Kcur (view)", il);
                    Vcur = ggml_view_tensor(ctx0, kv_self->v_l[il]);
                    cb(Vcur, "Vcur (view)", il);
                }
                struct ggml_tensor * kq = ggml_mul_mat(ctx0, Kcur, Qcur);
                cb(kq, "kq", il);
                // TODO: apply causal masks
                struct ggml_tensor * kq_soft_max = ggml_soft_max_ext(ctx0, kq, nullptr, 1.f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias);
                cb(kq_soft_max, "kq_soft_max", il);
                Vcur = ggml_cont(ctx0, ggml_transpose(ctx0, Vcur));
                cb(Vcur, "Vcur", il);
                struct ggml_tensor * kqv = ggml_mul_mat(ctx0, Vcur, kq_soft_max);
                cb(kqv, "kqv", il);
                struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
                cb(kqv_merged, "kqv_merged", il);
                cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_head_v*n_head, n_tokens);
                cb(cur, "kqv_merged_cont", il);
                cur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_o_proj, cur);
                cb(cur, "cur", il);
                // TODO: do this in place once?
                cur = ggml_mul(ctx0, cur, ggml_tanh(ctx0, model.layers[il].cross_attn_attn_gate));
                struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
                cb(ffn_inp, "ffn_inp", il);
                // feed-forward network
                cur = build_norm(ffn_inp,
                        model.layers[il].ffn_norm, NULL,
                        LLM_NORM_RMS, il);
                cb(cur, "ffn_norm", il);
                cur = build_ffn(cur,
                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
                        model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
                        model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
                        NULL,
                        LLM_FFN_SILU, LLM_FFN_PAR, il);
                cb(cur, "ffn_out", il);
                // TODO: do this inplace once?
                cur = ggml_add_inplace(ctx0, ggml_mul_inplace(ctx0, cur, ggml_tanh(ctx0, model.layers[il].cross_attn_mlp_gate)), ffn_inp);
                cb(cur, "ffn_out", il);
                cur = build_cvec(cur, il);
                cb(cur, "l_out", il);
                // input for next layer
                inpL = cur;
            } else {
                // self attention layer
                // rope freq factors for llama3; may return nullptr for llama2 and other models
                ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
                // compute Q and K and RoPE them
                ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
                cb(Qcur, "Qcur", il);
                if (model.layers[il].bq) {
                    Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
                    cb(Qcur, "Qcur", il);
                }
                ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
                cb(Kcur, "Kcur", il);
                if (model.layers[il].bk) {
                    Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
                    cb(Kcur, "Kcur", il);
                }
                ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
                cb(Vcur, "Vcur", il);
                if (model.layers[il].bv) {
                    Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
                    cb(Vcur, "Vcur", il);
                }
                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
                Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
                Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
                Qcur = ggml_rope_ext(
                        ctx0, Qcur, inp_pos, rope_factors,
                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                        ext_factor, attn_factor, beta_fast, beta_slow
                        );
                Kcur = ggml_rope_ext(
                        ctx0, Kcur, inp_pos, rope_factors,
                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                        ext_factor, attn_factor, beta_fast, beta_slow
                        );
                cb(Qcur, "Qcur", il);
                cb(Kcur, "Kcur", il);
                cb(Vcur, "Vcur", il);
                cur = build_attn(inp_attn, gf,
                    model.layers[il].wo, model.layers[il].bo,
                    Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
                if (il == n_layer - 1) {
                    // skip computing output for unused tokens
                    struct ggml_tensor * inp_out_ids = build_inp_out_ids();
                    n_tokens = n_outputs;
                    cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
                    inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
                }
                struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
                cb(ffn_inp, "ffn_inp", il);
                // feed-forward network
                cur = build_norm(ffn_inp,
                        model.layers[il].ffn_norm, NULL,
                        LLM_NORM_RMS, il);
                cb(cur, "ffn_norm", il);
                cur = build_ffn(cur,
                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
                        model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
                        model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
                        NULL,
                        LLM_FFN_SILU, LLM_FFN_PAR, il);
                cb(cur, "ffn_out", il);
                cur = ggml_add(ctx0, cur, ffn_inp);
                cb(cur, "ffn_out", il);
                cur = build_cvec(cur, il);
                cb(cur, "l_out", il);
                // input for next layer
                inpL = cur;
            }
        }
        cur = inpL;
        cur = build_norm(cur,
                model.output_norm, NULL,
                LLM_NORM_RMS, -1);
        cb(cur, "result_norm", -1);
        res->t_embd = cur;
        // lm_head
        cur = build_lora_mm(model.output, cur);
        cb(cur, "result_output", -1);
        res->t_logits = cur;
        ggml_build_forward_expand(gf, cur);
    }
 };
 struct llm_build_deci : public llm_graph_context {
    llm_build_deci(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
        const int64_t n_embd_head = hparams.n_embd_head_v;
@ -13128,6 +13428,10 @@ llm_graph_result_ptr llama_model::build_graph(
            {
                llm = std::make_unique<llm_build_llama>(*this, params, gf);
            } break;
        case LLM_ARCH_MLLAMA:
            {
                llm = std::make_unique<llm_build_mllama>(*this, params, gf);
            } break;
        case LLM_ARCH_DECI:
            {
                llm = std::make_unique<llm_build_deci>(*this, params, gf);
@ -13489,6 +13793,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
        // use what we call a normal RoPE, operating on pairs of consecutive head values
        case LLM_ARCH_LLAMA:
        case LLM_ARCH_LLAMA4:
        case LLM_ARCH_MLLAMA:
        case LLM_ARCH_DECI:
        case LLM_ARCH_BAICHUAN:
        case LLM_ARCH_STARCODER:
--- a/llama/llama.cpp/src/llama-model.h
+++ b/llama/llama.cpp/src/llama-model.h
@ -11,6 +11,7 @@
 #include <string>
 #include <unordered_map>
 #include <vector>
 #include <stdexcept>
 struct llama_cparams;
 struct llama_ubatch;
@ -74,6 +75,7 @@ enum llm_type {
    LLM_TYPE_40B,
    LLM_TYPE_65B,
    LLM_TYPE_70B,
    LLM_TYPE_90B,
    LLM_TYPE_236B,
    LLM_TYPE_290B,
    LLM_TYPE_314B,
@ -318,6 +320,16 @@ struct llama_layer {
    struct ggml_tensor * bskcn_tv = nullptr;
    // cross attention
    struct ggml_tensor * cross_attn_k_norm = nullptr;
    struct ggml_tensor * cross_attn_k_proj = nullptr;
    struct ggml_tensor * cross_attn_o_proj = nullptr;
    struct ggml_tensor * cross_attn_q_norm = nullptr;
    struct ggml_tensor * cross_attn_q_proj = nullptr;
    struct ggml_tensor * cross_attn_v_proj = nullptr;
    struct ggml_tensor * cross_attn_attn_gate = nullptr;
    struct ggml_tensor * cross_attn_mlp_gate = nullptr;
    struct llama_layer_posnet posnet;
    struct llama_layer_convnext convnext;
--- a/llama/llama.cpp/src/llama-quant.cpp
+++ b/llama/llama.cpp/src/llama-quant.cpp
@ -639,7 +639,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
        if (llama_model_has_encoder(&model)) {
            n_attn_layer *= 3;
        }
-        GGML_ASSERT((qs.n_attention_wv == n_attn_layer) && "n_attention_wv is unexpected");
+        if (qs.n_attention_wv != n_attn_layer) {
            LLAMA_LOG_WARN("%s: n_attention_wv is unexpected, expected: %d, found: %d\n", __func__, n_attn_layer, qs.n_attention_wv);
        }
    }
    size_t total_size_org = 0;
--- a/llama/llama.cpp/src/llama-vocab.cpp
+++ b/llama/llama.cpp/src/llama-vocab.cpp
@ -1469,6 +1469,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
            const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
            if (precompiled_charsmap_keyidx != -1) {
                const gguf_type pc_type = gguf_get_arr_type(ctx, precompiled_charsmap_keyidx);
                GGML_ASSERT(pc_type == GGUF_TYPE_INT8 || pc_type == GGUF_TYPE_UINT8);
                const size_t n_precompiled_charsmap = gguf_get_arr_data_n(ctx, precompiled_charsmap_keyidx);
                const char * pc = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx);
                precompiled_charsmap.assign(pc, pc + n_precompiled_charsmap);
--- a/llama/llama.cpp/tools/mtmd/llava.cpp
+++ b/llama/llama.cpp/tools/mtmd/llava.cpp
@ -462,7 +462,7 @@ struct llava_embd_batch {
    std::vector<llama_seq_id *> seq_ids;
    std::vector<int8_t>         logits;
    llama_batch batch;
-    llava_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
+    llava_embd_batch(float * embd, int32_t n_embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
        pos     .resize(n_tokens);
        n_seq_id.resize(n_tokens);
        seq_ids .resize(n_tokens + 1);
@ -474,6 +474,7 @@ struct llava_embd_batch {
            /*n_tokens       =*/ n_tokens,
            /*tokens         =*/ nullptr,
            /*embd           =*/ embd,
            /*n_embd         =*/ n_embd,
            /*pos            =*/ pos.data(),
            /*n_seq_id       =*/ n_seq_id.data(),
            /*seq_id         =*/ seq_ids.data(),
@ -497,7 +498,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
            n_eval = n_batch;
        }
        float * embd = image_embed->embed+i*n_embd;
-        llava_embd_batch llava_batch = llava_embd_batch(embd, n_eval, *n_past, 0);
+        llava_embd_batch llava_batch = llava_embd_batch(embd, n_embd, n_eval, *n_past, 0);
        if (llama_decode(ctx_llama, llava_batch.batch)) {
            LOG_ERR("%s : failed to eval\n", __func__);
            return false;
--- a/llama/llama.go
+++ b/llama/llama.go
@ -17,6 +17,7 @@ package llama
 #include "llava.h"
 #include "gguf.h"
 #include "mllama.h"
 #include "sampling_ext.h"
 extern bool llamaProgressCallback(float progress, void *user_data);
@ -509,6 +510,63 @@ func (c *ClipContext) NewEmbed(llamaContext *Context, data []byte) ([][]float32,
 	return embed, nil
 }
 type MllamaContext struct {
 	c *C.struct_mllama_ctx
 }
 func NewMllamaContext(llamaContext *Context, modelPath string) (*MllamaContext, error) {
 	mp := C.CString(modelPath)
 	defer C.free(unsafe.Pointer(mp))
 	c := C.mllama_model_load(mp, 1)
 	if c == nil {
 		return nil, fmt.Errorf("unable to load mllama model: %v", modelPath)
 	}
 	projEmbedSize := int(C.mllama_n_embd(c))
 	modelEmbedSize := llamaContext.Model().NEmbd()
 	if projEmbedSize != modelEmbedSize {
 		return nil, fmt.Errorf("projector embedding size (%d) does not match model (%d)", projEmbedSize, modelEmbedSize)
 	}
 	return &MllamaContext{c: c}, nil
 }
 func (m *MllamaContext) Free() {
 	C.mllama_free(m.c)
 }
 func (m *MllamaContext) NewEmbed(llamaContext *Context, data []byte, aspectRatioId int) ([][]float32, error) {
 	img := C.mllama_image_init()
 	defer C.mllama_image_free(img)
 	ok := bool(C.mllama_image_load_from_data(unsafe.Pointer(&data[0]), C.int(len(data)), 560, 560, 3, 4, C.int(aspectRatioId), img))
 	if !ok {
 		return nil, errors.New("unable to load mllama image data")
 	}
 	rows := make([]float32, m.EmbedSize(llamaContext))
 	ok = bool(C.mllama_image_encode(m.c, C.int(llamaContext.numThreads), img, (*C.float)(unsafe.Pointer(&rows[0]))))
 	if !ok {
 		return nil, errors.New("unable to make mllama embedding from image")
 	}
 	embed := make([][]float32, 1)
 	embed[0] = rows
 	return embed, nil
 }
 func (m *MllamaContext) EmbedSize(llamaContext *Context) int {
 	numTokens := int(C.mllama_n_positions(m.c) * C.mllama_n_tiles(m.c))
 	numEmbed := llamaContext.Model().NEmbd()
 	return numTokens * numEmbed
 }
 func (c *Context) SetCrossAttention(state bool) {
 	C.llama_set_cross_attention(c.c, C.bool(state))
 }
 func (c *Context) Synchronize() {
 	C.llama_synchronize(c.c)
 }
--- a/llama/mllama.cpp
+++ b/llama/mllama.cpp
@ -0,0 +1,887 @@
 // NOTE: This is modified from clip.cpp for Mllama only
 #include "mllama.h"
 #include "ggml-alloc.h"
 #include "ggml-backend.h"
 #include "ggml-cpu.h"
 #include "ggml.h"
 #include "gguf.h"
 #ifdef GGML_USE_CUDA
 #include "ggml-cuda.h"
 #endif
 #ifdef GGML_USE_METAL
 #include "ggml-metal.h"
 #endif
 #ifdef GGML_USE_CANN
 #include "ggml-cann.h"
 #endif
 #ifdef GGML_USE_VULKAN
 #include "ggml-vulkan.h"
 #endif
 #include <algorithm>
 #include <cmath>
 #include <cstdarg>
 #include <cstdlib>
 #include <cstring>
 #include <fstream>
 #include <stdexcept>
 #include <vector>
 #define REQUIRE(x)                                           \
    do {                                                     \
        if (!(x)) {                                          \
            throw std::runtime_error("REQUIRE failed: " #x); \
        }                                                    \
    } while (0)
 #define LOG(fmt, ...) fprintf(stderr, "%s: " fmt "\n", __func__, ##__VA_ARGS__)
 #if defined(_WIN32)
 #define WIN32_LEAN_AND_MEAN
 #ifndef NOMINMAX
    #define NOMINMAX
 #endif
 #include <windows.h>
 #if __GLIBCXX__
 #include <cstdio>
 #include <ext/stdio_filebuf.h>
 #include <fcntl.h>
 #endif
 #endif
 struct mllama_image {
    int width;
    int height;
    int num_channels = 3;
    int num_tiles = 4;
    int aspect_ratio_id;
    std::vector<float> data;
 };
 static std::string format(const char *fmt, ...) {
    va_list args;
    va_start(args, fmt);
    std::vector<char> b(128);
    int n = vsnprintf(b.data(), b.size(), fmt, args);
    REQUIRE(n >= 0 && n < b.size());
    va_end(args);
    return std::string(b.data(), b.size());
 }
 //
 // utilities to get data from a gguf file
 //
 static int get_key_index(const gguf_context *ctx, const char *key) {
    int key_index = gguf_find_key(ctx, key);
    REQUIRE(key_index != -1);
    return key_index;
 }
 static std::vector<uint32_t> get_u32_array(const gguf_context *ctx, const std::string &key) {
    const int i = get_key_index(ctx, key.c_str());
    const int n = gguf_get_arr_n(ctx, i);
    const uint32_t *data = (uint32_t *)gguf_get_arr_data(ctx, i);
    std::vector<uint32_t> s(n);
    for (size_t j = 0; j < s.size(); j++) {
        s[j] = data[j];
    }
    return s;
 }
 static uint32_t get_u32(const gguf_context *ctx, const std::string &key) {
    return gguf_get_val_u32(ctx, get_key_index(ctx, key.c_str()));
 }
 static float get_f32(const gguf_context *ctx, const std::string &key) {
    return gguf_get_val_f32(ctx, get_key_index(ctx, key.c_str()));
 }
 static std::string get_ftype(int ftype) {
    return ggml_type_name(static_cast<ggml_type>(ftype));
 }
 //
 // mllama layers
 //
 struct mllama_hparams {
    uint32_t image_size;
    uint32_t patch_size;
    uint32_t hidden_size;
    uint32_t n_intermediate;
    uint32_t projection_dim;
    uint32_t n_head;
    uint32_t n_layer;
    uint32_t n_global_layer;
    uint32_t n_tiles;
    float eps;
    std::vector<bool> intermediate_layers;
 };
 struct mllama_layer {
    // attention
    struct ggml_tensor *k_w;
    struct ggml_tensor *k_b;
    struct ggml_tensor *q_w;
    struct ggml_tensor *q_b;
    struct ggml_tensor *v_w;
    struct ggml_tensor *v_b;
    struct ggml_tensor *o_w;
    struct ggml_tensor *o_b;
    struct ggml_tensor *attn_gate;
    // layernorm 1
    struct ggml_tensor *ln_1_w;
    struct ggml_tensor *ln_1_b;
    // ff
    struct ggml_tensor *ff_i_w;
    struct ggml_tensor *ff_i_b;
    struct ggml_tensor *ff_o_w;
    struct ggml_tensor *ff_o_b;
    struct ggml_tensor *ff_gate;
    // layernorm 2
    struct ggml_tensor *ln_2_w;
    struct ggml_tensor *ln_2_b;
 };
 struct mllama_vision_model {
    struct mllama_hparams hparams;
    // embeddings
    struct ggml_tensor *class_embedding;
    struct ggml_tensor *patch_embeddings;
    struct ggml_tensor *position_embeddings;
    struct ggml_tensor *position_embeddings_gate;
    struct ggml_tensor *tile_position_embeddings;
    struct ggml_tensor *tile_position_embeddings_gate;
    struct ggml_tensor *pre_tile_position_embeddings;
    struct ggml_tensor *pre_tile_position_embeddings_gate;
    struct ggml_tensor *post_tile_position_embeddings;
    struct ggml_tensor *post_tile_position_embeddings_gate;
    struct ggml_tensor *pre_ln_w;
    struct ggml_tensor *pre_ln_b;
    std::vector<mllama_layer> layers;
    std::vector<mllama_layer> global_layers;
    struct ggml_tensor *post_ln_w;
    struct ggml_tensor *post_ln_b;
    struct ggml_tensor *mm_0_w;
    struct ggml_tensor *mm_0_b;
 };
 struct mllama_ctx {
    struct mllama_vision_model vision_model;
    uint32_t ftype = 1;
    struct gguf_context *ctx_gguf;
    struct ggml_context *ctx_data;
    std::vector<uint8_t> buf_compute_meta;
    // memory buffers to evaluate the model
    ggml_backend_buffer_t params_buffer = nullptr;
    ggml_backend_t backend = nullptr;
    ggml_gallocr_t compute_alloc = nullptr;
 };
 static ggml_tensor *mllama_image_build_encoder_layer(
    struct ggml_context *ctx0, const size_t il, const struct mllama_layer &layer, struct ggml_tensor *embeddings,
    const float eps, const int hidden_size, const int batch_size, const int n_head, const int d_head) {
    struct ggml_tensor *cur = embeddings;
    {
        // layernorm1
        cur = ggml_norm(ctx0, cur, eps);
        cur = ggml_add(ctx0, ggml_mul(ctx0, cur, layer.ln_1_w), layer.ln_1_b);
        ggml_set_name(cur, format("%d pre layernorm", il).c_str());
    }
    {
        // self-attention
        struct ggml_tensor *Q = ggml_mul_mat(ctx0, layer.q_w, cur);
        if (layer.q_b != nullptr) {
            Q = ggml_add(ctx0, Q, layer.q_b);
        }
        Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, Q->ne[1], batch_size);
        Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
        ggml_set_name(Q, format("%d query", il).c_str());
        struct ggml_tensor *K = ggml_mul_mat(ctx0, layer.k_w, cur);
        if (layer.k_b != nullptr) {
            K = ggml_add(ctx0, K, layer.k_b);
        }
        K = ggml_reshape_4d(ctx0, K, d_head, n_head, K->ne[1], batch_size);
        K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
        ggml_set_name(K, format("%d key", il).c_str());
        struct ggml_tensor *V = ggml_mul_mat(ctx0, layer.v_w, cur);
        if (layer.v_b != nullptr) {
            V = ggml_add(ctx0, V, layer.v_b);
        }
        V = ggml_reshape_4d(ctx0, V, d_head, n_head, V->ne[1], batch_size);
        V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3));
        ggml_set_name(V, format("%d value", il).c_str());
        struct ggml_tensor *KQ = ggml_mul_mat(ctx0, K, Q);
        KQ = ggml_scale_inplace(ctx0, KQ, 1.0f / sqrtf((float)d_head));
        KQ = ggml_soft_max_inplace(ctx0, KQ);
        ggml_set_name(KQ, format("%d KQ", il).c_str());
        struct ggml_tensor *KQV = ggml_mul_mat(ctx0, V, KQ);
        KQV = ggml_reshape_4d(ctx0, KQV, d_head, KQV->ne[1], n_head, batch_size);
        KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
        KQV = ggml_cont_3d(ctx0, KQV, hidden_size, KQV->ne[2], batch_size);
        ggml_set_name(KQV, format("%d KQV", il).c_str());
        cur = ggml_mul_mat(ctx0, layer.o_w, KQV);
        if (layer.o_b != nullptr) {
            cur = ggml_add(ctx0, cur, layer.o_b);
        }
        ggml_set_name(cur, format("%d self attention", il).c_str());
        if (layer.attn_gate != nullptr) {
            cur = ggml_mul_inplace(ctx0, cur, layer.attn_gate);
            ggml_set_name(cur, format("%d self attention gate", il).c_str());
        }
    }
    cur = ggml_add(ctx0, cur, embeddings);
    ggml_set_name(cur, format("%d residual", il).c_str());
    embeddings = cur;
    {
        // layernorm2
        cur = ggml_norm(ctx0, cur, eps);
        cur = ggml_add(ctx0, ggml_mul(ctx0, cur, layer.ln_2_w), layer.ln_2_b);
        ggml_set_name(cur, format("%d post layernorm", il).c_str());
    }
    {
        // feed forward
        cur = ggml_add(ctx0, ggml_mul_mat(ctx0, layer.ff_i_w, cur), layer.ff_i_b);
        cur = ggml_gelu_inplace(ctx0, cur);
        cur = ggml_add(ctx0, ggml_mul_mat(ctx0, layer.ff_o_w, cur), layer.ff_o_b);
        ggml_set_name(cur, format("%d feed forward", il).c_str());
        if (layer.ff_gate != nullptr) {
            cur = ggml_mul_inplace(ctx0, cur, layer.ff_gate);
            ggml_set_name(cur, format("%d feed forward gate", il).c_str());
        }
    }
    // residual 2
    cur = ggml_add(ctx0, cur, embeddings);
    ggml_set_name(cur, format("%d residual", il).c_str());
    embeddings = cur;
    return embeddings;
 }
 static ggml_cgraph *mllama_image_build_graph(mllama_ctx *ctx, const mllama_image_batch *imgs) {
    const auto &model = ctx->vision_model;
    const auto &hparams = model.hparams;
    const int image_size = hparams.image_size;
    const int image_size_width = image_size;
    const int image_size_height = image_size;
    const int patch_size = hparams.patch_size;
    const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
    const int num_positions = num_patches + (model.class_embedding == nullptr ? 0 : 1);
    const int hidden_size = hparams.hidden_size;
    const int n_head = hparams.n_head;
    const int d_head = hidden_size / n_head;
    const int batch_size = imgs->size;
    REQUIRE(batch_size == 1);
    int num_tiles = 4;
    int num_channels = 3;
    if (imgs->data != nullptr) {
        num_tiles = imgs->data[0].num_tiles > 0 ? imgs->data[0].num_tiles : num_tiles;
        num_channels = imgs->data[0].num_channels > 0 ? imgs->data[0].num_channels : num_channels;
    }
    struct ggml_init_params params = {
        ctx->buf_compute_meta.size(), // mem_size
        ctx->buf_compute_meta.data(), // mem_buffer
        true,                         // no_alloc
    };
    struct ggml_context *ctx0 = ggml_init(params);
    struct ggml_cgraph *gf = ggml_new_graph(ctx0);
    struct ggml_tensor *inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size_width, image_size_height, num_channels, num_tiles);
    ggml_set_name(inp_raw, "inp_raw");
    ggml_set_input(inp_raw);
    struct ggml_tensor *inp = ggml_conv_2d(ctx0, model.patch_embeddings, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
    inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, num_tiles);
    inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3));
    struct ggml_tensor *aspect_ratios = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, imgs->size);
    ggml_set_name(aspect_ratios, "aspect_ratios");
    ggml_set_input(aspect_ratios);
    if (model.pre_tile_position_embeddings != nullptr) {
        struct ggml_tensor *pre_tile_position_embeddings = ggml_get_rows(ctx0, model.pre_tile_position_embeddings, aspect_ratios);
        ggml_set_name(pre_tile_position_embeddings, "pre_tile_position_embeddings");
        pre_tile_position_embeddings = ggml_reshape_3d(ctx0, pre_tile_position_embeddings, hidden_size, 1, num_tiles);
        if (model.pre_tile_position_embeddings_gate != nullptr) {
            pre_tile_position_embeddings = ggml_mul_inplace(ctx0, pre_tile_position_embeddings, model.pre_tile_position_embeddings_gate);
        }
        inp = ggml_add(ctx0, inp, pre_tile_position_embeddings);
    }
    struct ggml_tensor *embeddings = inp;
    if (model.class_embedding != nullptr) {
        // concat class_embeddings and patch_embeddings
        embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, num_tiles);
        ggml_set_name(embeddings, "embeddings");
        ggml_set_input(embeddings);
        for (int i = 0; i < num_tiles; ++i) {
            // repeat class embeddings for each tile
            embeddings = ggml_acc_inplace(ctx0, embeddings, model.class_embedding, embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], i * embeddings->nb[2]);
        }
        embeddings = ggml_acc_inplace(ctx0, embeddings, inp, embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
    }
    struct ggml_tensor *positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions);
    ggml_set_name(positions, "positions");
    ggml_set_input(positions);
    struct ggml_tensor *position_embd = ggml_get_rows(ctx0, model.position_embeddings, positions);
    if (model.position_embeddings_gate != nullptr) {
        position_embd = ggml_mul_inplace(ctx0, position_embd, model.position_embeddings_gate);
    }
    embeddings = ggml_add(ctx0, embeddings, position_embd);
    if (model.tile_position_embeddings != nullptr) {
        struct ggml_tensor *tile_position_embeddings = ggml_get_rows(ctx0, model.tile_position_embeddings, aspect_ratios);
        ggml_set_name(tile_position_embeddings, "tile_position_embeddings");
        tile_position_embeddings = ggml_reshape_3d(ctx0, tile_position_embeddings, hidden_size, num_positions, num_tiles);
        if (model.tile_position_embeddings_gate != nullptr) {
            tile_position_embeddings = ggml_mul_inplace(ctx0, tile_position_embeddings, model.tile_position_embeddings_gate);
        }
        embeddings = ggml_add(ctx0, embeddings, tile_position_embeddings);
    }
    // pre-layernorm
    if (model.pre_ln_w != nullptr) {
        embeddings = ggml_mul(ctx0, ggml_norm(ctx0, embeddings, hparams.eps), model.pre_ln_w);
        if (model.pre_ln_b != nullptr) {
            embeddings = ggml_add(ctx0, embeddings, model.pre_ln_b);
        }
        ggml_set_name(embeddings, "pre layernorm");
    }
    const int num_padding_patches = 8 - (embeddings->ne[1] % 8) % 8;
    embeddings = ggml_pad(ctx0, embeddings, 0, num_padding_patches, 0, 0);
    embeddings = ggml_view_3d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1] * embeddings->ne[2], batch_size, embeddings->nb[1], embeddings->nb[2] * embeddings->ne[3], 0);
    std::vector<struct ggml_tensor *> intermediate_embeddings;
    // encoder
    for (size_t il = 0; il < model.layers.size(); il++) {
        if (hparams.intermediate_layers[il]) {
            intermediate_embeddings.push_back(embeddings);
        }
        embeddings = mllama_image_build_encoder_layer(
            ctx0, il, model.layers[il], embeddings,
            hparams.eps, hidden_size, batch_size, n_head, d_head);
    }
    // post-layernorm
    if (model.post_ln_w != nullptr) {
        embeddings = ggml_mul(ctx0, ggml_norm(ctx0, embeddings, hparams.eps), model.post_ln_w);
        if (model.post_ln_b != nullptr) {
            embeddings = ggml_add(ctx0, embeddings, model.post_ln_b);
        }
        ggml_set_name(embeddings, "post layernorm");
    }
    embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size, num_positions + num_padding_patches, num_tiles);
    if (model.post_tile_position_embeddings != nullptr) {
        struct ggml_tensor *post_tile_position_embeddings = ggml_get_rows(ctx0, model.post_tile_position_embeddings, aspect_ratios);
        ggml_set_name(post_tile_position_embeddings, "post_tile_position_embeddings");
        post_tile_position_embeddings = ggml_reshape_3d(ctx0, post_tile_position_embeddings, hidden_size, 1, num_tiles);
        if (model.post_tile_position_embeddings_gate != nullptr) {
            post_tile_position_embeddings = ggml_mul(ctx0, post_tile_position_embeddings, model.post_tile_position_embeddings_gate);
        }
        embeddings = ggml_add(ctx0, embeddings, post_tile_position_embeddings);
    }
    embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size, num_tiles * (num_positions + num_padding_patches), 1);
    // global encoder
    for (size_t il = 0; il < model.global_layers.size(); il++) {
        embeddings = mllama_image_build_encoder_layer(
            ctx0, il, model.global_layers[il], embeddings,
            hparams.eps, hidden_size, batch_size, n_head, d_head);
    }
    struct ggml_tensor *stacked_embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 0, hidden_size, (num_positions + num_padding_patches) * num_tiles);
    for (size_t i = 0; i < intermediate_embeddings.size(); ++i) {
        stacked_embeddings = ggml_concat(ctx0, stacked_embeddings, ggml_reshape_3d(ctx0, intermediate_embeddings[i], 1, intermediate_embeddings[i]->ne[0], intermediate_embeddings[i]->ne[1]), 0);
    }
    stacked_embeddings = ggml_reshape_4d(ctx0, stacked_embeddings, intermediate_embeddings.size() * hidden_size, num_positions + num_padding_patches, num_tiles, batch_size);
    stacked_embeddings = ggml_unpad(ctx0, stacked_embeddings, 0, num_padding_patches, 0, 0);
    embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size, num_positions + num_padding_patches, num_tiles);
    embeddings = ggml_unpad(ctx0, embeddings, 0, num_padding_patches, 0, 0);
    embeddings = ggml_concat(ctx0, embeddings, stacked_embeddings, 0);
    // mllama projector
    embeddings = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_0_w, embeddings), model.mm_0_b);
    ggml_set_name(embeddings, "multi modal projector");
    // build the graph
    ggml_build_forward_expand(gf, embeddings);
    ggml_free(ctx0);
    return gf;
 }
 static struct ggml_tensor *mllama_tensor_load(struct ggml_context *ctx, const char *name, const bool optional) {
    struct ggml_tensor *cur = ggml_get_tensor(ctx, name);
    REQUIRE(cur != nullptr || optional);
    return cur;
 }
 static std::vector<struct mllama_layer> mllama_layers_load(struct ggml_context *ctx, const char *prefix, const int n) {
    std::vector<struct mllama_layer> layers(n);
    for (size_t i = 0; i < layers.size(); i++) {
        auto &layer = layers[i];
        layer.ln_1_w = mllama_tensor_load(ctx, format("%s.blk.%d.ln1.weight", prefix, i).c_str(), false);
        layer.ln_1_b = mllama_tensor_load(ctx, format("%s.blk.%d.ln1.bias", prefix, i).c_str(), false);
        layer.ln_2_w = mllama_tensor_load(ctx, format("%s.blk.%d.ln2.weight", prefix, i).c_str(), false);
        layer.ln_2_b = mllama_tensor_load(ctx, format("%s.blk.%d.ln2.bias", prefix, i).c_str(), false);
        layer.k_w = mllama_tensor_load(ctx, format("%s.blk.%d.attn_k.weight", prefix, i).c_str(), false);
        layer.k_b = mllama_tensor_load(ctx, format("%s.blk.%d.attn_k.bias", prefix, i).c_str(), true);
        layer.q_w = mllama_tensor_load(ctx, format("%s.blk.%d.attn_q.weight", prefix, i).c_str(), false);
        layer.q_b = mllama_tensor_load(ctx, format("%s.blk.%d.attn_q.bias", prefix, i).c_str(), true);
        layer.v_w = mllama_tensor_load(ctx, format("%s.blk.%d.attn_v.weight", prefix, i).c_str(), false);
        layer.v_b = mllama_tensor_load(ctx, format("%s.blk.%d.attn_v.bias", prefix, i).c_str(), true);
        layer.o_w = mllama_tensor_load(ctx, format("%s.blk.%d.attn_out.weight", prefix, i).c_str(), false);
        layer.o_b = mllama_tensor_load(ctx, format("%s.blk.%d.attn_out.bias", prefix, i).c_str(), true);
        layer.ff_i_w = mllama_tensor_load(ctx, format("%s.blk.%d.ffn_down.weight", prefix, i).c_str(), false);
        layer.ff_i_b = mllama_tensor_load(ctx, format("%s.blk.%d.ffn_down.bias", prefix, i).c_str(), false);
        layer.ff_o_w = mllama_tensor_load(ctx, format("%s.blk.%d.ffn_up.weight", prefix, i).c_str(), false);
        layer.ff_o_b = mllama_tensor_load(ctx, format("%s.blk.%d.ffn_up.bias", prefix, i).c_str(), false);
        layer.attn_gate = mllama_tensor_load(ctx, format("%s.blk.%d.attn_gate", prefix, i).c_str(), true);
        layer.ff_gate = mllama_tensor_load(ctx, format("%s.blk.%d.ffn_gate", prefix, i).c_str(), true);
    }
    return layers;
 }
 // read and create ggml_context containing the tensors and their data
 struct mllama_ctx *mllama_model_load(const char *fname, const int verbosity = 1) {
    struct ggml_context *meta = nullptr;
    struct gguf_init_params params = {
        true,  // no_alloc
        &meta, // ctx
    };
    struct gguf_context *ctx = gguf_init_from_file(fname, params);
    REQUIRE(ctx != nullptr);
    if (verbosity >= 1) {
        const int n_tensors = gguf_get_n_tensors(ctx);
        const int n_kv = gguf_get_n_kv(ctx);
        const std::string ftype = get_ftype(get_u32(ctx, "general.file_type"));
        const int idx_desc = get_key_index(ctx, "general.description");
        const std::string description = gguf_get_val_str(ctx, idx_desc);
        const int idx_name = gguf_find_key(ctx, "general.name");
        if (idx_name != -1) { // make name optional temporarily as some of the uploaded models missing it due to a bug
            const std::string name = gguf_get_val_str(ctx, idx_name);
            LOG("model name:   %s", name.c_str());
        }
        LOG("description:  %s", description.c_str());
        LOG("GGUF version: %d", gguf_get_version(ctx));
        LOG("alignment:    %zu", gguf_get_alignment(ctx));
        LOG("n_tensors:    %d", n_tensors);
        LOG("n_kv:         %d", n_kv);
        LOG("ftype:        %s", ftype.c_str());
        LOG("");
    }
    const int n_tensors = gguf_get_n_tensors(ctx);
    mllama_ctx *new_mllama = new mllama_ctx{};
    ggml_backend_t backend = ggml_backend_init_best();
    if (backend == nullptr) {
        LOG("%s: failed to initialize backend\n", __func__);
        mllama_free(new_mllama);
        gguf_free(ctx);
        return nullptr;
    }
    LOG("%s: using %s backend\n", __func__, ggml_backend_name(backend));
    new_mllama->backend = backend;
    // load tensors
    {
        std::vector<uint8_t> read_buf;
        struct ggml_init_params params = {
            (n_tensors + 1) * ggml_tensor_overhead(), // mem_size
            nullptr,                                  // mem_buffer
            true,                                     // no_alloc
        };
        new_mllama->ctx_data = ggml_init(params);
        if (!new_mllama->ctx_data) {
            LOG("ggml_init() failed");
            mllama_free(new_mllama);
            gguf_free(ctx);
            return nullptr;
        }
 #ifdef _WIN32
        int wlen = MultiByteToWideChar(CP_UTF8, 0, fname, -1, NULL, 0);
        if (!wlen) {
            return NULL;
        }
        wchar_t * wbuf = (wchar_t *) malloc(wlen * sizeof(wchar_t));
        wlen = MultiByteToWideChar(CP_UTF8, 0, fname, -1, wbuf, wlen);
        if (!wlen) {
            free(wbuf);
            return NULL;
        }
 #if __GLIBCXX__
        int fd = _wopen(wbuf, _O_RDONLY | _O_BINARY);
        __gnu_cxx::stdio_filebuf<char> buffer(fd, std::ios_base::in);
        std::istream fin(&buffer);
 #else // MSVC
        // unused in our current build
        auto fin = std::ifstream(wbuf, std::ios::binary);
 #endif
        free(wbuf);
 #else
        auto fin = std::ifstream(fname, std::ios::binary);
 #endif
        if (!fin) {
            LOG("cannot open model file for loading tensors\n");
            mllama_free(new_mllama);
            gguf_free(ctx);
            return nullptr;
        }
        // add tensors to context
        for (int i = 0; i < n_tensors; ++i) {
            const char *name = gguf_get_tensor_name(ctx, i);
            struct ggml_tensor *t = ggml_get_tensor(meta, name);
            struct ggml_tensor *cur = ggml_dup_tensor(new_mllama->ctx_data, t);
            ggml_set_name(cur, name);
        }
        // alloc memory and offload data
        new_mllama->params_buffer = ggml_backend_alloc_ctx_tensors(new_mllama->ctx_data, new_mllama->backend);
        for (int i = 0; i < n_tensors; ++i) {
            const char *name = gguf_get_tensor_name(ctx, i);
            struct ggml_tensor *cur = ggml_get_tensor(new_mllama->ctx_data, name);
            const size_t offset = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i);
            fin.seekg(offset, std::ios::beg);
            if (!fin) {
                LOG("failed to seek for tensor %s\n", name);
                mllama_free(new_mllama);
                gguf_free(ctx);
                return nullptr;
            }
            int num_bytes = ggml_nbytes(cur);
            if (ggml_backend_buffer_is_host(new_mllama->params_buffer)) {
                // for the CPU and Metal backend, we can read directly into the tensor
                fin.read(reinterpret_cast<char *>(cur->data), num_bytes);
            } else {
                // read into a temporary buffer first, then copy to device memory
                read_buf.resize(num_bytes);
                fin.read(reinterpret_cast<char *>(read_buf.data()), num_bytes);
                ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
            }
        }
 #if defined(_WIN32) && defined(__GLIBCXX__)
        close(fd);
 #else
        fin.close();
 #endif
    }
    // vision model
    // load vision model
    auto &vision_model = new_mllama->vision_model;
    auto &hparams = vision_model.hparams;
    hparams.hidden_size = get_u32(ctx, "mllama.vision.embedding_length");
    hparams.n_head = get_u32(ctx, "mllama.vision.attention.head_count");
    hparams.n_intermediate = get_u32(ctx, "mllama.vision.feed_forward_length");
    hparams.n_layer = get_u32(ctx, "mllama.vision.block_count");
    hparams.n_global_layer = get_u32(ctx, "mllama.vision.global.block_count");
    hparams.n_tiles = get_u32(ctx, "mllama.vision.max_num_tiles");
    hparams.image_size = get_u32(ctx, "mllama.vision.image_size");
    hparams.patch_size = get_u32(ctx, "mllama.vision.patch_size");
    hparams.projection_dim = get_u32(ctx, "mllama.vision.projection_dim");
    hparams.eps = get_f32(ctx, "mllama.vision.attention.layer_norm_epsilon");
    std::vector<uint32_t> intermediate_layers_indices = get_u32_array(ctx, "mllama.vision.intermediate_layers_indices");
    hparams.intermediate_layers.resize(hparams.n_layer);
    for (size_t i = 0; i < intermediate_layers_indices.size(); i++) {
        hparams.intermediate_layers[intermediate_layers_indices[i]] = true;
    }
    if (verbosity >= 2) {
        LOG("");
        LOG("vision model hparams");
        LOG("image_size         %d", hparams.image_size);
        LOG("patch_size         %d", hparams.patch_size);
        LOG("v_hidden_size      %d", hparams.hidden_size);
        LOG("v_n_intermediate   %d", hparams.n_intermediate);
        LOG("v_projection_dim   %d", hparams.projection_dim);
        LOG("v_n_head           %d", hparams.n_head);
        LOG("v_n_layer          %d", hparams.n_layer);
        LOG("v_n_global_layer   %d", hparams.n_global_layer);
        LOG("v_eps              %f", hparams.eps);
    }
    vision_model.class_embedding = mllama_tensor_load(new_mllama->ctx_data, "v.class_embd", true);
    vision_model.patch_embeddings = mllama_tensor_load(new_mllama->ctx_data, "v.patch_embd.weight", true);
    vision_model.position_embeddings = mllama_tensor_load(new_mllama->ctx_data, "v.position_embd.weight", true);
    vision_model.position_embeddings_gate = mllama_tensor_load(new_mllama->ctx_data, "v.position_embd.gate", true);
    vision_model.pre_ln_w = mllama_tensor_load(new_mllama->ctx_data, "v.pre_ln.weight", true);
    vision_model.pre_ln_b = mllama_tensor_load(new_mllama->ctx_data, "v.pre_ln.bias", true);
    vision_model.post_ln_w = mllama_tensor_load(new_mllama->ctx_data, "v.post_ln.weight", true);
    vision_model.post_ln_b = mllama_tensor_load(new_mllama->ctx_data, "v.post_ln.bias", true);
    vision_model.tile_position_embeddings = mllama_tensor_load(new_mllama->ctx_data, "v.tile_position_embd.weight", true);
    vision_model.tile_position_embeddings_gate = mllama_tensor_load(new_mllama->ctx_data, "v.tile_position_embd.gate", true);
    vision_model.pre_tile_position_embeddings = mllama_tensor_load(new_mllama->ctx_data, "v.pre_tile_position_embd.weight", true);
    vision_model.pre_tile_position_embeddings_gate = mllama_tensor_load(new_mllama->ctx_data, "v.pre_tile_position_embd.gate", true);
    vision_model.post_tile_position_embeddings = mllama_tensor_load(new_mllama->ctx_data, "v.post_tile_position_embd.weight", true);
    vision_model.post_tile_position_embeddings_gate = mllama_tensor_load(new_mllama->ctx_data, "v.post_tile_position_embd.gate", true);
    vision_model.mm_0_w = mllama_tensor_load(new_mllama->ctx_data, "mm.0.weight", false);
    vision_model.mm_0_b = mllama_tensor_load(new_mllama->ctx_data, "mm.0.bias", false);
    vision_model.layers = mllama_layers_load(new_mllama->ctx_data, "v", hparams.n_layer);
    vision_model.global_layers = mllama_layers_load(new_mllama->ctx_data, "v.global", hparams.n_global_layer);
    ggml_free(meta);
    new_mllama->ctx_gguf = ctx;
    {
        // measure mem requirement and allocate
        new_mllama->buf_compute_meta.resize(GGML_DEFAULT_GRAPH_SIZE * ggml_tensor_overhead() + ggml_graph_overhead());
        new_mllama->compute_alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(new_mllama->backend));
        struct mllama_image_batch batch;
        batch.size = 1;
        ggml_cgraph *gf = mllama_image_build_graph(new_mllama, &batch);
        ggml_gallocr_reserve(new_mllama->compute_alloc, gf);
        size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_mllama->compute_alloc, 0);
        LOG("compute allocated memory: %.2f MB", compute_memory_buffer_size / 1024.0 / 1024.0);
    }
    return new_mllama;
 }
 struct mllama_image *mllama_image_init() {
    return new mllama_image();
 }
 void mllama_image_free(struct mllama_image *img) { delete img; }
 void mllama_image_batch_free(struct mllama_image_batch *batch) {
    if (batch->size > 0) {
        delete[] batch->data;
        batch->size = 0;
    }
 }
 bool mllama_image_load_from_data(const void *data, const int n, const int width, const int height, const int num_channels, const int num_tiles, const int aspect_ratio_id, struct mllama_image *img) {
    img->width = width;
    img->height = height;
    img->num_channels = num_channels;
    img->num_tiles = num_tiles;
    img->aspect_ratio_id = aspect_ratio_id;
    img->data.resize(n);
    memcpy(img->data.data(), data, n);
    return true;
 }
 inline int mllama(int x, int lower, int upper) {
    return std::max(lower, std::min(x, upper));
 }
 void mllama_free(mllama_ctx *ctx) {
    ggml_free(ctx->ctx_data);
    gguf_free(ctx->ctx_gguf);
    ggml_backend_buffer_free(ctx->params_buffer);
    ggml_backend_free(ctx->backend);
    ggml_gallocr_free(ctx->compute_alloc);
    delete ctx;
 }
 bool mllama_image_encode(struct mllama_ctx *ctx, const int n_threads, mllama_image *img, float *vec) {
    mllama_image_batch imgs{};
    imgs.size = 1;
    imgs.data = img;
    return mllama_image_batch_encode(ctx, n_threads, &imgs, vec);
 }
 bool mllama_image_batch_encode(mllama_ctx *ctx, const int n_threads, const mllama_image_batch *imgs, float *vec) {
    int batch_size = imgs->size;
    REQUIRE(batch_size == 1);
    // build the inference graph
    ggml_cgraph *gf = mllama_image_build_graph(ctx, imgs);
    ggml_gallocr_alloc_graph(ctx->compute_alloc, gf);
    // set inputs
    const auto &model = ctx->vision_model;
    const auto &hparams = model.hparams;
    const int image_size = hparams.image_size;
    int image_size_width = image_size;
    int image_size_height = image_size;
    const int patch_size = hparams.patch_size;
    const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
    const int num_positions = num_patches + (model.class_embedding == nullptr ? 0 : 1);
    {
        struct ggml_tensor *inp_raw = ggml_graph_get_tensor(gf, "inp_raw");
        ggml_backend_tensor_set(inp_raw, imgs->data[0].data.data(), 0, ggml_nbytes(inp_raw));
    }
    {
        struct ggml_tensor *embeddings = ggml_graph_get_tensor(gf, "embeddings");
        if (embeddings != nullptr) {
            void *zeros = malloc(ggml_nbytes(embeddings));
            memset(zeros, 0, ggml_nbytes(embeddings));
            ggml_backend_tensor_set(embeddings, zeros, 0, ggml_nbytes(embeddings));
            free(zeros);
        }
    }
    {
        struct ggml_tensor *positions = ggml_graph_get_tensor(gf, "positions");
        if (positions != nullptr) {
            int *positions_data = (int *)malloc(ggml_nbytes(positions));
            for (int i = 0; i < num_positions; i++) {
                positions_data[i] = i;
            }
            ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
            free(positions_data);
        }
    }
    {
        struct ggml_tensor *aspect_ratios = ggml_graph_get_tensor(gf, "aspect_ratios");
        if (aspect_ratios != nullptr) {
            int *aspect_ratios_data = (int *)malloc(ggml_nbytes(aspect_ratios));
            aspect_ratios_data[0] = imgs->data[0].aspect_ratio_id;
            ggml_backend_tensor_set(aspect_ratios, aspect_ratios_data, 0, ggml_nbytes(aspect_ratios));
            free(aspect_ratios_data);
        }
    }
    if (ggml_backend_is_cpu(ctx->backend)) {
        ggml_backend_cpu_set_n_threads(ctx->backend, n_threads);
    }
    ggml_backend_graph_compute(ctx->backend, gf);
    // the last node is the embedding tensor
    struct ggml_tensor *embeddings = ggml_graph_node(gf, ggml_graph_n_nodes(gf) - 1);
    // copy the embeddings to the location passed by the user
    ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
    return true;
 }
 int32_t mllama_image_size(const struct mllama_ctx *ctx) {
    return ctx->vision_model.hparams.image_size;
 }
 int32_t mllama_patch_size(const struct mllama_ctx *ctx) {
    return ctx->vision_model.hparams.patch_size;
 }
 int32_t mllama_hidden_size(const struct mllama_ctx *ctx) {
    return ctx->vision_model.hparams.hidden_size;
 }
 int mllama_n_patches(const struct mllama_ctx *ctx) {
    const auto &hparams = ctx->vision_model.hparams;
    return (hparams.image_size / hparams.patch_size) * (hparams.image_size / hparams.patch_size);
 }
 int mllama_n_positions(const struct mllama_ctx *ctx) {
    return mllama_n_patches(ctx) + (ctx->vision_model.class_embedding == nullptr ? 0 : 1);
 }
 int mllama_n_tiles(const struct mllama_ctx *ctx) {
    return ctx->vision_model.hparams.n_tiles;
 }
 int mllama_n_embd(const struct mllama_ctx *ctx) {
    return ctx->vision_model.hparams.projection_dim;
 }
 size_t mllama_n_embd_bytes(const struct mllama_ctx *ctx) {
    return mllama_n_positions(ctx) * mllama_n_embd(ctx) * mllama_n_tiles(ctx) * sizeof(float);
 }
--- a/llama/mllama.h
+++ b/llama/mllama.h
@ -0,0 +1,61 @@
 #ifndef MLLAMA_H
 #define MLLAMA_H
 #include <stddef.h>
 #include <stdint.h>
 #ifdef LLAMA_SHARED
 #if defined(_WIN32) && !defined(__MINGW32__)
 #ifdef LLAMA_BUILD
 #define MLLAMA_API __declspec(dllexport)
 #else
 #define MLLAMA_API __declspec(dllimport)
 #endif
 #else
 #define MLLAMA_API __attribute__((visibility("default")))
 #endif
 #else
 #define MLLAMA_API
 #endif
 #ifdef __cplusplus
 extern "C" {
 #endif
 struct mllama_ctx;
 struct mllama_image_batch {
    struct mllama_image *data;
    size_t size;
 };
 MLLAMA_API struct mllama_ctx *mllama_model_load(const char *fname, int verbosity);
 MLLAMA_API struct mllama_ctx *mllama_model_load_cpu(const char *fname, int verbosity);
 MLLAMA_API void mllama_free(struct mllama_ctx *ctx);
 MLLAMA_API int32_t mllama_image_size(const struct mllama_ctx *ctx);
 MLLAMA_API int32_t mllama_patch_size(const struct mllama_ctx *ctx);
 MLLAMA_API int32_t mllama_hidden_size(const struct mllama_ctx *ctx);
 MLLAMA_API int mllama_n_patches(const struct mllama_ctx *ctx);
 MLLAMA_API int mllama_n_positions(const struct mllama_ctx *ctx);
 MLLAMA_API int mllama_n_tiles(const struct mllama_ctx *ctx);
 MLLAMA_API int mllama_n_embd(const struct mllama_ctx *ctx);
 MLLAMA_API size_t mllama_n_embd_bytes(const struct mllama_ctx *ctx);
 MLLAMA_API struct mllama_image *mllama_image_init();
 MLLAMA_API void mllama_image_free(struct mllama_image *img);
 MLLAMA_API void mllama_image_batch_free(struct mllama_image_batch *batch);
 MLLAMA_API bool mllama_image_load_from_data(const void *data, const int n, const int nx, const int ny, const int nc, const int nt, const int aspect_ratio_id, struct mllama_image *img);
 MLLAMA_API bool mllama_image_encode(struct mllama_ctx *ctx, int n_threads, struct mllama_image *img, float *vec);
 MLLAMA_API bool mllama_image_batch_encode(struct mllama_ctx *ctx, int n_threads, const struct mllama_image_batch *imgs, float *vec);
 #ifdef __cplusplus
 }
 #endif
 #endif // MLLAMA_H
--- a/llama/patches/0005-solar-pro.patch
+++ b/llama/patches/0005-solar-pro.patch
@ -270,7 +270,7 @@ index 3a4e72a3..831b68c0 100644
 +            // self-attention
 +            {
 +                // rope freq factors for llama3; may return nullptr for llama2 and other models
-+                ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
+                ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
 +
 +                // compute Q and K and RoPE them
 +                ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
--- a/llama/patches/0006-add-mllama-support.patch
+++ b/llama/patches/0006-add-mllama-support.patch
--- a/llama/patches/0007-add-unpad-operator.patch
+++ b/llama/patches/0007-add-unpad-operator.patch
@ -0,0 +1,419 @@
 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
 From: jmorganca <jmorganca@gmail.com>
 Date: Sun, 13 Apr 2025 22:10:06 -0400
 Subject: [PATCH] add unpad operator
 adds the unpad operator to GGML
 ---
 ggml/include/ggml.h                  | 10 +++++
 ggml/src/ggml-cpu/ggml-cpu.c         |  5 +++
 ggml/src/ggml-cpu/ops.cpp            | 55 ++++++++++++++++++++++++++++
 ggml/src/ggml-cpu/ops.h              |  1 +
 ggml/src/ggml-cuda/ggml-cuda.cu      |  4 ++
 ggml/src/ggml-cuda/pad.cu            | 46 +++++++++++++++++++++++
 ggml/src/ggml-cuda/pad.cuh           |  1 +
 ggml/src/ggml-metal/ggml-metal.m     | 33 +++++++++++++++++
 ggml/src/ggml-metal/ggml-metal.metal | 45 +++++++++++++++++++++++
 ggml/src/ggml.c                      | 25 ++++++++++++-
 10 files changed, 223 insertions(+), 2 deletions(-)
 diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
 index e91dedf1..8dc107ba 100644
 --- a/ggml/include/ggml.h
 +++ b/ggml/include/ggml.h
@@ -489,6 +489,7 @@ extern "C" {
         GGML_OP_UPSCALE, // nearest interpolate
         GGML_OP_PAD,
         GGML_OP_PAD_REFLECT_1D,
 +        GGML_OP_UNPAD,
         GGML_OP_ARANGE,
         GGML_OP_TIMESTEP_EMBEDDING,
         GGML_OP_ARGSORT,
@@ -1781,6 +1782,15 @@ extern "C" {
             int                   p0,
             int                   p1);
 +    // unpad each dimension: [x, ..., x, y, ..., y] -> [x, ..., x]
 +    GGML_API struct ggml_tensor * ggml_unpad(
 +            struct ggml_context * ctx,
 +            struct ggml_tensor  * a,
 +            int                  p0,
 +            int                  p1,
 +            int                  p2,
 +            int                  p3);
 +
     // Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151
     // timesteps: [N,]
     // return: [N, dim]
 diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
 index a30e67f2..835e6495 100644
 --- a/ggml/src/ggml-cpu/ggml-cpu.c
 +++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -1951,6 +1951,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
             {
                 ggml_compute_forward_pad_reflect_1d(params, tensor);
             } break;
 +        case GGML_OP_UNPAD:
 +            {
 +                ggml_compute_forward_unpad(params, tensor);
 +            } break;
         case GGML_OP_ARANGE:
             {
                 ggml_compute_forward_arange(params, tensor);
@@ -2274,6 +2278,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
         case GGML_OP_UPSCALE:
         case GGML_OP_PAD:
         case GGML_OP_PAD_REFLECT_1D:
 +        case GGML_OP_UNPAD:
         case GGML_OP_ARANGE:
         case GGML_OP_TIMESTEP_EMBEDDING:
         case GGML_OP_ARGSORT:
 diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
 index 955fec59..1868a10c 100644
 --- a/ggml/src/ggml-cpu/ops.cpp
 +++ b/ggml/src/ggml-cpu/ops.cpp
@@ -6690,6 +6690,61 @@ void ggml_compute_forward_pad_reflect_1d(
     }
 }
 +// ggml_compute_forward_unpad
 +
 +static void ggml_compute_forward_unpad_f32(
 +    const struct ggml_compute_params *params,
 +    struct ggml_tensor *dst) {
 +
 +    const struct ggml_tensor * src0 = dst->src[0];
 +
 +    GGML_ASSERT(src0->nb[0] == sizeof(float));
 +    GGML_ASSERT( dst->nb[0] == sizeof(float));
 +
 +    const int ith = params->ith;
 +    const int nth = params->nth;
 +
 +    GGML_TENSOR_UNARY_OP_LOCALS
 +
 +    float * dst_ptr = (float *) dst->data;
 +
 +    // TODO: optimize
 +
 +    for (int64_t i2 = 0; i2 < ne2; ++i2) {
 +        for (int64_t i1 = ith; i1 < ne1; i1 += nth) {
 +            for (int64_t i0 = 0; i0 < ne0; ++i0) {
 +                for (int64_t i3 = 0; i3 < ne3; ++i3) {
 +                    const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0;
 +
 +                    const float * src_ptr = (const float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
 +
 +                    if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
 +                        dst_ptr[dst_idx] = *src_ptr;
 +                    }
 +                }
 +            }
 +        }
 +    }
 +}
 +
 +void ggml_compute_forward_unpad(
 +    const struct ggml_compute_params * params,
 +    struct ggml_tensor * dst) {
 +
 +    const struct ggml_tensor * src0 = dst->src[0];
 +
 +    switch (src0->type) {
 +        case GGML_TYPE_F32:
 +            {
 +                ggml_compute_forward_unpad_f32(params, dst);
 +            } break;
 +        default:
 +            {
 +                GGML_ABORT("fatal error");
 +            }
 +    }
 +}
 +
 // ggml_compute_forward_arange
 static void ggml_compute_forward_arange_f32(
 diff --git a/ggml/src/ggml-cpu/ops.h b/ggml/src/ggml-cpu/ops.h
 index dc081b9e..a7125555 100644
 --- a/ggml/src/ggml-cpu/ops.h
 +++ b/ggml/src/ggml-cpu/ops.h
@@ -72,6 +72,7 @@ void ggml_compute_forward_pool_2d_back(const struct ggml_compute_params * params
 void ggml_compute_forward_upscale(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_pad(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_pad_reflect_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 +void ggml_compute_forward_unpad(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_arange(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_timestep_embedding(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_argsort(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
 index cb0d8528..6fe86674 100644
 --- a/ggml/src/ggml-cuda/ggml-cuda.cu
 +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2238,6 +2238,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
         case GGML_OP_PAD:
             ggml_cuda_op_pad(ctx, dst);
             break;
 +        case GGML_OP_UNPAD:
 +            ggml_cuda_op_unpad(ctx, dst);
 +            break;
         case GGML_OP_ARANGE:
             ggml_cuda_op_arange(ctx, dst);
             break;
@@ -3212,6 +3215,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
         case GGML_OP_UPSCALE:
             return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST;
         case GGML_OP_PAD:
 +        case GGML_OP_UNPAD:
         case GGML_OP_ARANGE:
         case GGML_OP_TIMESTEP_EMBEDDING:
         case GGML_OP_LEAKY_RELU:
 diff --git a/ggml/src/ggml-cuda/pad.cu b/ggml/src/ggml-cuda/pad.cu
 index 77432b04..7d45a7e1 100644
 --- a/ggml/src/ggml-cuda/pad.cu
 +++ b/ggml/src/ggml-cuda/pad.cu
@@ -47,3 +47,49 @@ void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
         src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
         dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], stream);
 }
 +
 +static __global__ void unpad_f32(const float * x, float * dst, const int ne0, const int ne00, const int ne01, const int ne02, const int ne03) {
 +    // blockIdx.z: idx of ne2*ne3, aka ne02*ne03
 +    // blockIdx.y: idx of ne1
 +    // blockIDx.x: idx of ne0 / BLOCK_SIZE
 +    int nidx = threadIdx.x + blockIdx.x * blockDim.x;
 +    if (nidx >= ne0) {
 +        return;
 +    }
 +
 +    // operation
 +    int offset_dst =
 +        nidx +
 +        blockIdx.y * ne0 +
 +        blockIdx.z * ne0 * gridDim.y;
 +    if (nidx < ne00 && blockIdx.y < ne01 && blockIdx.z < ne02*ne03) {
 +        int offset_src =
 +            nidx +
 +            blockIdx.y * ne00 +
 +            blockIdx.z * ne00 * ne01;
 +        dst[offset_dst] = x[offset_src];
 +    }
 +}
 +
 +static void unpad_f32_cuda(const float * x, float * dst,
 +    const int ne00, const int ne01, const int ne02, const int ne03,
 +    const int ne0, const int ne1, const int ne2, const int ne3, cudaStream_t stream) {
 +    int num_blocks = (ne0 + CUDA_PAD_BLOCK_SIZE - 1) / CUDA_PAD_BLOCK_SIZE;
 +    dim3 gridDim(num_blocks, ne1, ne2*ne3);
 +    unpad_f32<<<gridDim, CUDA_PAD_BLOCK_SIZE, 0, stream>>>(x, dst, ne0, ne00, ne01, ne02, ne03);
 +}
 +
 +void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
 +    const ggml_tensor * src0 = dst->src[0];
 +    const float * src0_d = (const float *)src0->data;
 +    float * dst_d = (float *)dst->data;
 +    cudaStream_t stream = ctx.stream();
 +
 +    GGML_ASSERT(src0->type == GGML_TYPE_F32);
 +    GGML_ASSERT(dst->type == GGML_TYPE_F32);
 +    GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
 +
 +    unpad_f32_cuda(src0_d, dst_d,
 +        src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
 +        dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], stream);
 +}
 \ No newline at end of file
 diff --git a/ggml/src/ggml-cuda/pad.cuh b/ggml/src/ggml-cuda/pad.cuh
 index 8fd386b0..e2ededc3 100644
 --- a/ggml/src/ggml-cuda/pad.cuh
 +++ b/ggml/src/ggml-cuda/pad.cuh
@@ -3,3 +3,4 @@
 #define CUDA_PAD_BLOCK_SIZE 256
 void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 +void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
 index 1b56f858..7641247e 100644
 --- a/ggml/src/ggml-metal/ggml-metal.m
 +++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -347,6 +347,7 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
     GGML_METAL_KERNEL_TYPE_UPSCALE_F32,
     GGML_METAL_KERNEL_TYPE_PAD_F32,
     GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32,
 +    GGML_METAL_KERNEL_TYPE_UNPAD_F32,
     GGML_METAL_KERNEL_TYPE_ARANGE_F32,
     GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,
     GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,
@@ -1294,6 +1295,7 @@ @implementation GGMLMetalClass
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32,                     upscale_f32,                     true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32,                         pad_f32,                         true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32,              pad_reflect_1d_f32,              true);
 +        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UNPAD_F32,                       unpad_f32,                       true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,          timestep_embedding_f32,          true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARANGE_F32,                      arange_f32,                      true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,             argsort_f32_i32_asc,             true);
@@ -1655,6 +1657,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
         case GGML_OP_POOL_2D:
         case GGML_OP_PAD:
         case GGML_OP_PAD_REFLECT_1D:
 +        case GGML_OP_UNPAD:
         case GGML_OP_TIMESTEP_EMBEDDING:
         case GGML_OP_ARGSORT:
         case GGML_OP_LEAKY_RELU:
@@ -4184,6 +4187,36 @@ static bool ggml_metal_encode_node(
                 const int nth = MIN(1024, ne0);
 +                [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
 +            } break;
 +        case GGML_OP_UNPAD:
 +            {
 +                GGML_ASSERT(src0->type == GGML_TYPE_F32);
 +
 +                id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_UNPAD_F32].pipeline;
 +
 +                [encoder setComputePipelineState:pipeline];
 +                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
 +                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
 +                [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
 +                [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
 +                [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
 +                [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5];
 +                [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
 +                [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
 +                [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
 +                [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9];
 +                [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:10];
 +                [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:11];
 +                [encoder setBytes:&ne2  length:sizeof(ne2)  atIndex:12];
 +                [encoder setBytes:&ne3  length:sizeof(ne3)  atIndex:13];
 +                [encoder setBytes:&nb0  length:sizeof(nb0)  atIndex:14];
 +                [encoder setBytes:&nb1  length:sizeof(nb1)  atIndex:15];
 +                [encoder setBytes:&nb2  length:sizeof(nb2)  atIndex:16];
 +                [encoder setBytes:&nb3  length:sizeof(nb3)  atIndex:17];
 +
 +                const int nth = MIN(1024, ne0);
 +
                 [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
             } break;
         case GGML_OP_ARANGE:
 diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
 index 9cfddf45..080a943b 100644
 --- a/ggml/src/ggml-metal/ggml-metal.metal
 +++ b/ggml/src/ggml-metal/ggml-metal.metal
@@ -3121,6 +3121,51 @@ kernel void kernel_pad_reflect_1d_f32(
     }
 }
 +kernel void kernel_unpad_f32(
 +    device  const char * src0,
 +    device        char * dst,
 +    constant   int64_t & ne00,
 +    constant   int64_t & ne01,
 +    constant   int64_t & ne02,
 +    constant   int64_t & ne03,
 +    constant  uint64_t & nb00,
 +    constant  uint64_t & nb01,
 +    constant  uint64_t & nb02,
 +    constant  uint64_t & nb03,
 +    constant   int64_t & ne0,
 +    constant   int64_t & ne1,
 +    constant   int64_t & ne2,
 +    constant   int64_t & ne3,
 +    constant  uint64_t & nb0,
 +    constant  uint64_t & nb1,
 +    constant  uint64_t & nb2,
 +    constant  uint64_t & nb3,
 +    uint3 tgpig[[threadgroup_position_in_grid]],
 +    uint3 tpitg[[thread_position_in_threadgroup]],
 +    uint3   ntg[[threads_per_threadgroup]]) {
 +
 +    const int64_t i3 = tgpig.z;
 +    const int64_t i2 = tgpig.y;
 +    const int64_t i1 = tgpig.x;
 +
 +    const int64_t i03 = i3;
 +    const int64_t i02 = i2;
 +    const int64_t i01 = i1;
 +
 +    device const float * src0_ptr = (device const float *) (src0 + i03*nb03 + i02*nb02 + i01*nb01);
 +    device       float * dst_ptr  = (device       float *) (dst  +  i3*nb3  +  i2*nb2  +  i1*nb1);
 +
 +    if (i1 < ne01 && i2 < ne02 && i3 < ne03) {
 +        for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
 +            if (i0 < ne00) {
 +                dst_ptr[i0] = src0_ptr[i0];
 +            }
 +        }
 +
 +        return;
 +    }
 +}
 +
 kernel void kernel_arange_f32(
     device        char * dst,
     constant   ggml_metal_kargs_arange & args,
 diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
 index 8a654624..6b034d35 100644
 --- a/ggml/src/ggml.c
 +++ b/ggml/src/ggml.c
@@ -923,6 +923,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "UPSCALE",
     "PAD",
     "PAD_REFLECT_1D",
 +    "UNPAD",
     "ARANGE",
     "TIMESTEP_EMBEDDING",
     "ARGSORT",
@@ -953,7 +954,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "OPT_STEP_ADAMW",
 };
 -static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
 +static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
 static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "none",
@@ -1018,6 +1019,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "upscale(x)",
     "pad(x)",
     "pad_reflect_1d(x)",
 +    "unpad(x)",
     "arange(start, stop, step)",
     "timestep_embedding(timesteps, dim, max_period)",
     "argsort(x)",
@@ -1048,7 +1050,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "adamw(x)",
 };
 -static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
 +static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
 static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
@@ -4274,6 +4276,25 @@ struct ggml_tensor * ggml_pad_reflect_1d(
     return result;
 }
 +// ggml_unpad
 +
 +struct ggml_tensor * ggml_unpad(
 +    struct ggml_context * ctx,
 +    struct ggml_tensor  * a,
 +    int p0, int p1, int p2, int p3) {
 +
 +    struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type,
 +            a->ne[0] - p0,
 +            a->ne[1] - p1,
 +            a->ne[2] - p2,
 +            a->ne[3] - p3);
 +
 +    result->op = GGML_OP_UNPAD;
 +    result->src[0] = a;
 +
 +    return result;
 +}
 +
 // ggml_arange
 struct ggml_tensor * ggml_arange(
--- a/llama/patches/0008-fix-deepseek-deseret-regex.patch
+++ b/llama/patches/0008-fix-deepseek-deseret-regex.patch
--- a/llama/patches/0009-maintain-ordering-for-rules-for-grammar.patch
+++ b/llama/patches/0009-maintain-ordering-for-rules-for-grammar.patch
--- a/llama/patches/0010-ensure-KV-cache-is-fully-defragmented.patch
+++ b/llama/patches/0010-ensure-KV-cache-is-fully-defragmented.patch
@ -15,50 +15,13 @@ but this can leave a cache that still does not have adequate space
 even after defragmentation is triggered. Instead, we should do
 multiple batches of processing until everything is complete.
 ---
 src/llama-context.cpp  |  18 ++++---
 src/llama-context.h    |   1 +
 src/llama-kv-cache.cpp | 107 ++++++++++++++---------------------------
 src/llama-kv-cache.h   |  12 ++++-
- 4 files changed, 59 insertions(+), 79 deletions(-)
+ 3 files changed, 47 insertions(+), 73 deletions(-)
 diff --git a/src/llama-context.cpp b/src/llama-context.cpp
 index c22687e4..c5948e8f 100644
 --- a/src/llama-context.cpp
 +++ b/src/llama-context.cpp
@@ -950,9 +950,12 @@ int llama_context::decode(llama_batch & inp_batch) {
         // find KV slot
         if (!kv_self->find_slot(ubatch)) {
 -            LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
 -
 -            return 1;
 +            kv_self->defrag_sched(-1.0f);
 +            kv_self->update(*this);
 +            if (!kv_self->find_slot(ubatch)) {
 +                LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
 +                return 1;
 +            }
         }
         ggml_backend_sched_reset(sched.get());
@@ -1967,9 +1970,12 @@ void llama_context::opt_epoch_iter(
             // TODO: not sure if this is needed
             if (!kv_self->find_slot(ubatch)) {
 -                LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
 -
 -                GGML_ABORT("TODO: handle this error");
 +                kv_self->defrag_sched(-1.0f);
 +                kv_self->update(*this);
 +                if (!kv_self->find_slot(ubatch)) {
 +                    LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
 +                    GGML_ABORT("TODO: handle this error");
 +                }
             }
             auto * gf = graph_init();
 diff --git a/src/llama-context.h b/src/llama-context.h
-index c0ceacb1..0264e937 100644
+index c4ab242a..9970dfc6 100644
 --- a/src/llama-context.h
 +++ b/src/llama-context.h
@@ -5,6 +5,7 @@
@ -70,10 +33,10 @@ index c0ceacb1..0264e937 100644
 #include "ggml-cpp.h"
 #include "ggml-opt.h"
 diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
-index 3dcad65b..60e67b03 100644
+index a7b0a7eb..1a50c034 100644
 --- a/src/llama-kv-cache.cpp
 +++ b/src/llama-kv-cache.cpp
-@@ -364,8 +364,6 @@ void llama_kv_cache_unified::commit() {
+@@ -372,8 +372,6 @@ void llama_kv_cache_unified::commit() {
 }
 bool llama_kv_cache_unified::update(llama_context & lctx) {
@ -82,7 +45,7 @@ index 3dcad65b..60e67b03 100644
     auto * sched = lctx.get_sched();
     if (has_shift) {
-@@ -388,8 +386,6 @@ bool llama_kv_cache_unified::update(llama_context & lctx) {
+@@ -396,8 +394,6 @@ bool llama_kv_cache_unified::update(llama_context & lctx) {
             res->set_inputs(nullptr);
             lctx.graph_compute(gf, false);
@ -91,7 +54,7 @@ index 3dcad65b..60e67b03 100644
         }
         {
-@@ -403,27 +399,36 @@ bool llama_kv_cache_unified::update(llama_context & lctx) {
+@@ -411,27 +407,36 @@ bool llama_kv_cache_unified::update(llama_context & lctx) {
     if (do_defrag) {
         LLAMA_LOG_DEBUG("%s: defragmenting KV cache\n", __func__);
@ -133,7 +96,7 @@ index 3dcad65b..60e67b03 100644
 }
 void llama_kv_cache_unified::defrag_sched(float thold) {
-@@ -707,11 +712,10 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_shift(
+@@ -715,11 +720,10 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_shift(
 llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag(
         const llama_cparams & cparams,
                ggml_context * ctx,
@ -147,7 +110,7 @@ index 3dcad65b..60e67b03 100644
 #if 0
     // CPU defrag
     //
-@@ -783,32 +787,20 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag(
+@@ -791,32 +795,20 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag(
         ggml_backend_tensor_set(v_l[il], buf_v.data(), 0, buf_v.size());
     }
 #else
@ -185,7 +148,7 @@ index 3dcad65b..60e67b03 100644
             ggml_tensor * view_v_src;
             ggml_tensor * view_v_dst;
-@@ -816,31 +808,29 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag(
+@@ -824,31 +816,29 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag(
             if (cparams.flash_attn) {
                 // NOTE: the V cache is not transposed when using flash attention
                 view_v_src = ggml_view_2d(ctx, v_l[il],
@ -225,7 +188,7 @@ index 3dcad65b..60e67b03 100644
     }
     //LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
-@@ -857,17 +847,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
+@@ -865,17 +855,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
     assert(n_used <= n_kv);
@ -244,7 +207,7 @@ index 3dcad65b..60e67b03 100644
     // determine which KV cells to move where
     //
-@@ -875,10 +855,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
+@@ -883,10 +863,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
     //
     //  if ids[i] == i || ids[i] == n_kv, then cell i is not moved
     //
@ -256,7 +219,7 @@ index 3dcad65b..60e67b03 100644
     for (uint32_t i0 = 0; i0 < n_used; ++i0) {
         const auto & cell0 = cells[i0];
-@@ -927,19 +904,11 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
+@@ -935,19 +912,11 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
         // are we moving a continuous block of memory?
         bool cont = false;
@ -276,7 +239,7 @@ index 3dcad65b..60e67b03 100644
                 cont = false;
                 continue;
             }
-@@ -955,8 +924,10 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
+@@ -963,8 +932,10 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
             head = n_used;
             if (!cont) {
@ -288,7 +251,7 @@ index 3dcad65b..60e67b03 100644
             }
             nf++;
-@@ -966,22 +937,16 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
+@@ -974,22 +945,16 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
             }
         }
--- a/llama/patches/0011-sort-devices-by-score.patch
+++ b/llama/patches/0011-sort-devices-by-score.patch
@ -11,7 +11,7 @@ with the fastest acceleration is loaded
 1 file changed, 13 insertions(+), 8 deletions(-)
 diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
-index 405d8e31..4e67d243 100644
+index 82ae1b5b..1487f322 100644
 --- a/ggml/src/ggml-backend-reg.cpp
 +++ b/ggml/src/ggml-backend-reg.cpp
@@ -157,7 +157,7 @@ struct ggml_backend_reg_entry {
--- a/llama/patches/0012-add-phony-target-ggml-cpu-for-all-cpu-variants.patch
+++ b/llama/patches/0012-add-phony-target-ggml-cpu-for-all-cpu-variants.patch
--- a/llama/patches/0013-remove-amx.patch
+++ b/llama/patches/0013-remove-amx.patch
--- a/llama/patches/0014-fix-string-arr-kv-loading.patch
+++ b/llama/patches/0014-fix-string-arr-kv-loading.patch
@ -9,8 +9,8 @@ such as vocab fields
 ---
 ggml/include/gguf.h | 1 +
 ggml/src/gguf.cpp   | 7 +++++--
- src/llama-vocab.cpp | 4 +---
+ src/llama-vocab.cpp | 2 +-
- 3 files changed, 7 insertions(+), 5 deletions(-)
+ 3 files changed, 7 insertions(+), 3 deletions(-)
 diff --git a/ggml/include/gguf.h b/ggml/include/gguf.h
 index 79ee2020..3efb22f0 100644
@ -53,15 +53,13 @@ index 381a9c7d..e45b453d 100644
 }
 diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
-index 10f34d33..9f5fd57b 100644
+index 10f34d33..b098bb25 100644
 --- a/src/llama-vocab.cpp
 +++ b/src/llama-vocab.cpp
-@@ -1469,9 +1469,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
+@@ -1471,7 +1471,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
             const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
             if (precompiled_charsmap_keyidx != -1) {
                 const gguf_type pc_type = gguf_get_arr_type(ctx, precompiled_charsmap_keyidx);
-                GGML_ASSERT(pc_type == GGUF_TYPE_INT8 || pc_type == GGUF_TYPE_UINT8);
+                 GGML_ASSERT(pc_type == GGUF_TYPE_INT8 || pc_type == GGUF_TYPE_UINT8);
-
+ 
 -                const size_t n_precompiled_charsmap = gguf_get_arr_n(ctx, precompiled_charsmap_keyidx);
 +                const size_t n_precompiled_charsmap = gguf_get_arr_data_n(ctx, precompiled_charsmap_keyidx);
                 const char * pc = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx);
--- a/llama/patches/0015-ollama-debug-tensor.patch
+++ b/llama/patches/0015-ollama-debug-tensor.patch
@ -8,7 +8,7 @@ Subject: [PATCH] ollama debug tensor
 1 file changed, 6 insertions(+)
 diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
-index a30e67f2..2462d2b8 100644
+index 835e6495..3902894b 100644
 --- a/ggml/src/ggml-cpu/ggml-cpu.c
 +++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -15,6 +15,8 @@
@ -20,7 +20,7 @@ index a30e67f2..2462d2b8 100644
 #if defined(_MSC_VER) || defined(__MINGW32__)
 #include <malloc.h> // using malloc.h with MSC/MINGW
 #elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
-@@ -2841,6 +2843,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
+@@ -2846,6 +2848,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
         ggml_compute_forward(&params, node);
--- a/llama/patches/0016-add-ollama-vocab-for-grammar-support.patch
+++ b/llama/patches/0016-add-ollama-vocab-for-grammar-support.patch
--- a/llama/patches/0018-add-argsort-and-cuda-copy-for-i32.patch
+++ b/llama/patches/0018-add-argsort-and-cuda-copy-for-i32.patch
--- a/llama/sampling_ext.cpp
+++ b/llama/sampling_ext.cpp
@ -114,9 +114,6 @@ void grammar_free(struct llama_grammar *g) {
        if (g->vocab != nullptr) {
            delete g->vocab;
        }
        if (g->o_vocab != nullptr) {
                delete g->o_vocab;
        }
        llama_grammar_free_impl(g);
    }
 }
--- a/llm/memory.go
+++ b/llm/memory.go
@ -1,12 +1,9 @@
 package llm
 import (
 	"cmp"
 	"fmt"
 	"log/slog"
 	"maps"
 	"os"
 	"slices"
 	"strconv"
 	"strings"
@ -111,8 +108,9 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 	slog.Debug("evaluating", "library", gpus[0].Library, "gpu_count", len(gpus), "available", availableList)
 	for _, projector := range projectors {
-		weight := projectorMemoryRequirements(projector)
+		weight, graph := projectorMemoryRequirements(projector)
 		projectorWeights += weight
 		projectorGraph += graph
 		// multimodal models require at least 2048 context
 		opts.NumCtx = max(opts.NumCtx, 2048)
@ -122,10 +120,12 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 	}
 	layers := f.Tensors().GroupLayers()
-	// add one layer (chosing the max layer) worth of memory as a buffer
+	// add one layer worth of memory as a buffer
-	layerSize = slices.MaxFunc(slices.Collect(maps.Values(layers)), func(a, b ggml.Layer) int {
+	if blk0, ok := layers["blk.0"]; ok {
-		return cmp.Compare(a.Size(), b.Size())
+		layerSize = blk0.Size()
-	}).Size()
+	} else {
 		slog.Warn("model missing blk.0 layer size")
 	}
 	var kvct string
 	if envconfig.FlashAttention() &&
@ -219,7 +219,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 	}
 	// For all the layers, find where they can fit on the GPU(s)
-	for i := int(f.KV().BlockCount()) - 1; i >= 0; i-- {
+	for i := range int(f.KV().BlockCount()) {
 		// Some models have inconsistent layer sizes
 		if blk, ok := layers[fmt.Sprintf("blk.%d", i)]; ok {
 			layerSize = blk.Size()
@ -229,7 +229,6 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 		if opts.NumGPU >= 0 && layerCount >= opts.NumGPU {
 			// Stop allocating on GPU(s) once we hit the users target NumGPU
 			overflow += layerSize
 			continue
 		}
@ -246,13 +245,13 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 				gpusWithSpace = append(gpusWithSpace[:i%j], gpusWithSpace[i%j+1:]...)
 			}
 		}
 		if len(gpusWithSpace) == 0 {
 			overflow += layerSize
 		}
 	}
 	if layerCount >= int(f.KV().BlockCount()) {
 		fullyLoaded = true
 	} else {
 		for i := layerCount; i < int(f.KV().BlockCount()); i++ {
 			overflow += layerSize
 		}
 	}
 	// Determine if we need to consider output then find where it fits
@ -408,21 +407,51 @@ func (m MemoryEstimate) LogValue() slog.Value {
 	return slog.GroupValue(attrs...)
 }
-func projectorMemoryRequirements(filename string) (weights uint64) {
+func projectorMemoryRequirements(filename string) (weights, graphSize uint64) {
 	file, err := os.Open(filename)
 	if err != nil {
-		return 0
+		return 0, 0
 	}
 	defer file.Close()
 	ggml, _, err := ggml.Decode(file, 1024)
 	if err != nil {
-		return 0
+		return 0, 0
 	}
 	for _, layer := range ggml.Tensors().GroupLayers() {
 		weights += layer.Size()
 	}
-	return weights
+	switch arch := ggml.KV().Architecture(); arch {
 	case "mllama":
 		kv := func(n string) uint64 {
 			if v, ok := ggml.KV()[arch+".vision."+n].(uint32); ok {
 				return uint64(v)
 			}
 			return 0
 		}
 		imageSize := kv("image_size")
 		maxNumTiles := kv("max_num_tiles")
 		embeddingLength := kv("embedding_length")
 		headCount := kv("attention.head_count")
 		numPatches := (imageSize / kv("patch_size")) * (imageSize / kv("patch_size"))
 		if _, ok := ggml.Tensors().GroupLayers()["v"]["class_embd"]; ok {
 			numPatches++
 		}
 		numPaddedPatches := numPatches + 8 - (numPatches%8)%8
 		graphSize = 4 * (8 +
 			imageSize*imageSize*kv("num_channels")*maxNumTiles +
 			embeddingLength*numPatches*maxNumTiles +
 			9*embeddingLength*numPaddedPatches*maxNumTiles +
 			numPaddedPatches*maxNumTiles*numPaddedPatches*maxNumTiles*headCount)
 	}
 	return weights, graphSize
 }
--- a/llm/server.go
+++ b/llm/server.go
@ -311,7 +311,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
 		params = append(params, "--mmproj", projectors[0])
 	}
-	// iterate through compatible GPU libraries such as 'cuda_v12', 'cuda_v11', 'rocm', etc.
+	// iterate through compatible GPU libraries such as 'cuda_v12', 'rocm', etc.
 	// adding each library's respective path to the LD_LIBRARY_PATH, until finally running
 	// without any LD_LIBRARY_PATH flags
 	for {
@ -679,8 +679,9 @@ ws ::= ([ \t\n] ws)?
 const maxBufferSize = 512 * format.KiloByte
 type ImageData struct {
-	Data []byte `json:"data"`
+	Data          []byte `json:"data"`
-	ID   int    `json:"id"`
+	ID            int    `json:"id"`
 	AspectRatioID int    `json:"aspect_ratio_id"`
 }
 type CompletionRequest struct {
--- a/ml/backend.go
+++ b/ml/backend.go
@ -119,18 +119,22 @@ type Context interface {
 	Layer(int) Context
 }
-// RopeOptions contains optional parameters for RoPE function
+// RopeOpts contains optional parameters for RoPE function
-type RopeOptions struct {
+type RopeOpts struct {
-	OriginalContextLen uint32
+	DefaultContextLen uint32
 	YarnExtFactor     float32
 	YarnAttnFactor    float32
 	YarnBetaFast      float32
 	YarnBetaSlow      float32
 }
 // RopeOption defines a function that modifies RopeOpts
-type RopeOption func(*RopeOptions)
+type RopeOption func(*RopeOpts)
 // WithContextLen sets a custom context length
 func WithContextLen(len uint32) RopeOption {
-	return func(opts *RopeOptions) {
+	return func(opts *RopeOpts) {
-		opts.OriginalContextLen = len
+		opts.DefaultContextLen = len
 	}
 }
@ -160,6 +164,7 @@ type Tensor interface {
 	Conv2D(ctx Context, weight Tensor, s0, s1, p0, p1, d0, d1 int) Tensor
 	RoPE(ctx Context, positionIDs, ropeFactors Tensor, dim, ropeType uint32, base, scale float32, options ...RopeOption) Tensor
 	RoPEMulti(ctx Context, positionIDs, ropeFactors Tensor, dim uint32, sections [4]int32, ropeType uint32, base, scale float32) Tensor
 	IM2Col(ctx Context, weight Tensor, s0, s1, p0, p1, d0, d1 int) Tensor
 	Sin(ctx Context) Tensor
@ -176,6 +181,7 @@ type Tensor interface {
 	Set(ctx Context, t2 Tensor, offset int, strides ...int) Tensor
 	Pad(ctx Context, shape ...int) Tensor
 	Unpad(ctx Context, shape ...int) Tensor
 	Stack(ctx Context, dim int, s ...Tensor) Tensor
--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@ -1017,6 +1017,17 @@ func (t *Tensor) Sigmoid(ctx ml.Context) ml.Tensor {
 	}
 }
 func (t *Tensor) Unpad(ctx ml.Context, shape ...int) ml.Tensor {
 	if len(shape) != 4 {
 		panic("expected 4 dimensions")
 	}
 	return &Tensor{
 		b: t.b,
 		t: C.ggml_unpad(ctx.(*Context).ctx, t.t, C.int(shape[0]), C.int(shape[1]), C.int(shape[2]), C.int(shape[3])),
 	}
 }
 func (t *Tensor) View(ctx ml.Context, offset int, shape ...int) ml.Tensor {
 	switch len(shape) {
 	case 1:
@ -1062,8 +1073,12 @@ const (
 func (t *Tensor) RoPE(ctx ml.Context, positionIDs, ropeFactors ml.Tensor, ropeDim, ropeType uint32, ropeBase, ropeScale float32, options ...ml.RopeOption) ml.Tensor {
 	// Default options
-	opts := &ml.RopeOptions{
+	opts := &ml.RopeOpts{
-		OriginalContextLen: 131072,
+		DefaultContextLen: 131072,
 		YarnExtFactor:     0.0,
 		YarnAttnFactor:    1.0,
 		YarnBetaFast:      32.0,
 		YarnBetaSlow:      1.0,
 	}
 	// Apply any provided options
@ -1089,13 +1104,44 @@ func (t *Tensor) RoPE(ctx ml.Context, positionIDs, ropeFactors ml.Tensor, ropeDi
 			ropeFactors.(*Tensor).t,
 			C.int(ropeDim),
 			C.int(ropeType),
-			C.int(opts.OriginalContextLen),
+			C.int(128000),
 			C.float(ropeBase),
 			C.float(ropeScale),
-			C.float(0.0),
+			C.float(opts.YarnExtFactor),
-			C.float(1.0),
+			C.float(opts.YarnAttnFactor),
-			C.float(32.0),
+			C.float(opts.YarnBetaFast),
-			C.float(1.0),
+			C.float(opts.YarnBetaSlow),
 		),
 	}
 }
 func (t *Tensor) RoPEMulti(ctx ml.Context, positionIDs, ropeFactors ml.Tensor, ropeDim uint32, sections [4]int32, ropeType uint32, ropeBase, ropeScale float32) ml.Tensor {
 	if ropeFactors == nil {
 		ropeFactors = &Tensor{b: t.b}
 	}
 	dequant := t.t
 	if C.ggml_is_quantized(t.t._type) {
 		dequant = C.ggml_cast(ctx.(*Context).ctx, t.t, C.GGML_TYPE_F32)
 	}
 	return &Tensor{
 		b: t.b,
 		t: C.ggml_rope_multi(
 			ctx.(*Context).ctx,
 			dequant,
 			positionIDs.(*Tensor).t,
 			ropeFactors.(*Tensor).t,
 			C.int(ropeDim),
 			(*C.int)(&sections[0]),
 			C.int(ropeType),
 			C.int(128000), // Default context length
 			C.float(ropeBase),
 			C.float(ropeScale),
 			C.float(0.0),  // ext_factor
 			C.float(1.0),  // attn_factor
 			C.float(32.0), // beta_fast
 			C.float(1.0),  // beta_slow
 		),
 	}
 }
--- a/ml/backend/ggml/ggml/include/ggml.h
+++ b/ml/backend/ggml/ggml/include/ggml.h
@ -489,6 +489,7 @@ extern "C" {
        GGML_OP_UPSCALE, // nearest interpolate
        GGML_OP_PAD,
        GGML_OP_PAD_REFLECT_1D,
        GGML_OP_UNPAD,
        GGML_OP_ARANGE,
        GGML_OP_TIMESTEP_EMBEDDING,
        GGML_OP_ARGSORT,
@ -1781,6 +1782,15 @@ extern "C" {
            int                   p0,
            int                   p1);
    // unpad each dimension: [x, ..., x, y, ..., y] -> [x, ..., x]
    GGML_API struct ggml_tensor * ggml_unpad(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            int                  p0,
            int                  p1,
            int                  p2,
            int                  p3);
    // Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151
    // timesteps: [N,]
    // return: [N, dim]
--- a/ml/backend/ggml/ggml/src/ggml-backend-reg.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-backend-reg.cpp
@ -178,9 +178,9 @@ struct ggml_backend_registry {
 #ifdef GGML_USE_CANN
        register_backend(ggml_backend_cann_reg());
 #endif
-#ifdef GGML_USE_BLAS
+// #ifdef GGML_USE_BLAS
-        register_backend(ggml_backend_blas_reg());
+//         register_backend(ggml_backend_blas_reg());
-#endif
+// #endif
 #ifdef GGML_USE_RPC
        register_backend(ggml_backend_rpc_reg());
 #endif
--- a/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.c
@ -1953,6 +1953,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
            {
                ggml_compute_forward_pad_reflect_1d(params, tensor);
            } break;
        case GGML_OP_UNPAD:
            {
                ggml_compute_forward_unpad(params, tensor);
            } break;
        case GGML_OP_ARANGE:
            {
                ggml_compute_forward_arange(params, tensor);
@ -2276,6 +2280,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
        case GGML_OP_UPSCALE:
        case GGML_OP_PAD:
        case GGML_OP_PAD_REFLECT_1D:
        case GGML_OP_UNPAD:
        case GGML_OP_ARANGE:
        case GGML_OP_TIMESTEP_EMBEDDING:
        case GGML_OP_ARGSORT:
--- a/ml/backend/ggml/ggml/src/ggml-cpu/ops.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/ops.cpp
@ -6690,6 +6690,61 @@ void ggml_compute_forward_pad_reflect_1d(
    }
 }
 // ggml_compute_forward_unpad
 static void ggml_compute_forward_unpad_f32(
    const struct ggml_compute_params *params,
    struct ggml_tensor *dst) {
    const struct ggml_tensor * src0 = dst->src[0];
    GGML_ASSERT(src0->nb[0] == sizeof(float));
    GGML_ASSERT( dst->nb[0] == sizeof(float));
    const int ith = params->ith;
    const int nth = params->nth;
    GGML_TENSOR_UNARY_OP_LOCALS
    float * dst_ptr = (float *) dst->data;
    // TODO: optimize
    for (int64_t i2 = 0; i2 < ne2; ++i2) {
        for (int64_t i1 = ith; i1 < ne1; i1 += nth) {
            for (int64_t i0 = 0; i0 < ne0; ++i0) {
                for (int64_t i3 = 0; i3 < ne3; ++i3) {
                    const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0;
                    const float * src_ptr = (const float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
                    if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
                        dst_ptr[dst_idx] = *src_ptr;
                    }
                }
            }
        }
    }
 }
 void ggml_compute_forward_unpad(
    const struct ggml_compute_params * params,
    struct ggml_tensor * dst) {
    const struct ggml_tensor * src0 = dst->src[0];
    switch (src0->type) {
        case GGML_TYPE_F32:
            {
                ggml_compute_forward_unpad_f32(params, dst);
            } break;
        default:
            {
                GGML_ABORT("fatal error");
            }
    }
 }
 // ggml_compute_forward_arange
 static void ggml_compute_forward_arange_f32(
--- a/ml/backend/ggml/ggml/src/ggml-cpu/ops.h
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/ops.h
@ -72,6 +72,7 @@ void ggml_compute_forward_pool_2d_back(const struct ggml_compute_params * params
 void ggml_compute_forward_upscale(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_pad(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_pad_reflect_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_unpad(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_arange(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_timestep_embedding(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_argsort(const struct ggml_compute_params * params, struct ggml_tensor * dst);
--- a/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
@ -2238,6 +2238,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
        case GGML_OP_PAD:
            ggml_cuda_op_pad(ctx, dst);
            break;
        case GGML_OP_UNPAD:
            ggml_cuda_op_unpad(ctx, dst);
            break;
        case GGML_OP_ARANGE:
            ggml_cuda_op_arange(ctx, dst);
            break;
@ -3212,6 +3215,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
        case GGML_OP_UPSCALE:
            return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST;
        case GGML_OP_PAD:
        case GGML_OP_UNPAD:
        case GGML_OP_ARANGE:
        case GGML_OP_TIMESTEP_EMBEDDING:
        case GGML_OP_LEAKY_RELU:
--- a/ml/backend/ggml/ggml/src/ggml-cuda/pad.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/pad.cu
@ -47,3 +47,49 @@ void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
        src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
        dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], stream);
 }
 static __global__ void unpad_f32(const float * x, float * dst, const int ne0, const int ne00, const int ne01, const int ne02, const int ne03) {
    // blockIdx.z: idx of ne2*ne3, aka ne02*ne03
    // blockIdx.y: idx of ne1
    // blockIDx.x: idx of ne0 / BLOCK_SIZE
    int nidx = threadIdx.x + blockIdx.x * blockDim.x;
    if (nidx >= ne0) {
        return;
    }
    // operation
    int offset_dst =
        nidx +
        blockIdx.y * ne0 +
        blockIdx.z * ne0 * gridDim.y;
    if (nidx < ne00 && blockIdx.y < ne01 && blockIdx.z < ne02*ne03) {
        int offset_src =
            nidx +
            blockIdx.y * ne00 +
            blockIdx.z * ne00 * ne01;
        dst[offset_dst] = x[offset_src];
    }
 }
 static void unpad_f32_cuda(const float * x, float * dst,
    const int ne00, const int ne01, const int ne02, const int ne03,
    const int ne0, const int ne1, const int ne2, const int ne3, cudaStream_t stream) {
    int num_blocks = (ne0 + CUDA_PAD_BLOCK_SIZE - 1) / CUDA_PAD_BLOCK_SIZE;
    dim3 gridDim(num_blocks, ne1, ne2*ne3);
    unpad_f32<<<gridDim, CUDA_PAD_BLOCK_SIZE, 0, stream>>>(x, dst, ne0, ne00, ne01, ne02, ne03);
 }
 void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    const ggml_tensor * src0 = dst->src[0];
    const float * src0_d = (const float *)src0->data;
    float * dst_d = (float *)dst->data;
    cudaStream_t stream = ctx.stream();
    GGML_ASSERT(src0->type == GGML_TYPE_F32);
    GGML_ASSERT(dst->type == GGML_TYPE_F32);
    GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
    unpad_f32_cuda(src0_d, dst_d,
        src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
        dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], stream);
 }
--- a/ml/backend/ggml/ggml/src/ggml-cuda/pad.cuh
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/pad.cuh
@ -3,3 +3,4 @@
 #define CUDA_PAD_BLOCK_SIZE 256
 void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal
+++ b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal
@ -5599,6 +5599,51 @@ kernel void kernel_pad_reflect_1d_f32(
    }
 }
 kernel void kernel_unpad_f32(
    device  const char * src0,
    device        char * dst,
    constant   int64_t & ne00,
    constant   int64_t & ne01,
    constant   int64_t & ne02,
    constant   int64_t & ne03,
    constant  uint64_t & nb00,
    constant  uint64_t & nb01,
    constant  uint64_t & nb02,
    constant  uint64_t & nb03,
    constant   int64_t & ne0,
    constant   int64_t & ne1,
    constant   int64_t & ne2,
    constant   int64_t & ne3,
    constant  uint64_t & nb0,
    constant  uint64_t & nb1,
    constant  uint64_t & nb2,
    constant  uint64_t & nb3,
    uint3 tgpig[[threadgroup_position_in_grid]],
    uint3 tpitg[[thread_position_in_threadgroup]],
    uint3   ntg[[threads_per_threadgroup]]) {
    const int64_t i3 = tgpig.z;
    const int64_t i2 = tgpig.y;
    const int64_t i1 = tgpig.x;
    const int64_t i03 = i3;
    const int64_t i02 = i2;
    const int64_t i01 = i1;
    device const float * src0_ptr = (device const float *) (src0 + i03*nb03 + i02*nb02 + i01*nb01);
    device       float * dst_ptr  = (device       float *) (dst  +  i3*nb3  +  i2*nb2  +  i1*nb1);
    if (i1 < ne01 && i2 < ne02 && i3 < ne03) {
        for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
            if (i0 < ne00) {
                dst_ptr[i0] = src0_ptr[i0];
            }
        }
        return;
    }
 }
 kernel void kernel_arange_f32(
    device        char * dst,
    constant   ggml_metal_kargs_arange & args,
--- a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m
+++ b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m
@ -347,6 +347,7 @@ enum ggml_metal_kernel_type {
    GGML_METAL_KERNEL_TYPE_UPSCALE_F32,
    GGML_METAL_KERNEL_TYPE_PAD_F32,
    GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32,
    GGML_METAL_KERNEL_TYPE_UNPAD_F32,
    GGML_METAL_KERNEL_TYPE_ARANGE_F32,
    GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,
    GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,
@ -1294,6 +1295,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32,                     upscale_f32,                     true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32,                         pad_f32,                         true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32,              pad_reflect_1d_f32,              true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UNPAD_F32,                       unpad_f32,                       true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,          timestep_embedding_f32,          true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARANGE_F32,                      arange_f32,                      true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,             argsort_f32_i32_asc,             true);
@ -1655,6 +1657,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
        case GGML_OP_POOL_2D:
        case GGML_OP_PAD:
        case GGML_OP_PAD_REFLECT_1D:
        case GGML_OP_UNPAD:
        case GGML_OP_TIMESTEP_EMBEDDING:
        case GGML_OP_ARGSORT:
        case GGML_OP_LEAKY_RELU:
@ -4184,6 +4187,36 @@ static bool ggml_metal_encode_node(
                const int nth = MIN(1024, ne0);
                [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
            } break;
        case GGML_OP_UNPAD:
            {
                GGML_ASSERT(src0->type == GGML_TYPE_F32);
                id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_UNPAD_F32].pipeline;
                [encoder setComputePipelineState:pipeline];
                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
                [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
                [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
                [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
                [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5];
                [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
                [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
                [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
                [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9];
                [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:10];
                [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:11];
                [encoder setBytes:&ne2  length:sizeof(ne2)  atIndex:12];
                [encoder setBytes:&ne3  length:sizeof(ne3)  atIndex:13];
                [encoder setBytes:&nb0  length:sizeof(nb0)  atIndex:14];
                [encoder setBytes:&nb1  length:sizeof(nb1)  atIndex:15];
                [encoder setBytes:&nb2  length:sizeof(nb2)  atIndex:16];
                [encoder setBytes:&nb3  length:sizeof(nb3)  atIndex:17];
                const int nth = MIN(1024, ne0);
                [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
            } break;
        case GGML_OP_ARANGE:
--- a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.metal
@ -3121,6 +3121,51 @@ kernel void kernel_pad_reflect_1d_f32(
    }
 }
 kernel void kernel_unpad_f32(
    device  const char * src0,
    device        char * dst,
    constant   int64_t & ne00,
    constant   int64_t & ne01,
    constant   int64_t & ne02,
    constant   int64_t & ne03,
    constant  uint64_t & nb00,
    constant  uint64_t & nb01,
    constant  uint64_t & nb02,
    constant  uint64_t & nb03,
    constant   int64_t & ne0,
    constant   int64_t & ne1,
    constant   int64_t & ne2,
    constant   int64_t & ne3,
    constant  uint64_t & nb0,
    constant  uint64_t & nb1,
    constant  uint64_t & nb2,
    constant  uint64_t & nb3,
    uint3 tgpig[[threadgroup_position_in_grid]],
    uint3 tpitg[[thread_position_in_threadgroup]],
    uint3   ntg[[threads_per_threadgroup]]) {
    const int64_t i3 = tgpig.z;
    const int64_t i2 = tgpig.y;
    const int64_t i1 = tgpig.x;
    const int64_t i03 = i3;
    const int64_t i02 = i2;
    const int64_t i01 = i1;
    device const float * src0_ptr = (device const float *) (src0 + i03*nb03 + i02*nb02 + i01*nb01);
    device       float * dst_ptr  = (device       float *) (dst  +  i3*nb3  +  i2*nb2  +  i1*nb1);
    if (i1 < ne01 && i2 < ne02 && i3 < ne03) {
        for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
            if (i0 < ne00) {
                dst_ptr[i0] = src0_ptr[i0];
            }
        }
        return;
    }
 }
 kernel void kernel_arange_f32(
    device        char * dst,
    constant   ggml_metal_kargs_arange & args,
--- a/ml/backend/ggml/ggml/src/ggml.c
+++ b/ml/backend/ggml/ggml/src/ggml.c
@ -923,6 +923,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
    "UPSCALE",
    "PAD",
    "PAD_REFLECT_1D",
    "UNPAD",
    "ARANGE",
    "TIMESTEP_EMBEDDING",
    "ARGSORT",
@ -953,7 +954,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
    "OPT_STEP_ADAMW",
 };
-static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
+static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
 static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
    "none",
@ -1018,6 +1019,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
    "upscale(x)",
    "pad(x)",
    "pad_reflect_1d(x)",
    "unpad(x)",
    "arange(start, stop, step)",
    "timestep_embedding(timesteps, dim, max_period)",
    "argsort(x)",
@ -1048,7 +1050,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
    "adamw(x)",
 };
-static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
+static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
 static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
@ -4274,6 +4276,25 @@ struct ggml_tensor * ggml_pad_reflect_1d(
    return result;
 }
 // ggml_unpad
 struct ggml_tensor * ggml_unpad(
    struct ggml_context * ctx,
    struct ggml_tensor  * a,
    int p0, int p1, int p2, int p3) {
    struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type,
            a->ne[0] - p0,
            a->ne[1] - p1,
            a->ne[2] - p2,
            a->ne[3] - p3);
    result->op = GGML_OP_UNPAD;
    result->src[0] = a;
    return result;
 }
 // ggml_arange
 struct ggml_tensor * ggml_arange(
--- a/model/models/gemma2/model.go
+++ b/model/models/gemma2/model.go
@ -45,8 +45,6 @@ func New(c fs.Config) (model.Model, error) {
 				Types:  c.Ints("tokenizer.ggml.token_type"),
 				BOS:    int32(c.Uint("tokenizer.ggml.bos_token_id")),
 				EOS:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
 				// TODO: set EOT to EOS otherwise 0 will stop generation
 				EOT: int32(c.Uint("tokenizer.ggml.eos_token_id")),
 			},
 		),
 		Layers: make([]Layer, c.Uint("block_count")),
--- a/model/models/gemma3/model_text.go
+++ b/model/models/gemma3/model_text.go
@ -7,6 +7,7 @@ import (
 	"github.com/ollama/ollama/kvcache"
 	"github.com/ollama/ollama/ml"
 	"github.com/ollama/ollama/ml/nn"
 	"github.com/ollama/ollama/model"
 	"github.com/ollama/ollama/model/input"
 )
@ -19,6 +20,9 @@ type TextConfig struct {
 }
 type TextModel struct {
 	model.Base
 	model.SentencePieceModel
 	TokenEmbedding *nn.Embedding `gguf:"token_embd"`
 	Layers         []TextLayer   `gguf:"blk"`
 	OutputNorm     *nn.RMSNorm   `gguf:"output_norm"`
@ -41,6 +45,15 @@ func newTextModel(c fs.Config) *TextModel {
 	numBlocks := int(c.Uint("block_count"))
 	m := TextModel{
 		SentencePieceModel: model.NewSentencePieceModel(
 			&model.Vocabulary{
 				Values: c.Strings("tokenizer.ggml.tokens"),
 				Scores: c.Floats("tokenizer.ggml.scores"),
 				Types:  c.Ints("tokenizer.ggml.token_type"),
 				BOS:    int32(c.Uint("tokenizer.ggml.bos_token_id")),
 				EOS:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
 			},
 		),
 		Layers: make([]TextLayer, numBlocks),
 		TextConfig: &TextConfig{
 			hiddenSize:     int(c.Uint("embedding_length")),
--- a/model/models/llama/model.go
+++ b/model/models/llama/model.go
@ -47,9 +47,6 @@ func New(c fs.Config) (model.Model, error) {
 				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
 				EOS:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
 				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
 				// TODO: set EOT to EOS otherwise 0 will stop generation
 				EOT:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
 				AddEOT: c.Bool("tokenizer.ggml.add_eos_token", false),
 			},
 		),
 		Layers: make([]Layer, c.Uint("block_count")),
--- a/model/models/llama4/model.go
+++ b/model/models/llama4/model.go
@ -45,9 +45,6 @@ func New(c fs.Config) (model.Model, error) {
 				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
 				EOS:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
 				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
 				// TODO: set EOT to EOS otherwise 0 will stop generation
 				EOT:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
 				AddEOT: c.Bool("tokenizer.ggml.add_eos_token", false),
 			},
 		),
 		ImageProcessor: newImageProcessor(c),
--- a/model/models/llama4/model_vision.go
+++ b/model/models/llama4/model_vision.go
@ -208,7 +208,7 @@ func (m *VisionModel) Forward(ctx ml.Context, pixelValues ml.Tensor) ml.Tensor {
 	}
 	hiddenStates = m.LayerNormPost.Forward(ctx, hiddenStates, m.eps)
-	hiddenStates = hiddenStates.Pad(ctx, 0, -1, 0, 0)
+	hiddenStates = hiddenStates.Unpad(ctx, 0, 1, 0, 0)
 	hiddenStates = m.VisionAdapter.Forward(ctx, hiddenStates, m.VisionOptions)
 	return hiddenStates
 }
--- a/model/models/mistral3/model.go
+++ b/model/models/mistral3/model.go
@ -16,8 +16,6 @@ import (
 type Model struct {
 	model.Base
 	model.BytePairEncoding
 	*TextModel
 	*VisionModel         `gguf:"v,vision"`
 	*MultiModalProjector `gguf:"mm"`
@ -42,21 +40,6 @@ func New(c fs.Config) (model.Model, error) {
 		VisionModel:         newVisionModel(c),
 		ImageProcessor:      newImageProcessor(c),
 		MultiModalProjector: newMultiModalProjector(c),
 		BytePairEncoding: model.NewBytePairEncoding(
 			c.String("tokenizer.ggml.pretokenizer", `[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
 			&model.Vocabulary{
 				Values: c.Strings("tokenizer.ggml.tokens"),
 				Types:  c.Ints("tokenizer.ggml.token_type"),
 				Merges: c.Strings("tokenizer.ggml.merges"),
 				BOS:    int32(c.Uint("tokenizer.ggml.bos_token_id", 1)),
 				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
 				EOS:    int32(c.Uint("tokenizer.ggml.eos_token_id", 2)),
 				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
 				// TODO: set EOT to EOS otherwise 0 will stop generation
 				EOT:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
 				AddEOT: c.Bool("tokenizer.ggml.add_eos_token", false),
 			},
 		),
 	}
 	m.Cache = kvcache.NewCausalCache(m.TextModel.Shift)
--- a/model/models/mistral3/model_text.go
+++ b/model/models/mistral3/model_text.go
@ -21,6 +21,7 @@ type TextOptions struct {
 type TextModel struct {
 	model.Base
 	model.BytePairEncoding
 	TokenEmbedding *nn.Embedding `gguf:"token_embd"`
 	Layers         []Layer       `gguf:"blk"`
@ -147,6 +148,18 @@ func NewTextModel(c fs.Config) (*TextModel, error) {
 	}
 	textModel := &TextModel{
 		BytePairEncoding: model.NewBytePairEncoding(
 			c.String("tokenizer.ggml.pretokenizer", `[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
 			&model.Vocabulary{
 				Values: c.Strings("tokenizer.ggml.tokens"),
 				Types:  c.Ints("tokenizer.ggml.token_type"),
 				Merges: c.Strings("tokenizer.ggml.merges"),
 				BOS:    int32(c.Uint("tokenizer.ggml.bos_token_id", 1)),
 				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
 				EOS:    int32(c.Uint("tokenizer.ggml.eos_token_id", 2)),
 				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
 			},
 		),
 		Layers: make([]Layer, c.Uint("block_count")),
 		TextOptions: &TextOptions{
 			hiddenSize: int(c.Uint("embedding_length")),
--- a/model/models/mllama/imageproc.go
+++ b/model/models/mllama/imageproc.go
@ -0,0 +1,201 @@
 package mllama
 import (
 	"fmt"
 	"image"
 	_ "image/jpeg"
 	_ "image/png"
 	"io"
 	"math"
 	"slices"
 	"golang.org/x/image/draw"
 	"github.com/ollama/ollama/model/imageproc"
 )
 func getSupportedAspectRatios(maxTiles int) []image.Point {
 	ratios := []image.Point{}
 	for w := range maxTiles {
 		for h := range maxTiles {
 			if (w+1)*(h+1) <= maxTiles {
 				ratios = append(ratios, image.Point{w + 1, h + 1})
 			}
 		}
 	}
 	return ratios
 }
 func clip(a, a_min, a_max int) int {
 	if a < a_min {
 		return a_min
 	} else if a > a_max {
 		return a_max
 	}
 	return a
 }
 func getOptimalTiledCanvas(imageSize image.Point, maxImageTiles, tileSize int) image.Point {
 	possibleTileArrangements := getSupportedAspectRatios(maxImageTiles)
 	possibleCanvasSizes := []image.Point{}
 	for _, pta := range possibleTileArrangements {
 		possibleCanvasSizes = append(possibleCanvasSizes, image.Point{pta.X * tileSize, pta.Y * tileSize})
 	}
 	scales := []float64{}
 	for _, pcs := range possibleCanvasSizes {
 		scaleHeight := float64(pcs.Y) / float64(imageSize.Y)
 		scaleWidth := float64(pcs.X) / float64(imageSize.X)
 		if scaleWidth > scaleHeight {
 			scales = append(scales, scaleHeight)
 		} else {
 			scales = append(scales, scaleWidth)
 		}
 	}
 	var minUpscale float64
 	var maxDownscale float64
 	var upscale bool
 	for _, s := range scales {
 		if s > 1.0 {
 			upscale = true
 			if minUpscale == 0 {
 				minUpscale = s
 			} else {
 				minUpscale = math.Min(minUpscale, s)
 			}
 		} else {
 			maxDownscale = math.Max(maxDownscale, s)
 		}
 	}
 	selectedScale := maxDownscale
 	if upscale {
 		selectedScale = minUpscale
 	}
 	var selectedCanvas image.Point
 	for n, pcs := range possibleCanvasSizes {
 		if scales[n] == selectedScale {
 			// choose the smallest possible canvas
 			if selectedCanvas.X == 0 && selectedCanvas.Y == 0 {
 				selectedCanvas = pcs
 			} else if pcs.X*pcs.Y < selectedCanvas.X*selectedCanvas.Y {
 				selectedCanvas = pcs
 			}
 		}
 	}
 	return selectedCanvas
 }
 func getImageSizeFitToCanvas(imageSize, canvasSize image.Point, tileSize int) image.Point {
 	targetWidth := clip(imageSize.X, tileSize, canvasSize.X)
 	targetHeight := clip(imageSize.Y, tileSize, canvasSize.Y)
 	scaleWidth := float64(targetWidth) / float64(imageSize.X)
 	scaleHeight := float64(targetHeight) / float64(imageSize.Y)
 	var w, h int
 	if scaleWidth < scaleHeight {
 		w = targetWidth
 		h = min(int(math.Floor(float64(imageSize.Y)*scaleWidth)), targetHeight)
 	} else {
 		w = min(int(math.Floor(float64(imageSize.X)*scaleHeight)), targetWidth)
 		h = targetHeight
 	}
 	return image.Point{w, h}
 }
 func resizeImage(img image.Image, format string, outputSize image.Point, maxImageTiles int) (image.Image, image.Point) {
 	if format == "png" {
 		img = imageproc.Composite(img)
 	}
 	b := img.Bounds()
 	tileSize := outputSize.Y
 	canvasSize := getOptimalTiledCanvas(b.Max, maxImageTiles, tileSize)
 	aspectRatio := image.Point{canvasSize.X / tileSize, canvasSize.Y / tileSize}
 	newSize := getImageSizeFitToCanvas(b.Max, canvasSize, tileSize)
 	return imageproc.Resize(img, newSize, imageproc.ResizeBilinear), aspectRatio
 }
 func padImage(img image.Image, outputSize, aspectRatio image.Point) image.Image {
 	paddedSize := image.Point{
 		X: outputSize.X * aspectRatio.X,
 		Y: outputSize.Y * aspectRatio.Y,
 	}
 	dst := image.NewRGBA(image.Rect(0, 0, paddedSize.X, paddedSize.Y))
 	draw.Draw(dst, img.Bounds(), img, image.Point{0, 0}, draw.Over)
 	return dst
 }
 func splitToTiles(img image.Image, numTilesSize image.Point) []image.Image {
 	b := img.Bounds()
 	width := b.Max.X - b.Min.X
 	height := b.Max.Y - b.Min.Y
 	tileHeight := height / numTilesSize.Y
 	tileWidth := width / numTilesSize.X
 	images := []image.Image{}
 	for h := range numTilesSize.Y {
 		for w := range numTilesSize.X {
 			rect := image.Rect(tileWidth*w, tileHeight*h, tileWidth*(w+1), tileHeight*(h+1))
 			images = append(images, img.(interface {
 				SubImage(image.Rectangle) image.Image
 			}).SubImage(rect))
 		}
 	}
 	return images
 }
 func packImages(img image.Image, aspectRatio image.Point) []float32 {
 	subImages := splitToTiles(img, aspectRatio)
 	var pixelVals []float32
 	rescale := true
 	channelFirst := true
 	for _, subImg := range subImages {
 		vals := imageproc.Normalize(subImg, imageproc.ClipDefaultMean, imageproc.ClipDefaultSTD, rescale, channelFirst)
 		pixelVals = append(pixelVals, vals...)
 	}
 	return pixelVals
 }
 func Preprocess(imageData io.Reader) ([]float32, map[string]any, error) {
 	outputSize := image.Point{560, 560}
 	maxTiles := 4
 	img, format, err := image.Decode(imageData)
 	if err != nil {
 		return nil, nil, fmt.Errorf("failed to decode image: %w", err)
 	}
 	newImage, aspectRatio := resizeImage(img, format, outputSize, maxTiles)
 	newImage = padImage(newImage, outputSize, aspectRatio)
 	data := packImages(newImage, aspectRatio)
 	aspectRatioIndex := slices.Index(getSupportedAspectRatios(maxTiles), aspectRatio) + 1
 	opts := map[string]any{
 		"aspectRatioIndex": aspectRatioIndex,
 	}
 	return data, opts, nil
 }
--- a/model/models/mllama/imageproc_test.go
+++ b/model/models/mllama/imageproc_test.go
@ -0,0 +1,420 @@
 package mllama
 import (
 	"bytes"
 	"image"
 	"image/png"
 	"testing"
 	"github.com/google/go-cmp/cmp"
 )
 func TestAspectRatios(t *testing.T) {
 	type aspectCase struct {
 		MaxTiles int
 		Expected []image.Point
 	}
 	cases := []aspectCase{
 		{
 			MaxTiles: 1,
 			Expected: []image.Point{{1, 1}},
 		},
 		{
 			MaxTiles: 2,
 			Expected: []image.Point{{1, 1}, {1, 2}, {2, 1}},
 		},
 		{
 			MaxTiles: 3,
 			Expected: []image.Point{{1, 1}, {1, 2}, {1, 3}, {2, 1}, {3, 1}},
 		},
 		{
 			MaxTiles: 4,
 			Expected: []image.Point{{1, 1}, {1, 2}, {1, 3}, {1, 4}, {2, 1}, {2, 2}, {3, 1}, {4, 1}},
 		},
 	}
 	for _, c := range cases {
 		actual := getSupportedAspectRatios(c.MaxTiles)
 		if diff := cmp.Diff(actual, c.Expected); diff != "" {
 			t.Errorf("mismatch (-got +want):\n%s", diff)
 		}
 	}
 }
 func TestGetImageSizeFitToCanvas(t *testing.T) {
 	type imageSizeCase struct {
 		ImageRect  image.Point
 		CanvasRect image.Point
 		TileSize   int
 		Expected   image.Point
 	}
 	cases := []imageSizeCase{
 		{
 			ImageRect:  image.Point{400, 400},
 			CanvasRect: image.Point{640, 480},
 			TileSize:   200,
 			Expected:   image.Point{400, 400},
 		},
 		{
 			ImageRect:  image.Point{1024, 768},
 			CanvasRect: image.Point{640, 480},
 			TileSize:   200,
 			Expected:   image.Point{640, 480},
 		},
 		{
 			ImageRect:  image.Point{500, 500},
 			CanvasRect: image.Point{1000, 1000},
 			TileSize:   750,
 			Expected:   image.Point{750, 750},
 		},
 		{
 			ImageRect:  image.Point{500, 1000},
 			CanvasRect: image.Point{2000, 2000},
 			TileSize:   2000,
 			Expected:   image.Point{1000, 2000},
 		},
 		{
 			ImageRect:  image.Point{4000, 3000},
 			CanvasRect: image.Point{2000, 1000},
 			TileSize:   1000,
 			Expected:   image.Point{1333, 1000},
 		},
 		{
 			ImageRect:  image.Point{667, 1000},
 			CanvasRect: image.Point{1000, 1000},
 			TileSize:   560,
 			Expected:   image.Point{667, 1000},
 		},
 	}
 	for _, c := range cases {
 		actual := getImageSizeFitToCanvas(c.ImageRect, c.CanvasRect, c.TileSize)
 		if actual != c.Expected {
 			t.Errorf("incorrect image rect: '%#v'. expected: '%#v'", actual, c.Expected)
 		}
 	}
 }
 func TestGetOptimalTiledCanvas(t *testing.T) {
 	type tiledCanvasSizeCase struct {
 		ImageSize     image.Point
 		MaxImageTiles int
 		TileSize      int
 		Expected      image.Point
 	}
 	cases := []tiledCanvasSizeCase{
 		{
 			ImageSize:     image.Point{1024, 768},
 			MaxImageTiles: 4,
 			TileSize:      1000,
 			Expected:      image.Point{2000, 1000},
 		},
 		{
 			ImageSize:     image.Point{1024, 768},
 			MaxImageTiles: 4,
 			TileSize:      560,
 			Expected:      image.Point{1120, 1120},
 		},
 		{
 			ImageSize:     image.Point{800, 600},
 			MaxImageTiles: 4,
 			TileSize:      560,
 			Expected:      image.Point{1120, 1120},
 		},
 		{
 			ImageSize:     image.Point{640, 480},
 			MaxImageTiles: 4,
 			TileSize:      560,
 			Expected:      image.Point{1120, 560},
 		},
 		{
 			ImageSize:     image.Point{320, 200},
 			MaxImageTiles: 4,
 			TileSize:      560,
 			Expected:      image.Point{560, 560},
 		},
 		{
 			ImageSize:     image.Point{1320, 200},
 			MaxImageTiles: 4,
 			TileSize:      560,
 			Expected:      image.Point{1680, 560},
 		},
 		{
 			ImageSize:     image.Point{2000, 200},
 			MaxImageTiles: 4,
 			TileSize:      560,
 			Expected:      image.Point{2240, 560},
 		},
 		{
 			ImageSize:     image.Point{10000, 200},
 			MaxImageTiles: 4,
 			TileSize:      560,
 			Expected:      image.Point{2240, 560},
 		},
 		{
 			ImageSize:     image.Point{480, 640},
 			MaxImageTiles: 4,
 			TileSize:      560,
 			Expected:      image.Point{560, 1120},
 		},
 		{
 			ImageSize:     image.Point{200, 320},
 			MaxImageTiles: 4,
 			TileSize:      560,
 			Expected:      image.Point{560, 560},
 		},
 		{
 			ImageSize:     image.Point{200, 1320},
 			MaxImageTiles: 4,
 			TileSize:      560,
 			Expected:      image.Point{560, 1680},
 		},
 		{
 			ImageSize:     image.Point{200, 2000},
 			MaxImageTiles: 4,
 			TileSize:      560,
 			Expected:      image.Point{560, 2240},
 		},
 		{
 			ImageSize:     image.Point{200, 10000},
 			MaxImageTiles: 4,
 			TileSize:      560,
 			Expected:      image.Point{560, 2240},
 		},
 		{
 			ImageSize:     image.Point{10000, 10000},
 			MaxImageTiles: 4,
 			TileSize:      560,
 			Expected:      image.Point{1120, 1120},
 		},
 	}
 	for _, c := range cases {
 		actual := getOptimalTiledCanvas(c.ImageSize, c.MaxImageTiles, c.TileSize)
 		if actual != c.Expected {
 			t.Errorf("incorrect tiled canvas: '%#v'. expected: '%#v'", actual, c.Expected)
 		}
 	}
 }
 func TestSplitToTiles(t *testing.T) {
 	type splitCase struct {
 		TestImage    image.Image
 		NumTilesSize image.Point
 		Expected     []image.Image
 	}
 	cases := []splitCase{
 		{
 			TestImage:    image.NewRGBA(image.Rect(0, 0, 1024, 768)),
 			NumTilesSize: image.Point{1, 1},
 			Expected:     []image.Image{image.NewRGBA(image.Rect(0, 0, 1024, 768))},
 		},
 		{
 			TestImage:    image.NewRGBA(image.Rect(0, 0, 1000, 500)),
 			NumTilesSize: image.Point{2, 1},
 			Expected: []image.Image{
 				image.NewRGBA(image.Rect(0, 0, 500, 500)),
 				image.NewRGBA(image.Rect(500, 0, 1000, 500)),
 			},
 		},
 		{
 			TestImage:    image.NewRGBA(image.Rect(0, 0, 1000, 1000)),
 			NumTilesSize: image.Point{2, 2},
 			Expected: []image.Image{
 				image.NewRGBA(image.Rect(0, 0, 500, 500)),
 				image.NewRGBA(image.Rect(500, 0, 1000, 500)),
 				image.NewRGBA(image.Rect(0, 500, 500, 1000)),
 				image.NewRGBA(image.Rect(500, 500, 1000, 1000)),
 			},
 		},
 	}
 	for _, c := range cases {
 		actual := splitToTiles(c.TestImage, c.NumTilesSize)
 		if len(actual) != len(c.Expected) {
 			t.Errorf("incorrect number of images '%d': expected: '%d'", len(actual), len(c.Expected))
 		}
 		for i := range actual {
 			if actual[i].Bounds() != c.Expected[i].Bounds() {
 				t.Errorf("image size incorrect: '%#v': expected: '%#v'", actual[i].Bounds(), c.Expected[i].Bounds())
 			}
 		}
 	}
 }
 func TestResize(t *testing.T) {
 	type resizeCase struct {
 		TestImage           image.Image
 		OutputSize          image.Point
 		MaxImageTiles       int
 		ExpectedImage       image.Image
 		ExpectedAspectRatio image.Point
 	}
 	cases := []resizeCase{
 		{
 			TestImage:           image.NewRGBA(image.Rect(0, 0, 200, 200)),
 			OutputSize:          image.Point{100, 100},
 			MaxImageTiles:       1,
 			ExpectedImage:       image.NewRGBA(image.Rect(0, 0, 100, 100)),
 			ExpectedAspectRatio: image.Point{1, 1},
 		},
 		{
 			TestImage:           image.NewRGBA(image.Rect(0, 0, 200, 200)),
 			OutputSize:          image.Point{100, 100},
 			MaxImageTiles:       2,
 			ExpectedImage:       image.NewRGBA(image.Rect(0, 0, 100, 100)),
 			ExpectedAspectRatio: image.Point{1, 1},
 		},
 		{
 			TestImage:           image.NewRGBA(image.Rect(0, 0, 10, 10)),
 			OutputSize:          image.Point{560, 560},
 			MaxImageTiles:       4,
 			ExpectedImage:       image.NewRGBA(image.Rect(0, 0, 560, 560)),
 			ExpectedAspectRatio: image.Point{1, 1},
 		},
 		{
 			TestImage:           image.NewRGBA(image.Rect(0, 0, 2560, 1920)),
 			OutputSize:          image.Point{560, 560},
 			MaxImageTiles:       4,
 			ExpectedImage:       image.NewRGBA(image.Rect(0, 0, 1120, 840)),
 			ExpectedAspectRatio: image.Point{2, 2},
 		},
 		{
 			TestImage:           image.NewRGBA(image.Rect(0, 0, 1024, 768)),
 			OutputSize:          image.Point{560, 560},
 			MaxImageTiles:       4,
 			ExpectedImage:       image.NewRGBA(image.Rect(0, 0, 1024, 768)),
 			ExpectedAspectRatio: image.Point{2, 2},
 		},
 	}
 	for _, c := range cases {
 		actualImage, actualAspectRatio := resizeImage(c.TestImage, "png", c.OutputSize, c.MaxImageTiles)
 		if actualImage.Bounds() != c.ExpectedImage.Bounds() {
 			t.Errorf("image size incorrect: '%#v': expected: '%#v'", actualImage.Bounds(), c.ExpectedImage.Bounds())
 		}
 		if actualAspectRatio != c.ExpectedAspectRatio {
 			t.Errorf("aspect ratio incorrect: '%#v': expected: '%#v'", actualAspectRatio, c.ExpectedAspectRatio)
 		}
 	}
 }
 func TestPad(t *testing.T) {
 	type padCase struct {
 		TestImage   image.Image
 		OutputSize  image.Point
 		AspectRatio image.Point
 		Expected    image.Image
 	}
 	cases := []padCase{
 		{
 			TestImage:   image.NewRGBA(image.Rect(0, 0, 1000, 667)),
 			OutputSize:  image.Point{560, 560},
 			AspectRatio: image.Point{2, 2},
 			Expected:    image.NewRGBA(image.Rect(0, 0, 1120, 1120)),
 		},
 	}
 	for _, c := range cases {
 		actual := padImage(c.TestImage, c.OutputSize, c.AspectRatio)
 		if actual.Bounds() != c.Expected.Bounds() {
 			t.Errorf("image size incorrect: '%#v': expected: '%#v'", actual.Bounds(), c.Expected.Bounds())
 		}
 	}
 }
 func TestPackImages(t *testing.T) {
 	type packCase struct {
 		TestImage    image.Image
 		AspectRatio  image.Point
 		ExpectedVals int
 	}
 	cases := []packCase{
 		{
 			TestImage:    image.NewRGBA(image.Rect(0, 0, 1120, 1120)),
 			AspectRatio:  image.Point{2, 2},
 			ExpectedVals: 2 * 2 * 3 * 560 * 560,
 		},
 		{
 			TestImage:    image.NewRGBA(image.Rect(0, 0, 560, 560)),
 			AspectRatio:  image.Point{1, 1},
 			ExpectedVals: 1 * 1 * 3 * 560 * 560,
 		},
 		{
 			TestImage:    image.NewRGBA(image.Rect(0, 0, 1120, 560)),
 			AspectRatio:  image.Point{1, 2},
 			ExpectedVals: 1 * 2 * 3 * 560 * 560,
 		},
 	}
 	for _, c := range cases {
 		actualVals := packImages(c.TestImage, c.AspectRatio)
 		if len(actualVals) != c.ExpectedVals {
 			t.Errorf("packed image size incorrect: '%d': expected: '%d'", len(actualVals), c.ExpectedVals)
 		}
 	}
 }
 func TestPreprocess(t *testing.T) {
 	type preprocessCase struct {
 		TestImage             image.Image
 		ExpectedVals          int
 		ExpectedAspectRatioID int
 	}
 	cases := []preprocessCase{
 		{
 			TestImage:             image.NewRGBA(image.Rect(0, 0, 10, 10)),
 			ExpectedVals:          0,
 			ExpectedAspectRatioID: 1,
 		},
 		{
 			TestImage:             image.NewRGBA(image.Rect(0, 0, 1024, 768)),
 			ExpectedVals:          0,
 			ExpectedAspectRatioID: 6,
 		},
 	}
 	for _, c := range cases {
 		var buf bytes.Buffer
 		err := png.Encode(&buf, c.TestImage)
 		if err != nil {
 			t.Fatal(err)
 		}
 		imgData, opts, err := Preprocess(&buf)
 		if err != nil {
 			t.Fatalf("error processing: %q", err)
 		}
 		if len(imgData) == 0 {
 			t.Errorf("no image data returned")
 		}
 		ar, ok := opts["aspectRatioIndex"]
 		if !ok {
 			t.Fatalf("no aspect ratio found")
 		}
 		aspectRatioID := ar.(int)
 		if aspectRatioID != c.ExpectedAspectRatioID {
 			t.Errorf("aspect ratio incorrect: '%d': expected: '%d'", aspectRatioID, c.ExpectedAspectRatioID)
 		}
 	}
 }
--- a/model/models/mllama/model.go
+++ b/model/models/mllama/model.go
@ -2,7 +2,11 @@ package mllama
 import (
 	"bytes"
 	"encoding/binary"
 	"fmt"
 	"hash/fnv"
 	"image"
 	"slices"
 	"github.com/ollama/ollama/fs"
 	"github.com/ollama/ollama/kvcache"
@ -30,6 +34,10 @@ const (
 )
 func New(c fs.Config) (model.Model, error) {
 	// Verify unified config
 	if c.Uint("vision.block_count") == 0 {
 		return nil, fmt.Errorf("non-unified vision model not supported")
 	}
 	m := Model{
 		BytePairEncoding: model.NewBytePairEncoding(
 			c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
@ -41,9 +49,6 @@ func New(c fs.Config) (model.Model, error) {
 				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
 				EOS:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
 				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
 				// TODO: set EOT to EOS otherwise 0 will stop generation
 				EOT:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
 				AddEOT: c.Bool("tokenizer.ggml.add_eos_token", false),
 			},
 		),
 		ImageProcessor: newImageProcessor(c),
@ -68,19 +73,22 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, er
 		return nil, err
 	}
-	f32s, ratio, err := m.ImageProcessor.ProcessImage(image)
+	f32s, aspectRatioID, err := m.ImageProcessor.ProcessImage(image)
 	if err != nil {
 		return nil, err
 	}
-	pixelValues, err := ctx.Input().FromFloatSlice(f32s, m.imageSize, m.imageSize, m.numChannels, ratio.numTiles())
+	pixelValues, err := ctx.Input().FromFloatSlice(f32s,
 		m.ImageProcessor.imageSize,
 		m.ImageProcessor.imageSize,
 		m.ImageProcessor.numChannels,
 		m.ImageProcessor.maxNumTiles,
 	)
 	if err != nil {
 		return nil, err
 	}
-	pixelValues = pixelValues.Pad(ctx, 0, 0, 0, m.ImageProcessor.maxNumTiles-ratio.numTiles())
+	aspectRatio, err := ctx.Input().FromIntSlice([]int32{int32(aspectRatioID)}, 1)
 	aspectRatio, err := ctx.Input().FromIntSlice([]int32{int32(ratio.rank)}, 1)
 	if err != nil {
 		return nil, err
 	}
@ -91,19 +99,41 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, er
 }
 func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
 	var images []input.Input
 	fnvHash := fnv.New64a()
 	for i := range inputs {
-		if inputs[i].Multimodal != nil {
+		if inputs[i].Multimodal == nil {
-			inputs[i].Token = 128256 // <|image|>
+			if len(images) > 0 {
 				inputs[i].Multimodal = []ml.Tensor{images[0].Multimodal.(ml.Tensor)}
 				inputs[i].MultimodalHash = images[0].MultimodalHash
 				for j := 1; j < len(images); j++ {
 					inputs[i].Multimodal = append(inputs[i].Multimodal.([]ml.Tensor), images[0].Multimodal.(ml.Tensor))
 					fnvHash.Reset()
 					binary.Write(fnvHash, binary.NativeEndian, inputs[i].MultimodalHash)
 					binary.Write(fnvHash, binary.NativeEndian, inputs[j].MultimodalHash)
 					inputs[i].MultimodalHash = fnvHash.Sum64()
 				}
 				images = nil
 			}
 		} else {
 			images = append(images, inputs[i])
 			inputs[i].Token = -1
 		}
 	}
 	inputs = slices.DeleteFunc(inputs, func(input input.Input) bool { return input.Token == -1 })
 	return inputs, nil
 }
 func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
 	var crossAttentionStates ml.Tensor
 	if len(batch.Multimodal) > 0 {
-		crossAttentionStates = batch.Multimodal[len(batch.Multimodal)-1].Multimodal.(ml.Tensor)
+		images := batch.Multimodal[len(batch.Multimodal)-1].Multimodal.([]ml.Tensor)
 		if len(images) > 0 {
 			crossAttentionStates = images[len(images)-1]
 		}
 	}
 	positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
@ -117,7 +147,7 @@ func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
 	}
 	// TODO: attention mask, cross attention mask
-	return m.TextModel.Forward(ctx, batch.Inputs, positions, outputs, crossAttentionStates, nil, m.Cache.(*kvcache.WrapperCache)), nil
+	return m.TextModel.Forward(ctx, batch.Inputs, positions, outputs, nil, crossAttentionStates, nil, m.Cache.(*kvcache.WrapperCache)), nil
 }
 func init() {
--- a/model/models/mllama/model_text.go
+++ b/model/models/mllama/model_text.go
@ -18,7 +18,7 @@ type TextSelfAttention struct {
 	RopeFactors ml.Tensor  `gguf:"rope_freqs.weight"`
 }
-func (sa *TextSelfAttention) Forward(ctx ml.Context, hiddenState, positions ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor {
+func (sa *TextSelfAttention) Forward(ctx ml.Context, hiddenState, positions, _ ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor {
 	batchSize := hiddenState.Dim(1)
 	headDim := opts.hiddenSize / opts.numHeads
 	ropeType := uint32(0)
@ -69,11 +69,11 @@ type TextSelfAttentionDecoderLayer struct {
 	MLP     *TextMLP
 }
-func (d *TextSelfAttentionDecoderLayer) Forward(ctx ml.Context, hiddenState, positions, outputs, _, _ ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor {
+func (d *TextSelfAttentionDecoderLayer) Forward(ctx ml.Context, hiddenState, positions, outputs, mask, _, _ ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor {
 	residual := hiddenState
 	hiddenState = d.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
-	hiddenState = d.SelfAttention.Forward(ctx, hiddenState, positions, cache, opts)
+	hiddenState = d.SelfAttention.Forward(ctx, hiddenState, positions, mask, cache, opts)
 	// In the final layer (outputs != nil), optimize by pruning to just the token positions
 	// we need logits for.
@ -151,7 +151,7 @@ type TextCrossAttentionDecoderLayer struct {
 	MLPGate ml.Tensor `gguf:"cross_attn_mlp_gate"`
 }
-func (d *TextCrossAttentionDecoderLayer) Forward(ctx ml.Context, hiddenState, _, _, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor {
+func (d *TextCrossAttentionDecoderLayer) Forward(ctx ml.Context, hiddenState, _, _, _, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor {
 	residual := hiddenState
 	hiddenState = d.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
@ -167,14 +167,14 @@ func (d *TextCrossAttentionDecoderLayer) Forward(ctx ml.Context, hiddenState, _,
 }
 type TextDecoderLayer interface {
-	Forward(ctx ml.Context, hiddenState, positionIDs, outputs, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor
+	Forward(ctx ml.Context, hiddenState, positionIDs, outputs, mask, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor
 }
 type TextDecoder struct {
 	Layers []TextDecoderLayer
 }
-func (d *TextDecoder) Forward(ctx ml.Context, hiddenState, positionIDs, outputs, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor {
+func (d *TextDecoder) Forward(ctx ml.Context, hiddenState, positionIDs, outputs, mask, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor {
 	for i, layer := range d.Layers {
 		layerType := selfAttentionLayer
 		if slices.Contains(opts.crossAttentionLayers, int32(i)) {
@ -190,7 +190,7 @@ func (d *TextDecoder) Forward(ctx ml.Context, hiddenState, positionIDs, outputs,
 				lastLayerOutputs = outputs
 			}
-			hiddenState = layer.Forward(ctx, hiddenState, positionIDs, lastLayerOutputs, crossAttentionStates, crossAttentionMask, cache, opts)
+			hiddenState = layer.Forward(ctx, hiddenState, positionIDs, lastLayerOutputs, mask, crossAttentionStates, crossAttentionMask, cache, opts)
 		}
 	}
@ -214,9 +214,9 @@ type TextModel struct {
 	*TextModelOptions
 }
-func (m *TextModel) Forward(ctx ml.Context, inputIDs, positionIDs, outputs, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache) ml.Tensor {
+func (m *TextModel) Forward(ctx ml.Context, inputIDs, positionIDs, outputs, mask, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache) ml.Tensor {
 	hiddenState := m.TokenEmbedding.Forward(ctx, inputIDs)
-	hiddenState = m.Transformer.Forward(ctx, hiddenState, positionIDs, outputs, crossAttentionStates, crossAttentionMask, cache, m.TextModelOptions)
+	hiddenState = m.Transformer.Forward(ctx, hiddenState, positionIDs, outputs, mask, crossAttentionStates, crossAttentionMask, cache, m.TextModelOptions)
 	hiddenState = m.OutputNorm.Forward(ctx, hiddenState, m.eps)
 	return m.Output.Forward(ctx, hiddenState)
 }
--- a/model/models/mllama/model_vision.go
+++ b/model/models/mllama/model_vision.go
@ -15,7 +15,7 @@ type VisionSelfAttention struct {
 	Query  *nn.Linear `gguf:"attn_q"`
 	Key    *nn.Linear `gguf:"attn_k"`
 	Value  *nn.Linear `gguf:"attn_v"`
-	Output *nn.Linear `gguf:"attn_output"`
+	Output *nn.Linear `gguf:"attn_out"`
 	Gate ml.Tensor `gguf:"attn_gate"`
 }
@ -45,29 +45,36 @@ func (sa *VisionSelfAttention) Forward(ctx ml.Context, hiddenState ml.Tensor, op
 	attention = attention.Reshape(ctx, opts.hiddenSize, attention.Dim(2), batchSize)
 	hiddenState = sa.Output.Forward(ctx, attention)
 	if sa.Gate != nil {
 		hiddenState = hiddenState.Mul(ctx, sa.Gate)
 	}
 	return hiddenState
 }
 type VisionMLP struct {
 	Up   *nn.Linear `gguf:"ffn_up"`
 	Down *nn.Linear `gguf:"ffn_down"`
 	Up   *nn.Linear `gguf:"ffn_up"`
 	Gate ml.Tensor `gguf:"ffn_gate"`
 }
 func (mlp *VisionMLP) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *VisionModelOptions) ml.Tensor {
-	hiddenState = mlp.Up.Forward(ctx, hiddenState).GELU(ctx)
+	hiddenState = mlp.Down.Forward(ctx, hiddenState).GELU(ctx)
-	hiddenState = mlp.Down.Forward(ctx, hiddenState)
+	hiddenState = mlp.Up.Forward(ctx, hiddenState)
 	if mlp.Gate != nil {
 		hiddenState = hiddenState.Mul(ctx, mlp.Gate)
 	}
 	return hiddenState
 }
 type VisionEncoderLayer struct {
-	AttentionNorm *nn.LayerNorm `gguf:"attn_norm"`
+	AttentionNorm *nn.LayerNorm `gguf:"ln1"`
 	SelfAttention *VisionSelfAttention
 	AttentionGate ml.Tensor `gguf:"attn_gate"`
-	MLPNorm *nn.LayerNorm `gguf:"ffn_norm"`
+	MLPNorm *nn.LayerNorm `gguf:"ln2"`
 	MLP     *VisionMLP
 	MLPGate ml.Tensor `gguf:"ffn_gate"`
 }
 func (e *VisionEncoderLayer) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *VisionModelOptions) ml.Tensor {
@ -76,22 +83,13 @@ func (e *VisionEncoderLayer) Forward(ctx ml.Context, hiddenState ml.Tensor, opts
 	// self attention
 	hiddenState = e.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
 	hiddenState = e.SelfAttention.Forward(ctx, hiddenState, opts)
 	if e.AttentionGate != nil {
 		hiddenState = hiddenState.Mul(ctx, e.AttentionGate)
 	}
 	hiddenState = hiddenState.Add(ctx, residual)
 	residual = hiddenState
 	// feed forward
 	hiddenState = e.MLPNorm.Forward(ctx, hiddenState, opts.eps)
 	hiddenState = e.MLP.Forward(ctx, hiddenState, opts)
-	hiddenState = hiddenState.Add(ctx, residual)
+	return hiddenState.Add(ctx, residual)
 	if e.MLPGate != nil {
 		hiddenState = hiddenState.Mul(ctx, e.MLPGate)
 	}
 	return hiddenState
 }
 type VisionEncoder struct {
@ -116,9 +114,9 @@ type PrecomputedAspectRatioEmbedding struct {
 	Gate      ml.Tensor `gguf:"gate"`
 }
-func (e *PrecomputedAspectRatioEmbedding) Forward(ctx ml.Context, hiddenState ml.Tensor, aspectRatioIDs ml.Tensor, numTiles int, opts *VisionModelOptions) ml.Tensor {
+func (e *PrecomputedAspectRatioEmbedding) Forward(ctx ml.Context, hiddenState ml.Tensor, aspectRatioIDs ml.Tensor, opts *VisionModelOptions) ml.Tensor {
 	embeddings := e.Embedding.Forward(ctx, aspectRatioIDs)
-	embeddings = embeddings.Reshape(ctx, opts.hiddenSize, 1, numTiles)
+	embeddings = embeddings.Reshape(ctx, opts.hiddenSize, 1, opts.numTiles)
 	if e.Gate != nil {
 		embeddings = embeddings.Mul(ctx, e.Gate)
 	}
@ -134,7 +132,7 @@ type PrecomputedPositionEmbedding struct {
 	TilePositionEmbeddingGate ml.Tensor     `gguf:"tile_position_embd.gate"`
 }
-func (e *PrecomputedPositionEmbedding) Forward(ctx ml.Context, hiddenState, positionIDs, aspectRatioIDs ml.Tensor, numPositions, numTiles int, opts *VisionModelOptions) ml.Tensor {
+func (e *PrecomputedPositionEmbedding) Forward(ctx ml.Context, hiddenState, positionIDs, aspectRatioIDs ml.Tensor, numPositions int, opts *VisionModelOptions) ml.Tensor {
 	positionEmbedding := e.PositionEmbedding.Forward(ctx, positionIDs)
 	if e.PositionEmbeddingGate != nil {
 		positionEmbedding = positionEmbedding.Mul(ctx, e.PositionEmbeddingGate)
@ -143,7 +141,7 @@ func (e *PrecomputedPositionEmbedding) Forward(ctx ml.Context, hiddenState, posi
 	hiddenState = hiddenState.Add(ctx, positionEmbedding)
 	tilePositionEmbedding := e.TilePositionEmbedding.Forward(ctx, aspectRatioIDs)
-	tilePositionEmbedding = tilePositionEmbedding.Reshape(ctx, opts.hiddenSize, numPositions, numTiles)
+	tilePositionEmbedding = tilePositionEmbedding.Reshape(ctx, opts.hiddenSize, numPositions, opts.numTiles)
 	if e.TilePositionEmbeddingGate != nil {
 		tilePositionEmbedding = tilePositionEmbedding.Mul(ctx, e.TilePositionEmbeddingGate)
 	}
@ -152,9 +150,9 @@ func (e *PrecomputedPositionEmbedding) Forward(ctx ml.Context, hiddenState, posi
 }
 type VisionModelOptions struct {
-	hiddenSize, numHeads int
+	hiddenSize, numHeads, numTiles int
-	imageSize, patchSize int
+	imageSize, patchSize           int
-	eps                  float32
+	eps                            float32
 	intermediateLayersIndices []int32
 }
@ -183,16 +181,14 @@ func (m *VisionModel) Forward(ctx ml.Context, pixelValues, positionIDs, aspectRa
 		numPositions++
 	}
 	numTiles := pixelValues.Dim(3)
 	hiddenState := m.PatchEmbeddings.Forward(ctx, pixelValues, m.patchSize, m.patchSize, 0, 0, 1, 1)
-	hiddenState = hiddenState.Reshape(ctx, numPatches, m.hiddenSize, numTiles)
+	hiddenState = hiddenState.Reshape(ctx, numPatches, m.hiddenSize, m.numTiles)
 	hiddenState = hiddenState.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx)
-	hiddenState = m.PreTilePositionEmbedding.Forward(ctx, hiddenState, aspectRatioIDs, numTiles, m.VisionModelOptions)
+	hiddenState = m.PreTilePositionEmbedding.Forward(ctx, hiddenState, aspectRatioIDs, m.VisionModelOptions)
-	hiddenState = m.ClassEmbedding.Repeat(ctx, 2, numTiles).Concat(ctx, hiddenState, 1)
+	hiddenState = m.ClassEmbedding.Repeat(ctx, 2, m.numTiles).Concat(ctx, hiddenState, 1)
-	hiddenState = m.PositionEmbedding.Forward(ctx, hiddenState, positionIDs, aspectRatioIDs, numPositions, numTiles, m.VisionModelOptions)
+	hiddenState = m.PositionEmbedding.Forward(ctx, hiddenState, positionIDs, aspectRatioIDs, numPositions, m.VisionModelOptions)
 	hiddenState = m.PreLayerNorm.Forward(ctx, hiddenState, m.eps)
 	numPaddingPatches := 8 - (hiddenState.Dim(1)%8)%8
@ -203,18 +199,18 @@ func (m *VisionModel) Forward(ctx ml.Context, pixelValues, positionIDs, aspectRa
 	hiddenState = m.PostLayerNorm.Forward(ctx, hiddenState, m.eps)
-	hiddenState = hiddenState.Reshape(ctx, m.hiddenSize, numPositions+numPaddingPatches, numTiles, batchSize)
+	hiddenState = hiddenState.Reshape(ctx, m.hiddenSize, numPositions+numPaddingPatches, m.numTiles, batchSize)
-	hiddenState = m.PostTilePositionEmbedding.Forward(ctx, hiddenState, aspectRatioIDs, numTiles, m.VisionModelOptions)
+	hiddenState = m.PostTilePositionEmbedding.Forward(ctx, hiddenState, aspectRatioIDs, m.VisionModelOptions)
-	hiddenState = hiddenState.Reshape(ctx, m.hiddenSize, numTiles*(numPositions+numPaddingPatches), batchSize)
+	hiddenState = hiddenState.Reshape(ctx, m.hiddenSize, m.numTiles*(numPositions+numPaddingPatches), batchSize)
 	hiddenState, _ = m.GlobalTransformer.Forward(ctx, hiddenState, nil, m.VisionModelOptions)
 	hiddenStates := intermediateHiddenStates[0].Stack(ctx, 0, intermediateHiddenStates[1:]...)
-	hiddenStates = hiddenStates.Reshape(ctx, len(intermediateHiddenStates)*m.hiddenSize, numPositions+numPaddingPatches, numTiles, batchSize)
+	hiddenStates = hiddenStates.Reshape(ctx, len(intermediateHiddenStates)*m.hiddenSize, numPositions+numPaddingPatches, m.numTiles, batchSize)
-	hiddenStates = hiddenStates.Pad(ctx, 0, -numPaddingPatches, 0, 0)
+	hiddenStates = hiddenStates.Unpad(ctx, 0, numPaddingPatches, 0, 0)
-	hiddenState = hiddenState.Reshape(ctx, m.hiddenSize, numPositions+numPaddingPatches, numTiles, batchSize)
+	hiddenState = hiddenState.Reshape(ctx, m.hiddenSize, numPositions+numPaddingPatches, m.numTiles, batchSize)
-	hiddenState = hiddenState.Pad(ctx, 0, -numPaddingPatches, 0, 0)
+	hiddenState = hiddenState.Unpad(ctx, 0, numPaddingPatches, 0, 0)
 	return hiddenState.Concat(ctx, hiddenStates, 0)
 }
@ -226,6 +222,7 @@ func newVisionModel(c fs.Config) *VisionModel {
 		VisionModelOptions: &VisionModelOptions{
 			hiddenSize: int(c.Uint("vision.embedding_length")),
 			numHeads:   int(c.Uint("vision.attention.head_count")),
 			numTiles:   int(c.Uint("vision.max_num_tiles")),
 			imageSize: int(c.Uint("vision.image_size")),
 			patchSize: int(c.Uint("vision.patch_size")),
--- a/model/models/mllama/process_image.go
+++ b/model/models/mllama/process_image.go
@ -2,31 +2,17 @@ package mllama
 import (
 	"image"
 	"image/color"
 	"math"
 	"slices"
 	"golang.org/x/image/draw"
 	"github.com/ollama/ollama/fs"
 	"github.com/ollama/ollama/model/imageproc"
 )
 type supportedAspectRatio struct {
 	rank, width, height int
 }
 func (a supportedAspectRatio) Point() image.Point {
 	return image.Point{a.width, a.height}
 }
 func (a supportedAspectRatio) numTiles() int {
 	return a.width * a.height
 }
 type ImageProcessor struct {
 	imageSize, numChannels, maxNumTiles int
 	mean, std [3]float32
 }
 func newImageProcessor(c fs.Config) ImageProcessor {
@ -34,49 +20,71 @@ func newImageProcessor(c fs.Config) ImageProcessor {
 		imageSize:   int(c.Uint("vision.image_size")),
 		numChannels: int(c.Uint("vision.num_channels")),
 		maxNumTiles: int(c.Uint("vision.max_num_tiles")),
 		mean: imageproc.ClipDefaultMean,
 		std:  imageproc.ClipDefaultSTD,
 	}
 }
-func (p ImageProcessor) supportedAspectRatios() (ratios []supportedAspectRatio) {
+func (p *ImageProcessor) supportedAspectRatios(maxTiles int) []image.Point {
-	for w := 1; w <= p.maxNumTiles; w++ {
+	ratios := []image.Point{}
-		for h := 1; h <= p.maxNumTiles/w; h++ {
+
-			ratios = append(ratios, supportedAspectRatio{len(ratios) + 1, w, h})
+	for w := range maxTiles {
 		for h := range maxTiles {
 			if (w+1)*(h+1) <= maxTiles {
 				ratios = append(ratios, image.Point{w + 1, h + 1})
 			}
 		}
 	}
 	return ratios
 }
-func (p ImageProcessor) fitToCanvas(imageSize, canvasSize image.Point) image.Point {
+func (p *ImageProcessor) clip(a, a_min, a_max int) int {
-	tw := min(max(imageSize.X, p.imageSize), canvasSize.X)
+	if a < a_min {
-	th := min(max(imageSize.Y, p.imageSize), canvasSize.Y)
+		return a_min
 	} else if a > a_max {
 		return a_max
 	}
-	r := math.Min(
+	return a
-		float64(tw)/float64(imageSize.X),
+}
 		float64(th)/float64(imageSize.Y),
 	)
-	w := min(int(math.Floor(float64(imageSize.X)*r)), tw)
+func (p *ImageProcessor) fitToCanvas(imageSize, canvasSize image.Point, tileSize int) image.Point {
-	h := min(int(math.Floor(float64(imageSize.Y)*r)), th)
+	targetWidth := p.clip(imageSize.X, tileSize, canvasSize.X)
 	targetHeight := p.clip(imageSize.Y, tileSize, canvasSize.Y)
 	scaleWidth := float64(targetWidth) / float64(imageSize.X)
 	scaleHeight := float64(targetHeight) / float64(imageSize.Y)
 	var w, h int
 	if scaleWidth < scaleHeight {
 		w = targetWidth
 		h = min(int(math.Floor(float64(imageSize.Y)*scaleWidth)), targetHeight)
 	} else {
 		w = min(int(math.Floor(float64(imageSize.X)*scaleHeight)), targetWidth)
 		h = targetHeight
 	}
 	return image.Point{w, h}
 }
-func (p ImageProcessor) optimalTiledCanvas(imageSize image.Point) image.Point {
+func (p *ImageProcessor) optimalTiledCanvas(imageSize image.Point, maxImageTiles, tileSize int) image.Point {
-	possibleTileArrangements := p.supportedAspectRatios()
+	possibleTileArrangements := p.supportedAspectRatios(maxImageTiles)
-	possibleCanvasSizes := make([]image.Point, len(possibleTileArrangements))
+	possibleCanvasSizes := []image.Point{}
-	for i, pta := range possibleTileArrangements {
+	for _, pta := range possibleTileArrangements {
-		possibleCanvasSizes[i] = image.Point{pta.width * p.imageSize, pta.height * p.imageSize}
+		possibleCanvasSizes = append(possibleCanvasSizes, image.Point{pta.X * tileSize, pta.Y * tileSize})
 	}
-	scales := make([]float64, len(possibleCanvasSizes))
+	scales := []float64{}
-	for i, pcs := range possibleCanvasSizes {
+
-		scales[i] = min(
+	for _, pcs := range possibleCanvasSizes {
-			float64(pcs.Y)/float64(imageSize.Y),
+		scaleHeight := float64(pcs.Y) / float64(imageSize.Y)
-			float64(pcs.X)/float64(imageSize.X),
+		scaleWidth := float64(pcs.X) / float64(imageSize.X)
-		)
+
 		if scaleWidth > scaleHeight {
 			scales = append(scales, scaleHeight)
 		} else {
 			scales = append(scales, scaleWidth)
 		}
 	}
 	var minUpscale float64
@ -115,41 +123,47 @@ func (p ImageProcessor) optimalTiledCanvas(imageSize image.Point) image.Point {
 	return selectedCanvas
 }
-func (p ImageProcessor) splitToTiles(img image.Image, numTilesSize image.Point) []image.Image {
+func (p *ImageProcessor) splitToTiles(img image.Image, numTilesSize image.Point) []image.Image {
 	b := img.Bounds()
 	width := b.Max.X - b.Min.X
 	height := b.Max.Y - b.Min.Y
 	tileHeight := height / numTilesSize.Y
 	tileWidth := width / numTilesSize.X
-	images := make([]image.Image, 0, numTilesSize.Y*numTilesSize.X)
+	images := []image.Image{}
 	for h := range numTilesSize.Y {
 		for w := range numTilesSize.X {
 			rect := image.Rect(tileWidth*w, tileHeight*h, tileWidth*(w+1), tileHeight*(h+1))
-			if subImg, ok := img.(interface {
+			images = append(images, img.(interface {
 				SubImage(image.Rectangle) image.Image
-			}); ok {
+			}).SubImage(rect))
 				images = append(images, subImg.SubImage(rect))
 			} else {
 				// Handle the case where img does not implement SubImage
 				// This is a fallback and may not be efficient
 				newImg := image.NewRGBA(rect)
 				draw.Draw(newImg, rect, img, rect.Min, draw.Src)
 				images = append(images, newImg)
 			}
 		}
 	}
 	return images
 }
-func (p ImageProcessor) resize(img image.Image) (image.Image, image.Point) {
+// remove the "alpha" channel by drawing over a prefilled image
-	b := img.Bounds()
+//
 //nolint:unused
 func (p *ImageProcessor) compositeImage(img image.Image) image.Image {
 	dst := image.NewRGBA(img.Bounds())
-	canvasSize := p.optimalTiledCanvas(b.Max)
+	white := color.RGBA{255, 255, 255, 255}
-	aspectRatio := image.Point{canvasSize.X / p.imageSize, canvasSize.Y / p.imageSize}
+	draw.Draw(dst, dst.Bounds(), &image.Uniform{white}, image.Point{}, draw.Src)
-	newSize := p.fitToCanvas(b.Max, canvasSize)
+	draw.Draw(dst, dst.Bounds(), img, img.Bounds().Min, draw.Over)
 	return dst
 }
 func (p *ImageProcessor) resize(img image.Image, outputSize image.Point, maxImageTiles int) (image.Image, image.Point) {
 	b := img.Bounds()
 	tileSize := outputSize.Y
 	canvasSize := p.optimalTiledCanvas(b.Max, maxImageTiles, tileSize)
 	aspectRatio := image.Point{canvasSize.X / tileSize, canvasSize.Y / tileSize}
 	newSize := p.fitToCanvas(b.Max, canvasSize, tileSize)
 	dst := image.NewRGBA(image.Rect(0, 0, newSize.X, newSize.Y))
@ -163,10 +177,10 @@ func (p ImageProcessor) resize(img image.Image) (image.Image, image.Point) {
 	return dst, aspectRatio
 }
-func (p ImageProcessor) pad(img image.Image, aspectRatio image.Point) image.Image {
+func (p *ImageProcessor) pad(img image.Image, outputSize, aspectRatio image.Point) image.Image {
 	paddedSize := image.Point{
-		X: p.imageSize * aspectRatio.X,
+		X: outputSize.X * aspectRatio.X,
-		Y: p.imageSize * aspectRatio.Y,
+		Y: outputSize.Y * aspectRatio.Y,
 	}
 	dst := image.NewRGBA(image.Rect(0, 0, paddedSize.X, paddedSize.Y))
@ -175,7 +189,7 @@ func (p ImageProcessor) pad(img image.Image, aspectRatio image.Point) image.Imag
 	return dst
 }
-func (p ImageProcessor) pack(img image.Image, aspectRatio image.Point) []float32 {
+func (p *ImageProcessor) pack(img image.Image, aspectRatio image.Point, mean, std [3]float32) []float32 {
 	subImages := p.splitToTiles(img, aspectRatio)
 	var pixelVals []float32
@ -191,9 +205,9 @@ func (p ImageProcessor) pack(img image.Image, aspectRatio image.Point) []float32
 				gVal := float32(g>>8) / 255.0
 				bVal := float32(b>>8) / 255.0
-				rVal = (rVal - p.mean[0]) / p.std[0]
+				rVal = (rVal - mean[0]) / std[0]
-				gVal = (gVal - p.mean[1]) / p.std[1]
+				gVal = (gVal - mean[1]) / std[1]
-				bVal = (bVal - p.mean[2]) / p.std[2]
+				bVal = (bVal - mean[2]) / std[2]
 				rVals = append(rVals, rVal)
 				gVals = append(gVals, gVal)
@ -208,15 +222,17 @@ func (p ImageProcessor) pack(img image.Image, aspectRatio image.Point) []float32
 	return pixelVals
 }
-func (p ImageProcessor) ProcessImage(img image.Image) ([]float32, supportedAspectRatio, error) {
+func (p ImageProcessor) ProcessImage(img image.Image) ([]float32, int, error) {
-	newImage, newImageRatio := p.resize(img)
+	outputSize := image.Point{p.imageSize, p.imageSize}
 	newImage = p.pad(newImage, newImageRatio)
 	pixelValues := p.pack(newImage, newImageRatio)
-	supportedAspectRatios := p.supportedAspectRatios()
+	// clip values
-	aspectRatioID := slices.IndexFunc(supportedAspectRatios, func(i supportedAspectRatio) bool {
+	mean := [3]float32{0.48145466, 0.4578275, 0.40821073}
-		return i.width == newImageRatio.X && i.height == newImageRatio.Y
+	std := [3]float32{0.26862954, 0.26130258, 0.27577711}
 	})
-	return pixelValues, supportedAspectRatios[aspectRatioID], nil
+	newImage, aspectRatio := p.resize(img, outputSize, p.maxNumTiles)
 	newImage = p.pad(newImage, outputSize, aspectRatio)
 	data := p.pack(newImage, aspectRatio, mean, std)
 	aspectRatioIndex := slices.Index(p.supportedAspectRatios(p.maxNumTiles), aspectRatio) + 1
 	return data, aspectRatioIndex, nil
 }
--- a/model/models/mllama/process_image_test.go
+++ b/model/models/mllama/process_image_test.go
@ -1,387 +0,0 @@
 package mllama
 import (
 	"image"
 	"testing"
 	"github.com/google/go-cmp/cmp"
 )
 func TestSupportedAspectRatios(t *testing.T) {
 	cases := []struct {
 		p    ImageProcessor
 		want []supportedAspectRatio
 	}{
 		{
 			p: ImageProcessor{maxNumTiles: 1},
 			want: []supportedAspectRatio{
 				{1, 1, 1},
 			},
 		},
 		{
 			p: ImageProcessor{maxNumTiles: 2},
 			want: []supportedAspectRatio{
 				{1, 1, 1},
 				{2, 1, 2},
 				{3, 2, 1},
 			},
 		},
 		{
 			p: ImageProcessor{maxNumTiles: 3},
 			want: []supportedAspectRatio{
 				{1, 1, 1},
 				{2, 1, 2},
 				{3, 1, 3},
 				{4, 2, 1},
 				{5, 3, 1},
 			},
 		},
 		{
 			p: ImageProcessor{maxNumTiles: 4},
 			want: []supportedAspectRatio{
 				{1, 1, 1},
 				{2, 1, 2},
 				{3, 1, 3},
 				{4, 1, 4},
 				{5, 2, 1},
 				{6, 2, 2},
 				{7, 3, 1},
 				{8, 4, 1},
 			},
 		},
 	}
 	for _, tt := range cases {
 		actual := tt.p.supportedAspectRatios()
 		if diff := cmp.Diff(actual, tt.want, cmp.AllowUnexported(supportedAspectRatio{})); diff != "" {
 			t.Errorf("mismatch (-got +want):\n%s", diff)
 		}
 	}
 }
 func TestFitToCanvas(t *testing.T) {
 	cases := []struct {
 		p      ImageProcessor
 		image  image.Point
 		canvas image.Point
 		expect image.Point
 	}{
 		{
 			p:      ImageProcessor{imageSize: 200},
 			image:  image.Point{400, 400},
 			canvas: image.Point{640, 480},
 			expect: image.Point{400, 400},
 		},
 		{
 			p:      ImageProcessor{imageSize: 200},
 			image:  image.Point{1024, 768},
 			canvas: image.Point{640, 480},
 			expect: image.Point{640, 480},
 		},
 		{
 			p:      ImageProcessor{imageSize: 750},
 			image:  image.Point{500, 500},
 			canvas: image.Point{1000, 1000},
 			expect: image.Point{750, 750},
 		},
 		{
 			p:      ImageProcessor{imageSize: 2000},
 			image:  image.Point{500, 1000},
 			canvas: image.Point{2000, 2000},
 			expect: image.Point{1000, 2000},
 		},
 		{
 			p:      ImageProcessor{imageSize: 1000},
 			image:  image.Point{4000, 3000},
 			canvas: image.Point{2000, 1000},
 			expect: image.Point{1333, 1000},
 		},
 		{
 			p:      ImageProcessor{imageSize: 560},
 			image:  image.Point{667, 1000},
 			canvas: image.Point{1000, 1000},
 			expect: image.Point{667, 1000},
 		},
 	}
 	for _, tt := range cases {
 		actual := tt.p.fitToCanvas(tt.image, tt.canvas)
 		if diff := cmp.Diff(actual, tt.expect); diff != "" {
 			t.Errorf("mismatch (-got +want):\n%s", diff)
 		}
 	}
 }
 func TestOptimalTiledCanvas(t *testing.T) {
 	cases := []struct {
 		p      ImageProcessor
 		image  image.Point
 		expect image.Point
 	}{
 		{
 			p:      ImageProcessor{maxNumTiles: 4, imageSize: 1000},
 			image:  image.Point{1024, 768},
 			expect: image.Point{2000, 1000},
 		},
 		{
 			p:      ImageProcessor{maxNumTiles: 4, imageSize: 560},
 			image:  image.Point{1024, 768},
 			expect: image.Point{1120, 1120},
 		},
 		{
 			p:      ImageProcessor{maxNumTiles: 4, imageSize: 560},
 			image:  image.Point{800, 600},
 			expect: image.Point{1120, 1120},
 		},
 		{
 			p:      ImageProcessor{maxNumTiles: 4, imageSize: 560},
 			image:  image.Point{640, 480},
 			expect: image.Point{1120, 560},
 		},
 		{
 			p:      ImageProcessor{maxNumTiles: 4, imageSize: 560},
 			image:  image.Point{320, 200},
 			expect: image.Point{560, 560},
 		},
 		{
 			p:      ImageProcessor{maxNumTiles: 4, imageSize: 560},
 			image:  image.Point{1320, 200},
 			expect: image.Point{1680, 560},
 		},
 		{
 			p:      ImageProcessor{maxNumTiles: 4, imageSize: 560},
 			image:  image.Point{2000, 200},
 			expect: image.Point{2240, 560},
 		},
 		{
 			p:      ImageProcessor{maxNumTiles: 4, imageSize: 560},
 			image:  image.Point{10000, 200},
 			expect: image.Point{2240, 560},
 		},
 		{
 			p:      ImageProcessor{maxNumTiles: 4, imageSize: 560},
 			image:  image.Point{480, 640},
 			expect: image.Point{560, 1120},
 		},
 		{
 			p:      ImageProcessor{maxNumTiles: 4, imageSize: 560},
 			image:  image.Point{200, 320},
 			expect: image.Point{560, 560},
 		},
 		{
 			p:      ImageProcessor{maxNumTiles: 4, imageSize: 560},
 			image:  image.Point{200, 1320},
 			expect: image.Point{560, 1680},
 		},
 		{
 			p:      ImageProcessor{maxNumTiles: 4, imageSize: 560},
 			image:  image.Point{200, 2000},
 			expect: image.Point{560, 2240},
 		},
 		{
 			p:      ImageProcessor{maxNumTiles: 4, imageSize: 560},
 			image:  image.Point{200, 10000},
 			expect: image.Point{560, 2240},
 		},
 		{
 			p:      ImageProcessor{maxNumTiles: 4, imageSize: 560},
 			image:  image.Point{10000, 10000},
 			expect: image.Point{1120, 1120},
 		},
 	}
 	for _, tt := range cases {
 		actual := tt.p.optimalTiledCanvas(tt.image)
 		if diff := cmp.Diff(actual, tt.expect); diff != "" {
 			t.Errorf("mismatch (-got +want):\n%s", diff)
 		}
 	}
 }
 func TestSplitToTiles(t *testing.T) {
 	cases := []struct {
 		imageMax image.Point
 		numTiles image.Point
 		expect   []image.Image
 	}{
 		{
 			imageMax: image.Point{1024, 768},
 			numTiles: image.Point{1, 1},
 			expect:   []image.Image{image.NewRGBA(image.Rect(0, 0, 1024, 768))},
 		},
 		{
 			imageMax: image.Point{1000, 500},
 			numTiles: image.Point{2, 1},
 			expect: []image.Image{
 				image.NewRGBA(image.Rect(0, 0, 500, 500)),
 				image.NewRGBA(image.Rect(500, 0, 1000, 500)),
 			},
 		},
 		{
 			imageMax: image.Point{1000, 1000},
 			numTiles: image.Point{2, 2},
 			expect: []image.Image{
 				image.NewRGBA(image.Rect(0, 0, 500, 500)),
 				image.NewRGBA(image.Rect(500, 0, 1000, 500)),
 				image.NewRGBA(image.Rect(0, 500, 500, 1000)),
 				image.NewRGBA(image.Rect(500, 500, 1000, 1000)),
 			},
 		},
 	}
 	var p ImageProcessor
 	for _, tt := range cases {
 		actual := p.splitToTiles(image.NewRGBA(image.Rectangle{Max: tt.imageMax}), tt.numTiles)
 		if len(actual) != len(tt.expect) {
 			t.Errorf("incorrect number of images '%d': expect: '%d'", len(actual), len(tt.expect))
 		}
 		for i := range actual {
 			if actual[i].Bounds() != tt.expect[i].Bounds() {
 				t.Errorf("image size incorrect: '%#v': expect: '%#v'", actual[i].Bounds(), tt.expect[i].Bounds())
 			}
 		}
 	}
 }
 func TestResize(t *testing.T) {
 	cases := []struct {
 		p                 ImageProcessor
 		imageMax          image.Point
 		expectImage       image.Image
 		expectAspectRatio image.Point
 	}{
 		{
 			p:                 ImageProcessor{maxNumTiles: 1, imageSize: 100},
 			imageMax:          image.Point{200, 200},
 			expectImage:       image.NewRGBA(image.Rect(0, 0, 100, 100)),
 			expectAspectRatio: image.Point{1, 1},
 		},
 		{
 			p:                 ImageProcessor{maxNumTiles: 2, imageSize: 100},
 			imageMax:          image.Point{200, 200},
 			expectImage:       image.NewRGBA(image.Rect(0, 0, 100, 100)),
 			expectAspectRatio: image.Point{1, 1},
 		},
 		{
 			p:                 ImageProcessor{maxNumTiles: 4, imageSize: 560},
 			imageMax:          image.Point{10, 10},
 			expectImage:       image.NewRGBA(image.Rect(0, 0, 560, 560)),
 			expectAspectRatio: image.Point{1, 1},
 		},
 		{
 			p:                 ImageProcessor{maxNumTiles: 4, imageSize: 560},
 			imageMax:          image.Point{2560, 1920},
 			expectImage:       image.NewRGBA(image.Rect(0, 0, 1120, 840)),
 			expectAspectRatio: image.Point{2, 2},
 		},
 		{
 			p:                 ImageProcessor{maxNumTiles: 4, imageSize: 560},
 			imageMax:          image.Point{1024, 768},
 			expectImage:       image.NewRGBA(image.Rect(0, 0, 1024, 768)),
 			expectAspectRatio: image.Point{2, 2},
 		},
 	}
 	for _, tt := range cases {
 		actualImage, actualAspectRatio := tt.p.resize(image.Rectangle{Max: tt.imageMax})
 		if actualImage.Bounds() != tt.expectImage.Bounds() {
 			t.Errorf("image size incorrect: '%#v': expect: '%#v'", actualImage.Bounds(), tt.expectImage.Bounds())
 		}
 		if actualAspectRatio != tt.expectAspectRatio {
 			t.Errorf("aspect ratio incorrect: '%#v': expect: '%#v'", actualAspectRatio, tt.expectAspectRatio)
 		}
 	}
 }
 func TestPad(t *testing.T) {
 	cases := []struct {
 		p           ImageProcessor
 		imageMax    image.Point
 		aspectRatio image.Point
 		expect      image.Image
 	}{
 		{
 			p:           ImageProcessor{maxNumTiles: 4, imageSize: 560},
 			imageMax:    image.Point{1000, 667},
 			aspectRatio: image.Point{2, 2},
 			expect:      image.NewRGBA(image.Rect(0, 0, 1120, 1120)),
 		},
 	}
 	for _, tt := range cases {
 		actual := tt.p.pad(image.Rectangle{Max: tt.imageMax}, tt.aspectRatio)
 		if actual.Bounds() != tt.expect.Bounds() {
 			t.Errorf("image size incorrect: '%#v': expect: '%#v'", actual.Bounds(), tt.expect.Bounds())
 		}
 	}
 }
 func TestPackImages(t *testing.T) {
 	cases := []struct {
 		imageMax    image.Point
 		aspectRatio image.Point
 		expectVals  int
 	}{
 		{
 			imageMax:    image.Point{1120, 1120},
 			aspectRatio: image.Point{2, 2},
 			expectVals:  2 * 2 * 3 * 560 * 560,
 		},
 		{
 			imageMax:    image.Point{560, 560},
 			aspectRatio: image.Point{1, 1},
 			expectVals:  1 * 1 * 3 * 560 * 560,
 		},
 		{
 			imageMax:    image.Point{1120, 560},
 			aspectRatio: image.Point{1, 2},
 			expectVals:  1 * 2 * 3 * 560 * 560,
 		},
 	}
 	for _, tt := range cases {
 		var p ImageProcessor
 		actualVals := p.pack(image.NewRGBA(image.Rectangle{Max: tt.imageMax}), tt.aspectRatio)
 		if len(actualVals) != tt.expectVals {
 			t.Errorf("packed image size incorrect: '%d': expect: '%d'", len(actualVals), tt.expectVals)
 		}
 	}
 }
 func TestPreprocess(t *testing.T) {
 	cases := []struct {
 		imageMax            image.Point
 		expectAspectRatioID int
 	}{
 		{
 			imageMax:            image.Point{10, 10},
 			expectAspectRatioID: 1,
 		},
 		{
 			imageMax:            image.Point{1024, 768},
 			expectAspectRatioID: 6,
 		},
 	}
 	p := ImageProcessor{imageSize: 560, maxNumTiles: 4}
 	for _, tt := range cases {
 		img, aspectRatio, err := p.ProcessImage(image.NewRGBA(image.Rectangle{Max: tt.imageMax}))
 		if err != nil {
 			t.Fatalf("error processing: %q", err)
 		}
 		if len(img) == 0 {
 			t.Errorf("no image data returned")
 		}
 		if aspectRatio.rank != tt.expectAspectRatioID {
 			t.Errorf("aspect ratio incorrect: '%d': expect: '%d'", aspectRatio, tt.expectAspectRatioID)
 		}
 	}
 }
--- a/model/models/qwen25vl/model.go
+++ b/model/models/qwen25vl/model.go
@ -16,8 +16,6 @@ import (
 type Model struct {
 	model.Base
 	model.BytePairEncoding
 	*TextModel
 	*VisionModel `gguf:"v,vision"`
@ -29,20 +27,6 @@ var _ model.MultimodalProcessor = (*Model)(nil)
 func New(c fs.Config) (model.Model, error) {
 	m := &Model{
 		BytePairEncoding: model.NewBytePairEncoding(
 			c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
 			&model.Vocabulary{
 				Values: c.Strings("tokenizer.ggml.tokens"),
 				Types:  c.Ints("tokenizer.ggml.token_type"),
 				Merges: c.Strings("tokenizer.ggml.merges"),
 				BOS:    int32(c.Uint("tokenizer.ggml.bos_token_id")),
 				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", false),
 				EOS:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
 				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
 				EOT:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
 				AddEOT: c.Bool("tokenizer.ggml.add_eos_token", false),
 			},
 		),
 		TextModel:      NewTextModel(c),
 		VisionModel:    newVisionModel(c),
 		ImageProcessor: newImageProcessor(c),
@ -88,13 +72,13 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, er
 	}
 	visionOutputs := m.VisionModel.Forward(ctx, pixels, grid)
-	return &chunks{Model: m, Tensor: visionOutputs}, nil
+	return &chunks{Model: m, Tensor: visionOutputs, grid: grid}, nil
 }
 type chunks struct {
 	*Model
 	ml.Tensor
-
+	grid     *Grid
 	dataOnce sync.Once
 	data     []float32
 }
@ -134,7 +118,7 @@ func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
 			// Adding the 'Picture' prefix is a hack, at the time of writing there is no way to prefix
 			// the image tokens with a prompt, so we add a prefix here
 			nImg++
-			pre, err := m.Encode(fmt.Sprintf(" Picture %d: ", nImg), true)
+			pre, err := m.TextModel.Encode(fmt.Sprintf(" Picture %d: ", nImg), true)
 			if err != nil {
 				return nil, fmt.Errorf("failed to encode image prompt: %w", err)
 			}
@ -169,7 +153,28 @@ func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
 }
 func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
-	positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
+	fmt.Println("Forward")
 	pos := make([]int32, len(batch.Positions)*4)
 	var grid = &Grid{}
 	if len(batch.Multimodal) > 0 {
 		image := batch.Multimodal[0].Multimodal
 		grid = image.(*chunk).chunks.grid
 		for y := 0; y < grid.Height/2; y++ {
 			for x := 0; x < grid.Width/2; x++ {
 				i := y*grid.Width/2 + x
 				pos[i] = batch.Positions[i]
 				pos[i+len(batch.Positions)] = batch.Positions[i] + int32(y)
 				pos[i+len(batch.Positions)*2] = batch.Positions[i] + int32(x)
 				pos[i+len(batch.Positions)*3] = 0
 			}
 		}
 	} else {
 		copy(pos[:len(batch.Positions)], batch.Positions)
 		copy(pos[len(batch.Positions):len(batch.Positions)*2], batch.Positions)
 		copy(pos[len(batch.Positions)*2:len(batch.Positions)*3], batch.Positions)
 	}
 	positions, err := ctx.Input().FromIntSlice(pos, len(pos))
 	if err != nil {
 		return nil, err
 	}
--- a/model/models/qwen25vl/model_text.go
+++ b/model/models/qwen25vl/model_text.go
@ -7,6 +7,7 @@ import (
 	"github.com/ollama/ollama/kvcache"
 	"github.com/ollama/ollama/ml"
 	"github.com/ollama/ollama/ml/nn"
 	"github.com/ollama/ollama/model"
 	"github.com/ollama/ollama/model/input"
 )
@ -17,6 +18,9 @@ type TextOptions struct {
 }
 type TextModel struct {
 	model.Base
 	model.BytePairEncoding
 	TokenEmbedding *nn.Embedding `gguf:"token_embd"`
 	Layers         []Layer       `gguf:"blk"`
 	OutputNorm     *nn.RMSNorm   `gguf:"output_norm"`
@ -27,6 +31,20 @@ type TextModel struct {
 func NewTextModel(c fs.Config) *TextModel {
 	m := TextModel{
 		BytePairEncoding: model.NewBytePairEncoding(
 			c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
 			&model.Vocabulary{
 				Values: c.Strings("tokenizer.ggml.tokens"),
 				Types:  c.Ints("tokenizer.ggml.token_type"),
 				Merges: c.Strings("tokenizer.ggml.merges"),
 				BOS:    int32(c.Uint("tokenizer.ggml.bos_token_id")),
 				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", false),
 				EOS:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
 				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
 				EOT:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
 				AddEOT: c.Bool("tokenizer.ggml.add_eos_token", false),
 			},
 		),
 		Layers: make([]Layer, c.Uint("block_count")),
 		TextOptions: &TextOptions{
 			ctxLen:            int(c.Uint("context_length")),
@ -47,23 +65,26 @@ func NewTextModel(c fs.Config) *TextModel {
 // SelfAttention implements the multi-head self-attention mechanism
 // with separate projections for query, key, value and output transformations
 type SelfAttention struct {
-	Query  *nn.Linear `gguf:"attn_q"`
+	Query       *nn.Linear `gguf:"attn_q"`
-	Key    *nn.Linear `gguf:"attn_k"`
+	Key         *nn.Linear `gguf:"attn_k"`
-	Value  *nn.Linear `gguf:"attn_v"`
+	Value       *nn.Linear `gguf:"attn_v"`
-	Output *nn.Linear `gguf:"attn_output"`
+	Output      *nn.Linear `gguf:"attn_output"`
 	RopeFactors ml.Tensor  `gguf:"rope_freqs.weight"`
 }
 func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *TextOptions) ml.Tensor {
 	batchSize := hiddenState.Dim(1)
 	headDim := opts.hiddenSize / opts.numHeads
 	sections := [4]int32{16, 24, 24, 0}
 	q := sa.Query.Forward(ctx, hiddenState)
 	q = q.Reshape(ctx, headDim, opts.numHeads, batchSize)
-	q = q.RoPE(ctx, positionIDs, nil, opts.ropeDim, 2, opts.ropeBase, opts.ropeScale, ml.WithContextLen(opts.defaultContextLen))
+	q = q.RoPEMulti(ctx, positionIDs, sa.RopeFactors, opts.ropeDim, sections, 8, opts.ropeBase, opts.ropeScale)
 	k := sa.Key.Forward(ctx, hiddenState)
 	k = k.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
-	k = k.RoPE(ctx, positionIDs, nil, opts.ropeDim, 2, opts.ropeBase, opts.ropeScale, ml.WithContextLen(opts.defaultContextLen))
+	k = k.RoPEMulti(ctx, positionIDs, sa.RopeFactors, opts.ropeDim, sections, 8, opts.ropeBase, opts.ropeScale)
 	v := sa.Value.Forward(ctx, hiddenState)
 	v = v.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
--- a/model/models/qwen25vl/model_vision.go
+++ b/model/models/qwen25vl/model_vision.go
@ -139,7 +139,7 @@ type VisionModelOptions struct {
 	ropeTheta         float32
 	spatialMergeSize  int
 	windowSize        int
-	fullAttnBlocks    []int32
+	fullAttnBlocks    []int
 	temporalPatchSize int
 }
@ -235,7 +235,7 @@ func (m *VisionModel) Forward(ctx ml.Context, pixelValues ml.Tensor, grid *Grid)
 	mask := blockDiagonalMask(ctx, hiddenStates.Dim(1), bounds, m.VisionModelOptions.numHeads)
 	// Apply encoder layers
 	for i, layer := range m.Layers {
-		if slices.Contains(m.fullAttnBlocks, int32(i)) {
+		if slices.Contains(m.fullAttnBlocks, i) {
 			hiddenStates = layer.Forward(ctx, hiddenStates, cos, sin, nil, m.VisionModelOptions)
 		} else {
 			hiddenStates = layer.Forward(
@ -383,9 +383,13 @@ func newVisionModel(c fs.Config) *VisionModel {
 			spatialMergeSize:  spatialMergeSize,
 			windowSize:        windowSize,
 			temporalPatchSize: temporalPatchSize,
 			fullAttnBlocks:    fullAttnBlocks,
 		},
 	}
 	for i := range fullAttnBlocks {
 		// full attention block indexes have to be converted to int for use with the slices package
 		model.fullAttnBlocks = append(model.fullAttnBlocks, int(fullAttnBlocks[i]))
 	}
 	return model
 }
--- a/model/models/qwen25vl/process_image.go
+++ b/model/models/qwen25vl/process_image.go
@ -11,6 +11,7 @@ import (
 // ImageProcessor contains configuration for the Qwen 2.5 VL image processing
 type ImageProcessor struct {
 	imageSize         int
 	numChannels       int
 	patchSize         int
 	temporalPatchSize int
@ -29,16 +30,17 @@ func newImageProcessor(c fs.Config) ImageProcessor {
 	mergeSize := int(c.Uint("vision.spatial_merge_size", 2))
 	return ImageProcessor{
 		imageSize:         int(c.Uint("vision.image_size", 560)),
 		numChannels:       int(c.Uint("vision.num_channels", 3)), // not set
 		patchSize:         patchSize,
 		temporalPatchSize: 2,
 		mergeSize:         mergeSize,
 		minPixels:         56 * 56,
-		maxPixels:         int(c.Uint("vision.max_pixels", 28*28*1280)), // 1MP limit
+		maxPixels:         28 * 28 * 4 * 1280,
 		factor:            patchSize * mergeSize,
 		rescaleFactor:     1.0 / 255.0,
-		imageMean:         imageproc.ClipDefaultMean[:],
+		imageMean:         []float32{0.48145466, 0.4578275, 0.40821073},
-		imageStd:          imageproc.ClipDefaultSTD[:],
+		imageStd:          []float32{0.26862954, 0.26130258, 0.27577711},
 	}
 }
--- a/model/models/qwen25vl/process_image_test.go
+++ b/model/models/qwen25vl/process_image_test.go
@ -0,0 +1,47 @@
 package qwen25vl
 import (
 	"image"
 	_ "image/jpeg" // Register JPEG decoder
 	"testing"
 )
 func TestSmartResize(t *testing.T) {
 	type smartResizeCase struct {
 		TestImage image.Image
 		Expected  image.Point
 	}
 	// Create an image processor with default values
 	processor := ImageProcessor{
 		imageSize:   560, // Example value
 		numChannels: 3,
 		factor:      28,
 		minPixels:   56 * 56,
 		maxPixels:   14 * 14 * 4 * 1280,
 	}
 	cases := []smartResizeCase{
 		{
 			TestImage: image.NewRGBA(image.Rect(0, 0, 1024, 1024)),
 			Expected:  image.Point{980, 980},
 		},
 		{
 			TestImage: image.NewRGBA(image.Rect(0, 0, 1024, 768)),
 			Expected:  image.Point{1036, 756},
 		},
 		{
 			TestImage: image.NewRGBA(image.Rect(0, 0, 2000, 2000)),
 			Expected:  image.Point{980, 980},
 		},
 	}
 	for _, c := range cases {
 		b := c.TestImage.Bounds().Max
 		x, y := processor.SmartResize(b.X, b.Y)
 		actual := image.Point{x, y}
 		if actual != c.Expected {
 			t.Errorf("expected: %v, actual: %v", c.Expected, actual)
 		}
 	}
 }
--- a/model/models/qwen2vl/imageproc.go
+++ b/model/models/qwen2vl/imageproc.go
@ -0,0 +1,74 @@
 package qwen2vl
 import (
 	"fmt"
 	"image"
 	_ "image/jpeg"
 	_ "image/png"
 	"io"
 	"math"
 	"github.com/ollama/ollama/model/imageproc"
 )
 const (
 	DefaultFactor    = 28
 	DefaultMinPixels = 56 * 56
 	DefaultMaxPixels = 14 * 14 * 4 * 1280
 )
 // smartResize calculates the size of the image to resize to based on the
 // factor, minPixels, and maxPixels.
 func smartResize(size image.Point, factor, minPixels, maxPixels int) image.Point {
 	// 1. Both dimensions of size are divisible by factor
 	// 2. The area of the image is between minPixels and maxPixels
 	// 3. The aspect ratio of the image is as close to 1:1 as possible
 	if size.Y < factor || size.X < factor {
 		panic("image is too small to resize")
 	} else if max(size.X, size.Y)/min(size.X, size.Y) > 200 {
 		panic("aspect ratio must be less than 200:1")
 	}
 	f := float64(factor)
 	width := float64(size.X)
 	height := float64(size.Y)
 	xBar := math.Round(width/f) * f
 	yBar := math.Round(height/f) * f
 	if xBar*yBar > float64(maxPixels) {
 		beta := math.Sqrt(height * width / float64(maxPixels))
 		xBar = math.Floor(width/beta/f) * f
 		yBar = math.Floor(height/beta/f) * f
 	} else if xBar*yBar < float64(minPixels) {
 		beta := math.Sqrt(float64(minPixels) / (height * width))
 		xBar = math.Ceil(width*beta/f) * f
 		yBar = math.Ceil(height*beta/f) * f
 	}
 	return image.Point{int(xBar), int(yBar)}
 }
 func resizeImage(img image.Image, format string, size image.Point) image.Image {
 	if format == "png" {
 		img = imageproc.Composite(img)
 	}
 	return imageproc.Resize(img, size, imageproc.ResizeBilinear)
 }
 func Preprocess(imageData io.Reader) ([]float32, map[string]any, error) {
 	img, format, err := image.Decode(imageData)
 	if err != nil {
 		return nil, nil, fmt.Errorf("failed to decode image: %w", err)
 	}
 	size := smartResize(img.Bounds().Max, DefaultFactor, DefaultMinPixels, DefaultMaxPixels)
 	img = resizeImage(img, format, size)
 	data := imageproc.Normalize(img, imageproc.ClipDefaultMean, imageproc.ClipDefaultSTD, true, true)
 	opts := map[string]any{}
 	return data, opts, nil
 }
--- a/model/models/qwen2vl/imageproc_test.go
+++ b/model/models/qwen2vl/imageproc_test.go
@ -0,0 +1,78 @@
 package qwen2vl
 import (
 	"bytes"
 	"image"
 	"image/png"
 	"testing"
 )
 func TestSmartResize(t *testing.T) {
 	type smartResizeCase struct {
 		TestImage image.Image
 		Expected  image.Point
 	}
 	cases := []smartResizeCase{
 		{
 			TestImage: image.NewRGBA(image.Rect(0, 0, 1024, 1024)),
 			Expected:  image.Point{980, 980},
 		},
 		{
 			TestImage: image.NewRGBA(image.Rect(0, 0, 1024, 768)),
 			Expected:  image.Point{1036, 756},
 		},
 		{
 			TestImage: image.NewRGBA(image.Rect(0, 0, 2000, 2000)),
 			Expected:  image.Point{980, 980},
 		},
 	}
 	for _, c := range cases {
 		b := c.TestImage.Bounds().Max
 		actual := smartResize(b, DefaultFactor, DefaultMinPixels, DefaultMaxPixels)
 		if actual != c.Expected {
 			t.Errorf("expected: %v, actual: %v", c.Expected, actual)
 		}
 	}
 }
 func TestPreprocess(t *testing.T) {
 	type preprocessCase struct {
 		TestImage   image.Image
 		ExpectedLen int
 	}
 	cases := []preprocessCase{
 		{
 			TestImage:   image.NewRGBA(image.Rect(0, 0, 256, 256)),
 			ExpectedLen: 252 * 252 * 3 * 1,
 		},
 		{
 			TestImage:   image.NewRGBA(image.Rect(0, 0, 2000, 2000)),
 			ExpectedLen: 980 * 980 * 3 * 1,
 		},
 	}
 	for _, c := range cases {
 		var buf bytes.Buffer
 		err := png.Encode(&buf, c.TestImage)
 		if err != nil {
 			t.Fatal(err)
 		}
 		imgData, _, err := Preprocess(&buf)
 		if err != nil {
 			t.Fatalf("error processing: %q", err)
 		}
 		switch len(imgData) {
 		case 0:
 			t.Errorf("no image data returned")
 		case c.ExpectedLen:
 			// ok
 		default:
 			t.Errorf("unexpected image data length: %d, expected: %d", len(imgData), c.ExpectedLen)
 		}
 	}
 }
--- a/runner/llamarunner/image.go
+++ b/runner/llamarunner/image.go
@ -5,6 +5,7 @@ import (
 	"fmt"
 	"hash/maphash"
 	"log/slog"
 	"slices"
 	"sync"
 	"time"
@ -17,7 +18,8 @@ type ImageContext struct {
 	// mu is required to be held when generating embeddings or accessing the cache
 	mu sync.Mutex
-	clip *llama.ClipContext
+	clip   *llama.ClipContext
 	mllama *llama.MllamaContext
 	// cache of images to embeddings
 	images    []imageCache
@ -33,6 +35,8 @@ func NewImageContext(llamaContext *llama.Context, modelPath string) (*ImageConte
 	var c ImageContext
 	if arch == "clip" {
 		c.clip, err = llama.NewClipContext(llamaContext, modelPath)
 	} else if arch == "mllama" {
 		c.mllama, err = llama.NewMllamaContext(llamaContext, modelPath)
 	} else {
 		return nil, fmt.Errorf("unknown vision model architecture: %s", arch)
 	}
@ -54,9 +58,12 @@ func (c *ImageContext) Free(modelPath string) {
 	if c.clip != nil {
 		c.clip.Free()
 	}
 	if c.mllama != nil {
 		c.mllama.Free()
 	}
 }
-func (c *ImageContext) NewEmbed(llamaContext *llama.Context, data []byte) ([][]float32, error) {
+func (c *ImageContext) NewEmbed(llamaContext *llama.Context, data []byte, aspectRatioId int) ([][]float32, error) {
 	if c == nil {
 		return nil, nil
 	}
@ -72,7 +79,12 @@ func (c *ImageContext) NewEmbed(llamaContext *llama.Context, data []byte) ([][]f
 	embed, err := c.findImage(hash)
 	if err != nil {
-		if c.clip != nil {
+		if c.mllama != nil {
 			embed, err = c.mllama.NewEmbed(llamaContext, data, aspectRatioId)
 			if err != nil {
 				return nil, err
 			}
 		} else if c.clip != nil {
 			embed, err = c.clip.NewEmbed(llamaContext, data)
 			if err != nil {
 				return nil, err
@ -93,11 +105,33 @@ func (c *ImageContext) BatchSize(configuredBatchSize int) int {
 		return 0
 	}
 	// Mllama maps an image to 1 embedding token (llava creates many tokens)
 	// and doesn't support more than a single image per request.
 	// The embeddings are large (100 MB), so allocating a big batch can fail
 	// on some systems
 	if c.mllama != nil {
 		return 1
 	}
 	return configuredBatchSize
 }
 func (c *ImageContext) EmbedSize(llamaContext *llama.Context) int {
-	return llamaContext.Model().NEmbd()
+	if c != nil && c.mllama != nil {
 		return c.mllama.EmbedSize(llamaContext)
 	} else {
 		return llamaContext.Model().NEmbd()
 	}
 }
 func (c *ImageContext) NeedCrossAttention(inputs ...input) bool {
 	if c == nil || c.mllama == nil {
 		return false
 	}
 	return slices.ContainsFunc(inputs, func(input input) bool {
 		return input.embed != nil
 	})
 }
 type imageCache struct {
--- a/runner/llamarunner/runner.go
+++ b/runner/llamarunner/runner.go
@ -57,6 +57,10 @@ type Sequence struct {
 	// input cache being used by this sequence
 	cache *InputCacheSlot
 	// does this sequence require cross-attention layers to be processed? - if we have seen
 	// an image for certain multi-modal models
 	crossAttention bool
 	// channel to send responses over
 	responses chan string
@ -201,7 +205,7 @@ func (s *Server) inputs(prompt string, images []llm.ImageData) ([]input, error)
 				return nil, fmt.Errorf("invalid image index: %d", n)
 			}
-			embed, err := s.image.NewEmbed(s.lc, images[imageIndex].Data)
+			embed, err := s.image.NewEmbed(s.lc, images[imageIndex].Data, images[imageIndex].AspectRatioID)
 			if err != nil {
 				return nil, err
 			}
@ -364,6 +368,7 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
 	defer s.mu.Unlock()
 	var batch *llama.Batch
 	crossAttention := false
 	seqIdx := s.nextSeq - 1
 	for range s.seqs {
@ -411,8 +416,9 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
 					batch = tokenBatch
 				} else {
 					batch = embedBatch
 					seq.crossAttention = s.image.NeedCrossAttention(input)
 				}
-			} else if embedding != batch.IsEmbedding() {
+			} else if embedding != batch.IsEmbedding() || crossAttention != seq.crossAttention {
 				s.nextSeq = seqIdx
 				break
 			}
@ -421,6 +427,7 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
 				break
 			}
 			crossAttention = seq.crossAttention
 			batch.Add(input.token, input.embed, len(seq.cache.Inputs)+len(seq.pendingInputs), i+1 == len(seq.inputs), seq.cache.Id)
 			seq.pendingInputs = append(seq.pendingInputs, input)
 			seq.iBatch = batch.NumTokens() - 1
@ -433,11 +440,20 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
 		return nil
 	}
 	s.lc.SetCrossAttention(crossAttention)
 	err := s.lc.Decode(batch)
 	if err != nil {
 		return fmt.Errorf("failed to decode batch: %w", err)
 	}
 	if crossAttention {
 		// synchronize state to ensure the cross attention batch is complete.
 		// needed specifically for multi-GPU systems otherwise an inflight
 		// task may be incorrectly invalidated causing a crash
 		s.lc.Synchronize()
 	}
 	for i, seq := range s.seqs {
 		if seq == nil {
 			continue
@ -606,6 +622,8 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
 				return
 			}
 			seq.crossAttention = s.image.NeedCrossAttention(seq.cache.Inputs...)
 			s.seqs[i] = seq
 			s.cond.Signal()
 			found = true
--- a/scripts/build_windows.ps1
+++ b/scripts/build_windows.ps1
@ -27,7 +27,6 @@ function checkEnv() {
        $env:VCToolsRedistDir=(get-item "${MSVC_INSTALL}\VC\Redist\MSVC\*")[0]
    }
    # Locate CUDA versions
    # Note: this assumes every version found will be built
    $cudaList=(get-item "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v*\bin\" -ea 'silentlycontinue')
    if ($cudaList.length -eq 0) {
        $d=(get-command -ea 'silentlycontinue' nvcc).path
@ -94,19 +93,6 @@ function buildOllama() {
        $hashEnv = @{}
        Get-ChildItem env: | foreach { $hashEnv[$_.Name] = $_.Value }
        if ("$script:CUDA_DIRS".Contains("v11")) {
            $hashEnv.Keys | foreach { if ($_.Contains("CUDA_PATH_V11")) { $v11="$_" }}
            $env:CUDAToolkit_ROOT=$hashEnv[$v11]
            write-host "Building CUDA v11 backend libraries"
            # Note: cuda v11 requires msvc 2019 so force the older generator
            # to avoid 2022 (or newer) from being used as the default
            & cmake --fresh --preset "CUDA 11" -G "Visual Studio 16 2019" --install-prefix $script:DIST_DIR
            if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
            & cmake --build --preset "CUDA 11"  --config Release --parallel $script:JOBS
            if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
            & cmake --install build --component "CUDA" --strip
            if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
        }
        if ("$script:CUDA_DIRS".Contains("v12")) {
            $hashEnv.Keys | foreach { if ($_.Contains("CUDA_PATH_V12")) { $v12="$_" }}
            $env:CUDAToolkit_ROOT=$hashEnv[$v12]
--- a/scripts/env.sh
+++ b/scripts/env.sh
@ -10,9 +10,7 @@ OLLAMA_COMMON_BUILD_ARGS="--build-arg=VERSION \
    --build-arg=GOFLAGS \
    --build-arg=OLLAMA_CUSTOM_CPU_DEFS \
    --build-arg=OLLAMA_SKIP_CUDA_GENERATE \
    --build-arg=OLLAMA_SKIP_CUDA_11_GENERATE \
    --build-arg=OLLAMA_SKIP_CUDA_12_GENERATE \
    --build-arg=CUDA_V11_ARCHITECTURES \
    --build-arg=CUDA_V12_ARCHITECTURES \
    --build-arg=OLLAMA_SKIP_ROCM_GENERATE \
    --build-arg=OLLAMA_FAST_BUILD \
--- a/server/prompt.go
+++ b/server/prompt.go
@ -3,32 +3,47 @@ package server
 import (
 	"bytes"
 	"context"
 	"encoding/binary"
 	"errors"
 	"fmt"
 	"log/slog"
 	"slices"
 	"strings"
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/llm"
 	"github.com/ollama/ollama/model/models/mllama"
 	"github.com/ollama/ollama/template"
 )
 type tokenizeFunc func(context.Context, string) ([]int, error)
 var errTooManyImages = errors.New("vision model only supports a single image per message")
 // chatPrompt accepts a list of messages and returns the prompt and images that should be used for the next chat turn.
 // chatPrompt truncates any messages that exceed the context window of the model, making sure to always include 1) the
 // latest message and 2) system messages
 func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.Options, msgs []api.Message, tools []api.Tool) (prompt string, images []llm.ImageData, _ error) {
 	var system []api.Message
 	isMllama := checkMllamaModelFamily(m)
 	var imageNumTokens int
 	// TODO: Ideally we would compute this from the projector metadata but some pieces are implementation dependent
-	// Clip images are represented as 768 tokens, each an embedding
+	if isMllama {
-	imageNumTokens := 768
+		// Our mllama implementation packs all of the embeddings into a single token
 		imageNumTokens = 1
 	} else {
 		// Clip images are represented as 768 tokens, each an embedding
 		imageNumTokens = 768
 	}
 	n := len(msgs) - 1
 	// in reverse, find all messages that fit into context window
 	for i := n; i >= 0; i-- {
 		if isMllama && len(msgs[i].Images) > 1 {
 			return "", nil, errTooManyImages
 		}
 		// always include the last message
 		if i == n {
 			continue
@ -69,17 +84,48 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.
 	currMsgIdx := n
 	for cnt, msg := range msgs[currMsgIdx:] {
-		if slices.Contains(m.Config.ModelFamilies, "mllama") && len(msg.Images) > 1 {
+		prefix := ""
-			return "", nil, errors.New("this model only supports one image while more than one image requested")
+		imgPrompt := ""
 		}
 		var prefix string
 		prompt := msg.Content
 		for _, i := range msg.Images {
-			imgData := llm.ImageData{
+			var imgData llm.ImageData
-				ID:   len(images),
+
-				Data: i,
+			if isMllama {
 				if len(m.ProjectorPaths) == 0 {
 					imgData = llm.ImageData{
 						ID:   len(images),
 						Data: i,
 					}
 				} else {
 					data, opts, err := mllama.Preprocess(bytes.NewReader(i))
 					if err != nil {
 						return "", nil, err
 					}
 					buf := new(bytes.Buffer)
 					err = binary.Write(buf, binary.LittleEndian, data)
 					if err != nil {
 						return "", nil, err
 					}
 					ar, ok := opts["aspectRatioIndex"].(int)
 					if !ok {
 						return "", nil, fmt.Errorf("missing aspect ratio for image")
 					}
 					imgData = llm.ImageData{
 						ID:            len(images),
 						Data:          buf.Bytes(),
 						AspectRatioID: ar,
 					}
 				}
 				imgPrompt = "<|image|>"
 			} else {
 				imgData = llm.ImageData{
 					ID:   len(images),
 					Data: i,
 				}
 			}
 			imgTag := fmt.Sprintf("[img-%d]", imgData.ID)
@ -91,7 +137,7 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.
 			images = append(images, imgData)
 		}
-		msgs[currMsgIdx+cnt].Content = prefix + prompt
+		msgs[currMsgIdx+cnt].Content = prefix + imgPrompt + prompt
 	}
 	// truncate any messages that do not fit into the context window
@ -102,3 +148,12 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.
 	return b.String(), images, nil
 }
 func checkMllamaModelFamily(m *Model) bool {
 	for _, arch := range m.Config.ModelFamilies {
 		if arch == "mllama" {
 			return true
 		}
 	}
 	return false
 }
--- a/server/prompt_test.go
+++ b/server/prompt_test.go
@ -2,6 +2,8 @@ package server
 import (
 	"bytes"
 	"image"
 	"image/png"
 	"testing"
 	"github.com/google/go-cmp/cmp"
@ -12,9 +14,10 @@ import (
 func TestChatPrompt(t *testing.T) {
 	type expect struct {
-		prompt string
+		prompt        string
-		images [][]byte
+		images        [][]byte
-		error  error
+		aspectRatioID int
 		error         error
 	}
 	tmpl, err := template.Parse(`
@ -25,6 +28,28 @@ func TestChatPrompt(t *testing.T) {
 		t.Fatal(err)
 	}
 	visionModel := Model{Template: tmpl, ProjectorPaths: []string{"vision"}}
 	mllamaModel := Model{Template: tmpl, ProjectorPaths: []string{"vision"}, Config: ConfigV2{ModelFamilies: []string{"mllama"}}}
 	createImg := func(width, height int) ([]byte, error) {
 		img := image.NewRGBA(image.Rect(0, 0, width, height))
 		var buf bytes.Buffer
 		if err := png.Encode(&buf, img); err != nil {
 			return nil, err
 		}
 		return buf.Bytes(), nil
 	}
 	imgBuf, err := createImg(5, 5)
 	if err != nil {
 		t.Fatal(err)
 	}
 	imgBuf2, err := createImg(6, 6)
 	if err != nil {
 		t.Fatal(err)
 	}
 	cases := []struct {
 		name  string
@ -202,6 +227,90 @@ func TestChatPrompt(t *testing.T) {
 				images: [][]byte{[]byte("one hotdog"), []byte("two hotdogs")},
 			},
 		},
 		{
 			name:  "messages with mllama (no images)",
 			model: mllamaModel,
 			limit: 2048,
 			msgs: []api.Message{
 				{Role: "user", Content: "You're a test, Harry!"},
 				{Role: "assistant", Content: "I-I'm a what?"},
 				{Role: "user", Content: "A test. And a thumping good one at that, I'd wager."},
 			},
 			expect: expect{
 				prompt: "You're a test, Harry! I-I'm a what? A test. And a thumping good one at that, I'd wager. ",
 			},
 		},
 		{
 			name:  "messages with mllama single prompt",
 			model: mllamaModel,
 			limit: 2048,
 			msgs: []api.Message{
 				{Role: "user", Content: "How many hotdogs are in this image?", Images: []api.ImageData{imgBuf}},
 			},
 			expect: expect{
 				prompt:        "[img-0]<|image|>How many hotdogs are in this image? ",
 				images:        [][]byte{imgBuf},
 				aspectRatioID: 1,
 			},
 		},
 		{
 			name:  "messages with mllama",
 			model: mllamaModel,
 			limit: 2048,
 			msgs: []api.Message{
 				{Role: "user", Content: "You're a test, Harry!"},
 				{Role: "assistant", Content: "I-I'm a what?"},
 				{Role: "user", Content: "A test. And a thumping good one at that, I'd wager.", Images: []api.ImageData{imgBuf}},
 			},
 			expect: expect{
 				prompt:        "You're a test, Harry! I-I'm a what? [img-0]<|image|>A test. And a thumping good one at that, I'd wager. ",
 				images:        [][]byte{imgBuf},
 				aspectRatioID: 1,
 			},
 		},
 		{
 			name:  "multiple messages with mllama",
 			model: mllamaModel,
 			limit: 2048,
 			msgs: []api.Message{
 				{Role: "user", Content: "You're a test, Harry!", Images: []api.ImageData{imgBuf}},
 				{Role: "assistant", Content: "I-I'm a what?"},
 				{Role: "user", Content: "A test. And a thumping good one at that, I'd wager.", Images: []api.ImageData{imgBuf2}},
 			},
 			expect: expect{
 				prompt:        "[img-0]<|image|>You're a test, Harry! I-I'm a what? [img-1]<|image|>A test. And a thumping good one at that, I'd wager. ",
 				images:        [][]byte{imgBuf, imgBuf2},
 				aspectRatioID: 1,
 			},
 		},
 		{
 			name:  "earlier image with mllama",
 			model: mllamaModel,
 			limit: 2048,
 			msgs: []api.Message{
 				{Role: "user", Content: "How many hotdogs are in this image?", Images: []api.ImageData{imgBuf}},
 				{Role: "assistant", Content: "There are four hotdogs."},
 				{Role: "user", Content: "Which ones have mustard?"},
 			},
 			expect: expect{
 				prompt:        "[img-0]<|image|>How many hotdogs are in this image? There are four hotdogs. Which ones have mustard? ",
 				images:        [][]byte{imgBuf},
 				aspectRatioID: 1,
 			},
 		},
 		{
 			name:  "too many images with mllama",
 			model: mllamaModel,
 			limit: 2048,
 			msgs: []api.Message{
 				{Role: "user", Content: "You're a test, Harry!"},
 				{Role: "assistant", Content: "I-I'm a what?"},
 				{Role: "user", Content: "A test. And a thumping good one at that, I'd wager.", Images: []api.ImageData{imgBuf, imgBuf}},
 			},
 			expect: expect{
 				error: errTooManyImages,
 			},
 		},
 	}
 	for _, tt := range cases {
@ -232,6 +341,10 @@ func TestChatPrompt(t *testing.T) {
 					if !bytes.Equal(images[i].Data, tt.images[i]) {
 						t.Errorf("expected %q, got %q", tt.images[i], images[i].Data)
 					}
 				} else {
 					if images[i].AspectRatioID != tt.aspectRatioID {
 						t.Errorf("expected aspect ratio %d, got %d", tt.aspectRatioID, images[i].AspectRatioID)
 					}
 				}
 			}
 		})
--- a/server/quantization.go
+++ b/server/quantization.go
@ -70,7 +70,23 @@ func getTensorNewType(kv fsggml.KV, qs *quantizeState, newType fsggml.TensorType
 			newType = fsggml.TensorTypeQ6_K
 		}
 	} else if strings.Contains(name, "attn_v.weight") {
-		if (ftype == fsggml.FileTypeQ4_K_M) &&
+		if ftype == fsggml.FileTypeQ2_K {
 			if kv.GQA() >= 4 {
 				newType = fsggml.TensorTypeQ4_K
 			} else {
 				newType = fsggml.TensorTypeQ3_K
 			}
 		} else if ftype == fsggml.FileTypeQ2_K_S && kv.GQA() >= 4 {
 			newType = fsggml.TensorTypeQ4_K
 		} else if ftype == fsggml.FileTypeQ3_K_M {
 			if qs.iAttnV < 2 {
 				newType = fsggml.TensorTypeQ5_K
 			} else {
 				newType = fsggml.TensorTypeQ4_K
 			}
 		} else if ftype == fsggml.FileTypeQ3_K_L {
 			newType = fsggml.TensorTypeQ5_K
 		} else if (ftype == fsggml.FileTypeQ4_K_M || ftype == fsggml.FileTypeQ5_K_M) &&
 			useMoreBits(qs.iAttnV, qs.nAttnV) {
 			newType = fsggml.TensorTypeQ6_K
 		} else if ftype == fsggml.FileTypeQ4_K_S && qs.iAttnV < 4 {
@ -98,23 +114,54 @@ func getTensorNewType(kv fsggml.KV, qs *quantizeState, newType fsggml.TensorType
 	} else if strings.Contains(name, "ffn_down") {
 		iLayer := qs.iFfnDown
 		n_layer := qs.nFfnDown
-		if ftype == fsggml.FileTypeQ4_K_M {
+		if ftype == fsggml.FileTypeQ2_K {
 			newType = fsggml.TensorTypeQ3_K
 		} else if ftype == fsggml.FileTypeQ2_K_S {
 			if iLayer < n_layer/8 {
 				newType = fsggml.TensorTypeQ4_K
 			}
 		} else if ftype == fsggml.FileTypeQ3_K_M {
 			if iLayer < n_layer/16 {
 				newType = fsggml.TensorTypeQ5_K
 			} else if useMoreBits(iLayer, n_layer) {
 				newType = fsggml.TensorTypeQ4_K
 			} else {
 				newType = fsggml.TensorTypeQ3_K
 			}
 		} else if ftype == fsggml.FileTypeQ3_K_L {
 			newType = fsggml.TensorTypeQ5_K
 		} else if ftype == fsggml.FileTypeQ4_K_M {
 			if useMoreBits(iLayer, n_layer) {
 				newType = fsggml.TensorTypeQ6_K
 			}
 		} else if ftype == fsggml.FileTypeQ5_K_M && useMoreBits(iLayer, n_layer) {
 			newType = fsggml.TensorTypeQ6_K
 		} else if ftype == fsggml.FileTypeQ4_K_S && iLayer < n_layer/8 {
 			newType = fsggml.TensorTypeQ5_K
 		}
 		qs.iFfnDown++
 	} else if strings.Contains(name, "attn_output.weight") {
 		if nExperts == 8 {
-			if ftype == fsggml.FileTypeQ4_K_S || ftype == fsggml.FileTypeQ4_K_M {
+			if ftype == fsggml.FileTypeQ2_K || ftype == fsggml.FileTypeQ3_K_S || ftype == fsggml.FileTypeQ3_K_M ||
 				ftype == fsggml.FileTypeQ4_K_S || ftype == fsggml.FileTypeQ4_K_M {
 				newType = fsggml.TensorTypeQ5_K
 			}
 		} else {
 			if ftype == fsggml.FileTypeQ2_K {
 				newType = fsggml.TensorTypeQ3_K
 			} else if ftype == fsggml.FileTypeQ3_K_M {
 				newType = fsggml.TensorTypeQ4_K
 			} else if ftype == fsggml.FileTypeQ3_K_L {
 				newType = fsggml.TensorTypeQ5_K
 			}
 		}
 	} else if strings.Contains(name, "attn_qkv.weight") {
-		if ftype == fsggml.FileTypeQ4_K_M {
+		if ftype == fsggml.FileTypeQ3_K_M || ftype == fsggml.FileTypeQ3_K_L {
 			newType = fsggml.TensorTypeQ4_K
 		} else if ftype == fsggml.FileTypeQ4_K_M {
 			newType = fsggml.TensorTypeQ5_K
 		} else if ftype == fsggml.FileTypeQ5_K_M {
 			newType = fsggml.TensorTypeQ6_K
 		}
 	}
--- a/server/quantization_test.go
+++ b/server/quantization_test.go
@ -42,6 +42,71 @@ func TestGetTensorNewType(t *testing.T) {
 			ftype:       fsggml.FileTypeF32,
 			expected:    fsggml.TensorTypeQ6_K,
 		},
 		{
 			name: "attn_v.weight_q4_k",
 			kv: map[string]any{
 				"general.architecture":        "foo",
 				"foo.attention.head_count":    uint32(4),
 				"foo.attention.head_count_kv": uint32(1),
 			},
 			newType:     fsggml.TensorTypeQ4_0,
 			tensor_name: "blk.0.attn_v.weight",
 			shape:       []uint64{256},
 			ftype:       fsggml.FileTypeQ2_K,
 			expected:    fsggml.TensorTypeQ4_K,
 		},
 		{
 			name:        "attn_v.weight_q3_k",
 			kv:          map[string]any{},
 			newType:     fsggml.TensorTypeQ4_0,
 			tensor_name: "blk.0.attn_v.weight",
 			shape:       []uint64{256},
 			ftype:       fsggml.FileTypeQ2_K,
 			expected:    fsggml.TensorTypeQ3_K,
 		},
 		{
 			name: "attn_v.weight_q2_k_s_q4_k",
 			kv: map[string]any{
 				"general.architecture":        "foo",
 				"foo.attention.head_count":    uint32(4),
 				"foo.attention.head_count_kv": uint32(1),
 			},
 			newType:     fsggml.TensorTypeQ4_0,
 			tensor_name: "blk.0.attn_v.weight",
 			shape:       []uint64{256},
 			ftype:       fsggml.FileTypeQ2_K_S,
 			expected:    fsggml.TensorTypeQ4_K,
 		},
 		{
 			name:        "attn_v.weight_q3_k_m",
 			kv:          map[string]any{},
 			newType:     fsggml.TensorTypeQ4_0,
 			tensor_name: "blk.0.attn_v.weight",
 			shape:       []uint64{256},
 			ftype:       fsggml.FileTypeQ3_K_M,
 			expected:    fsggml.TensorTypeQ5_K,
 		},
 		{
 			name: "attn_v.weight_q3_k_m_i",
 			qs: quantizeState{
 				iAttnV: 2,
 			},
 			kv:          map[string]any{},
 			newType:     fsggml.TensorTypeQ4_0,
 			tensor_name: "blk.0.attn_v.weight",
 			shape:       []uint64{256},
 			ftype:       fsggml.FileTypeQ3_K_M,
 			expected:    fsggml.TensorTypeQ4_K,
 		},
 		{
 			name:        "attn_v.weight_q3_k_l",
 			kv:          map[string]any{},
 			newType:     fsggml.TensorTypeQ4_0,
 			tensor_name: "blk.0.attn_v.weight",
 			shape:       []uint64{256},
 			ftype:       fsggml.FileTypeQ3_K_L,
 			expected:    fsggml.TensorTypeQ5_K,
 		},
 		{
 			name: "attn_v.weight_q4_k_m",
 			qs: quantizeState{
@ -91,6 +156,88 @@ func TestGetTensorNewType(t *testing.T) {
 			ftype:       fsggml.FileTypeF32,
 			expected:    fsggml.TensorTypeQ8_0,
 		},
 		{
 			name:        "ffn_down_q2_k",
 			qs:          quantizeState{},
 			kv:          map[string]any{},
 			newType:     fsggml.TensorTypeQ4_0,
 			tensor_name: "ffn_down",
 			shape:       []uint64{256},
 			ftype:       fsggml.FileTypeQ2_K,
 			expected:    fsggml.TensorTypeQ3_K,
 		},
 		{
 			name:        "ffn_down_q2_k_s",
 			qs:          quantizeState{},
 			kv:          map[string]any{},
 			newType:     fsggml.TensorTypeQ4_0,
 			tensor_name: "ffn_down",
 			shape:       []uint64{256},
 			ftype:       fsggml.FileTypeQ2_K_S,
 			expected:    fsggml.TensorTypeQ4_0,
 		},
 		{
 			name: "ffn_down_q2_k_s_layers",
 			qs: quantizeState{
 				iFfnDown: 2,
 				nFfnDown: 3 * 8,
 			},
 			kv:          map[string]any{},
 			newType:     fsggml.TensorTypeQ4_0,
 			tensor_name: "ffn_down",
 			shape:       []uint64{256},
 			ftype:       fsggml.FileTypeQ2_K_S,
 			expected:    fsggml.TensorTypeQ4_K,
 		},
 		{
 			name: "ffn_down_q3_k_m_base",
 			qs: quantizeState{
 				iFfnDown: 1,
 				nFfnDown: 8,
 			},
 			kv:          map[string]any{},
 			newType:     fsggml.TensorTypeQ4_0,
 			tensor_name: "ffn_down",
 			shape:       []uint64{256},
 			ftype:       fsggml.FileTypeQ3_K_M,
 			expected:    fsggml.TensorTypeQ3_K,
 		},
 		{
 			name: "ffn_down_q3_k_m_16",
 			qs: quantizeState{
 				iFfnDown: 2,
 				nFfnDown: 3 * 16,
 			},
 			kv:          map[string]any{},
 			newType:     fsggml.TensorTypeQ4_0,
 			tensor_name: "ffn_down",
 			shape:       []uint64{256},
 			ftype:       fsggml.FileTypeQ3_K_M,
 			expected:    fsggml.TensorTypeQ5_K,
 		},
 		{
 			name: "ffn_down_q3_k_m_8",
 			qs: quantizeState{
 				iFfnDown: 2,
 				nFfnDown: 3 * 8,
 			},
 			kv:          map[string]any{},
 			newType:     fsggml.TensorTypeQ4_0,
 			tensor_name: "ffn_down",
 			shape:       []uint64{256},
 			ftype:       fsggml.FileTypeQ3_K_M,
 			expected:    fsggml.TensorTypeQ4_K,
 		},
 		{
 			name:        "ffn_down_q3_k_l",
 			qs:          quantizeState{},
 			kv:          map[string]any{},
 			newType:     fsggml.TensorTypeQ4_0,
 			tensor_name: "ffn_down",
 			shape:       []uint64{256},
 			ftype:       fsggml.FileTypeQ3_K_L,
 			expected:    fsggml.TensorTypeQ5_K,
 		},
 		{
 			name: "ffn_down_q4_k_m",
 			qs: quantizeState{
@ -117,6 +264,19 @@ func TestGetTensorNewType(t *testing.T) {
 			ftype:       fsggml.FileTypeQ4_K_M,
 			expected:    fsggml.TensorTypeQ6_K,
 		},
 		{
 			name: "ffn_down_q5_k_m",
 			qs: quantizeState{
 				iFfnDown: 2,
 				nFfnDown: 3 * 8,
 			},
 			kv:          map[string]any{},
 			newType:     fsggml.TensorTypeQ4_0,
 			tensor_name: "ffn_down",
 			shape:       []uint64{256},
 			ftype:       fsggml.FileTypeQ5_K_M,
 			expected:    fsggml.TensorTypeQ6_K,
 		},
 		{
 			name: "ffn_down_q4_k_s",
 			qs: quantizeState{
@ -130,6 +290,59 @@ func TestGetTensorNewType(t *testing.T) {
 			ftype:       fsggml.FileTypeQ4_K_S,
 			expected:    fsggml.TensorTypeQ5_K,
 		},
 		{
 			name: "attn_output.weight_8_expert",
 			qs:   quantizeState{},
 			kv: map[string]any{
 				"general.architecture": "foo",
 				"foo.expert_count":     uint32(8),
 			},
 			newType:     fsggml.TensorTypeQ4_0,
 			tensor_name: "blk.0.attn_output.weight",
 			shape:       []uint64{256},
 			ftype:       fsggml.FileTypeQ2_K,
 			expected:    fsggml.TensorTypeQ5_K,
 		},
 		{
 			name:        "attn_output.weight_q2",
 			qs:          quantizeState{},
 			kv:          map[string]any{},
 			newType:     fsggml.TensorTypeQ4_0,
 			tensor_name: "blk.0.attn_output.weight",
 			shape:       []uint64{256},
 			ftype:       fsggml.FileTypeQ2_K,
 			expected:    fsggml.TensorTypeQ3_K,
 		},
 		{
 			name:        "attn_output.weight_q3_k_m",
 			qs:          quantizeState{},
 			kv:          map[string]any{},
 			newType:     fsggml.TensorTypeQ4_0,
 			tensor_name: "blk.0.attn_output.weight",
 			shape:       []uint64{256},
 			ftype:       fsggml.FileTypeQ3_K_M,
 			expected:    fsggml.TensorTypeQ4_K,
 		},
 		{
 			name:        "attn_output.weight_q3_k_l",
 			qs:          quantizeState{},
 			kv:          map[string]any{},
 			newType:     fsggml.TensorTypeQ4_0,
 			tensor_name: "blk.0.attn_output.weight",
 			shape:       []uint64{256},
 			ftype:       fsggml.FileTypeQ3_K_L,
 			expected:    fsggml.TensorTypeQ5_K,
 		},
 		{
 			name:        "attn_qkv.weight_q3_k_m",
 			qs:          quantizeState{},
 			kv:          map[string]any{},
 			newType:     fsggml.TensorTypeQ4_0,
 			tensor_name: "blk.0.attn_qkv.weight",
 			shape:       []uint64{256},
 			ftype:       fsggml.FileTypeQ3_K_M,
 			expected:    fsggml.TensorTypeQ4_K,
 		},
 		{
 			name:        "attn_qkv.weight_q4_k_m",
 			qs:          quantizeState{},
@ -140,6 +353,16 @@ func TestGetTensorNewType(t *testing.T) {
 			ftype:       fsggml.FileTypeQ4_K_M,
 			expected:    fsggml.TensorTypeQ5_K,
 		},
 		{
 			name:        "attn_qkv.weight_q5_k_m",
 			qs:          quantizeState{},
 			kv:          map[string]any{},
 			newType:     fsggml.TensorTypeQ4_0,
 			tensor_name: "blk.0.attn_qkv.weight",
 			shape:       []uint64{256},
 			ftype:       fsggml.FileTypeQ5_K_M,
 			expected:    fsggml.TensorTypeQ6_K,
 		},
 	}
 	for _, tt := range cases {
 		t.Run(tt.name, func(t *testing.T) {
--- a/server/routes.go
+++ b/server/routes.go
@ -4,10 +4,10 @@ import (
 	"bytes"
 	"cmp"
 	"context"
 	"encoding/binary"
 	"encoding/json"
 	"errors"
 	"fmt"
 	"image"
 	"io"
 	"io/fs"
 	"log/slog"
@ -25,7 +25,6 @@ import (
 	"github.com/gin-contrib/cors"
 	"github.com/gin-gonic/gin"
 	"golang.org/x/image/webp"
 	"golang.org/x/sync/errgroup"
 	"github.com/ollama/ollama/api"
@ -34,6 +33,7 @@ import (
 	"github.com/ollama/ollama/fs/ggml"
 	"github.com/ollama/ollama/llm"
 	"github.com/ollama/ollama/logutil"
 	"github.com/ollama/ollama/model/models/mllama"
 	"github.com/ollama/ollama/openai"
 	"github.com/ollama/ollama/server/internal/client/ollama"
 	"github.com/ollama/ollama/server/internal/registry"
@ -98,10 +98,6 @@ func (s *Server) scheduleRunner(ctx context.Context, name string, caps []model.C
 		return nil, nil, nil, err
 	}
 	if slices.Contains(model.Config.ModelFamilies, "mllama") && len(model.ProjectorPaths) > 0 {
 		return nil, nil, nil, fmt.Errorf("'llama3.2-vision' is no longer compatible with your version of Ollama and has been replaced by a newer version. To re-download, run 'ollama pull llama3.2-vision'")
 	}
 	if err := model.CheckCapabilities(caps...); err != nil {
 		return nil, nil, nil, fmt.Errorf("%s %w", name, err)
 	}
@ -208,14 +204,38 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 		return
 	}
-	if slices.Contains(m.Config.ModelFamilies, "mllama") && len(req.Images) > 1 {
+	isMllama := checkMllamaModelFamily(m)
-		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "this model only supports one image while more than one image requested"})
+	if isMllama && len(req.Images) > 1 {
 		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "this model only supports one image: more than one image sent"})
 		return
 	}
 	images := make([]llm.ImageData, len(req.Images))
 	for i := range req.Images {
-		images[i] = llm.ImageData{ID: i, Data: req.Images[i]}
+		if isMllama && len(m.ProjectorPaths) > 0 {
 			data, opts, err := mllama.Preprocess(bytes.NewReader(req.Images[i]))
 			if err != nil {
 				c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": "error processing image"})
 				return
 			}
 			ar, ok := opts["aspectRatioIndex"].(int)
 			if !ok {
 				c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": "error processing image"})
 				return
 			}
 			buf := new(bytes.Buffer)
 			err = binary.Write(buf, binary.LittleEndian, data)
 			if err != nil {
 				c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": "error processing image"})
 				return
 			}
 			images[i] = llm.ImageData{ID: i, Data: buf.Bytes(), AspectRatioID: ar}
 		} else {
 			images[i] = llm.ImageData{ID: i, Data: req.Images[i]}
 		}
 	}
 	prompt := req.Prompt
@ -247,6 +267,9 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 			for _, i := range images {
 				imgPrompt := ""
 				if isMllama {
 					imgPrompt = "<|image|>"
 				}
 				msgs = append(msgs, api.Message{Role: "user", Content: fmt.Sprintf("[img-%d]"+imgPrompt, i.ID)})
 			}
@ -1281,10 +1304,6 @@ func Serve(ln net.Listener) error {
 	s.sched.Run(schedCtx)
 	// register the experimental webp decoder
 	// so webp images can be used in multimodal inputs
 	image.RegisterFormat("webp", "RIFF????WEBP", webp.Decode, webp.DecodeConfig)
 	// At startup we retrieve GPU information so we can get log messages before loading a model
 	// This will log warnings to the log in case we have problems with detected GPUs
 	gpus := discover.GetGPUInfo()
--- a/server/sched.go
+++ b/server/sched.go
@ -8,7 +8,6 @@ import (
 	"os"
 	"reflect"
 	"runtime"
 	"slices"
 	"sort"
 	"strconv"
 	"strings"
@ -133,11 +132,11 @@ func (s *Scheduler) processPending(ctx context.Context) {
 				continue
 			}
 			numParallel := int(envconfig.NumParallel())
-			// `mllama` is a snowflake and uses an encoder cache which cannot be used with num_parallel > 1
+			// TODO (jmorganca): mllama doesn't support parallel yet
-			// ref: https://github.com/ollama/ollama/issues/4165
+			// see https://github.com/ollama/ollama/issues/4165
-			if slices.Contains(pending.model.Config.ModelFamilies, "mllama") && numParallel != 1 {
+			if checkMllamaModelFamily(pending.model) && numParallel != 1 {
 				numParallel = 1
-				slog.Warn("mllama does not currently support parallel requests")
+				slog.Warn("mllama doesn't support parallel requests yet")
 			}
 			for {
Author	SHA1	Message	Date
jmorganca	5c76074f66	wip	2025-05-12 19:15:42 -07:00
Bruce MacDonald	18d52686de	Update model/models/qwen25vl/model_vision.go	2025-05-12 14:16:46 -07:00
Bruce MacDonald	2d2eb5903d	use with pattern for rope	2025-05-12 14:14:03 -07:00
Bruce MacDonald	533f4c41bd	add eot	2025-05-12 14:03:37 -07:00
Bruce MacDonald	31b2c06393	Update 0007-add-unpad-operator.patch	2025-05-12 13:51:46 -07:00
Bruce MacDonald	4ae23deb50	Revert "Update 0007-add-unpad-operator.patch" This reverts commit 340359fd087dd93c99bed4b9c87ccea3e759e184. Update 0007-add-unpad-operator.patch	2025-05-12 13:51:04 -07:00
Bruce MacDonald	5d3da85a16	remove out of date comments	2025-05-12 13:51:04 -07:00
Bruce MacDonald	8b64b456c1	Update 0007-add-unpad-operator.patch	2025-05-12 13:51:04 -07:00
Bruce MacDonald	684f0d9291	set default values for vision model in config	2025-05-12 13:51:04 -07:00
jmorganca	3308bff137	add i32 copy and argsort for cuda	2025-05-12 13:51:04 -07:00
Bruce MacDonald	bf1929a3bc	Delete 0017-add-ollama-vocab-for-grammar-support.patch	2025-05-12 13:49:44 -07:00
Bruce MacDonald	1a2c413225	move mask	2025-05-12 13:49:44 -07:00
Bruce MacDonald	57279f89a2	calculate block mask once, rather than in attention	2025-05-12 13:49:44 -07:00
Bruce MacDonald	9ceee25d8b	chunk vision outputs	2025-05-12 13:49:44 -07:00
Bruce MacDonald	661bf04696	add picture prefix	2025-05-12 13:49:44 -07:00
Bruce MacDonald	2521a55ae6	fixes after rebase	2025-05-12 13:49:44 -07:00
Bruce MacDonald	32948ec952	increase rope base	2025-05-12 13:49:43 -07:00
Bruce MacDonald	9876c8453a	update exported functions for tests	2025-05-12 13:49:43 -07:00
Bruce MacDonald	919b3d6e21	require new engine for qwen25vl arch	2025-05-12 13:49:43 -07:00
Bruce MacDonald	16b13e0cfc	Revert "ropeTheta should be 1e5" This reverts commit cc1638b26763eae7daddd44e3975a885671ef9d3. This reverts commit b32385591307e2d33a8f43ce1626b529d2dac83e.	2025-05-12 13:49:43 -07:00
Bruce MacDonald	75441c56f3	add comment explaining rope theta	2025-05-12 13:49:43 -07:00
Bruce MacDonald	45f96e898d	ropeTheta should be 1e5	2025-05-12 13:49:43 -07:00
Bruce MacDonald	7c555d394c	simplify patch creation	2025-05-12 13:49:43 -07:00
Bruce MacDonald	39ee6d2bd0	ranges for lint	2025-05-12 13:49:43 -07:00
Bruce MacDonald	47705b5168	simplify rope changes	2025-05-12 13:49:43 -07:00
Michael Yang	698a92aa4a	reverse window	2025-05-12 13:49:43 -07:00
Michael Yang	150c499cae	use silu	2025-05-12 13:49:43 -07:00
Michael Yang	f1257a7de4	update vision rope theta default	2025-05-12 13:49:43 -07:00
Bruce MacDonald	b68af0370f	move sdpa to model forward pass	2025-05-12 13:49:43 -07:00
Bruce MacDonald	ca981c8a49	full attn block indexes should be []int32	2025-05-12 13:49:43 -07:00
Bruce MacDonald	b3da8a319e	Update model_vision.go	2025-05-12 13:49:42 -07:00
Bruce MacDonald	359e1d5b19	full attention layers	2025-05-12 13:49:42 -07:00
Michael Yang	bde6b46ce9	fix padding padding was being added to offset but not to the running count	2025-05-12 13:49:42 -07:00
Bruce MacDonald	ff1f74534b	block attention	2025-05-12 13:49:42 -07:00
Bruce MacDonald	104f802df1	remove todos	2025-05-12 13:49:42 -07:00
Bruce MacDonald	eed0ac2948	clean up vision model forward pass	2025-05-12 13:49:42 -07:00
Bruce MacDonald	fcfad744ff	fix patch merger	2025-05-12 13:49:42 -07:00
Michael Yang	fb3c16f2a2	window index	2025-05-12 13:49:42 -07:00
Michael Yang	ee869f35e4	fix image processing python built-in `round()` rounds to the nearest even number if the value is in the middle https://docs.python.org/3/library/functions.html#round	2025-05-12 13:49:42 -07:00
Michael Yang	ff5d1a3dc0	duplicate input embeddings	2025-05-12 13:49:42 -07:00
Michael Yang	88b231f903	use maxgridsize	2025-05-12 13:49:42 -07:00
Michael Yang	7e920c8d75	fix: patch merger and convert convert: - split patch embedding - split qkv remove duplicate PatchMerger	2025-05-12 13:49:42 -07:00
Bruce MacDonald	dd8c619fba	fixes after rebase	2025-05-12 13:49:42 -07:00
Bruce MacDonald	2af76d0e7a	default to 32 for vision block count	2025-05-12 13:49:42 -07:00
Bruce MacDonald	8d901825f0	reshape cos and sin	2025-05-12 13:49:41 -07:00
Bruce MacDonald	04936b719f	Update model_vision.go	2025-05-12 13:49:41 -07:00
Bruce MacDonald	0f0136d419	simplify by doing operations in Go rather than with tensors Co-Authored-By: Michael Yang <2372640+mxyng@users.noreply.github.com>	2025-05-12 13:49:41 -07:00
Bruce MacDonald	80498f76de	fix build	2025-05-12 13:49:41 -07:00
Bruce MacDonald	f8b48aa784	Delete model_external_test.go	2025-05-12 13:49:41 -07:00
Bruce MacDonald	5ff0d538b0	wip: implementing rope	2025-05-12 13:49:41 -07:00
Bruce MacDonald	eedc969c35	grid refactor	2025-05-12 13:49:41 -07:00
Bruce MacDonald	963531215e	update convert	2025-05-12 13:49:41 -07:00
Bruce MacDonald	3fe090f447	get patch embedding vals from config	2025-05-12 13:49:41 -07:00
Bruce MacDonald	1704072746	patch embeddings	2025-05-12 13:49:41 -07:00
Bruce MacDonald	c1f9bcb4dd	restructure image processing Update model.go Update model.go Update model.go no projector no projector vision model scaffold ... ... wip ... rebase fix patch merger tidy ... Update model_vision.go server: do not attempt to parse offset file as gguf This logic was causing issues for me when importing a gguf that had some padding at the end of the file. The valid gguf would be read, but then it would try to read the offset as a different gguf file. This does not seem right. Update process_image_test.go apply norm prompt processing prompt processing fix post tokenize fix gguf padding + populate the split patch embeddings ... ... another shot at patch embeddings ... patch embedding Update model_vision.go split pixels	2025-05-12 13:49:41 -07:00
Bruce MacDonald	198b1e6db9	text model forward pass	2025-05-12 13:49:41 -07:00
Bruce MacDonald	51ad65f831	ml: structured rope config to allow specifying context len This commit refactors the Rotary Position Embedding (RoPE) implementation across the codebase to use a structured configuration approach instead of individual parameters. Key changes: - Add new RoPEConfig struct with fields for dimension, type, base frequency, and scaling - Add RopeType enum to formalize different RoPE implementation variants - Add YarnConfig struct and related configuration for YaRN (Yet Another RoPE extensioN) context extension - Update RoPE method signature across all tensor interfaces and implementations - Refactor all model implementations (llama, gemma2, gemma3, mllama) to use the new configuration structure This change improves code organization, makes the RoPE configuration more explicit, and provides better support for different RoPE variants and context extension methods.	2025-05-12 13:49:41 -07:00