diff --git a/Makefile.sync b/Makefile.sync
index bceae7f59..711667c98 100644
--- a/Makefile.sync
+++ b/Makefile.sync
@@ -15,11 +15,13 @@ help:
 	@echo "    make -f $(lastword $(MAKEFILE_LIST)) clean sync"
 
 .PHONY: sync
-sync: llama/build-info.cpp llama/llama.cpp ml/backend/ggml/ggml
+sync: llama/build-info.cpp ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal
 
-.PHONY: llama/build-info.cpp
-llama/build-info.cpp: llama/build-info.cpp.in
-	sed -e 's|@FETCH_HEAD@|$(FETCH_HEAD)|' $< > $@
+llama/build-info.cpp: llama/build-info.cpp.in llama/llama.cpp
+	sed -e 's|@FETCH_HEAD@|$(FETCH_HEAD)|' <$< >$@
+
+ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal: ml/backend/ggml/ggml
+	go generate ./$(@D)
 
 .PHONY: llama/llama.cpp
 llama/llama.cpp: llama/vendor/
@@ -30,12 +32,13 @@ ml/backend/ggml/ggml: llama/vendor/ggml/
 	rsync -arvzc -f "merge $@/.rsync-filter" $< $@
 
 PATCHES=$(wildcard llama/patches/*.patch)
+PATCHED=$(join $(dir $(PATCHES)), $(addsuffix ed, $(addprefix ., $(notdir $(PATCHES)))))
 
 .PHONY: apply-patches
 .NOTPARALLEL:
-apply-patches: $(addsuffix ed, $(PATCHES))
+apply-patches: $(PATCHED)
 
-%.patched: %.patch
+llama/patches/.%.patched: llama/patches/%.patch
 	@if git -c user.name=nobody -c 'user.email=<>' -C $(WORKDIR) am -3 $(realpath $<); then touch $@; else git -C $(WORKDIR) am --abort; exit 1; fi
 
 .PHONY: checkout
@@ -57,4 +60,4 @@ format-patches: llama/patches
 
 .PHONE: clean
 clean: checkout
-	$(RM) $(addsuffix ed, $(PATCHES))
+	$(RM) llama/patches/.*.patched
diff --git a/convert/convert.go b/convert/convert.go
index 249ec8077..48804d7f3 100644
--- a/convert/convert.go
+++ b/convert/convert.go
@@ -1,6 +1,7 @@
 package convert
 
 import (
+	"cmp"
 	"encoding/json"
 	"errors"
 	"fmt"
@@ -14,13 +15,12 @@ import (
 )
 
 type ModelParameters struct {
-	Architectures []string       `json:"architectures"`
-	VocabSize     uint32         `json:"vocab_size"`
-	TextModel     TextParameters `json:"text_config"`
-}
+	Architectures []string `json:"architectures"`
+	VocabSize     uint32   `json:"vocab_size"`
 
-type TextParameters struct {
-	VocabSize uint32 `json:"vocab_size"`
+	TextModel struct {
+		VocabSize uint32 `json:"vocab_size"`
+	} `json:"text_config"`
 }
 
 type AdapterParameters struct {
@@ -173,6 +173,8 @@ func ConvertModel(fsys fs.FS, f *os.File) error {
 	switch p.Architectures[0] {
 	case "LlamaForCausalLM":
 		conv = &llamaModel{}
+	case "MllamaForConditionalGeneration":
+		conv = &mllamaModel{}
 	case "Llama4ForConditionalGeneration":
 		conv = &llama4Model{}
 	case "Mistral3ForConditionalGeneration":
@@ -212,24 +214,22 @@ func ConvertModel(fsys fs.FS, f *os.File) error {
 		return err
 	}
 
-	vocabSize := int(p.VocabSize)
-	if vocabSize == 0 {
-		tVocabSize := int(p.TextModel.VocabSize)
-		vocabSize = tVocabSize
-	}
+	vocabSize := int(cmp.Or(p.VocabSize, p.TextModel.VocabSize))
 
 	switch {
 	case vocabSize == 0:
-		slog.Warn("vocabulary size was not explicitly set by the model", "default size", len(t.Vocabulary.Tokens))
+		slog.Debug("vocabulary size was not explicitly set by the model", "default size", len(t.Vocabulary.Tokens))
 	case vocabSize > len(t.Vocabulary.Tokens):
-		slog.Warn("vocabulary is smaller than expected, padding with dummy tokens", "expect", vocabSize, "actual", len(t.Vocabulary.Tokens))
+		slog.Debug("vocabulary is smaller than expected, padding with dummy tokens", "expect", vocabSize, "actual", len(t.Vocabulary.Tokens))
 		for i := range vocabSize - len(t.Vocabulary.Tokens) {
 			t.Vocabulary.Tokens = append(t.Vocabulary.Tokens, fmt.Sprintf("[PAD%d]", i))
 			t.Vocabulary.Scores = append(t.Vocabulary.Scores, -1)
 			t.Vocabulary.Types = append(t.Vocabulary.Types, tokenTypeUserDefined)
 		}
 	case vocabSize < len(t.Vocabulary.Tokens):
-		return fmt.Errorf("vocabulary is larger than expected '%d' instead of '%d'", len(t.Vocabulary.Tokens), vocabSize)
+		slog.Debug("vocabulary is larger than expected", "want", vocabSize, "got", len(t.Vocabulary.Tokens))
+		p.VocabSize = uint32(len(t.Vocabulary.Tokens))
+		p.TextModel.VocabSize = uint32(len(t.Vocabulary.Tokens))
 	default:
 		slog.Debug("vocabulary", "size", len(t.Vocabulary.Tokens))
 	}
diff --git a/convert/convert_mllama.go b/convert/convert_mllama.go
new file mode 100644
index 000000000..12478be71
--- /dev/null
+++ b/convert/convert_mllama.go
@@ -0,0 +1,160 @@
+package convert
+
+import (
+	"strings"
+
+	"github.com/ollama/ollama/fs/ggml"
+	"github.com/pdevine/tensor"
+	"github.com/pdevine/tensor/native"
+)
+
+type mllamaModel struct {
+	ModelParameters
+	TextModel struct {
+		llamaModel
+
+		CrossAttentionLayers []int32 `json:"cross_attention_layers"`
+	} `json:"text_config"`
+	VisionModel struct {
+		NumHiddenLayers           uint32  `json:"num_hidden_layers"`
+		NumGlobalLayers           uint32  `json:"num_global_layers"`
+		IntermediateLayersIndices []int32 `json:"intermediate_layers_indices"`
+
+		HiddenSize       uint32 `json:"hidden_size"`
+		IntermediateSize uint32 `json:"intermediate_size"`
+
+		AttentionHeads uint32 `json:"attention_heads"`
+
+		ImageSize   uint32  `json:"image_size"`
+		PatchSize   uint32  `json:"patch_size"`
+		NumChannels uint32  `json:"num_channels"`
+		MaxNumTiles uint32  `json:"max_num_tiles"`
+		NormEpsilon float32 `json:"norm_eps"`
+		RopeTheta   float32 `json:"rope.freq_base"`
+	} `json:"vision_config"`
+}
+
+func (m *mllamaModel) KV(t *Tokenizer) ggml.KV {
+	kv := m.ModelParameters.KV(t)
+	kv["general.architecture"] = "mllama"
+
+	for k, v := range m.TextModel.KV(t) {
+		if strings.HasPrefix(k, "llama.") {
+			kv[strings.ReplaceAll(k, "llama.", "mllama.")] = v
+		}
+	}
+
+	kv["mllama.attention.cross_attention_layers"] = m.TextModel.CrossAttentionLayers
+
+	kv["mllama.vision.block_count"] = m.VisionModel.NumHiddenLayers
+	kv["mllama.vision.global.block_count"] = m.VisionModel.NumGlobalLayers
+	kv["mllama.vision.intermediate_layers_indices"] = m.VisionModel.IntermediateLayersIndices
+
+	kv["mllama.vision.embedding_length"] = m.VisionModel.HiddenSize
+	kv["mllama.vision.feed_forward_length"] = m.VisionModel.IntermediateSize
+
+	kv["mllama.vision.attention.head_count"] = m.VisionModel.AttentionHeads
+	kv["mllama.vision.attention.layer_norm_epsilon"] = m.VisionModel.NormEpsilon
+
+	kv["mllama.vision.image_size"] = m.VisionModel.ImageSize
+	kv["mllama.vision.patch_size"] = m.VisionModel.PatchSize
+	kv["mllama.vision.max_num_tiles"] = m.VisionModel.MaxNumTiles
+	kv["mllama.vision.num_channels"] = m.VisionModel.NumChannels
+
+	return kv
+}
+
+func (m *mllamaModel) Replacements() []string {
+	return append(
+		m.TextModel.Replacements(),
+		"language_model.", "",
+		"gate_attn", "attn_gate",
+		"gate_ffn", "ffn_gate",
+		"cross_attn.", "cross_attn_",
+		"vision_model", "v",
+		"class_embedding", "class_embd",
+		"patch_embedding", "patch_embd",
+		"gated_positional_embedding.tile_embedding", "tile_position_embd",
+		"gated_positional_embedding.embedding", "position_embd.weight",
+		"gated_positional_embedding", "position_embd",
+		"embedding.weight", "weight",
+		"pre_tile_positional_embedding", "pre_tile_position_embd",
+		"post_tile_positional_embedding", "post_tile_position_embd",
+		"layernorm_pre", "pre_ln",
+		"layernorm_post", "post_ln",
+		"global_transformer.layers", "global.blk",
+		"transformer.layers", "blk",
+		"mlp.fc1", "ffn_up",
+		"mlp.fc2", "ffn_down",
+		"multi_modal_projector", "mm.0",
+	)
+}
+
+func (m *mllamaModel) Tensors(ts []Tensor) []*ggml.Tensor {
+	var out []*ggml.Tensor
+	var text []Tensor
+	for _, t := range ts {
+		if t.Name() == "v.position_embd.gate" {
+			for _, name := range []string{"v.position_embd.gate", "v.tile_position_embd.gate"} {
+				tt := t.Clone()
+				tt.SetRepacker(m.repack(name))
+				out = append(out, &ggml.Tensor{
+					Name:     name,
+					Kind:     t.Kind(),
+					Shape:    t.Shape(),
+					WriterTo: tt,
+				})
+			}
+		} else if t.Name() == "v.pre_tile_position_embd.gate" || t.Name() == "v.post_tile_position_embd.gate" {
+			t.SetRepacker(m.repack(t.Name()))
+			out = append(out, &ggml.Tensor{
+				Name:     t.Name(),
+				Kind:     t.Kind(),
+				Shape:    t.Shape(),
+				WriterTo: t,
+			})
+		} else if strings.HasPrefix(t.Name(), "v.") || strings.HasPrefix(t.Name(), "mm.") {
+			out = append(out, &ggml.Tensor{
+				Name:     t.Name(),
+				Kind:     t.Kind(),
+				Shape:    t.Shape(),
+				WriterTo: t,
+			})
+		} else {
+			text = append(text, t)
+		}
+	}
+
+	return append(out, m.TextModel.Tensors(text)...)
+}
+
+func (m *mllamaModel) repack(name string) Repacker {
+	return func(_ string, data []float32, shape []uint64) (_ []float32, err error) {
+		dims := make([]int, len(shape))
+		for i, dim := range shape {
+			dims[i] = int(dim)
+		}
+
+		var t tensor.Tensor = tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
+
+		t, err = tensor.Tanh(t)
+		if err != nil {
+			return nil, err
+		}
+
+		if name == "v.position_embd.gate" {
+			t, err = tensor.Sub(float32(1), t)
+			if err != nil {
+				return nil, err
+			}
+		}
+
+		t = tensor.Materialize(t)
+		// flatten tensor so it can be return as a vector
+		if err := t.Reshape(t.Shape().TotalSize()); err != nil {
+			return nil, err
+		}
+
+		return native.VectorF32(t.(*tensor.Dense))
+	}
+}
diff --git a/convert/reader.go b/convert/reader.go
index ab81d5c0b..07d12f0dd 100644
--- a/convert/reader.go
+++ b/convert/reader.go
@@ -38,7 +38,10 @@ const (
 func (t tensorBase) Kind() uint32 {
 	if strings.HasSuffix(t.name, ".ffn_gate_inp.weight") ||
 		t.name == "token_types.weight" ||
-		t.name == "v.positional_embedding_vlm" {
+		t.name == "v.positional_embedding_vlm" ||
+		t.name == "v.tile_position_embd.weight" ||
+		t.name == "v.pre_tile_position_embd.weight" ||
+		t.name == "v.post_tile_position_embd.weight" {
 		// these tensors are always F32
 		return 0
 	}
diff --git a/fs/ggml/ggml.go b/fs/ggml/ggml.go
index 735d41fa5..c29d715bd 100644
--- a/fs/ggml/ggml.go
+++ b/fs/ggml/ggml.go
@@ -125,6 +125,7 @@ func (kv KV) OllamaEngineRequired() bool {
 		"gemma3",
 		"mistral3",
 		"llama4",
+		"mllama",
 	}, kv.Architecture())
 }
 
diff --git a/llama/llama.cpp/include/llama.h b/llama/llama.cpp/include/llama.h
index 41beef219..abedebdb7 100644
--- a/llama/llama.cpp/include/llama.h
+++ b/llama/llama.cpp/include/llama.h
@@ -258,7 +258,6 @@ extern "C" {
 
         llama_token  *  token;
         float        *  embd;
-        int32_t         n_embd;
         llama_pos    *  pos;
         int32_t      *  n_seq_id;
         llama_seq_id ** seq_id;
@@ -366,7 +365,6 @@ extern "C" {
         bool flash_attn;  // whether to use flash attention [EXPERIMENTAL]
         bool no_perf;     // whether to measure performance timings
         bool op_offload;  // whether to offload host tensor operations to device
-        bool cross_attn;  // whether to use cross attention
     };
 
     // model quantization parameters
@@ -466,10 +464,6 @@ extern "C" {
             struct llama_context_params   params),
             "use llama_init_from_model instead");
 
-    // TODO (jmorganca): this should most likely be passed in as part of a batch
-    // and not set on the context for all batches.
-    LLAMA_API void llama_set_cross_attention(struct llama_context * ctx, bool cross_attn_state);
-
     // Frees all allocated memory
     LLAMA_API void llama_free(struct llama_context * ctx);
 
diff --git a/llama/llama.cpp/src/llama-arch.cpp b/llama/llama.cpp/src/llama-arch.cpp
index eb7b5325e..5ab3f5722 100644
--- a/llama/llama.cpp/src/llama-arch.cpp
+++ b/llama/llama.cpp/src/llama-arch.cpp
@@ -6,7 +6,6 @@
 
 static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
     { LLM_ARCH_LLAMA,            "llama"            },
-    { LLM_ARCH_MLLAMA,           "mllama"           },
     { LLM_ARCH_LLAMA4,           "llama4"           },
     { LLM_ARCH_DECI,             "deci"             },
     { LLM_ARCH_FALCON,           "falcon"           },
@@ -145,7 +144,6 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
     { LLM_KV_ATTENTION_SLIDING_WINDOW,               "%s.attention.sliding_window"               },
     { LLM_KV_ATTENTION_SCALE,                        "%s.attention.scale"                        },
     { LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,        "%s.attention.block_skip_connection"        },
-    { LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS,       "%s.attention.cross_attention_layers"       },
     { LLM_KV_ATTENTION_KEY_LENGTH_MLA,               "%s.attention.key_length_mla"               },
     { LLM_KV_ATTENTION_VALUE_LENGTH_MLA,             "%s.attention.value_length_mla"             },
 
@@ -275,40 +273,6 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
             { LLM_TENSOR_FFN_UP_SHEXP,    "blk.%d.ffn_up_shexp" },
         },
     },
-    {
-        LLM_ARCH_MLLAMA,
-        {
-            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
-            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
-            { LLM_TENSOR_OUTPUT,          "output" },
-            { LLM_TENSOR_ROPE_FREQS,      "rope_freqs" },
-            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
-            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
-            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
-            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
-            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
-            { LLM_TENSOR_ATTN_ROT_EMBD,   "blk.%d.attn_rot_embd" },
-            { LLM_TENSOR_FFN_GATE_INP,    "blk.%d.ffn_gate_inp" },
-            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
-            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
-            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
-            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
-            { LLM_TENSOR_FFN_GATE_EXP,    "blk.%d.ffn_gate.%d" },
-            { LLM_TENSOR_FFN_DOWN_EXP,    "blk.%d.ffn_down.%d" },
-            { LLM_TENSOR_FFN_UP_EXP,      "blk.%d.ffn_up.%d" },
-            { LLM_TENSOR_FFN_GATE_EXPS,   "blk.%d.ffn_gate_exps" },
-            { LLM_TENSOR_FFN_DOWN_EXPS,   "blk.%d.ffn_down_exps" },
-            { LLM_TENSOR_FFN_UP_EXPS,     "blk.%d.ffn_up_exps" },
-            { LLM_TENSOR_CROSS_ATTN_K_NORM,    "blk.%d.cross_attn_k_norm" },
-            { LLM_TENSOR_CROSS_ATTN_K_PROJ,    "blk.%d.cross_attn_k_proj" },
-            { LLM_TENSOR_CROSS_ATTN_O_PROJ,    "blk.%d.cross_attn_o_proj" },
-            { LLM_TENSOR_CROSS_ATTN_Q_NORM,    "blk.%d.cross_attn_q_norm" },
-            { LLM_TENSOR_CROSS_ATTN_Q_PROJ,    "blk.%d.cross_attn_q_proj" },
-            { LLM_TENSOR_CROSS_ATTN_V_PROJ,    "blk.%d.cross_attn_v_proj" },
-            { LLM_TENSOR_CROSS_ATTN_ATTN_GATE, "blk.%d.cross_attn_attn_gate" },
-            { LLM_TENSOR_CROSS_ATTN_MLP_GATE,  "blk.%d.cross_attn_mlp_gate" },
-        },
-    },
     {
         LLM_ARCH_DECI,
         {
@@ -1737,14 +1701,6 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
     // this tensor is loaded for T5, but never used
     {LLM_TENSOR_DEC_CROSS_ATTN_REL_B,       {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
     {LLM_TENSOR_BSKCN_TV,                   {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
-    {LLM_TENSOR_CROSS_ATTN_K_NORM,          {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
-    {LLM_TENSOR_CROSS_ATTN_K_PROJ,          {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
-    {LLM_TENSOR_CROSS_ATTN_O_PROJ,          {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
-    {LLM_TENSOR_CROSS_ATTN_Q_NORM,          {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
-    {LLM_TENSOR_CROSS_ATTN_Q_PROJ,          {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
-    {LLM_TENSOR_CROSS_ATTN_V_PROJ,          {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
-    {LLM_TENSOR_CROSS_ATTN_ATTN_GATE,       {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
-    {LLM_TENSOR_CROSS_ATTN_MLP_GATE,        {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
     {LLM_TENSOR_CONV1D,                     {LLM_TENSOR_LAYER_INPUT,     GGML_OP_IM2COL}},
     {LLM_TENSOR_POS_NET_NORM,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
     {LLM_TENSOR_POS_NET_NORM1,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
diff --git a/llama/llama.cpp/src/llama-arch.h b/llama/llama.cpp/src/llama-arch.h
index bc8a4f0bb..525c1b7d4 100644
--- a/llama/llama.cpp/src/llama-arch.h
+++ b/llama/llama.cpp/src/llama-arch.h
@@ -11,7 +11,6 @@
 enum llm_arch {
     LLM_ARCH_LLAMA,
     LLM_ARCH_LLAMA4,
-    LLM_ARCH_MLLAMA,
     LLM_ARCH_DECI,
     LLM_ARCH_FALCON,
     LLM_ARCH_BAICHUAN,
@@ -149,7 +148,6 @@ enum llm_kv {
     LLM_KV_ATTENTION_SLIDING_WINDOW,
     LLM_KV_ATTENTION_SCALE,
     LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
-    LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS,
     LLM_KV_ATTENTION_KEY_LENGTH_MLA,
     LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
 
@@ -351,14 +349,6 @@ enum llm_tensor {
     LLM_TENSOR_CLS,
     LLM_TENSOR_CLS_OUT,
     LLM_TENSOR_BSKCN_TV,
-    LLM_TENSOR_CROSS_ATTN_K_NORM,
-    LLM_TENSOR_CROSS_ATTN_K_PROJ,
-    LLM_TENSOR_CROSS_ATTN_O_PROJ,
-    LLM_TENSOR_CROSS_ATTN_Q_NORM,
-    LLM_TENSOR_CROSS_ATTN_Q_PROJ,
-    LLM_TENSOR_CROSS_ATTN_V_PROJ,
-    LLM_TENSOR_CROSS_ATTN_ATTN_GATE,
-    LLM_TENSOR_CROSS_ATTN_MLP_GATE,
     LLM_TENSOR_CONV1D,
     LLM_TENSOR_CONVNEXT_DW,
     LLM_TENSOR_CONVNEXT_NORM,
diff --git a/llama/llama.cpp/src/llama-batch.cpp b/llama/llama.cpp/src/llama-batch.cpp
index 241b316e8..a88b2fe30 100644
--- a/llama/llama.cpp/src/llama-batch.cpp
+++ b/llama/llama.cpp/src/llama-batch.cpp
@@ -320,7 +320,6 @@ struct llama_batch llama_batch_get_one(
         /*n_tokens       =*/ n_tokens,
         /*tokens         =*/ tokens,
         /*embd           =*/ nullptr,
-        /*n_embd         =*/ 0,
         /*pos            =*/ nullptr,
         /*n_seq_id       =*/ nullptr,
         /*seq_id         =*/ nullptr,
@@ -333,7 +332,6 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
         /*n_tokens       =*/ 0,
         /*tokens         =*/ nullptr,
         /*embd           =*/ nullptr,
-        /*n_embd         =*/ 0,
         /*pos            =*/ nullptr,
         /*n_seq_id       =*/ nullptr,
         /*seq_id         =*/ nullptr,
@@ -342,7 +340,6 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
 
     if (embd) {
         batch.embd = (float *) malloc(sizeof(float) * n_tokens_alloc * embd);
-        batch.n_embd = embd;
     } else {
         batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens_alloc);
     }
diff --git a/llama/llama.cpp/src/llama-context.cpp b/llama/llama.cpp/src/llama-context.cpp
index c5948e8fb..1f3a39564 100644
--- a/llama/llama.cpp/src/llama-context.cpp
+++ b/llama/llama.cpp/src/llama-context.cpp
@@ -514,7 +514,7 @@ float * llama_context::get_logits_ith(int32_t i) {
             throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, n_outputs));
         }
 
-        return logits + j*model.hparams.n_vocab;
+        return logits + j*model.vocab.n_tokens();
     } catch (const std::exception & err) {
         LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what());
 #ifndef NDEBUG
@@ -632,10 +632,6 @@ void llama_context::set_warmup(bool value) {
     cparams.warmup = value;
 }
 
-void llama_context::set_cross_attn(bool value) {
-    cparams.cross_attn = value;
-}
-
 void llama_context::set_adapter_lora(
             llama_adapter_lora * adapter,
             float scale) {
@@ -713,7 +709,7 @@ int llama_context::encode(llama_batch & inp_batch) {
 
     const int64_t n_embd = hparams.n_embd;
 
-    llama_sbatch sbatch = llama_sbatch(batch, batch.n_embd, /* simple_split */ true, /* logits_all */ true);
+    llama_sbatch sbatch = llama_sbatch(batch, n_embd, /* simple_split */ true, /* logits_all */ true);
 
     const llama_ubatch ubatch = sbatch.split_simple(n_tokens);
 
@@ -867,9 +863,10 @@ int llama_context::decode(llama_batch & inp_batch) {
 
     const llama_batch & batch = batch_allocr.batch;
 
+    const auto & vocab   = model.vocab;
     const auto & hparams = model.hparams;
 
-    const int32_t n_vocab = hparams.n_vocab;
+    const int32_t n_vocab = vocab.n_tokens();
 
     const int64_t n_tokens_all = batch.n_tokens;
     const int64_t n_embd       = hparams.n_embd;
@@ -1093,7 +1090,7 @@ int llama_context::decode(llama_batch & inp_batch) {
         // make the outputs have the same order they had in the user-provided batch
         // note: this is mostly relevant for recurrent models atm
         if (!sorted_output) {
-            const uint32_t n_vocab = model.hparams.n_vocab;
+            const uint32_t n_vocab = model.vocab.n_tokens();
             const uint32_t n_embd  = model.hparams.n_embd;
 
             GGML_ASSERT((size_t) n_outputs == out_ids.size());
@@ -1148,11 +1145,12 @@ int llama_context::decode(llama_batch & inp_batch) {
 
 int32_t llama_context::output_reserve(int32_t n_outputs) {
     const auto & hparams = model.hparams;
+    const auto & vocab   = model.vocab;
 
     const int64_t n_outputs_max = std::max<int64_t>(n_outputs, n_seq_max());
 
     const auto n_batch = cparams.n_batch;
-    const auto n_vocab = hparams.n_vocab;
+    const auto n_vocab = vocab.n_tokens();
     const auto n_embd  = hparams.n_embd;
 
     // TODO: use a per-batch flag for logits presence instead
@@ -1687,7 +1685,7 @@ size_t llama_context::state_write_data(llama_io_write_i & io) {
     {
         LLAMA_LOG_DEBUG("%s: - writing logits\n", __func__);
 
-        const uint64_t logits_size = std::min((uint64_t) this->logits_size, (uint64_t) n_outputs * model.hparams.n_vocab);
+        const uint64_t logits_size = std::min((uint64_t) this->logits_size, (uint64_t) n_outputs * model.vocab.n_tokens());
 
         io.write(&logits_size, sizeof(logits_size));
 
@@ -2099,7 +2097,6 @@ llama_context_params llama_context_default_params() {
         /*.flash_attn                  =*/ false,
         /*.no_perf                     =*/ true,
         /*.op_offload                  =*/ true,
-        /*.cross_attn                  =*/ false,
     };
 
     return result;
@@ -2225,10 +2222,6 @@ void llama_set_warmup(llama_context * ctx, bool warmup) {
     ctx->set_warmup(warmup);
 }
 
-void llama_set_cross_attention(struct llama_context * ctx, bool cross_attention) {
-    ctx->set_cross_attn(cross_attention);
-}
-
 void llama_synchronize(llama_context * ctx) {
     ctx->synchronize();
 }
diff --git a/llama/llama.cpp/src/llama-context.h b/llama/llama.cpp/src/llama-context.h
index 9970dfc6d..0264e9371 100644
--- a/llama/llama.cpp/src/llama-context.h
+++ b/llama/llama.cpp/src/llama-context.h
@@ -72,7 +72,6 @@ struct llama_context {
     void set_embeddings (bool value);
     void set_causal_attn(bool value);
     void set_warmup(bool value);
-    void set_cross_attn(bool value);
 
     void set_adapter_lora(
             llama_adapter_lora * adapter,
diff --git a/llama/llama.cpp/src/llama-cparams.h b/llama/llama.cpp/src/llama-cparams.h
index 7a6156ced..246fa5777 100644
--- a/llama/llama.cpp/src/llama-cparams.h
+++ b/llama/llama.cpp/src/llama-cparams.h
@@ -31,7 +31,6 @@ struct llama_cparams {
     bool no_perf;
     bool warmup;
     bool op_offload;
-    bool cross_attn;
 
     enum llama_pooling_type pooling_type;
 
diff --git a/llama/llama.cpp/src/llama-graph.cpp b/llama/llama.cpp/src/llama-graph.cpp
index f14869cf5..b0e3f6359 100644
--- a/llama/llama.cpp/src/llama-graph.cpp
+++ b/llama/llama.cpp/src/llama-graph.cpp
@@ -532,12 +532,6 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
     }
 }
 
-void llm_graph_input_cross_attn_state::set_input(const llama_ubatch * ubatch) {
-    if (ubatch->embd) {
-        ggml_backend_tensor_set(cross_attn_state, ubatch->embd, 0, ggml_nbytes(cross_attn_state));
-    }
-}
-
 //
 // llm_graph_context
 //
@@ -1520,25 +1514,6 @@ llm_graph_input_attn_cross * llm_graph_context::build_attn_inp_cross() const {
     return (llm_graph_input_attn_cross *) res->add_input(std::move(inp));
 }
 
-ggml_tensor * llm_graph_context::build_inp_cross_attn_state() const {
-    const int64_t n_embd = hparams.n_embd;
-
-    auto inp = std::make_unique<llm_graph_input_cross_attn_state>();
-
-    ggml_tensor * cur = nullptr;
-
-    inp->cross_attn_state = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd, 1601, 4);
-    ggml_set_input(inp->cross_attn_state);
-
-    cur = inp->cross_attn_state;
-
-    cb(cur, "inp_cross_attn_state", -1);
-
-    res->add_input(std::move(inp));
-
-    return cur;
-}
-
 ggml_tensor * llm_graph_context::build_attn(
         llm_graph_input_attn_cross * inp,
         ggml_cgraph * gf,
diff --git a/llama/llama.cpp/src/llama-graph.h b/llama/llama.cpp/src/llama-graph.h
index 5a3227850..832a8c09f 100644
--- a/llama/llama.cpp/src/llama-graph.h
+++ b/llama/llama.cpp/src/llama-graph.h
@@ -87,7 +87,6 @@ public:
 
     ggml_tensor * tokens = nullptr; // I32 [n_batch]
     ggml_tensor * embd   = nullptr; // F32 [n_embd, n_batch]
-    ggml_tensor * cross_attn_state; // F32 [4, n_embd, 1061]
 };
 
 class llm_graph_input_pos : public llm_graph_input_i {
@@ -285,16 +284,6 @@ public:
     const llama_cross * cross = nullptr;
 };
 
-class llm_graph_input_cross_attn_state : public llm_graph_input_i {
-public:
-    llm_graph_input_cross_attn_state()          = default;
-    virtual ~llm_graph_input_cross_attn_state() = default;
-
-    void set_input(const llama_ubatch * ubatch) override;
-
-    ggml_tensor * cross_attn_state; // F32 [4, n_embd, 1061]
-};
-
 //
 // llm_graph_result
 //
@@ -506,7 +495,6 @@ struct llm_graph_context {
     ggml_tensor * build_inp_cls() const;
     ggml_tensor * build_inp_s_copy() const;
     ggml_tensor * build_inp_s_mask() const;
-    ggml_tensor * build_inp_cross_attn_state() const;
 
     ggml_tensor * build_inp_cross_embd() const;
     ggml_tensor * build_inp_pos_bucket_enc() const;
diff --git a/llama/llama.cpp/src/llama-hparams.cpp b/llama/llama.cpp/src/llama-hparams.cpp
index 6a02de036..8a6679601 100644
--- a/llama/llama.cpp/src/llama-hparams.cpp
+++ b/llama/llama.cpp/src/llama-hparams.cpp
@@ -85,7 +85,3 @@ bool llama_hparams::is_swa(uint32_t il) const {
 
     GGML_ABORT("fatal error");
 }
-
-bool llama_hparams::cross_attention_layers(uint32_t il) const {
-    return std::find(cross_attn_layers.begin(), cross_attn_layers.end(), il) != cross_attn_layers.end();
-}
diff --git a/llama/llama.cpp/src/llama-hparams.h b/llama/llama.cpp/src/llama-hparams.h
index b6fc7e6df..48dce4071 100644
--- a/llama/llama.cpp/src/llama-hparams.h
+++ b/llama/llama.cpp/src/llama-hparams.h
@@ -2,8 +2,6 @@
 
 #include "llama.h"
 
-#include <algorithm>
-
 #include <array>
 
 // bump if necessary
@@ -44,7 +42,6 @@ struct llama_hparams {
     uint32_t n_expert = 0;
     uint32_t n_expert_used = 0;
     uint32_t n_rel_attn_bkts = 0;
-    uint32_t n_vocab = 0;
 
     // note: deepseek2 using MLA converts into MQA with larger heads, then decompresses to MHA
     uint32_t n_embd_head_k_mla = 0;
@@ -59,7 +56,6 @@ struct llama_hparams {
     std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
 
     std::array<std::array<uint32_t, LLAMA_MAX_LAYERS>, 4> n_bskcn_arr = {};
-    std::array<uint32_t, LLAMA_MAX_LAYERS> cross_attn_layers;
 
     uint32_t n_layer_dense_lead = 0;
     uint32_t n_lora_q           = 0;
@@ -163,9 +159,6 @@ struct llama_hparams {
     // Block skip connection
     bool n_bskcn(uint32_t n, uint32_t il) const;
 
-    // cross attention layers
-    bool cross_attention_layers(uint32_t il) const;
-
     bool is_swa(uint32_t il) const;
 };
 
diff --git a/llama/llama.cpp/src/llama-kv-cache.cpp b/llama/llama.cpp/src/llama-kv-cache.cpp
index 1a50c0347..60e67b036 100644
--- a/llama/llama.cpp/src/llama-kv-cache.cpp
+++ b/llama/llama.cpp/src/llama-kv-cache.cpp
@@ -100,16 +100,8 @@ llama_kv_cache_unified::llama_kv_cache_unified(
             throw std::runtime_error("failed to create ggml context for kv cache");
         }
 
-        ggml_tensor * k, *v;
-
-        // for cross attention layers
-        if (model.arch == LLM_ARCH_MLLAMA && hparams.cross_attention_layers(i)) {
-            k = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hparams.n_embd_head_k, 6404, hparams.n_head_kv(i));
-            v = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hparams.n_embd_head_v, 6404, hparams.n_head_kv(i));
-        } else {
-            k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
-            v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size);
-        }
+        ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
+        ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size);
         ggml_format_name(k, "cache_k_l%d", i);
         ggml_format_name(v, "cache_v_l%d", i);
         k_l.push_back(k);
@@ -459,7 +451,7 @@ void llama_kv_cache_unified::set_full() {
 llama_sbatch llama_kv_cache_unified::sbatch_init(
         const llama_batch & batch,
         bool logits_all) {
-    return llama_sbatch(batch, batch.n_embd, true, logits_all);
+    return llama_sbatch(batch, hparams.n_embd, true, logits_all);
 }
 
 llama_ubatch llama_kv_cache_unified::ubatch_next(
diff --git a/llama/llama.cpp/src/llama-model-loader.cpp b/llama/llama.cpp/src/llama-model-loader.cpp
index 2acfd4a8f..7f6617fac 100644
--- a/llama/llama.cpp/src/llama-model-loader.cpp
+++ b/llama/llama.cpp/src/llama-model-loader.cpp
@@ -315,8 +315,6 @@ namespace GGUFMeta {
         return true;
     }
 
-    template bool llama_model_loader::get_arr<std::array<unsigned int, 512>>(enum llm_kv kid, std::array<unsigned int, 512>& result, bool required);
-
     template<typename T, size_t N_MAX>
     bool llama_model_loader::get_arr(const std::string & key, std::array<T, N_MAX> & result, bool required) {
         const int kid = gguf_find_key(meta.get(), key.c_str());
diff --git a/llama/llama.cpp/src/llama-model.cpp b/llama/llama.cpp/src/llama-model.cpp
index e8298f562..db62973fa 100644
--- a/llama/llama.cpp/src/llama-model.cpp
+++ b/llama/llama.cpp/src/llama-model.cpp
@@ -433,7 +433,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
 
     // get general kv
     ml.get_key(LLM_KV_GENERAL_NAME, name, false);
-    ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab, false);
 
     // everything past this point is not vocab-related
     if (hparams.vocab_only) {
@@ -445,7 +444,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_BLOCK_COUNT,       hparams.n_layer);
     ml.get_key(LLM_KV_EXPERT_COUNT,      hparams.n_expert,      false);
     ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
-    ml.get_key(LLM_KV_VOCAB_SIZE,        hparams.n_vocab,       false);
 
     if (arch == LLM_ARCH_WAVTOKENIZER_DEC) {
         ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd_features);
@@ -469,11 +467,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
     std::fill(hparams.n_head_arr.begin(),    hparams.n_head_arr.end(),    0);
     std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
     std::fill(hparams.n_ff_arr.begin(),      hparams.n_ff_arr.end(),      0);
-    std::fill(hparams.cross_attn_layers.begin(), hparams.cross_attn_layers.end(), -1);
 
     ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH,  hparams.n_ff_arr,   hparams.n_layer, false);
     ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false);
-    ml.get_arr(LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS, hparams.cross_attn_layers, false);
 
     // n_head_kv is optional, default to n_head
     hparams.n_head_kv_arr = hparams.n_head_arr;
@@ -526,7 +522,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
 
         ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
 
-        if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_MLLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON) {
+        if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON) {
             if (hparams.n_rot != hparams.n_embd_head_k) {
                 throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
             }
@@ -589,16 +585,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                     hparams.use_kq_norm = false;
                 }
             } break;
-        case LLM_ARCH_MLLAMA:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-
-                switch (hparams.n_layer) {
-                    case 40: type = LLM_TYPE_11B; break;
-                    case 100: type = LLM_TYPE_90B; break;
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
-            } break;
         case LLM_ARCH_DECI:
             {
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -1595,7 +1581,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
         const int64_t n_embd_head_v = hparams.n_embd_head_v;
         const int64_t n_ff          = hparams.n_ff();
         const int64_t n_embd_gqa    = n_embd_v_gqa;
-        const int64_t n_vocab       = hparams.n_vocab;
+        const int64_t n_vocab       = vocab.n_tokens();
         const int64_t n_token_types = vocab.n_token_types();
         const int64_t n_rot         = hparams.n_rot;
         const int64_t n_expert      = hparams.n_expert;
@@ -1854,52 +1840,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                         }
                     }
                 } break;
-            case LLM_ARCH_MLLAMA:
-                {
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab+8}, 0);
-
-                    // output
-                    {
-                        output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                        output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
-
-                        // if output is NULL, init from the input tok embed
-                        if (output == NULL) {
-                            output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
-                        }
-                    }
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-
-                        if (hparams.cross_attention_layers(i)) {
-                            layer.cross_attn_k_norm = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_K_NORM,   "weight", i), {128}, 0);
-                            layer.cross_attn_k_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_K_PROJ,   "weight", i), {n_embd, 1024}, 0);
-                            layer.cross_attn_o_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_O_PROJ,   "weight", i), {n_embd, n_embd}, 0);
-                            layer.cross_attn_q_norm = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_Q_NORM, "weight", i), {128}, 0);
-                            layer.cross_attn_q_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_Q_PROJ, "weight", i), {n_embd, n_embd}, 0);
-                            layer.cross_attn_v_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_V_PROJ, "weight", i), {n_embd, 1024}, 0);
-                            layer.cross_attn_attn_gate = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_ATTN_GATE, i), {1}, 0);
-                            layer.cross_attn_mlp_gate = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_MLP_GATE, i), {1}, 0);
-                            layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
-                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
-                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
-                            layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-                        } else {
-                            layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-                            layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
-                            layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
-                            layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
-                            layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
-                            layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-                            layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
-                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
-                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
-                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
-                        }
-                    }
-                } break;
             case LLM_ARCH_DECI:
                 {
                     tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -4816,246 +4756,6 @@ struct llm_build_llama : public llm_graph_context {
     }
 };
 
-struct llm_build_mllama: public llm_graph_context {
-    llm_build_mllama(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
-        // mutable variable, needed during the last layer of the computation to skip unused tokens
-        int32_t n_tokens = this->n_tokens;
-
-        const int64_t n_embd_head = hparams.n_embd_head_v;
-        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-        GGML_ASSERT(n_embd_head == hparams.n_rot);
-
-        ggml_tensor * cur;
-        ggml_tensor * inpL;
-        ggml_tensor * inpCAS;
-
-        inpL = build_inp_embd(model.tok_embd);
-        inpCAS = build_inp_cross_attn_state();
-
-          // inp_pos - contains the positions
-        ggml_tensor * inp_pos = build_inp_pos();
-
-        auto * inp_attn = build_attn_inp_kv_unified();
-        const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
-
-        for (int il = 0; il < n_layer; ++il) {
-            ggml_tensor * inpSA = inpL;
-
-            // norm
-            cur = build_norm(inpL,
-                    model.layers[il].attn_norm, NULL,
-                    LLM_NORM_RMS, il);
-            cb(cur, "attn_norm", il);
-
-            if (hparams.cross_attention_layers(il)) {
-                if (!ubatch.embd && !cparams.cross_attn) {
-                    continue;
-                }
-
-                // cross attention layer
-                ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_q_proj, cur);
-                cb(Qcur, "Qcur", il);
-
-                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
-                cb(Qcur, "Qcur", il);
-
-                Qcur = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 0, 2, 1, 3));
-                cb(Qcur, "Qcur", il);
-
-                Qcur = build_norm(Qcur, model.layers[il].cross_attn_q_norm, NULL, LLM_NORM_RMS, il);
-                cb(Qcur, "Qcur", il);
-
-                ggml_tensor * Kcur, * Vcur;
-                if (ubatch.embd) {
-                    Kcur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_k_proj, inpCAS);
-                    cb(Kcur, "Kcur", il);
-
-                    Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, 6404);
-                    cb(Kcur, "Kcur", il);
-
-                    Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
-                    cb(Kcur, "Kcur", il);
-
-                    Kcur = build_norm(Kcur, model.layers[il].cross_attn_k_norm, NULL, LLM_NORM_RMS, il);
-                    cb(Kcur, "Kcur", il);
-
-                    ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, kv_self->k_l[il]));
-
-                    Vcur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_v_proj, inpCAS);
-                    cb(Vcur, "Vcur", il);
-
-                    Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, 6404);
-                    cb(Vcur, "Vcur", il);
-
-                    Vcur = ggml_permute(ctx0, Vcur, 0, 2, 1, 3);
-                    cb(Vcur, "Vcur", il);
-
-                    ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, kv_self->v_l[il]));
-                } else {
-                    Kcur = ggml_view_tensor(ctx0, kv_self->k_l[il]);
-                    cb(Kcur, "Kcur (view)", il);
-
-                    Vcur = ggml_view_tensor(ctx0, kv_self->v_l[il]);
-                    cb(Vcur, "Vcur (view)", il);
-                }
-
-                struct ggml_tensor * kq = ggml_mul_mat(ctx0, Kcur, Qcur);
-                cb(kq, "kq", il);
-
-                // TODO: apply causal masks
-                struct ggml_tensor * kq_soft_max = ggml_soft_max_ext(ctx0, kq, nullptr, 1.f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias);
-                cb(kq_soft_max, "kq_soft_max", il);
-
-                Vcur = ggml_cont(ctx0, ggml_transpose(ctx0, Vcur));
-                cb(Vcur, "Vcur", il);
-
-                struct ggml_tensor * kqv = ggml_mul_mat(ctx0, Vcur, kq_soft_max);
-                cb(kqv, "kqv", il);
-
-                struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
-                cb(kqv_merged, "kqv_merged", il);
-
-                cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_head_v*n_head, n_tokens);
-                cb(cur, "kqv_merged_cont", il);
-
-                cur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_o_proj, cur);
-                cb(cur, "cur", il);
-
-                // TODO: do this in place once?
-                cur = ggml_mul(ctx0, cur, ggml_tanh(ctx0, model.layers[il].cross_attn_attn_gate));
-
-                struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
-                cb(ffn_inp, "ffn_inp", il);
-
-                // feed-forward network
-                cur = build_norm(ffn_inp,
-                        model.layers[il].ffn_norm, NULL,
-                        LLM_NORM_RMS, il);
-                cb(cur, "ffn_norm", il);
-
-                cur = build_ffn(cur,
-                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
-                        model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
-                        model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
-                        NULL,
-                        LLM_FFN_SILU, LLM_FFN_PAR, il);
-                cb(cur, "ffn_out", il);
-
-                // TODO: do this inplace once?
-                cur = ggml_add_inplace(ctx0, ggml_mul_inplace(ctx0, cur, ggml_tanh(ctx0, model.layers[il].cross_attn_mlp_gate)), ffn_inp);
-                cb(cur, "ffn_out", il);
-
-                cur = build_cvec(cur, il);
-                cb(cur, "l_out", il);
-
-                // input for next layer
-                inpL = cur;
-            } else {
-                // self attention layer
-
-                // rope freq factors for llama3; may return nullptr for llama2 and other models
-                ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
-
-                // compute Q and K and RoPE them
-                ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
-                cb(Qcur, "Qcur", il);
-                if (model.layers[il].bq) {
-                    Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
-                    cb(Qcur, "Qcur", il);
-                }
-
-                ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
-                cb(Kcur, "Kcur", il);
-                if (model.layers[il].bk) {
-                    Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
-                    cb(Kcur, "Kcur", il);
-                }
-
-                ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
-                cb(Vcur, "Vcur", il);
-                if (model.layers[il].bv) {
-                    Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
-                    cb(Vcur, "Vcur", il);
-                }
-
-                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
-                Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
-                Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
-
-                Qcur = ggml_rope_ext(
-                        ctx0, Qcur, inp_pos, rope_factors,
-                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                        ext_factor, attn_factor, beta_fast, beta_slow
-                        );
-
-                Kcur = ggml_rope_ext(
-                        ctx0, Kcur, inp_pos, rope_factors,
-                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                        ext_factor, attn_factor, beta_fast, beta_slow
-                        );
-
-                cb(Qcur, "Qcur", il);
-                cb(Kcur, "Kcur", il);
-                cb(Vcur, "Vcur", il);
-
-                cur = build_attn(inp_attn, gf,
-                    model.layers[il].wo, model.layers[il].bo,
-                    Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
-
-                if (il == n_layer - 1) {
-                    // skip computing output for unused tokens
-                    struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-                    n_tokens = n_outputs;
-                    cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
-                    inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
-                }
-
-                struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
-                cb(ffn_inp, "ffn_inp", il);
-
-                // feed-forward network
-                cur = build_norm(ffn_inp,
-                        model.layers[il].ffn_norm, NULL,
-                        LLM_NORM_RMS, il);
-                cb(cur, "ffn_norm", il);
-
-                cur = build_ffn(cur,
-                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
-                        model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
-                        model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
-                        NULL,
-                        LLM_FFN_SILU, LLM_FFN_PAR, il);
-                cb(cur, "ffn_out", il);
-
-                cur = ggml_add(ctx0, cur, ffn_inp);
-                cb(cur, "ffn_out", il);
-
-                cur = build_cvec(cur, il);
-                cb(cur, "l_out", il);
-
-                // input for next layer
-                inpL = cur;
-            }
-        }
-
-        cur = inpL;
-
-        cur = build_norm(cur,
-                model.output_norm, NULL,
-                LLM_NORM_RMS, -1);
-        cb(cur, "result_norm", -1);
-        res->t_embd = cur;
-
-        // lm_head
-        cur = build_lora_mm(model.output, cur);
-
-        cb(cur, "result_output", -1);
-        res->t_logits = cur;
-
-        ggml_build_forward_expand(gf, cur);
-    }
-};
-
 struct llm_build_deci : public llm_graph_context {
     llm_build_deci(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
@@ -13428,10 +13128,6 @@ llm_graph_result_ptr llama_model::build_graph(
             {
                 llm = std::make_unique<llm_build_llama>(*this, params, gf);
             } break;
-        case LLM_ARCH_MLLAMA:
-            {
-                llm = std::make_unique<llm_build_mllama>(*this, params, gf);
-            } break;
         case LLM_ARCH_DECI:
             {
                 llm = std::make_unique<llm_build_deci>(*this, params, gf);
@@ -13793,7 +13489,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
         // use what we call a normal RoPE, operating on pairs of consecutive head values
         case LLM_ARCH_LLAMA:
         case LLM_ARCH_LLAMA4:
-        case LLM_ARCH_MLLAMA:
         case LLM_ARCH_DECI:
         case LLM_ARCH_BAICHUAN:
         case LLM_ARCH_STARCODER:
diff --git a/llama/llama.cpp/src/llama-model.h b/llama/llama.cpp/src/llama-model.h
index 9281e6296..43746c7dd 100644
--- a/llama/llama.cpp/src/llama-model.h
+++ b/llama/llama.cpp/src/llama-model.h
@@ -11,7 +11,6 @@
 #include <string>
 #include <unordered_map>
 #include <vector>
-#include <stdexcept>
 
 struct llama_cparams;
 struct llama_ubatch;
@@ -75,7 +74,6 @@ enum llm_type {
     LLM_TYPE_40B,
     LLM_TYPE_65B,
     LLM_TYPE_70B,
-    LLM_TYPE_90B,
     LLM_TYPE_236B,
     LLM_TYPE_290B,
     LLM_TYPE_314B,
@@ -320,16 +318,6 @@ struct llama_layer {
 
     struct ggml_tensor * bskcn_tv = nullptr;
 
-    // cross attention
-    struct ggml_tensor * cross_attn_k_norm = nullptr;
-    struct ggml_tensor * cross_attn_k_proj = nullptr;
-    struct ggml_tensor * cross_attn_o_proj = nullptr;
-    struct ggml_tensor * cross_attn_q_norm = nullptr;
-    struct ggml_tensor * cross_attn_q_proj = nullptr;
-    struct ggml_tensor * cross_attn_v_proj = nullptr;
-    struct ggml_tensor * cross_attn_attn_gate = nullptr;
-    struct ggml_tensor * cross_attn_mlp_gate = nullptr;
-
     struct llama_layer_posnet posnet;
 
     struct llama_layer_convnext convnext;
diff --git a/llama/llama.cpp/src/llama-quant.cpp b/llama/llama.cpp/src/llama-quant.cpp
index 56531980c..820d5128e 100644
--- a/llama/llama.cpp/src/llama-quant.cpp
+++ b/llama/llama.cpp/src/llama-quant.cpp
@@ -639,9 +639,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
         if (llama_model_has_encoder(&model)) {
             n_attn_layer *= 3;
         }
-        if (qs.n_attention_wv != n_attn_layer) {
-            LLAMA_LOG_WARN("%s: n_attention_wv is unexpected, expected: %d, found: %d\n", __func__, n_attn_layer, qs.n_attention_wv);
-        }
+        GGML_ASSERT((qs.n_attention_wv == n_attn_layer) && "n_attention_wv is unexpected");
     }
 
     size_t total_size_org = 0;
diff --git a/llama/llama.cpp/tools/mtmd/llava.cpp b/llama/llama.cpp/tools/mtmd/llava.cpp
index b0eb79bb3..ebef8b3c1 100644
--- a/llama/llama.cpp/tools/mtmd/llava.cpp
+++ b/llama/llama.cpp/tools/mtmd/llava.cpp
@@ -462,7 +462,7 @@ struct llava_embd_batch {
     std::vector<llama_seq_id *> seq_ids;
     std::vector<int8_t>         logits;
     llama_batch batch;
-    llava_embd_batch(float * embd, int32_t n_embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
+    llava_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
         pos     .resize(n_tokens);
         n_seq_id.resize(n_tokens);
         seq_ids .resize(n_tokens + 1);
@@ -474,7 +474,6 @@ struct llava_embd_batch {
             /*n_tokens       =*/ n_tokens,
             /*tokens         =*/ nullptr,
             /*embd           =*/ embd,
-            /*n_embd         =*/ n_embd,
             /*pos            =*/ pos.data(),
             /*n_seq_id       =*/ n_seq_id.data(),
             /*seq_id         =*/ seq_ids.data(),
@@ -498,7 +497,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
             n_eval = n_batch;
         }
         float * embd = image_embed->embed+i*n_embd;
-        llava_embd_batch llava_batch = llava_embd_batch(embd, n_embd, n_eval, *n_past, 0);
+        llava_embd_batch llava_batch = llava_embd_batch(embd, n_eval, *n_past, 0);
         if (llama_decode(ctx_llama, llava_batch.batch)) {
             LOG_ERR("%s : failed to eval\n", __func__);
             return false;
diff --git a/llama/llama.go b/llama/llama.go
index f0f2af828..1251be3a5 100644
--- a/llama/llama.go
+++ b/llama/llama.go
@@ -17,7 +17,6 @@ package llama
 #include "llava.h"
 #include "gguf.h"
 
-#include "mllama.h"
 #include "sampling_ext.h"
 
 extern bool llamaProgressCallback(float progress, void *user_data);
@@ -510,63 +509,6 @@ func (c *ClipContext) NewEmbed(llamaContext *Context, data []byte) ([][]float32,
 	return embed, nil
 }
 
-type MllamaContext struct {
-	c *C.struct_mllama_ctx
-}
-
-func NewMllamaContext(llamaContext *Context, modelPath string) (*MllamaContext, error) {
-	mp := C.CString(modelPath)
-	defer C.free(unsafe.Pointer(mp))
-	c := C.mllama_model_load(mp, 1)
-	if c == nil {
-		return nil, fmt.Errorf("unable to load mllama model: %v", modelPath)
-	}
-
-	projEmbedSize := int(C.mllama_n_embd(c))
-	modelEmbedSize := llamaContext.Model().NEmbd()
-	if projEmbedSize != modelEmbedSize {
-		return nil, fmt.Errorf("projector embedding size (%d) does not match model (%d)", projEmbedSize, modelEmbedSize)
-	}
-
-	return &MllamaContext{c: c}, nil
-}
-
-func (m *MllamaContext) Free() {
-	C.mllama_free(m.c)
-}
-
-func (m *MllamaContext) NewEmbed(llamaContext *Context, data []byte, aspectRatioId int) ([][]float32, error) {
-	img := C.mllama_image_init()
-	defer C.mllama_image_free(img)
-
-	ok := bool(C.mllama_image_load_from_data(unsafe.Pointer(&data[0]), C.int(len(data)), 560, 560, 3, 4, C.int(aspectRatioId), img))
-	if !ok {
-		return nil, errors.New("unable to load mllama image data")
-	}
-
-	rows := make([]float32, m.EmbedSize(llamaContext))
-	ok = bool(C.mllama_image_encode(m.c, C.int(llamaContext.numThreads), img, (*C.float)(unsafe.Pointer(&rows[0]))))
-	if !ok {
-		return nil, errors.New("unable to make mllama embedding from image")
-	}
-
-	embed := make([][]float32, 1)
-	embed[0] = rows
-
-	return embed, nil
-}
-
-func (m *MllamaContext) EmbedSize(llamaContext *Context) int {
-	numTokens := int(C.mllama_n_positions(m.c) * C.mllama_n_tiles(m.c))
-	numEmbed := llamaContext.Model().NEmbd()
-
-	return numTokens * numEmbed
-}
-
-func (c *Context) SetCrossAttention(state bool) {
-	C.llama_set_cross_attention(c.c, C.bool(state))
-}
-
 func (c *Context) Synchronize() {
 	C.llama_synchronize(c.c)
 }
diff --git a/llama/mllama.cpp b/llama/mllama.cpp
deleted file mode 100644
index 1ba8f5bef..000000000
--- a/llama/mllama.cpp
+++ /dev/null
@@ -1,887 +0,0 @@
-// NOTE: This is modified from clip.cpp for Mllama only
-#include "mllama.h"
-
-#include "ggml-alloc.h"
-#include "ggml-backend.h"
-#include "ggml-cpu.h"
-#include "ggml.h"
-#include "gguf.h"
-
-#ifdef GGML_USE_CUDA
-#include "ggml-cuda.h"
-#endif
-
-#ifdef GGML_USE_METAL
-#include "ggml-metal.h"
-#endif
-
-#ifdef GGML_USE_CANN
-#include "ggml-cann.h"
-#endif
-
-#ifdef GGML_USE_VULKAN
-#include "ggml-vulkan.h"
-#endif
-
-#include <algorithm>
-#include <cmath>
-#include <cstdarg>
-#include <cstdlib>
-#include <cstring>
-#include <fstream>
-#include <stdexcept>
-#include <vector>
-
-#define REQUIRE(x)                                           \
-    do {                                                     \
-        if (!(x)) {                                          \
-            throw std::runtime_error("REQUIRE failed: " #x); \
-        }                                                    \
-    } while (0)
-
-#define LOG(fmt, ...) fprintf(stderr, "%s: " fmt "\n", __func__, ##__VA_ARGS__)
-
-#if defined(_WIN32)
-#define WIN32_LEAN_AND_MEAN
-#ifndef NOMINMAX
-    #define NOMINMAX
-#endif
-#include <windows.h>
-#if __GLIBCXX__
-#include <cstdio>
-#include <ext/stdio_filebuf.h>
-#include <fcntl.h>
-#endif
-#endif
-
-struct mllama_image {
-    int width;
-    int height;
-
-    int num_channels = 3;
-    int num_tiles = 4;
-
-    int aspect_ratio_id;
-
-    std::vector<float> data;
-};
-
-static std::string format(const char *fmt, ...) {
-    va_list args;
-    va_start(args, fmt);
-    std::vector<char> b(128);
-    int n = vsnprintf(b.data(), b.size(), fmt, args);
-    REQUIRE(n >= 0 && n < b.size());
-    va_end(args);
-    return std::string(b.data(), b.size());
-}
-
-//
-// utilities to get data from a gguf file
-//
-
-static int get_key_index(const gguf_context *ctx, const char *key) {
-    int key_index = gguf_find_key(ctx, key);
-    REQUIRE(key_index != -1);
-    return key_index;
-}
-
-static std::vector<uint32_t> get_u32_array(const gguf_context *ctx, const std::string &key) {
-    const int i = get_key_index(ctx, key.c_str());
-    const int n = gguf_get_arr_n(ctx, i);
-    const uint32_t *data = (uint32_t *)gguf_get_arr_data(ctx, i);
-
-    std::vector<uint32_t> s(n);
-    for (size_t j = 0; j < s.size(); j++) {
-        s[j] = data[j];
-    }
-
-    return s;
-}
-
-static uint32_t get_u32(const gguf_context *ctx, const std::string &key) {
-    return gguf_get_val_u32(ctx, get_key_index(ctx, key.c_str()));
-}
-
-static float get_f32(const gguf_context *ctx, const std::string &key) {
-    return gguf_get_val_f32(ctx, get_key_index(ctx, key.c_str()));
-}
-
-static std::string get_ftype(int ftype) {
-    return ggml_type_name(static_cast<ggml_type>(ftype));
-}
-
-//
-// mllama layers
-//
-
-struct mllama_hparams {
-    uint32_t image_size;
-    uint32_t patch_size;
-    uint32_t hidden_size;
-    uint32_t n_intermediate;
-    uint32_t projection_dim;
-    uint32_t n_head;
-    uint32_t n_layer;
-    uint32_t n_global_layer;
-    uint32_t n_tiles;
-
-    float eps;
-
-    std::vector<bool> intermediate_layers;
-};
-
-struct mllama_layer {
-    // attention
-    struct ggml_tensor *k_w;
-    struct ggml_tensor *k_b;
-    struct ggml_tensor *q_w;
-    struct ggml_tensor *q_b;
-    struct ggml_tensor *v_w;
-    struct ggml_tensor *v_b;
-
-    struct ggml_tensor *o_w;
-    struct ggml_tensor *o_b;
-
-    struct ggml_tensor *attn_gate;
-
-    // layernorm 1
-    struct ggml_tensor *ln_1_w;
-    struct ggml_tensor *ln_1_b;
-
-    // ff
-    struct ggml_tensor *ff_i_w;
-    struct ggml_tensor *ff_i_b;
-
-    struct ggml_tensor *ff_o_w;
-    struct ggml_tensor *ff_o_b;
-
-    struct ggml_tensor *ff_gate;
-
-    // layernorm 2
-    struct ggml_tensor *ln_2_w;
-    struct ggml_tensor *ln_2_b;
-};
-
-struct mllama_vision_model {
-    struct mllama_hparams hparams;
-
-    // embeddings
-    struct ggml_tensor *class_embedding;
-    struct ggml_tensor *patch_embeddings;
-    struct ggml_tensor *position_embeddings;
-    struct ggml_tensor *position_embeddings_gate;
-    struct ggml_tensor *tile_position_embeddings;
-    struct ggml_tensor *tile_position_embeddings_gate;
-    struct ggml_tensor *pre_tile_position_embeddings;
-    struct ggml_tensor *pre_tile_position_embeddings_gate;
-    struct ggml_tensor *post_tile_position_embeddings;
-    struct ggml_tensor *post_tile_position_embeddings_gate;
-
-    struct ggml_tensor *pre_ln_w;
-    struct ggml_tensor *pre_ln_b;
-
-    std::vector<mllama_layer> layers;
-    std::vector<mllama_layer> global_layers;
-
-    struct ggml_tensor *post_ln_w;
-    struct ggml_tensor *post_ln_b;
-
-    struct ggml_tensor *mm_0_w;
-    struct ggml_tensor *mm_0_b;
-};
-
-struct mllama_ctx {
-    struct mllama_vision_model vision_model;
-
-    uint32_t ftype = 1;
-
-    struct gguf_context *ctx_gguf;
-    struct ggml_context *ctx_data;
-
-    std::vector<uint8_t> buf_compute_meta;
-
-    // memory buffers to evaluate the model
-    ggml_backend_buffer_t params_buffer = nullptr;
-
-    ggml_backend_t backend = nullptr;
-    ggml_gallocr_t compute_alloc = nullptr;
-};
-
-static ggml_tensor *mllama_image_build_encoder_layer(
-    struct ggml_context *ctx0, const size_t il, const struct mllama_layer &layer, struct ggml_tensor *embeddings,
-    const float eps, const int hidden_size, const int batch_size, const int n_head, const int d_head) {
-    struct ggml_tensor *cur = embeddings;
-
-    {
-        // layernorm1
-        cur = ggml_norm(ctx0, cur, eps);
-        cur = ggml_add(ctx0, ggml_mul(ctx0, cur, layer.ln_1_w), layer.ln_1_b);
-        ggml_set_name(cur, format("%d pre layernorm", il).c_str());
-    }
-
-    {
-        // self-attention
-        struct ggml_tensor *Q = ggml_mul_mat(ctx0, layer.q_w, cur);
-        if (layer.q_b != nullptr) {
-            Q = ggml_add(ctx0, Q, layer.q_b);
-        }
-
-        Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, Q->ne[1], batch_size);
-        Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
-        ggml_set_name(Q, format("%d query", il).c_str());
-
-        struct ggml_tensor *K = ggml_mul_mat(ctx0, layer.k_w, cur);
-        if (layer.k_b != nullptr) {
-            K = ggml_add(ctx0, K, layer.k_b);
-        }
-
-        K = ggml_reshape_4d(ctx0, K, d_head, n_head, K->ne[1], batch_size);
-        K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
-        ggml_set_name(K, format("%d key", il).c_str());
-
-        struct ggml_tensor *V = ggml_mul_mat(ctx0, layer.v_w, cur);
-        if (layer.v_b != nullptr) {
-            V = ggml_add(ctx0, V, layer.v_b);
-        }
-
-        V = ggml_reshape_4d(ctx0, V, d_head, n_head, V->ne[1], batch_size);
-        V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3));
-        ggml_set_name(V, format("%d value", il).c_str());
-
-        struct ggml_tensor *KQ = ggml_mul_mat(ctx0, K, Q);
-        KQ = ggml_scale_inplace(ctx0, KQ, 1.0f / sqrtf((float)d_head));
-        KQ = ggml_soft_max_inplace(ctx0, KQ);
-        ggml_set_name(KQ, format("%d KQ", il).c_str());
-
-        struct ggml_tensor *KQV = ggml_mul_mat(ctx0, V, KQ);
-        KQV = ggml_reshape_4d(ctx0, KQV, d_head, KQV->ne[1], n_head, batch_size);
-        KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
-        KQV = ggml_cont_3d(ctx0, KQV, hidden_size, KQV->ne[2], batch_size);
-        ggml_set_name(KQV, format("%d KQV", il).c_str());
-
-        cur = ggml_mul_mat(ctx0, layer.o_w, KQV);
-        if (layer.o_b != nullptr) {
-            cur = ggml_add(ctx0, cur, layer.o_b);
-        }
-        ggml_set_name(cur, format("%d self attention", il).c_str());
-
-        if (layer.attn_gate != nullptr) {
-            cur = ggml_mul_inplace(ctx0, cur, layer.attn_gate);
-            ggml_set_name(cur, format("%d self attention gate", il).c_str());
-        }
-    }
-
-    cur = ggml_add(ctx0, cur, embeddings);
-    ggml_set_name(cur, format("%d residual", il).c_str());
-
-    embeddings = cur;
-
-    {
-        // layernorm2
-        cur = ggml_norm(ctx0, cur, eps);
-        cur = ggml_add(ctx0, ggml_mul(ctx0, cur, layer.ln_2_w), layer.ln_2_b);
-        ggml_set_name(cur, format("%d post layernorm", il).c_str());
-    }
-
-    {
-        // feed forward
-        cur = ggml_add(ctx0, ggml_mul_mat(ctx0, layer.ff_i_w, cur), layer.ff_i_b);
-        cur = ggml_gelu_inplace(ctx0, cur);
-        cur = ggml_add(ctx0, ggml_mul_mat(ctx0, layer.ff_o_w, cur), layer.ff_o_b);
-        ggml_set_name(cur, format("%d feed forward", il).c_str());
-
-        if (layer.ff_gate != nullptr) {
-            cur = ggml_mul_inplace(ctx0, cur, layer.ff_gate);
-            ggml_set_name(cur, format("%d feed forward gate", il).c_str());
-        }
-    }
-
-    // residual 2
-    cur = ggml_add(ctx0, cur, embeddings);
-    ggml_set_name(cur, format("%d residual", il).c_str());
-
-    embeddings = cur;
-
-    return embeddings;
-}
-
-static ggml_cgraph *mllama_image_build_graph(mllama_ctx *ctx, const mllama_image_batch *imgs) {
-    const auto &model = ctx->vision_model;
-    const auto &hparams = model.hparams;
-
-    const int image_size = hparams.image_size;
-    const int image_size_width = image_size;
-    const int image_size_height = image_size;
-
-    const int patch_size = hparams.patch_size;
-    const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
-    const int num_positions = num_patches + (model.class_embedding == nullptr ? 0 : 1);
-    const int hidden_size = hparams.hidden_size;
-    const int n_head = hparams.n_head;
-    const int d_head = hidden_size / n_head;
-
-    const int batch_size = imgs->size;
-    REQUIRE(batch_size == 1);
-
-    int num_tiles = 4;
-    int num_channels = 3;
-    if (imgs->data != nullptr) {
-        num_tiles = imgs->data[0].num_tiles > 0 ? imgs->data[0].num_tiles : num_tiles;
-        num_channels = imgs->data[0].num_channels > 0 ? imgs->data[0].num_channels : num_channels;
-    }
-
-    struct ggml_init_params params = {
-        ctx->buf_compute_meta.size(), // mem_size
-        ctx->buf_compute_meta.data(), // mem_buffer
-        true,                         // no_alloc
-    };
-
-    struct ggml_context *ctx0 = ggml_init(params);
-    struct ggml_cgraph *gf = ggml_new_graph(ctx0);
-
-    struct ggml_tensor *inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size_width, image_size_height, num_channels, num_tiles);
-    ggml_set_name(inp_raw, "inp_raw");
-    ggml_set_input(inp_raw);
-
-    struct ggml_tensor *inp = ggml_conv_2d(ctx0, model.patch_embeddings, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
-
-    inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, num_tiles);
-    inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3));
-
-    struct ggml_tensor *aspect_ratios = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, imgs->size);
-    ggml_set_name(aspect_ratios, "aspect_ratios");
-    ggml_set_input(aspect_ratios);
-
-    if (model.pre_tile_position_embeddings != nullptr) {
-        struct ggml_tensor *pre_tile_position_embeddings = ggml_get_rows(ctx0, model.pre_tile_position_embeddings, aspect_ratios);
-        ggml_set_name(pre_tile_position_embeddings, "pre_tile_position_embeddings");
-
-        pre_tile_position_embeddings = ggml_reshape_3d(ctx0, pre_tile_position_embeddings, hidden_size, 1, num_tiles);
-        if (model.pre_tile_position_embeddings_gate != nullptr) {
-            pre_tile_position_embeddings = ggml_mul_inplace(ctx0, pre_tile_position_embeddings, model.pre_tile_position_embeddings_gate);
-        }
-
-        inp = ggml_add(ctx0, inp, pre_tile_position_embeddings);
-    }
-
-    struct ggml_tensor *embeddings = inp;
-
-    if (model.class_embedding != nullptr) {
-        // concat class_embeddings and patch_embeddings
-        embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, num_tiles);
-        ggml_set_name(embeddings, "embeddings");
-        ggml_set_input(embeddings);
-        for (int i = 0; i < num_tiles; ++i) {
-            // repeat class embeddings for each tile
-            embeddings = ggml_acc_inplace(ctx0, embeddings, model.class_embedding, embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], i * embeddings->nb[2]);
-        }
-
-        embeddings = ggml_acc_inplace(ctx0, embeddings, inp, embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
-    }
-
-    struct ggml_tensor *positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions);
-    ggml_set_name(positions, "positions");
-    ggml_set_input(positions);
-
-    struct ggml_tensor *position_embd = ggml_get_rows(ctx0, model.position_embeddings, positions);
-    if (model.position_embeddings_gate != nullptr) {
-        position_embd = ggml_mul_inplace(ctx0, position_embd, model.position_embeddings_gate);
-    }
-
-    embeddings = ggml_add(ctx0, embeddings, position_embd);
-
-    if (model.tile_position_embeddings != nullptr) {
-        struct ggml_tensor *tile_position_embeddings = ggml_get_rows(ctx0, model.tile_position_embeddings, aspect_ratios);
-        ggml_set_name(tile_position_embeddings, "tile_position_embeddings");
-
-        tile_position_embeddings = ggml_reshape_3d(ctx0, tile_position_embeddings, hidden_size, num_positions, num_tiles);
-        if (model.tile_position_embeddings_gate != nullptr) {
-            tile_position_embeddings = ggml_mul_inplace(ctx0, tile_position_embeddings, model.tile_position_embeddings_gate);
-        }
-
-        embeddings = ggml_add(ctx0, embeddings, tile_position_embeddings);
-    }
-
-    // pre-layernorm
-    if (model.pre_ln_w != nullptr) {
-        embeddings = ggml_mul(ctx0, ggml_norm(ctx0, embeddings, hparams.eps), model.pre_ln_w);
-        if (model.pre_ln_b != nullptr) {
-            embeddings = ggml_add(ctx0, embeddings, model.pre_ln_b);
-        }
-
-        ggml_set_name(embeddings, "pre layernorm");
-    }
-
-    const int num_padding_patches = 8 - (embeddings->ne[1] % 8) % 8;
-
-    embeddings = ggml_pad(ctx0, embeddings, 0, num_padding_patches, 0, 0);
-    embeddings = ggml_view_3d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1] * embeddings->ne[2], batch_size, embeddings->nb[1], embeddings->nb[2] * embeddings->ne[3], 0);
-
-    std::vector<struct ggml_tensor *> intermediate_embeddings;
-
-    // encoder
-    for (size_t il = 0; il < model.layers.size(); il++) {
-        if (hparams.intermediate_layers[il]) {
-            intermediate_embeddings.push_back(embeddings);
-        }
-
-        embeddings = mllama_image_build_encoder_layer(
-            ctx0, il, model.layers[il], embeddings,
-            hparams.eps, hidden_size, batch_size, n_head, d_head);
-    }
-
-    // post-layernorm
-    if (model.post_ln_w != nullptr) {
-        embeddings = ggml_mul(ctx0, ggml_norm(ctx0, embeddings, hparams.eps), model.post_ln_w);
-        if (model.post_ln_b != nullptr) {
-            embeddings = ggml_add(ctx0, embeddings, model.post_ln_b);
-        }
-
-        ggml_set_name(embeddings, "post layernorm");
-    }
-
-    embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size, num_positions + num_padding_patches, num_tiles);
-
-    if (model.post_tile_position_embeddings != nullptr) {
-        struct ggml_tensor *post_tile_position_embeddings = ggml_get_rows(ctx0, model.post_tile_position_embeddings, aspect_ratios);
-        ggml_set_name(post_tile_position_embeddings, "post_tile_position_embeddings");
-
-        post_tile_position_embeddings = ggml_reshape_3d(ctx0, post_tile_position_embeddings, hidden_size, 1, num_tiles);
-        if (model.post_tile_position_embeddings_gate != nullptr) {
-            post_tile_position_embeddings = ggml_mul(ctx0, post_tile_position_embeddings, model.post_tile_position_embeddings_gate);
-        }
-
-        embeddings = ggml_add(ctx0, embeddings, post_tile_position_embeddings);
-    }
-
-    embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size, num_tiles * (num_positions + num_padding_patches), 1);
-
-    // global encoder
-    for (size_t il = 0; il < model.global_layers.size(); il++) {
-        embeddings = mllama_image_build_encoder_layer(
-            ctx0, il, model.global_layers[il], embeddings,
-            hparams.eps, hidden_size, batch_size, n_head, d_head);
-    }
-
-    struct ggml_tensor *stacked_embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 0, hidden_size, (num_positions + num_padding_patches) * num_tiles);
-    for (size_t i = 0; i < intermediate_embeddings.size(); ++i) {
-        stacked_embeddings = ggml_concat(ctx0, stacked_embeddings, ggml_reshape_3d(ctx0, intermediate_embeddings[i], 1, intermediate_embeddings[i]->ne[0], intermediate_embeddings[i]->ne[1]), 0);
-    }
-
-    stacked_embeddings = ggml_reshape_4d(ctx0, stacked_embeddings, intermediate_embeddings.size() * hidden_size, num_positions + num_padding_patches, num_tiles, batch_size);
-    stacked_embeddings = ggml_unpad(ctx0, stacked_embeddings, 0, num_padding_patches, 0, 0);
-
-    embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size, num_positions + num_padding_patches, num_tiles);
-    embeddings = ggml_unpad(ctx0, embeddings, 0, num_padding_patches, 0, 0);
-    embeddings = ggml_concat(ctx0, embeddings, stacked_embeddings, 0);
-
-    // mllama projector
-    embeddings = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_0_w, embeddings), model.mm_0_b);
-    ggml_set_name(embeddings, "multi modal projector");
-
-    // build the graph
-    ggml_build_forward_expand(gf, embeddings);
-
-    ggml_free(ctx0);
-
-    return gf;
-}
-
-static struct ggml_tensor *mllama_tensor_load(struct ggml_context *ctx, const char *name, const bool optional) {
-    struct ggml_tensor *cur = ggml_get_tensor(ctx, name);
-    REQUIRE(cur != nullptr || optional);
-    return cur;
-}
-
-static std::vector<struct mllama_layer> mllama_layers_load(struct ggml_context *ctx, const char *prefix, const int n) {
-    std::vector<struct mllama_layer> layers(n);
-    for (size_t i = 0; i < layers.size(); i++) {
-        auto &layer = layers[i];
-        layer.ln_1_w = mllama_tensor_load(ctx, format("%s.blk.%d.ln1.weight", prefix, i).c_str(), false);
-        layer.ln_1_b = mllama_tensor_load(ctx, format("%s.blk.%d.ln1.bias", prefix, i).c_str(), false);
-        layer.ln_2_w = mllama_tensor_load(ctx, format("%s.blk.%d.ln2.weight", prefix, i).c_str(), false);
-        layer.ln_2_b = mllama_tensor_load(ctx, format("%s.blk.%d.ln2.bias", prefix, i).c_str(), false);
-
-        layer.k_w = mllama_tensor_load(ctx, format("%s.blk.%d.attn_k.weight", prefix, i).c_str(), false);
-        layer.k_b = mllama_tensor_load(ctx, format("%s.blk.%d.attn_k.bias", prefix, i).c_str(), true);
-        layer.q_w = mllama_tensor_load(ctx, format("%s.blk.%d.attn_q.weight", prefix, i).c_str(), false);
-        layer.q_b = mllama_tensor_load(ctx, format("%s.blk.%d.attn_q.bias", prefix, i).c_str(), true);
-        layer.v_w = mllama_tensor_load(ctx, format("%s.blk.%d.attn_v.weight", prefix, i).c_str(), false);
-        layer.v_b = mllama_tensor_load(ctx, format("%s.blk.%d.attn_v.bias", prefix, i).c_str(), true);
-        layer.o_w = mllama_tensor_load(ctx, format("%s.blk.%d.attn_out.weight", prefix, i).c_str(), false);
-        layer.o_b = mllama_tensor_load(ctx, format("%s.blk.%d.attn_out.bias", prefix, i).c_str(), true);
-
-        layer.ff_i_w = mllama_tensor_load(ctx, format("%s.blk.%d.ffn_down.weight", prefix, i).c_str(), false);
-        layer.ff_i_b = mllama_tensor_load(ctx, format("%s.blk.%d.ffn_down.bias", prefix, i).c_str(), false);
-        layer.ff_o_w = mllama_tensor_load(ctx, format("%s.blk.%d.ffn_up.weight", prefix, i).c_str(), false);
-        layer.ff_o_b = mllama_tensor_load(ctx, format("%s.blk.%d.ffn_up.bias", prefix, i).c_str(), false);
-
-        layer.attn_gate = mllama_tensor_load(ctx, format("%s.blk.%d.attn_gate", prefix, i).c_str(), true);
-        layer.ff_gate = mllama_tensor_load(ctx, format("%s.blk.%d.ffn_gate", prefix, i).c_str(), true);
-    }
-
-    return layers;
-}
-
-// read and create ggml_context containing the tensors and their data
-struct mllama_ctx *mllama_model_load(const char *fname, const int verbosity = 1) {
-    struct ggml_context *meta = nullptr;
-
-    struct gguf_init_params params = {
-        true,  // no_alloc
-        &meta, // ctx
-    };
-
-    struct gguf_context *ctx = gguf_init_from_file(fname, params);
-    REQUIRE(ctx != nullptr);
-
-    if (verbosity >= 1) {
-        const int n_tensors = gguf_get_n_tensors(ctx);
-        const int n_kv = gguf_get_n_kv(ctx);
-        const std::string ftype = get_ftype(get_u32(ctx, "general.file_type"));
-        const int idx_desc = get_key_index(ctx, "general.description");
-        const std::string description = gguf_get_val_str(ctx, idx_desc);
-        const int idx_name = gguf_find_key(ctx, "general.name");
-        if (idx_name != -1) { // make name optional temporarily as some of the uploaded models missing it due to a bug
-            const std::string name = gguf_get_val_str(ctx, idx_name);
-            LOG("model name:   %s", name.c_str());
-        }
-        LOG("description:  %s", description.c_str());
-        LOG("GGUF version: %d", gguf_get_version(ctx));
-        LOG("alignment:    %zu", gguf_get_alignment(ctx));
-        LOG("n_tensors:    %d", n_tensors);
-        LOG("n_kv:         %d", n_kv);
-        LOG("ftype:        %s", ftype.c_str());
-        LOG("");
-    }
-    const int n_tensors = gguf_get_n_tensors(ctx);
-
-    mllama_ctx *new_mllama = new mllama_ctx{};
-
-    ggml_backend_t backend = ggml_backend_init_best();
-    if (backend == nullptr) {
-        LOG("%s: failed to initialize backend\n", __func__);
-        mllama_free(new_mllama);
-        gguf_free(ctx);
-        return nullptr;
-    }
-    LOG("%s: using %s backend\n", __func__, ggml_backend_name(backend));
-    new_mllama->backend = backend;
-
-    // load tensors
-    {
-        std::vector<uint8_t> read_buf;
-        struct ggml_init_params params = {
-            (n_tensors + 1) * ggml_tensor_overhead(), // mem_size
-            nullptr,                                  // mem_buffer
-            true,                                     // no_alloc
-        };
-
-        new_mllama->ctx_data = ggml_init(params);
-        if (!new_mllama->ctx_data) {
-            LOG("ggml_init() failed");
-            mllama_free(new_mllama);
-            gguf_free(ctx);
-            return nullptr;
-        }
-
-#ifdef _WIN32
-        int wlen = MultiByteToWideChar(CP_UTF8, 0, fname, -1, NULL, 0);
-        if (!wlen) {
-            return NULL;
-        }
-        wchar_t * wbuf = (wchar_t *) malloc(wlen * sizeof(wchar_t));
-        wlen = MultiByteToWideChar(CP_UTF8, 0, fname, -1, wbuf, wlen);
-        if (!wlen) {
-            free(wbuf);
-            return NULL;
-        }
-#if __GLIBCXX__
-        int fd = _wopen(wbuf, _O_RDONLY | _O_BINARY);
-        __gnu_cxx::stdio_filebuf<char> buffer(fd, std::ios_base::in);
-        std::istream fin(&buffer);
-#else // MSVC
-        // unused in our current build
-        auto fin = std::ifstream(wbuf, std::ios::binary);
-#endif
-        free(wbuf);
-#else
-        auto fin = std::ifstream(fname, std::ios::binary);
-#endif
-        if (!fin) {
-            LOG("cannot open model file for loading tensors\n");
-            mllama_free(new_mllama);
-            gguf_free(ctx);
-            return nullptr;
-        }
-
-        // add tensors to context
-        for (int i = 0; i < n_tensors; ++i) {
-            const char *name = gguf_get_tensor_name(ctx, i);
-            struct ggml_tensor *t = ggml_get_tensor(meta, name);
-            struct ggml_tensor *cur = ggml_dup_tensor(new_mllama->ctx_data, t);
-            ggml_set_name(cur, name);
-        }
-
-        // alloc memory and offload data
-        new_mllama->params_buffer = ggml_backend_alloc_ctx_tensors(new_mllama->ctx_data, new_mllama->backend);
-        for (int i = 0; i < n_tensors; ++i) {
-            const char *name = gguf_get_tensor_name(ctx, i);
-            struct ggml_tensor *cur = ggml_get_tensor(new_mllama->ctx_data, name);
-            const size_t offset = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i);
-            fin.seekg(offset, std::ios::beg);
-            if (!fin) {
-                LOG("failed to seek for tensor %s\n", name);
-                mllama_free(new_mllama);
-                gguf_free(ctx);
-                return nullptr;
-            }
-            int num_bytes = ggml_nbytes(cur);
-            if (ggml_backend_buffer_is_host(new_mllama->params_buffer)) {
-                // for the CPU and Metal backend, we can read directly into the tensor
-                fin.read(reinterpret_cast<char *>(cur->data), num_bytes);
-            } else {
-                // read into a temporary buffer first, then copy to device memory
-                read_buf.resize(num_bytes);
-                fin.read(reinterpret_cast<char *>(read_buf.data()), num_bytes);
-                ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
-            }
-        }
-
-#if defined(_WIN32) && defined(__GLIBCXX__)
-        close(fd);
-#else
-        fin.close();
-#endif
-    }
-
-    // vision model
-    // load vision model
-    auto &vision_model = new_mllama->vision_model;
-    auto &hparams = vision_model.hparams;
-    hparams.hidden_size = get_u32(ctx, "mllama.vision.embedding_length");
-    hparams.n_head = get_u32(ctx, "mllama.vision.attention.head_count");
-    hparams.n_intermediate = get_u32(ctx, "mllama.vision.feed_forward_length");
-    hparams.n_layer = get_u32(ctx, "mllama.vision.block_count");
-    hparams.n_global_layer = get_u32(ctx, "mllama.vision.global.block_count");
-    hparams.n_tiles = get_u32(ctx, "mllama.vision.max_num_tiles");
-    hparams.image_size = get_u32(ctx, "mllama.vision.image_size");
-    hparams.patch_size = get_u32(ctx, "mllama.vision.patch_size");
-    hparams.projection_dim = get_u32(ctx, "mllama.vision.projection_dim");
-    hparams.eps = get_f32(ctx, "mllama.vision.attention.layer_norm_epsilon");
-
-    std::vector<uint32_t> intermediate_layers_indices = get_u32_array(ctx, "mllama.vision.intermediate_layers_indices");
-    hparams.intermediate_layers.resize(hparams.n_layer);
-    for (size_t i = 0; i < intermediate_layers_indices.size(); i++) {
-        hparams.intermediate_layers[intermediate_layers_indices[i]] = true;
-    }
-
-    if (verbosity >= 2) {
-        LOG("");
-        LOG("vision model hparams");
-        LOG("image_size         %d", hparams.image_size);
-        LOG("patch_size         %d", hparams.patch_size);
-        LOG("v_hidden_size      %d", hparams.hidden_size);
-        LOG("v_n_intermediate   %d", hparams.n_intermediate);
-        LOG("v_projection_dim   %d", hparams.projection_dim);
-        LOG("v_n_head           %d", hparams.n_head);
-        LOG("v_n_layer          %d", hparams.n_layer);
-        LOG("v_n_global_layer   %d", hparams.n_global_layer);
-        LOG("v_eps              %f", hparams.eps);
-    }
-
-    vision_model.class_embedding = mllama_tensor_load(new_mllama->ctx_data, "v.class_embd", true);
-    vision_model.patch_embeddings = mllama_tensor_load(new_mllama->ctx_data, "v.patch_embd.weight", true);
-
-    vision_model.position_embeddings = mllama_tensor_load(new_mllama->ctx_data, "v.position_embd.weight", true);
-    vision_model.position_embeddings_gate = mllama_tensor_load(new_mllama->ctx_data, "v.position_embd.gate", true);
-
-    vision_model.pre_ln_w = mllama_tensor_load(new_mllama->ctx_data, "v.pre_ln.weight", true);
-    vision_model.pre_ln_b = mllama_tensor_load(new_mllama->ctx_data, "v.pre_ln.bias", true);
-    vision_model.post_ln_w = mllama_tensor_load(new_mllama->ctx_data, "v.post_ln.weight", true);
-    vision_model.post_ln_b = mllama_tensor_load(new_mllama->ctx_data, "v.post_ln.bias", true);
-
-    vision_model.tile_position_embeddings = mllama_tensor_load(new_mllama->ctx_data, "v.tile_position_embd.weight", true);
-    vision_model.tile_position_embeddings_gate = mllama_tensor_load(new_mllama->ctx_data, "v.tile_position_embd.gate", true);
-
-    vision_model.pre_tile_position_embeddings = mllama_tensor_load(new_mllama->ctx_data, "v.pre_tile_position_embd.weight", true);
-    vision_model.pre_tile_position_embeddings_gate = mllama_tensor_load(new_mllama->ctx_data, "v.pre_tile_position_embd.gate", true);
-
-    vision_model.post_tile_position_embeddings = mllama_tensor_load(new_mllama->ctx_data, "v.post_tile_position_embd.weight", true);
-    vision_model.post_tile_position_embeddings_gate = mllama_tensor_load(new_mllama->ctx_data, "v.post_tile_position_embd.gate", true);
-
-    vision_model.mm_0_w = mllama_tensor_load(new_mllama->ctx_data, "mm.0.weight", false);
-    vision_model.mm_0_b = mllama_tensor_load(new_mllama->ctx_data, "mm.0.bias", false);
-
-    vision_model.layers = mllama_layers_load(new_mllama->ctx_data, "v", hparams.n_layer);
-    vision_model.global_layers = mllama_layers_load(new_mllama->ctx_data, "v.global", hparams.n_global_layer);
-
-    ggml_free(meta);
-
-    new_mllama->ctx_gguf = ctx;
-
-    {
-        // measure mem requirement and allocate
-        new_mllama->buf_compute_meta.resize(GGML_DEFAULT_GRAPH_SIZE * ggml_tensor_overhead() + ggml_graph_overhead());
-        new_mllama->compute_alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(new_mllama->backend));
-        struct mllama_image_batch batch;
-        batch.size = 1;
-        ggml_cgraph *gf = mllama_image_build_graph(new_mllama, &batch);
-        ggml_gallocr_reserve(new_mllama->compute_alloc, gf);
-        size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_mllama->compute_alloc, 0);
-        LOG("compute allocated memory: %.2f MB", compute_memory_buffer_size / 1024.0 / 1024.0);
-    }
-
-    return new_mllama;
-}
-
-struct mllama_image *mllama_image_init() {
-    return new mllama_image();
-}
-
-void mllama_image_free(struct mllama_image *img) { delete img; }
-void mllama_image_batch_free(struct mllama_image_batch *batch) {
-    if (batch->size > 0) {
-        delete[] batch->data;
-        batch->size = 0;
-    }
-}
-
-bool mllama_image_load_from_data(const void *data, const int n, const int width, const int height, const int num_channels, const int num_tiles, const int aspect_ratio_id, struct mllama_image *img) {
-    img->width = width;
-    img->height = height;
-    img->num_channels = num_channels;
-    img->num_tiles = num_tiles;
-    img->aspect_ratio_id = aspect_ratio_id;
-    img->data.resize(n);
-
-    memcpy(img->data.data(), data, n);
-    return true;
-}
-
-inline int mllama(int x, int lower, int upper) {
-    return std::max(lower, std::min(x, upper));
-}
-
-void mllama_free(mllama_ctx *ctx) {
-    ggml_free(ctx->ctx_data);
-    gguf_free(ctx->ctx_gguf);
-
-    ggml_backend_buffer_free(ctx->params_buffer);
-    ggml_backend_free(ctx->backend);
-    ggml_gallocr_free(ctx->compute_alloc);
-    delete ctx;
-}
-
-bool mllama_image_encode(struct mllama_ctx *ctx, const int n_threads, mllama_image *img, float *vec) {
-    mllama_image_batch imgs{};
-    imgs.size = 1;
-    imgs.data = img;
-    return mllama_image_batch_encode(ctx, n_threads, &imgs, vec);
-}
-
-bool mllama_image_batch_encode(mllama_ctx *ctx, const int n_threads, const mllama_image_batch *imgs, float *vec) {
-    int batch_size = imgs->size;
-    REQUIRE(batch_size == 1);
-
-    // build the inference graph
-    ggml_cgraph *gf = mllama_image_build_graph(ctx, imgs);
-    ggml_gallocr_alloc_graph(ctx->compute_alloc, gf);
-
-    // set inputs
-    const auto &model = ctx->vision_model;
-    const auto &hparams = model.hparams;
-
-    const int image_size = hparams.image_size;
-    int image_size_width = image_size;
-    int image_size_height = image_size;
-
-    const int patch_size = hparams.patch_size;
-    const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
-    const int num_positions = num_patches + (model.class_embedding == nullptr ? 0 : 1);
-
-    {
-        struct ggml_tensor *inp_raw = ggml_graph_get_tensor(gf, "inp_raw");
-        ggml_backend_tensor_set(inp_raw, imgs->data[0].data.data(), 0, ggml_nbytes(inp_raw));
-    }
-
-    {
-        struct ggml_tensor *embeddings = ggml_graph_get_tensor(gf, "embeddings");
-        if (embeddings != nullptr) {
-            void *zeros = malloc(ggml_nbytes(embeddings));
-            memset(zeros, 0, ggml_nbytes(embeddings));
-            ggml_backend_tensor_set(embeddings, zeros, 0, ggml_nbytes(embeddings));
-            free(zeros);
-        }
-    }
-
-    {
-        struct ggml_tensor *positions = ggml_graph_get_tensor(gf, "positions");
-        if (positions != nullptr) {
-            int *positions_data = (int *)malloc(ggml_nbytes(positions));
-            for (int i = 0; i < num_positions; i++) {
-                positions_data[i] = i;
-            }
-            ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
-            free(positions_data);
-        }
-    }
-
-    {
-        struct ggml_tensor *aspect_ratios = ggml_graph_get_tensor(gf, "aspect_ratios");
-        if (aspect_ratios != nullptr) {
-            int *aspect_ratios_data = (int *)malloc(ggml_nbytes(aspect_ratios));
-            aspect_ratios_data[0] = imgs->data[0].aspect_ratio_id;
-            ggml_backend_tensor_set(aspect_ratios, aspect_ratios_data, 0, ggml_nbytes(aspect_ratios));
-            free(aspect_ratios_data);
-        }
-    }
-
-    if (ggml_backend_is_cpu(ctx->backend)) {
-        ggml_backend_cpu_set_n_threads(ctx->backend, n_threads);
-    }
-
-    ggml_backend_graph_compute(ctx->backend, gf);
-
-    // the last node is the embedding tensor
-    struct ggml_tensor *embeddings = ggml_graph_node(gf, ggml_graph_n_nodes(gf) - 1);
-
-    // copy the embeddings to the location passed by the user
-    ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
-
-    return true;
-}
-
-int32_t mllama_image_size(const struct mllama_ctx *ctx) {
-    return ctx->vision_model.hparams.image_size;
-}
-
-int32_t mllama_patch_size(const struct mllama_ctx *ctx) {
-    return ctx->vision_model.hparams.patch_size;
-}
-
-int32_t mllama_hidden_size(const struct mllama_ctx *ctx) {
-    return ctx->vision_model.hparams.hidden_size;
-}
-
-int mllama_n_patches(const struct mllama_ctx *ctx) {
-    const auto &hparams = ctx->vision_model.hparams;
-    return (hparams.image_size / hparams.patch_size) * (hparams.image_size / hparams.patch_size);
-}
-
-int mllama_n_positions(const struct mllama_ctx *ctx) {
-    return mllama_n_patches(ctx) + (ctx->vision_model.class_embedding == nullptr ? 0 : 1);
-}
-
-int mllama_n_tiles(const struct mllama_ctx *ctx) {
-    return ctx->vision_model.hparams.n_tiles;
-}
-
-int mllama_n_embd(const struct mllama_ctx *ctx) {
-    return ctx->vision_model.hparams.projection_dim;
-}
-
-size_t mllama_n_embd_bytes(const struct mllama_ctx *ctx) {
-    return mllama_n_positions(ctx) * mllama_n_embd(ctx) * mllama_n_tiles(ctx) * sizeof(float);
-}
diff --git a/llama/mllama.h b/llama/mllama.h
deleted file mode 100644
index 446dbb9ec..000000000
--- a/llama/mllama.h
+++ /dev/null
@@ -1,61 +0,0 @@
-#ifndef MLLAMA_H
-#define MLLAMA_H
-
-#include <stddef.h>
-#include <stdint.h>
-
-#ifdef LLAMA_SHARED
-#if defined(_WIN32) && !defined(__MINGW32__)
-#ifdef LLAMA_BUILD
-#define MLLAMA_API __declspec(dllexport)
-#else
-#define MLLAMA_API __declspec(dllimport)
-#endif
-#else
-#define MLLAMA_API __attribute__((visibility("default")))
-#endif
-#else
-#define MLLAMA_API
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-struct mllama_ctx;
-
-struct mllama_image_batch {
-    struct mllama_image *data;
-    size_t size;
-};
-
-MLLAMA_API struct mllama_ctx *mllama_model_load(const char *fname, int verbosity);
-MLLAMA_API struct mllama_ctx *mllama_model_load_cpu(const char *fname, int verbosity);
-
-MLLAMA_API void mllama_free(struct mllama_ctx *ctx);
-
-MLLAMA_API int32_t mllama_image_size(const struct mllama_ctx *ctx);
-MLLAMA_API int32_t mllama_patch_size(const struct mllama_ctx *ctx);
-MLLAMA_API int32_t mllama_hidden_size(const struct mllama_ctx *ctx);
-
-MLLAMA_API int mllama_n_patches(const struct mllama_ctx *ctx);
-MLLAMA_API int mllama_n_positions(const struct mllama_ctx *ctx);
-MLLAMA_API int mllama_n_tiles(const struct mllama_ctx *ctx);
-MLLAMA_API int mllama_n_embd(const struct mllama_ctx *ctx);
-MLLAMA_API size_t mllama_n_embd_bytes(const struct mllama_ctx *ctx);
-
-MLLAMA_API struct mllama_image *mllama_image_init();
-
-MLLAMA_API void mllama_image_free(struct mllama_image *img);
-MLLAMA_API void mllama_image_batch_free(struct mllama_image_batch *batch);
-
-MLLAMA_API bool mllama_image_load_from_data(const void *data, const int n, const int nx, const int ny, const int nc, const int nt, const int aspect_ratio_id, struct mllama_image *img);
-
-MLLAMA_API bool mllama_image_encode(struct mllama_ctx *ctx, int n_threads, struct mllama_image *img, float *vec);
-MLLAMA_API bool mllama_image_batch_encode(struct mllama_ctx *ctx, int n_threads, const struct mllama_image_batch *imgs, float *vec);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif // MLLAMA_H
diff --git a/llama/patches/0005-solar-pro.patch b/llama/patches/0005-solar-pro.patch
index c630f2439..deb53c225 100644
--- a/llama/patches/0005-solar-pro.patch
+++ b/llama/patches/0005-solar-pro.patch
@@ -270,7 +270,7 @@ index 3a4e72a3..831b68c0 100644
 +            // self-attention
 +            {
 +                // rope freq factors for llama3; may return nullptr for llama2 and other models
-+                ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
++                ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
 +
 +                // compute Q and K and RoPE them
 +                ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
diff --git a/llama/patches/0006-add-mllama-support.patch b/llama/patches/0006-add-mllama-support.patch
deleted file mode 100644
index 05f85ec34..000000000
--- a/llama/patches/0006-add-mllama-support.patch
+++ /dev/null
@@ -1,1027 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: jmorganca <jmorganca@gmail.com>
-Date: Sun, 20 Apr 2025 16:12:36 -0700
-Subject: [PATCH] add mllama support
-
-adds support for the llama 3.2 vision architecture
----
- ggml/src/ggml-backend-reg.cpp |   6 +-
- include/llama.h               |   6 +
- src/llama-arch.cpp            |  44 +++++
- src/llama-arch.h              |  10 ++
- src/llama-batch.cpp           |   3 +
- src/llama-context.cpp         |  23 ++-
- src/llama-context.h           |   1 +
- src/llama-cparams.h           |   1 +
- src/llama-graph.cpp           |  25 +++
- src/llama-graph.h             |  12 ++
- src/llama-hparams.cpp         |   4 +
- src/llama-hparams.h           |   7 +
- src/llama-kv-cache.cpp        |  14 +-
- src/llama-model-loader.cpp    |   2 +
- src/llama-model.cpp           | 311 +++++++++++++++++++++++++++++++++-
- src/llama-model.h             |  12 ++
- src/llama-quant.cpp           |   4 +-
- tools/mtmd/llava.cpp          |   5 +-
- tools/mtmd/mtmd-helper.cpp    |   7 +-
- 19 files changed, 475 insertions(+), 22 deletions(-)
-
-diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
-index 405d8e31..82ae1b5b 100644
---- a/ggml/src/ggml-backend-reg.cpp
-+++ b/ggml/src/ggml-backend-reg.cpp
-@@ -178,9 +178,9 @@ struct ggml_backend_registry {
- #ifdef GGML_USE_CANN
-         register_backend(ggml_backend_cann_reg());
- #endif
--#ifdef GGML_USE_BLAS
--        register_backend(ggml_backend_blas_reg());
--#endif
-+// #ifdef GGML_USE_BLAS
-+//         register_backend(ggml_backend_blas_reg());
-+// #endif
- #ifdef GGML_USE_RPC
-         register_backend(ggml_backend_rpc_reg());
- #endif
-diff --git a/include/llama.h b/include/llama.h
-index abedebdb..41beef21 100644
---- a/include/llama.h
-+++ b/include/llama.h
-@@ -258,6 +258,7 @@ extern "C" {
- 
-         llama_token  *  token;
-         float        *  embd;
-+        int32_t         n_embd;
-         llama_pos    *  pos;
-         int32_t      *  n_seq_id;
-         llama_seq_id ** seq_id;
-@@ -365,6 +366,7 @@ extern "C" {
-         bool flash_attn;  // whether to use flash attention [EXPERIMENTAL]
-         bool no_perf;     // whether to measure performance timings
-         bool op_offload;  // whether to offload host tensor operations to device
-+        bool cross_attn;  // whether to use cross attention
-     };
- 
-     // model quantization parameters
-@@ -464,6 +466,10 @@ extern "C" {
-             struct llama_context_params   params),
-             "use llama_init_from_model instead");
- 
-+    // TODO (jmorganca): this should most likely be passed in as part of a batch
-+    // and not set on the context for all batches.
-+    LLAMA_API void llama_set_cross_attention(struct llama_context * ctx, bool cross_attn_state);
-+
-     // Frees all allocated memory
-     LLAMA_API void llama_free(struct llama_context * ctx);
- 
-diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
-index 5ab3f572..eb7b5325 100644
---- a/src/llama-arch.cpp
-+++ b/src/llama-arch.cpp
-@@ -6,6 +6,7 @@
- 
- static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
-     { LLM_ARCH_LLAMA,            "llama"            },
-+    { LLM_ARCH_MLLAMA,           "mllama"           },
-     { LLM_ARCH_LLAMA4,           "llama4"           },
-     { LLM_ARCH_DECI,             "deci"             },
-     { LLM_ARCH_FALCON,           "falcon"           },
-@@ -144,6 +145,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
-     { LLM_KV_ATTENTION_SLIDING_WINDOW,               "%s.attention.sliding_window"               },
-     { LLM_KV_ATTENTION_SCALE,                        "%s.attention.scale"                        },
-     { LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,        "%s.attention.block_skip_connection"        },
-+    { LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS,       "%s.attention.cross_attention_layers"       },
-     { LLM_KV_ATTENTION_KEY_LENGTH_MLA,               "%s.attention.key_length_mla"               },
-     { LLM_KV_ATTENTION_VALUE_LENGTH_MLA,             "%s.attention.value_length_mla"             },
- 
-@@ -273,6 +275,40 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
-             { LLM_TENSOR_FFN_UP_SHEXP,    "blk.%d.ffn_up_shexp" },
-         },
-     },
-+    {
-+        LLM_ARCH_MLLAMA,
-+        {
-+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
-+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
-+            { LLM_TENSOR_OUTPUT,          "output" },
-+            { LLM_TENSOR_ROPE_FREQS,      "rope_freqs" },
-+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
-+            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
-+            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
-+            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
-+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
-+            { LLM_TENSOR_ATTN_ROT_EMBD,   "blk.%d.attn_rot_embd" },
-+            { LLM_TENSOR_FFN_GATE_INP,    "blk.%d.ffn_gate_inp" },
-+            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
-+            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
-+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
-+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
-+            { LLM_TENSOR_FFN_GATE_EXP,    "blk.%d.ffn_gate.%d" },
-+            { LLM_TENSOR_FFN_DOWN_EXP,    "blk.%d.ffn_down.%d" },
-+            { LLM_TENSOR_FFN_UP_EXP,      "blk.%d.ffn_up.%d" },
-+            { LLM_TENSOR_FFN_GATE_EXPS,   "blk.%d.ffn_gate_exps" },
-+            { LLM_TENSOR_FFN_DOWN_EXPS,   "blk.%d.ffn_down_exps" },
-+            { LLM_TENSOR_FFN_UP_EXPS,     "blk.%d.ffn_up_exps" },
-+            { LLM_TENSOR_CROSS_ATTN_K_NORM,    "blk.%d.cross_attn_k_norm" },
-+            { LLM_TENSOR_CROSS_ATTN_K_PROJ,    "blk.%d.cross_attn_k_proj" },
-+            { LLM_TENSOR_CROSS_ATTN_O_PROJ,    "blk.%d.cross_attn_o_proj" },
-+            { LLM_TENSOR_CROSS_ATTN_Q_NORM,    "blk.%d.cross_attn_q_norm" },
-+            { LLM_TENSOR_CROSS_ATTN_Q_PROJ,    "blk.%d.cross_attn_q_proj" },
-+            { LLM_TENSOR_CROSS_ATTN_V_PROJ,    "blk.%d.cross_attn_v_proj" },
-+            { LLM_TENSOR_CROSS_ATTN_ATTN_GATE, "blk.%d.cross_attn_attn_gate" },
-+            { LLM_TENSOR_CROSS_ATTN_MLP_GATE,  "blk.%d.cross_attn_mlp_gate" },
-+        },
-+    },
-     {
-         LLM_ARCH_DECI,
-         {
-@@ -1701,6 +1737,14 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
-     // this tensor is loaded for T5, but never used
-     {LLM_TENSOR_DEC_CROSS_ATTN_REL_B,       {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
-     {LLM_TENSOR_BSKCN_TV,                   {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
-+    {LLM_TENSOR_CROSS_ATTN_K_NORM,          {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
-+    {LLM_TENSOR_CROSS_ATTN_K_PROJ,          {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
-+    {LLM_TENSOR_CROSS_ATTN_O_PROJ,          {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
-+    {LLM_TENSOR_CROSS_ATTN_Q_NORM,          {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
-+    {LLM_TENSOR_CROSS_ATTN_Q_PROJ,          {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
-+    {LLM_TENSOR_CROSS_ATTN_V_PROJ,          {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
-+    {LLM_TENSOR_CROSS_ATTN_ATTN_GATE,       {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
-+    {LLM_TENSOR_CROSS_ATTN_MLP_GATE,        {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
-     {LLM_TENSOR_CONV1D,                     {LLM_TENSOR_LAYER_INPUT,     GGML_OP_IM2COL}},
-     {LLM_TENSOR_POS_NET_NORM,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
-     {LLM_TENSOR_POS_NET_NORM1,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
-diff --git a/src/llama-arch.h b/src/llama-arch.h
-index 525c1b7d..bc8a4f0b 100644
---- a/src/llama-arch.h
-+++ b/src/llama-arch.h
-@@ -11,6 +11,7 @@
- enum llm_arch {
-     LLM_ARCH_LLAMA,
-     LLM_ARCH_LLAMA4,
-+    LLM_ARCH_MLLAMA,
-     LLM_ARCH_DECI,
-     LLM_ARCH_FALCON,
-     LLM_ARCH_BAICHUAN,
-@@ -148,6 +149,7 @@ enum llm_kv {
-     LLM_KV_ATTENTION_SLIDING_WINDOW,
-     LLM_KV_ATTENTION_SCALE,
-     LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
-+    LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS,
-     LLM_KV_ATTENTION_KEY_LENGTH_MLA,
-     LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
- 
-@@ -349,6 +351,14 @@ enum llm_tensor {
-     LLM_TENSOR_CLS,
-     LLM_TENSOR_CLS_OUT,
-     LLM_TENSOR_BSKCN_TV,
-+    LLM_TENSOR_CROSS_ATTN_K_NORM,
-+    LLM_TENSOR_CROSS_ATTN_K_PROJ,
-+    LLM_TENSOR_CROSS_ATTN_O_PROJ,
-+    LLM_TENSOR_CROSS_ATTN_Q_NORM,
-+    LLM_TENSOR_CROSS_ATTN_Q_PROJ,
-+    LLM_TENSOR_CROSS_ATTN_V_PROJ,
-+    LLM_TENSOR_CROSS_ATTN_ATTN_GATE,
-+    LLM_TENSOR_CROSS_ATTN_MLP_GATE,
-     LLM_TENSOR_CONV1D,
-     LLM_TENSOR_CONVNEXT_DW,
-     LLM_TENSOR_CONVNEXT_NORM,
-diff --git a/src/llama-batch.cpp b/src/llama-batch.cpp
-index a88b2fe3..241b316e 100644
---- a/src/llama-batch.cpp
-+++ b/src/llama-batch.cpp
-@@ -320,6 +320,7 @@ struct llama_batch llama_batch_get_one(
-         /*n_tokens       =*/ n_tokens,
-         /*tokens         =*/ tokens,
-         /*embd           =*/ nullptr,
-+        /*n_embd         =*/ 0,
-         /*pos            =*/ nullptr,
-         /*n_seq_id       =*/ nullptr,
-         /*seq_id         =*/ nullptr,
-@@ -332,6 +333,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
-         /*n_tokens       =*/ 0,
-         /*tokens         =*/ nullptr,
-         /*embd           =*/ nullptr,
-+        /*n_embd         =*/ 0,
-         /*pos            =*/ nullptr,
-         /*n_seq_id       =*/ nullptr,
-         /*seq_id         =*/ nullptr,
-@@ -340,6 +342,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
- 
-     if (embd) {
-         batch.embd = (float *) malloc(sizeof(float) * n_tokens_alloc * embd);
-+        batch.n_embd = embd;
-     } else {
-         batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens_alloc);
-     }
-diff --git a/src/llama-context.cpp b/src/llama-context.cpp
-index dca22d8b..c22687e4 100644
---- a/src/llama-context.cpp
-+++ b/src/llama-context.cpp
-@@ -514,7 +514,7 @@ float * llama_context::get_logits_ith(int32_t i) {
-             throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, n_outputs));
-         }
- 
--        return logits + j*model.vocab.n_tokens();
-+        return logits + j*model.hparams.n_vocab;
-     } catch (const std::exception & err) {
-         LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what());
- #ifndef NDEBUG
-@@ -632,6 +632,10 @@ void llama_context::set_warmup(bool value) {
-     cparams.warmup = value;
- }
- 
-+void llama_context::set_cross_attn(bool value) {
-+    cparams.cross_attn = value;
-+}
-+
- void llama_context::set_adapter_lora(
-             llama_adapter_lora * adapter,
-             float scale) {
-@@ -709,7 +713,7 @@ int llama_context::encode(llama_batch & inp_batch) {
- 
-     const int64_t n_embd = hparams.n_embd;
- 
--    llama_sbatch sbatch = llama_sbatch(batch, n_embd, /* simple_split */ true, /* logits_all */ true);
-+    llama_sbatch sbatch = llama_sbatch(batch, batch.n_embd, /* simple_split */ true, /* logits_all */ true);
- 
-     const llama_ubatch ubatch = sbatch.split_simple(n_tokens);
- 
-@@ -863,10 +867,9 @@ int llama_context::decode(llama_batch & inp_batch) {
- 
-     const llama_batch & batch = batch_allocr.batch;
- 
--    const auto & vocab   = model.vocab;
-     const auto & hparams = model.hparams;
- 
--    const int32_t n_vocab = vocab.n_tokens();
-+    const int32_t n_vocab = hparams.n_vocab;
- 
-     const int64_t n_tokens_all = batch.n_tokens;
-     const int64_t n_embd       = hparams.n_embd;
-@@ -1087,7 +1090,7 @@ int llama_context::decode(llama_batch & inp_batch) {
-         // make the outputs have the same order they had in the user-provided batch
-         // note: this is mostly relevant for recurrent models atm
-         if (!sorted_output) {
--            const uint32_t n_vocab = model.vocab.n_tokens();
-+            const uint32_t n_vocab = model.hparams.n_vocab;
-             const uint32_t n_embd  = model.hparams.n_embd;
- 
-             GGML_ASSERT((size_t) n_outputs == out_ids.size());
-@@ -1142,12 +1145,11 @@ int llama_context::decode(llama_batch & inp_batch) {
- 
- int32_t llama_context::output_reserve(int32_t n_outputs) {
-     const auto & hparams = model.hparams;
--    const auto & vocab   = model.vocab;
- 
-     const int64_t n_outputs_max = std::max<int64_t>(n_outputs, n_seq_max());
- 
-     const auto n_batch = cparams.n_batch;
--    const auto n_vocab = vocab.n_tokens();
-+    const auto n_vocab = hparams.n_vocab;
-     const auto n_embd  = hparams.n_embd;
- 
-     // TODO: use a per-batch flag for logits presence instead
-@@ -1682,7 +1684,7 @@ size_t llama_context::state_write_data(llama_io_write_i & io) {
-     {
-         LLAMA_LOG_DEBUG("%s: - writing logits\n", __func__);
- 
--        const uint64_t logits_size = std::min((uint64_t) this->logits_size, (uint64_t) n_outputs * model.vocab.n_tokens());
-+        const uint64_t logits_size = std::min((uint64_t) this->logits_size, (uint64_t) n_outputs * model.hparams.n_vocab);
- 
-         io.write(&logits_size, sizeof(logits_size));
- 
-@@ -2091,6 +2093,7 @@ llama_context_params llama_context_default_params() {
-         /*.flash_attn                  =*/ false,
-         /*.no_perf                     =*/ true,
-         /*.op_offload                  =*/ true,
-+        /*.cross_attn                  =*/ false,
-     };
- 
-     return result;
-@@ -2216,6 +2219,10 @@ void llama_set_warmup(llama_context * ctx, bool warmup) {
-     ctx->set_warmup(warmup);
- }
- 
-+void llama_set_cross_attention(struct llama_context * ctx, bool cross_attention) {
-+    ctx->set_cross_attn(cross_attention);
-+}
-+
- void llama_synchronize(llama_context * ctx) {
-     ctx->synchronize();
- }
-diff --git a/src/llama-context.h b/src/llama-context.h
-index c0ceacb1..c4ab242a 100644
---- a/src/llama-context.h
-+++ b/src/llama-context.h
-@@ -71,6 +71,7 @@ struct llama_context {
-     void set_embeddings (bool value);
-     void set_causal_attn(bool value);
-     void set_warmup(bool value);
-+    void set_cross_attn(bool value);
- 
-     void set_adapter_lora(
-             llama_adapter_lora * adapter,
-diff --git a/src/llama-cparams.h b/src/llama-cparams.h
-index 246fa577..7a6156ce 100644
---- a/src/llama-cparams.h
-+++ b/src/llama-cparams.h
-@@ -31,6 +31,7 @@ struct llama_cparams {
-     bool no_perf;
-     bool warmup;
-     bool op_offload;
-+    bool cross_attn;
- 
-     enum llama_pooling_type pooling_type;
- 
-diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
-index b0e3f635..f14869cf 100644
---- a/src/llama-graph.cpp
-+++ b/src/llama-graph.cpp
-@@ -532,6 +532,12 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
-     }
- }
- 
-+void llm_graph_input_cross_attn_state::set_input(const llama_ubatch * ubatch) {
-+    if (ubatch->embd) {
-+        ggml_backend_tensor_set(cross_attn_state, ubatch->embd, 0, ggml_nbytes(cross_attn_state));
-+    }
-+}
-+
- //
- // llm_graph_context
- //
-@@ -1514,6 +1520,25 @@ llm_graph_input_attn_cross * llm_graph_context::build_attn_inp_cross() const {
-     return (llm_graph_input_attn_cross *) res->add_input(std::move(inp));
- }
- 
-+ggml_tensor * llm_graph_context::build_inp_cross_attn_state() const {
-+    const int64_t n_embd = hparams.n_embd;
-+
-+    auto inp = std::make_unique<llm_graph_input_cross_attn_state>();
-+
-+    ggml_tensor * cur = nullptr;
-+
-+    inp->cross_attn_state = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd, 1601, 4);
-+    ggml_set_input(inp->cross_attn_state);
-+
-+    cur = inp->cross_attn_state;
-+
-+    cb(cur, "inp_cross_attn_state", -1);
-+
-+    res->add_input(std::move(inp));
-+
-+    return cur;
-+}
-+
- ggml_tensor * llm_graph_context::build_attn(
-         llm_graph_input_attn_cross * inp,
-         ggml_cgraph * gf,
-diff --git a/src/llama-graph.h b/src/llama-graph.h
-index 832a8c09..5a322785 100644
---- a/src/llama-graph.h
-+++ b/src/llama-graph.h
-@@ -87,6 +87,7 @@ public:
- 
-     ggml_tensor * tokens = nullptr; // I32 [n_batch]
-     ggml_tensor * embd   = nullptr; // F32 [n_embd, n_batch]
-+    ggml_tensor * cross_attn_state; // F32 [4, n_embd, 1061]
- };
- 
- class llm_graph_input_pos : public llm_graph_input_i {
-@@ -284,6 +285,16 @@ public:
-     const llama_cross * cross = nullptr;
- };
- 
-+class llm_graph_input_cross_attn_state : public llm_graph_input_i {
-+public:
-+    llm_graph_input_cross_attn_state()          = default;
-+    virtual ~llm_graph_input_cross_attn_state() = default;
-+
-+    void set_input(const llama_ubatch * ubatch) override;
-+
-+    ggml_tensor * cross_attn_state; // F32 [4, n_embd, 1061]
-+};
-+
- //
- // llm_graph_result
- //
-@@ -495,6 +506,7 @@ struct llm_graph_context {
-     ggml_tensor * build_inp_cls() const;
-     ggml_tensor * build_inp_s_copy() const;
-     ggml_tensor * build_inp_s_mask() const;
-+    ggml_tensor * build_inp_cross_attn_state() const;
- 
-     ggml_tensor * build_inp_cross_embd() const;
-     ggml_tensor * build_inp_pos_bucket_enc() const;
-diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp
-index 8a667960..6a02de03 100644
---- a/src/llama-hparams.cpp
-+++ b/src/llama-hparams.cpp
-@@ -85,3 +85,7 @@ bool llama_hparams::is_swa(uint32_t il) const {
- 
-     GGML_ABORT("fatal error");
- }
-+
-+bool llama_hparams::cross_attention_layers(uint32_t il) const {
-+    return std::find(cross_attn_layers.begin(), cross_attn_layers.end(), il) != cross_attn_layers.end();
-+}
-diff --git a/src/llama-hparams.h b/src/llama-hparams.h
-index 48dce407..b6fc7e6d 100644
---- a/src/llama-hparams.h
-+++ b/src/llama-hparams.h
-@@ -2,6 +2,8 @@
- 
- #include "llama.h"
- 
-+#include <algorithm>
-+
- #include <array>
- 
- // bump if necessary
-@@ -42,6 +44,7 @@ struct llama_hparams {
-     uint32_t n_expert = 0;
-     uint32_t n_expert_used = 0;
-     uint32_t n_rel_attn_bkts = 0;
-+    uint32_t n_vocab = 0;
- 
-     // note: deepseek2 using MLA converts into MQA with larger heads, then decompresses to MHA
-     uint32_t n_embd_head_k_mla = 0;
-@@ -56,6 +59,7 @@ struct llama_hparams {
-     std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
- 
-     std::array<std::array<uint32_t, LLAMA_MAX_LAYERS>, 4> n_bskcn_arr = {};
-+    std::array<uint32_t, LLAMA_MAX_LAYERS> cross_attn_layers;
- 
-     uint32_t n_layer_dense_lead = 0;
-     uint32_t n_lora_q           = 0;
-@@ -159,6 +163,9 @@ struct llama_hparams {
-     // Block skip connection
-     bool n_bskcn(uint32_t n, uint32_t il) const;
- 
-+    // cross attention layers
-+    bool cross_attention_layers(uint32_t il) const;
-+
-     bool is_swa(uint32_t il) const;
- };
- 
-diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
-index 3dcad65b..a7b0a7eb 100644
---- a/src/llama-kv-cache.cpp
-+++ b/src/llama-kv-cache.cpp
-@@ -100,8 +100,16 @@ llama_kv_cache_unified::llama_kv_cache_unified(
-             throw std::runtime_error("failed to create ggml context for kv cache");
-         }
- 
--        ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
--        ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size);
-+        ggml_tensor * k, *v;
-+
-+        // for cross attention layers
-+        if (model.arch == LLM_ARCH_MLLAMA && hparams.cross_attention_layers(i)) {
-+            k = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hparams.n_embd_head_k, 6404, hparams.n_head_kv(i));
-+            v = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hparams.n_embd_head_v, 6404, hparams.n_head_kv(i));
-+        } else {
-+            k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
-+            v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size);
-+        }
-         ggml_format_name(k, "cache_k_l%d", i);
-         ggml_format_name(v, "cache_v_l%d", i);
-         k_l.push_back(k);
-@@ -446,7 +454,7 @@ void llama_kv_cache_unified::set_full() {
- llama_sbatch llama_kv_cache_unified::sbatch_init(
-         const llama_batch & batch,
-         bool logits_all) {
--    return llama_sbatch(batch, hparams.n_embd, true, logits_all);
-+    return llama_sbatch(batch, batch.n_embd, true, logits_all);
- }
- 
- llama_ubatch llama_kv_cache_unified::ubatch_next(
-diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
-index 7f6617fa..2acfd4a8 100644
---- a/src/llama-model-loader.cpp
-+++ b/src/llama-model-loader.cpp
-@@ -315,6 +315,8 @@ namespace GGUFMeta {
-         return true;
-     }
- 
-+    template bool llama_model_loader::get_arr<std::array<unsigned int, 512>>(enum llm_kv kid, std::array<unsigned int, 512>& result, bool required);
-+
-     template<typename T, size_t N_MAX>
-     bool llama_model_loader::get_arr(const std::string & key, std::array<T, N_MAX> & result, bool required) {
-         const int kid = gguf_find_key(meta.get(), key.c_str());
-diff --git a/src/llama-model.cpp b/src/llama-model.cpp
-index 831b68c0..e8298f56 100644
---- a/src/llama-model.cpp
-+++ b/src/llama-model.cpp
-@@ -433,6 +433,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
- 
-     // get general kv
-     ml.get_key(LLM_KV_GENERAL_NAME, name, false);
-+    ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab, false);
- 
-     // everything past this point is not vocab-related
-     if (hparams.vocab_only) {
-@@ -444,6 +445,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
-     ml.get_key(LLM_KV_BLOCK_COUNT,       hparams.n_layer);
-     ml.get_key(LLM_KV_EXPERT_COUNT,      hparams.n_expert,      false);
-     ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
-+    ml.get_key(LLM_KV_VOCAB_SIZE,        hparams.n_vocab,       false);
- 
-     if (arch == LLM_ARCH_WAVTOKENIZER_DEC) {
-         ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd_features);
-@@ -467,9 +469,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
-     std::fill(hparams.n_head_arr.begin(),    hparams.n_head_arr.end(),    0);
-     std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
-     std::fill(hparams.n_ff_arr.begin(),      hparams.n_ff_arr.end(),      0);
-+    std::fill(hparams.cross_attn_layers.begin(), hparams.cross_attn_layers.end(), -1);
- 
-     ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH,  hparams.n_ff_arr,   hparams.n_layer, false);
-     ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false);
-+    ml.get_arr(LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS, hparams.cross_attn_layers, false);
- 
-     // n_head_kv is optional, default to n_head
-     hparams.n_head_kv_arr = hparams.n_head_arr;
-@@ -522,7 +526,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
- 
-         ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
- 
--        if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON) {
-+        if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_MLLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON) {
-             if (hparams.n_rot != hparams.n_embd_head_k) {
-                 throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
-             }
-@@ -585,6 +589,16 @@ void llama_model::load_hparams(llama_model_loader & ml) {
-                     hparams.use_kq_norm = false;
-                 }
-             } break;
-+        case LLM_ARCH_MLLAMA:
-+            {
-+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-+
-+                switch (hparams.n_layer) {
-+                    case 40: type = LLM_TYPE_11B; break;
-+                    case 100: type = LLM_TYPE_90B; break;
-+                    default: type = LLM_TYPE_UNKNOWN;
-+                }
-+            } break;
-         case LLM_ARCH_DECI:
-             {
-                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-@@ -1581,7 +1595,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
-         const int64_t n_embd_head_v = hparams.n_embd_head_v;
-         const int64_t n_ff          = hparams.n_ff();
-         const int64_t n_embd_gqa    = n_embd_v_gqa;
--        const int64_t n_vocab       = vocab.n_tokens();
-+        const int64_t n_vocab       = hparams.n_vocab;
-         const int64_t n_token_types = vocab.n_token_types();
-         const int64_t n_rot         = hparams.n_rot;
-         const int64_t n_expert      = hparams.n_expert;
-@@ -1840,6 +1854,52 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
-                         }
-                     }
-                 } break;
-+            case LLM_ARCH_MLLAMA:
-+                {
-+                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab+8}, 0);
-+
-+                    // output
-+                    {
-+                        output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-+                        output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
-+
-+                        // if output is NULL, init from the input tok embed
-+                        if (output == NULL) {
-+                            output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
-+                        }
-+                    }
-+
-+                    for (int i = 0; i < n_layer; ++i) {
-+                        auto & layer = layers[i];
-+
-+                        if (hparams.cross_attention_layers(i)) {
-+                            layer.cross_attn_k_norm = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_K_NORM,   "weight", i), {128}, 0);
-+                            layer.cross_attn_k_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_K_PROJ,   "weight", i), {n_embd, 1024}, 0);
-+                            layer.cross_attn_o_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_O_PROJ,   "weight", i), {n_embd, n_embd}, 0);
-+                            layer.cross_attn_q_norm = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_Q_NORM, "weight", i), {128}, 0);
-+                            layer.cross_attn_q_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_Q_PROJ, "weight", i), {n_embd, n_embd}, 0);
-+                            layer.cross_attn_v_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_V_PROJ, "weight", i), {n_embd, 1024}, 0);
-+                            layer.cross_attn_attn_gate = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_ATTN_GATE, i), {1}, 0);
-+                            layer.cross_attn_mlp_gate = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_MLP_GATE, i), {1}, 0);
-+                            layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-+                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
-+                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
-+                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
-+                            layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-+                        } else {
-+                            layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-+                            layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
-+                            layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
-+                            layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
-+                            layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
-+                            layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-+                            layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
-+                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
-+                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
-+                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
-+                        }
-+                    }
-+                } break;
-             case LLM_ARCH_DECI:
-                 {
-                     tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-@@ -4756,6 +4816,246 @@ struct llm_build_llama : public llm_graph_context {
-     }
- };
- 
-+struct llm_build_mllama: public llm_graph_context {
-+    llm_build_mllama(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
-+        // mutable variable, needed during the last layer of the computation to skip unused tokens
-+        int32_t n_tokens = this->n_tokens;
-+
-+        const int64_t n_embd_head = hparams.n_embd_head_v;
-+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-+        GGML_ASSERT(n_embd_head == hparams.n_rot);
-+
-+        ggml_tensor * cur;
-+        ggml_tensor * inpL;
-+        ggml_tensor * inpCAS;
-+
-+        inpL = build_inp_embd(model.tok_embd);
-+        inpCAS = build_inp_cross_attn_state();
-+
-+          // inp_pos - contains the positions
-+        ggml_tensor * inp_pos = build_inp_pos();
-+
-+        auto * inp_attn = build_attn_inp_kv_unified();
-+        const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
-+
-+        for (int il = 0; il < n_layer; ++il) {
-+            ggml_tensor * inpSA = inpL;
-+
-+            // norm
-+            cur = build_norm(inpL,
-+                    model.layers[il].attn_norm, NULL,
-+                    LLM_NORM_RMS, il);
-+            cb(cur, "attn_norm", il);
-+
-+            if (hparams.cross_attention_layers(il)) {
-+                if (!ubatch.embd && !cparams.cross_attn) {
-+                    continue;
-+                }
-+
-+                // cross attention layer
-+                ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_q_proj, cur);
-+                cb(Qcur, "Qcur", il);
-+
-+                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
-+                cb(Qcur, "Qcur", il);
-+
-+                Qcur = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 0, 2, 1, 3));
-+                cb(Qcur, "Qcur", il);
-+
-+                Qcur = build_norm(Qcur, model.layers[il].cross_attn_q_norm, NULL, LLM_NORM_RMS, il);
-+                cb(Qcur, "Qcur", il);
-+
-+                ggml_tensor * Kcur, * Vcur;
-+                if (ubatch.embd) {
-+                    Kcur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_k_proj, inpCAS);
-+                    cb(Kcur, "Kcur", il);
-+
-+                    Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, 6404);
-+                    cb(Kcur, "Kcur", il);
-+
-+                    Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
-+                    cb(Kcur, "Kcur", il);
-+
-+                    Kcur = build_norm(Kcur, model.layers[il].cross_attn_k_norm, NULL, LLM_NORM_RMS, il);
-+                    cb(Kcur, "Kcur", il);
-+
-+                    ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, kv_self->k_l[il]));
-+
-+                    Vcur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_v_proj, inpCAS);
-+                    cb(Vcur, "Vcur", il);
-+
-+                    Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, 6404);
-+                    cb(Vcur, "Vcur", il);
-+
-+                    Vcur = ggml_permute(ctx0, Vcur, 0, 2, 1, 3);
-+                    cb(Vcur, "Vcur", il);
-+
-+                    ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, kv_self->v_l[il]));
-+                } else {
-+                    Kcur = ggml_view_tensor(ctx0, kv_self->k_l[il]);
-+                    cb(Kcur, "Kcur (view)", il);
-+
-+                    Vcur = ggml_view_tensor(ctx0, kv_self->v_l[il]);
-+                    cb(Vcur, "Vcur (view)", il);
-+                }
-+
-+                struct ggml_tensor * kq = ggml_mul_mat(ctx0, Kcur, Qcur);
-+                cb(kq, "kq", il);
-+
-+                // TODO: apply causal masks
-+                struct ggml_tensor * kq_soft_max = ggml_soft_max_ext(ctx0, kq, nullptr, 1.f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias);
-+                cb(kq_soft_max, "kq_soft_max", il);
-+
-+                Vcur = ggml_cont(ctx0, ggml_transpose(ctx0, Vcur));
-+                cb(Vcur, "Vcur", il);
-+
-+                struct ggml_tensor * kqv = ggml_mul_mat(ctx0, Vcur, kq_soft_max);
-+                cb(kqv, "kqv", il);
-+
-+                struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
-+                cb(kqv_merged, "kqv_merged", il);
-+
-+                cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_head_v*n_head, n_tokens);
-+                cb(cur, "kqv_merged_cont", il);
-+
-+                cur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_o_proj, cur);
-+                cb(cur, "cur", il);
-+
-+                // TODO: do this in place once?
-+                cur = ggml_mul(ctx0, cur, ggml_tanh(ctx0, model.layers[il].cross_attn_attn_gate));
-+
-+                struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
-+                cb(ffn_inp, "ffn_inp", il);
-+
-+                // feed-forward network
-+                cur = build_norm(ffn_inp,
-+                        model.layers[il].ffn_norm, NULL,
-+                        LLM_NORM_RMS, il);
-+                cb(cur, "ffn_norm", il);
-+
-+                cur = build_ffn(cur,
-+                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
-+                        model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
-+                        model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
-+                        NULL,
-+                        LLM_FFN_SILU, LLM_FFN_PAR, il);
-+                cb(cur, "ffn_out", il);
-+
-+                // TODO: do this inplace once?
-+                cur = ggml_add_inplace(ctx0, ggml_mul_inplace(ctx0, cur, ggml_tanh(ctx0, model.layers[il].cross_attn_mlp_gate)), ffn_inp);
-+                cb(cur, "ffn_out", il);
-+
-+                cur = build_cvec(cur, il);
-+                cb(cur, "l_out", il);
-+
-+                // input for next layer
-+                inpL = cur;
-+            } else {
-+                // self attention layer
-+
-+                // rope freq factors for llama3; may return nullptr for llama2 and other models
-+                ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
-+
-+                // compute Q and K and RoPE them
-+                ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
-+                cb(Qcur, "Qcur", il);
-+                if (model.layers[il].bq) {
-+                    Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
-+                    cb(Qcur, "Qcur", il);
-+                }
-+
-+                ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
-+                cb(Kcur, "Kcur", il);
-+                if (model.layers[il].bk) {
-+                    Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
-+                    cb(Kcur, "Kcur", il);
-+                }
-+
-+                ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
-+                cb(Vcur, "Vcur", il);
-+                if (model.layers[il].bv) {
-+                    Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
-+                    cb(Vcur, "Vcur", il);
-+                }
-+
-+                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
-+                Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
-+                Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
-+
-+                Qcur = ggml_rope_ext(
-+                        ctx0, Qcur, inp_pos, rope_factors,
-+                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-+                        ext_factor, attn_factor, beta_fast, beta_slow
-+                        );
-+
-+                Kcur = ggml_rope_ext(
-+                        ctx0, Kcur, inp_pos, rope_factors,
-+                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-+                        ext_factor, attn_factor, beta_fast, beta_slow
-+                        );
-+
-+                cb(Qcur, "Qcur", il);
-+                cb(Kcur, "Kcur", il);
-+                cb(Vcur, "Vcur", il);
-+
-+                cur = build_attn(inp_attn, gf,
-+                    model.layers[il].wo, model.layers[il].bo,
-+                    Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
-+
-+                if (il == n_layer - 1) {
-+                    // skip computing output for unused tokens
-+                    struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-+                    n_tokens = n_outputs;
-+                    cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
-+                    inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
-+                }
-+
-+                struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
-+                cb(ffn_inp, "ffn_inp", il);
-+
-+                // feed-forward network
-+                cur = build_norm(ffn_inp,
-+                        model.layers[il].ffn_norm, NULL,
-+                        LLM_NORM_RMS, il);
-+                cb(cur, "ffn_norm", il);
-+
-+                cur = build_ffn(cur,
-+                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
-+                        model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
-+                        model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
-+                        NULL,
-+                        LLM_FFN_SILU, LLM_FFN_PAR, il);
-+                cb(cur, "ffn_out", il);
-+
-+                cur = ggml_add(ctx0, cur, ffn_inp);
-+                cb(cur, "ffn_out", il);
-+
-+                cur = build_cvec(cur, il);
-+                cb(cur, "l_out", il);
-+
-+                // input for next layer
-+                inpL = cur;
-+            }
-+        }
-+
-+        cur = inpL;
-+
-+        cur = build_norm(cur,
-+                model.output_norm, NULL,
-+                LLM_NORM_RMS, -1);
-+        cb(cur, "result_norm", -1);
-+        res->t_embd = cur;
-+
-+        // lm_head
-+        cur = build_lora_mm(model.output, cur);
-+
-+        cb(cur, "result_output", -1);
-+        res->t_logits = cur;
-+
-+        ggml_build_forward_expand(gf, cur);
-+    }
-+};
-+
- struct llm_build_deci : public llm_graph_context {
-     llm_build_deci(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
-         const int64_t n_embd_head = hparams.n_embd_head_v;
-@@ -12496,7 +12796,7 @@ struct llm_build_solar : public llm_graph_context {
-             // self-attention
-             {
-                 // rope freq factors for llama3; may return nullptr for llama2 and other models
--                ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
-+                ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
- 
-                 // compute Q and K and RoPE them
-                 ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
-@@ -13128,6 +13428,10 @@ llm_graph_result_ptr llama_model::build_graph(
-             {
-                 llm = std::make_unique<llm_build_llama>(*this, params, gf);
-             } break;
-+        case LLM_ARCH_MLLAMA:
-+            {
-+                llm = std::make_unique<llm_build_mllama>(*this, params, gf);
-+            } break;
-         case LLM_ARCH_DECI:
-             {
-                 llm = std::make_unique<llm_build_deci>(*this, params, gf);
-@@ -13489,6 +13793,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
-         // use what we call a normal RoPE, operating on pairs of consecutive head values
-         case LLM_ARCH_LLAMA:
-         case LLM_ARCH_LLAMA4:
-+        case LLM_ARCH_MLLAMA:
-         case LLM_ARCH_DECI:
-         case LLM_ARCH_BAICHUAN:
-         case LLM_ARCH_STARCODER:
-diff --git a/src/llama-model.h b/src/llama-model.h
-index 43746c7d..9281e629 100644
---- a/src/llama-model.h
-+++ b/src/llama-model.h
-@@ -11,6 +11,7 @@
- #include <string>
- #include <unordered_map>
- #include <vector>
-+#include <stdexcept>
- 
- struct llama_cparams;
- struct llama_ubatch;
-@@ -74,6 +75,7 @@ enum llm_type {
-     LLM_TYPE_40B,
-     LLM_TYPE_65B,
-     LLM_TYPE_70B,
-+    LLM_TYPE_90B,
-     LLM_TYPE_236B,
-     LLM_TYPE_290B,
-     LLM_TYPE_314B,
-@@ -318,6 +320,16 @@ struct llama_layer {
- 
-     struct ggml_tensor * bskcn_tv = nullptr;
- 
-+    // cross attention
-+    struct ggml_tensor * cross_attn_k_norm = nullptr;
-+    struct ggml_tensor * cross_attn_k_proj = nullptr;
-+    struct ggml_tensor * cross_attn_o_proj = nullptr;
-+    struct ggml_tensor * cross_attn_q_norm = nullptr;
-+    struct ggml_tensor * cross_attn_q_proj = nullptr;
-+    struct ggml_tensor * cross_attn_v_proj = nullptr;
-+    struct ggml_tensor * cross_attn_attn_gate = nullptr;
-+    struct ggml_tensor * cross_attn_mlp_gate = nullptr;
-+
-     struct llama_layer_posnet posnet;
- 
-     struct llama_layer_convnext convnext;
-diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
-index 820d5128..56531980 100644
---- a/src/llama-quant.cpp
-+++ b/src/llama-quant.cpp
-@@ -639,7 +639,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
-         if (llama_model_has_encoder(&model)) {
-             n_attn_layer *= 3;
-         }
--        GGML_ASSERT((qs.n_attention_wv == n_attn_layer) && "n_attention_wv is unexpected");
-+        if (qs.n_attention_wv != n_attn_layer) {
-+            LLAMA_LOG_WARN("%s: n_attention_wv is unexpected, expected: %d, found: %d\n", __func__, n_attn_layer, qs.n_attention_wv);
-+        }
-     }
- 
-     size_t total_size_org = 0;
-diff --git a/tools/mtmd/llava.cpp b/tools/mtmd/llava.cpp
-index ebef8b3c..b0eb79bb 100644
---- a/tools/mtmd/llava.cpp
-+++ b/tools/mtmd/llava.cpp
-@@ -462,7 +462,7 @@ struct llava_embd_batch {
-     std::vector<llama_seq_id *> seq_ids;
-     std::vector<int8_t>         logits;
-     llama_batch batch;
--    llava_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
-+    llava_embd_batch(float * embd, int32_t n_embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
-         pos     .resize(n_tokens);
-         n_seq_id.resize(n_tokens);
-         seq_ids .resize(n_tokens + 1);
-@@ -474,6 +474,7 @@ struct llava_embd_batch {
-             /*n_tokens       =*/ n_tokens,
-             /*tokens         =*/ nullptr,
-             /*embd           =*/ embd,
-+            /*n_embd         =*/ n_embd,
-             /*pos            =*/ pos.data(),
-             /*n_seq_id       =*/ n_seq_id.data(),
-             /*seq_id         =*/ seq_ids.data(),
-@@ -497,7 +498,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
-             n_eval = n_batch;
-         }
-         float * embd = image_embed->embed+i*n_embd;
--        llava_embd_batch llava_batch = llava_embd_batch(embd, n_eval, *n_past, 0);
-+        llava_embd_batch llava_batch = llava_embd_batch(embd, n_embd, n_eval, *n_past, 0);
-         if (llama_decode(ctx_llama, llava_batch.batch)) {
-             LOG_ERR("%s : failed to eval\n", __func__);
-             return false;
-diff --git a/tools/mtmd/mtmd-helper.cpp b/tools/mtmd/mtmd-helper.cpp
-index 7a328867..61ebdd43 100644
---- a/tools/mtmd/mtmd-helper.cpp
-+++ b/tools/mtmd/mtmd-helper.cpp
-@@ -58,7 +58,7 @@ struct decode_embd_batch {
-     std::vector<llama_seq_id *> seq_ids;
-     std::vector<int8_t>         logits;
-     llama_batch batch;
--    decode_embd_batch(float * embd, int32_t n_tokens, int n_pos_per_embd, int n_mmproj_embd) : n_pos_per_embd(n_pos_per_embd), n_mmproj_embd(n_mmproj_embd) {
-+    decode_embd_batch(float * embd, int32_t n_embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) : n_pos_per_embd(n_pos_per_embd), n_mmproj_embd(n_mmproj_embd) {
-         pos     .resize(n_tokens * n_pos_per_embd);
-         n_seq_id.resize(n_tokens);
-         seq_ids .resize(n_tokens + 1);
-@@ -69,6 +69,7 @@ struct decode_embd_batch {
-             /*n_tokens       =*/ n_tokens,
-             /*tokens         =*/ nullptr,
-             /*embd           =*/ embd,
-+            /*n_embd         =*/ n_embd,
-             /*pos            =*/ pos.data(),
-             /*n_seq_id       =*/ n_seq_id.data(),
-             /*seq_id         =*/ seq_ids.data(),
-@@ -131,6 +132,7 @@ struct decode_embd_batch {
-             /*n_tokens       =*/ n_tokens,
-             /*tokens         =*/ nullptr,
-             /*embd           =*/ batch.embd     + offset * n_mmproj_embd,
-+            /*n_embd         =*/ batch.n_embd,
-             /*pos            =*/ pos_ptr,
-             /*n_seq_id       =*/ batch.n_seq_id + offset,
-             /*seq_id         =*/ batch.seq_id   + offset,
-@@ -166,7 +168,8 @@ int32_t mtmd_helper_decode_image_chunk(
-     int32_t n_tokens = mtmd_image_tokens_get_n_tokens(image_tokens);
-     int32_t i_batch = 0;
-     int32_t n_img_batches = GGML_PAD(n_tokens, n_batch) / n_batch;
--    decode_embd_batch batch_embd(encoded_embd, n_tokens, n_pos_per_embd, n_mmproj_embd);
-+    int n_embd  = llama_model_n_embd(llama_get_model(lctx));
-+    decode_embd_batch batch_embd(encoded_embd, n_embd, n_tokens, n_past, seq_id);
- 
-     const int nx = mtmd_image_tokens_get_nx(image_tokens);
-     const int ny = mtmd_image_tokens_get_ny(image_tokens);
diff --git a/llama/patches/0008-fix-deepseek-deseret-regex.patch b/llama/patches/0006-fix-deepseek-deseret-regex.patch
similarity index 100%
rename from llama/patches/0008-fix-deepseek-deseret-regex.patch
rename to llama/patches/0006-fix-deepseek-deseret-regex.patch
diff --git a/llama/patches/0007-add-unpad-operator.patch b/llama/patches/0007-add-unpad-operator.patch
deleted file mode 100644
index fc45aeff4..000000000
--- a/llama/patches/0007-add-unpad-operator.patch
+++ /dev/null
@@ -1,419 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: jmorganca <jmorganca@gmail.com>
-Date: Sun, 13 Apr 2025 22:10:06 -0400
-Subject: [PATCH] add unpad operator
-
-adds the unpad operator to GGML
----
- ggml/include/ggml.h                  | 10 +++++
- ggml/src/ggml-cpu/ggml-cpu.c         |  5 +++
- ggml/src/ggml-cpu/ops.cpp            | 55 ++++++++++++++++++++++++++++
- ggml/src/ggml-cpu/ops.h              |  1 +
- ggml/src/ggml-cuda/ggml-cuda.cu      |  4 ++
- ggml/src/ggml-cuda/pad.cu            | 46 +++++++++++++++++++++++
- ggml/src/ggml-cuda/pad.cuh           |  1 +
- ggml/src/ggml-metal/ggml-metal.m     | 33 +++++++++++++++++
- ggml/src/ggml-metal/ggml-metal.metal | 45 +++++++++++++++++++++++
- ggml/src/ggml.c                      | 25 ++++++++++++-
- 10 files changed, 223 insertions(+), 2 deletions(-)
-
-diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
-index e91dedf1..8dc107ba 100644
---- a/ggml/include/ggml.h
-+++ b/ggml/include/ggml.h
-@@ -489,6 +489,7 @@ extern "C" {
-         GGML_OP_UPSCALE, // nearest interpolate
-         GGML_OP_PAD,
-         GGML_OP_PAD_REFLECT_1D,
-+        GGML_OP_UNPAD,
-         GGML_OP_ARANGE,
-         GGML_OP_TIMESTEP_EMBEDDING,
-         GGML_OP_ARGSORT,
-@@ -1781,6 +1782,15 @@ extern "C" {
-             int                   p0,
-             int                   p1);
- 
-+    // unpad each dimension: [x, ..., x, y, ..., y] -> [x, ..., x]
-+    GGML_API struct ggml_tensor * ggml_unpad(
-+            struct ggml_context * ctx,
-+            struct ggml_tensor  * a,
-+            int                  p0,
-+            int                  p1,
-+            int                  p2,
-+            int                  p3);
-+
-     // Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151
-     // timesteps: [N,]
-     // return: [N, dim]
-diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
-index a30e67f2..835e6495 100644
---- a/ggml/src/ggml-cpu/ggml-cpu.c
-+++ b/ggml/src/ggml-cpu/ggml-cpu.c
-@@ -1951,6 +1951,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
-             {
-                 ggml_compute_forward_pad_reflect_1d(params, tensor);
-             } break;
-+        case GGML_OP_UNPAD:
-+            {
-+                ggml_compute_forward_unpad(params, tensor);
-+            } break;
-         case GGML_OP_ARANGE:
-             {
-                 ggml_compute_forward_arange(params, tensor);
-@@ -2274,6 +2278,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
-         case GGML_OP_UPSCALE:
-         case GGML_OP_PAD:
-         case GGML_OP_PAD_REFLECT_1D:
-+        case GGML_OP_UNPAD:
-         case GGML_OP_ARANGE:
-         case GGML_OP_TIMESTEP_EMBEDDING:
-         case GGML_OP_ARGSORT:
-diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
-index 955fec59..1868a10c 100644
---- a/ggml/src/ggml-cpu/ops.cpp
-+++ b/ggml/src/ggml-cpu/ops.cpp
-@@ -6690,6 +6690,61 @@ void ggml_compute_forward_pad_reflect_1d(
-     }
- }
- 
-+// ggml_compute_forward_unpad
-+
-+static void ggml_compute_forward_unpad_f32(
-+    const struct ggml_compute_params *params,
-+    struct ggml_tensor *dst) {
-+
-+    const struct ggml_tensor * src0 = dst->src[0];
-+
-+    GGML_ASSERT(src0->nb[0] == sizeof(float));
-+    GGML_ASSERT( dst->nb[0] == sizeof(float));
-+
-+    const int ith = params->ith;
-+    const int nth = params->nth;
-+
-+    GGML_TENSOR_UNARY_OP_LOCALS
-+
-+    float * dst_ptr = (float *) dst->data;
-+
-+    // TODO: optimize
-+
-+    for (int64_t i2 = 0; i2 < ne2; ++i2) {
-+        for (int64_t i1 = ith; i1 < ne1; i1 += nth) {
-+            for (int64_t i0 = 0; i0 < ne0; ++i0) {
-+                for (int64_t i3 = 0; i3 < ne3; ++i3) {
-+                    const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0;
-+
-+                    const float * src_ptr = (const float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
-+
-+                    if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
-+                        dst_ptr[dst_idx] = *src_ptr;
-+                    }
-+                }
-+            }
-+        }
-+    }
-+}
-+
-+void ggml_compute_forward_unpad(
-+    const struct ggml_compute_params * params,
-+    struct ggml_tensor * dst) {
-+
-+    const struct ggml_tensor * src0 = dst->src[0];
-+
-+    switch (src0->type) {
-+        case GGML_TYPE_F32:
-+            {
-+                ggml_compute_forward_unpad_f32(params, dst);
-+            } break;
-+        default:
-+            {
-+                GGML_ABORT("fatal error");
-+            }
-+    }
-+}
-+
- // ggml_compute_forward_arange
- 
- static void ggml_compute_forward_arange_f32(
-diff --git a/ggml/src/ggml-cpu/ops.h b/ggml/src/ggml-cpu/ops.h
-index dc081b9e..a7125555 100644
---- a/ggml/src/ggml-cpu/ops.h
-+++ b/ggml/src/ggml-cpu/ops.h
-@@ -72,6 +72,7 @@ void ggml_compute_forward_pool_2d_back(const struct ggml_compute_params * params
- void ggml_compute_forward_upscale(const struct ggml_compute_params * params, struct ggml_tensor * dst);
- void ggml_compute_forward_pad(const struct ggml_compute_params * params, struct ggml_tensor * dst);
- void ggml_compute_forward_pad_reflect_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-+void ggml_compute_forward_unpad(const struct ggml_compute_params * params, struct ggml_tensor * dst);
- void ggml_compute_forward_arange(const struct ggml_compute_params * params, struct ggml_tensor * dst);
- void ggml_compute_forward_timestep_embedding(const struct ggml_compute_params * params, struct ggml_tensor * dst);
- void ggml_compute_forward_argsort(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
-index cb0d8528..6fe86674 100644
---- a/ggml/src/ggml-cuda/ggml-cuda.cu
-+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
-@@ -2238,6 +2238,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
-         case GGML_OP_PAD:
-             ggml_cuda_op_pad(ctx, dst);
-             break;
-+        case GGML_OP_UNPAD:
-+            ggml_cuda_op_unpad(ctx, dst);
-+            break;
-         case GGML_OP_ARANGE:
-             ggml_cuda_op_arange(ctx, dst);
-             break;
-@@ -3212,6 +3215,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
-         case GGML_OP_UPSCALE:
-             return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST;
-         case GGML_OP_PAD:
-+        case GGML_OP_UNPAD:
-         case GGML_OP_ARANGE:
-         case GGML_OP_TIMESTEP_EMBEDDING:
-         case GGML_OP_LEAKY_RELU:
-diff --git a/ggml/src/ggml-cuda/pad.cu b/ggml/src/ggml-cuda/pad.cu
-index 77432b04..7d45a7e1 100644
---- a/ggml/src/ggml-cuda/pad.cu
-+++ b/ggml/src/ggml-cuda/pad.cu
-@@ -47,3 +47,49 @@ void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
-         src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
-         dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], stream);
- }
-+
-+static __global__ void unpad_f32(const float * x, float * dst, const int ne0, const int ne00, const int ne01, const int ne02, const int ne03) {
-+    // blockIdx.z: idx of ne2*ne3, aka ne02*ne03
-+    // blockIdx.y: idx of ne1
-+    // blockIDx.x: idx of ne0 / BLOCK_SIZE
-+    int nidx = threadIdx.x + blockIdx.x * blockDim.x;
-+    if (nidx >= ne0) {
-+        return;
-+    }
-+
-+    // operation
-+    int offset_dst =
-+        nidx +
-+        blockIdx.y * ne0 +
-+        blockIdx.z * ne0 * gridDim.y;
-+    if (nidx < ne00 && blockIdx.y < ne01 && blockIdx.z < ne02*ne03) {
-+        int offset_src =
-+            nidx +
-+            blockIdx.y * ne00 +
-+            blockIdx.z * ne00 * ne01;
-+        dst[offset_dst] = x[offset_src];
-+    }
-+}
-+
-+static void unpad_f32_cuda(const float * x, float * dst,
-+    const int ne00, const int ne01, const int ne02, const int ne03,
-+    const int ne0, const int ne1, const int ne2, const int ne3, cudaStream_t stream) {
-+    int num_blocks = (ne0 + CUDA_PAD_BLOCK_SIZE - 1) / CUDA_PAD_BLOCK_SIZE;
-+    dim3 gridDim(num_blocks, ne1, ne2*ne3);
-+    unpad_f32<<<gridDim, CUDA_PAD_BLOCK_SIZE, 0, stream>>>(x, dst, ne0, ne00, ne01, ne02, ne03);
-+}
-+
-+void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
-+    const ggml_tensor * src0 = dst->src[0];
-+    const float * src0_d = (const float *)src0->data;
-+    float * dst_d = (float *)dst->data;
-+    cudaStream_t stream = ctx.stream();
-+
-+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
-+    GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
-+
-+    unpad_f32_cuda(src0_d, dst_d,
-+        src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
-+        dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], stream);
-+}
-\ No newline at end of file
-diff --git a/ggml/src/ggml-cuda/pad.cuh b/ggml/src/ggml-cuda/pad.cuh
-index 8fd386b0..e2ededc3 100644
---- a/ggml/src/ggml-cuda/pad.cuh
-+++ b/ggml/src/ggml-cuda/pad.cuh
-@@ -3,3 +3,4 @@
- #define CUDA_PAD_BLOCK_SIZE 256
- 
- void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
-+void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
-diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
-index 1b56f858..7641247e 100644
---- a/ggml/src/ggml-metal/ggml-metal.m
-+++ b/ggml/src/ggml-metal/ggml-metal.m
-@@ -347,6 +347,7 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
-     GGML_METAL_KERNEL_TYPE_UPSCALE_F32,
-     GGML_METAL_KERNEL_TYPE_PAD_F32,
-     GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32,
-+    GGML_METAL_KERNEL_TYPE_UNPAD_F32,
-     GGML_METAL_KERNEL_TYPE_ARANGE_F32,
-     GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,
-     GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,
-@@ -1294,6 +1295,7 @@ @implementation GGMLMetalClass
-         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32,                     upscale_f32,                     true);
-         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32,                         pad_f32,                         true);
-         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32,              pad_reflect_1d_f32,              true);
-+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UNPAD_F32,                       unpad_f32,                       true);
-         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,          timestep_embedding_f32,          true);
-         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARANGE_F32,                      arange_f32,                      true);
-         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,             argsort_f32_i32_asc,             true);
-@@ -1655,6 +1657,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
-         case GGML_OP_POOL_2D:
-         case GGML_OP_PAD:
-         case GGML_OP_PAD_REFLECT_1D:
-+        case GGML_OP_UNPAD:
-         case GGML_OP_TIMESTEP_EMBEDDING:
-         case GGML_OP_ARGSORT:
-         case GGML_OP_LEAKY_RELU:
-@@ -4184,6 +4187,36 @@ static bool ggml_metal_encode_node(
- 
-                 const int nth = MIN(1024, ne0);
- 
-+                [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
-+            } break;
-+        case GGML_OP_UNPAD:
-+            {
-+                GGML_ASSERT(src0->type == GGML_TYPE_F32);
-+
-+                id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_UNPAD_F32].pipeline;
-+
-+                [encoder setComputePipelineState:pipeline];
-+                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-+                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-+                [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
-+                [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
-+                [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
-+                [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5];
-+                [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
-+                [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
-+                [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
-+                [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9];
-+                [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:10];
-+                [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:11];
-+                [encoder setBytes:&ne2  length:sizeof(ne2)  atIndex:12];
-+                [encoder setBytes:&ne3  length:sizeof(ne3)  atIndex:13];
-+                [encoder setBytes:&nb0  length:sizeof(nb0)  atIndex:14];
-+                [encoder setBytes:&nb1  length:sizeof(nb1)  atIndex:15];
-+                [encoder setBytes:&nb2  length:sizeof(nb2)  atIndex:16];
-+                [encoder setBytes:&nb3  length:sizeof(nb3)  atIndex:17];
-+
-+                const int nth = MIN(1024, ne0);
-+
-                 [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
-             } break;
-         case GGML_OP_ARANGE:
-diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
-index 9cfddf45..080a943b 100644
---- a/ggml/src/ggml-metal/ggml-metal.metal
-+++ b/ggml/src/ggml-metal/ggml-metal.metal
-@@ -3121,6 +3121,51 @@ kernel void kernel_pad_reflect_1d_f32(
-     }
- }
- 
-+kernel void kernel_unpad_f32(
-+    device  const char * src0,
-+    device        char * dst,
-+    constant   int64_t & ne00,
-+    constant   int64_t & ne01,
-+    constant   int64_t & ne02,
-+    constant   int64_t & ne03,
-+    constant  uint64_t & nb00,
-+    constant  uint64_t & nb01,
-+    constant  uint64_t & nb02,
-+    constant  uint64_t & nb03,
-+    constant   int64_t & ne0,
-+    constant   int64_t & ne1,
-+    constant   int64_t & ne2,
-+    constant   int64_t & ne3,
-+    constant  uint64_t & nb0,
-+    constant  uint64_t & nb1,
-+    constant  uint64_t & nb2,
-+    constant  uint64_t & nb3,
-+    uint3 tgpig[[threadgroup_position_in_grid]],
-+    uint3 tpitg[[thread_position_in_threadgroup]],
-+    uint3   ntg[[threads_per_threadgroup]]) {
-+
-+    const int64_t i3 = tgpig.z;
-+    const int64_t i2 = tgpig.y;
-+    const int64_t i1 = tgpig.x;
-+
-+    const int64_t i03 = i3;
-+    const int64_t i02 = i2;
-+    const int64_t i01 = i1;
-+
-+    device const float * src0_ptr = (device const float *) (src0 + i03*nb03 + i02*nb02 + i01*nb01);
-+    device       float * dst_ptr  = (device       float *) (dst  +  i3*nb3  +  i2*nb2  +  i1*nb1);
-+
-+    if (i1 < ne01 && i2 < ne02 && i3 < ne03) {
-+        for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
-+            if (i0 < ne00) {
-+                dst_ptr[i0] = src0_ptr[i0];
-+            }
-+        }
-+
-+        return;
-+    }
-+}
-+
- kernel void kernel_arange_f32(
-     device        char * dst,
-     constant   ggml_metal_kargs_arange & args,
-diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
-index 8a654624..6b034d35 100644
---- a/ggml/src/ggml.c
-+++ b/ggml/src/ggml.c
-@@ -923,6 +923,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
-     "UPSCALE",
-     "PAD",
-     "PAD_REFLECT_1D",
-+    "UNPAD",
-     "ARANGE",
-     "TIMESTEP_EMBEDDING",
-     "ARGSORT",
-@@ -953,7 +954,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
-     "OPT_STEP_ADAMW",
- };
- 
--static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
-+static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
- 
- static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
-     "none",
-@@ -1018,6 +1019,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
-     "upscale(x)",
-     "pad(x)",
-     "pad_reflect_1d(x)",
-+    "unpad(x)",
-     "arange(start, stop, step)",
-     "timestep_embedding(timesteps, dim, max_period)",
-     "argsort(x)",
-@@ -1048,7 +1050,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
-     "adamw(x)",
- };
- 
--static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
-+static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
- 
- static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
- 
-@@ -4274,6 +4276,25 @@ struct ggml_tensor * ggml_pad_reflect_1d(
-     return result;
- }
- 
-+// ggml_unpad
-+
-+struct ggml_tensor * ggml_unpad(
-+    struct ggml_context * ctx,
-+    struct ggml_tensor  * a,
-+    int p0, int p1, int p2, int p3) {
-+
-+    struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type,
-+            a->ne[0] - p0,
-+            a->ne[1] - p1,
-+            a->ne[2] - p2,
-+            a->ne[3] - p3);
-+
-+    result->op = GGML_OP_UNPAD;
-+    result->src[0] = a;
-+
-+    return result;
-+}
-+
- // ggml_arange
- 
- struct ggml_tensor * ggml_arange(
diff --git a/llama/patches/0009-maintain-ordering-for-rules-for-grammar.patch b/llama/patches/0007-maintain-ordering-for-rules-for-grammar.patch
similarity index 100%
rename from llama/patches/0009-maintain-ordering-for-rules-for-grammar.patch
rename to llama/patches/0007-maintain-ordering-for-rules-for-grammar.patch
diff --git a/llama/patches/0010-ensure-KV-cache-is-fully-defragmented.patch b/llama/patches/0008-ensure-KV-cache-is-fully-defragmented.patch
similarity index 94%
rename from llama/patches/0010-ensure-KV-cache-is-fully-defragmented.patch
rename to llama/patches/0008-ensure-KV-cache-is-fully-defragmented.patch
index c5faeaaae..52116ce3f 100644
--- a/llama/patches/0010-ensure-KV-cache-is-fully-defragmented.patch
+++ b/llama/patches/0008-ensure-KV-cache-is-fully-defragmented.patch
@@ -58,7 +58,7 @@ index c22687e4..c5948e8f 100644
  
              auto * gf = graph_init();
 diff --git a/src/llama-context.h b/src/llama-context.h
-index c4ab242a..9970dfc6 100644
+index c0ceacb1..0264e937 100644
 --- a/src/llama-context.h
 +++ b/src/llama-context.h
 @@ -5,6 +5,7 @@
@@ -70,10 +70,10 @@ index c4ab242a..9970dfc6 100644
  #include "ggml-cpp.h"
  #include "ggml-opt.h"
 diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
-index a7b0a7eb..1a50c034 100644
+index 3dcad65b..60e67b03 100644
 --- a/src/llama-kv-cache.cpp
 +++ b/src/llama-kv-cache.cpp
-@@ -372,8 +372,6 @@ void llama_kv_cache_unified::commit() {
+@@ -364,8 +364,6 @@ void llama_kv_cache_unified::commit() {
  }
  
  bool llama_kv_cache_unified::update(llama_context & lctx) {
@@ -82,7 +82,7 @@ index a7b0a7eb..1a50c034 100644
      auto * sched = lctx.get_sched();
  
      if (has_shift) {
-@@ -396,8 +394,6 @@ bool llama_kv_cache_unified::update(llama_context & lctx) {
+@@ -388,8 +386,6 @@ bool llama_kv_cache_unified::update(llama_context & lctx) {
              res->set_inputs(nullptr);
  
              lctx.graph_compute(gf, false);
@@ -91,7 +91,7 @@ index a7b0a7eb..1a50c034 100644
          }
  
          {
-@@ -411,27 +407,36 @@ bool llama_kv_cache_unified::update(llama_context & lctx) {
+@@ -403,27 +399,36 @@ bool llama_kv_cache_unified::update(llama_context & lctx) {
  
      if (do_defrag) {
          LLAMA_LOG_DEBUG("%s: defragmenting KV cache\n", __func__);
@@ -133,7 +133,7 @@ index a7b0a7eb..1a50c034 100644
  }
  
  void llama_kv_cache_unified::defrag_sched(float thold) {
-@@ -715,11 +720,10 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_shift(
+@@ -707,11 +712,10 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_shift(
  llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag(
          const llama_cparams & cparams,
                 ggml_context * ctx,
@@ -147,7 +147,7 @@ index a7b0a7eb..1a50c034 100644
  #if 0
      // CPU defrag
      //
-@@ -791,32 +795,20 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag(
+@@ -783,32 +787,20 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag(
          ggml_backend_tensor_set(v_l[il], buf_v.data(), 0, buf_v.size());
      }
  #else
@@ -185,7 +185,7 @@ index a7b0a7eb..1a50c034 100644
  
              ggml_tensor * view_v_src;
              ggml_tensor * view_v_dst;
-@@ -824,31 +816,29 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag(
+@@ -816,31 +808,29 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag(
              if (cparams.flash_attn) {
                  // NOTE: the V cache is not transposed when using flash attention
                  view_v_src = ggml_view_2d(ctx, v_l[il],
@@ -225,7 +225,7 @@ index a7b0a7eb..1a50c034 100644
      }
  
      //LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
-@@ -865,17 +855,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
+@@ -857,17 +847,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
  
      assert(n_used <= n_kv);
  
@@ -244,7 +244,7 @@ index a7b0a7eb..1a50c034 100644
  
      // determine which KV cells to move where
      //
-@@ -883,10 +863,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
+@@ -875,10 +855,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
      //
      //  if ids[i] == i || ids[i] == n_kv, then cell i is not moved
      //
@@ -256,7 +256,7 @@ index a7b0a7eb..1a50c034 100644
  
      for (uint32_t i0 = 0; i0 < n_used; ++i0) {
          const auto & cell0 = cells[i0];
-@@ -935,19 +912,11 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
+@@ -927,19 +904,11 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
          // are we moving a continuous block of memory?
          bool cont = false;
  
@@ -276,7 +276,7 @@ index a7b0a7eb..1a50c034 100644
                  cont = false;
                  continue;
              }
-@@ -963,8 +932,10 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
+@@ -955,8 +924,10 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
              head = n_used;
  
              if (!cont) {
@@ -288,7 +288,7 @@ index a7b0a7eb..1a50c034 100644
              }
  
              nf++;
-@@ -974,22 +945,16 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
+@@ -966,22 +937,16 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
              }
          }
  
diff --git a/llama/patches/0011-sort-devices-by-score.patch b/llama/patches/0009-sort-devices-by-score.patch
similarity index 99%
rename from llama/patches/0011-sort-devices-by-score.patch
rename to llama/patches/0009-sort-devices-by-score.patch
index 8c3908cf6..e27d1ae92 100644
--- a/llama/patches/0011-sort-devices-by-score.patch
+++ b/llama/patches/0009-sort-devices-by-score.patch
@@ -11,7 +11,7 @@ with the fastest acceleration is loaded
  1 file changed, 13 insertions(+), 8 deletions(-)
 
 diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
-index 82ae1b5b..1487f322 100644
+index 405d8e31..4e67d243 100644
 --- a/ggml/src/ggml-backend-reg.cpp
 +++ b/ggml/src/ggml-backend-reg.cpp
 @@ -157,7 +157,7 @@ struct ggml_backend_reg_entry {
diff --git a/llama/patches/0012-add-phony-target-ggml-cpu-for-all-cpu-variants.patch b/llama/patches/0010-add-phony-target-ggml-cpu-for-all-cpu-variants.patch
similarity index 100%
rename from llama/patches/0012-add-phony-target-ggml-cpu-for-all-cpu-variants.patch
rename to llama/patches/0010-add-phony-target-ggml-cpu-for-all-cpu-variants.patch
diff --git a/llama/patches/0013-remove-amx.patch b/llama/patches/0011-remove-amx.patch
similarity index 100%
rename from llama/patches/0013-remove-amx.patch
rename to llama/patches/0011-remove-amx.patch
diff --git a/llama/patches/0014-fix-string-arr-kv-loading.patch b/llama/patches/0012-fix-string-arr-kv-loading.patch
similarity index 100%
rename from llama/patches/0014-fix-string-arr-kv-loading.patch
rename to llama/patches/0012-fix-string-arr-kv-loading.patch
diff --git a/llama/patches/0015-ollama-debug-tensor.patch b/llama/patches/0013-ollama-debug-tensor.patch
similarity index 91%
rename from llama/patches/0015-ollama-debug-tensor.patch
rename to llama/patches/0013-ollama-debug-tensor.patch
index d8f9fc8a4..53d911277 100644
--- a/llama/patches/0015-ollama-debug-tensor.patch
+++ b/llama/patches/0013-ollama-debug-tensor.patch
@@ -8,7 +8,7 @@ Subject: [PATCH] ollama debug tensor
  1 file changed, 6 insertions(+)
 
 diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
-index 835e6495..3902894b 100644
+index a30e67f2..2462d2b8 100644
 --- a/ggml/src/ggml-cpu/ggml-cpu.c
 +++ b/ggml/src/ggml-cpu/ggml-cpu.c
 @@ -15,6 +15,8 @@
@@ -20,7 +20,7 @@ index 835e6495..3902894b 100644
  #if defined(_MSC_VER) || defined(__MINGW32__)
  #include <malloc.h> // using malloc.h with MSC/MINGW
  #elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
-@@ -2846,6 +2848,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
+@@ -2841,6 +2843,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
  
          ggml_compute_forward(&params, node);
  
diff --git a/llama/patches/0016-add-ollama-vocab-for-grammar-support.patch b/llama/patches/0014-add-ollama-vocab-for-grammar-support.patch
similarity index 100%
rename from llama/patches/0016-add-ollama-vocab-for-grammar-support.patch
rename to llama/patches/0014-add-ollama-vocab-for-grammar-support.patch
diff --git a/llm/memory.go b/llm/memory.go
index 76082bf7c..b5a8dd5c6 100644
--- a/llm/memory.go
+++ b/llm/memory.go
@@ -111,9 +111,8 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 	slog.Debug("evaluating", "library", gpus[0].Library, "gpu_count", len(gpus), "available", availableList)
 
 	for _, projector := range projectors {
-		weight, graph := projectorMemoryRequirements(projector)
+		weight := projectorMemoryRequirements(projector)
 		projectorWeights += weight
-		projectorGraph += graph
 
 		// multimodal models require at least 2048 context
 		opts.NumCtx = max(opts.NumCtx, 2048)
@@ -409,51 +408,21 @@ func (m MemoryEstimate) LogValue() slog.Value {
 	return slog.GroupValue(attrs...)
 }
 
-func projectorMemoryRequirements(filename string) (weights, graphSize uint64) {
+func projectorMemoryRequirements(filename string) (weights uint64) {
 	file, err := os.Open(filename)
 	if err != nil {
-		return 0, 0
+		return 0
 	}
 	defer file.Close()
 
 	ggml, _, err := ggml.Decode(file, 1024)
 	if err != nil {
-		return 0, 0
+		return 0
 	}
 
 	for _, layer := range ggml.Tensors().GroupLayers() {
 		weights += layer.Size()
 	}
 
-	switch arch := ggml.KV().Architecture(); arch {
-	case "mllama":
-		kv := func(n string) uint64 {
-			if v, ok := ggml.KV()[arch+".vision."+n].(uint32); ok {
-				return uint64(v)
-			}
-
-			return 0
-		}
-
-		imageSize := kv("image_size")
-
-		maxNumTiles := kv("max_num_tiles")
-		embeddingLength := kv("embedding_length")
-		headCount := kv("attention.head_count")
-
-		numPatches := (imageSize / kv("patch_size")) * (imageSize / kv("patch_size"))
-		if _, ok := ggml.Tensors().GroupLayers()["v"]["class_embd"]; ok {
-			numPatches++
-		}
-
-		numPaddedPatches := numPatches + 8 - (numPatches%8)%8
-
-		graphSize = 4 * (8 +
-			imageSize*imageSize*kv("num_channels")*maxNumTiles +
-			embeddingLength*numPatches*maxNumTiles +
-			9*embeddingLength*numPaddedPatches*maxNumTiles +
-			numPaddedPatches*maxNumTiles*numPaddedPatches*maxNumTiles*headCount)
-	}
-
-	return weights, graphSize
+	return weights
 }
diff --git a/llm/server.go b/llm/server.go
index a64669c2f..c07315fa3 100644
--- a/llm/server.go
+++ b/llm/server.go
@@ -679,9 +679,8 @@ ws ::= ([ \t\n] ws)?
 const maxBufferSize = 512 * format.KiloByte
 
 type ImageData struct {
-	Data          []byte `json:"data"`
-	ID            int    `json:"id"`
-	AspectRatioID int    `json:"aspect_ratio_id"`
+	Data []byte `json:"data"`
+	ID   int    `json:"id"`
 }
 
 type CompletionRequest struct {
diff --git a/ml/backend.go b/ml/backend.go
index ba24ecb45..f84a99845 100644
--- a/ml/backend.go
+++ b/ml/backend.go
@@ -161,7 +161,6 @@ type Tensor interface {
 	Set(ctx Context, t2 Tensor, offset int, strides ...int) Tensor
 
 	Pad(ctx Context, shape ...int) Tensor
-	Unpad(ctx Context, shape ...int) Tensor
 
 	Stack(ctx Context, dim int, s ...Tensor) Tensor
 
diff --git a/ml/backend/ggml/ggml.go b/ml/backend/ggml/ggml.go
index e97795a69..e1aa687c8 100644
--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@@ -1017,17 +1017,6 @@ func (t *Tensor) Sigmoid(ctx ml.Context) ml.Tensor {
 	}
 }
 
-func (t *Tensor) Unpad(ctx ml.Context, shape ...int) ml.Tensor {
-	if len(shape) != 4 {
-		panic("expected 4 dimensions")
-	}
-
-	return &Tensor{
-		b: t.b,
-		t: C.ggml_unpad(ctx.(*Context).ctx, t.t, C.int(shape[0]), C.int(shape[1]), C.int(shape[2]), C.int(shape[3])),
-	}
-}
-
 func (t *Tensor) View(ctx ml.Context, offset int, shape ...int) ml.Tensor {
 	switch len(shape) {
 	case 1:
diff --git a/ml/backend/ggml/ggml/include/ggml.h b/ml/backend/ggml/ggml/include/ggml.h
index 8dc107ba8..e91dedf14 100644
--- a/ml/backend/ggml/ggml/include/ggml.h
+++ b/ml/backend/ggml/ggml/include/ggml.h
@@ -489,7 +489,6 @@ extern "C" {
         GGML_OP_UPSCALE, // nearest interpolate
         GGML_OP_PAD,
         GGML_OP_PAD_REFLECT_1D,
-        GGML_OP_UNPAD,
         GGML_OP_ARANGE,
         GGML_OP_TIMESTEP_EMBEDDING,
         GGML_OP_ARGSORT,
@@ -1782,15 +1781,6 @@ extern "C" {
             int                   p0,
             int                   p1);
 
-    // unpad each dimension: [x, ..., x, y, ..., y] -> [x, ..., x]
-    GGML_API struct ggml_tensor * ggml_unpad(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            int                  p0,
-            int                  p1,
-            int                  p2,
-            int                  p3);
-
     // Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151
     // timesteps: [N,]
     // return: [N, dim]
diff --git a/ml/backend/ggml/ggml/src/ggml-backend-reg.cpp b/ml/backend/ggml/ggml/src/ggml-backend-reg.cpp
index 1487f322f..4e67d243a 100644
--- a/ml/backend/ggml/ggml/src/ggml-backend-reg.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-backend-reg.cpp
@@ -178,9 +178,9 @@ struct ggml_backend_registry {
 #ifdef GGML_USE_CANN
         register_backend(ggml_backend_cann_reg());
 #endif
-// #ifdef GGML_USE_BLAS
-//         register_backend(ggml_backend_blas_reg());
-// #endif
+#ifdef GGML_USE_BLAS
+        register_backend(ggml_backend_blas_reg());
+#endif
 #ifdef GGML_USE_RPC
         register_backend(ggml_backend_rpc_reg());
 #endif
diff --git a/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.c b/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.c
index 3902894ba..2462d2b85 100644
--- a/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.c
@@ -1953,10 +1953,6 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
             {
                 ggml_compute_forward_pad_reflect_1d(params, tensor);
             } break;
-        case GGML_OP_UNPAD:
-            {
-                ggml_compute_forward_unpad(params, tensor);
-            } break;
         case GGML_OP_ARANGE:
             {
                 ggml_compute_forward_arange(params, tensor);
@@ -2280,7 +2276,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
         case GGML_OP_UPSCALE:
         case GGML_OP_PAD:
         case GGML_OP_PAD_REFLECT_1D:
-        case GGML_OP_UNPAD:
         case GGML_OP_ARANGE:
         case GGML_OP_TIMESTEP_EMBEDDING:
         case GGML_OP_ARGSORT:
diff --git a/ml/backend/ggml/ggml/src/ggml-cpu/ops.cpp b/ml/backend/ggml/ggml/src/ggml-cpu/ops.cpp
index 1868a10cd..955fec59a 100644
--- a/ml/backend/ggml/ggml/src/ggml-cpu/ops.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/ops.cpp
@@ -6690,61 +6690,6 @@ void ggml_compute_forward_pad_reflect_1d(
     }
 }
 
-// ggml_compute_forward_unpad
-
-static void ggml_compute_forward_unpad_f32(
-    const struct ggml_compute_params *params,
-    struct ggml_tensor *dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    GGML_ASSERT(src0->nb[0] == sizeof(float));
-    GGML_ASSERT( dst->nb[0] == sizeof(float));
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    GGML_TENSOR_UNARY_OP_LOCALS
-
-    float * dst_ptr = (float *) dst->data;
-
-    // TODO: optimize
-
-    for (int64_t i2 = 0; i2 < ne2; ++i2) {
-        for (int64_t i1 = ith; i1 < ne1; i1 += nth) {
-            for (int64_t i0 = 0; i0 < ne0; ++i0) {
-                for (int64_t i3 = 0; i3 < ne3; ++i3) {
-                    const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0;
-
-                    const float * src_ptr = (const float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
-
-                    if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
-                        dst_ptr[dst_idx] = *src_ptr;
-                    }
-                }
-            }
-        }
-    }
-}
-
-void ggml_compute_forward_unpad(
-    const struct ggml_compute_params * params,
-    struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_unpad_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ABORT("fatal error");
-            }
-    }
-}
-
 // ggml_compute_forward_arange
 
 static void ggml_compute_forward_arange_f32(
diff --git a/ml/backend/ggml/ggml/src/ggml-cpu/ops.h b/ml/backend/ggml/ggml/src/ggml-cpu/ops.h
index a7125555e..dc081b9e6 100644
--- a/ml/backend/ggml/ggml/src/ggml-cpu/ops.h
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/ops.h
@@ -72,7 +72,6 @@ void ggml_compute_forward_pool_2d_back(const struct ggml_compute_params * params
 void ggml_compute_forward_upscale(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_pad(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_pad_reflect_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_unpad(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_arange(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_timestep_embedding(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_argsort(const struct ggml_compute_params * params, struct ggml_tensor * dst);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu b/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
index 6fe866742..cb0d8528d 100644
--- a/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2238,9 +2238,6 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
         case GGML_OP_PAD:
             ggml_cuda_op_pad(ctx, dst);
             break;
-        case GGML_OP_UNPAD:
-            ggml_cuda_op_unpad(ctx, dst);
-            break;
         case GGML_OP_ARANGE:
             ggml_cuda_op_arange(ctx, dst);
             break;
@@ -3215,7 +3212,6 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
         case GGML_OP_UPSCALE:
             return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST;
         case GGML_OP_PAD:
-        case GGML_OP_UNPAD:
         case GGML_OP_ARANGE:
         case GGML_OP_TIMESTEP_EMBEDDING:
         case GGML_OP_LEAKY_RELU:
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/pad.cu b/ml/backend/ggml/ggml/src/ggml-cuda/pad.cu
index 7d45a7e19..77432b046 100644
--- a/ml/backend/ggml/ggml/src/ggml-cuda/pad.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/pad.cu
@@ -47,49 +47,3 @@ void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
         src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
         dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], stream);
 }
-
-static __global__ void unpad_f32(const float * x, float * dst, const int ne0, const int ne00, const int ne01, const int ne02, const int ne03) {
-    // blockIdx.z: idx of ne2*ne3, aka ne02*ne03
-    // blockIdx.y: idx of ne1
-    // blockIDx.x: idx of ne0 / BLOCK_SIZE
-    int nidx = threadIdx.x + blockIdx.x * blockDim.x;
-    if (nidx >= ne0) {
-        return;
-    }
-
-    // operation
-    int offset_dst =
-        nidx +
-        blockIdx.y * ne0 +
-        blockIdx.z * ne0 * gridDim.y;
-    if (nidx < ne00 && blockIdx.y < ne01 && blockIdx.z < ne02*ne03) {
-        int offset_src =
-            nidx +
-            blockIdx.y * ne00 +
-            blockIdx.z * ne00 * ne01;
-        dst[offset_dst] = x[offset_src];
-    }
-}
-
-static void unpad_f32_cuda(const float * x, float * dst,
-    const int ne00, const int ne01, const int ne02, const int ne03,
-    const int ne0, const int ne1, const int ne2, const int ne3, cudaStream_t stream) {
-    int num_blocks = (ne0 + CUDA_PAD_BLOCK_SIZE - 1) / CUDA_PAD_BLOCK_SIZE;
-    dim3 gridDim(num_blocks, ne1, ne2*ne3);
-    unpad_f32<<<gridDim, CUDA_PAD_BLOCK_SIZE, 0, stream>>>(x, dst, ne0, ne00, ne01, ne02, ne03);
-}
-
-void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
-    const ggml_tensor * src0 = dst->src[0];
-    const float * src0_d = (const float *)src0->data;
-    float * dst_d = (float *)dst->data;
-    cudaStream_t stream = ctx.stream();
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT(dst->type == GGML_TYPE_F32);
-    GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
-
-    unpad_f32_cuda(src0_d, dst_d,
-        src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
-        dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], stream);
-}
\ No newline at end of file
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/pad.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/pad.cuh
index e2ededc3c..8fd386b00 100644
--- a/ml/backend/ggml/ggml/src/ggml-cuda/pad.cuh
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/pad.cuh
@@ -3,4 +3,3 @@
 #define CUDA_PAD_BLOCK_SIZE 256
 
 void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
-void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal
index 56fdb3cda..3656c2383 100644
--- a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal
+++ b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal
@@ -5599,51 +5599,6 @@ kernel void kernel_pad_reflect_1d_f32(
     }
 }
 
-kernel void kernel_unpad_f32(
-    device  const char * src0,
-    device        char * dst,
-    constant   int64_t & ne00,
-    constant   int64_t & ne01,
-    constant   int64_t & ne02,
-    constant   int64_t & ne03,
-    constant  uint64_t & nb00,
-    constant  uint64_t & nb01,
-    constant  uint64_t & nb02,
-    constant  uint64_t & nb03,
-    constant   int64_t & ne0,
-    constant   int64_t & ne1,
-    constant   int64_t & ne2,
-    constant   int64_t & ne3,
-    constant  uint64_t & nb0,
-    constant  uint64_t & nb1,
-    constant  uint64_t & nb2,
-    constant  uint64_t & nb3,
-    uint3 tgpig[[threadgroup_position_in_grid]],
-    uint3 tpitg[[thread_position_in_threadgroup]],
-    uint3   ntg[[threads_per_threadgroup]]) {
-
-    const int64_t i3 = tgpig.z;
-    const int64_t i2 = tgpig.y;
-    const int64_t i1 = tgpig.x;
-
-    const int64_t i03 = i3;
-    const int64_t i02 = i2;
-    const int64_t i01 = i1;
-
-    device const float * src0_ptr = (device const float *) (src0 + i03*nb03 + i02*nb02 + i01*nb01);
-    device       float * dst_ptr  = (device       float *) (dst  +  i3*nb3  +  i2*nb2  +  i1*nb1);
-
-    if (i1 < ne01 && i2 < ne02 && i3 < ne03) {
-        for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
-            if (i0 < ne00) {
-                dst_ptr[i0] = src0_ptr[i0];
-            }
-        }
-
-        return;
-    }
-}
-
 kernel void kernel_arange_f32(
     device        char * dst,
     constant   ggml_metal_kargs_arange & args,
diff --git a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m
index 7641247eb..1b56f858c 100644
--- a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m
+++ b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m
@@ -347,7 +347,6 @@ enum ggml_metal_kernel_type {
     GGML_METAL_KERNEL_TYPE_UPSCALE_F32,
     GGML_METAL_KERNEL_TYPE_PAD_F32,
     GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32,
-    GGML_METAL_KERNEL_TYPE_UNPAD_F32,
     GGML_METAL_KERNEL_TYPE_ARANGE_F32,
     GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,
     GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,
@@ -1295,7 +1294,6 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32,                     upscale_f32,                     true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32,                         pad_f32,                         true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32,              pad_reflect_1d_f32,              true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UNPAD_F32,                       unpad_f32,                       true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,          timestep_embedding_f32,          true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARANGE_F32,                      arange_f32,                      true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,             argsort_f32_i32_asc,             true);
@@ -1657,7 +1655,6 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
         case GGML_OP_POOL_2D:
         case GGML_OP_PAD:
         case GGML_OP_PAD_REFLECT_1D:
-        case GGML_OP_UNPAD:
         case GGML_OP_TIMESTEP_EMBEDDING:
         case GGML_OP_ARGSORT:
         case GGML_OP_LEAKY_RELU:
@@ -4187,36 +4184,6 @@ static bool ggml_metal_encode_node(
 
                 const int nth = MIN(1024, ne0);
 
-                [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
-            } break;
-        case GGML_OP_UNPAD:
-            {
-                GGML_ASSERT(src0->type == GGML_TYPE_F32);
-
-                id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_UNPAD_F32].pipeline;
-
-                [encoder setComputePipelineState:pipeline];
-                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-                [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
-                [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
-                [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
-                [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5];
-                [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
-                [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
-                [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
-                [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9];
-                [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:10];
-                [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:11];
-                [encoder setBytes:&ne2  length:sizeof(ne2)  atIndex:12];
-                [encoder setBytes:&ne3  length:sizeof(ne3)  atIndex:13];
-                [encoder setBytes:&nb0  length:sizeof(nb0)  atIndex:14];
-                [encoder setBytes:&nb1  length:sizeof(nb1)  atIndex:15];
-                [encoder setBytes:&nb2  length:sizeof(nb2)  atIndex:16];
-                [encoder setBytes:&nb3  length:sizeof(nb3)  atIndex:17];
-
-                const int nth = MIN(1024, ne0);
-
                 [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
             } break;
         case GGML_OP_ARANGE:
diff --git a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.metal b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.metal
index 080a943bc..9cfddf450 100644
--- a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.metal
@@ -3121,51 +3121,6 @@ kernel void kernel_pad_reflect_1d_f32(
     }
 }
 
-kernel void kernel_unpad_f32(
-    device  const char * src0,
-    device        char * dst,
-    constant   int64_t & ne00,
-    constant   int64_t & ne01,
-    constant   int64_t & ne02,
-    constant   int64_t & ne03,
-    constant  uint64_t & nb00,
-    constant  uint64_t & nb01,
-    constant  uint64_t & nb02,
-    constant  uint64_t & nb03,
-    constant   int64_t & ne0,
-    constant   int64_t & ne1,
-    constant   int64_t & ne2,
-    constant   int64_t & ne3,
-    constant  uint64_t & nb0,
-    constant  uint64_t & nb1,
-    constant  uint64_t & nb2,
-    constant  uint64_t & nb3,
-    uint3 tgpig[[threadgroup_position_in_grid]],
-    uint3 tpitg[[thread_position_in_threadgroup]],
-    uint3   ntg[[threads_per_threadgroup]]) {
-
-    const int64_t i3 = tgpig.z;
-    const int64_t i2 = tgpig.y;
-    const int64_t i1 = tgpig.x;
-
-    const int64_t i03 = i3;
-    const int64_t i02 = i2;
-    const int64_t i01 = i1;
-
-    device const float * src0_ptr = (device const float *) (src0 + i03*nb03 + i02*nb02 + i01*nb01);
-    device       float * dst_ptr  = (device       float *) (dst  +  i3*nb3  +  i2*nb2  +  i1*nb1);
-
-    if (i1 < ne01 && i2 < ne02 && i3 < ne03) {
-        for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
-            if (i0 < ne00) {
-                dst_ptr[i0] = src0_ptr[i0];
-            }
-        }
-
-        return;
-    }
-}
-
 kernel void kernel_arange_f32(
     device        char * dst,
     constant   ggml_metal_kargs_arange & args,
diff --git a/ml/backend/ggml/ggml/src/ggml.c b/ml/backend/ggml/ggml/src/ggml.c
index 6b034d352..8a6546240 100644
--- a/ml/backend/ggml/ggml/src/ggml.c
+++ b/ml/backend/ggml/ggml/src/ggml.c
@@ -923,7 +923,6 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "UPSCALE",
     "PAD",
     "PAD_REFLECT_1D",
-    "UNPAD",
     "ARANGE",
     "TIMESTEP_EMBEDDING",
     "ARGSORT",
@@ -954,7 +953,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "OPT_STEP_ADAMW",
 };
 
-static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
+static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
 
 static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "none",
@@ -1019,7 +1018,6 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "upscale(x)",
     "pad(x)",
     "pad_reflect_1d(x)",
-    "unpad(x)",
     "arange(start, stop, step)",
     "timestep_embedding(timesteps, dim, max_period)",
     "argsort(x)",
@@ -1050,7 +1048,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "adamw(x)",
 };
 
-static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
+static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
 
 static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
 
@@ -4276,25 +4274,6 @@ struct ggml_tensor * ggml_pad_reflect_1d(
     return result;
 }
 
-// ggml_unpad
-
-struct ggml_tensor * ggml_unpad(
-    struct ggml_context * ctx,
-    struct ggml_tensor  * a,
-    int p0, int p1, int p2, int p3) {
-
-    struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type,
-            a->ne[0] - p0,
-            a->ne[1] - p1,
-            a->ne[2] - p2,
-            a->ne[3] - p3);
-
-    result->op = GGML_OP_UNPAD;
-    result->src[0] = a;
-
-    return result;
-}
-
 // ggml_arange
 
 struct ggml_tensor * ggml_arange(
diff --git a/model/models/llama4/model_vision.go b/model/models/llama4/model_vision.go
index 3bf9cee75..e6b1afef6 100644
--- a/model/models/llama4/model_vision.go
+++ b/model/models/llama4/model_vision.go
@@ -208,7 +208,7 @@ func (m *VisionModel) Forward(ctx ml.Context, pixelValues ml.Tensor) ml.Tensor {
 	}
 
 	hiddenStates = m.LayerNormPost.Forward(ctx, hiddenStates, m.eps)
-	hiddenStates = hiddenStates.Unpad(ctx, 0, 1, 0, 0)
+	hiddenStates = hiddenStates.Pad(ctx, 0, -1, 0, 0)
 	hiddenStates = m.VisionAdapter.Forward(ctx, hiddenStates, m.VisionOptions)
 	return hiddenStates
 }
diff --git a/model/models/mllama/imageproc.go b/model/models/mllama/imageproc.go
deleted file mode 100644
index 13f2fb8b3..000000000
--- a/model/models/mllama/imageproc.go
+++ /dev/null
@@ -1,201 +0,0 @@
-package mllama
-
-import (
-	"fmt"
-	"image"
-	_ "image/jpeg"
-	_ "image/png"
-	"io"
-	"math"
-	"slices"
-
-	"golang.org/x/image/draw"
-
-	"github.com/ollama/ollama/model/imageproc"
-)
-
-func getSupportedAspectRatios(maxTiles int) []image.Point {
-	ratios := []image.Point{}
-
-	for w := range maxTiles {
-		for h := range maxTiles {
-			if (w+1)*(h+1) <= maxTiles {
-				ratios = append(ratios, image.Point{w + 1, h + 1})
-			}
-		}
-	}
-
-	return ratios
-}
-
-func clip(a, a_min, a_max int) int {
-	if a < a_min {
-		return a_min
-	} else if a > a_max {
-		return a_max
-	}
-
-	return a
-}
-
-func getOptimalTiledCanvas(imageSize image.Point, maxImageTiles, tileSize int) image.Point {
-	possibleTileArrangements := getSupportedAspectRatios(maxImageTiles)
-	possibleCanvasSizes := []image.Point{}
-	for _, pta := range possibleTileArrangements {
-		possibleCanvasSizes = append(possibleCanvasSizes, image.Point{pta.X * tileSize, pta.Y * tileSize})
-	}
-
-	scales := []float64{}
-
-	for _, pcs := range possibleCanvasSizes {
-		scaleHeight := float64(pcs.Y) / float64(imageSize.Y)
-		scaleWidth := float64(pcs.X) / float64(imageSize.X)
-
-		if scaleWidth > scaleHeight {
-			scales = append(scales, scaleHeight)
-		} else {
-			scales = append(scales, scaleWidth)
-		}
-	}
-
-	var minUpscale float64
-	var maxDownscale float64
-	var upscale bool
-
-	for _, s := range scales {
-		if s > 1.0 {
-			upscale = true
-			if minUpscale == 0 {
-				minUpscale = s
-			} else {
-				minUpscale = math.Min(minUpscale, s)
-			}
-		} else {
-			maxDownscale = math.Max(maxDownscale, s)
-		}
-	}
-
-	selectedScale := maxDownscale
-	if upscale {
-		selectedScale = minUpscale
-	}
-
-	var selectedCanvas image.Point
-	for n, pcs := range possibleCanvasSizes {
-		if scales[n] == selectedScale {
-			// choose the smallest possible canvas
-			if selectedCanvas.X == 0 && selectedCanvas.Y == 0 {
-				selectedCanvas = pcs
-			} else if pcs.X*pcs.Y < selectedCanvas.X*selectedCanvas.Y {
-				selectedCanvas = pcs
-			}
-		}
-	}
-	return selectedCanvas
-}
-
-func getImageSizeFitToCanvas(imageSize, canvasSize image.Point, tileSize int) image.Point {
-	targetWidth := clip(imageSize.X, tileSize, canvasSize.X)
-	targetHeight := clip(imageSize.Y, tileSize, canvasSize.Y)
-
-	scaleWidth := float64(targetWidth) / float64(imageSize.X)
-	scaleHeight := float64(targetHeight) / float64(imageSize.Y)
-
-	var w, h int
-
-	if scaleWidth < scaleHeight {
-		w = targetWidth
-		h = min(int(math.Floor(float64(imageSize.Y)*scaleWidth)), targetHeight)
-	} else {
-		w = min(int(math.Floor(float64(imageSize.X)*scaleHeight)), targetWidth)
-		h = targetHeight
-	}
-
-	return image.Point{w, h}
-}
-
-func resizeImage(img image.Image, format string, outputSize image.Point, maxImageTiles int) (image.Image, image.Point) {
-	if format == "png" {
-		img = imageproc.Composite(img)
-	}
-
-	b := img.Bounds()
-	tileSize := outputSize.Y
-
-	canvasSize := getOptimalTiledCanvas(b.Max, maxImageTiles, tileSize)
-	aspectRatio := image.Point{canvasSize.X / tileSize, canvasSize.Y / tileSize}
-	newSize := getImageSizeFitToCanvas(b.Max, canvasSize, tileSize)
-
-	return imageproc.Resize(img, newSize, imageproc.ResizeBilinear), aspectRatio
-}
-
-func padImage(img image.Image, outputSize, aspectRatio image.Point) image.Image {
-	paddedSize := image.Point{
-		X: outputSize.X * aspectRatio.X,
-		Y: outputSize.Y * aspectRatio.Y,
-	}
-
-	dst := image.NewRGBA(image.Rect(0, 0, paddedSize.X, paddedSize.Y))
-	draw.Draw(dst, img.Bounds(), img, image.Point{0, 0}, draw.Over)
-
-	return dst
-}
-
-func splitToTiles(img image.Image, numTilesSize image.Point) []image.Image {
-	b := img.Bounds()
-	width := b.Max.X - b.Min.X
-	height := b.Max.Y - b.Min.Y
-	tileHeight := height / numTilesSize.Y
-	tileWidth := width / numTilesSize.X
-
-	images := []image.Image{}
-
-	for h := range numTilesSize.Y {
-		for w := range numTilesSize.X {
-			rect := image.Rect(tileWidth*w, tileHeight*h, tileWidth*(w+1), tileHeight*(h+1))
-			images = append(images, img.(interface {
-				SubImage(image.Rectangle) image.Image
-			}).SubImage(rect))
-		}
-	}
-
-	return images
-}
-
-func packImages(img image.Image, aspectRatio image.Point) []float32 {
-	subImages := splitToTiles(img, aspectRatio)
-
-	var pixelVals []float32
-
-	rescale := true
-	channelFirst := true
-
-	for _, subImg := range subImages {
-		vals := imageproc.Normalize(subImg, imageproc.ClipDefaultMean, imageproc.ClipDefaultSTD, rescale, channelFirst)
-		pixelVals = append(pixelVals, vals...)
-	}
-
-	return pixelVals
-}
-
-func Preprocess(imageData io.Reader) ([]float32, map[string]any, error) {
-	outputSize := image.Point{560, 560}
-	maxTiles := 4
-
-	img, format, err := image.Decode(imageData)
-	if err != nil {
-		return nil, nil, fmt.Errorf("failed to decode image: %w", err)
-	}
-
-	newImage, aspectRatio := resizeImage(img, format, outputSize, maxTiles)
-	newImage = padImage(newImage, outputSize, aspectRatio)
-
-	data := packImages(newImage, aspectRatio)
-	aspectRatioIndex := slices.Index(getSupportedAspectRatios(maxTiles), aspectRatio) + 1
-
-	opts := map[string]any{
-		"aspectRatioIndex": aspectRatioIndex,
-	}
-
-	return data, opts, nil
-}
diff --git a/model/models/mllama/imageproc_test.go b/model/models/mllama/imageproc_test.go
deleted file mode 100644
index a14b91bd1..000000000
--- a/model/models/mllama/imageproc_test.go
+++ /dev/null
@@ -1,420 +0,0 @@
-package mllama
-
-import (
-	"bytes"
-	"image"
-	"image/png"
-	"testing"
-
-	"github.com/google/go-cmp/cmp"
-)
-
-func TestAspectRatios(t *testing.T) {
-	type aspectCase struct {
-		MaxTiles int
-		Expected []image.Point
-	}
-
-	cases := []aspectCase{
-		{
-			MaxTiles: 1,
-			Expected: []image.Point{{1, 1}},
-		},
-		{
-			MaxTiles: 2,
-			Expected: []image.Point{{1, 1}, {1, 2}, {2, 1}},
-		},
-		{
-			MaxTiles: 3,
-			Expected: []image.Point{{1, 1}, {1, 2}, {1, 3}, {2, 1}, {3, 1}},
-		},
-		{
-			MaxTiles: 4,
-			Expected: []image.Point{{1, 1}, {1, 2}, {1, 3}, {1, 4}, {2, 1}, {2, 2}, {3, 1}, {4, 1}},
-		},
-	}
-
-	for _, c := range cases {
-		actual := getSupportedAspectRatios(c.MaxTiles)
-
-		if diff := cmp.Diff(actual, c.Expected); diff != "" {
-			t.Errorf("mismatch (-got +want):\n%s", diff)
-		}
-	}
-}
-
-func TestGetImageSizeFitToCanvas(t *testing.T) {
-	type imageSizeCase struct {
-		ImageRect  image.Point
-		CanvasRect image.Point
-		TileSize   int
-		Expected   image.Point
-	}
-
-	cases := []imageSizeCase{
-		{
-			ImageRect:  image.Point{400, 400},
-			CanvasRect: image.Point{640, 480},
-			TileSize:   200,
-			Expected:   image.Point{400, 400},
-		},
-		{
-			ImageRect:  image.Point{1024, 768},
-			CanvasRect: image.Point{640, 480},
-			TileSize:   200,
-			Expected:   image.Point{640, 480},
-		},
-		{
-			ImageRect:  image.Point{500, 500},
-			CanvasRect: image.Point{1000, 1000},
-			TileSize:   750,
-			Expected:   image.Point{750, 750},
-		},
-		{
-			ImageRect:  image.Point{500, 1000},
-			CanvasRect: image.Point{2000, 2000},
-			TileSize:   2000,
-			Expected:   image.Point{1000, 2000},
-		},
-		{
-			ImageRect:  image.Point{4000, 3000},
-			CanvasRect: image.Point{2000, 1000},
-			TileSize:   1000,
-			Expected:   image.Point{1333, 1000},
-		},
-		{
-			ImageRect:  image.Point{667, 1000},
-			CanvasRect: image.Point{1000, 1000},
-			TileSize:   560,
-			Expected:   image.Point{667, 1000},
-		},
-	}
-
-	for _, c := range cases {
-		actual := getImageSizeFitToCanvas(c.ImageRect, c.CanvasRect, c.TileSize)
-
-		if actual != c.Expected {
-			t.Errorf("incorrect image rect: '%#v'. expected: '%#v'", actual, c.Expected)
-		}
-	}
-}
-
-func TestGetOptimalTiledCanvas(t *testing.T) {
-	type tiledCanvasSizeCase struct {
-		ImageSize     image.Point
-		MaxImageTiles int
-		TileSize      int
-		Expected      image.Point
-	}
-
-	cases := []tiledCanvasSizeCase{
-		{
-			ImageSize:     image.Point{1024, 768},
-			MaxImageTiles: 4,
-			TileSize:      1000,
-			Expected:      image.Point{2000, 1000},
-		},
-		{
-			ImageSize:     image.Point{1024, 768},
-			MaxImageTiles: 4,
-			TileSize:      560,
-			Expected:      image.Point{1120, 1120},
-		},
-		{
-			ImageSize:     image.Point{800, 600},
-			MaxImageTiles: 4,
-			TileSize:      560,
-			Expected:      image.Point{1120, 1120},
-		},
-		{
-			ImageSize:     image.Point{640, 480},
-			MaxImageTiles: 4,
-			TileSize:      560,
-			Expected:      image.Point{1120, 560},
-		},
-		{
-			ImageSize:     image.Point{320, 200},
-			MaxImageTiles: 4,
-			TileSize:      560,
-			Expected:      image.Point{560, 560},
-		},
-		{
-			ImageSize:     image.Point{1320, 200},
-			MaxImageTiles: 4,
-			TileSize:      560,
-			Expected:      image.Point{1680, 560},
-		},
-		{
-			ImageSize:     image.Point{2000, 200},
-			MaxImageTiles: 4,
-			TileSize:      560,
-			Expected:      image.Point{2240, 560},
-		},
-		{
-			ImageSize:     image.Point{10000, 200},
-			MaxImageTiles: 4,
-			TileSize:      560,
-			Expected:      image.Point{2240, 560},
-		},
-		{
-			ImageSize:     image.Point{480, 640},
-			MaxImageTiles: 4,
-			TileSize:      560,
-			Expected:      image.Point{560, 1120},
-		},
-		{
-			ImageSize:     image.Point{200, 320},
-			MaxImageTiles: 4,
-			TileSize:      560,
-			Expected:      image.Point{560, 560},
-		},
-		{
-			ImageSize:     image.Point{200, 1320},
-			MaxImageTiles: 4,
-			TileSize:      560,
-			Expected:      image.Point{560, 1680},
-		},
-		{
-			ImageSize:     image.Point{200, 2000},
-			MaxImageTiles: 4,
-			TileSize:      560,
-			Expected:      image.Point{560, 2240},
-		},
-		{
-			ImageSize:     image.Point{200, 10000},
-			MaxImageTiles: 4,
-			TileSize:      560,
-			Expected:      image.Point{560, 2240},
-		},
-		{
-			ImageSize:     image.Point{10000, 10000},
-			MaxImageTiles: 4,
-			TileSize:      560,
-			Expected:      image.Point{1120, 1120},
-		},
-	}
-
-	for _, c := range cases {
-		actual := getOptimalTiledCanvas(c.ImageSize, c.MaxImageTiles, c.TileSize)
-
-		if actual != c.Expected {
-			t.Errorf("incorrect tiled canvas: '%#v'. expected: '%#v'", actual, c.Expected)
-		}
-	}
-}
-
-func TestSplitToTiles(t *testing.T) {
-	type splitCase struct {
-		TestImage    image.Image
-		NumTilesSize image.Point
-		Expected     []image.Image
-	}
-
-	cases := []splitCase{
-		{
-			TestImage:    image.NewRGBA(image.Rect(0, 0, 1024, 768)),
-			NumTilesSize: image.Point{1, 1},
-			Expected:     []image.Image{image.NewRGBA(image.Rect(0, 0, 1024, 768))},
-		},
-		{
-			TestImage:    image.NewRGBA(image.Rect(0, 0, 1000, 500)),
-			NumTilesSize: image.Point{2, 1},
-			Expected: []image.Image{
-				image.NewRGBA(image.Rect(0, 0, 500, 500)),
-				image.NewRGBA(image.Rect(500, 0, 1000, 500)),
-			},
-		},
-		{
-			TestImage:    image.NewRGBA(image.Rect(0, 0, 1000, 1000)),
-			NumTilesSize: image.Point{2, 2},
-			Expected: []image.Image{
-				image.NewRGBA(image.Rect(0, 0, 500, 500)),
-				image.NewRGBA(image.Rect(500, 0, 1000, 500)),
-				image.NewRGBA(image.Rect(0, 500, 500, 1000)),
-				image.NewRGBA(image.Rect(500, 500, 1000, 1000)),
-			},
-		},
-	}
-
-	for _, c := range cases {
-		actual := splitToTiles(c.TestImage, c.NumTilesSize)
-
-		if len(actual) != len(c.Expected) {
-			t.Errorf("incorrect number of images '%d': expected: '%d'", len(actual), len(c.Expected))
-		}
-
-		for i := range actual {
-			if actual[i].Bounds() != c.Expected[i].Bounds() {
-				t.Errorf("image size incorrect: '%#v': expected: '%#v'", actual[i].Bounds(), c.Expected[i].Bounds())
-			}
-		}
-	}
-}
-
-func TestResize(t *testing.T) {
-	type resizeCase struct {
-		TestImage           image.Image
-		OutputSize          image.Point
-		MaxImageTiles       int
-		ExpectedImage       image.Image
-		ExpectedAspectRatio image.Point
-	}
-
-	cases := []resizeCase{
-		{
-			TestImage:           image.NewRGBA(image.Rect(0, 0, 200, 200)),
-			OutputSize:          image.Point{100, 100},
-			MaxImageTiles:       1,
-			ExpectedImage:       image.NewRGBA(image.Rect(0, 0, 100, 100)),
-			ExpectedAspectRatio: image.Point{1, 1},
-		},
-		{
-			TestImage:           image.NewRGBA(image.Rect(0, 0, 200, 200)),
-			OutputSize:          image.Point{100, 100},
-			MaxImageTiles:       2,
-			ExpectedImage:       image.NewRGBA(image.Rect(0, 0, 100, 100)),
-			ExpectedAspectRatio: image.Point{1, 1},
-		},
-		{
-			TestImage:           image.NewRGBA(image.Rect(0, 0, 10, 10)),
-			OutputSize:          image.Point{560, 560},
-			MaxImageTiles:       4,
-			ExpectedImage:       image.NewRGBA(image.Rect(0, 0, 560, 560)),
-			ExpectedAspectRatio: image.Point{1, 1},
-		},
-		{
-			TestImage:           image.NewRGBA(image.Rect(0, 0, 2560, 1920)),
-			OutputSize:          image.Point{560, 560},
-			MaxImageTiles:       4,
-			ExpectedImage:       image.NewRGBA(image.Rect(0, 0, 1120, 840)),
-			ExpectedAspectRatio: image.Point{2, 2},
-		},
-		{
-			TestImage:           image.NewRGBA(image.Rect(0, 0, 1024, 768)),
-			OutputSize:          image.Point{560, 560},
-			MaxImageTiles:       4,
-			ExpectedImage:       image.NewRGBA(image.Rect(0, 0, 1024, 768)),
-			ExpectedAspectRatio: image.Point{2, 2},
-		},
-	}
-
-	for _, c := range cases {
-		actualImage, actualAspectRatio := resizeImage(c.TestImage, "png", c.OutputSize, c.MaxImageTiles)
-
-		if actualImage.Bounds() != c.ExpectedImage.Bounds() {
-			t.Errorf("image size incorrect: '%#v': expected: '%#v'", actualImage.Bounds(), c.ExpectedImage.Bounds())
-		}
-
-		if actualAspectRatio != c.ExpectedAspectRatio {
-			t.Errorf("aspect ratio incorrect: '%#v': expected: '%#v'", actualAspectRatio, c.ExpectedAspectRatio)
-		}
-	}
-}
-
-func TestPad(t *testing.T) {
-	type padCase struct {
-		TestImage   image.Image
-		OutputSize  image.Point
-		AspectRatio image.Point
-		Expected    image.Image
-	}
-
-	cases := []padCase{
-		{
-			TestImage:   image.NewRGBA(image.Rect(0, 0, 1000, 667)),
-			OutputSize:  image.Point{560, 560},
-			AspectRatio: image.Point{2, 2},
-			Expected:    image.NewRGBA(image.Rect(0, 0, 1120, 1120)),
-		},
-	}
-
-	for _, c := range cases {
-		actual := padImage(c.TestImage, c.OutputSize, c.AspectRatio)
-
-		if actual.Bounds() != c.Expected.Bounds() {
-			t.Errorf("image size incorrect: '%#v': expected: '%#v'", actual.Bounds(), c.Expected.Bounds())
-		}
-	}
-}
-
-func TestPackImages(t *testing.T) {
-	type packCase struct {
-		TestImage    image.Image
-		AspectRatio  image.Point
-		ExpectedVals int
-	}
-
-	cases := []packCase{
-		{
-			TestImage:    image.NewRGBA(image.Rect(0, 0, 1120, 1120)),
-			AspectRatio:  image.Point{2, 2},
-			ExpectedVals: 2 * 2 * 3 * 560 * 560,
-		},
-		{
-			TestImage:    image.NewRGBA(image.Rect(0, 0, 560, 560)),
-			AspectRatio:  image.Point{1, 1},
-			ExpectedVals: 1 * 1 * 3 * 560 * 560,
-		},
-		{
-			TestImage:    image.NewRGBA(image.Rect(0, 0, 1120, 560)),
-			AspectRatio:  image.Point{1, 2},
-			ExpectedVals: 1 * 2 * 3 * 560 * 560,
-		},
-	}
-
-	for _, c := range cases {
-		actualVals := packImages(c.TestImage, c.AspectRatio)
-		if len(actualVals) != c.ExpectedVals {
-			t.Errorf("packed image size incorrect: '%d': expected: '%d'", len(actualVals), c.ExpectedVals)
-		}
-	}
-}
-
-func TestPreprocess(t *testing.T) {
-	type preprocessCase struct {
-		TestImage             image.Image
-		ExpectedVals          int
-		ExpectedAspectRatioID int
-	}
-
-	cases := []preprocessCase{
-		{
-			TestImage:             image.NewRGBA(image.Rect(0, 0, 10, 10)),
-			ExpectedVals:          0,
-			ExpectedAspectRatioID: 1,
-		},
-		{
-			TestImage:             image.NewRGBA(image.Rect(0, 0, 1024, 768)),
-			ExpectedVals:          0,
-			ExpectedAspectRatioID: 6,
-		},
-	}
-
-	for _, c := range cases {
-		var buf bytes.Buffer
-		err := png.Encode(&buf, c.TestImage)
-		if err != nil {
-			t.Fatal(err)
-		}
-
-		imgData, opts, err := Preprocess(&buf)
-		if err != nil {
-			t.Fatalf("error processing: %q", err)
-		}
-
-		if len(imgData) == 0 {
-			t.Errorf("no image data returned")
-		}
-
-		ar, ok := opts["aspectRatioIndex"]
-		if !ok {
-			t.Fatalf("no aspect ratio found")
-		}
-
-		aspectRatioID := ar.(int)
-
-		if aspectRatioID != c.ExpectedAspectRatioID {
-			t.Errorf("aspect ratio incorrect: '%d': expected: '%d'", aspectRatioID, c.ExpectedAspectRatioID)
-		}
-	}
-}
diff --git a/model/models/mllama/model.go b/model/models/mllama/model.go
index 3fa26dedb..93b443ef1 100644
--- a/model/models/mllama/model.go
+++ b/model/models/mllama/model.go
@@ -2,11 +2,7 @@ package mllama
 
 import (
 	"bytes"
-	"encoding/binary"
-	"fmt"
-	"hash/fnv"
 	"image"
-	"slices"
 
 	"github.com/ollama/ollama/fs"
 	"github.com/ollama/ollama/kvcache"
@@ -34,10 +30,6 @@ const (
 )
 
 func New(c fs.Config) (model.Model, error) {
-	// Verify unified config
-	if c.Uint("vision.block_count") == 0 {
-		return nil, fmt.Errorf("non-unified vision model not supported")
-	}
 	m := Model{
 		BytePairEncoding: model.NewBytePairEncoding(
 			c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
@@ -76,22 +68,19 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, er
 		return nil, err
 	}
 
-	f32s, aspectRatioID, err := m.ImageProcessor.ProcessImage(image)
+	f32s, ratio, err := m.ImageProcessor.ProcessImage(image)
 	if err != nil {
 		return nil, err
 	}
 
-	pixelValues, err := ctx.Input().FromFloatSlice(f32s,
-		m.ImageProcessor.imageSize,
-		m.ImageProcessor.imageSize,
-		m.ImageProcessor.numChannels,
-		m.ImageProcessor.maxNumTiles,
-	)
+	pixelValues, err := ctx.Input().FromFloatSlice(f32s, m.imageSize, m.imageSize, m.numChannels, ratio.numTiles())
 	if err != nil {
 		return nil, err
 	}
 
-	aspectRatio, err := ctx.Input().FromIntSlice([]int32{int32(aspectRatioID)}, 1)
+	pixelValues = pixelValues.Pad(ctx, 0, 0, 0, m.ImageProcessor.maxNumTiles-ratio.numTiles())
+
+	aspectRatio, err := ctx.Input().FromIntSlice([]int32{int32(ratio.rank)}, 1)
 	if err != nil {
 		return nil, err
 	}
@@ -102,41 +91,19 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, er
 }
 
 func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
-	var images []input.Input
-	fnvHash := fnv.New64a()
-
 	for i := range inputs {
-		if inputs[i].Multimodal == nil {
-			if len(images) > 0 {
-				inputs[i].Multimodal = []ml.Tensor{images[0].Multimodal.(ml.Tensor)}
-				inputs[i].MultimodalHash = images[0].MultimodalHash
-				for j := 1; j < len(images); j++ {
-					inputs[i].Multimodal = append(inputs[i].Multimodal.([]ml.Tensor), images[0].Multimodal.(ml.Tensor))
-					fnvHash.Reset()
-					binary.Write(fnvHash, binary.NativeEndian, inputs[i].MultimodalHash)
-					binary.Write(fnvHash, binary.NativeEndian, inputs[j].MultimodalHash)
-					inputs[i].MultimodalHash = fnvHash.Sum64()
-				}
-				images = nil
-			}
-		} else {
-			images = append(images, inputs[i])
-			inputs[i].Token = -1
+		if inputs[i].Multimodal != nil {
+			inputs[i].Token = 128256 // <|image|>
 		}
 	}
 
-	inputs = slices.DeleteFunc(inputs, func(input input.Input) bool { return input.Token == -1 })
-
 	return inputs, nil
 }
 
 func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
 	var crossAttentionStates ml.Tensor
 	if len(batch.Multimodal) > 0 {
-		images := batch.Multimodal[len(batch.Multimodal)-1].Multimodal.([]ml.Tensor)
-		if len(images) > 0 {
-			crossAttentionStates = images[len(images)-1]
-		}
+		crossAttentionStates = batch.Multimodal[len(batch.Multimodal)-1].Multimodal.(ml.Tensor)
 	}
 
 	positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
@@ -150,7 +117,7 @@ func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
 	}
 
 	// TODO: attention mask, cross attention mask
-	return m.TextModel.Forward(ctx, batch.Inputs, positions, outputs, nil, crossAttentionStates, nil, m.Cache.(*kvcache.WrapperCache)), nil
+	return m.TextModel.Forward(ctx, batch.Inputs, positions, outputs, crossAttentionStates, nil, m.Cache.(*kvcache.WrapperCache)), nil
 }
 
 func init() {
diff --git a/model/models/mllama/model_text.go b/model/models/mllama/model_text.go
index 490eb696c..9bd414afc 100644
--- a/model/models/mllama/model_text.go
+++ b/model/models/mllama/model_text.go
@@ -18,7 +18,7 @@ type TextSelfAttention struct {
 	RopeFactors ml.Tensor  `gguf:"rope_freqs.weight"`
 }
 
-func (sa *TextSelfAttention) Forward(ctx ml.Context, hiddenState, positions, _ ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor {
+func (sa *TextSelfAttention) Forward(ctx ml.Context, hiddenState, positions ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor {
 	batchSize := hiddenState.Dim(1)
 	headDim := opts.hiddenSize / opts.numHeads
 	ropeType := uint32(0)
@@ -69,11 +69,11 @@ type TextSelfAttentionDecoderLayer struct {
 	MLP     *TextMLP
 }
 
-func (d *TextSelfAttentionDecoderLayer) Forward(ctx ml.Context, hiddenState, positions, outputs, mask, _, _ ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor {
+func (d *TextSelfAttentionDecoderLayer) Forward(ctx ml.Context, hiddenState, positions, outputs, _, _ ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor {
 	residual := hiddenState
 
 	hiddenState = d.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
-	hiddenState = d.SelfAttention.Forward(ctx, hiddenState, positions, mask, cache, opts)
+	hiddenState = d.SelfAttention.Forward(ctx, hiddenState, positions, cache, opts)
 
 	// In the final layer (outputs != nil), optimize by pruning to just the token positions
 	// we need logits for.
@@ -151,7 +151,7 @@ type TextCrossAttentionDecoderLayer struct {
 	MLPGate ml.Tensor `gguf:"cross_attn_mlp_gate"`
 }
 
-func (d *TextCrossAttentionDecoderLayer) Forward(ctx ml.Context, hiddenState, _, _, _, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor {
+func (d *TextCrossAttentionDecoderLayer) Forward(ctx ml.Context, hiddenState, _, _, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor {
 	residual := hiddenState
 
 	hiddenState = d.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
@@ -167,14 +167,14 @@ func (d *TextCrossAttentionDecoderLayer) Forward(ctx ml.Context, hiddenState, _,
 }
 
 type TextDecoderLayer interface {
-	Forward(ctx ml.Context, hiddenState, positionIDs, outputs, mask, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor
+	Forward(ctx ml.Context, hiddenState, positionIDs, outputs, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor
 }
 
 type TextDecoder struct {
 	Layers []TextDecoderLayer
 }
 
-func (d *TextDecoder) Forward(ctx ml.Context, hiddenState, positionIDs, outputs, mask, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor {
+func (d *TextDecoder) Forward(ctx ml.Context, hiddenState, positionIDs, outputs, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor {
 	for i, layer := range d.Layers {
 		layerType := selfAttentionLayer
 		if slices.Contains(opts.crossAttentionLayers, int32(i)) {
@@ -190,7 +190,7 @@ func (d *TextDecoder) Forward(ctx ml.Context, hiddenState, positionIDs, outputs,
 				lastLayerOutputs = outputs
 			}
 
-			hiddenState = layer.Forward(ctx, hiddenState, positionIDs, lastLayerOutputs, mask, crossAttentionStates, crossAttentionMask, cache, opts)
+			hiddenState = layer.Forward(ctx, hiddenState, positionIDs, lastLayerOutputs, crossAttentionStates, crossAttentionMask, cache, opts)
 		}
 	}
 
@@ -214,9 +214,9 @@ type TextModel struct {
 	*TextModelOptions
 }
 
-func (m *TextModel) Forward(ctx ml.Context, inputIDs, positionIDs, outputs, mask, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache) ml.Tensor {
+func (m *TextModel) Forward(ctx ml.Context, inputIDs, positionIDs, outputs, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache) ml.Tensor {
 	hiddenState := m.TokenEmbedding.Forward(ctx, inputIDs)
-	hiddenState = m.Transformer.Forward(ctx, hiddenState, positionIDs, outputs, mask, crossAttentionStates, crossAttentionMask, cache, m.TextModelOptions)
+	hiddenState = m.Transformer.Forward(ctx, hiddenState, positionIDs, outputs, crossAttentionStates, crossAttentionMask, cache, m.TextModelOptions)
 	hiddenState = m.OutputNorm.Forward(ctx, hiddenState, m.eps)
 	return m.Output.Forward(ctx, hiddenState)
 }
diff --git a/model/models/mllama/model_vision.go b/model/models/mllama/model_vision.go
index bd3d150a3..77ea53731 100644
--- a/model/models/mllama/model_vision.go
+++ b/model/models/mllama/model_vision.go
@@ -15,7 +15,7 @@ type VisionSelfAttention struct {
 	Query  *nn.Linear `gguf:"attn_q"`
 	Key    *nn.Linear `gguf:"attn_k"`
 	Value  *nn.Linear `gguf:"attn_v"`
-	Output *nn.Linear `gguf:"attn_out"`
+	Output *nn.Linear `gguf:"attn_output"`
 
 	Gate ml.Tensor `gguf:"attn_gate"`
 }
@@ -45,36 +45,29 @@ func (sa *VisionSelfAttention) Forward(ctx ml.Context, hiddenState ml.Tensor, op
 	attention = attention.Reshape(ctx, opts.hiddenSize, attention.Dim(2), batchSize)
 
 	hiddenState = sa.Output.Forward(ctx, attention)
-	if sa.Gate != nil {
-		hiddenState = hiddenState.Mul(ctx, sa.Gate)
-	}
-
 	return hiddenState
 }
 
 type VisionMLP struct {
-	Down *nn.Linear `gguf:"ffn_down"`
 	Up   *nn.Linear `gguf:"ffn_up"`
-
-	Gate ml.Tensor `gguf:"ffn_gate"`
+	Down *nn.Linear `gguf:"ffn_down"`
 }
 
 func (mlp *VisionMLP) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *VisionModelOptions) ml.Tensor {
-	hiddenState = mlp.Down.Forward(ctx, hiddenState).GELU(ctx)
-	hiddenState = mlp.Up.Forward(ctx, hiddenState)
-	if mlp.Gate != nil {
-		hiddenState = hiddenState.Mul(ctx, mlp.Gate)
-	}
+	hiddenState = mlp.Up.Forward(ctx, hiddenState).GELU(ctx)
+	hiddenState = mlp.Down.Forward(ctx, hiddenState)
 
 	return hiddenState
 }
 
 type VisionEncoderLayer struct {
-	AttentionNorm *nn.LayerNorm `gguf:"ln1"`
+	AttentionNorm *nn.LayerNorm `gguf:"attn_norm"`
 	SelfAttention *VisionSelfAttention
+	AttentionGate ml.Tensor `gguf:"attn_gate"`
 
-	MLPNorm *nn.LayerNorm `gguf:"ln2"`
+	MLPNorm *nn.LayerNorm `gguf:"ffn_norm"`
 	MLP     *VisionMLP
+	MLPGate ml.Tensor `gguf:"ffn_gate"`
 }
 
 func (e *VisionEncoderLayer) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *VisionModelOptions) ml.Tensor {
@@ -83,13 +76,22 @@ func (e *VisionEncoderLayer) Forward(ctx ml.Context, hiddenState ml.Tensor, opts
 	// self attention
 	hiddenState = e.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
 	hiddenState = e.SelfAttention.Forward(ctx, hiddenState, opts)
+
+	if e.AttentionGate != nil {
+		hiddenState = hiddenState.Mul(ctx, e.AttentionGate)
+	}
 	hiddenState = hiddenState.Add(ctx, residual)
 	residual = hiddenState
 
 	// feed forward
 	hiddenState = e.MLPNorm.Forward(ctx, hiddenState, opts.eps)
 	hiddenState = e.MLP.Forward(ctx, hiddenState, opts)
-	return hiddenState.Add(ctx, residual)
+	hiddenState = hiddenState.Add(ctx, residual)
+	if e.MLPGate != nil {
+		hiddenState = hiddenState.Mul(ctx, e.MLPGate)
+	}
+
+	return hiddenState
 }
 
 type VisionEncoder struct {
@@ -114,9 +116,9 @@ type PrecomputedAspectRatioEmbedding struct {
 	Gate      ml.Tensor `gguf:"gate"`
 }
 
-func (e *PrecomputedAspectRatioEmbedding) Forward(ctx ml.Context, hiddenState ml.Tensor, aspectRatioIDs ml.Tensor, opts *VisionModelOptions) ml.Tensor {
+func (e *PrecomputedAspectRatioEmbedding) Forward(ctx ml.Context, hiddenState ml.Tensor, aspectRatioIDs ml.Tensor, numTiles int, opts *VisionModelOptions) ml.Tensor {
 	embeddings := e.Embedding.Forward(ctx, aspectRatioIDs)
-	embeddings = embeddings.Reshape(ctx, opts.hiddenSize, 1, opts.numTiles)
+	embeddings = embeddings.Reshape(ctx, opts.hiddenSize, 1, numTiles)
 	if e.Gate != nil {
 		embeddings = embeddings.Mul(ctx, e.Gate)
 	}
@@ -132,7 +134,7 @@ type PrecomputedPositionEmbedding struct {
 	TilePositionEmbeddingGate ml.Tensor     `gguf:"tile_position_embd.gate"`
 }
 
-func (e *PrecomputedPositionEmbedding) Forward(ctx ml.Context, hiddenState, positionIDs, aspectRatioIDs ml.Tensor, numPositions int, opts *VisionModelOptions) ml.Tensor {
+func (e *PrecomputedPositionEmbedding) Forward(ctx ml.Context, hiddenState, positionIDs, aspectRatioIDs ml.Tensor, numPositions, numTiles int, opts *VisionModelOptions) ml.Tensor {
 	positionEmbedding := e.PositionEmbedding.Forward(ctx, positionIDs)
 	if e.PositionEmbeddingGate != nil {
 		positionEmbedding = positionEmbedding.Mul(ctx, e.PositionEmbeddingGate)
@@ -141,7 +143,7 @@ func (e *PrecomputedPositionEmbedding) Forward(ctx ml.Context, hiddenState, posi
 	hiddenState = hiddenState.Add(ctx, positionEmbedding)
 
 	tilePositionEmbedding := e.TilePositionEmbedding.Forward(ctx, aspectRatioIDs)
-	tilePositionEmbedding = tilePositionEmbedding.Reshape(ctx, opts.hiddenSize, numPositions, opts.numTiles)
+	tilePositionEmbedding = tilePositionEmbedding.Reshape(ctx, opts.hiddenSize, numPositions, numTiles)
 	if e.TilePositionEmbeddingGate != nil {
 		tilePositionEmbedding = tilePositionEmbedding.Mul(ctx, e.TilePositionEmbeddingGate)
 	}
@@ -150,9 +152,9 @@ func (e *PrecomputedPositionEmbedding) Forward(ctx ml.Context, hiddenState, posi
 }
 
 type VisionModelOptions struct {
-	hiddenSize, numHeads, numTiles int
-	imageSize, patchSize           int
-	eps                            float32
+	hiddenSize, numHeads int
+	imageSize, patchSize int
+	eps                  float32
 
 	intermediateLayersIndices []int32
 }
@@ -181,14 +183,16 @@ func (m *VisionModel) Forward(ctx ml.Context, pixelValues, positionIDs, aspectRa
 		numPositions++
 	}
 
+	numTiles := pixelValues.Dim(3)
+
 	hiddenState := m.PatchEmbeddings.Forward(ctx, pixelValues, m.patchSize, m.patchSize, 0, 0, 1, 1)
-	hiddenState = hiddenState.Reshape(ctx, numPatches, m.hiddenSize, m.numTiles)
+	hiddenState = hiddenState.Reshape(ctx, numPatches, m.hiddenSize, numTiles)
 	hiddenState = hiddenState.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx)
 
-	hiddenState = m.PreTilePositionEmbedding.Forward(ctx, hiddenState, aspectRatioIDs, m.VisionModelOptions)
-	hiddenState = m.ClassEmbedding.Repeat(ctx, 2, m.numTiles).Concat(ctx, hiddenState, 1)
+	hiddenState = m.PreTilePositionEmbedding.Forward(ctx, hiddenState, aspectRatioIDs, numTiles, m.VisionModelOptions)
+	hiddenState = m.ClassEmbedding.Repeat(ctx, 2, numTiles).Concat(ctx, hiddenState, 1)
 
-	hiddenState = m.PositionEmbedding.Forward(ctx, hiddenState, positionIDs, aspectRatioIDs, numPositions, m.VisionModelOptions)
+	hiddenState = m.PositionEmbedding.Forward(ctx, hiddenState, positionIDs, aspectRatioIDs, numPositions, numTiles, m.VisionModelOptions)
 	hiddenState = m.PreLayerNorm.Forward(ctx, hiddenState, m.eps)
 
 	numPaddingPatches := 8 - (hiddenState.Dim(1)%8)%8
@@ -199,18 +203,18 @@ func (m *VisionModel) Forward(ctx ml.Context, pixelValues, positionIDs, aspectRa
 
 	hiddenState = m.PostLayerNorm.Forward(ctx, hiddenState, m.eps)
 
-	hiddenState = hiddenState.Reshape(ctx, m.hiddenSize, numPositions+numPaddingPatches, m.numTiles, batchSize)
-	hiddenState = m.PostTilePositionEmbedding.Forward(ctx, hiddenState, aspectRatioIDs, m.VisionModelOptions)
+	hiddenState = hiddenState.Reshape(ctx, m.hiddenSize, numPositions+numPaddingPatches, numTiles, batchSize)
+	hiddenState = m.PostTilePositionEmbedding.Forward(ctx, hiddenState, aspectRatioIDs, numTiles, m.VisionModelOptions)
 
-	hiddenState = hiddenState.Reshape(ctx, m.hiddenSize, m.numTiles*(numPositions+numPaddingPatches), batchSize)
+	hiddenState = hiddenState.Reshape(ctx, m.hiddenSize, numTiles*(numPositions+numPaddingPatches), batchSize)
 	hiddenState, _ = m.GlobalTransformer.Forward(ctx, hiddenState, nil, m.VisionModelOptions)
 
 	hiddenStates := intermediateHiddenStates[0].Stack(ctx, 0, intermediateHiddenStates[1:]...)
-	hiddenStates = hiddenStates.Reshape(ctx, len(intermediateHiddenStates)*m.hiddenSize, numPositions+numPaddingPatches, m.numTiles, batchSize)
-	hiddenStates = hiddenStates.Unpad(ctx, 0, numPaddingPatches, 0, 0)
+	hiddenStates = hiddenStates.Reshape(ctx, len(intermediateHiddenStates)*m.hiddenSize, numPositions+numPaddingPatches, numTiles, batchSize)
+	hiddenStates = hiddenStates.Pad(ctx, 0, -numPaddingPatches, 0, 0)
 
-	hiddenState = hiddenState.Reshape(ctx, m.hiddenSize, numPositions+numPaddingPatches, m.numTiles, batchSize)
-	hiddenState = hiddenState.Unpad(ctx, 0, numPaddingPatches, 0, 0)
+	hiddenState = hiddenState.Reshape(ctx, m.hiddenSize, numPositions+numPaddingPatches, numTiles, batchSize)
+	hiddenState = hiddenState.Pad(ctx, 0, -numPaddingPatches, 0, 0)
 	return hiddenState.Concat(ctx, hiddenStates, 0)
 }
 
@@ -222,7 +226,6 @@ func newVisionModel(c fs.Config) *VisionModel {
 		VisionModelOptions: &VisionModelOptions{
 			hiddenSize: int(c.Uint("vision.embedding_length")),
 			numHeads:   int(c.Uint("vision.attention.head_count")),
-			numTiles:   int(c.Uint("vision.max_num_tiles")),
 
 			imageSize: int(c.Uint("vision.image_size")),
 			patchSize: int(c.Uint("vision.patch_size")),
diff --git a/model/models/mllama/process_image.go b/model/models/mllama/process_image.go
index 1b0506d32..8e60508ff 100644
--- a/model/models/mllama/process_image.go
+++ b/model/models/mllama/process_image.go
@@ -2,17 +2,31 @@ package mllama
 
 import (
 	"image"
-	"image/color"
 	"math"
 	"slices"
 
 	"golang.org/x/image/draw"
 
 	"github.com/ollama/ollama/fs"
+	"github.com/ollama/ollama/model/imageproc"
 )
 
+type supportedAspectRatio struct {
+	rank, width, height int
+}
+
+func (a supportedAspectRatio) Point() image.Point {
+	return image.Point{a.width, a.height}
+}
+
+func (a supportedAspectRatio) numTiles() int {
+	return a.width * a.height
+}
+
 type ImageProcessor struct {
 	imageSize, numChannels, maxNumTiles int
+
+	mean, std [3]float32
 }
 
 func newImageProcessor(c fs.Config) ImageProcessor {
@@ -20,71 +34,49 @@ func newImageProcessor(c fs.Config) ImageProcessor {
 		imageSize:   int(c.Uint("vision.image_size")),
 		numChannels: int(c.Uint("vision.num_channels")),
 		maxNumTiles: int(c.Uint("vision.max_num_tiles")),
+
+		mean: imageproc.ClipDefaultMean,
+		std:  imageproc.ClipDefaultSTD,
 	}
 }
 
-func (p *ImageProcessor) supportedAspectRatios(maxTiles int) []image.Point {
-	ratios := []image.Point{}
-
-	for w := range maxTiles {
-		for h := range maxTiles {
-			if (w+1)*(h+1) <= maxTiles {
-				ratios = append(ratios, image.Point{w + 1, h + 1})
-			}
+func (p ImageProcessor) supportedAspectRatios() (ratios []supportedAspectRatio) {
+	for w := 1; w <= p.maxNumTiles; w++ {
+		for h := 1; h <= p.maxNumTiles/w; h++ {
+			ratios = append(ratios, supportedAspectRatio{len(ratios) + 1, w, h})
 		}
 	}
-
 	return ratios
 }
 
-func (p *ImageProcessor) clip(a, a_min, a_max int) int {
-	if a < a_min {
-		return a_min
-	} else if a > a_max {
-		return a_max
-	}
+func (p ImageProcessor) fitToCanvas(imageSize, canvasSize image.Point) image.Point {
+	tw := min(max(imageSize.X, p.imageSize), canvasSize.X)
+	th := min(max(imageSize.Y, p.imageSize), canvasSize.Y)
 
-	return a
-}
+	r := math.Min(
+		float64(tw)/float64(imageSize.X),
+		float64(th)/float64(imageSize.Y),
+	)
 
-func (p *ImageProcessor) fitToCanvas(imageSize, canvasSize image.Point, tileSize int) image.Point {
-	targetWidth := p.clip(imageSize.X, tileSize, canvasSize.X)
-	targetHeight := p.clip(imageSize.Y, tileSize, canvasSize.Y)
-
-	scaleWidth := float64(targetWidth) / float64(imageSize.X)
-	scaleHeight := float64(targetHeight) / float64(imageSize.Y)
-
-	var w, h int
-
-	if scaleWidth < scaleHeight {
-		w = targetWidth
-		h = min(int(math.Floor(float64(imageSize.Y)*scaleWidth)), targetHeight)
-	} else {
-		w = min(int(math.Floor(float64(imageSize.X)*scaleHeight)), targetWidth)
-		h = targetHeight
-	}
+	w := min(int(math.Floor(float64(imageSize.X)*r)), tw)
+	h := min(int(math.Floor(float64(imageSize.Y)*r)), th)
 
 	return image.Point{w, h}
 }
 
-func (p *ImageProcessor) optimalTiledCanvas(imageSize image.Point, maxImageTiles, tileSize int) image.Point {
-	possibleTileArrangements := p.supportedAspectRatios(maxImageTiles)
-	possibleCanvasSizes := []image.Point{}
-	for _, pta := range possibleTileArrangements {
-		possibleCanvasSizes = append(possibleCanvasSizes, image.Point{pta.X * tileSize, pta.Y * tileSize})
+func (p ImageProcessor) optimalTiledCanvas(imageSize image.Point) image.Point {
+	possibleTileArrangements := p.supportedAspectRatios()
+	possibleCanvasSizes := make([]image.Point, len(possibleTileArrangements))
+	for i, pta := range possibleTileArrangements {
+		possibleCanvasSizes[i] = image.Point{pta.width * p.imageSize, pta.height * p.imageSize}
 	}
 
-	scales := []float64{}
-
-	for _, pcs := range possibleCanvasSizes {
-		scaleHeight := float64(pcs.Y) / float64(imageSize.Y)
-		scaleWidth := float64(pcs.X) / float64(imageSize.X)
-
-		if scaleWidth > scaleHeight {
-			scales = append(scales, scaleHeight)
-		} else {
-			scales = append(scales, scaleWidth)
-		}
+	scales := make([]float64, len(possibleCanvasSizes))
+	for i, pcs := range possibleCanvasSizes {
+		scales[i] = min(
+			float64(pcs.Y)/float64(imageSize.Y),
+			float64(pcs.X)/float64(imageSize.X),
+		)
 	}
 
 	var minUpscale float64
@@ -123,47 +115,41 @@ func (p *ImageProcessor) optimalTiledCanvas(imageSize image.Point, maxImageTiles
 	return selectedCanvas
 }
 
-func (p *ImageProcessor) splitToTiles(img image.Image, numTilesSize image.Point) []image.Image {
+func (p ImageProcessor) splitToTiles(img image.Image, numTilesSize image.Point) []image.Image {
 	b := img.Bounds()
 	width := b.Max.X - b.Min.X
 	height := b.Max.Y - b.Min.Y
 	tileHeight := height / numTilesSize.Y
 	tileWidth := width / numTilesSize.X
 
-	images := []image.Image{}
+	images := make([]image.Image, 0, numTilesSize.Y*numTilesSize.X)
 
 	for h := range numTilesSize.Y {
 		for w := range numTilesSize.X {
 			rect := image.Rect(tileWidth*w, tileHeight*h, tileWidth*(w+1), tileHeight*(h+1))
-			images = append(images, img.(interface {
+			if subImg, ok := img.(interface {
 				SubImage(image.Rectangle) image.Image
-			}).SubImage(rect))
+			}); ok {
+				images = append(images, subImg.SubImage(rect))
+			} else {
+				// Handle the case where img does not implement SubImage
+				// This is a fallback and may not be efficient
+				newImg := image.NewRGBA(rect)
+				draw.Draw(newImg, rect, img, rect.Min, draw.Src)
+				images = append(images, newImg)
+			}
 		}
 	}
 
 	return images
 }
 
-// remove the "alpha" channel by drawing over a prefilled image
-//
-//nolint:unused
-func (p *ImageProcessor) compositeImage(img image.Image) image.Image {
-	dst := image.NewRGBA(img.Bounds())
-
-	white := color.RGBA{255, 255, 255, 255}
-	draw.Draw(dst, dst.Bounds(), &image.Uniform{white}, image.Point{}, draw.Src)
-	draw.Draw(dst, dst.Bounds(), img, img.Bounds().Min, draw.Over)
-
-	return dst
-}
-
-func (p *ImageProcessor) resize(img image.Image, outputSize image.Point, maxImageTiles int) (image.Image, image.Point) {
+func (p ImageProcessor) resize(img image.Image) (image.Image, image.Point) {
 	b := img.Bounds()
-	tileSize := outputSize.Y
 
-	canvasSize := p.optimalTiledCanvas(b.Max, maxImageTiles, tileSize)
-	aspectRatio := image.Point{canvasSize.X / tileSize, canvasSize.Y / tileSize}
-	newSize := p.fitToCanvas(b.Max, canvasSize, tileSize)
+	canvasSize := p.optimalTiledCanvas(b.Max)
+	aspectRatio := image.Point{canvasSize.X / p.imageSize, canvasSize.Y / p.imageSize}
+	newSize := p.fitToCanvas(b.Max, canvasSize)
 
 	dst := image.NewRGBA(image.Rect(0, 0, newSize.X, newSize.Y))
 
@@ -177,10 +163,10 @@ func (p *ImageProcessor) resize(img image.Image, outputSize image.Point, maxImag
 	return dst, aspectRatio
 }
 
-func (p *ImageProcessor) pad(img image.Image, outputSize, aspectRatio image.Point) image.Image {
+func (p ImageProcessor) pad(img image.Image, aspectRatio image.Point) image.Image {
 	paddedSize := image.Point{
-		X: outputSize.X * aspectRatio.X,
-		Y: outputSize.Y * aspectRatio.Y,
+		X: p.imageSize * aspectRatio.X,
+		Y: p.imageSize * aspectRatio.Y,
 	}
 
 	dst := image.NewRGBA(image.Rect(0, 0, paddedSize.X, paddedSize.Y))
@@ -189,7 +175,7 @@ func (p *ImageProcessor) pad(img image.Image, outputSize, aspectRatio image.Poin
 	return dst
 }
 
-func (p *ImageProcessor) pack(img image.Image, aspectRatio image.Point, mean, std [3]float32) []float32 {
+func (p ImageProcessor) pack(img image.Image, aspectRatio image.Point) []float32 {
 	subImages := p.splitToTiles(img, aspectRatio)
 
 	var pixelVals []float32
@@ -205,9 +191,9 @@ func (p *ImageProcessor) pack(img image.Image, aspectRatio image.Point, mean, st
 				gVal := float32(g>>8) / 255.0
 				bVal := float32(b>>8) / 255.0
 
-				rVal = (rVal - mean[0]) / std[0]
-				gVal = (gVal - mean[1]) / std[1]
-				bVal = (bVal - mean[2]) / std[2]
+				rVal = (rVal - p.mean[0]) / p.std[0]
+				gVal = (gVal - p.mean[1]) / p.std[1]
+				bVal = (bVal - p.mean[2]) / p.std[2]
 
 				rVals = append(rVals, rVal)
 				gVals = append(gVals, gVal)
@@ -222,17 +208,15 @@ func (p *ImageProcessor) pack(img image.Image, aspectRatio image.Point, mean, st
 	return pixelVals
 }
 
-func (p ImageProcessor) ProcessImage(img image.Image) ([]float32, int, error) {
-	outputSize := image.Point{p.imageSize, p.imageSize}
+func (p ImageProcessor) ProcessImage(img image.Image) ([]float32, supportedAspectRatio, error) {
+	newImage, newImageRatio := p.resize(img)
+	newImage = p.pad(newImage, newImageRatio)
+	pixelValues := p.pack(newImage, newImageRatio)
 
-	// clip values
-	mean := [3]float32{0.48145466, 0.4578275, 0.40821073}
-	std := [3]float32{0.26862954, 0.26130258, 0.27577711}
+	supportedAspectRatios := p.supportedAspectRatios()
+	aspectRatioID := slices.IndexFunc(supportedAspectRatios, func(i supportedAspectRatio) bool {
+		return i.width == newImageRatio.X && i.height == newImageRatio.Y
+	})
 
-	newImage, aspectRatio := p.resize(img, outputSize, p.maxNumTiles)
-	newImage = p.pad(newImage, outputSize, aspectRatio)
-
-	data := p.pack(newImage, aspectRatio, mean, std)
-	aspectRatioIndex := slices.Index(p.supportedAspectRatios(p.maxNumTiles), aspectRatio) + 1
-	return data, aspectRatioIndex, nil
+	return pixelValues, supportedAspectRatios[aspectRatioID], nil
 }
diff --git a/model/models/mllama/process_image_test.go b/model/models/mllama/process_image_test.go
new file mode 100644
index 000000000..a9669b182
--- /dev/null
+++ b/model/models/mllama/process_image_test.go
@@ -0,0 +1,387 @@
+package mllama
+
+import (
+	"image"
+	"testing"
+
+	"github.com/google/go-cmp/cmp"
+)
+
+func TestSupportedAspectRatios(t *testing.T) {
+	cases := []struct {
+		p    ImageProcessor
+		want []supportedAspectRatio
+	}{
+		{
+			p: ImageProcessor{maxNumTiles: 1},
+			want: []supportedAspectRatio{
+				{1, 1, 1},
+			},
+		},
+		{
+			p: ImageProcessor{maxNumTiles: 2},
+			want: []supportedAspectRatio{
+				{1, 1, 1},
+				{2, 1, 2},
+				{3, 2, 1},
+			},
+		},
+		{
+			p: ImageProcessor{maxNumTiles: 3},
+			want: []supportedAspectRatio{
+				{1, 1, 1},
+				{2, 1, 2},
+				{3, 1, 3},
+				{4, 2, 1},
+				{5, 3, 1},
+			},
+		},
+		{
+			p: ImageProcessor{maxNumTiles: 4},
+			want: []supportedAspectRatio{
+				{1, 1, 1},
+				{2, 1, 2},
+				{3, 1, 3},
+				{4, 1, 4},
+				{5, 2, 1},
+				{6, 2, 2},
+				{7, 3, 1},
+				{8, 4, 1},
+			},
+		},
+	}
+
+	for _, tt := range cases {
+		actual := tt.p.supportedAspectRatios()
+		if diff := cmp.Diff(actual, tt.want, cmp.AllowUnexported(supportedAspectRatio{})); diff != "" {
+			t.Errorf("mismatch (-got +want):\n%s", diff)
+		}
+	}
+}
+
+func TestFitToCanvas(t *testing.T) {
+	cases := []struct {
+		p      ImageProcessor
+		image  image.Point
+		canvas image.Point
+		expect image.Point
+	}{
+		{
+			p:      ImageProcessor{imageSize: 200},
+			image:  image.Point{400, 400},
+			canvas: image.Point{640, 480},
+			expect: image.Point{400, 400},
+		},
+		{
+			p:      ImageProcessor{imageSize: 200},
+			image:  image.Point{1024, 768},
+			canvas: image.Point{640, 480},
+			expect: image.Point{640, 480},
+		},
+		{
+			p:      ImageProcessor{imageSize: 750},
+			image:  image.Point{500, 500},
+			canvas: image.Point{1000, 1000},
+			expect: image.Point{750, 750},
+		},
+		{
+			p:      ImageProcessor{imageSize: 2000},
+			image:  image.Point{500, 1000},
+			canvas: image.Point{2000, 2000},
+			expect: image.Point{1000, 2000},
+		},
+		{
+			p:      ImageProcessor{imageSize: 1000},
+			image:  image.Point{4000, 3000},
+			canvas: image.Point{2000, 1000},
+			expect: image.Point{1333, 1000},
+		},
+		{
+			p:      ImageProcessor{imageSize: 560},
+			image:  image.Point{667, 1000},
+			canvas: image.Point{1000, 1000},
+			expect: image.Point{667, 1000},
+		},
+	}
+
+	for _, tt := range cases {
+		actual := tt.p.fitToCanvas(tt.image, tt.canvas)
+		if diff := cmp.Diff(actual, tt.expect); diff != "" {
+			t.Errorf("mismatch (-got +want):\n%s", diff)
+		}
+	}
+}
+
+func TestOptimalTiledCanvas(t *testing.T) {
+	cases := []struct {
+		p      ImageProcessor
+		image  image.Point
+		expect image.Point
+	}{
+		{
+			p:      ImageProcessor{maxNumTiles: 4, imageSize: 1000},
+			image:  image.Point{1024, 768},
+			expect: image.Point{2000, 1000},
+		},
+		{
+			p:      ImageProcessor{maxNumTiles: 4, imageSize: 560},
+			image:  image.Point{1024, 768},
+			expect: image.Point{1120, 1120},
+		},
+		{
+			p:      ImageProcessor{maxNumTiles: 4, imageSize: 560},
+			image:  image.Point{800, 600},
+			expect: image.Point{1120, 1120},
+		},
+		{
+			p:      ImageProcessor{maxNumTiles: 4, imageSize: 560},
+			image:  image.Point{640, 480},
+			expect: image.Point{1120, 560},
+		},
+		{
+			p:      ImageProcessor{maxNumTiles: 4, imageSize: 560},
+			image:  image.Point{320, 200},
+			expect: image.Point{560, 560},
+		},
+		{
+			p:      ImageProcessor{maxNumTiles: 4, imageSize: 560},
+			image:  image.Point{1320, 200},
+			expect: image.Point{1680, 560},
+		},
+		{
+			p:      ImageProcessor{maxNumTiles: 4, imageSize: 560},
+			image:  image.Point{2000, 200},
+			expect: image.Point{2240, 560},
+		},
+		{
+			p:      ImageProcessor{maxNumTiles: 4, imageSize: 560},
+			image:  image.Point{10000, 200},
+			expect: image.Point{2240, 560},
+		},
+		{
+			p:      ImageProcessor{maxNumTiles: 4, imageSize: 560},
+			image:  image.Point{480, 640},
+			expect: image.Point{560, 1120},
+		},
+		{
+			p:      ImageProcessor{maxNumTiles: 4, imageSize: 560},
+			image:  image.Point{200, 320},
+			expect: image.Point{560, 560},
+		},
+		{
+			p:      ImageProcessor{maxNumTiles: 4, imageSize: 560},
+			image:  image.Point{200, 1320},
+			expect: image.Point{560, 1680},
+		},
+		{
+			p:      ImageProcessor{maxNumTiles: 4, imageSize: 560},
+			image:  image.Point{200, 2000},
+			expect: image.Point{560, 2240},
+		},
+		{
+			p:      ImageProcessor{maxNumTiles: 4, imageSize: 560},
+			image:  image.Point{200, 10000},
+			expect: image.Point{560, 2240},
+		},
+		{
+			p:      ImageProcessor{maxNumTiles: 4, imageSize: 560},
+			image:  image.Point{10000, 10000},
+			expect: image.Point{1120, 1120},
+		},
+	}
+
+	for _, tt := range cases {
+		actual := tt.p.optimalTiledCanvas(tt.image)
+		if diff := cmp.Diff(actual, tt.expect); diff != "" {
+			t.Errorf("mismatch (-got +want):\n%s", diff)
+		}
+	}
+}
+
+func TestSplitToTiles(t *testing.T) {
+	cases := []struct {
+		imageMax image.Point
+		numTiles image.Point
+		expect   []image.Image
+	}{
+		{
+			imageMax: image.Point{1024, 768},
+			numTiles: image.Point{1, 1},
+			expect:   []image.Image{image.NewRGBA(image.Rect(0, 0, 1024, 768))},
+		},
+		{
+			imageMax: image.Point{1000, 500},
+			numTiles: image.Point{2, 1},
+			expect: []image.Image{
+				image.NewRGBA(image.Rect(0, 0, 500, 500)),
+				image.NewRGBA(image.Rect(500, 0, 1000, 500)),
+			},
+		},
+		{
+			imageMax: image.Point{1000, 1000},
+			numTiles: image.Point{2, 2},
+			expect: []image.Image{
+				image.NewRGBA(image.Rect(0, 0, 500, 500)),
+				image.NewRGBA(image.Rect(500, 0, 1000, 500)),
+				image.NewRGBA(image.Rect(0, 500, 500, 1000)),
+				image.NewRGBA(image.Rect(500, 500, 1000, 1000)),
+			},
+		},
+	}
+
+	var p ImageProcessor
+
+	for _, tt := range cases {
+		actual := p.splitToTiles(image.NewRGBA(image.Rectangle{Max: tt.imageMax}), tt.numTiles)
+
+		if len(actual) != len(tt.expect) {
+			t.Errorf("incorrect number of images '%d': expect: '%d'", len(actual), len(tt.expect))
+		}
+
+		for i := range actual {
+			if actual[i].Bounds() != tt.expect[i].Bounds() {
+				t.Errorf("image size incorrect: '%#v': expect: '%#v'", actual[i].Bounds(), tt.expect[i].Bounds())
+			}
+		}
+	}
+}
+
+func TestResize(t *testing.T) {
+	cases := []struct {
+		p                 ImageProcessor
+		imageMax          image.Point
+		expectImage       image.Image
+		expectAspectRatio image.Point
+	}{
+		{
+			p:                 ImageProcessor{maxNumTiles: 1, imageSize: 100},
+			imageMax:          image.Point{200, 200},
+			expectImage:       image.NewRGBA(image.Rect(0, 0, 100, 100)),
+			expectAspectRatio: image.Point{1, 1},
+		},
+		{
+			p:                 ImageProcessor{maxNumTiles: 2, imageSize: 100},
+			imageMax:          image.Point{200, 200},
+			expectImage:       image.NewRGBA(image.Rect(0, 0, 100, 100)),
+			expectAspectRatio: image.Point{1, 1},
+		},
+		{
+			p:                 ImageProcessor{maxNumTiles: 4, imageSize: 560},
+			imageMax:          image.Point{10, 10},
+			expectImage:       image.NewRGBA(image.Rect(0, 0, 560, 560)),
+			expectAspectRatio: image.Point{1, 1},
+		},
+		{
+			p:                 ImageProcessor{maxNumTiles: 4, imageSize: 560},
+			imageMax:          image.Point{2560, 1920},
+			expectImage:       image.NewRGBA(image.Rect(0, 0, 1120, 840)),
+			expectAspectRatio: image.Point{2, 2},
+		},
+		{
+			p:                 ImageProcessor{maxNumTiles: 4, imageSize: 560},
+			imageMax:          image.Point{1024, 768},
+			expectImage:       image.NewRGBA(image.Rect(0, 0, 1024, 768)),
+			expectAspectRatio: image.Point{2, 2},
+		},
+	}
+
+	for _, tt := range cases {
+		actualImage, actualAspectRatio := tt.p.resize(image.Rectangle{Max: tt.imageMax})
+
+		if actualImage.Bounds() != tt.expectImage.Bounds() {
+			t.Errorf("image size incorrect: '%#v': expect: '%#v'", actualImage.Bounds(), tt.expectImage.Bounds())
+		}
+
+		if actualAspectRatio != tt.expectAspectRatio {
+			t.Errorf("aspect ratio incorrect: '%#v': expect: '%#v'", actualAspectRatio, tt.expectAspectRatio)
+		}
+	}
+}
+
+func TestPad(t *testing.T) {
+	cases := []struct {
+		p           ImageProcessor
+		imageMax    image.Point
+		aspectRatio image.Point
+		expect      image.Image
+	}{
+		{
+			p:           ImageProcessor{maxNumTiles: 4, imageSize: 560},
+			imageMax:    image.Point{1000, 667},
+			aspectRatio: image.Point{2, 2},
+			expect:      image.NewRGBA(image.Rect(0, 0, 1120, 1120)),
+		},
+	}
+
+	for _, tt := range cases {
+		actual := tt.p.pad(image.Rectangle{Max: tt.imageMax}, tt.aspectRatio)
+
+		if actual.Bounds() != tt.expect.Bounds() {
+			t.Errorf("image size incorrect: '%#v': expect: '%#v'", actual.Bounds(), tt.expect.Bounds())
+		}
+	}
+}
+
+func TestPackImages(t *testing.T) {
+	cases := []struct {
+		imageMax    image.Point
+		aspectRatio image.Point
+		expectVals  int
+	}{
+		{
+			imageMax:    image.Point{1120, 1120},
+			aspectRatio: image.Point{2, 2},
+			expectVals:  2 * 2 * 3 * 560 * 560,
+		},
+		{
+			imageMax:    image.Point{560, 560},
+			aspectRatio: image.Point{1, 1},
+			expectVals:  1 * 1 * 3 * 560 * 560,
+		},
+		{
+			imageMax:    image.Point{1120, 560},
+			aspectRatio: image.Point{1, 2},
+			expectVals:  1 * 2 * 3 * 560 * 560,
+		},
+	}
+
+	for _, tt := range cases {
+		var p ImageProcessor
+		actualVals := p.pack(image.NewRGBA(image.Rectangle{Max: tt.imageMax}), tt.aspectRatio)
+		if len(actualVals) != tt.expectVals {
+			t.Errorf("packed image size incorrect: '%d': expect: '%d'", len(actualVals), tt.expectVals)
+		}
+	}
+}
+
+func TestPreprocess(t *testing.T) {
+	cases := []struct {
+		imageMax            image.Point
+		expectAspectRatioID int
+	}{
+		{
+			imageMax:            image.Point{10, 10},
+			expectAspectRatioID: 1,
+		},
+		{
+			imageMax:            image.Point{1024, 768},
+			expectAspectRatioID: 6,
+		},
+	}
+
+	p := ImageProcessor{imageSize: 560, maxNumTiles: 4}
+	for _, tt := range cases {
+		img, aspectRatio, err := p.ProcessImage(image.NewRGBA(image.Rectangle{Max: tt.imageMax}))
+		if err != nil {
+			t.Fatalf("error processing: %q", err)
+		}
+
+		if len(img) == 0 {
+			t.Errorf("no image data returned")
+		}
+
+		if aspectRatio.rank != tt.expectAspectRatioID {
+			t.Errorf("aspect ratio incorrect: '%d': expect: '%d'", aspectRatio, tt.expectAspectRatioID)
+		}
+	}
+}
diff --git a/runner/llamarunner/image.go b/runner/llamarunner/image.go
index e7e30a4d8..1d0c1a4f5 100644
--- a/runner/llamarunner/image.go
+++ b/runner/llamarunner/image.go
@@ -5,7 +5,6 @@ import (
 	"fmt"
 	"hash/maphash"
 	"log/slog"
-	"slices"
 	"sync"
 	"time"
 
@@ -18,8 +17,7 @@ type ImageContext struct {
 	// mu is required to be held when generating embeddings or accessing the cache
 	mu sync.Mutex
 
-	clip   *llama.ClipContext
-	mllama *llama.MllamaContext
+	clip *llama.ClipContext
 
 	// cache of images to embeddings
 	images    []imageCache
@@ -35,8 +33,6 @@ func NewImageContext(llamaContext *llama.Context, modelPath string) (*ImageConte
 	var c ImageContext
 	if arch == "clip" {
 		c.clip, err = llama.NewClipContext(llamaContext, modelPath)
-	} else if arch == "mllama" {
-		c.mllama, err = llama.NewMllamaContext(llamaContext, modelPath)
 	} else {
 		return nil, fmt.Errorf("unknown vision model architecture: %s", arch)
 	}
@@ -58,12 +54,9 @@ func (c *ImageContext) Free(modelPath string) {
 	if c.clip != nil {
 		c.clip.Free()
 	}
-	if c.mllama != nil {
-		c.mllama.Free()
-	}
 }
 
-func (c *ImageContext) NewEmbed(llamaContext *llama.Context, data []byte, aspectRatioId int) ([][]float32, error) {
+func (c *ImageContext) NewEmbed(llamaContext *llama.Context, data []byte) ([][]float32, error) {
 	if c == nil {
 		return nil, nil
 	}
@@ -79,12 +72,7 @@ func (c *ImageContext) NewEmbed(llamaContext *llama.Context, data []byte, aspect
 
 	embed, err := c.findImage(hash)
 	if err != nil {
-		if c.mllama != nil {
-			embed, err = c.mllama.NewEmbed(llamaContext, data, aspectRatioId)
-			if err != nil {
-				return nil, err
-			}
-		} else if c.clip != nil {
+		if c.clip != nil {
 			embed, err = c.clip.NewEmbed(llamaContext, data)
 			if err != nil {
 				return nil, err
@@ -105,33 +93,11 @@ func (c *ImageContext) BatchSize(configuredBatchSize int) int {
 		return 0
 	}
 
-	// Mllama maps an image to 1 embedding token (llava creates many tokens)
-	// and doesn't support more than a single image per request.
-	// The embeddings are large (100 MB), so allocating a big batch can fail
-	// on some systems
-	if c.mllama != nil {
-		return 1
-	}
-
 	return configuredBatchSize
 }
 
 func (c *ImageContext) EmbedSize(llamaContext *llama.Context) int {
-	if c != nil && c.mllama != nil {
-		return c.mllama.EmbedSize(llamaContext)
-	} else {
-		return llamaContext.Model().NEmbd()
-	}
-}
-
-func (c *ImageContext) NeedCrossAttention(inputs ...input) bool {
-	if c == nil || c.mllama == nil {
-		return false
-	}
-
-	return slices.ContainsFunc(inputs, func(input input) bool {
-		return input.embed != nil
-	})
+	return llamaContext.Model().NEmbd()
 }
 
 type imageCache struct {
diff --git a/runner/llamarunner/runner.go b/runner/llamarunner/runner.go
index 73e50ee0b..7aa9b96a2 100644
--- a/runner/llamarunner/runner.go
+++ b/runner/llamarunner/runner.go
@@ -57,10 +57,6 @@ type Sequence struct {
 	// input cache being used by this sequence
 	cache *InputCacheSlot
 
-	// does this sequence require cross-attention layers to be processed? - if we have seen
-	// an image for certain multi-modal models
-	crossAttention bool
-
 	// channel to send responses over
 	responses chan string
 
@@ -205,7 +201,7 @@ func (s *Server) inputs(prompt string, images []llm.ImageData) ([]input, error)
 				return nil, fmt.Errorf("invalid image index: %d", n)
 			}
 
-			embed, err := s.image.NewEmbed(s.lc, images[imageIndex].Data, images[imageIndex].AspectRatioID)
+			embed, err := s.image.NewEmbed(s.lc, images[imageIndex].Data)
 			if err != nil {
 				return nil, err
 			}
@@ -368,7 +364,6 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
 	defer s.mu.Unlock()
 
 	var batch *llama.Batch
-	crossAttention := false
 
 	seqIdx := s.nextSeq - 1
 	for range s.seqs {
@@ -416,9 +411,8 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
 					batch = tokenBatch
 				} else {
 					batch = embedBatch
-					seq.crossAttention = s.image.NeedCrossAttention(input)
 				}
-			} else if embedding != batch.IsEmbedding() || crossAttention != seq.crossAttention {
+			} else if embedding != batch.IsEmbedding() {
 				s.nextSeq = seqIdx
 				break
 			}
@@ -427,7 +421,6 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
 				break
 			}
 
-			crossAttention = seq.crossAttention
 			batch.Add(input.token, input.embed, len(seq.cache.Inputs)+len(seq.pendingInputs), i+1 == len(seq.inputs), seq.cache.Id)
 			seq.pendingInputs = append(seq.pendingInputs, input)
 			seq.iBatch = batch.NumTokens() - 1
@@ -440,20 +433,11 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
 		return nil
 	}
 
-	s.lc.SetCrossAttention(crossAttention)
-
 	err := s.lc.Decode(batch)
 	if err != nil {
 		return fmt.Errorf("failed to decode batch: %w", err)
 	}
 
-	if crossAttention {
-		// synchronize state to ensure the cross attention batch is complete.
-		// needed specifically for multi-GPU systems otherwise an inflight
-		// task may be incorrectly invalidated causing a crash
-		s.lc.Synchronize()
-	}
-
 	for i, seq := range s.seqs {
 		if seq == nil {
 			continue
@@ -622,8 +606,6 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
 				return
 			}
 
-			seq.crossAttention = s.image.NeedCrossAttention(seq.cache.Inputs...)
-
 			s.seqs[i] = seq
 			s.cond.Signal()
 			found = true
diff --git a/server/prompt.go b/server/prompt.go
index 5b5b958f1..147a02b69 100644
--- a/server/prompt.go
+++ b/server/prompt.go
@@ -3,47 +3,32 @@ package server
 import (
 	"bytes"
 	"context"
-	"encoding/binary"
 	"errors"
 	"fmt"
 	"log/slog"
+	"slices"
 	"strings"
 
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/llm"
-	"github.com/ollama/ollama/model/models/mllama"
 	"github.com/ollama/ollama/template"
 )
 
 type tokenizeFunc func(context.Context, string) ([]int, error)
 
-var errTooManyImages = errors.New("vision model only supports a single image per message")
-
 // chatPrompt accepts a list of messages and returns the prompt and images that should be used for the next chat turn.
 // chatPrompt truncates any messages that exceed the context window of the model, making sure to always include 1) the
 // latest message and 2) system messages
 func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.Options, msgs []api.Message, tools []api.Tool) (prompt string, images []llm.ImageData, _ error) {
 	var system []api.Message
 
-	isMllama := checkMllamaModelFamily(m)
-
-	var imageNumTokens int
 	// TODO: Ideally we would compute this from the projector metadata but some pieces are implementation dependent
-	if isMllama {
-		// Our mllama implementation packs all of the embeddings into a single token
-		imageNumTokens = 1
-	} else {
-		// Clip images are represented as 768 tokens, each an embedding
-		imageNumTokens = 768
-	}
+	// Clip images are represented as 768 tokens, each an embedding
+	imageNumTokens := 768
 
 	n := len(msgs) - 1
 	// in reverse, find all messages that fit into context window
 	for i := n; i >= 0; i-- {
-		if isMllama && len(msgs[i].Images) > 1 {
-			return "", nil, errTooManyImages
-		}
-
 		// always include the last message
 		if i == n {
 			continue
@@ -84,48 +69,17 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.
 	currMsgIdx := n
 
 	for cnt, msg := range msgs[currMsgIdx:] {
-		prefix := ""
-		imgPrompt := ""
+		if slices.Contains(m.Config.ModelFamilies, "mllama") && len(msg.Images) > 1 {
+			return "", nil, errors.New("this model only supports one image while more than one image requested")
+		}
+
+		var prefix string
 		prompt := msg.Content
 
 		for _, i := range msg.Images {
-			var imgData llm.ImageData
-
-			if isMllama {
-				if len(m.ProjectorPaths) == 0 {
-					imgData = llm.ImageData{
-						ID:   len(images),
-						Data: i,
-					}
-				} else {
-					data, opts, err := mllama.Preprocess(bytes.NewReader(i))
-					if err != nil {
-						return "", nil, err
-					}
-
-					buf := new(bytes.Buffer)
-					err = binary.Write(buf, binary.LittleEndian, data)
-					if err != nil {
-						return "", nil, err
-					}
-
-					ar, ok := opts["aspectRatioIndex"].(int)
-					if !ok {
-						return "", nil, fmt.Errorf("missing aspect ratio for image")
-					}
-
-					imgData = llm.ImageData{
-						ID:            len(images),
-						Data:          buf.Bytes(),
-						AspectRatioID: ar,
-					}
-				}
-				imgPrompt = "<|image|>"
-			} else {
-				imgData = llm.ImageData{
-					ID:   len(images),
-					Data: i,
-				}
+			imgData := llm.ImageData{
+				ID:   len(images),
+				Data: i,
 			}
 
 			imgTag := fmt.Sprintf("[img-%d]", imgData.ID)
@@ -137,7 +91,7 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.
 
 			images = append(images, imgData)
 		}
-		msgs[currMsgIdx+cnt].Content = prefix + imgPrompt + prompt
+		msgs[currMsgIdx+cnt].Content = prefix + prompt
 	}
 
 	// truncate any messages that do not fit into the context window
@@ -148,12 +102,3 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.
 
 	return b.String(), images, nil
 }
-
-func checkMllamaModelFamily(m *Model) bool {
-	for _, arch := range m.Config.ModelFamilies {
-		if arch == "mllama" {
-			return true
-		}
-	}
-	return false
-}
diff --git a/server/prompt_test.go b/server/prompt_test.go
index b81c01eef..fb6c96c0c 100644
--- a/server/prompt_test.go
+++ b/server/prompt_test.go
@@ -2,8 +2,6 @@ package server
 
 import (
 	"bytes"
-	"image"
-	"image/png"
 	"testing"
 
 	"github.com/google/go-cmp/cmp"
@@ -14,10 +12,9 @@ import (
 
 func TestChatPrompt(t *testing.T) {
 	type expect struct {
-		prompt        string
-		images        [][]byte
-		aspectRatioID int
-		error         error
+		prompt string
+		images [][]byte
+		error  error
 	}
 
 	tmpl, err := template.Parse(`
@@ -28,28 +25,6 @@ func TestChatPrompt(t *testing.T) {
 		t.Fatal(err)
 	}
 	visionModel := Model{Template: tmpl, ProjectorPaths: []string{"vision"}}
-	mllamaModel := Model{Template: tmpl, ProjectorPaths: []string{"vision"}, Config: ConfigV2{ModelFamilies: []string{"mllama"}}}
-
-	createImg := func(width, height int) ([]byte, error) {
-		img := image.NewRGBA(image.Rect(0, 0, width, height))
-		var buf bytes.Buffer
-
-		if err := png.Encode(&buf, img); err != nil {
-			return nil, err
-		}
-
-		return buf.Bytes(), nil
-	}
-
-	imgBuf, err := createImg(5, 5)
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	imgBuf2, err := createImg(6, 6)
-	if err != nil {
-		t.Fatal(err)
-	}
 
 	cases := []struct {
 		name  string
@@ -227,90 +202,6 @@ func TestChatPrompt(t *testing.T) {
 				images: [][]byte{[]byte("one hotdog"), []byte("two hotdogs")},
 			},
 		},
-		{
-			name:  "messages with mllama (no images)",
-			model: mllamaModel,
-			limit: 2048,
-			msgs: []api.Message{
-				{Role: "user", Content: "You're a test, Harry!"},
-				{Role: "assistant", Content: "I-I'm a what?"},
-				{Role: "user", Content: "A test. And a thumping good one at that, I'd wager."},
-			},
-			expect: expect{
-				prompt: "You're a test, Harry! I-I'm a what? A test. And a thumping good one at that, I'd wager. ",
-			},
-		},
-		{
-			name:  "messages with mllama single prompt",
-			model: mllamaModel,
-			limit: 2048,
-			msgs: []api.Message{
-				{Role: "user", Content: "How many hotdogs are in this image?", Images: []api.ImageData{imgBuf}},
-			},
-			expect: expect{
-				prompt:        "[img-0]<|image|>How many hotdogs are in this image? ",
-				images:        [][]byte{imgBuf},
-				aspectRatioID: 1,
-			},
-		},
-		{
-			name:  "messages with mllama",
-			model: mllamaModel,
-			limit: 2048,
-			msgs: []api.Message{
-				{Role: "user", Content: "You're a test, Harry!"},
-				{Role: "assistant", Content: "I-I'm a what?"},
-				{Role: "user", Content: "A test. And a thumping good one at that, I'd wager.", Images: []api.ImageData{imgBuf}},
-			},
-			expect: expect{
-				prompt:        "You're a test, Harry! I-I'm a what? [img-0]<|image|>A test. And a thumping good one at that, I'd wager. ",
-				images:        [][]byte{imgBuf},
-				aspectRatioID: 1,
-			},
-		},
-		{
-			name:  "multiple messages with mllama",
-			model: mllamaModel,
-			limit: 2048,
-			msgs: []api.Message{
-				{Role: "user", Content: "You're a test, Harry!", Images: []api.ImageData{imgBuf}},
-				{Role: "assistant", Content: "I-I'm a what?"},
-				{Role: "user", Content: "A test. And a thumping good one at that, I'd wager.", Images: []api.ImageData{imgBuf2}},
-			},
-			expect: expect{
-				prompt:        "[img-0]<|image|>You're a test, Harry! I-I'm a what? [img-1]<|image|>A test. And a thumping good one at that, I'd wager. ",
-				images:        [][]byte{imgBuf, imgBuf2},
-				aspectRatioID: 1,
-			},
-		},
-		{
-			name:  "earlier image with mllama",
-			model: mllamaModel,
-			limit: 2048,
-			msgs: []api.Message{
-				{Role: "user", Content: "How many hotdogs are in this image?", Images: []api.ImageData{imgBuf}},
-				{Role: "assistant", Content: "There are four hotdogs."},
-				{Role: "user", Content: "Which ones have mustard?"},
-			},
-			expect: expect{
-				prompt:        "[img-0]<|image|>How many hotdogs are in this image? There are four hotdogs. Which ones have mustard? ",
-				images:        [][]byte{imgBuf},
-				aspectRatioID: 1,
-			},
-		},
-		{
-			name:  "too many images with mllama",
-			model: mllamaModel,
-			limit: 2048,
-			msgs: []api.Message{
-				{Role: "user", Content: "You're a test, Harry!"},
-				{Role: "assistant", Content: "I-I'm a what?"},
-				{Role: "user", Content: "A test. And a thumping good one at that, I'd wager.", Images: []api.ImageData{imgBuf, imgBuf}},
-			},
-			expect: expect{
-				error: errTooManyImages,
-			},
-		},
 	}
 
 	for _, tt := range cases {
@@ -341,10 +232,6 @@ func TestChatPrompt(t *testing.T) {
 					if !bytes.Equal(images[i].Data, tt.images[i]) {
 						t.Errorf("expected %q, got %q", tt.images[i], images[i].Data)
 					}
-				} else {
-					if images[i].AspectRatioID != tt.aspectRatioID {
-						t.Errorf("expected aspect ratio %d, got %d", tt.aspectRatioID, images[i].AspectRatioID)
-					}
 				}
 			}
 		})
diff --git a/server/routes.go b/server/routes.go
index fd65669a2..d0b8f487e 100644
--- a/server/routes.go
+++ b/server/routes.go
@@ -4,7 +4,6 @@ import (
 	"bytes"
 	"cmp"
 	"context"
-	"encoding/binary"
 	"encoding/json"
 	"errors"
 	"fmt"
@@ -35,7 +34,6 @@ import (
 	"github.com/ollama/ollama/fs/ggml"
 	"github.com/ollama/ollama/llm"
 	"github.com/ollama/ollama/logutil"
-	"github.com/ollama/ollama/model/models/mllama"
 	"github.com/ollama/ollama/openai"
 	"github.com/ollama/ollama/server/internal/client/ollama"
 	"github.com/ollama/ollama/server/internal/registry"
@@ -100,6 +98,10 @@ func (s *Server) scheduleRunner(ctx context.Context, name string, caps []model.C
 		return nil, nil, nil, err
 	}
 
+	if slices.Contains(model.Config.ModelFamilies, "mllama") && len(model.ProjectorPaths) > 0 {
+		return nil, nil, nil, fmt.Errorf("'llama3.2-vision' is no longer compatible with your version of Ollama and has been replaced by a newer version. To re-download, run 'ollama pull llama3.2-vision'")
+	}
+
 	if err := model.CheckCapabilities(caps...); err != nil {
 		return nil, nil, nil, fmt.Errorf("%s %w", name, err)
 	}
@@ -206,38 +208,14 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 		return
 	}
 
-	isMllama := checkMllamaModelFamily(m)
-	if isMllama && len(req.Images) > 1 {
-		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "this model only supports one image: more than one image sent"})
+	if slices.Contains(m.Config.ModelFamilies, "mllama") && len(req.Images) > 1 {
+		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "this model only supports one image while more than one image requested"})
 		return
 	}
 
 	images := make([]llm.ImageData, len(req.Images))
 	for i := range req.Images {
-		if isMllama && len(m.ProjectorPaths) > 0 {
-			data, opts, err := mllama.Preprocess(bytes.NewReader(req.Images[i]))
-			if err != nil {
-				c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": "error processing image"})
-				return
-			}
-
-			ar, ok := opts["aspectRatioIndex"].(int)
-			if !ok {
-				c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": "error processing image"})
-				return
-			}
-
-			buf := new(bytes.Buffer)
-			err = binary.Write(buf, binary.LittleEndian, data)
-			if err != nil {
-				c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": "error processing image"})
-				return
-			}
-
-			images[i] = llm.ImageData{ID: i, Data: buf.Bytes(), AspectRatioID: ar}
-		} else {
-			images[i] = llm.ImageData{ID: i, Data: req.Images[i]}
-		}
+		images[i] = llm.ImageData{ID: i, Data: req.Images[i]}
 	}
 
 	prompt := req.Prompt
@@ -269,9 +247,6 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 
 			for _, i := range images {
 				imgPrompt := ""
-				if isMllama {
-					imgPrompt = "<|image|>"
-				}
 				msgs = append(msgs, api.Message{Role: "user", Content: fmt.Sprintf("[img-%d]"+imgPrompt, i.ID)})
 			}
 
diff --git a/server/sched.go b/server/sched.go
index 43da138e2..3fc54e55a 100644
--- a/server/sched.go
+++ b/server/sched.go
@@ -8,6 +8,7 @@ import (
 	"os"
 	"reflect"
 	"runtime"
+	"slices"
 	"sort"
 	"strconv"
 	"strings"
@@ -132,11 +133,11 @@ func (s *Scheduler) processPending(ctx context.Context) {
 				continue
 			}
 			numParallel := int(envconfig.NumParallel())
-			// TODO (jmorganca): mllama doesn't support parallel yet
-			// see https://github.com/ollama/ollama/issues/4165
-			if checkMllamaModelFamily(pending.model) && numParallel != 1 {
+			// `mllama` is a snowflake and uses an encoder cache which cannot be used with num_parallel > 1
+			// ref: https://github.com/ollama/ollama/issues/4165
+			if slices.Contains(pending.model.Config.ModelFamilies, "mllama") && numParallel != 1 {
 				numParallel = 1
-				slog.Warn("mllama doesn't support parallel requests yet")
+				slog.Warn("mllama does not currently support parallel requests")
 			}
 
 			for {