From ed14ce2db87ba5e34b293a312eb5237efa9b6761 Mon Sep 17 00:00:00 2001
From: Bruce MacDonald <brucewmacdonald@gmail.com>
Date: Tue, 18 Mar 2025 14:10:16 -0700
Subject: [PATCH] convert mistral-3.1-2503

---
 convert/convert.go            |   4 +-
 convert/convert_mistral.go    | 188 ++++++++++++++++++++--------------
 convert/reader.go             |   5 +-
 model/models/llama/model.go   |  25 +----
 model/models/mistral/model.go |  40 +++++---
 parser/parser.go              |   8 +-
 6 files changed, 152 insertions(+), 118 deletions(-)

diff --git a/convert/convert.go b/convert/convert.go
index 695f5598a..b0a44d5aa 100644
--- a/convert/convert.go
+++ b/convert/convert.go
@@ -182,9 +182,9 @@ func ConvertModel(fsys fs.FS, ws io.WriteSeeker) error {
 
 	var conv ModelConverter
 	switch p.Architectures[0] {
-	case "LlamaForCausalLM":
+	case "LlamaForCausalLM", "MistralForCausalLM":
 		conv = &llamaModel{}
-	case "MistralForCausalLM":
+	case "Mistral3ForConditionalGeneration":
 		conv = &mistralModel{}
 	case "MixtralForCausalLM":
 		conv = &mixtralModel{}
diff --git a/convert/convert_mistral.go b/convert/convert_mistral.go
index 8e881974d..dc3c80fce 100644
--- a/convert/convert_mistral.go
+++ b/convert/convert_mistral.go
@@ -14,20 +14,39 @@ import (
 
 type mistralModel struct {
 	ModelParameters
-	NLayers               uint32  `json:"n_layers"`
-	NumHiddenLayers       uint32  `json:"num_hidden_layers"`
-	NLayer                uint32  `json:"n_layer"`
-	MaxPositionEmbeddings uint32  `json:"max_position_embeddings"`
-	NCtx                  uint32  `json:"n_ctx"`
-	HiddenSize            uint32  `json:"hidden_size"`
-	NEmbd                 uint32  `json:"n_embd"`
-	IntermediateSize      uint32  `json:"intermediate_size"`
-	NInner                uint32  `json:"n_inner"`
-	NumAttentionHeads     uint32  `json:"num_attention_heads"`
-	NHead                 uint32  `json:"n_head"`
-	NumKeyValueHeads      uint32  `json:"num_key_value_heads"`
-	RopeTheta             float32 `json:"rope_theta"`
-	RopeScaling           struct {
+	// Text model parameters
+	TextConfig struct {
+		NumHiddenLayers       uint32  `json:"num_hidden_layers"`
+		MaxPositionEmbeddings uint32  `json:"max_position_embeddings"`
+		HiddenSize            uint32  `json:"hidden_size"`
+		IntermediateSize      uint32  `json:"intermediate_size"`
+		NumAttentionHeads     uint32  `json:"num_attention_heads"`
+		NumKeyValueHeads      uint32  `json:"num_key_value_heads"`
+		RopeTheta             float32 `json:"rope_theta"`
+		RMSNormEPS            float32 `json:"rms_norm_eps"`
+		HeadDim               uint32  `json:"head_dim"`
+	} `json:"text_config"`
+
+	// Vision model parameters
+	VisionConfig struct {
+		NumHiddenLayers   uint32  `json:"num_hidden_layers"`
+		HiddenSize        uint32  `json:"hidden_size"`
+		IntermediateSize  uint32  `json:"intermediate_size"`
+		NumAttentionHeads uint32  `json:"num_attention_heads"`
+		ImageSize         uint32  `json:"image_size"`
+		PatchSize         uint32  `json:"patch_size"`
+		RopeTheta         float32 `json:"rope_theta"`
+	} `json:"vision_config"`
+
+	// Multimodal specific parameters
+	ImageTokenIndex         uint32 `json:"image_token_index"`
+	MultimodalProjectorBias bool   `json:"multimodal_projector_bias"`
+	ProjectorHiddenAct      string `json:"projector_hidden_act"`
+	SpatialMergeSize        uint32 `json:"spatial_merge_size"`
+	VisionFeatureLayer      int32  `json:"vision_feature_layer"`
+
+	// For RoPE scaling if needed
+	RopeScaling struct {
 		Type                            string  `json:"type"`
 		RopeType                        string  `json:"rope_type"`
 		Factor                          float32 `json:"factor"`
@@ -37,44 +56,46 @@ type mistralModel struct {
 
 		factors ropeFactor
 	} `json:"rope_scaling"`
-	RMSNormEPS       float32 `json:"rms_norm_eps"`
-	LayerNormEPS     float32 `json:"layer_norm_eps"`
-	LayerNormEpsilon float32 `json:"layer_norm_epsilon"`
-	NormEpsilon      float32 `json:"norm_epsilon"`
-	HeadDim          uint32  `json:"head_dim"`
 }
 
 func (p *mistralModel) KV(t *Tokenizer) ggml.KV {
 	kv := p.ModelParameters.KV(t)
 	kv["general.architecture"] = "mistral"
 	kv["mistral.vocab_size"] = p.VocabSize
+	kv["mistral.image_token_index"] = p.ImageTokenIndex
+	kv["mistral.multimodal_projector_bias"] = p.MultimodalProjectorBias
+	kv["mistral.projector_hidden_act"] = p.ProjectorHiddenAct
+	kv["mistral.spatial_merge_size"] = p.SpatialMergeSize
+	// kv["mistral.vision_feature_layer"] = p.VisionFeatureLayer
 
-	kv["mistral.block_count"] = cmp.Or(p.NLayers, p.NumHiddenLayers, p.NLayer)
+	// Text model config
+	kv["mistral.block_count"] = p.TextConfig.NumHiddenLayers
+	kv["mistral.context_length"] = p.TextConfig.MaxPositionEmbeddings
+	kv["mistral.embedding_length"] = p.TextConfig.HiddenSize
+	kv["mistral.feed_forward_length"] = p.TextConfig.IntermediateSize
+	kv["mistral.attention.head_count"] = p.TextConfig.NumAttentionHeads
+	kv["mistral.attention.head_count_kv"] = p.TextConfig.NumKeyValueHeads
+	kv["mistral.rope.dimension_count"] = p.TextConfig.HiddenSize / p.TextConfig.NumAttentionHeads
+	kv["mistral.rope.freq_base"] = p.TextConfig.RopeTheta
+	kv["mistral.attention.layer_norm_rms_epsilon"] = p.TextConfig.RMSNormEPS
+	kv["mistral.attention.key_length"] = p.TextConfig.HeadDim
+	kv["mistral.attention.value_length"] = p.TextConfig.HeadDim
 
-	if contextLength := cmp.Or(p.MaxPositionEmbeddings, p.NCtx); contextLength > 0 {
-		kv["mistral.context_length"] = contextLength
-	}
-
-	if embeddingLength := cmp.Or(p.HiddenSize, p.NEmbd); embeddingLength > 0 {
-		kv["mistral.embedding_length"] = cmp.Or(p.HiddenSize, p.NEmbd)
-	}
-
-	if feedForwardLength := cmp.Or(p.IntermediateSize, p.NInner); feedForwardLength > 0 {
-		kv["mistral.feed_forward_length"] = cmp.Or(p.IntermediateSize, p.NInner)
-	}
-
-	kv["mistral.attention.head_count"] = cmp.Or(p.NumAttentionHeads, p.NHead)
-	kv["mistral.rope.dimension_count"] = p.HiddenSize / cmp.Or(p.NLayers, p.NumHiddenLayers, p.NLayer)
-
-	if p.RopeTheta > 0 {
-		kv["mistral.rope.freq_base"] = p.RopeTheta
-	}
+	// Vision model config
+	kv["mistral.vision.block_count"] = p.VisionConfig.NumHiddenLayers
+	kv["mistral.vision.embedding_length"] = p.VisionConfig.HiddenSize
+	kv["mistral.vision.feed_forward_length"] = p.VisionConfig.IntermediateSize
+	kv["mistral.vision.attention.head_count"] = p.VisionConfig.NumAttentionHeads
+	kv["mistral.vision.image_size"] = p.VisionConfig.ImageSize
+	kv["mistral.vision.patch_size"] = p.VisionConfig.PatchSize
+	kv["mistral.vision.rope.freq_base"] = p.VisionConfig.RopeTheta
 
+	// If RoPE scaling is present
 	if p.RopeScaling.Type == "linear" {
 		kv["mistral.rope.scaling.type"] = p.RopeScaling.Type
 		kv["mistral.rope.scaling.factor"] = p.RopeScaling.Factor
 	} else if p.RopeScaling.RopeType == "llama3" {
-		dim := p.HiddenSize / p.NumAttentionHeads
+		dim := p.TextConfig.HiddenSize / p.TextConfig.NumAttentionHeads
 		for i := uint32(0); i < dim; i += 2 {
 			factor := cmp.Or(p.RopeScaling.Factor, 8.0)
 			factorLow := cmp.Or(p.RopeScaling.LowFrequencyFactor, 1.0)
@@ -84,7 +105,7 @@ func (p *mistralModel) KV(t *Tokenizer) ggml.KV {
 			lambdaLow := float32(original) / factorLow
 			lambdaHigh := float32(original) / factorHigh
 
-			lambda := 2 * math.Pi * math.Pow(float64(p.RopeTheta), float64(i)/float64(dim))
+			lambda := 2 * math.Pi * math.Pow(float64(p.TextConfig.RopeTheta), float64(i)/float64(dim))
 			if lambda < float64(lambdaHigh) {
 				p.RopeScaling.factors = append(p.RopeScaling.factors, 1.0)
 			} else if lambda > float64(lambdaLow) {
@@ -96,23 +117,6 @@ func (p *mistralModel) KV(t *Tokenizer) ggml.KV {
 		}
 	}
 
-	if p.NumKeyValueHeads > 0 {
-		kv["mistral.attention.head_count_kv"] = p.NumKeyValueHeads
-	}
-
-	if p.RMSNormEPS > 0 {
-		kv["mistral.attention.layer_norm_rms_epsilon"] = p.RMSNormEPS
-	}
-
-	if layerNormEpsilon := cmp.Or(p.LayerNormEPS, p.LayerNormEpsilon, p.NormEpsilon); layerNormEpsilon > 0 {
-		kv["mistral.attention.layer_norm_epsilon"] = layerNormEpsilon
-	}
-
-	if p.HeadDim > 0 {
-		kv["mistral.attention.key_length"] = p.HeadDim
-		kv["mistral.attention.value_length"] = p.HeadDim
-	}
-
 	return kv
 }
 
@@ -129,18 +133,13 @@ func (p *mistralModel) Tensors(ts []Tensor) []ggml.Tensor {
 	}
 
 	for _, t := range ts {
+		// Process tensors that require repacking
 		if strings.HasSuffix(t.Name(), "attn_q.weight") ||
 			strings.HasSuffix(t.Name(), "attn_k.weight") {
 			t.SetRepacker(p.repack)
 		}
 
-		if strings.HasPrefix(t.Name(), "patch_merger.") ||
-			strings.HasPrefix(t.Name(), "pre_mm_projector_output_norm.") ||
-			strings.HasPrefix(t.Name(), "vision_encoder.") ||
-			strings.HasPrefix(t.Name(), "vision_language_adapter.") {
-			continue
-		}
-
+		// Add all tensors to output
 		out = append(out, ggml.Tensor{
 			Name:     t.Name(),
 			Kind:     t.Kind(),
@@ -154,19 +153,42 @@ func (p *mistralModel) Tensors(ts []Tensor) []ggml.Tensor {
 
 func (p *mistralModel) Replacements() []string {
 	return []string{
-		"tok_embeddings", "token_embd",
-		"norm", "output_norm",
-		"layers", "blk",
-		"attention_norm", "attn_norm",
-		"attention.wq", "attn_q",
-		"attention.wk", "attn_k",
-		"attention.wv", "attn_v",
-		"attention.wo", "attn_output",
-		"feed_forward.w1", "ffn_gate",
-		"feed_forward.w2", "ffn_down",
-		"feed_forward.w3", "ffn_up",
-		"ffn_norm", "ffn_norm",
-		"output", "output",
+		// Language model replacements
+		"language_model.model.embed_tokens", "token_embd",
+		"language_model.model.norm", "output_norm",
+		"language_model.model.layers", "blk",
+		"language_model.model.layers.*.input_layernorm", "input_layernorm",
+		"language_model.model.layers.*.self_attn.q_proj", "self_attn.q_proj",
+		"language_model.model.layers.*.self_attn.k_proj", "self_attn.k_proj",
+		"language_model.model.layers.*.self_attn.v_proj", "self_attn.v_proj",
+		"language_model.model.layers.*.self_attn.o_proj", "self_attn.o_proj",
+		"language_model.model.layers.*.mlp.gate_proj", "mlp.gate_proj",
+		"language_model.model.layers.*.mlp.down_proj", "mlp.down_proj",
+		"language_model.model.layers.*.mlp.up_proj", "mlp.up_proj",
+		"language_model.model.layers.*.post_attention_layernorm", "post_attention_layernorm",
+		"language_model.lm_head", "output",
+
+		// Vision model replacements - map to shorter prefixes
+		"vision_tower", "v",
+		"multi_modal_projector", "mm",
+
+		// Vision transformer blocks - these should be updated accordingly
+		"vision_tower.transformer.layers", "v.blk",
+		"vision_tower.transformer.layers.*.attention_norm", "v.attn_norm",
+		"vision_tower.transformer.layers.*.attention.q_proj", "v.attn_q",
+		"vision_tower.transformer.layers.*.attention.k_proj", "v.attn_k",
+		"vision_tower.transformer.layers.*.attention.v_proj", "v.attn_v",
+		"vision_tower.transformer.layers.*.attention.o_proj", "v.attn_output",
+		"vision_tower.transformer.layers.*.feed_forward.gate_proj", "v.ffn_gate",
+		"vision_tower.transformer.layers.*.feed_forward.down_proj", "v.ffn_down",
+		"vision_tower.transformer.layers.*.feed_forward.up_proj", "v.ffn_up",
+		"vision_tower.transformer.layers.*.ffn_norm", "v.ffn_norm",
+		"vision_tower.ln_pre", "v.encoder_norm",
+		"vision_tower.patch_conv", "v.patch_conv",
+
+		// Multimodal projector components
+		"multi_modal_projector.patch_merger", "mm.patch_merger",
+		"multi_modal_projector.norm", "mm.norm",
 	}
 }
 
@@ -178,9 +200,17 @@ func (p *mistralModel) repack(name string, data []float32, shape []uint64) ([]fl
 
 	var heads uint32
 	if strings.HasSuffix(name, "attn_q.weight") {
-		heads = p.NumAttentionHeads
+		if strings.Contains(name, "vision") {
+			heads = p.VisionConfig.NumAttentionHeads
+		} else {
+			heads = p.TextConfig.NumAttentionHeads
+		}
 	} else if strings.HasSuffix(name, "attn_k.weight") {
-		heads = cmp.Or(p.NumKeyValueHeads, p.NumAttentionHeads)
+		if strings.Contains(name, "vision") {
+			heads = p.VisionConfig.NumAttentionHeads
+		} else {
+			heads = cmp.Or(p.TextConfig.NumKeyValueHeads, p.TextConfig.NumAttentionHeads)
+		}
 	} else {
 		return nil, fmt.Errorf("unknown tensor for repack: %s", name)
 	}
diff --git a/convert/reader.go b/convert/reader.go
index 904b13a42..c1218e66d 100644
--- a/convert/reader.go
+++ b/convert/reader.go
@@ -62,7 +62,10 @@ func parseTensors(fsys fs.FS, replacer *strings.Replacer) ([]Tensor, error) {
 		Pattern string
 		Func    func(fs.FS, *strings.Replacer, ...string) ([]Tensor, error)
 	}{
-		{"*.safetensors", parseSafetensors},
+		{"model-*-of-*.safetensors", parseSafetensors},
+		{"model.safetensors", parseSafetensors},
+		{"adapters.safetensors", parseSafetensors},
+		{"adapter_model.safetensors", parseSafetensors},
 		{"pytorch_model-*-of-*.bin", parseTorch},
 		{"pytorch_model.bin", parseTorch},
 		{"consolidated.*.pth", parseTorch},
diff --git a/model/models/llama/model.go b/model/models/llama/model.go
index 47a88043e..19a2ab8c4 100644
--- a/model/models/llama/model.go
+++ b/model/models/llama/model.go
@@ -13,9 +13,9 @@ import (
 )
 
 type Options struct {
-	hiddenSize, numHeads, numKVHeads, headDim int
-	eps, ropeBase, ropeScale                  float32
-	ropeDim                                   uint32
+	hiddenSize, numHeads, numKVHeads int
+	eps, ropeBase, ropeScale         float32
+	ropeDim                          uint32
 }
 
 type Model struct {
@@ -37,8 +37,6 @@ func New(c ml.Config) (model.Model, error) {
 
 	m := Model{
 		BytePairEncoding: model.NewBytePairEncoding(
-			// TODO: need to set this in the conversion for mistral:
-			// tokenizer.ggml.pretokenizer = [^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+
 			c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
 			&model.Vocabulary{
 				Values: c.Strings("tokenizer.ggml.tokens"),
@@ -55,7 +53,6 @@ func New(c ml.Config) (model.Model, error) {
 			hiddenSize: int(c.Uint("embedding_length")),
 			numHeads:   int(c.Uint("attention.head_count")),
 			numKVHeads: int(c.Uint("attention.head_count_kv")),
-			headDim:    int(c.Uint("attention.key_length")),
 			eps:        c.Float("attention.layer_norm_rms_epsilon"),
 			ropeBase:   c.Float("rope.freq_base"),
 			ropeScale:  c.Float("rope.freq_scale", 1),
@@ -78,36 +75,24 @@ type SelfAttention struct {
 
 func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
 	batchSize := hiddenState.Dim(1)
+	headDim := opts.hiddenSize / opts.numHeads
 	ropeType := uint32(0)
-	// Get head dimension - use explicit value if available, otherwise calculate
-	headDim := opts.headDim
-	if headDim == 0 {
-		headDim = opts.hiddenSize / opts.numHeads
-	}
 
-	// Query projection and reshape
 	q := sa.Query.Forward(ctx, hiddenState)
 	q = q.Reshape(ctx, headDim, opts.numHeads, batchSize)
 	q = q.RoPE(ctx, positionIDs, sa.RopeFactors, opts.ropeDim, ropeType, opts.ropeBase, opts.ropeScale)
 
-	// Key projection and reshape
 	k := sa.Key.Forward(ctx, hiddenState)
 	k = k.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
 	k = k.RoPE(ctx, positionIDs, sa.RopeFactors, opts.ropeDim, ropeType, opts.ropeBase, opts.ropeScale)
 
-	// Value projection and reshape
 	v := sa.Value.Forward(ctx, hiddenState)
 	v = v.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
 
-	// Attention computation
 	scaleFactor := 1.0 / math.Sqrt(float64(headDim))
 	kqv := nn.Attention(ctx, q, k, v, scaleFactor, cache)
+	kqv = kqv.Reshape(ctx, opts.hiddenSize, batchSize)
 
-	// Reshape attention output for final projection
-	outputDim := headDim * opts.numHeads
-	kqv = kqv.Reshape(ctx, outputDim, batchSize)
-
-	// Apply output projection
 	return sa.Output.Forward(ctx, kqv)
 }
 
diff --git a/model/models/mistral/model.go b/model/models/mistral/model.go
index 9ebcff3c0..8bfd1a798 100644
--- a/model/models/mistral/model.go
+++ b/model/models/mistral/model.go
@@ -37,10 +37,7 @@ func New(c ml.Config) (model.Model, error) {
 
 	m := Model{
 		BytePairEncoding: model.NewBytePairEncoding(
-			// TODO: need to set this in the conversion for mistral:
-			// tokenizer.ggml.pretokenizer = [^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+
-			c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
-			// c.String("tokenizer.ggml.pretokenizer", `[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
+			c.String("tokenizer.ggml.pretokenizer", `[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
 			&model.Vocabulary{
 				Values: c.Strings("tokenizer.ggml.tokens"),
 				Types:  c.Uints("tokenizer.ggml.token_type"),
@@ -64,16 +61,29 @@ func New(c ml.Config) (model.Model, error) {
 		},
 	}
 
+	fmt.Println("Model Parameters:")
+	fmt.Printf("  model_type: %q\n", "gpt2")
+	fmt.Printf("  vocab_size: %d\n", len(c.Strings("tokenizer.ggml.tokens")))
+	fmt.Printf("  hidden_size: %d\n", m.Options.hiddenSize)
+	fmt.Printf("  num_hidden_layers: %d\n", c.Uint("block_count"))
+	fmt.Printf("  num_attention_heads: %d\n", m.Options.numHeads)
+	fmt.Printf("  num_key_value_heads: %d\n", m.Options.numKVHeads)
+	fmt.Printf("  rms_norm_eps: %g\n", m.Options.eps)
+	fmt.Printf("  rope_theta: %g\n", m.Options.ropeBase)
+	fmt.Printf("  bos_token_id: %d\n", c.Uint("tokenizer.ggml.bos_token_id"))
+	fmt.Printf("  eos_token_id: %d\n", c.Uint("tokenizer.ggml.eos_token_id"))
+	fmt.Printf("  pad_token_id: %d\n", c.Uint("tokenizer.ggml.pad_token_id", 0))
+
 	m.Cache = kvcache.NewCausalCache(m.Shift)
 
 	return &m, nil
 }
 
 type SelfAttention struct {
-	Query       *nn.Linear `gguf:"attn_q"`
-	Key         *nn.Linear `gguf:"attn_k"`
-	Value       *nn.Linear `gguf:"attn_v"`
-	Output      *nn.Linear `gguf:"attn_output"`
+	Query       *nn.Linear `gguf:"self_attn.q_proj"`
+	Key         *nn.Linear `gguf:"self_attn.k_proj"`
+	Value       *nn.Linear `gguf:"self_attn.v_proj"`
+	Output      *nn.Linear `gguf:"self_attn.o_proj"`
 	RopeFactors ml.Tensor  `gguf:"rope_freqs.weight"`
 }
 
@@ -117,9 +127,9 @@ func (m *Model) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tenso
 }
 
 type MLP struct {
-	Up   *nn.Linear `gguf:"ffn_up"`
-	Down *nn.Linear `gguf:"ffn_down"`
-	Gate *nn.Linear `gguf:"ffn_gate"`
+	Up   *nn.Linear `gguf:"mlp.up_proj"`
+	Down *nn.Linear `gguf:"mlp.down_proj"`
+	Gate *nn.Linear `gguf:"mlp.gate_proj"`
 }
 
 func (mlp *MLP) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *Options) ml.Tensor {
@@ -128,9 +138,9 @@ func (mlp *MLP) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *Options) ml
 }
 
 type Layer struct {
-	AttentionNorm *nn.RMSNorm `gguf:"attn_norm"`
+	AttentionNorm *nn.RMSNorm `gguf:"input_layernorm"`
 	SelfAttention *SelfAttention
-	MLPNorm       *nn.RMSNorm `gguf:"ffn_norm"`
+	MLPNorm       *nn.RMSNorm `gguf:"post_attention_layernorm"`
 	MLP           *MLP
 }
 
@@ -171,6 +181,7 @@ func (m *Model) Forward(ctx ml.Context, opts input.Options) (ml.Tensor, error) {
 		return nil, err
 	}
 
+	// Get token embeddings
 	hiddenState := m.TokenEmbedding.Forward(ctx, inputs)
 
 	for i, layer := range m.Layers {
@@ -184,7 +195,10 @@ func (m *Model) Forward(ctx ml.Context, opts input.Options) (ml.Tensor, error) {
 		hiddenState = layer.Forward(ctx, hiddenState, positions, lastLayerOutputs, m.Cache, m.Options)
 	}
 
+	// Apply output normalization
 	hiddenState = m.OutputNorm.Forward(ctx, hiddenState, m.eps)
+
+	// Apply output projection
 	return m.Output.Forward(ctx, hiddenState), nil
 }
 
diff --git a/parser/parser.go b/parser/parser.go
index eb916cbfb..6832351fb 100644
--- a/parser/parser.go
+++ b/parser/parser.go
@@ -211,9 +211,7 @@ func filesForModel(path string) ([]string, error) {
 	}
 
 	var files []string
-	if st, _ := glob(filepath.Join(path, "consolidated.safetensors"), "application/octet-stream"); len(st) > 0 {
-		files = append(files, st...)
-	} else if st, _ := glob(filepath.Join(path, "model*.safetensors"), "application/octet-stream"); len(st) > 0 {
+	if st, _ := glob(filepath.Join(path, "model*.safetensors"), "application/octet-stream"); len(st) > 0 {
 		// safetensors files might be unresolved git lfs references; skip if they are
 		// covers model-x-of-y.safetensors, model.fp32-x-of-y.safetensors, model.safetensors
 		files = append(files, st...)
@@ -224,6 +222,10 @@ func filesForModel(path string) ([]string, error) {
 		// covers adapter_model.safetensors
 		files = append(files, st...)
 	} else if pt, _ := glob(filepath.Join(path, "pytorch_model*.bin"), "application/zip"); len(pt) > 0 {
+		// pytorch files might also be unresolved git lfs references; skip if they are
+		// covers pytorch_model-x-of-y.bin, pytorch_model.fp32-x-of-y.bin, pytorch_model.bin
+		files = append(files, pt...)
+	} else if pt, _ := glob(filepath.Join(path, "consolidated*.pth"), "application/zip"); len(pt) > 0 {
 		// pytorch files might also be unresolved git lfs references; skip if they are
 		// covers consolidated.x.pth, consolidated.pth
 		files = append(files, pt...)