From 7ba9fa9c7d0bc73abacca88d6827d973d7ba92cf Mon Sep 17 00:00:00 2001
From: Michael Yang <git@mxy.ng>
Date: Mon, 21 Apr 2025 10:45:56 -0700
Subject: [PATCH] fixes for maverick

---
 convert/convert_llama4.go         |  4 +-
 model/models/llama4/model.go      |  5 ++-
 model/models/llama4/model_text.go | 70 +++++++++++++++++++++----------
 3 files changed, 53 insertions(+), 26 deletions(-)

diff --git a/convert/convert_llama4.go b/convert/convert_llama4.go
index 9aa0382ff..26a230b33 100644
--- a/convert/convert_llama4.go
+++ b/convert/convert_llama4.go
@@ -45,8 +45,8 @@ func (p *llama4Model) KV(t *Tokenizer) ggml.KV {
 		}
 	}
 
-	kv["llama4.intermediate_size"] = p.TextModel.IntermediateSizeMLP
-	kv["llama4.intermediate_size_moe"] = p.TextModel.IntermediateSize
+	kv["llama4.feed_forward_length"] = p.TextModel.IntermediateSizeMLP
+	kv["llama4.expert_feed_forward_length"] = p.TextModel.IntermediateSize
 
 	kv["llama4.expert_count"] = p.TextModel.NumLocalExperts
 	kv["llama4.expert_used_count"] = p.TextModel.NumExpertsPerToken
diff --git a/model/models/llama4/model.go b/model/models/llama4/model.go
index 8f80c1dd4..53dc986a0 100644
--- a/model/models/llama4/model.go
+++ b/model/models/llama4/model.go
@@ -35,7 +35,8 @@ func (p *Projector) Forward(ctx ml.Context, visionOutputs ml.Tensor) ml.Tensor {
 func New(c fs.Config) (model.Model, error) {
 	m := Model{
 		BytePairEncoding: model.NewBytePairEncoding(
-			c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
+			c.String("tokenizer.ggml.pretokenizer",
+				`[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
 			&model.Vocabulary{
 				Values: c.Strings("tokenizer.ggml.tokens"),
 				Types:  c.Uints("tokenizer.ggml.token_type"),
@@ -52,7 +53,7 @@ func New(c fs.Config) (model.Model, error) {
 	}
 
 	m.Cache = kvcache.NewWrapperCache(
-		kvcache.NewChunkedAttentionCache(int32(c.Uint("attention.chunk_size")), m.Shift),
+		kvcache.NewChunkedAttentionCache(int32(c.Uint("attention.chunk_size", 8192)), m.Shift),
 		kvcache.NewCausalCache(m.Shift),
 	)
 
diff --git a/model/models/llama4/model_text.go b/model/models/llama4/model_text.go
index c7ceceec5..3f9f578f1 100644
--- a/model/models/llama4/model_text.go
+++ b/model/models/llama4/model_text.go
@@ -19,7 +19,7 @@ type TextAttention struct {
 	RopeFactors ml.Tensor  `gguf:"rope_factors"`
 }
 
-func (sa *TextAttention) Forward(ctx ml.Context, hiddenStates, positions ml.Tensor, cache kvcache.Cache, useRope bool, opts *TextOptions) ml.Tensor {
+func (sa *TextAttention) Forward(ctx ml.Context, hiddenStates, positions, attentionScales ml.Tensor, cache kvcache.Cache, useRope bool, opts *TextOptions) ml.Tensor {
 	batchSize, headDim := hiddenStates.Dim(1), cmp.Or(opts.headDim, opts.hiddenSize/opts.numHeads)
 
 	query := sa.Query.Forward(ctx, hiddenStates)
@@ -33,11 +33,15 @@ func (sa *TextAttention) Forward(ctx ml.Context, hiddenStates, positions ml.Tens
 	if useRope {
 		query = query.RoPE(ctx, positions, sa.RopeFactors, uint32(opts.ropeDim), uint32(0), opts.ropeBase, opts.ropeScale)
 		key = key.RoPE(ctx, positions, sa.RopeFactors, uint32(opts.ropeDim), uint32(0), opts.ropeBase, opts.ropeScale)
+	}
 
-		if opts.useQKNorm {
-			query = query.RMSNorm(ctx, nil, opts.eps)
-			key = key.RMSNorm(ctx, nil, opts.eps)
-		}
+	if opts.useQKNorm {
+		query = query.RMSNorm(ctx, nil, opts.eps)
+		key = key.RMSNorm(ctx, nil, opts.eps)
+	}
+
+	if attentionScales != nil && !useRope {
+		query = query.Mul(ctx, attentionScales)
 	}
 
 	attention := nn.Attention(ctx, query, key, value, 1./math.Sqrt(float64(headDim)), cache)
@@ -82,7 +86,7 @@ func (e *TextExperts) Forward(ctx ml.Context, hiddenStates, routerLogits ml.Tens
 	return nextStates
 }
 
-// TextSharedExpert is TextMLP with different names
+// TextSharedExpert is TextMLP with different tensor names
 type TextSharedExpert struct {
 	Gate *nn.Linear `gguf:"ffn_gate_shexp"`
 	Up   *nn.Linear `gguf:"ffn_up_shexp"`
@@ -122,12 +126,12 @@ type TextLayer struct {
 	FeedForward TextFeedForward
 }
 
-func (d *TextLayer) Forward(ctx ml.Context, hiddenStates, positions, outputs ml.Tensor, cache kvcache.Cache, useRope bool, opts *TextOptions) ml.Tensor {
+func (d *TextLayer) Forward(ctx ml.Context, hiddenStates, positions, attentionScales, outputs ml.Tensor, cache kvcache.Cache, useRope bool, opts *TextOptions) ml.Tensor {
 	residual := hiddenStates
 
 	// self attention
 	hiddenStates = d.AttentionNorm.Forward(ctx, hiddenStates, opts.eps)
-	hiddenStates = d.Attention.Forward(ctx, hiddenStates, positions, cache, useRope, opts)
+	hiddenStates = d.Attention.Forward(ctx, hiddenStates, positions, attentionScales, cache, useRope, opts)
 
 	if outputs != nil {
 		hiddenStates = hiddenStates.Rows(ctx, outputs)
@@ -151,7 +155,11 @@ type TextOptions struct {
 	ropeBase, ropeScale           float32
 	eps                           float32
 	interleaveLayerStep           int
+	noRopeInterval                int
 	useQKNorm                     bool
+	attentionTemperatureTuning    bool
+	attentionScale                float64
+	attentionFloorScale           float64
 }
 
 type TextModel struct {
@@ -178,18 +186,22 @@ func newTextModel(c fs.Config) *TextModel {
 	return &TextModel{
 		Layers: layers,
 		TextOptions: &TextOptions{
-			hiddenSize:          int(c.Uint("embedding_length")),
-			numHeads:            int(c.Uint("attention.head_count")),
-			numKVHeads:          int(c.Uint("attention.head_count_kv")),
-			headDim:             int(c.Uint("attention.head_dim", 128)),
-			numExperts:          int(c.Uint("expert_count")),
-			numExpertsUsed:      int(c.Uint("expert_used_count")),
-			ropeDim:             int(c.Uint("rope.dimension_count")),
-			ropeBase:            c.Float("rope.freq_base"),
-			ropeScale:           c.Float("rope.freq_scale", 1),
-			eps:                 c.Float("attention.layer_norm_rms_epsilon"),
-			interleaveLayerStep: int(c.Uint("interleave_moe_layer_step", 1)),
-			useQKNorm:           c.Bool("use_qk_norm", true),
+			hiddenSize:                 int(c.Uint("embedding_length")),
+			numHeads:                   int(c.Uint("attention.head_count")),
+			numKVHeads:                 int(c.Uint("attention.head_count_kv")),
+			headDim:                    int(c.Uint("attention.head_dim", 128)),
+			numExperts:                 int(c.Uint("expert_count")),
+			numExpertsUsed:             int(c.Uint("expert_used_count")),
+			ropeDim:                    int(c.Uint("rope.dimension_count")),
+			ropeBase:                   c.Float("rope.freq_base"),
+			ropeScale:                  c.Float("rope.freq_scale", 1),
+			eps:                        c.Float("attention.layer_norm_rms_epsilon"),
+			interleaveLayerStep:        int(c.Uint("interleave_moe_layer_step", 1)),
+			noRopeInterval:             int(c.Uint("no_rope_interval", 4)),
+			useQKNorm:                  c.Bool("use_qk_norm", true),
+			attentionTemperatureTuning: c.Bool("attention.temperature_tuning", true),
+			attentionScale:             float64(c.Float("attention.scale", 0.1)),
+			attentionFloorScale:        float64(c.Float("attention.floor_scale", 8192)),
 		},
 	}
 }
@@ -207,11 +219,25 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor
 		ctx.Forward(img.Copy(ctx, hiddenStates.View(ctx, mi.Index*hiddenStates.Stride(1), img.Dim(0)*img.Dim(1))))
 	}
 
+	var attentionScales ml.Tensor
+	if m.attentionTemperatureTuning {
+		scales := make([]float32, len(batch.Positions))
+		for i, p := range batch.Positions {
+			scales[i] = float32(math.Log(math.Floor(((float64(p)+1.0)/float64(m.attentionFloorScale))+1.0))*m.attentionScale + 1.0)
+		}
+
+		var err error
+		attentionScales, err = ctx.Input().FromFloatSlice(scales, 1, 1, len(scales))
+		if err != nil {
+			panic(err)
+		}
+	}
+
 	for i, layer := range m.Layers {
 		cache.SetLayer(i)
 		wc := cache.(*kvcache.WrapperCache)
 		wc.SetLayerType(1)
-		useChunkedAttention := (i+1)%4 != 0
+		useChunkedAttention := (i+1)%m.noRopeInterval != 0
 		if useChunkedAttention {
 			wc.SetLayerType(0)
 		}
@@ -221,7 +247,7 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor
 			lastLayerOutputs = outputs
 		}
 
-		hiddenStates = layer.Forward(ctx, hiddenStates, positions, lastLayerOutputs, cache, useChunkedAttention, m.TextOptions)
+		hiddenStates = layer.Forward(ctx, hiddenStates, positions, attentionScales, lastLayerOutputs, cache, useChunkedAttention, m.TextOptions)
 	}
 
 	hiddenStates = m.OutputNorm.Forward(ctx, hiddenStates, m.eps)