From 7ba9fa9c7d0bc73abacca88d6827d973d7ba92cf Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Mon, 21 Apr 2025 10:45:56 -0700 Subject: [PATCH] fixes for maverick --- convert/convert_llama4.go | 4 +- model/models/llama4/model.go | 5 ++- model/models/llama4/model_text.go | 70 +++++++++++++++++++++---------- 3 files changed, 53 insertions(+), 26 deletions(-) diff --git a/convert/convert_llama4.go b/convert/convert_llama4.go index 9aa0382ff..26a230b33 100644 --- a/convert/convert_llama4.go +++ b/convert/convert_llama4.go @@ -45,8 +45,8 @@ func (p *llama4Model) KV(t *Tokenizer) ggml.KV { } } - kv["llama4.intermediate_size"] = p.TextModel.IntermediateSizeMLP - kv["llama4.intermediate_size_moe"] = p.TextModel.IntermediateSize + kv["llama4.feed_forward_length"] = p.TextModel.IntermediateSizeMLP + kv["llama4.expert_feed_forward_length"] = p.TextModel.IntermediateSize kv["llama4.expert_count"] = p.TextModel.NumLocalExperts kv["llama4.expert_used_count"] = p.TextModel.NumExpertsPerToken diff --git a/model/models/llama4/model.go b/model/models/llama4/model.go index 8f80c1dd4..53dc986a0 100644 --- a/model/models/llama4/model.go +++ b/model/models/llama4/model.go @@ -35,7 +35,8 @@ func (p *Projector) Forward(ctx ml.Context, visionOutputs ml.Tensor) ml.Tensor { func New(c fs.Config) (model.Model, error) { m := Model{ BytePairEncoding: model.NewBytePairEncoding( - c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`), + c.String("tokenizer.ggml.pretokenizer", + `[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+`), &model.Vocabulary{ Values: c.Strings("tokenizer.ggml.tokens"), Types: c.Uints("tokenizer.ggml.token_type"), @@ -52,7 +53,7 @@ func New(c fs.Config) (model.Model, error) { } m.Cache = kvcache.NewWrapperCache( - kvcache.NewChunkedAttentionCache(int32(c.Uint("attention.chunk_size")), m.Shift), + kvcache.NewChunkedAttentionCache(int32(c.Uint("attention.chunk_size", 8192)), m.Shift), kvcache.NewCausalCache(m.Shift), ) diff --git a/model/models/llama4/model_text.go b/model/models/llama4/model_text.go index c7ceceec5..3f9f578f1 100644 --- a/model/models/llama4/model_text.go +++ b/model/models/llama4/model_text.go @@ -19,7 +19,7 @@ type TextAttention struct { RopeFactors ml.Tensor `gguf:"rope_factors"` } -func (sa *TextAttention) Forward(ctx ml.Context, hiddenStates, positions ml.Tensor, cache kvcache.Cache, useRope bool, opts *TextOptions) ml.Tensor { +func (sa *TextAttention) Forward(ctx ml.Context, hiddenStates, positions, attentionScales ml.Tensor, cache kvcache.Cache, useRope bool, opts *TextOptions) ml.Tensor { batchSize, headDim := hiddenStates.Dim(1), cmp.Or(opts.headDim, opts.hiddenSize/opts.numHeads) query := sa.Query.Forward(ctx, hiddenStates) @@ -33,11 +33,15 @@ func (sa *TextAttention) Forward(ctx ml.Context, hiddenStates, positions ml.Tens if useRope { query = query.RoPE(ctx, positions, sa.RopeFactors, uint32(opts.ropeDim), uint32(0), opts.ropeBase, opts.ropeScale) key = key.RoPE(ctx, positions, sa.RopeFactors, uint32(opts.ropeDim), uint32(0), opts.ropeBase, opts.ropeScale) + } - if opts.useQKNorm { - query = query.RMSNorm(ctx, nil, opts.eps) - key = key.RMSNorm(ctx, nil, opts.eps) - } + if opts.useQKNorm { + query = query.RMSNorm(ctx, nil, opts.eps) + key = key.RMSNorm(ctx, nil, opts.eps) + } + + if attentionScales != nil && !useRope { + query = query.Mul(ctx, attentionScales) } attention := nn.Attention(ctx, query, key, value, 1./math.Sqrt(float64(headDim)), cache) @@ -82,7 +86,7 @@ func (e *TextExperts) Forward(ctx ml.Context, hiddenStates, routerLogits ml.Tens return nextStates } -// TextSharedExpert is TextMLP with different names +// TextSharedExpert is TextMLP with different tensor names type TextSharedExpert struct { Gate *nn.Linear `gguf:"ffn_gate_shexp"` Up *nn.Linear `gguf:"ffn_up_shexp"` @@ -122,12 +126,12 @@ type TextLayer struct { FeedForward TextFeedForward } -func (d *TextLayer) Forward(ctx ml.Context, hiddenStates, positions, outputs ml.Tensor, cache kvcache.Cache, useRope bool, opts *TextOptions) ml.Tensor { +func (d *TextLayer) Forward(ctx ml.Context, hiddenStates, positions, attentionScales, outputs ml.Tensor, cache kvcache.Cache, useRope bool, opts *TextOptions) ml.Tensor { residual := hiddenStates // self attention hiddenStates = d.AttentionNorm.Forward(ctx, hiddenStates, opts.eps) - hiddenStates = d.Attention.Forward(ctx, hiddenStates, positions, cache, useRope, opts) + hiddenStates = d.Attention.Forward(ctx, hiddenStates, positions, attentionScales, cache, useRope, opts) if outputs != nil { hiddenStates = hiddenStates.Rows(ctx, outputs) @@ -151,7 +155,11 @@ type TextOptions struct { ropeBase, ropeScale float32 eps float32 interleaveLayerStep int + noRopeInterval int useQKNorm bool + attentionTemperatureTuning bool + attentionScale float64 + attentionFloorScale float64 } type TextModel struct { @@ -178,18 +186,22 @@ func newTextModel(c fs.Config) *TextModel { return &TextModel{ Layers: layers, TextOptions: &TextOptions{ - hiddenSize: int(c.Uint("embedding_length")), - numHeads: int(c.Uint("attention.head_count")), - numKVHeads: int(c.Uint("attention.head_count_kv")), - headDim: int(c.Uint("attention.head_dim", 128)), - numExperts: int(c.Uint("expert_count")), - numExpertsUsed: int(c.Uint("expert_used_count")), - ropeDim: int(c.Uint("rope.dimension_count")), - ropeBase: c.Float("rope.freq_base"), - ropeScale: c.Float("rope.freq_scale", 1), - eps: c.Float("attention.layer_norm_rms_epsilon"), - interleaveLayerStep: int(c.Uint("interleave_moe_layer_step", 1)), - useQKNorm: c.Bool("use_qk_norm", true), + hiddenSize: int(c.Uint("embedding_length")), + numHeads: int(c.Uint("attention.head_count")), + numKVHeads: int(c.Uint("attention.head_count_kv")), + headDim: int(c.Uint("attention.head_dim", 128)), + numExperts: int(c.Uint("expert_count")), + numExpertsUsed: int(c.Uint("expert_used_count")), + ropeDim: int(c.Uint("rope.dimension_count")), + ropeBase: c.Float("rope.freq_base"), + ropeScale: c.Float("rope.freq_scale", 1), + eps: c.Float("attention.layer_norm_rms_epsilon"), + interleaveLayerStep: int(c.Uint("interleave_moe_layer_step", 1)), + noRopeInterval: int(c.Uint("no_rope_interval", 4)), + useQKNorm: c.Bool("use_qk_norm", true), + attentionTemperatureTuning: c.Bool("attention.temperature_tuning", true), + attentionScale: float64(c.Float("attention.scale", 0.1)), + attentionFloorScale: float64(c.Float("attention.floor_scale", 8192)), }, } } @@ -207,11 +219,25 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor ctx.Forward(img.Copy(ctx, hiddenStates.View(ctx, mi.Index*hiddenStates.Stride(1), img.Dim(0)*img.Dim(1)))) } + var attentionScales ml.Tensor + if m.attentionTemperatureTuning { + scales := make([]float32, len(batch.Positions)) + for i, p := range batch.Positions { + scales[i] = float32(math.Log(math.Floor(((float64(p)+1.0)/float64(m.attentionFloorScale))+1.0))*m.attentionScale + 1.0) + } + + var err error + attentionScales, err = ctx.Input().FromFloatSlice(scales, 1, 1, len(scales)) + if err != nil { + panic(err) + } + } + for i, layer := range m.Layers { cache.SetLayer(i) wc := cache.(*kvcache.WrapperCache) wc.SetLayerType(1) - useChunkedAttention := (i+1)%4 != 0 + useChunkedAttention := (i+1)%m.noRopeInterval != 0 if useChunkedAttention { wc.SetLayerType(0) } @@ -221,7 +247,7 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor lastLayerOutputs = outputs } - hiddenStates = layer.Forward(ctx, hiddenStates, positions, lastLayerOutputs, cache, useChunkedAttention, m.TextOptions) + hiddenStates = layer.Forward(ctx, hiddenStates, positions, attentionScales, lastLayerOutputs, cache, useChunkedAttention, m.TextOptions) } hiddenStates = m.OutputNorm.Forward(ctx, hiddenStates, m.eps)