From 3241b457904a91c709b4c77075a50917fab43aaa Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Thu, 13 Feb 2025 14:22:22 -0800 Subject: [PATCH] add test --- fs/ggml/ggml.go | 42 ++++++------ fs/ggml/ggml_test.go | 155 +++++++++++++++++++++++++++++++++++++++++++ fs/ggml/type.go | 11 ++- llm/memory.go | 6 +- 4 files changed, 187 insertions(+), 27 deletions(-) create mode 100644 fs/ggml/ggml_test.go diff --git a/fs/ggml/ggml.go b/fs/ggml/ggml.go index 698caebcb..dd8a01d95 100644 --- a/fs/ggml/ggml.go +++ b/fs/ggml/ggml.go @@ -153,7 +153,7 @@ func (s Tensors) Items(prefix ...string) []*Tensor { return items } -func (ts Tensors) Layers() map[string]Layer { +func (ts Tensors) GroupLayers() map[string]Layer { layers := make(map[string]Layer) for _, t := range ts.items { parts := strings.Split(t.Name, ".") @@ -377,22 +377,22 @@ func Decode(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) { }, offset, nil } -func (llm GGML) GraphSize(context, batch uint64, kvCacheType string) (kv, partialOffload, fullOffload uint64) { - embedding := llm.KV().EmbeddingLength() - heads := llm.KV().HeadCount() - headsKV := llm.KV().HeadCountKV() - vocab := uint64(llm.KV()["tokenizer.ggml.tokens"].(*array).size) +func (f GGML) GraphSize(context, batch uint64, kvCacheType string) (kv, partialOffload, fullOffload uint64) { + embedding := f.KV().EmbeddingLength() + heads := f.KV().HeadCount() + headsKV := f.KV().HeadCountKV() + vocab := uint64(f.KV()["tokenizer.ggml.tokens"].(*array).size) - embeddingHeads := llm.KV().EmbeddingHeadCount() - embeddingHeadsK := llm.KV().EmbeddingHeadCountK() - embeddingHeadsV := llm.KV().EmbeddingHeadCountV() + embeddingHeads := f.KV().EmbeddingHeadCount() + embeddingHeadsK := f.KV().EmbeddingHeadCountK() + embeddingHeadsV := f.KV().EmbeddingHeadCountV() - layers := llm.Tensors().Layers() + layers := f.Tensors().GroupLayers() bytesPerElement := kvCacheBytesPerElement(kvCacheType) - kv = uint64(float64(context*llm.KV().BlockCount()*(embeddingHeadsK+embeddingHeadsV)*headsKV) * bytesPerElement) + kv = uint64(float64(context*f.KV().BlockCount()*(embeddingHeadsK+embeddingHeadsV)*headsKV) * bytesPerElement) - switch llm.KV().Architecture() { + switch f.KV().Architecture() { case "llama": fullOffload = max( 4*batch*(1+4*embedding+context*(1+heads)), @@ -407,7 +407,7 @@ func (llm GGML) GraphSize(context, batch uint64, kvCacheType string) (kv, partia if ffnGateExpsWeight, ok := layers["blk.0"]["ffn_gate_exps.weight"]; ok { // mixtral 8x22b - ff := uint64(llm.KV()["llama.feed_forward_length"].(uint32)) + ff := uint64(f.KV()["llama.feed_forward_length"].(uint32)) partialOffload = max( 3*ffnGateExpsWeight.Size()+4*batch*(2*ff+headsKV+embedding+context+embeddingHeads*headsKV), 4*(context*batch*heads+context*embeddingHeads*headsKV+batch*1024+embeddingHeads*headsKV*batch), @@ -424,11 +424,11 @@ func (llm GGML) GraphSize(context, batch uint64, kvCacheType string) (kv, partia case "mllama": var visionTokens, tiles uint64 = 1601, 4 - if crossAttentionLayers, ok := llm.KV()["mllama.attention.cross_attention_layers"].(*array); ok { + if crossAttentionLayers, ok := f.KV()["mllama.attention.cross_attention_layers"].(*array); ok { kv = headsKV * (embeddingHeadsK + embeddingHeadsV) * // one for K, one for V (2* // sizeof(float16) - (llm.KV().BlockCount()-uint64(crossAttentionLayers.size))* // num non-cross attention layers + (f.KV().BlockCount()-uint64(crossAttentionLayers.size))* // num non-cross attention layers context + 4* // sizeof(float32) uint64(crossAttentionLayers.size)* // num cross attention layers @@ -443,7 +443,7 @@ func (llm GGML) GraphSize(context, batch uint64, kvCacheType string) (kv, partia ) var ropeFreqsCount uint64 - if ropeFreqs, ok := llm.Tensors().Layers()["rope_freqs"]; ok { + if ropeFreqs, ok := f.Tensors().GroupLayers()["rope_freqs"]; ok { if ropeFreqsWeights, ok := ropeFreqs["weights"]; ok { ropeFreqsCount = ropeFreqsWeights.parameters() } @@ -547,20 +547,20 @@ func (llm GGML) GraphSize(context, batch uint64, kvCacheType string) (kv, partia } // SupportsKVCacheType checks if the requested cache type is supported -func (llm GGML) SupportsKVCacheType(cacheType string) bool { +func (f GGML) SupportsKVCacheType(cacheType string) bool { return slices.Contains([]string{"f16", "q8_0", "q4_0"}, cacheType) } // SupportsFlashAttention checks if the model supports flash attention -func (llm GGML) SupportsFlashAttention() bool { - _, isEmbedding := llm.KV()[fmt.Sprintf("%s.pooling_type", llm.KV().Architecture())] +func (f GGML) SupportsFlashAttention() bool { + _, isEmbedding := f.KV()[fmt.Sprintf("%s.pooling_type", f.KV().Architecture())] if isEmbedding { return false } // Check head counts match and are non-zero - headCountK := llm.KV().EmbeddingHeadCountK() - headCountV := llm.KV().EmbeddingHeadCountV() + headCountK := f.KV().EmbeddingHeadCountK() + headCountV := f.KV().EmbeddingHeadCountV() return headCountK != 0 && headCountV != 0 && headCountK == headCountV } diff --git a/fs/ggml/ggml_test.go b/fs/ggml/ggml_test.go new file mode 100644 index 000000000..93aa95adb --- /dev/null +++ b/fs/ggml/ggml_test.go @@ -0,0 +1,155 @@ +package ggml + +import ( + "maps" + "slices" + "strings" + "testing" + + "github.com/google/go-cmp/cmp" +) + +func TestTensorLayers(t *testing.T) { + tensors := make(map[string]*Tensor) + for _, name := range []string{ + "token_embd.weight", + "blk.0.attn_k.weight", + "blk.0.attn_output.weight", + "blk.0.attn_q.weight", + "blk.0.attn_v.weight", + "blk.0.attn_norm.weight", + "blk.0.ffn_down.weight", + "blk.0.ffn_gate.weight", + "blk.0.ffn_up.weight", + "blk.0.ffn_norm.weight", + "output_norm.weight", + "mm.0.bias", + "mm.0.weight", + "v.blk.0.attn_k.weight", + "v.blk.0.attn_output.weight", + "v.blk.0.attn_q.weight", + "v.blk.0.attn_v.weight", + "v.blk.0.attn_norm.weight", + "v.blk.0.ffn_down.weight", + "v.blk.0.ffn_gate.weight", + "v.blk.0.ffn_up.weight", + "v.blk.0.ffn_norm.weight", + "v.patch_embd.weight", + "v.position_embd.gate", + "v.position_embd.weight", + } { + tensors[name] = &Tensor{Name: name} + } + + cases := []struct { + name string + items []*Tensor + want map[string]Layer + }{ + { + name: "text", + items: slices.Collect(func(yield func(*Tensor) bool) { + for k, v := range tensors { + if !strings.HasPrefix(k, "mm.") && !strings.HasPrefix(k, "v.") { + if !yield(v) { + return + } + } + } + }), + want: map[string]Layer{ + "blk.0": { + "attn_k.weight": tensors["blk.0.attn_k.weight"], + "attn_q.weight": tensors["blk.0.attn_q.weight"], + "attn_v.weight": tensors["blk.0.attn_v.weight"], + "attn_output.weight": tensors["blk.0.attn_output.weight"], + "attn_norm.weight": tensors["blk.0.attn_norm.weight"], + "ffn_down.weight": tensors["blk.0.ffn_down.weight"], + "ffn_gate.weight": tensors["blk.0.ffn_gate.weight"], + "ffn_up.weight": tensors["blk.0.ffn_up.weight"], + "ffn_norm.weight": tensors["blk.0.ffn_norm.weight"], + }, + "token_embd": {"weight": tensors["token_embd.weight"]}, + "output_norm": {"weight": tensors["output_norm.weight"]}, + }, + }, + { + name: "vision", + items: slices.Collect(func(yield func(*Tensor) bool) { + for k, v := range tensors { + if strings.HasPrefix(k, "mm.") || strings.HasPrefix(k, "v.") { + if !yield(v) { + return + } + } + } + }), + want: map[string]Layer{ + "mm": { + "0.bias": tensors["mm.0.bias"], + "0.weight": tensors["mm.0.weight"], + }, + "v": { + "blk.0.attn_k.weight": tensors["v.blk.0.attn_k.weight"], + "blk.0.attn_q.weight": tensors["v.blk.0.attn_q.weight"], + "blk.0.attn_v.weight": tensors["v.blk.0.attn_v.weight"], + "blk.0.attn_output.weight": tensors["v.blk.0.attn_output.weight"], + "blk.0.attn_norm.weight": tensors["v.blk.0.attn_norm.weight"], + "blk.0.ffn_down.weight": tensors["v.blk.0.ffn_down.weight"], + "blk.0.ffn_gate.weight": tensors["v.blk.0.ffn_gate.weight"], + "blk.0.ffn_up.weight": tensors["v.blk.0.ffn_up.weight"], + "blk.0.ffn_norm.weight": tensors["v.blk.0.ffn_norm.weight"], + "patch_embd.weight": tensors["v.patch_embd.weight"], + "position_embd.gate": tensors["v.position_embd.gate"], + "position_embd.weight": tensors["v.position_embd.weight"], + }, + }, + }, + { + name: "vision and text", + items: slices.Collect(maps.Values(tensors)), + want: map[string]Layer{ + "blk.0": { + "attn_k.weight": tensors["blk.0.attn_k.weight"], + "attn_q.weight": tensors["blk.0.attn_q.weight"], + "attn_v.weight": tensors["blk.0.attn_v.weight"], + "attn_output.weight": tensors["blk.0.attn_output.weight"], + "attn_norm.weight": tensors["blk.0.attn_norm.weight"], + "ffn_down.weight": tensors["blk.0.ffn_down.weight"], + "ffn_gate.weight": tensors["blk.0.ffn_gate.weight"], + "ffn_up.weight": tensors["blk.0.ffn_up.weight"], + "ffn_norm.weight": tensors["blk.0.ffn_norm.weight"], + }, + "token_embd": {"weight": tensors["token_embd.weight"]}, + "output_norm": {"weight": tensors["output_norm.weight"]}, + "mm": { + "0.bias": tensors["mm.0.bias"], + "0.weight": tensors["mm.0.weight"], + }, + "v": { + "blk.0.attn_k.weight": tensors["v.blk.0.attn_k.weight"], + "blk.0.attn_q.weight": tensors["v.blk.0.attn_q.weight"], + "blk.0.attn_v.weight": tensors["v.blk.0.attn_v.weight"], + "blk.0.attn_output.weight": tensors["v.blk.0.attn_output.weight"], + "blk.0.attn_norm.weight": tensors["v.blk.0.attn_norm.weight"], + "blk.0.ffn_down.weight": tensors["v.blk.0.ffn_down.weight"], + "blk.0.ffn_gate.weight": tensors["v.blk.0.ffn_gate.weight"], + "blk.0.ffn_up.weight": tensors["v.blk.0.ffn_up.weight"], + "blk.0.ffn_norm.weight": tensors["v.blk.0.ffn_norm.weight"], + "patch_embd.weight": tensors["v.patch_embd.weight"], + "position_embd.gate": tensors["v.position_embd.gate"], + "position_embd.weight": tensors["v.position_embd.weight"], + }, + }, + }, + } + + for _, tt := range cases { + t.Run(tt.name, func(t *testing.T) { + got := Tensors{items: tt.items}.GroupLayers() + if diff := cmp.Diff(got, tt.want); diff != "" { + t.Errorf("unexpected layers (-got +want):\n%s", diff) + } + }) + } +} diff --git a/fs/ggml/type.go b/fs/ggml/type.go index a24d8b34e..7265afbcd 100644 --- a/fs/ggml/type.go +++ b/fs/ggml/type.go @@ -32,9 +32,10 @@ const ( fileTypeIQ1_S fileTypeIQ4_NL fileTypeIQ3_S + fileTypeIQ3_M fileTypeIQ2_S - fileTypeIQ4_XS fileTypeIQ2_M + fileTypeIQ4_XS fileTypeIQ1_M fileTypeBF16 @@ -93,12 +94,14 @@ func ParseFileType(s string) (fileType, error) { return fileTypeIQ4_NL, nil case "IQ3_S": return fileTypeIQ3_S, nil + case "IQ3_M": + return fileTypeIQ3_M, nil case "IQ2_S": return fileTypeIQ2_S, nil - case "IQ4_XS": - return fileTypeIQ4_XS, nil case "IQ2_M": return fileTypeIQ2_M, nil + case "IQ4_XS": + return fileTypeIQ4_XS, nil case "IQ1_M": return fileTypeIQ1_M, nil case "BF16": @@ -160,6 +163,8 @@ func (t fileType) String() string { return "IQ4_NL" case fileTypeIQ3_S: return "IQ3_S" + case fileTypeIQ3_M: + return "IQ3_M" case fileTypeIQ2_S: return "IQ2_S" case fileTypeIQ4_XS: diff --git a/llm/memory.go b/llm/memory.go index 82bb31dde..1da4d2c08 100644 --- a/llm/memory.go +++ b/llm/memory.go @@ -116,7 +116,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin opts.NumCtx = max(opts.NumCtx, 2048) } - layers := f.Tensors().Layers() + layers := f.Tensors().GroupLayers() // add one layer worth of memory as a buffer if blk0, ok := layers["blk.0"]; ok { layerSize = blk0.Size() @@ -410,7 +410,7 @@ func projectorMemoryRequirements(filename string) (weights, graphSize uint64) { return 0, 0 } - for _, layer := range ggml.Tensors().Layers() { + for _, layer := range ggml.Tensors().GroupLayers() { weights += layer.Size() } @@ -431,7 +431,7 @@ func projectorMemoryRequirements(filename string) (weights, graphSize uint64) { headCount := kv("attention.head_count") numPatches := (imageSize / kv("patch_size")) * (imageSize / kv("patch_size")) - if _, ok := ggml.Tensors().Layers()["v"]["class_embd"]; ok { + if _, ok := ggml.Tensors().GroupLayers()["v"]["class_embd"]; ok { numPatches++ }