add test
This commit is contained in:
parent
3c653195f4
commit
3241b45790
@ -153,7 +153,7 @@ func (s Tensors) Items(prefix ...string) []*Tensor {
|
|||||||
return items
|
return items
|
||||||
}
|
}
|
||||||
|
|
||||||
func (ts Tensors) Layers() map[string]Layer {
|
func (ts Tensors) GroupLayers() map[string]Layer {
|
||||||
layers := make(map[string]Layer)
|
layers := make(map[string]Layer)
|
||||||
for _, t := range ts.items {
|
for _, t := range ts.items {
|
||||||
parts := strings.Split(t.Name, ".")
|
parts := strings.Split(t.Name, ".")
|
||||||
@ -377,22 +377,22 @@ func Decode(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) {
|
|||||||
}, offset, nil
|
}, offset, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (llm GGML) GraphSize(context, batch uint64, kvCacheType string) (kv, partialOffload, fullOffload uint64) {
|
func (f GGML) GraphSize(context, batch uint64, kvCacheType string) (kv, partialOffload, fullOffload uint64) {
|
||||||
embedding := llm.KV().EmbeddingLength()
|
embedding := f.KV().EmbeddingLength()
|
||||||
heads := llm.KV().HeadCount()
|
heads := f.KV().HeadCount()
|
||||||
headsKV := llm.KV().HeadCountKV()
|
headsKV := f.KV().HeadCountKV()
|
||||||
vocab := uint64(llm.KV()["tokenizer.ggml.tokens"].(*array).size)
|
vocab := uint64(f.KV()["tokenizer.ggml.tokens"].(*array).size)
|
||||||
|
|
||||||
embeddingHeads := llm.KV().EmbeddingHeadCount()
|
embeddingHeads := f.KV().EmbeddingHeadCount()
|
||||||
embeddingHeadsK := llm.KV().EmbeddingHeadCountK()
|
embeddingHeadsK := f.KV().EmbeddingHeadCountK()
|
||||||
embeddingHeadsV := llm.KV().EmbeddingHeadCountV()
|
embeddingHeadsV := f.KV().EmbeddingHeadCountV()
|
||||||
|
|
||||||
layers := llm.Tensors().Layers()
|
layers := f.Tensors().GroupLayers()
|
||||||
|
|
||||||
bytesPerElement := kvCacheBytesPerElement(kvCacheType)
|
bytesPerElement := kvCacheBytesPerElement(kvCacheType)
|
||||||
kv = uint64(float64(context*llm.KV().BlockCount()*(embeddingHeadsK+embeddingHeadsV)*headsKV) * bytesPerElement)
|
kv = uint64(float64(context*f.KV().BlockCount()*(embeddingHeadsK+embeddingHeadsV)*headsKV) * bytesPerElement)
|
||||||
|
|
||||||
switch llm.KV().Architecture() {
|
switch f.KV().Architecture() {
|
||||||
case "llama":
|
case "llama":
|
||||||
fullOffload = max(
|
fullOffload = max(
|
||||||
4*batch*(1+4*embedding+context*(1+heads)),
|
4*batch*(1+4*embedding+context*(1+heads)),
|
||||||
@ -407,7 +407,7 @@ func (llm GGML) GraphSize(context, batch uint64, kvCacheType string) (kv, partia
|
|||||||
|
|
||||||
if ffnGateExpsWeight, ok := layers["blk.0"]["ffn_gate_exps.weight"]; ok {
|
if ffnGateExpsWeight, ok := layers["blk.0"]["ffn_gate_exps.weight"]; ok {
|
||||||
// mixtral 8x22b
|
// mixtral 8x22b
|
||||||
ff := uint64(llm.KV()["llama.feed_forward_length"].(uint32))
|
ff := uint64(f.KV()["llama.feed_forward_length"].(uint32))
|
||||||
partialOffload = max(
|
partialOffload = max(
|
||||||
3*ffnGateExpsWeight.Size()+4*batch*(2*ff+headsKV+embedding+context+embeddingHeads*headsKV),
|
3*ffnGateExpsWeight.Size()+4*batch*(2*ff+headsKV+embedding+context+embeddingHeads*headsKV),
|
||||||
4*(context*batch*heads+context*embeddingHeads*headsKV+batch*1024+embeddingHeads*headsKV*batch),
|
4*(context*batch*heads+context*embeddingHeads*headsKV+batch*1024+embeddingHeads*headsKV*batch),
|
||||||
@ -424,11 +424,11 @@ func (llm GGML) GraphSize(context, batch uint64, kvCacheType string) (kv, partia
|
|||||||
case "mllama":
|
case "mllama":
|
||||||
var visionTokens, tiles uint64 = 1601, 4
|
var visionTokens, tiles uint64 = 1601, 4
|
||||||
|
|
||||||
if crossAttentionLayers, ok := llm.KV()["mllama.attention.cross_attention_layers"].(*array); ok {
|
if crossAttentionLayers, ok := f.KV()["mllama.attention.cross_attention_layers"].(*array); ok {
|
||||||
kv = headsKV *
|
kv = headsKV *
|
||||||
(embeddingHeadsK + embeddingHeadsV) * // one for K, one for V
|
(embeddingHeadsK + embeddingHeadsV) * // one for K, one for V
|
||||||
(2* // sizeof(float16)
|
(2* // sizeof(float16)
|
||||||
(llm.KV().BlockCount()-uint64(crossAttentionLayers.size))* // num non-cross attention layers
|
(f.KV().BlockCount()-uint64(crossAttentionLayers.size))* // num non-cross attention layers
|
||||||
context +
|
context +
|
||||||
4* // sizeof(float32)
|
4* // sizeof(float32)
|
||||||
uint64(crossAttentionLayers.size)* // num cross attention layers
|
uint64(crossAttentionLayers.size)* // num cross attention layers
|
||||||
@ -443,7 +443,7 @@ func (llm GGML) GraphSize(context, batch uint64, kvCacheType string) (kv, partia
|
|||||||
)
|
)
|
||||||
|
|
||||||
var ropeFreqsCount uint64
|
var ropeFreqsCount uint64
|
||||||
if ropeFreqs, ok := llm.Tensors().Layers()["rope_freqs"]; ok {
|
if ropeFreqs, ok := f.Tensors().GroupLayers()["rope_freqs"]; ok {
|
||||||
if ropeFreqsWeights, ok := ropeFreqs["weights"]; ok {
|
if ropeFreqsWeights, ok := ropeFreqs["weights"]; ok {
|
||||||
ropeFreqsCount = ropeFreqsWeights.parameters()
|
ropeFreqsCount = ropeFreqsWeights.parameters()
|
||||||
}
|
}
|
||||||
@ -547,20 +547,20 @@ func (llm GGML) GraphSize(context, batch uint64, kvCacheType string) (kv, partia
|
|||||||
}
|
}
|
||||||
|
|
||||||
// SupportsKVCacheType checks if the requested cache type is supported
|
// SupportsKVCacheType checks if the requested cache type is supported
|
||||||
func (llm GGML) SupportsKVCacheType(cacheType string) bool {
|
func (f GGML) SupportsKVCacheType(cacheType string) bool {
|
||||||
return slices.Contains([]string{"f16", "q8_0", "q4_0"}, cacheType)
|
return slices.Contains([]string{"f16", "q8_0", "q4_0"}, cacheType)
|
||||||
}
|
}
|
||||||
|
|
||||||
// SupportsFlashAttention checks if the model supports flash attention
|
// SupportsFlashAttention checks if the model supports flash attention
|
||||||
func (llm GGML) SupportsFlashAttention() bool {
|
func (f GGML) SupportsFlashAttention() bool {
|
||||||
_, isEmbedding := llm.KV()[fmt.Sprintf("%s.pooling_type", llm.KV().Architecture())]
|
_, isEmbedding := f.KV()[fmt.Sprintf("%s.pooling_type", f.KV().Architecture())]
|
||||||
if isEmbedding {
|
if isEmbedding {
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check head counts match and are non-zero
|
// Check head counts match and are non-zero
|
||||||
headCountK := llm.KV().EmbeddingHeadCountK()
|
headCountK := f.KV().EmbeddingHeadCountK()
|
||||||
headCountV := llm.KV().EmbeddingHeadCountV()
|
headCountV := f.KV().EmbeddingHeadCountV()
|
||||||
return headCountK != 0 && headCountV != 0 && headCountK == headCountV
|
return headCountK != 0 && headCountV != 0 && headCountK == headCountV
|
||||||
}
|
}
|
||||||
|
|
||||||
|
155
fs/ggml/ggml_test.go
Normal file
155
fs/ggml/ggml_test.go
Normal file
@ -0,0 +1,155 @@
|
|||||||
|
package ggml
|
||||||
|
|
||||||
|
import (
|
||||||
|
"maps"
|
||||||
|
"slices"
|
||||||
|
"strings"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/google/go-cmp/cmp"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestTensorLayers(t *testing.T) {
|
||||||
|
tensors := make(map[string]*Tensor)
|
||||||
|
for _, name := range []string{
|
||||||
|
"token_embd.weight",
|
||||||
|
"blk.0.attn_k.weight",
|
||||||
|
"blk.0.attn_output.weight",
|
||||||
|
"blk.0.attn_q.weight",
|
||||||
|
"blk.0.attn_v.weight",
|
||||||
|
"blk.0.attn_norm.weight",
|
||||||
|
"blk.0.ffn_down.weight",
|
||||||
|
"blk.0.ffn_gate.weight",
|
||||||
|
"blk.0.ffn_up.weight",
|
||||||
|
"blk.0.ffn_norm.weight",
|
||||||
|
"output_norm.weight",
|
||||||
|
"mm.0.bias",
|
||||||
|
"mm.0.weight",
|
||||||
|
"v.blk.0.attn_k.weight",
|
||||||
|
"v.blk.0.attn_output.weight",
|
||||||
|
"v.blk.0.attn_q.weight",
|
||||||
|
"v.blk.0.attn_v.weight",
|
||||||
|
"v.blk.0.attn_norm.weight",
|
||||||
|
"v.blk.0.ffn_down.weight",
|
||||||
|
"v.blk.0.ffn_gate.weight",
|
||||||
|
"v.blk.0.ffn_up.weight",
|
||||||
|
"v.blk.0.ffn_norm.weight",
|
||||||
|
"v.patch_embd.weight",
|
||||||
|
"v.position_embd.gate",
|
||||||
|
"v.position_embd.weight",
|
||||||
|
} {
|
||||||
|
tensors[name] = &Tensor{Name: name}
|
||||||
|
}
|
||||||
|
|
||||||
|
cases := []struct {
|
||||||
|
name string
|
||||||
|
items []*Tensor
|
||||||
|
want map[string]Layer
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "text",
|
||||||
|
items: slices.Collect(func(yield func(*Tensor) bool) {
|
||||||
|
for k, v := range tensors {
|
||||||
|
if !strings.HasPrefix(k, "mm.") && !strings.HasPrefix(k, "v.") {
|
||||||
|
if !yield(v) {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}),
|
||||||
|
want: map[string]Layer{
|
||||||
|
"blk.0": {
|
||||||
|
"attn_k.weight": tensors["blk.0.attn_k.weight"],
|
||||||
|
"attn_q.weight": tensors["blk.0.attn_q.weight"],
|
||||||
|
"attn_v.weight": tensors["blk.0.attn_v.weight"],
|
||||||
|
"attn_output.weight": tensors["blk.0.attn_output.weight"],
|
||||||
|
"attn_norm.weight": tensors["blk.0.attn_norm.weight"],
|
||||||
|
"ffn_down.weight": tensors["blk.0.ffn_down.weight"],
|
||||||
|
"ffn_gate.weight": tensors["blk.0.ffn_gate.weight"],
|
||||||
|
"ffn_up.weight": tensors["blk.0.ffn_up.weight"],
|
||||||
|
"ffn_norm.weight": tensors["blk.0.ffn_norm.weight"],
|
||||||
|
},
|
||||||
|
"token_embd": {"weight": tensors["token_embd.weight"]},
|
||||||
|
"output_norm": {"weight": tensors["output_norm.weight"]},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "vision",
|
||||||
|
items: slices.Collect(func(yield func(*Tensor) bool) {
|
||||||
|
for k, v := range tensors {
|
||||||
|
if strings.HasPrefix(k, "mm.") || strings.HasPrefix(k, "v.") {
|
||||||
|
if !yield(v) {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}),
|
||||||
|
want: map[string]Layer{
|
||||||
|
"mm": {
|
||||||
|
"0.bias": tensors["mm.0.bias"],
|
||||||
|
"0.weight": tensors["mm.0.weight"],
|
||||||
|
},
|
||||||
|
"v": {
|
||||||
|
"blk.0.attn_k.weight": tensors["v.blk.0.attn_k.weight"],
|
||||||
|
"blk.0.attn_q.weight": tensors["v.blk.0.attn_q.weight"],
|
||||||
|
"blk.0.attn_v.weight": tensors["v.blk.0.attn_v.weight"],
|
||||||
|
"blk.0.attn_output.weight": tensors["v.blk.0.attn_output.weight"],
|
||||||
|
"blk.0.attn_norm.weight": tensors["v.blk.0.attn_norm.weight"],
|
||||||
|
"blk.0.ffn_down.weight": tensors["v.blk.0.ffn_down.weight"],
|
||||||
|
"blk.0.ffn_gate.weight": tensors["v.blk.0.ffn_gate.weight"],
|
||||||
|
"blk.0.ffn_up.weight": tensors["v.blk.0.ffn_up.weight"],
|
||||||
|
"blk.0.ffn_norm.weight": tensors["v.blk.0.ffn_norm.weight"],
|
||||||
|
"patch_embd.weight": tensors["v.patch_embd.weight"],
|
||||||
|
"position_embd.gate": tensors["v.position_embd.gate"],
|
||||||
|
"position_embd.weight": tensors["v.position_embd.weight"],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "vision and text",
|
||||||
|
items: slices.Collect(maps.Values(tensors)),
|
||||||
|
want: map[string]Layer{
|
||||||
|
"blk.0": {
|
||||||
|
"attn_k.weight": tensors["blk.0.attn_k.weight"],
|
||||||
|
"attn_q.weight": tensors["blk.0.attn_q.weight"],
|
||||||
|
"attn_v.weight": tensors["blk.0.attn_v.weight"],
|
||||||
|
"attn_output.weight": tensors["blk.0.attn_output.weight"],
|
||||||
|
"attn_norm.weight": tensors["blk.0.attn_norm.weight"],
|
||||||
|
"ffn_down.weight": tensors["blk.0.ffn_down.weight"],
|
||||||
|
"ffn_gate.weight": tensors["blk.0.ffn_gate.weight"],
|
||||||
|
"ffn_up.weight": tensors["blk.0.ffn_up.weight"],
|
||||||
|
"ffn_norm.weight": tensors["blk.0.ffn_norm.weight"],
|
||||||
|
},
|
||||||
|
"token_embd": {"weight": tensors["token_embd.weight"]},
|
||||||
|
"output_norm": {"weight": tensors["output_norm.weight"]},
|
||||||
|
"mm": {
|
||||||
|
"0.bias": tensors["mm.0.bias"],
|
||||||
|
"0.weight": tensors["mm.0.weight"],
|
||||||
|
},
|
||||||
|
"v": {
|
||||||
|
"blk.0.attn_k.weight": tensors["v.blk.0.attn_k.weight"],
|
||||||
|
"blk.0.attn_q.weight": tensors["v.blk.0.attn_q.weight"],
|
||||||
|
"blk.0.attn_v.weight": tensors["v.blk.0.attn_v.weight"],
|
||||||
|
"blk.0.attn_output.weight": tensors["v.blk.0.attn_output.weight"],
|
||||||
|
"blk.0.attn_norm.weight": tensors["v.blk.0.attn_norm.weight"],
|
||||||
|
"blk.0.ffn_down.weight": tensors["v.blk.0.ffn_down.weight"],
|
||||||
|
"blk.0.ffn_gate.weight": tensors["v.blk.0.ffn_gate.weight"],
|
||||||
|
"blk.0.ffn_up.weight": tensors["v.blk.0.ffn_up.weight"],
|
||||||
|
"blk.0.ffn_norm.weight": tensors["v.blk.0.ffn_norm.weight"],
|
||||||
|
"patch_embd.weight": tensors["v.patch_embd.weight"],
|
||||||
|
"position_embd.gate": tensors["v.position_embd.gate"],
|
||||||
|
"position_embd.weight": tensors["v.position_embd.weight"],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tt := range cases {
|
||||||
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
|
got := Tensors{items: tt.items}.GroupLayers()
|
||||||
|
if diff := cmp.Diff(got, tt.want); diff != "" {
|
||||||
|
t.Errorf("unexpected layers (-got +want):\n%s", diff)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
@ -32,9 +32,10 @@ const (
|
|||||||
fileTypeIQ1_S
|
fileTypeIQ1_S
|
||||||
fileTypeIQ4_NL
|
fileTypeIQ4_NL
|
||||||
fileTypeIQ3_S
|
fileTypeIQ3_S
|
||||||
|
fileTypeIQ3_M
|
||||||
fileTypeIQ2_S
|
fileTypeIQ2_S
|
||||||
fileTypeIQ4_XS
|
|
||||||
fileTypeIQ2_M
|
fileTypeIQ2_M
|
||||||
|
fileTypeIQ4_XS
|
||||||
fileTypeIQ1_M
|
fileTypeIQ1_M
|
||||||
fileTypeBF16
|
fileTypeBF16
|
||||||
|
|
||||||
@ -93,12 +94,14 @@ func ParseFileType(s string) (fileType, error) {
|
|||||||
return fileTypeIQ4_NL, nil
|
return fileTypeIQ4_NL, nil
|
||||||
case "IQ3_S":
|
case "IQ3_S":
|
||||||
return fileTypeIQ3_S, nil
|
return fileTypeIQ3_S, nil
|
||||||
|
case "IQ3_M":
|
||||||
|
return fileTypeIQ3_M, nil
|
||||||
case "IQ2_S":
|
case "IQ2_S":
|
||||||
return fileTypeIQ2_S, nil
|
return fileTypeIQ2_S, nil
|
||||||
case "IQ4_XS":
|
|
||||||
return fileTypeIQ4_XS, nil
|
|
||||||
case "IQ2_M":
|
case "IQ2_M":
|
||||||
return fileTypeIQ2_M, nil
|
return fileTypeIQ2_M, nil
|
||||||
|
case "IQ4_XS":
|
||||||
|
return fileTypeIQ4_XS, nil
|
||||||
case "IQ1_M":
|
case "IQ1_M":
|
||||||
return fileTypeIQ1_M, nil
|
return fileTypeIQ1_M, nil
|
||||||
case "BF16":
|
case "BF16":
|
||||||
@ -160,6 +163,8 @@ func (t fileType) String() string {
|
|||||||
return "IQ4_NL"
|
return "IQ4_NL"
|
||||||
case fileTypeIQ3_S:
|
case fileTypeIQ3_S:
|
||||||
return "IQ3_S"
|
return "IQ3_S"
|
||||||
|
case fileTypeIQ3_M:
|
||||||
|
return "IQ3_M"
|
||||||
case fileTypeIQ2_S:
|
case fileTypeIQ2_S:
|
||||||
return "IQ2_S"
|
return "IQ2_S"
|
||||||
case fileTypeIQ4_XS:
|
case fileTypeIQ4_XS:
|
||||||
|
@ -116,7 +116,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
|
|||||||
opts.NumCtx = max(opts.NumCtx, 2048)
|
opts.NumCtx = max(opts.NumCtx, 2048)
|
||||||
}
|
}
|
||||||
|
|
||||||
layers := f.Tensors().Layers()
|
layers := f.Tensors().GroupLayers()
|
||||||
// add one layer worth of memory as a buffer
|
// add one layer worth of memory as a buffer
|
||||||
if blk0, ok := layers["blk.0"]; ok {
|
if blk0, ok := layers["blk.0"]; ok {
|
||||||
layerSize = blk0.Size()
|
layerSize = blk0.Size()
|
||||||
@ -410,7 +410,7 @@ func projectorMemoryRequirements(filename string) (weights, graphSize uint64) {
|
|||||||
return 0, 0
|
return 0, 0
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, layer := range ggml.Tensors().Layers() {
|
for _, layer := range ggml.Tensors().GroupLayers() {
|
||||||
weights += layer.Size()
|
weights += layer.Size()
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -431,7 +431,7 @@ func projectorMemoryRequirements(filename string) (weights, graphSize uint64) {
|
|||||||
headCount := kv("attention.head_count")
|
headCount := kv("attention.head_count")
|
||||||
|
|
||||||
numPatches := (imageSize / kv("patch_size")) * (imageSize / kv("patch_size"))
|
numPatches := (imageSize / kv("patch_size")) * (imageSize / kv("patch_size"))
|
||||||
if _, ok := ggml.Tensors().Layers()["v"]["class_embd"]; ok {
|
if _, ok := ggml.Tensors().GroupLayers()["v"]["class_embd"]; ok {
|
||||||
numPatches++
|
numPatches++
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user