From acbffa59e9f5ea20d0183c10c4896d297e47da10 Mon Sep 17 00:00:00 2001 From: Blake Mizerany Date: Sun, 23 Jun 2024 13:55:48 -0700 Subject: [PATCH] llm: suppress large allocations for GGUF arrays This introduces a little array type for holding GGUF arrays that prevents the array from growing too large. It preserves the total size of the array, but limits the number of elements that are actually allocated. GGUF arrays that are extremely large, such as tokens, etc, are generally uninteresting to users, and are not worth the memory overhead, and the time spent allocating and freeing them. They are necessary for inference, but not for inspection. The size of these arrays is, however, important in Ollama, so it is preserved in a separate field on array. --- llm/ggml.go | 2 +- llm/gguf.go | 36 ++++++++++++++++++++++++++++++------ 2 files changed, 31 insertions(+), 7 deletions(-) diff --git a/llm/ggml.go b/llm/ggml.go index f02f0ff60..121eb532c 100644 --- a/llm/ggml.go +++ b/llm/ggml.go @@ -321,7 +321,7 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui embedding := llm.KV().EmbeddingLength() heads := llm.KV().HeadCount() headsKV := llm.KV().HeadCountKV() - vocab := uint64(len(llm.KV()["tokenizer.ggml.tokens"].([]any))) + vocab := llm.KV()["tokenizer.ggml.tokens"].(*array).size embeddingHeads := llm.KV().EmbeddingHeadCount() embeddingHeadsK := llm.KV().EmbeddingHeadCountK() diff --git a/llm/gguf.go b/llm/gguf.go index 234efe574..b3c0e2eaa 100644 --- a/llm/gguf.go +++ b/llm/gguf.go @@ -316,7 +316,7 @@ func writeGGUFString(llm *gguf, w io.Writer, s string) error { return err } -func readGGUFV1Array(llm *gguf, r io.Reader) (a []any, err error) { +func readGGUFV1Array(llm *gguf, r io.Reader) (*array, error) { t, err := readGGUF[uint32](llm, r) if err != nil { return nil, err @@ -327,6 +327,8 @@ func readGGUFV1Array(llm *gguf, r io.Reader) (a []any, err error) { return nil, err } + a := &array{size: uint64(n)} + for i := 0; uint32(i) < n; i++ { var e any switch t { @@ -361,13 +363,27 @@ func readGGUFV1Array(llm *gguf, r io.Reader) (a []any, err error) { return nil, err } - a = append(a, e) + if len(a.values) < arrayMaxSize { + a.values = append(a.values, e) + } } - return + return a, nil } -func readGGUFArray(llm *gguf, r io.Reader) (a []any, err error) { +const arrayMaxSize = 1000 + +type array struct { + size uint64 + + // values is the slice of values in the array. + // + // Its length may be less than size if the array is too big to reaonably + // fit in memory. The current limit si arrayMaxSize. + values []any +} + +func readGGUFArray(llm *gguf, r io.Reader) (*array, error) { if llm.Version == 1 { return readGGUFV1Array(llm, r) } @@ -382,6 +398,8 @@ func readGGUFArray(llm *gguf, r io.Reader) (a []any, err error) { return nil, err } + a := &array{size: n} + for i := 0; uint64(i) < n; i++ { var e any switch t { @@ -416,10 +434,16 @@ func readGGUFArray(llm *gguf, r io.Reader) (a []any, err error) { return nil, err } - a = append(a, e) + // TODO(bmizerany): We may want to only enforce this limit + // on certain fields, however, as of now, I (bmizerany) do + // not know of any array fields that are needed by Ollama that + // exceed this limit. + if len(a.values) < arrayMaxSize { + a.values = append(a.values, e) + } } - return + return a, nil } func writeGGUFArray[S ~[]E, E any](llm *gguf, w io.Writer, t uint32, s S) error {