refactor prcess text tests

2025-01-29 15:29:30 -08:00 · 2025-01-29 15:29:30 -08:00 · 109ad1da0f
commit 109ad1da0f
parent 624bfb0b11
12 changed files with 472282 additions and 731 deletions
--- a/model/llama/model.go
+++ b/model/llama/model.go
@ -29,16 +29,16 @@ type Model struct {
 func New(c ml.Config) (model.Model, error) {
 	return &Model{
-		BytePairEncoding: model.BytePairEncoding{
+		BytePairEncoding: model.NewBytePairEncoding(
-			Pretokenizer: c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
+			c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
-			Vocabulary: &model.Vocabulary{
+			&model.Vocabulary{
 				Values: c.Strings("tokenizer.ggml.tokens"),
 				Types:  c.Uints("tokenizer.ggml.token_type"),
 				Merges: c.Strings("tokenizer.ggml.merges"),
 				BOS:    c.Uint("tokenizer.ggml.bos_token_id"),
 				EOS:    c.Uint("tokenizer.ggml.eos_token_id"),
 			},
-		},
+		),
 		Layers: make([]Layer, c.Uint("block_count")),
 		Options: &Options{
 			hiddenSize: int64(c.Uint("embedding_length")),
--- a/model/mllama/model.go
+++ b/model/mllama/model.go
@ -8,6 +8,7 @@ import (
 type Model struct {
 	model.Base
 	model.BytePairEncoding
 	*VisionModel `gguf:"v,vision"`
 	*TextModel
@ -15,14 +16,22 @@ type Model struct {
 	Projector *nn.Linear `gguf:"mm.0"`
 	ImageProcessor
 	TextProcessor
 }
 func New(c ml.Config) (model.Model, error) {
 	return &Model{
 		BytePairEncoding: model.NewBytePairEncoding(
 			c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
 			&model.Vocabulary{
 				Values: c.Strings("tokenizer.ggml.tokens"),
 				Types:  c.Uints("tokenizer.ggml.token_type"),
 				Merges: c.Strings("tokenizer.ggml.merges"),
 				BOS:    c.Uint("tokenizer.ggml.bos_token_id"),
 				EOS:    c.Uint("tokenizer.ggml.eos_token_id"),
 			},
 		),
 		ImageProcessor: newImageProcessor(c),
 		VisionModel:    newVisionModel(c),
 		TextProcessor:  newTextProcessor(c),
 		TextModel:      newTextModel(c),
 	}, nil
 }
--- a/model/mllama/process_text.go
+++ b/model/mllama/process_text.go
@ -1,25 +0,0 @@
 package mllama
 import (
 	"github.com/ollama/ollama/ml"
 	"github.com/ollama/ollama/model"
 )
 type TextProcessor struct {
 	model.BytePairEncoding
 }
 func newTextProcessor(c ml.Config) TextProcessor {
 	return TextProcessor{
 		BytePairEncoding: model.BytePairEncoding{
 			Pretokenizer: c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
 			Vocabulary: &model.Vocabulary{
 				Values: c.Strings("tokenizer.ggml.tokens"),
 				Types:  c.Uints("tokenizer.ggml.token_type"),
 				Merges: c.Strings("tokenizer.ggml.merges"),
 				BOS:    c.Uint("tokenizer.ggml.bos_token_id"),
 				EOS:    c.Uint("tokenizer.ggml.eos_token_id"),
 			},
 		},
 	}
 }
--- a/model/mllama/process_text_test.go
+++ b/model/mllama/process_text_test.go
@ -1,87 +0,0 @@
 package mllama
 import (
 	"encoding/json"
 	"errors"
 	"os"
 	"path/filepath"
 	"strconv"
 	"testing"
 	"github.com/google/go-cmp/cmp"
 	"github.com/google/go-cmp/cmp/cmpopts"
 	"github.com/ollama/ollama/model"
 )
 func TestProcessText(t *testing.T) {
 	ours, err := model.New(filepath.Join("testdata", "model.bin"))
 	if errors.Is(err, os.ErrNotExist) {
 		t.Skip("no model.bin")
 	} else if err != nil {
 		t.Fatal(err)
 	}
 	t.Run("decode", func(t *testing.T) {
 		f, err := os.Open(filepath.Join("testdata", "theirs.json"))
 		if errors.Is(err, os.ErrNotExist) {
 			t.Skip("no theirs.json")
 		} else if err != nil {
 			t.Fatal(err)
 		}
 		defer f.Close()
 		var theirs [][]byte
 		if err := json.NewDecoder(f).Decode(&theirs); err != nil {
 			t.Fatal(err)
 		}
 		for id := range theirs {
 			ids := []int32{int32(id)}
 			s, err := ours.(model.TextProcessor).Decode(ids)
 			if err != nil {
 				t.Fatal(err)
 			}
 			if diff := cmp.Diff(string(theirs[id]), s); diff != "" {
 				t.Errorf("%d no match (-theirs +ours):\n%s", id, diff)
 			}
 		}
 	})
 	t.Run("encode", func(t *testing.T) {
 		f, err := os.Open(filepath.Join("..", "testdata", "inputs.json"))
 		if errors.Is(err, os.ErrNotExist) {
 			t.Skip("no inputs.json")
 		} else if err != nil {
 			t.Fatal(err)
 		}
 		defer f.Close()
 		var inputs []struct {
 			Values []byte  `json:"base64"`
 			IDs    []int32 `json:"ids"`
 		}
 		if err := json.NewDecoder(f).Decode(&inputs); err != nil {
 			t.Fatal(err)
 		}
 		for i, input := range inputs {
 			if i == 45 {
 				t.Skip("skip 45")
 			}
 			t.Run(strconv.Itoa(i), func(t *testing.T) {
 				ids, err := ours.(model.TextProcessor).Encode(string(input.Values))
 				if err != nil {
 					t.Fatal(err)
 				}
 				if diff := cmp.Diff(input.IDs, ids, cmpopts.EquateEmpty()); diff != "" {
 					t.Errorf("%s: no match (-theirs +ours):\n%s", input.Values, diff)
 				}
 			})
 		}
 	})
 }
--- a/model/mllama/testdata/model.bin
+++ b/model/mllama/testdata/model.bin
@ -1 +0,0 @@
 /Users/michaelyang/git/ollama/library/nltpt/Llama-3.2-11B-Vision-Instruct/merged.gguf
--- a/model/mllama/testdata/theirs.json
+++ b/model/mllama/testdata/theirs.json
--- a/model/process_text.go
+++ b/model/process_text.go
@ -2,6 +2,7 @@ package model
 import (
 	"cmp"
 	"iter"
 	"log/slog"
 	"strings"
 	"sync"
@ -99,23 +100,29 @@ func (v *Vocabulary) Merge(left, right string) int {
 }
 type BytePairEncoding struct {
-	Pretokenizer string
+	pre   *regexp2.Regexp
-
+	vocab *Vocabulary
 	*Vocabulary
 }
-func (bpe BytePairEncoding) split(s string) ([]string, error) {
+func NewBytePairEncoding(pre string, vocab *Vocabulary) BytePairEncoding {
-	re, err := regexp2.Compile(bpe.Pretokenizer, regexp2.Unicode|regexp2.RE2)
+	return BytePairEncoding{
-	if err != nil {
+		pre:   regexp2.MustCompile(pre, regexp2.Unicode|regexp2.RE2),
-		return nil, err
+		vocab: vocab,
 	}
 }
-	var matches []string
+func (bpe BytePairEncoding) Is(id uint32, special Special) bool {
-	for m, _ := re.FindStringMatch(s); m != nil; m, _ = re.FindNextMatch(m) {
+	return bpe.vocab.Is(id, special)
 		matches = append(matches, m.String())
 }
-	return matches, nil
+func (bpe *BytePairEncoding) split(s string) iter.Seq[string] {
 	return func(yield func(string) bool) {
 		for m, _ := bpe.pre.FindStringMatch(s); m != nil; m, _ = bpe.pre.FindNextMatch(m) {
 			if !yield(m.String()) {
 				break
 			}
 		}
 	}
 }
 // fragment is a string fragment and their corresponding token IDs
@ -138,9 +145,9 @@ type merge struct {
 func (bpe BytePairEncoding) Encode(s string) ([]int32, error) {
 	fragments := []fragment{{value: s}}
-	for _, special := range bpe.Vocabulary.SpecialVocabulary() {
+	for _, special := range bpe.vocab.SpecialVocabulary() {
 		// TODO: process special tokens concurrently
-		id := bpe.Vocabulary.Encode(special)
+		id := bpe.vocab.Encode(special)
 		for i := 0; i < len(fragments); i++ {
 			frag := fragments[i]
 			if len(frag.ids) > 0 {
@ -173,13 +180,7 @@ func (bpe BytePairEncoding) Encode(s string) ([]int32, error) {
 			continue
 		}
-		// split fragment using pretokenizer
+		for split := range bpe.split(frag.value) {
 		splits, err := bpe.split(frag.value)
 		if err != nil {
 			return nil, err
 		}
 		for _, split := range splits {
 			// TODO: process splits concurrently
 			var sb strings.Builder
 			for _, b := range []byte(split) {
@ -197,7 +198,7 @@ func (bpe BytePairEncoding) Encode(s string) ([]int32, error) {
 			}
 			// short circuit if the fragment is in the vocabulary
-			if id := bpe.Vocabulary.Encode(sb.String()); id >= 0 {
+			if id := bpe.vocab.Encode(sb.String()); id >= 0 {
 				ids = append(ids, id)
 				slog.Debug("encoded", "text", sb.String(), "ids", []int32{id})
 				continue
@ -219,7 +220,7 @@ func (bpe BytePairEncoding) Encode(s string) ([]int32, error) {
 				}
 				left, right := string(merges[a].runes), string(merges[b].runes)
-				rank := bpe.Vocabulary.Merge(left, right)
+				rank := bpe.vocab.Merge(left, right)
 				if rank < 0 {
 					return nil
 				}
@ -271,7 +272,7 @@ func (bpe BytePairEncoding) Encode(s string) ([]int32, error) {
 			for _, merge := range merges {
 				if len(merge.runes) > 0 {
 					// TODO: handle the edge case where the rune isn't in the vocabulary
-					if id := bpe.Vocabulary.Encode(string(merge.runes)); id >= 0 {
+					if id := bpe.vocab.Encode(string(merge.runes)); id >= 0 {
 						ids = append(ids, id)
 						slog.Debug("encoded", "text", string(merge.runes), "ids", []int32{id})
 					}
@ -286,7 +287,7 @@ func (bpe BytePairEncoding) Encode(s string) ([]int32, error) {
 func (bpe BytePairEncoding) Decode(ids []int32) (string, error) {
 	var sb strings.Builder
 	for _, id := range ids {
-		for _, r := range bpe.Vocabulary.Decode(id) {
+		for _, r := range bpe.vocab.Decode(id) {
 			switch {
 			case r == 0x0100:
 				// this produces 0x00 aka NULL
--- a/model/process_text_test.go
+++ b/model/process_text_test.go
@ -0,0 +1,247 @@
 package model
 import (
 	"bufio"
 	"encoding/json"
 	"math"
 	"os"
 	"path/filepath"
 	"slices"
 	"strconv"
 	"strings"
 	"testing"
 	"github.com/google/go-cmp/cmp"
 )
 func llama(t testing.TB) BytePairEncoding {
 	t.Helper()
 	f, err := os.Open(filepath.Join("testdata", "llama3.2", "encoder.json"))
 	if err != nil {
 		t.Fatal(err)
 	}
 	defer f.Close()
 	vocab := make(map[string]int32)
 	if err := json.NewDecoder(f).Decode(&vocab); err != nil {
 		t.Fatal(err)
 	}
 	types := make([]uint32, len(vocab))
 	tokens := make([]string, len(vocab))
 	for token, id := range vocab {
 		tokens[id] = token
 		types[id] = 1
 	}
 	for _, token := range []string{"<|begin_of_text|>", "<|end_of_text|>"} {
 		if _, ok := vocab[token]; !ok {
 			tokens = append(tokens, token) //nolint:makezero
 			types = append(types, 3)       //nolint:makezero
 			vocab[token] = int32(len(vocab))
 		}
 	}
 	f, err = os.Open(filepath.Join("testdata", "llama3.2", "vocab.bpe"))
 	if err != nil {
 		t.Fatal(err)
 	}
 	defer f.Close()
 	merges := make([]string, 0, 50000)
 	scanner := bufio.NewScanner(f)
 	for scanner.Scan() {
 		if !strings.HasPrefix(scanner.Text(), "#") {
 			merges = append(merges, scanner.Text())
 		}
 	}
 	return NewBytePairEncoding(
 		`(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`,
 		&Vocabulary{
 			Values: tokens,
 			Types:  types,
 			Merges: merges,
 		},
 	)
 }
 func TestLlama(t *testing.T) {
 	tokenizer := llama(t)
 	t.Run("simple", func(t *testing.T) {
 		t.Parallel()
 		ids, err := tokenizer.Encode("hello world")
 		if err != nil {
 			t.Error(err)
 		}
 		if diff := cmp.Diff([]int32{15339, 1917}, ids); diff != "" {
 			t.Errorf("no match (-theirs +ours):\n%s", diff)
 		}
 		s, err := tokenizer.Decode([]int32{15339, 1917})
 		if err != nil {
 			t.Fatal(err)
 		}
 		if s != "hello world" {
 			t.Errorf("got %q, want hello world", s)
 		}
 		ids, err = tokenizer.Encode("hello <|end_of_text|>")
 		if err != nil {
 			t.Error(err)
 		}
 		if diff := cmp.Diff([]int32{15339, 220, 128001}, ids); diff != "" {
 			t.Errorf("no match (-theirs +ours):\n%s", diff)
 		}
 	})
 	t.Run("simple repeated", func(t *testing.T) {
 		t.Parallel()
 		cases := map[string][]int32{
 			strings.Repeat("0", 1):  {15},
 			strings.Repeat("0", 2):  {410},
 			strings.Repeat("0", 3):  {931},
 			strings.Repeat("0", 4):  {931, 15},
 			strings.Repeat("0", 5):  {931, 410},
 			strings.Repeat("0", 6):  {931, 931},
 			strings.Repeat("0", 7):  {931, 931, 15},
 			strings.Repeat("0", 8):  {931, 931, 410},
 			strings.Repeat("0", 9):  {931, 931, 931},
 			strings.Repeat("0", 10): {931, 931, 931, 15},
 			strings.Repeat("0", 11): {931, 931, 931, 410},
 			strings.Repeat("0", 12): {931, 931, 931, 931},
 			strings.Repeat("0", 13): {931, 931, 931, 931, 15},
 			strings.Repeat("0", 14): {931, 931, 931, 931, 410},
 			strings.Repeat("0", 15): {931, 931, 931, 931, 931},
 			strings.Repeat("0", 16): {931, 931, 931, 931, 931, 15},
 			strings.Repeat("0", 17): {931, 931, 931, 931, 931, 410},
 		}
 		for s, want := range cases {
 			ids, err := tokenizer.Encode(s)
 			if err != nil {
 				t.Error(err)
 			}
 			if diff := cmp.Diff(want, ids); diff != "" {
 				t.Errorf("%q no match (-theirs +ours):\n%s", s, diff)
 			}
 		}
 	})
 	t.Run("basic roundtrip", func(t *testing.T) {
 		t.Parallel()
 		cases := []string{
 			"hello",
 			"hello ",
 			"hello  ",
 			" hello",
 			" hello ",
 			" hello  ",
 			"hello world",
 			"请考试我的软件！12345",
 		}
 		for _, want := range cases {
 			ids, err := tokenizer.Encode(want)
 			if err != nil {
 				t.Error(err)
 			}
 			if got, err := tokenizer.Decode(ids); err != nil {
 				t.Fatal(err)
 			} else if got != want {
 				t.Errorf("got %q, want %q", got, want)
 			}
 		}
 	})
 	t.Run("special", func(t *testing.T) {
 		t.Parallel()
 		cases := map[string][]int32{
 			"<|begin_of_text|>A B!":                                               {128000, 32, 426, 0},
 			"<|begin_of_text|>A<|end_of_text|>B!":                                 {128000, 32, 128001, 33, 0},
 			"<|begin_of_text|>A<|end_of_text|>B<|begin_of_text|>!":                {128000, 32, 128001, 33, 128000, 0},
 			"<|begin_of_text|>A<|end_of_text|>B<|begin_of_text|>!<|end_of_text|>": {128000, 32, 128001, 33, 128000, 0, 128001},
 		}
 		for s, want := range cases {
 			ids, err := tokenizer.Encode(s)
 			if err != nil {
 				t.Fatal(err)
 			}
 			if diff := cmp.Diff(want, ids); diff != "" {
 				t.Errorf("no match (-theirs +ours):\n%s", diff)
 			}
 		}
 	})
 	t.Run("split", func(t *testing.T) {
 		t.Parallel()
 		cases := map[string][]string{
 			"Hello World!":                   {"Hello", " World", "!"},
 			"I'm don't won't":                {"I", "'m", " don", "'t", " won", "'t"},
 			"In 2024 there are 366 days":     {"In", " ", "202", "4", " there", " are", " ", "366", " days"},
 			"Hello!! ...world":               {"Hello", "!!", " ...", "world"},
 			"Hello    World":                 {"Hello", "   ", " World"},
 			"Hello\nWorld":                   {"Hello", "\n", "World"},
 			"Hello, WORLD!! How's it going?": {"Hello", ",", " WORLD", "!!", " How", "'s", " it", " going", "?"},
 		}
 		for s, want := range cases {
 			got := slices.Collect(tokenizer.split(s))
 			if diff := cmp.Diff(want, got); diff != "" {
 				t.Errorf("no match (-theirs +ours):\n%s", diff)
 			}
 		}
 	})
 }
 func Benchmark(b *testing.B) {
 	tokenizer := llama(b)
 	bts, err := os.ReadFile(filepath.Join("testdata", "war-and-peace.txt"))
 	if err != nil {
 		b.Fatal(err)
 	}
 	for i := range 8 {
 		n := min(int(math.Pow10(i)), len(bts))
 		bts := bts[:n]
 		b.Run("encode"+strconv.Itoa(n), func(b *testing.B) {
 			b.ResetTimer()
 			for range b.N {
 				_, err := tokenizer.Encode(string(bts))
 				if err != nil {
 					b.Fatal(err)
 				}
 			}
 		})
 		b.Run("decode"+strconv.Itoa(n), func(b *testing.B) {
 			ids, err := tokenizer.Encode(string(bts))
 			if err != nil {
 				b.Fatal(err)
 			}
 			b.ResetTimer()
 			for range b.N {
 				_, err := tokenizer.Decode(ids)
 				if err != nil {
 					b.Fatal(err)
 				}
 			}
 		})
 	}
 }
--- a/model/testdata/inputs.json
+++ b/model/testdata/inputs.json
@ -1,586 +0,0 @@
 [
    {
        "base64": "aWVkIDQgwr0gbW9udGhz",
        "ids": [
            1142,
            220,
            19,
            220,
            27154,
            4038
        ]
    },
    {
        "base64": "RsO8aHJlcg==",
        "ids": [
            37,
            51853,
            261
        ]
    },
    {
        "base64": "",
        "ids": []
    },
    {
        "base64": "IA==",
        "ids": [
            220
        ]
    },
    {
        "base64": "ICA=",
        "ids": [
            256
        ]
    },
    {
        "base64": "ICAg",
        "ids": [
            262
        ]
    },
    {
        "base64": "CQ==",
        "ids": [
            197
        ]
    },
    {
        "base64": "Cg==",
        "ids": [
            198
        ]
    },
    {
        "base64": "Cgo=",
        "ids": [
            271
        ]
    },
    {
        "base64": "CgoK",
        "ids": [
            1432
        ]
    },
    {
        "base64": "CQo=",
        "ids": [
            1602
        ]
    },
    {
        "base64": "SGVsbG8gd29ybGQ=",
        "ids": [
            9906,
            1917
        ]
    },
    {
        "base64": "IEhlbGxvIHdvcmxk",
        "ids": [
            22691,
            1917
        ]
    },
    {
        "base64": "SGVsbG8gV29ybGQ=",
        "ids": [
            9906,
            4435
        ]
    },
    {
        "base64": "IEhlbGxvIFdvcmxk",
        "ids": [
            22691,
            4435
        ]
    },
    {
        "base64": "IEhlbGxvIFdvcmxkIQ==",
        "ids": [
            22691,
            4435,
            0
        ]
    },
    {
        "base64": "SGVsbG8sIHdvcmxkIQ==",
        "ids": [
            9906,
            11,
            1917,
            0
        ]
    },
    {
        "base64": "IEhlbGxvLCB3b3JsZCE=",
        "ids": [
            22691,
            11,
            1917,
            0
        ]
    },
    {
        "base64": "IHRoaXMgaXMg8J+mmS5jcHA=",
        "ids": [
            420,
            374,
            11410,
            99,
            247,
            13,
            11055
        ]
    },
    {
        "base64": "dzA0OCA3dHVpamsgZHNkZmh1",
        "ids": [
            86,
            23904,
            220,
            22,
            83,
            2005,
            42908,
            11729,
            3013,
            17156
        ]
    },
    {
        "base64": "0L3QtdGJ0L4g0L3QsCDQkdGK0LvQs9Cw0YDRgdC60Lg=",
        "ids": [
            79862,
            102118,
            13373,
            64571,
            34694,
            3114,
            112203,
            80112
        ]
    },
    {
        "base64": "4Z6A4Z624Z6T4Z+L4Z6P4Z+C4Z6W4Z634Z6f4Z+B4Z6f4Z6i4Z624Z6F4Z6B4Z6b4Z6F4Z+B4Z6J",
        "ids": [
            21549,
            222,
            98629,
            241,
            45358,
            233,
            21549,
            237,
            45358,
            224,
            21549,
            244,
            21549,
            115,
            21549,
            253,
            45358,
            223,
            21549,
            253,
            21549,
            95,
            98629,
            227,
            21549,
            223,
            21549,
            249,
            21549,
            227,
            45358,
            223,
            21549,
            231
        ]
    },
    {
        "base64": "8J+agCAobm9ybWFsKSDwn5i24oCN8J+Mq++4jyAobXVsdGlwbGUgZW1vamlzIGNvbmNhdGVuYXRlZCkg4pyFIChvbmx5IGVtb2ppIHRoYXQgaGFzIGl0cyBvd24gdG9rZW4p",
        "ids": [
            9468,
            248,
            222,
            320,
            8416,
            8,
            27623,
            114,
            102470,
            9468,
            234,
            104,
            31643,
            320,
            36773,
            100166,
            98634,
            8,
            26602,
            227,
            320,
            3323,
            43465,
            430,
            706,
            1202,
            1866,
            4037,
            8
        ]
    },
    {
        "base64": "SGVsbG8=",
        "ids": [
            9906
        ]
    },
    {
        "base64": "IEhlbGxv",
        "ids": [
            22691
        ]
    },
    {
        "base64": "ICBIZWxsbw==",
        "ids": [
            220,
            22691
        ]
    },
    {
        "base64": "ICAgSGVsbG8=",
        "ids": [
            256,
            22691
        ]
    },
    {
        "base64": "ICAgIEhlbGxv",
        "ids": [
            262,
            22691
        ]
    },
    {
        "base64": "ICAgIEhlbGxvCiAgICBIZWxsbw==",
        "ids": [
            262,
            22691,
            198,
            262,
            22691
        ]
    },
    {
        "base64": "ICg=",
        "ids": [
            320
        ]
    },
    {
        "base64": "CiA9",
        "ids": [
            198,
            284
        ]
    },
    {
        "base64": "JyBlcmE=",
        "ids": [
            6,
            11639
        ]
    },
    {
        "base64": "SGVsbG8sIHknYWxsISBIb3cgYXJlIHlvdSDwn5iBID/miJHmg7PlnKhhcHBsZeW3peS9nDEzMTQxNTHlpKnvvZ4=",
        "ids": [
            9906,
            11,
            379,
            65948,
            0,
            2650,
            527,
            499,
            27623,
            223,
            949,
            37046,
            101067,
            19000,
            23182,
            102301,
            9263,
            18136,
            16,
            36827,
            21909
        ]
    },
    {
        "base64": "ISEhISEh",
        "ids": [
            17523,
            3001
        ]
    },
    {
        "base64": "Mw==",
        "ids": [
            18
        ]
    },
    {
        "base64": "MzM=",
        "ids": [
            1644
        ]
    },
    {
        "base64": "MzMz",
        "ids": [
            8765
        ]
    },
    {
        "base64": "MzMzMw==",
        "ids": [
            8765,
            18
        ]
    },
    {
        "base64": "MzMzMzM=",
        "ids": [
            8765,
            1644
        ]
    },
    {
        "base64": "MzMzMzMz",
        "ids": [
            8765,
            8765
        ]
    },
    {
        "base64": "MzMzMzMzMw==",
        "ids": [
            8765,
            8765,
            18
        ]
    },
    {
        "base64": "MzMzMzMzMzM=",
        "ids": [
            8765,
            8765,
            1644
        ]
    },
    {
        "base64": "MzMzMzMzMzMz",
        "ids": [
            8765,
            8765,
            8765
        ]
    },
    {
        "base64": "Q+G7rWEgVmnhu4d0",
        "ids": [
            34,
            91163,
            101798
        ]
    },
    {
        "base64": "IGRpc2NhcmRz",
        "ids": [
            2624,
            2402
        ]
    },
    {
        "base64": "CiAKCiAKCgogCSAJCSAJCiAgCiAgIAogICAgCiAgICAgCvCfmoAgKG5vcm1hbCkg8J+YtuKAjfCfjKvvuI8gKG11bHRpcGxlIGVtb2ppcyBjb25jYXRlbmF0ZWQpIOKchSDwn6aZ8J+mmSAzIDMzIDMzMyAzMzMzIDMzMzMzIDMzMzMzMyAzMzMzMzMzIDMzMzMzMzMzIDMuMyAzLi4zIDMuLi4zIOGegOGetuGek+Gfi+Gej+GfguGeluGet+Gen+GfgeGen+GeouGetuGehfCfmIEgP+aIkeaDs+WcqGFwcGxl5bel5L2cMTMxNDE1MeWkqe+9niAtLS0tLS09PT09PT09INC90LXRidC+INC90LAg0JHRitC70LPQsNGA0YHQutC4ICcnJycnJ2BgYGBgYGAiIiIiLi4uLi4uISEhISEhPz8/Pz8/IEkndmUgYmVlbiAndG9sZCBoZSdzIHRoZXJlLCAnUkUgeW91IHN1cmU/ICdNIG5vdCBzdXJlIEknbGwgbWFrZSBpdCwgJ0QgeW91IGxpa2Ugc29tZSB0ZWE/IFdlJ1ZlIGEnbEw=",
        "ids": [
            198,
            4815,
            15073,
            66597,
            8004,
            1602,
            2355,
            79772,
            11187,
            9468,
            248,
            222,
            320,
            8416,
            8,
            27623,
            114,
            102470,
            9468,
            234,
            104,
            31643,
            320,
            36773,
            100166,
            98634,
            8,
            26602,
            227,
            11410,
            99,
            247,
            9468,
            99,
            247,
            220,
            18,
            220,
            1644,
            220,
            8765,
            220,
            8765,
            18,
            220,
            8765,
            1644,
            220,
            8765,
            8765,
            220,
            8765,
            8765,
            18,
            220,
            8765,
            8765,
            1644,
            220,
            18,
            13,
            18,
            220,
            18,
            497,
            18,
            220,
            18,
            1131,
            18,
            220,
            21549,
            222,
            98629,
            241,
            45358,
            233,
            21549,
            237,
            45358,
            224,
            21549,
            244,
            21549,
            115,
            21549,
            253,
            45358,
            223,
            21549,
            253,
            21549,
            95,
            98629,
            227,
            76460,
            223,
            949,
            37046,
            101067,
            19000,
            23182,
            102301,
            9263,
            18136,
            16,
            36827,
            21909,
            56560,
            54337,
            19175,
            102118,
            13373,
            64571,
            34694,
            3114,
            112203,
            80112,
            3436,
            106451,
            14196,
            14196,
            74694,
            3089,
            3089,
            29249,
            17523,
            3001,
            27708,
            7801,
            358,
            3077,
            1027,
            364,
            83,
            820,
            568,
            596,
            1070,
            11,
            364,
            793,
            499,
            2771,
            30,
            364,
            44,
            539,
            2771,
            358,
            3358,
            1304,
            433,
            11,
            364,
            35,
            499,
            1093,
            1063,
            15600,
            30,
            1226,
            6,
            43712,
            264,
            64966,
            43
        ]
    }
 ]
--- a/model/testdata/llama3.2/encoder.json
+++ b/model/testdata/llama3.2/encoder.json
--- a/model/testdata/llama3.2/vocab.bpe
+++ b/model/testdata/llama3.2/vocab.bpe
--- a/model/testdata/war-and-peace.txt
+++ b/model/testdata/war-and-peace.txt
		`@ -1 +0,0 @@`
			`/Users/michaelyang/git/ollama/library/nltpt/Llama-3.2-11B-Vision-Instruct/merged.gguf`