From 526b2ed10296cc3d1ae89121eedcbbbe257741a3 Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Mon, 12 May 2025 17:29:46 -0700
Subject: [PATCH] fix vocabulary (#10679)

---
 model/models/gemma2/model.go        |  2 ++
 model/models/gemma3/model_text.go   | 13 -------------
 model/models/llama/model.go         |  3 +++
 model/models/llama4/model.go        |  3 +++
 model/models/mistral3/model.go      | 17 +++++++++++++++++
 model/models/mistral3/model_text.go | 13 -------------
 model/models/mllama/model.go        |  3 +++
 7 files changed, 28 insertions(+), 26 deletions(-)

diff --git a/model/models/gemma2/model.go b/model/models/gemma2/model.go
index d418f6827..3156b0068 100644
--- a/model/models/gemma2/model.go
+++ b/model/models/gemma2/model.go
@@ -45,6 +45,8 @@ func New(c fs.Config) (model.Model, error) {
 				Types:  c.Ints("tokenizer.ggml.token_type"),
 				BOS:    int32(c.Uint("tokenizer.ggml.bos_token_id")),
 				EOS:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
+				// TODO: set EOT to EOS otherwise 0 will stop generation
+				EOT: int32(c.Uint("tokenizer.ggml.eos_token_id")),
 			},
 		),
 		Layers: make([]Layer, c.Uint("block_count")),
diff --git a/model/models/gemma3/model_text.go b/model/models/gemma3/model_text.go
index c1e843d8f..741818a29 100644
--- a/model/models/gemma3/model_text.go
+++ b/model/models/gemma3/model_text.go
@@ -7,7 +7,6 @@ import (
 	"github.com/ollama/ollama/kvcache"
 	"github.com/ollama/ollama/ml"
 	"github.com/ollama/ollama/ml/nn"
-	"github.com/ollama/ollama/model"
 	"github.com/ollama/ollama/model/input"
 )
 
@@ -20,9 +19,6 @@ type TextConfig struct {
 }
 
 type TextModel struct {
-	model.Base
-	model.SentencePieceModel
-
 	TokenEmbedding *nn.Embedding `gguf:"token_embd"`
 	Layers         []TextLayer   `gguf:"blk"`
 	OutputNorm     *nn.RMSNorm   `gguf:"output_norm"`
@@ -45,15 +41,6 @@ func newTextModel(c fs.Config) *TextModel {
 	numBlocks := int(c.Uint("block_count"))
 
 	m := TextModel{
-		SentencePieceModel: model.NewSentencePieceModel(
-			&model.Vocabulary{
-				Values: c.Strings("tokenizer.ggml.tokens"),
-				Scores: c.Floats("tokenizer.ggml.scores"),
-				Types:  c.Ints("tokenizer.ggml.token_type"),
-				BOS:    int32(c.Uint("tokenizer.ggml.bos_token_id")),
-				EOS:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
-			},
-		),
 		Layers: make([]TextLayer, numBlocks),
 		TextConfig: &TextConfig{
 			hiddenSize:     int(c.Uint("embedding_length")),
diff --git a/model/models/llama/model.go b/model/models/llama/model.go
index 3e5a54278..c75d7eb2f 100644
--- a/model/models/llama/model.go
+++ b/model/models/llama/model.go
@@ -47,6 +47,9 @@ func New(c fs.Config) (model.Model, error) {
 				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
 				EOS:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
 				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
+				// TODO: set EOT to EOS otherwise 0 will stop generation
+				EOT:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
+				AddEOT: c.Bool("tokenizer.ggml.add_eos_token", false),
 			},
 		),
 		Layers: make([]Layer, c.Uint("block_count")),
diff --git a/model/models/llama4/model.go b/model/models/llama4/model.go
index 632d313ec..798f0d162 100644
--- a/model/models/llama4/model.go
+++ b/model/models/llama4/model.go
@@ -45,6 +45,9 @@ func New(c fs.Config) (model.Model, error) {
 				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
 				EOS:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
 				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
+				// TODO: set EOT to EOS otherwise 0 will stop generation
+				EOT:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
+				AddEOT: c.Bool("tokenizer.ggml.add_eos_token", false),
 			},
 		),
 		ImageProcessor: newImageProcessor(c),
diff --git a/model/models/mistral3/model.go b/model/models/mistral3/model.go
index f749fdcd2..c96852441 100644
--- a/model/models/mistral3/model.go
+++ b/model/models/mistral3/model.go
@@ -16,6 +16,8 @@ import (
 
 type Model struct {
 	model.Base
+	model.BytePairEncoding
+
 	*TextModel
 	*VisionModel         `gguf:"v,vision"`
 	*MultiModalProjector `gguf:"mm"`
@@ -40,6 +42,21 @@ func New(c fs.Config) (model.Model, error) {
 		VisionModel:         newVisionModel(c),
 		ImageProcessor:      newImageProcessor(c),
 		MultiModalProjector: newMultiModalProjector(c),
+		BytePairEncoding: model.NewBytePairEncoding(
+			c.String("tokenizer.ggml.pretokenizer", `[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
+			&model.Vocabulary{
+				Values: c.Strings("tokenizer.ggml.tokens"),
+				Types:  c.Ints("tokenizer.ggml.token_type"),
+				Merges: c.Strings("tokenizer.ggml.merges"),
+				BOS:    int32(c.Uint("tokenizer.ggml.bos_token_id", 1)),
+				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
+				EOS:    int32(c.Uint("tokenizer.ggml.eos_token_id", 2)),
+				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
+				// TODO: set EOT to EOS otherwise 0 will stop generation
+				EOT:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
+				AddEOT: c.Bool("tokenizer.ggml.add_eos_token", false),
+			},
+		),
 	}
 
 	m.Cache = kvcache.NewCausalCache(m.TextModel.Shift)
diff --git a/model/models/mistral3/model_text.go b/model/models/mistral3/model_text.go
index 1bf72acd8..565b001a7 100644
--- a/model/models/mistral3/model_text.go
+++ b/model/models/mistral3/model_text.go
@@ -21,7 +21,6 @@ type TextOptions struct {
 
 type TextModel struct {
 	model.Base
-	model.BytePairEncoding
 
 	TokenEmbedding *nn.Embedding `gguf:"token_embd"`
 	Layers         []Layer       `gguf:"blk"`
@@ -148,18 +147,6 @@ func NewTextModel(c fs.Config) (*TextModel, error) {
 	}
 
 	textModel := &TextModel{
-		BytePairEncoding: model.NewBytePairEncoding(
-			c.String("tokenizer.ggml.pretokenizer", `[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
-			&model.Vocabulary{
-				Values: c.Strings("tokenizer.ggml.tokens"),
-				Types:  c.Ints("tokenizer.ggml.token_type"),
-				Merges: c.Strings("tokenizer.ggml.merges"),
-				BOS:    int32(c.Uint("tokenizer.ggml.bos_token_id", 1)),
-				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
-				EOS:    int32(c.Uint("tokenizer.ggml.eos_token_id", 2)),
-				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
-			},
-		),
 		Layers: make([]Layer, c.Uint("block_count")),
 		TextOptions: &TextOptions{
 			hiddenSize: int(c.Uint("embedding_length")),
diff --git a/model/models/mllama/model.go b/model/models/mllama/model.go
index 149876c9c..3fa26dedb 100644
--- a/model/models/mllama/model.go
+++ b/model/models/mllama/model.go
@@ -49,6 +49,9 @@ func New(c fs.Config) (model.Model, error) {
 				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
 				EOS:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
 				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
+				// TODO: set EOT to EOS otherwise 0 will stop generation
+				EOT:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
+				AddEOT: c.Bool("tokenizer.ggml.add_eos_token", false),
 			},
 		),
 		ImageProcessor: newImageProcessor(c),