diff --git a/model/process_text_benchmark_test.go b/model/process_text_benchmark_test.go
new file mode 100644
index 000000000..aa7185094
--- /dev/null
+++ b/model/process_text_benchmark_test.go
@@ -0,0 +1,160 @@
+package model
+
+import (
+ "testing"
+)
+
+// BenchmarkVocabulary is a reusable test vocabulary for benchmarks
+var BenchmarkVocabulary = &Vocabulary{
+ Values: []string{
+ "Hello",
+ "World",
+ "!",
+ "How",
+ "are",
+ "you",
+ "t",
+ "o",
+ "d",
+ "a",
+ "y",
+ "to",
+ "tod",
+ "toda",
+ "today",
+ " ",
+ "",
+ "",
+ "",
+ "'s",
+ "'t",
+ "'re",
+ "'ve",
+ "'m",
+ "'ll",
+ "'d",
+ },
+ Types: []uint32{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1}, // 3 for special tokens
+ Merges: []string{
+ "to",
+ "tod",
+ "toda",
+ "today",
+ },
+ BOS: 16, //
+ EOS: 17, //
+}
+
+func BenchmarkBytePairEncoding(b *testing.B) {
+ bpe := BytePairEncoding{
+ Pretokenizer: `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`,
+ Vocabulary: BenchmarkVocabulary,
+ }
+
+ benchmarks := []struct {
+ name string
+ input string
+ }{
+ {
+ name: "simple_hello_world",
+ input: "Hello World!",
+ },
+ {
+ name: "with_special_tokens",
+ input: "Hello World!",
+ },
+ {
+ name: "with_merges",
+ input: "today is today and today",
+ },
+ {
+ name: "with_contractions",
+ input: "I'm don't won't can't they're we've you'll he'd",
+ },
+ {
+ name: "long_text",
+ input: "Hello World! How are you today? I'm doing great! This is a longer text to test the performance of the encoding and decoding process with multiple sentences and various tokens including special ones like and and contractions like don't and won't.",
+ },
+ }
+
+ for _, bm := range benchmarks {
+ // Benchmark Encoding
+ b.Run("Encode_"+bm.name, func(b *testing.B) {
+ b.ReportAllocs()
+ for i := 0; i < b.N; i++ {
+ tokens, err := bpe.Encode(bm.input)
+ if err != nil {
+ b.Fatal(err)
+ }
+ b.SetBytes(int64(len(tokens) * 4)) // Each token is 4 bytes (int32)
+ }
+ })
+
+ // First encode the input to get tokens for decode benchmark
+ tokens, err := bpe.Encode(bm.input)
+ if err != nil {
+ b.Fatal(err)
+ }
+
+ // Benchmark Decoding
+ b.Run("Decode_"+bm.name, func(b *testing.B) {
+ b.ReportAllocs()
+ for i := 0; i < b.N; i++ {
+ decoded, err := bpe.Decode(tokens)
+ if err != nil {
+ b.Fatal(err)
+ }
+ b.SetBytes(int64(len(decoded)))
+ }
+ })
+ }
+}
+
+func BenchmarkBytePairEncodingSplit(b *testing.B) {
+ bpe := BytePairEncoding{
+ Pretokenizer: `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`,
+ }
+
+ benchmarks := []struct {
+ name string
+ input string
+ }{
+ {
+ name: "simple_text",
+ input: "Hello World!",
+ },
+ {
+ name: "with_contractions",
+ input: "I'm don't won't",
+ },
+ {
+ name: "with_numbers",
+ input: "In 2024 there are 365 days",
+ },
+ {
+ name: "with_special_chars",
+ input: "Hello!! ...world",
+ },
+ {
+ name: "with_spaces",
+ input: "Hello World",
+ },
+ {
+ name: "with_newlines",
+ input: "Hello\nWorld\nHow\nAre\nYou",
+ },
+ }
+
+ for _, bm := range benchmarks {
+ b.Run("Split_"+bm.name, func(b *testing.B) {
+ b.ReportAllocs()
+ for i := 0; i < b.N; i++ {
+ splits, err := bpe.split(bm.input)
+ if err != nil {
+ b.Fatal(err)
+ }
+ b.SetBytes(int64(len(splits)))
+ }
+ })
+ }
+}