From fb4664fcec8c205b48ea7edd0c6f760fc2c346b0 Mon Sep 17 00:00:00 2001
From: jmorganca <jmorganca@gmail.com>
Date: Tue, 11 Mar 2025 18:54:00 +0100
Subject: [PATCH] model: add more spm tokenizer tests

---
 model/process_text_spm_test.go | 8 ++++++++
 1 file changed, 8 insertions(+)
diff --git a/model/process_text_spm_test.go b/model/process_text_spm_test.go
index 13e28cc5f..a43004db1 100644
--- a/model/process_text_spm_test.go
+++ b/model/process_text_spm_test.go
@@ -70,6 +70,14 @@ func TestSentencePieceEncode(t *testing.T) {
 			"请考试我的软件！12345",
 			"你好",
 			"Hello 你好 world!",
+			"Special characters: !@#$%^&*()_+-=[]{}|;':\",./<>?",
+			"Multilingual: 你好 こんにちは Привет Hola مرحبا",
+			"Numbers and symbols: 123456789 +- */",
+			"Special tokens: <bos> text <eos>",
+			"Code snippets: func main() { fmt.Println(\"Hello World\") }",
+			"Long text: " + "Lorem ipsum dolor sit amet, consectetur adipiscing elit. " +
+				"Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. " +
+				"Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris.",
 		}
 
 		for _, want := range cases {