From fb4664fcec8c205b48ea7edd0c6f760fc2c346b0 Mon Sep 17 00:00:00 2001 From: jmorganca Date: Tue, 11 Mar 2025 18:54:00 +0100 Subject: [PATCH] model: add more spm tokenizer tests --- model/process_text_spm_test.go | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/model/process_text_spm_test.go b/model/process_text_spm_test.go index 13e28cc5f..a43004db1 100644 --- a/model/process_text_spm_test.go +++ b/model/process_text_spm_test.go @@ -70,6 +70,14 @@ func TestSentencePieceEncode(t *testing.T) { "请考试我的软件!12345", "你好", "Hello 你好 world!", + "Special characters: !@#$%^&*()_+-=[]{}|;':\",./<>?", + "Multilingual: 你好 こんにちは Привет Hola مرحبا", + "Numbers and symbols: 123456789 +- */", + "Special tokens: text ", + "Code snippets: func main() { fmt.Println(\"Hello World\") }", + "Long text: " + "Lorem ipsum dolor sit amet, consectetur adipiscing elit. " + + "Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. " + + "Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris.", } for _, want := range cases {