add picture prefix

2025-05-08 11:51:00 -07:00 · 2025-05-08 11:51:00 -07:00 · 661bf04696
commit 661bf04696
parent 2521a55ae6
2 changed files with 13 additions and 1 deletions
--- a/convert/convert_qwen25vl.go
+++ b/convert/convert_qwen25vl.go
@ -49,7 +49,7 @@ func (q *qwen25VLModel) KV(t *Tokenizer) ggml.KV {
 	kv["qwen25vl.vision.window_size"] = q.VisionModel.WindowSize
 	kv["qwen25vl.vision.attention.layer_norm_epsilon"] = cmp.Or(q.VisionModel.RMSNormEps, 1e-6)
 	// RoPE theta increased from 1e4 to 1e5 to compensate for numerical differences between tensor operations; empirically produces better results.
-	kv["qwen25vl.vision.rope.freq_base"] = cmp.Or(q.VisionModel.RopeTheta, 1e5)
+	kv["qwen25vl.vision.rope.freq_base"] = cmp.Or(q.VisionModel.RopeTheta, 1e4)
 	kv["qwen25vl.vision.fullatt_block_indexes"] = q.VisionModel.FullAttentionBlocks
 	kv["qwen25vl.vision.temporal_patch_size"] = q.VisionModel.TemporalPatchSize

--- a/model/models/qwen25vl/model.go
+++ b/model/models/qwen25vl/model.go
@ -84,11 +84,23 @@ func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
 		visionEndToken   int32 = 151653
 	)

+	nImg := 0
 	for _, inp := range inputs {
 		if inp.Multimodal == nil {
 			// If not a multimodal input, add it to the result unchanged
 			result = append(result, inp)
 		} else {
+			// Adding the 'Picture' prefix is a hack, at the time of writing there is no way to prefix
+			// the image tokens with a prompt, so we add a prefix here
+			nImg++
+			pre, err := m.TextModel.Encode(fmt.Sprintf(" Picture %d: ", nImg), true)
+			if err != nil {
+				return nil, fmt.Errorf("failed to encode image prompt: %w", err)
+			}
+			for i := range pre {
+				result = append(result, input.Input{Token: pre[i]})
+			}
+
 			// This is an image token with multimodal data
 			visionOutputs := inp.Multimodal.(ml.Tensor)