diff --git a/convert/convert_qwen25vl.go b/convert/convert_qwen25vl.go index 3237ab78e..ff9becd8a 100644 --- a/convert/convert_qwen25vl.go +++ b/convert/convert_qwen25vl.go @@ -49,7 +49,7 @@ func (q *qwen25VLModel) KV(t *Tokenizer) ggml.KV { kv["qwen25vl.vision.window_size"] = q.VisionModel.WindowSize kv["qwen25vl.vision.attention.layer_norm_epsilon"] = cmp.Or(q.VisionModel.RMSNormEps, 1e-6) // RoPE theta increased from 1e4 to 1e5 to compensate for numerical differences between tensor operations; empirically produces better results. - kv["qwen25vl.vision.rope.freq_base"] = cmp.Or(q.VisionModel.RopeTheta, 1e5) + kv["qwen25vl.vision.rope.freq_base"] = cmp.Or(q.VisionModel.RopeTheta, 1e4) kv["qwen25vl.vision.fullatt_block_indexes"] = q.VisionModel.FullAttentionBlocks kv["qwen25vl.vision.temporal_patch_size"] = q.VisionModel.TemporalPatchSize diff --git a/model/models/qwen25vl/model.go b/model/models/qwen25vl/model.go index 546e68b13..31911fa98 100644 --- a/model/models/qwen25vl/model.go +++ b/model/models/qwen25vl/model.go @@ -84,11 +84,23 @@ func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) { visionEndToken int32 = 151653 ) + nImg := 0 for _, inp := range inputs { if inp.Multimodal == nil { // If not a multimodal input, add it to the result unchanged result = append(result, inp) } else { + // Adding the 'Picture' prefix is a hack, at the time of writing there is no way to prefix + // the image tokens with a prompt, so we add a prefix here + nImg++ + pre, err := m.TextModel.Encode(fmt.Sprintf(" Picture %d: ", nImg), true) + if err != nil { + return nil, fmt.Errorf("failed to encode image prompt: %w", err) + } + for i := range pre { + result = append(result, input.Input{Token: pre[i]}) + } + // This is an image token with multimodal data visionOutputs := inp.Multimodal.(ml.Tensor)