add picture prefix
This commit is contained in:
parent
2521a55ae6
commit
661bf04696
@ -49,7 +49,7 @@ func (q *qwen25VLModel) KV(t *Tokenizer) ggml.KV {
|
|||||||
kv["qwen25vl.vision.window_size"] = q.VisionModel.WindowSize
|
kv["qwen25vl.vision.window_size"] = q.VisionModel.WindowSize
|
||||||
kv["qwen25vl.vision.attention.layer_norm_epsilon"] = cmp.Or(q.VisionModel.RMSNormEps, 1e-6)
|
kv["qwen25vl.vision.attention.layer_norm_epsilon"] = cmp.Or(q.VisionModel.RMSNormEps, 1e-6)
|
||||||
// RoPE theta increased from 1e4 to 1e5 to compensate for numerical differences between tensor operations; empirically produces better results.
|
// RoPE theta increased from 1e4 to 1e5 to compensate for numerical differences between tensor operations; empirically produces better results.
|
||||||
kv["qwen25vl.vision.rope.freq_base"] = cmp.Or(q.VisionModel.RopeTheta, 1e5)
|
kv["qwen25vl.vision.rope.freq_base"] = cmp.Or(q.VisionModel.RopeTheta, 1e4)
|
||||||
kv["qwen25vl.vision.fullatt_block_indexes"] = q.VisionModel.FullAttentionBlocks
|
kv["qwen25vl.vision.fullatt_block_indexes"] = q.VisionModel.FullAttentionBlocks
|
||||||
kv["qwen25vl.vision.temporal_patch_size"] = q.VisionModel.TemporalPatchSize
|
kv["qwen25vl.vision.temporal_patch_size"] = q.VisionModel.TemporalPatchSize
|
||||||
|
|
||||||
|
@ -84,11 +84,23 @@ func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
|
|||||||
visionEndToken int32 = 151653
|
visionEndToken int32 = 151653
|
||||||
)
|
)
|
||||||
|
|
||||||
|
nImg := 0
|
||||||
for _, inp := range inputs {
|
for _, inp := range inputs {
|
||||||
if inp.Multimodal == nil {
|
if inp.Multimodal == nil {
|
||||||
// If not a multimodal input, add it to the result unchanged
|
// If not a multimodal input, add it to the result unchanged
|
||||||
result = append(result, inp)
|
result = append(result, inp)
|
||||||
} else {
|
} else {
|
||||||
|
// Adding the 'Picture' prefix is a hack, at the time of writing there is no way to prefix
|
||||||
|
// the image tokens with a prompt, so we add a prefix here
|
||||||
|
nImg++
|
||||||
|
pre, err := m.TextModel.Encode(fmt.Sprintf(" Picture %d: ", nImg), true)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to encode image prompt: %w", err)
|
||||||
|
}
|
||||||
|
for i := range pre {
|
||||||
|
result = append(result, input.Input{Token: pre[i]})
|
||||||
|
}
|
||||||
|
|
||||||
// This is an image token with multimodal data
|
// This is an image token with multimodal data
|
||||||
visionOutputs := inp.Multimodal.(ml.Tensor)
|
visionOutputs := inp.Multimodal.(ml.Tensor)
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user