diff --git a/convert/convert_qwen25vl.go b/convert/convert_qwen25vl.go index 4fc64cf0c..c71a096aa 100644 --- a/convert/convert_qwen25vl.go +++ b/convert/convert_qwen25vl.go @@ -12,18 +12,18 @@ type qwen25VLModel struct { qwen2Model VisionModel struct { - Depth uint32 `json:"depth"` - HiddenSize uint32 `json:"hidden_size"` - NumHeads uint32 `json:"num_heads"` - InChannels uint32 `json:"in_chans"` - PatchSize uint32 `json:"patch_size"` - SpatialMergeSize uint32 `json:"spatial_merge_size"` - SpatialPatchSize uint32 `json:"spatial_patch_size"` - WindowSize uint32 `json:"window_size"` - RMSNormEps float32 `json:"layer_norm_epsilon"` - RopeTheta float32 `json:"rope_theta"` - FullAttentionBlocks []uint32 `json:"fullatt_block_indexes"` - TemporalPatchSize uint32 `json:"temporal_patch_size"` + Depth uint32 `json:"depth"` + HiddenSize uint32 `json:"hidden_size"` + NumHeads uint32 `json:"num_heads"` + InChannels uint32 `json:"in_chans"` + PatchSize uint32 `json:"patch_size"` + SpatialMergeSize uint32 `json:"spatial_merge_size"` + SpatialPatchSize uint32 `json:"spatial_patch_size"` + WindowSize uint32 `json:"window_size"` + RMSNormEps float32 `json:"layer_norm_epsilon"` + RopeTheta float32 `json:"rope_theta"` + FullAttentionBlocks []int32 `json:"fullatt_block_indexes"` + TemporalPatchSize uint32 `json:"temporal_patch_size"` } `json:"vision_config"` } diff --git a/model/models/qwen25vl/model.go b/model/models/qwen25vl/model.go index 2d938b707..552c38cc0 100644 --- a/model/models/qwen25vl/model.go +++ b/model/models/qwen25vl/model.go @@ -69,7 +69,6 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, er func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) { var result []input.Input - // Get image token IDs from config var ( imageToken int32 = 151655 visionStartToken int32 = 151652