diff --git a/convert/convert_qwen25vl.go b/convert/convert_qwen25vl.go index 412520b0d..4fc64cf0c 100644 --- a/convert/convert_qwen25vl.go +++ b/convert/convert_qwen25vl.go @@ -12,17 +12,18 @@ type qwen25VLModel struct { qwen2Model VisionModel struct { - Depth uint32 `json:"depth"` - HiddenSize uint32 `json:"hidden_size"` - NumHeads uint32 `json:"num_heads"` - InChannels uint32 `json:"in_chans"` - PatchSize uint32 `json:"patch_size"` - SpatialMergeSize uint32 `json:"spatial_merge_size"` - SpatialPatchSize uint32 `json:"spatial_patch_size"` - WindowSize uint32 `json:"window_size"` - RMSNormEps float32 `json:"layer_norm_epsilon"` - RopeTheta float32 `json:"rope_theta"` - TemporalPatchSize uint32 `json:"temporal_patch_size"` + Depth uint32 `json:"depth"` + HiddenSize uint32 `json:"hidden_size"` + NumHeads uint32 `json:"num_heads"` + InChannels uint32 `json:"in_chans"` + PatchSize uint32 `json:"patch_size"` + SpatialMergeSize uint32 `json:"spatial_merge_size"` + SpatialPatchSize uint32 `json:"spatial_patch_size"` + WindowSize uint32 `json:"window_size"` + RMSNormEps float32 `json:"layer_norm_epsilon"` + RopeTheta float32 `json:"rope_theta"` + FullAttentionBlocks []uint32 `json:"fullatt_block_indexes"` + TemporalPatchSize uint32 `json:"temporal_patch_size"` } `json:"vision_config"` } @@ -48,6 +49,7 @@ func (q *qwen25VLModel) KV(t *Tokenizer) ggml.KV { kv["qwen25vl.vision.window_size"] = q.VisionModel.WindowSize kv["qwen25vl.vision.attention.layer_norm_epsilon"] = cmp.Or(q.VisionModel.RMSNormEps, 1e-6) kv["qwen25vl.vision.rope.freq_base"] = cmp.Or(q.VisionModel.RopeTheta, 1e5) + kv["qwen25vl.vision.fullatt_block_indexes"] = q.VisionModel.FullAttentionBlocks kv["qwen25vl.vision.temporal_patch_size"] = q.VisionModel.TemporalPatchSize return kv diff --git a/model/models/qwen25vl/model_vision.go b/model/models/qwen25vl/model_vision.go index d1f13eab5..b3db140e9 100644 --- a/model/models/qwen25vl/model_vision.go +++ b/model/models/qwen25vl/model_vision.go @@ -3,6 +3,7 @@ package qwen25vl import ( "fmt" "math" + "slices" "github.com/ollama/ollama/fs" "github.com/ollama/ollama/ml" @@ -74,7 +75,10 @@ func (sa *VisionSelfAttention) Forward(ctx ml.Context, hiddenStates, cos, sin ml // Scale factor for scaled dot-product attention scale := 1.0 / math.Sqrt(float64(opts.headDim)) - mask := blockDiagonalMask(ctx, query.Dim(2), bounds, opts.numHeads) + var mask ml.Tensor + if bounds != nil { + mask = blockDiagonalMask(ctx, query.Dim(2), bounds, opts.numHeads) + } attention := nn.Attention(ctx, query, key, value, scale, nil, nn.WithMask(mask)) attention = attention.Reshape(ctx, opts.hiddenSize, attention.Dim(2), batchSize) @@ -128,6 +132,7 @@ type VisionModelOptions struct { ropeTheta float32 spatialMergeSize int windowSize int + fullAttnBlocks []int temporalPatchSize int } @@ -221,8 +226,12 @@ func (m *VisionModel) Forward(ctx ml.Context, pixelValues ml.Tensor, grid *Grid) sin = sin.Reshape(ctx, sin.Dim(0), 1, sin.Dim(1)) // Apply encoder layers - for _, layer := range m.Layers { - hiddenStates = layer.Forward(ctx, hiddenStates, cos, sin, bounds, m.VisionModelOptions) + for i, layer := range m.Layers { + if slices.Contains(m.fullAttnBlocks, i) { + hiddenStates = layer.Forward(ctx, hiddenStates, cos, sin, nil, m.VisionModelOptions) + } else { + hiddenStates = layer.Forward(ctx, hiddenStates, cos, sin, bounds, m.VisionModelOptions) + } } return m.PatchMerger.Forward(ctx, hiddenStates, m.VisionModelOptions) @@ -342,9 +351,10 @@ func newVisionModel(c fs.Config) *VisionModel { ropeTheta := c.Float("vision.rope.freq_base", 10000.0) spatialMergeSize := int(c.Uint("vision.spatial_merge_size", 2)) windowSize := int(c.Uint("vision.window_size", 112)) + fullAttnBlocks := c.Ints("qwen25vl.vision.fullatt_block_indexes", []int32{7, 15, 23, 31}) temporalPatchSize := int(c.Uint("vision.temporal_patch_size", 2)) - return &VisionModel{ + model := &VisionModel{ Layers: make([]VisionEncoderLayer, c.Uint("vision.block_count", 32)), VisionModelOptions: &VisionModelOptions{ hiddenSize: hiddenSize, @@ -359,4 +369,11 @@ func newVisionModel(c fs.Config) *VisionModel { temporalPatchSize: temporalPatchSize, }, } + + for i := range fullAttnBlocks { + // full attention block indexes have to be converted to int for use with the slices package + model.fullAttnBlocks = append(model.fullAttnBlocks, int(fullAttnBlocks[i])) + } + + return model }