full attention layers
This commit is contained in:
parent
bde6b46ce9
commit
359e1d5b19
@ -12,17 +12,18 @@ type qwen25VLModel struct {
|
|||||||
qwen2Model
|
qwen2Model
|
||||||
|
|
||||||
VisionModel struct {
|
VisionModel struct {
|
||||||
Depth uint32 `json:"depth"`
|
Depth uint32 `json:"depth"`
|
||||||
HiddenSize uint32 `json:"hidden_size"`
|
HiddenSize uint32 `json:"hidden_size"`
|
||||||
NumHeads uint32 `json:"num_heads"`
|
NumHeads uint32 `json:"num_heads"`
|
||||||
InChannels uint32 `json:"in_chans"`
|
InChannels uint32 `json:"in_chans"`
|
||||||
PatchSize uint32 `json:"patch_size"`
|
PatchSize uint32 `json:"patch_size"`
|
||||||
SpatialMergeSize uint32 `json:"spatial_merge_size"`
|
SpatialMergeSize uint32 `json:"spatial_merge_size"`
|
||||||
SpatialPatchSize uint32 `json:"spatial_patch_size"`
|
SpatialPatchSize uint32 `json:"spatial_patch_size"`
|
||||||
WindowSize uint32 `json:"window_size"`
|
WindowSize uint32 `json:"window_size"`
|
||||||
RMSNormEps float32 `json:"layer_norm_epsilon"`
|
RMSNormEps float32 `json:"layer_norm_epsilon"`
|
||||||
RopeTheta float32 `json:"rope_theta"`
|
RopeTheta float32 `json:"rope_theta"`
|
||||||
TemporalPatchSize uint32 `json:"temporal_patch_size"`
|
FullAttentionBlocks []uint32 `json:"fullatt_block_indexes"`
|
||||||
|
TemporalPatchSize uint32 `json:"temporal_patch_size"`
|
||||||
} `json:"vision_config"`
|
} `json:"vision_config"`
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -48,6 +49,7 @@ func (q *qwen25VLModel) KV(t *Tokenizer) ggml.KV {
|
|||||||
kv["qwen25vl.vision.window_size"] = q.VisionModel.WindowSize
|
kv["qwen25vl.vision.window_size"] = q.VisionModel.WindowSize
|
||||||
kv["qwen25vl.vision.attention.layer_norm_epsilon"] = cmp.Or(q.VisionModel.RMSNormEps, 1e-6)
|
kv["qwen25vl.vision.attention.layer_norm_epsilon"] = cmp.Or(q.VisionModel.RMSNormEps, 1e-6)
|
||||||
kv["qwen25vl.vision.rope.freq_base"] = cmp.Or(q.VisionModel.RopeTheta, 1e5)
|
kv["qwen25vl.vision.rope.freq_base"] = cmp.Or(q.VisionModel.RopeTheta, 1e5)
|
||||||
|
kv["qwen25vl.vision.fullatt_block_indexes"] = q.VisionModel.FullAttentionBlocks
|
||||||
kv["qwen25vl.vision.temporal_patch_size"] = q.VisionModel.TemporalPatchSize
|
kv["qwen25vl.vision.temporal_patch_size"] = q.VisionModel.TemporalPatchSize
|
||||||
|
|
||||||
return kv
|
return kv
|
||||||
|
@ -3,6 +3,7 @@ package qwen25vl
|
|||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
"math"
|
"math"
|
||||||
|
"slices"
|
||||||
|
|
||||||
"github.com/ollama/ollama/fs"
|
"github.com/ollama/ollama/fs"
|
||||||
"github.com/ollama/ollama/ml"
|
"github.com/ollama/ollama/ml"
|
||||||
@ -74,7 +75,10 @@ func (sa *VisionSelfAttention) Forward(ctx ml.Context, hiddenStates, cos, sin ml
|
|||||||
|
|
||||||
// Scale factor for scaled dot-product attention
|
// Scale factor for scaled dot-product attention
|
||||||
scale := 1.0 / math.Sqrt(float64(opts.headDim))
|
scale := 1.0 / math.Sqrt(float64(opts.headDim))
|
||||||
mask := blockDiagonalMask(ctx, query.Dim(2), bounds, opts.numHeads)
|
var mask ml.Tensor
|
||||||
|
if bounds != nil {
|
||||||
|
mask = blockDiagonalMask(ctx, query.Dim(2), bounds, opts.numHeads)
|
||||||
|
}
|
||||||
|
|
||||||
attention := nn.Attention(ctx, query, key, value, scale, nil, nn.WithMask(mask))
|
attention := nn.Attention(ctx, query, key, value, scale, nil, nn.WithMask(mask))
|
||||||
attention = attention.Reshape(ctx, opts.hiddenSize, attention.Dim(2), batchSize)
|
attention = attention.Reshape(ctx, opts.hiddenSize, attention.Dim(2), batchSize)
|
||||||
@ -128,6 +132,7 @@ type VisionModelOptions struct {
|
|||||||
ropeTheta float32
|
ropeTheta float32
|
||||||
spatialMergeSize int
|
spatialMergeSize int
|
||||||
windowSize int
|
windowSize int
|
||||||
|
fullAttnBlocks []int
|
||||||
temporalPatchSize int
|
temporalPatchSize int
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -221,8 +226,12 @@ func (m *VisionModel) Forward(ctx ml.Context, pixelValues ml.Tensor, grid *Grid)
|
|||||||
sin = sin.Reshape(ctx, sin.Dim(0), 1, sin.Dim(1))
|
sin = sin.Reshape(ctx, sin.Dim(0), 1, sin.Dim(1))
|
||||||
|
|
||||||
// Apply encoder layers
|
// Apply encoder layers
|
||||||
for _, layer := range m.Layers {
|
for i, layer := range m.Layers {
|
||||||
hiddenStates = layer.Forward(ctx, hiddenStates, cos, sin, bounds, m.VisionModelOptions)
|
if slices.Contains(m.fullAttnBlocks, i) {
|
||||||
|
hiddenStates = layer.Forward(ctx, hiddenStates, cos, sin, nil, m.VisionModelOptions)
|
||||||
|
} else {
|
||||||
|
hiddenStates = layer.Forward(ctx, hiddenStates, cos, sin, bounds, m.VisionModelOptions)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return m.PatchMerger.Forward(ctx, hiddenStates, m.VisionModelOptions)
|
return m.PatchMerger.Forward(ctx, hiddenStates, m.VisionModelOptions)
|
||||||
@ -342,9 +351,10 @@ func newVisionModel(c fs.Config) *VisionModel {
|
|||||||
ropeTheta := c.Float("vision.rope.freq_base", 10000.0)
|
ropeTheta := c.Float("vision.rope.freq_base", 10000.0)
|
||||||
spatialMergeSize := int(c.Uint("vision.spatial_merge_size", 2))
|
spatialMergeSize := int(c.Uint("vision.spatial_merge_size", 2))
|
||||||
windowSize := int(c.Uint("vision.window_size", 112))
|
windowSize := int(c.Uint("vision.window_size", 112))
|
||||||
|
fullAttnBlocks := c.Ints("qwen25vl.vision.fullatt_block_indexes", []int32{7, 15, 23, 31})
|
||||||
temporalPatchSize := int(c.Uint("vision.temporal_patch_size", 2))
|
temporalPatchSize := int(c.Uint("vision.temporal_patch_size", 2))
|
||||||
|
|
||||||
return &VisionModel{
|
model := &VisionModel{
|
||||||
Layers: make([]VisionEncoderLayer, c.Uint("vision.block_count", 32)),
|
Layers: make([]VisionEncoderLayer, c.Uint("vision.block_count", 32)),
|
||||||
VisionModelOptions: &VisionModelOptions{
|
VisionModelOptions: &VisionModelOptions{
|
||||||
hiddenSize: hiddenSize,
|
hiddenSize: hiddenSize,
|
||||||
@ -359,4 +369,11 @@ func newVisionModel(c fs.Config) *VisionModel {
|
|||||||
temporalPatchSize: temporalPatchSize,
|
temporalPatchSize: temporalPatchSize,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for i := range fullAttnBlocks {
|
||||||
|
// full attention block indexes have to be converted to int for use with the slices package
|
||||||
|
model.fullAttnBlocks = append(model.fullAttnBlocks, int(fullAttnBlocks[i]))
|
||||||
|
}
|
||||||
|
|
||||||
|
return model
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user