diff --git a/convert/convert_qwen2.go b/convert/convert_qwen2.go index edcb82e29..48b92bbdd 100644 --- a/convert/convert_qwen2.go +++ b/convert/convert_qwen2.go @@ -15,6 +15,7 @@ type qwen2Model struct { Type string `json:"type"` Factor ropeFactor `json:"factor"` OriginalMaxPositionEmbeddings uint32 `json:"original_max_position_embeddings"` + MropeSection []int32 `json:"mrope_section"` } `json:"rope_scaling"` RMSNormEPS float32 `json:"rms_norm_eps"` } @@ -39,6 +40,8 @@ func (q *qwen2Model) KV(t *Tokenizer) ggml.KV { case "yarn": kv["qwen2.rope.scaling.type"] = q.RopeScaling.Type kv["qwen2.rope.scaling.factor"] = q.RopeScaling.Factor + case "mrope": + kv["qwen2.rope.mrope_section"] = q.RopeScaling.MropeSection default: panic("unknown rope scaling type") } diff --git a/convert/convert_qwen25vl.go b/convert/convert_qwen25vl.go index ab1e27890..4d8d248ca 100644 --- a/convert/convert_qwen25vl.go +++ b/convert/convert_qwen25vl.go @@ -1,31 +1,26 @@ package convert import ( - "bytes" - "encoding/binary" - "io" - "log/slog" + "cmp" + "slices" "strings" "github.com/ollama/ollama/fs/ggml" - "github.com/pdevine/tensor" - "github.com/pdevine/tensor/native" - "github.com/x448/float16" ) type qwen25VLModel struct { - ModelParameters - HiddenSize uint32 `json:"hidden_size"` - IntermediateSize uint32 `json:"intermediate_size"` - MaxPositionEmbeddings uint32 `json:"max_position_embeddings"` - NumAttentionHeads uint32 `json:"num_attention_heads"` - HiddenLayers uint32 `json:"num_hidden_layers"` - RopeTheta float32 `json:"rope_theta"` - NumKeyValueHeads uint32 `json:"num_key_value_heads"` - RMSNormEPS float32 `json:"rms_norm_eps"` + qwen2Model VisionModel struct { - SpatialMergeSize uint32 `json:"spatial_merge_size"` // TODO: is this set? + Depth uint32 `json:"depth"` + HiddenSize uint32 `json:"hidden_size"` + IntermediateSize uint32 `json:"intermediate_size"` + InChannels uint32 `json:"in_chans"` + NumHeads uint32 `json:"num_heads"` + PatchSize uint32 `json:"patch_size"` + SpatialMergeSize uint32 `json:"spatial_merge_size"` // TODO: is this set? + SpatialPatchSize uint32 `json:"spatial_patch_size"` + RopeTheta float32 `json:"rope_theta"` } `json:"vision_config"` } @@ -34,14 +29,22 @@ var _ ModelConverter = (*qwen25VLModel)(nil) func (q *qwen25VLModel) KV(t *Tokenizer) ggml.KV { kv := q.ModelParameters.KV(t) kv["general.architecture"] = "qwen25vl" - kv["qwen25vl.block_count"] = q.HiddenLayers - kv["qwen25vl.context_length"] = q.MaxPositionEmbeddings - kv["qwen25vl.embedding_length"] = q.HiddenSize - kv["qwen25vl.feed_forward_length"] = q.IntermediateSize - kv["qwen25vl.attention.head_count"] = q.NumAttentionHeads - kv["qwen25vl.attention.head_count_kv"] = q.NumKeyValueHeads - kv["qwen25vl.rope.freq_base"] = q.RopeTheta - kv["qwen25vl.attention.layer_norm_rms_epsilon"] = q.RMSNormEPS + + for k, v := range q.qwen2Model.KV(t) { + if strings.HasPrefix(k, "qwen2.") { + kv[strings.Replace(k, "qwen2.", "qwen25vl.", 1)] = v + } + } + + kv["qwen25vl.vision.block_count"] = q.VisionModel.Depth + kv["qwen25vl.vision.embedding_length"] = q.VisionModel.HiddenSize + kv["qwen25vl.vision.feed_forward_length"] = q.VisionModel.IntermediateSize + kv["qwen25vl.vision.attention.head_count"] = q.VisionModel.NumHeads + kv["qwen25vl.vision.num_channels"] = q.VisionModel.InChannels + kv["qwen25vl.vision.patch_size"] = q.VisionModel.PatchSize + kv["qwen25vl.vision.spatial_merge_size"] = q.VisionModel.SpatialMergeSize + kv["qwen25vl.vision.spatial_patch_size"] = q.VisionModel.SpatialPatchSize + kv["qwen25vl.vision.rope.freq_base"] = cmp.Or(q.VisionModel.RopeTheta, 1e5) return kv } @@ -50,11 +53,20 @@ func (q *qwen25VLModel) Tensors(ts []Tensor) []ggml.Tensor { var out []ggml.Tensor for _, t := range ts { - if strings.HasSuffix(t.Name(), "patch_embed.proj.weight") { - var buf bytes.Buffer - t.WriteTo(&buf) - newTensors := splitPatchEmbed(buf, t.Kind(), t.Shape()) - out = append(out, newTensors...) + if strings.Contains(t.Name(), "patch_embed.proj") { + for t := range splitDim(t, 2, + strings.NewReplacer("patch_embed.proj", "patch_embd_0"), + strings.NewReplacer("patch_embed.proj", "patch_embd_1"), + ) { + t.Shape = slices.DeleteFunc(t.Shape, func(i uint64) bool { return i == 1 }) + out = append(out, t) + } + } else if strings.Contains(t.Name(), "attn.qkv") { + out = append(out, slices.Collect(splitDim(t, 0, + strings.NewReplacer("attn.qkv", "attn_q"), + strings.NewReplacer("attn.qkv", "attn_k"), + strings.NewReplacer("attn.qkv", "attn_v"), + ))...) } else { out = append(out, ggml.Tensor{ Name: t.Name(), @@ -69,109 +81,12 @@ func (q *qwen25VLModel) Tensors(ts []Tensor) []ggml.Tensor { } func (p *qwen25VLModel) Replacements() []string { - return []string{ - "lm_head", "output", - "model.embed_tokens", "token_embd", - "model.layers", "blk", - "visual.blocks", "v.blk", - "input_layernorm", "attn_norm", - "self_attn.k_proj", "attn_k", - "self_attn.v_proj", "attn_v", - "self_attn.q_proj", "attn_q", - "self_attn.o_proj", "attn_output", - "mlp.down_proj", "ffn_down", - "mlp.gate_proj", "ffn_gate", - "mlp.up_proj", "ffn_up", - "post_attention_layernorm", "ffn_norm", - "model.norm", "output_norm", - } -} - -func splitPatchEmbed(buf bytes.Buffer, kind uint32, shape []uint64) []ggml.Tensor { - slog.Debug("patch stuff", "kind", kind, "shape", shape) - - if kind != tensorKindF16 { - panic("tensor is of wrong type") - } - - if len(shape) != 5 || (len(shape) == 5 && shape[2] != 2) { - panic("wrong sized tensor") - } - - // determine the size of the tensor based on its shape - shapeToSize := func(s []int) int { - r := 1 - for _, n := range s { - r *= int(n) - } - return r - } - - // tensor.WithShape() wants []int - intShape := make([]int, len(shape)) - for i, v := range shape { - intShape[i] = int(v) - } - - u16s := make([]uint16, shapeToSize(intShape)) - if err := binary.Read(&buf, binary.LittleEndian, u16s); err != nil { - panic("bad read") - } - - f32s := make([]float32, len(u16s)) - for i := range u16s { - f32s[i] = float16.Frombits(u16s[i]).Float32() - } - - newTensors := []ggml.Tensor{} - - getDataFromSlice := func(f32s []float32, shape []int, s []tensor.Slice) patchEmbed { - slog.Debug("getDataFromSlice", "num f32s", len(f32s), "shape", shape) - n := tensor.New(tensor.WithShape(shape...), tensor.WithBacking(f32s)) - t, err := n.Slice(s...) - if err != nil { - panic(err) - } - - ts, err := native.SelectF32(t.Materialize().(*tensor.Dense), 0) - if err != nil { - panic(err) - } - - slog.Debug("first vals", "val 1", ts[0][0], "val 2", ts[0][1], "val 3", ts[0][2]) - - f16s := make(patchEmbed, shapeToSize(shape)) - for r, row := range ts { - for c, col := range row { - f16s[r+c] = float16.Fromfloat32(col).Bits() - } - } - - return f16s - } - - p := getDataFromSlice(f32s, intShape, []tensor.Slice{nil, nil, tensor.S(0, 1, 1), nil, nil}) - newTensors = append(newTensors, ggml.Tensor{ - Name: "v.patch_embed_0.weight", - Kind: kind, - Shape: append(shape[:2], shape[3:]...), - WriterTo: p, - }) - - p = getDataFromSlice(f32s, intShape, []tensor.Slice{nil, nil, tensor.S(1, 2, 1), nil, nil}) - newTensors = append(newTensors, ggml.Tensor{ - Name: "v.patch_embed_1.weight", - Kind: kind, - Shape: append(shape[:2], shape[3:]...), - WriterTo: p, - }) - - return newTensors -} - -type patchEmbed []uint16 - -func (t patchEmbed) WriteTo(w io.Writer) (int64, error) { - err := binary.Write(w, binary.LittleEndian, t) - return 0, err + return append( + p.qwen2Model.Replacements(), + "visual", "v", + "blocks", "blk", + "attn.proj", "attn_out", + "norm1", "ln1", + "norm2", "ln2", + ) } diff --git a/convert/tensor.go b/convert/tensor.go new file mode 100644 index 000000000..258ffe1dd --- /dev/null +++ b/convert/tensor.go @@ -0,0 +1,56 @@ +package convert + +import ( + "iter" + "slices" + "strings" + + "github.com/ollama/ollama/fs/ggml" + "github.com/pdevine/tensor" + "github.com/pdevine/tensor/native" +) + +// splitDim splits a tensor along a specified dimension into multiple tensors. The dimension +// is split evenly based on the number of replacers provided. +func splitDim(t Tensor, dim int, replacers ...*strings.Replacer) iter.Seq[ggml.Tensor] { + return func(yield func(ggml.Tensor) bool) { + for i, replacer := range replacers { + shape := slices.Clone(t.Shape()) + shape[dim] = shape[dim] / uint64(len(replacers)) + + slice := slices.Repeat([]tensor.Slice{nil}, len(shape)) + slice[dim] = tensor.S(i*int(shape[dim]), (i+1)*int(shape[dim])) + + tt := t.Clone() + tt.SetRepacker(func(_ string, data []float32, shape []uint64) ([]float32, error) { + dims := make([]int, len(shape)) + for i := range shape { + dims[i] = int(shape[i]) + } + + var t tensor.Tensor = tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data)) + t, err := t.Slice(slice...) + if err != nil { + return nil, err + } + + t = tensor.Materialize(t) + // flatten tensor so it can be written as a vector + if err := t.Reshape(t.Shape().TotalSize()); err != nil { + return nil, err + } + + return native.VectorF32(t.(*tensor.Dense)) + }) + + if !yield(ggml.Tensor{ + Name: replacer.Replace(t.Name()), + Kind: t.Kind(), + Shape: shape, + WriterTo: tt, + }) { + break + } + } + } +} diff --git a/model/models/qwen25vl/model.go b/model/models/qwen25vl/model.go index b8894616d..5812b08eb 100644 --- a/model/models/qwen25vl/model.go +++ b/model/models/qwen25vl/model.go @@ -4,11 +4,11 @@ import ( "bytes" "fmt" "image" + "slices" "github.com/ollama/ollama/fs" "github.com/ollama/ollama/kvcache" "github.com/ollama/ollama/ml" - "github.com/ollama/ollama/ml/nn" "github.com/ollama/ollama/model" "github.com/ollama/ollama/model/input" ) @@ -17,7 +17,6 @@ type Model struct { model.Base *TextModel *VisionModel `gguf:"v,vision"` - *PatchMerger `gguf:"mm"` ImageProcessor } @@ -25,31 +24,6 @@ type Model struct { // Implement MultimodalProcessor interface var _ model.MultimodalProcessor = (*Model)(nil) -type PatchMerger struct { - MLPLayer1 *nn.Linear `gguf:"0"` - MLPLayer2 *nn.Linear `gguf:"2"` -} - -// Forward computes patch merging for the vision model -func (pm *PatchMerger) Forward(ctx ml.Context, visionOutputs ml.Tensor, eps float32) ml.Tensor { - // Get dimensions - hiddenSize := visionOutputs.Dim(0) - numPositions := visionOutputs.Dim(1) - batchSize := visionOutputs.Dim(2) - - reshaped := visionOutputs.Reshape(ctx, hiddenSize*4, numPositions/4, batchSize) - - // Apply first linear layer (mm_0_w, mm_0_b) - hidden := pm.MLPLayer1.Forward(ctx, reshaped) - - activated := hidden.GELU(ctx) - - // Apply second linear layer (mm_1_w, mm_1_b) - output := pm.MLPLayer2.Forward(ctx, activated) - - return output -} - func New(c fs.Config) (model.Model, error) { m := &Model{ TextModel: NewTextModel(c), @@ -62,11 +36,6 @@ func New(c fs.Config) (model.Model, error) { return m, nil } -type imageFeatures struct { - Tensor ml.Tensor - Grid *Grid -} - func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, error) { if len(m.VisionModel.Layers) == 0 { return nil, model.ErrNoVisionModel @@ -93,12 +62,7 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, er } visionOutputs := m.VisionModel.Forward(ctx, pixelValues, grid) - visionOutputs = m.PatchMerger.Forward(ctx, visionOutputs, m.VisionModel.eps) - - return &imageFeatures{ - Tensor: visionOutputs, - Grid: grid, - }, nil + return visionOutputs, nil } // PostTokenize arranges Qwen-2.5-VL's inputs for the forward pass @@ -106,12 +70,11 @@ func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) { var result []input.Input // Get image token IDs from config - imageToken := 151655 - visionStartToken := 151652 - visionEndToken := 151653 - - // Get merge size from config - mergeSize := m.ImageProcessor.mergeSize + var ( + imageToken int32 = 151655 + visionStartToken int32 = 151652 + visionEndToken int32 = 151653 + ) for _, inp := range inputs { if inp.Multimodal == nil { @@ -119,29 +82,20 @@ func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) { result = append(result, inp) } else { // This is an image token with multimodal data - features := inp.Multimodal.(*imageFeatures) + visionOutputs := inp.Multimodal.(ml.Tensor) // Calculate tokens per grid based on grid dimensions - mergeLength := mergeSize * mergeSize - gridProduct := features.Grid.Temporal * features.Grid.Height * features.Grid.Width - tokensPerGrid := gridProduct / mergeLength // First add the vision start token - result = append(result, input.Input{Token: int32(visionStartToken)}) + result = append(result, input.Input{Token: visionStartToken, SameBatch: visionOutputs.Dim(1) + 2}) // Add the image token with the multimodal tensor data at the first position - result = append(result, input.Input{ - Token: int32(imageToken), - Multimodal: features.Tensor, - MultimodalHash: inp.MultimodalHash, - }) + result = append(result, input.Input{Token: imageToken, Multimodal: visionOutputs, MultimodalHash: inp.MultimodalHash}) // Add the placeholder tokens for the remaining positions (tokensPerGrid-1) - for range tokensPerGrid - 1 { - result = append(result, input.Input{Token: int32(imageToken)}) - } + result = append(result, slices.Repeat([]input.Input{{Token: imageToken}}, visionOutputs.Dim(1)-1)...) - result = append(result, input.Input{Token: int32(visionEndToken)}) + result = append(result, input.Input{Token: visionEndToken}) } } diff --git a/model/models/qwen25vl/model_text.go b/model/models/qwen25vl/model_text.go index 549cc139b..18ba79e84 100644 --- a/model/models/qwen25vl/model_text.go +++ b/model/models/qwen25vl/model_text.go @@ -148,6 +148,11 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor // Initial token embedding hiddenState := m.TokenEmbedding.Forward(ctx, inputs) + for _, image := range batch.Multimodal { + visionOutputs := image.Multimodal.(ml.Tensor) + ctx.Forward(visionOutputs.Copy(ctx, hiddenState.View(ctx, image.Index*hiddenState.Stride(1), visionOutputs.Dim(0)*visionOutputs.Dim(1)))) + } + // Process through transformer layers for i, layer := range m.Layers { cache.SetLayer(i) diff --git a/model/models/qwen25vl/model_vision.go b/model/models/qwen25vl/model_vision.go index e2ef9a222..ba615814e 100644 --- a/model/models/qwen25vl/model_vision.go +++ b/model/models/qwen25vl/model_vision.go @@ -70,8 +70,8 @@ func (mlp *VisionMLP) Forward(ctx ml.Context, hiddenStates ml.Tensor, opts *Visi // VisionEncoderLayer implements an encoder layer for the Qwen vision model type VisionEncoderLayer struct { Norm1 *nn.RMSNorm `gguf:"ln1"` - Norm2 *nn.RMSNorm `gguf:"ln2"` SelfAttention *VisionSelfAttention + Norm2 *nn.RMSNorm `gguf:"ln2"` MLP *VisionMLP } @@ -138,30 +138,36 @@ func (pe *PatchEmbedding) Forward(ctx ml.Context, pixelValues ml.Tensor, numChan // VisionPatchMerger implements patch merging for the Qwen vision model type VisionPatchMerger struct { - LNQ *nn.RMSNorm `gguf:"ln_q"` - MLP *nn.Linear `gguf:"mlp"` + LNQ *nn.RMSNorm `gguf:"ln_q"` + MLP0 *nn.Linear `gguf:"mlp.0"` + MLP2 *nn.Linear `gguf:"mlp.2"` } // Forward computes patch merging for the vision model -func (pm *VisionPatchMerger) Forward(ctx ml.Context, x ml.Tensor, outDim, contextDim, spatialMergeSize int) ml.Tensor { - hiddenSize := contextDim * (spatialMergeSize * spatialMergeSize) +func (pm *VisionPatchMerger) Forward(ctx ml.Context, visionOutputs ml.Tensor, eps float32) ml.Tensor { + // Get dimensions + hiddenSize := visionOutputs.Dim(0) + numPositions := visionOutputs.Dim(1) + batchSize := visionOutputs.Dim(2) - // Normalize and reshape - x = pm.LNQ.Forward(ctx, x, 1e-6) - x = x.Reshape(ctx, -1, hiddenSize) + reshaped := pm.LNQ.Forward(ctx, visionOutputs, 1e6).Reshape(ctx, hiddenSize*4, numPositions/4, batchSize) - // Apply MLP for merging - x = pm.MLP.Forward(ctx, x) + // Apply first linear layer (mm_0_w, mm_0_b) + hidden := pm.MLP0.Forward(ctx, reshaped) - return x + activated := hidden.GELU(ctx) + + // Apply second linear layer (mm_1_w, mm_1_b) + output := pm.MLP2.Forward(ctx, activated) + + return output } // VisionModel implements the Qwen vision model type VisionModel struct { PatchEmbedding *PatchEmbedding Layers []VisionEncoderLayer `gguf:"blk"` - PostLayerNorm *nn.LayerNorm `gguf:"post_ln"` - PatchMerger *VisionPatchMerger `gguf:"patch_merger"` + PatchMerger *VisionPatchMerger `gguf:"merger"` *VisionModelOptions } @@ -187,8 +193,7 @@ func (m *VisionModel) Forward(ctx ml.Context, pixelValues ml.Tensor, grid *Grid) hiddenStates = layer.Forward(ctx, hiddenStates, cos, sin, m.VisionModelOptions) } - // hiddenStates = m.PostLayerNorm.Forward(ctx, hiddenStates, m.eps) - return hiddenStates + return m.PatchMerger.Forward(ctx, hiddenStates, m.eps) } // positionalEmbedding generates rotary position embeddings for attention mechanisms @@ -248,7 +253,7 @@ func (m *VisionModel) positionalEmbedding(ctx ml.Context, grid *Grid) ml.Tensor func newVisionModel(c fs.Config) *VisionModel { patchSize := int(c.Uint("vision.patch_size", 14)) hiddenSize := int(c.Uint("vision.embedding_length", 1280)) - ropeTheta := c.Float("vision.rope_theta", 10000.0) // not set + ropeTheta := c.Float("vision.rope.freq_base", 10000.0) // not set outHiddenSize := int(c.Uint("vision.out_embedding_length", 0)) // not set numHeads := int(c.Uint("vision.attention.head_count", 16)) diff --git a/model/models/qwen25vl/process_image.go b/model/models/qwen25vl/process_image.go index a7bacf155..63c75038b 100644 --- a/model/models/qwen25vl/process_image.go +++ b/model/models/qwen25vl/process_image.go @@ -26,13 +26,12 @@ type ImageProcessor struct { // newImageProcessor creates a new image processor with default values func newImageProcessor(c fs.Config) ImageProcessor { - patchSize := int(c.Uint("vision.patch_size", 14)) mergeSize := int(c.Uint("vision.spatial_merge_size", 2)) return ImageProcessor{ imageSize: int(c.Uint("vision.image_size", 560)), - numChannels: 3, + numChannels: int(c.Uint("vision.num_channels", 3)), // not set patchSize: patchSize, temporalPatchSize: 2, mergeSize: mergeSize,