From fcfad744ffc00e287bf5ad073e857a58efb1ec14 Mon Sep 17 00:00:00 2001 From: Bruce MacDonald Date: Tue, 29 Apr 2025 15:43:10 -0700 Subject: [PATCH] fix patch merger --- model/models/qwen25vl/model_vision.go | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/model/models/qwen25vl/model_vision.go b/model/models/qwen25vl/model_vision.go index f25d3e1aa..b69ab143f 100644 --- a/model/models/qwen25vl/model_vision.go +++ b/model/models/qwen25vl/model_vision.go @@ -148,19 +148,16 @@ type VisionPatchMerger struct { // Forward computes patch merging for the vision model func (pm *VisionPatchMerger) Forward(ctx ml.Context, visionOutputs ml.Tensor, eps float32) ml.Tensor { - // Get dimensions - hiddenSize := visionOutputs.Dim(0) - numPositions := visionOutputs.Dim(1) - batchSize := visionOutputs.Dim(2) + normalized := pm.LNQ.Forward(ctx, visionOutputs, eps) - reshaped := pm.LNQ.Forward(ctx, visionOutputs, 1e6).Reshape(ctx, hiddenSize*4, numPositions/4, batchSize) + spatialMergeSize := 2 // This should come from config? + hiddenSize := visionOutputs.Dim(0) * (spatialMergeSize * spatialMergeSize) - // Apply first linear layer (mm_0_w, mm_0_b) + // Reshape the normalized output to view the hidden size dimension + // Similar to .view(-1, self.hidden_size) in PyTorch + reshaped := normalized.Reshape(ctx, hiddenSize, normalized.Dim(1)/(spatialMergeSize*spatialMergeSize), batchSize) hidden := pm.MLP0.Forward(ctx, reshaped) - activated := hidden.GELU(ctx) - - // Apply second linear layer (mm_1_w, mm_1_b) output := pm.MLP2.Forward(ctx, activated) return output