From 7bf793a6007ca11fae0180ea6f2ebd7258428bd4 Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Wed, 12 Mar 2025 16:59:23 -0700 Subject: [PATCH] gemma3: Allow multiple image in a single input Previously processing multiple images in a batch would trigger segfaults so sending images together was disabled as a way to mitigate this. The trigger was processing one image on the CPU and one on the GPU. This can no longer happen: - The vision encoder is now on the GPU so both images would be processed on the GPU. - We require images to be fully contained in a batch and each image including its special tokens is over half the batch size. As a result, we will never get two images in the same batch. Fixes #9731 --- server/prompt.go | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/server/prompt.go b/server/prompt.go index d053f2a8d..5b5b958f1 100644 --- a/server/prompt.go +++ b/server/prompt.go @@ -26,7 +26,6 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api. var system []api.Message isMllama := checkMllamaModelFamily(m) - isGemma3 := checkGemma3ModelFamily(m) var imageNumTokens int // TODO: Ideally we would compute this from the projector metadata but some pieces are implementation dependent @@ -41,7 +40,7 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api. n := len(msgs) - 1 // in reverse, find all messages that fit into context window for i := n; i >= 0; i-- { - if (isMllama || isGemma3) && len(msgs[i].Images) > 1 { + if isMllama && len(msgs[i].Images) > 1 { return "", nil, errTooManyImages } @@ -158,12 +157,3 @@ func checkMllamaModelFamily(m *Model) bool { } return false } - -func checkGemma3ModelFamily(m *Model) bool { - for _, arch := range m.Config.ModelFamilies { - if arch == "gemma3" { - return true - } - } - return false -}