From 7bf793a6007ca11fae0180ea6f2ebd7258428bd4 Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@ollama.com>
Date: Wed, 12 Mar 2025 16:59:23 -0700
Subject: [PATCH] gemma3: Allow multiple image in a single input

Previously processing multiple images in a batch would trigger
segfaults so sending images together was disabled as a way to
mitigate this. The trigger was processing one image on the CPU
and one on the GPU.

This can no longer happen:
 - The vision encoder is now on the GPU so both images would be
   processed on the GPU.
 - We require images to be fully contained in a batch and each
   image including its special tokens is over half the batch size.
   As a result, we will never get two images in the same batch.

Fixes #9731
---
 server/prompt.go | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/server/prompt.go b/server/prompt.go
index d053f2a8d..5b5b958f1 100644
--- a/server/prompt.go
+++ b/server/prompt.go
@@ -26,7 +26,6 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.
 	var system []api.Message
 
 	isMllama := checkMllamaModelFamily(m)
-	isGemma3 := checkGemma3ModelFamily(m)
 
 	var imageNumTokens int
 	// TODO: Ideally we would compute this from the projector metadata but some pieces are implementation dependent
@@ -41,7 +40,7 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.
 	n := len(msgs) - 1
 	// in reverse, find all messages that fit into context window
 	for i := n; i >= 0; i-- {
-		if (isMllama || isGemma3) && len(msgs[i].Images) > 1 {
+		if isMllama && len(msgs[i].Images) > 1 {
 			return "", nil, errTooManyImages
 		}
 
@@ -158,12 +157,3 @@ func checkMllamaModelFamily(m *Model) bool {
 	}
 	return false
 }
-
-func checkGemma3ModelFamily(m *Model) bool {
-	for _, arch := range m.Config.ModelFamilies {
-		if arch == "gemma3" {
-			return true
-		}
-	}
-	return false
-}