diff --git a/llm/memory.go b/llm/memory.go
index a65096ae9..e78d46fd1 100644
--- a/llm/memory.go
+++ b/llm/memory.go
@@ -85,8 +85,12 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 	var graphOffload uint64
 
 	// Projectors loaded into GPU0 only
-	var projectorWeights uint64
-	var projectorGraph uint64
+	var llamaEngineProjectorWeights uint64
+	var llamaEngineProjectorGraph uint64
+
+	// Projectors loaded with output layer
+	var ollamaEngineProjectorWeights uint64
+	var ollamaEngineProjectorGraph uint64
 
 	// Conditional output size on GPU 0
 	var memoryLayerOutput uint64
@@ -112,14 +116,15 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 
 	for _, projector := range projectors {
 		weight, graph := projectorMemoryRequirements(projector)
-		projectorWeights += weight
-		projectorGraph += graph
+		llamaEngineProjectorWeights += weight
+		llamaEngineProjectorGraph += graph
 
 		// multimodal models require at least 2048 context
 		opts.NumCtx = max(opts.NumCtx, 2048)
 	}
-	if projectorWeights == 0 && projectorGraph == 0 {
-		projectorWeights, projectorGraph = f.VisionGraphSize()
+	if llamaEngineProjectorWeights == 0 && llamaEngineProjectorGraph == 0 {
+		ollamaEngineProjectorWeights, ollamaEngineProjectorGraph = f.VisionGraphSize()
+		opts.NumCtx = max(opts.NumCtx, 2048)
 	}
 
 	layers := f.Tensors().GroupLayers()
@@ -164,6 +169,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 		graphFullOffload = graphPartialOffload
 	}
 
+	// Output layer handled at the end if we have space
 	if layer, ok := layers["output_norm"]; ok {
 		memoryLayerOutput += layer.Size()
 	}
@@ -173,8 +179,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 		memoryLayerOutput += layer.Size()
 	}
 
-	// Output layer handled at the end if we have space
-	gpuZeroOverhead := projectorWeights + projectorGraph
+	gpuZeroOverhead := llamaEngineProjectorWeights + llamaEngineProjectorGraph
 
 	// Reduce set of GPUs to only those that have sufficient space to fit overhead and at least one layer
 	var layerCount int
@@ -259,13 +264,14 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 	}
 
 	// Determine if we need to consider output then find where it fits
-	if memoryLayerOutput > 0 {
+	memoryLastLayer := memoryLayerOutput + ollamaEngineProjectorWeights + ollamaEngineProjectorGraph
+	if memoryLastLayer > 0 {
 		if opts.NumGPU < 0 || layerCount < opts.NumGPU {
 			for j := len(gpusWithSpace); j > 0; j-- {
 				g := gpusWithSpace[layerCount%j]
 				used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
-				if g.g.FreeMemory > overhead+used+memoryLayerOutput {
-					gpuAllocations[g.i] += memoryLayerOutput
+				if g.g.FreeMemory > overhead+used+memoryLastLayer {
+					gpuAllocations[g.i] += memoryLastLayer
 					layerCounts[g.i]++
 					layerCount++
 					break
@@ -275,7 +281,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 
 		if layerCount < int(f.KV().BlockCount())+1 {
 			fullyLoaded = false
-			overflow += memoryLayerOutput
+			overflow += memoryLastLayer
 		}
 	}
 
@@ -333,8 +339,8 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 		memoryLayerOutput:   memoryLayerOutput,
 		graphFullOffload:    graphFullOffload,
 		graphPartialOffload: graphPartialOffload,
-		projectorWeights:    projectorWeights,
-		projectorGraph:      projectorGraph,
+		projectorWeights:    llamaEngineProjectorWeights + ollamaEngineProjectorWeights,
+		projectorGraph:      llamaEngineProjectorGraph + ollamaEngineProjectorGraph,
 	}
 
 	if gpus[0].Library == "cpu" {