From 00fa70bb269e021731e6a858d14db6bd1dae142b Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@ollama.com>
Date: Tue, 13 May 2025 13:04:20 -0700
Subject: [PATCH] llm: Consistently track unassigned model data

In some cases, if we fail to assign a piece of the model to a GPU then
we lose track of this data. Although it doesn't change the memory
allocation, it does affect the total size of the model reported by
tools such as ollama ps (and also the percent offloaded).

This makes it look like setting num_gpu isn't reflected in ollama ps,
which isn't true but the offloading percent may appear to not change.

Spreading the model across more GPUs will continue to impact the
reported total size of the model.
---
 llm/memory.go | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/llm/memory.go b/llm/memory.go
index 76082bf7c..a65096ae9 100644
--- a/llm/memory.go
+++ b/llm/memory.go
@@ -217,6 +217,8 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 	if len(gpusWithSpace) > 0 {
 		gpuZeroID = gpusWithSpace[0].i
 		gpuAllocations[gpuZeroID] += gpuZeroOverhead
+	} else {
+		overflow += gpuZeroOverhead
 	}
 
 	// For all the layers, find where they can fit on the GPU(s)
@@ -257,15 +259,17 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 	}
 
 	// Determine if we need to consider output then find where it fits
-	if memoryLayerOutput > 0 && (opts.NumGPU < 0 || layerCount < opts.NumGPU) {
-		for j := len(gpusWithSpace); j > 0; j-- {
-			g := gpusWithSpace[layerCount%j]
-			used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
-			if g.g.FreeMemory > overhead+used+memoryLayerOutput {
-				gpuAllocations[g.i] += memoryLayerOutput
-				layerCounts[g.i]++
-				layerCount++
-				break
+	if memoryLayerOutput > 0 {
+		if opts.NumGPU < 0 || layerCount < opts.NumGPU {
+			for j := len(gpusWithSpace); j > 0; j-- {
+				g := gpusWithSpace[layerCount%j]
+				used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
+				if g.g.FreeMemory > overhead+used+memoryLayerOutput {
+					gpuAllocations[g.i] += memoryLayerOutput
+					layerCounts[g.i]++
+					layerCount++
+					break
+				}
 			}
 		}