From 00fa70bb269e021731e6a858d14db6bd1dae142b Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Tue, 13 May 2025 13:04:20 -0700 Subject: [PATCH] llm: Consistently track unassigned model data In some cases, if we fail to assign a piece of the model to a GPU then we lose track of this data. Although it doesn't change the memory allocation, it does affect the total size of the model reported by tools such as ollama ps (and also the percent offloaded). This makes it look like setting num_gpu isn't reflected in ollama ps, which isn't true but the offloading percent may appear to not change. Spreading the model across more GPUs will continue to impact the reported total size of the model. --- llm/memory.go | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/llm/memory.go b/llm/memory.go index 76082bf7c..a65096ae9 100644 --- a/llm/memory.go +++ b/llm/memory.go @@ -217,6 +217,8 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin if len(gpusWithSpace) > 0 { gpuZeroID = gpusWithSpace[0].i gpuAllocations[gpuZeroID] += gpuZeroOverhead + } else { + overflow += gpuZeroOverhead } // For all the layers, find where they can fit on the GPU(s) @@ -257,15 +259,17 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin } // Determine if we need to consider output then find where it fits - if memoryLayerOutput > 0 && (opts.NumGPU < 0 || layerCount < opts.NumGPU) { - for j := len(gpusWithSpace); j > 0; j-- { - g := gpusWithSpace[layerCount%j] - used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload) - if g.g.FreeMemory > overhead+used+memoryLayerOutput { - gpuAllocations[g.i] += memoryLayerOutput - layerCounts[g.i]++ - layerCount++ - break + if memoryLayerOutput > 0 { + if opts.NumGPU < 0 || layerCount < opts.NumGPU { + for j := len(gpusWithSpace); j > 0; j-- { + g := gpusWithSpace[layerCount%j] + used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload) + if g.g.FreeMemory > overhead+used+memoryLayerOutput { + gpuAllocations[g.i] += memoryLayerOutput + layerCounts[g.i]++ + layerCount++ + break + } } }