Compare commits

...

2 Commits

Author SHA1 Message Date
Jesse Gross
1089c2a074 llm: Estimate projector memory correctly for Ollama engine
The Llama engine always places vision projectors on the first GPU
if one exists. However, the Ollama engine groups it with the output
layer, which means the projector is only offloaded if all other layers
are offloaded. The memory estimation code always assumes the former
layout - this changes it to use the correct layout based on the engine.

This addresses two impacts of the current behavior:
 - In multi-GPU setups, we can crash with OOM errors when we try to
   allocate memory on a full GPU while another still has space.
 - If the vision projector is large, it may prevent us from offloading
   anything when we could have fit some of the text layers.
2025-05-13 16:44:27 -07:00
Jesse Gross
00fa70bb26 llm: Consistently track unassigned model data
In some cases, if we fail to assign a piece of the model to a GPU then
we lose track of this data. Although it doesn't change the memory
allocation, it does affect the total size of the model reported by
tools such as ollama ps (and also the percent offloaded).

This makes it look like setting num_gpu isn't reflected in ollama ps,
which isn't true but the offloading percent may appear to not change.

Spreading the model across more GPUs will continue to impact the
reported total size of the model.
2025-05-13 16:44:26 -07:00

View File

@ -85,8 +85,12 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
var graphOffload uint64 var graphOffload uint64
// Projectors loaded into GPU0 only // Projectors loaded into GPU0 only
var projectorWeights uint64 var llamaEngineProjectorWeights uint64
var projectorGraph uint64 var llamaEngineProjectorGraph uint64
// Projectors loaded with output layer
var ollamaEngineProjectorWeights uint64
var ollamaEngineProjectorGraph uint64
// Conditional output size on GPU 0 // Conditional output size on GPU 0
var memoryLayerOutput uint64 var memoryLayerOutput uint64
@ -112,14 +116,15 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
for _, projector := range projectors { for _, projector := range projectors {
weight, graph := projectorMemoryRequirements(projector) weight, graph := projectorMemoryRequirements(projector)
projectorWeights += weight llamaEngineProjectorWeights += weight
projectorGraph += graph llamaEngineProjectorGraph += graph
// multimodal models require at least 2048 context // multimodal models require at least 2048 context
opts.NumCtx = max(opts.NumCtx, 2048) opts.NumCtx = max(opts.NumCtx, 2048)
} }
if projectorWeights == 0 && projectorGraph == 0 { if llamaEngineProjectorWeights == 0 && llamaEngineProjectorGraph == 0 {
projectorWeights, projectorGraph = f.VisionGraphSize() ollamaEngineProjectorWeights, ollamaEngineProjectorGraph = f.VisionGraphSize()
opts.NumCtx = max(opts.NumCtx, 2048)
} }
layers := f.Tensors().GroupLayers() layers := f.Tensors().GroupLayers()
@ -164,6 +169,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
graphFullOffload = graphPartialOffload graphFullOffload = graphPartialOffload
} }
// Output layer handled at the end if we have space
if layer, ok := layers["output_norm"]; ok { if layer, ok := layers["output_norm"]; ok {
memoryLayerOutput += layer.Size() memoryLayerOutput += layer.Size()
} }
@ -173,8 +179,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
memoryLayerOutput += layer.Size() memoryLayerOutput += layer.Size()
} }
// Output layer handled at the end if we have space gpuZeroOverhead := llamaEngineProjectorWeights + llamaEngineProjectorGraph
gpuZeroOverhead := projectorWeights + projectorGraph
// Reduce set of GPUs to only those that have sufficient space to fit overhead and at least one layer // Reduce set of GPUs to only those that have sufficient space to fit overhead and at least one layer
var layerCount int var layerCount int
@ -217,6 +222,8 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
if len(gpusWithSpace) > 0 { if len(gpusWithSpace) > 0 {
gpuZeroID = gpusWithSpace[0].i gpuZeroID = gpusWithSpace[0].i
gpuAllocations[gpuZeroID] += gpuZeroOverhead gpuAllocations[gpuZeroID] += gpuZeroOverhead
} else {
overflow += gpuZeroOverhead
} }
// For all the layers, find where they can fit on the GPU(s) // For all the layers, find where they can fit on the GPU(s)
@ -257,21 +264,24 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
} }
// Determine if we need to consider output then find where it fits // Determine if we need to consider output then find where it fits
if memoryLayerOutput > 0 && (opts.NumGPU < 0 || layerCount < opts.NumGPU) { memoryLastLayer := memoryLayerOutput + ollamaEngineProjectorWeights + ollamaEngineProjectorGraph
for j := len(gpusWithSpace); j > 0; j-- { if memoryLastLayer > 0 {
g := gpusWithSpace[layerCount%j] if opts.NumGPU < 0 || layerCount < opts.NumGPU {
used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload) for j := len(gpusWithSpace); j > 0; j-- {
if g.g.FreeMemory > overhead+used+memoryLayerOutput { g := gpusWithSpace[layerCount%j]
gpuAllocations[g.i] += memoryLayerOutput used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
layerCounts[g.i]++ if g.g.FreeMemory > overhead+used+memoryLastLayer {
layerCount++ gpuAllocations[g.i] += memoryLastLayer
break layerCounts[g.i]++
layerCount++
break
}
} }
} }
if layerCount < int(f.KV().BlockCount())+1 { if layerCount < int(f.KV().BlockCount())+1 {
fullyLoaded = false fullyLoaded = false
overflow += memoryLayerOutput overflow += memoryLastLayer
} }
} }
@ -329,8 +339,8 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
memoryLayerOutput: memoryLayerOutput, memoryLayerOutput: memoryLayerOutput,
graphFullOffload: graphFullOffload, graphFullOffload: graphFullOffload,
graphPartialOffload: graphPartialOffload, graphPartialOffload: graphPartialOffload,
projectorWeights: projectorWeights, projectorWeights: llamaEngineProjectorWeights + ollamaEngineProjectorWeights,
projectorGraph: projectorGraph, projectorGraph: llamaEngineProjectorGraph + ollamaEngineProjectorGraph,
} }
if gpus[0].Library == "cpu" { if gpus[0].Library == "cpu" {