llm: Estimate projector memory correctly for Ollama engine
The Llama engine always places vision projectors on the first GPU if one exists. However, the Ollama engine groups it with the output layer, which means the projector is only offloaded if all other layers are offloaded. The memory estimation code always assumes the former layout - this changes it to use the correct layout based on the engine. This addresses two impacts of the current behavior: - In multi-GPU setups, we can crash with OOM errors when we try to allocate memory on a full GPU while another still has space. - If the vision projector is large, it may prevent us from offloading anything when we could have fit some of the text layers.
This commit is contained in:
parent
00fa70bb26
commit
1089c2a074
@ -85,8 +85,12 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
|
|||||||
var graphOffload uint64
|
var graphOffload uint64
|
||||||
|
|
||||||
// Projectors loaded into GPU0 only
|
// Projectors loaded into GPU0 only
|
||||||
var projectorWeights uint64
|
var llamaEngineProjectorWeights uint64
|
||||||
var projectorGraph uint64
|
var llamaEngineProjectorGraph uint64
|
||||||
|
|
||||||
|
// Projectors loaded with output layer
|
||||||
|
var ollamaEngineProjectorWeights uint64
|
||||||
|
var ollamaEngineProjectorGraph uint64
|
||||||
|
|
||||||
// Conditional output size on GPU 0
|
// Conditional output size on GPU 0
|
||||||
var memoryLayerOutput uint64
|
var memoryLayerOutput uint64
|
||||||
@ -112,14 +116,15 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
|
|||||||
|
|
||||||
for _, projector := range projectors {
|
for _, projector := range projectors {
|
||||||
weight, graph := projectorMemoryRequirements(projector)
|
weight, graph := projectorMemoryRequirements(projector)
|
||||||
projectorWeights += weight
|
llamaEngineProjectorWeights += weight
|
||||||
projectorGraph += graph
|
llamaEngineProjectorGraph += graph
|
||||||
|
|
||||||
// multimodal models require at least 2048 context
|
// multimodal models require at least 2048 context
|
||||||
opts.NumCtx = max(opts.NumCtx, 2048)
|
opts.NumCtx = max(opts.NumCtx, 2048)
|
||||||
}
|
}
|
||||||
if projectorWeights == 0 && projectorGraph == 0 {
|
if llamaEngineProjectorWeights == 0 && llamaEngineProjectorGraph == 0 {
|
||||||
projectorWeights, projectorGraph = f.VisionGraphSize()
|
ollamaEngineProjectorWeights, ollamaEngineProjectorGraph = f.VisionGraphSize()
|
||||||
|
opts.NumCtx = max(opts.NumCtx, 2048)
|
||||||
}
|
}
|
||||||
|
|
||||||
layers := f.Tensors().GroupLayers()
|
layers := f.Tensors().GroupLayers()
|
||||||
@ -164,6 +169,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
|
|||||||
graphFullOffload = graphPartialOffload
|
graphFullOffload = graphPartialOffload
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Output layer handled at the end if we have space
|
||||||
if layer, ok := layers["output_norm"]; ok {
|
if layer, ok := layers["output_norm"]; ok {
|
||||||
memoryLayerOutput += layer.Size()
|
memoryLayerOutput += layer.Size()
|
||||||
}
|
}
|
||||||
@ -173,8 +179,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
|
|||||||
memoryLayerOutput += layer.Size()
|
memoryLayerOutput += layer.Size()
|
||||||
}
|
}
|
||||||
|
|
||||||
// Output layer handled at the end if we have space
|
gpuZeroOverhead := llamaEngineProjectorWeights + llamaEngineProjectorGraph
|
||||||
gpuZeroOverhead := projectorWeights + projectorGraph
|
|
||||||
|
|
||||||
// Reduce set of GPUs to only those that have sufficient space to fit overhead and at least one layer
|
// Reduce set of GPUs to only those that have sufficient space to fit overhead and at least one layer
|
||||||
var layerCount int
|
var layerCount int
|
||||||
@ -259,13 +264,14 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Determine if we need to consider output then find where it fits
|
// Determine if we need to consider output then find where it fits
|
||||||
if memoryLayerOutput > 0 {
|
memoryLastLayer := memoryLayerOutput + ollamaEngineProjectorWeights + ollamaEngineProjectorGraph
|
||||||
|
if memoryLastLayer > 0 {
|
||||||
if opts.NumGPU < 0 || layerCount < opts.NumGPU {
|
if opts.NumGPU < 0 || layerCount < opts.NumGPU {
|
||||||
for j := len(gpusWithSpace); j > 0; j-- {
|
for j := len(gpusWithSpace); j > 0; j-- {
|
||||||
g := gpusWithSpace[layerCount%j]
|
g := gpusWithSpace[layerCount%j]
|
||||||
used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
|
used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
|
||||||
if g.g.FreeMemory > overhead+used+memoryLayerOutput {
|
if g.g.FreeMemory > overhead+used+memoryLastLayer {
|
||||||
gpuAllocations[g.i] += memoryLayerOutput
|
gpuAllocations[g.i] += memoryLastLayer
|
||||||
layerCounts[g.i]++
|
layerCounts[g.i]++
|
||||||
layerCount++
|
layerCount++
|
||||||
break
|
break
|
||||||
@ -275,7 +281,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
|
|||||||
|
|
||||||
if layerCount < int(f.KV().BlockCount())+1 {
|
if layerCount < int(f.KV().BlockCount())+1 {
|
||||||
fullyLoaded = false
|
fullyLoaded = false
|
||||||
overflow += memoryLayerOutput
|
overflow += memoryLastLayer
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -333,8 +339,8 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
|
|||||||
memoryLayerOutput: memoryLayerOutput,
|
memoryLayerOutput: memoryLayerOutput,
|
||||||
graphFullOffload: graphFullOffload,
|
graphFullOffload: graphFullOffload,
|
||||||
graphPartialOffload: graphPartialOffload,
|
graphPartialOffload: graphPartialOffload,
|
||||||
projectorWeights: projectorWeights,
|
projectorWeights: llamaEngineProjectorWeights + ollamaEngineProjectorWeights,
|
||||||
projectorGraph: projectorGraph,
|
projectorGraph: llamaEngineProjectorGraph + ollamaEngineProjectorGraph,
|
||||||
}
|
}
|
||||||
|
|
||||||
if gpus[0].Library == "cpu" {
|
if gpus[0].Library == "cpu" {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user