diff --git a/llm/memory.go b/llm/memory.go index a65096ae9..e78d46fd1 100644 --- a/llm/memory.go +++ b/llm/memory.go @@ -85,8 +85,12 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin var graphOffload uint64 // Projectors loaded into GPU0 only - var projectorWeights uint64 - var projectorGraph uint64 + var llamaEngineProjectorWeights uint64 + var llamaEngineProjectorGraph uint64 + + // Projectors loaded with output layer + var ollamaEngineProjectorWeights uint64 + var ollamaEngineProjectorGraph uint64 // Conditional output size on GPU 0 var memoryLayerOutput uint64 @@ -112,14 +116,15 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin for _, projector := range projectors { weight, graph := projectorMemoryRequirements(projector) - projectorWeights += weight - projectorGraph += graph + llamaEngineProjectorWeights += weight + llamaEngineProjectorGraph += graph // multimodal models require at least 2048 context opts.NumCtx = max(opts.NumCtx, 2048) } - if projectorWeights == 0 && projectorGraph == 0 { - projectorWeights, projectorGraph = f.VisionGraphSize() + if llamaEngineProjectorWeights == 0 && llamaEngineProjectorGraph == 0 { + ollamaEngineProjectorWeights, ollamaEngineProjectorGraph = f.VisionGraphSize() + opts.NumCtx = max(opts.NumCtx, 2048) } layers := f.Tensors().GroupLayers() @@ -164,6 +169,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin graphFullOffload = graphPartialOffload } + // Output layer handled at the end if we have space if layer, ok := layers["output_norm"]; ok { memoryLayerOutput += layer.Size() } @@ -173,8 +179,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin memoryLayerOutput += layer.Size() } - // Output layer handled at the end if we have space - gpuZeroOverhead := projectorWeights + projectorGraph + gpuZeroOverhead := llamaEngineProjectorWeights + llamaEngineProjectorGraph // Reduce set of GPUs to only those that have sufficient space to fit overhead and at least one layer var layerCount int @@ -259,13 +264,14 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin } // Determine if we need to consider output then find where it fits - if memoryLayerOutput > 0 { + memoryLastLayer := memoryLayerOutput + ollamaEngineProjectorWeights + ollamaEngineProjectorGraph + if memoryLastLayer > 0 { if opts.NumGPU < 0 || layerCount < opts.NumGPU { for j := len(gpusWithSpace); j > 0; j-- { g := gpusWithSpace[layerCount%j] used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload) - if g.g.FreeMemory > overhead+used+memoryLayerOutput { - gpuAllocations[g.i] += memoryLayerOutput + if g.g.FreeMemory > overhead+used+memoryLastLayer { + gpuAllocations[g.i] += memoryLastLayer layerCounts[g.i]++ layerCount++ break @@ -275,7 +281,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin if layerCount < int(f.KV().BlockCount())+1 { fullyLoaded = false - overflow += memoryLayerOutput + overflow += memoryLastLayer } } @@ -333,8 +339,8 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin memoryLayerOutput: memoryLayerOutput, graphFullOffload: graphFullOffload, graphPartialOffload: graphPartialOffload, - projectorWeights: projectorWeights, - projectorGraph: projectorGraph, + projectorWeights: llamaEngineProjectorWeights + ollamaEngineProjectorWeights, + projectorGraph: llamaEngineProjectorGraph + ollamaEngineProjectorGraph, } if gpus[0].Library == "cpu" {