Compare commits
2 Commits
main
...
jessegross
Author | SHA1 | Date | |
---|---|---|---|
![]() |
1089c2a074 | ||
![]() |
00fa70bb26 |
@ -85,8 +85,12 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
|
|||||||
var graphOffload uint64
|
var graphOffload uint64
|
||||||
|
|
||||||
// Projectors loaded into GPU0 only
|
// Projectors loaded into GPU0 only
|
||||||
var projectorWeights uint64
|
var llamaEngineProjectorWeights uint64
|
||||||
var projectorGraph uint64
|
var llamaEngineProjectorGraph uint64
|
||||||
|
|
||||||
|
// Projectors loaded with output layer
|
||||||
|
var ollamaEngineProjectorWeights uint64
|
||||||
|
var ollamaEngineProjectorGraph uint64
|
||||||
|
|
||||||
// Conditional output size on GPU 0
|
// Conditional output size on GPU 0
|
||||||
var memoryLayerOutput uint64
|
var memoryLayerOutput uint64
|
||||||
@ -112,14 +116,15 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
|
|||||||
|
|
||||||
for _, projector := range projectors {
|
for _, projector := range projectors {
|
||||||
weight, graph := projectorMemoryRequirements(projector)
|
weight, graph := projectorMemoryRequirements(projector)
|
||||||
projectorWeights += weight
|
llamaEngineProjectorWeights += weight
|
||||||
projectorGraph += graph
|
llamaEngineProjectorGraph += graph
|
||||||
|
|
||||||
// multimodal models require at least 2048 context
|
// multimodal models require at least 2048 context
|
||||||
opts.NumCtx = max(opts.NumCtx, 2048)
|
opts.NumCtx = max(opts.NumCtx, 2048)
|
||||||
}
|
}
|
||||||
if projectorWeights == 0 && projectorGraph == 0 {
|
if llamaEngineProjectorWeights == 0 && llamaEngineProjectorGraph == 0 {
|
||||||
projectorWeights, projectorGraph = f.VisionGraphSize()
|
ollamaEngineProjectorWeights, ollamaEngineProjectorGraph = f.VisionGraphSize()
|
||||||
|
opts.NumCtx = max(opts.NumCtx, 2048)
|
||||||
}
|
}
|
||||||
|
|
||||||
layers := f.Tensors().GroupLayers()
|
layers := f.Tensors().GroupLayers()
|
||||||
@ -164,6 +169,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
|
|||||||
graphFullOffload = graphPartialOffload
|
graphFullOffload = graphPartialOffload
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Output layer handled at the end if we have space
|
||||||
if layer, ok := layers["output_norm"]; ok {
|
if layer, ok := layers["output_norm"]; ok {
|
||||||
memoryLayerOutput += layer.Size()
|
memoryLayerOutput += layer.Size()
|
||||||
}
|
}
|
||||||
@ -173,8 +179,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
|
|||||||
memoryLayerOutput += layer.Size()
|
memoryLayerOutput += layer.Size()
|
||||||
}
|
}
|
||||||
|
|
||||||
// Output layer handled at the end if we have space
|
gpuZeroOverhead := llamaEngineProjectorWeights + llamaEngineProjectorGraph
|
||||||
gpuZeroOverhead := projectorWeights + projectorGraph
|
|
||||||
|
|
||||||
// Reduce set of GPUs to only those that have sufficient space to fit overhead and at least one layer
|
// Reduce set of GPUs to only those that have sufficient space to fit overhead and at least one layer
|
||||||
var layerCount int
|
var layerCount int
|
||||||
@ -217,6 +222,8 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
|
|||||||
if len(gpusWithSpace) > 0 {
|
if len(gpusWithSpace) > 0 {
|
||||||
gpuZeroID = gpusWithSpace[0].i
|
gpuZeroID = gpusWithSpace[0].i
|
||||||
gpuAllocations[gpuZeroID] += gpuZeroOverhead
|
gpuAllocations[gpuZeroID] += gpuZeroOverhead
|
||||||
|
} else {
|
||||||
|
overflow += gpuZeroOverhead
|
||||||
}
|
}
|
||||||
|
|
||||||
// For all the layers, find where they can fit on the GPU(s)
|
// For all the layers, find where they can fit on the GPU(s)
|
||||||
@ -257,21 +264,24 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Determine if we need to consider output then find where it fits
|
// Determine if we need to consider output then find where it fits
|
||||||
if memoryLayerOutput > 0 && (opts.NumGPU < 0 || layerCount < opts.NumGPU) {
|
memoryLastLayer := memoryLayerOutput + ollamaEngineProjectorWeights + ollamaEngineProjectorGraph
|
||||||
for j := len(gpusWithSpace); j > 0; j-- {
|
if memoryLastLayer > 0 {
|
||||||
g := gpusWithSpace[layerCount%j]
|
if opts.NumGPU < 0 || layerCount < opts.NumGPU {
|
||||||
used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
|
for j := len(gpusWithSpace); j > 0; j-- {
|
||||||
if g.g.FreeMemory > overhead+used+memoryLayerOutput {
|
g := gpusWithSpace[layerCount%j]
|
||||||
gpuAllocations[g.i] += memoryLayerOutput
|
used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
|
||||||
layerCounts[g.i]++
|
if g.g.FreeMemory > overhead+used+memoryLastLayer {
|
||||||
layerCount++
|
gpuAllocations[g.i] += memoryLastLayer
|
||||||
break
|
layerCounts[g.i]++
|
||||||
|
layerCount++
|
||||||
|
break
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if layerCount < int(f.KV().BlockCount())+1 {
|
if layerCount < int(f.KV().BlockCount())+1 {
|
||||||
fullyLoaded = false
|
fullyLoaded = false
|
||||||
overflow += memoryLayerOutput
|
overflow += memoryLastLayer
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -329,8 +339,8 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
|
|||||||
memoryLayerOutput: memoryLayerOutput,
|
memoryLayerOutput: memoryLayerOutput,
|
||||||
graphFullOffload: graphFullOffload,
|
graphFullOffload: graphFullOffload,
|
||||||
graphPartialOffload: graphPartialOffload,
|
graphPartialOffload: graphPartialOffload,
|
||||||
projectorWeights: projectorWeights,
|
projectorWeights: llamaEngineProjectorWeights + ollamaEngineProjectorWeights,
|
||||||
projectorGraph: projectorGraph,
|
projectorGraph: llamaEngineProjectorGraph + ollamaEngineProjectorGraph,
|
||||||
}
|
}
|
||||||
|
|
||||||
if gpus[0].Library == "cpu" {
|
if gpus[0].Library == "cpu" {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user