Fixed over vram allcation dure to small initial layer sizes.

Co-authored-by: Tej Kiran <kiran.tej@amd.com>
Co-authored-by: Michael Yang <mxyng@pm.me>
Co-authored-by: Tej Kiran <itej89@gmailcom>
This commit is contained in:
tej 2025-05-13 18:42:39 -05:00 committed by GitHub
parent 8cc33f4c2b
commit 0478d440f0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -1,9 +1,12 @@
package llm package llm
import ( import (
"cmp"
"fmt" "fmt"
"log/slog" "log/slog"
"maps"
"os" "os"
"slices"
"strconv" "strconv"
"strings" "strings"
@ -120,12 +123,10 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
} }
layers := f.Tensors().GroupLayers() layers := f.Tensors().GroupLayers()
// add one layer worth of memory as a buffer // add one layer (chosing the max layer) worth of memory as a buffer
if blk0, ok := layers["blk.0"]; ok { layerSize = slices.MaxFunc(slices.Collect(maps.Values(layers)), func(a, b ggml.Layer) int {
layerSize = blk0.Size() return cmp.Compare(a.Size(), b.Size())
} else { }).Size()
slog.Warn("model missing blk.0 layer size")
}
var kvct string var kvct string
if envconfig.FlashAttention() && if envconfig.FlashAttention() &&
@ -219,7 +220,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
} }
// For all the layers, find where they can fit on the GPU(s) // For all the layers, find where they can fit on the GPU(s)
for i := range int(f.KV().BlockCount()) { for i := int(f.KV().BlockCount()) - 1; i >= 0; i-- {
// Some models have inconsistent layer sizes // Some models have inconsistent layer sizes
if blk, ok := layers[fmt.Sprintf("blk.%d", i)]; ok { if blk, ok := layers[fmt.Sprintf("blk.%d", i)]; ok {
layerSize = blk.Size() layerSize = blk.Size()
@ -229,6 +230,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
if opts.NumGPU >= 0 && layerCount >= opts.NumGPU { if opts.NumGPU >= 0 && layerCount >= opts.NumGPU {
// Stop allocating on GPU(s) once we hit the users target NumGPU // Stop allocating on GPU(s) once we hit the users target NumGPU
overflow += layerSize
continue continue
} }
@ -245,13 +247,13 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
gpusWithSpace = append(gpusWithSpace[:i%j], gpusWithSpace[i%j+1:]...) gpusWithSpace = append(gpusWithSpace[:i%j], gpusWithSpace[i%j+1:]...)
} }
} }
if len(gpusWithSpace) == 0 {
overflow += layerSize
}
} }
if layerCount >= int(f.KV().BlockCount()) { if layerCount >= int(f.KV().BlockCount()) {
fullyLoaded = true fullyLoaded = true
} else {
for i := layerCount; i < int(f.KV().BlockCount()); i++ {
overflow += layerSize
}
} }
// Determine if we need to consider output then find where it fits // Determine if we need to consider output then find where it fits