From 0478d440f0ba62202bc4b98043ae4a7d0b85e4ba Mon Sep 17 00:00:00 2001 From: tej <37236721+itej89@users.noreply.github.com> Date: Tue, 13 May 2025 18:42:39 -0500 Subject: [PATCH] Fixed over vram allcation dure to small initial layer sizes. Co-authored-by: Tej Kiran Co-authored-by: Michael Yang Co-authored-by: Tej Kiran --- llm/memory.go | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/llm/memory.go b/llm/memory.go index e05327f79..76082bf7c 100644 --- a/llm/memory.go +++ b/llm/memory.go @@ -1,9 +1,12 @@ package llm import ( + "cmp" "fmt" "log/slog" + "maps" "os" + "slices" "strconv" "strings" @@ -120,12 +123,10 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin } layers := f.Tensors().GroupLayers() - // add one layer worth of memory as a buffer - if blk0, ok := layers["blk.0"]; ok { - layerSize = blk0.Size() - } else { - slog.Warn("model missing blk.0 layer size") - } + // add one layer (chosing the max layer) worth of memory as a buffer + layerSize = slices.MaxFunc(slices.Collect(maps.Values(layers)), func(a, b ggml.Layer) int { + return cmp.Compare(a.Size(), b.Size()) + }).Size() var kvct string if envconfig.FlashAttention() && @@ -219,7 +220,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin } // For all the layers, find where they can fit on the GPU(s) - for i := range int(f.KV().BlockCount()) { + for i := int(f.KV().BlockCount()) - 1; i >= 0; i-- { // Some models have inconsistent layer sizes if blk, ok := layers[fmt.Sprintf("blk.%d", i)]; ok { layerSize = blk.Size() @@ -229,6 +230,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin if opts.NumGPU >= 0 && layerCount >= opts.NumGPU { // Stop allocating on GPU(s) once we hit the users target NumGPU + overflow += layerSize continue } @@ -245,13 +247,13 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin gpusWithSpace = append(gpusWithSpace[:i%j], gpusWithSpace[i%j+1:]...) } } + + if len(gpusWithSpace) == 0 { + overflow += layerSize + } } if layerCount >= int(f.KV().BlockCount()) { fullyLoaded = true - } else { - for i := layerCount; i < int(f.KV().BlockCount()); i++ { - overflow += layerSize - } } // Determine if we need to consider output then find where it fits