From 0478d440f0ba62202bc4b98043ae4a7d0b85e4ba Mon Sep 17 00:00:00 2001
From: tej <37236721+itej89@users.noreply.github.com>
Date: Tue, 13 May 2025 18:42:39 -0500
Subject: [PATCH] Fixed over vram allcation dure to small initial layer sizes.

Co-authored-by: Tej Kiran <kiran.tej@amd.com>
Co-authored-by: Michael Yang <mxyng@pm.me>
Co-authored-by: Tej Kiran <itej89@gmailcom>
---
 llm/memory.go | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/llm/memory.go b/llm/memory.go
index e05327f79..76082bf7c 100644
--- a/llm/memory.go
+++ b/llm/memory.go
@@ -1,9 +1,12 @@
 package llm
 
 import (
+	"cmp"
 	"fmt"
 	"log/slog"
+	"maps"
 	"os"
+	"slices"
 	"strconv"
 	"strings"
 
@@ -120,12 +123,10 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 	}
 
 	layers := f.Tensors().GroupLayers()
-	// add one layer worth of memory as a buffer
-	if blk0, ok := layers["blk.0"]; ok {
-		layerSize = blk0.Size()
-	} else {
-		slog.Warn("model missing blk.0 layer size")
-	}
+	// add one layer (chosing the max layer) worth of memory as a buffer
+	layerSize = slices.MaxFunc(slices.Collect(maps.Values(layers)), func(a, b ggml.Layer) int {
+		return cmp.Compare(a.Size(), b.Size())
+	}).Size()
 
 	var kvct string
 	if envconfig.FlashAttention() &&
@@ -219,7 +220,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 	}
 
 	// For all the layers, find where they can fit on the GPU(s)
-	for i := range int(f.KV().BlockCount()) {
+	for i := int(f.KV().BlockCount()) - 1; i >= 0; i-- {
 		// Some models have inconsistent layer sizes
 		if blk, ok := layers[fmt.Sprintf("blk.%d", i)]; ok {
 			layerSize = blk.Size()
@@ -229,6 +230,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 
 		if opts.NumGPU >= 0 && layerCount >= opts.NumGPU {
 			// Stop allocating on GPU(s) once we hit the users target NumGPU
+			overflow += layerSize
 			continue
 		}
 
@@ -245,13 +247,13 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 				gpusWithSpace = append(gpusWithSpace[:i%j], gpusWithSpace[i%j+1:]...)
 			}
 		}
+
+		if len(gpusWithSpace) == 0 {
+			overflow += layerSize
+		}
 	}
 	if layerCount >= int(f.KV().BlockCount()) {
 		fullyLoaded = true
-	} else {
-		for i := layerCount; i < int(f.KV().BlockCount()); i++ {
-			overflow += layerSize
-		}
 	}
 
 	// Determine if we need to consider output then find where it fits