From 2789ed31a7fb3930dd47d0e1aa5aa50fc0f044f2 Mon Sep 17 00:00:00 2001 From: Jeffrey Morgan Date: Thu, 18 Jan 2024 00:53:17 -0500 Subject: [PATCH] improve scratch buffer estimates --- gpu/gpu.go | 8 +------- llm/llm.go | 14 +++++++------- 2 files changed, 8 insertions(+), 14 deletions(-) diff --git a/gpu/gpu.go b/gpu/gpu.go index 451b7557a..bd39e2654 100644 --- a/gpu/gpu.go +++ b/gpu/gpu.go @@ -190,13 +190,7 @@ func getCPUMem() (memInfo, error) { func CheckVRAM() (int64, error) { gpuInfo := GetGPUInfo() if gpuInfo.FreeMemory > 0 && (gpuInfo.Library == "cuda" || gpuInfo.Library == "rocm") { - // leave 10% or 512MiB of VRAM free per GPU to handle unaccounted for overhead - overhead := gpuInfo.FreeMemory / 10 - gpus := uint64(gpuInfo.DeviceCount) - if overhead < gpus*512*1024*1024 { - overhead = gpus * 512 * 1024 * 1024 - } - return int64(gpuInfo.FreeMemory - overhead), nil + return int64(gpuInfo.FreeMemory), nil } return 0, fmt.Errorf("no GPU detected") // TODO - better handling of CPU based memory determiniation diff --git a/llm/llm.go b/llm/llm.go index 8b043f380..a51ca7d8a 100644 --- a/llm/llm.go +++ b/llm/llm.go @@ -50,10 +50,10 @@ func New(workDir, model string, adapters, projectors []string, opts api.Options) // fp16 k,v matrices require = n_ctx * n_layer * n_embd / n_head * n_head_kv * 2 bytes each * 2 key and value kv := 2 * 2 * int64(opts.NumCtx) * int64(ggml.NumLayers()) * int64(ggml.NumEmbed()) * int64(ggml.NumHeadKv()) / int64(ggml.NumHead()) - // this amount is the overhead + tensors in memory - // TODO: get this from the llama.cpp's graph calculations instead of - // estimating it's 1/6 * kv_cache_size * num_gqa - graph := int64(ggml.NumGQA()) * kv / 6 + // rough estimation for scratch space based on context size, batch size and number of layers in the model + // TODO: instead call llama.cpp's alloc functions to measure required memory + // TODO: account for quantization levels + scratch := 8*int64(opts.NumCtx)*int64(opts.NumBatch)*int64(ggml.NumLayers()) + 1536*1024*1024 // 1536MiB overhead info := gpu.GetGPUInfo() switch runtime.GOOS { @@ -62,7 +62,7 @@ func New(workDir, model string, adapters, projectors []string, opts api.Options) break } - if size+kv+graph > vram { + if size+kv+scratch > vram { slog.Info("not enough vram available, falling back to CPU only") info.Library = "cpu" info.Variant = gpu.GetCPUVariant() @@ -99,13 +99,13 @@ func New(workDir, model string, adapters, projectors []string, opts api.Options) maxlayers := int64(ggml.NumLayers()) + 1 devices := int64(info.DeviceCount) avg := vram / devices - layers := maxlayers * (avg - graph) / (kv + size/devices) + layers := maxlayers * (avg - scratch) / (kv + size/devices) if layers > maxlayers { layers = maxlayers } // 1 + 2 must fit on the main gpu - min := graph + kv*layers/maxlayers + min := scratch + kv*layers/maxlayers if layers <= 0 || min > avg { slog.Info("not enough vram available, falling back to CPU only") info.Library = "cpu"