diff --git a/llm/memory.go b/llm/memory.go index c5d861b6a..384e2dc60 100644 --- a/llm/memory.go +++ b/llm/memory.go @@ -129,7 +129,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string, var kvct string if fa { - requested := envconfig.KvCacheType() + requested := strings.ToLower(envconfig.KvCacheType()) if requested != "" && ggml.SupportsKVCacheType(requested) { kvct = requested } diff --git a/llm/server.go b/llm/server.go index 23caa9a0a..debdd35e8 100644 --- a/llm/server.go +++ b/llm/server.go @@ -225,7 +225,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter fa = false } - kvct := envconfig.KvCacheType() + kvct := strings.ToLower(envconfig.KvCacheType()) if fa { slog.Info("enabling flash attention")