From 34c3b68fc8a14eb5a93f6bdd175fa94e2e8fa12b Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@ollama.com>
Date: Wed, 9 Apr 2025 11:21:57 -0700
Subject: [PATCH] ggml: Don't allocate CPU buffers as CUDA Host buffers

Allocating (and in particular, freeing) memory from CUDA host buffers
is expensive and can cause a significant performance hit if we do
it for every token. Using normal system memory avoids this issue
and also gives the OS more flexibility to manage it.

There is no performance impact from this patch directly (either
positive or negative) but it makes a difference once we start
freeing memory correctly.
---
 ml/backend/ggml/ggml.go | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/ml/backend/ggml/ggml.go b/ml/backend/ggml/ggml.go
index 727a7b8e5..b4644d97e 100644
--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@@ -384,12 +384,6 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
 	for _, d := range append(gpus, append(accels, cpus...)...) {
 		b := C.ggml_backend_dev_init(d, nil)
 		bt := C.ggml_backend_get_default_buffer_type(b)
-		if d := C.ggml_backend_get_device(b); C.ggml_backend_dev_type(d) == C.GGML_BACKEND_DEVICE_TYPE_CPU && len(gpus) > 0 {
-			// use the first gpu host buffer type for gpu if possible
-			if hbt := C.ggml_backend_dev_host_buffer_type(gpus[0]); hbt != nil {
-				bt = hbt
-			}
-		}
 
 		deviceBufferTypes[d] = bt