From f50d691254e671e69975c4e54fc4d0469b538f10 Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@ollama.com>
Date: Tue, 8 Apr 2025 12:11:55 -0700
Subject: [PATCH] ggml: Fix memory leak on input tensors

For every forward pass through the model, we need to allocate input
tensors: tokens, images, positions, outputs and masks. These get
allocated in system memory.

However, when we close the context that the tensors were allocated
through, the metadata gets freed but the actual backend memory does
not. This results in a significant memory leak.

This makes it so that all the memory allocated through a context
gets freed when it is closed.

Fixes #10040
---
 ml/backend/ggml/ggml.go | 31 +++++++++++++++++++++++--------
 1 file changed, 23 insertions(+), 8 deletions(-)

diff --git a/ml/backend/ggml/ggml.go b/ml/backend/ggml/ggml.go
index b4644d97e..94fc87a3d 100644
--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@@ -447,6 +447,8 @@ func (b *Backend) NewContextSize(n int) ml.Context {
 		panic(fmt.Errorf("requested number of graph nodes (%v) for new context exceeds maximum (%v)", n, b.maxGraphNodes))
 	}
 
+	var allocatedBuffers []*C.struct_ggml_backend_buffer
+
 	return &Context{
 		b:             b,
 		maxGraphNodes: n,
@@ -454,6 +456,7 @@ func (b *Backend) NewContextSize(n int) ml.Context {
 			mem_size: C.size_t(n)*C.ggml_tensor_overhead() + C.ggml_graph_overhead_custom(C.size_t(n), false),
 			no_alloc: true,
 		}),
+		allocatedBuffers: &allocatedBuffers,
 	}
 }
 
@@ -474,6 +477,10 @@ type Context struct {
 	// buft is the buffer type used for new tensors
 	buft *C.struct_ggml_backend_buffer_type
 
+	// allocatedBuffers are buffers for tensors that we have allocated in this context
+	// so that we can free them when we close the context
+	allocatedBuffers *[]*C.struct_ggml_backend_buffer
+
 	// maxGraphNodes is the maximum allowed number of graph nodes in this context
 	maxGraphNodes int
 }
@@ -481,10 +488,11 @@ type Context struct {
 func (c *Context) Input() ml.Context {
 	if c.b.input != nil {
 		return &Context{
-			b:             c.b,
-			ctx:           c.ctx,
-			buft:          c.b.input,
-			maxGraphNodes: c.maxGraphNodes,
+			b:                c.b,
+			ctx:              c.ctx,
+			buft:             c.b.input,
+			allocatedBuffers: c.allocatedBuffers,
+			maxGraphNodes:    c.maxGraphNodes,
 		}
 	}
 
@@ -494,10 +502,11 @@ func (c *Context) Input() ml.Context {
 func (c *Context) Layer(i int) ml.Context {
 	if buft, ok := c.b.layers[i]; ok {
 		return &Context{
-			b:             c.b,
-			ctx:           c.ctx,
-			buft:          buft,
-			maxGraphNodes: c.maxGraphNodes,
+			b:                c.b,
+			ctx:              c.ctx,
+			buft:             buft,
+			allocatedBuffers: c.allocatedBuffers,
+			maxGraphNodes:    c.maxGraphNodes,
 		}
 	}
 
@@ -610,6 +619,7 @@ func (c *Context) newTensor(dtype ml.DType, shape []int) (ml.Tensor, error) {
 	if b == nil {
 		return nil, fmt.Errorf("unable to allocate %v from device %v for new tensor", format.HumanBytes2(uint64(size)), C.GoString(C.ggml_backend_buft_name(c.buft)))
 	}
+	*c.allocatedBuffers = append(*c.allocatedBuffers, b)
 
 	C.ggml_backend_tensor_alloc(b, t, C.ggml_backend_buffer_get_base(b))
 	return &Tensor{b: c.b, t: t}, nil
@@ -688,6 +698,11 @@ func (c *Context) FromIntSlice(s []int32, shape ...int) (ml.Tensor, error) {
 
 func (c *Context) Close() {
 	if c != nil {
+		for _, b := range *c.allocatedBuffers {
+			C.ggml_backend_buffer_free(b)
+		}
+		*c.allocatedBuffers = nil
+
 		C.ggml_free(c.ctx)
 	}
 }