diff --git a/ml/backend/ggml/ggml.go b/ml/backend/ggml/ggml.go index b4644d97e..94fc87a3d 100644 --- a/ml/backend/ggml/ggml.go +++ b/ml/backend/ggml/ggml.go @@ -447,6 +447,8 @@ func (b *Backend) NewContextSize(n int) ml.Context { panic(fmt.Errorf("requested number of graph nodes (%v) for new context exceeds maximum (%v)", n, b.maxGraphNodes)) } + var allocatedBuffers []*C.struct_ggml_backend_buffer + return &Context{ b: b, maxGraphNodes: n, @@ -454,6 +456,7 @@ func (b *Backend) NewContextSize(n int) ml.Context { mem_size: C.size_t(n)*C.ggml_tensor_overhead() + C.ggml_graph_overhead_custom(C.size_t(n), false), no_alloc: true, }), + allocatedBuffers: &allocatedBuffers, } } @@ -474,6 +477,10 @@ type Context struct { // buft is the buffer type used for new tensors buft *C.struct_ggml_backend_buffer_type + // allocatedBuffers are buffers for tensors that we have allocated in this context + // so that we can free them when we close the context + allocatedBuffers *[]*C.struct_ggml_backend_buffer + // maxGraphNodes is the maximum allowed number of graph nodes in this context maxGraphNodes int } @@ -481,10 +488,11 @@ type Context struct { func (c *Context) Input() ml.Context { if c.b.input != nil { return &Context{ - b: c.b, - ctx: c.ctx, - buft: c.b.input, - maxGraphNodes: c.maxGraphNodes, + b: c.b, + ctx: c.ctx, + buft: c.b.input, + allocatedBuffers: c.allocatedBuffers, + maxGraphNodes: c.maxGraphNodes, } } @@ -494,10 +502,11 @@ func (c *Context) Input() ml.Context { func (c *Context) Layer(i int) ml.Context { if buft, ok := c.b.layers[i]; ok { return &Context{ - b: c.b, - ctx: c.ctx, - buft: buft, - maxGraphNodes: c.maxGraphNodes, + b: c.b, + ctx: c.ctx, + buft: buft, + allocatedBuffers: c.allocatedBuffers, + maxGraphNodes: c.maxGraphNodes, } } @@ -610,6 +619,7 @@ func (c *Context) newTensor(dtype ml.DType, shape []int) (ml.Tensor, error) { if b == nil { return nil, fmt.Errorf("unable to allocate %v from device %v for new tensor", format.HumanBytes2(uint64(size)), C.GoString(C.ggml_backend_buft_name(c.buft))) } + *c.allocatedBuffers = append(*c.allocatedBuffers, b) C.ggml_backend_tensor_alloc(b, t, C.ggml_backend_buffer_get_base(b)) return &Tensor{b: c.b, t: t}, nil @@ -688,6 +698,11 @@ func (c *Context) FromIntSlice(s []int32, shape ...int) (ml.Tensor, error) { func (c *Context) Close() { if c != nil { + for _, b := range *c.allocatedBuffers { + C.ggml_backend_buffer_free(b) + } + *c.allocatedBuffers = nil + C.ggml_free(c.ctx) } }