diff --git a/ml/backend/ggml/ggml.go b/ml/backend/ggml/ggml.go index c4adcd98f..af5dbf99e 100644 --- a/ml/backend/ggml/ggml.go +++ b/ml/backend/ggml/ggml.go @@ -44,13 +44,13 @@ type Backend struct { tensors map[string]*C.struct_ggml_tensor // input is the backend used for inputs - input *C.struct_ggml_backend + input *C.struct_ggml_backend_buffer_type // output is the backend used for outputs - output *C.struct_ggml_backend + output *C.struct_ggml_backend_buffer_type // layers is the backend used for repeating layers - layers map[int]*C.struct_ggml_backend + layers map[int]*C.struct_ggml_backend_buffer_type flashAttention bool @@ -83,7 +83,10 @@ func New(r *os.File, params ml.BackendParams) (ml.Backend, error) { for _, d := range devices() { switch C.ggml_backend_dev_type(d) { case C.GGML_BACKEND_DEVICE_TYPE_CPU: - cpus = append(cpus, d) + if len(cpus) == 0 { + // only the first cpu device should be used + cpus = append(cpus, d) + } case C.GGML_BACKEND_DEVICE_TYPE_ACCEL: accels = append(accels, d) case C.GGML_BACKEND_DEVICE_TYPE_GPU: @@ -324,25 +327,25 @@ func New(r *os.File, params ml.BackendParams) (ml.Backend, error) { return nil, err } - // map devices to backends so tensors created post initialization can be assigned to the correct device - deviceBackends := make(map[*C.struct_ggml_backend_device]*C.struct_ggml_backend) + // map devices to backend buffer types so new tensors can be assigned to the correct device + deviceBufferTypes := make(map[*C.struct_ggml_backend_device]*C.struct_ggml_backend_buffer_type) // create backends and buffer types used for the compute graph scheduler var schedBackends []*C.struct_ggml_backend var schedBufts []*C.struct_ggml_backend_buffer_type for _, d := range append(gpus, append(accels, cpus...)...) { b := C.ggml_backend_dev_init(d, nil) - schedBackends = append(schedBackends, b) - deviceBackends[d] = b - bt := C.ggml_backend_get_default_buffer_type(b) - // use the first gpu host buffer type for gpu if possible if d := C.ggml_backend_get_device(b); C.ggml_backend_dev_type(d) == C.GGML_BACKEND_DEVICE_TYPE_CPU && len(gpus) > 0 { - if hbt := C.ggml_backend_dev_host_buffer_type(d); hbt != nil { + // use the first gpu host buffer type for gpu if possible + if hbt := C.ggml_backend_dev_host_buffer_type(gpus[0]); hbt != nil { bt = hbt } } + deviceBufferTypes[d] = bt + + schedBackends = append(schedBackends, b) schedBufts = append(schedBufts, bt) slog.Info("compute graph", "backend", C.GoString(C.ggml_backend_name(b)), "buffer_type", C.GoString(C.ggml_backend_buft_name(bt))) @@ -365,12 +368,12 @@ func New(r *os.File, params ml.BackendParams) (ml.Backend, error) { C.size_t(maxGraphNodes), true, ), - input: deviceBackends[input.d], - output: deviceBackends[output.d], - layers: func() map[int]*C.struct_ggml_backend { - m := make(map[int]*C.struct_ggml_backend) + input: deviceBufferTypes[input.d], + output: deviceBufferTypes[output.d], + layers: func() map[int]*C.struct_ggml_backend_buffer_type { + m := make(map[int]*C.struct_ggml_backend_buffer_type) for i, layer := range layers { - m[i] = deviceBackends[layer.d] + m[i] = deviceBufferTypes[layer.d] } return m }(), @@ -401,13 +404,12 @@ func (b *Backend) NewContext() ml.Context { func (b *Backend) NewContextSize(n int) ml.Context { n = min(n, b.maxGraphNodes) return &Context{ - b: b, + b: b, + maxGraphNodes: n, ctx: C.ggml_init(C.struct_ggml_init_params{ mem_size: C.size_t(n)*C.ggml_tensor_overhead() + C.ggml_graph_overhead_custom(C.size_t(n), false), no_alloc: true, }), - backend: C.ggml_backend_sched_get_backend(b.sched, 0), - maxGraphNodes: n, } } @@ -425,8 +427,8 @@ type Context struct { ctx *C.struct_ggml_context graph *C.struct_ggml_cgraph - // backend is the backend used for new tensors - backend *C.struct_ggml_backend + // buft is the buffer type used for new tensors + buft *C.struct_ggml_backend_buffer_type // maxGraphNodes is the maximum allowed number of graph nodes in this context maxGraphNodes int @@ -437,7 +439,7 @@ func (c Context) Input() ml.Context { return &Context{ b: c.b, ctx: c.ctx, - backend: c.b.input, + buft: c.b.input, maxGraphNodes: c.maxGraphNodes, } } @@ -450,7 +452,7 @@ func (c Context) Output() ml.Context { return &Context{ b: c.b, ctx: c.ctx, - backend: c.b.output, + buft: c.b.output, maxGraphNodes: c.maxGraphNodes, } } @@ -459,11 +461,11 @@ func (c Context) Output() ml.Context { } func (c Context) Layer(i int) ml.Context { - if backend, ok := c.b.layers[i]; ok { + if buft, ok := c.b.layers[i]; ok { return &Context{ b: c.b, ctx: c.ctx, - backend: backend, + buft: buft, maxGraphNodes: c.maxGraphNodes, } } @@ -516,6 +518,10 @@ func shapeToGGML(shape []int) *C.int64_t { } func (c Context) newTensor(dtype ml.DType, shape []int) ml.Tensor { + if c.buft == nil { + panic("set Input, Output, or Layer before creating tensors") + } + var cdtype uint32 switch dtype { case ml.DTypeF32: @@ -542,7 +548,7 @@ func (c Context) newTensor(dtype ml.DType, shape []int) ml.Tensor { } t := C.ggml_new_tensor(c.ctx, cdtype, C.int(len(shape)), shapeToGGML(shape)) - b := C.ggml_backend_alloc_buffer(c.backend, C.ggml_nbytes(t)) + b := C.ggml_backend_buft_alloc_buffer(c.buft, C.ggml_nbytes(t)) C.ggml_backend_tensor_alloc(b, t, C.ggml_backend_buffer_get_base(b)) return &Tensor{b: c.b, t: t} }