diff --git a/ml/backend/ggml/ggml.go b/ml/backend/ggml/ggml.go
index c4adcd98f..af5dbf99e 100644
--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@@ -44,13 +44,13 @@ type Backend struct {
 	tensors map[string]*C.struct_ggml_tensor
 
 	// input is the backend used for inputs
-	input *C.struct_ggml_backend
+	input *C.struct_ggml_backend_buffer_type
 
 	// output is the backend used for outputs
-	output *C.struct_ggml_backend
+	output *C.struct_ggml_backend_buffer_type
 
 	// layers is the backend used for repeating layers
-	layers map[int]*C.struct_ggml_backend
+	layers map[int]*C.struct_ggml_backend_buffer_type
 
 	flashAttention bool
 
@@ -83,7 +83,10 @@ func New(r *os.File, params ml.BackendParams) (ml.Backend, error) {
 	for _, d := range devices() {
 		switch C.ggml_backend_dev_type(d) {
 		case C.GGML_BACKEND_DEVICE_TYPE_CPU:
-			cpus = append(cpus, d)
+			if len(cpus) == 0 {
+				// only the first cpu device should be used
+				cpus = append(cpus, d)
+			}
 		case C.GGML_BACKEND_DEVICE_TYPE_ACCEL:
 			accels = append(accels, d)
 		case C.GGML_BACKEND_DEVICE_TYPE_GPU:
@@ -324,25 +327,25 @@ func New(r *os.File, params ml.BackendParams) (ml.Backend, error) {
 		return nil, err
 	}
 
-	// map devices to backends so tensors created post initialization can be assigned to the correct device
-	deviceBackends := make(map[*C.struct_ggml_backend_device]*C.struct_ggml_backend)
+	// map devices to backend buffer types so new tensors can be assigned to the correct device
+	deviceBufferTypes := make(map[*C.struct_ggml_backend_device]*C.struct_ggml_backend_buffer_type)
 
 	// create backends and buffer types used for the compute graph scheduler
 	var schedBackends []*C.struct_ggml_backend
 	var schedBufts []*C.struct_ggml_backend_buffer_type
 	for _, d := range append(gpus, append(accels, cpus...)...) {
 		b := C.ggml_backend_dev_init(d, nil)
-		schedBackends = append(schedBackends, b)
-		deviceBackends[d] = b
-
 		bt := C.ggml_backend_get_default_buffer_type(b)
-		// use the first gpu host buffer type for gpu if possible
 		if d := C.ggml_backend_get_device(b); C.ggml_backend_dev_type(d) == C.GGML_BACKEND_DEVICE_TYPE_CPU && len(gpus) > 0 {
-			if hbt := C.ggml_backend_dev_host_buffer_type(d); hbt != nil {
+			// use the first gpu host buffer type for gpu if possible
+			if hbt := C.ggml_backend_dev_host_buffer_type(gpus[0]); hbt != nil {
 				bt = hbt
 			}
 		}
 
+		deviceBufferTypes[d] = bt
+
+		schedBackends = append(schedBackends, b)
 		schedBufts = append(schedBufts, bt)
 
 		slog.Info("compute graph", "backend", C.GoString(C.ggml_backend_name(b)), "buffer_type", C.GoString(C.ggml_backend_buft_name(bt)))
@@ -365,12 +368,12 @@ func New(r *os.File, params ml.BackendParams) (ml.Backend, error) {
 			C.size_t(maxGraphNodes),
 			true,
 		),
-		input:  deviceBackends[input.d],
-		output: deviceBackends[output.d],
-		layers: func() map[int]*C.struct_ggml_backend {
-			m := make(map[int]*C.struct_ggml_backend)
+		input:  deviceBufferTypes[input.d],
+		output: deviceBufferTypes[output.d],
+		layers: func() map[int]*C.struct_ggml_backend_buffer_type {
+			m := make(map[int]*C.struct_ggml_backend_buffer_type)
 			for i, layer := range layers {
-				m[i] = deviceBackends[layer.d]
+				m[i] = deviceBufferTypes[layer.d]
 			}
 			return m
 		}(),
@@ -401,13 +404,12 @@ func (b *Backend) NewContext() ml.Context {
 func (b *Backend) NewContextSize(n int) ml.Context {
 	n = min(n, b.maxGraphNodes)
 	return &Context{
-		b: b,
+		b:             b,
+		maxGraphNodes: n,
 		ctx: C.ggml_init(C.struct_ggml_init_params{
 			mem_size: C.size_t(n)*C.ggml_tensor_overhead() + C.ggml_graph_overhead_custom(C.size_t(n), false),
 			no_alloc: true,
 		}),
-		backend:       C.ggml_backend_sched_get_backend(b.sched, 0),
-		maxGraphNodes: n,
 	}
 }
 
@@ -425,8 +427,8 @@ type Context struct {
 	ctx   *C.struct_ggml_context
 	graph *C.struct_ggml_cgraph
 
-	// backend is the backend used for new tensors
-	backend *C.struct_ggml_backend
+	// buft is the buffer type used for new tensors
+	buft *C.struct_ggml_backend_buffer_type
 
 	// maxGraphNodes is the maximum allowed number of graph nodes in this context
 	maxGraphNodes int
@@ -437,7 +439,7 @@ func (c Context) Input() ml.Context {
 		return &Context{
 			b:             c.b,
 			ctx:           c.ctx,
-			backend:       c.b.input,
+			buft:          c.b.input,
 			maxGraphNodes: c.maxGraphNodes,
 		}
 	}
@@ -450,7 +452,7 @@ func (c Context) Output() ml.Context {
 		return &Context{
 			b:             c.b,
 			ctx:           c.ctx,
-			backend:       c.b.output,
+			buft:          c.b.output,
 			maxGraphNodes: c.maxGraphNodes,
 		}
 	}
@@ -459,11 +461,11 @@ func (c Context) Output() ml.Context {
 }
 
 func (c Context) Layer(i int) ml.Context {
-	if backend, ok := c.b.layers[i]; ok {
+	if buft, ok := c.b.layers[i]; ok {
 		return &Context{
 			b:             c.b,
 			ctx:           c.ctx,
-			backend:       backend,
+			buft:          buft,
 			maxGraphNodes: c.maxGraphNodes,
 		}
 	}
@@ -516,6 +518,10 @@ func shapeToGGML(shape []int) *C.int64_t {
 }
 
 func (c Context) newTensor(dtype ml.DType, shape []int) ml.Tensor {
+	if c.buft == nil {
+		panic("set Input, Output, or Layer before creating tensors")
+	}
+
 	var cdtype uint32
 	switch dtype {
 	case ml.DTypeF32:
@@ -542,7 +548,7 @@ func (c Context) newTensor(dtype ml.DType, shape []int) ml.Tensor {
 	}
 
 	t := C.ggml_new_tensor(c.ctx, cdtype, C.int(len(shape)), shapeToGGML(shape))
-	b := C.ggml_backend_alloc_buffer(c.backend, C.ggml_nbytes(t))
+	b := C.ggml_backend_buft_alloc_buffer(c.buft, C.ggml_nbytes(t))
 	C.ggml_backend_tensor_alloc(b, t, C.ggml_backend_buffer_get_base(b))
 	return &Tensor{b: c.b, t: t}
 }