diff --git a/ml/backend/ggml/ggml.go b/ml/backend/ggml/ggml.go index ae32e3c64..e909f53cf 100644 --- a/ml/backend/ggml/ggml.go +++ b/ml/backend/ggml/ggml.go @@ -25,11 +25,13 @@ import ( "github.com/ollama/ollama/format" fs "github.com/ollama/ollama/fs/ggml" "github.com/ollama/ollama/ml" + ggml "github.com/ollama/ollama/ml/backend/ggml/ggml/src" "golang.org/x/sync/errgroup" ) func devices() iter.Seq[*C.struct_ggml_backend_device] { return func(yield func(*C.struct_ggml_backend_device) bool) { + ggml.OnceLoad() for i := range C.ggml_backend_dev_count() { if !yield(C.ggml_backend_dev_get(i)) { return @@ -146,8 +148,15 @@ func New(r *os.File, params ml.BackendParams) (ml.Backend, error) { slog.Info("max tensors", "max_tensors", maxTensors) + type tensor struct { + source *fs.Tensor + target string + } + + targets := make(map[string][]string) + ctxs := make(map[*C.struct_ggml_backend_buffer_type]*C.struct_ggml_context) - createTensor := func(t *fs.Tensor, bts []*C.struct_ggml_backend_buffer_type) *C.struct_ggml_tensor { + createTensor := func(t tensor, bts []*C.struct_ggml_backend_buffer_type) *C.struct_ggml_tensor { for _, bt := range bts { if _, ok := ctxs[bt]; !ok { ctxs[bt] = C.ggml_init(C.struct_ggml_init_params{ @@ -156,16 +165,23 @@ func New(r *os.File, params ml.BackendParams) (ml.Backend, error) { }) } - cname := C.CString(t.Name) + targets[t.source.Name] = append(targets[t.source.Name], t.target) + + name := t.source.Name + if t.target != "" { + name = t.target + } + + cname := C.CString(name) defer C.free(unsafe.Pointer(cname)) if tt := C.ggml_get_tensor(ctxs[bt], cname); tt != nil { return tt } - tt := C.ggml_new_tensor(ctxs[bt], t.Kind, C.int(len(t.Shape)), (*C.int64_t)(unsafe.Pointer(&t.Shape[0]))) + tt := C.ggml_new_tensor(ctxs[bt], t.source.Kind, C.int(len(t.source.Shape)), (*C.int64_t)(unsafe.Pointer(&t.source.Shape[0]))) C.ggml_set_name(tt, cname) - slog.Debug("created tensor", "name", t.Name, "shape", t.Shape, "dtype", t.Kind, "buffer_type", C.GoString(C.ggml_backend_buft_name(bt))) + slog.Debug("created tensor", "name", name, "shape", t.source.Shape, "dtype", t.source.Kind, "buffer_type", C.GoString(C.ggml_backend_buft_name(bt))) //nolint:staticcheck // TODO: check if buffer type supports this tensor return tt } @@ -187,9 +203,9 @@ func New(r *os.File, params ml.BackendParams) (ml.Backend, error) { for _, t := range meta.Tensors().Items() { switch { case hasPart(t.Name, "position_embd", "token_embd", "token_norm_embd", "token_types"): - createTensor(t, input.bts) + createTensor(tensor{source: t}, input.bts) case hasPart(t.Name, "cls", "output", "output_norm"): - createTensor(t, output.bts) + createTensor(tensor{source: t}, output.bts) default: if i := func() int { if fields := strings.FieldsFunc(t.Name, func(r rune) bool { return !unicode.IsNumber(r) }); len(fields) > 0 { @@ -200,10 +216,13 @@ func New(r *os.File, params ml.BackendParams) (ml.Backend, error) { return -1 }(); i >= 0 { - createTensor(t, layers[i].bts) + createTensor(tensor{source: t}, layers[i].bts) } else { - for _, layer := range layers { - createTensor(t, layer.bts) + for i, layer := range layers { + createTensor(tensor{ + source: t, + target: "blk." + strconv.Itoa(i) + "." + t.Name, + }, layer.bts) } } } @@ -237,28 +256,34 @@ func New(r *os.File, params ml.BackendParams) (ml.Backend, error) { sr := io.NewSectionReader(r, int64(meta.Tensors().Offset), n-int64(meta.Tensors().Offset)) var g errgroup.Group for _, t := range meta.Tensors().Items() { - g.Go(func() error { - tt, ok := tensors[t.Name] - if !ok { - return fmt.Errorf("unassigned tensor: %s", t.Name) - } + for _, target := range targets[t.Name] { + g.Go(func() error { + if target == "" { + target = t.Name + } - bts := make([]byte, t.Size()) - n, err := io.ReadFull(io.NewSectionReader(sr, int64(t.Offset), int64(t.Size())), bts) - if err != nil { - return err - } + tt, ok := tensors[target] + if !ok { + return fmt.Errorf("unassigned tensor: %s", t.Name) + } - if n != len(bts) { - return errors.New("short read") - } + bts := make([]byte, t.Size()) + n, err := io.ReadFull(io.NewSectionReader(sr, int64(t.Offset), int64(t.Size())), bts) + if err != nil { + return err + } - cname := C.CString(t.Name) - C.ggml_backend_tensor_set(tt, unsafe.Pointer(&bts[0]), 0, C.size_t(t.Size())) - C.free(unsafe.Pointer(cname)) + if n != len(bts) { + return errors.New("short read") + } - return nil - }) + cname := C.CString(t.Name) + C.ggml_backend_tensor_set(tt, unsafe.Pointer(&bts[0]), 0, C.size_t(t.Size())) + C.free(unsafe.Pointer(cname)) + + return nil + }) + } } if g.Wait() != nil { diff --git a/ml/backend/ggml/ggml/src/ggml-backend-reg.cpp b/ml/backend/ggml/ggml/src/ggml-backend-reg.cpp index 00cfc968c..799af5f3a 100644 --- a/ml/backend/ggml/ggml/src/ggml-backend-reg.cpp +++ b/ml/backend/ggml/ggml/src/ggml-backend-reg.cpp @@ -207,7 +207,13 @@ struct ggml_backend_registry { for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) { register_device(ggml_backend_reg_dev_get(reg, i), score); } + } + void register_device(ggml_backend_dev_t device, int score = -1) { +#ifndef NDEBUG + GGML_LOG_DEBUG("%s: registered device %s (%s)\n", __func__, ggml_backend_dev_name(device), ggml_backend_dev_description(device)); +#endif + devices.push_back({device, score}); std::stable_sort(devices.begin(), devices.end(), [](const auto & a, const auto & b) { return a.second > b.second; @@ -215,21 +221,6 @@ struct ggml_backend_registry { ); } - void register_device(ggml_backend_dev_t device, int score = -1) { - switch (ggml_backend_dev_type(device)) { - case GGML_BACKEND_DEVICE_TYPE_CPU: - case GGML_BACKEND_DEVICE_TYPE_GPU: - score += 1 << 16; - case GGML_BACKEND_DEVICE_TYPE_ACCEL: - score += 1 << 20; - } - -#ifndef NDEBUG - GGML_LOG_DEBUG("%s: registered device %s (%s)\n", __func__, ggml_backend_dev_name(device), ggml_backend_dev_description(device)); -#endif - devices.push_back({device, score}); - } - ggml_backend_reg_t load_backend(const std::filesystem::path & path, bool silent) { dl_handle_ptr handle { dl_load_library(path) }; if (!handle) {