diff --git a/llama/example/main.go b/llama/example/main.go index d8de89c6a..da5be0d31 100644 --- a/llama/example/main.go +++ b/llama/example/main.go @@ -29,9 +29,14 @@ func main() { // load the model llama.BackendInit() - params := llama.NewModelParams(999, 0, func(p float32) { - fmt.Printf("loading... %f\n", p) - }) + params := llama.ModelParams{ + NumGpuLayers: 999, + MainGpu: 0, + UseMmap: true, + Progress: func(p float32) { + fmt.Printf("loading... %f\n", p) + }, + } model := llama.LoadModelFromFile(*mpath, params) ctxParams := llama.NewContextParams(2048, runtime.NumCPU(), false) diff --git a/llama/llama.go b/llama/llama.go index 704d9e8f1..0ffc2e351 100644 --- a/llama/llama.go +++ b/llama/llama.go @@ -78,33 +78,6 @@ func NewContextParams(numCtx int, threads int, flashAttention bool) ContextParam return ContextParams{c: params} } -type ModelParams struct { - c C.struct_llama_model_params -} - -//export llamaProgressCallback -func llamaProgressCallback(progress C.float, userData unsafe.Pointer) C.bool { - handle := cgo.Handle(userData) - callback := handle.Value().(func(float32)) - callback(float32(progress)) - return true -} - -func NewModelParams(numGpuLayers int, mainGpu int, callback func(float32)) ModelParams { - params := C.llama_model_default_params() - params.n_gpu_layers = C.int(numGpuLayers) - params.main_gpu = C.int32_t(mainGpu) - - handle := cgo.NewHandle(callback) - params.progress_callback = C.llama_progress_callback(C.llamaProgressCallback) - params.progress_callback_user_data = unsafe.Pointer(handle) - runtime.SetFinalizer(¶ms, func(p *C.struct_llama_model_params) { - handle.Delete() - }) - - return ModelParams{c: params} -} - type Context struct { c *C.struct_llama_context } @@ -179,8 +152,49 @@ func (c *Context) GetEmbeddingsIth(i int) []float32 { return unsafe.Slice((*float32)(unsafe.Pointer(C.llama_get_embeddings_ith(c.c, C.int32_t(i)))), c.Model().NEmbd()) } +type ModelParams struct { + NumGpuLayers int + MainGpu int + UseMmap bool + UseMlock bool + TensorSplit []float32 + Progress func(float32) +} + +//export llamaProgressCallback +func llamaProgressCallback(progress C.float, userData unsafe.Pointer) C.bool { + handle := cgo.Handle(userData) + callback := handle.Value().(func(float32)) + callback(float32(progress)) + return true +} + func LoadModelFromFile(modelPath string, params ModelParams) *Model { - return &Model{c: C.llama_load_model_from_file(C.CString(modelPath), params.c)} + cparams := C.llama_model_default_params() + cparams.n_gpu_layers = C.int(params.NumGpuLayers) + cparams.main_gpu = C.int32_t(params.MainGpu) + cparams.use_mmap = C.bool(params.UseMmap) + cparams.use_mlock = C.bool(params.UseMlock) + + if len(params.TensorSplit) > 0 { + tensorSplitData := ¶ms.TensorSplit[0] + + var tensorSplitPin runtime.Pinner + tensorSplitPin.Pin(tensorSplitData) + defer tensorSplitPin.Unpin() + + cparams.tensor_split = (*C.float)(unsafe.Pointer(tensorSplitData)) + } + + if params.Progress != nil { + handle := cgo.NewHandle(params.Progress) + defer handle.Delete() + + cparams.progress_callback = C.llama_progress_callback(C.llamaProgressCallback) + cparams.progress_callback_user_data = unsafe.Pointer(handle) + } + + return &Model{c: C.llama_load_model_from_file(C.CString(modelPath), cparams)} } func NewContextWithModel(model *Model, params ContextParams) *Context { diff --git a/llama/runner/runner.go b/llama/runner/runner.go index 77f28ce83..e16fa1646 100644 --- a/llama/runner/runner.go +++ b/llama/runner/runner.go @@ -12,6 +12,7 @@ import ( "net/http" "os" "path/filepath" + "regexp" "runtime" "strconv" "strings" @@ -599,16 +600,16 @@ func main() { lpath := flag.String("lora", "", "Path to lora layer file") port := flag.Int("port", 8080, "Port to expose the server on") threads := flag.Int("threads", runtime.NumCPU(), "Number of threads to use during generation") - - // TODO not yet implemented but wired to keep the parsing aligned - embedding := flag.Bool("embedding", false, "enable embedding vector output (default: disabled)") - logDisable := flag.Bool("log-disable", false, "disables logging to a file") verbose := flag.Bool("verbose", false, "verbose output (default: disabled)") - f32 := flag.Bool("memory-f32", false, "use f32 instead of f16 for memory key+value (default: disabled) not recommended: doubles context memory required and no measurable increase in quality") noMmap := flag.Bool("no-mmap", false, "do not memory-map model (slower load but may reduce pageouts if not using mlock)") mlock := flag.Bool("mlock", false, "force system to keep model in RAM rather than swapping or compressing") tensorSplit := flag.String("tensor-split", "", "fraction of the model to offload to each GPU, comma-separated list of proportions") + // These are either ignored by llama.cpp or have no significance to us + _ = flag.Bool("embedding", false, "enable embedding vector output (default: disabled)") + _ = flag.Bool("log-disable", false, "disables logging to a file") + _ = flag.Bool("memory-f32", false, "use f32 instead of f16 for memory key+value (default: disabled) not recommended: doubles context memory required and no measurable increase in quality") + flag.Parse() level := slog.LevelInfo if *verbose { @@ -627,26 +628,6 @@ func main() { }) slog.SetDefault(slog.New(handler)) - // TODO actually implement... - if *embedding { - slog.Warn("embeddings not yet supported") - } - if *logDisable { - slog.Info("ignoring --log-disable") - } - if *f32 { - slog.Warn("memory-f32 not yet supported") - } - if *noMmap { - slog.Warn("no-mmap not yet supported") - } - if *mlock { - slog.Warn("mlock not yet supported") - } - if *tensorSplit != "" { - slog.Warn("tensor-split not yet implemented") - } - server := &Server{ numCtx: *kvSize / *parallel, batchSize: *batchSize, @@ -659,10 +640,29 @@ func main() { // otherwise Ollama can timeout for large model loads // load the model llama.BackendInit() - params := llama.NewModelParams(*nGpuLayers, *mainGpu, func(progress float32) { - slog.Debug("Loading model", "progress %", math.Round(float64(progress*100))) - server.progress = progress - }) + + var tensorSplitFloats []float32 + if *tensorSplit != "" { + stringFloats := regexp.MustCompile(",").Split(*tensorSplit, -1) + + tensorSplitFloats = make([]float32, 0, len(stringFloats)) + for _, s := range stringFloats { + f, _ := strconv.ParseFloat(s, 32) + tensorSplitFloats = append(tensorSplitFloats, float32(f)) + } + } + + params := llama.ModelParams{ + NumGpuLayers: *nGpuLayers, + MainGpu: *mainGpu, + UseMmap: !*noMmap && *lpath == "", + UseMlock: *mlock, + TensorSplit: tensorSplitFloats, + Progress: func(progress float32) { + slog.Debug("Loading model", "progress %", math.Round(float64(progress*100))) + server.progress = progress + }, + } server.model = llama.LoadModelFromFile(*mpath, params) if *lpath != "" {