From bd6a7d5e6416c4c2aeba07233303385254395b6c Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Thu, 20 Feb 2025 11:18:01 -0800 Subject: [PATCH] ollamarunner: Pass runner performance parameters to backends Currently the following parameters are in the runner but not used: - numGPULayers - mainGPU - threads - tensorSplit This passes them through to the backend, which is where they would actually get used. However, the GGML backend does not yet do anything with them. --- ml/backend.go | 23 +++++++++++++++++++---- ml/backend/ggml/ggml.go | 2 +- model/model.go | 4 ++-- runner/ollamarunner/runner.go | 31 ++++++++++++++++++------------- 4 files changed, 40 insertions(+), 20 deletions(-) diff --git a/ml/backend.go b/ml/backend.go index aebf86f76..3cc18f2b6 100644 --- a/ml/backend.go +++ b/ml/backend.go @@ -26,9 +26,24 @@ type Backend interface { SystemInfo() string } -var backends = make(map[string]func(*os.File) (Backend, error)) +// BackendParams controls how the backend loads and executes models +type BackendParams struct { + // NumThreads sets the number of threads to use if running on the CPU + NumThreads int -func RegisterBackend(name string, f func(*os.File) (Backend, error)) { + // MainGPU is the index of the primary GPU to use + MainGPU int + + // NumGPULayers is the number of layers to offload to GPUs + NumGPULayers int + + // TensorSplit is the fraction of the model to offload to each GPU + TensorSplit []float32 +} + +var backends = make(map[string]func(*os.File, BackendParams) (Backend, error)) + +func RegisterBackend(name string, f func(*os.File, BackendParams) (Backend, error)) { if _, ok := backends[name]; ok { panic("backend: backend already registered") } @@ -36,9 +51,9 @@ func RegisterBackend(name string, f func(*os.File) (Backend, error)) { backends[name] = f } -func NewBackend(f *os.File) (Backend, error) { +func NewBackend(f *os.File, params BackendParams) (Backend, error) { if backend, ok := backends["ggml"]; ok { - return backend(f) + return backend(f, params) } return nil, fmt.Errorf("unsupported backend") diff --git a/ml/backend/ggml/ggml.go b/ml/backend/ggml/ggml.go index 5ba36361e..492f2d0af 100644 --- a/ml/backend/ggml/ggml.go +++ b/ml/backend/ggml/ggml.go @@ -84,7 +84,7 @@ type Backend struct { tensors map[string]*Context } -func New(r *os.File) (ml.Backend, error) { +func New(r *os.File, params ml.BackendParams) (ml.Backend, error) { meta, n, err := fs.Decode(r, -1) if err != nil { return nil, err diff --git a/model/model.go b/model/model.go index 5eedc9bd0..0b5996d9f 100644 --- a/model/model.go +++ b/model/model.go @@ -70,14 +70,14 @@ func Register(name string, f func(ml.Config) (Model, error)) { } // New initializes a new model instance with the provided configuration based on the metadata in the model file -func New(modelPath string) (Model, error) { +func New(modelPath string, params ml.BackendParams) (Model, error) { r, err := os.Open(modelPath) if err != nil { return nil, err } defer r.Close() - b, err := ml.NewBackend(r) + b, err := ml.NewBackend(r, params) if err != nil { return nil, err } diff --git a/runner/ollamarunner/runner.go b/runner/ollamarunner/runner.go index 6d45050c8..d11eba820 100644 --- a/runner/ollamarunner/runner.go +++ b/runner/ollamarunner/runner.go @@ -25,6 +25,7 @@ import ( "golang.org/x/sync/semaphore" "github.com/ollama/ollama/api" + "github.com/ollama/ollama/ml" "github.com/ollama/ollama/model" "github.com/ollama/ollama/runner/common" "github.com/ollama/ollama/sample" @@ -801,6 +802,7 @@ func (m *multiLPath) String() string { func (s *Server) loadModel( mpath string, + params ml.BackendParams, lpath multiLPath, parallel int, kvCacheType string, @@ -808,12 +810,12 @@ func (s *Server) loadModel( multiUserCache bool, ) { var err error - s.model, err = model.New(mpath) + s.model, err = model.New(mpath, params) if err != nil { panic(err) } - slog.Info("system", "info", s.model.Backend().SystemInfo() /* "threads", *threads */) + slog.Info("system", "info", s.model.Backend().SystemInfo(), "threads", params.NumThreads) // TODO(jessegross): LoRA loading if lpath.String() != "" { @@ -843,17 +845,17 @@ func Execute(args []string) error { mpath := fs.String("model", "", "Path to model binary file") parallel := fs.Int("parallel", 1, "Number of sequences to handle simultaneously") batchSize := fs.Int("batch-size", 512, "Batch size") - _ = fs.Int("n-gpu-layers", 0, "Number of layers to offload to GPU") - _ = fs.Int("main-gpu", 0, "Main GPU") + numGPULayers := fs.Int("n-gpu-layers", 0, "Number of layers to offload to GPU") + mainGPU := fs.Int("main-gpu", 0, "Main GPU") _ = fs.Bool("flash-attn", false, "Enable flash attention") kvSize := fs.Int("ctx-size", 2048, "Context (or KV cache) size") kvCacheType := fs.String("kv-cache-type", "", "quantization type for KV cache (default: f16)") port := fs.Int("port", 8080, "Port to expose the server on") - _ = fs.Int("threads", runtime.NumCPU(), "Number of threads to use during generation") + threads := fs.Int("threads", runtime.NumCPU(), "Number of threads to use during generation") verbose := fs.Bool("verbose", false, "verbose output (default: disabled)") _ = fs.Bool("no-mmap", false, "do not memory-map model (slower load but may reduce pageouts if not using mlock)") _ = fs.Bool("mlock", false, "force system to keep model in RAM rather than swapping or compressing") - _ = fs.String("tensor-split", "", "fraction of the model to offload to each GPU, comma-separated list of proportions") + tensorSplit := fs.String("tensor-split", "", "fraction of the model to offload to each GPU, comma-separated list of proportions") multiUserCache := fs.Bool("multiuser-cache", false, "optimize input cache algorithm for multiple users") var lpaths multiLPath @@ -890,15 +892,11 @@ func Execute(args []string) error { } // TODO(jessegross): Parameters that need to be implemented: - // n-gpu-layers - // main-gpu // flash-attn - // threads // no-mmap // mlock - // tensor-split - /*var tensorSplitFloats []float32 + var tensorSplitFloats []float32 if *tensorSplit != "" { stringFloats := regexp.MustCompile(",").Split(*tensorSplit, -1) @@ -907,10 +905,17 @@ func Execute(args []string) error { f, _ := strconv.ParseFloat(s, 32) tensorSplitFloats = append(tensorSplitFloats, float32(f)) } - }*/ + } + + params := ml.BackendParams{ + NumThreads: *threads, + NumGPULayers: *numGPULayers, + MainGPU: *mainGPU, + TensorSplit: tensorSplitFloats, + } server.ready.Add(1) - go server.loadModel(*mpath, lpaths, *parallel, *kvCacheType, *kvSize, *multiUserCache) + go server.loadModel(*mpath, params, lpaths, *parallel, *kvCacheType, *kvSize, *multiUserCache) server.cond = sync.NewCond(&server.mu)