ollamarunner: Pass runner performance parameters to backends

Currently the following parameters are in the runner but not used:
 - numGPULayers
 - mainGPU
 - threads
 - tensorSplit

This passes them through to the backend, which is where they would
actually get used. However, the GGML backend does not yet do anything
with them.
This commit is contained in:
Jesse Gross 2025-02-20 11:18:01 -08:00 committed by Jesse Gross
parent 14b5a9a150
commit bd6a7d5e64
4 changed files with 40 additions and 20 deletions

View File

@ -26,9 +26,24 @@ type Backend interface {
SystemInfo() string SystemInfo() string
} }
var backends = make(map[string]func(*os.File) (Backend, error)) // BackendParams controls how the backend loads and executes models
type BackendParams struct {
// NumThreads sets the number of threads to use if running on the CPU
NumThreads int
func RegisterBackend(name string, f func(*os.File) (Backend, error)) { // MainGPU is the index of the primary GPU to use
MainGPU int
// NumGPULayers is the number of layers to offload to GPUs
NumGPULayers int
// TensorSplit is the fraction of the model to offload to each GPU
TensorSplit []float32
}
var backends = make(map[string]func(*os.File, BackendParams) (Backend, error))
func RegisterBackend(name string, f func(*os.File, BackendParams) (Backend, error)) {
if _, ok := backends[name]; ok { if _, ok := backends[name]; ok {
panic("backend: backend already registered") panic("backend: backend already registered")
} }
@ -36,9 +51,9 @@ func RegisterBackend(name string, f func(*os.File) (Backend, error)) {
backends[name] = f backends[name] = f
} }
func NewBackend(f *os.File) (Backend, error) { func NewBackend(f *os.File, params BackendParams) (Backend, error) {
if backend, ok := backends["ggml"]; ok { if backend, ok := backends["ggml"]; ok {
return backend(f) return backend(f, params)
} }
return nil, fmt.Errorf("unsupported backend") return nil, fmt.Errorf("unsupported backend")

View File

@ -84,7 +84,7 @@ type Backend struct {
tensors map[string]*Context tensors map[string]*Context
} }
func New(r *os.File) (ml.Backend, error) { func New(r *os.File, params ml.BackendParams) (ml.Backend, error) {
meta, n, err := fs.Decode(r, -1) meta, n, err := fs.Decode(r, -1)
if err != nil { if err != nil {
return nil, err return nil, err

View File

@ -70,14 +70,14 @@ func Register(name string, f func(ml.Config) (Model, error)) {
} }
// New initializes a new model instance with the provided configuration based on the metadata in the model file // New initializes a new model instance with the provided configuration based on the metadata in the model file
func New(modelPath string) (Model, error) { func New(modelPath string, params ml.BackendParams) (Model, error) {
r, err := os.Open(modelPath) r, err := os.Open(modelPath)
if err != nil { if err != nil {
return nil, err return nil, err
} }
defer r.Close() defer r.Close()
b, err := ml.NewBackend(r) b, err := ml.NewBackend(r, params)
if err != nil { if err != nil {
return nil, err return nil, err
} }

View File

@ -25,6 +25,7 @@ import (
"golang.org/x/sync/semaphore" "golang.org/x/sync/semaphore"
"github.com/ollama/ollama/api" "github.com/ollama/ollama/api"
"github.com/ollama/ollama/ml"
"github.com/ollama/ollama/model" "github.com/ollama/ollama/model"
"github.com/ollama/ollama/runner/common" "github.com/ollama/ollama/runner/common"
"github.com/ollama/ollama/sample" "github.com/ollama/ollama/sample"
@ -801,6 +802,7 @@ func (m *multiLPath) String() string {
func (s *Server) loadModel( func (s *Server) loadModel(
mpath string, mpath string,
params ml.BackendParams,
lpath multiLPath, lpath multiLPath,
parallel int, parallel int,
kvCacheType string, kvCacheType string,
@ -808,12 +810,12 @@ func (s *Server) loadModel(
multiUserCache bool, multiUserCache bool,
) { ) {
var err error var err error
s.model, err = model.New(mpath) s.model, err = model.New(mpath, params)
if err != nil { if err != nil {
panic(err) panic(err)
} }
slog.Info("system", "info", s.model.Backend().SystemInfo() /* "threads", *threads */) slog.Info("system", "info", s.model.Backend().SystemInfo(), "threads", params.NumThreads)
// TODO(jessegross): LoRA loading // TODO(jessegross): LoRA loading
if lpath.String() != "" { if lpath.String() != "" {
@ -843,17 +845,17 @@ func Execute(args []string) error {
mpath := fs.String("model", "", "Path to model binary file") mpath := fs.String("model", "", "Path to model binary file")
parallel := fs.Int("parallel", 1, "Number of sequences to handle simultaneously") parallel := fs.Int("parallel", 1, "Number of sequences to handle simultaneously")
batchSize := fs.Int("batch-size", 512, "Batch size") batchSize := fs.Int("batch-size", 512, "Batch size")
_ = fs.Int("n-gpu-layers", 0, "Number of layers to offload to GPU") numGPULayers := fs.Int("n-gpu-layers", 0, "Number of layers to offload to GPU")
_ = fs.Int("main-gpu", 0, "Main GPU") mainGPU := fs.Int("main-gpu", 0, "Main GPU")
_ = fs.Bool("flash-attn", false, "Enable flash attention") _ = fs.Bool("flash-attn", false, "Enable flash attention")
kvSize := fs.Int("ctx-size", 2048, "Context (or KV cache) size") kvSize := fs.Int("ctx-size", 2048, "Context (or KV cache) size")
kvCacheType := fs.String("kv-cache-type", "", "quantization type for KV cache (default: f16)") kvCacheType := fs.String("kv-cache-type", "", "quantization type for KV cache (default: f16)")
port := fs.Int("port", 8080, "Port to expose the server on") port := fs.Int("port", 8080, "Port to expose the server on")
_ = fs.Int("threads", runtime.NumCPU(), "Number of threads to use during generation") threads := fs.Int("threads", runtime.NumCPU(), "Number of threads to use during generation")
verbose := fs.Bool("verbose", false, "verbose output (default: disabled)") verbose := fs.Bool("verbose", false, "verbose output (default: disabled)")
_ = fs.Bool("no-mmap", false, "do not memory-map model (slower load but may reduce pageouts if not using mlock)") _ = fs.Bool("no-mmap", false, "do not memory-map model (slower load but may reduce pageouts if not using mlock)")
_ = fs.Bool("mlock", false, "force system to keep model in RAM rather than swapping or compressing") _ = fs.Bool("mlock", false, "force system to keep model in RAM rather than swapping or compressing")
_ = fs.String("tensor-split", "", "fraction of the model to offload to each GPU, comma-separated list of proportions") tensorSplit := fs.String("tensor-split", "", "fraction of the model to offload to each GPU, comma-separated list of proportions")
multiUserCache := fs.Bool("multiuser-cache", false, "optimize input cache algorithm for multiple users") multiUserCache := fs.Bool("multiuser-cache", false, "optimize input cache algorithm for multiple users")
var lpaths multiLPath var lpaths multiLPath
@ -890,15 +892,11 @@ func Execute(args []string) error {
} }
// TODO(jessegross): Parameters that need to be implemented: // TODO(jessegross): Parameters that need to be implemented:
// n-gpu-layers
// main-gpu
// flash-attn // flash-attn
// threads
// no-mmap // no-mmap
// mlock // mlock
// tensor-split
/*var tensorSplitFloats []float32 var tensorSplitFloats []float32
if *tensorSplit != "" { if *tensorSplit != "" {
stringFloats := regexp.MustCompile(",").Split(*tensorSplit, -1) stringFloats := regexp.MustCompile(",").Split(*tensorSplit, -1)
@ -907,10 +905,17 @@ func Execute(args []string) error {
f, _ := strconv.ParseFloat(s, 32) f, _ := strconv.ParseFloat(s, 32)
tensorSplitFloats = append(tensorSplitFloats, float32(f)) tensorSplitFloats = append(tensorSplitFloats, float32(f))
} }
}*/ }
params := ml.BackendParams{
NumThreads: *threads,
NumGPULayers: *numGPULayers,
MainGPU: *mainGPU,
TensorSplit: tensorSplitFloats,
}
server.ready.Add(1) server.ready.Add(1)
go server.loadModel(*mpath, lpaths, *parallel, *kvCacheType, *kvSize, *multiUserCache) go server.loadModel(*mpath, params, lpaths, *parallel, *kvCacheType, *kvSize, *multiUserCache)
server.cond = sync.NewCond(&server.mu) server.cond = sync.NewCond(&server.mu)