no runners

This commit is contained in:
Michael Yang 2024-12-20 13:46:26 -08:00
parent cef3cf353a
commit 67bcb55941
4 changed files with 154 additions and 381 deletions

View File

@ -719,23 +719,18 @@ func (l GpuInfoList) GetVisibleDevicesEnv() (string, string) {
func LibraryDirs() []string { func LibraryDirs() []string {
// dependencies can exist wherever we found the runners (e.g. build tree for developers) and relative to the executable // dependencies can exist wherever we found the runners (e.g. build tree for developers) and relative to the executable
// This can be simplified once we no longer carry runners as payloads // This can be simplified once we no longer carry runners as payloads
paths := []string{} exe, err := os.Executable()
appExe, err := os.Executable()
if err != nil { if err != nil {
slog.Warn("failed to lookup executable path", "error", err) slog.Warn("failed to lookup executable path", "error", err)
} else { return nil
appRelative := filepath.Join(filepath.Dir(appExe), envconfig.LibRelativeToExe(), "lib", "ollama")
if _, err := os.Stat(appRelative); err == nil {
paths = append(paths, appRelative)
} }
lib := filepath.Join(filepath.Dir(exe), envconfig.LibRelativeToExe(), "lib", "ollama")
if _, err := os.Stat(lib); err != nil {
return nil
} }
rDir := runners.Locate()
if err != nil { return []string{lib}
slog.Warn("unable to locate gpu dependency libraries", "error", err)
} else {
paths = append(paths, filepath.Dir(rDir))
}
return paths
} }
func GetSystemInfo() SystemInfo { func GetSystemInfo() SystemInfo {

View File

@ -30,7 +30,6 @@ import (
"github.com/ollama/ollama/format" "github.com/ollama/ollama/format"
"github.com/ollama/ollama/fs/ggml" "github.com/ollama/ollama/fs/ggml"
"github.com/ollama/ollama/llama" "github.com/ollama/ollama/llama"
"github.com/ollama/ollama/runners"
) )
type LlamaServer interface { type LlamaServer interface {
@ -91,25 +90,19 @@ func LoadModel(model string, maxArraySize int) (*ggml.GGML, error) {
// NewLlamaServer will run a server for the given GPUs // NewLlamaServer will run a server for the given GPUs
// The gpu list must be a single family. // The gpu list must be a single family.
func NewLlamaServer(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters, projectors []string, opts api.Options, numParallel int) (LlamaServer, error) { func NewLlamaServer(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters, projectors []string, opts api.Options, numParallel int) (LlamaServer, error) {
var err error
var cpuRunner string
var estimate MemoryEstimate
var systemTotalMemory uint64
var systemFreeMemory uint64
var systemSwapFreeMemory uint64
systemInfo := discover.GetSystemInfo() systemInfo := discover.GetSystemInfo()
systemTotalMemory = systemInfo.System.TotalMemory systemTotalMemory := systemInfo.System.TotalMemory
systemFreeMemory = systemInfo.System.FreeMemory systemFreeMemory := systemInfo.System.FreeMemory
systemSwapFreeMemory = systemInfo.System.FreeSwap systemSwapFreeMemory := systemInfo.System.FreeSwap
slog.Info("system memory", "total", format.HumanBytes2(systemTotalMemory), "free", format.HumanBytes2(systemFreeMemory), "free_swap", format.HumanBytes2(systemSwapFreeMemory)) slog.Info("system memory", "total", format.HumanBytes2(systemTotalMemory), "free", format.HumanBytes2(systemFreeMemory), "free_swap", format.HumanBytes2(systemSwapFreeMemory))
// If the user wants zero GPU layers, reset the gpu list to be CPU/system ram info // If the user wants zero GPU layers, reset the gpu list to be CPU/system ram info
if opts.NumGPU == 0 { if opts.NumGPU == 0 {
gpus = discover.GetCPUInfo() gpus = discover.GetCPUInfo()
} }
var estimate MemoryEstimate
if len(gpus) == 1 && gpus[0].Library == "cpu" { if len(gpus) == 1 && gpus[0].Library == "cpu" {
cpuRunner = runners.ServerForCpu()
estimate = EstimateGPULayers(gpus, f, projectors, opts) estimate = EstimateGPULayers(gpus, f, projectors, opts)
} else { } else {
estimate = EstimateGPULayers(gpus, f, projectors, opts) estimate = EstimateGPULayers(gpus, f, projectors, opts)
@ -121,7 +114,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapt
opts.NumGPU = 0 opts.NumGPU = 0
case gpus[0].Library != "metal" && estimate.Layers == 0: case gpus[0].Library != "metal" && estimate.Layers == 0:
// Don't bother loading into the GPU if no layers can fit // Don't bother loading into the GPU if no layers can fit
cpuRunner = runners.ServerForCpu()
gpus = discover.GetCPUInfo() gpus = discover.GetCPUInfo()
case opts.NumGPU < 0 && estimate.Layers > 0 && gpus[0].Library != "cpu": case opts.NumGPU < 0 && estimate.Layers > 0 && gpus[0].Library != "cpu":
opts.NumGPU = estimate.Layers opts.NumGPU = estimate.Layers
@ -141,36 +133,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapt
slog.Info("offload", "", estimate) slog.Info("offload", "", estimate)
// Loop through potential servers
finalErr := errors.New("no suitable llama servers found")
availableServers := runners.GetAvailableServers()
var servers []string
if cpuRunner != "" {
servers = []string{cpuRunner}
} else {
servers = runners.ServersForGpu(gpus[0].RunnerName()) // All GPUs in the list are matching Library and Variant
}
demandLib := envconfig.LLMLibrary()
if demandLib != "" {
serverPath := availableServers[demandLib]
if serverPath == "" {
slog.Info(fmt.Sprintf("Invalid OLLAMA_LLM_LIBRARY %s - not found", demandLib))
} else {
slog.Info("user override", "OLLAMA_LLM_LIBRARY", demandLib, "path", serverPath)
servers = []string{demandLib}
if strings.HasPrefix(demandLib, "cpu") || (!(runtime.GOOS == "darwin" && runtime.GOARCH == "arm64") && demandLib == runners.BuiltinName()) {
// Omit the GPU flag to silence the warning
opts.NumGPU = -1
}
}
}
if len(servers) == 0 {
return nil, fmt.Errorf("no servers found for %v", gpus)
}
params := []string{ params := []string{
"--model", model, "--model", model,
"--ctx-size", strconv.Itoa(opts.NumCtx), "--ctx-size", strconv.Itoa(opts.NumCtx),
@ -271,18 +233,9 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapt
params = append(params, "--multiuser-cache") params = append(params, "--multiuser-cache")
} }
for i := range servers { exe, err := os.Executable()
builtin := servers[i] == runners.BuiltinName() if err != nil {
server := availableServers[servers[i]] return nil, err
if server == "" {
// Shouldn't happen
finalErr = fmt.Errorf("[%d] server %s not listed in available servers %v", i, servers[i], availableServers)
slog.Error("server list inconsistent", "error", finalErr)
continue
}
if strings.HasPrefix(servers[i], "cpu") || (builtin && !(runtime.GOOS == "darwin" && runtime.GOARCH == "arm64")) {
gpus = discover.GetCPUInfo()
} }
// Find an availableServers port, retry on each iteration in case the failure was a port conflict race // Find an availableServers port, retry on each iteration in case the failure was a port conflict race
@ -307,7 +260,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapt
pathEnv = "PATH" pathEnv = "PATH"
} }
// Start with the server directory for the LD_LIBRARY_PATH/PATH // Start with the server directory for the LD_LIBRARY_PATH/PATH
libraryPaths := []string{filepath.Dir(server)} libraryPaths := []string{filepath.Dir(exe)}
if libraryPath, ok := os.LookupEnv(pathEnv); ok { if libraryPath, ok := os.LookupEnv(pathEnv); ok {
// favor our bundled library dependencies over system libraries // favor our bundled library dependencies over system libraries
@ -324,7 +277,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapt
// TODO - once fully switched to the Go runner, load the model here for tokenize/detokenize cgo access // TODO - once fully switched to the Go runner, load the model here for tokenize/detokenize cgo access
s := &llmServer{ s := &llmServer{
port: port, port: port,
cmd: exec.Command(server, finalParams...), cmd: exec.Command(exe, finalParams...),
status: NewStatusWriter(os.Stderr), status: NewStatusWriter(os.Stderr),
options: opts, options: opts,
modelPath: model, modelPath: model,
@ -397,16 +350,14 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapt
if err = s.cmd.Start(); err != nil { if err = s.cmd.Start(); err != nil {
// Detect permission denied and augment the message about noexec // Detect permission denied and augment the message about noexec
if errors.Is(err, os.ErrPermission) { if errors.Is(err, os.ErrPermission) {
finalErr = fmt.Errorf("unable to start server %w. %s may have noexec set. Set OLLAMA_TMPDIR for server to a writable executable directory", err, server) return nil, fmt.Errorf("unable to start server %w. %s may have noexec set. Set OLLAMA_TMPDIR for server to a writable executable directory", err, exe)
continue
} }
msg := "" msg := ""
if s.status != nil && s.status.LastErrMsg != "" { if s.status != nil && s.status.LastErrMsg != "" {
msg = s.status.LastErrMsg msg = s.status.LastErrMsg
} }
err = fmt.Errorf("error starting the external llama server: %v %s", err, msg) return nil, fmt.Errorf("error starting the external llama server: %v %s", err, msg)
finalErr = err
continue
} }
// reap subprocess when it exits // reap subprocess when it exits
@ -425,10 +376,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapt
}() }()
return s, nil return s, nil
}
slog.Error("unable to load any llama server", "error", finalErr)
return nil, finalErr
} }
type ServerStatus int type ServerStatus int

View File

@ -1,17 +1,9 @@
package runners package runners
import ( import (
"log/slog"
"os"
"path/filepath"
"runtime"
"slices"
"strings"
"sync" "sync"
"golang.org/x/sys/cpu" "golang.org/x/sys/cpu"
"github.com/ollama/ollama/envconfig"
) )
var ( var (
@ -52,155 +44,3 @@ func GetCPUCapability() CPUCapability {
// else LCD // else LCD
return CPUCapabilityNone return CPUCapabilityNone
} }
// Return the location where runners were located
// empty string indicates only builtin is present
func Locate() string {
once.Do(locateRunnersOnce)
return runnersDir
}
// searches for runners in a prioritized set of locations
// 1. local build, with executable at the top of the tree
// 2. lib directory relative to executable
func locateRunnersOnce() {
exe, err := os.Executable()
if err != nil {
slog.Debug("runner locate", "error", err)
}
paths := []string{
filepath.Join(filepath.Dir(exe), "llama", "build", runtime.GOOS+"-"+runtime.GOARCH, "runners"),
filepath.Join(filepath.Dir(exe), envconfig.LibRelativeToExe(), "lib", "ollama", "runners"),
}
for _, path := range paths {
if _, err := os.Stat(path); err == nil {
runnersDir = path
slog.Debug("runners located", "dir", runnersDir)
return
}
}
// Fall back to built-in
slog.Debug("no dynamic runners detected, using only built-in")
runnersDir = ""
}
// Return the well-known name of the builtin runner for the given platform
func BuiltinName() string {
if runtime.GOOS == "darwin" && runtime.GOARCH == "arm64" {
return "metal"
}
return "cpu"
}
// directory names are the name of the runner and may contain an optional
// variant prefixed with '_' as the separator. For example, "cuda_v11" and
// "cuda_v12" or "cpu" and "cpu_avx2". Any library without a variant is the
// lowest common denominator
func GetAvailableServers() map[string]string {
once.Do(locateRunnersOnce)
servers := make(map[string]string)
exe, err := os.Executable()
if err == nil {
servers[BuiltinName()] = exe
}
if runnersDir == "" {
return servers
}
// glob runnersDir for files that start with ollama_
pattern := filepath.Join(runnersDir, "*", "ollama_*")
files, err := filepath.Glob(pattern)
if err != nil {
slog.Debug("could not glob", "pattern", pattern, "error", err)
return nil
}
for _, file := range files {
slog.Debug("availableServers : found", "file", file)
runnerName := filepath.Base(filepath.Dir(file))
// Special case for our GPU runners - if compiled with standard AVX flag
// detect incompatible system
// Custom builds will omit this and its up to the user to ensure compatibility
parsed := strings.Split(runnerName, "_")
if len(parsed) == 3 && parsed[2] == "avx" && !cpu.X86.HasAVX {
slog.Info("GPU runner incompatible with host system, CPU does not have AVX", "runner", runnerName)
continue
}
servers[runnerName] = file
}
return servers
}
// serversForGpu returns a list of compatible servers give the provided GPU library/variant
func ServersForGpu(requested string) []string {
// glob workDir for files that start with ollama_
availableServers := GetAvailableServers()
// Short circuit if the only option is built-in
if _, ok := availableServers[BuiltinName()]; ok && len(availableServers) == 1 {
return []string{BuiltinName()}
}
bestCPUVariant := GetCPUCapability()
requestedLib := strings.Split(requested, "_")[0]
servers := []string{}
// exact match first
for a := range availableServers {
short := a
parsed := strings.Split(a, "_")
if len(parsed) == 3 {
// Strip off optional _avx for comparison
short = parsed[0] + "_" + parsed[1]
}
if a == requested || short == requested {
servers = []string{a}
}
}
// If no exact match, then try without variant
if len(servers) == 0 {
alt := []string{}
for a := range availableServers {
if requestedLib == strings.Split(a, "_")[0] && a != requested {
alt = append(alt, a)
}
}
slices.Sort(alt)
servers = append(servers, alt...)
}
// Finally append the best CPU option if found, then builtin
if bestCPUVariant != CPUCapabilityNone {
for cmp := range availableServers {
if cmp == "cpu_"+bestCPUVariant.String() {
servers = append(servers, cmp)
break
}
}
}
servers = append(servers, BuiltinName())
return servers
}
// Return the optimal server for this CPU architecture
func ServerForCpu() string {
if runtime.GOOS == "darwin" && runtime.GOARCH == "arm64" {
return BuiltinName()
}
variant := GetCPUCapability()
availableServers := GetAvailableServers()
if variant != CPUCapabilityNone {
for cmp := range availableServers {
if cmp == "cpu_"+variant.String() {
return cmp
}
}
}
return BuiltinName()
}

View File

@ -33,7 +33,6 @@ import (
"github.com/ollama/ollama/llm" "github.com/ollama/ollama/llm"
"github.com/ollama/ollama/openai" "github.com/ollama/ollama/openai"
"github.com/ollama/ollama/parser" "github.com/ollama/ollama/parser"
"github.com/ollama/ollama/runners"
"github.com/ollama/ollama/server/imageproc" "github.com/ollama/ollama/server/imageproc"
"github.com/ollama/ollama/template" "github.com/ollama/ollama/template"
"github.com/ollama/ollama/types/errtypes" "github.com/ollama/ollama/types/errtypes"
@ -1269,14 +1268,6 @@ func Serve(ln net.Listener) error {
done() done()
}() }()
// Locate and log what runners are present at startup
var runnerNames []string
for v := range runners.GetAvailableServers() {
runnerNames = append(runnerNames, v)
}
slog.Info("Dynamic LLM libraries", "runners", runnerNames)
slog.Debug("Override detection logic by setting OLLAMA_LLM_LIBRARY")
s.sched.Run(schedCtx) s.sched.Run(schedCtx)
// At startup we retrieve GPU information so we can get log messages before loading a model // At startup we retrieve GPU information so we can get log messages before loading a model