diff --git a/discover/gpu.go b/discover/gpu.go index e76c844fe..0dcc9bd10 100644 --- a/discover/gpu.go +++ b/discover/gpu.go @@ -719,23 +719,18 @@ func (l GpuInfoList) GetVisibleDevicesEnv() (string, string) { func LibraryDirs() []string { // dependencies can exist wherever we found the runners (e.g. build tree for developers) and relative to the executable // This can be simplified once we no longer carry runners as payloads - paths := []string{} - appExe, err := os.Executable() + exe, err := os.Executable() if err != nil { slog.Warn("failed to lookup executable path", "error", err) - } else { - appRelative := filepath.Join(filepath.Dir(appExe), envconfig.LibRelativeToExe(), "lib", "ollama") - if _, err := os.Stat(appRelative); err == nil { - paths = append(paths, appRelative) - } + return nil } - rDir := runners.Locate() - if err != nil { - slog.Warn("unable to locate gpu dependency libraries", "error", err) - } else { - paths = append(paths, filepath.Dir(rDir)) + + lib := filepath.Join(filepath.Dir(exe), envconfig.LibRelativeToExe(), "lib", "ollama") + if _, err := os.Stat(lib); err != nil { + return nil } - return paths + + return []string{lib} } func GetSystemInfo() SystemInfo { diff --git a/llm/server.go b/llm/server.go index dc38979ca..9add35497 100644 --- a/llm/server.go +++ b/llm/server.go @@ -30,7 +30,6 @@ import ( "github.com/ollama/ollama/format" "github.com/ollama/ollama/fs/ggml" "github.com/ollama/ollama/llama" - "github.com/ollama/ollama/runners" ) type LlamaServer interface { @@ -91,25 +90,19 @@ func LoadModel(model string, maxArraySize int) (*ggml.GGML, error) { // NewLlamaServer will run a server for the given GPUs // The gpu list must be a single family. func NewLlamaServer(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters, projectors []string, opts api.Options, numParallel int) (LlamaServer, error) { - var err error - var cpuRunner string - var estimate MemoryEstimate - var systemTotalMemory uint64 - var systemFreeMemory uint64 - var systemSwapFreeMemory uint64 - systemInfo := discover.GetSystemInfo() - systemTotalMemory = systemInfo.System.TotalMemory - systemFreeMemory = systemInfo.System.FreeMemory - systemSwapFreeMemory = systemInfo.System.FreeSwap + systemTotalMemory := systemInfo.System.TotalMemory + systemFreeMemory := systemInfo.System.FreeMemory + systemSwapFreeMemory := systemInfo.System.FreeSwap slog.Info("system memory", "total", format.HumanBytes2(systemTotalMemory), "free", format.HumanBytes2(systemFreeMemory), "free_swap", format.HumanBytes2(systemSwapFreeMemory)) // If the user wants zero GPU layers, reset the gpu list to be CPU/system ram info if opts.NumGPU == 0 { gpus = discover.GetCPUInfo() } + + var estimate MemoryEstimate if len(gpus) == 1 && gpus[0].Library == "cpu" { - cpuRunner = runners.ServerForCpu() estimate = EstimateGPULayers(gpus, f, projectors, opts) } else { estimate = EstimateGPULayers(gpus, f, projectors, opts) @@ -121,7 +114,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapt opts.NumGPU = 0 case gpus[0].Library != "metal" && estimate.Layers == 0: // Don't bother loading into the GPU if no layers can fit - cpuRunner = runners.ServerForCpu() gpus = discover.GetCPUInfo() case opts.NumGPU < 0 && estimate.Layers > 0 && gpus[0].Library != "cpu": opts.NumGPU = estimate.Layers @@ -141,36 +133,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapt slog.Info("offload", "", estimate) - // Loop through potential servers - finalErr := errors.New("no suitable llama servers found") - - availableServers := runners.GetAvailableServers() - - var servers []string - if cpuRunner != "" { - servers = []string{cpuRunner} - } else { - servers = runners.ServersForGpu(gpus[0].RunnerName()) // All GPUs in the list are matching Library and Variant - } - demandLib := envconfig.LLMLibrary() - if demandLib != "" { - serverPath := availableServers[demandLib] - if serverPath == "" { - slog.Info(fmt.Sprintf("Invalid OLLAMA_LLM_LIBRARY %s - not found", demandLib)) - } else { - slog.Info("user override", "OLLAMA_LLM_LIBRARY", demandLib, "path", serverPath) - servers = []string{demandLib} - if strings.HasPrefix(demandLib, "cpu") || (!(runtime.GOOS == "darwin" && runtime.GOARCH == "arm64") && demandLib == runners.BuiltinName()) { - // Omit the GPU flag to silence the warning - opts.NumGPU = -1 - } - } - } - - if len(servers) == 0 { - return nil, fmt.Errorf("no servers found for %v", gpus) - } - params := []string{ "--model", model, "--ctx-size", strconv.Itoa(opts.NumCtx), @@ -271,164 +233,149 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapt params = append(params, "--multiuser-cache") } - for i := range servers { - builtin := servers[i] == runners.BuiltinName() - server := availableServers[servers[i]] - if server == "" { - // Shouldn't happen - finalErr = fmt.Errorf("[%d] server %s not listed in available servers %v", i, servers[i], availableServers) - slog.Error("server list inconsistent", "error", finalErr) - continue - } - - if strings.HasPrefix(servers[i], "cpu") || (builtin && !(runtime.GOOS == "darwin" && runtime.GOARCH == "arm64")) { - gpus = discover.GetCPUInfo() - } - - // Find an availableServers port, retry on each iteration in case the failure was a port conflict race - port := 0 - if a, err := net.ResolveTCPAddr("tcp", "localhost:0"); err == nil { - var l *net.TCPListener - if l, err = net.ListenTCP("tcp", a); err == nil { - port = l.Addr().(*net.TCPAddr).Port - l.Close() - } - } - if port == 0 { - slog.Debug("ResolveTCPAddr failed ", "error", err) - port = rand.Intn(65535-49152) + 49152 // get a random port in the ephemeral range - } - finalParams := []string{"runner"} - finalParams = append(finalParams, params...) - finalParams = append(finalParams, "--port", strconv.Itoa(port)) - - pathEnv := "LD_LIBRARY_PATH" - if runtime.GOOS == "windows" { - pathEnv = "PATH" - } - // Start with the server directory for the LD_LIBRARY_PATH/PATH - libraryPaths := []string{filepath.Dir(server)} - - if libraryPath, ok := os.LookupEnv(pathEnv); ok { - // favor our bundled library dependencies over system libraries - libraryPaths = append(libraryPaths, filepath.SplitList(libraryPath)...) - } - - // Note: we always put the dependency path first - // since this was the exact version we compiled/linked against - if gpus[0].DependencyPath != nil { - // assume gpus from the same library have the same dependency path - libraryPaths = append(gpus[0].DependencyPath, libraryPaths...) - } - - // TODO - once fully switched to the Go runner, load the model here for tokenize/detokenize cgo access - s := &llmServer{ - port: port, - cmd: exec.Command(server, finalParams...), - status: NewStatusWriter(os.Stderr), - options: opts, - modelPath: model, - estimate: estimate, - numParallel: numParallel, - sem: semaphore.NewWeighted(int64(numParallel)), - totalLayers: f.KV().BlockCount() + 1, - gpus: gpus, - done: make(chan error, 1), - } - - s.cmd.Env = os.Environ() - s.cmd.Stdout = os.Stdout - s.cmd.Stderr = s.status - s.cmd.SysProcAttr = LlamaServerSysProcAttr - - envWorkarounds := [][2]string{} - for _, gpu := range gpus { - envWorkarounds = append(envWorkarounds, gpu.EnvWorkarounds...) - } - visibleDevicesEnv, visibleDevicesEnvVal := gpus.GetVisibleDevicesEnv() - pathEnvVal := strings.Join(libraryPaths, string(filepath.ListSeparator)) - - // Update or add the path and visible devices variable with our adjusted version - pathNeeded := true - devicesNeeded := visibleDevicesEnv != "" - for i := range s.cmd.Env { - cmp := strings.SplitN(s.cmd.Env[i], "=", 2) - if strings.EqualFold(cmp[0], pathEnv) { - s.cmd.Env[i] = pathEnv + "=" + pathEnvVal - pathNeeded = false - } else if devicesNeeded && strings.EqualFold(cmp[0], visibleDevicesEnv) { - s.cmd.Env[i] = visibleDevicesEnv + "=" + visibleDevicesEnvVal - devicesNeeded = false - } else if len(envWorkarounds) != 0 { - for _, kv := range envWorkarounds { - if strings.EqualFold(cmp[0], kv[0]) { - s.cmd.Env[i] = kv[0] + "=" + kv[1] - } - } - } - } - if pathNeeded { - s.cmd.Env = append(s.cmd.Env, pathEnv+"="+pathEnvVal) - } - if devicesNeeded { - s.cmd.Env = append(s.cmd.Env, visibleDevicesEnv+"="+visibleDevicesEnvVal) - } - - slog.Info("starting llama server", "cmd", s.cmd.String()) - if envconfig.Debug() { - filteredEnv := []string{} - for _, ev := range s.cmd.Env { - if strings.HasPrefix(ev, "CUDA_") || - strings.HasPrefix(ev, "ROCR_") || - strings.HasPrefix(ev, "ROCM_") || - strings.HasPrefix(ev, "HIP_") || - strings.HasPrefix(ev, "GPU_") || - strings.HasPrefix(ev, "HSA_") || - strings.HasPrefix(ev, "GGML_") || - strings.HasPrefix(ev, "PATH=") || - strings.HasPrefix(ev, "LD_LIBRARY_PATH=") { - filteredEnv = append(filteredEnv, ev) - } - } - // Log at debug as the environment is inherited and might contain sensitive information - slog.Debug("subprocess", "environment", filteredEnv) - } - - if err = s.cmd.Start(); err != nil { - // Detect permission denied and augment the message about noexec - if errors.Is(err, os.ErrPermission) { - finalErr = fmt.Errorf("unable to start server %w. %s may have noexec set. Set OLLAMA_TMPDIR for server to a writable executable directory", err, server) - continue - } - msg := "" - if s.status != nil && s.status.LastErrMsg != "" { - msg = s.status.LastErrMsg - } - err = fmt.Errorf("error starting the external llama server: %v %s", err, msg) - finalErr = err - continue - } - - // reap subprocess when it exits - go func() { - err := s.cmd.Wait() - // Favor a more detailed message over the process exit status - if err != nil && s.status != nil && s.status.LastErrMsg != "" { - slog.Debug("llama runner terminated", "error", err) - if strings.Contains(s.status.LastErrMsg, "unknown model") { - s.status.LastErrMsg = "this model is not supported by your version of Ollama. You may need to upgrade" - } - s.done <- errors.New(s.status.LastErrMsg) - } else { - s.done <- err - } - }() - - return s, nil + exe, err := os.Executable() + if err != nil { + return nil, err } - slog.Error("unable to load any llama server", "error", finalErr) - return nil, finalErr + // Find an availableServers port, retry on each iteration in case the failure was a port conflict race + port := 0 + if a, err := net.ResolveTCPAddr("tcp", "localhost:0"); err == nil { + var l *net.TCPListener + if l, err = net.ListenTCP("tcp", a); err == nil { + port = l.Addr().(*net.TCPAddr).Port + l.Close() + } + } + if port == 0 { + slog.Debug("ResolveTCPAddr failed ", "error", err) + port = rand.Intn(65535-49152) + 49152 // get a random port in the ephemeral range + } + finalParams := []string{"runner"} + finalParams = append(finalParams, params...) + finalParams = append(finalParams, "--port", strconv.Itoa(port)) + + pathEnv := "LD_LIBRARY_PATH" + if runtime.GOOS == "windows" { + pathEnv = "PATH" + } + // Start with the server directory for the LD_LIBRARY_PATH/PATH + libraryPaths := []string{filepath.Dir(exe)} + + if libraryPath, ok := os.LookupEnv(pathEnv); ok { + // favor our bundled library dependencies over system libraries + libraryPaths = append(libraryPaths, filepath.SplitList(libraryPath)...) + } + + // Note: we always put the dependency path first + // since this was the exact version we compiled/linked against + if gpus[0].DependencyPath != nil { + // assume gpus from the same library have the same dependency path + libraryPaths = append(gpus[0].DependencyPath, libraryPaths...) + } + + // TODO - once fully switched to the Go runner, load the model here for tokenize/detokenize cgo access + s := &llmServer{ + port: port, + cmd: exec.Command(exe, finalParams...), + status: NewStatusWriter(os.Stderr), + options: opts, + modelPath: model, + estimate: estimate, + numParallel: numParallel, + sem: semaphore.NewWeighted(int64(numParallel)), + totalLayers: f.KV().BlockCount() + 1, + gpus: gpus, + done: make(chan error, 1), + } + + s.cmd.Env = os.Environ() + s.cmd.Stdout = os.Stdout + s.cmd.Stderr = s.status + s.cmd.SysProcAttr = LlamaServerSysProcAttr + + envWorkarounds := [][2]string{} + for _, gpu := range gpus { + envWorkarounds = append(envWorkarounds, gpu.EnvWorkarounds...) + } + visibleDevicesEnv, visibleDevicesEnvVal := gpus.GetVisibleDevicesEnv() + pathEnvVal := strings.Join(libraryPaths, string(filepath.ListSeparator)) + + // Update or add the path and visible devices variable with our adjusted version + pathNeeded := true + devicesNeeded := visibleDevicesEnv != "" + for i := range s.cmd.Env { + cmp := strings.SplitN(s.cmd.Env[i], "=", 2) + if strings.EqualFold(cmp[0], pathEnv) { + s.cmd.Env[i] = pathEnv + "=" + pathEnvVal + pathNeeded = false + } else if devicesNeeded && strings.EqualFold(cmp[0], visibleDevicesEnv) { + s.cmd.Env[i] = visibleDevicesEnv + "=" + visibleDevicesEnvVal + devicesNeeded = false + } else if len(envWorkarounds) != 0 { + for _, kv := range envWorkarounds { + if strings.EqualFold(cmp[0], kv[0]) { + s.cmd.Env[i] = kv[0] + "=" + kv[1] + } + } + } + } + if pathNeeded { + s.cmd.Env = append(s.cmd.Env, pathEnv+"="+pathEnvVal) + } + if devicesNeeded { + s.cmd.Env = append(s.cmd.Env, visibleDevicesEnv+"="+visibleDevicesEnvVal) + } + + slog.Info("starting llama server", "cmd", s.cmd.String()) + if envconfig.Debug() { + filteredEnv := []string{} + for _, ev := range s.cmd.Env { + if strings.HasPrefix(ev, "CUDA_") || + strings.HasPrefix(ev, "ROCR_") || + strings.HasPrefix(ev, "ROCM_") || + strings.HasPrefix(ev, "HIP_") || + strings.HasPrefix(ev, "GPU_") || + strings.HasPrefix(ev, "HSA_") || + strings.HasPrefix(ev, "GGML_") || + strings.HasPrefix(ev, "PATH=") || + strings.HasPrefix(ev, "LD_LIBRARY_PATH=") { + filteredEnv = append(filteredEnv, ev) + } + } + // Log at debug as the environment is inherited and might contain sensitive information + slog.Debug("subprocess", "environment", filteredEnv) + } + + if err = s.cmd.Start(); err != nil { + // Detect permission denied and augment the message about noexec + if errors.Is(err, os.ErrPermission) { + return nil, fmt.Errorf("unable to start server %w. %s may have noexec set. Set OLLAMA_TMPDIR for server to a writable executable directory", err, exe) + } + + msg := "" + if s.status != nil && s.status.LastErrMsg != "" { + msg = s.status.LastErrMsg + } + return nil, fmt.Errorf("error starting the external llama server: %v %s", err, msg) + } + + // reap subprocess when it exits + go func() { + err := s.cmd.Wait() + // Favor a more detailed message over the process exit status + if err != nil && s.status != nil && s.status.LastErrMsg != "" { + slog.Debug("llama runner terminated", "error", err) + if strings.Contains(s.status.LastErrMsg, "unknown model") { + s.status.LastErrMsg = "this model is not supported by your version of Ollama. You may need to upgrade" + } + s.done <- errors.New(s.status.LastErrMsg) + } else { + s.done <- err + } + }() + + return s, nil } type ServerStatus int diff --git a/runners/common.go b/runners/common.go index 287a6716a..c14207b03 100644 --- a/runners/common.go +++ b/runners/common.go @@ -1,17 +1,9 @@ package runners import ( - "log/slog" - "os" - "path/filepath" - "runtime" - "slices" - "strings" "sync" "golang.org/x/sys/cpu" - - "github.com/ollama/ollama/envconfig" ) var ( @@ -52,155 +44,3 @@ func GetCPUCapability() CPUCapability { // else LCD return CPUCapabilityNone } - -// Return the location where runners were located -// empty string indicates only builtin is present -func Locate() string { - once.Do(locateRunnersOnce) - return runnersDir -} - -// searches for runners in a prioritized set of locations -// 1. local build, with executable at the top of the tree -// 2. lib directory relative to executable -func locateRunnersOnce() { - exe, err := os.Executable() - if err != nil { - slog.Debug("runner locate", "error", err) - } - - paths := []string{ - filepath.Join(filepath.Dir(exe), "llama", "build", runtime.GOOS+"-"+runtime.GOARCH, "runners"), - filepath.Join(filepath.Dir(exe), envconfig.LibRelativeToExe(), "lib", "ollama", "runners"), - } - for _, path := range paths { - if _, err := os.Stat(path); err == nil { - runnersDir = path - slog.Debug("runners located", "dir", runnersDir) - return - } - } - // Fall back to built-in - slog.Debug("no dynamic runners detected, using only built-in") - runnersDir = "" -} - -// Return the well-known name of the builtin runner for the given platform -func BuiltinName() string { - if runtime.GOOS == "darwin" && runtime.GOARCH == "arm64" { - return "metal" - } - return "cpu" -} - -// directory names are the name of the runner and may contain an optional -// variant prefixed with '_' as the separator. For example, "cuda_v11" and -// "cuda_v12" or "cpu" and "cpu_avx2". Any library without a variant is the -// lowest common denominator -func GetAvailableServers() map[string]string { - once.Do(locateRunnersOnce) - - servers := make(map[string]string) - exe, err := os.Executable() - if err == nil { - servers[BuiltinName()] = exe - } - - if runnersDir == "" { - return servers - } - - // glob runnersDir for files that start with ollama_ - pattern := filepath.Join(runnersDir, "*", "ollama_*") - - files, err := filepath.Glob(pattern) - if err != nil { - slog.Debug("could not glob", "pattern", pattern, "error", err) - return nil - } - - for _, file := range files { - slog.Debug("availableServers : found", "file", file) - runnerName := filepath.Base(filepath.Dir(file)) - // Special case for our GPU runners - if compiled with standard AVX flag - // detect incompatible system - // Custom builds will omit this and its up to the user to ensure compatibility - parsed := strings.Split(runnerName, "_") - if len(parsed) == 3 && parsed[2] == "avx" && !cpu.X86.HasAVX { - slog.Info("GPU runner incompatible with host system, CPU does not have AVX", "runner", runnerName) - continue - } - servers[runnerName] = file - } - - return servers -} - -// serversForGpu returns a list of compatible servers give the provided GPU library/variant -func ServersForGpu(requested string) []string { - // glob workDir for files that start with ollama_ - availableServers := GetAvailableServers() - - // Short circuit if the only option is built-in - if _, ok := availableServers[BuiltinName()]; ok && len(availableServers) == 1 { - return []string{BuiltinName()} - } - - bestCPUVariant := GetCPUCapability() - requestedLib := strings.Split(requested, "_")[0] - servers := []string{} - - // exact match first - for a := range availableServers { - short := a - parsed := strings.Split(a, "_") - if len(parsed) == 3 { - // Strip off optional _avx for comparison - short = parsed[0] + "_" + parsed[1] - } - if a == requested || short == requested { - servers = []string{a} - } - } - - // If no exact match, then try without variant - if len(servers) == 0 { - alt := []string{} - for a := range availableServers { - if requestedLib == strings.Split(a, "_")[0] && a != requested { - alt = append(alt, a) - } - } - slices.Sort(alt) - servers = append(servers, alt...) - } - - // Finally append the best CPU option if found, then builtin - if bestCPUVariant != CPUCapabilityNone { - for cmp := range availableServers { - if cmp == "cpu_"+bestCPUVariant.String() { - servers = append(servers, cmp) - break - } - } - } - servers = append(servers, BuiltinName()) - return servers -} - -// Return the optimal server for this CPU architecture -func ServerForCpu() string { - if runtime.GOOS == "darwin" && runtime.GOARCH == "arm64" { - return BuiltinName() - } - variant := GetCPUCapability() - availableServers := GetAvailableServers() - if variant != CPUCapabilityNone { - for cmp := range availableServers { - if cmp == "cpu_"+variant.String() { - return cmp - } - } - } - return BuiltinName() -} diff --git a/server/routes.go b/server/routes.go index 2693b767b..cadd312d4 100644 --- a/server/routes.go +++ b/server/routes.go @@ -33,7 +33,6 @@ import ( "github.com/ollama/ollama/llm" "github.com/ollama/ollama/openai" "github.com/ollama/ollama/parser" - "github.com/ollama/ollama/runners" "github.com/ollama/ollama/server/imageproc" "github.com/ollama/ollama/template" "github.com/ollama/ollama/types/errtypes" @@ -1269,14 +1268,6 @@ func Serve(ln net.Listener) error { done() }() - // Locate and log what runners are present at startup - var runnerNames []string - for v := range runners.GetAvailableServers() { - runnerNames = append(runnerNames, v) - } - slog.Info("Dynamic LLM libraries", "runners", runnerNames) - slog.Debug("Override detection logic by setting OLLAMA_LLM_LIBRARY") - s.sched.Run(schedCtx) // At startup we retrieve GPU information so we can get log messages before loading a model