no runners

2024-12-20 13:46:26 -08:00 · 2024-12-20 13:46:26 -08:00 · 67bcb55941
commit 67bcb55941
parent cef3cf353a
4 changed files with 154 additions and 381 deletions
--- a/discover/gpu.go
+++ b/discover/gpu.go
@ -719,23 +719,18 @@ func (l GpuInfoList) GetVisibleDevicesEnv() (string, string) {
 func LibraryDirs() []string {
 	// dependencies can exist wherever we found the runners (e.g. build tree for developers) and relative to the executable
 	// This can be simplified once we no longer carry runners as payloads
-	paths := []string{}
+	exe, err := os.Executable()
 	appExe, err := os.Executable()
 	if err != nil {
 		slog.Warn("failed to lookup executable path", "error", err)
-	} else {
+		return nil
 		appRelative := filepath.Join(filepath.Dir(appExe), envconfig.LibRelativeToExe(), "lib", "ollama")
 		if _, err := os.Stat(appRelative); err == nil {
 			paths = append(paths, appRelative)
 	}
 	lib := filepath.Join(filepath.Dir(exe), envconfig.LibRelativeToExe(), "lib", "ollama")
 	if _, err := os.Stat(lib); err != nil {
 		return nil
 	}
-	rDir := runners.Locate()
+
-	if err != nil {
+	return []string{lib}
 		slog.Warn("unable to locate gpu dependency libraries", "error", err)
 	} else {
 		paths = append(paths, filepath.Dir(rDir))
 	}
 	return paths
 }
 func GetSystemInfo() SystemInfo {
--- a/llm/server.go
+++ b/llm/server.go
@ -30,7 +30,6 @@ import (
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/fs/ggml"
 	"github.com/ollama/ollama/llama"
 	"github.com/ollama/ollama/runners"
 )
 type LlamaServer interface {
@ -91,25 +90,19 @@ func LoadModel(model string, maxArraySize int) (*ggml.GGML, error) {
 // NewLlamaServer will run a server for the given GPUs
 // The gpu list must be a single family.
 func NewLlamaServer(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters, projectors []string, opts api.Options, numParallel int) (LlamaServer, error) {
 	var err error
 	var cpuRunner string
 	var estimate MemoryEstimate
 	var systemTotalMemory uint64
 	var systemFreeMemory uint64
 	var systemSwapFreeMemory uint64
 	systemInfo := discover.GetSystemInfo()
-	systemTotalMemory = systemInfo.System.TotalMemory
+	systemTotalMemory := systemInfo.System.TotalMemory
-	systemFreeMemory = systemInfo.System.FreeMemory
+	systemFreeMemory := systemInfo.System.FreeMemory
-	systemSwapFreeMemory = systemInfo.System.FreeSwap
+	systemSwapFreeMemory := systemInfo.System.FreeSwap
 	slog.Info("system memory", "total", format.HumanBytes2(systemTotalMemory), "free", format.HumanBytes2(systemFreeMemory), "free_swap", format.HumanBytes2(systemSwapFreeMemory))
 	// If the user wants zero GPU layers, reset the gpu list to be CPU/system ram info
 	if opts.NumGPU == 0 {
 		gpus = discover.GetCPUInfo()
 	}
 	var estimate MemoryEstimate
 	if len(gpus) == 1 && gpus[0].Library == "cpu" {
 		cpuRunner = runners.ServerForCpu()
 		estimate = EstimateGPULayers(gpus, f, projectors, opts)
 	} else {
 		estimate = EstimateGPULayers(gpus, f, projectors, opts)
@ -121,7 +114,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapt
 			opts.NumGPU = 0
 		case gpus[0].Library != "metal" && estimate.Layers == 0:
 			// Don't bother loading into the GPU if no layers can fit
 			cpuRunner = runners.ServerForCpu()
 			gpus = discover.GetCPUInfo()
 		case opts.NumGPU < 0 && estimate.Layers > 0 && gpus[0].Library != "cpu":
 			opts.NumGPU = estimate.Layers
@ -141,36 +133,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapt
 	slog.Info("offload", "", estimate)
 	// Loop through potential servers
 	finalErr := errors.New("no suitable llama servers found")
 	availableServers := runners.GetAvailableServers()
 	var servers []string
 	if cpuRunner != "" {
 		servers = []string{cpuRunner}
 	} else {
 		servers = runners.ServersForGpu(gpus[0].RunnerName()) // All GPUs in the list are matching Library and Variant
 	}
 	demandLib := envconfig.LLMLibrary()
 	if demandLib != "" {
 		serverPath := availableServers[demandLib]
 		if serverPath == "" {
 			slog.Info(fmt.Sprintf("Invalid OLLAMA_LLM_LIBRARY %s - not found", demandLib))
 		} else {
 			slog.Info("user override", "OLLAMA_LLM_LIBRARY", demandLib, "path", serverPath)
 			servers = []string{demandLib}
 			if strings.HasPrefix(demandLib, "cpu") || (!(runtime.GOOS == "darwin" && runtime.GOARCH == "arm64") && demandLib == runners.BuiltinName()) {
 				// Omit the GPU flag to silence the warning
 				opts.NumGPU = -1
 			}
 		}
 	}
 	if len(servers) == 0 {
 		return nil, fmt.Errorf("no servers found for %v", gpus)
 	}
 	params := []string{
 		"--model", model,
 		"--ctx-size", strconv.Itoa(opts.NumCtx),
@ -271,18 +233,9 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapt
 		params = append(params, "--multiuser-cache")
 	}
-	for i := range servers {
+	exe, err := os.Executable()
-		builtin := servers[i] == runners.BuiltinName()
+	if err != nil {
-		server := availableServers[servers[i]]
+		return nil, err
 		if server == "" {
 			// Shouldn't happen
 			finalErr = fmt.Errorf("[%d] server %s not listed in available servers %v", i, servers[i], availableServers)
 			slog.Error("server list inconsistent", "error", finalErr)
 			continue
 		}
 		if strings.HasPrefix(servers[i], "cpu") || (builtin && !(runtime.GOOS == "darwin" && runtime.GOARCH == "arm64")) {
 			gpus = discover.GetCPUInfo()
 	}
 	// Find an availableServers  port, retry on each iteration in case the failure was a port conflict race
@ -307,7 +260,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapt
 		pathEnv = "PATH"
 	}
 	// Start with the server directory for the LD_LIBRARY_PATH/PATH
-		libraryPaths := []string{filepath.Dir(server)}
+	libraryPaths := []string{filepath.Dir(exe)}
 	if libraryPath, ok := os.LookupEnv(pathEnv); ok {
 		// favor our bundled library dependencies over system libraries
@ -324,7 +277,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapt
 	// TODO - once fully switched to the Go runner, load the model here for tokenize/detokenize cgo access
 	s := &llmServer{
 		port:        port,
-			cmd:         exec.Command(server, finalParams...),
+		cmd:         exec.Command(exe, finalParams...),
 		status:      NewStatusWriter(os.Stderr),
 		options:     opts,
 		modelPath:   model,
@ -397,16 +350,14 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapt
 	if err = s.cmd.Start(); err != nil {
 		// Detect permission denied and augment the message about noexec
 		if errors.Is(err, os.ErrPermission) {
-				finalErr = fmt.Errorf("unable to start server %w.  %s may have noexec set.  Set OLLAMA_TMPDIR for server to a writable executable directory", err, server)
+			return nil, fmt.Errorf("unable to start server %w.  %s may have noexec set.  Set OLLAMA_TMPDIR for server to a writable executable directory", err, exe)
 				continue
 		}
 		msg := ""
 		if s.status != nil && s.status.LastErrMsg != "" {
 			msg = s.status.LastErrMsg
 		}
-			err = fmt.Errorf("error starting the external llama server: %v %s", err, msg)
+		return nil, fmt.Errorf("error starting the external llama server: %v %s", err, msg)
 			finalErr = err
 			continue
 	}
 	// reap subprocess when it exits
@ -425,10 +376,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapt
 	}()
 	return s, nil
 	}
 	slog.Error("unable to load any llama server", "error", finalErr)
 	return nil, finalErr
 }
 type ServerStatus int
--- a/runners/common.go
+++ b/runners/common.go
@ -1,17 +1,9 @@
 package runners
 import (
 	"log/slog"
 	"os"
 	"path/filepath"
 	"runtime"
 	"slices"
 	"strings"
 	"sync"
 	"golang.org/x/sys/cpu"
 	"github.com/ollama/ollama/envconfig"
 )
 var (
@ -52,155 +44,3 @@ func GetCPUCapability() CPUCapability {
 	// else LCD
 	return CPUCapabilityNone
 }
 // Return the location where runners were located
 // empty string indicates only builtin is present
 func Locate() string {
 	once.Do(locateRunnersOnce)
 	return runnersDir
 }
 // searches for runners in a prioritized set of locations
 // 1. local build, with executable at the top of the tree
 // 2. lib directory relative to executable
 func locateRunnersOnce() {
 	exe, err := os.Executable()
 	if err != nil {
 		slog.Debug("runner locate", "error", err)
 	}
 	paths := []string{
 		filepath.Join(filepath.Dir(exe), "llama", "build", runtime.GOOS+"-"+runtime.GOARCH, "runners"),
 		filepath.Join(filepath.Dir(exe), envconfig.LibRelativeToExe(), "lib", "ollama", "runners"),
 	}
 	for _, path := range paths {
 		if _, err := os.Stat(path); err == nil {
 			runnersDir = path
 			slog.Debug("runners located", "dir", runnersDir)
 			return
 		}
 	}
 	// Fall back to built-in
 	slog.Debug("no dynamic runners detected, using only built-in")
 	runnersDir = ""
 }
 // Return the well-known name of the builtin runner for the given platform
 func BuiltinName() string {
 	if runtime.GOOS == "darwin" && runtime.GOARCH == "arm64" {
 		return "metal"
 	}
 	return "cpu"
 }
 // directory names are the name of the runner and may contain an optional
 // variant prefixed with '_' as the separator. For example, "cuda_v11" and
 // "cuda_v12" or "cpu" and "cpu_avx2". Any library without a variant is the
 // lowest common denominator
 func GetAvailableServers() map[string]string {
 	once.Do(locateRunnersOnce)
 	servers := make(map[string]string)
 	exe, err := os.Executable()
 	if err == nil {
 		servers[BuiltinName()] = exe
 	}
 	if runnersDir == "" {
 		return servers
 	}
 	// glob runnersDir for files that start with ollama_
 	pattern := filepath.Join(runnersDir, "*", "ollama_*")
 	files, err := filepath.Glob(pattern)
 	if err != nil {
 		slog.Debug("could not glob", "pattern", pattern, "error", err)
 		return nil
 	}
 	for _, file := range files {
 		slog.Debug("availableServers : found", "file", file)
 		runnerName := filepath.Base(filepath.Dir(file))
 		// Special case for our GPU runners - if compiled with standard AVX flag
 		// detect incompatible system
 		// Custom builds will omit this and its up to the user to ensure compatibility
 		parsed := strings.Split(runnerName, "_")
 		if len(parsed) == 3 && parsed[2] == "avx" && !cpu.X86.HasAVX {
 			slog.Info("GPU runner incompatible with host system, CPU does not have AVX", "runner", runnerName)
 			continue
 		}
 		servers[runnerName] = file
 	}
 	return servers
 }
 // serversForGpu returns a list of compatible servers give the provided GPU library/variant
 func ServersForGpu(requested string) []string {
 	// glob workDir for files that start with ollama_
 	availableServers := GetAvailableServers()
 	// Short circuit if the only option is built-in
 	if _, ok := availableServers[BuiltinName()]; ok && len(availableServers) == 1 {
 		return []string{BuiltinName()}
 	}
 	bestCPUVariant := GetCPUCapability()
 	requestedLib := strings.Split(requested, "_")[0]
 	servers := []string{}
 	// exact match first
 	for a := range availableServers {
 		short := a
 		parsed := strings.Split(a, "_")
 		if len(parsed) == 3 {
 			// Strip off optional _avx for comparison
 			short = parsed[0] + "_" + parsed[1]
 		}
 		if a == requested || short == requested {
 			servers = []string{a}
 		}
 	}
 	// If no exact match, then try without variant
 	if len(servers) == 0 {
 		alt := []string{}
 		for a := range availableServers {
 			if requestedLib == strings.Split(a, "_")[0] && a != requested {
 				alt = append(alt, a)
 			}
 		}
 		slices.Sort(alt)
 		servers = append(servers, alt...)
 	}
 	// Finally append the best CPU option if found, then builtin
 	if bestCPUVariant != CPUCapabilityNone {
 		for cmp := range availableServers {
 			if cmp == "cpu_"+bestCPUVariant.String() {
 				servers = append(servers, cmp)
 				break
 			}
 		}
 	}
 	servers = append(servers, BuiltinName())
 	return servers
 }
 // Return the optimal server for this CPU architecture
 func ServerForCpu() string {
 	if runtime.GOOS == "darwin" && runtime.GOARCH == "arm64" {
 		return BuiltinName()
 	}
 	variant := GetCPUCapability()
 	availableServers := GetAvailableServers()
 	if variant != CPUCapabilityNone {
 		for cmp := range availableServers {
 			if cmp == "cpu_"+variant.String() {
 				return cmp
 			}
 		}
 	}
 	return BuiltinName()
 }
--- a/server/routes.go
+++ b/server/routes.go
@ -33,7 +33,6 @@ import (
 	"github.com/ollama/ollama/llm"
 	"github.com/ollama/ollama/openai"
 	"github.com/ollama/ollama/parser"
 	"github.com/ollama/ollama/runners"
 	"github.com/ollama/ollama/server/imageproc"
 	"github.com/ollama/ollama/template"
 	"github.com/ollama/ollama/types/errtypes"
@ -1269,14 +1268,6 @@ func Serve(ln net.Listener) error {
 		done()
 	}()
 	// Locate and log what runners are present at startup
 	var runnerNames []string
 	for v := range runners.GetAvailableServers() {
 		runnerNames = append(runnerNames, v)
 	}
 	slog.Info("Dynamic LLM libraries", "runners", runnerNames)
 	slog.Debug("Override detection logic by setting OLLAMA_LLM_LIBRARY")
 	s.sched.Run(schedCtx)
 	// At startup we retrieve GPU information so we can get log messages before loading a model