diff --git a/build.log b/build.log deleted file mode 100644 index e69de29bb..000000000 diff --git a/llm/server.go b/llm/server.go index ad67138b5..8bb845788 100644 --- a/llm/server.go +++ b/llm/server.go @@ -82,7 +82,7 @@ func LoadModel(model string, maxArraySize int) (*GGML, error) { // NewLlamaServer will run a server for the given GPUs // The gpu list must be a single family. -func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, projectors []string, opts api.Options) (LlamaServer, error) { +func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, projectors []string, opts api.Options, numParallel int) (LlamaServer, error) { var err error var cpuRunner string var estimate MemoryEstimate @@ -218,8 +218,10 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr // Windows CUDA should not use mmap for best performance // Linux with a model larger than free space, mmap leads to thrashing + // For CPU loads we want the memory to be allocated, not FS cache if (runtime.GOOS == "windows" && gpus[0].Library == "cuda" && opts.UseMMap == api.TriStateUndefined) || (runtime.GOOS == "linux" && systemFreeMemory < estimate.TotalSize && opts.UseMMap == api.TriStateUndefined) || + (gpus[0].Library == "cpu" && opts.UseMMap == api.TriStateUndefined) || opts.UseMMap == api.TriStateFalse { params = append(params, "--no-mmap") } @@ -232,15 +234,6 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr params = append(params, "--numa") } - numParallel := envconfig.NumParallel - - // TODO (jmorganca): multimodal models don't support parallel yet - // see https://github.com/ollama/ollama/issues/4165 - if len(projectors) > 0 { - numParallel = 1 - slog.Warn("multimodal models don't support parallel requests yet") - } - params = append(params, "--parallel", fmt.Sprintf("%d", numParallel)) if estimate.TensorSplit != "" { @@ -567,6 +560,9 @@ func (s *llmServer) WaitUntilRunning(ctx context.Context) error { if s.status != nil && s.status.LastErrMsg != "" { msg = s.status.LastErrMsg } + if strings.Contains(msg, "unknown model") { + return fmt.Errorf("this model is not supported by your version of Ollama. You may need to upgrade.") + } return fmt.Errorf("llama runner process has terminated: %v %s", err, msg) default: } diff --git a/llm/status.go b/llm/status.go index 7fc76621c..0f56b7f99 100644 --- a/llm/status.go +++ b/llm/status.go @@ -25,6 +25,7 @@ var errorPrefixes = []string{ "CUDA error", "cudaMalloc failed", "\"ERR\"", + "architecture", } func (w *StatusWriter) Write(b []byte) (int, error) { @@ -34,19 +35,6 @@ func (w *StatusWriter) Write(b []byte) (int, error) { errMsg = prefix + string(bytes.TrimSpace(after)) } } - - if bytes.Contains(b, []byte("unknown model architecture")) { - if _, after, ok := bytes.Cut(b, []byte("architecture")); ok { - errMsg = "error" + string(bytes.TrimSpace(after)) - - if before, _, ok := bytes.Cut(after, []byte("llama_load")); ok { - errMsg = "error" + string(bytes.TrimSpace(before)) - } - - errMsg = errMsg + "\nYour current version of Ollama doesn't support this model architecture. Consider upgrading." - } - } - if errMsg != "" { w.LastErrMsg = errMsg }