refactor error
This commit is contained in:
parent
5d76e78c2f
commit
a562b9069f
@ -82,7 +82,7 @@ func LoadModel(model string, maxArraySize int) (*GGML, error) {
|
|||||||
|
|
||||||
// NewLlamaServer will run a server for the given GPUs
|
// NewLlamaServer will run a server for the given GPUs
|
||||||
// The gpu list must be a single family.
|
// The gpu list must be a single family.
|
||||||
func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, projectors []string, opts api.Options) (LlamaServer, error) {
|
func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, projectors []string, opts api.Options, numParallel int) (LlamaServer, error) {
|
||||||
var err error
|
var err error
|
||||||
var cpuRunner string
|
var cpuRunner string
|
||||||
var estimate MemoryEstimate
|
var estimate MemoryEstimate
|
||||||
@ -218,8 +218,10 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
|||||||
|
|
||||||
// Windows CUDA should not use mmap for best performance
|
// Windows CUDA should not use mmap for best performance
|
||||||
// Linux with a model larger than free space, mmap leads to thrashing
|
// Linux with a model larger than free space, mmap leads to thrashing
|
||||||
|
// For CPU loads we want the memory to be allocated, not FS cache
|
||||||
if (runtime.GOOS == "windows" && gpus[0].Library == "cuda" && opts.UseMMap == api.TriStateUndefined) ||
|
if (runtime.GOOS == "windows" && gpus[0].Library == "cuda" && opts.UseMMap == api.TriStateUndefined) ||
|
||||||
(runtime.GOOS == "linux" && systemFreeMemory < estimate.TotalSize && opts.UseMMap == api.TriStateUndefined) ||
|
(runtime.GOOS == "linux" && systemFreeMemory < estimate.TotalSize && opts.UseMMap == api.TriStateUndefined) ||
|
||||||
|
(gpus[0].Library == "cpu" && opts.UseMMap == api.TriStateUndefined) ||
|
||||||
opts.UseMMap == api.TriStateFalse {
|
opts.UseMMap == api.TriStateFalse {
|
||||||
params = append(params, "--no-mmap")
|
params = append(params, "--no-mmap")
|
||||||
}
|
}
|
||||||
@ -232,15 +234,6 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
|||||||
params = append(params, "--numa")
|
params = append(params, "--numa")
|
||||||
}
|
}
|
||||||
|
|
||||||
numParallel := envconfig.NumParallel
|
|
||||||
|
|
||||||
// TODO (jmorganca): multimodal models don't support parallel yet
|
|
||||||
// see https://github.com/ollama/ollama/issues/4165
|
|
||||||
if len(projectors) > 0 {
|
|
||||||
numParallel = 1
|
|
||||||
slog.Warn("multimodal models don't support parallel requests yet")
|
|
||||||
}
|
|
||||||
|
|
||||||
params = append(params, "--parallel", fmt.Sprintf("%d", numParallel))
|
params = append(params, "--parallel", fmt.Sprintf("%d", numParallel))
|
||||||
|
|
||||||
if estimate.TensorSplit != "" {
|
if estimate.TensorSplit != "" {
|
||||||
@ -567,6 +560,9 @@ func (s *llmServer) WaitUntilRunning(ctx context.Context) error {
|
|||||||
if s.status != nil && s.status.LastErrMsg != "" {
|
if s.status != nil && s.status.LastErrMsg != "" {
|
||||||
msg = s.status.LastErrMsg
|
msg = s.status.LastErrMsg
|
||||||
}
|
}
|
||||||
|
if strings.Contains(msg, "unknown model") {
|
||||||
|
return fmt.Errorf("this model is not supported by your version of Ollama. You may need to upgrade.")
|
||||||
|
}
|
||||||
return fmt.Errorf("llama runner process has terminated: %v %s", err, msg)
|
return fmt.Errorf("llama runner process has terminated: %v %s", err, msg)
|
||||||
default:
|
default:
|
||||||
}
|
}
|
||||||
|
@ -25,6 +25,7 @@ var errorPrefixes = []string{
|
|||||||
"CUDA error",
|
"CUDA error",
|
||||||
"cudaMalloc failed",
|
"cudaMalloc failed",
|
||||||
"\"ERR\"",
|
"\"ERR\"",
|
||||||
|
"architecture",
|
||||||
}
|
}
|
||||||
|
|
||||||
func (w *StatusWriter) Write(b []byte) (int, error) {
|
func (w *StatusWriter) Write(b []byte) (int, error) {
|
||||||
@ -34,19 +35,6 @@ func (w *StatusWriter) Write(b []byte) (int, error) {
|
|||||||
errMsg = prefix + string(bytes.TrimSpace(after))
|
errMsg = prefix + string(bytes.TrimSpace(after))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if bytes.Contains(b, []byte("unknown model architecture")) {
|
|
||||||
if _, after, ok := bytes.Cut(b, []byte("architecture")); ok {
|
|
||||||
errMsg = "error" + string(bytes.TrimSpace(after))
|
|
||||||
|
|
||||||
if before, _, ok := bytes.Cut(after, []byte("llama_load")); ok {
|
|
||||||
errMsg = "error" + string(bytes.TrimSpace(before))
|
|
||||||
}
|
|
||||||
|
|
||||||
errMsg = errMsg + "\nYour current version of Ollama doesn't support this model architecture. Consider upgrading."
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if errMsg != "" {
|
if errMsg != "" {
|
||||||
w.LastErrMsg = errMsg
|
w.LastErrMsg = errMsg
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user