From 415c8fcc3deee73e8a11822a962a87c7cb58d938 Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Wed, 30 Apr 2025 11:26:52 -0700 Subject: [PATCH] Fix "Stopping..." scheduler hang (#10487) * Adjust initial scheduler refCount Ensure we only set the refCount on success * sched: fix lock order inversion deadlock Under certain race conditions, there was a scenario where the scheduler would get into a deadlock while trying to update free space information while a model was trying to unload. --- server/sched.go | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/server/sched.go b/server/sched.go index 883540cea..8513fbf4b 100644 --- a/server/sched.go +++ b/server/sched.go @@ -441,10 +441,9 @@ func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoLis estimatedVRAM: llama.EstimatedVRAM(), estimatedTotal: llama.EstimatedTotal(), loading: true, - refCount: 1, } runner.numParallel = numParallel - runner.refMu.Lock() + runner.refMu.Lock() // hold lock until running or aborted s.loadedMu.Lock() s.loaded[req.model.ModelPath] = runner @@ -455,13 +454,13 @@ func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoLis defer runner.refMu.Unlock() if err = llama.WaitUntilRunning(req.ctx); err != nil { slog.Error("error loading llama server", "error", err) - runner.refCount-- req.errCh <- err slog.Debug("triggering expiration for failed load", "model", runner.modelPath) s.expiredCh <- runner return } slog.Debug("finished setting up runner", "model", req.model.ModelPath) + runner.refCount++ runner.loading = false go func() { <-req.ctx.Done() @@ -479,7 +478,12 @@ func (s *Scheduler) updateFreeSpace(allGpus discover.GpuInfoList) { } predMap := map[predKey]uint64{} // Sum up the total predicted usage per GPU for all runners s.loadedMu.Lock() + runners := make([]*runnerRef, 0, len(s.loaded)) for _, r := range s.loaded { + runners = append(runners, r) + } + s.loadedMu.Unlock() + for _, r := range runners { r.refMu.Lock() if r.llama != nil { for _, gpu := range allGpus { @@ -490,7 +494,6 @@ func (s *Scheduler) updateFreeSpace(allGpus discover.GpuInfoList) { } r.refMu.Unlock() } - s.loadedMu.Unlock() // Now that we've summed up all the GPU usage predictions across all the loaded runners, update the gpu list for i := range allGpus { @@ -537,10 +540,8 @@ func (s *Scheduler) filterGPUsWithoutLoadingModels(allGpus discover.GpuInfoList) // TODO consolidate sched_types.go type runnerRef struct { - refMu sync.Mutex - // refCond sync.Cond // Signaled on transition from 1 -> 0 refCount + refMu sync.Mutex refCount uint // prevent unloading if > 0 - // unloading bool // set to true when we are trying to unload the runner llama llm.LlamaServer loading bool // True only during initial load, then false forever @@ -811,8 +812,8 @@ func (s *Scheduler) unloadAllRunners() { func (s *Scheduler) expireRunner(model *Model) { s.loadedMu.Lock() - defer s.loadedMu.Unlock() runner, ok := s.loaded[model.ModelPath] + s.loadedMu.Unlock() if ok { runner.refMu.Lock() runner.expiresAt = time.Now()