server: allow running embed models in parallel

The ability to run embedding models in parallel with other types of models was removed due to limitations in server slot loading in a past version of the server. This slot loading system is no longer used, and embedding models can run in parallel with chat models.
2025-03-10 13:34:02 -07:00 · 2025-03-10 13:34:02 -07:00 · 12a8b00b34
commit 12a8b00b34
parent d8a5d96b98
1 changed files with 0 additions and 5 deletions
--- a/server/sched.go
+++ b/server/sched.go
@ -194,11 +194,6 @@ func (s *Scheduler) processPending(ctx context.Context) {
 						break
 					}
 					// Embedding models should always be loaded with parallel=1
 					if pending.model.CheckCapabilities(CapabilityCompletion) != nil {
 						numParallel = 1
 					}
 					// Evaluate if the model will fit in the available system memory, or if we should unload a model first
 					if len(gpus) == 1 && gpus[0].Library == "cpu" {
 						// simplifying assumption of defaultParallel when in CPU mode