server: allow running embed models in parallel
The ability to run embedding models in parallel with other types of models was removed due to limitations in server slot loading in a past version of the server. This slot loading system is no longer used, and embedding models can run in parallel with chat models.
This commit is contained in:
parent
d8a5d96b98
commit
12a8b00b34
@ -194,11 +194,6 @@ func (s *Scheduler) processPending(ctx context.Context) {
|
|||||||
break
|
break
|
||||||
}
|
}
|
||||||
|
|
||||||
// Embedding models should always be loaded with parallel=1
|
|
||||||
if pending.model.CheckCapabilities(CapabilityCompletion) != nil {
|
|
||||||
numParallel = 1
|
|
||||||
}
|
|
||||||
|
|
||||||
// Evaluate if the model will fit in the available system memory, or if we should unload a model first
|
// Evaluate if the model will fit in the available system memory, or if we should unload a model first
|
||||||
if len(gpus) == 1 && gpus[0].Library == "cpu" {
|
if len(gpus) == 1 && gpus[0].Library == "cpu" {
|
||||||
// simplifying assumption of defaultParallel when in CPU mode
|
// simplifying assumption of defaultParallel when in CPU mode
|
||||||
|
Loading…
x
Reference in New Issue
Block a user