Request and model concurrency
This change adds support for multiple concurrent requests, as well as loading multiple models by spawning multiple runners. The default settings are currently set at 1 concurrent request per model and only 1 loaded model at a time, but these can be adjusted by setting OLLAMA_NUM_PARALLEL and OLLAMA_MAX_LOADED_MODELS.
This commit is contained in:
202
server/routes.go
202
server/routes.go
@@ -15,11 +15,8 @@ import (
|
||||
"os"
|
||||
"os/signal"
|
||||
"path/filepath"
|
||||
"reflect"
|
||||
"runtime"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
@@ -38,7 +35,8 @@ import (
|
||||
var mode string = gin.DebugMode
|
||||
|
||||
type Server struct {
|
||||
addr net.Addr
|
||||
addr net.Addr
|
||||
sched *Scheduler
|
||||
}
|
||||
|
||||
func init() {
|
||||
@@ -53,88 +51,8 @@ func init() {
|
||||
gin.SetMode(mode)
|
||||
}
|
||||
|
||||
var loaded struct {
|
||||
mu sync.Mutex
|
||||
|
||||
llama *llm.LlamaServer
|
||||
|
||||
expireTimer *time.Timer
|
||||
|
||||
model string
|
||||
adapters []string
|
||||
projectors []string
|
||||
*api.Options
|
||||
}
|
||||
|
||||
var defaultSessionDuration = 5 * time.Minute
|
||||
|
||||
func unload() {
|
||||
if loaded.llama != nil {
|
||||
loaded.llama.Close()
|
||||
}
|
||||
|
||||
loaded.llama = nil
|
||||
loaded.model = ""
|
||||
loaded.adapters = nil
|
||||
loaded.projectors = nil
|
||||
loaded.Options = nil
|
||||
}
|
||||
|
||||
// load a model into memory if it is not already loaded, it is up to the caller to lock loaded.mu before calling this function
|
||||
func load(c *gin.Context, model *Model, opts api.Options, sessionDuration time.Duration) error {
|
||||
ctx, cancel := context.WithTimeout(c, 10*time.Second)
|
||||
defer cancel()
|
||||
|
||||
needLoad := loaded.llama == nil || // is there a model loaded?
|
||||
loaded.model != model.ModelPath || // has the base model changed?
|
||||
!reflect.DeepEqual(loaded.adapters, model.AdapterPaths) || // have the adapters changed?
|
||||
!reflect.DeepEqual(loaded.projectors, model.ProjectorPaths) || // have the adapters changed?
|
||||
!reflect.DeepEqual(loaded.Options.Runner, opts.Runner) || // have the runner options changed?
|
||||
loaded.llama.Ping(ctx) != nil
|
||||
|
||||
if needLoad {
|
||||
if loaded.llama != nil {
|
||||
slog.Info("changing loaded model")
|
||||
unload()
|
||||
}
|
||||
|
||||
llama, err := llm.NewLlamaServer(model.ModelPath, model.AdapterPaths, model.ProjectorPaths, opts)
|
||||
if err != nil {
|
||||
// some older models are not compatible with newer versions of llama.cpp
|
||||
// show a generalized compatibility error until there is a better way to
|
||||
// check for model compatibility
|
||||
if errors.Is(llm.ErrUnsupportedFormat, err) || strings.Contains(err.Error(), "failed to load model") {
|
||||
err = fmt.Errorf("%v: this model may be incompatible with your version of Ollama. If you previously pulled this model, try updating it by running `ollama pull %s`", err, model.ShortName)
|
||||
}
|
||||
|
||||
return err
|
||||
}
|
||||
|
||||
loaded.model = model.ModelPath
|
||||
loaded.adapters = model.AdapterPaths
|
||||
loaded.projectors = model.ProjectorPaths
|
||||
loaded.llama = llama
|
||||
loaded.Options = &opts
|
||||
|
||||
if err = llama.WaitUntilRunning(); err != nil {
|
||||
slog.Error("error loading llama server", "error", err)
|
||||
unload()
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
if loaded.expireTimer == nil {
|
||||
loaded.expireTimer = time.AfterFunc(sessionDuration, func() {
|
||||
loaded.mu.Lock()
|
||||
defer loaded.mu.Unlock()
|
||||
unload()
|
||||
})
|
||||
}
|
||||
|
||||
loaded.expireTimer.Reset(sessionDuration)
|
||||
return nil
|
||||
}
|
||||
|
||||
func modelOptions(model *Model, requestOpts map[string]interface{}) (api.Options, error) {
|
||||
opts := api.DefaultOptions()
|
||||
if err := opts.FromMap(model.Options); err != nil {
|
||||
@@ -154,9 +72,7 @@ func isSupportedImageType(image []byte) bool {
|
||||
return slices.Contains(allowedTypes, contentType)
|
||||
}
|
||||
|
||||
func GenerateHandler(c *gin.Context) {
|
||||
loaded.mu.Lock()
|
||||
defer loaded.mu.Unlock()
|
||||
func (s *Server) GenerateHandler(c *gin.Context) {
|
||||
|
||||
checkpointStart := time.Now()
|
||||
var req api.GenerateRequest
|
||||
@@ -224,7 +140,11 @@ func GenerateHandler(c *gin.Context) {
|
||||
sessionDuration = req.KeepAlive.Duration
|
||||
}
|
||||
|
||||
if err := load(c, model, opts, sessionDuration); err != nil {
|
||||
rCh, eCh := s.sched.GetRunner(c.Request.Context(), model, opts, sessionDuration)
|
||||
var runner *runnerRef
|
||||
select {
|
||||
case runner = <-rCh:
|
||||
case err = <-eCh:
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
||||
return
|
||||
}
|
||||
@@ -275,7 +195,7 @@ func GenerateHandler(c *gin.Context) {
|
||||
|
||||
sb.Reset()
|
||||
if req.Context != nil {
|
||||
prev, err := loaded.llama.Detokenize(c.Request.Context(), req.Context)
|
||||
prev, err := runner.llama.Detokenize(c.Request.Context(), req.Context)
|
||||
if err != nil {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
||||
return
|
||||
@@ -297,9 +217,6 @@ func GenerateHandler(c *gin.Context) {
|
||||
defer close(ch)
|
||||
|
||||
fn := func(r llm.CompletionResponse) {
|
||||
// Update model expiration
|
||||
loaded.expireTimer.Reset(sessionDuration)
|
||||
|
||||
// Build up the full response
|
||||
if _, err := generated.WriteString(r.Content); err != nil {
|
||||
ch <- gin.H{"error": err.Error()}
|
||||
@@ -331,7 +248,7 @@ func GenerateHandler(c *gin.Context) {
|
||||
}
|
||||
|
||||
// TODO (jmorganca): encode() should not strip special tokens
|
||||
tokens, err := loaded.llama.Tokenize(c.Request.Context(), p)
|
||||
tokens, err := runner.llama.Tokenize(c.Request.Context(), p)
|
||||
if err != nil {
|
||||
ch <- gin.H{"error": err.Error()}
|
||||
return
|
||||
@@ -359,7 +276,7 @@ func GenerateHandler(c *gin.Context) {
|
||||
Images: images,
|
||||
Options: opts,
|
||||
}
|
||||
if err := loaded.llama.Completion(c.Request.Context(), req, fn); err != nil {
|
||||
if err := runner.llama.Completion(c.Request.Context(), req, fn); err != nil {
|
||||
ch <- gin.H{"error": err.Error()}
|
||||
}
|
||||
}()
|
||||
@@ -421,10 +338,7 @@ func getDefaultSessionDuration() time.Duration {
|
||||
return defaultSessionDuration
|
||||
}
|
||||
|
||||
func EmbeddingsHandler(c *gin.Context) {
|
||||
loaded.mu.Lock()
|
||||
defer loaded.mu.Unlock()
|
||||
|
||||
func (s *Server) EmbeddingsHandler(c *gin.Context) {
|
||||
var req api.EmbeddingRequest
|
||||
err := c.ShouldBindJSON(&req)
|
||||
switch {
|
||||
@@ -469,7 +383,11 @@ func EmbeddingsHandler(c *gin.Context) {
|
||||
sessionDuration = req.KeepAlive.Duration
|
||||
}
|
||||
|
||||
if err := load(c, model, opts, sessionDuration); err != nil {
|
||||
rCh, eCh := s.sched.GetRunner(c.Request.Context(), model, opts, sessionDuration)
|
||||
var runner *runnerRef
|
||||
select {
|
||||
case runner = <-rCh:
|
||||
case err = <-eCh:
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
||||
return
|
||||
}
|
||||
@@ -480,7 +398,7 @@ func EmbeddingsHandler(c *gin.Context) {
|
||||
return
|
||||
}
|
||||
|
||||
embedding, err := loaded.llama.Embedding(c.Request.Context(), req.Prompt)
|
||||
embedding, err := runner.llama.Embedding(c.Request.Context(), req.Prompt)
|
||||
if err != nil {
|
||||
slog.Info(fmt.Sprintf("embedding generation failed: %v", err))
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to generate embedding"})
|
||||
@@ -493,7 +411,7 @@ func EmbeddingsHandler(c *gin.Context) {
|
||||
c.JSON(http.StatusOK, resp)
|
||||
}
|
||||
|
||||
func PullModelHandler(c *gin.Context) {
|
||||
func (s *Server) PullModelHandler(c *gin.Context) {
|
||||
var req api.PullRequest
|
||||
err := c.ShouldBindJSON(&req)
|
||||
switch {
|
||||
@@ -542,7 +460,7 @@ func PullModelHandler(c *gin.Context) {
|
||||
streamResponse(c, ch)
|
||||
}
|
||||
|
||||
func PushModelHandler(c *gin.Context) {
|
||||
func (s *Server) PushModelHandler(c *gin.Context) {
|
||||
var req api.PushRequest
|
||||
err := c.ShouldBindJSON(&req)
|
||||
switch {
|
||||
@@ -591,7 +509,7 @@ func PushModelHandler(c *gin.Context) {
|
||||
streamResponse(c, ch)
|
||||
}
|
||||
|
||||
func CreateModelHandler(c *gin.Context) {
|
||||
func (s *Server) CreateModelHandler(c *gin.Context) {
|
||||
var req api.CreateRequest
|
||||
err := c.ShouldBindJSON(&req)
|
||||
switch {
|
||||
@@ -664,7 +582,7 @@ func CreateModelHandler(c *gin.Context) {
|
||||
streamResponse(c, ch)
|
||||
}
|
||||
|
||||
func DeleteModelHandler(c *gin.Context) {
|
||||
func (s *Server) DeleteModelHandler(c *gin.Context) {
|
||||
var req api.DeleteRequest
|
||||
err := c.ShouldBindJSON(&req)
|
||||
switch {
|
||||
@@ -709,7 +627,7 @@ func DeleteModelHandler(c *gin.Context) {
|
||||
c.JSON(http.StatusOK, nil)
|
||||
}
|
||||
|
||||
func ShowModelHandler(c *gin.Context) {
|
||||
func (s *Server) ShowModelHandler(c *gin.Context) {
|
||||
var req api.ShowRequest
|
||||
err := c.ShouldBindJSON(&req)
|
||||
switch {
|
||||
@@ -809,7 +727,7 @@ func GetModelInfo(req api.ShowRequest) (*api.ShowResponse, error) {
|
||||
return resp, nil
|
||||
}
|
||||
|
||||
func ListModelsHandler(c *gin.Context) {
|
||||
func (s *Server) ListModelsHandler(c *gin.Context) {
|
||||
models := make([]api.ModelResponse, 0)
|
||||
manifestsPath, err := GetManifestPath()
|
||||
if err != nil {
|
||||
@@ -869,7 +787,7 @@ func ListModelsHandler(c *gin.Context) {
|
||||
c.JSON(http.StatusOK, api.ListResponse{Models: models})
|
||||
}
|
||||
|
||||
func CopyModelHandler(c *gin.Context) {
|
||||
func (s *Server) CopyModelHandler(c *gin.Context) {
|
||||
var req api.CopyRequest
|
||||
err := c.ShouldBindJSON(&req)
|
||||
switch {
|
||||
@@ -901,7 +819,7 @@ func CopyModelHandler(c *gin.Context) {
|
||||
}
|
||||
}
|
||||
|
||||
func HeadBlobHandler(c *gin.Context) {
|
||||
func (s *Server) HeadBlobHandler(c *gin.Context) {
|
||||
path, err := GetBlobsPath(c.Param("digest"))
|
||||
if err != nil {
|
||||
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": err.Error()})
|
||||
@@ -916,7 +834,7 @@ func HeadBlobHandler(c *gin.Context) {
|
||||
c.Status(http.StatusOK)
|
||||
}
|
||||
|
||||
func CreateBlobHandler(c *gin.Context) {
|
||||
func (s *Server) CreateBlobHandler(c *gin.Context) {
|
||||
path, err := GetBlobsPath(c.Param("digest"))
|
||||
if err != nil {
|
||||
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": err.Error()})
|
||||
@@ -1063,27 +981,27 @@ func (s *Server) GenerateRoutes() http.Handler {
|
||||
allowedHostsMiddleware(s.addr),
|
||||
)
|
||||
|
||||
r.POST("/api/pull", PullModelHandler)
|
||||
r.POST("/api/generate", GenerateHandler)
|
||||
r.POST("/api/chat", ChatHandler)
|
||||
r.POST("/api/embeddings", EmbeddingsHandler)
|
||||
r.POST("/api/create", CreateModelHandler)
|
||||
r.POST("/api/push", PushModelHandler)
|
||||
r.POST("/api/copy", CopyModelHandler)
|
||||
r.DELETE("/api/delete", DeleteModelHandler)
|
||||
r.POST("/api/show", ShowModelHandler)
|
||||
r.POST("/api/blobs/:digest", CreateBlobHandler)
|
||||
r.HEAD("/api/blobs/:digest", HeadBlobHandler)
|
||||
r.POST("/api/pull", s.PullModelHandler)
|
||||
r.POST("/api/generate", s.GenerateHandler)
|
||||
r.POST("/api/chat", s.ChatHandler)
|
||||
r.POST("/api/embeddings", s.EmbeddingsHandler)
|
||||
r.POST("/api/create", s.CreateModelHandler)
|
||||
r.POST("/api/push", s.PushModelHandler)
|
||||
r.POST("/api/copy", s.CopyModelHandler)
|
||||
r.DELETE("/api/delete", s.DeleteModelHandler)
|
||||
r.POST("/api/show", s.ShowModelHandler)
|
||||
r.POST("/api/blobs/:digest", s.CreateBlobHandler)
|
||||
r.HEAD("/api/blobs/:digest", s.HeadBlobHandler)
|
||||
|
||||
// Compatibility endpoints
|
||||
r.POST("/v1/chat/completions", openai.Middleware(), ChatHandler)
|
||||
r.POST("/v1/chat/completions", openai.Middleware(), s.ChatHandler)
|
||||
|
||||
for _, method := range []string{http.MethodGet, http.MethodHead} {
|
||||
r.Handle(method, "/", func(c *gin.Context) {
|
||||
c.String(http.StatusOK, "Ollama is running")
|
||||
})
|
||||
|
||||
r.Handle(method, "/api/tags", ListModelsHandler)
|
||||
r.Handle(method, "/api/tags", s.ListModelsHandler)
|
||||
r.Handle(method, "/api/version", func(c *gin.Context) {
|
||||
c.JSON(http.StatusOK, gin.H{"version": version.Version})
|
||||
})
|
||||
@@ -1137,7 +1055,9 @@ func Serve(ln net.Listener) error {
|
||||
}
|
||||
}
|
||||
|
||||
s := &Server{addr: ln.Addr()}
|
||||
ctx, done := context.WithCancel(context.Background())
|
||||
sched := InitScheduler(ctx)
|
||||
s := &Server{addr: ln.Addr(), sched: sched}
|
||||
r := s.GenerateRoutes()
|
||||
|
||||
slog.Info(fmt.Sprintf("Listening on %s (version %s)", ln.Addr(), version.Version))
|
||||
@@ -1150,7 +1070,8 @@ func Serve(ln net.Listener) error {
|
||||
signal.Notify(signals, syscall.SIGINT, syscall.SIGTERM)
|
||||
go func() {
|
||||
<-signals
|
||||
unload()
|
||||
done()
|
||||
sched.unloadAllRunners()
|
||||
gpu.Cleanup()
|
||||
os.Exit(0)
|
||||
}()
|
||||
@@ -1158,12 +1079,12 @@ func Serve(ln net.Listener) error {
|
||||
if err := llm.Init(); err != nil {
|
||||
return fmt.Errorf("unable to initialize llm library %w", err)
|
||||
}
|
||||
if runtime.GOOS == "linux" { // TODO - windows too
|
||||
// check compatibility to log warnings
|
||||
if _, err := gpu.CheckVRAM(); err != nil {
|
||||
slog.Info(err.Error())
|
||||
}
|
||||
}
|
||||
|
||||
s.sched.Run(ctx)
|
||||
|
||||
// At startup we retrieve GPU information so we can get log messages before loading a model
|
||||
// This will log warnings to the log in case we have problems with detected GPUs
|
||||
_ = gpu.GetGPUInfo()
|
||||
|
||||
return srvr.Serve(ln)
|
||||
}
|
||||
@@ -1219,9 +1140,9 @@ func streamResponse(c *gin.Context, ch chan any) {
|
||||
}
|
||||
|
||||
// ChatPrompt builds up a prompt from a series of messages for the currently `loaded` model
|
||||
func chatPrompt(ctx context.Context, template string, messages []api.Message, numCtx int) (string, error) {
|
||||
func chatPrompt(ctx context.Context, runner *runnerRef, template string, messages []api.Message, numCtx int) (string, error) {
|
||||
encode := func(s string) ([]int, error) {
|
||||
return loaded.llama.Tokenize(ctx, s)
|
||||
return runner.llama.Tokenize(ctx, s)
|
||||
}
|
||||
|
||||
prompt, err := ChatPrompt(template, messages, numCtx, encode)
|
||||
@@ -1232,10 +1153,7 @@ func chatPrompt(ctx context.Context, template string, messages []api.Message, nu
|
||||
return prompt, nil
|
||||
}
|
||||
|
||||
func ChatHandler(c *gin.Context) {
|
||||
loaded.mu.Lock()
|
||||
defer loaded.mu.Unlock()
|
||||
|
||||
func (s *Server) ChatHandler(c *gin.Context) {
|
||||
checkpointStart := time.Now()
|
||||
|
||||
var req api.ChatRequest
|
||||
@@ -1292,7 +1210,11 @@ func ChatHandler(c *gin.Context) {
|
||||
sessionDuration = req.KeepAlive.Duration
|
||||
}
|
||||
|
||||
if err := load(c, model, opts, sessionDuration); err != nil {
|
||||
rCh, eCh := s.sched.GetRunner(c.Request.Context(), model, opts, sessionDuration)
|
||||
var runner *runnerRef
|
||||
select {
|
||||
case runner = <-rCh:
|
||||
case err = <-eCh:
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
||||
return
|
||||
}
|
||||
@@ -1309,7 +1231,7 @@ func ChatHandler(c *gin.Context) {
|
||||
}, req.Messages...)
|
||||
}
|
||||
|
||||
prompt, err := chatPrompt(c.Request.Context(), model.Template, req.Messages, opts.NumCtx)
|
||||
prompt, err := chatPrompt(c.Request.Context(), runner, model.Template, req.Messages, opts.NumCtx)
|
||||
if err != nil {
|
||||
c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
|
||||
return
|
||||
@@ -1352,8 +1274,6 @@ func ChatHandler(c *gin.Context) {
|
||||
defer close(ch)
|
||||
|
||||
fn := func(r llm.CompletionResponse) {
|
||||
// Update model expiration
|
||||
loaded.expireTimer.Reset(sessionDuration)
|
||||
|
||||
resp := api.ChatResponse{
|
||||
Model: req.Model,
|
||||
@@ -1376,7 +1296,7 @@ func ChatHandler(c *gin.Context) {
|
||||
ch <- resp
|
||||
}
|
||||
|
||||
if err := loaded.llama.Completion(c.Request.Context(), llm.CompletionRequest{
|
||||
if err := runner.llama.Completion(c.Request.Context(), llm.CompletionRequest{
|
||||
Prompt: prompt,
|
||||
Format: req.Format,
|
||||
Images: images,
|
||||
|
||||
Reference in New Issue
Block a user