From 1fdb351c37a445fb2e8fdad19fff88f6d85b2912 Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Tue, 4 Mar 2025 09:03:46 -0800 Subject: [PATCH] New engine: vision models and auto-fallback (#9113) * Include unified vision layers in memory prediction For newer vision models with a single gguf, include the projection estimates. * Adjust CLI to handle both styles of vision model metadata * Wire up new tokenizers for new engine If we're loading the new engine, utilize the new model text processor instead of calling into cgo wrappers for llama.cpp. This also cleans up some tech debt from the older tokenization flow for the C++ server which was no longer used. This also adjusts the grammar handling logic to pass through to the new engine instead of utilizing the cgo schema to grammar call. * Lay foundation for auto selection of new engine --- cmd/cmd.go | 14 +- fs/ggml/ggml.go | 37 +++++ llm/memory.go | 3 + llm/server.go | 278 +++++++++++++++-------------------- model/model.go | 31 ++++ model/model_test.go | 39 +++++ model/models/llama/model.go | 6 + model/models/mllama/model.go | 6 + server/prompt.go | 3 +- server/routes.go | 2 +- 10 files changed, 249 insertions(+), 170 deletions(-) diff --git a/cmd/cmd.go b/cmd/cmd.go index b4001a7f3..159de9a6a 100644 --- a/cmd/cmd.go +++ b/cmd/cmd.go @@ -339,10 +339,16 @@ func RunHandler(cmd *cobra.Command, args []string) error { return err } - // TODO(jessegross): We should either find another way to know if this is - // a vision model or remove the logic. Also consider that other modalities will - // need different behavior anyways. - opts.MultiModal = len(info.ProjectorInfo) != 0 || envconfig.NewEngine() + if len(info.ProjectorInfo) != 0 { + opts.MultiModal = true + } + for k := range info.ModelInfo { + if strings.Contains(k, ".vision.") { + opts.MultiModal = true + break + } + } + opts.ParentModel = info.Details.ParentModel if interactive { diff --git a/fs/ggml/ggml.go b/fs/ggml/ggml.go index b9f9cc178..8662c3b01 100644 --- a/fs/ggml/ggml.go +++ b/fs/ggml/ggml.go @@ -565,6 +565,43 @@ func (f GGML) GraphSize(context, batch uint64, kvCacheType string) (kv, partialO return } +func (llm GGML) VisionGraphSize() (weights, graphSize uint64) { + switch llm.KV().Architecture() { + case "mllama": + for _, layer := range llm.Tensors().GroupLayers()["v"] { + weights += layer.Size() + } + + kv := func(n string) uint64 { + if v, ok := llm.KV()["mllama.vision."+n].(uint32); ok { + return uint64(v) + } + + return 0 + } + + imageSize := kv("image_size") + + maxNumTiles := kv("max_num_tiles") + embeddingLength := kv("embedding_length") + headCount := kv("attention.head_count") + + numPatches := (imageSize / kv("patch_size")) * (imageSize / kv("patch_size")) + if _, ok := llm.Tensors().GroupLayers()["v"]["class_embd"]; ok { + numPatches++ + } + + numPaddedPatches := numPatches + 8 - (numPatches%8)%8 + + graphSize = 4 * (8 + + imageSize*imageSize*kv("num_channels")*maxNumTiles + + embeddingLength*numPatches*maxNumTiles + + 9*embeddingLength*numPaddedPatches*maxNumTiles + + numPaddedPatches*maxNumTiles*numPaddedPatches*maxNumTiles*headCount) + } + return weights, graphSize +} + // SupportsKVCacheType checks if the requested cache type is supported func (f GGML) SupportsKVCacheType(cacheType string) bool { return slices.Contains([]string{"f16", "q8_0", "q4_0"}, cacheType) diff --git a/llm/memory.go b/llm/memory.go index 1da4d2c08..40104eca9 100644 --- a/llm/memory.go +++ b/llm/memory.go @@ -115,6 +115,9 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin // multimodal models require at least 2048 context opts.NumCtx = max(opts.NumCtx, 2048) } + if projectorWeights == 0 && projectorGraph == 0 { + projectorWeights, projectorGraph = f.VisionGraphSize() + } layers := f.Tensors().GroupLayers() // add one layer worth of memory as a buffer diff --git a/llm/server.go b/llm/server.go index fd027a535..09690a5ff 100644 --- a/llm/server.go +++ b/llm/server.go @@ -30,6 +30,7 @@ import ( "github.com/ollama/ollama/format" "github.com/ollama/ollama/fs/ggml" "github.com/ollama/ollama/llama" + "github.com/ollama/ollama/model" ) type LlamaServer interface { @@ -54,8 +55,15 @@ type llmServer struct { options api.Options numParallel int modelPath string - modelLock sync.Mutex // Temporary until we switch fully to Go server - model *llama.Model // If non-nil, the runner is a new Go server + + // llamaModel is an instance of the cgo llama.cpp model definition + // nil if this server is running the new engine + llamaModel *llama.Model + llamaModelLock sync.Mutex + + // textProcessor handles text encoding/decoding for the model in the Ollama engine + // nil if this server is running the llama.cpp based engine + textProcessor model.TextProcessor estimate MemoryEstimate totalLayers uint64 @@ -89,7 +97,7 @@ func LoadModel(model string, maxArraySize int) (*ggml.GGML, error) { // NewLlamaServer will run a server for the given GPUs // The gpu list must be a single family. -func NewLlamaServer(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters, projectors []string, opts api.Options, numParallel int) (LlamaServer, error) { +func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, adapters, projectors []string, opts api.Options, numParallel int) (LlamaServer, error) { systemInfo := discover.GetSystemInfo() systemTotalMemory := systemInfo.System.TotalMemory systemFreeMemory := systemInfo.System.FreeMemory @@ -130,7 +138,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapt slog.Info("offload", "", estimate) params := []string{ - "--model", model, + "--model", modelPath, "--ctx-size", strconv.Itoa(opts.NumCtx), "--batch-size", strconv.Itoa(opts.NumBatch), } @@ -153,11 +161,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapt } } - if len(projectors) > 0 { - // TODO: applying multiple projectors is not supported by the llama.cpp server yet - params = append(params, "--mmproj", projectors[0]) - } - defaultThreads := systemInfo.GetOptimalThreadCount() if opts.NumThread > 0 { params = append(params, "--threads", strconv.Itoa(opts.NumThread)) @@ -257,6 +260,34 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapt } } slog.Debug("compatible gpu libraries", "compatible", compatible) + exe, err := os.Executable() + if err != nil { + return nil, fmt.Errorf("unable to lookup executable path: %w", err) + } + + if eval, err := filepath.EvalSymlinks(exe); err == nil { + exe = eval + } + + var llamaModel *llama.Model + var textProcessor model.TextProcessor + if envconfig.NewEngine() { + textProcessor, err = model.NewTextProcessor(modelPath) + if err != nil { + // To prepare for opt-out mode, instead of treating this as an error, we fallback to the old runner + slog.Debug("model not yet supported by Ollama engine, switching to compatibility mode", "model", modelPath, "error", err) + } + } + if textProcessor == nil { + llamaModel, err = llama.LoadModelFromFile(modelPath, llama.ModelParams{VocabOnly: true}) + if err != nil { + return nil, err + } + } + + if len(projectors) > 0 && llamaModel != nil { + params = append(params, "--mmproj", projectors[0]) + } // iterate through compatible GPU libraries such as 'cuda_v12', 'cuda_v11', 'rocm', etc. // adding each library's respective path to the LD_LIBRARY_PATH, until finally running @@ -275,7 +306,9 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapt port = rand.Intn(65535-49152) + 49152 // get a random port in the ephemeral range } finalParams := []string{"runner"} - if envconfig.NewEngine() { + if textProcessor != nil { + // New engine + // TODO - if we have failure to load scenarios, add logic to retry with the old runner finalParams = append(finalParams, "--ollama-engine") } finalParams = append(finalParams, params...) @@ -315,28 +348,20 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapt // finally, add the root library path libraryPaths = append(libraryPaths, discover.LibOllamaPath) - exe, err := os.Executable() - if err != nil { - return nil, fmt.Errorf("unable to lookup executable path: %w", err) - } - - if eval, err := filepath.EvalSymlinks(exe); err == nil { - exe = eval - } - - // TODO - once fully switched to the Go runner, load the model here for tokenize/detokenize cgo access s := &llmServer{ - port: port, - cmd: exec.Command(exe, finalParams...), - status: NewStatusWriter(os.Stderr), - options: opts, - modelPath: model, - estimate: estimate, - numParallel: numParallel, - sem: semaphore.NewWeighted(int64(numParallel)), - totalLayers: f.KV().BlockCount() + 1, - gpus: gpus, - done: make(chan error, 1), + port: port, + cmd: exec.Command(exe, finalParams...), + status: NewStatusWriter(os.Stderr), + options: opts, + modelPath: modelPath, + llamaModel: llamaModel, + textProcessor: textProcessor, + estimate: estimate, + numParallel: numParallel, + sem: semaphore.NewWeighted(int64(numParallel)), + totalLayers: f.KV().BlockCount() + 1, + gpus: gpus, + done: make(chan error, 1), } s.cmd.Env = os.Environ() @@ -405,6 +430,9 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapt } err := fmt.Errorf("error starting runner: %v %s", err, msg) if len(compatible) == 0 { + if llamaModel != nil { + llama.FreeModel(llamaModel) + } return nil, err } @@ -701,24 +729,29 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu } if len(req.Format) > 0 { - switch string(req.Format) { - case `null`, `""`: - // Field was set, but "missing" a value. We accept - // these as "not set". - break - case `"json"`: - request["grammar"] = grammarJSON - default: - if req.Format[0] != '{' { - return fmt.Errorf("invalid format: %q; expected \"json\" or a valid JSON Schema object", req.Format) - } + format := string(req.Format) + if format != `null` && format != `""` { + if s.textProcessor != nil { + // New engine handles this on the backend + request["format"] = req.Format + } else { + // old engine + switch format { + case `"json"`: + request["grammar"] = grammarJSON + default: + if req.Format[0] != '{' { + return fmt.Errorf("invalid format: %q; expected \"json\" or a valid JSON Schema object", req.Format) + } - // User provided a JSON schema - g := llama.SchemaToGrammar(req.Format) - if g == nil { - return fmt.Errorf("invalid JSON schema in format") + // User provided a JSON schema + g := llama.SchemaToGrammar(req.Format) + if g == nil { + return fmt.Errorf("invalid JSON schema in format") + } + request["grammar"] = string(g) + } } - request["grammar"] = string(g) } } @@ -933,64 +966,25 @@ type TokenizeResponse struct { } func (s *llmServer) Tokenize(ctx context.Context, content string) ([]int, error) { - s.modelLock.Lock() - defer s.modelLock.Unlock() - if s.model != nil { - return s.model.Tokenize(content, false, true) - } + s.llamaModelLock.Lock() + defer s.llamaModelLock.Unlock() - // Make sure the server is ready - status, err := s.getServerStatus(ctx) - if err != nil { - return nil, err - } else if status != ServerStatusReady && status != ServerStatusNoSlotsAvailable { - return nil, fmt.Errorf("unexpected server status: %s", status.ToString()) + if s.llamaModel != nil { + return s.llamaModel.Tokenize(content, false, true) } - - data, err := json.Marshal(TokenizeRequest{Content: content}) - if err != nil { - return nil, fmt.Errorf("marshaling encode data: %w", err) - } - - req, err := http.NewRequestWithContext(ctx, http.MethodPost, fmt.Sprintf("http://127.0.0.1:%d/tokenize", s.port), bytes.NewBuffer(data)) - if err != nil { - return nil, fmt.Errorf("encode request: %w", err) - } - req.Header.Set("Content-Type", "application/json") - - resp, err := http.DefaultClient.Do(req) - if err != nil { - return nil, fmt.Errorf("do encode request: %w", err) - } - defer resp.Body.Close() - if resp.StatusCode == http.StatusNotFound { - if s.model == nil { - slog.Debug("new runner detected, loading model for cgo tokenization") - m, err := llama.LoadModelFromFile(s.modelPath, llama.ModelParams{VocabOnly: true}) - if err != nil { - return nil, err - } - s.model = m + if s.textProcessor != nil { + tokens, err := s.textProcessor.Encode(content) + if err != nil { + return nil, err } - return s.model.Tokenize(content, false, true) + toks := make([]int, len(tokens)) + for i, t := range tokens { + toks[i] = int(t) + } + return toks, nil } - - body, err := io.ReadAll(resp.Body) - if err != nil { - return nil, fmt.Errorf("read encode request: %w", err) - } - - if resp.StatusCode >= 400 { - log.Printf("llm encode error: %s", body) - return nil, fmt.Errorf("%s", body) - } - - var encoded TokenizeResponse - if err := json.Unmarshal(body, &encoded); err != nil { - return nil, fmt.Errorf("unmarshal encode response: %w", err) - } - - return encoded.Tokens, nil + // not reached + return nil, fmt.Errorf("no tokenizer configured") } type DetokenizeRequest struct { @@ -1002,80 +996,38 @@ type DetokenizeResponse struct { } func (s *llmServer) Detokenize(ctx context.Context, tokens []int) (string, error) { - s.modelLock.Lock() - defer s.modelLock.Unlock() - if s.model != nil { + s.llamaModelLock.Lock() + defer s.llamaModelLock.Unlock() + + if s.llamaModel != nil { var resp string for _, token := range tokens { - resp += s.model.TokenToPiece(token) + resp += s.llamaModel.TokenToPiece(token) } return resp, nil } - // Make sure the server is ready - status, err := s.getServerStatus(ctx) - if err != nil { - return "", err - } else if status != ServerStatusReady && status != ServerStatusNoSlotsAvailable { - return "", fmt.Errorf("unexpected server status: %s", status.ToString()) - } - - data, err := json.Marshal(DetokenizeRequest{Tokens: tokens}) - if err != nil { - return "", fmt.Errorf("marshaling decode data: %w", err) - } - - req, err := http.NewRequestWithContext(ctx, http.MethodPost, fmt.Sprintf("http://127.0.0.1:%d/detokenize", s.port), bytes.NewBuffer(data)) - if err != nil { - return "", fmt.Errorf("decode request: %w", err) - } - req.Header.Set("Content-Type", "application/json") - - resp, err := http.DefaultClient.Do(req) - if err != nil { - return "", fmt.Errorf("do decode request: %w", err) - } - defer resp.Body.Close() - if resp.StatusCode == http.StatusNotFound { - if s.model == nil { - slog.Debug("new runner detected, loading model for cgo tokenization") - m, err := llama.LoadModelFromFile(s.modelPath, llama.ModelParams{VocabOnly: true}) - if err != nil { - return "", err - } - s.model = m + if s.textProcessor != nil { + toks := make([]int32, len(tokens)) + for i, t := range tokens { + toks[i] = int32(t) } - var resp string - for _, token := range tokens { - resp += s.model.TokenToPiece(token) + content, err := s.textProcessor.Decode(toks) + if err != nil { + return "", err } - return resp, nil + return content, nil } - - body, err := io.ReadAll(resp.Body) - if err != nil { - return "", fmt.Errorf("read decode request: %w", err) - } - - if resp.StatusCode >= 400 { - log.Printf("llm decode error: %s", body) - return "", fmt.Errorf("%s", body) - } - - var decoded DetokenizeResponse - if err := json.Unmarshal(body, &decoded); err != nil { - return "", fmt.Errorf("unmarshal encode response: %w", err) - } - - return decoded.Content, nil + // not reached + return "", fmt.Errorf("no tokenizer configured") } func (s *llmServer) Close() error { - s.modelLock.Lock() - if s.model != nil { - llama.FreeModel(s.model) - s.model = nil + s.llamaModelLock.Lock() + if s.llamaModel != nil { + llama.FreeModel(s.llamaModel) + s.llamaModel = nil } - s.modelLock.Unlock() + s.llamaModelLock.Unlock() if s.cmd != nil { slog.Debug("stopping llama server") diff --git a/model/model.go b/model/model.go index 16020b354..f8ed87410 100644 --- a/model/model.go +++ b/model/model.go @@ -16,6 +16,7 @@ import ( _ "golang.org/x/image/tiff" _ "golang.org/x/image/webp" + fs "github.com/ollama/ollama/fs/ggml" "github.com/ollama/ollama/kvcache" "github.com/ollama/ollama/ml" _ "github.com/ollama/ollama/ml/backend" @@ -100,6 +101,36 @@ func New(modelPath string, params ml.BackendParams) (Model, error) { return m, nil } +func NewTextProcessor(s string) (TextProcessor, error) { + r, err := os.Open(s) + if err != nil { + return nil, err + } + defer r.Close() + meta, _, err := fs.Decode(r, -1) + if err != nil { + return nil, err + } + return getTextProcessor(meta.KV()) +} + +func getTextProcessor(kv fs.KV) (TextProcessor, error) { + arch := kv.Architecture() + f, ok := models[arch] + if !ok { + return nil, fmt.Errorf("unsupported model architecture %q", arch) + } + m, err := f(kv) + if err != nil { + return nil, err + } + tp, ok := m.(TextProcessor) + if !ok { + return nil, fmt.Errorf("%v is not a TextProcessor", m) + } + return tp, nil +} + func populateFields(base Base, v reflect.Value, tags ...Tag) reflect.Value { t := v.Type() diff --git a/model/model_test.go b/model/model_test.go index 02b8aa3c2..8761817e0 100644 --- a/model/model_test.go +++ b/model/model_test.go @@ -3,9 +3,11 @@ package model import ( "reflect" "slices" + "strings" "testing" "github.com/google/go-cmp/cmp" + fs "github.com/ollama/ollama/fs/ggml" "github.com/ollama/ollama/ml" "github.com/ollama/ollama/ml/backend/ggml" "github.com/ollama/ollama/ml/nn" @@ -134,3 +136,40 @@ func TestPopulateFieldsAlternateName(t *testing.T) { t.Errorf("populateFields() set incorrect values (-want +got):\n%s", diff) } } + +func TestGetTextProcessor(t *testing.T) { + tp, err := getTextProcessor(fs.KV{}) + if err == nil { + t.Error("expected error") + } else if !strings.Contains(err.Error(), "unsupported model architecture") { + t.Errorf("unexpected error: %v", err) + } else if tp != nil { + t.Error("expected nil tp") + } + + models["dummy"] = func(ml.Config) (Model, error) { + return notTextProcessorModel{}, nil + } + tp, err = getTextProcessor(fs.KV{"general.architecture": "dummy"}) + if err == nil { + t.Error("expected error") + } else if !strings.Contains(err.Error(), "not a TextProcessor") { + t.Errorf("unexpected error: %v", err) + } else if tp != nil { + t.Error("expected nil tp") + } +} + +type notTextProcessorModel struct{} + +func (notTextProcessorModel) Forward(ml.Context, Options) (ml.Tensor, error) { + panic("unimplemented") +} + +func (notTextProcessorModel) Backend() ml.Backend { + panic("unimplemented") +} + +func (notTextProcessorModel) Config() config { + panic("unimplemented") +} diff --git a/model/models/llama/model.go b/model/models/llama/model.go index 9bf6f4979..2f254a286 100644 --- a/model/models/llama/model.go +++ b/model/models/llama/model.go @@ -1,7 +1,9 @@ package llama import ( + "fmt" "math" + "strings" "github.com/ollama/ollama/kvcache" "github.com/ollama/ollama/ml" @@ -29,6 +31,10 @@ type Model struct { } func New(c ml.Config) (model.Model, error) { + if !strings.EqualFold(c.String("tokenizer.ggml.model"), "gpt2") { + return nil, fmt.Errorf("tokenizer %s not yet supported", c.String("tokenizer.ggml.model")) + } + m := Model{ BytePairEncoding: model.NewBytePairEncoding( c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`), diff --git a/model/models/mllama/model.go b/model/models/mllama/model.go index 743f4c32d..8fee0cdb0 100644 --- a/model/models/mllama/model.go +++ b/model/models/mllama/model.go @@ -1,6 +1,8 @@ package mllama import ( + "fmt" + "github.com/ollama/ollama/kvcache" "github.com/ollama/ollama/ml" "github.com/ollama/ollama/ml/nn" @@ -25,6 +27,10 @@ const ( ) func New(c ml.Config) (model.Model, error) { + // Verify unified config + if c.Uint("vision.block_count") == 0 { + return nil, fmt.Errorf("non-unified vision model not supported") + } m := Model{ BytePairEncoding: model.NewBytePairEncoding( c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`), diff --git a/server/prompt.go b/server/prompt.go index 233dffd69..5b5b958f1 100644 --- a/server/prompt.go +++ b/server/prompt.go @@ -10,7 +10,6 @@ import ( "strings" "github.com/ollama/ollama/api" - "github.com/ollama/ollama/envconfig" "github.com/ollama/ollama/llm" "github.com/ollama/ollama/model/models/mllama" "github.com/ollama/ollama/template" @@ -93,7 +92,7 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api. var imgData llm.ImageData if isMllama { - if envconfig.NewEngine() { + if len(m.ProjectorPaths) == 0 { imgData = llm.ImageData{ ID: len(images), Data: i, diff --git a/server/routes.go b/server/routes.go index f1afad6e0..73e94dc65 100644 --- a/server/routes.go +++ b/server/routes.go @@ -205,7 +205,7 @@ func (s *Server) GenerateHandler(c *gin.Context) { images := make([]llm.ImageData, len(req.Images)) for i := range req.Images { - if isMllama && !envconfig.NewEngine() { + if isMllama && len(model.ProjectorPaths) > 0 { data, opts, err := mllama.Preprocess(bytes.NewReader(req.Images[i])) if err != nil { c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": "error processing image"})