Add done_reason

chore: use errors.New to replace fmt.Errorf will much better (#3789 )
types/model: export IsValidNamePart (#3788 )
2024-04-22 09:30:19 -04:00 · 2024-04-20 22:11:06 -04:00 · 2024-04-20 18:26:34 -07:00 · 2024-04-20 15:17:03 -04:00 · 2024-04-18 16:04:12 -07:00 · 2024-04-18 16:02:08 -07:00
15 changed files with 136 additions and 110 deletions
--- a/14
+++ b/14
@@ -18,7 +18,7 @@ ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
 COPY --from=llm-code / /go/src/github.com/ollama/ollama/
 WORKDIR /go/src/github.com/ollama/ollama/llm/generate
 ARG CGO_CFLAGS
-RUN OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
+RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh

 FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION-devel-rockylinux8 AS cuda-build-arm64
 ARG CMAKE_VERSION
@@ -28,7 +28,7 @@ ENV PATH /opt/rh/gcc-toolset-10/root/usr/bin:$PATH
 COPY --from=llm-code / /go/src/github.com/ollama/ollama/
 WORKDIR /go/src/github.com/ollama/ollama/llm/generate
 ARG CGO_CFLAGS
-RUN OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
+RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh

 FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete AS rocm-build-amd64
 ARG CMAKE_VERSION
@@ -40,7 +40,7 @@ COPY --from=llm-code / /go/src/github.com/ollama/ollama/
 WORKDIR /go/src/github.com/ollama/ollama/llm/generate
 ARG CGO_CFLAGS
 ARG AMDGPU_TARGETS
-RUN OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
+RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
 RUN mkdir /tmp/scratch && \
    for dep in $(zcat /go/src/github.com/ollama/ollama/llm/build/linux/x86_64/rocm*/bin/deps.txt.gz) ; do \
        cp ${dep} /tmp/scratch/ || exit 1 ; \
@@ -64,11 +64,11 @@ WORKDIR /go/src/github.com/ollama/ollama/llm/generate
 FROM --platform=linux/amd64 cpu-builder-amd64 AS static-build-amd64
 RUN OLLAMA_CPU_TARGET="static" sh gen_linux.sh
 FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu-build-amd64
-RUN OLLAMA_CPU_TARGET="cpu" sh gen_linux.sh
+RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu" sh gen_linux.sh
 FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu_avx-build-amd64
-RUN OLLAMA_CPU_TARGET="cpu_avx" sh gen_linux.sh
+RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu_avx" sh gen_linux.sh
 FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu_avx2-build-amd64
-RUN OLLAMA_CPU_TARGET="cpu_avx2" sh gen_linux.sh
+RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu_avx2" sh gen_linux.sh

 FROM --platform=linux/arm64 centos:7 AS cpu-builder-arm64
 ARG CMAKE_VERSION
@@ -84,7 +84,7 @@ WORKDIR /go/src/github.com/ollama/ollama/llm/generate
 FROM --platform=linux/arm64 cpu-builder-arm64 AS static-build-arm64
 RUN OLLAMA_CPU_TARGET="static" sh gen_linux.sh
 FROM --platform=linux/arm64 cpu-builder-arm64 AS cpu-build-arm64
-RUN OLLAMA_CPU_TARGET="cpu" sh gen_linux.sh
+RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu" sh gen_linux.sh


 # Intermediate stage used for ./scripts/build_linux.sh
--- a/README.md
+++ b/README.md
@@ -35,10 +35,10 @@ The official [Ollama Docker image](https://hub.docker.com/r/ollama/ollama) `olla

 ## Quickstart

-To run and chat with [Llama 2](https://ollama.com/library/llama2):
+To run and chat with [Llama 3](https://ollama.com/library/llama3):

 ```
-ollama run llama2
+ollama run llama3
 ```

 ## Model library
@@ -49,7 +49,8 @@ Here are some example models that can be downloaded:

 | Model              | Parameters | Size  | Download                       |
 | ------------------ | ---------- | ----- | ------------------------------ |
-| Llama 2            | 7B         | 3.8GB | `ollama run llama2`            |
+| Llama 3            | 8B         | 4.7GB | `ollama run llama3`            |
+| Llama 3            | 70B        | 40GB  | `ollama run llama3:70b`        |
 | Mistral            | 7B         | 4.1GB | `ollama run mistral`           |
 | Dolphin Phi        | 2.7B       | 1.6GB | `ollama run dolphin-phi`       |
 | Phi-2              | 2.7B       | 1.7GB | `ollama run phi`               |
@@ -97,16 +98,16 @@ See the [guide](docs/import.md) on importing models for more information.

 ### Customize a prompt

-Models from the Ollama library can be customized with a prompt. For example, to customize the `llama2` model:
+Models from the Ollama library can be customized with a prompt. For example, to customize the `llama3` model:

 ```
-ollama pull llama2
+ollama pull llama3
 ```

 Create a `Modelfile`:

 ```
-FROM llama2
+FROM llama3

 # set the temperature to 1 [higher is more creative, lower is more coherent]
 PARAMETER temperature 1
@@ -141,7 +142,7 @@ ollama create mymodel -f ./Modelfile
 ### Pull a model

 ```
-ollama pull llama2
+ollama pull llama3
 ```

 > This command can also be used to update a local model. Only the diff will be pulled.
@@ -149,13 +150,13 @@ ollama pull llama2
 ### Remove a model

 ```
-ollama rm llama2
+ollama rm llama3
 ```

 ### Copy a model

 ```
-ollama cp llama2 my-llama2
+ollama cp llama3 my-model
 ```

 ### Multiline input
@@ -179,7 +180,7 @@ The image features a yellow smiley face, which is likely the central focus of th
 ### Pass in prompt as arguments

 ```
-$ ollama run llama2 "Summarize this file: $(cat README.md)"
+$ ollama run llama3 "Summarize this file: $(cat README.md)"
 Ollama is a lightweight, extensible framework for building and running language models on the local machine. It provides a simple API for creating, running, and managing models, as well as a library of pre-built models that can be easily used in a variety of applications.
 ```

@@ -226,7 +227,7 @@ Next, start the server:
 Finally, in a separate shell, run a model:

 ```
-./ollama run llama2
+./ollama run llama3
 ```

 ## REST API
@@ -237,7 +238,7 @@ Ollama has a REST API for running and managing models.

 ```
 curl http://localhost:11434/api/generate -d '{
-  "model": "llama2",
+  "model": "llama3",
  "prompt":"Why is the sky blue?"
 }'
 ```
@@ -246,7 +247,7 @@ curl http://localhost:11434/api/generate -d '{

 ```
 curl http://localhost:11434/api/chat -d '{
-  "model": "mistral",
+  "model": "llama3",
  "messages": [
    { "role": "user", "content": "why is the sky blue?" }
  ]
--- a/api/types.go
+++ b/api/types.go
@@ -2,6 +2,7 @@ package api

 import (
 	"encoding/json"
+	"errors"
 	"fmt"
 	"math"
 	"os"
@@ -97,7 +98,8 @@ type ChatResponse struct {
 	CreatedAt time.Time `json:"created_at"`
 	Message   Message   `json:"message"`

-	Done bool `json:"done"`
+	Done       bool   `json:"done"`
+	DoneReason string `json:"done_reason,omitempty"`

 	Metrics
 }
@@ -264,8 +266,9 @@ type GenerateResponse struct {
 	CreatedAt time.Time `json:"created_at"`
 	Response  string    `json:"response"`

-	Done    bool  `json:"done"`
-	Context []int `json:"context,omitempty"`
+	Done       bool   `json:"done"`
+	DoneReason string `json:"done_reason,omitempty"`
+	Context    []int  `json:"context,omitempty"`

 	Metrics
 }
@@ -307,7 +310,7 @@ func (m *Metrics) Summary() {
 	}
 }

-var ErrInvalidOpts = fmt.Errorf("invalid options")
+var ErrInvalidOpts = errors.New("invalid options")

 func (opts *Options) FromMap(m map[string]interface{}) error {
 	valueOpts := reflect.ValueOf(opts).Elem() // names of the fields in the options struct
--- a/docs/api.md
+++ b/docs/api.md
@@ -90,7 +90,7 @@ The final response in the stream also includes additional data about the generat
 - `load_duration`: time spent in nanoseconds loading the model
 - `prompt_eval_count`: number of tokens in the prompt
 - `prompt_eval_duration`: time spent in nanoseconds evaluating the prompt
- `eval_count`: number of tokens the response
+- `eval_count`: number of tokens in the response
 - `eval_duration`: time in nanoseconds spent generating the response
 - `context`: an encoding of the conversation used in this response, this can be sent in the next request to keep a conversational memory
 - `response`: empty if the response was streamed, if not streamed, this will contain the full response
--- a/docs/tutorials/nvidia-jetson.md
+++ b/docs/tutorials/nvidia-jetson.md
@@ -1,38 +1,15 @@
 # Running Ollama on NVIDIA Jetson Devices

-With some minor configuration, Ollama runs well on [NVIDIA Jetson Devices](https://www.nvidia.com/en-us/autonomous-machines/embedded-systems/). The following has been tested on [JetPack 5.1.2](https://developer.nvidia.com/embedded/jetpack).
+Ollama runs well on [NVIDIA Jetson Devices](https://www.nvidia.com/en-us/autonomous-machines/embedded-systems/) and should run out of the box with the standard installation instructions. 

-NVIDIA Jetson devices are Linux-based embedded AI computers that are purpose-built for AI applications.
-
-Jetsons have an integrated GPU that is wired directly to the memory controller of the machine. For this reason, the `nvidia-smi` command is unrecognized, and Ollama proceeds to operate in "CPU only"
-mode. This can be verified by using a monitoring tool like jtop.
-
-In order to address this, we simply pass the path to the Jetson's pre-installed CUDA libraries into `ollama serve` (while in a tmux session). We then hardcode the num_gpu parameters into a cloned
-version of our target model.
-
-Prerequisites:
-
- curl
- tmux
-
-Here are the steps:
+The following has been tested on [JetPack 5.1.2](https://developer.nvidia.com/embedded/jetpack), but should also work on JetPack 6.0.

 - Install Ollama via standard Linux command (ignore the 404 error): `curl https://ollama.com/install.sh | sh`
- Stop the Ollama service: `sudo systemctl stop ollama`
- Start Ollama serve in a tmux session called ollama_jetson and reference the CUDA libraries path: `tmux has-session -t ollama_jetson 2>/dev/null || tmux new-session -d -s ollama_jetson 
-'LD_LIBRARY_PATH=/usr/local/cuda/lib64 ollama serve'`
 - Pull the model you want to use (e.g. mistral): `ollama pull mistral`
- Create a new Modelfile specifically for enabling GPU support on the Jetson: `touch ModelfileMistralJetson`
- In the ModelfileMistralJetson file, specify the FROM model and the num_gpu PARAMETER as shown below:
-
-```
-FROM mistral
-PARAMETER num_gpu 999
-```
-
- Create a new model from your Modelfile: `ollama create mistral-jetson -f ./ModelfileMistralJetson`
- Run the new model: `ollama run mistral-jetson`
-
-If you run a monitoring tool like jtop you should now see that Ollama is using the Jetson's integrated GPU.
+- Start an interactive session: `ollama run mistral`

 And that's it!
+
+# Running Ollama in Docker
+
+When running GPU accelerated applications in Docker, it is highly recommended to use [dusty-nv jetson-containers repo](https://github.com/dusty-nv/jetson-containers).
--- a/llm/generate/gen_linux.sh
+++ b/llm/generate/gen_linux.sh
@@ -57,21 +57,21 @@ init_vars
 git_module_setup
 apply_patches

+init_vars
+if [ -z "${OLLAMA_SKIP_STATIC_GENERATE}" -o "${OLLAMA_CPU_TARGET}" = "static" ]; then
+    # Builds by default, allows skipping, forces build if OLLAMA_CPU_TARGET="static"
+    # Enables optimized Dockerfile builds using a blanket skip and targeted overrides
+    # Static build for linking into the Go binary
+    init_vars
+    CMAKE_TARGETS="--target llama --target ggml"
+    CMAKE_DEFS="-DBUILD_SHARED_LIBS=off -DLLAMA_NATIVE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
+    BUILD_DIR="../build/linux/${ARCH}_static"
+    echo "Building static library"
+    build
+fi

 init_vars
 if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
-
-    if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "static" ]; then
-        # Static build for linking into the Go binary
-        init_vars
-        CMAKE_TARGETS="--target llama --target ggml"
-        CMAKE_DEFS="-DBUILD_SHARED_LIBS=off -DLLAMA_NATIVE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
-        BUILD_DIR="../build/linux/${ARCH}_static"
-        echo "Building static library"
-        build
-    fi
-
-
    # Users building from source can tune the exact flags we pass to cmake for configuring
    # llama.cpp, and we'll build only 1 CPU variant in that case as the default.
    if [ -n "${OLLAMA_CUSTOM_CPU_DEFS}" ]; then
--- a/llm/payload.go
+++ b/llm/payload.go
@@ -17,7 +17,7 @@ import (
 	"github.com/ollama/ollama/gpu"
 )

-var errPayloadMissing = fmt.Errorf("expected payloads not included in this build of ollama")
+var errPayloadMissing = errors.New("expected payloads not included in this build of ollama")

 func Init() error {
 	payloadsDir, err := gpu.PayloadsDir()
--- a/llm/server.go
+++ b/llm/server.go
@@ -509,10 +509,13 @@ type ImageData struct {
 }

 type completion struct {
-	Content string `json:"content"`
-	Model   string `json:"model"`
-	Prompt  string `json:"prompt"`
-	Stop    bool   `json:"stop"`
+	Content      string `json:"content"`
+	Model        string `json:"model"`
+	Prompt       string `json:"prompt"`
+	Stop         bool   `json:"stop"`
+	StoppedEos   bool   `json:"stopped_eos"`
+	StoppedWord  bool   `json:"stopped_word"`
+	StoppedLimit bool   `json:"stopped_limit"`

 	Timings struct {
 		PredictedN  int     `json:"predicted_n"`
@@ -532,6 +535,7 @@ type CompletionRequest struct {
 type CompletionResponse struct {
 	Content            string
 	Done               bool
+	DoneReason         string
 	PromptEvalCount    int
 	PromptEvalDuration time.Duration
 	EvalCount          int
@@ -648,6 +652,8 @@ func (s *LlamaServer) Completion(ctx context.Context, req CompletionRequest, fn
 					return fmt.Errorf("error parsing llm response stream: %s", line)
 				}

+				fmt.Println("c", string(evt))
+
 				var c completion
 				if err := json.Unmarshal(evt, &c); err != nil {
 					return fmt.Errorf("error unmarshaling llm prediction response: %v", err)
@@ -674,8 +680,18 @@ func (s *LlamaServer) Completion(ctx context.Context, req CompletionRequest, fn
 				}

 				if c.Stop {
+					var doneReason string
+					switch {
+					case c.StoppedEos:
+						doneReason = "stop"
+					case c.StoppedWord:
+						doneReason = "stop"
+					case c.StoppedLimit:
+						doneReason = "limit"
+					}
 					fn(CompletionResponse{
 						Done:               true,
+						DoneReason:         doneReason,
 						PromptEvalCount:    c.Timings.PromptN,
 						PromptEvalDuration: parseDurationMs(c.Timings.PromptMS),
 						EvalCount:          c.Timings.PredictedN,
--- a/server/images.go
+++ b/server/images.go
@@ -1137,7 +1137,7 @@ func GetSHA256Digest(r io.Reader) (string, int64) {
 	return fmt.Sprintf("sha256:%x", h.Sum(nil)), n
 }

-var errUnauthorized = fmt.Errorf("unauthorized")
+var errUnauthorized = errors.New("unauthorized")

 func makeRequestWithRetry(ctx context.Context, method string, requestURL *url.URL, headers http.Header, body io.ReadSeeker, regOpts *registryOptions) (*http.Response, error) {
 	for i := 0; i < 2; i++ {
@@ -1255,7 +1255,7 @@ func parseRegistryChallenge(authStr string) registryChallenge {
 	}
 }

-var errDigestMismatch = fmt.Errorf("digest mismatch, file must be downloaded again")
+var errDigestMismatch = errors.New("digest mismatch, file must be downloaded again")

 func verifyBlob(digest string) error {
 	fp, err := GetBlobsPath(digest)
--- a/server/prompt.go
+++ b/server/prompt.go
@@ -91,7 +91,7 @@ func countTokens(tmpl string, system string, prompt string, response string, enc
 }

 // ChatPrompt builds up a prompt from a series of messages, truncating based on context window size
-func ChatPrompt(tmpl string, messages []api.Message, window int, encode func(string) ([]int, error)) (string, error) {
+func ChatPrompt(tmpl string, messages []api.Message, window int, encode func(string) ([]int, error)) (string, int, error) {
 	type prompt struct {
 		System   string
 		Prompt   string
@@ -138,7 +138,7 @@ func ChatPrompt(tmpl string, messages []api.Message, window int, encode func(str

 			p.Response = msg.Content
 		default:
-			return "", fmt.Errorf("invalid role: %s, role must be one of [system, user, assistant]", msg.Role)
+			return "", 0, fmt.Errorf("invalid role: %s, role must be one of [system, user, assistant]", msg.Role)
 		}
 	}

@@ -151,7 +151,7 @@ func ChatPrompt(tmpl string, messages []api.Message, window int, encode func(str
 	for i, p := range prompts {
 		tokens, err := countTokens(tmpl, p.System, p.Prompt, p.Response, encode)
 		if err != nil {
-			return "", err
+			return "", 0, err
 		}

 		prompts[i].tokens = tokens + len(prompts[i].images)*768
@@ -160,15 +160,17 @@ func ChatPrompt(tmpl string, messages []api.Message, window int, encode func(str
 	// truncate images and prompts starting from the beginning of the list
 	// until either one prompt remains or the total tokens fits the context window
 	// TODO (jmorganca): this doesn't account for the context window room required for the response
+	var required int
 	for {
-		var required int
+		required = 0
 		for _, p := range prompts {
 			required += p.tokens
 		}

 		required += 1 // for bos token

-		if required <= window {
+		// leave ~1024 tokens for generation
+		if required <= max(1024, window/2) {
 			slog.Debug("prompt now fits in context window", "required", required, "window", window)
 			break
 		}
@@ -194,7 +196,7 @@ func ChatPrompt(tmpl string, messages []api.Message, window int, encode func(str

 				tokens, err := countTokens(tmpl, prompts[0].System, prompts[0].Prompt, prompts[0].Response, encode)
 				if err != nil {
-					return "", err
+					return "", 0, err
 				}

 				prompts[0].tokens = tokens + len(prompts[0].images)*768
@@ -212,10 +214,10 @@ func ChatPrompt(tmpl string, messages []api.Message, window int, encode func(str
 		// last prompt should leave the response unrendered (for completion)
 		rendered, err := Prompt(tmpl, p.System, p.Prompt, p.Response, i == len(prompts)-1)
 		if err != nil {
-			return "", err
+			return "", 0, err
 		}
 		sb.WriteString(rendered)
 	}

-	return sb.String(), nil
+	return sb.String(), required, nil
 }
--- a/server/prompt_test.go
+++ b/server/prompt_test.go
@@ -192,7 +192,7 @@ func TestChatPrompt(t *testing.T) {

 	for _, tc := range tests {
 		t.Run(tc.name, func(t *testing.T) {
-			got, err := ChatPrompt(tc.template, tc.messages, tc.window, encode)
+			got, _, err := ChatPrompt(tc.template, tc.messages, tc.window, encode)
 			if err != nil {
 				t.Errorf("error = %v", err)
 			}
--- a/server/routes.go
+++ b/server/routes.go
@@ -234,9 +234,10 @@ func GenerateHandler(c *gin.Context) {
 	// of `raw` mode so we need to check for it too
 	if req.Prompt == "" && req.Template == "" && req.System == "" {
 		c.JSON(http.StatusOK, api.GenerateResponse{
-			CreatedAt: time.Now().UTC(),
-			Model:     req.Model,
-			Done:      true,
+			CreatedAt:  time.Now().UTC(),
+			Model:      req.Model,
+			Done:       true,
+			DoneReason: "load",
 		})
 		return
 	}
@@ -289,6 +290,14 @@ func GenerateHandler(c *gin.Context) {
 		prompt = sb.String()
 	}

+	tokens, err := loaded.llama.Tokenize(c.Request.Context(), prompt)
+	if err != nil {
+		c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
+		return
+	}
+
+	opts.NumPredict = max(opts.NumCtx-len(tokens), 0)
+
 	slog.Debug("generate handler", "prompt", prompt)

 	ch := make(chan any)
@@ -307,10 +316,11 @@ func GenerateHandler(c *gin.Context) {
 			}

 			resp := api.GenerateResponse{
-				Model:     req.Model,
-				CreatedAt: time.Now().UTC(),
-				Done:      r.Done,
-				Response:  r.Content,
+				Model:      req.Model,
+				CreatedAt:  time.Now().UTC(),
+				Done:       r.Done,
+				DoneReason: r.DoneReason,
+				Response:   r.Content,
 				Metrics: api.Metrics{
 					PromptEvalCount:    r.PromptEvalCount,
 					PromptEvalDuration: r.PromptEvalDuration,
@@ -1219,17 +1229,17 @@ func streamResponse(c *gin.Context, ch chan any) {
 }

 // ChatPrompt builds up a prompt from a series of messages for the currently `loaded` model
-func chatPrompt(ctx context.Context, template string, messages []api.Message, numCtx int) (string, error) {
+func chatPrompt(ctx context.Context, template string, messages []api.Message, numCtx int) (string, int, error) {
 	encode := func(s string) ([]int, error) {
 		return loaded.llama.Tokenize(ctx, s)
 	}

-	prompt, err := ChatPrompt(template, messages, numCtx, encode)
+	prompt, tokens, err := ChatPrompt(template, messages, numCtx, encode)
 	if err != nil {
-		return "", err
+		return "", 0, err
 	}

-	return prompt, nil
+	return prompt, tokens, nil
 }

 func ChatHandler(c *gin.Context) {
@@ -1309,19 +1319,22 @@ func ChatHandler(c *gin.Context) {
 		}, req.Messages...)
 	}

-	prompt, err := chatPrompt(c.Request.Context(), model.Template, req.Messages, opts.NumCtx)
+	prompt, tokens, err := chatPrompt(c.Request.Context(), model.Template, req.Messages, opts.NumCtx)
 	if err != nil {
 		c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
 		return
 	}

+	opts.NumPredict = max(opts.NumCtx-tokens, 0)
+
 	// an empty request loads the model
 	if len(req.Messages) == 0 || prompt == "" {
 		resp := api.ChatResponse{
-			CreatedAt: time.Now().UTC(),
-			Model:     req.Model,
-			Done:      true,
-			Message:   api.Message{Role: "assistant"},
+			CreatedAt:  time.Now().UTC(),
+			Model:      req.Model,
+			Done:       true,
+			DoneReason: "load",
+			Message:    api.Message{Role: "assistant"},
 		}
 		c.JSON(http.StatusOK, resp)
 		return
@@ -1356,10 +1369,11 @@ func ChatHandler(c *gin.Context) {
 			loaded.expireTimer.Reset(sessionDuration)

 			resp := api.ChatResponse{
-				Model:     req.Model,
-				CreatedAt: time.Now().UTC(),
-				Message:   api.Message{Role: "assistant", Content: r.Content},
-				Done:      r.Done,
+				Model:      req.Model,
+				CreatedAt:  time.Now().UTC(),
+				Message:    api.Message{Role: "assistant", Content: r.Content},
+				Done:       r.Done,
+				DoneReason: r.DoneReason,
 				Metrics: api.Metrics{
 					PromptEvalCount:    r.PromptEvalCount,
 					PromptEvalDuration: r.PromptEvalDuration,
--- a/types/model/digest.go
+++ b/types/model/digest.go
@@ -1,6 +1,7 @@
 package model

 import (
+	"fmt"
 	"log/slog"
 	"strings"
 	"unicode"
@@ -47,8 +48,11 @@ var (
 // Digest.
 func ParseDigest(s string) Digest {
 	typ, digest, ok := strings.Cut(s, "-")
+	if !ok {
+		typ, digest, ok = strings.Cut(s, ":")
+	}
 	if ok && isValidDigestType(typ) && isValidHex(digest) {
-		return Digest{s: s}
+		return Digest{s: fmt.Sprintf("%s-%s", typ, digest)}
 	}
 	return Digest{}
 }
--- a/types/model/name.go
+++ b/types/model/name.go
@@ -156,7 +156,7 @@ func ParseName(s, fill string) Name {
 			r = Name{}
 			return false
 		}
-		if kind == PartExtraneous || !isValidPart(kind, part) {
+		if kind == PartExtraneous || !IsValidNamePart(kind, part) {
 			r = Name{}
 			return false
 		}
@@ -176,7 +176,7 @@ func parseMask(s string) Name {
 			// mask part; treat as empty but valid
 			return true
 		}
-		if !isValidPart(kind, part) {
+		if !IsValidNamePart(kind, part) {
 			panic(fmt.Errorf("invalid mask part %s: %q", kind, part))
 		}
 		r.parts[kind] = part
@@ -608,7 +608,7 @@ func ParseNameFromFilepath(s, fill string) Name {
 	var r Name
 	for i := range PartBuild + 1 {
 		part, rest, _ := strings.Cut(s, string(filepath.Separator))
-		if !isValidPart(i, part) {
+		if !IsValidNamePart(i, part) {
 			return Name{}
 		}
 		r.parts[i] = part
@@ -654,9 +654,12 @@ func (r Name) FilepathNoBuild() string {
 	return filepath.Join(r.parts[:PartBuild]...)
 }

-// isValidPart reports if s contains all valid characters for the given
-// part kind.
-func isValidPart(kind PartKind, s string) bool {
+// IsValidNamePart reports if s contains all valid characters for the given
+// part kind and is under MaxNamePartLen bytes.
+func IsValidNamePart(kind PartKind, s string) bool {
+	if len(s) > MaxNamePartLen {
+		return false
+	}
 	if s == "" {
 		return false
 	}
--- a/types/model/name_test.go
+++ b/types/model/name_test.go
@@ -105,6 +105,12 @@ var testNames = map[string]fields{
 	strings.Repeat("a", MaxNamePartLen+1): {},
 }

+func TestIsValidNameLen(t *testing.T) {
+	if IsValidNamePart(PartNamespace, strings.Repeat("a", MaxNamePartLen+1)) {
+		t.Errorf("unexpectedly valid long name")
+	}
+}
+
 // TestConsecutiveDots tests that consecutive dots are not allowed in any
 // part, to avoid path traversal. There also are some tests in testNames, but
 // this test is more exhaustive and exists to emphasize the importance of
Author	SHA1	Message	Date
jmorganca	e117483ef6	Add `done_reason`	2024-04-22 09:30:19 -04:00
Cheng	62be2050dd	chore: use errors.New to replace fmt.Errorf will much better (#3789 )	2024-04-20 22:11:06 -04:00
Blake Mizerany	56f8aa6912	types/model: export IsValidNamePart (#3788 )	2024-04-20 18:26:34 -07:00
Sri Siddhaarth	e6f9bfc0e8	Update api.md (#3705 )	2024-04-20 15:17:03 -04:00
Daniel Hiltgen	8d1995c625	Merge pull request #3708 from remy415/arm64static move Ollama static build to its own flag	2024-04-18 16:04:12 -07:00
Daniel Hiltgen	fd01fbf038	Merge pull request #3710 from remy415/update-jetson-docs update jetson tutorial	2024-04-18 16:02:08 -07:00
Blake Mizerany	0408205c1c	types/model: accept former `:` as a separator in digest (#3724 ) This also converges the old sep `:` to the new sep `-`.	2024-04-18 14:17:46 -07:00
Jeffrey Morgan	63a7edd771	Update README.md	2024-04-18 16:09:38 -04:00
Michael	554ffdcce3	add llama3 to readme add llama3 to readme	2024-04-18 15:18:48 -04:00
Jeremy	9850a4ce08	Merge branch 'ollama:main' into update-jetson-docs	2024-04-18 09:55:17 -04:00
Jeremy	fd048f1367	Merge branch 'ollama:main' into arm64static	2024-04-18 09:55:04 -04:00
Jeremy	8dca03173d	Merge remote-tracking branch 'upstream/main' into update-jetson-docs	2024-04-17 16:18:50 -04:00
Jeremy	85bdf14b56	update jetson tutorial	2024-04-17 16:17:42 -04:00
Jeremy	da8a0c7657	Merge branch 'ollama:main' into arm64static	2024-04-17 15:22:34 -04:00
Jeremy	ea4c284a48	Merge branch 'ollama:main' into arm64static	2024-04-17 15:11:38 -04:00
Jeremy	8aec92fa6d	rearranged conditional logic for static build, dockerfile updated	2024-04-17 14:43:28 -04:00
Jeremy	70261b9bb6	move static build to its own flag	2024-04-17 13:04:28 -04:00