num parallel embed

Merge pull request #5512 from ollama/mxyng/detect-stop
autodetect stop parameters from template
2024-07-26 15:18:35 -07:00 · 2024-07-26 13:48:23 -07:00 · 2024-07-25 23:10:18 -04:00 · 2024-07-25 20:27:33 -04:00 · 2024-07-25 16:26:19 -07:00 · 2024-07-25 16:23:40 -07:00
29 changed files with 532 additions and 14 deletions
--- a/README.md
+++ b/README.md
@@ -64,7 +64,8 @@ Here are some example models that can be downloaded:
 | LLaVA              | 7B         | 4.5GB | `ollama run llava`             |
 | Solar              | 10.7B      | 6.1GB | `ollama run solar`             |

-> Note: You should have at least 8 GB of RAM available to run the 7B models, 16 GB to run the 13B models, and 32 GB to run the 33B models.
+> [!NOTE]
+> You should have at least 8 GB of RAM available to run the 7B models, 16 GB to run the 13B models, and 32 GB to run the 33B models.

 ## Customize a model

--- a/docs/api.md
+++ b/docs/api.md
@@ -40,6 +40,7 @@ Generate a response for a given prompt with a provided model. This is a streamin

 - `model`: (required) the [model name](#model-names)
 - `prompt`: the prompt to generate a response for
+- `suffix`: the text after the model response
 - `images`: (optional) a list of base64-encoded images (for multimodal models such as `llava`)

 Advanced parameters (optional):
@@ -57,7 +58,8 @@ Advanced parameters (optional):

 Enable JSON mode by setting the `format` parameter to `json`. This will structure the response as a valid JSON object. See the JSON mode [example](#request-json-mode) below.

-> Note: it's important to instruct the model to use JSON in the `prompt`. Otherwise, the model may generate large amounts whitespace.
+> [!IMPORTANT]
+> It's important to instruct the model to use JSON in the `prompt`. Otherwise, the model may generate large amounts whitespace.

 ### Examples

@@ -148,8 +150,44 @@ If `stream` is set to `false`, the response will be a single JSON object:
 }
 ```

+#### Request (with suffix)
+
+##### Request
+
+```shell
+curl http://localhost:11434/api/generate -d '{
+  "model": "codellama:code",
+  "prompt": "def compute_gcd(a, b):",
+  "suffix": "    return result",
+  "options": {
+    "temperature": 0
+  },
+  "stream": false
+}'
+```
+
+##### Response
+
+```json
+{
+  "model": "codellama:code",
+  "created_at": "2024-07-22T20:47:51.147561Z",
+  "response": "\n  if a == 0:\n    return b\n  else:\n    return compute_gcd(b % a, a)\n\ndef compute_lcm(a, b):\n  result = (a * b) / compute_gcd(a, b)\n",
+  "done": true,
+  "done_reason": "stop",
+  "context": [...],
+  "total_duration": 1162761250,
+  "load_duration": 6683708,
+  "prompt_eval_count": 17,
+  "prompt_eval_duration": 201222000,
+  "eval_count": 63,
+  "eval_duration": 953997000
+}
+```
+
 #### Request (JSON mode)

+> [!IMPORTANT]
 > When `format` is set to `json`, the output will always be a well-formed JSON object. It's important to also instruct the model to respond in JSON.

 ##### Request
@@ -380,12 +418,14 @@ Generate the next message in a chat with a provided model. This is a streaming e

 - `model`: (required) the [model name](#model-names)
 - `messages`: the messages of the chat, this can be used to keep a chat memory
+- `tools`: tools for the model to use if supported. Requires `stream` to be set to `false`

 The `message` object has the following fields:

- `role`: the role of the message, either `system`, `user` or `assistant`
+- `role`: the role of the message, either `system`, `user`, `assistant`, or `tool`
 - `content`: the content of the message
 - `images` (optional): a list of images to include in the message (for multimodal models such as `llava`)
+- `tool_calls` (optional): a list of tools the model wants to use

 Advanced parameters (optional):

@@ -622,6 +662,79 @@ curl http://localhost:11434/api/chat -d '{
 }
 ```

+#### Chat request (with tools)
+
+##### Request
+
+```
+curl http://localhost:11434/api/chat -d '{
+  "model": "mistral",
+  "messages": [
+    {
+      "role": "user",
+      "content": "What is the weather today in Paris?"
+    }
+  ],
+  "stream": false,
+  "tools": [
+    {
+      "type": "function",
+      "function": {
+        "name": "get_current_weather",
+        "description": "Get the current weather for a location",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "location": {
+              "type": "string",
+              "description": "The location to get the weather for, e.g. San Francisco, CA"
+            },
+            "format": {
+              "type": "string",
+              "description": "The format to return the weather in, e.g. 'celsius' or 'fahrenheit'",
+              "enum": ["celsius", "fahrenheit"]
+            }
+          },
+          "required": ["location", "format"]
+        }
+      }
+    }
+  ]
+}'
+```
+
+##### Response
+
+```json
+{
+  "model": "mistral:7b-instruct-v0.3-q4_K_M",
+  "created_at": "2024-07-22T20:33:28.123648Z",
+  "message": {
+    "role": "assistant",
+    "content": "",
+    "tool_calls": [
+      {
+        "function": {
+          "name": "get_current_weather",
+          "arguments": {
+            "format": "celsius",
+            "location": "Paris, FR"
+          }
+        }
+      }
+    ]
+  },
+  "done_reason": "stop",
+  "done": true,
+  "total_duration": 885095291,
+  "load_duration": 3753500,
+  "prompt_eval_count": 122,
+  "prompt_eval_duration": 328493000,
+  "eval_count": 33,
+  "eval_duration": 552222000
+}
+```
+
 ## Create a Model

 ```shell
@@ -1173,4 +1286,4 @@ curl http://localhost:11434/api/embeddings -d '{
    0.8785552978515625, -0.34576427936553955, 0.5742510557174683, -0.04222835972905159, -0.137906014919281
  ]
 }
-```
+```
--- a/docs/modelfile.md
+++ b/docs/modelfile.md
@@ -1,6 +1,7 @@
 # Ollama Model File

-> Note: `Modelfile` syntax is in development
+> [!NOTE]
+> `Modelfile` syntax is in development

 A model file is the blueprint to create and share models with Ollama.

--- a/docs/openai.md
+++ b/docs/openai.md
@@ -78,8 +78,8 @@ curl http://localhost:11434/v1/chat/completions \
 - [x] Streaming
 - [x] JSON mode
 - [x] Reproducible outputs
+- [x] Tools (streaming support coming soon)
 - [ ] Vision
- [ ] Function calling
 - [ ] Logprobs

 #### Supported request fields
@@ -97,9 +97,9 @@ curl http://localhost:11434/v1/chat/completions \
 - [x] `temperature`
 - [x] `top_p`
 - [x] `max_tokens`
- [ ] `logit_bias`
- [ ] `tools`
+- [x] `tools`
 - [ ] `tool_choice`
+- [ ] `logit_bias`
 - [ ] `user`
 - [ ] `n`

--- a/docs/template.md
+++ b/docs/template.md
@@ -0,0 +1,173 @@
+# Template
+
+Ollama provides a powerful templating engine backed by Go's built-in templating engine to construct prompts for your large language model. This feature is a valuable tool to get the most out of your models.
+
+## Basic Template Structure
+
+A basic Go template consists of three main parts:
+
+* **Layout**: The overall structure of the template.
+* **Variables**: Placeholders for dynamic data that will be replaced with actual values when the template is rendered.
+* **Functions**: Custom functions or logic that can be used to manipulate the template's content.
+
+Here's an example of a simple chat template:
+
+```gotmpl
+{{- range .Messages }}
+{{ .Role }}: {{ .Content }}
+{{- end }}
+```
+
+In this example, we have:
+
+* A basic messages structure (layout)
+* Three variables: `Messages`, `Role`, and `Content` (variables)
+* A custom function (action) that iterates over an array of items (`range .Messages`) and displays each item
+
+## Adding templates to your model
+
+By default, models imported into Ollama have a default template of `{{ .Prompt }}`, i.e. user inputs are sent verbatim to the LLM. This is appropriate for text or code completion models but lacks essential markers for chat or instruction models.
+
+Omitting a template in these models puts the responsibility of correctly templating input onto the user. Adding a template allows users to easily get the best results from the model.
+
+To add templates in your model, you'll need to add a `TEMPLATE` command to the Modelfile. Here's an example using Meta's Llama 3.
+
+```dockerfile
+FROM llama3
+
+TEMPLATE """{{- if .System }}<|start_header_id|>system<|end_header_id|>
+
+{{ .System }}<|eot_id|>
+{{- end }}
+{{- range .Messages }}<|start_header_id|>{{ .Role }}<|end_header_id|>
+
+{{ .Content }}<|eot_id|>
+{{- end }}<|start_header_id|>assistant<|end_header_id|>
+
+"""
+```
+
+## Variables
+
+`System` (string): system prompt
+
+`Prompt` (string): user prompt
+
+`Response` (string): assistant response
+
+`Suffix` (string): text inserted after the assistant's response
+
+`Messages` (list): list of messages
+
+`Messages[].Role` (string): role which can be one of `system`, `user`, `assistant`, or `tool`
+
+`Messages[].Content` (string):  message content
+
+`Messages[].ToolCalls` (list): list of tools the model wants to call
+
+`Messages[].ToolCalls[].Function` (object): function to call
+
+`Messages[].ToolCalls[].Function.Name` (string): function name
+
+`Messages[].ToolCalls[].Function.Arguments` (map): mapping of argument name to argument value
+
+`Tools` (list): list of tools the model can access
+
+`Tools[].Type` (string): schema type. `type` is always `function`
+
+`Tools[].Function` (object): function definition
+
+`Tools[].Function.Name` (string): function name
+
+`Tools[].Function.Description` (string): function description
+
+`Tools[].Function.Parameters` (object): function parameters
+
+`Tools[].Function.Parameters.Type` (string): schema type. `type` is always `object`
+
+`Tools[].Function.Parameters.Required` (list): list of required properties
+
+`Tools[].Function.Parameters.Properties` (map): mapping of property name to property definition
+
+`Tools[].Function.Parameters.Properties[].Type` (string): property type
+
+`Tools[].Function.Parameters.Properties[].Description` (string): property description
+
+`Tools[].Function.Parameters.Properties[].Enum` (list): list of valid values
+
+## Tips and Best Practices
+
+Keep the following tips and best practices in mind when working with Go templates:
+
+* **Be mindful of dot**: Control flow structures like `range` and `with` changes the value `.`
+* **Out-of-scope variables**: Use `$.` to reference variables not currently in scope, starting from the root
+* **Whitespace control**: Use `-` to trim leading (`{{-`) and trailing (`-}}`) whitespace
+
+## Examples
+
+### Example Messages
+
+#### ChatML
+
+ChatML is a popular template format. It can be used for models such as Databrick's DBRX, Intel's Neural Chat, and Microsoft's Orca 2.
+
+```gotmpl
+{{- if .System }}<|im_start|>system
+{{ .System }}<|im_end|>
+{{ end }}
+{{- range .Messages }}<|im_start|>{{ .Role }}
+{{ .Content }}<|im_end|>
+{{ end }}<|im_start|>assistant
+{{ else }}
+{{ if .System }}<|im_start|>system
+{{ .System }}<|im_end|>
+```
+
+### Example Tools
+
+Tools support can be added to a model by adding a `{{ .Tools }}` node to the template. This feature is useful for models trained to call external tools and can a powerful tool for retrieving real-time data or performing complex tasks.
+
+#### Mistral
+
+Mistral v0.3 and Mixtral 8x22B supports tool calling.
+
+```gotmpl
+{{- range $index, $_ := .Messages }}
+{{- if eq .Role "user" }}
+{{- if and (le (len (slice $.Messages $index)) 2) $.Tools }}[AVAILABLE_TOOLS] {{ json $.Tools }}[/AVAILABLE_TOOLS]
+{{- end }}[INST] {{ if and (eq (len (slice $.Messages $index)) 1) $.System }}{{ $.System }}
+
+{{ end }}{{ .Content }}[/INST]
+{{- else if eq .Role "assistant" }}
+{{- if .Content }} {{ .Content }}</s>
+{{- else if .ToolCalls }}[TOOL_CALLS] [
+{{- range .ToolCalls }}{"name": "{{ .Function.Name }}", "arguments": {{ json .Function.Arguments }}}
+{{- end }}]</s>
+{{- end }}
+{{- else if eq .Role "tool" }}[TOOL_RESULTS] {"content": {{ .Content }}}[/TOOL_RESULTS]
+{{- end }}
+{{- end }}
+```
+
+### Example Fill-in-Middle
+
+Fill-in-middle support can be added to a model by adding a `{{ .Suffix }}` node to the template. This feature is useful for models that are trained to generate text in the middle of user input, such as code completion models.
+
+#### CodeLlama
+
+CodeLlama [7B](https://ollama.com/library/codellama:7b-code) and [13B](https://ollama.com/library/codellama:13b-code) code completion models support fill-in-middle.
+
+```gotmpl
+<PRE> {{ .Prompt }} <SUF>{{ .Suffix }} <MID>
+```
+
+> [!NOTE]
+> CodeLlama 34B and 70B code completion and all instruct and Python fine-tuned models do not support fill-in-middle.
+
+#### Codestral
+
+Codestral [22B](https://ollama.com/library/codestral:22b) supports fill-in-middle.
+
+```gotmpl
+[SUFFIX]{{ .Suffix }}[PREFIX] {{ .Prompt }}
+```
--- a/server/download.go
+++ b/server/download.go
@@ -8,6 +8,7 @@ import (
 	"io"
 	"log/slog"
 	"math"
+	"math/rand/v2"
 	"net/http"
 	"net/url"
 	"os"
@@ -141,6 +142,32 @@ func (b *blobDownload) Run(ctx context.Context, requestURL *url.URL, opts *regis
 	b.err = b.run(ctx, requestURL, opts)
 }

+func newBackoff(maxBackoff time.Duration) func(ctx context.Context) error {
+	var n int
+	return func(ctx context.Context) error {
+		if ctx.Err() != nil {
+			return ctx.Err()
+		}
+
+		n++
+
+		// n^2 backoff timer is a little smoother than the
+		// common choice of 2^n.
+		d := min(time.Duration(n*n)*10*time.Millisecond, maxBackoff)
+		// Randomize the delay between 0.5-1.5 x msec, in order
+		// to prevent accidental "thundering herd" problems.
+		d = time.Duration(float64(d) * (rand.Float64() + 0.5))
+		t := time.NewTimer(d)
+		defer t.Stop()
+		select {
+		case <-ctx.Done():
+			return ctx.Err()
+		case <-t.C:
+			return nil
+		}
+	}
+}
+
 func (b *blobDownload) run(ctx context.Context, requestURL *url.URL, opts *registryOptions) error {
 	defer blobDownloadManager.Delete(b.Digest)
 	ctx, b.CancelFunc = context.WithCancel(ctx)
@@ -153,6 +180,52 @@ func (b *blobDownload) run(ctx context.Context, requestURL *url.URL, opts *regis

 	_ = file.Truncate(b.Total)

+	directURL, err := func() (*url.URL, error) {
+		ctx, cancel := context.WithTimeout(ctx, 30*time.Second)
+		defer cancel()
+
+		backoff := newBackoff(10 * time.Second)
+		for {
+			// shallow clone opts to be used in the closure
+			// without affecting the outer opts.
+			newOpts := new(registryOptions)
+			*newOpts = *opts
+
+			newOpts.CheckRedirect = func(req *http.Request, via []*http.Request) error {
+				if len(via) > 10 {
+					return errors.New("maxium redirects exceeded (10) for directURL")
+				}
+
+				// if the hostname is the same, allow the redirect
+				if req.URL.Hostname() == requestURL.Hostname() {
+					return nil
+				}
+
+				// stop at the first redirect that is not
+				// the same hostname as the original
+				// request.
+				return http.ErrUseLastResponse
+			}
+
+			resp, err := makeRequestWithRetry(ctx, http.MethodGet, requestURL, nil, nil, newOpts)
+			if err != nil {
+				slog.Warn("failed to get direct URL; backing off and retrying", "err", err)
+				if err := backoff(ctx); err != nil {
+					return nil, err
+				}
+				continue
+			}
+			defer resp.Body.Close()
+			if resp.StatusCode != http.StatusTemporaryRedirect {
+				return nil, fmt.Errorf("unexpected status code %d", resp.StatusCode)
+			}
+			return resp.Location()
+		}
+	}()
+	if err != nil {
+		return err
+	}
+
 	g, inner := errgroup.WithContext(ctx)
 	g.SetLimit(numDownloadParts)
 	for i := range b.Parts {
@@ -165,7 +238,7 @@ func (b *blobDownload) run(ctx context.Context, requestURL *url.URL, opts *regis
 			var err error
 			for try := 0; try < maxRetries; try++ {
 				w := io.NewOffsetWriter(file, part.StartsAt())
-				err = b.downloadChunk(inner, requestURL, w, part, opts)
+				err = b.downloadChunk(inner, directURL, w, part, opts)
 				switch {
 				case errors.Is(err, context.Canceled), errors.Is(err, syscall.ENOSPC):
 					// return immediately if the context is canceled or the device is out of space
--- a/server/images.go
+++ b/server/images.go
@@ -54,6 +54,8 @@ type registryOptions struct {
 	Username string
 	Password string
 	Token    string
+
+	CheckRedirect func(req *http.Request, via []*http.Request) error
 }

 type Model struct {
@@ -1131,7 +1133,9 @@ func makeRequest(ctx context.Context, method string, requestURL *url.URL, header
 		req.ContentLength = contentLength
 	}

-	resp, err := http.DefaultClient.Do(req)
+	resp, err := (&http.Client{
+		CheckRedirect: regOpts.CheckRedirect,
+	}).Do(req)
 	if err != nil {
 		return nil, err
 	}
--- a/server/model.go
+++ b/server/model.go
@@ -263,13 +263,27 @@ func detectChatTemplate(layers []*layerGGML) ([]*layerGGML, error) {
 			if t, err := template.Named(s); err != nil {
 				slog.Debug("template detection", "error", err)
 			} else {
-				tmpl, err := NewLayer(t.Reader(), "application/vnd.ollama.image.template")
+				layer, err := NewLayer(t.Reader(), "application/vnd.ollama.image.template")
 				if err != nil {
 					return nil, err
 				}

-				tmpl.status = fmt.Sprintf("using autodetected template %s", t.Name)
-				layers = append(layers, &layerGGML{tmpl, nil})
+				layer.status = fmt.Sprintf("using autodetected template %s", t.Name)
+				layers = append(layers, &layerGGML{layer, nil})
+
+				if t.Parameters != nil {
+					var b bytes.Buffer
+					if err := json.NewEncoder(&b).Encode(t.Parameters); err != nil {
+						return nil, err
+					}
+
+					layer, err := NewLayer(&b, "application/vnd.ollama.image.params")
+					if err != nil {
+						return nil, err
+					}
+
+					layers = append(layers, &layerGGML{layer, nil})
+				}
 			}
 		}
 	}
--- a/server/routes_create_test.go
+++ b/server/routes_create_test.go
@@ -599,9 +599,10 @@ func TestCreateDetectTemplate(t *testing.T) {
 		}

 		checkFileExists(t, filepath.Join(p, "blobs", "*"), []string{
+			filepath.Join(p, "blobs", "sha256-0d79f567714c62c048378f2107fb332dabee0135d080c302d884317da9433cc5"),
 			filepath.Join(p, "blobs", "sha256-553c4a3f747b3d22a4946875f1cc8ed011c2930d83f864a0c7265f9ec0a20413"),
 			filepath.Join(p, "blobs", "sha256-c608dc615584cd20d9d830363dabf8a4783ae5d34245c3d8c115edb3bc7b28e4"),
-			filepath.Join(p, "blobs", "sha256-f836ee110db21567f826332e4cedd746c06d10664fd5a9ea3659e3683a944510"),
+			filepath.Join(p, "blobs", "sha256-ea34c57ba5b78b740aafe2aeb74dc6507fc3ad14170b64c26a04fb9e36c88d75"),
 		})
 	})

--- a/server/sched.go
+++ b/server/sched.go
@@ -132,6 +132,8 @@ func (s *Scheduler) processPending(ctx context.Context) {
 			if len(pending.model.ProjectorPaths) > 0 && numParallel != 1 {
 				numParallel = 1
 				slog.Warn("multimodal models don't support parallel requests yet")
+			} else if strings.Contains(pending.model.Config.ModelFamily, "bert") {
+				numParallel = runtime.NumCPU()
 			}

 			for {
--- a/template/alfred.json
+++ b/template/alfred.json
@@ -0,0 +1,8 @@
+{
+  "stop": [
+    "<start_system>",
+    "<end_message>",
+    "<start_user>",
+    "<start_assistant>"
+  ]
+}
--- a/template/alpaca.json
+++ b/template/alpaca.json
@@ -0,0 +1,6 @@
+{
+  "stop": [
+    "### Instruction:",
+    "### Response"
+  ]
+}
--- a/template/chatml.json
+++ b/template/chatml.json
@@ -0,0 +1,6 @@
+{
+  "stop": [
+    "<|im_start|>",
+    "<|im_end|>"
+  ]
+}
--- a/template/chatqa.json
+++ b/template/chatqa.json
@@ -0,0 +1,8 @@
+{
+  "stop": [
+    "System:",
+    "User:",
+    "Assistant:",
+    "<|begin_of_text|>"
+  ]
+}
--- a/template/codellama-70b-instruct.json
+++ b/template/codellama-70b-instruct.json
@@ -0,0 +1,7 @@
+{
+  "stop": [
+    "Source:",
+    "Destination:",
+    "<step>"
+  ]
+}
--- a/template/falcon-instruct.json
+++ b/template/falcon-instruct.json
@@ -0,0 +1,6 @@
+{
+  "stop": [
+    "User:",
+    "Assistant:"
+  ]
+}
--- a/template/gemma-instruct.json
+++ b/template/gemma-instruct.json
@@ -0,0 +1,6 @@
+{
+  "stop": [
+    "<start_of_turn>",
+    "<end_of_turn>"
+  ]
+}
--- a/template/granite-instruct.json
+++ b/template/granite-instruct.json
@@ -0,0 +1,7 @@
+{
+  "stop": [
+    "System:",
+    "Question:",
+    "Answer:"
+  ]
+}
--- a/template/llama2-chat.json
+++ b/template/llama2-chat.json
@@ -0,0 +1,8 @@
+{
+  "stop": [
+    "[INST]",
+    "[/INST]",
+    "<<SYS>>",
+    "<</SYS>>"
+  ]
+}
--- a/template/llama3-instruct.json
+++ b/template/llama3-instruct.json
@@ -0,0 +1,7 @@
+{
+  "stop": [
+    "<|start_header_id|>",
+    "<|end_header_id|>",
+    "<|eot_id|>"
+  ]
+}
--- a/template/magicoder.json
+++ b/template/magicoder.json
@@ -0,0 +1,6 @@
+{
+  "stop": [
+    "@@ Instruction",
+    "@@ Response"
+  ]
+}
--- a/template/mistral-instruct.json
+++ b/template/mistral-instruct.json
@@ -0,0 +1,6 @@
+{
+  "stop": [
+    "<|im_start|>",
+    "<|im_end|>"
+  ]
+}
--- a/template/openchat.json
+++ b/template/openchat.json
@@ -0,0 +1,5 @@
+{
+  "stop": [
+    "<|end_of_turn|>"
+  ]
+}
--- a/template/phi-3.json
+++ b/template/phi-3.json
@@ -0,0 +1,8 @@
+{
+  "stop": [
+    "<|end|>",
+    "<|system|>",
+    "<|user|>",
+    "<|assistant|>"
+  ]
+}
--- a/template/solar-instruct.json
+++ b/template/solar-instruct.json
@@ -0,0 +1,7 @@
+{
+  "stop": [
+    "### System:",
+    "### User:",
+    "### Assistant"
+  ]
+}
--- a/template/starcoder2-instruct.json
+++ b/template/starcoder2-instruct.json
@@ -0,0 +1,7 @@
+{
+  "stop": [
+    "### Instruction",
+    "### Response",
+    "<|endoftext|>"
+  ]
+}
--- a/template/template.go
+++ b/template/template.go
@@ -23,6 +23,7 @@ import (
 var indexBytes []byte

 //go:embed *.gotmpl
+//go:embed *.json
 var templatesFS embed.FS

 var templatesOnce = sync.OnceValues(func() ([]*named, error) {
@@ -39,6 +40,15 @@ var templatesOnce = sync.OnceValues(func() ([]*named, error) {

 		// normalize line endings
 		t.Bytes = bytes.ReplaceAll(bts, []byte("\r\n"), []byte("\n"))
+
+		params, err := templatesFS.ReadFile(t.Name + ".json")
+		if err != nil {
+			continue
+		}
+
+		if err := json.Unmarshal(params, &t.Parameters); err != nil {
+			return nil, err
+		}
 	}

 	return templates, nil
@@ -48,6 +58,10 @@ type named struct {
 	Name     string `json:"name"`
 	Template string `json:"template"`
 	Bytes    []byte
+
+	Parameters *struct {
+		Stop []string `json:"stop"`
+	}
 }

 func (t named) Reader() io.Reader {
--- a/template/vicuna.json
+++ b/template/vicuna.json
@@ -0,0 +1,6 @@
+{
+  "stop": [
+    "USER:",
+    "ASSISTANT:"
+  ]
+}
--- a/template/zephyr.json
+++ b/template/zephyr.json
@@ -0,0 +1,8 @@
+{
+  "stop": [
+    "<|system|>",
+    "</s>",
+    "<|user|>",
+    "<|assistant|>"
+  ]
+}
Author	SHA1	Message	Date
Roy Han	2647a0e443	num parallel embed	2024-07-26 15:18:35 -07:00
Michael Yang	ec4c35fe99	Merge pull request #5512 from ollama/mxyng/detect-stop autodetect stop parameters from template	2024-07-26 13:48:23 -07:00
Jeffrey Morgan	f5e3939220	Update api.md (#5968 )	2024-07-25 23:10:18 -04:00
Jeffrey Morgan	ae27d9dcfd	Update openai.md	2024-07-25 20:27:33 -04:00
Michael Yang	37096790a7	Merge pull request #5552 from ollama/mxyng/messages-docs docs	2024-07-25 16:26:19 -07:00
Michael Yang	997c903884	Update docs/template.md Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com>	2024-07-25 16:23:40 -07:00
Blake Mizerany	c8af3c2d96	server: reuse original download URL for images (#5962 ) This changes the registry client to reuse the original download URL it gets on the first redirect response for all subsequent requests, preventing thundering herd issues when hot new LLMs are released.	2024-07-25 15:58:30 -07:00
Jeffrey Morgan	455e61170d	Update openai.md	2024-07-25 18:34:47 -04:00
royjhan	4de1370a9d	openai tools doc (#5617 )	2024-07-25 18:34:06 -04:00
Michael Yang	9b60a038e5	update api.md	2024-07-22 13:49:51 -07:00
Michael Yang	83a0cb8d88	docs	2024-07-22 13:38:09 -07:00
Michael Yang	ebc529cbb3	autodetect stop parameters from template	2024-07-12 16:01:23 -07:00