Compare commits

..

6 Commits

Author SHA1 Message Date
ParthSareen
2536ffe0ab More cleanup 2024-12-11 18:11:00 -08:00
ParthSareen
97abd7bfea Code cleanup 2024-12-11 18:04:16 -08:00
Anuraag Agrawal
c6509bf76e Merge branch 'main' of https://github.com/ollama/ollama into openai-stream-usage 2024-12-06 12:05:25 +09:00
Anuraag Agrawal
7355ab3703 Return empty choices on usage chunk 2024-10-03 13:02:50 +09:00
Anuraag Agrawal
7ed81437fe Document stream_options 2024-09-17 15:25:31 +09:00
Anuraag Agrawal
220108d3f4 openai: support include_usage stream option to return final usage chunk 2024-09-13 12:32:05 +09:00
8 changed files with 179 additions and 192 deletions

View File

@@ -49,12 +49,12 @@ Here are some example models that can be downloaded:
| Model | Parameters | Size | Download | | Model | Parameters | Size | Download |
| ------------------ | ---------- | ----- | -------------------------------- | | ------------------ | ---------- | ----- | -------------------------------- |
| Llama 3.3 | 70B | 43GB | `ollama run llama3.3` |
| Llama 3.2 | 3B | 2.0GB | `ollama run llama3.2` | | Llama 3.2 | 3B | 2.0GB | `ollama run llama3.2` |
| Llama 3.2 | 1B | 1.3GB | `ollama run llama3.2:1b` | | Llama 3.2 | 1B | 1.3GB | `ollama run llama3.2:1b` |
| Llama 3.2 Vision | 11B | 7.9GB | `ollama run llama3.2-vision` | | Llama 3.2 Vision | 11B | 7.9GB | `ollama run llama3.2-vision` |
| Llama 3.2 Vision | 90B | 55GB | `ollama run llama3.2-vision:90b` | | Llama 3.2 Vision | 90B | 55GB | `ollama run llama3.2-vision:90b` |
| Llama 3.1 | 8B | 4.7GB | `ollama run llama3.1` | | Llama 3.1 | 8B | 4.7GB | `ollama run llama3.1` |
| Llama 3.1 | 70B | 40GB | `ollama run llama3.1:70b` |
| Llama 3.1 | 405B | 231GB | `ollama run llama3.1:405b` | | Llama 3.1 | 405B | 231GB | `ollama run llama3.1:405b` |
| Phi 3 Mini | 3.8B | 2.3GB | `ollama run phi3` | | Phi 3 Mini | 3.8B | 2.3GB | `ollama run phi3` |
| Phi 3 Medium | 14B | 7.9GB | `ollama run phi3:medium` | | Phi 3 Medium | 14B | 7.9GB | `ollama run phi3:medium` |

View File

@@ -1036,10 +1036,6 @@ func chat(cmd *cobra.Command, opts runOptions) (*api.Message, error) {
return nil return nil
} }
if opts.Format == "json" {
opts.Format = `"` + opts.Format + `"`
}
req := &api.ChatRequest{ req := &api.ChatRequest{
Model: opts.Model, Model: opts.Model,
Messages: opts.Messages, Messages: opts.Messages,
@@ -1125,10 +1121,6 @@ func generate(cmd *cobra.Command, opts runOptions) error {
} }
} }
if opts.Format == "json" {
opts.Format = `"` + opts.Format + `"`
}
request := api.GenerateRequest{ request := api.GenerateRequest{
Model: opts.Model, Model: opts.Model,
Prompt: opts.Prompt, Prompt: opts.Prompt,

View File

@@ -45,7 +45,7 @@ Generate a response for a given prompt with a provided model. This is a streamin
Advanced parameters (optional): Advanced parameters (optional):
- `format`: the format to return a response in. Format can be `json` or a JSON schema - `format`: the format to return a response in. Currently the only accepted value is `json`
- `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature` - `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
- `system`: system message to (overrides what is defined in the `Modelfile`) - `system`: system message to (overrides what is defined in the `Modelfile`)
- `template`: the prompt template to use (overrides what is defined in the `Modelfile`) - `template`: the prompt template to use (overrides what is defined in the `Modelfile`)
@@ -54,10 +54,6 @@ Advanced parameters (optional):
- `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`) - `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`)
- `context` (deprecated): the context parameter returned from a previous request to `/generate`, this can be used to keep a short conversational memory - `context` (deprecated): the context parameter returned from a previous request to `/generate`, this can be used to keep a short conversational memory
#### Structured outputs
Structured outputs are supported by providing a JSON schema in the `format` parameter. The model will generate a response that matches the schema. See the [structured outputs](#request-structured-outputs) example below.
#### JSON mode #### JSON mode
Enable JSON mode by setting the `format` parameter to `json`. This will structure the response as a valid JSON object. See the JSON mode [example](#request-json-mode) below. Enable JSON mode by setting the `format` parameter to `json`. This will structure the response as a valid JSON object. See the JSON mode [example](#request-json-mode) below.
@@ -189,52 +185,6 @@ curl http://localhost:11434/api/generate -d '{
} }
``` ```
#### Request (Structured outputs)
##### Request
```shell
curl -X POST http://localhost:11434/api/generate -H "Content-Type: application/json" -d '{
"model": "llama3.1:8b",
"prompt": "Ollama is 22 years old and is busy saving the world. Respond using JSON",
"stream": false,
"format": {
"type": "object",
"properties": {
"age": {
"type": "integer"
},
"available": {
"type": "boolean"
}
},
"required": [
"age",
"available"
]
}
}'
```
##### Response
```json
{
"model": "llama3.1:8b",
"created_at": "2024-12-06T00:48:09.983619Z",
"response": "{\n \"age\": 22,\n \"available\": true\n}",
"done": true,
"done_reason": "stop",
"context": [1, 2, 3],
"total_duration": 1075509083,
"load_duration": 567678166,
"prompt_eval_count": 28,
"prompt_eval_duration": 236000000,
"eval_count": 16,
"eval_duration": 269000000
}
```
#### Request (JSON mode) #### Request (JSON mode)
> [!IMPORTANT] > [!IMPORTANT]
@@ -506,15 +456,11 @@ The `message` object has the following fields:
Advanced parameters (optional): Advanced parameters (optional):
- `format`: the format to return a response in. Format can be `json` or a JSON schema. - `format`: the format to return a response in. Currently the only accepted value is `json`
- `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature` - `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
- `stream`: if `false` the response will be returned as a single response object, rather than a stream of objects - `stream`: if `false` the response will be returned as a single response object, rather than a stream of objects
- `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`) - `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`)
### Structured outputs
Structured outputs are supported by providing a JSON schema in the `format` parameter. The model will generate a response that matches the schema. See the [Chat request (Structured outputs)](#chat-request-structured-outputs) example below.
### Examples ### Examples
#### Chat Request (Streaming) #### Chat Request (Streaming)
@@ -605,54 +551,6 @@ curl http://localhost:11434/api/chat -d '{
} }
``` ```
#### Chat request (Structured outputs)
##### Request
```shell
curl -X POST http://localhost:11434/api/chat -H "Content-Type: application/json" -d '{
"model": "llama3.1",
"messages": [{"role": "user", "content": "Ollama is 22 years old and busy saving the world. Return a JSON object with the age and availability."}],
"stream": false,
"format": {
"type": "object",
"properties": {
"age": {
"type": "integer"
},
"available": {
"type": "boolean"
}
},
"required": [
"age",
"available"
]
},
"options": {
"temperature": 0
}
}'
```
##### Response
```json
{
"model": "llama3.1",
"created_at": "2024-12-06T00:46:58.265747Z",
"message": { "role": "assistant", "content": "{\"age\": 22, \"available\": false}" },
"done_reason": "stop",
"done": true,
"total_duration": 2254970291,
"load_duration": 574751416,
"prompt_eval_count": 34,
"prompt_eval_duration": 1502000000,
"eval_count": 12,
"eval_duration": 175000000
}
```
#### Chat request (With History) #### Chat request (With History)
Send a chat message with a conversation history. You can use this same approach to start the conversation using multi-shot or chain-of-thought prompting. Send a chat message with a conversation history. You can use this same approach to start the conversation using multi-shot or chain-of-thought prompting.

View File

@@ -59,40 +59,6 @@ embeddings = client.embeddings.create(
input=["why is the sky blue?", "why is the grass green?"], input=["why is the sky blue?", "why is the grass green?"],
) )
``` ```
#### Structured outputs
```py
rom pydantic import BaseModel
from openai import OpenAI
client = OpenAI(base_url="http://localhost:11434/v1", api_key="ollama")
# Define the schema for the response
class FriendInfo(BaseModel):
name: str
age: int
is_available: bool
class FriendList(BaseModel):
friends: list[FriendInfo]
try:
completion = client.beta.chat.completions.parse(
temperature=0,
model="llama3.1:8b",
messages=[
{"role": "user", "content": "I have two friends. The first is Ollama 22 years old busy saving the world, and the second is Alonso 23 years old and wants to hang out. Return a list of friends in JSON format"}
],
response_format=FriendList,
)
friends_response = completion.choices[0].message
if friends_response.parsed:
print(friends_response.parsed)
elif friends_response.refusal:
print(friends_response.refusal)
except Exception as e:
print(f"Error: {e}")
```
### OpenAI JavaScript library ### OpenAI JavaScript library
@@ -233,6 +199,8 @@ curl http://localhost:11434/v1/embeddings \
- [x] `seed` - [x] `seed`
- [x] `stop` - [x] `stop`
- [x] `stream` - [x] `stream`
- [x] `stream_options`
- [x] `include_usage`
- [x] `temperature` - [x] `temperature`
- [x] `top_p` - [x] `top_p`
- [x] `max_tokens` - [x] `max_tokens`
@@ -261,6 +229,8 @@ curl http://localhost:11434/v1/embeddings \
- [x] `seed` - [x] `seed`
- [x] `stop` - [x] `stop`
- [x] `stream` - [x] `stream`
- [x] `stream_options`
- [x] `include_usage`
- [x] `temperature` - [x] `temperature`
- [x] `top_p` - [x] `top_p`
- [x] `max_tokens` - [x] `max_tokens`

View File

@@ -1,14 +1,3 @@
# Examples # Examples
This directory contains different examples of using Ollama. This directory contains different examples of using Ollama.
## Python examples
Ollama Python examples at [ollama-python/examples](https://github.com/ollama/ollama-python/tree/main/examples)
## JavaScript examples
Ollama JavaScript examples at [ollama-js/examples](https://github.com/ollama/ollama-js/tree/main/examples)
## OpenAI compatibility examples
Ollama OpenAI compatibility examples at [ollama/examples/openai](../docs/openai.md)

View File

@@ -746,7 +746,7 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
// TODO (parthsareen): Move conversion to grammar with sampling logic // TODO (parthsareen): Move conversion to grammar with sampling logic
// API should do error handling for invalid formats // API should do error handling for invalid formats
if req.Format != nil && strings.TrimSpace(string(req.Format)) != "null" { if req.Format != nil {
if strings.ToLower(strings.TrimSpace(string(req.Format))) == `"json"` { if strings.ToLower(strings.TrimSpace(string(req.Format))) == `"json"` {
request["grammar"] = jsonGrammar request["grammar"] = jsonGrammar
if !strings.Contains(strings.ToLower(req.Prompt), "json") { if !strings.Contains(strings.ToLower(req.Prompt), "json") {

View File

@@ -75,10 +75,15 @@ type EmbedRequest struct {
Model string `json:"model"` Model string `json:"model"`
} }
type StreamOptions struct {
IncludeUsage bool `json:"include_usage"`
}
type ChatCompletionRequest struct { type ChatCompletionRequest struct {
Model string `json:"model"` Model string `json:"model"`
Messages []Message `json:"messages"` Messages []Message `json:"messages"`
Stream bool `json:"stream"` Stream bool `json:"stream"`
StreamOptions *StreamOptions `json:"stream_options"`
MaxTokens *int `json:"max_tokens"` MaxTokens *int `json:"max_tokens"`
Seed *int `json:"seed"` Seed *int `json:"seed"`
Stop any `json:"stop"` Stop any `json:"stop"`
@@ -107,6 +112,7 @@ type ChatCompletionChunk struct {
Model string `json:"model"` Model string `json:"model"`
SystemFingerprint string `json:"system_fingerprint"` SystemFingerprint string `json:"system_fingerprint"`
Choices []ChunkChoice `json:"choices"` Choices []ChunkChoice `json:"choices"`
Usage *Usage `json:"usage,omitempty"`
} }
// TODO (https://github.com/ollama/ollama/issues/5259): support []string, []int and [][]int // TODO (https://github.com/ollama/ollama/issues/5259): support []string, []int and [][]int
@@ -119,6 +125,7 @@ type CompletionRequest struct {
Seed *int `json:"seed"` Seed *int `json:"seed"`
Stop any `json:"stop"` Stop any `json:"stop"`
Stream bool `json:"stream"` Stream bool `json:"stream"`
StreamOptions *StreamOptions `json:"stream_options"`
Temperature *float32 `json:"temperature"` Temperature *float32 `json:"temperature"`
TopP float32 `json:"top_p"` TopP float32 `json:"top_p"`
Suffix string `json:"suffix"` Suffix string `json:"suffix"`
@@ -141,6 +148,7 @@ type CompletionChunk struct {
Choices []CompleteChunkChoice `json:"choices"` Choices []CompleteChunkChoice `json:"choices"`
Model string `json:"model"` Model string `json:"model"`
SystemFingerprint string `json:"system_fingerprint"` SystemFingerprint string `json:"system_fingerprint"`
Usage *Usage `json:"usage,omitempty"`
} }
type ToolCall struct { type ToolCall struct {
@@ -197,6 +205,14 @@ func NewError(code int, message string) ErrorResponse {
return ErrorResponse{Error{Type: etype, Message: message}} return ErrorResponse{Error{Type: etype, Message: message}}
} }
func toUsage(r api.ChatResponse) Usage {
return Usage{
PromptTokens: r.PromptEvalCount,
CompletionTokens: r.EvalCount,
TotalTokens: r.PromptEvalCount + r.EvalCount,
}
}
func toolCallId() string { func toolCallId() string {
const letterBytes = "abcdefghijklmnopqrstuvwxyz0123456789" const letterBytes = "abcdefghijklmnopqrstuvwxyz0123456789"
b := make([]byte, 8) b := make([]byte, 8)
@@ -246,11 +262,7 @@ func toChatCompletion(id string, r api.ChatResponse) ChatCompletion {
return nil return nil
}(r.DoneReason), }(r.DoneReason),
}}, }},
Usage: Usage{ Usage: toUsage(r),
PromptTokens: r.PromptEvalCount,
CompletionTokens: r.EvalCount,
TotalTokens: r.PromptEvalCount + r.EvalCount,
},
} }
} }
@@ -275,6 +287,14 @@ func toChunk(id string, r api.ChatResponse) ChatCompletionChunk {
} }
} }
func toUsageGenerate(r api.GenerateResponse) Usage {
return Usage{
PromptTokens: r.PromptEvalCount,
CompletionTokens: r.EvalCount,
TotalTokens: r.PromptEvalCount + r.EvalCount,
}
}
func toCompletion(id string, r api.GenerateResponse) Completion { func toCompletion(id string, r api.GenerateResponse) Completion {
return Completion{ return Completion{
Id: id, Id: id,
@@ -292,11 +312,7 @@ func toCompletion(id string, r api.GenerateResponse) Completion {
return nil return nil
}(r.DoneReason), }(r.DoneReason),
}}, }},
Usage: Usage{ Usage: toUsageGenerate(r),
PromptTokens: r.PromptEvalCount,
CompletionTokens: r.EvalCount,
TotalTokens: r.PromptEvalCount + r.EvalCount,
},
} }
} }
@@ -571,12 +587,14 @@ type BaseWriter struct {
type ChatWriter struct { type ChatWriter struct {
stream bool stream bool
streamOptions *StreamOptions
id string id string
BaseWriter BaseWriter
} }
type CompleteWriter struct { type CompleteWriter struct {
stream bool stream bool
streamOptions *StreamOptions
id string id string
BaseWriter BaseWriter
} }
@@ -620,7 +638,11 @@ func (w *ChatWriter) writeResponse(data []byte) (int, error) {
// chat chunk // chat chunk
if w.stream { if w.stream {
d, err := json.Marshal(toChunk(w.id, chatResponse)) c := toChunk(w.id, chatResponse)
if w.streamOptions != nil && w.streamOptions.IncludeUsage {
c.Usage = &Usage{}
}
d, err := json.Marshal(c)
if err != nil { if err != nil {
return 0, err return 0, err
} }
@@ -632,6 +654,17 @@ func (w *ChatWriter) writeResponse(data []byte) (int, error) {
} }
if chatResponse.Done { if chatResponse.Done {
if w.streamOptions != nil && w.streamOptions.IncludeUsage {
u := toUsage(chatResponse)
d, err := json.Marshal(ChatCompletionChunk{Choices: []ChunkChoice{}, Usage: &u})
if err != nil {
return 0, err
}
_, err = w.ResponseWriter.Write([]byte(fmt.Sprintf("data: %s\n\n", d)))
if err != nil {
return 0, err
}
}
_, err = w.ResponseWriter.Write([]byte("data: [DONE]\n\n")) _, err = w.ResponseWriter.Write([]byte("data: [DONE]\n\n"))
if err != nil { if err != nil {
return 0, err return 0, err
@@ -669,7 +702,11 @@ func (w *CompleteWriter) writeResponse(data []byte) (int, error) {
// completion chunk // completion chunk
if w.stream { if w.stream {
d, err := json.Marshal(toCompleteChunk(w.id, generateResponse)) c := toCompleteChunk(w.id, generateResponse)
if w.streamOptions != nil && w.streamOptions.IncludeUsage {
c.Usage = &Usage{}
}
d, err := json.Marshal(c)
if err != nil { if err != nil {
return 0, err return 0, err
} }
@@ -681,6 +718,17 @@ func (w *CompleteWriter) writeResponse(data []byte) (int, error) {
} }
if generateResponse.Done { if generateResponse.Done {
if w.streamOptions != nil && w.streamOptions.IncludeUsage {
u := toUsageGenerate(generateResponse)
d, err := json.Marshal(CompletionChunk{Choices: []CompleteChunkChoice{}, Usage: &u})
if err != nil {
return 0, err
}
_, err = w.ResponseWriter.Write([]byte(fmt.Sprintf("data: %s\n\n", d)))
if err != nil {
return 0, err
}
}
_, err = w.ResponseWriter.Write([]byte("data: [DONE]\n\n")) _, err = w.ResponseWriter.Write([]byte("data: [DONE]\n\n"))
if err != nil { if err != nil {
return 0, err return 0, err
@@ -846,6 +894,7 @@ func CompletionsMiddleware() gin.HandlerFunc {
BaseWriter: BaseWriter{ResponseWriter: c.Writer}, BaseWriter: BaseWriter{ResponseWriter: c.Writer},
stream: req.Stream, stream: req.Stream,
id: fmt.Sprintf("cmpl-%d", rand.Intn(999)), id: fmt.Sprintf("cmpl-%d", rand.Intn(999)),
streamOptions: req.StreamOptions,
} }
c.Writer = w c.Writer = w
@@ -928,6 +977,7 @@ func ChatMiddleware() gin.HandlerFunc {
BaseWriter: BaseWriter{ResponseWriter: c.Writer}, BaseWriter: BaseWriter{ResponseWriter: c.Writer},
stream: req.Stream, stream: req.Stream,
id: fmt.Sprintf("chatcmpl-%d", rand.Intn(999)), id: fmt.Sprintf("chatcmpl-%d", rand.Intn(999)),
streamOptions: req.StreamOptions,
} }
c.Writer = w c.Writer = w

View File

@@ -112,6 +112,45 @@ func TestChatMiddleware(t *testing.T) {
Stream: &True, Stream: &True,
}, },
}, },
{
name: "chat handler with streaming usage",
body: `{
"model": "test-model",
"messages": [
{"role": "user", "content": "Hello"}
],
"stream": true,
"stream_options": {"include_usage": true},
"max_tokens": 999,
"seed": 123,
"stop": ["\n", "stop"],
"temperature": 3.0,
"frequency_penalty": 4.0,
"presence_penalty": 5.0,
"top_p": 6.0,
"response_format": {"type": "json_object"}
}`,
req: api.ChatRequest{
Model: "test-model",
Messages: []api.Message{
{
Role: "user",
Content: "Hello",
},
},
Options: map[string]any{
"num_predict": 999.0, // float because JSON doesn't distinguish between float and int
"seed": 123.0,
"stop": []any{"\n", "stop"},
"temperature": 3.0,
"frequency_penalty": 4.0,
"presence_penalty": 5.0,
"top_p": 6.0,
},
Format: json.RawMessage(`"json"`),
Stream: &True,
},
},
{ {
name: "chat handler with image content", name: "chat handler with image content",
body: `{ body: `{
@@ -363,6 +402,55 @@ func TestCompletionsMiddleware(t *testing.T) {
Stream: &False, Stream: &False,
}, },
}, },
{
name: "completions handler stream",
body: `{
"model": "test-model",
"prompt": "Hello",
"stream": true,
"temperature": 0.8,
"stop": ["\n", "stop"],
"suffix": "suffix"
}`,
req: api.GenerateRequest{
Model: "test-model",
Prompt: "Hello",
Options: map[string]any{
"frequency_penalty": 0.0,
"presence_penalty": 0.0,
"temperature": 0.8,
"top_p": 1.0,
"stop": []any{"\n", "stop"},
},
Suffix: "suffix",
Stream: &True,
},
},
{
name: "completions handler stream with usage",
body: `{
"model": "test-model",
"prompt": "Hello",
"stream": true,
"stream_options": {"include_usage": true},
"temperature": 0.8,
"stop": ["\n", "stop"],
"suffix": "suffix"
}`,
req: api.GenerateRequest{
Model: "test-model",
Prompt: "Hello",
Options: map[string]any{
"frequency_penalty": 0.0,
"presence_penalty": 0.0,
"temperature": 0.8,
"top_p": 1.0,
"stop": []any{"\n", "stop"},
},
Suffix: "suffix",
Stream: &True,
},
},
{ {
name: "completions handler error forwarding", name: "completions handler error forwarding",
body: `{ body: `{