Compare commits
6 Commits
v0.5.1
...
parth/open
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
2536ffe0ab | ||
|
|
97abd7bfea | ||
|
|
c6509bf76e | ||
|
|
7355ab3703 | ||
|
|
7ed81437fe | ||
|
|
220108d3f4 |
@@ -49,12 +49,12 @@ Here are some example models that can be downloaded:
|
|||||||
|
|
||||||
| Model | Parameters | Size | Download |
|
| Model | Parameters | Size | Download |
|
||||||
| ------------------ | ---------- | ----- | -------------------------------- |
|
| ------------------ | ---------- | ----- | -------------------------------- |
|
||||||
| Llama 3.3 | 70B | 43GB | `ollama run llama3.3` |
|
|
||||||
| Llama 3.2 | 3B | 2.0GB | `ollama run llama3.2` |
|
| Llama 3.2 | 3B | 2.0GB | `ollama run llama3.2` |
|
||||||
| Llama 3.2 | 1B | 1.3GB | `ollama run llama3.2:1b` |
|
| Llama 3.2 | 1B | 1.3GB | `ollama run llama3.2:1b` |
|
||||||
| Llama 3.2 Vision | 11B | 7.9GB | `ollama run llama3.2-vision` |
|
| Llama 3.2 Vision | 11B | 7.9GB | `ollama run llama3.2-vision` |
|
||||||
| Llama 3.2 Vision | 90B | 55GB | `ollama run llama3.2-vision:90b` |
|
| Llama 3.2 Vision | 90B | 55GB | `ollama run llama3.2-vision:90b` |
|
||||||
| Llama 3.1 | 8B | 4.7GB | `ollama run llama3.1` |
|
| Llama 3.1 | 8B | 4.7GB | `ollama run llama3.1` |
|
||||||
|
| Llama 3.1 | 70B | 40GB | `ollama run llama3.1:70b` |
|
||||||
| Llama 3.1 | 405B | 231GB | `ollama run llama3.1:405b` |
|
| Llama 3.1 | 405B | 231GB | `ollama run llama3.1:405b` |
|
||||||
| Phi 3 Mini | 3.8B | 2.3GB | `ollama run phi3` |
|
| Phi 3 Mini | 3.8B | 2.3GB | `ollama run phi3` |
|
||||||
| Phi 3 Medium | 14B | 7.9GB | `ollama run phi3:medium` |
|
| Phi 3 Medium | 14B | 7.9GB | `ollama run phi3:medium` |
|
||||||
|
|||||||
@@ -1036,10 +1036,6 @@ func chat(cmd *cobra.Command, opts runOptions) (*api.Message, error) {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
if opts.Format == "json" {
|
|
||||||
opts.Format = `"` + opts.Format + `"`
|
|
||||||
}
|
|
||||||
|
|
||||||
req := &api.ChatRequest{
|
req := &api.ChatRequest{
|
||||||
Model: opts.Model,
|
Model: opts.Model,
|
||||||
Messages: opts.Messages,
|
Messages: opts.Messages,
|
||||||
@@ -1125,10 +1121,6 @@ func generate(cmd *cobra.Command, opts runOptions) error {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if opts.Format == "json" {
|
|
||||||
opts.Format = `"` + opts.Format + `"`
|
|
||||||
}
|
|
||||||
|
|
||||||
request := api.GenerateRequest{
|
request := api.GenerateRequest{
|
||||||
Model: opts.Model,
|
Model: opts.Model,
|
||||||
Prompt: opts.Prompt,
|
Prompt: opts.Prompt,
|
||||||
|
|||||||
106
docs/api.md
106
docs/api.md
@@ -45,7 +45,7 @@ Generate a response for a given prompt with a provided model. This is a streamin
|
|||||||
|
|
||||||
Advanced parameters (optional):
|
Advanced parameters (optional):
|
||||||
|
|
||||||
- `format`: the format to return a response in. Format can be `json` or a JSON schema
|
- `format`: the format to return a response in. Currently the only accepted value is `json`
|
||||||
- `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
|
- `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
|
||||||
- `system`: system message to (overrides what is defined in the `Modelfile`)
|
- `system`: system message to (overrides what is defined in the `Modelfile`)
|
||||||
- `template`: the prompt template to use (overrides what is defined in the `Modelfile`)
|
- `template`: the prompt template to use (overrides what is defined in the `Modelfile`)
|
||||||
@@ -54,10 +54,6 @@ Advanced parameters (optional):
|
|||||||
- `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`)
|
- `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`)
|
||||||
- `context` (deprecated): the context parameter returned from a previous request to `/generate`, this can be used to keep a short conversational memory
|
- `context` (deprecated): the context parameter returned from a previous request to `/generate`, this can be used to keep a short conversational memory
|
||||||
|
|
||||||
#### Structured outputs
|
|
||||||
|
|
||||||
Structured outputs are supported by providing a JSON schema in the `format` parameter. The model will generate a response that matches the schema. See the [structured outputs](#request-structured-outputs) example below.
|
|
||||||
|
|
||||||
#### JSON mode
|
#### JSON mode
|
||||||
|
|
||||||
Enable JSON mode by setting the `format` parameter to `json`. This will structure the response as a valid JSON object. See the JSON mode [example](#request-json-mode) below.
|
Enable JSON mode by setting the `format` parameter to `json`. This will structure the response as a valid JSON object. See the JSON mode [example](#request-json-mode) below.
|
||||||
@@ -189,52 +185,6 @@ curl http://localhost:11434/api/generate -d '{
|
|||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
#### Request (Structured outputs)
|
|
||||||
|
|
||||||
##### Request
|
|
||||||
|
|
||||||
```shell
|
|
||||||
curl -X POST http://localhost:11434/api/generate -H "Content-Type: application/json" -d '{
|
|
||||||
"model": "llama3.1:8b",
|
|
||||||
"prompt": "Ollama is 22 years old and is busy saving the world. Respond using JSON",
|
|
||||||
"stream": false,
|
|
||||||
"format": {
|
|
||||||
"type": "object",
|
|
||||||
"properties": {
|
|
||||||
"age": {
|
|
||||||
"type": "integer"
|
|
||||||
},
|
|
||||||
"available": {
|
|
||||||
"type": "boolean"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"required": [
|
|
||||||
"age",
|
|
||||||
"available"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
}'
|
|
||||||
```
|
|
||||||
|
|
||||||
##### Response
|
|
||||||
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"model": "llama3.1:8b",
|
|
||||||
"created_at": "2024-12-06T00:48:09.983619Z",
|
|
||||||
"response": "{\n \"age\": 22,\n \"available\": true\n}",
|
|
||||||
"done": true,
|
|
||||||
"done_reason": "stop",
|
|
||||||
"context": [1, 2, 3],
|
|
||||||
"total_duration": 1075509083,
|
|
||||||
"load_duration": 567678166,
|
|
||||||
"prompt_eval_count": 28,
|
|
||||||
"prompt_eval_duration": 236000000,
|
|
||||||
"eval_count": 16,
|
|
||||||
"eval_duration": 269000000
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Request (JSON mode)
|
#### Request (JSON mode)
|
||||||
|
|
||||||
> [!IMPORTANT]
|
> [!IMPORTANT]
|
||||||
@@ -506,15 +456,11 @@ The `message` object has the following fields:
|
|||||||
|
|
||||||
Advanced parameters (optional):
|
Advanced parameters (optional):
|
||||||
|
|
||||||
- `format`: the format to return a response in. Format can be `json` or a JSON schema.
|
- `format`: the format to return a response in. Currently the only accepted value is `json`
|
||||||
- `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
|
- `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
|
||||||
- `stream`: if `false` the response will be returned as a single response object, rather than a stream of objects
|
- `stream`: if `false` the response will be returned as a single response object, rather than a stream of objects
|
||||||
- `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`)
|
- `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`)
|
||||||
|
|
||||||
### Structured outputs
|
|
||||||
|
|
||||||
Structured outputs are supported by providing a JSON schema in the `format` parameter. The model will generate a response that matches the schema. See the [Chat request (Structured outputs)](#chat-request-structured-outputs) example below.
|
|
||||||
|
|
||||||
### Examples
|
### Examples
|
||||||
|
|
||||||
#### Chat Request (Streaming)
|
#### Chat Request (Streaming)
|
||||||
@@ -605,54 +551,6 @@ curl http://localhost:11434/api/chat -d '{
|
|||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
#### Chat request (Structured outputs)
|
|
||||||
|
|
||||||
##### Request
|
|
||||||
|
|
||||||
```shell
|
|
||||||
curl -X POST http://localhost:11434/api/chat -H "Content-Type: application/json" -d '{
|
|
||||||
"model": "llama3.1",
|
|
||||||
"messages": [{"role": "user", "content": "Ollama is 22 years old and busy saving the world. Return a JSON object with the age and availability."}],
|
|
||||||
"stream": false,
|
|
||||||
"format": {
|
|
||||||
"type": "object",
|
|
||||||
"properties": {
|
|
||||||
"age": {
|
|
||||||
"type": "integer"
|
|
||||||
},
|
|
||||||
"available": {
|
|
||||||
"type": "boolean"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"required": [
|
|
||||||
"age",
|
|
||||||
"available"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"options": {
|
|
||||||
"temperature": 0
|
|
||||||
}
|
|
||||||
}'
|
|
||||||
```
|
|
||||||
|
|
||||||
##### Response
|
|
||||||
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"model": "llama3.1",
|
|
||||||
"created_at": "2024-12-06T00:46:58.265747Z",
|
|
||||||
"message": { "role": "assistant", "content": "{\"age\": 22, \"available\": false}" },
|
|
||||||
"done_reason": "stop",
|
|
||||||
"done": true,
|
|
||||||
"total_duration": 2254970291,
|
|
||||||
"load_duration": 574751416,
|
|
||||||
"prompt_eval_count": 34,
|
|
||||||
"prompt_eval_duration": 1502000000,
|
|
||||||
"eval_count": 12,
|
|
||||||
"eval_duration": 175000000
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Chat request (With History)
|
#### Chat request (With History)
|
||||||
|
|
||||||
Send a chat message with a conversation history. You can use this same approach to start the conversation using multi-shot or chain-of-thought prompting.
|
Send a chat message with a conversation history. You can use this same approach to start the conversation using multi-shot or chain-of-thought prompting.
|
||||||
|
|||||||
@@ -59,40 +59,6 @@ embeddings = client.embeddings.create(
|
|||||||
input=["why is the sky blue?", "why is the grass green?"],
|
input=["why is the sky blue?", "why is the grass green?"],
|
||||||
)
|
)
|
||||||
```
|
```
|
||||||
#### Structured outputs
|
|
||||||
```py
|
|
||||||
rom pydantic import BaseModel
|
|
||||||
from openai import OpenAI
|
|
||||||
|
|
||||||
client = OpenAI(base_url="http://localhost:11434/v1", api_key="ollama")
|
|
||||||
|
|
||||||
# Define the schema for the response
|
|
||||||
class FriendInfo(BaseModel):
|
|
||||||
name: str
|
|
||||||
age: int
|
|
||||||
is_available: bool
|
|
||||||
|
|
||||||
class FriendList(BaseModel):
|
|
||||||
friends: list[FriendInfo]
|
|
||||||
|
|
||||||
try:
|
|
||||||
completion = client.beta.chat.completions.parse(
|
|
||||||
temperature=0,
|
|
||||||
model="llama3.1:8b",
|
|
||||||
messages=[
|
|
||||||
{"role": "user", "content": "I have two friends. The first is Ollama 22 years old busy saving the world, and the second is Alonso 23 years old and wants to hang out. Return a list of friends in JSON format"}
|
|
||||||
],
|
|
||||||
response_format=FriendList,
|
|
||||||
)
|
|
||||||
|
|
||||||
friends_response = completion.choices[0].message
|
|
||||||
if friends_response.parsed:
|
|
||||||
print(friends_response.parsed)
|
|
||||||
elif friends_response.refusal:
|
|
||||||
print(friends_response.refusal)
|
|
||||||
except Exception as e:
|
|
||||||
print(f"Error: {e}")
|
|
||||||
```
|
|
||||||
|
|
||||||
### OpenAI JavaScript library
|
### OpenAI JavaScript library
|
||||||
|
|
||||||
@@ -233,6 +199,8 @@ curl http://localhost:11434/v1/embeddings \
|
|||||||
- [x] `seed`
|
- [x] `seed`
|
||||||
- [x] `stop`
|
- [x] `stop`
|
||||||
- [x] `stream`
|
- [x] `stream`
|
||||||
|
- [x] `stream_options`
|
||||||
|
- [x] `include_usage`
|
||||||
- [x] `temperature`
|
- [x] `temperature`
|
||||||
- [x] `top_p`
|
- [x] `top_p`
|
||||||
- [x] `max_tokens`
|
- [x] `max_tokens`
|
||||||
@@ -261,6 +229,8 @@ curl http://localhost:11434/v1/embeddings \
|
|||||||
- [x] `seed`
|
- [x] `seed`
|
||||||
- [x] `stop`
|
- [x] `stop`
|
||||||
- [x] `stream`
|
- [x] `stream`
|
||||||
|
- [x] `stream_options`
|
||||||
|
- [x] `include_usage`
|
||||||
- [x] `temperature`
|
- [x] `temperature`
|
||||||
- [x] `top_p`
|
- [x] `top_p`
|
||||||
- [x] `max_tokens`
|
- [x] `max_tokens`
|
||||||
|
|||||||
@@ -1,14 +1,3 @@
|
|||||||
# Examples
|
# Examples
|
||||||
|
|
||||||
This directory contains different examples of using Ollama.
|
This directory contains different examples of using Ollama.
|
||||||
|
|
||||||
## Python examples
|
|
||||||
Ollama Python examples at [ollama-python/examples](https://github.com/ollama/ollama-python/tree/main/examples)
|
|
||||||
|
|
||||||
|
|
||||||
## JavaScript examples
|
|
||||||
Ollama JavaScript examples at [ollama-js/examples](https://github.com/ollama/ollama-js/tree/main/examples)
|
|
||||||
|
|
||||||
|
|
||||||
## OpenAI compatibility examples
|
|
||||||
Ollama OpenAI compatibility examples at [ollama/examples/openai](../docs/openai.md)
|
|
||||||
|
|||||||
@@ -746,7 +746,7 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
|
|||||||
|
|
||||||
// TODO (parthsareen): Move conversion to grammar with sampling logic
|
// TODO (parthsareen): Move conversion to grammar with sampling logic
|
||||||
// API should do error handling for invalid formats
|
// API should do error handling for invalid formats
|
||||||
if req.Format != nil && strings.TrimSpace(string(req.Format)) != "null" {
|
if req.Format != nil {
|
||||||
if strings.ToLower(strings.TrimSpace(string(req.Format))) == `"json"` {
|
if strings.ToLower(strings.TrimSpace(string(req.Format))) == `"json"` {
|
||||||
request["grammar"] = jsonGrammar
|
request["grammar"] = jsonGrammar
|
||||||
if !strings.Contains(strings.ToLower(req.Prompt), "json") {
|
if !strings.Contains(strings.ToLower(req.Prompt), "json") {
|
||||||
|
|||||||
@@ -75,10 +75,15 @@ type EmbedRequest struct {
|
|||||||
Model string `json:"model"`
|
Model string `json:"model"`
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type StreamOptions struct {
|
||||||
|
IncludeUsage bool `json:"include_usage"`
|
||||||
|
}
|
||||||
|
|
||||||
type ChatCompletionRequest struct {
|
type ChatCompletionRequest struct {
|
||||||
Model string `json:"model"`
|
Model string `json:"model"`
|
||||||
Messages []Message `json:"messages"`
|
Messages []Message `json:"messages"`
|
||||||
Stream bool `json:"stream"`
|
Stream bool `json:"stream"`
|
||||||
|
StreamOptions *StreamOptions `json:"stream_options"`
|
||||||
MaxTokens *int `json:"max_tokens"`
|
MaxTokens *int `json:"max_tokens"`
|
||||||
Seed *int `json:"seed"`
|
Seed *int `json:"seed"`
|
||||||
Stop any `json:"stop"`
|
Stop any `json:"stop"`
|
||||||
@@ -107,6 +112,7 @@ type ChatCompletionChunk struct {
|
|||||||
Model string `json:"model"`
|
Model string `json:"model"`
|
||||||
SystemFingerprint string `json:"system_fingerprint"`
|
SystemFingerprint string `json:"system_fingerprint"`
|
||||||
Choices []ChunkChoice `json:"choices"`
|
Choices []ChunkChoice `json:"choices"`
|
||||||
|
Usage *Usage `json:"usage,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO (https://github.com/ollama/ollama/issues/5259): support []string, []int and [][]int
|
// TODO (https://github.com/ollama/ollama/issues/5259): support []string, []int and [][]int
|
||||||
@@ -119,6 +125,7 @@ type CompletionRequest struct {
|
|||||||
Seed *int `json:"seed"`
|
Seed *int `json:"seed"`
|
||||||
Stop any `json:"stop"`
|
Stop any `json:"stop"`
|
||||||
Stream bool `json:"stream"`
|
Stream bool `json:"stream"`
|
||||||
|
StreamOptions *StreamOptions `json:"stream_options"`
|
||||||
Temperature *float32 `json:"temperature"`
|
Temperature *float32 `json:"temperature"`
|
||||||
TopP float32 `json:"top_p"`
|
TopP float32 `json:"top_p"`
|
||||||
Suffix string `json:"suffix"`
|
Suffix string `json:"suffix"`
|
||||||
@@ -141,6 +148,7 @@ type CompletionChunk struct {
|
|||||||
Choices []CompleteChunkChoice `json:"choices"`
|
Choices []CompleteChunkChoice `json:"choices"`
|
||||||
Model string `json:"model"`
|
Model string `json:"model"`
|
||||||
SystemFingerprint string `json:"system_fingerprint"`
|
SystemFingerprint string `json:"system_fingerprint"`
|
||||||
|
Usage *Usage `json:"usage,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type ToolCall struct {
|
type ToolCall struct {
|
||||||
@@ -197,6 +205,14 @@ func NewError(code int, message string) ErrorResponse {
|
|||||||
return ErrorResponse{Error{Type: etype, Message: message}}
|
return ErrorResponse{Error{Type: etype, Message: message}}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func toUsage(r api.ChatResponse) Usage {
|
||||||
|
return Usage{
|
||||||
|
PromptTokens: r.PromptEvalCount,
|
||||||
|
CompletionTokens: r.EvalCount,
|
||||||
|
TotalTokens: r.PromptEvalCount + r.EvalCount,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func toolCallId() string {
|
func toolCallId() string {
|
||||||
const letterBytes = "abcdefghijklmnopqrstuvwxyz0123456789"
|
const letterBytes = "abcdefghijklmnopqrstuvwxyz0123456789"
|
||||||
b := make([]byte, 8)
|
b := make([]byte, 8)
|
||||||
@@ -246,11 +262,7 @@ func toChatCompletion(id string, r api.ChatResponse) ChatCompletion {
|
|||||||
return nil
|
return nil
|
||||||
}(r.DoneReason),
|
}(r.DoneReason),
|
||||||
}},
|
}},
|
||||||
Usage: Usage{
|
Usage: toUsage(r),
|
||||||
PromptTokens: r.PromptEvalCount,
|
|
||||||
CompletionTokens: r.EvalCount,
|
|
||||||
TotalTokens: r.PromptEvalCount + r.EvalCount,
|
|
||||||
},
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -275,6 +287,14 @@ func toChunk(id string, r api.ChatResponse) ChatCompletionChunk {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func toUsageGenerate(r api.GenerateResponse) Usage {
|
||||||
|
return Usage{
|
||||||
|
PromptTokens: r.PromptEvalCount,
|
||||||
|
CompletionTokens: r.EvalCount,
|
||||||
|
TotalTokens: r.PromptEvalCount + r.EvalCount,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func toCompletion(id string, r api.GenerateResponse) Completion {
|
func toCompletion(id string, r api.GenerateResponse) Completion {
|
||||||
return Completion{
|
return Completion{
|
||||||
Id: id,
|
Id: id,
|
||||||
@@ -292,11 +312,7 @@ func toCompletion(id string, r api.GenerateResponse) Completion {
|
|||||||
return nil
|
return nil
|
||||||
}(r.DoneReason),
|
}(r.DoneReason),
|
||||||
}},
|
}},
|
||||||
Usage: Usage{
|
Usage: toUsageGenerate(r),
|
||||||
PromptTokens: r.PromptEvalCount,
|
|
||||||
CompletionTokens: r.EvalCount,
|
|
||||||
TotalTokens: r.PromptEvalCount + r.EvalCount,
|
|
||||||
},
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -571,12 +587,14 @@ type BaseWriter struct {
|
|||||||
|
|
||||||
type ChatWriter struct {
|
type ChatWriter struct {
|
||||||
stream bool
|
stream bool
|
||||||
|
streamOptions *StreamOptions
|
||||||
id string
|
id string
|
||||||
BaseWriter
|
BaseWriter
|
||||||
}
|
}
|
||||||
|
|
||||||
type CompleteWriter struct {
|
type CompleteWriter struct {
|
||||||
stream bool
|
stream bool
|
||||||
|
streamOptions *StreamOptions
|
||||||
id string
|
id string
|
||||||
BaseWriter
|
BaseWriter
|
||||||
}
|
}
|
||||||
@@ -620,7 +638,11 @@ func (w *ChatWriter) writeResponse(data []byte) (int, error) {
|
|||||||
|
|
||||||
// chat chunk
|
// chat chunk
|
||||||
if w.stream {
|
if w.stream {
|
||||||
d, err := json.Marshal(toChunk(w.id, chatResponse))
|
c := toChunk(w.id, chatResponse)
|
||||||
|
if w.streamOptions != nil && w.streamOptions.IncludeUsage {
|
||||||
|
c.Usage = &Usage{}
|
||||||
|
}
|
||||||
|
d, err := json.Marshal(c)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return 0, err
|
return 0, err
|
||||||
}
|
}
|
||||||
@@ -632,6 +654,17 @@ func (w *ChatWriter) writeResponse(data []byte) (int, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if chatResponse.Done {
|
if chatResponse.Done {
|
||||||
|
if w.streamOptions != nil && w.streamOptions.IncludeUsage {
|
||||||
|
u := toUsage(chatResponse)
|
||||||
|
d, err := json.Marshal(ChatCompletionChunk{Choices: []ChunkChoice{}, Usage: &u})
|
||||||
|
if err != nil {
|
||||||
|
return 0, err
|
||||||
|
}
|
||||||
|
_, err = w.ResponseWriter.Write([]byte(fmt.Sprintf("data: %s\n\n", d)))
|
||||||
|
if err != nil {
|
||||||
|
return 0, err
|
||||||
|
}
|
||||||
|
}
|
||||||
_, err = w.ResponseWriter.Write([]byte("data: [DONE]\n\n"))
|
_, err = w.ResponseWriter.Write([]byte("data: [DONE]\n\n"))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return 0, err
|
return 0, err
|
||||||
@@ -669,7 +702,11 @@ func (w *CompleteWriter) writeResponse(data []byte) (int, error) {
|
|||||||
|
|
||||||
// completion chunk
|
// completion chunk
|
||||||
if w.stream {
|
if w.stream {
|
||||||
d, err := json.Marshal(toCompleteChunk(w.id, generateResponse))
|
c := toCompleteChunk(w.id, generateResponse)
|
||||||
|
if w.streamOptions != nil && w.streamOptions.IncludeUsage {
|
||||||
|
c.Usage = &Usage{}
|
||||||
|
}
|
||||||
|
d, err := json.Marshal(c)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return 0, err
|
return 0, err
|
||||||
}
|
}
|
||||||
@@ -681,6 +718,17 @@ func (w *CompleteWriter) writeResponse(data []byte) (int, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if generateResponse.Done {
|
if generateResponse.Done {
|
||||||
|
if w.streamOptions != nil && w.streamOptions.IncludeUsage {
|
||||||
|
u := toUsageGenerate(generateResponse)
|
||||||
|
d, err := json.Marshal(CompletionChunk{Choices: []CompleteChunkChoice{}, Usage: &u})
|
||||||
|
if err != nil {
|
||||||
|
return 0, err
|
||||||
|
}
|
||||||
|
_, err = w.ResponseWriter.Write([]byte(fmt.Sprintf("data: %s\n\n", d)))
|
||||||
|
if err != nil {
|
||||||
|
return 0, err
|
||||||
|
}
|
||||||
|
}
|
||||||
_, err = w.ResponseWriter.Write([]byte("data: [DONE]\n\n"))
|
_, err = w.ResponseWriter.Write([]byte("data: [DONE]\n\n"))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return 0, err
|
return 0, err
|
||||||
@@ -846,6 +894,7 @@ func CompletionsMiddleware() gin.HandlerFunc {
|
|||||||
BaseWriter: BaseWriter{ResponseWriter: c.Writer},
|
BaseWriter: BaseWriter{ResponseWriter: c.Writer},
|
||||||
stream: req.Stream,
|
stream: req.Stream,
|
||||||
id: fmt.Sprintf("cmpl-%d", rand.Intn(999)),
|
id: fmt.Sprintf("cmpl-%d", rand.Intn(999)),
|
||||||
|
streamOptions: req.StreamOptions,
|
||||||
}
|
}
|
||||||
|
|
||||||
c.Writer = w
|
c.Writer = w
|
||||||
@@ -928,6 +977,7 @@ func ChatMiddleware() gin.HandlerFunc {
|
|||||||
BaseWriter: BaseWriter{ResponseWriter: c.Writer},
|
BaseWriter: BaseWriter{ResponseWriter: c.Writer},
|
||||||
stream: req.Stream,
|
stream: req.Stream,
|
||||||
id: fmt.Sprintf("chatcmpl-%d", rand.Intn(999)),
|
id: fmt.Sprintf("chatcmpl-%d", rand.Intn(999)),
|
||||||
|
streamOptions: req.StreamOptions,
|
||||||
}
|
}
|
||||||
|
|
||||||
c.Writer = w
|
c.Writer = w
|
||||||
|
|||||||
@@ -112,6 +112,45 @@ func TestChatMiddleware(t *testing.T) {
|
|||||||
Stream: &True,
|
Stream: &True,
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
name: "chat handler with streaming usage",
|
||||||
|
body: `{
|
||||||
|
"model": "test-model",
|
||||||
|
"messages": [
|
||||||
|
{"role": "user", "content": "Hello"}
|
||||||
|
],
|
||||||
|
"stream": true,
|
||||||
|
"stream_options": {"include_usage": true},
|
||||||
|
"max_tokens": 999,
|
||||||
|
"seed": 123,
|
||||||
|
"stop": ["\n", "stop"],
|
||||||
|
"temperature": 3.0,
|
||||||
|
"frequency_penalty": 4.0,
|
||||||
|
"presence_penalty": 5.0,
|
||||||
|
"top_p": 6.0,
|
||||||
|
"response_format": {"type": "json_object"}
|
||||||
|
}`,
|
||||||
|
req: api.ChatRequest{
|
||||||
|
Model: "test-model",
|
||||||
|
Messages: []api.Message{
|
||||||
|
{
|
||||||
|
Role: "user",
|
||||||
|
Content: "Hello",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
Options: map[string]any{
|
||||||
|
"num_predict": 999.0, // float because JSON doesn't distinguish between float and int
|
||||||
|
"seed": 123.0,
|
||||||
|
"stop": []any{"\n", "stop"},
|
||||||
|
"temperature": 3.0,
|
||||||
|
"frequency_penalty": 4.0,
|
||||||
|
"presence_penalty": 5.0,
|
||||||
|
"top_p": 6.0,
|
||||||
|
},
|
||||||
|
Format: json.RawMessage(`"json"`),
|
||||||
|
Stream: &True,
|
||||||
|
},
|
||||||
|
},
|
||||||
{
|
{
|
||||||
name: "chat handler with image content",
|
name: "chat handler with image content",
|
||||||
body: `{
|
body: `{
|
||||||
@@ -363,6 +402,55 @@ func TestCompletionsMiddleware(t *testing.T) {
|
|||||||
Stream: &False,
|
Stream: &False,
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
name: "completions handler stream",
|
||||||
|
body: `{
|
||||||
|
"model": "test-model",
|
||||||
|
"prompt": "Hello",
|
||||||
|
"stream": true,
|
||||||
|
"temperature": 0.8,
|
||||||
|
"stop": ["\n", "stop"],
|
||||||
|
"suffix": "suffix"
|
||||||
|
}`,
|
||||||
|
req: api.GenerateRequest{
|
||||||
|
Model: "test-model",
|
||||||
|
Prompt: "Hello",
|
||||||
|
Options: map[string]any{
|
||||||
|
"frequency_penalty": 0.0,
|
||||||
|
"presence_penalty": 0.0,
|
||||||
|
"temperature": 0.8,
|
||||||
|
"top_p": 1.0,
|
||||||
|
"stop": []any{"\n", "stop"},
|
||||||
|
},
|
||||||
|
Suffix: "suffix",
|
||||||
|
Stream: &True,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "completions handler stream with usage",
|
||||||
|
body: `{
|
||||||
|
"model": "test-model",
|
||||||
|
"prompt": "Hello",
|
||||||
|
"stream": true,
|
||||||
|
"stream_options": {"include_usage": true},
|
||||||
|
"temperature": 0.8,
|
||||||
|
"stop": ["\n", "stop"],
|
||||||
|
"suffix": "suffix"
|
||||||
|
}`,
|
||||||
|
req: api.GenerateRequest{
|
||||||
|
Model: "test-model",
|
||||||
|
Prompt: "Hello",
|
||||||
|
Options: map[string]any{
|
||||||
|
"frequency_penalty": 0.0,
|
||||||
|
"presence_penalty": 0.0,
|
||||||
|
"temperature": 0.8,
|
||||||
|
"top_p": 1.0,
|
||||||
|
"stop": []any{"\n", "stop"},
|
||||||
|
},
|
||||||
|
Suffix: "suffix",
|
||||||
|
Stream: &True,
|
||||||
|
},
|
||||||
|
},
|
||||||
{
|
{
|
||||||
name: "completions handler error forwarding",
|
name: "completions handler error forwarding",
|
||||||
body: `{
|
body: `{
|
||||||
|
|||||||
Reference in New Issue
Block a user