More cleanup

Code cleanup
Merge branch 'main' of https://github.com/ollama/ollama into openai-stream-usage
2024-12-11 18:11:00 -08:00 · 2024-12-11 18:04:16 -08:00 · 2024-12-06 12:05:25 +09:00 · 2024-10-03 13:02:50 +09:00 · 2024-09-17 15:25:31 +09:00 · 2024-09-13 12:32:05 +09:00
8 changed files with 179 additions and 192 deletions
--- a/README.md
+++ b/README.md
@@ -49,12 +49,12 @@ Here are some example models that can be downloaded:
 | Model              | Parameters | Size  | Download                         |
 | ------------------ | ---------- | ----- | -------------------------------- |
 | Llama 3.3          | 70B        | 43GB  | `ollama run llama3.3`            |
 | Llama 3.2          | 3B         | 2.0GB | `ollama run llama3.2`            |
 | Llama 3.2          | 1B         | 1.3GB | `ollama run llama3.2:1b`         |
 | Llama 3.2 Vision   | 11B        | 7.9GB | `ollama run llama3.2-vision`     |
 | Llama 3.2 Vision   | 90B        | 55GB  | `ollama run llama3.2-vision:90b` |
 | Llama 3.1          | 8B         | 4.7GB | `ollama run llama3.1`            |
 | Llama 3.1          | 70B        | 40GB  | `ollama run llama3.1:70b`        |
 | Llama 3.1          | 405B       | 231GB | `ollama run llama3.1:405b`       |
 | Phi 3 Mini         | 3.8B       | 2.3GB | `ollama run phi3`                |
 | Phi 3 Medium       | 14B        | 7.9GB | `ollama run phi3:medium`         |
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -1036,10 +1036,6 @@ func chat(cmd *cobra.Command, opts runOptions) (*api.Message, error) {
 		return nil
 	}
 	if opts.Format == "json" {
 		opts.Format = `"` + opts.Format + `"`
 	}
 	req := &api.ChatRequest{
 		Model:    opts.Model,
 		Messages: opts.Messages,
@@ -1125,10 +1121,6 @@ func generate(cmd *cobra.Command, opts runOptions) error {
 		}
 	}
 	if opts.Format == "json" {
 		opts.Format = `"` + opts.Format + `"`
 	}
 	request := api.GenerateRequest{
 		Model:     opts.Model,
 		Prompt:    opts.Prompt,
--- a/docs/api.md
+++ b/docs/api.md
@@ -45,7 +45,7 @@ Generate a response for a given prompt with a provided model. This is a streamin
 Advanced parameters (optional):
- `format`: the format to return a response in. Format can be `json` or a JSON schema
+- `format`: the format to return a response in. Currently the only accepted value is `json`
 - `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
 - `system`: system message to (overrides what is defined in the `Modelfile`)
 - `template`: the prompt template to use (overrides what is defined in the `Modelfile`)
@@ -54,10 +54,6 @@ Advanced parameters (optional):
 - `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`)
 - `context` (deprecated): the context parameter returned from a previous request to `/generate`, this can be used to keep a short conversational memory
 #### Structured outputs
 Structured outputs are supported by providing a JSON schema in the `format` parameter. The model will generate a response that matches the schema. See the [structured outputs](#request-structured-outputs) example below.
 #### JSON mode
 Enable JSON mode by setting the `format` parameter to `json`. This will structure the response as a valid JSON object. See the JSON mode [example](#request-json-mode) below.
@@ -189,52 +185,6 @@ curl http://localhost:11434/api/generate -d '{
 }
 ```
 #### Request (Structured outputs)
 ##### Request
 ```shell
 curl -X POST http://localhost:11434/api/generate -H "Content-Type: application/json" -d '{
  "model": "llama3.1:8b",
  "prompt": "Ollama is 22 years old and is busy saving the world. Respond using JSON",
  "stream": false,
  "format": {
    "type": "object",
    "properties": {
      "age": {
        "type": "integer"
      },
      "available": {
        "type": "boolean"
      }
    },
    "required": [
      "age",
      "available"
    ]
  }
 }'
 ```
 ##### Response
 ```json
 {
  "model": "llama3.1:8b",
  "created_at": "2024-12-06T00:48:09.983619Z",
  "response": "{\n  \"age\": 22,\n  \"available\": true\n}",
  "done": true,
  "done_reason": "stop",
  "context": [1, 2, 3],
  "total_duration": 1075509083,
  "load_duration": 567678166,
  "prompt_eval_count": 28,
  "prompt_eval_duration": 236000000,
  "eval_count": 16,
  "eval_duration": 269000000
 }
 ```
 #### Request (JSON mode)
 > [!IMPORTANT]
@@ -506,15 +456,11 @@ The `message` object has the following fields:
 Advanced parameters (optional):
- `format`: the format to return a response in. Format can be `json` or a JSON schema. 
+- `format`: the format to return a response in. Currently the only accepted value is `json`
 - `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
 - `stream`: if `false` the response will be returned as a single response object, rather than a stream of objects
 - `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`)
 ### Structured outputs
 Structured outputs are supported by providing a JSON schema in the `format` parameter. The model will generate a response that matches the schema. See the [Chat request (Structured outputs)](#chat-request-structured-outputs) example below.
 ### Examples
 #### Chat Request (Streaming)
@@ -605,54 +551,6 @@ curl http://localhost:11434/api/chat -d '{
 }
 ```
 #### Chat request (Structured outputs)
 ##### Request
 ```shell
 curl -X POST http://localhost:11434/api/chat -H "Content-Type: application/json" -d '{
  "model": "llama3.1",
  "messages": [{"role": "user", "content": "Ollama is 22 years old and busy saving the world. Return a JSON object with the age and availability."}],
  "stream": false,
  "format": {
    "type": "object",
    "properties": {
      "age": {
        "type": "integer"
      },
      "available": {
        "type": "boolean"
      }
    },
    "required": [
      "age",
      "available"
    ]
  },
  "options": {
    "temperature": 0
  }
 }'
 ```
 ##### Response
 ```json
 {
  "model": "llama3.1",
  "created_at": "2024-12-06T00:46:58.265747Z",
  "message": { "role": "assistant", "content": "{\"age\": 22, \"available\": false}" },
  "done_reason": "stop",
  "done": true,
  "total_duration": 2254970291,
  "load_duration": 574751416,
  "prompt_eval_count": 34,
  "prompt_eval_duration": 1502000000,
  "eval_count": 12,
  "eval_duration": 175000000
 }
 ```
 #### Chat request (With History)
 Send a chat message with a conversation history. You can use this same approach to start the conversation using multi-shot or chain-of-thought prompting.
--- a/docs/openai.md
+++ b/docs/openai.md
@@ -59,40 +59,6 @@ embeddings = client.embeddings.create(
    input=["why is the sky blue?", "why is the grass green?"],
 )
 ```
 #### Structured outputs
 ```py
 rom pydantic import BaseModel
 from openai import OpenAI
 client = OpenAI(base_url="http://localhost:11434/v1", api_key="ollama")
 # Define the schema for the response
 class FriendInfo(BaseModel):
    name: str
    age: int 
    is_available: bool
 class FriendList(BaseModel):
    friends: list[FriendInfo]
 try:
    completion = client.beta.chat.completions.parse(
        temperature=0,
        model="llama3.1:8b",
        messages=[
            {"role": "user", "content": "I have two friends. The first is Ollama 22 years old busy saving the world, and the second is Alonso 23 years old and wants to hang out. Return a list of friends in JSON format"}
        ],
        response_format=FriendList,
    )
    friends_response = completion.choices[0].message
    if friends_response.parsed:
        print(friends_response.parsed)
    elif friends_response.refusal:
        print(friends_response.refusal)
 except Exception as e:
    print(f"Error: {e}")
 ```
 ### OpenAI JavaScript library
@@ -233,6 +199,8 @@ curl http://localhost:11434/v1/embeddings \
 - [x] `seed`
 - [x] `stop`
 - [x] `stream`
 - [x] `stream_options`
  - [x] `include_usage`
 - [x] `temperature`
 - [x] `top_p`
 - [x] `max_tokens`
@@ -261,6 +229,8 @@ curl http://localhost:11434/v1/embeddings \
 - [x] `seed`
 - [x] `stop`
 - [x] `stream`
 - [x] `stream_options`
  - [x] `include_usage`
 - [x] `temperature`
 - [x] `top_p`
 - [x] `max_tokens`
--- a/examples/README.md
+++ b/examples/README.md
@@ -1,14 +1,3 @@
 # Examples
 This directory contains different examples of using Ollama.
 ## Python examples
 Ollama Python examples at [ollama-python/examples](https://github.com/ollama/ollama-python/tree/main/examples)
 ## JavaScript examples
 Ollama JavaScript examples at [ollama-js/examples](https://github.com/ollama/ollama-js/tree/main/examples)
 ## OpenAI compatibility examples
 Ollama OpenAI compatibility examples at [ollama/examples/openai](../docs/openai.md)
--- a/llm/server.go
+++ b/llm/server.go
@@ -746,7 +746,7 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
 	// TODO (parthsareen): Move conversion to grammar with sampling logic
 	// API should do error handling for invalid formats
-	if req.Format != nil && strings.TrimSpace(string(req.Format)) != "null" {
+	if req.Format != nil {
 		if strings.ToLower(strings.TrimSpace(string(req.Format))) == `"json"` {
 			request["grammar"] = jsonGrammar
 			if !strings.Contains(strings.ToLower(req.Prompt), "json") {
--- a/openai/openai.go
+++ b/openai/openai.go
@@ -75,10 +75,15 @@ type EmbedRequest struct {
 	Model string `json:"model"`
 }
 type StreamOptions struct {
 	IncludeUsage bool `json:"include_usage"`
 }
 type ChatCompletionRequest struct {
 	Model            string          `json:"model"`
 	Messages         []Message       `json:"messages"`
 	Stream           bool            `json:"stream"`
 	StreamOptions    *StreamOptions  `json:"stream_options"`
 	MaxTokens        *int            `json:"max_tokens"`
 	Seed             *int            `json:"seed"`
 	Stop             any             `json:"stop"`
@@ -107,6 +112,7 @@ type ChatCompletionChunk struct {
 	Model             string        `json:"model"`
 	SystemFingerprint string        `json:"system_fingerprint"`
 	Choices           []ChunkChoice `json:"choices"`
 	Usage             *Usage        `json:"usage,omitempty"`
 }
 // TODO (https://github.com/ollama/ollama/issues/5259): support []string, []int and [][]int
@@ -119,6 +125,7 @@ type CompletionRequest struct {
 	Seed             *int           `json:"seed"`
 	Stop             any            `json:"stop"`
 	Stream           bool           `json:"stream"`
 	StreamOptions    *StreamOptions `json:"stream_options"`
 	Temperature      *float32       `json:"temperature"`
 	TopP             float32        `json:"top_p"`
 	Suffix           string         `json:"suffix"`
@@ -141,6 +148,7 @@ type CompletionChunk struct {
 	Choices           []CompleteChunkChoice `json:"choices"`
 	Model             string                `json:"model"`
 	SystemFingerprint string                `json:"system_fingerprint"`
 	Usage             *Usage                `json:"usage,omitempty"`
 }
 type ToolCall struct {
@@ -197,6 +205,14 @@ func NewError(code int, message string) ErrorResponse {
 	return ErrorResponse{Error{Type: etype, Message: message}}
 }
 func toUsage(r api.ChatResponse) Usage {
 	return Usage{
 		PromptTokens:     r.PromptEvalCount,
 		CompletionTokens: r.EvalCount,
 		TotalTokens:      r.PromptEvalCount + r.EvalCount,
 	}
 }
 func toolCallId() string {
 	const letterBytes = "abcdefghijklmnopqrstuvwxyz0123456789"
 	b := make([]byte, 8)
@@ -246,11 +262,7 @@ func toChatCompletion(id string, r api.ChatResponse) ChatCompletion {
 				return nil
 			}(r.DoneReason),
 		}},
-		Usage: Usage{
+		Usage: toUsage(r),
 			PromptTokens:     r.PromptEvalCount,
 			CompletionTokens: r.EvalCount,
 			TotalTokens:      r.PromptEvalCount + r.EvalCount,
 		},
 	}
 }
@@ -275,6 +287,14 @@ func toChunk(id string, r api.ChatResponse) ChatCompletionChunk {
 	}
 }
 func toUsageGenerate(r api.GenerateResponse) Usage {
 	return Usage{
 		PromptTokens:     r.PromptEvalCount,
 		CompletionTokens: r.EvalCount,
 		TotalTokens:      r.PromptEvalCount + r.EvalCount,
 	}
 }
 func toCompletion(id string, r api.GenerateResponse) Completion {
 	return Completion{
 		Id:                id,
@@ -292,11 +312,7 @@ func toCompletion(id string, r api.GenerateResponse) Completion {
 				return nil
 			}(r.DoneReason),
 		}},
-		Usage: Usage{
+		Usage: toUsageGenerate(r),
 			PromptTokens:     r.PromptEvalCount,
 			CompletionTokens: r.EvalCount,
 			TotalTokens:      r.PromptEvalCount + r.EvalCount,
 		},
 	}
 }
@@ -571,12 +587,14 @@ type BaseWriter struct {
 type ChatWriter struct {
 	stream        bool
 	streamOptions *StreamOptions
 	id            string
 	BaseWriter
 }
 type CompleteWriter struct {
 	stream        bool
 	streamOptions *StreamOptions
 	id            string
 	BaseWriter
 }
@@ -620,7 +638,11 @@ func (w *ChatWriter) writeResponse(data []byte) (int, error) {
 	// chat chunk
 	if w.stream {
-		d, err := json.Marshal(toChunk(w.id, chatResponse))
+		c := toChunk(w.id, chatResponse)
 		if w.streamOptions != nil && w.streamOptions.IncludeUsage {
 			c.Usage = &Usage{}
 		}
 		d, err := json.Marshal(c)
 		if err != nil {
 			return 0, err
 		}
@@ -632,6 +654,17 @@ func (w *ChatWriter) writeResponse(data []byte) (int, error) {
 		}
 		if chatResponse.Done {
 			if w.streamOptions != nil && w.streamOptions.IncludeUsage {
 				u := toUsage(chatResponse)
 				d, err := json.Marshal(ChatCompletionChunk{Choices: []ChunkChoice{}, Usage: &u})
 				if err != nil {
 					return 0, err
 				}
 				_, err = w.ResponseWriter.Write([]byte(fmt.Sprintf("data: %s\n\n", d)))
 				if err != nil {
 					return 0, err
 				}
 			}
 			_, err = w.ResponseWriter.Write([]byte("data: [DONE]\n\n"))
 			if err != nil {
 				return 0, err
@@ -669,7 +702,11 @@ func (w *CompleteWriter) writeResponse(data []byte) (int, error) {
 	// completion chunk
 	if w.stream {
-		d, err := json.Marshal(toCompleteChunk(w.id, generateResponse))
+		c := toCompleteChunk(w.id, generateResponse)
 		if w.streamOptions != nil && w.streamOptions.IncludeUsage {
 			c.Usage = &Usage{}
 		}
 		d, err := json.Marshal(c)
 		if err != nil {
 			return 0, err
 		}
@@ -681,6 +718,17 @@ func (w *CompleteWriter) writeResponse(data []byte) (int, error) {
 		}
 		if generateResponse.Done {
 			if w.streamOptions != nil && w.streamOptions.IncludeUsage {
 				u := toUsageGenerate(generateResponse)
 				d, err := json.Marshal(CompletionChunk{Choices: []CompleteChunkChoice{}, Usage: &u})
 				if err != nil {
 					return 0, err
 				}
 				_, err = w.ResponseWriter.Write([]byte(fmt.Sprintf("data: %s\n\n", d)))
 				if err != nil {
 					return 0, err
 				}
 			}
 			_, err = w.ResponseWriter.Write([]byte("data: [DONE]\n\n"))
 			if err != nil {
 				return 0, err
@@ -846,6 +894,7 @@ func CompletionsMiddleware() gin.HandlerFunc {
 			BaseWriter:    BaseWriter{ResponseWriter: c.Writer},
 			stream:        req.Stream,
 			id:            fmt.Sprintf("cmpl-%d", rand.Intn(999)),
 			streamOptions: req.StreamOptions,
 		}
 		c.Writer = w
@@ -928,6 +977,7 @@ func ChatMiddleware() gin.HandlerFunc {
 			BaseWriter:    BaseWriter{ResponseWriter: c.Writer},
 			stream:        req.Stream,
 			id:            fmt.Sprintf("chatcmpl-%d", rand.Intn(999)),
 			streamOptions: req.StreamOptions,
 		}
 		c.Writer = w
--- a/openai/openai_test.go
+++ b/openai/openai_test.go
@@ -112,6 +112,45 @@ func TestChatMiddleware(t *testing.T) {
 				Stream: &True,
 			},
 		},
 		{
 			name: "chat handler with streaming usage",
 			body: `{
 				"model": "test-model",
 				"messages": [
 					{"role": "user", "content": "Hello"}
 				],
 				"stream":            true,
 				"stream_options":    {"include_usage": true},
 				"max_tokens":        999,
 				"seed":              123,
 				"stop":              ["\n", "stop"],
 				"temperature":       3.0,
 				"frequency_penalty": 4.0,
 				"presence_penalty":  5.0,
 				"top_p":             6.0,
 				"response_format":   {"type": "json_object"}
 			}`,
 			req: api.ChatRequest{
 				Model: "test-model",
 				Messages: []api.Message{
 					{
 						Role:    "user",
 						Content: "Hello",
 					},
 				},
 				Options: map[string]any{
 					"num_predict":       999.0, // float because JSON doesn't distinguish between float and int
 					"seed":              123.0,
 					"stop":              []any{"\n", "stop"},
 					"temperature":       3.0,
 					"frequency_penalty": 4.0,
 					"presence_penalty":  5.0,
 					"top_p":             6.0,
 				},
 				Format: json.RawMessage(`"json"`),
 				Stream: &True,
 			},
 		},
 		{
 			name: "chat handler with image content",
 			body: `{
@@ -363,6 +402,55 @@ func TestCompletionsMiddleware(t *testing.T) {
 				Stream: &False,
 			},
 		},
 		{
 			name: "completions handler stream",
 			body: `{
 				"model": "test-model",
 				"prompt": "Hello",
 				"stream": true,
 				"temperature": 0.8,
 				"stop": ["\n", "stop"],
 				"suffix": "suffix"
 			}`,
 			req: api.GenerateRequest{
 				Model:  "test-model",
 				Prompt: "Hello",
 				Options: map[string]any{
 					"frequency_penalty": 0.0,
 					"presence_penalty":  0.0,
 					"temperature":       0.8,
 					"top_p":             1.0,
 					"stop":              []any{"\n", "stop"},
 				},
 				Suffix: "suffix",
 				Stream: &True,
 			},
 		},
 		{
 			name: "completions handler stream with usage",
 			body: `{
 				"model": "test-model",
 				"prompt": "Hello",
 				"stream": true,
 				"stream_options": {"include_usage": true},
 				"temperature": 0.8,
 				"stop": ["\n", "stop"],
 				"suffix": "suffix"
 			}`,
 			req: api.GenerateRequest{
 				Model:  "test-model",
 				Prompt: "Hello",
 				Options: map[string]any{
 					"frequency_penalty": 0.0,
 					"presence_penalty":  0.0,
 					"temperature":       0.8,
 					"top_p":             1.0,
 					"stop":              []any{"\n", "stop"},
 				},
 				Suffix: "suffix",
 				Stream: &True,
 			},
 		},
 		{
 			name: "completions handler error forwarding",
 			body: `{
Author	SHA1	Message	Date
ParthSareen	2536ffe0ab	More cleanup	2024-12-11 18:11:00 -08:00
ParthSareen	97abd7bfea	Code cleanup	2024-12-11 18:04:16 -08:00
Anuraag Agrawal	c6509bf76e	Merge branch 'main' of https://github.com/ollama/ollama into openai-stream-usage	2024-12-06 12:05:25 +09:00
Anuraag Agrawal	7355ab3703	Return empty choices on usage chunk	2024-10-03 13:02:50 +09:00
Anuraag Agrawal	7ed81437fe	Document stream_options	2024-09-17 15:25:31 +09:00
Anuraag Agrawal	220108d3f4	openai: support include_usage stream option to return final usage chunk	2024-09-13 12:32:05 +09:00