Compare commits

..

16 Commits

Author SHA1 Message Date
ParthSareen
b4de2e9189 change name to context_length 2025-02-07 11:50:38 -08:00
ParthSareen
61a5254115 context_window and addressing comments 2025-02-05 11:26:55 -08:00
ParthSareen
53d2cf37d2 update docs 2025-02-04 15:17:16 -08:00
ParthSareen
75f88e7aac Update docs 2025-02-04 10:47:32 -08:00
ParthSareen
4982089c84 Fix formatting 2025-01-30 13:53:24 -08:00
Parth Sareen
8c231b0826 Update openai/openai.go
Co-authored-by: Michael Yang <mxyng@pm.me>
2025-01-30 13:50:25 -08:00
ParthSareen
16abd181a9 remove context shifting with max tokens and update docs 2025-01-30 13:48:24 -08:00
ParthSareen
5c2f35d846 Add tests 2025-01-30 13:16:15 -08:00
ParthSareen
6de3227841 Cleanup api 2025-01-30 13:15:57 -08:00
ParthSareen
35e97db03b set num_ctx through extra body 2025-01-29 13:13:11 -08:00
Xiaofu Huang
2ef3c803a1 readme: add AI Toolkit for VSCode to community integrations (#8604) 2025-01-27 00:36:23 -08:00
Matěj Štágl
453e4d090b readme: add LlmTornado to community integrations (#8551) 2025-01-25 01:04:07 -08:00
Daniel Jalkut
ca2f9843c8 docs: remove reference to the deleted examples folder (#8524) 2025-01-22 22:52:15 -08:00
frob
294b6f5a22 docs: remove tfs_z option from documentation (#8515) 2025-01-21 09:28:59 -08:00
EndoTheDev
7bb356c680 docs: update suspend header in gpu.md (#8487) 2025-01-19 18:45:35 -08:00
Jannik Maierhöfer
021817e59a readme: add link to Langfuse (#8455) 2025-01-16 22:41:12 -08:00
7 changed files with 116 additions and 49 deletions

View File

@@ -369,6 +369,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [Minima](https://github.com/dmayboroda/minima) (RAG with on-premises or fully local workflow)
- [aidful-ollama-model-delete](https://github.com/AidfulAI/aidful-ollama-model-delete) (User interface for simplified model cleanup)
- [Perplexica](https://github.com/ItzCrazyKns/Perplexica) (An AI-powered search engine & an open-source alternative to Perplexity AI)
- [AI Toolkit for Visual Studio Code](https://aka.ms/ai-tooklit/ollama-docs) (Microsoft-official VSCode extension to chat, test, evaluate models with Ollama support, and use them in your AI applications.)
### Cloud
@@ -481,6 +482,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [GoLamify](https://github.com/prasad89/golamify)
- [Ollama for Haskell](https://github.com/tusharad/ollama-haskell)
- [multi-llm-ts](https://github.com/nbonamy/multi-llm-ts) (A Typescript/JavaScript library allowing access to different LLM in unified API)
- [LlmTornado](https://github.com/lofcz/llmtornado) (C# library providing a unified interface for major FOSS & Commercial inference APIs)
### Mobile
@@ -539,4 +541,5 @@ See the [API documentation](./docs/api.md) for all endpoints.
### Observability
- [OpenLIT](https://github.com/openlit/openlit) is an OpenTelemetry-native tool for monitoring Ollama Applications & GPUs using traces and metrics.
- [HoneyHive](https://docs.honeyhive.ai/integrations/ollama) is an AI observability and evaluation platform for AI agents. Use HoneyHive to evaluate agent performance, interrogate failures, and monitor quality in production.
- [HoneyHive](https://docs.honeyhive.ai/integrations/ollama) is an AI observability and evaluation platform for AI agents. Use HoneyHive to evaluate agent performance, interrogate failures, and monitor quality in production.
- [Langfuse](https://langfuse.com/docs/integrations/ollama) is an open source LLM observability platform that enables teams to collaboratively monitor, evaluate and debug AI applications.

View File

@@ -38,7 +38,7 @@ Numeric IDs may be used, however ordering may vary, so UUIDs are more reliable.
You can discover the UUID of your GPUs by running `nvidia-smi -L` If you want to
ignore the GPUs and force CPU usage, use an invalid GPU ID (e.g., "-1")
### Laptop Suspend Resume
### Linux Suspend Resume
On linux, after a suspend/resume cycle, sometimes Ollama will fail to discover
your NVIDIA GPU, and fallback to running on the CPU. You can workaround this

View File

@@ -67,8 +67,6 @@ To use this:
3. `ollama run choose-a-model-name`
4. Start using the model!
More examples are available in the [examples directory](../examples).
To view the Modelfile of a given model, use the `ollama show --modelfile` command.
```bash
@@ -155,7 +153,6 @@ PARAMETER <parameter> <parametervalue>
| temperature | The temperature of the model. Increasing the temperature will make the model answer more creatively. (Default: 0.8) | float | temperature 0.7 |
| seed | Sets the random number seed to use for generation. Setting this to a specific number will make the model generate the same text for the same prompt. (Default: 0) | int | seed 42 |
| stop | Sets the stop sequences to use. When this pattern is encountered the LLM will stop generating text and return. Multiple stop patterns may be set by specifying multiple separate `stop` parameters in a modelfile. | string | stop "AI assistant:" |
| tfs_z | Tail free sampling is used to reduce the impact of less probable tokens from the output. A higher value (e.g., 2.0) will reduce the impact more, while a value of 1.0 disables this setting. (default: 1) | float | tfs_z 1 |
| num_predict | Maximum number of tokens to predict when generating text. (Default: -1, infinite generation) | int | num_predict 42 |
| top_k | Reduces the probability of generating nonsense. A higher value (e.g. 100) will give more diverse answers, while a lower value (e.g. 10) will be more conservative. (Default: 40) | int | top_k 40 |
| top_p | Works together with top-k. A higher value (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. (Default: 0.9) | float | top_p 0.9 |

View File

@@ -204,6 +204,45 @@ curl http://localhost:11434/v1/embeddings \
}'
```
## Extra arguments
### Setting context length
- `context_length` parameter can be used to set the context length for the model
#### OpenAI python library
- OpenAI python library does not support setting context length, however this can be set for Ollama through the `extra_body` parameter
```py
completion = client.chat.completions.create(
model="llama3.1:8b",
messages=[{"role": "user", "content": "Say this is a test"}],
extra_body={"context_length": 4096},
)
```
#### OpenAI JavaScript library
- OpenAI JavaScript library does not support setting context length, however this can be set for Ollama by passing `context_length` directly with a `@ts-expect-error` as an undocumented parameter in the OpenAI JavaScript library. [See documentation here](https://github.com/openai/openai-node?tab=readme-ov-file#making-customundocumented-requests)
```ts
const chatCompletion = await openai.chat.completions.create({
messages: [{ role: 'user', content: 'Say this is a test' }],
model: 'llama3.2',
// @ts-expect-error context_length is an additional parameter
context_length: 4096,
})
```
#### `curl`
```shell
curl http://localhost:11434/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "llama3.2",
"messages": [{"role": "user", "content": "Say this is a test"}],
"context_length": 4096
}'
```
## Endpoints
### `/v1/chat/completions`
@@ -213,6 +252,7 @@ curl http://localhost:11434/v1/embeddings \
- [x] Chat completions
- [x] Streaming
- [x] JSON mode
- [x] Structured outputs
- [x] Reproducible outputs
- [x] Vision
- [x] Tools
@@ -339,27 +379,3 @@ curl http://localhost:11434/v1/chat/completions \
}'
```
### Setting the context size
The OpenAI API does not have a way of setting the context size for a model. If you need to change the context size, create a `Modelfile` which looks like:
```modelfile
FROM <some model>
PARAMETER num_ctx <context size>
```
Use the `ollama create mymodel` command to create a new model with the updated context size. Call the API with the updated model name:
```shell
curl http://localhost:11434/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "mymodel",
"messages": [
{
"role": "user",
"content": "Hello!"
}
]
}'
```

View File

@@ -80,10 +80,12 @@ type StreamOptions struct {
}
type ChatCompletionRequest struct {
Model string `json:"model"`
Messages []Message `json:"messages"`
Stream bool `json:"stream"`
StreamOptions *StreamOptions `json:"stream_options"`
Model string `json:"model"`
Messages []Message `json:"messages"`
Stream bool `json:"stream"`
StreamOptions *StreamOptions `json:"stream_options"`
MaxCompletionTokens *int `json:"max_completion_tokens"`
// Deprecated: Use [ChatCompletionRequest.MaxCompletionTokens]
MaxTokens *int `json:"max_tokens"`
Seed *int `json:"seed"`
Stop any `json:"stop"`
@@ -93,6 +95,7 @@ type ChatCompletionRequest struct {
TopP *float64 `json:"top_p"`
ResponseFormat *ResponseFormat `json:"response_format"`
Tools []api.Tool `json:"tools"`
ContextLength *int `json:"context_length"`
}
type ChatCompletion struct {
@@ -475,8 +478,17 @@ func fromChatRequest(r ChatCompletionRequest) (*api.ChatRequest, error) {
options["stop"] = stops
}
if r.ContextLength != nil {
options["num_ctx"] = *r.ContextLength
}
// Deprecated: MaxTokens is deprecated, use MaxCompletionTokens instead
if r.MaxTokens != nil {
options["num_predict"] = *r.MaxTokens
r.MaxCompletionTokens = r.MaxTokens
}
if r.MaxCompletionTokens != nil {
options["num_predict"] = *r.MaxCompletionTokens
}
if r.Temperature != nil {
@@ -962,6 +974,7 @@ func ChatMiddleware() gin.HandlerFunc {
c.AbortWithStatusJSON(http.StatusBadRequest, NewError(http.StatusBadRequest, err.Error()))
return
}
slog.Info("num_ctx", "num_ctx", chatReq.Options["num_ctx"])
if err := json.NewEncoder(&b).Encode(chatReq); err != nil {
c.AbortWithStatusJSON(http.StatusInternalServerError, NewError(http.StatusInternalServerError, err.Error()))

View File

@@ -7,7 +7,6 @@ import (
"io"
"net/http"
"net/http/httptest"
"reflect"
"strings"
"testing"
"time"
@@ -315,6 +314,42 @@ func TestChatMiddleware(t *testing.T) {
Stream: &True,
},
},
{
name: "chat handler with context_length",
body: `{
"model": "test-model",
"messages": [{"role": "user", "content": "Hello"}],
"context_length": 4096
}`,
req: api.ChatRequest{
Model: "test-model",
Messages: []api.Message{{Role: "user", Content: "Hello"}},
Options: map[string]any{
"num_ctx": 4096.0, // float because JSON doesn't distinguish between float and int
"temperature": 1.0,
"top_p": 1.0,
},
Stream: &False,
},
},
{
name: "chat handler with max_completion_tokens",
body: `{
"model": "test-model",
"messages": [{"role": "user", "content": "Hello"}],
"max_completion_tokens": 2
}`,
req: api.ChatRequest{
Model: "test-model",
Messages: []api.Message{{Role: "user", Content: "Hello"}},
Options: map[string]any{
"num_predict": 2.0, // float because JSON doesn't distinguish between float and int
"temperature": 1.0,
"top_p": 1.0,
},
Stream: &False,
},
},
{
name: "chat handler error forwarding",
body: `{
@@ -359,7 +394,7 @@ func TestChatMiddleware(t *testing.T) {
return
}
if diff := cmp.Diff(&tc.req, capturedRequest); diff != "" {
t.Fatalf("requests did not match: %+v", diff)
t.Fatalf("requests did not match (-want +got):\n%s", diff)
}
if diff := cmp.Diff(tc.err, errResp); diff != "" {
t.Fatalf("errors did not match for %s:\n%s", tc.name, diff)
@@ -493,12 +528,14 @@ func TestCompletionsMiddleware(t *testing.T) {
}
}
if capturedRequest != nil && !reflect.DeepEqual(tc.req, *capturedRequest) {
t.Fatal("requests did not match")
if capturedRequest != nil {
if diff := cmp.Diff(tc.req, *capturedRequest); diff != "" {
t.Fatalf("requests did not match (-want +got):\n%s", diff)
}
}
if !reflect.DeepEqual(tc.err, errResp) {
t.Fatal("errors did not match")
if diff := cmp.Diff(tc.err, errResp); diff != "" {
t.Fatalf("errors did not match (-want +got):\n%s", diff)
}
capturedRequest = nil
@@ -577,12 +614,14 @@ func TestEmbeddingsMiddleware(t *testing.T) {
}
}
if capturedRequest != nil && !reflect.DeepEqual(tc.req, *capturedRequest) {
t.Fatal("requests did not match")
if capturedRequest != nil {
if diff := cmp.Diff(tc.req, *capturedRequest); diff != "" {
t.Fatalf("requests did not match (-want +got):\n%s", diff)
}
}
if !reflect.DeepEqual(tc.err, errResp) {
t.Fatal("errors did not match")
if diff := cmp.Diff(tc.err, errResp); diff != "" {
t.Fatalf("errors did not match (-want +got):\n%s", diff)
}
capturedRequest = nil
@@ -656,8 +695,8 @@ func TestListMiddleware(t *testing.T) {
t.Fatalf("failed to unmarshal actual response: %v", err)
}
if !reflect.DeepEqual(expected, actual) {
t.Errorf("responses did not match\nExpected: %+v\nActual: %+v", expected, actual)
if diff := cmp.Diff(expected, actual); diff != "" {
t.Errorf("responses did not match (-want +got):\n%s", diff)
}
}
}
@@ -722,8 +761,8 @@ func TestRetrieveMiddleware(t *testing.T) {
t.Fatalf("failed to unmarshal actual response: %v", err)
}
if !reflect.DeepEqual(expected, actual) {
t.Errorf("responses did not match\nExpected: %+v\nActual: %+v", expected, actual)
if diff := cmp.Diff(expected, actual); diff != "" {
t.Errorf("responses did not match (-want +got):\n%s", diff)
}
}
}

View File

@@ -490,7 +490,6 @@ func TestParseFileParameters(t *testing.T) {
"top_k 1": {"top_k", "1"},
"top_p 1.0": {"top_p", "1.0"},
"min_p 0.05": {"min_p", "0.05"},
"tfs_z 1.0": {"tfs_z", "1.0"},
"typical_p 1.0": {"typical_p", "1.0"},
"repeat_last_n 1": {"repeat_last_n", "1"},
"temperature 1.0": {"temperature", "1.0"},