Compare commits
16 Commits
v0.5.7
...
parth/set-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
b4de2e9189 | ||
|
|
61a5254115 | ||
|
|
53d2cf37d2 | ||
|
|
75f88e7aac | ||
|
|
4982089c84 | ||
|
|
8c231b0826 | ||
|
|
16abd181a9 | ||
|
|
5c2f35d846 | ||
|
|
6de3227841 | ||
|
|
35e97db03b | ||
|
|
2ef3c803a1 | ||
|
|
453e4d090b | ||
|
|
ca2f9843c8 | ||
|
|
294b6f5a22 | ||
|
|
7bb356c680 | ||
|
|
021817e59a |
@@ -369,6 +369,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
||||
- [Minima](https://github.com/dmayboroda/minima) (RAG with on-premises or fully local workflow)
|
||||
- [aidful-ollama-model-delete](https://github.com/AidfulAI/aidful-ollama-model-delete) (User interface for simplified model cleanup)
|
||||
- [Perplexica](https://github.com/ItzCrazyKns/Perplexica) (An AI-powered search engine & an open-source alternative to Perplexity AI)
|
||||
- [AI Toolkit for Visual Studio Code](https://aka.ms/ai-tooklit/ollama-docs) (Microsoft-official VSCode extension to chat, test, evaluate models with Ollama support, and use them in your AI applications.)
|
||||
|
||||
### Cloud
|
||||
|
||||
@@ -481,6 +482,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
||||
- [GoLamify](https://github.com/prasad89/golamify)
|
||||
- [Ollama for Haskell](https://github.com/tusharad/ollama-haskell)
|
||||
- [multi-llm-ts](https://github.com/nbonamy/multi-llm-ts) (A Typescript/JavaScript library allowing access to different LLM in unified API)
|
||||
- [LlmTornado](https://github.com/lofcz/llmtornado) (C# library providing a unified interface for major FOSS & Commercial inference APIs)
|
||||
|
||||
### Mobile
|
||||
|
||||
@@ -539,4 +541,5 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
||||
### Observability
|
||||
|
||||
- [OpenLIT](https://github.com/openlit/openlit) is an OpenTelemetry-native tool for monitoring Ollama Applications & GPUs using traces and metrics.
|
||||
- [HoneyHive](https://docs.honeyhive.ai/integrations/ollama) is an AI observability and evaluation platform for AI agents. Use HoneyHive to evaluate agent performance, interrogate failures, and monitor quality in production.
|
||||
- [HoneyHive](https://docs.honeyhive.ai/integrations/ollama) is an AI observability and evaluation platform for AI agents. Use HoneyHive to evaluate agent performance, interrogate failures, and monitor quality in production.
|
||||
- [Langfuse](https://langfuse.com/docs/integrations/ollama) is an open source LLM observability platform that enables teams to collaboratively monitor, evaluate and debug AI applications.
|
||||
|
||||
@@ -38,7 +38,7 @@ Numeric IDs may be used, however ordering may vary, so UUIDs are more reliable.
|
||||
You can discover the UUID of your GPUs by running `nvidia-smi -L` If you want to
|
||||
ignore the GPUs and force CPU usage, use an invalid GPU ID (e.g., "-1")
|
||||
|
||||
### Laptop Suspend Resume
|
||||
### Linux Suspend Resume
|
||||
|
||||
On linux, after a suspend/resume cycle, sometimes Ollama will fail to discover
|
||||
your NVIDIA GPU, and fallback to running on the CPU. You can workaround this
|
||||
|
||||
@@ -67,8 +67,6 @@ To use this:
|
||||
3. `ollama run choose-a-model-name`
|
||||
4. Start using the model!
|
||||
|
||||
More examples are available in the [examples directory](../examples).
|
||||
|
||||
To view the Modelfile of a given model, use the `ollama show --modelfile` command.
|
||||
|
||||
```bash
|
||||
@@ -155,7 +153,6 @@ PARAMETER <parameter> <parametervalue>
|
||||
| temperature | The temperature of the model. Increasing the temperature will make the model answer more creatively. (Default: 0.8) | float | temperature 0.7 |
|
||||
| seed | Sets the random number seed to use for generation. Setting this to a specific number will make the model generate the same text for the same prompt. (Default: 0) | int | seed 42 |
|
||||
| stop | Sets the stop sequences to use. When this pattern is encountered the LLM will stop generating text and return. Multiple stop patterns may be set by specifying multiple separate `stop` parameters in a modelfile. | string | stop "AI assistant:" |
|
||||
| tfs_z | Tail free sampling is used to reduce the impact of less probable tokens from the output. A higher value (e.g., 2.0) will reduce the impact more, while a value of 1.0 disables this setting. (default: 1) | float | tfs_z 1 |
|
||||
| num_predict | Maximum number of tokens to predict when generating text. (Default: -1, infinite generation) | int | num_predict 42 |
|
||||
| top_k | Reduces the probability of generating nonsense. A higher value (e.g. 100) will give more diverse answers, while a lower value (e.g. 10) will be more conservative. (Default: 40) | int | top_k 40 |
|
||||
| top_p | Works together with top-k. A higher value (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. (Default: 0.9) | float | top_p 0.9 |
|
||||
|
||||
@@ -204,6 +204,45 @@ curl http://localhost:11434/v1/embeddings \
|
||||
}'
|
||||
```
|
||||
|
||||
## Extra arguments
|
||||
|
||||
### Setting context length
|
||||
- `context_length` parameter can be used to set the context length for the model
|
||||
|
||||
#### OpenAI python library
|
||||
- OpenAI python library does not support setting context length, however this can be set for Ollama through the `extra_body` parameter
|
||||
|
||||
```py
|
||||
completion = client.chat.completions.create(
|
||||
model="llama3.1:8b",
|
||||
messages=[{"role": "user", "content": "Say this is a test"}],
|
||||
extra_body={"context_length": 4096},
|
||||
)
|
||||
```
|
||||
|
||||
#### OpenAI JavaScript library
|
||||
- OpenAI JavaScript library does not support setting context length, however this can be set for Ollama by passing `context_length` directly with a `@ts-expect-error` as an undocumented parameter in the OpenAI JavaScript library. [See documentation here](https://github.com/openai/openai-node?tab=readme-ov-file#making-customundocumented-requests)
|
||||
|
||||
```ts
|
||||
const chatCompletion = await openai.chat.completions.create({
|
||||
messages: [{ role: 'user', content: 'Say this is a test' }],
|
||||
model: 'llama3.2',
|
||||
// @ts-expect-error context_length is an additional parameter
|
||||
context_length: 4096,
|
||||
})
|
||||
```
|
||||
|
||||
#### `curl`
|
||||
```shell
|
||||
curl http://localhost:11434/v1/chat/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"model": "llama3.2",
|
||||
"messages": [{"role": "user", "content": "Say this is a test"}],
|
||||
"context_length": 4096
|
||||
}'
|
||||
```
|
||||
|
||||
## Endpoints
|
||||
|
||||
### `/v1/chat/completions`
|
||||
@@ -213,6 +252,7 @@ curl http://localhost:11434/v1/embeddings \
|
||||
- [x] Chat completions
|
||||
- [x] Streaming
|
||||
- [x] JSON mode
|
||||
- [x] Structured outputs
|
||||
- [x] Reproducible outputs
|
||||
- [x] Vision
|
||||
- [x] Tools
|
||||
@@ -339,27 +379,3 @@ curl http://localhost:11434/v1/chat/completions \
|
||||
}'
|
||||
```
|
||||
|
||||
### Setting the context size
|
||||
|
||||
The OpenAI API does not have a way of setting the context size for a model. If you need to change the context size, create a `Modelfile` which looks like:
|
||||
|
||||
```modelfile
|
||||
FROM <some model>
|
||||
PARAMETER num_ctx <context size>
|
||||
```
|
||||
|
||||
Use the `ollama create mymodel` command to create a new model with the updated context size. Call the API with the updated model name:
|
||||
|
||||
```shell
|
||||
curl http://localhost:11434/v1/chat/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"model": "mymodel",
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Hello!"
|
||||
}
|
||||
]
|
||||
}'
|
||||
```
|
||||
|
||||
@@ -80,10 +80,12 @@ type StreamOptions struct {
|
||||
}
|
||||
|
||||
type ChatCompletionRequest struct {
|
||||
Model string `json:"model"`
|
||||
Messages []Message `json:"messages"`
|
||||
Stream bool `json:"stream"`
|
||||
StreamOptions *StreamOptions `json:"stream_options"`
|
||||
Model string `json:"model"`
|
||||
Messages []Message `json:"messages"`
|
||||
Stream bool `json:"stream"`
|
||||
StreamOptions *StreamOptions `json:"stream_options"`
|
||||
MaxCompletionTokens *int `json:"max_completion_tokens"`
|
||||
// Deprecated: Use [ChatCompletionRequest.MaxCompletionTokens]
|
||||
MaxTokens *int `json:"max_tokens"`
|
||||
Seed *int `json:"seed"`
|
||||
Stop any `json:"stop"`
|
||||
@@ -93,6 +95,7 @@ type ChatCompletionRequest struct {
|
||||
TopP *float64 `json:"top_p"`
|
||||
ResponseFormat *ResponseFormat `json:"response_format"`
|
||||
Tools []api.Tool `json:"tools"`
|
||||
ContextLength *int `json:"context_length"`
|
||||
}
|
||||
|
||||
type ChatCompletion struct {
|
||||
@@ -475,8 +478,17 @@ func fromChatRequest(r ChatCompletionRequest) (*api.ChatRequest, error) {
|
||||
options["stop"] = stops
|
||||
}
|
||||
|
||||
if r.ContextLength != nil {
|
||||
options["num_ctx"] = *r.ContextLength
|
||||
}
|
||||
|
||||
// Deprecated: MaxTokens is deprecated, use MaxCompletionTokens instead
|
||||
if r.MaxTokens != nil {
|
||||
options["num_predict"] = *r.MaxTokens
|
||||
r.MaxCompletionTokens = r.MaxTokens
|
||||
}
|
||||
|
||||
if r.MaxCompletionTokens != nil {
|
||||
options["num_predict"] = *r.MaxCompletionTokens
|
||||
}
|
||||
|
||||
if r.Temperature != nil {
|
||||
@@ -962,6 +974,7 @@ func ChatMiddleware() gin.HandlerFunc {
|
||||
c.AbortWithStatusJSON(http.StatusBadRequest, NewError(http.StatusBadRequest, err.Error()))
|
||||
return
|
||||
}
|
||||
slog.Info("num_ctx", "num_ctx", chatReq.Options["num_ctx"])
|
||||
|
||||
if err := json.NewEncoder(&b).Encode(chatReq); err != nil {
|
||||
c.AbortWithStatusJSON(http.StatusInternalServerError, NewError(http.StatusInternalServerError, err.Error()))
|
||||
|
||||
@@ -7,7 +7,6 @@ import (
|
||||
"io"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"reflect"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
@@ -315,6 +314,42 @@ func TestChatMiddleware(t *testing.T) {
|
||||
Stream: &True,
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "chat handler with context_length",
|
||||
body: `{
|
||||
"model": "test-model",
|
||||
"messages": [{"role": "user", "content": "Hello"}],
|
||||
"context_length": 4096
|
||||
}`,
|
||||
req: api.ChatRequest{
|
||||
Model: "test-model",
|
||||
Messages: []api.Message{{Role: "user", Content: "Hello"}},
|
||||
Options: map[string]any{
|
||||
"num_ctx": 4096.0, // float because JSON doesn't distinguish between float and int
|
||||
"temperature": 1.0,
|
||||
"top_p": 1.0,
|
||||
},
|
||||
Stream: &False,
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "chat handler with max_completion_tokens",
|
||||
body: `{
|
||||
"model": "test-model",
|
||||
"messages": [{"role": "user", "content": "Hello"}],
|
||||
"max_completion_tokens": 2
|
||||
}`,
|
||||
req: api.ChatRequest{
|
||||
Model: "test-model",
|
||||
Messages: []api.Message{{Role: "user", Content: "Hello"}},
|
||||
Options: map[string]any{
|
||||
"num_predict": 2.0, // float because JSON doesn't distinguish between float and int
|
||||
"temperature": 1.0,
|
||||
"top_p": 1.0,
|
||||
},
|
||||
Stream: &False,
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "chat handler error forwarding",
|
||||
body: `{
|
||||
@@ -359,7 +394,7 @@ func TestChatMiddleware(t *testing.T) {
|
||||
return
|
||||
}
|
||||
if diff := cmp.Diff(&tc.req, capturedRequest); diff != "" {
|
||||
t.Fatalf("requests did not match: %+v", diff)
|
||||
t.Fatalf("requests did not match (-want +got):\n%s", diff)
|
||||
}
|
||||
if diff := cmp.Diff(tc.err, errResp); diff != "" {
|
||||
t.Fatalf("errors did not match for %s:\n%s", tc.name, diff)
|
||||
@@ -493,12 +528,14 @@ func TestCompletionsMiddleware(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
if capturedRequest != nil && !reflect.DeepEqual(tc.req, *capturedRequest) {
|
||||
t.Fatal("requests did not match")
|
||||
if capturedRequest != nil {
|
||||
if diff := cmp.Diff(tc.req, *capturedRequest); diff != "" {
|
||||
t.Fatalf("requests did not match (-want +got):\n%s", diff)
|
||||
}
|
||||
}
|
||||
|
||||
if !reflect.DeepEqual(tc.err, errResp) {
|
||||
t.Fatal("errors did not match")
|
||||
if diff := cmp.Diff(tc.err, errResp); diff != "" {
|
||||
t.Fatalf("errors did not match (-want +got):\n%s", diff)
|
||||
}
|
||||
|
||||
capturedRequest = nil
|
||||
@@ -577,12 +614,14 @@ func TestEmbeddingsMiddleware(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
if capturedRequest != nil && !reflect.DeepEqual(tc.req, *capturedRequest) {
|
||||
t.Fatal("requests did not match")
|
||||
if capturedRequest != nil {
|
||||
if diff := cmp.Diff(tc.req, *capturedRequest); diff != "" {
|
||||
t.Fatalf("requests did not match (-want +got):\n%s", diff)
|
||||
}
|
||||
}
|
||||
|
||||
if !reflect.DeepEqual(tc.err, errResp) {
|
||||
t.Fatal("errors did not match")
|
||||
if diff := cmp.Diff(tc.err, errResp); diff != "" {
|
||||
t.Fatalf("errors did not match (-want +got):\n%s", diff)
|
||||
}
|
||||
|
||||
capturedRequest = nil
|
||||
@@ -656,8 +695,8 @@ func TestListMiddleware(t *testing.T) {
|
||||
t.Fatalf("failed to unmarshal actual response: %v", err)
|
||||
}
|
||||
|
||||
if !reflect.DeepEqual(expected, actual) {
|
||||
t.Errorf("responses did not match\nExpected: %+v\nActual: %+v", expected, actual)
|
||||
if diff := cmp.Diff(expected, actual); diff != "" {
|
||||
t.Errorf("responses did not match (-want +got):\n%s", diff)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -722,8 +761,8 @@ func TestRetrieveMiddleware(t *testing.T) {
|
||||
t.Fatalf("failed to unmarshal actual response: %v", err)
|
||||
}
|
||||
|
||||
if !reflect.DeepEqual(expected, actual) {
|
||||
t.Errorf("responses did not match\nExpected: %+v\nActual: %+v", expected, actual)
|
||||
if diff := cmp.Diff(expected, actual); diff != "" {
|
||||
t.Errorf("responses did not match (-want +got):\n%s", diff)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -490,7 +490,6 @@ func TestParseFileParameters(t *testing.T) {
|
||||
"top_k 1": {"top_k", "1"},
|
||||
"top_p 1.0": {"top_p", "1.0"},
|
||||
"min_p 0.05": {"min_p", "0.05"},
|
||||
"tfs_z 1.0": {"tfs_z", "1.0"},
|
||||
"typical_p 1.0": {"typical_p", "1.0"},
|
||||
"repeat_last_n 1": {"repeat_last_n", "1"},
|
||||
"temperature 1.0": {"temperature", "1.0"},
|
||||
|
||||
Reference in New Issue
Block a user