Compare commits

...

34 Commits

Author SHA1 Message Date
Jeffrey Morgan
aed1419c64 ci: skip go build for tests (#7899) 2024-12-04 21:22:36 -08:00
Parth Sareen
c6c526275d api: add generate endpoint for structured outputs (#7939) 2024-12-04 17:37:12 -08:00
Parth Sareen
630e7dc6ff api: structured outputs - chat endpoint (#7900)
Adds structured outputs to chat endpoint
---------

Co-authored-by: Michael Yang <mxyng@pm.me>
Co-authored-by: Hieu Nguyen <hieunguyen1053@outlook.com>
2024-12-04 16:31:19 -08:00
Michael Yang
eb8366d658 Merge pull request #7932 from ollama/mxyng/fix-merges 2024-12-04 10:04:52 -08:00
Michael Yang
4456012956 fix unmarshaling merges 2024-12-04 09:21:56 -08:00
Sam
539be43640 llm: normalise kvct parameter handling (#7926) 2024-12-03 16:30:40 -08:00
Sam
1bdab9fdb1 llm: introduce k/v context quantization (vRAM improvements) (#6279) 2024-12-03 15:57:19 -08:00
owboson
2b82c5a8a1 docs: correct default num_predict value in modelfile.md (#7693) 2024-12-03 15:00:05 -08:00
Tigran
55c3efa900 docs: remove extra quote in modelfile.md (#7908) 2024-12-02 09:28:56 -08:00
David Mayboroda
1aedffad93 readme: add minima to community integrations (#7906) 2024-12-02 01:14:47 -08:00
Jeffrey Morgan
ff6c2d6dc8 cmd: don't rely on reading repo file for test (#7898) 2024-11-30 14:12:53 -08:00
Jeffrey Morgan
d543b282a7 server: add warning message for deprecated context field (#7878) 2024-11-30 14:05:50 -08:00
Parth Sareen
5f8051180e Enable index tracking for tools - openai api support (#7888) 2024-11-29 20:00:09 -08:00
Jeffrey Morgan
39e29ae5dd llama: fix typo and formatting in readme (#7876) 2024-11-28 17:27:11 -08:00
TheCookingSenpai
30a9f063c9 readme: add SpaceLlama, YouLama, and DualMind to community integrations (#7216) 2024-11-28 15:16:27 -08:00
Parth Sareen
ce7455a8e1 api: enable tool streaming (#7836) 2024-11-27 13:40:57 -08:00
ItzCrazyKns
e3936d4fb3 Support Multiple LoRa Adapters (#7667)
Closes #7627
2024-11-27 11:00:04 -08:00
Bruce MacDonald
940e62772e openai: remove unused error code (#7850)
The writeError takes a code argument which is no longer used. Remove it for clarity.
2024-11-26 16:08:09 -08:00
Jesse Gross
71e6a0d0d1 runner.go: Don't try to extract image tags for text models
When processing a prompt, we look for image tags of the form
[img-0], which are inserted by the Ollama server process.
However, this can cause errors if the original prompt has these
tags - typically an image not found error is returned.

This changes tag searching behavior to be similar to the 0.3.x
series, which will largely avoid these problems. However,they can
still happen when input text with these tags is used with image
models. The correct solution is to escape the tags but this is a
larger issue with special sequences in general so this is an
incremental fix that should avoid the problem for the majority
of cases.
2024-11-26 13:23:24 -08:00
Jesse Gross
2cd11ae365 runner.go: Add unit tests for context shifting
This also makes it easier to truncate long inputs the same as
shifting but does not actually implement it. This type of
truncation has a trade off between quality and time to first
token.
2024-11-26 11:21:35 -08:00
jake83741
52bbad12f9 readme: update description for vnc-lm community integration (#7832) 2024-11-25 17:56:30 -08:00
frob
30e88d7f31 cmd: don't submit svg files as images for now (#7830) 2024-11-25 16:43:29 -08:00
Blake Mizerany
2b7ed61ca2 server: fix Transport override (#7834)
This changes makeRequest to update the http client Transport if and only
if testMakeRequestDialContext is set. This is to avoid overriding the
default Transport when testMakeRequestDialContext is nil, which broke
existing behavior, included proxies, timeouts, and other behaviors.

Fixes #7829
Fixes #7788
2024-11-25 15:08:34 -08:00
Shikhar Bakhda
647513a7d4 readme: add HoneyHive to community integrations (#7831) 2024-11-25 09:55:33 -08:00
Bruce MacDonald
a210ec74d2 cmd: print location of model after pushing (#7695)
After a user pushes their model it is not clear what to do next. Add a link
to the output of `ollama push` that tells the user where their model can now
be found.
2024-11-25 09:40:16 -08:00
Simon Schampijer
cfb1ddd6fc examples: update langchain-python-simple (#3591)
- better formatting of input prompt
- use invoke instead of predict
2024-11-24 16:06:22 -08:00
reid41
3987acd7ec readme: add descriptions for QA-Pilot and shell-pilot community integrations (#4303) 2024-11-24 15:55:09 -08:00
frob
fda1e6b563 llm: bring fileTypes into alignment with llama.cpp (#7819) 2024-11-24 10:33:33 -08:00
Adarsh Mishra
3440ffb37b readme: add description for OpenTalkGpt in community integrations (#7818) 2024-11-24 10:32:23 -08:00
Patcher
a820d2b267 readme: add observability section with OpenLIT to community-integrations 2024-11-23 18:03:12 -08:00
Meng Zhuo
2ebdb54fb3 all: update math32 go mod to v1.11.0 (#6627) 2024-11-23 15:21:54 -08:00
josc146
bb52abfa55 readme: add ChatGPTBox and RWKV-Runner to community integrations (#4118) 2024-11-23 13:31:27 -08:00
oza6ut0ne
31cb1ca9e5 openai: accept X-Stainless-Retry-Count header (#6910) 2024-11-23 12:39:05 -08:00
Rodrigo Ribeiro Gomes
78f779a323 readme: add powershai, a powershell module with ollama support to community integrations (#7438) 2024-11-23 10:08:59 -08:00
36 changed files with 1109 additions and 160 deletions

View File

@@ -243,7 +243,7 @@ jobs:
$env:PATH="$gopath;$gccpath;$env:PATH"
echo $env:PATH
if (!(gcc --version | select-string -quiet clang)) { throw "wrong gcc compiler detected - must be clang" }
make -j 4
make -j 4
- name: 'Build Unix Go Runners'
if: ${{ ! startsWith(matrix.os, 'windows-') }}
run: make -j 4
@@ -310,8 +310,7 @@ jobs:
arm64) echo ARCH=arm64 ;;
esac >>$GITHUB_ENV
shell: bash
- run: go build
- run: go test -v ./...
- run: go test ./...
patches:
needs: [changes]
@@ -323,4 +322,4 @@ jobs:
submodules: recursive
- name: Verify patches carry all the changes
run: |
make apply-patches sync && git diff --compact-summary --exit-code llama
make apply-patches sync && git diff --compact-summary --exit-code llama

View File

@@ -298,7 +298,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [AnythingLLM (Docker + MacOs/Windows/Linux native app)](https://github.com/Mintplex-Labs/anything-llm)
- [Ollama Basic Chat: Uses HyperDiv Reactive UI](https://github.com/rapidarchitect/ollama_basic_chat)
- [Ollama-chats RPG](https://github.com/drazdra/ollama-chats)
- [QA-Pilot](https://github.com/reid41/QA-Pilot) (Chat with Code Repository)
- [QA-Pilot](https://github.com/reid41/QA-Pilot) (Interactive chat tool that can leverage Ollama models for rapid understanding and navigation of GitHub code repositories)
- [ChatOllama](https://github.com/sugarforever/chat-ollama) (Open Source Chatbot based on Ollama with Knowledge Bases)
- [CRAG Ollama Chat](https://github.com/Nagi-ovo/CRAG-Ollama-Chat) (Simple Web Search with Corrective RAG)
- [RAGFlow](https://github.com/infiniflow/ragflow) (Open-source Retrieval-Augmented Generation engine based on deep document understanding)
@@ -308,6 +308,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [Ollama RAG Chatbot](https://github.com/datvodinh/rag-chatbot.git) (Local Chat with multiple PDFs using Ollama and RAG)
- [BrainSoup](https://www.nurgo-software.com/products/brainsoup) (Flexible native client with RAG & multi-agent automation)
- [macai](https://github.com/Renset/macai) (macOS client for Ollama, ChatGPT, and other compatible API back-ends)
- [RWKV-Runner](https://github.com/josStorer/RWKV-Runner) (RWKV offline LLM deployment tool, also usable as a client for ChatGPT and Ollama)
- [Ollama Grid Search](https://github.com/dezoito/ollama-grid-search) (app to evaluate and compare models)
- [Olpaka](https://github.com/Otacon/olpaka) (User-friendly Flutter Web App for Ollama)
- [OllamaSpring](https://github.com/CrazyNeil/OllamaSpring) (Ollama Client for macOS)
@@ -345,16 +346,20 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [Web management](https://github.com/lemonit-eric-mao/ollama-web-management) (Web management page)
- [Promptery](https://github.com/promptery/promptery) (desktop client for Ollama.)
- [Ollama App](https://github.com/JHubi1/ollama-app) (Modern and easy-to-use multi-platform client for Ollama)
- [SpaceLlama](https://github.com/tcsenpai/spacellama) (Firefox and Chrome extension to quickly summarize web pages with ollama in a sidebar)
- [YouLama](https://github.com/tcsenpai/youlama) (Webapp to quickly summarize any YouTube video, supporting Invidious as well)
- [DualMind](https://github.com/tcsenpai/dualmind) (Experimental app allowing two models to talk to each other in the terminal or in a web interface)
- [ollamarama-matrix](https://github.com/h1ddenpr0cess20/ollamarama-matrix) (Ollama chatbot for the Matrix chat protocol)
- [ollama-chat-app](https://github.com/anan1213095357/ollama-chat-app) (Flutter-based chat app)
- [Perfect Memory AI](https://www.perfectmemory.ai/) (Productivity AI assists personalized by what you have seen on your screen, heard and said in the meetings)
- [Hexabot](https://github.com/hexastack/hexabot) (A conversational AI builder)
- [Reddit Rate](https://github.com/rapidarchitect/reddit_analyzer) (Search and Rate Reddit topics with a weighted summation)
- [OpenTalkGpt](https://github.com/adarshM84/OpenTalkGpt)
- [OpenTalkGpt](https://github.com/adarshM84/OpenTalkGpt) (Chrome Extension to manage open-source models supported by Ollama, create custom models, and chat with models from a user-friendly UI)
- [VT](https://github.com/vinhnx/vt.ai) (A minimal multimodal AI chat app, with dynamic conversation routing. Supports local models via Ollama)
- [Nosia](https://github.com/nosia-ai/nosia) (Easy to install and use RAG platform based on Ollama)
- [Witsy](https://github.com/nbonamy/witsy) (An AI Desktop application avaiable for Mac/Windows/Linux)
- [Abbey](https://github.com/US-Artificial-Intelligence/abbey) (A configurable AI interface server with notebooks, document storage, and YouTube support)
- [Minima](https://github.com/dmayboroda/minima) (RAG with on-premises or fully local workflow)
### Cloud
@@ -376,7 +381,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [Oatmeal](https://github.com/dustinblackman/oatmeal)
- [cmdh](https://github.com/pgibler/cmdh)
- [ooo](https://github.com/npahlfer/ooo)
- [shell-pilot](https://github.com/reid41/shell-pilot)
- [shell-pilot](https://github.com/reid41/shell-pilot)(Interact with models via pure shell scripts on Linux or macOS)
- [tenere](https://github.com/pythops/tenere)
- [llm-ollama](https://github.com/taketwo/llm-ollama) for [Datasette's LLM CLI](https://llm.datasette.io/en/stable/).
- [typechat-cli](https://github.com/anaisbetts/typechat-cli)
@@ -392,6 +397,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [bb7](https://github.com/drunkwcodes/bb7)
- [SwollamaCLI](https://github.com/marcusziade/Swollama) bundled with the Swollama Swift package. [Demo](https://github.com/marcusziade/Swollama?tab=readme-ov-file#cli-usage)
- [aichat](https://github.com/sigoden/aichat) All-in-one LLM CLI tool featuring Shell Assistant, Chat-REPL, RAG, AI tools & agents, with access to OpenAI, Claude, Gemini, Ollama, Groq, and more.
- [PowershAI](https://github.com/rrg92/powershai) PowerShell module that brings AI to terminal on Windows, including support for Ollama
- [orbiton](https://github.com/xyproto/orbiton) Configuration-free text editor and IDE with support for tab completion with Ollama.
### Apple Vision Pro
@@ -496,12 +502,13 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [AI Telegram Bot](https://github.com/tusharhero/aitelegrambot) (Telegram bot using Ollama in backend)
- [AI ST Completion](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (Sublime Text 4 AI assistant plugin with Ollama support)
- [Discord-Ollama Chat Bot](https://github.com/kevinthedang/discord-ollama) (Generalized TypeScript Discord Bot w/ Tuning Documentation)
- [ChatGPTBox: All in one browser extension](https://github.com/josStorer/chatGPTBox) with [Integrating Tutorial](https://github.com/josStorer/chatGPTBox/issues/616#issuecomment-1975186467)
- [Discord AI chat/moderation bot](https://github.com/rapmd73/Companion) Chat/moderation bot written in python. Uses Ollama to create personalities.
- [Headless Ollama](https://github.com/nischalj10/headless-ollama) (Scripts to automatically install ollama client & models on any OS for apps that depends on ollama server)
- [Terraform AWS Ollama & Open WebUI](https://github.com/xuyangbocn/terraform-aws-self-host-llm) (A Terraform module to deploy on AWS a ready-to-use Ollama service, together with its front end Open WebUI service.)
- [node-red-contrib-ollama](https://github.com/jakubburkiewicz/node-red-contrib-ollama)
- [Local AI Helper](https://github.com/ivostoykov/localAI) (Chrome and Firefox extensions that enable interactions with the active tab and customisable API endpoints. Includes secure storage for user prompts.)
- [vnc-lm](https://github.com/jk011ru/vnc-lm) (A containerized Discord bot with support for attachments and web links)
- [vnc-lm](https://github.com/jake83741/vnc-lm) (Discord bot for messaging with LLMs through Ollama and LiteLLM. Seamlessly move between local and flagship models.)
- [LSP-AI](https://github.com/SilasMarvin/lsp-ai) (Open-source language server for AI-powered functionality)
- [QodeAssist](https://github.com/Palm1r/QodeAssist) (AI-powered coding assistant plugin for Qt Creator)
- [Obsidian Quiz Generator plugin](https://github.com/ECuiDev/obsidian-quiz-generator)
@@ -511,3 +518,8 @@ See the [API documentation](./docs/api.md) for all endpoints.
### Supported backends
- [llama.cpp](https://github.com/ggerganov/llama.cpp) project founded by Georgi Gerganov.
### Observability
- [OpenLIT](https://github.com/openlit/openlit) is an OpenTelemetry-native tool for monitoring Ollama Applications & GPUs using traces and metrics.
- [HoneyHive](https://docs.honeyhive.ai/integrations/ollama) is an AI observability and evaluation platform for AI agents. Use HoneyHive to evaluate agent performance, interrogate failures, and monitor quality in production.

View File

@@ -67,7 +67,7 @@ type GenerateRequest struct {
Raw bool `json:"raw,omitempty"`
// Format specifies the format to return a response in.
Format string `json:"format"`
Format json.RawMessage `json:"format,omitempty"`
// KeepAlive controls how long the model will stay loaded in memory following
// this request.
@@ -94,7 +94,7 @@ type ChatRequest struct {
Stream *bool `json:"stream,omitempty"`
// Format is the format to return the response in (e.g. "json").
Format string `json:"format"`
Format json.RawMessage `json:"format,omitempty"`
// KeepAlive controls how long the model will stay loaded into memory
// following the request.
@@ -146,6 +146,7 @@ type ToolCall struct {
}
type ToolCallFunction struct {
Index int `json:"index,omitempty"`
Name string `json:"name"`
Arguments ToolCallFunctionArguments `json:"arguments"`
}

View File

@@ -8,6 +8,7 @@ import (
"crypto/ed25519"
"crypto/rand"
"crypto/sha256"
"encoding/json"
"encoding/pem"
"errors"
"fmt"
@@ -39,6 +40,7 @@ import (
"github.com/ollama/ollama/parser"
"github.com/ollama/ollama/progress"
"github.com/ollama/ollama/server"
"github.com/ollama/ollama/types/model"
"github.com/ollama/ollama/version"
)
@@ -558,6 +560,8 @@ func PushHandler(cmd *cobra.Command, args []string) error {
}
request := api.PushRequest{Name: args[0], Insecure: insecure}
n := model.ParseName(args[0])
if err := client.Push(cmd.Context(), &request, fn); err != nil {
if spinner != nil {
spinner.Stop()
@@ -568,7 +572,16 @@ func PushHandler(cmd *cobra.Command, args []string) error {
return err
}
p.Stop()
spinner.Stop()
destination := n.String()
if strings.HasSuffix(n.Host, ".ollama.ai") || strings.HasSuffix(n.Host, ".ollama.com") {
destination = "https://ollama.com/" + strings.TrimSuffix(n.DisplayShortest(), ":latest")
}
fmt.Printf("\nYou can find your model at:\n\n")
fmt.Printf("\t%s\n", destination)
return nil
}
@@ -1026,7 +1039,7 @@ func chat(cmd *cobra.Command, opts runOptions) (*api.Message, error) {
req := &api.ChatRequest{
Model: opts.Model,
Messages: opts.Messages,
Format: opts.Format,
Format: json.RawMessage(opts.Format),
Options: opts.Options,
}
@@ -1113,7 +1126,7 @@ func generate(cmd *cobra.Command, opts runOptions) error {
Prompt: opts.Prompt,
Context: generateContext,
Images: opts.Images,
Format: opts.Format,
Format: json.RawMessage(opts.Format),
System: opts.System,
Options: opts.Options,
KeepAlive: opts.KeepAlive,
@@ -1433,6 +1446,7 @@ func NewCLI() *cobra.Command {
envVars["OLLAMA_SCHED_SPREAD"],
envVars["OLLAMA_TMPDIR"],
envVars["OLLAMA_FLASH_ATTENTION"],
envVars["OLLAMA_KV_CACHE_TYPE"],
envVars["OLLAMA_LLM_LIBRARY"],
envVars["OLLAMA_GPU_OVERHEAD"],
envVars["OLLAMA_LOAD_TIMEOUT"],

View File

@@ -4,10 +4,10 @@ import (
"bytes"
"context"
"encoding/json"
"io"
"net/http"
"net/http/httptest"
"os"
"path/filepath"
"strings"
"testing"
@@ -179,18 +179,14 @@ Weigh anchor!
t.Run("license", func(t *testing.T) {
var b bytes.Buffer
license, err := os.ReadFile(filepath.Join("..", "LICENSE"))
if err != nil {
t.Fatal(err)
}
license := "MIT License\nCopyright (c) Ollama\n"
if err := showInfo(&api.ShowResponse{
Details: api.ModelDetails{
Family: "test",
ParameterSize: "7B",
QuantizationLevel: "FP16",
},
License: string(license),
License: license,
}, &b); err != nil {
t.Fatal(err)
}
@@ -369,3 +365,127 @@ func TestGetModelfileName(t *testing.T) {
})
}
}
func TestPushHandler(t *testing.T) {
tests := []struct {
name string
modelName string
serverResponse map[string]func(w http.ResponseWriter, r *http.Request)
expectedError string
expectedOutput string
}{
{
name: "successful push",
modelName: "test-model",
serverResponse: map[string]func(w http.ResponseWriter, r *http.Request){
"/api/push": func(w http.ResponseWriter, r *http.Request) {
if r.Method != http.MethodPost {
t.Errorf("expected POST request, got %s", r.Method)
}
var req api.PushRequest
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
http.Error(w, err.Error(), http.StatusBadRequest)
return
}
if req.Name != "test-model" {
t.Errorf("expected model name 'test-model', got %s", req.Name)
}
// Simulate progress updates
responses := []api.ProgressResponse{
{Status: "preparing manifest"},
{Digest: "sha256:abc123456789", Total: 100, Completed: 50},
{Digest: "sha256:abc123456789", Total: 100, Completed: 100},
}
for _, resp := range responses {
if err := json.NewEncoder(w).Encode(resp); err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
w.(http.Flusher).Flush()
}
},
},
expectedOutput: "\nYou can find your model at:\n\n\thttps://ollama.com/test-model\n",
},
{
name: "unauthorized push",
modelName: "unauthorized-model",
serverResponse: map[string]func(w http.ResponseWriter, r *http.Request){
"/api/push": func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(http.StatusUnauthorized)
err := json.NewEncoder(w).Encode(map[string]string{
"error": "access denied",
})
if err != nil {
t.Fatal(err)
}
},
},
expectedError: "you are not authorized to push to this namespace, create the model under a namespace you own",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
mockServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if handler, ok := tt.serverResponse[r.URL.Path]; ok {
handler(w, r)
return
}
http.Error(w, "not found", http.StatusNotFound)
}))
defer mockServer.Close()
t.Setenv("OLLAMA_HOST", mockServer.URL)
cmd := &cobra.Command{}
cmd.Flags().Bool("insecure", false, "")
cmd.SetContext(context.TODO())
// Redirect stderr to capture progress output
oldStderr := os.Stderr
r, w, _ := os.Pipe()
os.Stderr = w
// Capture stdout for the "Model pushed" message
oldStdout := os.Stdout
outR, outW, _ := os.Pipe()
os.Stdout = outW
err := PushHandler(cmd, []string{tt.modelName})
// Restore stderr
w.Close()
os.Stderr = oldStderr
// drain the pipe
if _, err := io.ReadAll(r); err != nil {
t.Fatal(err)
}
// Restore stdout and get output
outW.Close()
os.Stdout = oldStdout
stdout, _ := io.ReadAll(outR)
if tt.expectedError == "" {
if err != nil {
t.Errorf("expected no error, got %v", err)
}
if tt.expectedOutput != "" {
if got := string(stdout); got != tt.expectedOutput {
t.Errorf("expected output %q, got %q", tt.expectedOutput, got)
}
}
} else {
if err == nil || !strings.Contains(err.Error(), tt.expectedError) {
t.Errorf("expected error containing %q, got %v", tt.expectedError, err)
}
}
})
}
}

View File

@@ -514,7 +514,7 @@ func extractFileNames(input string) []string {
// Regex to match file paths starting with optional drive letter, / ./ \ or .\ and include escaped or unescaped spaces (\ or %20)
// and followed by more characters and a file extension
// This will capture non filename strings, but we'll check for file existence to remove mismatches
regexPattern := `(?:[a-zA-Z]:)?(?:\./|/|\\)[\S\\ ]+?\.(?i:jpg|jpeg|png|svg)\b`
regexPattern := `(?:[a-zA-Z]:)?(?:\./|/|\\)[\S\\ ]+?\.(?i:jpg|jpeg|png)\b`
re := regexp.MustCompile(regexPattern)
return re.FindAllString(input, -1)

View File

@@ -12,44 +12,45 @@ import (
func TestExtractFilenames(t *testing.T) {
// Unix style paths
input := ` some preamble
./relative\ path/one.png inbetween1 ./not a valid two.jpg inbetween2
/unescaped space /three.jpeg inbetween3 /valid\ path/dir/four.png "./quoted with spaces/five.svg`
./relative\ path/one.png inbetween1 ./not a valid two.jpg inbetween2 ./1.svg
/unescaped space /three.jpeg inbetween3 /valid\ path/dir/four.png "./quoted with spaces/five.JPG`
res := extractFileNames(input)
assert.Len(t, res, 5)
assert.Contains(t, res[0], "one.png")
assert.Contains(t, res[1], "two.jpg")
assert.Contains(t, res[2], "three.jpeg")
assert.Contains(t, res[3], "four.png")
assert.Contains(t, res[4], "five.svg")
assert.Contains(t, res[4], "five.JPG")
assert.NotContains(t, res[4], '"')
assert.NotContains(t, res, "inbtween")
assert.NotContains(t, res, "inbetween1")
assert.NotContains(t, res, "./1.svg")
// Windows style paths
input = ` some preamble
c:/users/jdoe/one.png inbetween1 c:/program files/someplace/two.jpg inbetween2
/absolute/nospace/three.jpeg inbetween3 /absolute/with space/four.png inbetween4
./relative\ path/five.svg inbetween5 "./relative with/spaces/six.png inbetween6
d:\path with\spaces\seven.svg inbetween7 c:\users\jdoe\eight.png inbetween8
d:\program files\someplace\nine.png inbetween9 "E:\program files\someplace\ten.svg some ending
./relative\ path/five.JPG inbetween5 "./relative with/spaces/six.png inbetween6
d:\path with\spaces\seven.JPEG inbetween7 c:\users\jdoe\eight.png inbetween8
d:\program files\someplace\nine.png inbetween9 "E:\program files\someplace\ten.PNG some ending
`
res = extractFileNames(input)
assert.Len(t, res, 10)
assert.NotContains(t, res, "inbtween")
assert.NotContains(t, res, "inbetween2")
assert.Contains(t, res[0], "one.png")
assert.Contains(t, res[0], "c:")
assert.Contains(t, res[1], "two.jpg")
assert.Contains(t, res[1], "c:")
assert.Contains(t, res[2], "three.jpeg")
assert.Contains(t, res[3], "four.png")
assert.Contains(t, res[4], "five.svg")
assert.Contains(t, res[4], "five.JPG")
assert.Contains(t, res[5], "six.png")
assert.Contains(t, res[6], "seven.svg")
assert.Contains(t, res[6], "seven.JPEG")
assert.Contains(t, res[6], "d:")
assert.Contains(t, res[7], "eight.png")
assert.Contains(t, res[7], "c:")
assert.Contains(t, res[8], "nine.png")
assert.Contains(t, res[8], "d:")
assert.Contains(t, res[9], "ten.svg")
assert.Contains(t, res[9], "ten.PNG")
assert.Contains(t, res[9], "E:")
}

View File

@@ -10,6 +10,7 @@ import (
"log/slog"
"os"
"slices"
"strings"
"golang.org/x/exp/maps"
)
@@ -60,7 +61,25 @@ func parseTokenizer(fsys fs.FS, specialTokenTypes []string) (*Tokenizer, error)
addedTokens[t.Content] = t
}
t.Merges = tt.Model.Merges
if len(tt.Model.Merges) == 0 {
// noop; merges is empty
} else if err := json.Unmarshal(tt.Model.Merges, &t.Merges); err == nil {
// noop; merges is []string
} else if merges, err := func() ([][]string, error) {
var merges [][]string
if err := json.Unmarshal(tt.Model.Merges, &merges); err != nil {
return nil, err
}
return merges, nil
}(); err == nil {
t.Merges = make([]string, len(merges))
for i := range merges {
t.Merges[i] = strings.Join(merges[i], " ")
}
} else {
return nil, fmt.Errorf("could not parse tokenizer merges. expected []string or [][]string: %w", err)
}
sha256sum := sha256.New()
for _, pt := range tt.PreTokenizer.PreTokenizers {
@@ -156,9 +175,9 @@ func parseTokenizer(fsys fs.FS, specialTokenTypes []string) (*Tokenizer, error)
type tokenizer struct {
AddedTokens []token `json:"added_tokens"`
Model struct {
Type string `json:"type"`
Vocab map[string]int `json:"vocab"`
Merges []string `json:"merges"`
Type string `json:"type"`
Vocab map[string]int `json:"vocab"`
Merges json.RawMessage `json:"merges"`
} `json:"model"`
PreTokenizer struct {

View File

@@ -191,6 +191,62 @@ func TestParseTokenizer(t *testing.T) {
Pre: "default",
},
},
{
name: "list string merges",
fsys: createTokenizerFS(t, t.TempDir(), map[string]io.Reader{
"tokenizer.json": strings.NewReader(`{
"model": {
"merges": [
"a b",
"c d",
"e f"
]
}
}`),
}),
want: &Tokenizer{
Vocabulary: &Vocabulary{
Model: "gpt2",
},
Merges: []string{
"a b",
"c d",
"e f",
},
Pre: "default",
},
},
{
name: "list list string merges",
fsys: createTokenizerFS(t, t.TempDir(), map[string]io.Reader{
"tokenizer.json": strings.NewReader(`{
"model": {
"merges": [
[
"a", "b"
],
[
"c", "d"
],
[
"e", "f"
]
]
}
}`),
}),
want: &Tokenizer{
Vocabulary: &Vocabulary{
Model: "gpt2",
},
Merges: []string{
"a b",
"c d",
"e f",
},
Pre: "default",
},
},
}
for _, tt := range cases {

View File

@@ -183,3 +183,17 @@ func (si SystemInfo) GetOptimalThreadCount() int {
return coreCount
}
// For each GPU, check if it does NOT support flash attention
func (l GpuInfoList) FlashAttentionSupported() bool {
for _, gpu := range l {
supportsFA := gpu.Library == "metal" ||
(gpu.Library == "cuda" && gpu.DriverMajor >= 7) ||
gpu.Library == "rocm"
if !supportsFA {
return false
}
}
return true
}

View File

@@ -49,10 +49,10 @@ Advanced parameters (optional):
- `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
- `system`: system message to (overrides what is defined in the `Modelfile`)
- `template`: the prompt template to use (overrides what is defined in the `Modelfile`)
- `context`: the context parameter returned from a previous request to `/generate`, this can be used to keep a short conversational memory
- `stream`: if `false` the response will be returned as a single response object, rather than a stream of objects
- `raw`: if `true` no formatting will be applied to the prompt. You may choose to use the `raw` parameter if you are specifying a full templated prompt in your request to the API
- `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`)
- `context` (deprecated): the context parameter returned from a previous request to `/generate`, this can be used to keep a short conversational memory
#### JSON mode

View File

@@ -151,7 +151,7 @@ Refer to the section [above](#how-do-i-configure-ollama-server) for how to set e
Ollama runs an HTTP server and can be exposed using a proxy server such as Nginx. To do so, configure the proxy to forward requests and optionally set required headers (if not exposing Ollama on the network). For example, with Nginx:
```
```nginx
server {
listen 80;
server_name example.com; # Replace with your domain or IP
@@ -285,4 +285,28 @@ Note: Windows with Radeon GPUs currently default to 1 model maximum due to limit
## How does Ollama load models on multiple GPUs?
Installing multiple GPUs of the same brand can be a great way to increase your available VRAM to load larger models. When you load a new model, Ollama evaluates the required VRAM for the model against what is currently available. If the model will entirely fit on any single GPU, Ollama will load the model on that GPU. This typically provides the best performance as it reduces the amount of data transfering across the PCI bus during inference. If the model does not fit entirely on one GPU, then it will be spread across all the available GPUs.
When loading a new model, Ollama evaluates the required VRAM for the model against what is currently available. If the model will entirely fit on any single GPU, Ollama will load the model on that GPU. This typically provides the best performance as it reduces the amount of data transferring across the PCI bus during inference. If the model does not fit entirely on one GPU, then it will be spread across all the available GPUs.
## How can I enable Flash Attention?
Flash Attention is a feature of most modern models that can significantly reduce memory usage as the context size grows. To enable Flash Attention, set the `OLLAMA_FLASH_ATTENTION` environment variable to `1` when starting the Ollama server.
## How can I set the quantization type for the K/V cache?
The K/V context cache can be quantized to significantly reduce memory usage when Flash Attention is enabled.
To use quantized K/V cache with Ollama you can set the following environment variable:
- `OLLAMA_KV_CACHE_TYPE` - The quantization type for the K/V cache. Default is `f16`.
> Note: Currently this is a global option - meaning all models will run with the specified quantization type.
The currently available K/V cache quantization types are:
- `f16` - high precision and memory usage (default).
- `q8_0` - 8-bit quantization, uses approximately 1/2 the memory of `f16` with a very small loss in precision, this usually has no noticeable impact on the model's quality (recommended if not using f16).
- `q4_0` - 4-bit quantization, uses approximately 1/4 the memory of `f16` with a small-medium loss in precision that may be more noticeable at higher context sizes.
How much the cache quantization impacts the model's response quality will depend on the model and the task. Models that have a high GQA count (e.g. Qwen2) may see a larger impact on precision from quantization than models with a low GQA count.
You may need to experiment with different quantization types to find the best balance between memory usage and quality.

View File

@@ -63,7 +63,7 @@ SYSTEM You are Mario from super mario bros, acting as an assistant.
To use this:
1. Save it as a file (e.g. `Modelfile`)
2. `ollama create choose-a-model-name -f <location of the file e.g. ./Modelfile>'`
2. `ollama create choose-a-model-name -f <location of the file e.g. ./Modelfile>`
3. `ollama run choose-a-model-name`
4. Start using the model!
@@ -156,7 +156,7 @@ PARAMETER <parameter> <parametervalue>
| seed | Sets the random number seed to use for generation. Setting this to a specific number will make the model generate the same text for the same prompt. (Default: 0) | int | seed 42 |
| stop | Sets the stop sequences to use. When this pattern is encountered the LLM will stop generating text and return. Multiple stop patterns may be set by specifying multiple separate `stop` parameters in a modelfile. | string | stop "AI assistant:" |
| tfs_z | Tail free sampling is used to reduce the impact of less probable tokens from the output. A higher value (e.g., 2.0) will reduce the impact more, while a value of 1.0 disables this setting. (default: 1) | float | tfs_z 1 |
| num_predict | Maximum number of tokens to predict when generating text. (Default: 128, -1 = infinite generation, -2 = fill context) | int | num_predict 42 |
| num_predict | Maximum number of tokens to predict when generating text. (Default: -1, infinite generation) | int | num_predict 42 |
| top_k | Reduces the probability of generating nonsense. A higher value (e.g. 100) will give more diverse answers, while a lower value (e.g. 10) will be more conservative. (Default: 40) | int | top_k 40 |
| top_p | Works together with top-k. A higher value (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. (Default: 0.9) | float | top_p 0.9 |
| min_p | Alternative to the top_p, and aims to ensure a balance of quality and variety. The parameter *p* represents the minimum probability for a token to be considered, relative to the probability of the most likely token. For example, with *p*=0.05 and the most likely token having a probability of 0.9, logits with a value less than 0.045 are filtered out. (Default: 0.0) | float | min_p 0.05 |

View File

@@ -153,6 +153,8 @@ var (
Debug = Bool("OLLAMA_DEBUG")
// FlashAttention enables the experimental flash attention feature.
FlashAttention = Bool("OLLAMA_FLASH_ATTENTION")
// KvCacheType is the quantization type for the K/V cache.
KvCacheType = String("OLLAMA_KV_CACHE_TYPE")
// NoHistory disables readline history.
NoHistory = Bool("OLLAMA_NOHISTORY")
// NoPrune disables pruning of model blobs on startup.
@@ -234,6 +236,7 @@ func AsMap() map[string]EnvVar {
ret := map[string]EnvVar{
"OLLAMA_DEBUG": {"OLLAMA_DEBUG", Debug(), "Show additional debug information (e.g. OLLAMA_DEBUG=1)"},
"OLLAMA_FLASH_ATTENTION": {"OLLAMA_FLASH_ATTENTION", FlashAttention(), "Enabled flash attention"},
"OLLAMA_KV_CACHE_TYPE": {"OLLAMA_KV_CACHE_TYPE", KvCacheType(), "Quantization type for the K/V cache (default: f16)"},
"OLLAMA_GPU_OVERHEAD": {"OLLAMA_GPU_OVERHEAD", GpuOverhead(), "Reserve a portion of VRAM per GPU (bytes)"},
"OLLAMA_HOST": {"OLLAMA_HOST", Host(), "IP Address for the ollama server (default 127.0.0.1:11434)"},
"OLLAMA_KEEP_ALIVE": {"OLLAMA_KEEP_ALIVE", KeepAlive(), "The duration that models stay loaded in memory (default \"5m\")"},

View File

@@ -1,6 +1,6 @@
from langchain.llms import Ollama
input = input("What is your question?")
input = input("What is your question?\n> ")
llm = Ollama(model="llama3.2")
res = llm.predict(input)
res = llm.invoke(input)
print (res)

2
go.mod
View File

@@ -29,7 +29,7 @@ require (
github.com/apache/arrow/go/arrow v0.0.0-20211112161151-bc219186db40 // indirect
github.com/bytedance/sonic/loader v0.1.1 // indirect
github.com/chewxy/hm v1.0.0 // indirect
github.com/chewxy/math32 v1.10.1 // indirect
github.com/chewxy/math32 v1.11.0 // indirect
github.com/cloudwego/base64x v0.1.4 // indirect
github.com/cloudwego/iasm v0.2.0 // indirect
github.com/davecgh/go-spew v1.1.1 // indirect

10
go.sum
View File

@@ -21,8 +21,8 @@ github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA
github.com/chewxy/hm v1.0.0 h1:zy/TSv3LV2nD3dwUEQL2VhXeoXbb9QkpmdRAVUFiA6k=
github.com/chewxy/hm v1.0.0/go.mod h1:qg9YI4q6Fkj/whwHR1D+bOGeF7SniIP40VweVepLjg0=
github.com/chewxy/math32 v1.0.0/go.mod h1:Miac6hA1ohdDUTagnvJy/q+aNnEk16qWUdb8ZVhvCN0=
github.com/chewxy/math32 v1.10.1 h1:LFpeY0SLJXeaiej/eIp2L40VYfscTvKh/FSEZ68uMkU=
github.com/chewxy/math32 v1.10.1/go.mod h1:dOB2rcuFrCn6UHrze36WSLVPKtzPMRAQvBvUwkSsLqs=
github.com/chewxy/math32 v1.11.0 h1:8sek2JWqeaKkVnHa7bPVqCEOUPbARo4SGxs6toKyAOo=
github.com/chewxy/math32 v1.11.0/go.mod h1:dOB2rcuFrCn6UHrze36WSLVPKtzPMRAQvBvUwkSsLqs=
github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw=
github.com/cloudwego/base64x v0.1.4 h1:jwCgWpFanWmN8xoIUHa2rtzmkd5J2plF/dnLS6Xd/0Y=
github.com/cloudwego/base64x v0.1.4/go.mod h1:0zlkT4Wn5C6NdauXdJRhSKRlJvmclQ1hhJgA0rcu/8w=
@@ -231,8 +231,6 @@ golang.org/x/image v0.0.0-20200430140353-33d19683fad8/go.mod h1:FeLwcggjj3mMvU+o
golang.org/x/image v0.0.0-20200618115811-c13761719519/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
golang.org/x/image v0.0.0-20201208152932-35266b937fa6/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
golang.org/x/image v0.0.0-20210216034530-4410531fe030/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
golang.org/x/image v0.14.0 h1:tNgSxAFe3jC4uYqvZdTr84SZoM1KfwdC9SKIFrLjFn4=
golang.org/x/image v0.14.0/go.mod h1:HUYqC05R2ZcZ3ejNQsIHQDQiwWM4JBqmm6MKANTp4LE=
golang.org/x/image v0.22.0 h1:UtK5yLUzilVrkjMAZAZ34DXGpASN8i8pj8g+O+yd10g=
golang.org/x/image v0.22.0/go.mod h1:9hPFhljd4zZ1GNSIZJ49sqbp45GKK9t6w+iXvGqZUz4=
golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE=
@@ -268,8 +266,6 @@ golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJ
golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.3.0 h1:ftCYgMx6zT/asHUrPw8BLLscYtGznsLAnjq5RH9P66E=
golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y=
golang.org/x/sync v0.9.0 h1:fEo0HyrW1GIgZdpbhCRO0PkJajUS5H9IFUztCgEo2jQ=
golang.org/x/sync v0.9.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
@@ -296,8 +292,6 @@ golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.3.5/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.15.0 h1:h1V/4gjBv8v9cjcR6+AR5+/cIYK5N/WAgiv4xlsEtAk=
golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
golang.org/x/text v0.20.0 h1:gK/Kv2otX8gz+wn7Rmb3vT96ZwuoxnQlY+HlJVj7Qug=
golang.org/x/text v0.20.0/go.mod h1:D4IsuqiFMhST5bX19pQ9ikHC2GsaKyk/oF+pn3ducp4=
golang.org/x/tools v0.0.0-20180525024113-a5b4c53f6e8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=

View File

@@ -93,7 +93,7 @@ make -j
## Vendoring
Ollama currently vendors [llama.cpp](https://github.com/ggerganov/llama.cpp/) and [ggml](https://github.com/ggerganov/ggml) through a vendoring model. While we generally strive to contribute changes back upstream to avoid drift, we cary a small set of patches which are applied to the tracking commit. A set of make targets are available to aid developers in updating to a newer tracking commit, or to work on changes.
Ollama currently vendors [llama.cpp](https://github.com/ggerganov/llama.cpp/) and [ggml](https://github.com/ggerganov/ggml) through a vendoring model. While we generally strive to contribute changes back upstream to avoid drift, we cary a small set of patches which are applied to the tracking commit. A set of make targets are available to aid developers in updating to a newer tracking commit, or to work on changes.
If you update the vendoring code, start by running the following command to establish the tracking llama.cpp repo in the `./vendor/` directory.
@@ -105,35 +105,35 @@ make apply-patches
**Pin to new base commit**
To update to a newer base commit, select the upstream git tag or commit and update `llama/vendoring.env`
To update to a newer base commit, select the upstream git tag or commit and update `llama/vendoring`
#### Applying patches
When updating to a newer base commit, the existing patches may not apply cleanly and require manual merge resolution.
Start by applying the patches. If any of the patches have conflicts, the `git am` will stop at the first failure.
Start by applying the patches. If any of the patches have conflicts, the `git am` will stop at the first failure.
```
make apply-patches
```
If you see an error message about a conflict, go into the `./vendor/` directory, and perform merge resolution using your preferred tool to the patch commit which failed. Save the file(s) and continue the patch series with `git am --continue` . If any additional patches fail, follow the same pattern until the full patch series is applied. Once finished, run a final `create-patches` and `sync` target to ensure everything is updated.
If you see an error message about a conflict, go into the `./vendor/` directory, and perform merge resolution using your preferred tool to the patch commit which failed. Save the file(s) and continue the patch series with `git am --continue` . If any additional patches fail, follow the same pattern until the full patch series is applied. Once finished, run a final `create-patches` and `sync` target to ensure everything is updated.
```
make create-patches sync
```
Build and test Ollama, and make any necessary changes to the Go code based on the new base commit. Submit your PR to the Ollama repo.
Build and test Ollama, and make any necessary changes to the Go code based on the new base commit. Submit your PR to the Ollama repo.
### Generating Patches
When working on new fixes or features that impact vendored code, use the following model. First get a clean tracking repo with all current patches applied:
When working on new fixes or features that impact vendored code, use the following model. First get a clean tracking repo with all current patches applied:
```
make apply-patches
```
Now edit the upstream native code in the `./vendor/` directory. You do not need to commit every change in order to build, a dirty working tree in the tracking repo is OK while developing. Simply save in your editor, and run the following to refresh the vendored code with your changes, build the backend(s) and build ollama:
Now edit the upstream native code in the `./vendor/` directory. You do not need to commit every change in order to build, a dirty working tree in the tracking repo is OK while developing. Simply save in your editor, and run the following to refresh the vendored code with your changes, build the backend(s) and build ollama:
```
make sync
@@ -142,9 +142,9 @@ go build .
```
> [!IMPORTANT]
> Do **NOT** run `apply-patches` while you're iterating as that will reset the tracking repo. It will detect a dirty tree and abort, but if your tree is clean and you accidentally ran this target, use `git reflog` to recover your commit(s).
> Do **NOT** run `apply-patches` while you're iterating as that will reset the tracking repo. It will detect a dirty tree and abort, but if your tree is clean and you accidentally ran this target, use `git reflog` to recover your commit(s).
Iterate until you're ready to submit PRs. Once your code is ready, commit a change in the `./vendor/` directory, then generate the patches for ollama with
Iterate until you're ready to submit PRs. Once your code is ready, commit a change in the `./vendor/` directory, then generate the patches for ollama with
```
make create-patches
@@ -157,4 +157,4 @@ In your `./vendor/` directory, create a branch, and cherry-pick the new commit t
Commit the changes in the ollama repo and submit a PR to Ollama, which will include the vendored code update with your change, along with the patches.
After your PR upstream is merged, follow the **Updating Base Commit** instructions above, however first remove your patch before running `apply-patches` since the new base commit contains your change already.
After your PR upstream is merged, follow the **Updating Base Commit** instructions above, however first remove your patch before running `apply-patches` since the new base commit contains your change already.

View File

@@ -85,9 +85,12 @@ COMPILER inline get_compiler() {
import "C"
import (
"bytes"
_ "embed"
"encoding/json"
"errors"
"fmt"
"log/slog"
"runtime"
"runtime/cgo"
"slices"
@@ -140,7 +143,7 @@ type ContextParams struct {
c C.struct_llama_context_params
}
func NewContextParams(numCtx int, batchSize int, numSeqMax int, threads int, flashAttention bool) ContextParams {
func NewContextParams(numCtx int, batchSize int, numSeqMax int, threads int, flashAttention bool, kvCacheType string) ContextParams {
params := C.llama_context_default_params()
params.n_ctx = C.uint(numCtx)
params.n_batch = C.uint(batchSize)
@@ -149,9 +152,28 @@ func NewContextParams(numCtx int, batchSize int, numSeqMax int, threads int, fla
params.n_threads_batch = params.n_threads
params.embeddings = C.bool(true)
params.flash_attn = C.bool(flashAttention)
params.type_k = kvCacheTypeFromStr(strings.ToLower(kvCacheType))
params.type_v = kvCacheTypeFromStr(strings.ToLower(kvCacheType))
return ContextParams{c: params}
}
// kvCacheTypeFromStr converts a string cache type to the corresponding GGML type value
func kvCacheTypeFromStr(s string) C.enum_ggml_type {
if s == "" {
return C.GGML_TYPE_F16
}
switch s {
case "q8_0":
return C.GGML_TYPE_Q8_0
case "q4_0":
return C.GGML_TYPE_Q4_0
default:
return C.GGML_TYPE_F16
}
}
type Context struct {
c *C.struct_llama_context
numThreads int
@@ -680,3 +702,33 @@ func (s *SamplingContext) Sample(llamaContext *Context, idx int) int {
func (s *SamplingContext) Accept(id int, applyGrammar bool) {
C.gpt_sampler_caccept(s.c, C.llama_token(id), C.bool(applyGrammar))
}
type JsonSchema struct {
Defs map[string]any `json:"$defs,omitempty"`
Properties map[string]any `json:"properties,omitempty"`
Required []string `json:"required,omitempty"`
Title string `json:"title,omitempty"`
Type string `json:"type,omitempty"`
}
func (js JsonSchema) AsGrammar() string {
var b bytes.Buffer
if err := json.NewEncoder(&b).Encode(js); err != nil {
return ""
}
cStr := C.CString(b.String())
defer C.free(unsafe.Pointer(cStr))
// Allocate buffer for grammar output with reasonable size
const maxLen = 32768 // 32KB
buf := make([]byte, maxLen)
// Call C function to convert schema to grammar
length := C.schema_to_grammar(cStr, (*C.char)(unsafe.Pointer(&buf[0])), C.size_t(maxLen))
if length == 0 {
slog.Warn("unable to convert schema to grammar")
}
return string(buf[:length])
}

View File

@@ -1 +1,70 @@
package llama
import (
"strings"
"testing"
"github.com/google/go-cmp/cmp"
)
func TestJsonSchema(t *testing.T) {
testCases := []struct {
name string
schema JsonSchema
expected string
}{
{
name: "empty schema",
schema: JsonSchema{
Type: "object",
},
expected: `array ::= "[" space ( value ("," space value)* )? "]" space
boolean ::= ("true" | "false") space
char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
decimal-part ::= [0-9]{1,16}
integral-part ::= [0] | [1-9] [0-9]{0,15}
null ::= "null" space
number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
object ::= "{" space ( string ":" space value ("," space string ":" space value)* )? "}" space
root ::= object
space ::= | " " | "\n" [ \t]{0,20}
string ::= "\"" char* "\"" space
value ::= object | array | string | number | boolean | null`,
},
{
name: "invalid schema with circular reference",
schema: JsonSchema{
Type: "object",
Properties: map[string]any{
"self": map[string]any{
"$ref": "#", // Self reference
},
},
},
expected: "", // Should return empty string for invalid schema
},
{
name: "schema with invalid type",
schema: JsonSchema{
Type: "invalid_type", // Invalid type
Properties: map[string]any{
"foo": map[string]any{
"type": "string",
},
},
},
expected: "", // Should return empty string for invalid schema
},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
result := tc.schema.AsGrammar()
if !strings.EqualFold(strings.TrimSpace(result), strings.TrimSpace(tc.expected)) {
if diff := cmp.Diff(tc.expected, result); diff != "" {
t.Fatalf("grammar mismatch (-want +got):\n%s", diff)
}
}
})
}
}

View File

@@ -199,6 +199,20 @@ func countCommonPrefix(a []input, b []input) int {
return count
}
func (c *InputCache) ShiftDiscard(inputLen int, numKeep int) int {
targetFree := (c.numCtx - numKeep) / 2
targetFree = max(targetFree, 1)
currentFree := c.numCtx - inputLen
discard := targetFree - currentFree
if discard < 0 {
discard = 0
}
return discard
}
// Frees up space in the KV cache by deleting the oldest half of history and shifting
// the newest half into that space (saving numKeep inputs at the beginning).
//
@@ -208,11 +222,7 @@ func (c *InputCache) ShiftCacheSlot(slot *InputCacheSlot, numKeep int) error {
return fmt.Errorf("unable to shift context - keep exceeds context (keep: %v context: %v)", numKeep, c.numCtx)
}
targetFree := (c.numCtx - numKeep) / 2
targetFree = max(targetFree, 1)
currentFree := c.numCtx - len(slot.Inputs)
discard := targetFree - currentFree
discard := c.ShiftDiscard(len(slot.Inputs), numKeep)
if discard <= 0 {
return nil

View File

@@ -227,3 +227,66 @@ func TestFindCacheSlot(t *testing.T) {
})
}
}
func TestShiftDiscard(t *testing.T) {
tests := []struct {
name string
numCtx int
numKeep int
inputLen int
expected int
}{
{
name: "Shift",
numCtx: 2048,
numKeep: 5,
inputLen: 2048,
expected: 1021,
},
{
name: "Max Keep",
numCtx: 2048,
numKeep: 2047,
inputLen: 2048,
expected: 1,
},
{
name: "No Keep",
numCtx: 2048,
numKeep: 0,
inputLen: 2048,
expected: 1024,
},
{
name: "Truncate",
numCtx: 2048,
numKeep: 5,
inputLen: 5000,
expected: 3973,
},
{
name: "Truncate Keep",
numCtx: 2048,
numKeep: 2047,
inputLen: 5000,
expected: 2953,
},
{
name: "No Op",
numCtx: 2048,
numKeep: 5,
inputLen: 512,
expected: 0,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
c := InputCache{numCtx: tt.numCtx}
result := c.ShiftDiscard(tt.inputLen, tt.numKeep)
if result != tt.expected {
t.Errorf("shiftDiscard(ctx: %v, keep: %v input: %v): have %v; want %v", tt.numCtx, tt.numKeep, tt.inputLen, result, tt.expected)
}
})
}
}

View File

@@ -122,9 +122,11 @@ func (s *Server) NewSequence(prompt string, images []ImageData, params NewSequen
params.numKeep = min(params.numKeep, s.cache.numCtx-1)
if len(inputs) > s.cache.numCtx {
slog.Warn("truncating input prompt", "limit", s.cache.numCtx, "prompt", len(inputs), "numKeep", params.numKeep)
discard := len(inputs) - s.cache.numCtx
newInputs := inputs[:params.numKeep]
newInputs = append(newInputs, inputs[len(inputs)-s.cache.numCtx+params.numKeep:]...)
newInputs = append(newInputs, inputs[params.numKeep+discard:]...)
slog.Warn("truncating input prompt", "limit", s.cache.numCtx, "prompt", len(inputs), "keep", params.numKeep, "new", len(newInputs))
inputs = newInputs
}
@@ -162,10 +164,16 @@ func (s *Server) NewSequence(prompt string, images []ImageData, params NewSequen
// generating image embeddings for each image
func (s *Server) inputs(prompt string, images []ImageData) ([]input, error) {
var inputs []input
var parts []string
var matches [][]string
re := regexp.MustCompile(`\[img-(\d+)\]`)
parts := re.Split(prompt, -1)
matches := re.FindAllStringSubmatch(prompt, -1)
if s.image != nil {
re := regexp.MustCompile(`\[img-(\d+)\]`)
parts = re.Split(prompt, -1)
matches = re.FindAllStringSubmatch(prompt, -1)
} else {
parts = []string{prompt}
}
for i, part := range parts {
// text - tokenize
@@ -825,12 +833,24 @@ func (s *Server) health(w http.ResponseWriter, r *http.Request) {
}
}
type multiLPath []string
func (m *multiLPath) Set(value string) error {
*m = append(*m, value)
return nil
}
func (m *multiLPath) String() string {
return strings.Join(*m, ", ")
}
func (s *Server) loadModel(
params llama.ModelParams,
mpath string,
lpath string,
lpath multiLPath,
ppath string,
kvSize int,
kvCacheType string,
flashAttention bool,
threads int,
multiUserCache bool,
@@ -843,16 +863,18 @@ func (s *Server) loadModel(
panic(err)
}
ctxParams := llama.NewContextParams(kvSize, s.batchSize*s.parallel, s.parallel, threads, flashAttention)
ctxParams := llama.NewContextParams(kvSize, s.batchSize*s.parallel, s.parallel, threads, flashAttention, kvCacheType)
s.lc, err = llama.NewContextWithModel(s.model, ctxParams)
if err != nil {
panic(err)
}
if lpath != "" {
err := s.model.ApplyLoraFromFile(s.lc, lpath, 1.0, threads)
if err != nil {
panic(err)
if lpath.String() != "" {
for _, path := range lpath {
err := s.model.ApplyLoraFromFile(s.lc, path, 1.0, threads)
if err != nil {
panic(err)
}
}
}
@@ -882,7 +904,7 @@ func main() {
mainGpu := flag.Int("main-gpu", 0, "Main GPU")
flashAttention := flag.Bool("flash-attn", false, "Enable flash attention")
kvSize := flag.Int("ctx-size", 2048, "Context (or KV cache) size")
lpath := flag.String("lora", "", "Path to lora layer file")
kvCacheType := flag.String("kv-cache-type", "", "quantization type for KV cache (default: f16)")
port := flag.Int("port", 8080, "Port to expose the server on")
threads := flag.Int("threads", runtime.NumCPU(), "Number of threads to use during generation")
verbose := flag.Bool("verbose", false, "verbose output (default: disabled)")
@@ -892,6 +914,9 @@ func main() {
multiUserCache := flag.Bool("multiuser-cache", false, "optimize input cache algorithm for multiple users")
requirements := flag.Bool("requirements", false, "print json requirement information")
var lpaths multiLPath
flag.Var(&lpaths, "lora", "Path to lora layer file (can be specified multiple times)")
flag.Parse()
if *requirements {
printRequirements(os.Stdout)
@@ -938,7 +963,7 @@ func main() {
params := llama.ModelParams{
NumGpuLayers: *nGpuLayers,
MainGpu: *mainGpu,
UseMmap: !*noMmap && *lpath == "",
UseMmap: !*noMmap && lpaths.String() == "",
UseMlock: *mlock,
TensorSplit: tensorSplitFloats,
Progress: func(progress float32) {
@@ -947,7 +972,7 @@ func main() {
}
server.ready.Add(1)
go server.loadModel(params, *mpath, *lpath, *ppath, *kvSize, *flashAttention, *threads, *multiUserCache)
go server.loadModel(params, *mpath, lpaths, *ppath, *kvSize, *kvCacheType, *flashAttention, *threads, *multiUserCache)
server.cond = sync.NewCond(&server.mu)

View File

@@ -1,11 +1,13 @@
// TODO: this is a temporary wrapper to allow calling C++ code from CGo
#include "sampling.h"
#include "sampling_ext.h"
#include "json-schema-to-grammar.h"
struct gpt_sampler *gpt_sampler_cinit(
const struct llama_model *model, struct gpt_sampler_cparams *params)
{
try {
try
{
gpt_sampler_params sparams;
sparams.top_k = params->top_k;
sparams.top_p = params->top_p;
@@ -24,7 +26,9 @@ struct gpt_sampler *gpt_sampler_cinit(
sparams.seed = params->seed;
sparams.grammar = params->grammar;
return gpt_sampler_init(model, sparams);
} catch (const std::exception & err) {
}
catch (const std::exception &err)
{
return nullptr;
}
}
@@ -54,3 +58,24 @@ void gpt_sampler_caccept(
{
gpt_sampler_accept(sampler, id, apply_grammar);
}
int schema_to_grammar(const char *json_schema, char *grammar, size_t max_len)
{
try
{
nlohmann::json schema = nlohmann::json::parse(json_schema);
std::string grammar_str = json_schema_to_grammar(schema);
size_t len = grammar_str.length();
if (len >= max_len)
{
len = max_len - 1;
}
strncpy(grammar, grammar_str.c_str(), len);
return len;
}
catch (const std::exception &e)
{
strncpy(grammar, "", max_len - 1);
return 0;
}
}

View File

@@ -47,6 +47,8 @@ extern "C"
llama_token id,
bool apply_grammar);
int schema_to_grammar(const char *json_schema, char *grammar, size_t max_len);
#ifdef __cplusplus
}
#endif

View File

@@ -32,9 +32,10 @@ const (
fileTypeIQ1_S
fileTypeIQ4_NL
fileTypeIQ3_S
fileTypeIQ3_M
fileTypeIQ2_S
fileTypeIQ4_XS
fileTypeIQ2_M
fileTypeIQ4_XS
fileTypeIQ1_M
fileTypeBF16
@@ -93,6 +94,8 @@ func ParseFileType(s string) (fileType, error) {
return fileTypeIQ4_NL, nil
case "IQ3_S":
return fileTypeIQ3_S, nil
case "IQ3_M":
return fileTypeIQ3_M, nil
case "IQ2_S":
return fileTypeIQ2_S, nil
case "IQ4_XS":
@@ -160,6 +163,8 @@ func (t fileType) String() string {
return "IQ4_NL"
case fileTypeIQ3_S:
return "IQ3_S"
case fileTypeIQ3_M:
return "IQ3_M"
case fileTypeIQ2_S:
return "IQ2_S"
case fileTypeIQ4_XS:

View File

@@ -360,7 +360,7 @@ func DecodeGGML(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) {
}, offset, nil
}
func (llm GGML) GraphSize(context, batch uint64) (kv, partialOffload, fullOffload uint64) {
func (llm GGML) GraphSize(context, batch uint64, kvCacheType string) (kv, partialOffload, fullOffload uint64) {
embedding := llm.KV().EmbeddingLength()
heads := llm.KV().HeadCount()
headsKV := llm.KV().HeadCountKV()
@@ -372,7 +372,8 @@ func (llm GGML) GraphSize(context, batch uint64) (kv, partialOffload, fullOffloa
layers := llm.Tensors().Layers()
kv = 2 * context * llm.KV().BlockCount() * (embeddingHeadsK + embeddingHeadsV) * headsKV
bytesPerElement := kvCacheBytesPerElement(kvCacheType)
kv = uint64(float64(context*llm.KV().BlockCount()*(embeddingHeadsK+embeddingHeadsV)*headsKV) * bytesPerElement)
switch llm.KV().Architecture() {
case "llama":
@@ -527,3 +528,34 @@ func (llm GGML) GraphSize(context, batch uint64) (kv, partialOffload, fullOffloa
return
}
// SupportsKVCacheType checks if the requested cache type is supported
func (ggml GGML) SupportsKVCacheType(cacheType string) bool {
validKVCacheTypes := []string{"f16", "q8_0", "q4_0"}
return slices.Contains(validKVCacheTypes, cacheType)
}
// SupportsFlashAttention checks if the model supports flash attention
func (ggml GGML) SupportsFlashAttention() bool {
_, isEmbedding := ggml.KV()[fmt.Sprintf("%s.pooling_type", ggml.KV().Architecture())]
if isEmbedding {
return false
}
// Check head counts match and are non-zero
headCountK := ggml.KV().EmbeddingHeadCountK()
headCountV := ggml.KV().EmbeddingHeadCountV()
return headCountK != 0 && headCountV != 0 && headCountK == headCountV
}
// kvCacheBytesPerElement returns the number of bytes per element for a given KV cache type
func kvCacheBytesPerElement(cacheType string) float64 {
switch cacheType {
case "q8_0":
return 1 // 1/2 of fp16
case "q4_0":
return 0.5 // 1/4 of fp16
default:
return 2 // f16 (default)
}
}

View File

@@ -123,7 +123,23 @@ func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string,
slog.Warn("model missing blk.0 layer size")
}
kv, graphPartialOffload, graphFullOffload := ggml.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)))
fa := envconfig.FlashAttention() &&
discover.GetGPUInfo().FlashAttentionSupported() &&
ggml.SupportsFlashAttention()
var kvct string
if fa {
requested := strings.ToLower(envconfig.KvCacheType())
if requested != "" && ggml.SupportsKVCacheType(requested) {
kvct = requested
}
}
kv, graphPartialOffload, graphFullOffload := ggml.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)), kvct)
// KV is proportional to the number of layers
layerSize += kv / ggml.KV().BlockCount()
if graphPartialOffload == 0 {
graphPartialOffload = ggml.KV().GQA() * kv / 6
}
@@ -131,9 +147,6 @@ func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string,
graphFullOffload = graphPartialOffload
}
// KV is proportional to the number of layers
layerSize += kv / ggml.KV().BlockCount()
// on metal there's no partial offload overhead
if gpus[0].Library == "metal" {
graphPartialOffload = graphFullOffload

View File

@@ -15,6 +15,7 @@ import (
func TestEstimateGPULayers(t *testing.T) {
t.Setenv("OLLAMA_DEBUG", "1")
t.Setenv("OLLAMA_KV_CACHE_TYPE", "") // Ensure default f16
modelName := "dummy"
f, err := os.CreateTemp(t.TempDir(), modelName)

View File

@@ -144,10 +144,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
// Loop through potential servers
finalErr := errors.New("no suitable llama servers found")
if len(adapters) > 1 {
return nil, errors.New("ollama supports only one lora adapter, but multiple were provided")
}
rDir, err := runners.Refresh(build.EmbedFS)
if err != nil {
return nil, err
@@ -201,8 +197,9 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
}
if len(adapters) > 0 {
// TODO: applying multiple adapters is not supported by the llama.cpp server yet
params = append(params, "--lora", adapters[0])
for _, adapter := range adapters {
params = append(params, "--lora", adapter)
}
}
if len(projectors) > 0 {
@@ -217,15 +214,36 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
params = append(params, "--threads", strconv.Itoa(defaultThreads))
}
flashAttnEnabled := envconfig.FlashAttention()
fa := envconfig.FlashAttention()
if fa && !gpus.FlashAttentionSupported() {
slog.Warn("flash attention enabled but not supported by gpu")
fa = false
}
for _, g := range gpus {
// only cuda (compute capability 7+) and metal support flash attention
if g.Library != "metal" && (g.Library != "cuda" || g.DriverMajor < 7) {
flashAttnEnabled = false
if fa && !ggml.SupportsFlashAttention() {
slog.Warn("flash attention enabled but not supported by model")
fa = false
}
kvct := strings.ToLower(envconfig.KvCacheType())
if fa {
slog.Info("enabling flash attention")
params = append(params, "--flash-attn")
// Flash Attention also supports kv cache quantization
// Enable if the requested and kv cache type is supported by the model
if kvct != "" && ggml.SupportsKVCacheType(kvct) {
params = append(params, "--kv-cache-type", kvct)
} else {
slog.Warn("kv cache type not supported by model", "type", kvct)
}
} else if kvct != "" && kvct != "f16" {
slog.Warn("quantized kv cache requested but flash attention disabled", "type", kvct)
}
// mmap has issues with partial offloading on metal
// mmap has issues with partial offloading on metal
for _, g := range gpus {
if g.Library == "metal" &&
uint64(opts.NumGPU) > 0 &&
uint64(opts.NumGPU) < ggml.KV().BlockCount()+1 {
@@ -234,10 +252,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
}
}
if flashAttnEnabled {
params = append(params, "--flash-attn")
}
// Windows CUDA should not use mmap for best performance
// Linux with a model larger than free space, mmap leads to thrashing
// For CPU loads we want the memory to be allocated, not FS cache
@@ -620,27 +634,22 @@ func (s *llmServer) WaitUntilRunning(ctx context.Context) error {
const jsonGrammar = `
root ::= object
value ::= object | array | string | number | ("true" | "false" | "null") ws
object ::=
"{" ws (
string ":" ws value
("," ws string ":" ws value)*
)? "}" ws
array ::=
"[" ws (
value
("," ws value)*
)? "]" ws
string ::=
"\"" (
[^"\\\x7F\x00-\x1F] |
"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) # escapes
)* "\"" ws
number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws
# Optional space: by convention, applied in this grammar after literal chars when allowed
ws ::= ([ \t\n] ws)?
`
@@ -670,7 +679,7 @@ type completion struct {
type CompletionRequest struct {
Prompt string
Format string
Format json.RawMessage
Images []ImageData
Options *api.Options
}
@@ -735,10 +744,22 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
return fmt.Errorf("unexpected server status: %s", status.ToString())
}
if req.Format == "json" {
request["grammar"] = jsonGrammar
if !strings.Contains(strings.ToLower(req.Prompt), "json") {
slog.Warn("Prompt does not specify that the LLM should response in JSON, but JSON format is expected. For best results specify that JSON is expected in the system prompt.")
// TODO (parthsareen): Move conversion to grammar with sampling logic
// API should do error handling for invalid formats
if req.Format != nil {
if strings.ToLower(strings.TrimSpace(string(req.Format))) == `"json"` {
request["grammar"] = jsonGrammar
if !strings.Contains(strings.ToLower(req.Prompt), "json") {
slog.Warn("prompt does not specify that the LLM should response in JSON, but JSON format is expected. For best results specify that JSON is expected in the system prompt.")
}
} else if schema, err := func() (llama.JsonSchema, error) {
var schema llama.JsonSchema
err := json.Unmarshal(req.Format, &schema)
return schema, err
}(); err == nil {
request["grammar"] = schema.AsGrammar()
} else {
slog.Warn(`format is neither a schema or "json"`, "format", req.Format)
}
}

View File

@@ -62,7 +62,12 @@ type Usage struct {
}
type ResponseFormat struct {
Type string `json:"type"`
Type string `json:"type"`
JsonSchema *JsonSchema `json:"json_schema,omitempty"`
}
type JsonSchema struct {
Schema map[string]any `json:"schema"`
}
type EmbedRequest struct {
@@ -140,6 +145,7 @@ type CompletionChunk struct {
type ToolCall struct {
ID string `json:"id"`
Index int `json:"index"`
Type string `json:"type"`
Function struct {
Name string `json:"name"`
@@ -200,12 +206,13 @@ func toolCallId() string {
return "call_" + strings.ToLower(string(b))
}
func toChatCompletion(id string, r api.ChatResponse) ChatCompletion {
toolCalls := make([]ToolCall, len(r.Message.ToolCalls))
for i, tc := range r.Message.ToolCalls {
func toToolCalls(tc []api.ToolCall) []ToolCall {
toolCalls := make([]ToolCall, len(tc))
for i, tc := range tc {
toolCalls[i].ID = toolCallId()
toolCalls[i].Type = "function"
toolCalls[i].Function.Name = tc.Function.Name
toolCalls[i].Index = tc.Function.Index
args, err := json.Marshal(tc.Function.Arguments)
if err != nil {
@@ -215,7 +222,11 @@ func toChatCompletion(id string, r api.ChatResponse) ChatCompletion {
toolCalls[i].Function.Arguments = string(args)
}
return toolCalls
}
func toChatCompletion(id string, r api.ChatResponse) ChatCompletion {
toolCalls := toToolCalls(r.Message.ToolCalls)
return ChatCompletion{
Id: id,
Object: "chat.completion",
@@ -244,6 +255,7 @@ func toChatCompletion(id string, r api.ChatResponse) ChatCompletion {
}
func toChunk(id string, r api.ChatResponse) ChatCompletionChunk {
toolCalls := toToolCalls(r.Message.ToolCalls)
return ChatCompletionChunk{
Id: id,
Object: "chat.completion.chunk",
@@ -252,7 +264,7 @@ func toChunk(id string, r api.ChatResponse) ChatCompletionChunk {
SystemFingerprint: "fp_ollama",
Choices: []ChunkChoice{{
Index: 0,
Delta: Message{Role: "assistant", Content: r.Message.Content},
Delta: Message{Role: "assistant", Content: r.Message.Content, ToolCalls: toolCalls},
FinishReason: func(reason string) *string {
if len(reason) > 0 {
return &reason
@@ -475,9 +487,21 @@ func fromChatRequest(r ChatCompletionRequest) (*api.ChatRequest, error) {
options["top_p"] = 1.0
}
var format string
if r.ResponseFormat != nil && r.ResponseFormat.Type == "json_object" {
format = "json"
var format json.RawMessage
if r.ResponseFormat != nil {
switch strings.ToLower(strings.TrimSpace(r.ResponseFormat.Type)) {
// Support the old "json_object" type for OpenAI compatibility
case "json_object":
format = json.RawMessage(`"json"`)
case "json_schema":
if r.ResponseFormat.JsonSchema != nil {
schema, err := json.Marshal(r.ResponseFormat.JsonSchema.Schema)
if err != nil {
return nil, fmt.Errorf("failed to marshal json schema: %w", err)
}
format = schema
}
}
}
return &api.ChatRequest{
@@ -571,7 +595,7 @@ type EmbedWriter struct {
model string
}
func (w *BaseWriter) writeError(code int, data []byte) (int, error) {
func (w *BaseWriter) writeError(data []byte) (int, error) {
var serr api.StatusError
err := json.Unmarshal(data, &serr)
if err != nil {
@@ -630,7 +654,7 @@ func (w *ChatWriter) writeResponse(data []byte) (int, error) {
func (w *ChatWriter) Write(data []byte) (int, error) {
code := w.ResponseWriter.Status()
if code != http.StatusOK {
return w.writeError(code, data)
return w.writeError(data)
}
return w.writeResponse(data)
@@ -679,7 +703,7 @@ func (w *CompleteWriter) writeResponse(data []byte) (int, error) {
func (w *CompleteWriter) Write(data []byte) (int, error) {
code := w.ResponseWriter.Status()
if code != http.StatusOK {
return w.writeError(code, data)
return w.writeError(data)
}
return w.writeResponse(data)
@@ -704,7 +728,7 @@ func (w *ListWriter) writeResponse(data []byte) (int, error) {
func (w *ListWriter) Write(data []byte) (int, error) {
code := w.ResponseWriter.Status()
if code != http.StatusOK {
return w.writeError(code, data)
return w.writeError(data)
}
return w.writeResponse(data)
@@ -730,7 +754,7 @@ func (w *RetrieveWriter) writeResponse(data []byte) (int, error) {
func (w *RetrieveWriter) Write(data []byte) (int, error) {
code := w.ResponseWriter.Status()
if code != http.StatusOK {
return w.writeError(code, data)
return w.writeError(data)
}
return w.writeResponse(data)
@@ -755,7 +779,7 @@ func (w *EmbedWriter) writeResponse(data []byte) (int, error) {
func (w *EmbedWriter) Write(data []byte) (int, error) {
code := w.ResponseWriter.Status()
if code != http.StatusOK {
return w.writeError(code, data)
return w.writeError(data)
}
return w.writeResponse(data)

View File

@@ -13,6 +13,7 @@ import (
"time"
"github.com/gin-gonic/gin"
"github.com/google/go-cmp/cmp"
"github.com/ollama/ollama/api"
)
@@ -107,7 +108,7 @@ func TestChatMiddleware(t *testing.T) {
"presence_penalty": 5.0,
"top_p": 6.0,
},
Format: "json",
Format: json.RawMessage(`"json"`),
Stream: &True,
},
},
@@ -195,7 +196,86 @@ func TestChatMiddleware(t *testing.T) {
Stream: &False,
},
},
{
name: "chat handler with streaming tools",
body: `{
"model": "test-model",
"messages": [
{"role": "user", "content": "What's the weather like in Paris?"}
],
"stream": true,
"tools": [{
"type": "function",
"function": {
"name": "get_weather",
"description": "Get the current weather",
"parameters": {
"type": "object",
"required": ["location"],
"properties": {
"location": {
"type": "string",
"description": "The city and state"
},
"unit": {
"type": "string",
"enum": ["celsius", "fahrenheit"]
}
}
}
}
}]
}`,
req: api.ChatRequest{
Model: "test-model",
Messages: []api.Message{
{
Role: "user",
Content: "What's the weather like in Paris?",
},
},
Tools: []api.Tool{
{
Type: "function",
Function: api.ToolFunction{
Name: "get_weather",
Description: "Get the current weather",
Parameters: struct {
Type string `json:"type"`
Required []string `json:"required"`
Properties map[string]struct {
Type string `json:"type"`
Description string `json:"description"`
Enum []string `json:"enum,omitempty"`
} `json:"properties"`
}{
Type: "object",
Required: []string{"location"},
Properties: map[string]struct {
Type string `json:"type"`
Description string `json:"description"`
Enum []string `json:"enum,omitempty"`
}{
"location": {
Type: "string",
Description: "The city and state",
},
"unit": {
Type: "string",
Enum: []string{"celsius", "fahrenheit"},
},
},
},
},
},
},
Options: map[string]any{
"temperature": 1.0,
"top_p": 1.0,
},
Stream: &True,
},
},
{
name: "chat handler error forwarding",
body: `{
@@ -237,13 +317,13 @@ func TestChatMiddleware(t *testing.T) {
if err := json.Unmarshal(resp.Body.Bytes(), &errResp); err != nil {
t.Fatal(err)
}
return
}
if capturedRequest != nil && !reflect.DeepEqual(tc.req, *capturedRequest) {
t.Fatal("requests did not match")
if diff := cmp.Diff(&tc.req, capturedRequest); diff != "" {
t.Fatalf("requests did not match: %+v", diff)
}
if !reflect.DeepEqual(tc.err, errResp) {
t.Fatal("errors did not match")
if diff := cmp.Diff(tc.err, errResp); diff != "" {
t.Fatalf("errors did not match for %s:\n%s", tc.name, diff)
}
})
}

View File

@@ -1076,17 +1076,15 @@ func makeRequest(ctx context.Context, method string, requestURL *url.URL, header
req.ContentLength = contentLength
}
resp, err := (&http.Client{
Transport: &http.Transport{
DialContext: testMakeRequestDialContext,
},
c := &http.Client{
CheckRedirect: regOpts.CheckRedirect,
}).Do(req)
if err != nil {
return nil, err
}
return resp, nil
if testMakeRequestDialContext != nil {
tr := http.DefaultTransport.(*http.Transport).Clone()
tr.DialContext = testMakeRequestDialContext
c.Transport = tr
}
return c.Do(req)
}
func getValue(header, key string) string {

View File

@@ -39,6 +39,7 @@ func TestExecuteWithTools(t *testing.T) {
{"mistral", `[TOOL_CALLS] [{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]
The temperature in San Francisco, CA is 70°F and in Toronto, Canada is 20°C.`, true},
{"mistral", `[TOOL_CALLS] [{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"To }]`, false},
{"mistral", `I'm not aware of that information. However, I can suggest searching for the weather using the "get_current_weather" function:
[{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]`, true},

View File

@@ -148,10 +148,7 @@ func (s *Server) GenerateHandler(c *gin.Context) {
return
}
if req.Format != "" && req.Format != "json" {
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "format must be empty or \"json\""})
return
} else if req.Raw && (req.Template != "" || req.System != "" || len(req.Context) > 0) {
if req.Raw && (req.Template != "" || req.System != "" || len(req.Context) > 0) {
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "raw mode does not support template, system, or context"})
return
}
@@ -251,6 +248,7 @@ func (s *Server) GenerateHandler(c *gin.Context) {
var b bytes.Buffer
if req.Context != nil {
slog.Warn("the context field is deprecated and will be removed in a future version of Ollama")
s, err := r.Detokenize(c.Request.Context(), req.Context)
if err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
@@ -1141,7 +1139,7 @@ func (s *Server) GenerateRoutes() http.Handler {
config.AllowWildcard = true
config.AllowBrowserExtensions = true
config.AllowHeaders = []string{"Authorization", "Content-Type", "User-Agent", "Accept", "X-Requested-With"}
openAIProperties := []string{"lang", "package-version", "os", "arch", "runtime", "runtime-version", "async"}
openAIProperties := []string{"lang", "package-version", "os", "arch", "retry-count", "runtime", "runtime-version", "async"}
for _, prop := range openAIProperties {
config.AllowHeaders = append(config.AllowHeaders, "x-stainless-"+prop)
}
@@ -1458,6 +1456,7 @@ func (s *Server) ChatHandler(c *gin.Context) {
prompt, images, err := chatPrompt(c.Request.Context(), m, r.Tokenize, opts, msgs, req.Tools)
if err != nil {
slog.Error("chat prompt error", "error", err)
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
return
}
@@ -1467,6 +1466,8 @@ func (s *Server) ChatHandler(c *gin.Context) {
ch := make(chan any)
go func() {
defer close(ch)
var sb strings.Builder
var toolCallIndex int = 0
if err := r.Completion(c.Request.Context(), llm.CompletionRequest{
Prompt: prompt,
Images: images,
@@ -1492,7 +1493,37 @@ func (s *Server) ChatHandler(c *gin.Context) {
res.LoadDuration = checkpointLoaded.Sub(checkpointStart)
}
ch <- res
// TODO: tool call checking and filtering should be moved outside of this callback once streaming
// however this was a simple change for now without reworking streaming logic of this (and other)
// handlers
if req.Stream != nil && !*req.Stream || len(req.Tools) == 0 {
ch <- res
return
}
// Streaming tool calls:
// If tools are recognized, use a flag to track the sending of a tool downstream
// This ensures that content is cleared from the message on the last chunk sent
sb.WriteString(r.Content)
if toolCalls, ok := m.parseToolCalls(sb.String()); ok {
res.Message.ToolCalls = toolCalls
for i := range toolCalls {
toolCalls[i].Function.Index = toolCallIndex
toolCallIndex++
}
res.Message.Content = ""
sb.Reset()
ch <- res
return
}
if r.Done {
// Send any remaining content if no tool calls were detected
if toolCallIndex == 0 {
res.Message.Content = sb.String()
}
ch <- res
}
}); err != nil {
ch <- gin.H{"error": err.Error()}
}

View File

@@ -8,6 +8,7 @@ import (
"io"
"net/http"
"strings"
"sync"
"testing"
"time"
@@ -25,10 +26,14 @@ type mockRunner struct {
// CompletionRequest is only valid until the next call to Completion
llm.CompletionRequest
llm.CompletionResponse
CompletionFn func(context.Context, llm.CompletionRequest, func(llm.CompletionResponse)) error
}
func (m *mockRunner) Completion(_ context.Context, r llm.CompletionRequest, fn func(r llm.CompletionResponse)) error {
func (m *mockRunner) Completion(ctx context.Context, r llm.CompletionRequest, fn func(r llm.CompletionResponse)) error {
m.CompletionRequest = r
if m.CompletionFn != nil {
return m.CompletionFn(ctx, r, fn)
}
fn(m.CompletionResponse)
return nil
}
@@ -88,9 +93,14 @@ func TestGenerateChat(t *testing.T) {
Model: "test",
Modelfile: fmt.Sprintf(`FROM %s
TEMPLATE """
{{- if .System }}System: {{ .System }} {{ end }}
{{- if .Prompt }}User: {{ .Prompt }} {{ end }}
{{- if .Response }}Assistant: {{ .Response }} {{ end }}"""
{{- if .Tools }}
{{ .Tools }}
{{ end }}
{{- range .Messages }}
{{- .Role }}: {{ .Content }}
{{- range .ToolCalls }}{"name": "{{ .Function.Name }}", "arguments": {{ .Function.Arguments }}}
{{- end }}
{{ end }}"""
`, createBinFile(t, llm.KV{
"general.architecture": "llama",
"llama.block_count": uint32(1),
@@ -263,7 +273,7 @@ func TestGenerateChat(t *testing.T) {
t.Errorf("expected status 200, got %d", w.Code)
}
if diff := cmp.Diff(mock.CompletionRequest.Prompt, "User: Hello! "); diff != "" {
if diff := cmp.Diff(mock.CompletionRequest.Prompt, "user: Hello!\n"); diff != "" {
t.Errorf("mismatch (-got +want):\n%s", diff)
}
@@ -292,7 +302,7 @@ func TestGenerateChat(t *testing.T) {
t.Errorf("expected status 200, got %d", w.Code)
}
if diff := cmp.Diff(mock.CompletionRequest.Prompt, "System: You are a helpful assistant. User: Hello! "); diff != "" {
if diff := cmp.Diff(mock.CompletionRequest.Prompt, "system: You are a helpful assistant.\nuser: Hello!\n"); diff != "" {
t.Errorf("mismatch (-got +want):\n%s", diff)
}
@@ -314,7 +324,7 @@ func TestGenerateChat(t *testing.T) {
t.Errorf("expected status 200, got %d", w.Code)
}
if diff := cmp.Diff(mock.CompletionRequest.Prompt, "System: You can perform magic tricks. User: Hello! "); diff != "" {
if diff := cmp.Diff(mock.CompletionRequest.Prompt, "system: You can perform magic tricks.\nuser: Hello!\n"); diff != "" {
t.Errorf("mismatch (-got +want):\n%s", diff)
}
@@ -337,12 +347,242 @@ func TestGenerateChat(t *testing.T) {
t.Errorf("expected status 200, got %d", w.Code)
}
if diff := cmp.Diff(mock.CompletionRequest.Prompt, "System: You are a helpful assistant. User: Hello! Assistant: I can help you with that. System: You can perform magic tricks. User: Help me write tests. "); diff != "" {
if diff := cmp.Diff(mock.CompletionRequest.Prompt, "system: You are a helpful assistant.\nuser: Hello!\nassistant: I can help you with that.\nsystem: You can perform magic tricks.\nuser: Help me write tests.\n"); diff != "" {
t.Errorf("mismatch (-got +want):\n%s", diff)
}
checkChatResponse(t, w.Body, "test-system", "Abra kadabra!")
})
t.Run("messages with tools (non-streaming)", func(t *testing.T) {
if w.Code != http.StatusOK {
t.Fatalf("failed to create test-system model: %d", w.Code)
}
tools := []api.Tool{
{
Type: "function",
Function: api.ToolFunction{
Name: "get_weather",
Description: "Get the current weather",
Parameters: struct {
Type string `json:"type"`
Required []string `json:"required"`
Properties map[string]struct {
Type string `json:"type"`
Description string `json:"description"`
Enum []string `json:"enum,omitempty"`
} `json:"properties"`
}{
Type: "object",
Required: []string{"location"},
Properties: map[string]struct {
Type string `json:"type"`
Description string `json:"description"`
Enum []string `json:"enum,omitempty"`
}{
"location": {
Type: "string",
Description: "The city and state",
},
"unit": {
Type: "string",
Enum: []string{"celsius", "fahrenheit"},
},
},
},
},
},
}
mock.CompletionResponse = llm.CompletionResponse{
Content: `{"name":"get_weather","arguments":{"location":"Seattle, WA","unit":"celsius"}}`,
Done: true,
DoneReason: "done",
PromptEvalCount: 1,
PromptEvalDuration: 1,
EvalCount: 1,
EvalDuration: 1,
}
streamRequest := true
w := createRequest(t, s.ChatHandler, api.ChatRequest{
Model: "test-system",
Messages: []api.Message{
{Role: "user", Content: "What's the weather in Seattle?"},
},
Tools: tools,
Stream: &streamRequest,
})
if w.Code != http.StatusOK {
var errResp struct {
Error string `json:"error"`
}
if err := json.NewDecoder(w.Body).Decode(&errResp); err != nil {
t.Logf("Failed to decode error response: %v", err)
} else {
t.Logf("Error response: %s", errResp.Error)
}
}
if w.Code != http.StatusOK {
t.Errorf("expected status 200, got %d", w.Code)
}
var resp api.ChatResponse
if err := json.NewDecoder(w.Body).Decode(&resp); err != nil {
t.Fatal(err)
}
if resp.Message.ToolCalls == nil {
t.Error("expected tool calls, got nil")
}
expectedToolCall := api.ToolCall{
Function: api.ToolCallFunction{
Name: "get_weather",
Arguments: api.ToolCallFunctionArguments{
"location": "Seattle, WA",
"unit": "celsius",
},
},
}
if diff := cmp.Diff(resp.Message.ToolCalls[0], expectedToolCall); diff != "" {
t.Errorf("tool call mismatch (-got +want):\n%s", diff)
}
})
t.Run("messages with tools (streaming)", func(t *testing.T) {
tools := []api.Tool{
{
Type: "function",
Function: api.ToolFunction{
Name: "get_weather",
Description: "Get the current weather",
Parameters: struct {
Type string `json:"type"`
Required []string `json:"required"`
Properties map[string]struct {
Type string `json:"type"`
Description string `json:"description"`
Enum []string `json:"enum,omitempty"`
} `json:"properties"`
}{
Type: "object",
Required: []string{"location"},
Properties: map[string]struct {
Type string `json:"type"`
Description string `json:"description"`
Enum []string `json:"enum,omitempty"`
}{
"location": {
Type: "string",
Description: "The city and state",
},
"unit": {
Type: "string",
Enum: []string{"celsius", "fahrenheit"},
},
},
},
},
},
}
// Simulate streaming response with multiple chunks
var wg sync.WaitGroup
wg.Add(1)
mock.CompletionFn = func(ctx context.Context, r llm.CompletionRequest, fn func(r llm.CompletionResponse)) error {
defer wg.Done()
// Send chunks with small delays to simulate streaming
responses := []llm.CompletionResponse{
{
Content: `{"name":"get_`,
Done: false,
PromptEvalCount: 1,
PromptEvalDuration: 1,
},
{
Content: `weather","arguments":{"location":"Seattle`,
Done: false,
PromptEvalCount: 2,
PromptEvalDuration: 1,
},
{
Content: `, WA","unit":"celsius"}}`,
Done: true,
DoneReason: "tool_call",
PromptEvalCount: 3,
PromptEvalDuration: 1,
},
}
for _, resp := range responses {
select {
case <-ctx.Done():
return ctx.Err()
default:
fn(resp)
time.Sleep(10 * time.Millisecond) // Small delay between chunks
}
}
return nil
}
w := createRequest(t, s.ChatHandler, api.ChatRequest{
Model: "test-system",
Messages: []api.Message{
{Role: "user", Content: "What's the weather in Seattle?"},
},
Tools: tools,
Stream: &stream,
})
wg.Wait()
if w.Code != http.StatusOK {
t.Errorf("expected status 200, got %d", w.Code)
}
// Read and validate the streamed responses
decoder := json.NewDecoder(w.Body)
var finalToolCall api.ToolCall
for {
var resp api.ChatResponse
if err := decoder.Decode(&resp); err == io.EOF {
break
} else if err != nil {
t.Fatal(err)
}
if resp.Done {
if len(resp.Message.ToolCalls) != 1 {
t.Errorf("expected 1 tool call in final response, got %d", len(resp.Message.ToolCalls))
}
finalToolCall = resp.Message.ToolCalls[0]
}
}
expectedToolCall := api.ToolCall{
Function: api.ToolCallFunction{
Name: "get_weather",
Arguments: api.ToolCallFunctionArguments{
"location": "Seattle, WA",
"unit": "celsius",
},
},
}
if diff := cmp.Diff(finalToolCall, expectedToolCall); diff != "" {
t.Errorf("final tool call mismatch (-got +want):\n%s", diff)
}
})
}
func TestGenerate(t *testing.T) {