wip?

server: add python tool parsing logic
2025-05-07 19:00:44 -07:00 · 2025-05-02 16:23:54 -07:00
20 changed files with 614 additions and 301 deletions
--- a/README.md
+++ b/README.md
@@ -285,7 +285,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Bionic GPT](https://github.com/bionic-gpt/bionic-gpt)
 - [HTML UI](https://github.com/rtcfirefly/ollama-ui)
 - [Saddle](https://github.com/jikkuatwork/saddle)
- [TagSpaces](https://www.tagspaces.org) (A platform for file-based apps, [utilizing Ollama](https://docs.tagspaces.org/ai/) for the generation of tags and descriptions)
+- [TagSpaces](https://www.tagspaces.org) (A platform for file based apps, [utilizing Ollama](https://docs.tagspaces.org/ai/) for the generation of tags and descriptions)
 - [Chatbot UI](https://github.com/ivanfioravanti/chatbot-ollama)
 - [Chatbot UI v2](https://github.com/mckaywrigley/chatbot-ui)
 - [Typescript UI](https://github.com/ollama-interface/Ollama-Gui?tab=readme-ov-file)
@@ -325,14 +325,14 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [RWKV-Runner](https://github.com/josStorer/RWKV-Runner) (RWKV offline LLM deployment tool, also usable as a client for ChatGPT and Ollama)
 - [Ollama Grid Search](https://github.com/dezoito/ollama-grid-search) (app to evaluate and compare models)
 - [Olpaka](https://github.com/Otacon/olpaka) (User-friendly Flutter Web App for Ollama)
- [Casibase](https://casibase.org) (An open source AI knowledge base and dialogue system combining the latest RAG, SSO, ollama support, and multiple large language models.)
+- [Casibase](https://casibase.org) (An open source AI knowledge base and dialogue system combining the latest RAG, SSO, ollama support and multiple large language models.)
 - [OllamaSpring](https://github.com/CrazyNeil/OllamaSpring) (Ollama Client for macOS)
 - [LLocal.in](https://github.com/kartikm7/llocal) (Easy to use Electron Desktop Client for Ollama)
 - [Shinkai Desktop](https://github.com/dcSpark/shinkai-apps) (Two click install Local AI using Ollama + Files + RAG)
- [AiLama](https://github.com/zeyoyt/ailama) (A Discord User App that allows you to interact with Ollama anywhere in Discord)
+- [AiLama](https://github.com/zeyoyt/ailama) (A Discord User App that allows you to interact with Ollama anywhere in discord )
 - [Ollama with Google Mesop](https://github.com/rapidarchitect/ollama_mesop/) (Mesop Chat Client implementation with Ollama)
 - [R2R](https://github.com/SciPhi-AI/R2R) (Open-source RAG engine)
- [Ollama-Kis](https://github.com/elearningshow/ollama-kis) (A simple easy-to-use GUI with sample custom LLM for Drivers Education)
+- [Ollama-Kis](https://github.com/elearningshow/ollama-kis) (A simple easy to use GUI with sample custom LLM for Drivers Education)
 - [OpenGPA](https://opengpa.org) (Open-source offline-first Enterprise Agentic Application)
 - [Painting Droid](https://github.com/mateuszmigas/painting-droid) (Painting app with AI integrations)
 - [Kerlig AI](https://www.kerlig.com/) (AI writing assistant for macOS)
@@ -341,16 +341,16 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [LLMStack](https://github.com/trypromptly/LLMStack) (No-code multi-agent framework to build LLM agents and workflows)
 - [BoltAI for Mac](https://boltai.com) (AI Chat Client for Mac)
 - [Harbor](https://github.com/av/harbor) (Containerized LLM Toolkit with Ollama as default backend)
- [PyGPT](https://github.com/szczyglis-dev/py-gpt) (AI desktop assistant for Linux, Windows, and Mac)
- [Alpaca](https://github.com/Jeffser/Alpaca) (An Ollama client application for Linux and macOS made with GTK4 and Adwaita)
+- [PyGPT](https://github.com/szczyglis-dev/py-gpt) (AI desktop assistant for Linux, Windows and Mac)
+- [Alpaca](https://github.com/Jeffser/Alpaca) (An Ollama client application for linux and macos made with GTK4 and Adwaita)
 - [AutoGPT](https://github.com/Significant-Gravitas/AutoGPT/blob/master/docs/content/platform/ollama.md) (AutoGPT Ollama integration)
 - [Go-CREW](https://www.jonathanhecl.com/go-crew/) (Powerful Offline RAG in Golang)
 - [PartCAD](https://github.com/openvmp/partcad/) (CAD model generation with OpenSCAD and CadQuery)
- [Ollama4j Web UI](https://github.com/ollama4j/ollama4j-web-ui) - Java-based Web UI for Ollama built with Vaadin, Spring Boot, and Ollama4j
+- [Ollama4j Web UI](https://github.com/ollama4j/ollama4j-web-ui) - Java-based Web UI for Ollama built with Vaadin, Spring Boot and Ollama4j
 - [PyOllaMx](https://github.com/kspviswa/pyOllaMx) - macOS application capable of chatting with both Ollama and Apple MLX models.
 - [Cline](https://github.com/cline/cline) - Formerly known as Claude Dev is a VSCode extension for multi-file/whole-repo coding
 - [Cherry Studio](https://github.com/kangfenmao/cherry-studio) (Desktop client with Ollama support)
- [ConfiChat](https://github.com/1runeberg/confichat) (Lightweight, standalone, multi-platform, and privacy-focused LLM chat interface with optional encryption)
+- [ConfiChat](https://github.com/1runeberg/confichat) (Lightweight, standalone, multi-platform, and privacy focused LLM chat interface with optional encryption)
 - [Archyve](https://github.com/nickthecook/archyve) (RAG-enabling document library)
 - [crewAI with Mesop](https://github.com/rapidarchitect/ollama-crew-mesop) (Mesop Web Interface to run crewAI with Ollama)
 - [Tkinter-based client](https://github.com/chyok/ollama-gui) (Python tkinter-based Client for Ollama)
@@ -368,7 +368,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [DualMind](https://github.com/tcsenpai/dualmind) (Experimental app allowing two models to talk to each other in the terminal or in a web interface)
 - [ollamarama-matrix](https://github.com/h1ddenpr0cess20/ollamarama-matrix) (Ollama chatbot for the Matrix chat protocol)
 - [ollama-chat-app](https://github.com/anan1213095357/ollama-chat-app) (Flutter-based chat app)
- [Perfect Memory AI](https://www.perfectmemory.ai/) (Productivity AI assists personalized by what you have seen on your screen, heard, and said in the meetings)
+- [Perfect Memory AI](https://www.perfectmemory.ai/) (Productivity AI assists personalized by what you have seen on your screen, heard and said in the meetings)
 - [Hexabot](https://github.com/hexastack/hexabot) (A conversational AI builder)
 - [Reddit Rate](https://github.com/rapidarchitect/reddit_analyzer) (Search and Rate Reddit topics with a weighted summation)
 - [OpenTalkGpt](https://github.com/adarshM84/OpenTalkGpt) (Chrome Extension to manage open-source models supported by Ollama, create custom models, and chat with models from a user-friendly UI)
@@ -386,7 +386,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [ChibiChat](https://github.com/CosmicEventHorizon/ChibiChat) (Kotlin-based Android app to chat with Ollama and Koboldcpp API endpoints)
 - [LocalLLM](https://github.com/qusaismael/localllm) (Minimal Web-App to run ollama models on it with a GUI)
 - [Ollamazing](https://github.com/buiducnhat/ollamazing) (Web extension to run Ollama models)
- [OpenDeepResearcher-via-searxng](https://github.com/benhaotang/OpenDeepResearcher-via-searxng) (A Deep Research equivalent endpoint with Ollama support for running locally)
+- [OpenDeepResearcher-via-searxng](https://github.com/benhaotang/OpenDeepResearcher-via-searxng) (A Deep Research equivent endpoint with Ollama support for running locally)
 - [AntSK](https://github.com/AIDotNet/AntSK) (Out-of-the-box & Adaptable RAG Chatbot)
 - [MaxKB](https://github.com/1Panel-dev/MaxKB/) (Ready-to-use & flexible RAG Chatbot)
 - [yla](https://github.com/danielekp/yla) (Web interface to freely interact with your customized models)
@@ -399,7 +399,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Ollamb](https://github.com/hengkysteen/ollamb) (Simple yet rich in features, cross-platform built with Flutter and designed for Ollama. Try the [web demo](https://hengkysteen.github.io/demo/ollamb/).)
 - [Writeopia](https://github.com/Writeopia/Writeopia) (Text editor with integration with Ollama)
 - [AppFlowy](https://github.com/AppFlowy-IO/AppFlowy) (AI collaborative workspace with Ollama, cross-platform and self-hostable)
- [Lumina](https://github.com/cushydigit/lumina.git) (A lightweight, minimal React.js frontend for interacting with Ollama servers)

 ### Cloud

@@ -441,7 +440,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [PowershAI](https://github.com/rrg92/powershai) PowerShell module that brings AI to terminal on Windows, including support for Ollama
 - [DeepShell](https://github.com/Abyss-c0re/deepshell) Your self-hosted AI assistant. Interactive Shell, Files and Folders analysis.
 - [orbiton](https://github.com/xyproto/orbiton) Configuration-free text editor and IDE with support for tab completion with Ollama.
- [orca-cli](https://github.com/molbal/orca-cli) Ollama Registry CLI Application - Browse, pull, and download models from Ollama Registry in your terminal.
+- [orca-cli](https://github.com/molbal/orca-cli) Ollama Registry CLI Application - Browse, pull and download models from Ollama Registry in your terminal.
 - [GGUF-to-Ollama](https://github.com/jonathanhecl/gguf-to-ollama) - Importing GGUF to Ollama made easy (multiplatform)

 ### Apple Vision Pro
@@ -516,7 +515,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Swollama for Swift](https://github.com/marcusziade/Swollama) with [DocC](https://marcusziade.github.io/Swollama/documentation/swollama/)
 - [GoLamify](https://github.com/prasad89/golamify)
 - [Ollama for Haskell](https://github.com/tusharad/ollama-haskell)
- [multi-llm-ts](https://github.com/nbonamy/multi-llm-ts) (A Typescript/JavaScript library allowing access to different LLM in a unified API)
+- [multi-llm-ts](https://github.com/nbonamy/multi-llm-ts) (A Typescript/JavaScript library allowing access to different LLM in unified API)
 - [LlmTornado](https://github.com/lofcz/llmtornado) (C# library providing a unified interface for major FOSS & Commercial inference APIs)
 - [Ollama for Zig](https://github.com/dravenk/ollama-zig)
 - [Abso](https://github.com/lunary-ai/abso) (OpenAI-compatible TypeScript SDK for any LLM provider)
@@ -525,11 +524,11 @@ See the [API documentation](./docs/api.md) for all endpoints.

 ### Mobile

- [SwiftChat](https://github.com/aws-samples/swift-chat) (Lightning-fast Cross-platform AI chat app with native UI for Android, iOS, and iPad)
+- [SwiftChat](https://github.com/aws-samples/swift-chat) (Lightning-fast Cross-platform AI chat app with native UI for Android, iOS and iPad)
 - [Enchanted](https://github.com/AugustDev/enchanted)
 - [Maid](https://github.com/Mobile-Artificial-Intelligence/maid)
 - [Ollama App](https://github.com/JHubi1/ollama-app) (Modern and easy-to-use multi-platform client for Ollama)
- [ConfiChat](https://github.com/1runeberg/confichat) (Lightweight, standalone, multi-platform, and privacy-focused LLM chat interface with optional encryption)
+- [ConfiChat](https://github.com/1runeberg/confichat) (Lightweight, standalone, multi-platform, and privacy focused LLM chat interface with optional encryption)
 - [Ollama Android Chat](https://github.com/sunshine0523/OllamaServer) (No need for Termux, start the Ollama service with one click on an Android device)
 - [Reins](https://github.com/ibrahimcetin/reins) (Easily tweak parameters, customize system prompts per chat, and enhance your AI experiments with reasoning model support.)

@@ -553,7 +552,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Obsidian Local GPT plugin](https://github.com/pfrankov/obsidian-local-gpt)
 - [Open Interpreter](https://docs.openinterpreter.com/language-model-setup/local-models/ollama)
 - [Llama Coder](https://github.com/ex3ndr/llama-coder) (Copilot alternative using Ollama)
- [Ollama Copilot](https://github.com/bernardo-bruning/ollama-copilot) (Proxy that allows you to use Ollama as a copilot like GitHub Copilot)
+- [Ollama Copilot](https://github.com/bernardo-bruning/ollama-copilot) (Proxy that allows you to use ollama as a copilot like Github copilot)
 - [twinny](https://github.com/rjmacarthy/twinny) (Copilot and Copilot chat alternative using Ollama)
 - [Wingman-AI](https://github.com/RussellCanfield/wingman-ai) (Copilot code and chat alternative using Ollama and Hugging Face)
 - [Page Assist](https://github.com/n4ze3m/page-assist) (Chrome Extension)
@@ -563,8 +562,8 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Discord-Ollama Chat Bot](https://github.com/kevinthedang/discord-ollama) (Generalized TypeScript Discord Bot w/ Tuning Documentation)
 - [ChatGPTBox: All in one browser extension](https://github.com/josStorer/chatGPTBox) with [Integrating Tutorial](https://github.com/josStorer/chatGPTBox/issues/616#issuecomment-1975186467)
 - [Discord AI chat/moderation bot](https://github.com/rapmd73/Companion) Chat/moderation bot written in python. Uses Ollama to create personalities.
- [Headless Ollama](https://github.com/nischalj10/headless-ollama) (Scripts to automatically install ollama client & models on any OS for apps that depend on ollama server)
- [Terraform AWS Ollama & Open WebUI](https://github.com/xuyangbocn/terraform-aws-self-host-llm) (A Terraform module to deploy on AWS a ready-to-use Ollama service, together with its front-end Open WebUI service.)
+- [Headless Ollama](https://github.com/nischalj10/headless-ollama) (Scripts to automatically install ollama client & models on any OS for apps that depends on ollama server)
+- [Terraform AWS Ollama & Open WebUI](https://github.com/xuyangbocn/terraform-aws-self-host-llm) (A Terraform module to deploy on AWS a ready-to-use Ollama service, together with its front end Open WebUI service.)
 - [node-red-contrib-ollama](https://github.com/jakubburkiewicz/node-red-contrib-ollama)
 - [Local AI Helper](https://github.com/ivostoykov/localAI) (Chrome and Firefox extensions that enable interactions with the active tab and customisable API endpoints. Includes secure storage for user prompts.)
 - [vnc-lm](https://github.com/jake83741/vnc-lm) (Discord bot for messaging with LLMs through Ollama and LiteLLM. Seamlessly move between local and flagship models.)
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -1407,6 +1407,7 @@ func NewCLI() *cobra.Command {
 				envVars["OLLAMA_LLM_LIBRARY"],
 				envVars["OLLAMA_GPU_OVERHEAD"],
 				envVars["OLLAMA_LOAD_TIMEOUT"],
+				envVars["OLLAMA_CONTEXT_LENGTH"],
 			})
 		default:
 			appendEnvDocs(cmd, envs)
--- a/docs/faq.md
+++ b/docs/faq.md
@@ -20,7 +20,7 @@ Please refer to the [GPU docs](./gpu.md).

 ## How can I specify the context window size?

-By default, Ollama uses a context window size of 4096 tokens. 
+By default, Ollama uses a context window size of 4096 tokens, unless you have a single GPU with <= 4 GB of VRAM, in which case it will default to 2048 tokens. 

 This can be overridden with the `OLLAMA_CONTEXT_LENGTH` environment variable. For example, to set the default context window to 8K, use: 

@@ -31,7 +31,7 @@ OLLAMA_CONTEXT_LENGTH=8192 ollama serve
 To change this when using `ollama run`, use `/set parameter`:

 ```shell
-/set parameter num_ctx 4096
+/set parameter num_ctx 8192
 ```

 When using the API, specify the `num_ctx` parameter:
@@ -41,7 +41,7 @@ curl http://localhost:11434/api/generate -d '{
  "model": "llama3.2",
  "prompt": "Why is the sky blue?",
  "options": {
-    "num_ctx": 4096
+    "num_ctx": 8192
  }
 }'
 ```
--- a/envconfig/config.go
+++ b/envconfig/config.go
@@ -169,7 +169,7 @@ var (
 	// Enable the new Ollama engine
 	NewEngine = Bool("OLLAMA_NEW_ENGINE")
 	// ContextLength sets the default context length
-	ContextLength = Uint("OLLAMA_CONTEXT_LENGTH", 4096)
+	ContextLength = Int64("OLLAMA_CONTEXT_LENGTH", -1)
 )

 func String(s string) func() string {
@@ -227,6 +227,20 @@ func Uint64(key string, defaultValue uint64) func() uint64 {
 	}
 }

+func Int64(key string, defaultValue int64) func() int64 {
+	return func() int64 {
+		if s := Var(key); s != "" {
+			if n, err := strconv.ParseInt(s, 10, 64); err != nil {
+				slog.Warn("invalid environment variable, using default", "key", key, "value", s, "default", defaultValue)
+			} else {
+				return n
+			}
+		}
+
+		return defaultValue
+	}
+}
+
 // Set aside VRAM per GPU
 var GpuOverhead = Uint64("OLLAMA_GPU_OVERHEAD", 0)

@@ -255,7 +269,7 @@ func AsMap() map[string]EnvVar {
 		"OLLAMA_ORIGINS":           {"OLLAMA_ORIGINS", AllowedOrigins(), "A comma separated list of allowed origins"},
 		"OLLAMA_SCHED_SPREAD":      {"OLLAMA_SCHED_SPREAD", SchedSpread(), "Always schedule model across all GPUs"},
 		"OLLAMA_MULTIUSER_CACHE":   {"OLLAMA_MULTIUSER_CACHE", MultiUserCache(), "Optimize prompt caching for multi-user scenarios"},
-		"OLLAMA_CONTEXT_LENGTH":    {"OLLAMA_CONTEXT_LENGTH", ContextLength(), "Context length to use unless otherwise specified (default: 4096)"},
+		"OLLAMA_CONTEXT_LENGTH":    {"OLLAMA_CONTEXT_LENGTH", ContextLength(), "Context length to use unless otherwise specified (default 4096 or 2048 with low VRAM)"},
 		"OLLAMA_NEW_ENGINE":        {"OLLAMA_NEW_ENGINE", NewEngine(), "Enable the new Ollama engine"},

 		// Informational
--- a/envconfig/config_test.go
+++ b/envconfig/config_test.go
@@ -278,9 +278,9 @@ func TestVar(t *testing.T) {
 }

 func TestContextLength(t *testing.T) {
-	cases := map[string]uint{
-		"":     4096,
-		"2048": 2048,
+	cases := map[string]int64{
+		"":     -1,
+		"4096": 4096,
 	}

 	for k, v := range cases {
--- a/fs/ggml/gguf.go
+++ b/fs/ggml/gguf.go
@@ -531,12 +531,11 @@ func WriteGGUF(ws io.WriteSeeker, kv KV, ts []Tensor) error {

 	var s uint64
 	for _, t := range ts {
-		t.Offset = s
+		t.Offset = s + uint64(ggufPadding(int64(s), int64(alignment)))
 		if err := ggufWriteTensorInfo(ws, t); err != nil {
 			return err
 		}
 		s += t.Size()
-		s += uint64(ggufPadding(int64(s), int64(alignment)))
 	}

 	for _, t := range ts {
--- a/fs/ggml/gguf_test.go
+++ b/fs/ggml/gguf_test.go
@@ -1,63 +0,0 @@
-package ggml
-
-import (
-	"bytes"
-	"os"
-	"slices"
-	"testing"
-
-	"github.com/google/go-cmp/cmp"
-)
-
-func TestWriteGGUF(t *testing.T) {
-	w, err := os.CreateTemp(t.TempDir(), "*.bin")
-	if err != nil {
-		t.Fatal(err)
-	}
-	defer w.Close()
-
-	if err := WriteGGUF(w, KV{
-		"general.alignment": uint32(16),
-	}, []Tensor{
-		{Name: "test.0", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(slices.Repeat([]byte{0}, 2*3*4))},
-		{Name: "test.1", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(slices.Repeat([]byte{0}, 2*3*4))},
-		{Name: "test.2", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(slices.Repeat([]byte{0}, 2*3*4))},
-		{Name: "test.3", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(slices.Repeat([]byte{0}, 2*3*4))},
-		{Name: "test.4", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(slices.Repeat([]byte{0}, 2*3*4))},
-		{Name: "test.5", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(slices.Repeat([]byte{0}, 2*3*4))},
-	}); err != nil {
-		t.Fatal(err)
-	}
-
-	r, err := os.Open(w.Name())
-	if err != nil {
-		t.Fatal(err)
-	}
-	defer r.Close()
-
-	ff, _, err := Decode(r, 0)
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	if diff := cmp.Diff(ff.KV(), KV{
-		"general.alignment":       uint32(16),
-		"general.parameter_count": uint64(36),
-	}); diff != "" {
-		t.Errorf("Mismatch (-want +got):\n%s", diff)
-	}
-
-	if diff := cmp.Diff(ff.Tensors(), Tensors{
-		Offset: 336,
-		items: []*Tensor{
-			{Name: "test.0", Offset: 0, Shape: []uint64{2, 3}},
-			{Name: "test.1", Offset: 32, Shape: []uint64{2, 3}},
-			{Name: "test.2", Offset: 64, Shape: []uint64{2, 3}},
-			{Name: "test.3", Offset: 96, Shape: []uint64{2, 3}},
-			{Name: "test.4", Offset: 128, Shape: []uint64{2, 3}},
-			{Name: "test.5", Offset: 160, Shape: []uint64{2, 3}},
-		},
-	}, cmp.AllowUnexported(Tensors{})); diff != "" {
-		t.Errorf("Mismatch (-want +got):\n%s", diff)
-	}
-}
--- a/go.mod
+++ b/go.mod
@@ -11,7 +11,7 @@ require (
 	github.com/spf13/cobra v1.7.0
 	github.com/stretchr/testify v1.9.0
 	github.com/x448/float16 v0.8.4
-	golang.org/x/sync v0.12.0
+	golang.org/x/sync v0.11.0
 )

 require (
@@ -70,12 +70,12 @@ require (
 	github.com/twitchyliquid64/golang-asm v0.15.1 // indirect
 	github.com/ugorji/go/codec v1.2.12 // indirect
 	golang.org/x/arch v0.8.0 // indirect
-	golang.org/x/crypto v0.36.0
+	golang.org/x/crypto v0.33.0
 	golang.org/x/exp v0.0.0-20250218142911-aa4b98e5adaa
-	golang.org/x/net v0.38.0 // indirect
-	golang.org/x/sys v0.31.0
-	golang.org/x/term v0.30.0
-	golang.org/x/text v0.23.0
+	golang.org/x/net v0.35.0 // indirect
+	golang.org/x/sys v0.30.0
+	golang.org/x/term v0.29.0
+	golang.org/x/text v0.22.0
 	google.golang.org/protobuf v1.34.1
 	gopkg.in/yaml.v3 v3.0.1 // indirect
 )
--- a/go.sum
+++ b/go.sum
@@ -214,8 +214,8 @@ golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACk
 golang.org/x/crypto v0.0.0-20190510104115-cbcb75029529/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
 golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
 golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
-golang.org/x/crypto v0.36.0 h1:AnAEvhDddvBdpY+uR+MyHmuZzzNqXSe/GvuDeob5L34=
-golang.org/x/crypto v0.36.0/go.mod h1:Y4J0ReaxCR1IMaabaSMugxJES1EpwhBHhv2bDHklZvc=
+golang.org/x/crypto v0.33.0 h1:IOBPskki6Lysi0lo9qQvbxiQ+FvsCC/YWOecCHAixus=
+golang.org/x/crypto v0.33.0/go.mod h1:bVdXmD7IV/4GdElGPozy6U7lWdRXA4qyRVGJV57uQ5M=
 golang.org/x/exp v0.0.0-20180321215751-8460e604b9de/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
 golang.org/x/exp v0.0.0-20180807140117-3d87b88a115f/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
 golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
@@ -257,8 +257,8 @@ golang.org/x/net v0.0.0-20200822124328-c89045814202/go.mod h1:/O7V0waA8r7cgGh81R
 golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
 golang.org/x/net v0.0.0-20210405180319-a5a99cb37ef4/go.mod h1:p54w0d4576C0XHj96bSt6lcn1PtDYWL6XObtHCRCNQM=
 golang.org/x/net v0.0.0-20210614182718-04defd469f4e/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
-golang.org/x/net v0.38.0 h1:vRMAPTMaeGqVhG5QyLJHqNDwecKTomGeqbnfZyKlBI8=
-golang.org/x/net v0.38.0/go.mod h1:ivrbrMbzFq5J41QOQh0siUuly180yBYtLp+CKbEaFx8=
+golang.org/x/net v0.35.0 h1:T5GQRQb2y08kTAByq9L4/bz8cipCdA8FbRTXewonqY8=
+golang.org/x/net v0.35.0/go.mod h1:EglIi67kWsHKlRzzVMUD93VMSWGFOMSZgxFjparz1Qk=
 golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
 golang.org/x/oauth2 v0.0.0-20200107190931-bf48bf16ab8d/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
 golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
@@ -268,8 +268,8 @@ golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJ
 golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
-golang.org/x/sync v0.12.0 h1:MHc5BpPuC30uJk597Ri8TV3CNZcTLu6B6z4lJy+g6Jw=
-golang.org/x/sync v0.12.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA=
+golang.org/x/sync v0.11.0 h1:GGz8+XQP4FvTTrjZPzNKTMFtSXH80RAzG+5ghFPgK9w=
+golang.org/x/sync v0.11.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
 golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
 golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
 golang.org/x/sys v0.0.0-20190312061237-fead79001313/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
@@ -285,17 +285,17 @@ golang.org/x/sys v0.0.0-20210510120138-977fb7262007/go.mod h1:oPkhp1MJrh7nUepCBc
 golang.org/x/sys v0.0.0-20210630005230-0f9fa26af87c/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
-golang.org/x/sys v0.31.0 h1:ioabZlmFYtWhL+TRYpcnNlLwhyxaM9kWTDEmfnprqik=
-golang.org/x/sys v0.31.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k=
+golang.org/x/sys v0.30.0 h1:QjkSwP/36a20jFYWkSue1YwXzLmsV5Gfq7Eiy72C1uc=
+golang.org/x/sys v0.30.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
 golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
-golang.org/x/term v0.30.0 h1:PQ39fJZ+mfadBm0y5WlL4vlM7Sx1Hgf13sMIY2+QS9Y=
-golang.org/x/term v0.30.0/go.mod h1:NYYFdzHoI5wRh/h5tDMdMqCqPJZEuNqVR5xJLd/n67g=
+golang.org/x/term v0.29.0 h1:L6pJp37ocefwRRtYPKSWOWzOtWSxVajvz2ldH/xi3iU=
+golang.org/x/term v0.29.0/go.mod h1:6bl4lRlvVuDgSf3179VpIxBF0o10JUpXWOnI7nErv7s=
 golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
 golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
 golang.org/x/text v0.3.5/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
 golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
-golang.org/x/text v0.23.0 h1:D71I7dUrlY+VX0gQShAThNGHFxZ13dGLBHQLVl1mJlY=
-golang.org/x/text v0.23.0/go.mod h1:/BLNzu4aZCJ1+kcD0DNRotWKage4q2rGVAg4o22unh4=
+golang.org/x/text v0.22.0 h1:bofq7m3/HAFvbF51jz3Q9wLg3jkvSPuiZu/pD1XwgtM=
+golang.org/x/text v0.22.0/go.mod h1:YRoo4H8PVmsu+E3Ou7cqLVH8oXWIHVoX0jqUWALQhfY=
 golang.org/x/tools v0.0.0-20180525024113-a5b4c53f6e8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
 golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
 golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
--- a/integration/embed_test.go
+++ b/integration/embed_test.go
@@ -34,15 +34,13 @@ func cosineSimilarity[V float32 | float64](v1, v2 []V) V {
 func TestAllMiniLMEmbeddings(t *testing.T) {
 	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
 	defer cancel()
-	client, _, cleanup := InitServerConnection(ctx, t)
-	defer cleanup()

 	req := api.EmbeddingRequest{
 		Model:  "all-minilm",
 		Prompt: "why is the sky blue?",
 	}

-	res, err := embeddingTestHelper(ctx, client, t, req)
+	res, err := embeddingTestHelper(ctx, t, req)

 	if err != nil {
 		t.Fatalf("error: %v", err)
@@ -64,15 +62,13 @@ func TestAllMiniLMEmbeddings(t *testing.T) {
 func TestAllMiniLMEmbed(t *testing.T) {
 	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
 	defer cancel()
-	client, _, cleanup := InitServerConnection(ctx, t)
-	defer cleanup()

 	req := api.EmbedRequest{
 		Model: "all-minilm",
 		Input: "why is the sky blue?",
 	}

-	res, err := embedTestHelper(ctx, client, t, req)
+	res, err := embedTestHelper(ctx, t, req)

 	if err != nil {
 		t.Fatalf("error: %v", err)
@@ -102,15 +98,13 @@ func TestAllMiniLMEmbed(t *testing.T) {
 func TestAllMiniLMBatchEmbed(t *testing.T) {
 	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
 	defer cancel()
-	client, _, cleanup := InitServerConnection(ctx, t)
-	defer cleanup()

 	req := api.EmbedRequest{
 		Model: "all-minilm",
 		Input: []string{"why is the sky blue?", "why is the grass green?"},
 	}

-	res, err := embedTestHelper(ctx, client, t, req)
+	res, err := embedTestHelper(ctx, t, req)

 	if err != nil {
 		t.Fatalf("error: %v", err)
@@ -150,8 +144,6 @@ func TestAllMiniLMBatchEmbed(t *testing.T) {
 func TestAllMiniLMEmbedTruncate(t *testing.T) {
 	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
 	defer cancel()
-	client, _, cleanup := InitServerConnection(ctx, t)
-	defer cleanup()

 	truncTrue, truncFalse := true, false

@@ -190,7 +182,7 @@ func TestAllMiniLMEmbedTruncate(t *testing.T) {
 	res := make(map[string]*api.EmbedResponse)

 	for _, req := range reqs {
-		response, err := embedTestHelper(ctx, client, t, req.Request)
+		response, err := embedTestHelper(ctx, t, req.Request)
 		if err != nil {
 			t.Fatalf("error: %v", err)
 		}
@@ -206,7 +198,7 @@ func TestAllMiniLMEmbedTruncate(t *testing.T) {
 	}

 	// check that truncate set to false returns an error if context length is exceeded
-	_, err := embedTestHelper(ctx, client, t, api.EmbedRequest{
+	_, err := embedTestHelper(ctx, t, api.EmbedRequest{
 		Model:    "all-minilm",
 		Input:    "why is the sky blue?",
 		Truncate: &truncFalse,
@@ -218,7 +210,9 @@ func TestAllMiniLMEmbedTruncate(t *testing.T) {
 	}
 }

-func embeddingTestHelper(ctx context.Context, client *api.Client, t *testing.T, req api.EmbeddingRequest) (*api.EmbeddingResponse, error) {
+func embeddingTestHelper(ctx context.Context, t *testing.T, req api.EmbeddingRequest) (*api.EmbeddingResponse, error) {
+	client, _, cleanup := InitServerConnection(ctx, t)
+	defer cleanup()
 	if err := PullIfMissing(ctx, client, req.Model); err != nil {
 		t.Fatalf("failed to pull model %s: %v", req.Model, err)
 	}
@@ -232,7 +226,9 @@ func embeddingTestHelper(ctx context.Context, client *api.Client, t *testing.T,
 	return response, nil
 }

-func embedTestHelper(ctx context.Context, client *api.Client, t *testing.T, req api.EmbedRequest) (*api.EmbedResponse, error) {
+func embedTestHelper(ctx context.Context, t *testing.T, req api.EmbedRequest) (*api.EmbedResponse, error) {
+	client, _, cleanup := InitServerConnection(ctx, t)
+	defer cleanup()
 	if err := PullIfMissing(ctx, client, req.Model); err != nil {
 		t.Fatalf("failed to pull model %s: %v", req.Model, err)
 	}
--- a/llm/server.go
+++ b/llm/server.go
@@ -329,13 +329,11 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
 			libraryPaths = append(libraryPaths, filepath.SplitList(libraryPath)...)
 		}

-		ggmlPaths := []string{discover.LibOllamaPath}
 		if len(compatible) > 0 {
 			c := compatible[0]
 			if libpath, ok := libs[c]; ok {
 				slog.Debug("adding gpu library", "path", libpath)
 				libraryPaths = append(libraryPaths, libpath)
-				ggmlPaths = append(ggmlPaths, libpath)
 			}
 		}

@@ -371,8 +369,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
 		s.cmd.Stderr = s.status
 		s.cmd.SysProcAttr = LlamaServerSysProcAttr

-		s.cmd.Env = append(s.cmd.Env, "OLLAMA_LIBRARY_PATH="+strings.Join(ggmlPaths, string(filepath.ListSeparator)))
-
 		envWorkarounds := [][2]string{}
 		for _, gpu := range gpus {
 			envWorkarounds = append(envWorkarounds, gpu.EnvWorkarounds...)
@@ -410,8 +406,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
 		if envconfig.Debug() {
 			filteredEnv := []string{}
 			for _, ev := range s.cmd.Env {
-				if strings.HasPrefix(ev, "OLLAMA_") ||
-					strings.HasPrefix(ev, "CUDA_") ||
+				if strings.HasPrefix(ev, "CUDA_") ||
 					strings.HasPrefix(ev, "ROCR_") ||
 					strings.HasPrefix(ev, "ROCM_") ||
 					strings.HasPrefix(ev, "HIP_") ||
--- a/ml/backend/ggml/ggml/src/ggml.go
+++ b/ml/backend/ggml/ggml/src/ggml.go
@@ -57,20 +57,26 @@ var OnceLoad = sync.OnceFunc(func() {
 		exe = "."
 	}

-	var value string
+	// PATH, LD_LIBRARY_PATH, and DYLD_LIBRARY_PATH are often
+	// set by the parent process, however, use a default value
+	// if the environment variable is not set.
+	var name, value string
 	switch runtime.GOOS {
 	case "darwin":
+		// On macOS, DYLD_LIBRARY_PATH is often not set, so
+		// we use the directory of the executable as the default.
+		name = "DYLD_LIBRARY_PATH"
 		value = filepath.Dir(exe)
 	case "windows":
+		name = "PATH"
 		value = filepath.Join(filepath.Dir(exe), "lib", "ollama")
 	default:
+		name = "LD_LIBRARY_PATH"
 		value = filepath.Join(filepath.Dir(exe), "..", "lib", "ollama")
 	}

-	// Avoid potentially loading incompatible GGML libraries
-	paths, ok := os.LookupEnv("OLLAMA_LIBRARY_PATH")
+	paths, ok := os.LookupEnv(name)
 	if !ok {
-		slog.Debug("OLLAMA_LIBRARY_PATH not set, falling back to default", "search", value)
 		paths = value
 	}

--- a/runner/ollamarunner/runner.go
+++ b/runner/ollamarunner/runner.go
@@ -723,9 +723,7 @@ func (m *multiLPath) String() string {
 	return strings.Join(*m, ", ")
 }

-// TODO(jessegross): This is causing tensor allocation failures with large batches when not offloaded
-// to the GPU
-/*func (s *Server) reserveWorstCaseGraph() error {
+func (s *Server) reserveWorstCaseGraph() error {
 	ctx := s.model.Backend().NewContext()
 	defer ctx.Close()

@@ -768,7 +766,7 @@ func (m *multiLPath) String() string {
 	}

 	return nil
-}*/
+}

 func (s *Server) loadModel(
 	ctx context.Context,
@@ -805,10 +803,10 @@ func (s *Server) loadModel(
 	s.seqs = make([]*Sequence, s.parallel)
 	s.seqsSem = semaphore.NewWeighted(int64(s.parallel))

-	/*err = s.reserveWorstCaseGraph()
+	err = s.reserveWorstCaseGraph()
 	if err != nil {
 		panic(err)
-	}*/
+	}

 	s.status = llm.ServerStatusReady
 	s.ready.Done()
--- a/server/python_tools.go
+++ b/server/python_tools.go
@@ -0,0 +1,226 @@
+package server
+
+import (
+	"fmt"
+	"regexp"
+	"strconv"
+	"strings"
+
+	"github.com/ollama/ollama/api"
+)
+
+var (
+	pythonFuncRegex = regexp.MustCompile(`(\w+)\((.*?)\)`)
+	braces          = map[rune]rune{
+		'[':  ']',
+		'{':  '}',
+		'(':  ')',
+		'"':  '"',
+		'\'': '\'',
+	}
+)
+
+// parsePythonValue converts a Python value string to its appropriate Go type
+func parsePythonValue(value string) (any, error) {
+	value = strings.TrimSpace(value)
+
+	// string
+	if (strings.HasPrefix(value, "\"") && strings.HasSuffix(value, "\"")) ||
+		(strings.HasPrefix(value, "'") && strings.HasSuffix(value, "'")) {
+		// Remove quotes
+		result := value[1 : len(value)-1]
+		return result, nil
+	}
+
+	// bool
+	switch strings.ToLower(value) {
+	case "true":
+		return true, nil
+	case "false":
+		return false, nil
+	case "none":
+		return nil, nil
+	}
+
+	// int
+	if i, err := strconv.Atoi(value); err == nil {
+		return i, nil
+	}
+
+	// float
+	if f, err := strconv.ParseFloat(value, 64); err == nil {
+		return f, nil
+	}
+
+	// list
+	if strings.HasPrefix(value, "[") && strings.HasSuffix(value, "]") {
+		listStr := value[1 : len(value)-1]
+		var list []any
+		stack := []rune{}
+		start := 0
+
+		for i, char := range listStr {
+			if len(stack) != 0 && char == braces[stack[len(stack)-1]] {
+				stack = stack[:len(stack)-1]
+			} else if _, ok := braces[char]; ok {
+				stack = append(stack, char)
+			}
+
+			if len(stack) == 0 && (char == ',' || i == len(listStr)-1) {
+				end := i
+				if i == len(listStr)-1 {
+					end = i + 1
+				}
+				item := strings.TrimSpace(listStr[start:end])
+				if val, err := parsePythonValue(item); err == nil {
+					list = append(list, val)
+				} else {
+					return nil, fmt.Errorf("invalid list item: %s", item)
+				}
+				start = i + 1
+			}
+		}
+		return list, nil
+	}
+
+	// dictionary
+	if strings.HasPrefix(value, "{") && strings.HasSuffix(value, "}") && strings.Contains(value, ":") {
+		dictStr := value[1 : len(value)-1]
+		dict := make(map[any]any)
+		stack := []rune{}
+		start := 0
+		for i, char := range dictStr {
+			if len(stack) != 0 && char == braces[stack[len(stack)-1]] {
+				stack = stack[:len(stack)-1]
+			} else if _, ok := braces[char]; ok {
+				stack = append(stack, char)
+			}
+			if len(stack) == 0 && (char == ',' || i == len(dictStr)-1) {
+				end := i
+				if i == len(dictStr)-1 {
+					end = i + 1
+				}
+				item := strings.TrimSpace(dictStr[start:end])
+				kv := strings.SplitN(item, ":", 2)
+				if len(kv) != 2 {
+					return nil, fmt.Errorf("invalid dictionary key-value pair: %s", item)
+				}
+
+				key, err := parsePythonValue(strings.TrimSpace(kv[0]))
+				if err != nil {
+					return nil, fmt.Errorf("invalid dictionary key: %s", kv[0])
+				}
+
+				val, err := parsePythonValue(strings.TrimSpace(kv[1]))
+				if err != nil {
+					return nil, fmt.Errorf("invalid dictionary value: %s", kv[1])
+				}
+
+				dict[key] = val
+				start = i + 1
+			}
+		}
+		return dict, nil
+	}
+
+	// sets (stored as lists)
+	if strings.HasPrefix(value, "{") && strings.HasSuffix(value, "}") {
+		setStr := value[1 : len(value)-1]
+		var list []any
+		stack := []rune{}
+		start := 0
+		for i, char := range setStr {
+			if len(stack) != 0 && char == braces[stack[len(stack)-1]] {
+				stack = stack[:len(stack)-1]
+			} else if _, ok := braces[char]; ok {
+				stack = append(stack, char)
+			}
+			if len(stack) == 0 && (char == ',' || i == len(setStr)-1) {
+				end := i
+				if i == len(setStr)-1 {
+					end = i + 1
+				}
+				item := strings.TrimSpace(setStr[start:end])
+				if val, err := parsePythonValue(item); err == nil {
+					list = append(list, val)
+				} else {
+					return nil, fmt.Errorf("invalid set item: %s", item)
+				}
+				start = i + 1
+			}
+		}
+		return list, nil
+	}
+
+	return nil, fmt.Errorf("invalid Python value: %s", value)
+}
+
+// parsePythonToolCall parses Python function calls from a string
+// it supports keyword arguments, as well as multiple functions in a single string
+func parsePythonToolCall(s string) ([]api.ToolCall, error) {
+	matches := pythonFuncRegex.FindAllStringSubmatchIndex(s, -1)
+	if len(matches) == 0 {
+		return nil, fmt.Errorf("no Python function calls found")
+	}
+
+	var toolCalls []api.ToolCall
+	for _, match := range matches {
+		name := s[match[2]:match[3]]
+		args := s[match[4]:match[5]]
+		var arguments api.ToolCallFunctionArguments
+		if len(args) == 0 {
+			toolCalls = append(toolCalls, api.ToolCall{
+				Function: api.ToolCallFunction{
+					Name: name,
+				},
+			})
+			continue
+		}
+
+		start := 0
+		stack := []rune{}
+		for i, char := range args {
+			if len(stack) != 0 && char == braces[stack[len(stack)-1]] {
+				stack = stack[:len(stack)-1]
+			} else if _, ok := braces[char]; ok {
+				stack = append(stack, char)
+			}
+			if len(stack) == 0 && (char == ',' || i == len(args)-1) {
+				end := i
+				if i == len(args)-1 {
+					end = i + 1
+				}
+				kv := strings.SplitN(args[start:end], "=", 2)
+				if len(kv) == 2 {
+					key := strings.TrimSpace(kv[0])
+					valueStr := strings.TrimSpace(kv[1])
+
+					// Parse the value into appropriate type
+					value, err := parsePythonValue(valueStr)
+					if err != nil {
+						return nil, fmt.Errorf("failed to parse value for key %q: %v", key, err)
+					}
+
+					arguments[key] = value
+				} else {
+					return nil, fmt.Errorf("invalid argument format: %q", args[start:end])
+				}
+				start = i + 1
+			}
+		}
+
+		if len(arguments) > 0 {
+			toolCalls = append(toolCalls, api.ToolCall{
+				Function: api.ToolCallFunction{
+					Name:      name,
+					Arguments: arguments,
+				},
+			})
+		}
+	}
+
+	if len(toolCalls) > 0 {
+		return toolCalls, nil
+	}
+	return nil, fmt.Errorf("failed to parse any valid tool calls")
+}
--- a/server/python_tools_test.go
+++ b/server/python_tools_test.go
@@ -0,0 +1,269 @@
+package server
+
+import (
+	"testing"
+
+	"github.com/google/go-cmp/cmp"
+	"github.com/ollama/ollama/api"
+)
+
+func TestParsePythonFunctionCall(t *testing.T) {
+	t1 := api.ToolCall{
+		Function: api.ToolCallFunction{
+			Name: "get_current_weather",
+			Arguments: api.ToolCallFunctionArguments{
+				"location": "San Francisco, CA",
+				"format":   "fahrenheit",
+			},
+		},
+	}
+
+	t2 := api.ToolCall{
+		Function: api.ToolCallFunction{
+			Name: "get_forecast",
+			Arguments: api.ToolCallFunctionArguments{
+				"days":     5,
+				"location": "Seattle",
+			},
+		},
+	}
+
+	t3 := api.ToolCall{
+		Function: api.ToolCallFunction{
+			Name: "get_current_weather",
+			Arguments: api.ToolCallFunctionArguments{
+				"list":   []any{1, 2, 3},
+				"int":    -1,
+				"float":  1.23,
+				"string": "hello",
+			},
+		},
+	}
+	t4 := api.ToolCall{
+		Function: api.ToolCallFunction{
+			Name: "get_current_weather",
+		},
+	}
+
+	cases := []struct {
+		name  string
+		input string
+		want  []api.ToolCall
+		err   bool
+	}{
+		{
+			name:  "malformed function call - missing closing paren",
+			input: "get_current_weather(location=\"San Francisco\"",
+			err:   true,
+		},
+		{
+			name:  "empty function call",
+			input: "get_current_weather()",
+			want:  []api.ToolCall{t4},
+			err:   false,
+		},
+		{
+			name:  "single valid function call",
+			input: "get_current_weather(location=\"San Francisco, CA\", format=\"fahrenheit\")",
+			want:  []api.ToolCall{t1},
+		},
+		{
+			name:  "multiple valid function calls",
+			input: "get_current_weather(location=\"San Francisco, CA\", format=\"fahrenheit\") get_forecast(days=5, location=\"Seattle\")",
+			want:  []api.ToolCall{t1, t2},
+		},
+		{
+			name:  "multiple valid function calls with list",
+			input: "get_current_weather(list=[1,2,3], int=-1, float=1.23, string=\"hello\")",
+			want:  []api.ToolCall{t3},
+		},
+		{
+			name:  "positional arguments not supported",
+			input: "get_current_weather(1, 2, 3)",
+			err:   true,
+		},
+		{
+			name:  "invalid argument format without equals",
+			input: "get_current_weather(\"San Francisco\")",
+			err:   true,
+		},
+		{
+			name:  "nested lists",
+			input: "get_current_weather(data=[[1,2],[3,4]])",
+			want: []api.ToolCall{{
+				Function: api.ToolCallFunction{
+					Name: "get_current_weather",
+					Arguments: api.ToolCallFunctionArguments{
+						"data": []any{[]any{1, 2}, []any{3, 4}},
+					},
+				},
+			}},
+		},
+		{
+			name:  "boolean and none values",
+			input: "get_current_weather(active=true, enabled=false, value=None)",
+			want: []api.ToolCall{{
+				Function: api.ToolCallFunction{
+					Name: "get_current_weather",
+					Arguments: api.ToolCallFunctionArguments{
+						"active":  true,
+						"enabled": false,
+						"value":   nil,
+					},
+				},
+			}},
+		},
+		{
+			name:  "single vs double quotes",
+			input: "get_current_weather(str1='single', str2=\"double\")",
+			want: []api.ToolCall{{
+				Function: api.ToolCallFunction{
+					Name: "get_current_weather",
+					Arguments: api.ToolCallFunctionArguments{
+						"str1": "single",
+						"str2": "double",
+					},
+				},
+			}},
+		},
+		{
+			name:  "whitespace handling",
+			input: "get_current_weather( location = \"San Francisco\" , temp = 72 )",
+			want: []api.ToolCall{{
+				Function: api.ToolCallFunction{
+					Name: "get_current_weather",
+					Arguments: api.ToolCallFunctionArguments{
+						"location": "San Francisco",
+						"temp":     72,
+					},
+				},
+			}},
+		},
+	}
+
+	for _, tt := range cases {
+		t.Run(tt.name, func(t *testing.T) {
+			got, err := parsePythonToolCall(tt.input)
+			if (err != nil) != tt.err {
+				t.Fatalf("expected error: %v, got error: %v", tt.err, err)
+			}
+			if tt.err {
+				return
+			}
+			if diff := cmp.Diff(got, tt.want); diff != "" {
+				t.Errorf("mismatch (-got +want):\n%s", diff)
+			}
+		})
+	}
+}
+
+func TestParsePythonValue(t *testing.T) {
+	cases := []struct {
+		name  string
+		input string
+		want  any
+		err   bool
+	}{
+		{
+			name:  "string with double quotes",
+			input: "\"hello\"",
+			want:  "hello",
+		},
+		{
+			name:  "string with single quotes",
+			input: "'world'",
+			want:  "world",
+		},
+		{
+			name:  "integer",
+			input: "42",
+			want:  42,
+		},
+		{
+			name:  "float",
+			input: "3.14",
+			want:  3.14,
+		},
+		{
+			name:  "boolean true",
+			input: "True",
+			want:  true,
+		},
+		{
+			name:  "boolean false",
+			input: "False",
+			want:  false,
+		},
+		{
+			name:  "none/null",
+			input: "None",
+			want:  nil,
+		},
+		{
+			name:  "simple list",
+			input: "[1, 2, 3]",
+			want:  []any{1, 2, 3},
+		},
+		{
+			name:  "nested list",
+			input: "[1, [2, 3], 4]",
+			want:  []any{1, []any{2, 3}, 4},
+		},
+		{
+			name:  "mixed type list",
+			input: "[1, \"two\", 3.0, true]",
+			want:  []any{1, "two", 3.0, true},
+		},
+		{
+			name:  "invalid list",
+			input: "[1, 2,",
+			want:  nil,
+			err:   true,
+		},
+		{
+			name:  "dictionaries",
+			input: "{'a': 1, 'b': 2}",
+			want:  map[any]any{"a": 1, "b": 2},
+			err:   false,
+		},
+		{
+			name:  "int dictionary",
+			input: "{1: 2}",
+			want:  map[any]any{1: 2},
+			err:   false,
+		},
+		{
+			name:  "mixed type dictionary",
+			input: "{'a': 1, 'b': 2.0, 'c': True}",
+			want:  map[any]any{"a": 1, "b": 2.0, "c": true},
+			err:   false,
+		},
+		{
+			name:  "invalid dictionary - missing closing brace",
+			input: "{'a': 1, 'b': 2",
+			want:  nil,
+			err:   true,
+		},
+		{
+			name:  "sets",
+			input: "{1, 2, 3}",
+			want:  []any{1, 2, 3},
+			err:   false,
+		},
+	}
+
+	for _, tt := range cases {
+		t.Run(tt.name, func(t *testing.T) {
+			got, err := parsePythonValue(tt.input)
+			if (err != nil) != tt.err {
+				t.Fatalf("expected error: %v, got error: %v", tt.err, err)
+			}
+			if tt.err {
+				return
+			}
+			if diff := cmp.Diff(got, tt.want); diff != "" {
+				t.Errorf("mismatch (-got +want):\n%s", diff)
+			}
+		})
+	}
+}
--- a/server/routes.go
+++ b/server/routes.go
@@ -18,7 +18,6 @@ import (
 	"os"
 	"os/signal"
 	"path/filepath"
-	"regexp"
 	"slices"
 	"strings"
 	"syscall"
@@ -1513,7 +1512,6 @@ func (s *Server) ChatHandler(c *gin.Context) {
 	if req.Messages[0].Role != "system" && m.System != "" {
 		msgs = append([]api.Message{{Role: "system", Content: m.System}}, msgs...)
 	}
-	msgs = filterThinkTags(msgs, m)

 	prompt, images, err := chatPrompt(c.Request.Context(), m, r.Tokenize, opts, msgs, req.Tools)
 	if err != nil {
@@ -1642,23 +1640,3 @@ func handleScheduleError(c *gin.Context, name string, err error) {
 		c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
 	}
 }
-
-var thinkTagRegexp = regexp.MustCompile(`<think>(?s).*?</think>(\n)*`)
-
-func filterThinkTags(msgs []api.Message, m *Model) []api.Message {
-	if m.Config.ModelFamily == "qwen3" || model.ParseName(m.Name).Model == "deepseek-r1" {
-		finalUserIndex := -1
-		for i, msg := range msgs {
-			if msg.Role == "user" {
-				finalUserIndex = i
-			}
-		}
-
-		for i, msg := range msgs {
-			if msg.Role == "assistant" && i < finalUserIndex {
-				msgs[i].Content = thinkTagRegexp.ReplaceAllString(msg.Content, "")
-			}
-		}
-	}
-	return msgs
-}
--- a/server/routes_generate_test.go
+++ b/server/routes_generate_test.go
@@ -299,6 +299,9 @@ func TestGenerateChat(t *testing.T) {
 				{Role: "user", Content: "Hello!"},
 			},
 			Stream: &stream,
+			Options: map[string]any{
+				"num_ctx": 1024,
+			},
 		})

 		if w.Code != http.StatusOK {
@@ -321,6 +324,9 @@ func TestGenerateChat(t *testing.T) {
 				{Role: "user", Content: "Hello!"},
 			},
 			Stream: &stream,
+			Options: map[string]any{
+				"num_ctx": 1024,
+			},
 		})

 		if w.Code != http.StatusOK {
@@ -344,6 +350,9 @@ func TestGenerateChat(t *testing.T) {
 				{Role: "user", Content: "Help me write tests."},
 			},
 			Stream: &stream,
+			Options: map[string]any{
+				"num_ctx": 1024,
+			},
 		})

 		if w.Code != http.StatusOK {
--- a/server/routes_test.go
+++ b/server/routes_test.go
@@ -15,7 +15,6 @@ import (
 	"net/http/httptest"
 	"os"
 	"path/filepath"
-	"reflect"
 	"sort"
 	"strings"
 	"testing"
@@ -747,128 +746,3 @@ func TestNormalize(t *testing.T) {
 		})
 	}
 }
-
-func TestFilterThinkTags(t *testing.T) {
-	type testCase struct {
-		msgs  []api.Message
-		want  []api.Message
-		model *Model
-	}
-	testCases := []testCase{
-		{
-			msgs: []api.Message{
-				{Role: "user", Content: "Hello, world!"},
-				{Role: "assistant", Content: "<think>Thinking... about the answer</think>abc"},
-				{Role: "user", Content: "What is the answer?"},
-			},
-			want: []api.Message{
-				{Role: "user", Content: "Hello, world!"},
-				{Role: "assistant", Content: "abc"},
-				{Role: "user", Content: "What is the answer?"},
-			},
-			model: &Model{
-				Config: ConfigV2{
-					ModelFamily: "qwen3",
-				},
-			},
-		},
-		// with newlines inside the think tag aned newlines after
-		{
-			msgs: []api.Message{
-				{Role: "user", Content: "Hello, world!"},
-				{Role: "assistant", Content: "<think>Thinking... \n\nabout \nthe answer</think>\n\nabc\ndef"},
-				{Role: "user", Content: "What is the answer?"},
-			},
-			want: []api.Message{
-				{Role: "user", Content: "Hello, world!"},
-				{Role: "assistant", Content: "abc\ndef"},
-				{Role: "user", Content: "What is the answer?"},
-			},
-			model: &Model{
-				Config: ConfigV2{
-					ModelFamily: "qwen3",
-				},
-			},
-		},
-		// should leave thinking tags if it's after the last user message
-		{
-			msgs: []api.Message{
-				{Role: "user", Content: "Hello, world!"},
-				{Role: "assistant", Content: "<think>Thinking...</think>after"},
-				{Role: "user", Content: "What is the answer?"},
-				{Role: "assistant", Content: "<think>thinking again</think>hjk"},
-				{Role: "assistant", Content: "<think>thinking yet again</think>hjk"},
-			},
-			want: []api.Message{
-				{Role: "user", Content: "Hello, world!"},
-				{Role: "assistant", Content: "after"},
-				{Role: "user", Content: "What is the answer?"},
-				{Role: "assistant", Content: "<think>thinking again</think>hjk"},
-				{Role: "assistant", Content: "<think>thinking yet again</think>hjk"},
-			},
-			model: &Model{
-				Config: ConfigV2{
-					ModelFamily: "qwen3",
-				},
-			},
-		},
-		{
-			// shouldn't strip anything because the model family isn't one of the hardcoded ones
-			msgs: []api.Message{
-				{Role: "user", Content: "Hello, world!"},
-				{Role: "assistant", Content: "<think>Thinking... about the answer</think>abc"},
-				{Role: "user", Content: "What is the answer?"},
-			},
-			want: []api.Message{
-				{Role: "user", Content: "Hello, world!"},
-				{Role: "assistant", Content: "<think>Thinking... about the answer</think>abc"},
-				{Role: "user", Content: "What is the answer?"},
-			},
-			model: &Model{
-				Config: ConfigV2{
-					ModelFamily: "llama3",
-				},
-			},
-		},
-		{
-			// deepseek-r1:-prefixed model
-			msgs: []api.Message{
-				{Role: "user", Content: "Hello, world!"},
-				{Role: "assistant", Content: "<think>Thinking... about the answer</think>abc"},
-				{Role: "user", Content: "What is the answer?"},
-			},
-			want: []api.Message{
-				{Role: "user", Content: "Hello, world!"},
-				{Role: "assistant", Content: "abc"},
-				{Role: "user", Content: "What is the answer?"},
-			},
-			model: &Model{
-				Name:      "registry.ollama.ai/library/deepseek-r1:latest",
-				ShortName: "deepseek-r1:7b",
-				Config:    ConfigV2{},
-			},
-		},
-	}
-
-	for i, tc := range testCases {
-		filtered := filterThinkTags(tc.msgs, tc.model)
-
-		if !reflect.DeepEqual(filtered, tc.want) {
-			t.Errorf("messages differ for case %d:", i)
-			for i := range tc.want {
-				if i >= len(filtered) {
-					t.Errorf("  missing message %d: %+v", i, tc.want[i])
-					continue
-				}
-				if !reflect.DeepEqual(filtered[i], tc.want[i]) {
-					t.Errorf("  message %d:\n    want: %+v\n    got:  %+v", i, tc.want[i], filtered[i])
-				}
-			}
-			if len(filtered) > len(tc.want) {
-				for i := len(tc.want); i < len(filtered); i++ {
-					t.Errorf("  extra message %d: %+v", i, filtered[i])
-				}
-			}
-		}
-	}
-}
--- a/server/sched.go
+++ b/server/sched.go
@@ -81,10 +81,6 @@ func InitScheduler(ctx context.Context) *Scheduler {

 // context must be canceled to decrement ref count and release the runner
 func (s *Scheduler) GetRunner(c context.Context, model *Model, opts api.Options, sessionDuration *api.Duration) (chan *runnerRef, chan error) {
-	if opts.NumCtx < 4 {
-		opts.NumCtx = 4
-	}
-
 	req := &LlmRequest{
 		ctx:             c,
 		model:           model,
@@ -114,6 +110,11 @@ func (s *Scheduler) Run(ctx context.Context) {
 	}()
 }

+const (
+	defaultContextLength  = 4096
+	smallGpuContextLength = 2048
+)
+
 func (s *Scheduler) processPending(ctx context.Context) {
 	for {
 		select {
@@ -166,6 +167,17 @@ func (s *Scheduler) processPending(ctx context.Context) {
 						gpus = s.getGpuFn()
 					}

+					if pending.origNumCtx == -1 {
+						if len(gpus) == 1 && gpus[0].Library != "cpu" && gpus[0].TotalMemory <= 4096*1024*1024 {
+							slog.Info("GPU is small, limiting default context window", "num_ctx", smallGpuContextLength)
+							pending.opts.NumCtx = smallGpuContextLength
+							pending.origNumCtx = smallGpuContextLength
+						} else {
+							pending.opts.NumCtx = defaultContextLength
+							pending.origNumCtx = defaultContextLength
+						}
+					}
+
 					if envconfig.MaxRunners() <= 0 {
 						// No user specified MaxRunners, so figure out what automatic setting to use
 						// If all GPUs have reliable free memory reporting, defaultModelsPerGPU * the number of GPUs
@@ -441,9 +453,10 @@ func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoLis
 		estimatedVRAM:   llama.EstimatedVRAM(),
 		estimatedTotal:  llama.EstimatedTotal(),
 		loading:         true,
+		refCount:        1,
 	}
 	runner.numParallel = numParallel
-	runner.refMu.Lock() // hold lock until running or aborted
+	runner.refMu.Lock()

 	s.loadedMu.Lock()
 	s.loaded[req.model.ModelPath] = runner
@@ -454,13 +467,13 @@ func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoLis
 		defer runner.refMu.Unlock()
 		if err = llama.WaitUntilRunning(req.ctx); err != nil {
 			slog.Error("error loading llama server", "error", err)
+			runner.refCount--
 			req.errCh <- err
 			slog.Debug("triggering expiration for failed load", "model", runner.modelPath)
 			s.expiredCh <- runner
 			return
 		}
 		slog.Debug("finished setting up runner", "model", req.model.ModelPath)
-		runner.refCount++
 		runner.loading = false
 		go func() {
 			<-req.ctx.Done()
@@ -478,12 +491,7 @@ func (s *Scheduler) updateFreeSpace(allGpus discover.GpuInfoList) {
 	}
 	predMap := map[predKey]uint64{} // Sum up the total predicted usage per GPU for all runners
 	s.loadedMu.Lock()
-	runners := make([]*runnerRef, 0, len(s.loaded))
 	for _, r := range s.loaded {
-		runners = append(runners, r)
-	}
-	s.loadedMu.Unlock()
-	for _, r := range runners {
 		r.refMu.Lock()
 		if r.llama != nil {
 			for _, gpu := range allGpus {
@@ -494,6 +502,7 @@ func (s *Scheduler) updateFreeSpace(allGpus discover.GpuInfoList) {
 		}
 		r.refMu.Unlock()
 	}
+	s.loadedMu.Unlock()

 	// Now that we've summed up all the GPU usage predictions across all the loaded runners, update the gpu list
 	for i := range allGpus {
@@ -540,8 +549,10 @@ func (s *Scheduler) filterGPUsWithoutLoadingModels(allGpus discover.GpuInfoList)

 // TODO consolidate sched_types.go
 type runnerRef struct {
-	refMu    sync.Mutex
+	refMu sync.Mutex
+	// refCond   sync.Cond // Signaled on transition from 1 -> 0 refCount
 	refCount uint // prevent unloading if > 0
+	// unloading bool      // set to true when we are trying to unload the runner

 	llama          llm.LlamaServer
 	loading        bool                 // True only during initial load, then false forever
@@ -812,8 +823,8 @@ func (s *Scheduler) unloadAllRunners() {

 func (s *Scheduler) expireRunner(model *Model) {
 	s.loadedMu.Lock()
+	defer s.loadedMu.Unlock()
 	runner, ok := s.loaded[model.ModelPath]
-	s.loadedMu.Unlock()
 	if ok {
 		runner.refMu.Lock()
 		runner.expiresAt = time.Now()
--- a/server/sched_test.go
+++ b/server/sched_test.go
@@ -148,6 +148,7 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est
 		successCh:       make(chan *runnerRef, 1),
 		errCh:           make(chan error, 1),
 	}
+	b.req.opts.NumCtx = 4096
 	b.srv = &mockLlm{estimatedVRAM: estimatedVRAM, estimatedVRAMByGPU: map[string]uint64{"": estimatedVRAM}}
 	return b
 }
Author	SHA1	Message	Date
ParthSareen	23e8ac9428	wip?	2025-05-07 19:00:44 -07:00
ParthSareen	611d3a17ed	server: add python tool parsing logic	2025-05-02 16:23:54 -07:00