Compare commits

...

106 Commits

Author SHA1 Message Date
Jeffrey Morgan
1d78d96fc6 remove .First 2023-11-15 18:07:13 -05:00
Michael Yang
686f85d6ca Merge pull request #1132 from jmorganca/mxyng/human-bytes
replace go-humanize with format.HumanBytes
2023-11-15 09:46:21 -08:00
bnodnarb
85951d25ef Created tutorial for running Ollama on NVIDIA Jetson devices (#1098) 2023-11-15 12:32:37 -05:00
Michael Yang
01ea6002c4 replace go-humanize with format.HumanBytes 2023-11-14 14:57:41 -08:00
Jeffrey Morgan
423862042a treat ollama run model < file as entire prompt, not prompt-per-line (#1126)
Previously, `ollama run` treated a non-terminal stdin (such as `ollama run model < file`) as containing one prompt per line. To run inference on a multi-line prompt, the only non-API workaround was to run `ollama run` interactively and wrap the prompt in `"""..."""`.

Now, `ollama run` treats a non-terminal stdin as containing a single prompt. For example, if `myprompt.txt` is a multi-line file, then `ollama run model < myprompt.txt` would treat `myprompt.txt`'s entire contents as the prompt.

Co-authored-by: Quinn Slack <quinn@slack.org>
2023-11-14 16:42:21 -05:00
Bruce MacDonald
df18486c35 Move /generate format to optional parameters (#1127)
This field is optional and should be under the `Advanced parameters` header
2023-11-14 16:12:30 -05:00
Jeffrey Morgan
4e612a2e92 use stdout fd for terminal size (#1125) 2023-11-14 16:09:09 -05:00
Jeffrey Morgan
6e0f686afa --format json should work in interactive mode 2023-11-14 10:22:03 -05:00
Jeffrey Morgan
c1844bbee2 add json mode to cli (#1095) 2023-11-13 21:54:02 -05:00
Huy Le
cb745965ce adding ollama.nvim for visibility (#1115) 2023-11-13 17:00:17 -05:00
Enrico Ros
8d29b6a2b6 New big-AGI integration (#1078)
* New big-AGI integration

Ollama works great in big-AGI, and this document explains how to link the two projects.

* Update README.md
2023-11-13 16:59:00 -05:00
Ilya Breitburg
724aa64bee Add Dart library to README.md (#1106) 2023-11-13 14:50:42 -05:00
Michael Yang
d91c103e74 Merge pull request #1055 from dansreis/946-fix-incorrect-base-model-name
Fixed incorrect base model name
2023-11-13 08:42:55 -08:00
Kevin Hermawan
98ec7d81e3 Add OllamaKit to the community integrations (#1085) 2023-11-11 14:41:42 -08:00
Daniel Reis
7c438f2c53 Replaced method 2023-11-10 20:22:03 +00:00
Daniel Reis
6e46338d44 Reverting previous changes 2023-11-10 20:21:35 +00:00
Jeffrey Morgan
cdddd3df65 add format to example python client 2023-11-10 10:22:21 -08:00
Daniel Hiltgen
afa61bdf45 Merge pull request #1075 from jmorganca/dhiltgen/unexpected-eof
Resume chunk download on UnexpectedEOF errors
2023-11-10 08:48:27 -08:00
Daniel Hiltgen
cc54a416c6 Resume chunk download on UnexpectedEOF errors
If the chunk download is interrupted, resume from where we left off
2023-11-10 08:29:42 -08:00
Matt Williams
c819d7f68a Merge pull request #955 from jmorganca/mattw/example-bash-compare
docs: add examples using bash to compare models
2023-11-10 08:59:32 -06:00
Jeffrey Morgan
5cba29b9d6 JSON mode: add `"format" as an api parameter (#1051)
* add `"format": "json"` as an API parameter
---------
Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com>
2023-11-09 16:44:02 -08:00
Daniel Reis
d17730356a Removed inline parse model path 2023-11-09 22:44:26 +00:00
Daniel Reis
32d79a6eea Using 'GetShortTagname' method instead 2023-11-09 22:40:37 +00:00
Bruce MacDonald
5b39503bcd document specifying multiple stop params (#1061) 2023-11-09 13:16:26 -08:00
Bruce MacDonald
1ae84bc2a2 skip gpu if less than 2GB VRAM are available (#1059) 2023-11-09 13:16:16 -08:00
Bruce MacDonald
db8bf336fc Update README.md 2023-11-09 12:53:24 -08:00
Nick Anderson
d77e094a90 Added gptel to list of integrations (#1062) 2023-11-09 12:52:36 -08:00
Matt Williams
dd3dc47ddb Merge pull request #992 from aashish2057/aashish2057/langchainjs_doc_update 2023-11-09 05:08:31 -08:00
Michael Yang
c5e1bbabda instead of static number of parameters for each model family, get the real number from the tensors (#1022)
* parse tensor info

* refactor decoder

* return actual parameter count

* explicit rounding

* s/Human/HumanNumber/
2023-11-08 17:55:46 -08:00
Bruce MacDonald
a49d6acc1e add a complete /generate options example (#1035) 2023-11-08 16:44:36 -08:00
Moritz Poldrack
6e9bcdb9b3 progressbar: make start and end seamless (#1042) 2023-11-08 16:42:40 -08:00
Matt Williams
13086363bd Update as per bmacd
Signed-off-by: Matt Williams <m@technovangelist.com>
2023-11-08 18:09:05 -06:00
Bruce MacDonald
ec2a31e9b3 support raw generation requests (#952)
- add the optional `raw` generate request parameter to bypass prompt formatting and response context
-add raw request to docs
2023-11-08 14:05:02 -08:00
Amith Koujalgi
ec84c02d54 Add Ollama4j Java library to the list of community libraries (#1044) 2023-11-08 11:04:32 -08:00
Kevin Hermawan
2a88b66bc9 Add Ollamac to community integrations (#1043) 2023-11-08 11:01:09 -08:00
Jeffrey Morgan
2d0faea96c clean up README.md 2023-11-08 00:03:29 -08:00
Jeffrey Morgan
637142181a clean up README.md 2023-11-07 23:52:31 -08:00
Matt Williams
bcbff421c9 Merge pull request #1023 from jmorganca/mattw/wherearemodelsfaq 2023-11-07 17:59:54 -08:00
thealhu
1359d6cf3b Fix sudo variable in install.sh (#1034)
It was forgotten to replace sudo at one place with the variable for sudo.
2023-11-07 09:59:57 -08:00
Omar Magdy
6e2d0224d9 Added logseq ollama plugin (#1029) 2023-11-07 09:58:13 -08:00
Ikko Eltociear Ashimine
921406f721 Update client.py (#1026)
recieve -> receive
2023-11-07 09:55:47 -08:00
Michael Yang
c7047d7353 Merge pull request #959 from jmorganca/mxyng/example-k8s 2023-11-07 10:43:21 -06:00
Matt Williams
1d155caba3 docs: clarify where the models are stored in the faq
Signed-off-by: Matt Williams <m@technovangelist.com>
2023-11-06 14:38:49 -08:00
Michael Yang
866324b9a5 Merge pull request #943 from tjbck/patch-1
doc: categorised community integrations + added ollama-webui
2023-11-06 11:35:39 -08:00
Michael Yang
145e060855 Apply suggestions from code review
Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com>
2023-11-06 11:32:23 -08:00
Michael Yang
146072113d Merge pull request #993 from jmorganca/mxyng/cleanup
cleanup upload and download errors
2023-11-06 11:32:12 -08:00
Timothy Jaeryang Baek
33d31d1b56 Merge branch 'main' into patch-1 2023-11-06 14:27:02 -05:00
Dr. David A. Kunz
274c6cbf4c Added gen.nvim to community integrations (#996) 2023-11-06 10:51:41 -08:00
Elton Renda
7ebbd89bbf add hass-ollama-conversation (#999) 2023-11-06 10:50:35 -08:00
Lars Grammel
9079b1bb6d Add ModelFusion community integration (#1020) 2023-11-06 10:46:16 -08:00
Timothy Jaeryang Baek
6febde7200 Merge branch 'main' into patch-1 2023-11-04 19:12:18 -05:00
pepperoni21
325cfcd9ff Added ollama-rs to community integrations (#995)
Co-authored-by: pepperoni21 <pepperoni2100@gmail.com>
2023-11-04 14:51:29 -07:00
Jeffrey Morgan
639d0fd070 Update README.md 2023-11-04 12:24:24 -07:00
Jeffrey Morgan
e21579a0f1 Restore system prompt on requests 2023-11-03 17:26:45 -07:00
Jeffrey Morgan
c44b619428 remove unused fmt.Println 2023-11-03 17:24:58 -07:00
Michael Yang
434a6f9d46 return last error 2023-11-03 16:49:51 -07:00
aashish2057
b13586cc72 update langchainjs doc 2023-11-03 18:45:19 -05:00
Jeffrey Morgan
17678b7225 Restore system prompt on requests and default num_keep to 0 2023-11-03 13:25:25 -07:00
Michael Yang
84725ec7e3 refactor part reset 2023-11-03 09:20:32 -07:00
Bruce MacDonald
6109bebba6 reformat api docs for more examples (#972) 2023-11-03 10:57:00 -04:00
Noah Gitsham
8ae8c9fa8c Remove duplicate "install" in GPU support warning (#984) 2023-11-03 00:45:14 -07:00
Noah Gitsham
f39daff461 Add missing "be" to GPU support warning message (#983) 2023-11-02 18:37:12 -07:00
Jeffrey Morgan
c50b01bc21 check request.Context for initial system prompt 2023-11-02 18:17:00 -07:00
Bruce MacDonald
b9dc875401 remove modelfile context deprecated in v0.0.7 (#974) 2023-11-02 20:52:56 -04:00
Jeffrey Morgan
06589a3b30 Set NumKeep to 4 by default (#982) 2023-11-02 17:26:11 -07:00
Michael Yang
1fd511e661 Merge pull request #975 from jmorganca/mxyng/downloads
update downloads to use retry wrapper
2023-11-02 16:12:48 -07:00
Michael Yang
c01bbe94fd Merge pull request #979 from jmorganca/mxyng/num-keep
update default NumKeep
2023-11-02 15:48:44 -07:00
Jeffrey Morgan
1beb5645a9 only use system prompt if context is not provided (#978) 2023-11-02 15:48:02 -07:00
Michael Yang
6db3691b8f update default NumKeep 2023-11-02 15:47:35 -07:00
Michael Yang
fe5a872444 fix upload 2023-11-02 13:25:58 -07:00
Michael Yang
d39709260f download with retry 2023-11-02 13:16:11 -07:00
Michael Yang
60bb3c03a1 use http.Method 2023-11-02 13:12:45 -07:00
Jeffrey Morgan
2e53704685 default rope params to 0 for new models (#968) 2023-11-02 08:41:30 -07:00
Michael Yang
527f9a7975 Merge pull request #966 from jmorganca/mxyng/fix-log 2023-11-01 17:49:10 -07:00
Michael Yang
c4cc738cbf fix log 2023-11-01 17:18:11 -07:00
Michael Yang
2c6189f4fe Merge pull request #750 from jmorganca/mxyng/concurrent-uploads
concurrent uploads
2023-11-01 15:00:01 -07:00
Michael Yang
dccac8c8fa k8s example 2023-11-01 14:52:58 -07:00
Michael Yang
c05ab9a86e Merge pull request #965 from jmorganca/mxyng/go-mod-tidy
go mod tidy
2023-11-01 11:55:43 -07:00
Michael Yang
f42f3d9b27 go fmt 2023-11-01 11:55:08 -07:00
Michael Yang
341fb7e35f go mod tidy 2023-11-01 11:54:25 -07:00
Michael
f31961637f Update README.md 2023-11-01 12:20:55 -04:00
Michael Yang
ec3614812a Merge pull request #960 from jmorganca/mxyng/fix-tautology 2023-11-01 08:30:49 -07:00
Michael Yang
f14969314a Merge pull request #958 from jmorganca/mxyng/append-ld-library-path 2023-11-01 08:30:38 -07:00
Bruce MacDonald
1fb9288661 notify that the ollama api is available after linux install (#954) 2023-11-01 11:28:26 -04:00
Matt Williams
01a03caa20 Merge pull request #956 from jmorganca/mattw/apidocupdate 2023-10-31 21:43:11 -07:00
Michael Yang
bf6786bb39 fix tautology 2023-10-31 20:49:48 -07:00
Michael Yang
642128b75a append LD_LIBRARY_PATH 2023-10-31 15:54:49 -07:00
Matt Williams
f21bd6210d docs: clarify and clean up API docs
Signed-off-by: Matt Williams <m@technovangelist.com>
2023-10-31 13:11:33 -07:00
Matt Williams
80362fedce better readme
Signed-off-by: Matt Williams <m@technovangelist.com>
2023-10-31 12:40:46 -07:00
Matt Williams
5757925060 add a gif
Signed-off-by: Matt Williams <m@technovangelist.com>
2023-10-31 11:52:01 -07:00
Michael
4512301756 Update README.md 2023-10-31 13:25:36 -04:00
Matt Williams
2236a93efc docs: add examples using bash to compare models
Signed-off-by: Matt Williams <m@technovangelist.com>
2023-10-31 09:12:39 -07:00
Matt Williams
ad88799411 Merge pull request #949 from jmorganca/matt/fixPrivateGPT
fix: private gpt example was broken due to changes in chroma
2023-10-30 17:17:00 -07:00
Bruce MacDonald
0818b5e318 readline windows terminal support (#950)
- update the readline package to have basic support on windows, this is not full feature parity with the unix cli yet
2023-10-30 16:18:12 -04:00
Matt Williams
1df6100c77 Update examples/langchain-python-rag-privategpt/privateGPT.py
Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com>
2023-10-30 12:48:17 -07:00
Matt Williams
5c48fe1fb0 Update examples/langchain-python-rag-privategpt/constants.py
Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com>
2023-10-30 12:47:56 -07:00
Dirk Loss
874bb31986 Fix conversion command for gptneox (#948) 2023-10-30 14:34:29 -04:00
Matt Williams
f7856a57eb fix: private gpt example was broken due to changes in chroma
Signed-off-by: Matt Williams <m@technovangelist.com>
2023-10-30 10:56:25 -07:00
Bruce MacDonald
f9a4281124 clean up: remove server functions from client (#937) 2023-10-30 11:10:18 -04:00
Timothy Jaeryang Baek
96da0792e6 doc: OllamaSharp for .NET moved to libraries 2023-10-28 16:18:38 -05:00
Timothy Jaeryang Baek
95d24262fc doc: categorised community integrations + added web-ui 2023-10-28 16:02:13 -05:00
Jeffrey Morgan
8d03bd7b54 remove +build directive in term.go 2023-10-28 09:56:03 -07:00
Michael Yang
115fc56eb7 calculate and verify md5 checksum 2023-10-27 17:07:33 -07:00
Michael Yang
186f685224 retry PUT 2023-10-27 17:07:33 -07:00
Michael Yang
12efcbb057 comments 2023-10-27 17:07:33 -07:00
Michael Yang
4e09aab8b9 concurrent uploads 2023-10-27 17:07:33 -07:00
45 changed files with 1301 additions and 2471 deletions

View File

@@ -29,8 +29,7 @@ curl https://ollama.ai/install.sh | sh
### Docker
The official [Ollama Docker image `ollama/ollama`](https://hub.docker.com/r/ollama/ollama)
is available on Docker Hub.
The official [Ollama Docker image](https://hub.docker.com/r/ollama/ollama) `ollama/ollama` is available on Docker Hub.
## Quickstart
@@ -160,7 +159,7 @@ I'm a basic program that prints the famous "Hello, world!" message to the consol
### Pass in prompt as arguments
```
$ ollama run llama2 "summarize this file:" "$(cat README.md)"
$ ollama run llama2 "Summarize this file: $(cat README.md)"
Ollama is a lightweight, extensible framework for building and running language models on the local machine. It provides a simple API for creating, running, and managing models, as well as a library of pre-built models that can be easily used in a variety of applications.
```
@@ -217,21 +216,44 @@ See the [API documentation](./docs/api.md) for all endpoints.
## Community Integrations
### Web & Desktop
- [HTML UI](https://github.com/rtcfirefly/ollama-ui)
- [Chatbot UI](https://github.com/ivanfioravanti/chatbot-ollama)
- [Typescript UI](https://github.com/ollama-interface/Ollama-Gui?tab=readme-ov-file)
- [Minimalistic React UI for Ollama Models](https://github.com/richawo/minimal-llm-ui)
- [Web UI](https://github.com/ollama-webui/ollama-webui)
- [Ollamac](https://github.com/kevinhermawan/Ollamac)
- [big-AGI](https://github.com/enricoros/big-agi/blob/main/docs/config-ollama.md)
### Terminal
- [oterm](https://github.com/ggozad/oterm)
- [Ellama Emacs client](https://github.com/s-kostyaev/ellama)
- [Emacs client](https://github.com/zweifisch/ollama)
- [gen.nvim](https://github.com/David-Kunz/gen.nvim)
- [ollama.nvim](https://github.com/nomnivore/ollama.nvim)
- [gptel Emacs client](https://github.com/karthink/gptel)
### Libraries
- [LangChain](https://python.langchain.com/docs/integrations/llms/ollama) and [LangChain.js](https://js.langchain.com/docs/modules/model_io/models/llms/integrations/ollama) with [example](https://js.langchain.com/docs/use_cases/question_answering/local_retrieval_qa)
- [LlamaIndex](https://gpt-index.readthedocs.io/en/stable/examples/llm/ollama.html)
- [LiteLLM](https://github.com/BerriAI/litellm)
- [OllamaSharp for .NET](https://github.com/awaescher/OllamaSharp)
- [Ollama-rs for Rust](https://github.com/pepperoni21/ollama-rs)
- [Ollama4j for Java](https://github.com/amithkoujalgi/ollama4j)
- [ModelFusion Typescript Library](https://modelfusion.dev/integration/model-provider/ollama)
- [OllamaKit for Swift](https://github.com/kevinhermawan/OllamaKit)
- [Ollama for Dart](https://github.com/breitburg/dart-ollama)
### Extensions & Plugins
- [Raycast extension](https://github.com/MassimilianoPasquini97/raycast_ollama)
- [Discollama](https://github.com/mxyng/discollama) (Discord bot inside the Ollama discord channel)
- [Continue](https://github.com/continuedev/continue)
- [Obsidian Ollama plugin](https://github.com/hinterdupfinger/obsidian-ollama)
- [Logseq Ollama plugin](https://github.com/omagdy7/ollama-logseq)
- [Dagger Chatbot](https://github.com/samalba/dagger-chatbot)
- [LiteLLM](https://github.com/BerriAI/litellm)
- [Discord AI Bot](https://github.com/mekb-turtle/discord-ai-bot)
- [Chatbot UI](https://github.com/ivanfioravanti/chatbot-ollama)
- [HTML UI](https://github.com/rtcfirefly/ollama-ui)
- [Typescript UI](https://github.com/ollama-interface/Ollama-Gui?tab=readme-ov-file)
- [Dumbar](https://github.com/JerrySievert/Dumbar)
- [Emacs client](https://github.com/zweifisch/ollama)
- [oterm](https://github.com/ggozad/oterm)
- [Ellama Emacs client](https://github.com/s-kostyaev/ellama)
- [OllamaSharp for .NET](https://github.com/awaescher/OllamaSharp)
- [Minimalistic React UI for Ollama Models](https://github.com/richawo/minimal-llm-ui)
- [Hass Ollama Conversation](https://github.com/ej52/hass-ollama-conversation)

View File

@@ -72,7 +72,7 @@ func ClientFromEnvironment() (*Client, error) {
},
}
mockRequest, err := http.NewRequest("HEAD", client.base.String(), nil)
mockRequest, err := http.NewRequest(http.MethodHead, client.base.String(), nil)
if err != nil {
return nil, err
}

View File

@@ -7,7 +7,7 @@ BASE_URL = os.environ.get('OLLAMA_HOST', 'http://localhost:11434')
# Generate a response for a given prompt with a provided model. This is a streaming endpoint, so will be a series of responses.
# The final response object will include statistics and additional data from the request. Use the callback function to override
# the default handler.
def generate(model_name, prompt, system=None, template=None, context=None, options=None, callback=None):
def generate(model_name, prompt, system=None, template=None, format="", context=None, options=None, callback=None):
try:
url = f"{BASE_URL}/api/generate"
payload = {
@@ -16,7 +16,8 @@ def generate(model_name, prompt, system=None, template=None, context=None, optio
"system": system,
"template": template,
"context": context,
"options": options
"options": options,
"format": format,
}
# Remove keys with None values

View File

@@ -37,10 +37,56 @@ type GenerateRequest struct {
Template string `json:"template"`
Context []int `json:"context,omitempty"`
Stream *bool `json:"stream,omitempty"`
Raw bool `json:"raw,omitempty"`
Format string `json:"format"`
Options map[string]interface{} `json:"options"`
}
// Options specfied in GenerateRequest, if you add a new option here add it to the API docs also
type Options struct {
Runner
// Predict options used at runtime
NumKeep int `json:"num_keep,omitempty"`
Seed int `json:"seed,omitempty"`
NumPredict int `json:"num_predict,omitempty"`
TopK int `json:"top_k,omitempty"`
TopP float32 `json:"top_p,omitempty"`
TFSZ float32 `json:"tfs_z,omitempty"`
TypicalP float32 `json:"typical_p,omitempty"`
RepeatLastN int `json:"repeat_last_n,omitempty"`
Temperature float32 `json:"temperature,omitempty"`
RepeatPenalty float32 `json:"repeat_penalty,omitempty"`
PresencePenalty float32 `json:"presence_penalty,omitempty"`
FrequencyPenalty float32 `json:"frequency_penalty,omitempty"`
Mirostat int `json:"mirostat,omitempty"`
MirostatTau float32 `json:"mirostat_tau,omitempty"`
MirostatEta float32 `json:"mirostat_eta,omitempty"`
PenalizeNewline bool `json:"penalize_newline,omitempty"`
Stop []string `json:"stop,omitempty"`
}
// Runner options which must be set when the model is loaded into memory
type Runner struct {
UseNUMA bool `json:"numa,omitempty"`
NumCtx int `json:"num_ctx,omitempty"`
NumBatch int `json:"num_batch,omitempty"`
NumGQA int `json:"num_gqa,omitempty"`
NumGPU int `json:"num_gpu,omitempty"`
MainGPU int `json:"main_gpu,omitempty"`
LowVRAM bool `json:"low_vram,omitempty"`
F16KV bool `json:"f16_kv,omitempty"`
LogitsAll bool `json:"logits_all,omitempty"`
VocabOnly bool `json:"vocab_only,omitempty"`
UseMMap bool `json:"use_mmap,omitempty"`
UseMLock bool `json:"use_mlock,omitempty"`
EmbeddingOnly bool `json:"embedding_only,omitempty"`
RopeFrequencyBase float32 `json:"rope_frequency_base,omitempty"`
RopeFrequencyScale float32 `json:"rope_frequency_scale,omitempty"`
NumThread int `json:"num_thread,omitempty"`
}
type EmbeddingRequest struct {
Model string `json:"model"`
Prompt string `json:"prompt"`
@@ -161,49 +207,6 @@ func (r *GenerateResponse) Summary() {
}
}
// Runner options which must be set when the model is loaded into memory
type Runner struct {
UseNUMA bool `json:"numa,omitempty"`
NumCtx int `json:"num_ctx,omitempty"`
NumBatch int `json:"num_batch,omitempty"`
NumGQA int `json:"num_gqa,omitempty"`
NumGPU int `json:"num_gpu,omitempty"`
MainGPU int `json:"main_gpu,omitempty"`
LowVRAM bool `json:"low_vram,omitempty"`
F16KV bool `json:"f16_kv,omitempty"`
LogitsAll bool `json:"logits_all,omitempty"`
VocabOnly bool `json:"vocab_only,omitempty"`
UseMMap bool `json:"use_mmap,omitempty"`
UseMLock bool `json:"use_mlock,omitempty"`
EmbeddingOnly bool `json:"embedding_only,omitempty"`
RopeFrequencyBase float32 `json:"rope_frequency_base,omitempty"`
RopeFrequencyScale float32 `json:"rope_frequency_scale,omitempty"`
NumThread int `json:"num_thread,omitempty"`
}
type Options struct {
Runner
// Predict options used at runtime
NumKeep int `json:"num_keep,omitempty"`
Seed int `json:"seed,omitempty"`
NumPredict int `json:"num_predict,omitempty"`
TopK int `json:"top_k,omitempty"`
TopP float32 `json:"top_p,omitempty"`
TFSZ float32 `json:"tfs_z,omitempty"`
TypicalP float32 `json:"typical_p,omitempty"`
RepeatLastN int `json:"repeat_last_n,omitempty"`
Temperature float32 `json:"temperature,omitempty"`
RepeatPenalty float32 `json:"repeat_penalty,omitempty"`
PresencePenalty float32 `json:"presence_penalty,omitempty"`
FrequencyPenalty float32 `json:"frequency_penalty,omitempty"`
Mirostat int `json:"mirostat,omitempty"`
MirostatTau float32 `json:"mirostat_tau,omitempty"`
MirostatEta float32 `json:"mirostat_eta,omitempty"`
PenalizeNewline bool `json:"penalize_newline,omitempty"`
Stop []string `json:"stop,omitempty"`
}
var ErrInvalidOpts = fmt.Errorf("invalid options")
func (opts *Options) FromMap(m map[string]interface{}) error {
@@ -293,7 +296,7 @@ func DefaultOptions() Options {
return Options{
// options set on request to runner
NumPredict: -1,
NumKeep: -1,
NumKeep: 0,
Temperature: 0.8,
TopK: 40,
TopP: 0.9,

View File

@@ -1,7 +1,6 @@
package cmd
import (
"bufio"
"context"
"crypto/ed25519"
"crypto/rand"
@@ -11,6 +10,7 @@ import (
"io"
"log"
"net"
"net/http"
"os"
"os/exec"
"os/signal"
@@ -20,7 +20,6 @@ import (
"syscall"
"time"
"github.com/dustin/go-humanize"
"github.com/olekukonko/tablewriter"
"github.com/spf13/cobra"
"golang.org/x/crypto/ssh"
@@ -98,19 +97,16 @@ func RunHandler(cmd *cobra.Command, args []string) error {
return err
}
models, err := client.List(context.Background())
if err != nil {
return err
}
canonicalModelPath := server.ParseModelPath(args[0])
for _, model := range models.Models {
if model.Name == canonicalModelPath.GetShortTagname() {
return RunGenerate(cmd, args)
name := args[0]
// check if the model exists on the server
_, err = client.Show(context.Background(), &api.ShowRequest{Name: name})
var statusError api.StatusError
switch {
case errors.As(err, &statusError) && statusError.StatusCode == http.StatusNotFound:
if err := PullHandler(cmd, args); err != nil {
return err
}
}
if err := PullHandler(cmd, args); err != nil {
case err != nil:
return err
}
@@ -176,7 +172,7 @@ func ListHandler(cmd *cobra.Command, args []string) error {
for _, m := range models.Models {
if len(args) == 0 || strings.HasPrefix(m.Name, args[0]) {
data = append(data, []string{m.Name, m.Digest[:12], humanize.Bytes(uint64(m.Size)), format.HumanTime(m.ModifiedAt, "Never")})
data = append(data, []string{m.Name, m.Digest[:12], format.HumanBytes(m.Size), format.HumanTime(m.ModifiedAt, "Never")})
}
}
@@ -352,34 +348,49 @@ func pull(model string, insecure bool) error {
}
func RunGenerate(cmd *cobra.Command, args []string) error {
if len(args) > 1 {
// join all args into a single prompt
wordWrap := false
if term.IsTerminal(int(os.Stdout.Fd())) {
wordWrap = true
}
format, err := cmd.Flags().GetString("format")
if err != nil {
return err
}
nowrap, err := cmd.Flags().GetBool("nowordwrap")
prompts := args[1:]
// prepend stdin to the prompt if provided
if !term.IsTerminal(int(os.Stdin.Fd())) {
in, err := io.ReadAll(os.Stdin)
if err != nil {
return err
}
if nowrap {
wordWrap = false
}
return generate(cmd, args[0], strings.Join(args[1:], " "), wordWrap)
prompts = append([]string{string(in)}, prompts...)
}
if readline.IsTerminal(int(os.Stdin.Fd())) {
return generateInteractive(cmd, args[0])
// output is being piped
if !term.IsTerminal(int(os.Stdout.Fd())) {
return generate(cmd, args[0], strings.Join(prompts, " "), false, format)
}
return generateBatch(cmd, args[0])
wordWrap := os.Getenv("TERM") == "xterm-256color"
nowrap, err := cmd.Flags().GetBool("nowordwrap")
if err != nil {
return err
}
if nowrap {
wordWrap = false
}
// prompts are provided via stdin or args so don't enter interactive mode
if len(prompts) > 0 {
return generate(cmd, args[0], strings.Join(prompts, " "), wordWrap, format)
}
return generateInteractive(cmd, args[0], wordWrap, format)
}
type generateContextKey string
func generate(cmd *cobra.Command, model, prompt string, wordWrap bool) error {
func generate(cmd *cobra.Command, model, prompt string, wordWrap bool, format string) error {
client, err := api.ClientFromEnvironment()
if err != nil {
return err
@@ -395,7 +406,7 @@ func generate(cmd *cobra.Command, model, prompt string, wordWrap bool) error {
generateContext = []int{}
}
termWidth, _, err := term.GetSize(int(0))
termWidth, _, err := term.GetSize(int(os.Stdout.Fd()))
if err != nil {
wordWrap = false
}
@@ -416,7 +427,7 @@ func generate(cmd *cobra.Command, model, prompt string, wordWrap bool) error {
var currentLineLength int
var wordBuffer string
request := api.GenerateRequest{Model: model, Prompt: prompt, Context: generateContext}
request := api.GenerateRequest{Model: model, Prompt: prompt, Context: generateContext, Format: format}
fn := func(response api.GenerateResponse) error {
if !spinner.IsFinished() {
spinner.Finish()
@@ -487,9 +498,9 @@ func generate(cmd *cobra.Command, model, prompt string, wordWrap bool) error {
return nil
}
func generateInteractive(cmd *cobra.Command, model string) error {
func generateInteractive(cmd *cobra.Command, model string, wordWrap bool, format string) error {
// load the model
if err := generate(cmd, model, "", false); err != nil {
if err := generate(cmd, model, "", false, ""); err != nil {
return err
}
@@ -510,6 +521,8 @@ func generateInteractive(cmd *cobra.Command, model string) error {
fmt.Fprintln(os.Stderr, " /set nohistory Disable history")
fmt.Fprintln(os.Stderr, " /set wordwrap Enable wordwrap")
fmt.Fprintln(os.Stderr, " /set nowordwrap Disable wordwrap")
fmt.Fprintln(os.Stderr, " /set format json Enable JSON mode")
fmt.Fprintln(os.Stderr, " /set noformat Disable formatting")
fmt.Fprintln(os.Stderr, " /set verbose Show LLM stats")
fmt.Fprintln(os.Stderr, " /set quiet Disable LLM stats")
fmt.Fprintln(os.Stderr, "")
@@ -537,21 +550,6 @@ func generateInteractive(cmd *cobra.Command, model string) error {
return err
}
var wordWrap bool
termType := os.Getenv("TERM")
if termType == "xterm-256color" {
wordWrap = true
}
// override wrapping if the user turned it off
nowrap, err := cmd.Flags().GetBool("nowordwrap")
if err != nil {
return err
}
if nowrap {
wordWrap = false
}
fmt.Print(readline.StartBracketedPaste)
defer fmt.Printf(readline.EndBracketedPaste)
@@ -615,6 +613,16 @@ func generateInteractive(cmd *cobra.Command, model string) error {
case "quiet":
cmd.Flags().Set("verbose", "false")
fmt.Println("Set 'quiet' mode.")
case "format":
if len(args) < 3 || args[2] != "json" {
fmt.Println("Invalid or missing format. For 'json' mode use '/set format json'")
} else {
format = args[2]
fmt.Printf("Set format to '%s' mode.\n", args[2])
}
case "noformat":
format = ""
fmt.Println("Disabled format.")
default:
fmt.Printf("Unknown command '/set %s'. Type /? for help\n", args[1])
}
@@ -688,26 +696,13 @@ func generateInteractive(cmd *cobra.Command, model string) error {
}
if len(line) > 0 && line[0] != '/' {
if err := generate(cmd, model, line, wordWrap); err != nil {
if err := generate(cmd, model, line, wordWrap, format); err != nil {
return err
}
}
}
}
func generateBatch(cmd *cobra.Command, model string) error {
scanner := bufio.NewScanner(os.Stdin)
for scanner.Scan() {
prompt := scanner.Text()
fmt.Printf(">>> %s\n", prompt)
if err := generate(cmd, model, prompt, false); err != nil {
return err
}
}
return nil
}
func RunServer(cmd *cobra.Command, _ []string) error {
host, port, err := net.SplitHostPort(os.Getenv("OLLAMA_HOST"))
if err != nil {
@@ -731,21 +726,6 @@ func RunServer(cmd *cobra.Command, _ []string) error {
origins = strings.Split(o, ",")
}
if noprune := os.Getenv("OLLAMA_NOPRUNE"); noprune == "" {
if err := server.PruneLayers(); err != nil {
return err
}
manifestsPath, err := server.GetManifestPath()
if err != nil {
return err
}
if err := server.PruneDirectory(manifestsPath); err != nil {
return err
}
}
return server.Serve(ln, origins)
}
@@ -900,6 +880,7 @@ func NewCLI() *cobra.Command {
runCmd.Flags().Bool("verbose", false, "Show timings for response")
runCmd.Flags().Bool("insecure", false, "Use an insecure registry")
runCmd.Flags().Bool("nowordwrap", false, "Don't wrap words to the next line automatically")
runCmd.Flags().String("format", "", "Response format (e.g. json)")
serveCmd := &cobra.Command{
Use: "serve",

View File

@@ -41,28 +41,36 @@ Generate a response for a given prompt with a provided model. This is a streamin
Advanced parameters (optional):
- `format`: the format to return a response in. Currently the only accepted value is `json`
- `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
- `system`: system prompt to (overrides what is defined in the `Modelfile`)
- `template`: the full prompt or prompt template (overrides what is defined in the `Modelfile`)
- `context`: the context parameter returned from a previous request to `/generate`, this can be used to keep a short conversational memory
- `stream`: if `false` the response will be be returned as a single response object, rather than a stream of objects
- `stream`: if `false` the response will be returned as a single response object, rather than a stream of objects
- `raw`: if `true` no formatting will be applied to the prompt and no context will be returned. You may choose to use the `raw` parameter if you are specifying a full templated prompt in your request to the API, and are managing history yourself.
### Request
### JSON mode
Enable JSON mode by setting the `format` parameter to `json` and specifying the model should use JSON in the `prompt`. This will structure the response as valid JSON. See the JSON mode [example](#request-json-mode) below.
### Examples
#### Request
```shell
curl -X POST http://localhost:11434/api/generate -d '{
"model": "llama2:7b",
"model": "llama2",
"prompt": "Why is the sky blue?"
}'
```
### Response
#### Response
A stream of JSON objects:
A stream of JSON objects is returned:
```json
{
"model": "llama2:7b",
"model": "llama2",
"created_at": "2023-08-04T08:52:19.385406455-07:00",
"response": "The",
"done": false
@@ -86,7 +94,7 @@ To calculate how fast the response is generated in tokens per second (token/s),
```json
{
"model": "llama2:7b",
"model": "llama2",
"created_at": "2023-08-04T19:22:45.499127Z",
"response": "",
"context": [1, 2, 3],
@@ -102,6 +110,182 @@ To calculate how fast the response is generated in tokens per second (token/s),
}
```
#### Request (No streaming)
```shell
curl -X POST http://localhost:11434/api/generate -d '{
"model": "llama2:7b",
"prompt": "Why is the sky blue?",
"stream": false
}'
```
#### Response
If `stream` is set to `false`, the response will be a single JSON object:
```json
{
"model": "llama2:7b",
"created_at": "2023-08-04T19:22:45.499127Z",
"response": "The sky is blue because it is the color of the sky.",
"context": [1, 2, 3],
"done": true,
"total_duration": 5589157167,
"load_duration": 3013701500,
"sample_count": 114,
"sample_duration": 81442000,
"prompt_eval_count": 46,
"prompt_eval_duration": 1160282000,
"eval_count": 13,
"eval_duration": 1325948000
}
```
#### Request (Raw mode)
In some cases you may wish to bypass the templating system and provide a full prompt. In this case, you can use the `raw` parameter to disable formatting and context.
```shell
curl -X POST http://localhost:11434/api/generate -d '{
"model": "mistral",
"prompt": "[INST] why is the sky blue? [/INST]",
"raw": true,
"stream": false
}'
```
#### Response
```json
{
"model": "mistral",
"created_at": "2023-11-03T15:36:02.583064Z",
"response": " The sky appears blue because of a phenomenon called Rayleigh scattering.",
"done": true,
"total_duration": 14648695333,
"load_duration": 3302671417,
"prompt_eval_count": 14,
"prompt_eval_duration": 286243000,
"eval_count": 129,
"eval_duration": 10931424000
}
```
#### Request (JSON mode)
```shell
curl -X POST http://localhost:11434/api/generate -d '{
"model": "llama2",
"prompt": "What color is the sky at different times of the day? Respond using JSON",
"format": "json",
"stream": false
}'
```
#### Response
```json
{
"model": "llama2",
"created_at": "2023-11-09T21:07:55.186497Z",
"response": "{\n\"morning\": {\n\"color\": \"blue\"\n},\n\"noon\": {\n\"color\": \"blue-gray\"\n},\n\"afternoon\": {\n\"color\": \"warm gray\"\n},\n\"evening\": {\n\"color\": \"orange\"\n}\n}\n",
"done": true,
"total_duration": 4661289125,
"load_duration": 1714434500,
"prompt_eval_count": 36,
"prompt_eval_duration": 264132000,
"eval_count": 75,
"eval_duration": 2112149000
}
```
The value of `response` will be a string containing JSON similar to:
```json
{
"morning": {
"color": "blue"
},
"noon": {
"color": "blue-gray"
},
"afternoon": {
"color": "warm gray"
},
"evening": {
"color": "orange"
}
}
```
#### Request (With options)
If you want to set custom options for the model at runtime rather than in the Modelfile, you can do so with the `options` parameter. This example sets every available option, but you can set any of them individually and omit the ones you do not want to override.
```shell
curl -X POST http://localhost:11434/api/generate -d '{
"model": "llama2:7b",
"prompt": "Why is the sky blue?",
"stream": false,
"options": {
"num_keep": 5,
"seed": 42,
"num_predict": 100,
"top_k": 20,
"top_p": 0.9,
"tfs_z": 0.5,
"typical_p": 0.7,
"repeat_last_n": 33,
"temperature": 0.8,
"repeat_penalty": 1.2,
"presence_penalty": 1.5,
"frequency_penalty": 1.0,
"mirostat": 1,
"mirostat_tau": 0.8,
"mirostat_eta": 0.6,
"penalize_newline": true,
"stop": ["\n", "user:"],
"numa": false,
"num_ctx": 4,
"num_batch": 2,
"num_gqa": 1,
"num_gpu": 1,
"main_gpu": 0,
"low_vram": false,
"f16_kv": true,
"logits_all": false,
"vocab_only": false,
"use_mmap": true,
"use_mlock": false,
"embedding_only": false,
"rope_frequency_base": 1.1,
"rope_frequency_scale": 0.8,
"num_thread": 8
}
}'
```
#### Response
```json
{
"model": "llama2:7b",
"created_at": "2023-08-04T19:22:45.499127Z",
"response": "The sky is blue because it is the color of the sky.",
"context": [1, 2, 3],
"done": true,
"total_duration": 5589157167,
"load_duration": 3013701500,
"sample_count": 114,
"sample_duration": 81442000,
"prompt_eval_count": 46,
"prompt_eval_duration": 1160282000,
"eval_count": 13,
"eval_duration": 1325948000
}
```
## Create a Model
```shell
@@ -114,9 +298,11 @@ Create a model from a [`Modelfile`](./modelfile.md)
- `name`: name of the model to create
- `path`: path to the Modelfile
- `stream`: (optional) if `false` the response will be be returned as a single response object, rather than a stream of objects
- `stream`: (optional) if `false` the response will be returned as a single response object, rather than a stream of objects
### Request
### Examples
#### Request
```shell
curl -X POST http://localhost:11434/api/create -d '{
@@ -125,7 +311,7 @@ curl -X POST http://localhost:11434/api/create -d '{
}'
```
### Response
#### Response
A stream of JSON objects. When finished, `status` is `success`.
@@ -143,13 +329,17 @@ GET /api/tags
List models that are available locally.
### Request
### Examples
#### Request
```shell
curl http://localhost:11434/api/tags
```
### Response
#### Response
A single JSON object will be returned.
```json
{
@@ -180,7 +370,9 @@ Show details about a model including modelfile, template, parameters, license, a
- `name`: name of the model to show
### Request
### Examples
#### Request
```shell
curl http://localhost:11434/api/show -d '{
@@ -188,14 +380,14 @@ curl http://localhost:11434/api/show -d '{
}'
```
### Response
#### Response
```json
{
"license": "<contents of license block>",
"modelfile": "# Modelfile generated by \"ollama show\"\n# To build a new Modelfile based on this one, replace the FROM line with:\n# FROM llama2:latest\n\nFROM /Users/username/.ollama/models/blobs/sha256:8daa9615cce30c259a9555b1cc250d461d1bc69980a274b44d7eda0be78076d8\nTEMPLATE \"\"\"[INST] {{ if and .First .System }}<<SYS>>{{ .System }}<</SYS>>\n\n{{ end }}{{ .Prompt }} [/INST] \"\"\"\nSYSTEM \"\"\"\"\"\"\nPARAMETER stop [INST]\nPARAMETER stop [/INST]\nPARAMETER stop <<SYS>>\nPARAMETER stop <</SYS>>\n",
"modelfile": "# Modelfile generated by \"ollama show\"\n# To build a new Modelfile based on this one, replace the FROM line with:\n# FROM llama2:latest\n\nFROM /Users/username/.ollama/models/blobs/sha256:8daa9615cce30c259a9555b1cc250d461d1bc69980a274b44d7eda0be78076d8\nTEMPLATE \"\"\"[INST] <<SYS>>{{ .System }}<</SYS>>\n\n{{ .Prompt }} [/INST] \"\"\"\nSYSTEM \"\"\"\"\"\"\nPARAMETER stop [INST]\nPARAMETER stop [/INST]\nPARAMETER stop <<SYS>>\nPARAMETER stop <</SYS>>\n",
"parameters": "stop [INST]\nstop [/INST]\nstop <<SYS>>\nstop <</SYS>>",
"template": "[INST] {{ if and .First .System }}<<SYS>>{{ .System }}<</SYS>>\n\n{{ end }}{{ .Prompt }} [/INST] "
"template": "[INST] <<SYS>>{{ .System }}<</SYS>>\n\n{{ .Prompt }} [/INST] "
}
```
@@ -207,7 +399,9 @@ POST /api/copy
Copy a model. Creates a model with another name from an existing model.
### Request
### Examples
#### Request
```shell
curl http://localhost:11434/api/copy -d '{
@@ -216,6 +410,10 @@ curl http://localhost:11434/api/copy -d '{
}'
```
#### Response
The only response is a 200 OK if successful.
## Delete a Model
```shell
@@ -226,9 +424,11 @@ Delete a model and its data.
### Parameters
- `model`: model name to delete
- `name`: model name to delete
### Request
### Examples
#### Request
```shell
curl -X DELETE http://localhost:11434/api/delete -d '{
@@ -236,6 +436,10 @@ curl -X DELETE http://localhost:11434/api/delete -d '{
}'
```
#### Response
If successful, the only response is a 200 OK.
## Pull a Model
```shell
@@ -248,9 +452,11 @@ Download a model from the ollama library. Cancelled pulls are resumed from where
- `name`: name of the model to pull
- `insecure`: (optional) allow insecure connections to the library. Only use this if you are pulling from your own library during development.
- `stream`: (optional) if `false` the response will be be returned as a single response object, rather than a stream of objects
- `stream`: (optional) if `false` the response will be returned as a single response object, rather than a stream of objects
### Request
### Examples
#### Request
```shell
curl -X POST http://localhost:11434/api/pull -d '{
@@ -258,13 +464,51 @@ curl -X POST http://localhost:11434/api/pull -d '{
}'
```
### Response
#### Response
If `stream` is not specified, or set to `true`, a stream of JSON objects is returned:
The first object is the manifest:
```json
{
"status": "pulling manifest"
}
```
Then there is a series of downloading responses. Until any of the download is completed, the `completed` key may not be included. The number of files to be downloaded depends on the number of layers specified in the manifest.
```json
{
"status": "downloading digestname",
"digest": "digestname",
"total": 2142590208
"total": 2142590208,
"completed": 241970
}
```
After all the files are downloaded, the final responses are:
```json
{
"status": "verifying sha256 digest"
}
{
"status": "writing manifest"
}
{
"status": "removing any unused layers"
}
{
"status": "success"
}
```
if `stream` is set to false, then the response is a single JSON object:
```json
{
"status": "success"
}
```
@@ -280,9 +524,11 @@ Upload a model to a model library. Requires registering for ollama.ai and adding
- `name`: name of the model to push in the form of `<namespace>/<model>:<tag>`
- `insecure`: (optional) allow insecure connections to the library. Only use this if you are pushing to your library during development.
- `stream`: (optional) if `false` the response will be be returned as a single response object, rather than a stream of objects
- `stream`: (optional) if `false` the response will be returned as a single response object, rather than a stream of objects
### Request
### Examples
#### Request
```shell
curl -X POST http://localhost:11434/api/push -d '{
@@ -290,9 +536,9 @@ curl -X POST http://localhost:11434/api/push -d '{
}'
```
### Response
#### Response
Streaming response that starts with:
If `stream` is not specified, or set to `true`, a stream of JSON objects is returned:
```json
{ "status": "retrieving manifest" }
@@ -325,6 +571,12 @@ Finally, when the upload is complete:
{"status":"success"}
```
If `stream` is set to `false`, then the response is a single JSON object:
```json
{ "status": "success" }
```
## Generate Embeddings
```shell
@@ -342,7 +594,9 @@ Advanced parameters:
- `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
### Request
### Examples
#### Request
```shell
curl -X POST http://localhost:11434/api/embeddings -d '{
@@ -351,7 +605,7 @@ curl -X POST http://localhost:11434/api/embeddings -d '{
}'
```
### Response
#### Response
```json
{

View File

@@ -74,6 +74,25 @@ systemctl restart ollama
- macOS: Raw model data is stored under `~/.ollama/models`.
- Linux: Raw model data is stored under `/usr/share/ollama/.ollama/models`
Below the models directory you will find a structure similar to the following:
```shell
.
├── blobs
└── manifests
└── registry.ollama.ai
├── f0rodo
├── library
├── mattw
└── saikatkumardey
```
There is a `manifests/registry.ollama.ai/namespace` path. In example above, the user has downloaded models from the official `library`, `f0rodo`, `mattw`, and `saikatkumardey` namespaces. Within each of those directories, you will find directories for each of the models downloaded. And in there you will find a file name representing each tag. Each tag file is the manifest for the model.
The manifest lists all the layers used in this model. You will see a `media type` for each layer, along with a digest. That digest corresponds with a file in the `models/blobs directory`.
### How can I change where Ollama stores models?
To modify where models are stored, you can use the `OLLAMA_MODELS` environment variable. Note that on Linux this means defining `OLLAMA_MODELS` in a drop-in `/etc/systemd/system/ollama.service.d` service file, reloading systemd, and restarting the ollama service.

View File

@@ -185,7 +185,7 @@ python convert.py <path to model directory>
python convert-falcon-hf-to-gguf.py <path to model directory>
# GPTNeoXForCausalLM
python convert-falcon-hf-to-gguf.py <path to model directory>
python convert-gptneox-hf-to-gguf.py <path to model directory>
# GPTBigCodeForCausalLM
python convert-starcoder-hf-to-gguf.py <path to model directory>

View File

@@ -112,8 +112,8 @@ PARAMETER <parameter> <parametervalue>
| repeat_last_n | Sets how far back for the model to look back to prevent repetition. (Default: 64, 0 = disabled, -1 = num_ctx) | int | repeat_last_n 64 |
| repeat_penalty | Sets how strongly to penalize repetitions. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. (Default: 1.1) | float | repeat_penalty 1.1 |
| temperature | The temperature of the model. Increasing the temperature will make the model answer more creatively. (Default: 0.8) | float | temperature 0.7 |
| seed | Sets the random number seed to use for generation. Setting this to a specific number will make the model generate the same text for the same prompt. (Default: 0) | int | seed 42 |
| stop | Sets the stop sequences to use. | string | stop "AI assistant:" |
| seed | Sets the random number seed to use for generation. Setting this to a specific number will make the model generate the same text for the same prompt. (Default: 0) | int | seed 42 |
| stop | Sets the stop sequences to use. When this pattern is encountered the LLM will stop generating text and return. Multiple stop patterns may be set by specifying multiple separate `stop` parameters in a modelfile. | string | stop "AI assistant:" |
| tfs_z | Tail free sampling is used to reduce the impact of less probable tokens from the output. A higher value (e.g., 2.0) will reduce the impact more, while a value of 1.0 disables this setting. (default: 1) | float | tfs_z 1 |
| num_predict | Maximum number of tokens to predict when generating text. (Default: 128, -1 = infinite generation, -2 = fill context) | int | num_predict 42 |
| top_k | Reduces the probability of generating nonsense. A higher value (e.g. 100) will give more diverse answers, while a lower value (e.g. 10) will be more conservative. (Default: 40) | int | top_k 40 |
@@ -129,14 +129,11 @@ PARAMETER <parameter> <parametervalue>
| --------------- | ------------------------------------------------------------------------------------------------------------ |
| `{{ .System }}` | The system prompt used to specify custom behavior, this must also be set in the Modelfile as an instruction. |
| `{{ .Prompt }}` | The incoming prompt, this is not specified in the model file and will be set based on input. |
| `{{ .First }}` | A boolean value used to render specific template information for the first generation of a session. |
```modelfile
TEMPLATE """
{{- if .First }}
### System:
{{ .System }}
{{- end }}
### User:
{{ .Prompt }}

View File

@@ -4,5 +4,6 @@ Here is a list of ways you can use Ollama with other tools to build interesting
- [Using LangChain with Ollama in JavaScript](./tutorials/langchainjs.md)
- [Using LangChain with Ollama in Python](./tutorials/langchainpy.md)
- [Running Ollama on NVIDIA Jetson Devices](./tutorials/nvidia-jetson.md)
Also be sure to check out the [examples](../examples) directory for more ways to use Ollama.
Also be sure to check out the [examples](../examples) directory for more ways to use Ollama.

View File

@@ -23,13 +23,17 @@ const answer = await ollama.call(`why is the sky blue?`);
console.log(answer);
```
That will get us the same thing as if we ran `ollama run llama2 "why is the sky blue"` in the terminal. But we want to load a document from the web to ask a question against. **Cheerio** is a great library for ingesting a webpage, and **LangChain** uses it in their **CheerioWebBaseLoader**. So let's build that part of the app.
That will get us the same thing as if we ran `ollama run llama2 "why is the sky blue"` in the terminal. But we want to load a document from the web to ask a question against. **Cheerio** is a great library for ingesting a webpage, and **LangChain** uses it in their **CheerioWebBaseLoader**. So let's install **Cheerio** and build that part of the app.
```bash
npm install cheerio
```
```javascript
import { CheerioWebBaseLoader } from "langchain/document_loaders/web/cheerio";
const loader = new CheerioWebBaseLoader("https://en.wikipedia.org/wiki/2023_Hawaii_wildfires");
const data = loader.load();
const data = await loader.load();
```
That will load the document. Although this page is smaller than the Odyssey, it is certainly bigger than the context size for most LLMs. So we are going to need to split into smaller pieces, and then select just the pieces relevant to our question. This is a great use for a vector datastore. In this example, we will use the **MemoryVectorStore** that is part of **LangChain**. But there is one more thing we need to get the content into the datastore. We have to run an embeddings process that converts the tokens in the text into a series of vectors. And for that, we are going to use **Tensorflow**. There is a lot of stuff going on in this one. First, install the **Tensorflow** components that we need.

View File

@@ -0,0 +1,38 @@
# Running Ollama on NVIDIA Jetson Devices
With some minor configuration, Ollama runs well on [NVIDIA Jetson Devices](https://www.nvidia.com/en-us/autonomous-machines/embedded-systems/). The following has been tested on [JetPack 5.1.2](https://developer.nvidia.com/embedded/jetpack).
NVIDIA Jetson devices are Linux-based embedded AI computers that are purpose-built for AI applications.
Jetsons have an integrated GPU that is wired directly to the memory controller of the machine. For this reason, the `nvidia-smi` command is unrecognized, and Ollama proceeds to operate in "CPU only"
mode. This can be verified by using a monitoring tool like jtop.
In order to address this, we simply pass the path to the Jetson's pre-installed CUDA libraries into `ollama serve` (while in a tmux session). We then hardcode the num_gpu parameters into a cloned
version of our target model.
Prerequisites:
- curl
- tmux
Here are the steps:
- Install Ollama via standard Linux command (ignore the 404 error): `curl https://ollama.ai/install.sh | sh`
- Stop the Ollama service: `sudo systemctl stop ollama`
- Start Ollama serve in a tmux session called ollama_jetson and reference the CUDA libraries path: `tmux has-session -t ollama_jetson 2>/dev/null || tmux new-session -d -s ollama_jetson
'LD_LIBRARY_PATH=/usr/local/cuda/lib64 ollama serve'`
- Pull the model you want to use (e.g. mistral): `ollama pull mistral`
- Create a new Modelfile specifically for enabling GPU support on the Jetson: `touch ModelfileMistralJetson`
- In the ModelfileMistralJetson file, specify the FROM model and the num_gpu PARAMETER as shown below:
```
FROM mistral
PARAMETER num_gpu 999
```
- Create a new model from your Modelfile: `ollama create mistral-jetson -f ./ModelfileMistralJetson`
- Run the new model: `ollama run mistral-jetson`
If you run a monitoring tool like jtop you should now see that Ollama is using the Jetson's integrated GPU.
And that's it!

View File

@@ -0,0 +1,10 @@
# Bash Shell examples
When calling `ollama`, you can pass it a file to run all the prompts in the file, one after the other:
`ollama run llama2 < sourcequestions.txt`
This concept is used in the following example.
## Compare Models
`comparemodels.sh` is a script that runs all the questions in `sourcequestions.txt` using any 4 models you choose that you have already pulled from the Ollama library or have created locally.

View File

@@ -0,0 +1,64 @@
#! /usr/bin/env bash
# Compare multiple models by running them with the same questions
NUMBEROFCHOICES=4
SELECTIONS=()
declare -a SUMS=()
# Get the list of models
CHOICES=$(ollama list | awk '{print $1}')
# Select which models to run as a comparison
echo "Select $NUMBEROFCHOICES models to compare:"
select ITEM in $CHOICES; do
if [[ -n $ITEM ]]; then
echo "You have selected $ITEM"
SELECTIONS+=("$ITEM")
((COUNT++))
if [[ $COUNT -eq $NUMBEROFCHOICES ]]; then
break
fi
else
echo "Invalid selection"
fi
done
# Loop through each of the selected models
for ITEM in "${SELECTIONS[@]}"; do
echo "--------------------------------------------------------------"
echo "Loading the model $ITEM into memory"
ollama run "$ITEM" ""
echo "--------------------------------------------------------------"
echo "Running the questions through the model $ITEM"
COMMAND_OUTPUT=$(ollama run "$ITEM" --verbose < sourcequestions.txt 2>&1| tee /dev/stderr)
# eval duration is sometimes listed in seconds and sometimes in milliseconds.
# Add up the values for each model
SUM=$(echo "$COMMAND_OUTPUT" | awk '
/eval duration:/ {
value = $3
if (index(value, "ms") > 0) {
gsub("ms", "", value)
value /= 1000
} else {
gsub("s", "", value)
}
sum += value
}
END { print sum }')
SUMS+=("All questions for $ITEM completed in $SUM seconds")
done
echo ""
echo "--------------------------------------------------------------"
echo -e "Sums of eval durations for each run:"
for val in "${SUMS[@]}"; do
echo "$val"
done
echo "--------------------------------------------------------------"
echo "Comparison complete. Now you can decide"
echo "which model is best."
echo "--------------------------------------------------------------"

View File

@@ -0,0 +1,7 @@
Why is the sky blue
What is a black hole
Explain the big bang theory like I am 5?
What is the quickest way to win a game of Monopoly with 3 others?
Why does a vacuum bottle keep my coffee hot and my milkshake cold?
What is the difference between a meteor, a meteorite, and a meteoroid?
Create an array with 5 items and print to the console. Do this in Python, C#, Typescript, and Rust.

View File

@@ -0,0 +1,36 @@
# Deploy Ollama to Kubernetes
## Prerequisites
- Ollama: https://ollama.ai/download
- Kubernetes cluster. This example will use Google Kubernetes Engine.
## Steps
1. Create the Ollama namespace, daemon set, and service
```bash
kubectl apply -f cpu.yaml
```
1. Port forward the Ollama service to connect and use it locally
```bash
kubectl -n ollama port-forward service/ollama 11434:80
```
1. Pull and run a model, for example `orca-mini:3b`
```bash
ollama run orca-mini:3b
```
## (Optional) Hardware Acceleration
Hardware acceleration in Kubernetes requires NVIDIA's [`k8s-device-plugin`](https://github.com/NVIDIA/k8s-device-plugin). Follow the link for more details.
Once configured, create a GPU enabled Ollama deployment.
```bash
kubectl apply -f gpu.yaml
```

View File

@@ -0,0 +1,42 @@
---
apiVersion: v1
kind: Namespace
metadata:
name: ollama
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: ollama
namespace: ollama
spec:
selector:
matchLabels:
name: ollama
template:
metadata:
labels:
name: ollama
spec:
containers:
- name: ollama
image: ollama/ollama:latest
ports:
- name: http
containerPort: 11434
protocol: TCP
---
apiVersion: v1
kind: Service
metadata:
name: ollama
namespace: ollama
spec:
type: ClusterIP
selector:
name: ollama
ports:
- port: 80
name: http
targetPort: http
protocol: TCP

View File

@@ -0,0 +1,56 @@
---
apiVersion: v1
kind: Namespace
metadata:
name: ollama
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: ollama
namespace: ollama
spec:
strategy:
type: Recreate
selector:
matchLabels:
name: ollama
template:
metadata:
labels:
name: ollama
spec:
containers:
- name: ollama
image: ollama/ollama:latest
env:
- name: PATH
value: /usr/local/nvidia/bin:/usr/local/nvidia/lib64:/usr/bin:/usr/sbin:/bin:/sbin
- name: LD_LIBRARY_PATH
value: /usr/local/nvidia/lib64
ports:
- name: http
containerPort: 11434
protocol: TCP
resources:
limits:
nvidia.com/gpu: 1
tolerations:
- key: nvidia.com/gpu
operator: Exists
effect: NoSchedule
---
apiVersion: v1
kind: Service
metadata:
name: ollama
namespace: ollama
spec:
type: ClusterIP
selector:
name: ollama
ports:
- port: 80
name: http
targetPort: http
protocol: TCP

View File

@@ -6,7 +6,6 @@ PERSIST_DIRECTORY = os.environ.get('PERSIST_DIRECTORY', 'db')
# Define the Chroma settings
CHROMA_SETTINGS = Settings(
chroma_db_impl='duckdb+parquet',
persist_directory=PERSIST_DIRECTORY,
anonymized_telemetry=False
)

View File

@@ -150,7 +150,7 @@ def main():
print("Creating new vectorstore")
texts = process_documents()
print(f"Creating embeddings. May take some minutes...")
db = Chroma.from_documents(texts, embeddings, persist_directory=persist_directory, client_settings=CHROMA_SETTINGS)
db = Chroma.from_documents(texts, embeddings, persist_directory=persist_directory)
db.persist()
db = None

View File

@@ -4,6 +4,7 @@ from langchain.embeddings import HuggingFaceEmbeddings
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.vectorstores import Chroma
from langchain.llms import Ollama
import chromadb
import os
import argparse
import time
@@ -22,7 +23,9 @@ def main():
# Parse the command line arguments
args = parse_arguments()
embeddings = HuggingFaceEmbeddings(model_name=embeddings_model_name)
db = Chroma(persist_directory=persist_directory, embedding_function=embeddings, client_settings=CHROMA_SETTINGS)
db = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
retriever = db.as_retriever(search_kwargs={"k": target_source_chunks})
# activate/deactivate the streaming StdOut callback for LLMs
callbacks = [] if args.mute_stream else [StreamingStdOutCallbackHandler()]

File diff suppressed because it is too large Load Diff

View File

@@ -3,10 +3,8 @@
FROM orca
TEMPLATE """
{{- if .First }}
### System:
{{ .System }}
{{- end }}
### User:
I hate it when my phone dies
### Response:

View File

@@ -3,10 +3,8 @@
This is a simple sentiments analyzer using the Orca model. When you pull Orca from the registry, it has a Template already defined that looks like this:
```Modelfile
{{- if .First }}
### System:
{{ .System }}
{{- end }}
### User:
{{ .Prompt }}

View File

@@ -17,7 +17,7 @@ def generate(prompt, context):
for line in r.iter_lines():
body = json.loads(line)
response_part = body.get('response', '')
# the response streams one token at a time, print that as we recieve it
# the response streams one token at a time, print that as we receive it
print(response_part, end='', flush=True)
if 'error' in body:
@@ -35,4 +35,4 @@ def main():
print()
if __name__ == "__main__":
main()
main()

View File

@@ -12,11 +12,11 @@ const (
func HumanBytes(b int64) string {
switch {
case b > GigaByte:
return fmt.Sprintf("%d GB", b/GigaByte)
return fmt.Sprintf("%.1f GB", float64(b)/GigaByte)
case b > MegaByte:
return fmt.Sprintf("%d MB", b/MegaByte)
return fmt.Sprintf("%.1f MB", float64(b)/MegaByte)
case b > KiloByte:
return fmt.Sprintf("%d KB", b/KiloByte)
return fmt.Sprintf("%.1f KB", float64(b)/KiloByte)
default:
return fmt.Sprintf("%d B", b)
}

25
format/format.go Normal file
View File

@@ -0,0 +1,25 @@
package format
import (
"fmt"
"math"
)
const (
Thousand = 1000
Million = Thousand * 1000
Billion = Million * 1000
)
func HumanNumber(b uint64) string {
switch {
case b > Billion:
return fmt.Sprintf("%.0fB", math.Round(float64(b)/Billion))
case b > Million:
return fmt.Sprintf("%.0fM", math.Round(float64(b)/Million))
case b > Thousand:
return fmt.Sprintf("%.0fK", math.Round(float64(b)/Thousand))
default:
return fmt.Sprintf("%d", b)
}
}

2
go.mod
View File

@@ -3,7 +3,6 @@ module github.com/jmorganca/ollama
go 1.20
require (
github.com/dustin/go-humanize v1.0.1
github.com/emirpasic/gods v1.18.1
github.com/gin-gonic/gin v1.9.1
github.com/mattn/go-runewidth v0.0.14
@@ -11,7 +10,6 @@ require (
github.com/olekukonko/tablewriter v0.0.5
github.com/spf13/cobra v1.7.0
golang.org/x/sync v0.3.0
gonum.org/v1/gonum v0.14.0
)
require github.com/rivo/uniseg v0.2.0 // indirect

4
go.sum
View File

@@ -9,8 +9,6 @@ github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ3
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY=
github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto=
github.com/emirpasic/gods v1.18.1 h1:FXtiHYKDGKCW2KzwZKx0iC0PQmdlorYgdFG9jPXJ1Bc=
github.com/emirpasic/gods v1.18.1/go.mod h1:8tpGGwCnJ5H4r6BWwaV6OrWmMoPhUl5jm/FMNAnJvWQ=
github.com/gabriel-vasile/mimetype v1.4.2 h1:w5qFW6JKBz9Y393Y4q372O9A7cUSequkh1Q7OhCmWKU=
@@ -140,8 +138,6 @@ golang.org/x/text v0.13.0 h1:ablQoSUd0tRdKxZewP80B+BaqeKJuVhuRxj/dkrun3k=
golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
gonum.org/v1/gonum v0.14.0 h1:2NiG67LD1tEH0D7kM+ps2V+fXmsAnpUeec7n8tcr4S0=
gonum.org/v1/gonum v0.14.0/go.mod h1:AoWeoz0becf9QMWtE8iWXNXc27fK4fNeHNf/oMejGfU=
google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw=
google.golang.org/protobuf v1.28.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I=
google.golang.org/protobuf v1.30.0 h1:kPPoIgf3TsEvrm0PFe15JQ+570QVxYzEvvHqChK+cng=

View File

@@ -5,6 +5,8 @@ import (
"encoding/binary"
"fmt"
"io"
"github.com/jmorganca/ollama/format"
)
type containerGGUF struct {
@@ -21,6 +23,8 @@ type containerGGUF struct {
NumTensor uint64
NumKV uint64
}
parameters uint64
}
func (c *containerGGUF) Name() string {
@@ -75,6 +79,14 @@ func newGGUFModel(container *containerGGUF) *ggufModel {
}
}
func (llm *ggufModel) NumTensor() uint64 {
if llm.Version == 1 {
return uint64(llm.V1.NumTensor)
}
return llm.V2.NumTensor
}
func (llm *ggufModel) NumKV() uint64 {
if llm.Version == 1 {
return uint64(llm.V1.NumKV)
@@ -93,6 +105,10 @@ func (llm *ggufModel) ModelFamily() string {
}
func (llm *ggufModel) ModelType() string {
if llm.parameters > 0 {
return format.HumanNumber(llm.parameters)
}
switch llm.ModelFamily() {
case "llama":
if blocks, ok := llm.kv["llama.block_count"].(uint32); ok {
@@ -127,13 +143,9 @@ func (llm *ggufModel) FileType() string {
}
func (llm *ggufModel) Decode(r io.Reader) error {
read := llm.readString
if llm.Version == 1 {
read = llm.readStringV1
}
// decode key-values
for i := 0; uint64(i) < llm.NumKV(); i++ {
k, err := read(r)
k, err := llm.readString(r)
if err != nil {
return err
}
@@ -165,24 +177,14 @@ func (llm *ggufModel) Decode(r io.Reader) error {
case ggufTypeBool:
v = llm.readBool(r)
case ggufTypeString:
fn := llm.readString
if llm.Version == 1 {
fn = llm.readStringV1
}
s, err := fn(r)
s, err := llm.readString(r)
if err != nil {
return err
}
v = s
case ggufTypeArray:
fn := llm.readArray
if llm.Version == 1 {
fn = llm.readArrayV1
}
a, err := fn(r)
a, err := llm.readArray(r)
if err != nil {
return err
}
@@ -195,6 +197,25 @@ func (llm *ggufModel) Decode(r io.Reader) error {
llm.kv[k] = v
}
// decode tensors
for i := 0; uint64(i) < llm.NumTensor(); i++ {
if _, err := llm.readString(r); err != nil {
return err
}
dimensions := llm.readU32(r)
var elements uint64 = 1
for i := 0; uint32(i) < dimensions; i++ {
elements *= llm.readU64(r)
}
llm.readU32(r) // type
llm.readU64(r) // offset
llm.parameters += elements
}
return nil
}
@@ -290,6 +311,10 @@ func (llm ggufModel) readStringV1(r io.Reader) (string, error) {
}
func (llm ggufModel) readString(r io.Reader) (string, error) {
if llm.Version == 1 {
return llm.readStringV1(r)
}
var nameLength uint64
binary.Read(r, llm.bo, &nameLength)
@@ -339,6 +364,10 @@ func (llm *ggufModel) readArrayV1(r io.Reader) (arr []any, err error) {
}
func (llm *ggufModel) readArray(r io.Reader) (arr []any, err error) {
if llm.Version == 1 {
return llm.readArrayV1(r)
}
atype := llm.readU32(r)
n := llm.readU64(r)

View File

@@ -27,6 +27,34 @@ import (
"github.com/jmorganca/ollama/format"
)
const jsonGrammar = `
root ::= object
value ::= object | array | string | number | ("true" | "false" | "null") ws
object ::=
"{" ws (
string ":" ws value
("," ws string ":" ws value)*
)? "}" ws
array ::=
"[" ws (
value
("," ws value)*
)? "]" ws
string ::=
"\"" (
[^"\\] |
"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) # escapes
)* "\"" ws
number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws
# Optional space: by convention, applied in this grammar after literal chars when allowed
ws ::= ([ \t\n] ws)?
`
//go:embed llama.cpp/*/build/*/bin/*
var llamaCppEmbed embed.FS
@@ -196,7 +224,10 @@ type llama struct {
Running
}
var errNoGPU = errors.New("nvidia-smi command failed")
var (
errNvidiaSMI = errors.New("nvidia-smi command failed")
errAvailableVRAM = errors.New("not enough VRAM available, falling back to CPU only")
)
// CheckVRAM returns the free VRAM in bytes on Linux machines with NVIDIA GPUs
func CheckVRAM() (int64, error) {
@@ -205,7 +236,7 @@ func CheckVRAM() (int64, error) {
cmd.Stdout = &stdout
err := cmd.Run()
if err != nil {
return 0, errNoGPU
return 0, errNvidiaSMI
}
var freeMiB int64
@@ -226,8 +257,8 @@ func CheckVRAM() (int64, error) {
freeBytes := freeMiB * 1024 * 1024
if freeBytes < 2*format.GigaByte {
log.Printf("less than 2 GB VRAM available, falling back to CPU only")
freeMiB = 0
log.Printf("less than 2 GB VRAM available")
return 0, errAvailableVRAM
}
return freeBytes, nil
@@ -240,7 +271,7 @@ func NumGPU(numLayer, fileSizeBytes int64, opts api.Options) int {
if runtime.GOOS == "linux" {
freeBytes, err := CheckVRAM()
if err != nil {
if err.Error() != "nvidia-smi command failed" {
if !errors.Is(err, errNvidiaSMI) {
log.Print(err.Error())
}
// nvidia driver not installed or no nvidia GPU found
@@ -306,13 +337,19 @@ func newLlama(model string, adapters []string, runners []ModelRunner, numLayers
params := []string{
"--model", model,
"--ctx-size", fmt.Sprintf("%d", opts.NumCtx),
"--rope-freq-base", fmt.Sprintf("%f", opts.RopeFrequencyBase),
"--rope-freq-scale", fmt.Sprintf("%f", opts.RopeFrequencyScale),
"--batch-size", fmt.Sprintf("%d", opts.NumBatch),
"--n-gpu-layers", fmt.Sprintf("%d", numGPU),
"--embedding",
}
if opts.RopeFrequencyBase > 0 {
params = append(params, "--rope-freq-base", fmt.Sprintf("%f", opts.RopeFrequencyBase))
}
if opts.RopeFrequencyScale > 0 {
params = append(params, "--rope-freq-scale", fmt.Sprintf("%f", opts.RopeFrequencyScale))
}
if opts.NumGQA > 0 {
params = append(params, "--gqa", fmt.Sprintf("%d", opts.NumGQA))
}
@@ -360,7 +397,15 @@ func newLlama(model string, adapters []string, runners []ModelRunner, numLayers
runner.Path,
append(params, "--port", strconv.Itoa(port))...,
)
cmd.Env = append(os.Environ(), fmt.Sprintf("LD_LIBRARY_PATH=%s", filepath.Dir(runner.Path)))
var libraryPaths []string
if libraryPath, ok := os.LookupEnv("LD_LIBRARY_PATH"); ok {
libraryPaths = append(libraryPaths, libraryPath)
}
libraryPaths = append(libraryPaths, filepath.Dir(runner.Path))
cmd.Env = append(os.Environ(), fmt.Sprintf("LD_LIBRARY_PATH=%s", strings.Join(libraryPaths, ":")))
cmd.Stdout = os.Stderr
statusWriter := NewStatusWriter()
cmd.Stderr = statusWriter
@@ -480,7 +525,7 @@ type prediction struct {
const maxBufferSize = 512 * format.KiloByte
func (llm *llama) Predict(ctx context.Context, prevContext []int, prompt string, fn func(api.GenerateResponse)) error {
func (llm *llama) Predict(ctx context.Context, prevContext []int, prompt string, format string, fn func(api.GenerateResponse)) error {
prevConvo, err := llm.Decode(ctx, prevContext)
if err != nil {
return err
@@ -515,6 +560,10 @@ func (llm *llama) Predict(ctx context.Context, prevContext []int, prompt string,
"stop": llm.Stop,
}
if format == "json" {
request["grammar"] = jsonGrammar
}
// Handling JSON marshaling with special characters unescaped.
buffer := &bytes.Buffer{}
enc := json.NewEncoder(buffer)

View File

@@ -14,7 +14,7 @@ import (
)
type LLM interface {
Predict(context.Context, []int, string, func(api.GenerateResponse)) error
Predict(context.Context, []int, string, string, func(api.GenerateResponse)) error
Embedding(context.Context, string) ([]float64, error)
Encode(context.Context, string) ([]int, error)
Decode(context.Context, []int) (string, error)
@@ -85,7 +85,10 @@ func New(workDir, model string, adapters []string, opts api.Options) (LLM, error
switch ggml.Name() {
case "gguf":
opts.NumGQA = 0 // TODO: remove this when llama.cpp runners differ enough to need separate newLlama functions
// TODO: gguf will load these options automatically from the model binary
opts.NumGQA = 0
opts.RopeFrequencyBase = 0.0
opts.RopeFrequencyScale = 0.0
return newLlama(model, adapters, chooseRunners(workDir, "gguf"), ggml.NumLayers(), opts)
case "ggml", "ggmf", "ggjt", "ggla":
return newLlama(model, adapters, chooseRunners(workDir, "ggml"), ggml.NumLayers(), opts)

View File

@@ -291,7 +291,7 @@ func OptionShowDescriptionAtLineEnd() Option {
}
}
var defaultTheme = Theme{Saucer: "█", SaucerPadding: " ", BarStart: "|", BarEnd: "|"}
var defaultTheme = Theme{Saucer: "█", SaucerPadding: " ", BarStart: "", BarEnd: ""}
// NewOptions constructs a new instance of ProgressBar, with any options you specify
func NewOptions(max int, options ...Option) *ProgressBar {

View File

@@ -2,6 +2,7 @@ package readline
import (
"fmt"
"os"
"github.com/emirpasic/gods/lists/arraylist"
"golang.org/x/term"
@@ -17,7 +18,8 @@ type Buffer struct {
}
func NewBuffer(prompt *Prompt) (*Buffer, error) {
width, height, err := term.GetSize(0)
fd := int(os.Stdout.Fd())
width, height, err := term.GetSize(fd)
if err != nil {
fmt.Println("Error getting size:", err)
return nil, err

View File

@@ -51,11 +51,12 @@ func (i *Instance) Readline() (string, error) {
}
fmt.Print(prompt)
termios, err := SetRawMode(syscall.Stdin)
fd := int(syscall.Stdin)
termios, err := SetRawMode(fd)
if err != nil {
return "", err
}
defer UnsetRawMode(syscall.Stdin, termios)
defer UnsetRawMode(fd, termios)
buf, _ := NewBuffer(i.Prompt)

View File

@@ -1,5 +1,4 @@
//go:build aix || darwin || dragonfly || freebsd || (linux && !appengine) || netbsd || openbsd || os400 || solaris
// +build aix darwin dragonfly freebsd linux,!appengine netbsd openbsd os400 solaris
package readline

View File

@@ -1,4 +1,5 @@
//go:build darwin || freebsd || netbsd || openbsd
package readline
import (

View File

@@ -1,4 +1,5 @@
//go:build linux || solaris
package readline
import (

62
readline/term_windows.go Normal file
View File

@@ -0,0 +1,62 @@
package readline
import (
"syscall"
"unsafe"
)
const (
enableLineInput = 2
enableWindowInput = 8
enableMouseInput = 16
enableInsertMode = 32
enableQuickEditMode = 64
enableExtendedFlags = 128
enableProcessedOutput = 1
enableWrapAtEolOutput = 2
enableAutoPosition = 256 // Cursor position is not affected by writing data to the console.
enableEchoInput = 4 // Characters are written to the console as they're read.
enableProcessedInput = 1 // Enables input processing (like recognizing Ctrl+C).
)
var kernel32 = syscall.NewLazyDLL("kernel32.dll")
var (
procGetConsoleMode = kernel32.NewProc("GetConsoleMode")
procSetConsoleMode = kernel32.NewProc("SetConsoleMode")
)
type State struct {
mode uint32
}
// IsTerminal checks if the given file descriptor is associated with a terminal
func IsTerminal(fd int) bool {
var st uint32
r, _, e := syscall.SyscallN(procGetConsoleMode.Addr(), uintptr(fd), uintptr(unsafe.Pointer(&st)), 0)
// if the call succeeds and doesn't produce an error, it's a terminal
return r != 0 && e == 0
}
func SetRawMode(fd int) (*State, error) {
var st uint32
// retrieve the current mode of the terminal
_, _, e := syscall.SyscallN(procGetConsoleMode.Addr(), uintptr(fd), uintptr(unsafe.Pointer(&st)), 0)
if e != 0 {
return nil, error(e)
}
// modify the mode to set it to raw
raw := st &^ (enableEchoInput | enableProcessedInput | enableLineInput | enableProcessedOutput)
// apply the new mode to the terminal
_, _, e = syscall.SyscallN(procSetConsoleMode.Addr(), uintptr(fd), uintptr(raw), 0)
if e != 0 {
return nil, error(e)
}
// return the original state so that it can be restored later
return &State{st}, nil
}
func UnsetRawMode(fd int, state *State) error {
_, _, err := syscall.SyscallN(procSetConsoleMode.Addr(), uintptr(fd), uintptr(state.mode), 0)
return err
}

View File

@@ -63,7 +63,10 @@ status "Installing ollama to $BINDIR..."
$SUDO install -o0 -g0 -m755 -d $BINDIR
$SUDO install -o0 -g0 -m755 $TEMP_DIR/ollama $BINDIR/ollama
install_success() { status 'Install complete. Run "ollama" from the command line.'; }
install_success() {
status 'The Ollama API is now available at 0.0.0.0:11434.'
status 'Install complete. Run "ollama" from the command line.'
}
trap install_success EXIT
# Everything from this point onwards is optional.
@@ -130,6 +133,7 @@ if check_gpu nvidia-smi; then
fi
if ! check_gpu lspci && ! check_gpu lshw; then
install_success
warning "No NVIDIA GPU detected. Ollama will run in CPU-only mode."
exit 0
fi
@@ -176,7 +180,7 @@ install_cuda_driver_apt() {
case $1 in
debian)
status 'Enabling contrib sources...'
$SUDO sed 's/main/contrib/' < /etc/apt/sources.list | sudo tee /etc/apt/sources.list.d/contrib.list > /dev/null
$SUDO sed 's/main/contrib/' < /etc/apt/sources.list | $SUDO tee /etc/apt/sources.list.d/contrib.list > /dev/null
;;
esac

View File

@@ -91,7 +91,7 @@ func getAuthToken(ctx context.Context, redirData AuthRedirect) (string, error) {
}
s := SignatureData{
Method: "GET",
Method: http.MethodGet,
Path: redirectURL.String(),
Data: nil,
}
@@ -103,7 +103,7 @@ func getAuthToken(ctx context.Context, redirData AuthRedirect) (string, error) {
headers := make(http.Header)
headers.Set("Authorization", sig)
resp, err := makeRequest(ctx, "GET", redirectURL, headers, nil, nil)
resp, err := makeRequest(ctx, http.MethodGet, redirectURL, headers, nil, nil)
if err != nil {
log.Printf("couldn't get token: %q", err)
return "", err

View File

@@ -89,17 +89,12 @@ func (b *blobDownload) Prepare(ctx context.Context, requestURL *url.URL, opts *R
}
if len(b.Parts) == 0 {
resp, err := makeRequest(ctx, "HEAD", requestURL, nil, nil, opts)
resp, err := makeRequestWithRetry(ctx, http.MethodHead, requestURL, nil, nil, opts)
if err != nil {
return err
}
defer resp.Body.Close()
if resp.StatusCode >= http.StatusBadRequest {
body, _ := io.ReadAll(resp.Body)
return fmt.Errorf("registry responded with code %d: %v", resp.StatusCode, string(body))
}
b.Total, _ = strconv.ParseInt(resp.Header.Get("Content-Length"), 10, 64)
var size = b.Total / numDownloadParts
@@ -134,7 +129,6 @@ func (b *blobDownload) Run(ctx context.Context, requestURL *url.URL, opts *Regis
func (b *blobDownload) run(ctx context.Context, requestURL *url.URL, opts *RegistryOptions) error {
defer blobDownloadManager.Delete(b.Digest)
ctx, b.CancelFunc = context.WithCancel(ctx)
file, err := os.OpenFile(b.Name+"-partial", os.O_CREATE|os.O_RDWR, 0644)
@@ -155,9 +149,10 @@ func (b *blobDownload) run(ctx context.Context, requestURL *url.URL, opts *Regis
i := i
g.Go(func() error {
var err error
for try := 0; try < maxRetries; try++ {
w := io.NewOffsetWriter(file, part.StartsAt())
err := b.downloadChunk(inner, requestURL, w, part, opts)
err = b.downloadChunk(inner, requestURL, w, part, opts)
switch {
case errors.Is(err, context.Canceled), errors.Is(err, syscall.ENOSPC):
// return immediately if the context is canceled or the device is out of space
@@ -166,11 +161,14 @@ func (b *blobDownload) run(ctx context.Context, requestURL *url.URL, opts *Regis
log.Printf("%s part %d attempt %d failed: %v, retrying", b.Digest[7:19], i, try, err)
continue
default:
if try > 0 {
log.Printf("%s part %d completed after %d retries", b.Digest[7:19], i, try)
}
return nil
}
}
return errors.New("max retries exceeded")
return fmt.Errorf("%w: %w", errMaxRetriesExceeded, err)
})
}
@@ -200,14 +198,14 @@ func (b *blobDownload) run(ctx context.Context, requestURL *url.URL, opts *Regis
func (b *blobDownload) downloadChunk(ctx context.Context, requestURL *url.URL, w io.Writer, part *blobDownloadPart, opts *RegistryOptions) error {
headers := make(http.Header)
headers.Set("Range", fmt.Sprintf("bytes=%d-%d", part.StartsAt(), part.StopsAt()-1))
resp, err := makeRequest(ctx, "GET", requestURL, headers, nil, opts)
resp, err := makeRequestWithRetry(ctx, http.MethodGet, requestURL, headers, nil, opts)
if err != nil {
return err
}
defer resp.Body.Close()
n, err := io.Copy(w, io.TeeReader(resp.Body, b))
if err != nil && !errors.Is(err, context.Canceled) {
if err != nil && !errors.Is(err, context.Canceled) && !errors.Is(err, io.ErrUnexpectedEOF) {
// rollback progress
b.Completed.Add(-n)
return err
@@ -218,7 +216,7 @@ func (b *blobDownload) downloadChunk(ctx context.Context, requestURL *url.URL, w
return err
}
// return nil or context.Canceled
// return nil or context.Canceled or UnexpectedEOF (resumable)
return err
}
@@ -308,6 +306,8 @@ type downloadOpts struct {
const maxRetries = 3
var errMaxRetriesExceeded = errors.New("max retries exceeded")
// downloadBlob downloads a blob from the registry and stores it in the blobs directory
func downloadBlob(ctx context.Context, opts downloadOpts) error {
fp, err := GetBlobsPath(opts.digest)

View File

@@ -60,18 +60,12 @@ func (m *Model) Prompt(request api.GenerateRequest) (string, error) {
}
var vars struct {
First bool
System string
Prompt string
// deprecated: versions <= 0.0.7 used this to omit the system prompt
Context []int
}
vars.First = len(request.Context) == 0
vars.System = m.System
vars.Prompt = request.Prompt
vars.Context = request.Context
if request.System != "" {
vars.System = request.System
@@ -401,7 +395,7 @@ func CreateModel(ctx context.Context, name string, path string, fn func(resp api
if err != nil {
return err
}
newLayer.From = mp.GetNamespaceRepository()
newLayer.From = mp.GetShortTagname()
layers = append(layers, newLayer)
}
}
@@ -981,46 +975,7 @@ func PushModel(ctx context.Context, name string, regOpts *RegistryOptions, fn fu
layers = append(layers, &manifest.Config)
for _, layer := range layers {
exists, err := checkBlobExistence(ctx, mp, layer.Digest, regOpts)
if err != nil {
return err
}
if exists {
fn(api.ProgressResponse{
Status: "using existing layer",
Digest: layer.Digest,
Total: layer.Size,
Completed: layer.Size,
})
log.Printf("Layer %s already exists", layer.Digest)
continue
}
fn(api.ProgressResponse{
Status: "starting upload",
Digest: layer.Digest,
Total: layer.Size,
})
location, chunkSize, err := startUpload(ctx, mp, layer, regOpts)
if err != nil {
log.Printf("couldn't start upload: %v", err)
return err
}
if strings.HasPrefix(filepath.Base(location.Path), "sha256:") {
layer.Digest = filepath.Base(location.Path)
fn(api.ProgressResponse{
Status: "using existing layer",
Digest: layer.Digest,
Total: layer.Size,
Completed: layer.Size,
})
continue
}
if err := uploadBlob(ctx, location, layer, chunkSize, regOpts, fn); err != nil {
if err := uploadBlob(ctx, mp, layer, regOpts, fn); err != nil {
log.Printf("error uploading blob: %v", err)
return err
}
@@ -1037,7 +992,7 @@ func PushModel(ctx context.Context, name string, regOpts *RegistryOptions, fn fu
headers := make(http.Header)
headers.Set("Content-Type", "application/vnd.docker.distribution.manifest.v2+json")
resp, err := makeRequestWithRetry(ctx, "PUT", requestURL, headers, bytes.NewReader(manifestJSON), regOpts)
resp, err := makeRequestWithRetry(ctx, http.MethodPut, requestURL, headers, bytes.NewReader(manifestJSON), regOpts)
if err != nil {
return err
}
@@ -1159,22 +1114,12 @@ func pullModelManifest(ctx context.Context, mp ModelPath, regOpts *RegistryOptio
headers := make(http.Header)
headers.Set("Accept", "application/vnd.docker.distribution.manifest.v2+json")
resp, err := makeRequest(ctx, "GET", requestURL, headers, nil, regOpts)
resp, err := makeRequestWithRetry(ctx, http.MethodGet, requestURL, headers, nil, regOpts)
if err != nil {
log.Printf("couldn't get manifest: %v", err)
return nil, err
}
defer resp.Body.Close()
if resp.StatusCode >= http.StatusBadRequest {
if resp.StatusCode == http.StatusNotFound {
return nil, fmt.Errorf("model not found")
}
body, _ := io.ReadAll(resp.Body)
return nil, fmt.Errorf("on pull registry responded with code %d: %s", resp.StatusCode, body)
}
var m *ManifestV2
if err := json.NewDecoder(resp.Body).Decode(&m); err != nil {
return nil, err
@@ -1218,24 +1163,7 @@ func GetSHA256Digest(r io.Reader) (string, int64) {
return fmt.Sprintf("sha256:%x", h.Sum(nil)), n
}
// Function to check if a blob already exists in the Docker registry
func checkBlobExistence(ctx context.Context, mp ModelPath, digest string, regOpts *RegistryOptions) (bool, error) {
requestURL := mp.BaseURL()
requestURL = requestURL.JoinPath("v2", mp.GetNamespaceRepository(), "blobs", digest)
resp, err := makeRequest(ctx, "HEAD", requestURL, nil, nil, regOpts)
if err != nil {
log.Printf("couldn't check for blob: %v", err)
return false, err
}
defer resp.Body.Close()
// Check for success: If the blob exists, the Docker registry will respond with a 200 OK
return resp.StatusCode < http.StatusBadRequest, nil
}
func makeRequestWithRetry(ctx context.Context, method string, requestURL *url.URL, headers http.Header, body io.ReadSeeker, regOpts *RegistryOptions) (*http.Response, error) {
var status string
for try := 0; try < maxRetries; try++ {
resp, err := makeRequest(ctx, method, requestURL, headers, body, regOpts)
if err != nil {
@@ -1243,8 +1171,6 @@ func makeRequestWithRetry(ctx context.Context, method string, requestURL *url.UR
return nil, err
}
status = resp.Status
switch {
case resp.StatusCode == http.StatusUnauthorized:
auth := resp.Header.Get("www-authenticate")
@@ -1256,21 +1182,25 @@ func makeRequestWithRetry(ctx context.Context, method string, requestURL *url.UR
regOpts.Token = token
if body != nil {
if _, err := body.Seek(0, io.SeekStart); err != nil {
return nil, err
}
body.Seek(0, io.SeekStart)
}
continue
case resp.StatusCode == http.StatusNotFound:
return nil, os.ErrNotExist
case resp.StatusCode >= http.StatusBadRequest:
body, _ := io.ReadAll(resp.Body)
return nil, fmt.Errorf("on upload registry responded with code %d: %s", resp.StatusCode, body)
body, err := io.ReadAll(resp.Body)
if err != nil {
return nil, fmt.Errorf("%d: %s", resp.StatusCode, err)
}
return nil, fmt.Errorf("%d: %s", resp.StatusCode, body)
default:
return resp, nil
}
}
return nil, fmt.Errorf("max retry exceeded: %v", status)
return nil, errMaxRetriesExceeded
}
func makeRequest(ctx context.Context, method string, requestURL *url.URL, headers http.Header, body io.Reader, regOpts *RegistryOptions) (*http.Response, error) {

View File

@@ -158,9 +158,17 @@ func GenerateHandler(c *gin.Context) {
return
}
if req.Model == "" {
// validate the request
switch {
case req.Model == "":
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "model is required"})
return
case len(req.Format) > 0 && req.Format != "json":
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "format must be json"})
return
case req.Raw && (req.Template != "" || req.System != "" || len(req.Context) > 0):
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "raw mode does not support template, system, or context"})
return
}
model, err := GetModel(req.Model)
@@ -189,10 +197,13 @@ func GenerateHandler(c *gin.Context) {
checkpointLoaded := time.Now()
prompt, err := model.Prompt(req)
if err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
return
prompt := req.Prompt
if !req.Raw {
prompt, err = model.Prompt(req)
if err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
return
}
}
ch := make(chan any)
@@ -215,10 +226,15 @@ func GenerateHandler(c *gin.Context) {
r.LoadDuration = checkpointLoaded.Sub(checkpointStart)
}
if req.Raw {
// in raw mode the client must manage history on their own
r.Context = nil
}
ch <- r
}
if err := loaded.runner.Predict(c.Request.Context(), req.Context, prompt, fn); err != nil {
if err := loaded.runner.Predict(c.Request.Context(), req.Context, prompt, req.Format, fn); err != nil {
ch <- gin.H{"error": err.Error()}
}
}()
@@ -365,7 +381,9 @@ func PushModelHandler(c *gin.Context) {
Insecure: req.Insecure,
}
ctx := context.Background()
ctx, cancel := context.WithCancel(c.Request.Context())
defer cancel()
if err := PushModel(ctx, req.Name, regOpts, fn); err != nil {
ch <- gin.H{"error": err.Error()}
}
@@ -614,6 +632,22 @@ var defaultAllowOrigins = []string{
}
func Serve(ln net.Listener, allowOrigins []string) error {
if noprune := os.Getenv("OLLAMA_NOPRUNE"); noprune == "" {
// clean up unused layers and manifests
if err := PruneLayers(); err != nil {
return err
}
manifestsPath, err := GetManifestPath()
if err != nil {
return err
}
if err := PruneDirectory(manifestsPath); err != nil {
return err
}
}
config := cors.DefaultConfig()
config.AllowWildcard = true
@@ -679,7 +713,7 @@ func Serve(ln net.Listener, allowOrigins []string) error {
if runtime.GOOS == "linux" {
// check compatibility to log warnings
if _, err := llm.CheckVRAM(); err != nil {
log.Printf("Warning: GPU support may not enabled, check you have installed install GPU drivers: %v", err)
log.Printf("Warning: GPU support may not be enabled, check you have installed GPU drivers: %v", err)
}
}

View File

@@ -2,218 +2,369 @@ package server
import (
"context"
"crypto/md5"
"errors"
"fmt"
"hash"
"io"
"log"
"net/http"
"net/url"
"os"
"strconv"
"strings"
"sync"
"sync/atomic"
"time"
"github.com/jmorganca/ollama/api"
"github.com/jmorganca/ollama/format"
"golang.org/x/sync/errgroup"
)
var blobUploadManager sync.Map
type blobUpload struct {
*Layer
Total int64
Completed atomic.Int64
Parts []blobUploadPart
nextURL chan *url.URL
context.CancelFunc
done bool
err error
references atomic.Int32
}
const (
redirectChunkSize int64 = 1024 * 1024 * 1024
regularChunkSize int64 = 95 * 1024 * 1024
numUploadParts = 64
minUploadPartSize int64 = 95 * 1000 * 1000
maxUploadPartSize int64 = 1000 * 1000 * 1000
)
func startUpload(ctx context.Context, mp ModelPath, layer *Layer, regOpts *RegistryOptions) (*url.URL, int64, error) {
requestURL := mp.BaseURL()
requestURL = requestURL.JoinPath("v2", mp.GetNamespaceRepository(), "blobs/uploads/")
if layer.From != "" {
func (b *blobUpload) Prepare(ctx context.Context, requestURL *url.URL, opts *RegistryOptions) error {
p, err := GetBlobsPath(b.Digest)
if err != nil {
return err
}
if b.From != "" {
values := requestURL.Query()
values.Add("mount", layer.Digest)
values.Add("from", layer.From)
values.Add("mount", b.Digest)
values.Add("from", b.From)
requestURL.RawQuery = values.Encode()
}
resp, err := makeRequestWithRetry(ctx, "POST", requestURL, nil, nil, regOpts)
resp, err := makeRequestWithRetry(ctx, http.MethodPost, requestURL, nil, nil, opts)
if err != nil {
log.Printf("couldn't start upload: %v", err)
return nil, 0, err
return err
}
defer resp.Body.Close()
location := resp.Header.Get("Docker-Upload-Location")
chunkSize := redirectChunkSize
if location == "" {
location = resp.Header.Get("Location")
chunkSize = regularChunkSize
}
locationURL, err := url.Parse(location)
fi, err := os.Stat(p)
if err != nil {
return nil, 0, err
return err
}
return locationURL, chunkSize, nil
b.Total = fi.Size()
var size = b.Total / numUploadParts
switch {
case size < minUploadPartSize:
size = minUploadPartSize
case size > maxUploadPartSize:
size = maxUploadPartSize
}
var offset int64
for offset < fi.Size() {
if offset+size > fi.Size() {
size = fi.Size() - offset
}
// set part.N to the current number of parts
b.Parts = append(b.Parts, blobUploadPart{blobUpload: b, N: len(b.Parts), Offset: offset, Size: size})
offset += size
}
log.Printf("uploading %s in %d %s part(s)", b.Digest[7:19], len(b.Parts), format.HumanBytes(b.Parts[0].Size))
requestURL, err = url.Parse(location)
if err != nil {
return err
}
b.nextURL = make(chan *url.URL, 1)
b.nextURL <- requestURL
return nil
}
func uploadBlob(ctx context.Context, requestURL *url.URL, layer *Layer, chunkSize int64, regOpts *RegistryOptions, fn func(api.ProgressResponse)) error {
// TODO allow resumability
// TODO allow canceling uploads via DELETE
// Run uploads blob parts to the upstream. If the upstream supports redirection, parts will be uploaded
// in parallel as defined by Prepare. Otherwise, parts will be uploaded serially. Run sets b.err on error.
func (b *blobUpload) Run(ctx context.Context, opts *RegistryOptions) {
defer blobUploadManager.Delete(b.Digest)
ctx, b.CancelFunc = context.WithCancel(ctx)
fp, err := GetBlobsPath(layer.Digest)
p, err := GetBlobsPath(b.Digest)
if err != nil {
return err
b.err = err
return
}
f, err := os.Open(fp)
f, err := os.Open(p)
if err != nil {
return err
b.err = err
return
}
defer f.Close()
pw := ProgressWriter{
status: fmt.Sprintf("uploading %s", layer.Digest),
digest: layer.Digest,
total: layer.Size,
fn: fn,
}
g, inner := errgroup.WithContext(ctx)
g.SetLimit(numUploadParts)
for i := range b.Parts {
part := &b.Parts[i]
select {
case <-inner.Done():
case requestURL := <-b.nextURL:
g.Go(func() error {
var err error
for try := 0; try < maxRetries; try++ {
part.ReadSeeker = io.NewSectionReader(f, part.Offset, part.Size)
err = b.uploadChunk(inner, http.MethodPatch, requestURL, part, opts)
switch {
case errors.Is(err, context.Canceled):
return err
case errors.Is(err, errMaxRetriesExceeded):
return err
case err != nil:
log.Printf("%s part %d attempt %d failed: %v, retrying", b.Digest[7:19], part.N, try, err)
continue
}
for offset := int64(0); offset < layer.Size; {
chunk := layer.Size - offset
if chunk > chunkSize {
chunk = chunkSize
}
return nil
}
resp, err := uploadBlobChunk(ctx, http.MethodPatch, requestURL, f, offset, chunk, regOpts, &pw)
if err != nil {
fn(api.ProgressResponse{
Status: fmt.Sprintf("error uploading chunk: %v", err),
Digest: layer.Digest,
Total: layer.Size,
Completed: offset,
return fmt.Errorf("%w: %w", errMaxRetriesExceeded, err)
})
return err
}
offset += chunk
location := resp.Header.Get("Docker-Upload-Location")
if location == "" {
location = resp.Header.Get("Location")
}
requestURL, err = url.Parse(location)
if err != nil {
return err
}
}
if err := g.Wait(); err != nil {
b.err = err
return
}
requestURL := <-b.nextURL
var sb strings.Builder
for _, part := range b.Parts {
sb.Write(part.Sum(nil))
}
md5sum := md5.Sum([]byte(sb.String()))
values := requestURL.Query()
values.Add("digest", layer.Digest)
values.Add("digest", b.Digest)
values.Add("etag", fmt.Sprintf("%x-%d", md5sum, len(b.Parts)))
requestURL.RawQuery = values.Encode()
headers := make(http.Header)
headers.Set("Content-Type", "application/octet-stream")
headers.Set("Content-Length", "0")
// finish the upload
resp, err := makeRequest(ctx, "PUT", requestURL, headers, nil, regOpts)
resp, err := makeRequestWithRetry(ctx, http.MethodPut, requestURL, headers, nil, opts)
if err != nil {
b.err = err
return
}
defer resp.Body.Close()
b.done = true
}
func (b *blobUpload) uploadChunk(ctx context.Context, method string, requestURL *url.URL, part *blobUploadPart, opts *RegistryOptions) error {
part.Reset()
headers := make(http.Header)
headers.Set("Content-Type", "application/octet-stream")
headers.Set("Content-Length", fmt.Sprintf("%d", part.Size))
headers.Set("X-Redirect-Uploads", "1")
if method == http.MethodPatch {
headers.Set("Content-Range", fmt.Sprintf("%d-%d", part.Offset, part.Offset+part.Size-1))
}
resp, err := makeRequest(ctx, method, requestURL, headers, io.TeeReader(part.ReadSeeker, io.MultiWriter(part, part.Hash)), opts)
if err != nil {
log.Printf("couldn't finish upload: %v", err)
return err
}
defer resp.Body.Close()
if resp.StatusCode >= http.StatusBadRequest {
body, _ := io.ReadAll(resp.Body)
return fmt.Errorf("on finish upload registry responded with code %d: %v", resp.StatusCode, string(body))
}
return nil
}
func uploadBlobChunk(ctx context.Context, method string, requestURL *url.URL, r io.ReaderAt, offset, limit int64, opts *RegistryOptions, pw *ProgressWriter) (*http.Response, error) {
sectionReader := io.NewSectionReader(r, offset, limit)
headers := make(http.Header)
headers.Set("Content-Type", "application/octet-stream")
headers.Set("Content-Length", strconv.Itoa(int(limit)))
headers.Set("X-Redirect-Uploads", "1")
if method == http.MethodPatch {
headers.Set("Content-Range", fmt.Sprintf("%d-%d", offset, offset+sectionReader.Size()-1))
location := resp.Header.Get("Docker-Upload-Location")
if location == "" {
location = resp.Header.Get("Location")
}
for try := 0; try < maxRetries; try++ {
resp, err := makeRequest(ctx, method, requestURL, headers, io.TeeReader(sectionReader, pw), opts)
if err != nil && !errors.Is(err, io.EOF) {
return nil, err
nextURL, err := url.Parse(location)
if err != nil {
return err
}
switch {
case resp.StatusCode == http.StatusTemporaryRedirect:
b.nextURL <- nextURL
redirectURL, err := resp.Location()
if err != nil {
return err
}
defer resp.Body.Close()
switch {
case resp.StatusCode == http.StatusTemporaryRedirect:
location, err := resp.Location()
if err != nil {
return nil, err
}
pw.completed = offset
if _, err := uploadBlobChunk(ctx, http.MethodPut, location, r, offset, limit, nil, pw); err != nil {
// retry
log.Printf("retrying redirected upload: %v", err)
for try := 0; try < maxRetries; try++ {
err = b.uploadChunk(ctx, http.MethodPut, redirectURL, part, nil)
switch {
case errors.Is(err, context.Canceled):
return err
case errors.Is(err, errMaxRetriesExceeded):
return err
case err != nil:
log.Printf("%s part %d attempt %d failed: %v, retrying", b.Digest[7:19], part.N, try, err)
continue
}
return resp, nil
case resp.StatusCode == http.StatusUnauthorized:
auth := resp.Header.Get("www-authenticate")
authRedir := ParseAuthRedirectString(auth)
token, err := getAuthToken(ctx, authRedir)
if err != nil {
return nil, err
}
opts.Token = token
pw.completed = offset
sectionReader = io.NewSectionReader(r, offset, limit)
continue
case resp.StatusCode >= http.StatusBadRequest:
body, _ := io.ReadAll(resp.Body)
return nil, fmt.Errorf("on upload registry responded with code %d: %s", resp.StatusCode, body)
return nil
}
return resp, nil
return fmt.Errorf("%w: %w", errMaxRetriesExceeded, err)
case resp.StatusCode == http.StatusUnauthorized:
auth := resp.Header.Get("www-authenticate")
authRedir := ParseAuthRedirectString(auth)
token, err := getAuthToken(ctx, authRedir)
if err != nil {
return err
}
opts.Token = token
fallthrough
case resp.StatusCode >= http.StatusBadRequest:
body, err := io.ReadAll(resp.Body)
if err != nil {
return err
}
return fmt.Errorf("http status %d %s: %s", resp.StatusCode, resp.Status, body)
}
return nil, fmt.Errorf("max retries exceeded")
if method == http.MethodPatch {
b.nextURL <- nextURL
}
return nil
}
type ProgressWriter struct {
status string
digest string
bucket int64
completed int64
total int64
fn func(api.ProgressResponse)
mu sync.Mutex
func (b *blobUpload) acquire() {
b.references.Add(1)
}
func (pw *ProgressWriter) Write(b []byte) (int, error) {
pw.mu.Lock()
defer pw.mu.Unlock()
func (b *blobUpload) release() {
if b.references.Add(-1) == 0 {
b.CancelFunc()
}
}
n := len(b)
pw.bucket += int64(n)
func (b *blobUpload) Wait(ctx context.Context, fn func(api.ProgressResponse)) error {
b.acquire()
defer b.release()
// throttle status updates to not spam the client
if pw.bucket >= 1024*1024 || pw.completed+pw.bucket >= pw.total {
pw.completed += pw.bucket
pw.fn(api.ProgressResponse{
Status: pw.status,
Digest: pw.digest,
Total: pw.total,
Completed: pw.completed,
ticker := time.NewTicker(60 * time.Millisecond)
for {
select {
case <-ticker.C:
case <-ctx.Done():
return ctx.Err()
}
fn(api.ProgressResponse{
Status: fmt.Sprintf("uploading %s", b.Digest),
Digest: b.Digest,
Total: b.Total,
Completed: b.Completed.Load(),
})
pw.bucket = 0
if b.done || b.err != nil {
return b.err
}
}
}
type blobUploadPart struct {
// N is the part number
N int
Offset int64
Size int64
hash.Hash
written int64
io.ReadSeeker
*blobUpload
}
func (p *blobUploadPart) Write(b []byte) (n int, err error) {
n = len(b)
p.written += int64(n)
p.Completed.Add(int64(n))
return n, nil
}
func (p *blobUploadPart) Reset() {
p.Seek(0, io.SeekStart)
p.Completed.Add(-int64(p.written))
p.written = 0
p.Hash = md5.New()
}
func uploadBlob(ctx context.Context, mp ModelPath, layer *Layer, opts *RegistryOptions, fn func(api.ProgressResponse)) error {
requestURL := mp.BaseURL()
requestURL = requestURL.JoinPath("v2", mp.GetNamespaceRepository(), "blobs", layer.Digest)
resp, err := makeRequestWithRetry(ctx, http.MethodHead, requestURL, nil, nil, opts)
switch {
case errors.Is(err, os.ErrNotExist):
case err != nil:
return err
default:
defer resp.Body.Close()
fn(api.ProgressResponse{
Status: fmt.Sprintf("uploading %s", layer.Digest),
Digest: layer.Digest,
Total: layer.Size,
Completed: layer.Size,
})
return nil
}
data, ok := blobUploadManager.LoadOrStore(layer.Digest, &blobUpload{Layer: layer})
upload := data.(*blobUpload)
if !ok {
requestURL := mp.BaseURL()
requestURL = requestURL.JoinPath("v2", mp.GetNamespaceRepository(), "blobs/uploads/")
if err := upload.Prepare(ctx, requestURL, opts); err != nil {
blobUploadManager.Delete(layer.Digest)
return err
}
go upload.Run(context.Background(), opts)
}
return upload.Wait(ctx, fn)
}