Compare commits

..

267 Commits

Author SHA1 Message Date
Jeffrey Morgan
5534f2cc6a llm: consider head_dim in llama arch (#5817) 2024-07-20 21:48:12 -04:00
Daniel Hiltgen
d321297d8a Merge pull request #5815 from dhiltgen/win_rocm_gfx_features
Adjust windows ROCm discovery
2024-07-20 16:02:55 -07:00
Daniel Hiltgen
06e5d74e34 Merge pull request #5506 from dhiltgen/sched_tests
Refine scheduler unit tests for reliability
2024-07-20 15:48:39 -07:00
Daniel Hiltgen
5d707e6fd5 Merge pull request #5583 from dhiltgen/integration_improvements
Fix context exhaustion integration test for small gpus
2024-07-20 15:48:21 -07:00
Daniel Hiltgen
283948c83b Adjust windows ROCm discovery
The v5 hip library returns unsupported GPUs which wont enumerate at
inference time in the runner so this makes sure we align discovery.  The
gfx906 cards are no longer supported so we shouldn't compile with that
GPU type as it wont enumerate at runtime.
2024-07-20 15:17:50 -07:00
Jeffrey Morgan
1475eab95f add patch for tekken (#5807) 2024-07-20 13:41:21 -04:00
Jeffrey Morgan
20090f3172 preserve last assistant message (#5802) 2024-07-19 20:19:26 -07:00
Jeffrey Morgan
69a2d4ccff Fix generate test flakyness (#5804) 2024-07-19 19:11:25 -07:00
Josh
e8b954c646 server: validate template (#5734)
add template validation to modelfile
2024-07-19 15:24:29 -07:00
royjhan
c57317cbf0 OpenAI: Function Based Testing (#5752)
* distinguish error forwarding

* more coverage

* rm comment
2024-07-19 11:37:12 -07:00
royjhan
51b2fd299c adjust openai chat msg processing (#5729) 2024-07-19 11:19:20 -07:00
Michael Yang
d0634b1596 Merge pull request #5780 from ollama/mxyng/tools
fix parsing tool calls: break on unexpected eofs
2024-07-18 12:14:10 -07:00
Michael Yang
43606d6d6a fix parsing tool calls 2024-07-18 12:08:11 -07:00
Jeffrey Morgan
70b1010fa5 server: check for empty tools array too (#5779) 2024-07-18 11:44:57 -07:00
Jeffrey Morgan
84e5721f3a always provide content even if empty (#5778) 2024-07-18 11:28:19 -07:00
Jeffrey Morgan
319fb1ce03 server: only parse tool calls if tools are provided (#5771)
* server: only parse tool calls if tools are provided

* still set `resp.Message.Content`
2024-07-18 08:50:23 -07:00
Michael Yang
b255445557 marshal json automatically for some template values (#5758) 2024-07-17 15:35:11 -07:00
Michael Yang
b23424bb3c Merge pull request #5753 from ollama/mxyng/parse-tool-call
parse tool call as individual objects
2024-07-17 11:47:53 -07:00
Michael Yang
5fd6988126 parse tool call as individual objects 2024-07-17 11:19:04 -07:00
Michael Yang
5b82960df8 stub response (#5750) 2024-07-17 10:39:22 -07:00
Michael Yang
cc9a252d8c Merge pull request #5732 from ollama/mxyng/cleanup
remove ToolCall from GenerateResponse
2024-07-17 10:26:54 -07:00
Pákozdi György
d281a6e603 add sidellama link (#5702) 2024-07-17 10:24:44 -07:00
royjhan
154f6f45d4 OpenAI: Support Tools (#5614)
* reopen pr

* tools

* remove tc from stream for now

* ID and Function

* openai expects arguments to be a string (#5739)

* mutually exclusive content and tool calls

* clean up

---------

Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com>
2024-07-16 20:52:59 -07:00
royjhan
0d41623b52 OpenAI: Add Suffix to v1/completions (#5611)
* add suffix

* remove todo

* remove TODO

* add to test

* rm outdated prompt tokens info md

* fix test

* fix test
2024-07-16 20:50:14 -07:00
Michael Yang
c279f96371 remove ToolCall from GenerateResponse 2024-07-16 15:22:49 -07:00
Michael Yang
499e87c9ba Merge pull request #5730 from ollama/mxyng/cleanup
remove unneeded tool calls
2024-07-16 14:42:13 -07:00
Michael Yang
cd0853f2d5 Merge pull request #5207 from ollama/mxyng/suffix
add insert support to generate endpoint
2024-07-16 14:37:32 -07:00
Michael Yang
d290e87513 add suffix support to generate endpoint
this change is triggered by the presence of "suffix", particularly
useful for code completion tasks
2024-07-16 14:31:35 -07:00
Thorsten Sommer
97c20ede33 README: Added AI Studio to the list of UIs (#5721)
* Added AI Studio to the list of UIs
2024-07-16 14:24:27 -07:00
Michael Yang
5a83f79afd remove unneeded tool calls 2024-07-16 13:48:45 -07:00
royjhan
987dbab0b0 OpenAI: /v1/embeddings compatibility (#5285)
* OpenAI v1 models

* Empty List Testing

* Add back envconfig

* v1/models docs

* Remove Docs

* OpenAI batch embed compatibility

* merge conflicts

* integrate with api/embed

* ep

* merge conflicts

* request tests

* rm resp test

* merge conflict

* merge conflict

* test fixes

* test fn renaming

* input validation for empty string

---------

Co-authored-by: jmorganca <jmorganca@gmail.com>
2024-07-16 13:36:08 -07:00
Michael Yang
a8388beb94 Merge pull request #5726 from ollama/mxyng/tools-templates
fix unmarshal type errors
2024-07-16 12:12:10 -07:00
Michael Yang
5afbb60fc4 fix unmarshal type errors 2024-07-16 11:39:34 -07:00
Jeffrey Morgan
4cb5d7decc server: omit model system prompt if empty (#5717) 2024-07-16 11:09:00 -07:00
Michael Yang
8eac50dd4f Merge pull request #5684 from ollama/mxyng/tests
add chat and generate tests with mock runner
2024-07-16 09:44:45 -07:00
Michael Yang
4a565cbf94 add chat and generate tests with mock runner 2024-07-16 09:39:31 -07:00
Michael Yang
64039df6d7 Merge pull request #5284 from ollama/mxyng/tools
tools
2024-07-15 18:03:37 -07:00
Jeffrey Morgan
7ac6d462ec server: return empty slice on empty /api/embed request (#5713)
* server: return empty slice on empty `/api/embed` request

* fix tests
2024-07-15 17:39:44 -07:00
Michael Yang
ef5136a745 tools test 2024-07-15 17:18:21 -07:00
Daniel Hiltgen
8288ec8824 Merge pull request #5710 from dhiltgen/rocm_bump
Bump linux ROCm to 6.1.2
2024-07-15 15:32:18 -07:00
Michael Yang
d02bbebb11 tools 2024-07-15 15:26:16 -07:00
Daniel Hiltgen
224337b32f Bump linux ROCm to 6.1.2 2024-07-15 15:10:22 -07:00
Jeffrey Morgan
9e35d9bbee server: lowercase roles for compatibility with clients (#5695) 2024-07-15 13:55:57 -07:00
royjhan
b9f5e16c80 Introduce /api/embed endpoint supporting batch embedding (#5127)
* Initial Batch Embedding

* Revert "Initial Batch Embedding"

This reverts commit c22d54895a.

* Initial Draft

* mock up notes

* api/embed draft

* add server function

* check normalization

* clean up

* normalization

* playing around with truncate stuff

* Truncation

* Truncation

* move normalization to go

* Integration Test Template

* Truncation Integration Tests

* Clean up

* use float32

* move normalize

* move normalize test

* refactoring

* integration float32

* input handling and handler testing

* Refactoring of legacy and new

* clear comments

* merge conflicts

* touches

* embedding type 64

* merge conflicts

* fix hanging on single string

* refactoring

* test values

* set context length

* clean up

* testing clean up

* testing clean up

* remove function closure

* Revert "remove function closure"

This reverts commit 55d48c6ed1.

* remove function closure

* remove redundant error check

* clean up

* more clean up

* clean up
2024-07-15 12:14:24 -07:00
royjhan
e9f7f36029 Support image input for OpenAI chat compatibility (#5208)
* OpenAI v1 models

* Refactor Writers

* Add Test

Co-Authored-By: Attila Kerekes

* Credit Co-Author

Co-Authored-By: Attila Kerekes <439392+keriati@users.noreply.github.com>

* Empty List Testing

* Use Namespace for Ownedby

* Update Test

* Add back envconfig

* v1/models docs

* Use ModelName Parser

* Test Names

* Remove Docs

* Clean Up

* Test name

Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com>

* Add Middleware for Chat and List

* Testing Cleanup

* Test with Fatal

* Add functionality to chat test

* Support image input for OpenAI chat

* Decoding

* Fix message processing logic

* openai vision test

* type errors

* clean up

* redundant check

* merge conflicts

* merge conflicts

* merge conflicts

* flattening and smaller image

* add test

* support python and js SDKs and mandate prefixing

* clean up

---------

Co-authored-by: Attila Kerekes <439392+keriati@users.noreply.github.com>
Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com>
2024-07-13 22:07:45 -07:00
Patrick Devine
057d31861e remove template (#5655) 2024-07-13 20:56:24 -07:00
jmorganca
f7ee012300 server: prepend system message in chat handler 2024-07-13 15:08:00 -07:00
Jeffrey Morgan
1ed0aa8fea server: fix context, load_duration and total_duration fields (#5676)
* server: fix `contet`, `load_duration` and `total_duration` fields

* Update server/routes.go
2024-07-13 09:25:31 -07:00
Jeffrey Morgan
ef98803d63 llm: looser checks for minimum memory (#5677) 2024-07-13 09:20:05 -07:00
Jarek
02fea420e5 Add Kerlig AI, an app for macOS (#5675) 2024-07-13 08:33:46 -07:00
Michael Yang
22c5451fc2 fix system prompt (#5662)
* fix system prompt

* execute template when hitting previous roles

* fix tests

---------

Co-authored-by: jmorganca <jmorganca@gmail.com>
2024-07-12 21:04:44 -07:00
Patrick Devine
23ebbaa46e Revert "remove template from tests"
This reverts commit 9ac0a7a50b.
2024-07-12 15:47:17 -07:00
Patrick Devine
9ac0a7a50b remove template from tests 2024-07-12 15:41:31 -07:00
Michael Yang
e5c65a85df Merge pull request #5653 from ollama/mxyng/collect-system
template: preprocess message and collect system
2024-07-12 12:32:34 -07:00
Jeffrey Morgan
33627331a3 app: also clean up tempdir runners on install (#5646) 2024-07-12 12:29:23 -07:00
Michael Yang
36c87c433b template: preprocess message and collect system 2024-07-12 12:26:43 -07:00
Jeffrey Morgan
179737feb7 Clean up old files when installing on Windows (#5645)
* app: always clean up install dir; force close applications

* remove wildcard

* revert `CloseApplications`

* whitespace

* update `LOCALAPPDATA` var
2024-07-11 22:53:46 -07:00
Michael Yang
47353f5ee4 Merge pull request #5639 from ollama/mxyng/unaggregated-system 2024-07-11 17:48:50 -07:00
Josh
10e768826c fix: quant err message (#5616) 2024-07-11 17:24:29 -07:00
Michael Yang
5056bb9c01 rename aggregate to contents 2024-07-11 17:00:26 -07:00
Jeffrey Morgan
c4cf8ad559 llm: avoid loading model if system memory is too small (#5637)
* llm: avoid loading model if system memory is too small

* update log

* Instrument swap free space

On linux and windows, expose how much swap space is available
so we can take that into consideration when scheduling models

* use `systemSwapFreeMemory` in check

---------

Co-authored-by: Daniel Hiltgen <daniel@ollama.com>
2024-07-11 16:42:57 -07:00
Michael Yang
57ec6901eb revert embedded templates to use prompt/response
This reverts commit 19753c18c0.

for compat. messages will be added at a later date
2024-07-11 14:49:35 -07:00
Michael Yang
e64f9ebb44 do no automatically aggregate system messages 2024-07-11 14:49:35 -07:00
Jeffrey Morgan
791650ddef sched: only error when over-allocating system memory (#5626) 2024-07-11 00:53:12 -07:00
Jeffrey Morgan
efbf41ed81 llm: dont link cuda with compat libs (#5621) 2024-07-10 20:01:52 -07:00
Michael Yang
cf15589851 Merge pull request #5620 from ollama/mxyng/templates
update embedded templates
2024-07-10 17:16:24 -07:00
Michael Yang
19753c18c0 update embedded templates 2024-07-10 17:03:08 -07:00
Michael Yang
41be28096a add system prompt to first legacy template 2024-07-10 17:03:08 -07:00
Michael Yang
37a570f962 Merge pull request #5612 from ollama/mxyng/mem
chatglm graph
2024-07-10 14:18:33 -07:00
Michael Yang
5a739ff4cb chatglm graph 2024-07-10 13:43:47 -07:00
Jeffrey Morgan
4e262eb2a8 remove GGML_CUDA_FORCE_MMQ=on from build (#5588) 2024-07-10 13:17:13 -07:00
Daniel Hiltgen
4cfcbc328f Merge pull request #5124 from dhiltgen/amd_windows
Wire up windows AMD driver reporting
2024-07-10 12:50:23 -07:00
Daniel Hiltgen
79292ff3e0 Merge pull request #5555 from dhiltgen/msvc_deps
Bundle missing CRT libraries
2024-07-10 12:50:02 -07:00
Daniel Hiltgen
8ea500441d Merge pull request #5580 from dhiltgen/cuda_overhead
Detect CUDA OS overhead
2024-07-10 12:47:31 -07:00
Daniel Hiltgen
b50c818623 Merge pull request #5607 from dhiltgen/win_rocm_v6
Bump ROCm on windows to 6.1.2
2024-07-10 12:47:10 -07:00
Daniel Hiltgen
b99e750b62 Merge pull request #5605 from dhiltgen/merge_glitch
Remove duplicate merge glitch
2024-07-10 11:47:08 -07:00
Daniel Hiltgen
1f50356e8e Bump ROCm on windows to 6.1.2
This also adjusts our algorithm to favor our bundled ROCm.
I've confirmed VRAM reporting still doesn't work properly so we
can't yet enable concurrency by default.
2024-07-10 11:01:22 -07:00
Daniel Hiltgen
22c81f62ec Remove duplicate merge glitch 2024-07-10 09:01:33 -07:00
Daniel Hiltgen
73e2c8f68f Fix context exhaustion integration test for small gpus
On the smaller GPUs, the initial model load of llama2 took over 30s (the
default timeout for the DoGenerate helper)
2024-07-09 16:24:14 -07:00
Daniel Hiltgen
f4408219e9 Refine scheduler unit tests for reliability
This breaks up some of the test scenarios to create a
more reliable set of tests, as well as adding a little more
coverage.
2024-07-09 16:00:08 -07:00
Daniel Hiltgen
2d1e3c3229 Merge pull request #5503 from dhiltgen/dual_rocm
Workaround broken ROCm p2p copy
2024-07-09 15:44:16 -07:00
royjhan
4918fae535 OpenAI v1/completions: allow stop token list (#5551)
* stop token parsing fix

* add stop test
2024-07-09 14:01:26 -07:00
royjhan
0aff67877e separate request tests (#5578) 2024-07-09 13:48:31 -07:00
Daniel Hiltgen
f6f759fc5f Detect CUDA OS Overhead
This adds logic to detect skew between the driver and
management library which can be attributed to OS overhead
and records that so we can adjust subsequent management
library free VRAM updates and avoid OOM scenarios.
2024-07-09 12:21:50 -07:00
Daniel Hiltgen
9544a57ee4 Merge pull request #5579 from dhiltgen/win_static_deps
Statically link c++ and thread lib on windows
2024-07-09 12:21:13 -07:00
Daniel Hiltgen
b51e3b63ac Statically link c++ and thread lib
This makes sure we statically link the c++ and thread library on windows
to avoid unnecessary runtime dependencies on non-standard DLLs
2024-07-09 11:34:30 -07:00
Michael Yang
6bbbc50f10 Merge pull request #5440 from ollama/mxyng/messages-templates
update named templates
2024-07-09 09:36:32 -07:00
Michael Yang
9bbddc37a7 Merge pull request #5126 from ollama/mxyng/messages
update message processing
2024-07-09 09:20:44 -07:00
Jeffrey Morgan
e4ff73297d server: fix model reloads when setting OLLAMA_NUM_PARALLEL (#5560)
* server: fix unneeded model reloads when setting `OLLAMA_NUM_PARALLEL`

* remove whitespace change

* undo some changes
2024-07-08 22:32:15 -07:00
Daniel Hiltgen
b44320db13 Bundle missing CRT libraries
Some users are experienging runner startup errors due
to not having these msvc redist libraries on their host
2024-07-08 18:24:21 -07:00
Daniel Hiltgen
0bacb30007 Workaround broken ROCm p2p copy
Enable the build flag for llama.cpp to use CPU copy for multi-GPU scenarios.
2024-07-08 09:40:52 -07:00
Jeffrey Morgan
53da2c6965 llm: remove ambiguous comment when putting upper limit on predictions to avoid infinite generation (#5535) 2024-07-07 14:32:05 -04:00
Jeffrey Morgan
d8def1ff94 llm: allow gemma 2 to context shift (#5534) 2024-07-07 13:41:51 -04:00
Jeffrey Morgan
571dc61955 Update llama.cpp submodule to a8db2a9c (#5530) 2024-07-07 13:03:09 -04:00
Jeffrey Morgan
0e09c380fc llm: print caching notices in debug only (#5533) 2024-07-07 12:38:04 -04:00
Jeffrey Morgan
0ee87615c7 sched: don't error if paging to disk on Windows and macOS (#5523) 2024-07-06 22:01:52 -04:00
Jeffrey Morgan
f8241bfba3 gpu: report system free memory instead of 0 (#5521) 2024-07-06 19:35:04 -04:00
Jeffrey Morgan
4607c70641 llm: add -DBUILD_SHARED_LIBS=off to common cpu cmake flags (#5520) 2024-07-06 18:58:16 -04:00
jmorganca
c12f1c5b99 release: move mingw library cleanup to correct job 2024-07-06 16:12:29 -04:00
jmorganca
a08f20d910 release: remove unwanted mingw dll.a files 2024-07-06 15:21:15 -04:00
jmorganca
6cea036027 Revert "llm: only statically link libstdc++"
This reverts commit 5796bfc401.
2024-07-06 15:10:48 -04:00
jmorganca
5796bfc401 llm: only statically link libstdc++ 2024-07-06 14:06:20 -04:00
jmorganca
f1a379aa56 llm: statically link pthread and stdc++ dependencies in windows build 2024-07-06 12:54:02 -04:00
jmorganca
9ae146993e llm: add GGML_STATIC flag to windows static lib 2024-07-06 03:27:05 -04:00
Jeffrey Morgan
e0348d3fe8 llm: add COMMON_DARWIN_DEFS to arm static build (#5513) 2024-07-05 22:42:42 -04:00
Jeffrey Morgan
2cc854f8cb llm: fix missing dylibs by restoring old build behavior on Linux and macOS (#5511)
* Revert "fix cmake build (#5505)"

This reverts commit 4fd5f3526a.

* llm: fix missing dylibs by restoring old build behavior

* crlf -> lf
2024-07-05 21:48:31 -04:00
Jeffrey Morgan
5304b765b2 llm: put back old include dir (#5507)
* llm: put back old include dir

* llm: update link paths for old submodule commits
2024-07-05 19:34:21 -04:00
Michael Yang
fb6cbc02fb update named templates 2024-07-05 16:29:32 -07:00
Jeffrey Morgan
4fd5f3526a fix cmake build (#5505) 2024-07-05 19:07:01 -04:00
Daniel Hiltgen
842f85f758 Merge pull request #5502 from dhiltgen/ci_fixes
Always go build in CI generate steps
2024-07-05 15:39:11 -07:00
Daniel Hiltgen
9d30f9f8b3 Always go build in CI generate steps
With the recent cgo changes, bugs can sneak through
if we don't make sure to `go build` all the permutations
2024-07-05 15:31:52 -07:00
Blake Mizerany
631cfd9e62 types/model: remove knowledge of digest (#5500)
This was leading to ambiguity and confusion in ollama.com, and is not
used anywhere in ollama at the moment. Once manifests are addressable by
digest, we can add this back in, and in a way that is more tailored to
the concept of addressing a manifest by digest.
2024-07-05 13:42:30 -07:00
Michael Yang
326363b3a7 no funcs 2024-07-05 13:17:25 -07:00
Michael Yang
ac7a842e55 fix model reloading
ensure runtime model changes (template, system prompt, messages,
options) are captured on model updates without needing to reload the
server
2024-07-05 13:17:25 -07:00
Michael Yang
2c3fe1fd97 comments 2024-07-05 13:17:24 -07:00
Michael Yang
269ed6e6a2 update message processing 2024-07-05 13:16:58 -07:00
Jeffrey Morgan
78fb33dd07 fix typo in cgo directives in llm.go (#5501) 2024-07-05 15:18:36 -04:00
Jeffrey Morgan
8f8e736b13 update llama.cpp submodule to d7fd29f (#5475) 2024-07-05 13:25:58 -04:00
Jeffrey Morgan
d89454de80 Use slot with cached prompt instead of least recently used (#5492)
* Use common prefix to select slot

* actually report `longest`
2024-07-05 12:32:47 -04:00
Daniel Hiltgen
af28b94533 Merge pull request #5469 from dhiltgen/prevent_system_oom
Prevent loading models larger than total memory
2024-07-05 08:22:20 -07:00
Jeffrey Morgan
e9188e971a Fix assert on small embedding inputs (#5491)
* Fix assert on small embedding inputs

* Update llm/patches/09-pooling.diff
2024-07-05 11:20:57 -04:00
Daniel Hiltgen
78eddfc068 Merge pull request #4412 from dhiltgen/win_docs
Document older win10 terminal problems
2024-07-05 08:18:22 -07:00
Daniel Hiltgen
02c24d3d01 Merge pull request #5466 from dhiltgen/fix_clip_unicode
Fix clip model loading with unicode paths
2024-07-05 08:16:58 -07:00
Daniel Hiltgen
52abc8acb7 Document older win10 terminal problems
We haven't found a workaround, so for now recommend updating.
2024-07-03 17:32:14 -07:00
Jeffrey Morgan
4d71c559b2 fix error detection by limiting model loading error parsing (#5472) 2024-07-03 20:04:30 -04:00
Anatoli Babenia
0d16eb310e fix: use envconfig.ModelsDir directly (#4821)
* Co-authored-by: Anatoli Babenia <anatoli@rainforce.org>

Co-authored-by: Maas Lalani <maas@lalani.dev>
2024-07-03 15:36:11 -07:00
Daniel Hiltgen
8072e205ff Merge pull request #5447 from dhiltgen/fix_keepalive
Only set default keep_alive on initial model load
2024-07-03 15:34:38 -07:00
Daniel Hiltgen
955f2a4e03 Only set default keep_alive on initial model load
This change fixes the handling of keep_alive so that if client
request omits the setting, we only set this on initial load.  Once
the model is loaded, if new requests leave this unset, we'll keep
whatever keep_alive was there.
2024-07-03 15:29:56 -07:00
Daniel Hiltgen
3c75113e37 Prevent loading models larger than total memory
Users may not realize the siny new model they're trying to load
fits on their disk, but can't load into system+GPU memory.  Today
we crash, but with this fix, we'll give them a better error message
before even trying to load it.
2024-07-03 14:47:42 -07:00
Daniel Hiltgen
ccd7785859 Merge pull request #5243 from dhiltgen/modelfile_use_mmap
Fix use_mmap for modefiles
2024-07-03 13:59:42 -07:00
royjhan
3b5a4a77f3 Return Correct Prompt Eval Count Regardless of Cache Prompt (#5371)
* openai compatibility

* Revert "openai compatibility"

This reverts commit d3f98a811e.

* remove erroneous subtraction of prompt cache
2024-07-03 13:46:23 -07:00
Daniel Hiltgen
daed0634a9 Merge pull request #5467 from dhiltgen/bogus_cpu_mac_error
Fix corner cases on tmp cleaner on mac
2024-07-03 13:39:36 -07:00
Daniel Hiltgen
0d4dd707bc Merge pull request #5465 from dhiltgen/better_cuda_logging
Better nvidia GPU discovery logging
2024-07-03 13:12:22 -07:00
Daniel Hiltgen
0e982bc1f4 Fix corner cases on tmp cleaner on mac
When ollama is running a long time, tmp cleaners can remove the
runners.  This tightens up a few corner cases on arm macs where
we failed with "server cpu not listed in available servers map[]"
2024-07-03 13:10:14 -07:00
Daniel Hiltgen
6298f49816 Fix clip model loading with unicode paths
On windows, if the model dir contained unicode characters
clip models would fail to load.  This fixes the file name
handling in clip.cpp to support utf16 on windows.
2024-07-03 12:46:36 -07:00
Daniel Hiltgen
ef757da2c9 Better nvidia GPU discovery logging
Refine the way we log GPU discovery to improve the non-debug
output, and report more actionable log messages when possible
to help users troubleshoot on their own.
2024-07-03 10:50:40 -07:00
Michael Yang
e5352297d9 Merge pull request #5448 from ollama/mxyng/fix-generate
use model template by default
2024-07-02 16:48:06 -07:00
Michael Yang
65a5040e09 fix generate template 2024-07-02 16:42:17 -07:00
royjhan
d626b99b54 OpenAI: v1/completions compatibility (#5209)
* OpenAI v1 models

* Refactor Writers

* Add Test

Co-Authored-By: Attila Kerekes

* Credit Co-Author

Co-Authored-By: Attila Kerekes <439392+keriati@users.noreply.github.com>

* Empty List Testing

* Use Namespace for Ownedby

* Update Test

* Add back envconfig

* v1/models docs

* Use ModelName Parser

* Test Names

* Remove Docs

* Clean Up

* Test name

Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com>

* Add Middleware for Chat and List

* Completions Endpoint

* Testing Cleanup

* Test with Fatal

* Add functionality to chat test

* Rename function

* float types

* type cleanup

* cleaning

* more cleaning

* Extra test cases

* merge conflicts

* merge conflicts

* merge conflicts

* merge conflicts

* cleaning

* cleaning

---------

Co-authored-by: Attila Kerekes <439392+keriati@users.noreply.github.com>
Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com>
2024-07-02 16:01:45 -07:00
Michael Yang
dddb58a38b Merge pull request #5051 from ollama/mxyng/capabilities
add model capabilities
2024-07-02 14:26:07 -07:00
Michael Yang
400056e154 Merge pull request #5420 from ollama/mxyng/insecure-path
err on insecure path
2024-07-02 14:03:23 -07:00
Daniel Hiltgen
d2f19024d0 Merge pull request #5442 from dhiltgen/concurrency_docs
Add windows radeon concurrency note
2024-07-02 12:47:47 -07:00
Daniel Hiltgen
69c04eecc4 Add windows radeon concurreny note 2024-07-02 12:46:14 -07:00
royjhan
996bb1b85e OpenAI: /v1/models and /v1/models/{model} compatibility (#5007)
* OpenAI v1 models

* Refactor Writers

* Add Test

Co-Authored-By: Attila Kerekes

* Credit Co-Author

Co-Authored-By: Attila Kerekes <439392+keriati@users.noreply.github.com>

* Empty List Testing

* Use Namespace for Ownedby

* Update Test

* Add back envconfig

* v1/models docs

* Use ModelName Parser

* Test Names

* Remove Docs

* Clean Up

* Test name

Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com>

* Add Middleware for Chat and List

* Testing Cleanup

* Test with Fatal

* Add functionality to chat test

* OpenAI: /v1/models/{model} compatibility (#5028)

* Retrieve Model

* OpenAI Delete Model

* Retrieve Middleware

* Remove Delete from Branch

* Update Test

* Middleware Test File

* Function name

* Cleanup

* Test Update

* Test Update

---------

Co-authored-by: Attila Kerekes <439392+keriati@users.noreply.github.com>
Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com>
2024-07-02 11:50:56 -07:00
Daniel Hiltgen
422dcc3856 Merge pull request #5439 from dhiltgen/fix_centos_7_build
Switch ARM64 container image base to rocky 8
2024-07-02 11:01:15 -07:00
Daniel Hiltgen
020bd60ab2 Switch amd container image base to rocky 8
The centos 7 arm mirrors have disappeared due to the EOL 2 days
ago, and the vault sed workaround which works for x86 doesn't work for arm.
2024-07-02 10:34:47 -07:00
Daniel Hiltgen
8e277b72bb Merge pull request #5438 from dhiltgen/fix_centos_7_build
Centos 7 EOL broke mirrors
2024-07-02 09:28:00 -07:00
Daniel Hiltgen
4f67b39d26 Centos 7 EOL broke mirrors
As of July 1st 2024: Could not resolve host: mirrorlist.centos.org
This is expected due to EOL dates.
2024-07-02 09:22:17 -07:00
Josh
2425281317 Merge pull request #5336 from ollama/jyan/from-errors
fix: trim spaces for FROM argument, don't trim inside of quotes
2024-07-01 16:32:46 -07:00
Josh
0403e9860e Merge pull request #5421 from ollama/jyan/ver
fix: add unsupported architecture message for linux/windows
2024-07-01 16:32:14 -07:00
Josh Yan
33a65e3ba3 error 2024-07-01 16:04:13 -07:00
Michael Yang
88bcd79bb9 err on insecure path 2024-07-01 15:55:59 -07:00
Josh Yan
7e571f95f0 trimspace test case 2024-07-01 11:07:48 -07:00
Michael Yang
da8e2a0447 use kvs to detect embedding models 2024-07-01 10:47:43 -07:00
Michael Yang
a30915bde1 add capabilities 2024-07-01 10:47:43 -07:00
Michael Yang
58e3fff311 rename templates to template 2024-07-01 10:40:54 -07:00
Michael Yang
3f0b309ad4 remove ManifestV2 2024-07-01 10:40:54 -07:00
Daniel Hiltgen
e70610ef06 Merge pull request #5410 from dhiltgen/ctx_cleanup
Fix case for NumCtx
2024-07-01 09:54:20 -07:00
Daniel Hiltgen
dfded7e075 Merge pull request #5364 from dhiltgen/concurrency_docs
Document concurrent behavior and settings
2024-07-01 09:49:48 -07:00
Daniel Hiltgen
173b550438 Remove default auto from help message
This may confuse users thinking "auto" is an acceptable string - it must be numeric
2024-07-01 09:48:05 -07:00
Daniel Hiltgen
cff3f44f4a Fix case for NumCtx 2024-07-01 09:43:59 -07:00
Josh Yan
26e4e66faf updated parsefile test 2024-07-01 09:43:49 -07:00
Daniel Hiltgen
97c9e11768 Switch use_mmap to a pointer type
This uses nil as undefined for a cleaner implementation.
2024-07-01 08:44:59 -07:00
Daniel Hiltgen
3518aaef33 Merge pull request #4218 from dhiltgen/auto_parallel
Enable concurrency by default
2024-07-01 08:32:29 -07:00
RAPID ARCHITECT
1963c00201 Update README.md (#5214)
* Update README.md

Added Mesop example to web & desktop

* Update README.md

---------

Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com>
2024-06-30 22:00:57 -04:00
Eduard
27402cb7a2 Update gpu.md (#5382)
Runs fine on a NVIDIA GeForce GTX 1050 Ti
2024-06-30 21:48:51 -04:00
Jeffrey Morgan
c1218199cf Update api.md 2024-06-29 16:22:49 -07:00
Jeffrey Morgan
717f7229eb Do not shift context for sliding window models (#5368)
* Do not shift context for sliding window models

* truncate prompt > 2/3 tokens

* only target gemma2
2024-06-28 19:39:31 -07:00
Daniel Hiltgen
aae56abb7c Document concurrent behavior and settings 2024-06-28 13:15:57 -07:00
royjhan
5f034f5b63 Include Show Info in Interactive (#5342) 2024-06-28 13:15:52 -07:00
royjhan
b910fa9010 Ollama Show: Check for Projector Type (#5307)
* Check exists projtype

* Maintain Ordering
2024-06-28 11:30:16 -07:00
royjhan
6d4219083c Update docs (#5312) 2024-06-28 09:58:14 -07:00
Michael Yang
1ed4f521c4 Merge pull request #5340 from ollama/mxyng/mem
gemma2 graph
2024-06-27 14:26:49 -07:00
Michael Yang
de2163dafd gemma2 graph 2024-06-27 13:34:52 -07:00
Josh Yan
9bd00041fa trim all params 2024-06-27 11:18:38 -07:00
Josh Yan
4e986a823c unquote, trimp space 2024-06-27 10:59:15 -07:00
Michael
2cc7d05012 update readme for gemma 2 (#5333)
* update readme for gemma 2
2024-06-27 12:45:16 -04:00
Michael Yang
123a722a6f zip: prevent extracting files into parent dirs (#5314) 2024-06-26 21:38:21 -07:00
Jeffrey Morgan
4d311eb731 llm: architecture patch (#5316) 2024-06-26 21:38:12 -07:00
Blake Mizerany
cb42e607c5 llm: speed up gguf decoding by a lot (#5246)
Previously, some costly things were causing the loading of GGUF files
and their metadata and tensor information to be VERY slow:

  * Too many allocations when decoding strings
  * Hitting disk for each read of each key and value, resulting in a
    not-okay amount of syscalls/disk I/O.

The show API is now down to 33ms from 800ms+ for llama3 on a macbook pro
m3.

This commit also prevents collecting large arrays of values when
decoding GGUFs (if desired). When such keys are encountered, their
values are null, and are encoded as such in JSON.

Also, this fixes a broken test that was not encoding valid GGUF.
2024-06-24 21:47:52 -07:00
Blake Mizerany
2aa91a937b cmd: defer stating model info until necessary (#5248)
This commit changes the 'ollama run' command to defer fetching model
information until it really needs it. That is, when in interactive mode.

It also removes one such case where the model information is fetch in
duplicate, just before calling generateInteractive and then again, first
thing, in generateInteractive.

This positively impacts the performance of the command:

    ; time ./before run llama3 'hi'
    Hi! It's nice to meet you. Is there something I can help you with, or would you like to chat?

    ./before run llama3 'hi'  0.02s user 0.01s system 2% cpu 1.168 total
    ; time ./before run llama3 'hi'
    Hi! It's nice to meet you. Is there something I can help you with, or would you like to chat?

    ./before run llama3 'hi'  0.02s user 0.01s system 2% cpu 1.220 total
    ; time ./before run llama3 'hi'
    Hi! It's nice to meet you. Is there something I can help you with, or would you like to chat?

    ./before run llama3 'hi'  0.02s user 0.01s system 2% cpu 1.217 total
    ; time ./after run llama3 'hi'
    Hi! It's nice to meet you. Is there something I can help you with, or would you like to chat?

    ./after run llama3 'hi'  0.02s user 0.01s system 4% cpu 0.652 total
    ; time ./after run llama3 'hi'
    Hi! It's nice to meet you. Is there something I can help you with, or would you like to chat?

    ./after run llama3 'hi'  0.01s user 0.01s system 5% cpu 0.498 total
    ; time ./after run llama3 'hi'
    Hi! It's nice to meet you. Is there something I can help you with or would you like to chat?

    ./after run llama3 'hi'  0.01s user 0.01s system 3% cpu 0.479 total
    ; time ./after run llama3 'hi'
    Hi! It's nice to meet you. Is there something I can help you with, or would you like to chat?

    ./after run llama3 'hi'  0.02s user 0.01s system 5% cpu 0.507 total
    ; time ./after run llama3 'hi'
    Hi! It's nice to meet you. Is there something I can help you with, or would you like to chat?

    ./after run llama3 'hi'  0.02s user 0.01s system 5% cpu 0.507 total
2024-06-24 20:14:03 -07:00
Daniel Hiltgen
ccef9431c8 Merge pull request #5205 from dhiltgen/modelfile_use_mmap
Fix use_mmap parsing for modelfiles
2024-06-21 16:30:36 -07:00
Daniel Hiltgen
642cee1342 Sort the ps output
Provide consistent ordering for the ps command - longest duration listed first
2024-06-21 15:59:41 -07:00
royjhan
9a9e7d83c4 Docs (#5149) 2024-06-21 15:52:09 -07:00
Daniel Hiltgen
9929751cc8 Disable concurrency for AMD + Windows
Until ROCm v6.2 ships, we wont be able to get accurate free memory
reporting on windows, which makes automatic concurrency too risky.
Users can still opt-in but will need to pay attention to model sizes otherwise they may thrash/page VRAM or cause OOM crashes.
All other platforms and GPUs have accurate VRAM reporting wired
up now, so we can turn on concurrency by default.
2024-06-21 15:45:05 -07:00
Daniel Hiltgen
17b7186cd7 Enable concurrency by default
This adjusts our default settings to enable multiple models and parallel
requests to a single model.  Users can still override these by the same
env var settings as before.  Parallel has a direct impact on
num_ctx, which in turn can have a significant impact on small VRAM GPUs
so this change also refines the algorithm so that when parallel is not
explicitly set by the user, we try to find a reasonable default that fits
the model on their GPU(s).  As before, multiple models will only load
concurrently if they fully fit in VRAM.
2024-06-21 15:45:05 -07:00
Michael Yang
189a43caa2 Merge pull request #5206 from ollama/mxyng/quantize
fix: quantization with template
2024-06-21 13:44:34 -07:00
Michael Yang
e835ef1836 fix: quantization with template 2024-06-21 13:39:25 -07:00
Daniel Hiltgen
7e7749224c Fix use_mmap parsing for modelfiles
Add the new tristate parsing logic for the code path for modelfiles,
as well as a unit test.
2024-06-21 12:27:19 -07:00
Daniel Hiltgen
c7c2f3bc22 Merge pull request #5194 from dhiltgen/linux_mmap_auto
Refine mmap default logic on linux
2024-06-20 11:44:08 -07:00
Daniel Hiltgen
54a79d6a8a Merge pull request #5125 from dhiltgen/fedora39
Bump latest fedora cuda repo to 39
2024-06-20 11:27:24 -07:00
Daniel Hiltgen
5bf5aeec01 Refine mmap default logic on linux
If we try to use mmap when the model is larger than the system free space, loading is slower than the no-mmap approach.
2024-06-20 11:07:04 -07:00
Michael Yang
e01e535cbb Merge pull request #5192 from ollama/mxyng/kv
handle asymmetric embedding KVs
2024-06-20 10:46:24 -07:00
Josh
0195d6a2f8 Merge pull request #5188 from ollama/jyan/tmpdir2
fix: skip os.removeAll() if PID does not exist
2024-06-20 10:40:59 -07:00
Michael Yang
8e0641a9bf handle asymmetric embedding KVs 2024-06-20 09:57:27 -07:00
Josh Yan
662568d453 err!=nil check 2024-06-20 09:30:59 -07:00
Josh Yan
4ebb66c662 reformat error check 2024-06-20 09:23:43 -07:00
Josh Yan
23e899f32d skip os.removeAll() if PID does not exist 2024-06-20 08:51:35 -07:00
royjhan
fedf71635e Extend api/show and ollama show to return more model info (#4881)
* API Show Extended

* Initial Draft of Information

Co-Authored-By: Patrick Devine <pdevine@sonic.net>

* Clean Up

* Descriptive arg error messages and other fixes

* Second Draft of Show with Projectors Included

* Remove Chat Template

* Touches

* Prevent wrapping from files

* Verbose functionality

* Docs

* Address Feedback

* Lint

* Resolve Conflicts

* Function Name

* Tests for api/show model info

* Show Test File

* Add Projector Test

* Clean routes

* Projector Check

* Move Show Test

* Touches

* Doc update

---------

Co-authored-by: Patrick Devine <pdevine@sonic.net>
2024-06-19 14:19:02 -07:00
Daniel Hiltgen
97c59be653 Merge pull request #5074 from dhiltgen/app_log_rotation
Implement log rotation for tray app
2024-06-19 13:02:24 -07:00
Daniel Hiltgen
9d8a4988e8 Implement log rotation for tray app 2024-06-19 12:53:34 -07:00
Michael Yang
1ae0750a21 Merge pull request #5147 from ollama/mxyng/cleanup
remove confusing log message
2024-06-19 12:50:31 -07:00
Michael Yang
9d91e5e587 remove confusing log message 2024-06-19 11:14:11 -07:00
Daniel Hiltgen
96624aa412 Merge pull request #5072 from dhiltgen/windows_path
Move libraries out of users path
2024-06-19 09:13:39 -07:00
Daniel Hiltgen
10f33b8537 Merge pull request #5146 from dhiltgen/backout
Put back temporary intel GPU env var
2024-06-19 09:12:45 -07:00
Daniel Hiltgen
4a633cc295 Merge pull request #5145 from dhiltgen/bad_loads
Fix bad symbol load detection
2024-06-19 09:12:33 -07:00
Daniel Hiltgen
d34d88e417 Revert "Revert "gpu: add env var for detecting Intel oneapi gpus (#5076)""
This reverts commit 755b4e4fc2.
2024-06-19 08:57:41 -07:00
Daniel Hiltgen
52ce350b7a Fix bad symbol load detection
pointer deref's weren't correct on a few libraries, which explains
some crashes on older systems or miswired symlinks for discovery libraries.
2024-06-19 08:39:07 -07:00
Daniel Hiltgen
2abebb2cbe Merge pull request #5128 from zhewang1-intc/fix_levelzero_empty_symbol_detect
Fix levelzero empty symbol detect
2024-06-19 08:33:16 -07:00
Blake Mizerany
380e06e5be types/model: remove Digest
The Digest type in its current form is awkward to work with and presents
challenges with regard to how it serializes via String using the '-'
prefix.

We currently only use this in ollama.com, so we'll move our specific
needs around digest parsing and validation there.
2024-06-18 20:28:11 -07:00
Wang,Zhe
badf975e45 get real func ptr. 2024-06-19 09:00:51 +08:00
Wang,Zhe
755b4e4fc2 Revert "gpu: add env var for detecting Intel oneapi gpus (#5076)"
This reverts commit 163cd3e77c.
2024-06-19 08:59:58 +08:00
Daniel Hiltgen
1a1c99e334 Bump latest fedora cuda repo to 39 2024-06-18 17:13:54 -07:00
Michael Yang
21adf8b6d2 Merge pull request #5121 from ollama/mxyng/deepseekv2
deepseek v2 graph
2024-06-18 16:30:58 -07:00
Daniel Hiltgen
784bf88b0d Wire up windows AMD driver reporting
This seems to be ROCm version, not actually driver version, but
it may be useful for toggling logic for VRAM reporting in the future
2024-06-18 16:22:47 -07:00
Michael Yang
e873841cbb deepseek v2 graph 2024-06-18 15:35:12 -07:00
Daniel Hiltgen
26d0bf9236 Merge pull request #5117 from dhiltgen/fix_prediction
Handle models with divergent layer sizes
2024-06-18 11:36:51 -07:00
Daniel Hiltgen
359b15a597 Handle models with divergent layer sizes
The recent refactoring of the memory prediction assumed all layers
are the same size, but for some models (like deepseek-coder-v2) this
is not the case, so our predictions were significantly off.
2024-06-18 11:05:34 -07:00
Daniel Hiltgen
b55958a587 Merge pull request #5106 from dhiltgen/clean_logs
Tighten up memory prediction logging
2024-06-18 09:24:38 -07:00
Daniel Hiltgen
7784ca33ce Tighten up memory prediction logging
Prior to this change, we logged the memory prediction multiple times
as the scheduler iterates to find a suitable configuration, which can be
confusing since only the last log before the server starts is actually valid.
This now logs once just before starting the server on the final configuration.
It also reports what library instead of always saying "offloading to gpu" when
using CPU.
2024-06-18 09:15:35 -07:00
Daniel Hiltgen
c9c8c98bf6 Merge pull request #5105 from dhiltgen/cuda_mmap
Adjust mmap logic for cuda windows for faster model load
2024-06-17 17:07:30 -07:00
Daniel Hiltgen
171796791f Adjust mmap logic for cuda windows for faster model load
On Windows, recent llama.cpp changes make mmap slower in most
cases, so default to off.  This also implements a tri-state for
use_mmap so we can detect the difference between a user provided
value of true/false, or unspecified.
2024-06-17 16:54:30 -07:00
Jeffrey Morgan
176d0f7075 Update import.md 2024-06-17 19:44:14 -04:00
Daniel Hiltgen
8ed51cac37 Merge pull request #5103 from dhiltgen/faster_win_build
Revert powershell jobs, but keep nvcc and cmake parallelism
2024-06-17 14:23:18 -07:00
Daniel Hiltgen
c9e6f0542d Merge pull request #5069 from dhiltgen/ci_release
Implement custom github release action
2024-06-17 13:59:37 -07:00
Daniel Hiltgen
b0930626c5 Add back lower level parallel flags
nvcc supports parallelism (threads) and cmake + make can use -j,
while msbuild requires /p:CL_MPcount=8
2024-06-17 13:44:46 -07:00
Daniel Hiltgen
e890be4814 Revert "More parallelism on windows generate"
This reverts commit 0577af98f4.
2024-06-17 13:32:46 -07:00
Daniel Hiltgen
b2799f111b Move libraries out of users path
We update the PATH on windows to get the CLI mapped, but this has
an unintended side effect of causing other apps that may use our bundled
DLLs to get terminated when we upgrade.
2024-06-17 13:12:18 -07:00
Jeffrey Morgan
152fc202f5 llm: update llama.cpp commit to 7c26775 (#4896)
* llm: update llama.cpp submodule to `7c26775`

* disable `LLAMA_BLAS` for now

* `-DLLAMA_OPENMP=off`
2024-06-17 15:56:16 -04:00
Lei Jitang
4ad0d4d6d3 Fix a build warning (#5096)
Signed-off-by: Lei Jitang <leijitang@outlook.com>
2024-06-17 14:47:48 -04:00
Jeffrey Morgan
163cd3e77c gpu: add env var for detecting Intel oneapi gpus (#5076)
* gpu: add env var for detecting intel oneapi gpus

* fix build error
2024-06-16 20:09:05 -04:00
Daniel Hiltgen
4c2c8f93dd Merge pull request #5080 from dhiltgen/debug_intel_crash
Add some more debugging logs for intel discovery
2024-06-16 14:42:41 -07:00
Daniel Hiltgen
fd1e6e0590 Add some more debugging logs for intel discovery
Also removes an unused overall count variable
2024-06-16 07:42:52 -07:00
royjhan
89c79bec8c Add ModifiedAt Field to /api/show (#5033)
* Add Mod Time to Show

* Error Handling
2024-06-15 20:53:56 -07:00
Jeffrey Morgan
c7b77004e3 docs: add missing powershell package to windows development instructions (#5075)
* docs: add missing instruction for powershell build

The powershell script for building Ollama on Windows now requires the `ThreadJob` module. Add this to the instructions and dependency list.

* Update development.md
2024-06-15 23:08:09 -04:00
Daniel Hiltgen
07d143f412 Merge pull request #5058 from coolljt0725/fix_build_warning
gpu: Fix build warning
2024-06-15 11:52:36 -07:00
Daniel Hiltgen
a12283e2ff Implement custom github release action
This implements the release logic we want via gh cli
to support updating releases with rc tags in place and retain
release notes and other community reactions.
2024-06-15 11:36:56 -07:00
Daniel Hiltgen
4b0050cf0e Merge pull request #5037 from dhiltgen/faster_win_build
More parallelism on windows generate
2024-06-15 08:03:05 -07:00
Daniel Hiltgen
0577af98f4 More parallelism on windows generate
Make the build faster
2024-06-15 07:44:55 -07:00
Daniel Hiltgen
17ce203a26 Merge pull request #4875 from dhiltgen/rocm_gfx900_workaround
Rocm gfx900 workaround
2024-06-15 07:38:58 -07:00
Daniel Hiltgen
d76555ffb5 Merge pull request #4874 from dhiltgen/rocm_v6_bump
Rocm v6 bump
2024-06-15 07:38:32 -07:00
Daniel Hiltgen
2786dff5d3 Merge pull request #4264 from dhiltgen/show_gpu_visible_settings
Centralize GPU configuration vars
2024-06-15 07:33:52 -07:00
Lei Jitang
225f0d1219 gpu: Fix build warning
Signed-off-by: Lei Jitang <leijitang@outlook.com>
2024-06-15 14:26:23 +08:00
Daniel Hiltgen
532db58311 Merge pull request #4972 from jayson-cloude/main
fix: "Skip searching for network devices"
2024-06-14 17:04:40 -07:00
Daniel Hiltgen
6be309e1bd Centralize GPU configuration vars
This should aid in troubleshooting by capturing and reporting the GPU
settings at startup in the logs along with all the other server settings.
2024-06-14 15:59:10 -07:00
Daniel Hiltgen
da3bf23354 Workaround gfx900 SDMA bugs
Implement support for GPU env var workarounds, and leverage
this for the Vega RX 56 which needs
HSA_ENABLE_SDMA=0 set to work properly
2024-06-14 15:38:13 -07:00
Daniel Hiltgen
26ab67732b Bump ROCm linux to 6.1.1 2024-06-14 15:37:54 -07:00
Daniel Hiltgen
45cacbaf05 Merge pull request #4517 from dhiltgen/gpu_incremental
Enhanced GPU discovery and multi-gpu support with concurrency
2024-06-14 15:35:00 -07:00
Daniel Hiltgen
17df6520c8 Remove mmap related output calc logic 2024-06-14 14:55:50 -07:00
Daniel Hiltgen
6f351bf586 review comments and coverage 2024-06-14 14:55:50 -07:00
Daniel Hiltgen
ff4f0cbd1d Prevent multiple concurrent loads on the same gpus
While models are loading, the VRAM metrics are dynamic, so try
to load on a GPU that doesn't have a model actively loading, or wait
to avoid races that lead to OOMs
2024-06-14 14:51:40 -07:00
Daniel Hiltgen
fc37c192ae Refine CPU load behavior with system memory visibility 2024-06-14 14:51:40 -07:00
Daniel Hiltgen
434dfe30c5 Reintroduce nvidia nvml library for windows
This library will give us the most reliable free VRAM reporting on windows
to enable concurrent model scheduling.
2024-06-14 14:51:40 -07:00
Daniel Hiltgen
4e2b7e181d Refactor intel gpu discovery 2024-06-14 14:51:40 -07:00
Daniel Hiltgen
48702dd149 Harden unload for empty runners 2024-06-14 14:51:40 -07:00
Daniel Hiltgen
68dfc6236a refined test timing
adjust timing on some tests so they don't timeout on small/slow GPUs
2024-06-14 14:51:40 -07:00
Daniel Hiltgen
5e8ff556cb Support forced spreading for multi GPU
Our default behavior today is to try to fit into a single GPU if possible.
Some users would prefer the old behavior of always spreading across
multiple GPUs even if the model can fit into one.  This exposes that
tunable behavior.
2024-06-14 14:51:40 -07:00
Daniel Hiltgen
6fd04ca922 Improve multi-gpu handling at the limit
Still not complete, needs some refinement to our prediction to understand the
discrete GPUs available space so we can see how many layers fit in each one
since we can't split one layer across multiple GPUs we can't treat free space
as one logical block
2024-06-14 14:51:40 -07:00
Daniel Hiltgen
206797bda4 Fix concurrency integration test to work locally
This worked remotely but wound up trying to spawn multiple servers
locally which doesn't work
2024-06-14 14:51:40 -07:00
Daniel Hiltgen
43ed358f9a Refine GPU discovery to bootstrap once
Now that we call the GPU discovery routines many times to
update memory, this splits initial discovery from free memory
updating.
2024-06-14 14:51:40 -07:00
Daniel Hiltgen
b32ebb4f29 Use DRM driver for VRAM info for amd
The amdgpu drivers free VRAM reporting omits some other apps, so leverage the
upstream DRM driver which keeps better tabs on things
2024-06-14 14:51:40 -07:00
Daniel Hiltgen
fb9cdfa723 Fix server.cpp for the new cuda build macros 2024-06-14 14:51:40 -07:00
Daniel Hiltgen
efac488675 Revert "Limit GPU lib search for now (#4777)"
This reverts commit 476fb8e892.
2024-06-14 14:51:40 -07:00
Jeffrey Morgan
6b800aa7b7 openai: do not set temperature to 0 when setting seed (#5045) 2024-06-14 13:43:56 -07:00
Jeffrey Morgan
dd7c9ebeaf server: longer timeout in TestRequests (#5046) 2024-06-14 09:48:25 -07:00
Patrick Devine
4dc7fb9525 update 40xx gpu compat matrix (#5036) 2024-06-13 17:10:33 -07:00
jayson-cloude
157f09acdf fix: "Skip searching for network devices"
On an Ubuntu 24.04 computer with vmware installed, the sudo lshw command will get stuck. "Network interfaces" is always displayed
2024-06-11 16:11:35 +08:00
201 changed files with 8976 additions and 2727 deletions

View File

@@ -147,7 +147,7 @@ jobs:
run: |
$ErrorActionPreference = "Stop"
write-host "downloading AMD HIP Installer"
Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-23.Q4-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
write-host "Installing AMD HIP"
Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
write-host "Completed AMD HIP"
@@ -437,6 +437,7 @@ jobs:
env:
OLLAMA_SKIP_IMAGE_BUILD: '1'
PUSH: '1'
GH_TOKEN: ${{ github.token }}
steps:
- uses: actions/checkout@v4
- name: Set Version
@@ -460,15 +461,20 @@ jobs:
ls -lh dist/
(cd dist; sha256sum * > sha256sum.txt)
cat dist/sha256sum.txt
- uses: ncipollo/release-action@v1
with:
name: ${{ env.RELEASE_VERSION }}
allowUpdates: true
artifacts: 'dist/*'
draft: true
prerelease: true
omitBodyDuringUpdate: true
generateReleaseNotes: true
omitDraftDuringUpdate: true
omitPrereleaseDuringUpdate: true
replacesArtifacts: true
- name: Create or update Release
run: |
echo "Looking for existing release for ${{ env.RELEASE_VERSION }}"
OLD_TAG=$(gh release ls --json name,tagName | jq -r ".[] | select(.name == \"${{ env.RELEASE_VERSION }}\") | .tagName")
if [ -n "$OLD_TAG" ]; then
echo "Updating release ${{ env.RELEASE_VERSION }} to point to new tag ${GITHUB_REF_NAME}"
gh release edit ${OLD_TAG} --tag ${GITHUB_REF_NAME}
else
echo "Creating new release ${{ env.RELEASE_VERSION }} pointing to tag ${GITHUB_REF_NAME}"
gh release create ${GITHUB_REF_NAME} \
--title ${{ env.RELEASE_VERSION }} \
--draft \
--generate-notes \
--prerelease
fi
echo "Uploading artifacts for tag ${GITHUB_REF_NAME}"
gh release upload ${GITHUB_REF_NAME} dist/* --clobber

View File

@@ -58,6 +58,7 @@ jobs:
runs-on: ${{ matrix.os }}
env:
GOARCH: ${{ matrix.arch }}
CGO_ENABLED: '1'
steps:
- uses: actions/checkout@v4
- uses: actions/setup-go@v5
@@ -79,6 +80,7 @@ jobs:
- run: go generate -x ./...
if: ${{ ! startsWith(matrix.os, 'windows-') }}
name: 'Unix Go Generate'
- run: go build .
- uses: actions/upload-artifact@v4
with:
name: ${{ matrix.os }}-${{ matrix.arch }}-libraries
@@ -124,7 +126,7 @@ jobs:
strategy:
matrix:
rocm-version:
- '6.0.2'
- '6.1.2'
runs-on: linux
container: rocm/dev-ubuntu-20.04:${{ matrix.rocm-version }}
steps:
@@ -167,7 +169,7 @@ jobs:
run: |
$ErrorActionPreference = "Stop"
write-host "downloading AMD HIP Installer"
Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-23.Q4-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
write-host "Installing AMD HIP"
Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
write-host "Completed AMD HIP"

View File

@@ -2,7 +2,7 @@ ARG GOLANG_VERSION=1.22.1
ARG CMAKE_VERSION=3.22.1
# this CUDA_VERSION corresponds with the one specified in docs/gpu.md
ARG CUDA_VERSION=11.3.1
ARG ROCM_VERSION=6.0.2
ARG ROCM_VERSION=6.1.2
# Copy the minimal context we need to run the generate scripts
FROM scratch AS llm-code
@@ -70,12 +70,12 @@ RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu_avx" sh gen_linux.sh
FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu_avx2-build-amd64
RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu_avx2" sh gen_linux.sh
FROM --platform=linux/arm64 centos:7 AS cpu-builder-arm64
FROM --platform=linux/arm64 rockylinux:8 AS cpu-builder-arm64
ARG CMAKE_VERSION
ARG GOLANG_VERSION
COPY ./scripts/rh_linux_deps.sh /
RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
ENV PATH /opt/rh/gcc-toolset-10/root/usr/bin:$PATH
COPY --from=llm-code / /go/src/github.com/ollama/ollama/
ARG OLLAMA_CUSTOM_CPU_DEFS
ARG CGO_CFLAGS

View File

@@ -53,8 +53,8 @@ Here are some example models that can be downloaded:
| Llama 3 | 70B | 40GB | `ollama run llama3:70b` |
| Phi 3 Mini | 3.8B | 2.3GB | `ollama run phi3` |
| Phi 3 Medium | 14B | 7.9GB | `ollama run phi3:medium` |
| Gemma | 2B | 1.4GB | `ollama run gemma:2b` |
| Gemma | 7B | 4.8GB | `ollama run gemma:7b` |
| Gemma 2 | 9B | 5.5GB | `ollama run gemma2` |
| Gemma 2 | 27B | 16GB | `ollama run gemma2:27b` |
| Mistral | 7B | 4.1GB | `ollama run mistral` |
| Moondream 2 | 1.4B | 829MB | `ollama run moondream` |
| Neural Chat | 7B | 4.1GB | `ollama run neural-chat` |
@@ -182,6 +182,12 @@ $ ollama run llama3 "Summarize this file: $(cat README.md)"
Ollama is a lightweight, extensible framework for building and running language models on the local machine. It provides a simple API for creating, running, and managing models, as well as a library of pre-built models that can be easily used in a variety of applications.
```
### Show model information
```
ollama show llama3
```
### List models on your computer
```
@@ -286,6 +292,10 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [Olpaka](https://github.com/Otacon/olpaka) (User-friendly Flutter Web App for Ollama)
- [OllamaSpring](https://github.com/CrazyNeil/OllamaSpring) (Ollama Client for macOS)
- [LLocal.in](https://github.com/kartikm7/llocal) (Easy to use Electron Desktop Client for Ollama)
- [Ollama with Google Mesop](https://github.com/rapidarchitect/ollama_mesop/) (Mesop Chat Client implementation with Ollama)
- [Kerlig AI](https://www.kerlig.com/) (AI writing assistant for macOS)
- [AI Studio](https://github.com/MindWorkAI/AI-Studio)
- [Sidellama](https://github.com/gyopak/sidellama) (browser-based LLM client)
### Terminal

View File

@@ -347,7 +347,16 @@ func (c *Client) Heartbeat(ctx context.Context) error {
return nil
}
// Embeddings generates embeddings from a model.
// Embed generates embeddings from a model.
func (c *Client) Embed(ctx context.Context, req *EmbedRequest) (*EmbedResponse, error) {
var resp EmbedResponse
if err := c.do(ctx, http.MethodPost, "/api/embed", req, &resp); err != nil {
return nil, err
}
return &resp, nil
}
// Embeddings generates an embedding from a model.
func (c *Client) Embeddings(ctx context.Context, req *EmbeddingRequest) (*EmbeddingResponse, error) {
var resp EmbeddingResponse
if err := c.do(ctx, http.MethodPost, "/api/embeddings", req, &resp); err != nil {

View File

@@ -47,6 +47,9 @@ type GenerateRequest struct {
// Prompt is the textual prompt to send to the model.
Prompt string `json:"prompt"`
// Suffix is the text that comes after the inserted text.
Suffix string `json:"suffix"`
// System overrides the model's default system message/prompt.
System string `json:"system"`
@@ -97,17 +100,80 @@ type ChatRequest struct {
// followin the request.
KeepAlive *Duration `json:"keep_alive,omitempty"`
// Tools is an optional list of tools the model has access to.
Tools `json:"tools,omitempty"`
// Options lists model-specific options.
Options map[string]interface{} `json:"options"`
}
type Tools []Tool
func (t Tools) String() string {
bts, _ := json.Marshal(t)
return string(bts)
}
// Message is a single message in a chat sequence. The message contains the
// role ("system", "user", or "assistant"), the content and an optional list
// of images.
type Message struct {
Role string `json:"role"`
Content string `json:"content"`
Images []ImageData `json:"images,omitempty"`
Role string `json:"role"`
Content string `json:"content"`
Images []ImageData `json:"images,omitempty"`
ToolCalls []ToolCall `json:"tool_calls,omitempty"`
}
func (m *Message) UnmarshalJSON(b []byte) error {
type Alias Message
var a Alias
if err := json.Unmarshal(b, &a); err != nil {
return err
}
*m = Message(a)
m.Role = strings.ToLower(m.Role)
return nil
}
type ToolCall struct {
Function ToolCallFunction `json:"function"`
}
type ToolCallFunction struct {
Name string `json:"name"`
Arguments ToolCallFunctionArguments `json:"arguments"`
}
type ToolCallFunctionArguments map[string]any
func (t *ToolCallFunctionArguments) String() string {
bts, _ := json.Marshal(t)
return string(bts)
}
type Tool struct {
Type string `json:"type"`
Function ToolFunction `json:"function"`
}
type ToolFunction struct {
Name string `json:"name"`
Description string `json:"description"`
Parameters struct {
Type string `json:"type"`
Required []string `json:"required"`
Properties map[string]struct {
Type string `json:"type"`
Description string `json:"description"`
Enum []string `json:"enum,omitempty"`
} `json:"properties"`
} `json:"parameters"`
}
func (t *ToolFunction) String() string {
bts, _ := json.Marshal(t)
return string(bts)
}
// ChatResponse is the response returned by [Client.Chat]. Its fields are
@@ -159,18 +225,42 @@ type Options struct {
// Runner options which must be set when the model is loaded into memory
type Runner struct {
UseNUMA bool `json:"numa,omitempty"`
NumCtx int `json:"num_ctx,omitempty"`
NumBatch int `json:"num_batch,omitempty"`
NumGPU int `json:"num_gpu,omitempty"`
MainGPU int `json:"main_gpu,omitempty"`
LowVRAM bool `json:"low_vram,omitempty"`
F16KV bool `json:"f16_kv,omitempty"`
LogitsAll bool `json:"logits_all,omitempty"`
VocabOnly bool `json:"vocab_only,omitempty"`
UseMMap bool `json:"use_mmap,omitempty"`
UseMLock bool `json:"use_mlock,omitempty"`
NumThread int `json:"num_thread,omitempty"`
UseNUMA bool `json:"numa,omitempty"`
NumCtx int `json:"num_ctx,omitempty"`
NumBatch int `json:"num_batch,omitempty"`
NumGPU int `json:"num_gpu,omitempty"`
MainGPU int `json:"main_gpu,omitempty"`
LowVRAM bool `json:"low_vram,omitempty"`
F16KV bool `json:"f16_kv,omitempty"`
LogitsAll bool `json:"logits_all,omitempty"`
VocabOnly bool `json:"vocab_only,omitempty"`
UseMMap *bool `json:"use_mmap,omitempty"`
UseMLock bool `json:"use_mlock,omitempty"`
NumThread int `json:"num_thread,omitempty"`
}
// EmbedRequest is the request passed to [Client.Embed].
type EmbedRequest struct {
// Model is the model name.
Model string `json:"model"`
// Input is the input to embed.
Input any `json:"input"`
// KeepAlive controls how long the model will stay loaded in memory following
// this request.
KeepAlive *Duration `json:"keep_alive,omitempty"`
Truncate *bool `json:"truncate,omitempty"`
// Options lists model-specific options.
Options map[string]interface{} `json:"options"`
}
// EmbedResponse is the response from [Client.Embed].
type EmbedResponse struct {
Model string `json:"model"`
Embeddings [][]float32 `json:"embeddings"`
}
// EmbeddingRequest is the request passed to [Client.Embeddings].
@@ -219,9 +309,12 @@ type DeleteRequest struct {
// ShowRequest is the request passed to [Client.Show].
type ShowRequest struct {
Model string `json:"model"`
System string `json:"system"`
Model string `json:"model"`
System string `json:"system"`
// Template is deprecated
Template string `json:"template"`
Verbose bool `json:"verbose"`
Options map[string]interface{} `json:"options"`
@@ -231,13 +324,16 @@ type ShowRequest struct {
// ShowResponse is the response returned from [Client.Show].
type ShowResponse struct {
License string `json:"license,omitempty"`
Modelfile string `json:"modelfile,omitempty"`
Parameters string `json:"parameters,omitempty"`
Template string `json:"template,omitempty"`
System string `json:"system,omitempty"`
Details ModelDetails `json:"details,omitempty"`
Messages []Message `json:"messages,omitempty"`
License string `json:"license,omitempty"`
Modelfile string `json:"modelfile,omitempty"`
Parameters string `json:"parameters,omitempty"`
Template string `json:"template,omitempty"`
System string `json:"system,omitempty"`
Details ModelDetails `json:"details,omitempty"`
Messages []Message `json:"messages,omitempty"`
ModelInfo map[string]any `json:"model_info,omitempty"`
ProjectorInfo map[string]any `json:"projector_info,omitempty"`
ModifiedAt time.Time `json:"modified_at,omitempty"`
}
// CopyRequest is the request passed to [Client.Copy].
@@ -310,6 +406,13 @@ type ProcessModelResponse struct {
SizeVRAM int64 `json:"size_vram"`
}
type RetrieveModelResponse struct {
Id string `json:"id"`
Object string `json:"object"`
Created int64 `json:"created"`
OwnedBy string `json:"owned_by"`
}
type TokenResponse struct {
Token string `json:"token"`
}
@@ -448,6 +551,17 @@ func (opts *Options) FromMap(m map[string]interface{}) error {
slice[i] = str
}
field.Set(reflect.ValueOf(slice))
case reflect.Pointer:
var b bool
if field.Type() == reflect.TypeOf(&b) {
val, ok := val.(bool)
if !ok {
return fmt.Errorf("option %q must be of type boolean", key)
}
field.Set(reflect.ValueOf(&val))
} else {
return fmt.Errorf("unknown type loading config params: %v %v", field.Kind(), field.Type())
}
default:
return fmt.Errorf("unknown type loading config params: %v", field.Kind())
}
@@ -490,7 +604,7 @@ func DefaultOptions() Options {
LowVRAM: false,
F16KV: true,
UseMLock: false,
UseMMap: true,
UseMMap: nil,
UseNUMA: false,
},
}
@@ -587,6 +701,17 @@ func FormatParams(params map[string][]string) (map[string]interface{}, error) {
case reflect.Slice:
// TODO: only string slices are supported right now
out[key] = vals
case reflect.Pointer:
var b bool
if field.Type() == reflect.TypeOf(&b) {
boolVal, err := strconv.ParseBool(vals[0])
if err != nil {
return nil, fmt.Errorf("invalid bool value %s", vals)
}
out[key] = &boolVal
} else {
return nil, fmt.Errorf("unknown type %s for %s", field.Kind(), key)
}
default:
return nil, fmt.Errorf("unknown type %s for %s", field.Kind(), key)
}

View File

@@ -2,6 +2,7 @@ package api
import (
"encoding/json"
"fmt"
"math"
"testing"
"time"
@@ -105,3 +106,128 @@ func TestDurationMarshalUnmarshal(t *testing.T) {
})
}
}
func TestUseMmapParsingFromJSON(t *testing.T) {
tr := true
fa := false
tests := []struct {
name string
req string
exp *bool
}{
{
name: "Undefined",
req: `{ }`,
exp: nil,
},
{
name: "True",
req: `{ "use_mmap": true }`,
exp: &tr,
},
{
name: "False",
req: `{ "use_mmap": false }`,
exp: &fa,
},
}
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
var oMap map[string]interface{}
err := json.Unmarshal([]byte(test.req), &oMap)
require.NoError(t, err)
opts := DefaultOptions()
err = opts.FromMap(oMap)
require.NoError(t, err)
assert.Equal(t, test.exp, opts.UseMMap)
})
}
}
func TestUseMmapFormatParams(t *testing.T) {
tr := true
fa := false
tests := []struct {
name string
req map[string][]string
exp *bool
err error
}{
{
name: "True",
req: map[string][]string{
"use_mmap": {"true"},
},
exp: &tr,
err: nil,
},
{
name: "False",
req: map[string][]string{
"use_mmap": {"false"},
},
exp: &fa,
err: nil,
},
{
name: "Numeric True",
req: map[string][]string{
"use_mmap": {"1"},
},
exp: &tr,
err: nil,
},
{
name: "Numeric False",
req: map[string][]string{
"use_mmap": {"0"},
},
exp: &fa,
err: nil,
},
{
name: "invalid string",
req: map[string][]string{
"use_mmap": {"foo"},
},
exp: nil,
err: fmt.Errorf("invalid bool value [foo]"),
},
}
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
resp, err := FormatParams(test.req)
require.Equal(t, test.err, err)
respVal, ok := resp["use_mmap"]
if test.exp != nil {
assert.True(t, ok, "resp: %v", resp)
assert.Equal(t, *test.exp, *respVal.(*bool))
}
})
}
}
func TestMessage_UnmarshalJSON(t *testing.T) {
tests := []struct {
input string
expected string
}{
{`{"role": "USER", "content": "Hello!"}`, "user"},
{`{"role": "System", "content": "Initialization complete."}`, "system"},
{`{"role": "assistant", "content": "How can I help you?"}`, "assistant"},
{`{"role": "TOOl", "content": "Access granted."}`, "tool"},
}
for _, test := range tests {
var msg Message
if err := json.Unmarshal([]byte(test.input), &msg); err != nil {
t.Errorf("Unexpected error: %v", err)
}
if msg.Role != test.expected {
t.Errorf("role not lowercased: got %v, expected %v", msg.Role, test.expected)
}
}
}

View File

@@ -5,6 +5,8 @@ import (
"log/slog"
"os"
"path/filepath"
"strconv"
"strings"
"github.com/ollama/ollama/envconfig"
)
@@ -24,6 +26,7 @@ func InitLogging() {
logFile = os.Stderr
// TODO - write one-line to the app.log file saying we're running in console mode to help avoid confusion
} else {
rotateLogs(AppLogFile)
logFile, err = os.OpenFile(AppLogFile, os.O_APPEND|os.O_WRONLY|os.O_CREATE, 0755)
if err != nil {
slog.Error(fmt.Sprintf("failed to create server log %v", err))
@@ -46,3 +49,32 @@ func InitLogging() {
slog.Info("ollama app started")
}
func rotateLogs(logFile string) {
if _, err := os.Stat(logFile); os.IsNotExist(err) {
return
}
index := strings.LastIndex(logFile, ".")
pre := logFile[:index]
post := "." + logFile[index+1:]
for i := LogRotationCount; i > 0; i-- {
older := pre + "-" + strconv.Itoa(i) + post
newer := pre + "-" + strconv.Itoa(i-1) + post
if i == 1 {
newer = pre + post
}
if _, err := os.Stat(newer); err == nil {
if _, err := os.Stat(older); err == nil {
err := os.Remove(older)
if err != nil {
slog.Warn("Failed to remove older log", "older", older, "error", err)
continue
}
}
err := os.Rename(newer, older)
if err != nil {
slog.Warn("Failed to rotate log", "older", older, "newer", newer, "error", err)
}
}
}
}

View File

@@ -0,0 +1,44 @@
package lifecycle
import (
"os"
"path/filepath"
"strconv"
"testing"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func TestRotateLogs(t *testing.T) {
logDir := t.TempDir()
logFile := filepath.Join(logDir, "testlog.log")
// No log exists
rotateLogs(logFile)
require.NoError(t, os.WriteFile(logFile, []byte("1"), 0644))
assert.FileExists(t, logFile)
// First rotation
rotateLogs(logFile)
assert.FileExists(t, filepath.Join(logDir, "testlog-1.log"))
assert.NoFileExists(t, filepath.Join(logDir, "testlog-2.log"))
assert.NoFileExists(t, logFile)
// Should be a no-op without a new log
rotateLogs(logFile)
assert.FileExists(t, filepath.Join(logDir, "testlog-1.log"))
assert.NoFileExists(t, filepath.Join(logDir, "testlog-2.log"))
assert.NoFileExists(t, logFile)
for i := 2; i <= LogRotationCount+1; i++ {
require.NoError(t, os.WriteFile(logFile, []byte(strconv.Itoa(i)), 0644))
assert.FileExists(t, logFile)
rotateLogs(logFile)
assert.NoFileExists(t, logFile)
for j := 1; j < i; j++ {
assert.FileExists(t, filepath.Join(logDir, "testlog-"+strconv.Itoa(j)+".log"))
}
assert.NoFileExists(t, filepath.Join(logDir, "testlog-"+strconv.Itoa(i+1)+".log"))
}
}

View File

@@ -16,11 +16,12 @@ var (
AppDir = "/opt/Ollama"
AppDataDir = "/opt/Ollama"
// TODO - should there be a distinct log dir?
UpdateStageDir = "/tmp"
AppLogFile = "/tmp/ollama_app.log"
ServerLogFile = "/tmp/ollama.log"
UpgradeLogFile = "/tmp/ollama_update.log"
Installer = "OllamaSetup.exe"
UpdateStageDir = "/tmp"
AppLogFile = "/tmp/ollama_app.log"
ServerLogFile = "/tmp/ollama.log"
UpgradeLogFile = "/tmp/ollama_update.log"
Installer = "OllamaSetup.exe"
LogRotationCount = 5
)
func init() {

View File

@@ -54,7 +54,7 @@ func start(ctx context.Context, command string) (*exec.Cmd, error) {
return nil, fmt.Errorf("failed to spawn server stderr pipe: %w", err)
}
// TODO - rotation
rotateLogs(ServerLogFile)
logFile, err := os.OpenFile(ServerLogFile, os.O_APPEND|os.O_WRONLY|os.O_CREATE, 0755)
if err != nil {
return nil, fmt.Errorf("failed to create server log: %w", err)

View File

@@ -88,10 +88,15 @@ DialogFontSize=12
[Files]
Source: ".\app.exe"; DestDir: "{app}"; DestName: "{#MyAppExeName}" ; Flags: ignoreversion 64bit
Source: "..\ollama.exe"; DestDir: "{app}"; Flags: ignoreversion 64bit
Source: "..\dist\windows-{#ARCH}\*.dll"; DestDir: "{app}"; Flags: ignoreversion 64bit
Source: "..\dist\windows-{#ARCH}\ollama_runners\*"; DestDir: "{app}\ollama_runners"; Flags: ignoreversion 64bit recursesubdirs
Source: "..\dist\ollama_welcome.ps1"; DestDir: "{app}"; Flags: ignoreversion
Source: ".\assets\app.ico"; DestDir: "{app}"; Flags: ignoreversion
#if DirExists("..\dist\windows-amd64\cuda")
Source: "..\dist\windows-amd64\cuda\*"; DestDir: "{app}\cuda\"; Flags: ignoreversion recursesubdirs
#endif
#if DirExists("..\dist\windows-amd64\oneapi")
Source: "..\dist\windows-amd64\oneapi\*"; DestDir: "{app}\oneapi\"; Flags: ignoreversion recursesubdirs
#endif
#if DirExists("..\dist\windows-amd64\rocm")
Source: "..\dist\windows-amd64\rocm\*"; DestDir: "{app}\rocm\"; Flags: ignoreversion recursesubdirs
#endif
@@ -122,6 +127,10 @@ Type: filesandordirs; Name: "{%USERPROFILE}\.ollama\models"
Type: filesandordirs; Name: "{%USERPROFILE}\.ollama\history"
; NOTE: if the user has a custom OLLAMA_MODELS it will be preserved
[InstallDelete]
Type: filesandordirs; Name: "{%TEMP}\ollama*"
Type: filesandordirs; Name: "{%LOCALAPPDATA}\Programs\Ollama"
[Messages]
WizardReady=Ollama Windows Preview
ReadyLabel1=%nLet's get you up and running with your own large language models.

View File

@@ -162,9 +162,6 @@ func tempZipFiles(path string) (string, error) {
}
defer tempfile.Close()
zipfile := zip.NewWriter(tempfile)
defer zipfile.Close()
detectContentType := func(path string) (string, error) {
f, err := os.Open(path)
if err != nil {
@@ -233,6 +230,9 @@ func tempZipFiles(path string) (string, error) {
files = append(files, tks...)
}
zipfile := zip.NewWriter(tempfile)
defer zipfile.Close()
for _, file := range files {
f, err := os.Open(file)
if err != nil {
@@ -287,38 +287,12 @@ func createBlob(cmd *cobra.Command, client *api.Client, path string) (string, er
}
func RunHandler(cmd *cobra.Command, args []string) error {
client, err := api.ClientFromEnvironment()
if err != nil {
return err
}
name := args[0]
// check if the model exists on the server
show, err := client.Show(cmd.Context(), &api.ShowRequest{Name: name})
var statusError api.StatusError
switch {
case errors.As(err, &statusError) && statusError.StatusCode == http.StatusNotFound:
if err := PullHandler(cmd, []string{name}); err != nil {
return err
}
show, err = client.Show(cmd.Context(), &api.ShowRequest{Name: name})
if err != nil {
return err
}
case err != nil:
return err
}
interactive := true
opts := runOptions{
Model: args[0],
WordWrap: os.Getenv("TERM") == "xterm-256color",
Options: map[string]interface{}{},
MultiModal: slices.Contains(show.Details.Families, "clip"),
ParentModel: show.Details.ParentModel,
Model: args[0],
WordWrap: os.Getenv("TERM") == "xterm-256color",
Options: map[string]interface{}{},
}
format, err := cmd.Flags().GetString("format")
@@ -362,11 +336,38 @@ func RunHandler(cmd *cobra.Command, args []string) error {
}
opts.WordWrap = !nowrap
if !interactive {
return generate(cmd, opts)
// Fill out the rest of the options based on information about the
// model.
client, err := api.ClientFromEnvironment()
if err != nil {
return err
}
return generateInteractive(cmd, opts)
name := args[0]
info, err := func() (*api.ShowResponse, error) {
showReq := &api.ShowRequest{Name: name}
info, err := client.Show(cmd.Context(), showReq)
var se api.StatusError
if errors.As(err, &se) && se.StatusCode == http.StatusNotFound {
if err := PullHandler(cmd, []string{name}); err != nil {
return nil, err
}
return client.Show(cmd.Context(), &api.ShowRequest{Name: name})
}
return info, err
}()
if err != nil {
return err
}
opts.MultiModal = slices.Contains(info.Details.Families, "clip")
opts.ParentModel = info.Details.ParentModel
opts.Messages = append(opts.Messages, info.Messages...)
if interactive {
return generateInteractive(cmd, opts)
}
return generate(cmd, opts)
}
func errFromUnknownKey(unknownKeyErr error) error {
@@ -579,10 +580,6 @@ func ShowHandler(cmd *cobra.Command, args []string) error {
return err
}
if len(args) != 1 {
return errors.New("missing model name")
}
license, errLicense := cmd.Flags().GetBool("license")
modelfile, errModelfile := cmd.Flags().GetBool("modelfile")
parameters, errParams := cmd.Flags().GetBool("parameters")
@@ -625,8 +622,6 @@ func ShowHandler(cmd *cobra.Command, args []string) error {
if flagsSet > 1 {
return errors.New("only one of '--license', '--modelfile', '--parameters', '--system', or '--template' can be specified")
} else if flagsSet == 0 {
return errors.New("one of '--license', '--modelfile', '--parameters', '--system', or '--template' must be specified")
}
req := api.ShowRequest{Name: args[0]}
@@ -635,22 +630,141 @@ func ShowHandler(cmd *cobra.Command, args []string) error {
return err
}
switch showType {
case "license":
fmt.Println(resp.License)
case "modelfile":
fmt.Println(resp.Modelfile)
case "parameters":
fmt.Println(resp.Parameters)
case "system":
fmt.Println(resp.System)
case "template":
fmt.Println(resp.Template)
if flagsSet == 1 {
switch showType {
case "license":
fmt.Println(resp.License)
case "modelfile":
fmt.Println(resp.Modelfile)
case "parameters":
fmt.Println(resp.Parameters)
case "system":
fmt.Println(resp.System)
case "template":
fmt.Println(resp.Template)
}
return nil
}
showInfo(resp)
return nil
}
func showInfo(resp *api.ShowResponse) {
arch := resp.ModelInfo["general.architecture"].(string)
modelData := [][]string{
{"arch", arch},
{"parameters", resp.Details.ParameterSize},
{"quantization", resp.Details.QuantizationLevel},
{"context length", fmt.Sprintf("%v", resp.ModelInfo[fmt.Sprintf("%s.context_length", arch)].(float64))},
{"embedding length", fmt.Sprintf("%v", resp.ModelInfo[fmt.Sprintf("%s.embedding_length", arch)].(float64))},
}
mainTableData := [][]string{
{"Model"},
{renderSubTable(modelData, false)},
}
if resp.ProjectorInfo != nil {
projectorData := [][]string{
{"arch", "clip"},
{"parameters", format.HumanNumber(uint64(resp.ProjectorInfo["general.parameter_count"].(float64)))},
}
if projectorType, ok := resp.ProjectorInfo["clip.projector_type"]; ok {
projectorData = append(projectorData, []string{"projector type", projectorType.(string)})
}
projectorData = append(projectorData,
[]string{"embedding length", fmt.Sprintf("%v", resp.ProjectorInfo["clip.vision.embedding_length"].(float64))},
[]string{"projection dimensionality", fmt.Sprintf("%v", resp.ProjectorInfo["clip.vision.projection_dim"].(float64))},
)
mainTableData = append(mainTableData,
[]string{"Projector"},
[]string{renderSubTable(projectorData, false)},
)
}
if resp.Parameters != "" {
mainTableData = append(mainTableData, []string{"Parameters"}, []string{formatParams(resp.Parameters)})
}
if resp.System != "" {
mainTableData = append(mainTableData, []string{"System"}, []string{renderSubTable(twoLines(resp.System), true)})
}
if resp.License != "" {
mainTableData = append(mainTableData, []string{"License"}, []string{renderSubTable(twoLines(resp.License), true)})
}
table := tablewriter.NewWriter(os.Stdout)
table.SetAutoWrapText(false)
table.SetBorder(false)
table.SetAlignment(tablewriter.ALIGN_LEFT)
for _, v := range mainTableData {
table.Append(v)
}
table.Render()
}
func renderSubTable(data [][]string, file bool) string {
var buf bytes.Buffer
table := tablewriter.NewWriter(&buf)
table.SetAutoWrapText(!file)
table.SetBorder(false)
table.SetNoWhiteSpace(true)
table.SetTablePadding("\t")
table.SetAlignment(tablewriter.ALIGN_LEFT)
for _, v := range data {
table.Append(v)
}
table.Render()
renderedTable := buf.String()
lines := strings.Split(renderedTable, "\n")
for i, line := range lines {
lines[i] = "\t" + line
}
return strings.Join(lines, "\n")
}
func twoLines(s string) [][]string {
lines := strings.Split(s, "\n")
res := [][]string{}
count := 0
for _, line := range lines {
line = strings.TrimSpace(line)
if line != "" {
count++
res = append(res, []string{line})
if count == 2 {
return res
}
}
}
return res
}
func formatParams(s string) string {
lines := strings.Split(s, "\n")
table := [][]string{}
for _, line := range lines {
table = append(table, strings.Fields(line))
}
return renderSubTable(table, false)
}
func CopyHandler(cmd *cobra.Command, args []string) error {
client, err := api.ClientFromEnvironment()
if err != nil {
@@ -729,7 +843,6 @@ type runOptions struct {
WordWrap bool
Format string
System string
Template string
Images []api.ImageData
Options map[string]interface{}
MultiModal bool
@@ -923,7 +1036,6 @@ func generate(cmd *cobra.Command, opts runOptions) error {
Images: opts.Images,
Format: opts.Format,
System: opts.System,
Template: opts.Template,
Options: opts.Options,
KeepAlive: opts.KeepAlive,
}

View File

@@ -27,69 +27,43 @@ const (
MultilineNone MultilineState = iota
MultilinePrompt
MultilineSystem
MultilineTemplate
)
func loadModel(cmd *cobra.Command, opts *runOptions) error {
client, err := api.ClientFromEnvironment()
if err != nil {
return err
}
p := progress.NewProgress(os.Stderr)
defer p.StopAndClear()
spinner := progress.NewSpinner("")
p.Add("", spinner)
showReq := api.ShowRequest{Name: opts.Model}
showResp, err := client.Show(cmd.Context(), &showReq)
client, err := api.ClientFromEnvironment()
if err != nil {
return err
}
opts.MultiModal = slices.Contains(showResp.Details.Families, "clip")
opts.ParentModel = showResp.Details.ParentModel
if len(showResp.Messages) > 0 {
opts.Messages = append(opts.Messages, showResp.Messages...)
}
chatReq := &api.ChatRequest{
Model: opts.Model,
Messages: []api.Message{},
Model: opts.Model,
KeepAlive: opts.KeepAlive,
}
if opts.KeepAlive != nil {
chatReq.KeepAlive = opts.KeepAlive
}
err = client.Chat(cmd.Context(), chatReq, func(resp api.ChatResponse) error {
return client.Chat(cmd.Context(), chatReq, func(resp api.ChatResponse) error {
p.StopAndClear()
if len(opts.Messages) > 0 {
for _, msg := range opts.Messages {
switch msg.Role {
case "user":
fmt.Printf(">>> %s\n", msg.Content)
case "assistant":
state := &displayResponseState{}
displayResponse(msg.Content, opts.WordWrap, state)
fmt.Println()
fmt.Println()
}
for _, msg := range opts.Messages {
switch msg.Role {
case "user":
fmt.Printf(">>> %s\n", msg.Content)
case "assistant":
state := &displayResponseState{}
displayResponse(msg.Content, opts.WordWrap, state)
fmt.Println()
fmt.Println()
}
}
return nil
})
if err != nil {
return err
}
return nil
}
func generateInteractive(cmd *cobra.Command, opts runOptions) error {
opts.Messages = make([]api.Message, 0)
err := loadModel(cmd, &opts)
if err != nil {
return err
@@ -119,7 +93,6 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
fmt.Fprintln(os.Stderr, "Available Commands:")
fmt.Fprintln(os.Stderr, " /set parameter ... Set a parameter")
fmt.Fprintln(os.Stderr, " /set system <string> Set system message")
fmt.Fprintln(os.Stderr, " /set template <string> Set prompt template")
fmt.Fprintln(os.Stderr, " /set history Enable history")
fmt.Fprintln(os.Stderr, " /set nohistory Disable history")
fmt.Fprintln(os.Stderr, " /set wordwrap Enable wordwrap")
@@ -229,10 +202,6 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
opts.Messages = append(opts.Messages, api.Message{Role: "system", Content: opts.System})
fmt.Println("Set system message.")
sb.Reset()
case MultilineTemplate:
opts.Template = sb.String()
fmt.Println("Set prompt template.")
sb.Reset()
}
multiline = MultilineNone
@@ -351,17 +320,13 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
}
fmt.Printf("Set parameter '%s' to '%s'\n", args[2], strings.Join(params, ", "))
opts.Options[args[2]] = fp[args[2]]
case "system", "template":
case "system":
if len(args) < 3 {
usageSet()
continue
}
if args[1] == "system" {
multiline = MultilineSystem
} else if args[1] == "template" {
multiline = MultilineTemplate
}
multiline = MultilineSystem
line := strings.Join(args[2:], " ")
line, ok := strings.CutPrefix(line, `"""`)
@@ -381,23 +346,17 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
continue
}
if args[1] == "system" {
opts.System = sb.String() // for display in modelfile
newMessage := api.Message{Role: "system", Content: sb.String()}
// Check if the slice is not empty and the last message is from 'system'
if len(opts.Messages) > 0 && opts.Messages[len(opts.Messages)-1].Role == "system" {
// Replace the last message
opts.Messages[len(opts.Messages)-1] = newMessage
} else {
opts.Messages = append(opts.Messages, newMessage)
}
fmt.Println("Set system message.")
sb.Reset()
} else if args[1] == "template" {
opts.Template = sb.String()
fmt.Println("Set prompt template.")
sb.Reset()
opts.System = sb.String() // for display in modelfile
newMessage := api.Message{Role: "system", Content: sb.String()}
// Check if the slice is not empty and the last message is from 'system'
if len(opts.Messages) > 0 && opts.Messages[len(opts.Messages)-1].Role == "system" {
// Replace the last message
opts.Messages[len(opts.Messages)-1] = newMessage
} else {
opts.Messages = append(opts.Messages, newMessage)
}
fmt.Println("Set system message.")
sb.Reset()
sb.Reset()
continue
@@ -418,7 +377,6 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
req := &api.ShowRequest{
Name: opts.Model,
System: opts.System,
Template: opts.Template,
Options: opts.Options,
}
resp, err := client.Show(cmd.Context(), req)
@@ -429,15 +387,7 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
switch args[1] {
case "info":
fmt.Println("Model details:")
if len(resp.Details.Families) > 0 {
fmt.Printf("Family %s\n", strings.Join(resp.Details.Families, ", "))
} else if resp.Details.Family != "" {
fmt.Printf("Family %s\n", resp.Details.Family)
}
fmt.Printf("Parameter Size %s\n", resp.Details.ParameterSize)
fmt.Printf("Quantization Level %s\n", resp.Details.QuantizationLevel)
fmt.Println("")
showInfo(resp)
case "license":
if resp.License == "" {
fmt.Println("No license was specified for this model.")
@@ -470,12 +420,9 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
fmt.Println("No system message was specified for this model.")
}
case "template":
switch {
case opts.Template != "":
fmt.Println(opts.Template + "\n")
case resp.Template != "":
if resp.Template != "" {
fmt.Println(resp.Template)
default:
} else {
fmt.Println("No prompt template was specified for this model.")
}
default:
@@ -569,10 +516,6 @@ func buildModelfile(opts runOptions) string {
fmt.Fprintf(&mf, "SYSTEM \"\"\"%s\"\"\"\n", opts.System)
}
if opts.Template != "" {
fmt.Fprintf(&mf, "TEMPLATE \"\"\"%s\"\"\"\n", opts.Template)
}
keys := make([]string, 0)
for k := range opts.Options {
keys = append(keys, k)

View File

@@ -59,7 +59,6 @@ func TestModelfileBuilder(t *testing.T) {
opts := runOptions{
Model: "hork",
System: "You are part horse and part shark, but all hork. Do horklike things",
Template: "This is a template.",
Messages: []api.Message{
{Role: "user", Content: "Hey there hork!"},
{Role: "assistant", Content: "Yes it is true, I am half horse, half shark."},
@@ -75,7 +74,6 @@ func TestModelfileBuilder(t *testing.T) {
mf := buildModelfile(opts)
expectedModelfile := `FROM {{.Model}}
SYSTEM """{{.System}}"""
TEMPLATE """{{.Template}}"""
PARAMETER penalize_newline false
PARAMETER seed 42
PARAMETER stop [hi there]
@@ -97,7 +95,6 @@ MESSAGE assistant """Yes it is true, I am half horse, half shark."""
mf = buildModelfile(opts)
expectedModelfile = `FROM {{.ParentModel}}
SYSTEM """{{.System}}"""
TEMPLATE """{{.Template}}"""
PARAMETER penalize_newline false
PARAMETER seed 42
PARAMETER stop [hi there]

View File

@@ -26,7 +26,7 @@ All durations are returned in nanoseconds.
### Streaming responses
Certain endpoints stream responses as JSON objects and can optional return non-streamed responses.
Certain endpoints stream responses as JSON objects. Streaming can be disabled by providing `{"stream": false}` for these endpoints.
## Generate a completion
@@ -777,11 +777,12 @@ A single JSON object will be returned.
POST /api/show
```
Show information about a model including details, modelfile, template, parameters, license, and system prompt.
Show information about a model including details, modelfile, template, parameters, license, system prompt.
### Parameters
- `name`: name of the model to show
- `verbose`: (optional) if set to `true`, returns full data for verbose response fields
### Examples
@@ -798,14 +799,40 @@ curl http://localhost:11434/api/show -d '{
```json
{
"modelfile": "# Modelfile generated by \"ollama show\"\n# To build a new Modelfile based on this one, replace the FROM line with:\n# FROM llava:latest\n\nFROM /Users/matt/.ollama/models/blobs/sha256:200765e1283640ffbd013184bf496e261032fa75b99498a9613be4e94d63ad52\nTEMPLATE \"\"\"{{ .System }}\nUSER: {{ .Prompt }}\nASSISTANT: \"\"\"\nPARAMETER num_ctx 4096\nPARAMETER stop \"\u003c/s\u003e\"\nPARAMETER stop \"USER:\"\nPARAMETER stop \"ASSISTANT:\"",
"parameters": "num_ctx 4096\nstop \u003c/s\u003e\nstop USER:\nstop ASSISTANT:",
"template": "{{ .System }}\nUSER: {{ .Prompt }}\nASSISTANT: ",
"parameters": "num_keep 24\nstop \"<|start_header_id|>\"\nstop \"<|end_header_id|>\"\nstop \"<|eot_id|>\"",
"template": "{{ if .System }}<|start_header_id|>system<|end_header_id|>\n\n{{ .System }}<|eot_id|>{{ end }}{{ if .Prompt }}<|start_header_id|>user<|end_header_id|>\n\n{{ .Prompt }}<|eot_id|>{{ end }}<|start_header_id|>assistant<|end_header_id|>\n\n{{ .Response }}<|eot_id|>",
"details": {
"parent_model": "",
"format": "gguf",
"family": "llama",
"families": ["llama", "clip"],
"parameter_size": "7B",
"families": [
"llama"
],
"parameter_size": "8.0B",
"quantization_level": "Q4_0"
},
"model_info": {
"general.architecture": "llama",
"general.file_type": 2,
"general.parameter_count": 8030261248,
"general.quantization_version": 2,
"llama.attention.head_count": 32,
"llama.attention.head_count_kv": 8,
"llama.attention.layer_norm_rms_epsilon": 0.00001,
"llama.block_count": 32,
"llama.context_length": 8192,
"llama.embedding_length": 4096,
"llama.feed_forward_length": 14336,
"llama.rope.dimension_count": 128,
"llama.rope.freq_base": 500000,
"llama.vocab_size": 128256,
"tokenizer.ggml.bos_token_id": 128000,
"tokenizer.ggml.eos_token_id": 128009,
"tokenizer.ggml.merges": [], // populates if `verbose=true`
"tokenizer.ggml.model": "gpt2",
"tokenizer.ggml.pre": "llama-bpe",
"tokenizer.ggml.token_type": [], // populates if `verbose=true`
"tokenizer.ggml.tokens": [] // populates if `verbose=true`
}
}
```

View File

@@ -104,7 +104,7 @@ like to use. For example, to compile an optimized binary for an Intel i9-9880H,
you might use:
```
OLLAMA_CUSTOM_CPU_DEFS="-DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_F16C=on -DLLAMA_FMA=on" go generate ./...
OLLAMA_CUSTOM_CPU_DEFS="-DGGML_AVX=on -DGGML_AVX2=on -DGGML_F16C=on -DGGML_FMA=on" go generate ./...
go build .
```
@@ -114,15 +114,18 @@ If you have Docker available, you can build linux binaries with `./scripts/build
### Windows
Note: The windows build for Ollama is still under development.
Note: The Windows build for Ollama is still under development.
Install required tools:
First, install required tools:
- MSVC toolchain - C/C++ and cmake as minimal requirements
- Go version 1.22 or higher
- MinGW (pick one variant) with GCC.
- [MinGW-w64](https://www.mingw-w64.org/)
- [MSYS2](https://www.msys2.org/)
- The `ThreadJob` Powershell module: `Install-Module -Name ThreadJob -Scope CurrentUser`
Then, build the `ollama` binary:
```powershell
$env:CGO_ENABLED="1"

View File

@@ -257,3 +257,19 @@ If you wish to override the `OLLAMA_KEEP_ALIVE` setting, use the `keep_alive` AP
## How do I manage the maximum number of requests the Ollama server can queue?
If too many requests are sent to the server, it will respond with a 503 error indicating the server is overloaded. You can adjust how many requests may be queue by setting `OLLAMA_MAX_QUEUE`.
## How does Ollama handle concurrent requests?
Ollama supports two levels of concurrent processing. If your system has sufficient available memory (system memory when using CPU inference, or VRAM for GPU inference) then multiple models can be loaded at the same time. For a given model, if there is sufficient available memory when the model is loaded, it is configured to allow parallel request processing.
If there is insufficient available memory to load a new model request while one or more models are already loaded, all new requests will be queued until the new model can be loaded. As prior models become idle, one or more will be unloaded to make room for the new model. Queued requests will be processed in order. When using GPU inference new models must be able to completely fit in VRAM to allow concurrent model loads.
Parallel request processing for a given model results in increasing the context size by the number of parallel requests. For example, a 2K context with 4 parallel requests will result in an 8K context and additional memory allocation.
The following server settings may be used to adjust how Ollama handles concurrent requests on most platforms:
- `OLLAMA_MAX_LOADED_MODELS` - The maximum number of models that can be loaded concurrently provided they fit in available memory. The default is 3 * the number of GPUs or 3 for CPU inference.
- `OLLAMA_NUM_PARALLEL` - The maximum number of parallel requests each model will process at the same time. The default will auto-select either 4 or 1 based on available memory.
- `OLLAMA_MAX_QUEUE` - The maximum number of requests Ollama will queue when busy before rejecting additional requests. The default is 512
Note: Windows with Radeon GPUs currently default to 1 model maximum due to limitations in ROCm v5.7 for available VRAM reporting. Once ROCm v6.2 is available, Windows Radeon will follow the defaults above. You may enable concurrent model loads on Radeon on Windows, but ensure you don't load more models than will fit into your GPUs VRAM.

View File

@@ -8,7 +8,7 @@ Check your compute compatibility to see if your card is supported:
| Compute Capability | Family | Cards |
| ------------------ | ------------------- | ----------------------------------------------------------------------------------------------------------- |
| 9.0 | NVIDIA | `H100` |
| 8.9 | GeForce RTX 40xx | `RTX 4090` `RTX 4080` `RTX 4070 Ti` `RTX 4060 Ti` |
| 8.9 | GeForce RTX 40xx | `RTX 4090` `RTX 4080 SUPER` `RTX 4080` `RTX 4070 Ti SUPER` `RTX 4070 Ti` `RTX 4070 SUPER` `RTX 4070` `RTX 4060 Ti` `RTX 4060` |
| | NVIDIA Professional | `L4` `L40` `RTX 6000` |
| 8.6 | GeForce RTX 30xx | `RTX 3090 Ti` `RTX 3090` `RTX 3080 Ti` `RTX 3080` `RTX 3070 Ti` `RTX 3070` `RTX 3060 Ti` `RTX 3060` |
| | NVIDIA Professional | `A40` `RTX A6000` `RTX A5000` `RTX A4000` `RTX A3000` `RTX A2000` `A10` `A16` `A2` |
@@ -18,7 +18,7 @@ Check your compute compatibility to see if your card is supported:
| | Quadro | `RTX 8000` `RTX 6000` `RTX 5000` `RTX 4000` |
| 7.0 | NVIDIA | `TITAN V` `V100` `Quadro GV100` |
| 6.1 | NVIDIA TITAN | `TITAN Xp` `TITAN X` |
| | GeForce GTX | `GTX 1080 Ti` `GTX 1080` `GTX 1070 Ti` `GTX 1070` `GTX 1060` `GTX 1050` |
| | GeForce GTX | `GTX 1080 Ti` `GTX 1080` `GTX 1070 Ti` `GTX 1070` `GTX 1060` `GTX 1050 Ti` `GTX 1050` |
| | Quadro | `P6000` `P5200` `P4200` `P3200` `P5000` `P4000` `P3000` `P2200` `P2000` `P1000` `P620` `P600` `P500` `P520` |
| | Tesla | `P40` `P4` |
| 6.0 | NVIDIA | `Tesla P100` `Quadro GP100` |
@@ -46,13 +46,24 @@ sudo modprobe nvidia_uvm`
## AMD Radeon
Ollama supports the following AMD GPUs:
### Linux Support
| Family | Cards and accelerators |
| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
| AMD Radeon RX | `7900 XTX` `7900 XT` `7900 GRE` `7800 XT` `7700 XT` `7600 XT` `7600` `6950 XT` `6900 XTX` `6900XT` `6800 XT` `6800` `Vega 64` `Vega 56` |
| AMD Radeon PRO | `W7900` `W7800` `W7700` `W7600` `W7500` `W6900X` `W6800X Duo` `W6800X` `W6800` `V620` `V420` `V340` `V320` `Vega II Duo` `Vega II` `VII` `SSG` |
| AMD Instinct | `MI300X` `MI300A` `MI300` `MI250X` `MI250` `MI210` `MI200` `MI100` `MI60` `MI50` |
### Overrides
### Windows Support
With ROCm v6.1, the following GPUs are supported on Windows.
| Family | Cards and accelerators |
| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
| AMD Radeon RX | `7900 XTX` `7900 XT` `7900 GRE` `7800 XT` `7700 XT` `7600 XT` `7600` `6950 XT` `6900 XTX` `6900XT` `6800 XT` `6800` |
| AMD Radeon PRO | `W7900` `W7800` `W7700` `W7600` `W7500` `W6900X` `W6800X Duo` `W6800X` `W6800` `V620` |
### Overrides on Linux
Ollama leverages the AMD ROCm library, which does not support all AMD GPUs. In
some cases you can force the system to try to use a similar LLVM target that is
close. For example The Radeon RX 5400 is `gfx1034` (also known as 10.3.4)
@@ -63,7 +74,7 @@ would set `HSA_OVERRIDE_GFX_VERSION="10.3.0"` as an environment variable for the
server. If you have an unsupported AMD GPU you can experiment using the list of
supported types below.
At this time, the known supported GPU types are the following LLVM Targets.
At this time, the known supported GPU types on linux are the following LLVM Targets.
This table shows some example GPUs that map to these LLVM targets:
| **LLVM Target** | **An Example GPU** |
|-----------------|---------------------|

View File

@@ -47,19 +47,13 @@ success
### Supported Quantizations
<details>
<summary>Legacy Quantization</summary>
- `Q4_0`
- `Q4_1`
- `Q5_0`
- `Q5_1`
- `Q8_0`
</details>
<details>
<summary>K-means Quantization</summary>`
#### K-means Quantizations
- `Q3_K_S`
- `Q3_K_M`
@@ -70,11 +64,6 @@ success
- `Q5_K_M`
- `Q6_K`
</details>
> [!NOTE]
> Activation-aware Weight Quantization (i.e. IQ) are not currently supported for automatic quantization however you can still import the quantized model into Ollama, see [Import GGUF](#import-gguf).
## Template Detection
> [!NOTE]

View File

@@ -65,6 +65,7 @@ curl http://localhost:11434/v1/chat/completions \
}
]
}'
```
## Endpoints
@@ -102,12 +103,6 @@ curl http://localhost:11434/v1/chat/completions \
- [ ] `user`
- [ ] `n`
#### Notes
- Setting `seed` will always set `temperature` to `0`
- `finish_reason` will always be `stop`
- `usage.prompt_tokens` will be 0 for completions where prompt evaluation is cached
## Models
Before using a model, pull it locally `ollama pull`:

View File

@@ -22,7 +22,7 @@ docker logs <container-name>
If manually running `ollama serve` in a terminal, the logs will be on that terminal.
When you run Ollama on **Windows**, there are a few different locations. You can view them in the explorer window by hitting `<cmd>+R` and type in:
- `explorer %LOCALAPPDATA%\Ollama` to view logs
- `explorer %LOCALAPPDATA%\Ollama` to view logs. The most recent server logs will be in `server.log` and older logs will be in `server-#.log`
- `explorer %LOCALAPPDATA%\Programs\Ollama` to browse the binaries (The installer adds this to your user PATH)
- `explorer %HOMEPATH%\.ollama` to browse where models and configuration is stored
- `explorer %TEMP%` where temporary executable files are stored in one or more `ollama*` directories
@@ -70,14 +70,18 @@ curl -fsSL https://ollama.com/install.sh | OLLAMA_VERSION="0.1.29" sh
If your system is configured with the "noexec" flag where Ollama stores its temporary executable files, you can specify an alternate location by setting OLLAMA_TMPDIR to a location writable by the user ollama runs as. For example OLLAMA_TMPDIR=/usr/share/ollama/
## Container fails to run on NVIDIA GPU
## NVIDIA GPU Discovery
Make sure you've set up the container runtime first as described in [docker.md](./docker.md)
When Ollama starts up, it takes inventory of the GPUs present in the system to determine compatibility and how much VRAM is available. Sometimes this discovery can fail to find your GPUs. In general, running the latest driver will yield the best results.
Sometimes the container runtime can have difficulties initializing the GPU. When you check the server logs, this can show up as various error codes, such as "3" (not initialized), "46" (device unavailable), "100" (no device), "999" (unknown), or others. The following troubleshooting techniques may help resolve the problem
### Linux NVIDIA Troubleshooting
- Is the container runtime working? Try `docker run --gpus all ubuntu nvidia-smi` - if this doesn't work, Ollama wont be able to see your NVIDIA GPU.
- Is the uvm driver not loaded? `sudo nvidia-modprobe -u`
If you are using a container to run Ollama, make sure you've set up the container runtime first as described in [docker.md](./docker.md)
Sometimes the Ollama can have difficulties initializing the GPU. When you check the server logs, this can show up as various error codes, such as "3" (not initialized), "46" (device unavailable), "100" (no device), "999" (unknown), or others. The following troubleshooting techniques may help resolve the problem
- If you are using a container, is the container runtime working? Try `docker run --gpus all ubuntu nvidia-smi` - if this doesn't work, Ollama wont be able to see your NVIDIA GPU.
- Is the uvm driver loaded? `sudo nvidia-modprobe -u`
- Try reloading the nvidia_uvm driver - `sudo rmmod nvidia_uvm` then `sudo modprobe nvidia_uvm`
- Try rebooting
- Make sure you're running the latest nvidia drivers
@@ -85,3 +89,8 @@ Sometimes the container runtime can have difficulties initializing the GPU. When
If none of those resolve the problem, gather additional information and file an issue:
- Set `CUDA_ERROR_LEVEL=50` and try again to get more diagnostic logs
- Check dmesg for any errors `sudo dmesg | grep -i nvrm` and `sudo dmesg | grep -i nvidia`
## Windows Terminal Errors
Older versions of Windows 10 (e.g., 21H1) are known to have a bug where the standard terminal program does not display control characters correctly. This can result in a long string of strings like `←[?25h←[?25l` being displayed, sometimes erroring with `The parameter is incorrect` To resolve this problem, please update to Win 10 22H1 or newer.

View File

@@ -19,7 +19,7 @@ Logs will often be helpful in diagnosing the problem (see
## System Requirements
* Windows 10 or newer, Home or Pro
* Windows 10 22H2 or newer, Home or Pro
* NVIDIA 452.39 or newer Drivers if you have an NVIDIA card
* AMD Radeon Driver https://www.amd.com/en/support if you have a Radeon card
@@ -39,8 +39,8 @@ server.
Ollama on Windows stores files in a few different locations. You can view them in
the explorer window by hitting `<cmd>+R` and type in:
- `explorer %LOCALAPPDATA%\Ollama` contains logs, and downloaded updates
- *app.log* contains logs from the GUI application
- *server.log* contains the server logs
- *app.log* contains most resent logs from the GUI application
- *server.log* contains the most recent server logs
- *upgrade.log* contains log output for upgrades
- `explorer %LOCALAPPDATA%\Programs\Ollama` contains the binaries (The installer adds this to your user PATH)
- `explorer %HOMEPATH%\.ollama` contains models and configuration

View File

@@ -4,12 +4,14 @@ import (
"errors"
"fmt"
"log/slog"
"math"
"net"
"os"
"path/filepath"
"runtime"
"strconv"
"strings"
"time"
)
type OllamaHost struct {
@@ -34,17 +36,17 @@ var (
// Set via OLLAMA_HOST in the environment
Host *OllamaHost
// Set via OLLAMA_KEEP_ALIVE in the environment
KeepAlive string
KeepAlive time.Duration
// Set via OLLAMA_LLM_LIBRARY in the environment
LLMLibrary string
// Set via OLLAMA_MAX_LOADED_MODELS in the environment
MaxRunners int
// Set via OLLAMA_MAX_QUEUE in the environment
MaxQueuedRequests int
// Set via OLLAMA_MODELS in the environment
ModelsDir string
// Set via OLLAMA_MAX_VRAM in the environment
MaxVRAM uint64
// Set via OLLAMA_MODELS in the environment
ModelsDir string
// Set via OLLAMA_NOHISTORY in the environment
NoHistory bool
// Set via OLLAMA_NOPRUNE in the environment
@@ -53,8 +55,23 @@ var (
NumParallel int
// Set via OLLAMA_RUNNERS_DIR in the environment
RunnersDir string
// Set via OLLAMA_SCHED_SPREAD in the environment
SchedSpread bool
// Set via OLLAMA_TMPDIR in the environment
TmpDir string
// Set via OLLAMA_INTEL_GPU in the environment
IntelGpu bool
// Set via CUDA_VISIBLE_DEVICES in the environment
CudaVisibleDevices string
// Set via HIP_VISIBLE_DEVICES in the environment
HipVisibleDevices string
// Set via ROCR_VISIBLE_DEVICES in the environment
RocrVisibleDevices string
// Set via GPU_DEVICE_ORDINAL in the environment
GpuDeviceOrdinal string
// Set via HSA_OVERRIDE_GFX_VERSION in the environment
HsaOverrideGfxVersion string
)
type EnvVar struct {
@@ -64,23 +81,33 @@ type EnvVar struct {
}
func AsMap() map[string]EnvVar {
return map[string]EnvVar{
ret := map[string]EnvVar{
"OLLAMA_DEBUG": {"OLLAMA_DEBUG", Debug, "Show additional debug information (e.g. OLLAMA_DEBUG=1)"},
"OLLAMA_FLASH_ATTENTION": {"OLLAMA_FLASH_ATTENTION", FlashAttention, "Enabled flash attention"},
"OLLAMA_HOST": {"OLLAMA_HOST", Host, "IP Address for the ollama server (default 127.0.0.1:11434)"},
"OLLAMA_KEEP_ALIVE": {"OLLAMA_KEEP_ALIVE", KeepAlive, "The duration that models stay loaded in memory (default \"5m\")"},
"OLLAMA_LLM_LIBRARY": {"OLLAMA_LLM_LIBRARY", LLMLibrary, "Set LLM library to bypass autodetection"},
"OLLAMA_MAX_LOADED_MODELS": {"OLLAMA_MAX_LOADED_MODELS", MaxRunners, "Maximum number of loaded models (default 1)"},
"OLLAMA_MAX_LOADED_MODELS": {"OLLAMA_MAX_LOADED_MODELS", MaxRunners, "Maximum number of loaded models per GPU"},
"OLLAMA_MAX_QUEUE": {"OLLAMA_MAX_QUEUE", MaxQueuedRequests, "Maximum number of queued requests"},
"OLLAMA_MAX_VRAM": {"OLLAMA_MAX_VRAM", MaxVRAM, "Maximum VRAM"},
"OLLAMA_MODELS": {"OLLAMA_MODELS", ModelsDir, "The path to the models directory"},
"OLLAMA_NOHISTORY": {"OLLAMA_NOHISTORY", NoHistory, "Do not preserve readline history"},
"OLLAMA_NOPRUNE": {"OLLAMA_NOPRUNE", NoPrune, "Do not prune model blobs on startup"},
"OLLAMA_NUM_PARALLEL": {"OLLAMA_NUM_PARALLEL", NumParallel, "Maximum number of parallel requests (default 1)"},
"OLLAMA_NUM_PARALLEL": {"OLLAMA_NUM_PARALLEL", NumParallel, "Maximum number of parallel requests"},
"OLLAMA_ORIGINS": {"OLLAMA_ORIGINS", AllowOrigins, "A comma separated list of allowed origins"},
"OLLAMA_RUNNERS_DIR": {"OLLAMA_RUNNERS_DIR", RunnersDir, "Location for runners"},
"OLLAMA_SCHED_SPREAD": {"OLLAMA_SCHED_SPREAD", SchedSpread, "Always schedule model across all GPUs"},
"OLLAMA_TMPDIR": {"OLLAMA_TMPDIR", TmpDir, "Location for temporary files"},
}
if runtime.GOOS != "darwin" {
ret["CUDA_VISIBLE_DEVICES"] = EnvVar{"CUDA_VISIBLE_DEVICES", CudaVisibleDevices, "Set which NVIDIA devices are visible"}
ret["HIP_VISIBLE_DEVICES"] = EnvVar{"HIP_VISIBLE_DEVICES", HipVisibleDevices, "Set which AMD devices are visible"}
ret["ROCR_VISIBLE_DEVICES"] = EnvVar{"ROCR_VISIBLE_DEVICES", RocrVisibleDevices, "Set which AMD devices are visible"}
ret["GPU_DEVICE_ORDINAL"] = EnvVar{"GPU_DEVICE_ORDINAL", GpuDeviceOrdinal, "Set which AMD devices are visible"}
ret["HSA_OVERRIDE_GFX_VERSION"] = EnvVar{"HSA_OVERRIDE_GFX_VERSION", HsaOverrideGfxVersion, "Override the gfx used for all detected AMD GPUs"}
ret["OLLAMA_INTEL_GPU"] = EnvVar{"OLLAMA_INTEL_GPU", IntelGpu, "Enable experimental Intel GPU detection"}
}
return ret
}
func Values() map[string]string {
@@ -104,9 +131,10 @@ func clean(key string) string {
func init() {
// default values
NumParallel = 1
MaxRunners = 1
NumParallel = 0 // Autoselect
MaxRunners = 0 // Autoselect
MaxQueuedRequests = 512
KeepAlive = 5 * time.Minute
LoadConfig()
}
@@ -180,8 +208,8 @@ func LoadConfig() {
if onp := clean("OLLAMA_NUM_PARALLEL"); onp != "" {
val, err := strconv.Atoi(onp)
if err != nil || val <= 0 {
slog.Error("invalid setting must be greater than zero", "OLLAMA_NUM_PARALLEL", onp, "error", err)
if err != nil {
slog.Error("invalid setting, ignoring", "OLLAMA_NUM_PARALLEL", onp, "error", err)
} else {
NumParallel = val
}
@@ -191,6 +219,15 @@ func LoadConfig() {
NoHistory = true
}
if spread := clean("OLLAMA_SCHED_SPREAD"); spread != "" {
s, err := strconv.ParseBool(spread)
if err == nil {
SchedSpread = s
} else {
SchedSpread = true
}
}
if noprune := clean("OLLAMA_NOPRUNE"); noprune != "" {
NoPrune = true
}
@@ -217,7 +254,7 @@ func LoadConfig() {
if maxRunners != "" {
m, err := strconv.Atoi(maxRunners)
if err != nil {
slog.Error("invalid setting", "OLLAMA_MAX_LOADED_MODELS", maxRunners, "error", err)
slog.Error("invalid setting, ignoring", "OLLAMA_MAX_LOADED_MODELS", maxRunners, "error", err)
} else {
MaxRunners = m
}
@@ -226,13 +263,16 @@ func LoadConfig() {
if onp := os.Getenv("OLLAMA_MAX_QUEUE"); onp != "" {
p, err := strconv.Atoi(onp)
if err != nil || p <= 0 {
slog.Error("invalid setting", "OLLAMA_MAX_QUEUE", onp, "error", err)
slog.Error("invalid setting, ignoring", "OLLAMA_MAX_QUEUE", onp, "error", err)
} else {
MaxQueuedRequests = p
}
}
KeepAlive = clean("OLLAMA_KEEP_ALIVE")
ka := clean("OLLAMA_KEEP_ALIVE")
if ka != "" {
loadKeepAlive(ka)
}
var err error
ModelsDir, err = getModelsDir()
@@ -244,6 +284,16 @@ func LoadConfig() {
if err != nil {
slog.Error("invalid setting", "OLLAMA_HOST", Host, "error", err, "using default port", Host.Port)
}
if set, err := strconv.ParseBool(clean("OLLAMA_INTEL_GPU")); err == nil {
IntelGpu = set
}
CudaVisibleDevices = clean("CUDA_VISIBLE_DEVICES")
HipVisibleDevices = clean("HIP_VISIBLE_DEVICES")
RocrVisibleDevices = clean("ROCR_VISIBLE_DEVICES")
GpuDeviceOrdinal = clean("GPU_DEVICE_ORDINAL")
HsaOverrideGfxVersion = clean("HSA_OVERRIDE_GFX_VERSION")
}
func getModelsDir() (string, error) {
@@ -300,3 +350,24 @@ func getOllamaHost() (*OllamaHost, error) {
Port: port,
}, nil
}
func loadKeepAlive(ka string) {
v, err := strconv.Atoi(ka)
if err != nil {
d, err := time.ParseDuration(ka)
if err == nil {
if d < 0 {
KeepAlive = time.Duration(math.MaxInt64)
} else {
KeepAlive = d
}
}
} else {
d := time.Duration(v) * time.Second
if d < 0 {
KeepAlive = time.Duration(math.MaxInt64)
} else {
KeepAlive = d
}
}
}

View File

@@ -2,8 +2,10 @@ package envconfig
import (
"fmt"
"math"
"net"
"testing"
"time"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
@@ -23,6 +25,21 @@ func TestConfig(t *testing.T) {
t.Setenv("OLLAMA_FLASH_ATTENTION", "1")
LoadConfig()
require.True(t, FlashAttention)
t.Setenv("OLLAMA_KEEP_ALIVE", "")
LoadConfig()
require.Equal(t, 5*time.Minute, KeepAlive)
t.Setenv("OLLAMA_KEEP_ALIVE", "3")
LoadConfig()
require.Equal(t, 3*time.Second, KeepAlive)
t.Setenv("OLLAMA_KEEP_ALIVE", "1h")
LoadConfig()
require.Equal(t, 1*time.Hour, KeepAlive)
t.Setenv("OLLAMA_KEEP_ALIVE", "-1s")
LoadConfig()
require.Equal(t, time.Duration(math.MaxInt64), KeepAlive)
t.Setenv("OLLAMA_KEEP_ALIVE", "-1")
LoadConfig()
require.Equal(t, time.Duration(math.MaxInt64), KeepAlive)
}
func TestClientFromEnvironment(t *testing.T) {

3
go.mod
View File

@@ -18,6 +18,7 @@ require (
require (
github.com/agnivade/levenshtein v1.1.1
github.com/d4l3k/go-bfloat16 v0.0.0-20211005043715-690c3bdd05f1
github.com/google/go-cmp v0.6.0
github.com/mattn/go-runewidth v0.0.14
github.com/nlpodyssey/gopickle v0.3.0
github.com/pdevine/tensor v0.0.0-20240510204454-f88f4562727c
@@ -71,7 +72,7 @@ require (
golang.org/x/net v0.25.0 // indirect
golang.org/x/sys v0.20.0
golang.org/x/term v0.20.0
golang.org/x/text v0.15.0 // indirect
golang.org/x/text v0.15.0
google.golang.org/protobuf v1.34.1
gopkg.in/yaml.v3 v3.0.1 // indirect
)

View File

@@ -49,9 +49,17 @@ func rocmGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) {
}
func commonAMDValidateLibDir() (string, error) {
// We try to favor system paths first, so that we can wire up the subprocess to use
// the system version. Only use our bundled version if the system version doesn't work
// This gives users a more recovery options if versions have subtle problems at runtime
// Favor our bundled version
// Installer payload location if we're running the installed binary
exe, err := os.Executable()
if err == nil {
rocmTargetDir := filepath.Join(filepath.Dir(exe), "rocm")
if rocmLibUsable(rocmTargetDir) {
slog.Debug("detected ROCM next to ollama executable " + rocmTargetDir)
return rocmTargetDir, nil
}
}
// Prefer explicit HIP env var
hipPath := os.Getenv("HIP_PATH")
@@ -87,14 +95,5 @@ func commonAMDValidateLibDir() (string, error) {
}
}
// Installer payload location if we're running the installed binary
exe, err := os.Executable()
if err == nil {
rocmTargetDir := filepath.Join(filepath.Dir(exe), "rocm")
if rocmLibUsable(rocmTargetDir) {
slog.Debug("detected ROCM next to ollama executable " + rocmTargetDir)
return rocmTargetDir, nil
}
}
return "", fmt.Errorf("no suitable rocm found, falling back to CPU")
}

View File

@@ -33,9 +33,10 @@ type HipLib struct {
}
func NewHipLib() (*HipLib, error) {
h, err := windows.LoadLibrary("amdhip64.dll")
// At runtime we depend on v6, so discover GPUs with the same library for a consistent set of GPUs
h, err := windows.LoadLibrary("amdhip64_6.dll")
if err != nil {
return nil, fmt.Errorf("unable to load amdhip64.dll: %w", err)
return nil, fmt.Errorf("unable to load amdhip64_6.dll, please make sure to upgrade to the latest amd driver: %w", err)
}
hl := &HipLib{}
hl.dll = h
@@ -84,9 +85,8 @@ func (hl *HipLib) AMDDriverVersion() (driverMajor, driverMinor int, err error) {
}
slog.Debug("hipDriverGetVersion", "version", version)
// TODO - this isn't actually right, but the docs claim hipDriverGetVersion isn't accurate anyway...
driverMajor = version / 1000
driverMinor = (version - (driverMajor * 1000)) / 10
driverMajor = version / 10000000
driverMinor = (version - (driverMajor * 10000000)) / 100000
return driverMajor, driverMinor, nil
}

View File

@@ -13,6 +13,7 @@ import (
"strconv"
"strings"
"github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/format"
)
@@ -25,7 +26,16 @@ const (
// Prefix with the node dir
GPUTotalMemoryFileGlob = "mem_banks/*/properties" // size_in_bytes line
GPUUsedMemoryFileGlob = "mem_banks/*/used_memory"
// Direct Rendering Manager sysfs location
DRMDeviceDirGlob = "/sys/class/drm/card*/device"
DRMTotalMemoryFile = "mem_info_vram_total"
DRMUsedMemoryFile = "mem_info_vram_used"
// In hex; properties file is in decimal
DRMUniqueIDFile = "unique_id"
DRMVendorFile = "vendor"
DRMDeviceFile = "device"
)
var (
@@ -35,8 +45,8 @@ var (
)
// Gather GPU information from the amdgpu driver if any supported GPUs are detected
func AMDGetGPUInfo() []GpuInfo {
resp := []GpuInfo{}
func AMDGetGPUInfo() []RocmGPUInfo {
resp := []RocmGPUInfo{}
if !AMDDetected() {
return resp
}
@@ -50,9 +60,9 @@ func AMDGetGPUInfo() []GpuInfo {
// Determine if the user has already pre-selected which GPUs to look at, then ignore the others
var visibleDevices []string
hipVD := os.Getenv("HIP_VISIBLE_DEVICES") // zero based index only
rocrVD := os.Getenv("ROCR_VISIBLE_DEVICES") // zero based index or UUID, but consumer cards seem to not support UUID
gpuDO := os.Getenv("GPU_DEVICE_ORDINAL") // zero based index
hipVD := envconfig.HipVisibleDevices // zero based index only
rocrVD := envconfig.RocrVisibleDevices // zero based index or UUID, but consumer cards seem to not support UUID
gpuDO := envconfig.GpuDeviceOrdinal // zero based index
switch {
// TODO is this priorty order right?
case hipVD != "":
@@ -65,7 +75,7 @@ func AMDGetGPUInfo() []GpuInfo {
visibleDevices = strings.Split(gpuDO, ",")
}
gfxOverride := os.Getenv("HSA_OVERRIDE_GFX_VERSION")
gfxOverride := envconfig.HsaOverrideGfxVersion
var supported []string
libDir := ""
@@ -90,7 +100,7 @@ func AMDGetGPUInfo() []GpuInfo {
scanner := bufio.NewScanner(fp)
isCPU := false
var major, minor, patch uint64
var vendor, device uint64
var vendor, device, uniqueID uint64
for scanner.Scan() {
line := strings.TrimSpace(scanner.Text())
// Note: we could also use "cpu_cores_count X" where X is greater than zero to detect CPUs
@@ -121,30 +131,43 @@ func AMDGetGPUInfo() []GpuInfo {
} else if strings.HasPrefix(line, "vendor_id") {
ver := strings.Fields(line)
if len(ver) != 2 {
slog.Debug("malformed vendor_id", "vendor_id", line)
slog.Debug("malformed", "vendor_id", line)
continue
}
vendor, err = strconv.ParseUint(ver[1], 10, 32)
vendor, err = strconv.ParseUint(ver[1], 10, 64)
if err != nil {
slog.Debug("malformed vendor_id" + line)
slog.Debug("malformed", "vendor_id", line, "error", err)
}
} else if strings.HasPrefix(line, "device_id") {
ver := strings.Fields(line)
if len(ver) != 2 {
slog.Debug("malformed device_id", "device_id", line)
slog.Debug("malformed", "device_id", line)
continue
}
device, err = strconv.ParseUint(ver[1], 10, 32)
device, err = strconv.ParseUint(ver[1], 10, 64)
if err != nil {
slog.Debug("malformed device_id" + line)
slog.Debug("malformed", "device_id", line, "error", err)
}
} else if strings.HasPrefix(line, "unique_id") {
ver := strings.Fields(line)
if len(ver) != 2 {
slog.Debug("malformed", "unique_id", line)
continue
}
uniqueID, err = strconv.ParseUint(ver[1], 10, 64)
if err != nil {
slog.Debug("malformed", "unique_id", line, "error", err)
}
}
// TODO - any other properties we want to extract and record?
// vendor_id + device_id -> pci lookup for "Name"
// Other metrics that may help us understand relative performance between multiple GPUs
}
// Note: while ./mem_banks/*/used_memory exists, it doesn't appear to take other VRAM consumers
// into consideration, so we instead map the device over to the DRM driver sysfs nodes which
// do reliably report VRAM usage.
if isCPU {
cpuCount++
continue
@@ -156,7 +179,7 @@ func AMDGetGPUInfo() []GpuInfo {
// Shouldn't happen, but just in case...
if gpuID < 0 {
slog.Error("unexpected amdgpu sysfs data resulted in negative GPU ID, please set OLLAMA_DEBUG=1 and report an issue")
return []GpuInfo{}
return nil
}
if int(major) < RocmComputeMin {
@@ -167,65 +190,68 @@ func AMDGetGPUInfo() []GpuInfo {
// Look up the memory for the current node
totalMemory := uint64(0)
usedMemory := uint64(0)
propGlob := filepath.Join(AMDNodesSysfsDir, strconv.Itoa(nodeID), GPUTotalMemoryFileGlob)
propFiles, err := filepath.Glob(propGlob)
if err != nil {
slog.Warn("error looking up total GPU memory", "glob", propGlob, "error", err)
var usedFile string
mapping := []struct {
id uint64
filename string
}{
{vendor, DRMVendorFile},
{device, DRMDeviceFile},
{uniqueID, DRMUniqueIDFile}, // Not all devices will report this
}
// 1 or more memory banks - sum the values of all of them
for _, propFile := range propFiles {
fp, err := os.Open(propFile)
if err != nil {
slog.Warn("failed to open sysfs node", "file", propFile, "erroir", err)
continue
}
defer fp.Close()
scanner := bufio.NewScanner(fp)
for scanner.Scan() {
line := strings.TrimSpace(scanner.Text())
if strings.HasPrefix(line, "size_in_bytes") {
ver := strings.Fields(line)
if len(ver) != 2 {
slog.Warn("malformed " + line)
continue
}
bankSizeInBytes, err := strconv.ParseUint(ver[1], 10, 64)
if err != nil {
slog.Warn("malformed int " + line)
continue
}
totalMemory += bankSizeInBytes
slog.Debug("mapping amdgpu to drm sysfs nodes", "amdgpu", match, "vendor", vendor, "device", device, "unique_id", uniqueID)
// Map over to DRM location to find the total/free memory
drmMatches, _ := filepath.Glob(DRMDeviceDirGlob)
for _, devDir := range drmMatches {
matched := true
for _, m := range mapping {
if m.id == 0 {
// Null ID means it didn't populate, so we can't use it to match
continue
}
filename := filepath.Join(devDir, m.filename)
buf, err := os.ReadFile(filename)
if err != nil {
slog.Debug("failed to read sysfs node", "file", filename, "error", err)
matched = false
break
}
// values here are in hex, strip off the lead 0x and parse so we can compare the numeric (decimal) values in amdgpu
cmp, err := strconv.ParseUint(strings.TrimPrefix(strings.TrimSpace(string(buf)), "0x"), 16, 64)
if err != nil {
slog.Debug("failed to parse sysfs node", "file", filename, "error", err)
matched = false
break
}
if cmp != m.id {
matched = false
break
}
}
}
if totalMemory == 0 {
slog.Warn("amdgpu reports zero total memory", "gpu", gpuID)
continue
}
usedGlob := filepath.Join(AMDNodesSysfsDir, strconv.Itoa(nodeID), GPUUsedMemoryFileGlob)
usedFiles, err := filepath.Glob(usedGlob)
if err != nil {
slog.Warn("error looking up used GPU memory", "glob", usedGlob, "error", err)
continue
}
for _, usedFile := range usedFiles {
fp, err := os.Open(usedFile)
if err != nil {
slog.Warn("failed to open sysfs node", "file", usedFile, "error", err)
if !matched {
continue
}
defer fp.Close()
data, err := io.ReadAll(fp)
// Found the matching DRM directory
slog.Debug("matched", "amdgpu", match, "drm", devDir)
totalFile := filepath.Join(devDir, DRMTotalMemoryFile)
buf, err := os.ReadFile(totalFile)
if err != nil {
slog.Warn("failed to read sysfs node", "file", usedFile, "error", err)
continue
slog.Debug("failed to read sysfs node", "file", totalFile, "error", err)
break
}
used, err := strconv.ParseUint(strings.TrimSpace(string(data)), 10, 64)
totalMemory, err = strconv.ParseUint(strings.TrimSpace(string(buf)), 10, 64)
if err != nil {
slog.Warn("malformed used memory", "data", string(data), "error", err)
continue
slog.Debug("failed to parse sysfs node", "file", totalFile, "error", err)
break
}
usedMemory += used
usedFile = filepath.Join(devDir, DRMUsedMemoryFile)
usedMemory, err = getFreeMemory(usedFile)
if err != nil {
slog.Debug("failed to update used memory", "error", err)
}
break
}
// iGPU detection, remove this check once we can support an iGPU variant of the rocm library
@@ -241,18 +267,21 @@ func AMDGetGPUInfo() []GpuInfo {
slog.Debug("amdgpu memory", "gpu", gpuID, "total", format.HumanBytes2(totalMemory))
slog.Debug("amdgpu memory", "gpu", gpuID, "available", format.HumanBytes2(totalMemory-usedMemory))
gpuInfo := GpuInfo{
Library: "rocm",
memInfo: memInfo{
TotalMemory: totalMemory,
FreeMemory: (totalMemory - usedMemory),
gpuInfo := RocmGPUInfo{
GpuInfo: GpuInfo{
Library: "rocm",
memInfo: memInfo{
TotalMemory: totalMemory,
FreeMemory: (totalMemory - usedMemory),
},
ID: strconv.Itoa(gpuID),
Name: name,
Compute: fmt.Sprintf("gfx%d%x%x", major, minor, patch),
MinimumMemory: rocmMinimumMemory,
DriverMajor: driverMajor,
DriverMinor: driverMinor,
},
ID: fmt.Sprintf("%d", gpuID),
Name: name,
Compute: fmt.Sprintf("gfx%d%x%x", major, minor, patch),
MinimumMemory: rocmMinimumMemory,
DriverMajor: driverMajor,
DriverMinor: driverMinor,
usedFilepath: usedFile,
}
// If the user wants to filter to a subset of devices, filter out if we aren't a match
@@ -276,7 +305,7 @@ func AMDGetGPUInfo() []GpuInfo {
libDir, err = AMDValidateLibDir()
if err != nil {
slog.Warn("unable to verify rocm library, will use cpu", "error", err)
return []GpuInfo{}
return nil
}
}
gpuInfo.DependencyPath = libDir
@@ -287,7 +316,7 @@ func AMDGetGPUInfo() []GpuInfo {
supported, err = GetSupportedGFX(libDir)
if err != nil {
slog.Warn("failed to lookup supported GFX types, falling back to CPU mode", "error", err)
return []GpuInfo{}
return nil
}
slog.Debug("rocm supported GPUs", "types", supported)
}
@@ -304,6 +333,11 @@ func AMDGetGPUInfo() []GpuInfo {
slog.Info("skipping rocm gfx compatibility check", "HSA_OVERRIDE_GFX_VERSION", gfxOverride)
}
// Check for env var workarounds
if name == "1002:687f" { // Vega RX 56
gpuInfo.EnvWorkarounds = append(gpuInfo.EnvWorkarounds, [2]string{"HSA_ENABLE_SDMA", "0"})
}
// The GPU has passed all the verification steps and is supported
resp = append(resp, gpuInfo)
}
@@ -378,3 +412,31 @@ func AMDDriverVersion() (driverMajor, driverMinor int, err error) {
}
return driverMajor, driverMinor, nil
}
func (gpus RocmGPUInfoList) RefreshFreeMemory() error {
if len(gpus) == 0 {
return nil
}
for i := range gpus {
usedMemory, err := getFreeMemory(gpus[i].usedFilepath)
if err != nil {
return err
}
slog.Debug("updating rocm free memory", "gpu", gpus[i].ID, "name", gpus[i].Name, "before", format.HumanBytes2(gpus[i].FreeMemory), "now", format.HumanBytes2(gpus[i].TotalMemory-usedMemory))
gpus[i].FreeMemory = gpus[i].TotalMemory - usedMemory
}
return nil
}
func getFreeMemory(usedFile string) (uint64, error) {
buf, err := os.ReadFile(usedFile)
if err != nil {
return 0, fmt.Errorf("failed to read sysfs node %s %w", usedFile, err)
}
usedMemory, err := strconv.ParseUint(strings.TrimSpace(string(buf)), 10, 64)
if err != nil {
slog.Debug("failed to parse sysfs node", "file", usedFile, "error", err)
return 0, fmt.Errorf("failed to parse sysfs node %s %w", usedFile, err)
}
return usedMemory, nil
}

View File

@@ -7,8 +7,10 @@ import (
"os"
"path/filepath"
"slices"
"strconv"
"strings"
"github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/format"
)
@@ -20,12 +22,12 @@ const (
var (
// Used to validate if the given ROCm lib is usable
ROCmLibGlobs = []string{"hipblas.dll", "rocblas"} // TODO - probably include more coverage of files here...
RocmStandardLocations = []string{"C:\\Program Files\\AMD\\ROCm\\5.7\\bin"} // TODO glob?
ROCmLibGlobs = []string{"hipblas.dll", "rocblas"} // This is not sufficient to discern v5 vs v6
RocmStandardLocations = []string{"C:\\Program Files\\AMD\\ROCm\\6.1\\bin"} // TODO glob?
)
func AMDGetGPUInfo() []GpuInfo {
resp := []GpuInfo{}
func AMDGetGPUInfo() []RocmGPUInfo {
resp := []RocmGPUInfo{}
hl, err := NewHipLib()
if err != nil {
slog.Debug(err.Error())
@@ -33,12 +35,11 @@ func AMDGetGPUInfo() []GpuInfo {
}
defer hl.Release()
// TODO - this reports incorrect version information, so omitting for now
// driverMajor, driverMinor, err := hl.AMDDriverVersion()
// if err != nil {
// // For now this is benign, but we may eventually need to fail compatibility checks
// slog.Debug("error looking up amd driver version", "error", err)
// }
driverMajor, driverMinor, err := hl.AMDDriverVersion()
if err != nil {
// For now this is benign, but we may eventually need to fail compatibility checks
slog.Debug("error looking up amd driver version", "error", err)
}
// Note: the HIP library automatically handles subsetting to any HIP_VISIBLE_DEVICES the user specified
count := hl.HipGetDeviceCount()
@@ -52,7 +53,7 @@ func AMDGetGPUInfo() []GpuInfo {
}
var supported []string
gfxOverride := os.Getenv("HSA_OVERRIDE_GFX_VERSION")
gfxOverride := envconfig.HsaOverrideGfxVersion
if gfxOverride == "" {
supported, err = GetSupportedGFX(libDir)
if err != nil {
@@ -91,7 +92,8 @@ func AMDGetGPUInfo() []GpuInfo {
continue
}
if gfxOverride == "" {
if !slices.Contains[[]string, string](supported, gfx) {
// Strip off Target Features when comparing
if !slices.Contains[[]string, string](supported, strings.Split(gfx, ":")[0]) {
slog.Warn("amdgpu is not supported", "gpu", i, "gpu_type", gfx, "library", libDir, "supported_types", supported)
// TODO - consider discrete markdown just for ROCM troubleshooting?
slog.Warn("See https://github.com/ollama/ollama/blob/main/docs/troubleshooting.md for HSA_OVERRIDE_GFX_VERSION usage")
@@ -113,25 +115,27 @@ func AMDGetGPUInfo() []GpuInfo {
continue
}
// TODO revisit this once ROCm v6 is available on windows.
// v5.7 only reports VRAM used by this process, so it's completely wrong and unusable
slog.Debug("amdgpu memory", "gpu", i, "total", format.HumanBytes2(totalMemory))
slog.Debug("amdgpu memory", "gpu", i, "available", format.HumanBytes2(freeMemory))
gpuInfo := GpuInfo{
Library: "rocm",
memInfo: memInfo{
TotalMemory: totalMemory,
FreeMemory: freeMemory,
},
ID: fmt.Sprintf("%d", i), // TODO this is probably wrong if we specify visible devices
DependencyPath: libDir,
MinimumMemory: rocmMinimumMemory,
Name: name,
Compute: gfx,
gpuInfo := RocmGPUInfo{
GpuInfo: GpuInfo{
Library: "rocm",
memInfo: memInfo{
TotalMemory: totalMemory,
FreeMemory: freeMemory,
},
// Free memory reporting on Windows is not reliable until we bump to ROCm v6.2
UnreliableFreeMemory: true,
// TODO - this information isn't accurate on windows, so don't report it until we find the right way to retrieve
// DriverMajor: driverMajor,
// DriverMinor: driverMinor,
ID: strconv.Itoa(i), // TODO this is probably wrong if we specify visible devices
DependencyPath: libDir,
MinimumMemory: rocmMinimumMemory,
Name: name,
Compute: gfx,
DriverMajor: driverMajor,
DriverMinor: driverMinor,
},
index: i,
}
resp = append(resp, gpuInfo)
@@ -159,3 +163,30 @@ func AMDValidateLibDir() (string, error) {
slog.Warn("amdgpu detected, but no compatible rocm library found. Please install ROCm")
return "", fmt.Errorf("no suitable rocm found, falling back to CPU")
}
func (gpus RocmGPUInfoList) RefreshFreeMemory() error {
if len(gpus) == 0 {
return nil
}
hl, err := NewHipLib()
if err != nil {
slog.Debug(err.Error())
return nil
}
defer hl.Release()
for i := range gpus {
err := hl.HipSetDevice(gpus[i].index)
if err != nil {
return err
}
freeMemory, _, err := hl.HipMemGetInfo()
if err != nil {
slog.Warn("get mem info", "id", i, "error", err)
continue
}
slog.Debug("updating rocm free memory", "gpu", gpus[i].ID, "name", gpus[i].Name, "before", format.HumanBytes2(gpus[i].FreeMemory), "now", format.HumanBytes2(freeMemory))
gpus[i].FreeMemory = freeMemory
}
return nil
}

View File

@@ -77,20 +77,27 @@ func cleanupTmpDirs() {
continue
}
raw, err := os.ReadFile(filepath.Join(d, "ollama.pid"))
if err == nil {
pid, err := strconv.Atoi(string(raw))
if err == nil {
if proc, err := os.FindProcess(pid); err == nil && !errors.Is(proc.Signal(syscall.Signal(0)), os.ErrProcessDone) {
// Another running ollama, ignore this tmpdir
continue
}
}
} else {
slog.Debug("failed to open ollama.pid", "path", d, "error", err)
}
err = os.RemoveAll(d)
if err != nil {
slog.Debug("unable to cleanup stale tmpdir", "path", d, "error", err)
slog.Warn("failed to read ollama.pid", "path", d, "error", err)
// No pid, ignore this tmpdir
continue
}
pid, err := strconv.Atoi(string(raw))
if err != nil {
slog.Warn("failed to parse pid", "path", d, "error", err)
continue
}
proc, err := os.FindProcess(pid)
if err == nil && !errors.Is(proc.Signal(syscall.Signal(0)), os.ErrProcessDone) {
slog.Warn("found running ollama", "pid", pid, "path", d)
// Another running ollama, ignore this tmpdir
continue
}
if err := os.Remove(d); err != nil {
slog.Warn("unable to cleanup stale tmpdir", "path", d, "error", err)
}
}
}

View File

@@ -1,21 +1,16 @@
package gpu
import (
"log/slog"
"golang.org/x/sys/cpu"
)
func GetCPUVariant() string {
func GetCPUCapability() CPUCapability {
if cpu.X86.HasAVX2 {
slog.Debug("CPU has AVX2")
return "avx2"
return CPUCapabilityAVX2
}
if cpu.X86.HasAVX {
slog.Debug("CPU has AVX")
return "avx"
return CPUCapabilityAVX
}
slog.Debug("CPU does not have vector extensions")
// else LCD
return ""
return CPUCapabilityNone
}

View File

@@ -24,19 +24,37 @@ import (
"github.com/ollama/ollama/format"
)
type handles struct {
type cudaHandles struct {
deviceCount int
cudart *C.cudart_handle_t
nvcuda *C.nvcuda_handle_t
nvml *C.nvml_handle_t
}
type oneapiHandles struct {
oneapi *C.oneapi_handle_t
deviceCount int
}
const (
cudaMinimumMemory = 457 * format.MebiByte
rocmMinimumMemory = 457 * format.MebiByte
// TODO OneAPI minimum memory
)
var gpuMutex sync.Mutex
var (
gpuMutex sync.Mutex
bootstrapped bool
cpuCapability CPUCapability
cpus []CPUInfo
cudaGPUs []CudaGPUInfo
nvcudaLibPath string
cudartLibPath string
oneapiLibPath string
nvmlLibPath string
rocmGPUs []RocmGPUInfo
oneapiGPUs []OneapiGPUInfo
)
// With our current CUDA compile flags, older than 5.0 will not work properly
var CudaComputeMin = [2]C.int{5, 0}
@@ -46,113 +64,113 @@ var RocmComputeMin = 9
// TODO find a better way to detect iGPU instead of minimum memory
const IGPUMemLimit = 1 * format.GibiByte // 512G is what they typically report, so anything less than 1G must be iGPU
var CudartLinuxGlobs = []string{
"/usr/local/cuda/lib64/libcudart.so*",
"/usr/lib/x86_64-linux-gnu/nvidia/current/libcudart.so*",
"/usr/lib/x86_64-linux-gnu/libcudart.so*",
"/usr/lib/wsl/lib/libcudart.so*",
"/usr/lib/wsl/drivers/*/libcudart.so*",
"/opt/cuda/lib64/libcudart.so*",
"/usr/local/cuda*/targets/aarch64-linux/lib/libcudart.so*",
"/usr/lib/aarch64-linux-gnu/nvidia/current/libcudart.so*",
"/usr/lib/aarch64-linux-gnu/libcudart.so*",
"/usr/local/cuda/lib*/libcudart.so*",
"/usr/lib*/libcudart.so*",
"/usr/local/lib*/libcudart.so*",
}
var CudartWindowsGlobs = []string{
"c:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v*\\bin\\cudart64_*.dll",
}
var NvcudaLinuxGlobs = []string{
"/usr/local/cuda*/targets/*/lib/libcuda.so*",
"/usr/lib/*-linux-gnu/nvidia/current/libcuda.so*",
"/usr/lib/*-linux-gnu/libcuda.so*",
"/usr/lib/wsl/lib/libcuda.so*",
"/usr/lib/wsl/drivers/*/libcuda.so*",
"/opt/cuda/lib*/libcuda.so*",
"/usr/local/cuda/lib*/libcuda.so*",
"/usr/lib*/libcuda.so*",
"/usr/local/lib*/libcuda.so*",
}
var NvcudaWindowsGlobs = []string{
"c:\\windows\\system*\\nvcuda.dll",
}
var OneapiWindowsGlobs = []string{
"c:\\Windows\\System32\\DriverStore\\FileRepository\\*\\ze_intel_gpu64.dll",
}
var OneapiLinuxGlobs = []string{
"/usr/lib/x86_64-linux-gnu/libze_intel_gpu.so*",
"/usr/lib*/libze_intel_gpu.so*",
}
// Jetson devices have JETSON_JETPACK="x.y.z" factory set to the Jetpack version installed.
// Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices.
var CudaTegra string = os.Getenv("JETSON_JETPACK")
// Note: gpuMutex must already be held
func initGPUHandles() *handles {
func initCudaHandles() *cudaHandles {
// TODO - if the ollama build is CPU only, don't do these checks as they're irrelevant and confusing
gpuHandles := &handles{}
var cudartMgmtName string
var cudartMgmtPatterns []string
var nvcudaMgmtName string
var nvcudaMgmtPatterns []string
tmpDir, _ := PayloadsDir()
switch runtime.GOOS {
case "windows":
cudartMgmtName = "cudart64_*.dll"
localAppData := os.Getenv("LOCALAPPDATA")
cudartMgmtPatterns = []string{filepath.Join(localAppData, "Programs", "Ollama", cudartMgmtName)}
cudartMgmtPatterns = append(cudartMgmtPatterns, CudartWindowsGlobs...)
// Aligned with driver, we can't carry as payloads
nvcudaMgmtName = "nvcuda.dll"
nvcudaMgmtPatterns = NvcudaWindowsGlobs
case "linux":
cudartMgmtName = "libcudart.so*"
if tmpDir != "" {
// TODO - add "payloads" for subprocess
cudartMgmtPatterns = []string{filepath.Join(tmpDir, "cuda*", cudartMgmtName)}
}
cudartMgmtPatterns = append(cudartMgmtPatterns, CudartLinuxGlobs...)
// Aligned with driver, we can't carry as payloads
nvcudaMgmtName = "libcuda.so*"
nvcudaMgmtPatterns = NvcudaLinuxGlobs
default:
return gpuHandles
cHandles := &cudaHandles{}
// Short Circuit if we already know which library to use
if nvmlLibPath != "" {
cHandles.nvml, _ = LoadNVMLMgmt([]string{nvmlLibPath})
return cHandles
}
if nvcudaLibPath != "" {
cHandles.deviceCount, cHandles.nvcuda, _ = LoadNVCUDAMgmt([]string{nvcudaLibPath})
return cHandles
}
if cudartLibPath != "" {
cHandles.deviceCount, cHandles.cudart, _ = LoadCUDARTMgmt([]string{cudartLibPath})
return cHandles
}
slog.Debug("Detecting GPUs")
nvcudaLibPaths := FindGPULibs(nvcudaMgmtName, nvcudaMgmtPatterns)
slog.Debug("searching for GPU discovery libraries for NVIDIA")
var cudartMgmtPatterns []string
// Aligned with driver, we can't carry as payloads
nvcudaMgmtPatterns := NvcudaGlobs
if runtime.GOOS == "windows" {
localAppData := os.Getenv("LOCALAPPDATA")
cudartMgmtPatterns = []string{filepath.Join(localAppData, "Programs", "Ollama", CudartMgmtName)}
}
tmpDir, _ := PayloadsDir()
if tmpDir != "" {
// TODO - add "payloads" for subprocess
cudartMgmtPatterns = []string{filepath.Join(tmpDir, "cuda*", CudartMgmtName)}
}
cudartMgmtPatterns = append(cudartMgmtPatterns, CudartGlobs...)
if len(NvmlGlobs) > 0 {
nvmlLibPaths := FindGPULibs(NvmlMgmtName, NvmlGlobs)
if len(nvmlLibPaths) > 0 {
nvml, libPath := LoadNVMLMgmt(nvmlLibPaths)
if nvml != nil {
slog.Debug("nvidia-ml loaded", "library", libPath)
cHandles.nvml = nvml
nvmlLibPath = libPath
}
}
}
nvcudaLibPaths := FindGPULibs(NvcudaMgmtName, nvcudaMgmtPatterns)
if len(nvcudaLibPaths) > 0 {
deviceCount, nvcuda, libPath := LoadNVCUDAMgmt(nvcudaLibPaths)
if nvcuda != nil {
slog.Debug("detected GPUs", "count", deviceCount, "library", libPath)
gpuHandles.nvcuda = nvcuda
gpuHandles.deviceCount = deviceCount
return gpuHandles
cHandles.nvcuda = nvcuda
cHandles.deviceCount = deviceCount
nvcudaLibPath = libPath
return cHandles
}
}
cudartLibPaths := FindGPULibs(cudartMgmtName, cudartMgmtPatterns)
cudartLibPaths := FindGPULibs(CudartMgmtName, cudartMgmtPatterns)
if len(cudartLibPaths) > 0 {
deviceCount, cudart, libPath := LoadCUDARTMgmt(cudartLibPaths)
if cudart != nil {
slog.Debug("detected GPUs", "library", libPath, "count", deviceCount)
gpuHandles.cudart = cudart
gpuHandles.deviceCount = deviceCount
return gpuHandles
cHandles.cudart = cudart
cHandles.deviceCount = deviceCount
cudartLibPath = libPath
return cHandles
}
}
return gpuHandles
return cHandles
}
// Note: gpuMutex must already be held
func initOneAPIHandles() *oneapiHandles {
oHandles := &oneapiHandles{}
// Short Circuit if we already know which library to use
if oneapiLibPath != "" {
oHandles.deviceCount, oHandles.oneapi, _ = LoadOneapiMgmt([]string{oneapiLibPath})
return oHandles
}
oneapiLibPaths := FindGPULibs(OneapiMgmtName, OneapiGlobs)
if len(oneapiLibPaths) > 0 {
oHandles.deviceCount, oHandles.oneapi, oneapiLibPath = LoadOneapiMgmt(oneapiLibPaths)
}
return oHandles
}
func GetCPUInfo() GpuInfoList {
gpuMutex.Lock()
if !bootstrapped {
gpuMutex.Unlock()
GetGPUInfo()
} else {
gpuMutex.Unlock()
}
return GpuInfoList{cpus[0].GpuInfo}
}
func GetGPUInfo() GpuInfoList {
@@ -160,112 +178,288 @@ func GetGPUInfo() GpuInfoList {
// GPUs so we can report warnings if we see Nvidia/AMD but fail to load the libraries
gpuMutex.Lock()
defer gpuMutex.Unlock()
gpuHandles := initGPUHandles()
needRefresh := true
var cHandles *cudaHandles
var oHandles *oneapiHandles
defer func() {
if gpuHandles.cudart != nil {
C.cudart_release(*gpuHandles.cudart)
if cHandles != nil {
if cHandles.cudart != nil {
C.cudart_release(*cHandles.cudart)
}
if cHandles.nvcuda != nil {
C.nvcuda_release(*cHandles.nvcuda)
}
if cHandles.nvml != nil {
C.nvml_release(*cHandles.nvml)
}
}
if gpuHandles.nvcuda != nil {
C.nvcuda_release(*gpuHandles.nvcuda)
if oHandles != nil {
if oHandles.oneapi != nil {
// TODO - is this needed?
C.oneapi_release(*oHandles.oneapi)
}
}
}()
// All our GPU builds on x86 have AVX enabled, so fallback to CPU if we don't detect at least AVX
cpuVariant := GetCPUVariant()
if cpuVariant == "" && runtime.GOARCH == "amd64" {
slog.Warn("CPU does not have AVX or AVX2, disabling GPU support.")
}
if !bootstrapped {
slog.Info("looking for compatible GPUs")
needRefresh = false
cpuCapability = GetCPUCapability()
var memInfo C.mem_info_t
// On windows we bundle the nvidia library one level above the runner dir
depPath := ""
if runtime.GOOS == "windows" && envconfig.RunnersDir != "" {
depPath = filepath.Dir(envconfig.RunnersDir)
}
var memInfo C.mem_info_t
resp := []GpuInfo{}
// NVIDIA first
for i := range gpuHandles.deviceCount {
// TODO once we support CPU compilation variants of GPU libraries refine this...
if cpuVariant == "" && runtime.GOARCH == "amd64" {
continue
mem, err := GetCPUMem()
if err != nil {
slog.Warn("error looking up system memory", "error", err)
}
if gpuHandles.cudart != nil || gpuHandles.nvcuda != nil {
gpuInfo := GpuInfo{
Library: "cuda",
cpus = []CPUInfo{CPUInfo{
GpuInfo: GpuInfo{
memInfo: mem,
Library: "cpu",
Variant: cpuCapability,
ID: "0",
},
}}
// Fallback to CPU mode if we're lacking required vector extensions on x86
if cpuCapability < GPURunnerCPUCapability && runtime.GOARCH == "amd64" {
slog.Warn("CPU does not have minimum vector extensions, GPU inference disabled", "required", GPURunnerCPUCapability, "detected", cpuCapability)
bootstrapped = true
// No need to do any GPU discovery, since we can't run on them
return GpuInfoList{cpus[0].GpuInfo}
}
// On windows we bundle the nvidia library one level above the runner dir
depPath := ""
if runtime.GOOS == "windows" && envconfig.RunnersDir != "" {
depPath = filepath.Join(filepath.Dir(envconfig.RunnersDir), "cuda")
}
// Load ALL libraries
cHandles = initCudaHandles()
// NVIDIA
for i := range cHandles.deviceCount {
if cHandles.cudart != nil || cHandles.nvcuda != nil {
gpuInfo := CudaGPUInfo{
GpuInfo: GpuInfo{
Library: "cuda",
},
index: i,
}
var driverMajor int
var driverMinor int
if cHandles.cudart != nil {
C.cudart_bootstrap(*cHandles.cudart, C.int(i), &memInfo)
} else {
C.nvcuda_bootstrap(*cHandles.nvcuda, C.int(i), &memInfo)
driverMajor = int(cHandles.nvcuda.driver_major)
driverMinor = int(cHandles.nvcuda.driver_minor)
}
if memInfo.err != nil {
slog.Info("error looking up nvidia GPU memory", "error", C.GoString(memInfo.err))
C.free(unsafe.Pointer(memInfo.err))
continue
}
if memInfo.major < CudaComputeMin[0] || (memInfo.major == CudaComputeMin[0] && memInfo.minor < CudaComputeMin[1]) {
slog.Info(fmt.Sprintf("[%d] CUDA GPU is too old. Compute Capability detected: %d.%d", i, memInfo.major, memInfo.minor))
continue
}
gpuInfo.TotalMemory = uint64(memInfo.total)
gpuInfo.FreeMemory = uint64(memInfo.free)
gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
gpuInfo.Compute = fmt.Sprintf("%d.%d", memInfo.major, memInfo.minor)
gpuInfo.MinimumMemory = cudaMinimumMemory
gpuInfo.DependencyPath = depPath
gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
gpuInfo.DriverMajor = driverMajor
gpuInfo.DriverMinor = driverMinor
// query the management library as well so we can record any skew between the two
// which represents overhead on the GPU we must set aside on subsequent updates
if cHandles.nvml != nil {
C.nvml_get_free(*cHandles.nvml, C.int(gpuInfo.index), &memInfo.free, &memInfo.total, &memInfo.used)
if memInfo.err != nil {
slog.Warn("error looking up nvidia GPU memory", "error", C.GoString(memInfo.err))
C.free(unsafe.Pointer(memInfo.err))
} else {
if memInfo.free != 0 && uint64(memInfo.free) > gpuInfo.FreeMemory {
gpuInfo.OSOverhead = uint64(memInfo.free) - gpuInfo.FreeMemory
slog.Info("detected OS VRAM overhead",
"id", gpuInfo.ID,
"library", gpuInfo.Library,
"compute", gpuInfo.Compute,
"driver", fmt.Sprintf("%d.%d", gpuInfo.DriverMajor, gpuInfo.DriverMinor),
"name", gpuInfo.Name,
"overhead", format.HumanBytes2(gpuInfo.OSOverhead),
)
}
}
}
// TODO potentially sort on our own algorithm instead of what the underlying GPU library does...
cudaGPUs = append(cudaGPUs, gpuInfo)
}
var driverMajor int
var driverMinor int
if gpuHandles.cudart != nil {
C.cudart_check_vram(*gpuHandles.cudart, C.int(i), &memInfo)
}
// Intel
if envconfig.IntelGpu {
oHandles = initOneAPIHandles()
// On windows we bundle the oneapi library one level above the runner dir
depPath = ""
if runtime.GOOS == "windows" && envconfig.RunnersDir != "" {
depPath = filepath.Join(filepath.Dir(envconfig.RunnersDir), "oneapi")
}
for d := range oHandles.oneapi.num_drivers {
if oHandles.oneapi == nil {
// shouldn't happen
slog.Warn("nil oneapi handle with driver count", "count", int(oHandles.oneapi.num_drivers))
continue
}
devCount := C.oneapi_get_device_count(*oHandles.oneapi, C.int(d))
for i := range devCount {
gpuInfo := OneapiGPUInfo{
GpuInfo: GpuInfo{
Library: "oneapi",
},
driverIndex: int(d),
gpuIndex: int(i),
}
// TODO - split bootstrapping from updating free memory
C.oneapi_check_vram(*oHandles.oneapi, C.int(d), i, &memInfo)
// TODO - convert this to MinimumMemory based on testing...
var totalFreeMem float64 = float64(memInfo.free) * 0.95 // work-around: leave some reserve vram for mkl lib used in ggml-sycl backend.
memInfo.free = C.uint64_t(totalFreeMem)
gpuInfo.TotalMemory = uint64(memInfo.total)
gpuInfo.FreeMemory = uint64(memInfo.free)
gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
gpuInfo.DependencyPath = depPath
oneapiGPUs = append(oneapiGPUs, gpuInfo)
}
}
}
rocmGPUs = AMDGetGPUInfo()
bootstrapped = true
if len(cudaGPUs) == 0 && len(rocmGPUs) == 0 && len(oneapiGPUs) == 0 {
slog.Info("no compatible GPUs were discovered")
}
}
// For detected GPUs, load library if not loaded
// Refresh free memory usage
if needRefresh {
mem, err := GetCPUMem()
if err != nil {
slog.Warn("error looking up system memory", "error", err)
} else {
slog.Debug("updating system memory data",
slog.Group(
"before",
"total", format.HumanBytes2(cpus[0].TotalMemory),
"free", format.HumanBytes2(cpus[0].FreeMemory),
"free_swap", format.HumanBytes2(cpus[0].FreeSwap),
),
slog.Group(
"now",
"total", format.HumanBytes2(mem.TotalMemory),
"free", format.HumanBytes2(mem.FreeMemory),
"free_swap", format.HumanBytes2(mem.FreeSwap),
),
)
cpus[0].FreeMemory = mem.FreeMemory
cpus[0].FreeSwap = mem.FreeSwap
}
var memInfo C.mem_info_t
if cHandles == nil && len(cudaGPUs) > 0 {
cHandles = initCudaHandles()
}
for i, gpu := range cudaGPUs {
if cHandles.nvml != nil {
C.nvml_get_free(*cHandles.nvml, C.int(gpu.index), &memInfo.free, &memInfo.total, &memInfo.used)
} else if cHandles.cudart != nil {
C.cudart_bootstrap(*cHandles.cudart, C.int(gpu.index), &memInfo)
} else if cHandles.nvcuda != nil {
C.nvcuda_get_free(*cHandles.nvcuda, C.int(gpu.index), &memInfo.free, &memInfo.total)
memInfo.used = memInfo.total - memInfo.free
} else {
C.nvcuda_check_vram(*gpuHandles.nvcuda, C.int(i), &memInfo)
driverMajor = int(gpuHandles.nvcuda.driver_major)
driverMinor = int(gpuHandles.nvcuda.driver_minor)
// shouldn't happen
slog.Warn("no valid cuda library loaded to refresh vram usage")
break
}
if memInfo.err != nil {
slog.Info("error looking up nvidia GPU memory", "error", C.GoString(memInfo.err))
slog.Warn("error looking up nvidia GPU memory", "error", C.GoString(memInfo.err))
C.free(unsafe.Pointer(memInfo.err))
continue
}
if memInfo.major < CudaComputeMin[0] || (memInfo.major == CudaComputeMin[0] && memInfo.minor < CudaComputeMin[1]) {
slog.Info(fmt.Sprintf("[%d] CUDA GPU is too old. Compute Capability detected: %d.%d", i, memInfo.major, memInfo.minor))
if memInfo.free == 0 {
slog.Warn("error looking up nvidia GPU memory")
continue
}
gpuInfo.TotalMemory = uint64(memInfo.total)
gpuInfo.FreeMemory = uint64(memInfo.free)
gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
gpuInfo.Compute = fmt.Sprintf("%d.%d", memInfo.major, memInfo.minor)
gpuInfo.MinimumMemory = cudaMinimumMemory
gpuInfo.DependencyPath = depPath
gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
gpuInfo.DriverMajor = driverMajor
gpuInfo.DriverMinor = driverMinor
if cHandles.nvml != nil && gpu.OSOverhead > 0 {
// When using the management library update based on recorded overhead
memInfo.free -= C.uint64_t(gpu.OSOverhead)
}
slog.Debug("updating cuda memory data",
"gpu", gpu.ID,
"name", gpu.Name,
"overhead", format.HumanBytes2(gpu.OSOverhead),
slog.Group(
"before",
"total", format.HumanBytes2(gpu.TotalMemory),
"free", format.HumanBytes2(gpu.FreeMemory),
),
slog.Group(
"now",
"total", format.HumanBytes2(uint64(memInfo.total)),
"free", format.HumanBytes2(uint64(memInfo.free)),
"used", format.HumanBytes2(uint64(memInfo.used)),
),
)
cudaGPUs[i].FreeMemory = uint64(memInfo.free)
}
// TODO potentially sort on our own algorithm instead of what the underlying GPU library does...
resp = append(resp, gpuInfo)
if oHandles == nil && len(oneapiGPUs) > 0 {
oHandles = initOneAPIHandles()
}
for i, gpu := range oneapiGPUs {
if oHandles.oneapi == nil {
// shouldn't happen
slog.Warn("nil oneapi handle with device count", "count", oHandles.deviceCount)
continue
}
C.oneapi_check_vram(*oHandles.oneapi, C.int(gpu.driverIndex), C.int(gpu.gpuIndex), &memInfo)
// TODO - convert this to MinimumMemory based on testing...
var totalFreeMem float64 = float64(memInfo.free) * 0.95 // work-around: leave some reserve vram for mkl lib used in ggml-sycl backend.
memInfo.free = C.uint64_t(totalFreeMem)
oneapiGPUs[i].FreeMemory = uint64(memInfo.free)
}
err = RocmGPUInfoList(rocmGPUs).RefreshFreeMemory()
if err != nil {
slog.Debug("problem refreshing ROCm free memory", "error", err)
}
}
// Then AMD
resp = append(resp, AMDGetGPUInfo()...)
resp := []GpuInfo{}
for _, gpu := range cudaGPUs {
resp = append(resp, gpu.GpuInfo)
}
for _, gpu := range rocmGPUs {
resp = append(resp, gpu.GpuInfo)
}
for _, gpu := range oneapiGPUs {
resp = append(resp, gpu.GpuInfo)
}
if len(resp) == 0 {
C.cpu_check_ram(&memInfo)
if memInfo.err != nil {
slog.Info("error looking up CPU memory", "error", C.GoString(memInfo.err))
C.free(unsafe.Pointer(memInfo.err))
return resp
}
gpuInfo := GpuInfo{
Library: "cpu",
Variant: cpuVariant,
}
gpuInfo.TotalMemory = uint64(memInfo.total)
gpuInfo.FreeMemory = uint64(memInfo.free)
gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
resp = append(resp, gpuInfo)
resp = append(resp, cpus[0].GpuInfo)
}
return resp
}
func GetCPUMem() (memInfo, error) {
var ret memInfo
var info C.mem_info_t
C.cpu_check_ram(&info)
if info.err != nil {
defer C.free(unsafe.Pointer(info.err))
return ret, fmt.Errorf(C.GoString(info.err))
}
ret.FreeMemory = uint64(info.free)
ret.TotalMemory = uint64(info.total)
return ret, nil
}
func FindGPULibs(baseLibName string, defaultPatterns []string) []string {
// Multiple GPU libraries may exist, and some may not work, so keep trying until we exhaust them
var ldPaths []string
@@ -353,7 +547,23 @@ func LoadNVCUDAMgmt(nvcudaLibPaths []string) (int, *C.nvcuda_handle_t, string) {
defer C.free(unsafe.Pointer(lib))
C.nvcuda_init(lib, &resp)
if resp.err != nil {
slog.Debug("Unable to load nvcuda", "library", libPath, "error", C.GoString(resp.err))
// Decide what log level based on the type of error message to help users understand why
msg := C.GoString(resp.err)
switch resp.cudaErr {
case C.CUDA_ERROR_INSUFFICIENT_DRIVER, C.CUDA_ERROR_SYSTEM_DRIVER_MISMATCH:
slog.Warn("version mismatch between driver and cuda driver library - reboot or upgrade may be required", "library", libPath, "error", msg)
case C.CUDA_ERROR_NO_DEVICE:
slog.Info("no nvidia devices detected", "library", libPath)
case C.CUDA_ERROR_UNKNOWN:
slog.Warn("unknown error initializing cuda driver library", "library", libPath, "error", msg)
slog.Warn("see https://github.com/ollama/ollama/blob/main/docs/troubleshooting.md for more information")
default:
if strings.Contains(msg, "wrong ELF class") {
slog.Debug("skipping 32bit library", "library", libPath)
} else {
slog.Info("unable to load cuda driver library", "library", libPath, "error", msg)
}
}
C.free(unsafe.Pointer(resp.err))
} else {
return int(resp.num_devices), &resp.ch, libPath
@@ -362,8 +572,26 @@ func LoadNVCUDAMgmt(nvcudaLibPaths []string) (int, *C.nvcuda_handle_t, string) {
return 0, nil, ""
}
func LoadNVMLMgmt(nvmlLibPaths []string) (*C.nvml_handle_t, string) {
var resp C.nvml_init_resp_t
resp.ch.verbose = getVerboseState()
for _, libPath := range nvmlLibPaths {
lib := C.CString(libPath)
defer C.free(unsafe.Pointer(lib))
C.nvml_init(lib, &resp)
if resp.err != nil {
slog.Info(fmt.Sprintf("Unable to load NVML management library %s: %s", libPath, C.GoString(resp.err)))
C.free(unsafe.Pointer(resp.err))
} else {
return &resp.ch, libPath
}
}
return nil, ""
}
func LoadOneapiMgmt(oneapiLibPaths []string) (int, *C.oneapi_handle_t, string) {
var resp C.oneapi_init_resp_t
num_devices := 0
resp.oh.verbose = getVerboseState()
for _, libPath := range oneapiLibPaths {
lib := C.CString(libPath)
@@ -373,7 +601,10 @@ func LoadOneapiMgmt(oneapiLibPaths []string) (int, *C.oneapi_handle_t, string) {
slog.Debug("Unable to load oneAPI management library", "library", libPath, "error", C.GoString(resp.err))
C.free(unsafe.Pointer(resp.err))
} else {
return int(resp.num_devices), &resp.oh, libPath
for i := range resp.oh.num_drivers {
num_devices += int(C.oneapi_get_device_count(resp.oh, C.int(i)))
}
return num_devices, &resp.oh, libPath
}
}
return 0, nil, ""

View File

@@ -24,7 +24,7 @@ func GetGPUInfo() GpuInfoList {
return []GpuInfo{
{
Library: "cpu",
Variant: GetCPUVariant(),
Variant: GetCPUCapability(),
memInfo: mem,
},
}
@@ -42,10 +42,22 @@ func GetGPUInfo() GpuInfoList {
return []GpuInfo{info}
}
func GetCPUInfo() GpuInfoList {
mem, _ := GetCPUMem()
return []GpuInfo{
{
Library: "cpu",
Variant: GetCPUCapability(),
memInfo: mem,
},
}
}
func GetCPUMem() (memInfo, error) {
return memInfo{
TotalMemory: uint64(C.getPhysicalMemory()),
FreeMemory: 0,
FreeMemory: uint64(C.getFreeMemory()),
// FreeSwap omitted as Darwin uses dynamic paging
}, nil
}

View File

@@ -47,6 +47,7 @@ typedef struct mem_info {
char gpu_name[GPU_NAME_LEN];
uint64_t total;
uint64_t free;
uint64_t used;
// Compute Capability
int major;
@@ -62,6 +63,7 @@ void cpu_check_ram(mem_info_t *resp);
#include "gpu_info_cudart.h"
#include "gpu_info_nvcuda.h"
#include "gpu_info_nvml.h"
#include "gpu_info_oneapi.h"
#endif // __GPU_INFO_H__

View File

@@ -1,45 +0,0 @@
#include "gpu_info.h"
// Fallbacks for CPU mode
#ifdef _WIN32
#include <sysinfoapi.h>
void cpu_check_ram(mem_info_t *resp) {
resp->err = NULL;
MEMORYSTATUSEX info;
info.dwLength = sizeof(info);
if (GlobalMemoryStatusEx(&info) != 0) {
resp->total = info.ullTotalPhys;
resp->free = info.ullAvailPhys;
snprintf(&resp->gpu_id[0], GPU_ID_LEN, "0");
} else {
resp->err = LOAD_ERR();
}
return;
}
#elif __linux__
#include <errno.h>
#include <string.h>
#include <sys/sysinfo.h>
void cpu_check_ram(mem_info_t *resp) {
struct sysinfo info;
resp->err = NULL;
if (sysinfo(&info) != 0) {
resp->err = strdup(strerror(errno));
} else {
resp->total = info.totalram * info.mem_unit;
resp->free = info.freeram * info.mem_unit;
snprintf(&resp->gpu_id[0], GPU_ID_LEN, "0");
}
return;
}
#elif __APPLE__
// TODO consider an Apple implementation that does something useful
// mem_info_t cpu_check_ram() {
// mem_info_t resp = {0, 0, NULL};
// return resp;
// }
#else
#error "Unsupported platform"
#endif

View File

@@ -40,7 +40,7 @@ void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp) {
for (i = 0; l[i].s != NULL; i++) {
*l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
if (!l[i].p) {
if (!*(l[i].p)) {
char *msg = LOAD_ERR();
LOG(resp->ch.verbose, "dlerr: %s\n", msg);
UNLOAD_LIBRARY(resp->ch.handle);
@@ -94,7 +94,7 @@ void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp) {
}
void cudart_check_vram(cudart_handle_t h, int i, mem_info_t *resp) {
void cudart_bootstrap(cudart_handle_t h, int i, mem_info_t *resp) {
resp->err = NULL;
cudartMemory_t memInfo = {0,0,0};
cudartReturn_t ret;
@@ -166,9 +166,11 @@ void cudart_check_vram(cudart_handle_t h, int i, mem_info_t *resp) {
resp->total = memInfo.total;
resp->free = memInfo.free;
resp->used = memInfo.used;
LOG(h.verbose, "[%s] CUDA totalMem %lu\n", resp->gpu_id, resp->total);
LOG(h.verbose, "[%s] CUDA freeMem %lu\n", resp->gpu_id, resp->free);
LOG(h.verbose, "[%s] CUDA usedMem %lu\n", resp->gpu_id, resp->used);
LOG(h.verbose, "[%s] Compute Capability %d.%d\n", resp->gpu_id, resp->major, resp->minor);
}

View File

@@ -140,7 +140,8 @@ typedef struct cudart_init_resp {
} cudart_init_resp_t;
void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp);
void cudart_check_vram(cudart_handle_t ch, int device_id, mem_info_t *resp);
void cudart_bootstrap(cudart_handle_t ch, int device_id, mem_info_t *resp);
// TODO - if we keep this library longer term, add cudart_get_free
void cudart_release(cudart_handle_t ch);
#endif // __GPU_INFO_CUDART_H__

View File

@@ -2,3 +2,4 @@
#include <stdint.h>
uint64_t getRecommendedMaxVRAM();
uint64_t getPhysicalMemory();
uint64_t getFreeMemory();

View File

@@ -1,4 +1,5 @@
// go:build darwin
#import <Foundation/Foundation.h>
#import <mach/mach.h>
#include "gpu_info_darwin.h"
uint64_t getRecommendedMaxVRAM() {
@@ -8,6 +9,27 @@ uint64_t getRecommendedMaxVRAM() {
return result;
}
// getPhysicalMemory returns the total physical memory in bytes
uint64_t getPhysicalMemory() {
return [[NSProcessInfo processInfo] physicalMemory];
return [NSProcessInfo processInfo].physicalMemory;
}
// getFreeMemory returns the total free memory in bytes, including inactive
// memory that can be reclaimed by the system.
uint64_t getFreeMemory() {
mach_port_t host_port = mach_host_self();
mach_msg_type_number_t host_size = sizeof(vm_statistics64_data_t) / sizeof(integer_t);
vm_size_t pagesize;
vm_statistics64_data_t vm_stat;
host_page_size(host_port, &pagesize);
if (host_statistics64(host_port, HOST_VM_INFO64, (host_info64_t)&vm_stat, &host_size) != KERN_SUCCESS) {
return 0;
}
uint64_t free_memory = (uint64_t)vm_stat.free_count * pagesize;
free_memory += (uint64_t)vm_stat.speculative_count * pagesize;
free_memory += (uint64_t)vm_stat.inactive_count * pagesize;
return free_memory;
}

View File

@@ -7,6 +7,7 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
CUresult ret;
resp->err = NULL;
resp->num_devices = 0;
resp->cudaErr = CUDA_SUCCESS;
const int buflen = 256;
char buf[buflen + 1];
int i;
@@ -38,12 +39,13 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
nvcuda_lib_path, msg);
free(msg);
resp->err = strdup(buf);
resp->cudaErr = -1;
return;
}
for (i = 0; l[i].s != NULL; i++) {
*l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
if (!*l[i].p) {
if (!*(l[i].p)) {
char *msg = LOAD_ERR();
LOG(resp->ch.verbose, "dlerr: %s\n", msg);
UNLOAD_LIBRARY(resp->ch.handle);
@@ -52,6 +54,7 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
msg);
free(msg);
resp->err = strdup(buf);
resp->cudaErr = -1;
return;
}
}
@@ -61,12 +64,9 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
LOG(resp->ch.verbose, "cuInit err: %d\n", ret);
UNLOAD_LIBRARY(resp->ch.handle);
resp->ch.handle = NULL;
if (ret == CUDA_ERROR_INSUFFICIENT_DRIVER) {
resp->err = strdup("your nvidia driver is too old or missing. If you have a CUDA GPU please upgrade to run ollama");
return;
}
snprintf(buf, buflen, "nvcuda init failure: %d", ret);
snprintf(buf, buflen, "cuda driver library init failure: %d", ret);
resp->err = strdup(buf);
resp->cudaErr = ret;
return;
}
@@ -91,12 +91,13 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
resp->ch.handle = NULL;
snprintf(buf, buflen, "unable to get device count: %d", ret);
resp->err = strdup(buf);
resp->cudaErr = ret;
return;
}
}
const int buflen = 256;
void nvcuda_check_vram(nvcuda_handle_t h, int i, mem_info_t *resp) {
void nvcuda_bootstrap(nvcuda_handle_t h, int i, mem_info_t *resp) {
resp->err = NULL;
nvcudaMemory_t memInfo = {0,0};
CUresult ret;
@@ -106,13 +107,13 @@ void nvcuda_check_vram(nvcuda_handle_t h, int i, mem_info_t *resp) {
CUuuid uuid = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
if (h.handle == NULL) {
resp->err = strdup("nvcuda handle isn't initialized");
resp->err = strdup("cuda driver library handle isn't initialized");
return;
}
ret = (*h.cuDeviceGet)(&device, i);
if (ret != CUDA_SUCCESS) {
snprintf(buf, buflen, "nvcuda device failed to initialize");
snprintf(buf, buflen, "cuda driver library device failed to initialize");
resp->err = strdup(buf);
return;
}
@@ -168,14 +169,14 @@ void nvcuda_check_vram(nvcuda_handle_t h, int i, mem_info_t *resp) {
// To get memory we have to set (and release) a context
ret = (*h.cuCtxCreate_v3)(&ctx, NULL, 0, 0, device);
if (ret != CUDA_SUCCESS) {
snprintf(buf, buflen, "nvcuda failed to get primary device context %d", ret);
snprintf(buf, buflen, "cuda driver library failed to get device context %d", ret);
resp->err = strdup(buf);
return;
}
ret = (*h.cuMemGetInfo_v2)(&memInfo.free, &memInfo.total);
if (ret != CUDA_SUCCESS) {
snprintf(buf, buflen, "nvcuda device memory info lookup failure %d", ret);
snprintf(buf, buflen, "cuda driver library device memory info lookup failure %d", ret);
resp->err = strdup(buf);
// Best effort on failure...
(*h.cuCtxDestroy)(ctx);
@@ -193,12 +194,47 @@ void nvcuda_check_vram(nvcuda_handle_t h, int i, mem_info_t *resp) {
ret = (*h.cuCtxDestroy)(ctx);
if (ret != CUDA_SUCCESS) {
LOG(1, "nvcuda failed to release primary device context %d", ret);
LOG(1, "cuda driver library failed to release device context %d", ret);
}
}
void nvcuda_get_free(nvcuda_handle_t h, int i, uint64_t *free, uint64_t *total) {
CUresult ret;
CUcontext ctx = NULL;
CUdevice device = -1;
*free = 0;
*total = 0;
ret = (*h.cuDeviceGet)(&device, i);
if (ret != CUDA_SUCCESS) {
LOG(1, "cuda driver library device failed to initialize");
return;
}
// To get memory we have to set (and release) a context
ret = (*h.cuCtxCreate_v3)(&ctx, NULL, 0, 0, device);
if (ret != CUDA_SUCCESS) {
LOG(1, "cuda driver library failed to get device context %d", ret);
return;
}
ret = (*h.cuMemGetInfo_v2)(free, total);
if (ret != CUDA_SUCCESS) {
LOG(1, "cuda driver library device memory info lookup failure %d", ret);
// Best effort on failure...
(*h.cuCtxDestroy)(ctx);
return;
}
ret = (*h.cuCtxDestroy)(ctx);
if (ret != CUDA_SUCCESS) {
LOG(1, "cuda driver library failed to release device context %d", ret);
}
}
void nvcuda_release(nvcuda_handle_t h) {
LOG(h.verbose, "releasing nvcuda library\n");
LOG(h.verbose, "releasing cuda driver library\n");
UNLOAD_LIBRARY(h.handle);
// TODO and other context release logic?
h.handle = NULL;

View File

@@ -7,9 +7,12 @@
typedef enum cudaError_enum {
CUDA_SUCCESS = 0,
CUDA_ERROR_INVALID_VALUE = 1,
CUDA_ERROR_MEMORY_ALLOCATION = 2,
CUDA_ERROR_OUT_OF_MEMORY = 2,
CUDA_ERROR_NOT_INITIALIZED = 3,
CUDA_ERROR_INSUFFICIENT_DRIVER = 35,
CUDA_ERROR_NO_DEVICE = 100,
CUDA_ERROR_SYSTEM_DRIVER_MISMATCH = 803,
CUDA_ERROR_UNKNOWN = 999,
// Other values omitted for now...
} CUresult;
@@ -64,10 +67,12 @@ typedef struct nvcuda_init_resp {
char *err; // If err is non-null handle is invalid
nvcuda_handle_t ch;
int num_devices;
CUresult cudaErr;
} nvcuda_init_resp_t;
void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp);
void nvcuda_check_vram(nvcuda_handle_t ch, int device_id, mem_info_t *resp);
void nvcuda_bootstrap(nvcuda_handle_t ch, int device_id, mem_info_t *resp);
void nvcuda_get_free(nvcuda_handle_t ch, int device_id, uint64_t *free, uint64_t *total);
void nvcuda_release(nvcuda_handle_t ch);
#endif // __GPU_INFO_NVCUDA_H__

104
gpu/gpu_info_nvml.c Normal file
View File

@@ -0,0 +1,104 @@
#ifndef __APPLE__ // TODO - maybe consider nvidia support on intel macs?
#include <string.h>
#include "gpu_info_nvml.h"
void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp) {
nvmlReturn_t ret;
resp->err = NULL;
const int buflen = 256;
char buf[buflen + 1];
int i;
struct lookup {
char *s;
void **p;
} l[] = {
{"nvmlInit_v2", (void *)&resp->ch.nvmlInit_v2},
{"nvmlShutdown", (void *)&resp->ch.nvmlShutdown},
{"nvmlDeviceGetHandleByIndex", (void *)&resp->ch.nvmlDeviceGetHandleByIndex},
{"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.nvmlDeviceGetMemoryInfo},
{NULL, NULL},
};
resp->ch.handle = LOAD_LIBRARY(nvml_lib_path, RTLD_LAZY);
if (!resp->ch.handle) {
char *msg = LOAD_ERR();
LOG(resp->ch.verbose, "library %s load err: %s\n", nvml_lib_path, msg);
snprintf(buf, buflen,
"Unable to load %s library to query for Nvidia GPUs: %s",
nvml_lib_path, msg);
free(msg);
resp->err = strdup(buf);
return;
}
// TODO once we've squashed the remaining corner cases remove this log
// LOG(resp->ch.verbose, "wiring nvidia management library functions in %s\n", nvml_lib_path);
for (i = 0; l[i].s != NULL; i++) {
// TODO once we've squashed the remaining corner cases remove this log
// LOG(resp->ch.verbose, "dlsym: %s\n", l[i].s);
*l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
if (!*(l[i].p)) {
resp->ch.handle = NULL;
char *msg = LOAD_ERR();
LOG(resp->ch.verbose, "dlerr: %s\n", msg);
UNLOAD_LIBRARY(resp->ch.handle);
snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
msg);
free(msg);
resp->err = strdup(buf);
return;
}
}
ret = (*resp->ch.nvmlInit_v2)();
if (ret != NVML_SUCCESS) {
LOG(resp->ch.verbose, "nvmlInit_v2 err: %d\n", ret);
UNLOAD_LIBRARY(resp->ch.handle);
resp->ch.handle = NULL;
snprintf(buf, buflen, "nvml vram init failure: %d", ret);
resp->err = strdup(buf);
return;
}
}
void nvml_get_free(nvml_handle_t h, int device_id, uint64_t *free, uint64_t *total, uint64_t *used) {
nvmlDevice_t device;
nvmlMemory_t memInfo = {0};
nvmlReturn_t ret;
ret = (*h.nvmlDeviceGetHandleByIndex)(device_id, &device);
if (ret != NVML_SUCCESS) {
LOG(1, "unable to get device handle %d: %d", device_id, ret);
*free = 0;
return;
}
ret = (*h.nvmlDeviceGetMemoryInfo)(device, &memInfo);
if (ret != NVML_SUCCESS) {
LOG(1, "device memory info lookup failure %d: %d", device_id, ret);
*free = 0;
return;
}
*free = memInfo.free;
*total = memInfo.total;
*used = memInfo.used;
}
void nvml_release(nvml_handle_t h) {
LOG(h.verbose, "releasing nvml library\n");
nvmlReturn_t ret;
ret = (*h.nvmlShutdown)();
if (ret != NVML_SUCCESS) {
LOG(1, "error during nvmlShutdown %d", ret);
}
UNLOAD_LIBRARY(h.handle);
h.handle = NULL;
}
#endif // __APPLE__

48
gpu/gpu_info_nvml.h Normal file
View File

@@ -0,0 +1,48 @@
#ifndef __APPLE__
#ifndef __GPU_INFO_NVML_H__
#define __GPU_INFO_NVML_H__
#include "gpu_info.h"
// Just enough typedef's to dlopen/dlsym for memory information
typedef enum nvmlReturn_enum {
NVML_SUCCESS = 0,
// Other values omitted for now...
} nvmlReturn_t;
typedef void *nvmlDevice_t; // Opaque is sufficient
typedef struct nvmlMemory_st {
unsigned long long total;
unsigned long long free;
unsigned long long used;
} nvmlMemory_t;
typedef enum nvmlBrandType_enum
{
NVML_BRAND_UNKNOWN = 0,
} nvmlBrandType_t;
typedef struct nvml_handle {
void *handle;
uint16_t verbose;
nvmlReturn_t (*nvmlInit_v2)(void);
nvmlReturn_t (*nvmlShutdown)(void);
nvmlReturn_t (*nvmlDeviceGetHandleByIndex)(unsigned int, nvmlDevice_t *);
nvmlReturn_t (*nvmlDeviceGetMemoryInfo)(nvmlDevice_t, nvmlMemory_t *);
} nvml_handle_t;
typedef struct nvml_init_resp {
char *err; // If err is non-null handle is invalid
nvml_handle_t ch;
} nvml_init_resp_t;
typedef struct nvml_compute_capability {
char *err;
int major;
int minor;
} nvml_compute_capability_t;
void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp);
void nvml_get_free(nvml_handle_t ch, int device_id, uint64_t *free, uint64_t *total, uint64_t *used);
void nvml_release(nvml_handle_t ch);
#endif // __GPU_INFO_NVML_H__
#endif // __APPLE__

View File

@@ -4,15 +4,17 @@
#include <string.h>
void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp)
{
void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp) {
ze_result_t ret;
resp->err = NULL;
resp->oh.devices = NULL;
resp->oh.num_devices = NULL;
resp->oh.drivers = NULL;
resp->oh.num_drivers = 0;
const int buflen = 256;
char buf[buflen + 1];
int i;
struct lookup
{
int i, d;
struct lookup {
char *s;
void **p;
} l[] = {
@@ -28,8 +30,7 @@ void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp)
};
resp->oh.handle = LOAD_LIBRARY(oneapi_lib_path, RTLD_LAZY);
if (!resp->oh.handle)
{
if (!resp->oh.handle) {
char *msg = LOAD_ERR();
snprintf(buf, buflen,
"Unable to load %s library to query for Intel GPUs: %s\n",
@@ -44,14 +45,12 @@ void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp)
"wiring Level-Zero management library functions in %s\n",
oneapi_lib_path);
for (i = 0; l[i].s != NULL; i++)
{
for (i = 0; l[i].s != NULL; i++) {
// TODO once we've squashed the remaining corner cases remove this log
LOG(resp->oh.verbose, "dlsym: %s\n", l[i].s);
*l[i].p = LOAD_SYMBOL(resp->oh.handle, l[i].s);
if (!l[i].p)
{
if (!*(l[i].p)) {
resp->oh.handle = NULL;
char *msg = LOAD_ERR();
LOG(resp->oh.verbose, "dlerr: %s\n", msg);
@@ -63,23 +62,70 @@ void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp)
}
}
LOG(resp->oh.verbose, "calling zesInit\n");
ret = (*resp->oh.zesInit)(0);
if (ret != ZE_RESULT_SUCCESS)
{
LOG(resp->oh.verbose, "zesInit err: %d\n", ret);
UNLOAD_LIBRARY(resp->oh.handle);
resp->oh.handle = NULL;
snprintf(buf, buflen, "oneapi vram init failure: %d", ret);
if (ret != ZE_RESULT_SUCCESS) {
LOG(resp->oh.verbose, "zesInit err: %x\n", ret);
snprintf(buf, buflen, "oneapi vram init failure: %x", ret);
resp->err = strdup(buf);
oneapi_release(resp->oh);
return;
}
(*resp->oh.zesDriverGet)(&resp->num_devices, NULL);
LOG(resp->oh.verbose, "calling zesDriverGet\n");
ret = (*resp->oh.zesDriverGet)(&resp->oh.num_drivers, NULL);
if (ret != ZE_RESULT_SUCCESS) {
LOG(resp->oh.verbose, "zesDriverGet err: %x\n", ret);
snprintf(buf, buflen, "unable to get driver count: %x", ret);
resp->err = strdup(buf);
oneapi_release(resp->oh);
return;
}
LOG(resp->oh.verbose, "oneapi driver count: %d\n", resp->oh.num_drivers);
resp->oh.drivers = malloc(resp->oh.num_drivers * sizeof(zes_driver_handle_t));
resp->oh.num_devices = malloc(resp->oh.num_drivers * sizeof(uint32_t));
memset(&resp->oh.num_devices[0], 0, resp->oh.num_drivers * sizeof(uint32_t));
resp->oh.devices =
malloc(resp->oh.num_drivers * sizeof(zes_device_handle_t *));
ret = (*resp->oh.zesDriverGet)(&resp->oh.num_drivers, &resp->oh.drivers[0]);
if (ret != ZE_RESULT_SUCCESS) {
LOG(resp->oh.verbose, "zesDriverGet err: %x\n", ret);
snprintf(buf, buflen, "unable to get driver count: %x", ret);
resp->err = strdup(buf);
oneapi_release(resp->oh);
return;
}
for (d = 0; d < resp->oh.num_drivers; d++) {
LOG(resp->oh.verbose, "calling zesDeviceGet count %d: %p\n", d, resp->oh.drivers[d]);
ret = (*resp->oh.zesDeviceGet)(resp->oh.drivers[d],
&resp->oh.num_devices[d], NULL);
if (ret != ZE_RESULT_SUCCESS) {
LOG(resp->oh.verbose, "zesDeviceGet err: %x\n", ret);
snprintf(buf, buflen, "unable to get device count: %x", ret);
resp->err = strdup(buf);
oneapi_release(resp->oh);
return;
}
resp->oh.devices[d] =
malloc(resp->oh.num_devices[d] * sizeof(zes_device_handle_t));
ret = (*resp->oh.zesDeviceGet)(
resp->oh.drivers[d], &resp->oh.num_devices[d], resp->oh.devices[d]);
if (ret != ZE_RESULT_SUCCESS) {
LOG(resp->oh.verbose, "zesDeviceGet err: %x\n", ret);
snprintf(buf, buflen, "unable to get device count: %x", ret);
resp->err = strdup(buf);
oneapi_release(resp->oh);
return;
}
}
return;
}
void oneapi_check_vram(oneapi_handle_t h, mem_info_t *resp)
{
void oneapi_check_vram(oneapi_handle_t h, int driver, int device,
mem_info_t *resp) {
ze_result_t ret;
resp->err = NULL;
uint64_t totalMem = 0;
@@ -88,127 +134,126 @@ void oneapi_check_vram(oneapi_handle_t h, mem_info_t *resp)
char buf[buflen + 1];
int i, d, m;
if (h.handle == NULL)
{
if (h.handle == NULL) {
resp->err = strdup("Level-Zero handle not initialized");
return;
}
uint32_t driversCount = 0;
ret = (*h.zesDriverGet)(&driversCount, NULL);
if (ret != ZE_RESULT_SUCCESS)
{
snprintf(buf, buflen, "unable to get driver count: %d", ret);
resp->err = strdup(buf);
if (driver > h.num_drivers || device > h.num_devices[driver]) {
resp->err = strdup("driver of device index out of bounds");
return;
}
LOG(h.verbose, "discovered %d Level-Zero drivers\n", driversCount);
zes_driver_handle_t *allDrivers =
malloc(driversCount * sizeof(zes_driver_handle_t));
(*h.zesDriverGet)(&driversCount, allDrivers);
resp->total = 0;
resp->free = 0;
for (d = 0; d < driversCount; d++)
{
uint32_t deviceCount = 0;
ret = (*h.zesDeviceGet)(allDrivers[d], &deviceCount, NULL);
if (ret != ZE_RESULT_SUCCESS)
{
snprintf(buf, buflen, "unable to get device count: %d", ret);
zes_device_ext_properties_t ext_props;
ext_props.stype = ZES_STRUCTURE_TYPE_DEVICE_EXT_PROPERTIES;
ext_props.pNext = NULL;
zes_device_properties_t props;
props.stype = ZES_STRUCTURE_TYPE_DEVICE_PROPERTIES;
props.pNext = &ext_props;
ret = (*h.zesDeviceGetProperties)(h.devices[driver][device], &props);
if (ret != ZE_RESULT_SUCCESS) {
snprintf(buf, buflen, "unable to get device properties: %d", ret);
resp->err = strdup(buf);
return;
}
snprintf(&resp->gpu_name[0], GPU_NAME_LEN, "%s", props.modelName);
// TODO this needs to map to ONEAPI_DEVICE_SELECTOR syntax
// (this is probably wrong...)
// TODO - the driver isn't included - what if there are multiple drivers?
snprintf(&resp->gpu_id[0], GPU_ID_LEN, "%d", device);
if (h.verbose) {
// When in verbose mode, report more information about
// the card we discover.
LOG(h.verbose, "[%d:%d] oneAPI device name: %s\n", driver, device,
props.modelName);
LOG(h.verbose, "[%d:%d] oneAPI brand: %s\n", driver, device,
props.brandName);
LOG(h.verbose, "[%d:%d] oneAPI vendor: %s\n", driver, device,
props.vendorName);
LOG(h.verbose, "[%d:%d] oneAPI S/N: %s\n", driver, device,
props.serialNumber);
LOG(h.verbose, "[%d:%d] oneAPI board number: %s\n", driver, device,
props.boardNumber);
}
// TODO
// Compute Capability equivalent in resp->major, resp->minor, resp->patch
uint32_t memCount = 0;
ret = (*h.zesDeviceEnumMemoryModules)(h.devices[driver][device], &memCount,
NULL);
if (ret != ZE_RESULT_SUCCESS) {
snprintf(buf, buflen, "unable to enumerate Level-Zero memory modules: %x",
ret);
resp->err = strdup(buf);
return;
}
LOG(h.verbose, "discovered %d Level-Zero memory modules\n", memCount);
zes_mem_handle_t *mems = malloc(memCount * sizeof(zes_mem_handle_t));
(*h.zesDeviceEnumMemoryModules)(h.devices[driver][device], &memCount, mems);
for (m = 0; m < memCount; m++) {
zes_mem_state_t state;
state.stype = ZES_STRUCTURE_TYPE_MEM_STATE;
state.pNext = NULL;
ret = (*h.zesMemoryGetState)(mems[m], &state);
if (ret != ZE_RESULT_SUCCESS) {
snprintf(buf, buflen, "unable to get memory state: %x", ret);
resp->err = strdup(buf);
free(allDrivers);
free(mems);
return;
}
LOG(h.verbose, "discovered %d Level-Zero devices\n", deviceCount);
zes_device_handle_t *devices =
malloc(deviceCount * sizeof(zes_device_handle_t));
(*h.zesDeviceGet)(allDrivers[d], &deviceCount, devices);
for (i = 0; i < deviceCount; i++)
{
zes_device_ext_properties_t ext_props;
ext_props.stype = ZES_STRUCTURE_TYPE_DEVICE_EXT_PROPERTIES;
ext_props.pNext = NULL;
zes_device_properties_t props;
props.stype = ZES_STRUCTURE_TYPE_DEVICE_PROPERTIES;
props.pNext = &ext_props;
ret = (*h.zesDeviceGetProperties)(devices[i], &props);
if (ret != ZE_RESULT_SUCCESS)
{
snprintf(buf, buflen, "unable to get device properties: %d", ret);
resp->err = strdup(buf);
free(allDrivers);
free(devices);
return;
}
if (h.verbose)
{
// When in verbose mode, report more information about
// the card we discover.
LOG(h.verbose, "[%d] oneAPI device name: %s\n", i,
props.modelName);
LOG(h.verbose, "[%d] oneAPI brand: %s\n", i,
props.brandName);
LOG(h.verbose, "[%d] oneAPI vendor: %s\n", i,
props.vendorName);
LOG(h.verbose, "[%d] oneAPI S/N: %s\n", i,
props.serialNumber);
LOG(h.verbose, "[%d] oneAPI board number: %s\n", i,
props.boardNumber);
}
uint32_t memCount = 0;
ret = (*h.zesDeviceEnumMemoryModules)(devices[i], &memCount, NULL);
if (ret != ZE_RESULT_SUCCESS)
{
snprintf(buf, buflen,
"unable to enumerate Level-Zero memory modules: %d", ret);
resp->err = strdup(buf);
free(allDrivers);
free(devices);
return;
}
LOG(h.verbose, "discovered %d Level-Zero memory modules\n", memCount);
zes_mem_handle_t *mems = malloc(memCount * sizeof(zes_mem_handle_t));
(*h.zesDeviceEnumMemoryModules)(devices[i], &memCount, mems);
for (m = 0; m < memCount; m++)
{
zes_mem_state_t state;
state.stype = ZES_STRUCTURE_TYPE_MEM_STATE;
state.pNext = NULL;
ret = (*h.zesMemoryGetState)(mems[m], &state);
if (ret != ZE_RESULT_SUCCESS)
{
snprintf(buf, buflen, "unable to get memory state: %d", ret);
resp->err = strdup(buf);
free(allDrivers);
free(devices);
free(mems);
return;
}
resp->total += state.size;
resp->free += state.free;
}
free(mems);
}
free(devices);
resp->total += state.size;
resp->free += state.free;
}
free(allDrivers);
free(mems);
}
void oneapi_release(oneapi_handle_t h) {
int d;
LOG(h.verbose, "releasing oneapi library\n");
for (d = 0; d < h.num_drivers; d++) {
if (h.devices != NULL && h.devices[d] != NULL) {
free(h.devices[d]);
}
}
if (h.devices != NULL) {
free(h.devices);
h.devices = NULL;
}
if (h.num_devices != NULL) {
free(h.num_devices);
h.num_devices = NULL;
}
if (h.drivers != NULL) {
free(h.drivers);
h.drivers = NULL;
}
h.num_drivers = 0;
UNLOAD_LIBRARY(h.handle);
h.handle = NULL;
}
int oneapi_get_device_count(oneapi_handle_t h, int driver) {
if (h.handle == NULL || h.num_devices == NULL) {
return 0;
}
if (driver > h.num_drivers) {
return 0;
}
return (int)h.num_devices[driver];
}
#endif // __APPLE__

View File

@@ -9,8 +9,7 @@
#define ZE_BIT(_i) (1 << _i)
// Just enough typedef's to dlopen/dlsym for memory information
typedef enum ze_result_t
{
typedef enum ze_result_t {
ZE_RESULT_SUCCESS = 0,
// Other values omitted for now...
} ze_result_t;
@@ -20,13 +19,11 @@ typedef struct _zes_driver_handle_t *zes_driver_handle_t;
typedef struct _zes_device_handle_t *zes_device_handle_t;
typedef struct _zes_mem_handle_t *zes_mem_handle_t;
typedef enum _ze_structure_type_t
{
typedef enum _ze_structure_type_t {
ZE_STRUCTURE_TYPE_FORCE_UINT32 = 0x7fffffff
} ze_structure_type_t;
typedef enum _zes_structure_type_t
{
typedef enum _zes_structure_type_t {
ZES_STRUCTURE_TYPE_DEVICE_PROPERTIES = 0x1,
ZES_STRUCTURE_TYPE_MEM_PROPERTIES = 0xb,
ZES_STRUCTURE_TYPE_MEM_STATE = 0x1e,
@@ -34,35 +31,29 @@ typedef enum _zes_structure_type_t
ZES_STRUCTURE_TYPE_FORCE_UINT32 = 0x7fffffff
} zes_structure_type_t;
typedef enum _zes_mem_type_t
{
typedef enum _zes_mem_type_t {
ZES_MEM_TYPE_FORCE_UINT32 = 0x7fffffff
} zes_mem_type_t;
typedef enum _zes_mem_loc_t
{
typedef enum _zes_mem_loc_t {
ZES_MEM_LOC_SYSTEM = 0,
ZES_MEM_LOC_DEVICE = 1,
ZES_MEM_LOC_FORCE_UINT32 = 0x7fffffff
} zes_mem_loc_t;
typedef enum _zes_mem_health_t
{
typedef enum _zes_mem_health_t {
ZES_MEM_HEALTH_FORCE_UINT32 = 0x7fffffff
} zes_mem_health_t;
typedef struct _ze_device_uuid_t
{
typedef struct _ze_device_uuid_t {
uint8_t id[ZE_MAX_DEVICE_UUID_SIZE];
} ze_device_uuid_t;
typedef struct _zes_uuid_t
{
typedef struct _zes_uuid_t {
uint8_t id[ZE_MAX_DEVICE_UUID_SIZE];
} zes_uuid_t;
typedef enum _ze_device_type_t
{
typedef enum _ze_device_type_t {
ZE_DEVICE_TYPE_GPU = 1,
ZE_DEVICE_TYPE_CPU = 2,
ZE_DEVICE_TYPE_FPGA = 3,
@@ -71,8 +62,7 @@ typedef enum _ze_device_type_t
ZE_DEVICE_TYPE_FORCE_UINT32 = 0x7fffffff
} ze_device_type_t;
typedef enum _zes_device_type_t
{
typedef enum _zes_device_type_t {
ZES_DEVICE_TYPE_GPU = 1,
ZES_DEVICE_TYPE_CPU = 2,
ZES_DEVICE_TYPE_FPGA = 3,
@@ -82,8 +72,7 @@ typedef enum _zes_device_type_t
} zes_device_type_t;
typedef uint32_t ze_device_property_flags_t;
typedef enum _ze_device_property_flag_t
{
typedef enum _ze_device_property_flag_t {
ZE_DEVICE_PROPERTY_FLAG_INTEGRATED = ZE_BIT(0),
ZE_DEVICE_PROPERTY_FLAG_SUBDEVICE = ZE_BIT(1),
ZE_DEVICE_PROPERTY_FLAG_ECC = ZE_BIT(2),
@@ -92,8 +81,7 @@ typedef enum _ze_device_property_flag_t
} ze_device_property_flag_t;
typedef uint32_t zes_device_property_flags_t;
typedef enum _zes_device_property_flag_t
{
typedef enum _zes_device_property_flag_t {
ZES_DEVICE_PROPERTY_FLAG_INTEGRATED = ZE_BIT(0),
ZES_DEVICE_PROPERTY_FLAG_SUBDEVICE = ZE_BIT(1),
ZES_DEVICE_PROPERTY_FLAG_ECC = ZE_BIT(2),
@@ -101,8 +89,7 @@ typedef enum _zes_device_property_flag_t
ZES_DEVICE_PROPERTY_FLAG_FORCE_UINT32 = 0x7fffffff
} zes_device_property_flag_t;
typedef struct _ze_device_properties_t
{
typedef struct _ze_device_properties_t {
ze_structure_type_t stype;
void *pNext;
ze_device_type_t type;
@@ -126,8 +113,7 @@ typedef struct _ze_device_properties_t
char name[ZE_MAX_DEVICE_NAME];
} ze_device_properties_t;
typedef struct _zes_device_properties_t
{
typedef struct _zes_device_properties_t {
zes_structure_type_t stype;
void *pNext;
ze_device_properties_t core;
@@ -140,8 +126,7 @@ typedef struct _zes_device_properties_t
char driverVersion[ZES_STRING_PROPERTY_SIZE];
} zes_device_properties_t;
typedef struct _zes_device_ext_properties_t
{
typedef struct _zes_device_ext_properties_t {
zes_structure_type_t stype;
void *pNext;
zes_uuid_t uuid;
@@ -149,8 +134,7 @@ typedef struct _zes_device_ext_properties_t
zes_device_property_flags_t flags;
} zes_device_ext_properties_t;
typedef struct _zes_mem_properties_t
{
typedef struct _zes_mem_properties_t {
zes_structure_type_t stype;
void *pNext;
zes_mem_type_t type;
@@ -162,8 +146,7 @@ typedef struct _zes_mem_properties_t
int32_t numChannels;
} zes_mem_properties_t;
typedef struct _zes_mem_state_t
{
typedef struct _zes_mem_state_t {
zes_structure_type_t stype;
const void *pNext;
zes_mem_health_t health;
@@ -171,10 +154,19 @@ typedef struct _zes_mem_state_t
uint64_t size;
} zes_mem_state_t;
typedef struct oneapi_handle
{
typedef struct oneapi_handle {
void *handle;
uint16_t verbose;
uint32_t num_drivers;
zes_driver_handle_t *drivers;
uint32_t *num_devices;
zes_device_handle_t **devices;
// TODO Driver major, minor information
// int driver_major;
// int driver_minor;
ze_result_t (*zesInit)(int);
ze_result_t (*zesDriverGet)(uint32_t *pCount, zes_driver_handle_t *phDrivers);
ze_result_t (*zesDeviceGet)(zes_driver_handle_t hDriver, uint32_t *pCount,
@@ -191,21 +183,21 @@ typedef struct oneapi_handle
} oneapi_handle_t;
typedef struct oneapi_init_resp
{
typedef struct oneapi_init_resp {
char *err; // If err is non-null handle is invalid
int num_devices;
oneapi_handle_t oh;
} oneapi_init_resp_t;
typedef struct oneapi_version_resp
{
typedef struct oneapi_version_resp {
ze_result_t status;
char *str; // Contains version or error string if status != 0
} oneapi_version_resp_t;
void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp);
void oneapi_check_vram(oneapi_handle_t rh, mem_info_t *resp);
void oneapi_check_vram(oneapi_handle_t h, int driver, int device,
mem_info_t *resp);
void oneapi_release(oneapi_handle_t h);
int oneapi_get_device_count(oneapi_handle_t h, int driver);
#endif // __GPU_INFO_INTEL_H__
#endif // __APPLE__

90
gpu/gpu_linux.go Normal file
View File

@@ -0,0 +1,90 @@
package gpu
import (
"bufio"
"fmt"
"os"
"strings"
"github.com/ollama/ollama/format"
)
var CudartGlobs = []string{
"/usr/local/cuda/lib64/libcudart.so*",
"/usr/lib/x86_64-linux-gnu/nvidia/current/libcudart.so*",
"/usr/lib/x86_64-linux-gnu/libcudart.so*",
"/usr/lib/wsl/lib/libcudart.so*",
"/usr/lib/wsl/drivers/*/libcudart.so*",
"/opt/cuda/lib64/libcudart.so*",
"/usr/local/cuda*/targets/aarch64-linux/lib/libcudart.so*",
"/usr/lib/aarch64-linux-gnu/nvidia/current/libcudart.so*",
"/usr/lib/aarch64-linux-gnu/libcudart.so*",
"/usr/local/cuda/lib*/libcudart.so*",
"/usr/lib*/libcudart.so*",
"/usr/local/lib*/libcudart.so*",
}
var NvmlGlobs = []string{}
var NvcudaGlobs = []string{
"/usr/local/cuda*/targets/*/lib/libcuda.so*",
"/usr/lib/*-linux-gnu/nvidia/current/libcuda.so*",
"/usr/lib/*-linux-gnu/libcuda.so*",
"/usr/lib/wsl/lib/libcuda.so*",
"/usr/lib/wsl/drivers/*/libcuda.so*",
"/opt/cuda/lib*/libcuda.so*",
"/usr/local/cuda/lib*/libcuda.so*",
"/usr/lib*/libcuda.so*",
"/usr/local/lib*/libcuda.so*",
}
var OneapiGlobs = []string{
"/usr/lib/x86_64-linux-gnu/libze_intel_gpu.so*",
"/usr/lib*/libze_intel_gpu.so*",
}
var CudartMgmtName = "libcudart.so*"
var NvcudaMgmtName = "libcuda.so*"
var NvmlMgmtName = "" // not currently wired on linux
var OneapiMgmtName = "libze_intel_gpu.so"
func GetCPUMem() (memInfo, error) {
var mem memInfo
var total, available, free, buffers, cached, freeSwap uint64
f, err := os.Open("/proc/meminfo")
if err != nil {
return mem, err
}
defer f.Close()
s := bufio.NewScanner(f)
for s.Scan() {
line := s.Text()
switch {
case strings.HasPrefix(line, "MemTotal:"):
_, err = fmt.Sscanf(line, "MemTotal:%d", &total)
case strings.HasPrefix(line, "MemAvailable:"):
_, err = fmt.Sscanf(line, "MemAvailable:%d", &available)
case strings.HasPrefix(line, "MemFree:"):
_, err = fmt.Sscanf(line, "MemFree:%d", &free)
case strings.HasPrefix(line, "Buffers:"):
_, err = fmt.Sscanf(line, "Buffers:%d", &buffers)
case strings.HasPrefix(line, "Cached:"):
_, err = fmt.Sscanf(line, "Cached:%d", &cached)
case strings.HasPrefix(line, "SwapFree:"):
_, err = fmt.Sscanf(line, "SwapFree:%d", &freeSwap)
default:
continue
}
if err != nil {
return mem, err
}
}
mem.TotalMemory = total * format.KibiByte
mem.FreeSwap = freeSwap * format.KibiByte
if available > 0 {
mem.FreeMemory = available * format.KibiByte
} else {
mem.FreeMemory = (free + buffers + cached) * format.KibiByte
}
return mem, nil
}

55
gpu/gpu_windows.go Normal file
View File

@@ -0,0 +1,55 @@
package gpu
import (
"fmt"
"syscall"
"unsafe"
)
type MEMORYSTATUSEX struct {
length uint32
MemoryLoad uint32
TotalPhys uint64
AvailPhys uint64
TotalPageFile uint64
AvailPageFile uint64
TotalVirtual uint64
AvailVirtual uint64
AvailExtendedVirtual uint64
}
var (
k32 = syscall.NewLazyDLL("kernel32.dll")
globalMemoryStatusExProc = k32.NewProc("GlobalMemoryStatusEx")
sizeofMemoryStatusEx = uint32(unsafe.Sizeof(MEMORYSTATUSEX{}))
)
var CudartGlobs = []string{
"c:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v*\\bin\\cudart64_*.dll",
}
var NvmlGlobs = []string{
"c:\\Windows\\System32\\nvml.dll",
}
var NvcudaGlobs = []string{
"c:\\windows\\system*\\nvcuda.dll",
}
var OneapiGlobs = []string{
"c:\\Windows\\System32\\DriverStore\\FileRepository\\*\\ze_intel_gpu64.dll",
}
var CudartMgmtName = "cudart64_*.dll"
var NvcudaMgmtName = "nvcuda.dll"
var NvmlMgmtName = "nvml.dll"
var OneapiMgmtName = "ze_intel_gpu64.dll"
func GetCPUMem() (memInfo, error) {
memStatus := MEMORYSTATUSEX{length: sizeofMemoryStatusEx}
r1, _, err := globalMemoryStatusExProc.Call(uintptr(unsafe.Pointer(&memStatus)))
if r1 == 0 {
return memInfo{}, fmt.Errorf("GlobalMemoryStatusEx failed: %w", err)
}
return memInfo{TotalMemory: memStatus.TotalPhys, FreeMemory: memStatus.AvailPhys, FreeSwap: memStatus.AvailPageFile}, nil
}

View File

@@ -10,6 +10,7 @@ import (
type memInfo struct {
TotalMemory uint64 `json:"total_memory,omitempty"`
FreeMemory uint64 `json:"free_memory,omitempty"`
FreeSwap uint64 `json:"free_swap,omitempty"`
}
// Beginning of an `ollama info` command
@@ -18,7 +19,7 @@ type GpuInfo struct {
Library string `json:"library,omitempty"`
// Optional variant to select (e.g. versions, cpu feature flags)
Variant string `json:"variant,omitempty"`
Variant CPUCapability `json:"variant"`
// MinimumMemory represents the minimum memory required to use the GPU
MinimumMemory uint64 `json:"-"`
@@ -26,6 +27,14 @@ type GpuInfo struct {
// Any extra PATH/LD_LIBRARY_PATH dependencies required for the Library to operate properly
DependencyPath string `json:"lib_path,omitempty"`
// Extra environment variables specific to the GPU as list of [key,value]
EnvWorkarounds [][2]string `json:"envs,omitempty"`
// Set to true if we can NOT reliably discover FreeMemory. A value of true indicates
// the FreeMemory is best effort, and may over or under report actual memory usage
// False indicates FreeMemory can generally be trusted on this GPU
UnreliableFreeMemory bool
// GPU information
ID string `json:"gpu_id"` // string to use for selection of this specific GPU
Name string `json:"name"` // user friendly name if available
@@ -38,6 +47,31 @@ type GpuInfo struct {
// TODO other performance capability info to help in scheduling decisions
}
type CPUInfo struct {
GpuInfo
}
type CudaGPUInfo struct {
GpuInfo
OSOverhead uint64 // Memory overhead between the driver library and management library
index int //nolint:unused,nolintlint
}
type CudaGPUInfoList []CudaGPUInfo
type RocmGPUInfo struct {
GpuInfo
usedFilepath string //nolint:unused,nolintlint
index int //nolint:unused,nolintlint
}
type RocmGPUInfoList []RocmGPUInfo
type OneapiGPUInfo struct {
GpuInfo
driverIndex int //nolint:unused,nolintlint
gpuIndex int //nolint:unused,nolintlint
}
type OneapiGPUInfoList []OneapiGPUInfo
type GpuInfoList []GpuInfo
// Split up the set of gpu info's by Library and variant
@@ -47,8 +81,8 @@ func (l GpuInfoList) ByLibrary() []GpuInfoList {
for _, info := range l {
found := false
requested := info.Library
if info.Variant != "" {
requested += "_" + info.Variant
if info.Variant != CPUCapabilityNone {
requested += "_" + info.Variant.String()
}
for i, lib := range libs {
if lib == requested {
@@ -86,3 +120,26 @@ type ByFreeMemory []GpuInfo
func (a ByFreeMemory) Len() int { return len(a) }
func (a ByFreeMemory) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
func (a ByFreeMemory) Less(i, j int) bool { return a[i].FreeMemory < a[j].FreeMemory }
type CPUCapability uint32
// Override at build time when building base GPU runners
var GPURunnerCPUCapability = CPUCapabilityAVX
const (
CPUCapabilityNone CPUCapability = iota
CPUCapabilityAVX
CPUCapabilityAVX2
// TODO AVX512
)
func (c CPUCapability) String() string {
switch c {
case CPUCapabilityAVX:
return "avx"
case CPUCapabilityAVX2:
return "avx2"
default:
return "no vector extensions"
}
}

View File

@@ -19,17 +19,19 @@ func TestMultiModelConcurrency(t *testing.T) {
var (
req = [2]api.GenerateRequest{
{
Model: "orca-mini",
Prompt: "why is the ocean blue?",
Stream: &stream,
Model: "orca-mini",
Prompt: "why is the ocean blue?",
Stream: &stream,
KeepAlive: &api.Duration{Duration: 10 * time.Second},
Options: map[string]interface{}{
"seed": 42,
"temperature": 0.0,
},
}, {
Model: "tinydolphin",
Prompt: "what is the origin of the us thanksgiving holiday?",
Stream: &stream,
Model: "tinydolphin",
Prompt: "what is the origin of the us thanksgiving holiday?",
Stream: &stream,
KeepAlive: &api.Duration{Duration: 10 * time.Second},
Options: map[string]interface{}{
"seed": 42,
"temperature": 0.0,
@@ -38,42 +40,64 @@ func TestMultiModelConcurrency(t *testing.T) {
}
resp = [2][]string{
[]string{"sunlight"},
[]string{"england", "english", "massachusetts", "pilgrims"},
[]string{"england", "english", "massachusetts", "pilgrims", "british"},
}
)
var wg sync.WaitGroup
wg.Add(len(req))
ctx, cancel := context.WithTimeout(context.Background(), time.Second*120)
ctx, cancel := context.WithTimeout(context.Background(), time.Second*240)
defer cancel()
client, _, cleanup := InitServerConnection(ctx, t)
defer cleanup()
for i := 0; i < len(req); i++ {
require.NoError(t, PullIfMissing(ctx, client, req[i].Model))
}
for i := 0; i < len(req); i++ {
go func(i int) {
defer wg.Done()
GenerateTestHelper(ctx, t, req[i], resp[i])
DoGenerate(ctx, t, client, req[i], resp[i], 60*time.Second, 10*time.Second)
}(i)
}
wg.Wait()
}
func TestIntegrationConcurrentPredictOrcaMini(t *testing.T) {
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Minute) // GTX 750 2G card takes ~9 minutes
req, resp := GenerateRequests()
reqLimit := len(req)
iterLimit := 5
vram := os.Getenv("OLLAMA_MAX_VRAM")
if vram != "" {
max, err := strconv.ParseUint(vram, 10, 64)
require.NoError(t, err)
// Don't hammer on small VRAM cards...
if max < 4*1024*1024*1024 {
reqLimit = min(reqLimit, 2)
iterLimit = 2
}
}
ctx, cancel := context.WithTimeout(context.Background(), 9*time.Minute)
defer cancel()
client, _, cleanup := InitServerConnection(ctx, t)
defer cleanup()
req, resp := GenerateRequests()
// Get the server running (if applicable) warm the model up with a single initial request
DoGenerate(ctx, t, client, req[0], resp[0], 60*time.Second, 5*time.Second)
DoGenerate(ctx, t, client, req[0], resp[0], 60*time.Second, 10*time.Second)
var wg sync.WaitGroup
wg.Add(len(req))
for i := 0; i < len(req); i++ {
wg.Add(reqLimit)
for i := 0; i < reqLimit; i++ {
go func(i int) {
defer wg.Done()
for j := 0; j < 5; j++ {
for j := 0; j < iterLimit; j++ {
slog.Info("Starting", "req", i, "iter", j)
// On slower GPUs it can take a while to process the 4 concurrent requests
// On slower GPUs it can take a while to process the concurrent requests
// so we allow a much longer initial timeout
DoGenerate(ctx, t, client, req[i], resp[i], 90*time.Second, 5*time.Second)
DoGenerate(ctx, t, client, req[i], resp[i], 120*time.Second, 20*time.Second)
}
}(i)
}
@@ -221,5 +245,23 @@ func TestMultiModelStress(t *testing.T) {
}
}(i)
}
go func() {
for {
time.Sleep(2 * time.Second)
select {
case <-ctx.Done():
return
default:
models, err := client.ListRunning(ctx)
if err != nil {
slog.Warn("failed to list running models", "error", err)
continue
}
for _, m := range models.Models {
slog.Info("loaded model snapshot", "model", m)
}
}
}
}()
wg.Wait()
}

View File

@@ -11,7 +11,8 @@ import (
)
func TestContextExhaustion(t *testing.T) {
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute) // TODO maybe shorter?
// Longer needed for small footprint GPUs
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
defer cancel()
// Set up the test data
req := api.GenerateRequest{
@@ -24,5 +25,10 @@ func TestContextExhaustion(t *testing.T) {
"num_ctx": 128,
},
}
GenerateTestHelper(ctx, t, req, []string{"once", "upon", "lived"})
client, _, cleanup := InitServerConnection(ctx, t)
defer cleanup()
if err := PullIfMissing(ctx, client, req.Model); err != nil {
t.Fatalf("PullIfMissing failed: %v", err)
}
DoGenerate(ctx, t, client, req, []string{"once", "upon", "lived"}, 120*time.Second, 10*time.Second)
}

152
integration/embed_test.go Normal file
View File

@@ -0,0 +1,152 @@
//go:build integration
package integration
import (
"context"
"testing"
"time"
"github.com/ollama/ollama/api"
)
func TestAllMiniLMEmbed(t *testing.T) {
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
defer cancel()
req := api.EmbedRequest{
Model: "all-minilm",
Input: "why is the sky blue?",
}
res, err := embedTestHelper(ctx, t, req)
if err != nil {
t.Fatalf("error: %v", err)
}
if len(res.Embeddings) != 1 {
t.Fatalf("expected 1 embedding, got %d", len(res.Embeddings))
}
if len(res.Embeddings[0]) != 384 {
t.Fatalf("expected 384 floats, got %d", len(res.Embeddings[0]))
}
if res.Embeddings[0][0] != 0.010071031 {
t.Fatalf("expected 0.010071031, got %f", res.Embeddings[0][0])
}
}
func TestAllMiniLMBatchEmbed(t *testing.T) {
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
defer cancel()
req := api.EmbedRequest{
Model: "all-minilm",
Input: []string{"why is the sky blue?", "why is the grass green?"},
}
res, err := embedTestHelper(ctx, t, req)
if err != nil {
t.Fatalf("error: %v", err)
}
if len(res.Embeddings) != 2 {
t.Fatalf("expected 2 embeddings, got %d", len(res.Embeddings))
}
if len(res.Embeddings[0]) != 384 {
t.Fatalf("expected 384 floats, got %d", len(res.Embeddings[0]))
}
if res.Embeddings[0][0] != 0.010071031 || res.Embeddings[1][0] != -0.009802706 {
t.Fatalf("expected 0.010071031 and -0.009802706, got %f and %f", res.Embeddings[0][0], res.Embeddings[1][0])
}
}
func TestAllMiniLmEmbedTruncate(t *testing.T) {
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
defer cancel()
truncTrue, truncFalse := true, false
type testReq struct {
Name string
Request api.EmbedRequest
}
reqs := []testReq{
{
Name: "Target Truncation",
Request: api.EmbedRequest{
Model: "all-minilm",
Input: "why",
},
},
{
Name: "Default Truncate",
Request: api.EmbedRequest{
Model: "all-minilm",
Input: "why is the sky blue?",
Options: map[string]any{"num_ctx": 1},
},
},
{
Name: "Explicit Truncate",
Request: api.EmbedRequest{
Model: "all-minilm",
Input: "why is the sky blue?",
Truncate: &truncTrue,
Options: map[string]any{"num_ctx": 1},
},
},
}
res := make(map[string]*api.EmbedResponse)
for _, req := range reqs {
response, err := embedTestHelper(ctx, t, req.Request)
if err != nil {
t.Fatalf("error: %v", err)
}
res[req.Name] = response
}
if res["Target Truncation"].Embeddings[0][0] != res["Default Truncate"].Embeddings[0][0] {
t.Fatal("expected default request to truncate correctly")
}
if res["Default Truncate"].Embeddings[0][0] != res["Explicit Truncate"].Embeddings[0][0] {
t.Fatal("expected default request and truncate true request to be the same")
}
// check that truncate set to false returns an error if context length is exceeded
_, err := embedTestHelper(ctx, t, api.EmbedRequest{
Model: "all-minilm",
Input: "why is the sky blue?",
Truncate: &truncFalse,
Options: map[string]any{"num_ctx": 1},
})
if err == nil {
t.Fatal("expected error, got nil")
}
}
func embedTestHelper(ctx context.Context, t *testing.T, req api.EmbedRequest) (*api.EmbedResponse, error) {
client, _, cleanup := InitServerConnection(ctx, t)
defer cleanup()
if err := PullIfMissing(ctx, client, req.Model); err != nil {
t.Fatalf("failed to pull model %s: %v", req.Model, err)
}
response, err := client.Embed(ctx, &req)
if err != nil {
return nil, err
}
return response, nil
}

View File

@@ -32,7 +32,11 @@ func TestIntegrationMultimodal(t *testing.T) {
resp := "the ollam"
ctx, cancel := context.WithTimeout(context.Background(), 3*time.Minute)
defer cancel()
GenerateTestHelper(ctx, t, req, []string{resp})
client, _, cleanup := InitServerConnection(ctx, t)
defer cleanup()
require.NoError(t, PullIfMissing(ctx, client, req.Model))
// llava models on CPU can be quite slow to start,
DoGenerate(ctx, t, client, req, []string{resp}, 120*time.Second, 30*time.Second)
}
const imageEncoding = `iVBORw0KGgoAAAANSUhEUgAAANIAAAB4CAYAAACHHqzKAAAAAXNSR0IArs4c6QAAAIRlWElmTU0AKgAAAAgABQESAAMAAAABAAEAAAEaAAUAAAABAAAASgEb

View File

@@ -140,7 +140,7 @@ func PullIfMissing(ctx context.Context, client *api.Client, modelName string) er
showCtx, cancel := context.WithDeadlineCause(
ctx,
time.Now().Add(5*time.Second),
time.Now().Add(10*time.Second),
fmt.Errorf("show for existing model %s took too long", modelName),
)
defer cancel()
@@ -287,41 +287,46 @@ func DoGenerate(ctx context.Context, t *testing.T, client *api.Client, genReq ap
func GenerateRequests() ([]api.GenerateRequest, [][]string) {
return []api.GenerateRequest{
{
Model: "orca-mini",
Prompt: "why is the ocean blue?",
Stream: &stream,
Model: "orca-mini",
Prompt: "why is the ocean blue?",
Stream: &stream,
KeepAlive: &api.Duration{Duration: 10 * time.Second},
Options: map[string]interface{}{
"seed": 42,
"temperature": 0.0,
},
}, {
Model: "orca-mini",
Prompt: "why is the color of dirt brown?",
Stream: &stream,
Model: "orca-mini",
Prompt: "why is the color of dirt brown?",
Stream: &stream,
KeepAlive: &api.Duration{Duration: 10 * time.Second},
Options: map[string]interface{}{
"seed": 42,
"temperature": 0.0,
},
}, {
Model: "orca-mini",
Prompt: "what is the origin of the us thanksgiving holiday?",
Stream: &stream,
Model: "orca-mini",
Prompt: "what is the origin of the us thanksgiving holiday?",
Stream: &stream,
KeepAlive: &api.Duration{Duration: 10 * time.Second},
Options: map[string]interface{}{
"seed": 42,
"temperature": 0.0,
},
}, {
Model: "orca-mini",
Prompt: "what is the origin of independence day?",
Stream: &stream,
Model: "orca-mini",
Prompt: "what is the origin of independence day?",
Stream: &stream,
KeepAlive: &api.Duration{Duration: 10 * time.Second},
Options: map[string]interface{}{
"seed": 42,
"temperature": 0.0,
},
}, {
Model: "orca-mini",
Prompt: "what is the composition of air?",
Stream: &stream,
Model: "orca-mini",
Prompt: "what is the composition of air?",
Stream: &stream,
KeepAlive: &api.Duration{Duration: 10 * time.Second},
Options: map[string]interface{}{
"seed": 42,
"temperature": 0.0,
@@ -331,7 +336,7 @@ func GenerateRequests() ([]api.GenerateRequest, [][]string) {
[][]string{
[]string{"sunlight"},
[]string{"soil", "organic", "earth", "black", "tan"},
[]string{"england", "english", "massachusetts", "pilgrims"},
[]string{"england", "english", "massachusetts", "pilgrims", "british"},
[]string{"fourth", "july", "declaration", "independence"},
[]string{"nitrogen", "oxygen", "carbon", "dioxide"},
}

View File

@@ -1,14 +1,13 @@
set(TARGET ollama_llama_server)
option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
include_directories(${CMAKE_CURRENT_SOURCE_DIR})
add_executable(${TARGET} server.cpp utils.hpp json.hpp httplib.h)
install(TARGETS ${TARGET} RUNTIME)
target_compile_definitions(${TARGET} PRIVATE
SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
)
target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
if (WIN32)
TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
endif()
set(TARGET ollama_llama_server)
option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
include_directories(${CMAKE_CURRENT_SOURCE_DIR})
add_executable(${TARGET} server.cpp utils.hpp json.hpp httplib.h)
install(TARGETS ${TARGET} RUNTIME)
target_compile_definitions(${TARGET} PRIVATE
SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
)
target_link_libraries(${TARGET} PRIVATE ggml llama common llava ${CMAKE_THREAD_LIBS_INIT})
if (WIN32)
TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
endif()
target_compile_features(${TARGET} PRIVATE cxx_std_11)

View File

@@ -56,7 +56,6 @@ struct server_params {
std::string hostname = "127.0.0.1";
std::vector<std::string> api_keys;
std::string public_path = "examples/server/public";
std::string chat_template = "";
int32_t port = 8080;
int32_t read_timeout = 600;
int32_t write_timeout = 600;
@@ -427,16 +426,6 @@ struct llama_server_context
return true;
}
void validate_model_chat_template(server_params & sparams) {
llama_chat_message chat[] = {{"user", "test"}};
std::vector<char> buf(1);
int res = llama_chat_apply_template(model, nullptr, chat, 1, true, buf.data(), buf.size());
if (res < 0) {
LOG_ERROR("The chat template comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", {});
sparams.chat_template = "chatml";
}
}
void initialize() {
// create slots
all_slots_are_idle = true;
@@ -1393,12 +1382,50 @@ struct llama_server_context
}
}
std::string common_prefix(const std::string& str1, const std::string& str2) {
auto mismatch_pair = std::mismatch(str1.begin(), str1.end(), str2.begin());
return std::string(str1.begin(), mismatch_pair.first);
}
// Find the slot that has the greatest common prefix
server_slot *prefix_slot(const json &prompt) {
if (!prompt.is_string()) {
return nullptr;
}
std::string prompt_str = prompt.get<std::string>();
server_slot *slot = nullptr;
size_t longest = 0;
for (server_slot &s : slots) {
if (s.available() && s.prompt.is_string()) {
std::string s_prompt = s.prompt.get<std::string>();
std::string prefix = common_prefix(s_prompt, prompt_str);
if (prefix.size() > longest) {
slot = &s;
longest = prefix.size();
}
}
}
if (!slot) {
return get_slot(-1);
}
LOG_DEBUG("slot with common prefix found", {{
"slot_id", slot->id,
"characters", longest
}});
return slot;
}
void process_single_task(task_server& task)
{
switch (task.type)
{
case TASK_TYPE_COMPLETION: {
server_slot *slot = get_slot(json_value(task.data, "slot_id", -1));
server_slot *slot = prefix_slot(task.data["prompt"]);
if (slot == nullptr)
{
// if no slot is available, we defer this task for processing later
@@ -1665,22 +1692,23 @@ struct llama_server_context
if (slot.ga_n == 1 && slot.n_prompt_tokens >= slot.n_ctx)
{
const int n_left = slot.n_ctx - slot.params.n_keep;
const int n_block_size = n_left / 2;
const int erased_blocks = (slot.n_prompt_tokens - slot.params.n_keep - n_block_size) / n_block_size;
const int n_shift = n_left / 2;
const int n_erase = slot.n_prompt_tokens - slot.params.n_keep - n_shift;
std::vector<llama_token> new_tokens(
prompt_tokens.begin(),
prompt_tokens.begin() + slot.params.n_keep);
new_tokens.insert(
new_tokens.end(),
prompt_tokens.begin() + slot.params.n_keep + erased_blocks * n_block_size,
prompt_tokens.begin() + slot.params.n_keep + n_erase,
prompt_tokens.end());
LOG_VERBOSE("input truncated", {
{"n_ctx", slot.n_ctx},
{"n_keep", slot.params.n_keep},
{"n_left", n_left},
{"new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend())},
LOG_INFO("input truncated", {
{"n_ctx", slot.n_ctx},
{"n_keep", slot.params.n_keep},
{"n_left", n_left},
{"n_shift", n_shift},
{"n_erase", n_erase},
});
slot.truncated = true;
prompt_tokens = new_tokens;
@@ -1715,7 +1743,7 @@ struct llama_server_context
slot.n_past -= 1;
}
slot.n_prompt_tokens_processed = slot.n_prompt_tokens - slot.n_past;
slot.n_prompt_tokens_processed = slot.n_prompt_tokens;
if (slot.ga_n != 1)
{
@@ -2335,9 +2363,9 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, g
invalid_param = true;
break;
}
#ifndef GGML_USE_CUBLAS
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting the split mode has no effect.\n");
#endif // GGML_USE_CUBLAS
#ifndef GGML_USE_CUDA
fprintf(stderr, "warning: llama.cpp was compiled without CUDA. Setting the split mode has no effect.\n");
#endif // GGML_USE_CUDA
}
else if (arg == "--tensor-split" || arg == "-ts")
{
@@ -2346,7 +2374,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, g
invalid_param = true;
break;
}
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)
#if defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL)
std::string arg_next = argv[i];
// split string by , and /
@@ -2367,8 +2395,8 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, g
}
}
#else
LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n", {});
#endif // GGML_USE_CUBLAS
LOG_WARNING("llama.cpp was compiled without CUDA. It is not possible to set a tensor split.\n", {});
#endif // GGML_USE_CUDA
}
else if (arg == "--main-gpu" || arg == "-mg")
{
@@ -2377,7 +2405,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, g
invalid_param = true;
break;
}
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)
#if defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL)
params.main_gpu = std::stoi(argv[i]);
#else
LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.", {});
@@ -2535,7 +2563,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, g
invalid_param = true;
break;
}
sparams.chat_template = argv[i];
}
else if (arg == "--override-kv")
{
@@ -3008,11 +3035,6 @@ int main(int argc, char **argv) {
}
const auto model_meta = llama.model_meta();
if (sparams.chat_template.empty()) { // custom chat template is not supplied
// check if the template comes with the model is supported by us
llama.validate_model_chat_template(sparams);
}
// Middleware for API key validation
auto validate_api_key = [&sparams](const httplib::Request &req, httplib::Response &res) -> bool {
// If API key is not set, skip validation
@@ -3166,26 +3188,33 @@ int main(int argc, char **argv) {
prompt = "";
}
json image_data;
if (body.count("image_data") != 0) {
image_data = body["image_data"];
}
else
{
image_data = "";
if (prompt.size() == 1) {
prompt = prompt[0];
}
// create and queue the task
const int task_id = llama.queue_tasks.get_new_id();
llama.queue_results.add_waiting_task_id(task_id);
llama.request_completion(task_id, { {"prompt", prompt}, { "n_predict", 0}, {"image_data", image_data} }, true, -1);
json responses;
{
const int id_task = llama.queue_tasks.get_new_id();
llama.queue_results.add_waiting_task_id(id_task);
llama.request_completion(id_task, {{"prompt", prompt}}, true, -1);
// get the result
task_result result = llama.queue_results.recv(task_id);
llama.queue_results.remove_waiting_task_id(task_id);
// get the result
task_result result = llama.queue_results.recv(id_task);
llama.queue_results.remove_waiting_task_id(id_task);
if (result.error) {
return res.set_content(result.result_json.dump(), "application/json; charset=utf-8");
}
// send the result
return res.set_content(result.result_json.dump(), "application/json; charset=utf-8");
responses = result.result_json.value("results", std::vector<json>{result.result_json});
json embeddings = json::array();
for (auto & elem : responses) {
embeddings.push_back(elem.at("embedding"));
}
// send the result
json embedding_res = json{{"embedding", embeddings}};
return res.set_content(embedding_res.dump(), "application/json; charset=utf-8");
}
});
// GG: if I put the main loop inside a thread, it crashes on the first request when build in Debug!?

View File

@@ -18,16 +18,16 @@ sign() {
fi
}
COMMON_DARWIN_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DLLAMA_METAL_MACOSX_VERSION_MIN=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DLLAMA_METAL_EMBED_LIBRARY=on"
COMMON_DARWIN_DEFS="-DBUILD_SHARED_LIBS=off -DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DLLAMA_METAL_MACOSX_VERSION_MIN=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DGGML_METAL_EMBED_LIBRARY=on -DGGML_OPENMP=off"
case "${GOARCH}" in
"amd64")
COMMON_CPU_DEFS="${COMMON_DARWIN_DEFS} -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DLLAMA_METAL=off -DLLAMA_NATIVE=off"
COMMON_CPU_DEFS="${COMMON_DARWIN_DEFS} -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DGGML_METAL=off -DGGML_NATIVE=off"
# Static build for linking into the Go binary
init_vars
CMAKE_TARGETS="--target llama --target ggml"
CMAKE_DEFS="${COMMON_CPU_DEFS} -DBUILD_SHARED_LIBS=off -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_BLAS=off -DGGML_ACCELERATE=off -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}"
BUILD_DIR="../build/darwin/${ARCH}_static"
echo "Building static library"
build
@@ -37,7 +37,7 @@ case "${GOARCH}" in
# CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
#
init_vars
CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_ACCELERATE=off -DGGML_BLAS=off -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}"
BUILD_DIR="../build/darwin/${ARCH}/cpu"
echo "Building LCD CPU"
build
@@ -49,7 +49,7 @@ case "${GOARCH}" in
# Approximately 400% faster than LCD on same CPU
#
init_vars
CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_ACCELERATE=off -DGGML_BLAS=off -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}"
BUILD_DIR="../build/darwin/${ARCH}/cpu_avx"
echo "Building AVX CPU"
build
@@ -61,7 +61,7 @@ case "${GOARCH}" in
# Approximately 10% faster than AVX on same CPU
#
init_vars
CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=on -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on ${CMAKE_DEFS}"
CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_ACCELERATE=on -DGGML_BLAS=off -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on ${CMAKE_DEFS}"
BUILD_DIR="../build/darwin/${ARCH}/cpu_avx2"
echo "Building AVX2 CPU"
EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation"
@@ -75,14 +75,14 @@ case "${GOARCH}" in
# Static build for linking into the Go binary
init_vars
CMAKE_TARGETS="--target llama --target ggml"
CMAKE_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DBUILD_SHARED_LIBS=off -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DLLAMA_METAL=off -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
CMAKE_DEFS="${COMMON_DARWIN_DEFS} -DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} ${CMAKE_DEFS}"
BUILD_DIR="../build/darwin/${ARCH}_static"
echo "Building static library"
build
if [ -z "$OLLAMA_SKIP_METAL_GENERATE" ]; then
init_vars
CMAKE_DEFS="${COMMON_DARWIN_DEFS} -DLLAMA_ACCELERATE=on -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DLLAMA_METAL=on ${CMAKE_DEFS}"
CMAKE_DEFS="${COMMON_DARWIN_DEFS} -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} ${CMAKE_DEFS}"
BUILD_DIR="../build/darwin/${ARCH}/metal"
EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders"
build

View File

@@ -51,7 +51,7 @@ if [ -z "${CUDACXX}" ]; then
export CUDACXX=$(command -v nvcc)
fi
fi
COMMON_CMAKE_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off"
COMMON_CMAKE_DEFS="-DBUILD_SHARED_LIBS=off -DCMAKE_POSITION_INDEPENDENT_CODE=on -DGGML_NATIVE=off -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_OPENMP=off"
source $(dirname $0)/gen_common.sh
init_vars
git_module_setup
@@ -64,7 +64,7 @@ if [ -z "${OLLAMA_SKIP_STATIC_GENERATE}" -o "${OLLAMA_CPU_TARGET}" = "static" ];
# Static build for linking into the Go binary
init_vars
CMAKE_TARGETS="--target llama --target ggml"
CMAKE_DEFS="-DBUILD_SHARED_LIBS=off -DLLAMA_NATIVE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
CMAKE_DEFS="-DBUILD_SHARED_LIBS=off -DGGML_NATIVE=off -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_OPENMP=off ${CMAKE_DEFS}"
BUILD_DIR="../build/linux/${ARCH}_static"
echo "Building static library"
build
@@ -77,29 +77,29 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
if [ -n "${OLLAMA_CUSTOM_CPU_DEFS}" ]; then
init_vars
echo "OLLAMA_CUSTOM_CPU_DEFS=\"${OLLAMA_CUSTOM_CPU_DEFS}\""
CMAKE_DEFS="${OLLAMA_CUSTOM_CPU_DEFS} -DCMAKE_POSITION_INDEPENDENT_CODE=on ${CMAKE_DEFS}"
CMAKE_DEFS="${OLLAMA_CUSTOM_CPU_DEFS} -DBUILD_SHARED_LIBS=off -DCMAKE_POSITION_INDEPENDENT_CODE=on ${CMAKE_DEFS}"
BUILD_DIR="../build/linux/${ARCH}/cpu"
echo "Building custom CPU"
build
compress
else
# Darwin Rosetta x86 emulation does NOT support AVX, AVX2, AVX512
# -DLLAMA_AVX -- 2011 Intel Sandy Bridge & AMD Bulldozer
# -DLLAMA_F16C -- 2012 Intel Ivy Bridge & AMD 2011 Bulldozer (No significant improvement over just AVX)
# -DLLAMA_AVX2 -- 2013 Intel Haswell & 2015 AMD Excavator / 2017 AMD Zen
# -DLLAMA_FMA (FMA3) -- 2013 Intel Haswell & 2012 AMD Piledriver
# -DGGML_AVX -- 2011 Intel Sandy Bridge & AMD Bulldozer
# -DGGML_F16C -- 2012 Intel Ivy Bridge & AMD 2011 Bulldozer (No significant improvement over just AVX)
# -DGGML_AVX2 -- 2013 Intel Haswell & 2015 AMD Excavator / 2017 AMD Zen
# -DGGML_FMA (FMA3) -- 2013 Intel Haswell & 2012 AMD Piledriver
# Note: the following seem to yield slower results than AVX2 - ymmv
# -DLLAMA_AVX512 -- 2017 Intel Skylake and High End DeskTop (HEDT)
# -DLLAMA_AVX512_VBMI -- 2018 Intel Cannon Lake
# -DLLAMA_AVX512_VNNI -- 2021 Intel Alder Lake
# -DGGML_AVX512 -- 2017 Intel Skylake and High End DeskTop (HEDT)
# -DGGML_AVX512_VBMI -- 2018 Intel Cannon Lake
# -DGGML_AVX512_VNNI -- 2021 Intel Alder Lake
COMMON_CPU_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off"
COMMON_CPU_DEFS="-DBUILD_SHARED_LIBS=off -DCMAKE_POSITION_INDEPENDENT_CODE=on -DGGML_NATIVE=off -DGGML_OPENMP=off"
if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu" ]; then
#
# CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
#
init_vars
CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}"
BUILD_DIR="../build/linux/${ARCH}/cpu"
echo "Building LCD CPU"
build
@@ -116,7 +116,7 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
# Approximately 400% faster than LCD on same CPU
#
init_vars
CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}"
BUILD_DIR="../build/linux/${ARCH}/cpu_avx"
echo "Building AVX CPU"
build
@@ -129,7 +129,7 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
# Approximately 10% faster than AVX on same CPU
#
init_vars
CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on ${CMAKE_DEFS}"
CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on ${CMAKE_DEFS}"
BUILD_DIR="../build/linux/${ARCH}/cpu_avx2"
echo "Building AVX2 CPU"
build
@@ -170,15 +170,15 @@ if [ -z "${OLLAMA_SKIP_CUDA_GENERATE}" -a -d "${CUDA_LIB_DIR}" ]; then
#
# CUDA compute < 6.0 lacks proper FP16 support on ARM.
# Disabling has minimal performance effect while maintaining compatibility.
ARM64_DEFS="-DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_CUDA_F16=off"
ARM64_DEFS="-DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_CUDA_F16=off"
fi
# Users building from source can tune the exact flags we pass to cmake for configuring llama.cpp
if [ -n "${OLLAMA_CUSTOM_CUDA_DEFS}" ]; then
echo "OLLAMA_CUSTOM_CUDA_DEFS=\"${OLLAMA_CUSTOM_CUDA_DEFS}\""
CMAKE_CUDA_DEFS="-DLLAMA_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} ${OLLAMA_CUSTOM_CUDA_DEFS}"
CMAKE_CUDA_DEFS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} ${OLLAMA_CUSTOM_CUDA_DEFS}"
echo "Building custom CUDA GPU"
else
CMAKE_CUDA_DEFS="-DLLAMA_CUDA=on -DLLAMA_CUDA_FORCE_MMQ=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}"
CMAKE_CUDA_DEFS="-DGGML_CUDA=on -DCMAKE_CUDA_FLAGS=-t8 -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}"
fi
CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS} ${CMAKE_CUDA_DEFS}"
BUILD_DIR="../build/linux/${ARCH}/cuda${CUDA_VARIANT}"
@@ -216,7 +216,7 @@ if [ -z "${OLLAMA_SKIP_ONEAPI_GENERATE}" -a -d "${ONEAPI_ROOT}" ]; then
init_vars
source ${ONEAPI_ROOT}/setvars.sh --force # set up environment variables for oneAPI
CC=icx
CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL=ON -DLLAMA_SYCL_F16=OFF"
CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL=ON -DGGML_SYCL_F16=OFF"
BUILD_DIR="../build/linux/${ARCH}/oneapi"
EXTRA_LIBS="-fsycl -Wl,-rpath,${ONEAPI_ROOT}/compiler/latest/lib,-rpath,${ONEAPI_ROOT}/mkl/latest/lib,-rpath,${ONEAPI_ROOT}/tbb/latest/lib,-rpath,${ONEAPI_ROOT}/compiler/latest/opt/oclfpga/linux64/lib -lOpenCL -lmkl_core -lmkl_sycl_blas -lmkl_intel_ilp64 -lmkl_tbb_thread -ltbb"
DEBUG_FLAGS="" # icx compiles with -O0 if we pass -g, so we must remove it
@@ -254,7 +254,7 @@ if [ -z "${OLLAMA_SKIP_ROCM_GENERATE}" -a -d "${ROCM_PATH}" ]; then
ROCM_VARIANT=_v$(ls ${ROCM_PATH}/lib/librocblas.so.*.*.????? | cut -f5 -d. || true)
fi
init_vars
CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DLLAMA_HIPBLAS=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS=$(amdGPUs) -DGPU_TARGETS=$(amdGPUs)"
CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DGGML_HIPBLAS=on -DLLAMA_CUDA_NO_PEER_COPY=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS=$(amdGPUs) -DGPU_TARGETS=$(amdGPUs)"
# Users building from source can tune the exact flags we pass to cmake for configuring llama.cpp
if [ -n "${OLLAMA_CUSTOM_ROCM_DEFS}" ]; then
echo "OLLAMA_CUSTOM_ROCM_DEFS=\"${OLLAMA_CUSTOM_ROCM_DEFS}\""

View File

@@ -6,18 +6,9 @@ function amdGPUs {
if ($env:AMDGPU_TARGETS) {
return $env:AMDGPU_TARGETS
}
# TODO - load from some common data file for linux + windows build consistency
# Current supported rocblas list from ROCm v6.1.2 on windows
# https://rocm.docs.amd.com/projects/install-on-windows/en/latest/reference/system-requirements.html#windows-supported-gpus
$GPU_LIST = @(
"gfx900"
"gfx906:xnack-"
"gfx908:xnack-"
"gfx90a:xnack+"
"gfx90a:xnack-"
"gfx940"
"gfx941"
"gfx942"
"gfx1010"
"gfx1012"
"gfx1030"
"gfx1100"
"gfx1101"
@@ -39,7 +30,8 @@ function init_vars {
}
$script:cmakeDefs = @(
"-DBUILD_SHARED_LIBS=on",
"-DLLAMA_NATIVE=off"
"-DGGML_NATIVE=off",
"-DGGML_OPENMP=off"
)
$script:commonCpuDefs = @("-DCMAKE_POSITION_INDEPENDENT_CODE=on")
$script:ARCH = $Env:PROCESSOR_ARCHITECTURE.ToLower()
@@ -122,8 +114,13 @@ function build {
& cmake --version
& cmake -S "${script:llamacppDir}" -B $script:buildDir $script:cmakeDefs
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
write-host "building with: cmake --build $script:buildDir --config $script:config $($script:cmakeTargets | ForEach-Object { `"--target`", $_ })"
& cmake --build $script:buildDir --config $script:config ($script:cmakeTargets | ForEach-Object { "--target", $_ })
if ($cmakeDefs -contains "-G") {
$extra=@("-j8")
} else {
$extra= @("--", "/p:CL_MPcount=8")
}
write-host "building with: cmake --build $script:buildDir --config $script:config $($script:cmakeTargets | ForEach-Object { `"--target`", $_ }) $extra"
& cmake --build $script:buildDir --config $script:config ($script:cmakeTargets | ForEach-Object { "--target", $_ }) $extra
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
# Rearrange output to be consistent between different generators
if ($null -ne ${script:config} -And (test-path -path "${script:buildDir}/bin/${script:config}" ) ) {
@@ -176,9 +173,9 @@ function cleanup {
}
# -DLLAMA_AVX -- 2011 Intel Sandy Bridge & AMD Bulldozer
# -DLLAMA_AVX2 -- 2013 Intel Haswell & 2015 AMD Excavator / 2017 AMD Zen
# -DLLAMA_FMA (FMA3) -- 2013 Intel Haswell & 2012 AMD Piledriver
# -DGGML_AVX -- 2011 Intel Sandy Bridge & AMD Bulldozer
# -DGGML_AVX2 -- 2013 Intel Haswell & 2015 AMD Excavator / 2017 AMD Zen
# -DGGML_FMA (FMA3) -- 2013 Intel Haswell & 2012 AMD Piledriver
function build_static() {
@@ -198,12 +195,13 @@ function build_static() {
"-DCMAKE_C_COMPILER=gcc.exe",
"-DCMAKE_CXX_COMPILER=g++.exe",
"-DBUILD_SHARED_LIBS=off",
"-DLLAMA_NATIVE=off",
"-DLLAMA_AVX=off",
"-DLLAMA_AVX2=off",
"-DLLAMA_AVX512=off",
"-DLLAMA_F16C=off",
"-DLLAMA_FMA=off")
"-DGGML_NATIVE=off",
"-DGGML_AVX=off",
"-DGGML_AVX2=off",
"-DGGML_AVX512=off",
"-DGGML_F16C=off",
"-DGGML_FMA=off",
"-DGGML_OPENMP=off")
$script:buildDir="../build/windows/${script:ARCH}_static"
write-host "Building static library"
build
@@ -217,7 +215,7 @@ function build_cpu($gen_arch) {
if ((-not "${env:OLLAMA_SKIP_CPU_GENERATE}" ) -and ((-not "${env:OLLAMA_CPU_TARGET}") -or ("${env:OLLAMA_CPU_TARGET}" -eq "cpu"))) {
# remaining llama.cpp builds use MSVC
init_vars
$script:cmakeDefs = $script:commonCpuDefs + @("-A", $gen_arch, "-DLLAMA_AVX=off", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs
$script:cmakeDefs = $script:commonCpuDefs + @("-A", $gen_arch, "-DGGML_AVX=off", "-DGGML_AVX2=off", "-DGGML_AVX512=off", "-DGGML_FMA=off", "-DGGML_F16C=off") + $script:cmakeDefs
$script:buildDir="../build/windows/${script:ARCH}/cpu"
$script:distDir="$script:DIST_BASE\cpu"
write-host "Building LCD CPU"
@@ -232,7 +230,7 @@ function build_cpu($gen_arch) {
function build_cpu_avx() {
if ((-not "${env:OLLAMA_SKIP_CPU_GENERATE}" ) -and ((-not "${env:OLLAMA_CPU_TARGET}") -or ("${env:OLLAMA_CPU_TARGET}" -eq "cpu_avx"))) {
init_vars
$script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs
$script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DGGML_AVX=on", "-DGGML_AVX2=off", "-DGGML_AVX512=off", "-DGGML_FMA=off", "-DGGML_F16C=off") + $script:cmakeDefs
$script:buildDir="../build/windows/${script:ARCH}/cpu_avx"
$script:distDir="$script:DIST_BASE\cpu_avx"
write-host "Building AVX CPU"
@@ -247,7 +245,7 @@ function build_cpu_avx() {
function build_cpu_avx2() {
if ((-not "${env:OLLAMA_SKIP_CPU_GENERATE}" ) -and ((-not "${env:OLLAMA_CPU_TARGET}") -or ("${env:OLLAMA_CPU_TARGET}" -eq "cpu_avx2"))) {
init_vars
$script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=on", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=on", "-DLLAMA_F16C=on") + $script:cmakeDefs
$script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DGGML_AVX=on", "-DGGML_AVX2=on", "-DGGML_AVX512=off", "-DGGML_FMA=on", "-DGGML_F16C=on") + $script:cmakeDefs
$script:buildDir="../build/windows/${script:ARCH}/cpu_avx2"
$script:distDir="$script:DIST_BASE\cpu_avx2"
write-host "Building AVX2 CPU"
@@ -270,7 +268,15 @@ function build_cuda() {
init_vars
$script:buildDir="../build/windows/${script:ARCH}/cuda$script:CUDA_VARIANT"
$script:distDir="$script:DIST_BASE\cuda$script:CUDA_VARIANT"
$script:cmakeDefs += @("-A", "x64", "-DLLAMA_CUDA=ON", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DCUDAToolkit_INCLUDE_DIR=$script:CUDA_INCLUDE_DIR", "-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}")
$script:cmakeDefs += @(
"-A", "x64",
"-DGGML_CUDA=ON",
"-DGGML_AVX=on",
"-DGGML_AVX2=off",
"-DCUDAToolkit_INCLUDE_DIR=$script:CUDA_INCLUDE_DIR",
"-DCMAKE_CUDA_FLAGS=-t8",
"-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}"
)
if ($null -ne $env:OLLAMA_CUSTOM_CUDA_DEFS) {
write-host "OLLAMA_CUSTOM_CUDA_DEFS=`"${env:OLLAMA_CUSTOM_CUDA_DEFS}`""
$script:cmakeDefs +=@("${env:OLLAMA_CUSTOM_CUDA_DEFS}")
@@ -280,10 +286,12 @@ function build_cuda() {
sign
install
write-host "copying CUDA dependencies to ${script:SRC_DIR}\dist\windows-${script:ARCH}\"
cp "${script:CUDA_LIB_DIR}\cudart64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\"
cp "${script:CUDA_LIB_DIR}\cublas64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\"
cp "${script:CUDA_LIB_DIR}\cublasLt64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\"
rm -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
md "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\" -ea 0 > $null
write-host "copying CUDA dependencies to ${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
cp "${script:CUDA_LIB_DIR}\cudart64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
cp "${script:CUDA_LIB_DIR}\cublas64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
cp "${script:CUDA_LIB_DIR}\cublasLt64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
} else {
write-host "Skipping CUDA generation step"
}
@@ -302,7 +310,7 @@ function build_oneapi() {
$script:distDir ="$script:DIST_BASE\oneapi$script:ONEAPI_VARIANT"
$script:cmakeDefs += @(
"-G", "MinGW Makefiles",
"-DLLAMA_SYCL=ON",
"-DGGML_SYCL=ON",
"-DCMAKE_C_COMPILER=icx",
"-DCMAKE_CXX_COMPILER=icx",
"-DCMAKE_BUILD_TYPE=Release"
@@ -317,16 +325,18 @@ function build_oneapi() {
sign
install
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libirngmd.dll" "${script:distDir}"
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libmmd.dll" "${script:distDir}"
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_level_zero.dll" "${script:distDir}"
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_unified_runtime.dll" "${script:distDir}"
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_win_proxy_loader.dll" "${script:distDir}"
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\svml_dispmd.dll" "${script:distDir}"
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\sycl7.dll" "${script:distDir}"
cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_core.2.dll" "${script:distDir}"
cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_sycl_blas.4.dll" "${script:distDir}"
cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_tbb_thread.2.dll" "${script:distDir}"
rm -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
md "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\" -ea 0 > $null
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libirngmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libmmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_level_zero.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_unified_runtime.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_win_proxy_loader.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\svml_dispmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\sycl7.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_core.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_sycl_blas.4.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_tbb_thread.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
} else {
Write-Host "Skipping oneAPI generation step"
}
@@ -346,10 +356,11 @@ function build_rocm() {
"-G", "Ninja",
"-DCMAKE_C_COMPILER=clang.exe",
"-DCMAKE_CXX_COMPILER=clang++.exe",
"-DLLAMA_HIPBLAS=on",
"-DGGML_HIPBLAS=on",
"-DLLAMA_CUDA_NO_PEER_COPY=on",
"-DHIP_PLATFORM=amd",
"-DLLAMA_AVX=on",
"-DLLAMA_AVX2=off",
"-DGGML_AVX=on",
"-DGGML_AVX2=off",
"-DCMAKE_POSITION_INDEPENDENT_CODE=on",
"-DAMDGPU_TARGETS=$(amdGPUs)",
"-DGPU_TARGETS=$(amdGPUs)"
@@ -375,7 +386,6 @@ function build_rocm() {
sign
install
# Assumes v5.7, may need adjustments for v6
rm -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\"
md "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\rocblas\library\" -ea 0 > $null
cp "${env:HIP_PATH}\bin\hipblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\"

View File

@@ -53,7 +53,7 @@ func (llm *ggla) Tensors() Tensors {
return llm.tensors
}
func (llm *ggla) decode(rs io.ReadSeeker) error {
func (llm *ggla) decode(rs io.ReadSeeker) (retErr error) {
var r uint32
if err := binary.Read(rs, binary.LittleEndian, &r); err != nil {
return err
@@ -69,9 +69,18 @@ func (llm *ggla) decode(rs io.ReadSeeker) error {
for {
var dims uint32
if err := binary.Read(rs, binary.LittleEndian, &dims); err != nil {
if errors.Is(err, io.EOF) {
return nil
}
return err
}
defer func() {
if errors.Is(retErr, io.EOF) {
retErr = io.ErrUnexpectedEOF
}
}()
var namesize uint32
if err := binary.Read(rs, binary.LittleEndian, &namesize); err != nil {
return err
@@ -108,7 +117,7 @@ func (llm *ggla) decode(rs io.ReadSeeker) error {
return err
}
if _, err := rs.Seek((offset+31)&-32, io.SeekStart); err != nil {
if _, err := rs.Seek((offset+31)&-32-offset, io.SeekCurrent); err != nil {
return err
}

View File

@@ -6,6 +6,8 @@ import (
"fmt"
"io"
"strings"
"github.com/ollama/ollama/util/bufioutil"
)
type GGML struct {
@@ -69,6 +71,30 @@ func (kv KV) HeadCountKV() uint64 {
return 1
}
func (kv KV) EmbeddingHeadCount() uint64 {
if heads := kv.HeadCount(); heads > 0 {
return kv.EmbeddingLength() / kv.HeadCount()
}
return 0
}
func (kv KV) EmbeddingHeadCountK() uint64 {
if k := kv.u64(fmt.Sprintf("%s.attention.key_length", kv.Architecture())); k > 0 {
return k
}
return kv.EmbeddingHeadCount()
}
func (kv KV) EmbeddingHeadCountV() uint64 {
if v := kv.u64(fmt.Sprintf("%s.attention.value_length", kv.Architecture())); v > 0 {
return v
}
return kv.EmbeddingHeadCount()
}
func (kv KV) GQA() uint64 {
return kv.HeadCount() / kv.HeadCountKV()
}
@@ -254,7 +280,18 @@ func DetectGGMLType(b []byte) string {
}
}
func DecodeGGML(rs io.ReadSeeker) (*GGML, int64, error) {
// DecodeGGML decodes a GGML model from the given reader.
//
// It collects array values for arrays with a size less than or equal to
// maxArraySize. If maxArraySize is 0, the default value of 1024 is used. If
// the maxArraySize is negative, all arrays are collected.
func DecodeGGML(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) {
if maxArraySize == 0 {
maxArraySize = 1024
}
rs = bufioutil.NewBufferedSeeker(rs, 32<<10)
var magic uint32
if err := binary.Read(rs, binary.LittleEndian, &magic); err != nil {
return nil, 0, err
@@ -267,17 +304,15 @@ func DecodeGGML(rs io.ReadSeeker) (*GGML, int64, error) {
case FILE_MAGIC_GGLA:
c = &containerGGLA{}
case FILE_MAGIC_GGUF_LE:
c = &containerGGUF{ByteOrder: binary.LittleEndian}
c = &containerGGUF{ByteOrder: binary.LittleEndian, maxArraySize: maxArraySize}
case FILE_MAGIC_GGUF_BE:
c = &containerGGUF{ByteOrder: binary.BigEndian}
c = &containerGGUF{ByteOrder: binary.BigEndian, maxArraySize: maxArraySize}
default:
return nil, 0, errors.New("invalid file magic")
}
model, err := c.Decode(rs)
if errors.Is(err, io.EOF) {
// noop
} else if err != nil {
if err != nil {
return nil, 0, err
}
@@ -297,7 +332,10 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
embedding := llm.KV().EmbeddingLength()
heads := llm.KV().HeadCount()
headsKV := llm.KV().HeadCountKV()
vocab := uint64(len(llm.KV()["tokenizer.ggml.tokens"].([]any)))
vocab := uint64(llm.KV()["tokenizer.ggml.tokens"].(*array).size)
embeddingHeads := llm.KV().EmbeddingHeadCount()
embeddingHeadsK := llm.KV().EmbeddingHeadCountK()
layers := llm.Tensors().Layers()
@@ -307,7 +345,8 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
partialOffload = 4 * batch * embedding
partialOffload += max(
4*batch*(1+embedding+max(context, embedding))+embedding*embedding*9/16+4*context*(batch*heads+embedding/heads*headsKV),
// 4*batch*(4+6*embedding+context*(2*heads)+llm.KV().GQA()),
4*batch*(1+embedding+max(context, embedding))+embedding*embedding*9/16+4*context*(batch*heads+embeddingHeads*headsKV),
4*batch*(embedding+vocab)+embedding*vocab*105/128,
)
@@ -315,21 +354,30 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
// mixtral 8x22b
ff := uint64(llm.KV()["llama.feed_forward_length"].(uint32))
partialOffload = max(
3*ffnGateExpsWeight.Size()+4*batch*(2*ff+headsKV+embedding+context+embedding/heads*headsKV),
4*(context*batch*heads+context*embedding/heads*headsKV+batch*1024+embedding/heads*headsKV*batch),
3*ffnGateExpsWeight.Size()+4*batch*(2*ff+headsKV+embedding+context+embeddingHeads*headsKV),
4*(context*batch*heads+context*embeddingHeads*headsKV+batch*1024+embeddingHeads*headsKV*batch),
)
} else if ffnGateWeight, ok := layers["blk.0"]["ffn_gate.0.weight"]; ok {
// mixtral 8x7b
ffnGateWeight1 := ffnGateWeight.Shape[1]
fullOffload = 4 * batch * (2 + 3*embedding + context*(1+heads) + 2*headsKV + ffnGateWeight1)
partialOffload = max(
4*batch*(3+embedding/heads*headsKV+embedding+context*(1+heads)+ffnGateWeight1)+(embedding*embedding+3*embedding*headsKV*ffnGateWeight1)*9/16,
4*batch*(3+embeddingHeads*headsKV+embedding+context*(1+heads)+ffnGateWeight1)+(embedding*embedding+3*embedding*headsKV*ffnGateWeight1)*9/16,
4*batch*(1+2*embedding+context*(1+heads))+embedding*(6*context*headsKV/heads+embedding*9/16),
)
}
case "gemma":
fullOffload = 4 * batch * (embedding + vocab)
partialOffload = 4*batch*(2*embedding+vocab+1) + embedding*vocab*105/128
case "gemma", "gemma2":
fullOffload = max(
4*batch*(embedding+vocab),
4*batch*(2+context+context*heads+2*embedding+2*embeddingHeadsK*heads),
)
partialOffload = max(
4*embedding*batch+embedding*vocab*105/128+4*vocab*batch,
4*batch*(2*embedding+1+2*embeddingHeadsK*heads+context+context*heads)+
4*embeddingHeadsK*context*8+
embedding*embeddingHeadsK*heads*9/16,
)
case "command-r":
fullOffload = max(
4*batch*(embedding+vocab),
@@ -366,6 +414,42 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
4*batch*(vocab+2*embedding),
fullOffload,
)
case "deepseek2":
fullOffload = max(
4*batch*(3*embedding+vocab),
4*batch*(3*embedding+2+context*(1+headsKV)+2*embeddingHeadsK*headsKV),
)
partialOffload = max(
4*batch*(3*embedding+vocab)+embedding*vocab*105/128,
4*batch*(2*embedding+1+2*embeddingHeadsK*headsKV+context+context*headsKV)+4*embeddingHeadsK*context*headsKV+embedding*embeddingHeadsK*headsKV*9/16,
)
case "chatglm":
fullOffload = 4 * batch * (embedding + vocab)
partialOffload = 4*batch*(embedding+vocab) + embedding*vocab*105/128
if qkvBias, ok := layers["blk.0"]["attn_qkv.bias"]; ok {
fullOffload = max(
fullOffload,
4*batch*(2+
2*embedding+
context+
context*heads+
embeddingHeadsK*heads+
qkvBias.Shape[0]),
)
partialOffload = max(
partialOffload,
4*batch*(1+
2*embedding+
embeddingHeadsK*heads+
context+
context*heads)+
4*embeddingHeadsK*context+
4*context*embeddingHeadsK+
4*qkvBias.Shape[0],
)
}
}
return

1
llm/ggml_test.go Normal file
View File

@@ -0,0 +1 @@
package llm

View File

@@ -3,11 +3,10 @@ package llm
import (
"bytes"
"encoding/binary"
"encoding/json"
"fmt"
"io"
"strings"
"log/slog"
)
type containerGGUF struct {
@@ -29,6 +28,12 @@ type containerGGUF struct {
NumTensor uint64
NumKV uint64
}
maxArraySize int
}
func (c *containerGGUF) canCollectArray(size int) bool {
return c.maxArraySize < 0 || size <= c.maxArraySize
}
func (c *containerGGUF) Name() string {
@@ -54,7 +59,6 @@ func (c *containerGGUF) Decode(rs io.ReadSeeker) (model, error) {
}
model := newGGUF(c)
slog.Debug(fmt.Sprintf("model = %#v", model))
if err := model.Decode(rs); err != nil {
return nil, err
}
@@ -85,6 +89,8 @@ type gguf struct {
tensors []*Tensor
parameters uint64
scratch [16 << 10]byte
}
func newGGUF(container *containerGGUF) *gguf {
@@ -181,34 +187,34 @@ func (llm *gguf) Decode(rs io.ReadSeeker) error {
}
// decode tensors
for i := 0; uint64(i) < llm.numTensor(); i++ {
for range llm.numTensor() {
name, err := readGGUFString(llm, rs)
if err != nil {
return err
return fmt.Errorf("failed to read tensor name: %w", err)
}
// dims is the number of dimensions in the tensor
dims, err := readGGUF[uint32](llm, rs)
if err != nil {
return err
return fmt.Errorf("failed to read tensor dimensions: %w", err)
}
shape := [4]uint64{1, 1, 1, 1}
for i := 0; uint32(i) < dims; i++ {
shape[i], err = readGGUF[uint64](llm, rs)
if err != nil {
return err
return fmt.Errorf("failed to read tensor shape: %w", err)
}
}
kind, err := readGGUF[uint32](llm, rs)
if err != nil {
return err
return fmt.Errorf("failed to read tensor kind: %w", err)
}
offset, err := readGGUF[uint64](llm, rs)
if err != nil {
return err
return fmt.Errorf("failed to read tensor offset: %w", err)
}
tensor := Tensor{
@@ -230,24 +236,19 @@ func (llm *gguf) Decode(rs io.ReadSeeker) error {
alignment = 32
}
offset, err := rs.Seek(0, io.SeekCurrent)
if err != nil {
return err
}
padding := llm.padding(offset, int64(alignment))
if _, err := rs.Seek(padding, io.SeekCurrent); err != nil {
return err
}
for _, tensor := range llm.tensors {
if _, err := rs.Seek(int64(tensor.Size()), io.SeekCurrent); err != nil {
return err
offset, err := rs.Seek(0, io.SeekCurrent)
if err != nil {
return fmt.Errorf("failed to get current offset: %w", err)
}
padding := llm.padding(int64(tensor.Size()), int64(alignment))
padding := llm.padding(offset, int64(alignment))
if _, err := rs.Seek(padding, io.SeekCurrent); err != nil {
return err
return fmt.Errorf("failed to seek to init padding: %w", err)
}
if _, err := rs.Seek(int64(tensor.Size()), io.SeekCurrent); err != nil {
return fmt.Errorf("failed to seek to tensor: %w", err)
}
}
@@ -285,22 +286,48 @@ func readGGUFV1String(llm *gguf, r io.Reader) (string, error) {
return b.String(), nil
}
func discardGGUFString(llm *gguf, r io.Reader) error {
buf := llm.scratch[:8]
_, err := io.ReadFull(r, buf)
if err != nil {
return err
}
size := int(llm.ByteOrder.Uint64(buf))
for size > 0 {
n, err := r.Read(llm.scratch[:min(size, cap(llm.scratch))])
if err != nil {
return err
}
size -= n
}
return nil
}
func readGGUFString(llm *gguf, r io.Reader) (string, error) {
if llm.Version == 1 {
return readGGUFV1String(llm, r)
}
var length uint64
if err := binary.Read(r, llm.ByteOrder, &length); err != nil {
buf := llm.scratch[:8]
_, err := io.ReadFull(r, buf)
if err != nil {
return "", err
}
var b bytes.Buffer
if _, err := io.CopyN(&b, r, int64(length)); err != nil {
length := int(llm.ByteOrder.Uint64(buf))
if length > len(llm.scratch) {
buf = make([]byte, length)
} else {
buf = llm.scratch[:length]
}
clear(buf)
_, err = io.ReadFull(r, buf)
if err != nil {
return "", err
}
return b.String(), nil
return string(buf), nil
}
func writeGGUFString(llm *gguf, w io.Writer, s string) error {
@@ -316,7 +343,16 @@ func writeGGUFString(llm *gguf, w io.Writer, s string) error {
return err
}
func readGGUFV1Array(llm *gguf, r io.Reader) (a []any, err error) {
type array struct {
size int
values []any
}
func (a *array) MarshalJSON() ([]byte, error) {
return json.Marshal(a.values)
}
func readGGUFV1Array(llm *gguf, r io.Reader) (*array, error) {
t, err := readGGUF[uint32](llm, r)
if err != nil {
return nil, err
@@ -327,7 +363,12 @@ func readGGUFV1Array(llm *gguf, r io.Reader) (a []any, err error) {
return nil, err
}
for i := 0; uint32(i) < n; i++ {
a := &array{size: int(n)}
if llm.canCollectArray(int(n)) {
a.values = make([]any, 0, int(n))
}
for i := range n {
var e any
switch t {
case ggufTypeUint8:
@@ -361,13 +402,15 @@ func readGGUFV1Array(llm *gguf, r io.Reader) (a []any, err error) {
return nil, err
}
a = append(a, e)
if a.values != nil {
a.values[i] = e
}
}
return
return a, nil
}
func readGGUFArray(llm *gguf, r io.Reader) (a []any, err error) {
func readGGUFArray(llm *gguf, r io.Reader) (*array, error) {
if llm.Version == 1 {
return readGGUFV1Array(llm, r)
}
@@ -382,7 +425,12 @@ func readGGUFArray(llm *gguf, r io.Reader) (a []any, err error) {
return nil, err
}
for i := 0; uint64(i) < n; i++ {
a := &array{size: int(n)}
if llm.canCollectArray(int(n)) {
a.values = make([]any, int(n))
}
for i := range n {
var e any
switch t {
case ggufTypeUint8:
@@ -408,7 +456,11 @@ func readGGUFArray(llm *gguf, r io.Reader) (a []any, err error) {
case ggufTypeBool:
e, err = readGGUF[bool](llm, r)
case ggufTypeString:
e, err = readGGUFString(llm, r)
if a.values != nil {
e, err = readGGUFString(llm, r)
} else {
err = discardGGUFString(llm, r)
}
default:
return nil, fmt.Errorf("invalid array type: %d", t)
}
@@ -416,10 +468,12 @@ func readGGUFArray(llm *gguf, r io.Reader) (a []any, err error) {
return nil, err
}
a = append(a, e)
if a.values != nil {
a.values[i] = e
}
}
return
return a, nil
}
func writeGGUFArray[S ~[]E, E any](llm *gguf, w io.Writer, t uint32, s S) error {
@@ -483,6 +537,7 @@ var ggufKVOrder = map[string][]string{
"tokenizer.ggml.add_bos_token",
"tokenizer.ggml.add_eos_token",
"tokenizer.chat_template",
"bert.pooling_type",
},
}

View File

@@ -1,12 +1,13 @@
package llm
// #cgo CFLAGS: -Illama.cpp
// #cgo darwin,arm64 LDFLAGS: ${SRCDIR}/build/darwin/arm64_static/libllama.a -lstdc++
// #cgo darwin,amd64 LDFLAGS: ${SRCDIR}/build/darwin/x86_64_static/libllama.a -lstdc++
// #cgo windows,amd64 LDFLAGS: ${SRCDIR}/build/windows/amd64_static/libllama.a -static -lstdc++
// #cgo windows,arm64 LDFLAGS: ${SRCDIR}/build/windows/arm64_static/libllama.a -static -lstdc++
// #cgo linux,amd64 LDFLAGS: ${SRCDIR}/build/linux/x86_64_static/libllama.a -lstdc++
// #cgo linux,arm64 LDFLAGS: ${SRCDIR}/build/linux/arm64_static/libllama.a -lstdc++
// #cgo CFLAGS: -Illama.cpp -Illama.cpp/include -Illama.cpp/ggml/include
// #cgo LDFLAGS: -lllama -lggml -lstdc++ -lpthread
// #cgo darwin,arm64 LDFLAGS: -L${SRCDIR}/build/darwin/arm64_static -L${SRCDIR}/build/darwin/arm64_static/src -L${SRCDIR}/build/darwin/arm64_static/ggml/src -framework Accelerate -framework Metal
// #cgo darwin,amd64 LDFLAGS: -L${SRCDIR}/build/darwin/x86_64_static -L${SRCDIR}/build/darwin/x86_64_static/src -L${SRCDIR}/build/darwin/x86_64_static/ggml/src
// #cgo windows,amd64 LDFLAGS: -static-libstdc++ -static-libgcc -static -L${SRCDIR}/build/windows/amd64_static -L${SRCDIR}/build/windows/amd64_static/src -L${SRCDIR}/build/windows/amd64_static/ggml/src
// #cgo windows,arm64 LDFLAGS: -static-libstdc++ -static-libgcc -static -L${SRCDIR}/build/windows/arm64_static -L${SRCDIR}/build/windows/arm64_static/src -L${SRCDIR}/build/windows/arm64_static/ggml/src
// #cgo linux,amd64 LDFLAGS: -L${SRCDIR}/build/linux/x86_64_static -L${SRCDIR}/build/linux/x86_64_static/src -L${SRCDIR}/build/linux/x86_64_static/ggml/src
// #cgo linux,arm64 LDFLAGS: -L${SRCDIR}/build/linux/arm64_static -L${SRCDIR}/build/linux/arm64_static/src -L${SRCDIR}/build/linux/arm64_static/ggml/src
// #include <stdlib.h>
// #include "llama.h"
import "C"
@@ -32,7 +33,7 @@ func Quantize(infile, outfile string, ftype fileType) error {
params.ftype = ftype.Value()
if rc := C.llama_model_quantize(cinfile, coutfile, &params); rc != 0 {
return fmt.Errorf("llama_model_quantize: %d", rc)
return fmt.Errorf("failed to quantize model. This model architecture may not be supported, or you may need to upgrade Ollama to the latest version")
}
return nil

View File

@@ -3,9 +3,10 @@ package llm
import (
"fmt"
"log/slog"
"strconv"
"strings"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/format"
"github.com/ollama/ollama/gpu"
)
@@ -16,7 +17,8 @@ func PredictServerFit(allGpus gpu.GpuInfoList, ggml *GGML, adapters, projectors
var estimatedVRAM uint64
for _, gpus := range allGpus.ByLibrary() {
var layerCount int
layerCount, estimatedVRAM, _ = EstimateGPULayers(gpus, ggml, projectors, opts)
estimate := EstimateGPULayers(gpus, ggml, projectors, opts)
layerCount, estimatedVRAM = estimate.Layers, estimate.VRAMSize
if opts.NumGPU < 0 {
if layerCount > 0 && layerCount >= int(ggml.KV().BlockCount()+1) {
return true, estimatedVRAM
@@ -30,24 +32,76 @@ func PredictServerFit(allGpus gpu.GpuInfoList, ggml *GGML, adapters, projectors
return false, estimatedVRAM
}
type MemoryEstimate struct {
// How many layers we predict we can load
Layers int
// The size of the graph which occupies the main GPU
Graph uint64
// How much VRAM will be allocated given the number of layers we predict
VRAMSize uint64
// The total size of the model if loaded into VRAM. If all layers are loaded, VRAMSize == TotalSize
TotalSize uint64
// For multi-GPU scenarios, this provides the tensor split parameter
TensorSplit string
// For multi-GPU scenarios, this is the size in bytes per GPU
GPUSizes []uint64
// internal fields for logging purposes
inferenceLibrary string
layersRequested int
layersModel int
availableList []string
kv uint64
allocationsList []string
memoryWeights uint64
memoryLayerOutput uint64
graphFullOffload uint64
graphPartialOffload uint64
}
// Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size
// The GPUs provided must all be the same Library
func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts api.Options) (int, uint64, uint64) {
var memoryAvailable uint64
for _, info := range gpus {
memoryAvailable += info.FreeMemory
}
if envconfig.MaxVRAM > 0 {
memoryAvailable = envconfig.MaxVRAM
}
func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts api.Options) MemoryEstimate {
// Graph size for a partial offload, applies to all GPUs
var graphPartialOffload uint64
slog.Debug("evaluating", "library", gpus[0].Library, "gpu_count", len(gpus), "available", format.HumanBytes2(memoryAvailable))
// Graph size when all layers are offloaded, applies to all GPUs
var graphFullOffload uint64
// TODO - this is probably wrong, first GPU vs secondaries will have different overheads
memoryMinimum := gpus[0].MinimumMemory
// Final graph offload once we know full or partial
var graphOffload uint64
// Projectors loaded into GPU0 only
var projectorSize uint64
// Conditional output size on GPU 0
var memoryLayerOutput uint64
// The sizes of a layer
var layerSize uint64
// The sum of all the layer sizes (just for logging)
var memoryWeights uint64
// True if all the layers are loaded
var fullyLoaded bool
// Overflow that didn't fit into the GPU
var overflow uint64
availableList := make([]string, len(gpus))
for i, gpu := range gpus {
availableList[i] = format.HumanBytes2(gpu.FreeMemory)
}
slog.Debug("evaluating", "library", gpus[0].Library, "gpu_count", len(gpus), "available", availableList)
for _, projector := range projectors {
memoryMinimum += projectorMemoryRequirements(projector)
projectorSize += projectorMemoryRequirements(projector)
// multimodal models require at least 2048 context
opts.NumCtx = max(opts.NumCtx, 2048)
@@ -56,127 +110,246 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
layers := ggml.Tensors().Layers()
// add one layer worth of memory as a buffer
if blk0, ok := layers["blk.0"]; ok {
memoryMinimum += blk0.size()
layerSize = blk0.size()
} else {
slog.Warn("model missing blk.0 layer size")
}
// fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv
var kv uint64 = 2 * 2 * uint64(opts.NumCtx) * ggml.KV().BlockCount() * ggml.KV().EmbeddingLength() / ggml.KV().HeadCount() * ggml.KV().HeadCountKV()
// fp16 k,v = sizeof(float16) * n_ctx * n_layer * (n_embd_head_k + n_embd_head_v) * n_head_kv
var kv uint64 = 2 * uint64(opts.NumCtx) * ggml.KV().BlockCount() * (ggml.KV().EmbeddingHeadCountK() + ggml.KV().EmbeddingHeadCountV()) * ggml.KV().HeadCountKV()
graphPartialOffload, graphFullOffload := ggml.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)))
// KV is proportional to the number of layers
layerSize += kv / ggml.KV().BlockCount()
graphPartialOffload, graphFullOffload = ggml.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)))
if graphPartialOffload == 0 {
graphPartialOffload = ggml.KV().GQA() * kv / 6
}
if graphFullOffload == 0 {
graphFullOffload = graphPartialOffload
}
graphFullOffload *= uint64(len(gpus))
graphPartialOffload *= uint64(len(gpus))
// on metal there's no partial offload overhead
if gpus[0].Library == "metal" {
graphPartialOffload = graphFullOffload
} else if len(gpus) > 1 {
// multigpu should always use the partial graph size
graphFullOffload = graphPartialOffload
}
// memoryRequiredTotal represents the memory required for full GPU offloading (all layers)
memoryRequiredTotal := memoryMinimum + graphFullOffload
// memoryRequiredPartial represents the memory required for partial GPU offloading (n > 0, n < layers)
memoryRequiredPartial := memoryMinimum + graphPartialOffload
var memoryLayerOutput uint64
if layer, ok := layers["output_norm"]; ok {
memoryLayerOutput += layer.size()
}
if layer, ok := layers["output"]; ok {
memoryLayerOutput += layer.size()
} else if layer, ok := layers["token_embd"]; ok {
memoryLayerOutput += layer.size()
}
if gpus[0].Library == "metal" && opts.UseMMap {
// memory is preallocated for output tensors
memoryRequiredTotal += memoryLayerOutput
memoryRequiredPartial += memoryLayerOutput
// Output layer handled at the end if we have space
gpuZeroOverhead := projectorSize
// Reduce set of GPUs to only those that have sufficient space to fit overhead and at least one layer
var layerCount int
layerCounts := make([]int, len(gpus))
gpuAllocations := make([]uint64, len(gpus))
type gs struct {
i int
g *gpu.GpuInfo
}
gpusWithSpace := []gs{}
for i := range gpus {
var gzo uint64
if len(gpusWithSpace) == 0 {
gzo = gpuZeroOverhead
}
// Only include GPUs that can fit the graph, gpu minimum, the layer buffer and at least more layer
if gpus[i].FreeMemory < gzo+max(graphPartialOffload, graphFullOffload)+gpus[i].MinimumMemory+2*layerSize {
slog.Debug("gpu has too little memory to allocate any layers", "gpu", gpus[i])
continue
}
gpusWithSpace = append(gpusWithSpace, gs{i, &gpus[i]})
gpuAllocations[i] += gpus[i].MinimumMemory + layerSize // We hold off on graph until we know partial vs. full
}
var layerCount int
var gpuZeroID int
if len(gpusWithSpace) > 0 {
gpuZeroID = gpusWithSpace[0].i
gpuAllocations[gpuZeroID] += gpuZeroOverhead
}
// For all the layers, find where they can fit on the GPU(s)
for i := range int(ggml.KV().BlockCount()) {
// Some models have inconsistent layer sizes
if blk, ok := layers[fmt.Sprintf("blk.%d", i)]; ok {
memoryLayer := blk.size()
layerSize = blk.size()
layerSize += kv / ggml.KV().BlockCount()
}
memoryWeights += layerSize
// KV is proportional to the number of layers
memoryLayer += kv / ggml.KV().BlockCount()
if opts.NumGPU >= 0 && layerCount >= opts.NumGPU {
// Stop allocating on GPU(s) once we hit the users target NumGPU
continue
}
memoryRequiredTotal += memoryLayer
if (opts.NumGPU >= 0 && layerCount+1 <= opts.NumGPU) || (opts.NumGPU < 0 && memoryAvailable > memoryRequiredPartial+memoryLayer) {
memoryRequiredPartial += memoryLayer
// distribute the layers across the GPU(s) that have space
for j := len(gpusWithSpace); j > 0; j-- {
g := gpusWithSpace[i%j]
used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
if g.g.FreeMemory > used+layerSize {
gpuAllocations[g.i] += layerSize
layerCounts[g.i]++
layerCount++
break
} else {
gpusWithSpace = append(gpusWithSpace[:i%j], gpusWithSpace[i%j+1:]...)
}
}
}
if gpus[0].Library != "metal" || !opts.UseMMap {
// memory was not preallocated for output tensors
memoryRequiredTotal += memoryLayerOutput
if layerCount >= int(ggml.KV().BlockCount()) {
fullyLoaded = true
} else {
for i := layerCount; i < int(ggml.KV().BlockCount()); i++ {
overflow += layerSize
}
}
if (opts.NumGPU >= 0 && layerCount+1 <= opts.NumGPU) || (opts.NumGPU < 0 && memoryAvailable > memoryRequiredTotal) {
layerCount = int(ggml.KV().BlockCount()) + 1
memoryRequiredPartial = memoryRequiredTotal
// Determine if we need to consider output then find where it fits
if memoryLayerOutput > 0 && (opts.NumGPU < 0 || layerCount < opts.NumGPU) {
for j := len(gpusWithSpace); j > 0; j-- {
g := gpusWithSpace[layerCount%j]
used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
if g.g.FreeMemory > used+memoryLayerOutput {
gpuAllocations[g.i] += memoryLayerOutput
layerCounts[g.i]++
layerCount++
break
}
}
if layerCount < int(ggml.KV().BlockCount())+1 {
fullyLoaded = false
overflow += memoryLayerOutput
}
}
memoryWeights := memoryRequiredTotal - memoryMinimum - graphFullOffload - kv
// Add the applicable (full or partial) graph allocations
for i := range gpus {
if layerCounts[i] <= 0 {
continue
}
if fullyLoaded {
gpuAllocations[i] += graphFullOffload
} else {
gpuAllocations[i] += graphPartialOffload
}
}
if fullyLoaded {
graphOffload = graphFullOffload
} else {
graphOffload = graphPartialOffload
}
// Summaries for the log
var memoryRequiredPartial, memoryRequiredTotal uint64
for i := range gpuAllocations {
memoryRequiredPartial += gpuAllocations[i]
}
memoryRequiredTotal = memoryRequiredPartial + overflow
tensorSplit := ""
if len(gpus) > 1 {
splits := make([]string, len(gpus))
for i, count := range layerCounts {
splits[i] = strconv.Itoa(count)
}
tensorSplit = strings.Join(splits, ",")
}
allocationsList := []string{}
for _, a := range gpuAllocations {
allocationsList = append(allocationsList, format.HumanBytes2(a))
}
estimate := MemoryEstimate{
TotalSize: memoryRequiredTotal,
Layers: 0,
Graph: 0,
VRAMSize: 0,
GPUSizes: []uint64{},
inferenceLibrary: gpus[0].Library,
layersRequested: opts.NumGPU,
layersModel: int(ggml.KV().BlockCount()) + 1,
availableList: availableList,
kv: kv,
allocationsList: allocationsList,
memoryWeights: memoryWeights,
memoryLayerOutput: memoryLayerOutput,
graphFullOffload: graphFullOffload,
graphPartialOffload: graphPartialOffload,
}
if gpus[0].Library == "cpu" {
return estimate
}
if layerCount == 0 {
slog.Debug("insufficient VRAM to load any model layers")
return estimate
}
estimate.Layers = layerCount
estimate.Graph = graphOffload
estimate.VRAMSize = memoryRequiredPartial
estimate.TotalSize = memoryRequiredTotal
estimate.TensorSplit = tensorSplit
estimate.GPUSizes = gpuAllocations
return estimate
}
func (m MemoryEstimate) log() {
slog.Info(
"offload to gpu",
"offload to "+m.inferenceLibrary,
slog.Group(
"layers",
// requested number of layers to offload
"requested", opts.NumGPU,
"requested", m.layersRequested,
// The number of layers the model has (including output)
"model", m.layersModel,
// estimated number of layers that can be offloaded
"real", layerCount,
"offload", m.Layers,
// multi-gpu split for tensors
"split", m.TensorSplit,
),
slog.Group(
"memory",
// memory available for offloading
"available", format.HumanBytes2(memoryAvailable),
// memory available by GPU for offloading
"available", m.availableList,
slog.Group(
"required",
// memory required for full offloading
"full", format.HumanBytes2(memoryRequiredTotal),
"full", format.HumanBytes2(m.TotalSize),
// memory required to offload layers.estimate layers
"partial", format.HumanBytes2(memoryRequiredPartial),
"partial", format.HumanBytes2(m.VRAMSize),
// memory of KV cache
"kv", format.HumanBytes2(kv),
"kv", format.HumanBytes2(m.kv),
// Allocations across the GPUs
"allocations", m.allocationsList,
),
slog.Group(
"weights",
// memory of the weights
"total", format.HumanBytes2(memoryWeights),
"total", format.HumanBytes2(m.memoryWeights),
// memory of repeating layers
"repeating", format.HumanBytes2(memoryWeights-memoryLayerOutput),
"repeating", format.HumanBytes2(m.memoryWeights-m.memoryLayerOutput),
// memory of non-repeating layers
"nonrepeating", format.HumanBytes2(memoryLayerOutput),
"nonrepeating", format.HumanBytes2(m.memoryLayerOutput),
),
slog.Group(
"graph",
// memory of graph when fully offloaded
"full", format.HumanBytes2(graphFullOffload),
"full", format.HumanBytes2(m.graphFullOffload),
// memory of graph when not fully offloaded
"partial", format.HumanBytes2(graphPartialOffload),
"partial", format.HumanBytes2(m.graphPartialOffload),
),
),
)
if gpus[0].Library == "cpu" {
return 0, 0, memoryRequiredTotal
}
if memoryRequiredPartial > memoryAvailable {
slog.Debug("insufficient VRAM to load any model layers")
return 0, 0, memoryRequiredTotal
}
return layerCount, memoryRequiredPartial, memoryRequiredTotal
}

130
llm/memory_test.go Normal file
View File

@@ -0,0 +1,130 @@
package llm
import (
"bytes"
"encoding/binary"
"fmt"
"os"
"testing"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/gpu"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func TestEstimateGPULayers(t *testing.T) {
envconfig.Debug = true
modelName := "dummy"
f, err := os.CreateTemp(t.TempDir(), modelName)
require.NoError(t, err)
defer f.Close()
gguf := NewGGUFV3(binary.LittleEndian)
inputLayerCount := 5
tensors := []Tensor{
{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
{Name: "blk.1.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
{Name: "blk.2.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
{Name: "blk.3.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
{Name: "blk.4.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
}
assert.Len(t, tensors, inputLayerCount+1)
err = gguf.Encode(f, KV{
"general.architecture": "llama",
"general.name": "name",
"llama.context_length": uint32(32),
"llama.embedding_length": uint32(4096),
"llama.block_count": uint32(inputLayerCount),
"llama.attention.head_count": uint32(32),
"llama.attention.head_count_kv": uint32(32),
"tokenizer.ggml.tokens": []string{" "},
"tokenizer.ggml.scores": []float32{0},
"tokenizer.ggml.token_type": []int32{0},
}, tensors)
require.NoError(t, err)
ggml, err := LoadModel(f.Name(), 0)
if err != nil {
t.Fatal(err)
}
// Simple CPU scenario
gpus := []gpu.GpuInfo{
{
Library: "cpu",
},
}
projectors := []string{}
opts := api.DefaultOptions()
t.Run("cpu", func(t *testing.T) {
estimate := EstimateGPULayers(gpus, ggml, projectors, opts)
assert.Equal(t, 0, estimate.Layers)
assert.Equal(t, uint64(0), estimate.Graph)
})
// derived from the dummy ggml file above
graphPartialOffload := uint64(202377216)
graphFullOffload := uint64(171968512)
layerSize := uint64(33554436)
projectorSize := uint64(0)
memoryLayerOutput := uint64(4)
// Dual CUDA scenario with assymetry
gpuMinimumMemory := uint64(2048)
gpus = []gpu.GpuInfo{
{
Library: "cuda",
MinimumMemory: gpuMinimumMemory,
},
{
Library: "cuda",
MinimumMemory: gpuMinimumMemory,
},
}
// Nested array: GPU0 layer space, GPU1 layer space, expected gpu0, expected gpu1
for i, s := range []struct {
layer0, layer1 uint64
expect0, expect1 uint64
}{
{1, 1, 1, 1},
{2, 1, 2, 1},
{2, 2, 2, 2},
{1, 2, 1, 2},
{3, 3, 3, 3},
{4, 4, 3, 3},
{6, 6, 3, 3},
{0, 3, 0, 3},
} {
t.Run(fmt.Sprintf("%v", s), func(t *testing.T) {
gpus[0].FreeMemory = 0
gpus[1].FreeMemory = 0
gpus[0].FreeMemory += projectorSize
if s.layer0 > 0 {
gpus[0].FreeMemory += memoryLayerOutput
} else {
gpus[1].FreeMemory += memoryLayerOutput
}
gpus[0].FreeMemory += gpuMinimumMemory + layerSize + s.layer0*layerSize + 1
gpus[1].FreeMemory += gpuMinimumMemory + layerSize + s.layer1*layerSize + 1
gpus[0].FreeMemory += max(graphFullOffload, graphPartialOffload)
gpus[1].FreeMemory += max(graphFullOffload, graphPartialOffload)
estimate := EstimateGPULayers(gpus, ggml, projectors, opts)
assert.Equal(t, int(s.expect0+s.expect1), estimate.Layers, "scenario %d: %v", i, s)
assert.Equal(t, fmt.Sprintf("%d,%d", s.expect0, s.expect1), estimate.TensorSplit, "scenario %d: %v", i, s)
var layerSums uint64
for _, b := range estimate.GPUSizes {
layerSums += b
}
if estimate.Layers < inputLayerCount+1 {
assert.Less(t, estimate.VRAMSize, estimate.TotalSize, "scenario %d: %v %+v", i, s, estimate)
assert.Equal(t, estimate.VRAMSize, layerSums, "scenario %d: %v %+v", i, s, estimate)
} else {
assert.Equal(t, estimate.VRAMSize, estimate.TotalSize, "scenario %d: %v %+v", i, s, estimate)
assert.Equal(t, estimate.TotalSize, layerSums, "scenario %d: %v %+v", i, s, estimate)
}
})
}
}

View File

@@ -1,8 +1,8 @@
diff --git a/common/common.cpp b/common/common.cpp
index ba1ecf0e..cead57cc 100644
index 2c05a4d4..927f0e3d 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1836,6 +1836,8 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
@@ -2093,6 +2093,8 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
mparams.use_mmap = params.use_mmap;
mparams.use_mlock = params.use_mlock;
mparams.check_tensors = params.check_tensors;
@@ -12,20 +12,20 @@ index ba1ecf0e..cead57cc 100644
mparams.kv_overrides = NULL;
} else {
diff --git a/common/common.h b/common/common.h
index d80344f2..71e84834 100644
index 65c0ef81..ebca2c77 100644
--- a/common/common.h
+++ b/common/common.h
@@ -174,6 +174,13 @@ struct gpt_params {
// multimodal models (see examples/llava)
@@ -184,6 +184,13 @@ struct gpt_params {
std::string mmproj = ""; // path to multimodal projector
std::vector<std::string> image; // path to image file(s)
+
+ // Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
+ // If the provided progress_callback returns true, model loading continues.
+ // If it returns false, model loading is immediately aborted.
+ llama_progress_callback progress_callback = NULL;
+ // context pointer passed to the progress callback
+ void * progress_callback_user_data;
};
void gpt_params_handle_model_default(gpt_params & params);
+
// embedding
bool embedding = false; // get only sentence embedding
int32_t embd_normalize = 2; // normalisation for embendings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)

View File

@@ -1,17 +1,8 @@
From 544a2d2e646d39e878d87dfbb3398a356bc560ab Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Thu, 23 May 2024 11:18:45 -0700
Subject: [PATCH] throw exception on load errors
---
llama.cpp | 25 ++++++++++++++++---------
1 file changed, 16 insertions(+), 9 deletions(-)
diff --git a/llama.cpp b/llama.cpp
index 15c66077..8ba90b6a 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -6346,7 +6346,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
diff --git a/src/llama.cpp b/src/llama.cpp
index 73f52435..58a00fb1 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -7241,7 +7241,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
}
} catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: error loading model: %s\n", __func__, err.what());
@@ -20,7 +11,7 @@ index 15c66077..8ba90b6a 100644
}
return 0;
@@ -15600,16 +15600,23 @@ struct llama_model * llama_load_model_from_file(
@@ -17564,16 +17564,23 @@ struct llama_model * llama_load_model_from_file(
}
model->rpc_servers.push_back(servers);
}
@@ -52,6 +43,3 @@ index 15c66077..8ba90b6a 100644
}
return model;
--
2.45.1

View File

@@ -1,7 +1,7 @@
diff --git a/ggml-metal.m b/ggml-metal.m
diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m
index 0207b787..b5e9884b 100644
--- a/ggml-metal.m
+++ b/ggml-metal.m
--- a/ggml/src/ggml-metal.m
+++ b/ggml/src/ggml-metal.m
@@ -1396,27 +1396,23 @@ static enum ggml_status ggml_metal_graph_compute(
// to the matrix-vector kernel
int ne11_mm_min = 1;

View File

@@ -1,11 +1,11 @@
diff --git a/llama.cpp b/llama.cpp
index 40d2ec2c..74f3ee9c 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -4642,16 +4642,7 @@ static void llm_load_vocab(
// for now, only BPE models have pre-tokenizers
diff --git a/src/llama.cpp b/src/llama.cpp
index 2b9ace28..172640e2 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -5357,16 +5357,7 @@ static void llm_load_vocab(
if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
vocab.tokenizer_add_space_prefix = false;
vocab.tokenizer_clean_spaces = true;
- if (tokenizer_pre.empty()) {
- LLAMA_LOG_WARN("%s: missing pre-tokenizer type, using: 'default'\n", __func__);
- LLAMA_LOG_WARN("%s: \n", __func__);
@@ -15,18 +15,18 @@ index 40d2ec2c..74f3ee9c 100644
- LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
- LLAMA_LOG_WARN("%s: \n", __func__);
- vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
- } else if (
+ if (
tokenizer_pre == "default") {
- } else if (tokenizer_pre == "default") {
+ if (tokenizer_pre == "default") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
} else if (
@@ -4703,7 +4694,8 @@ static void llm_load_vocab(
tokenizer_pre == "smaug-bpe") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMAUG;
tokenizer_pre == "llama3" ||
@@ -5439,7 +5430,8 @@ static void llm_load_vocab(
tokenizer_pre == "jais") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_JAIS;
} else {
- throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
+ LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
}
} else {
} else if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;

View File

@@ -1,7 +1,7 @@
diff --git a/llama.cpp b/llama.cpp
diff --git a/src/llama.cpp b/src/llama.cpp
index 40d2ec2c..f34eb79a 100644
--- a/llama.cpp
+++ b/llama.cpp
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -6943,7 +6943,7 @@ static struct ggml_tensor * llm_build_kqv(
struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
cb(kq, "kq", il);

View File

@@ -0,0 +1,45 @@
diff --git a/src/llama.cpp b/src/llama.cpp
index 1fe2b9f7..a43312a7 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -13689,7 +13689,7 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
const auto n_embd = hparams.n_embd;
// TODO: use a per-batch flag for logits presence instead
- const bool has_logits = !cparams.embeddings;
+ const bool has_logits = cparams.causal_attn;
const bool has_embd = lctx.is_encoding || (cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE));
const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0;
@@ -13959,17 +13959,25 @@ static int llama_decode_internal(
// no output
res = nullptr;
embd = nullptr;
- } else if (cparams.embeddings) {
- res = nullptr; // do not extract logits for embedding case
- embd = gf->nodes[gf->n_nodes - 1];
- if (strcmp(embd->name, "result_embd_pooled") != 0) {
- embd = gf->nodes[gf->n_nodes - 2];
+ }
+
+ if (cparams.embeddings) {
+ for (int i = gf->n_nodes - 1; i >= 0; --i) {
+ embd = gf->nodes[i];
+ if (strcmp(embd->name, "result_embd_pooled") == 0) {
+ break;
+ }
}
GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor");
- } else {
+ } else {
embd = nullptr; // do not extract embeddings when not needed
GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor");
}
+
+ if (!cparams.causal_attn) {
+ res = nullptr; // do not extract logits when not needed
+ }
+
// LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
ggml_backend_sched_alloc_graph(lctx.sched, gf);

View File

@@ -0,0 +1,42 @@
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index 95fbe3d0..5a02a6ec 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -32,6 +33,14 @@
#include <cinttypes>
#include <limits>
+#if defined(_WIN32)
+#define WIN32_LEAN_AND_MEAN
+#ifndef NOMINMAX
+ #define NOMINMAX
+#endif
+#include <windows.h>
+#endif
+
//#define CLIP_DEBUG_FUNCTIONS
// RGB uint8 image
@@ -1055,7 +1064,22 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
return nullptr;
}
+#ifdef _WIN32
+ int wlen = MultiByteToWideChar(CP_UTF8, 0, fname, -1, NULL, 0);
+ if (!wlen) {
+ return NULL;
+ }
+ wchar_t * wbuf = (wchar_t *) malloc(wlen * sizeof(wchar_t));
+ wlen = MultiByteToWideChar(CP_UTF8, 0, fname, -1, wbuf, wlen);
+ if (!wlen) {
+ free(wbuf);
+ return NULL;
+ }
+ auto fin = std::ifstream(wbuf, std::ios::binary);
+ free(wbuf);
+#else
auto fin = std::ifstream(fname, std::ios::binary);
+#endif
if (!fin) {
LOG_TEE("cannot open model file for loading tensors\n");
clip_free(new_clip);

View File

@@ -0,0 +1,60 @@
diff --git a/src/llama.cpp b/src/llama.cpp
index 721b8f4e..cfe7ac40 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -8420,14 +8420,14 @@ struct llm_build_context {
}
struct ggml_tensor * build_inp_mean() {
- lctx.inp_mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens);
+ lctx.inp_mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, cparams.n_seq_max);
cb(lctx.inp_mean, "inp_mean", -1);
ggml_set_input(lctx.inp_mean);
return lctx.inp_mean;
}
struct ggml_tensor * build_inp_cls() {
- lctx.inp_cls = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+ lctx.inp_cls = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, cparams.n_seq_max);
cb(lctx.inp_cls, "inp_cls", -1);
ggml_set_input(lctx.inp_cls);
return lctx.inp_cls;
@@ -13847,19 +13847,16 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer));
float * data = (float *) lctx.inp_mean->data;
- memset(lctx.inp_mean->data, 0, n_tokens * n_tokens * ggml_element_size(lctx.inp_mean));
+ memset(lctx.inp_mean->data, 0, n_tokens * cparams.n_seq_max * ggml_element_size(lctx.inp_mean));
std::vector<uint64_t> sum(n_tokens, 0);
for (int i = 0; i < n_tokens; ++i) {
const llama_seq_id seq_id = batch.seq_id[i][0];
-
- GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == MEAN");
-
sum[seq_id] += 1;
}
- std::vector<float> div(n_tokens, 0.0f);
- for (int i = 0; i < n_tokens; ++i) {
+ std::vector<float> div(cparams.n_seq_max, 0.0f);
+ for (uint32_t i = 0; i < cparams.n_seq_max; ++i) {
const uint64_t s = sum[i];
if (s > 0) {
div[i] = 1.0f/float(s);
@@ -13879,14 +13876,11 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
uint32_t * data = (uint32_t *) lctx.inp_cls->data;
- memset(lctx.inp_cls->data, 0, n_tokens * ggml_element_size(lctx.inp_cls));
+ memset(lctx.inp_cls->data, 0, cparams.n_seq_max * ggml_element_size(lctx.inp_cls));
for (int i = 0; i < n_tokens; ++i) {
const llama_seq_id seq_id = batch.seq_id[i][0];
const llama_pos pos = batch.pos[i];
-
- GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == CLS");
-
if (pos == 0) {
data[seq_id] = i;
}

View File

@@ -0,0 +1,43 @@
diff --git a/include/llama.h b/include/llama.h
index bb4b05ba..a92174e0 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -92,6 +92,7 @@ extern "C" {
LLAMA_VOCAB_PRE_TYPE_CHATGLM4 = 17,
LLAMA_VOCAB_PRE_TYPE_VIKING = 18,
LLAMA_VOCAB_PRE_TYPE_JAIS = 19,
+ LLAMA_VOCAB_PRE_TYPE_TEKKEN = 20,
};
// note: these values should be synchronized with ggml_rope
diff --git a/src/llama.cpp b/src/llama.cpp
index 18364976..435b6fe5 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -5429,6 +5429,12 @@ static void llm_load_vocab(
} else if (
tokenizer_pre == "jais") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_JAIS;
+ } else if (
+ tokenizer_pre == "tekken") {
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_TEKKEN;
+ vocab.tokenizer_clean_spaces = false;
+ vocab.tokenizer_ignore_merges = true;
+ vocab.tokenizer_add_bos = true;
} else {
LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
@@ -15448,6 +15454,13 @@ struct llm_tokenizer_bpe {
" ?[^(\\s|.,!?…。,、।۔،)]+",
};
break;
+ case LLAMA_VOCAB_PRE_TYPE_TEKKEN:
+ // original regex from tokenizer.json
+ // "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
+ regex_exprs = {
+ "[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
+ };
+ break;
default:
// default regex for BPE tokenization pre-processing
regex_exprs = {

View File

@@ -0,0 +1,19 @@
diff --git a/src/llama.cpp b/src/llama.cpp
index 2b9ace28..e60d3d8d 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -6052,10 +6052,10 @@ static bool llm_load_tensors(
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
- layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
- layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
- layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
- layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head});
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa});
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa});
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd});
// optional bias tensors
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);

View File

@@ -38,7 +38,7 @@ func Init() error {
}
var variants []string
for v := range availableServers() {
for v := range getAvailableServers() {
variants = append(variants, v)
}
slog.Info(fmt.Sprintf("Dynamic LLM libraries %v", variants))
@@ -50,7 +50,7 @@ func Init() error {
// binary names may contain an optional variant separated by '_'
// For example, "ollama_rocm_v6" and "ollama_rocm_v5" or "ollama_cpu" and "ollama_cpu_avx2"
// Any library without a variant is the lowest common denominator
func availableServers() map[string]string {
func getAvailableServers() map[string]string {
payloadsDir, err := gpu.PayloadsDir()
if err != nil {
slog.Error("payload lookup error", "error", err)
@@ -58,7 +58,7 @@ func availableServers() map[string]string {
}
// glob payloadsDir for files that start with ollama_
pattern := filepath.Join(payloadsDir, "*")
pattern := filepath.Join(payloadsDir, "*", "ollama_*")
files, err := filepath.Glob(pattern)
if err != nil {
@@ -69,7 +69,7 @@ func availableServers() map[string]string {
servers := make(map[string]string)
for _, file := range files {
slog.Debug("availableServers : found", "file", file)
servers[filepath.Base(file)] = file
servers[filepath.Base(filepath.Dir(file))] = filepath.Dir(file)
}
return servers
@@ -80,10 +80,10 @@ func availableServers() map[string]string {
// TODO - switch to metadata based mapping
func serversForGpu(info gpu.GpuInfo) []string {
// glob workDir for files that start with ollama_
availableServers := availableServers()
availableServers := getAvailableServers()
requested := info.Library
if info.Variant != "" {
requested += "_" + info.Variant
if info.Variant != gpu.CPUCapabilityNone {
requested += "_" + info.Variant.String()
}
servers := []string{}
@@ -115,27 +115,29 @@ func serversForGpu(info gpu.GpuInfo) []string {
servers = append(servers, alt...)
}
// Load up the best CPU variant if not primary requested
if info.Library != "cpu" {
variant := gpu.GetCPUVariant()
// If no variant, then we fall back to default
// If we have a variant, try that if we find an exact match
// Attempting to run the wrong CPU instructions will panic the
// process
if variant != "" {
for cmp := range availableServers {
if cmp == "cpu_"+variant {
servers = append(servers, cmp)
break
if !(runtime.GOOS == "darwin" && runtime.GOARCH == "arm64") {
// Load up the best CPU variant if not primary requested
if info.Library != "cpu" {
variant := gpu.GetCPUCapability()
// If no variant, then we fall back to default
// If we have a variant, try that if we find an exact match
// Attempting to run the wrong CPU instructions will panic the
// process
if variant != gpu.CPUCapabilityNone {
for cmp := range availableServers {
if cmp == "cpu_"+variant.String() {
servers = append(servers, cmp)
break
}
}
} else {
servers = append(servers, "cpu")
}
} else {
servers = append(servers, "cpu")
}
}
if len(servers) == 0 {
servers = []string{"cpu"}
if len(servers) == 0 {
servers = []string{"cpu"}
}
}
return servers
@@ -146,11 +148,11 @@ func serverForCpu() string {
if runtime.GOOS == "darwin" && runtime.GOARCH == "arm64" {
return "metal"
}
variant := gpu.GetCPUVariant()
availableServers := availableServers()
if variant != "" {
variant := gpu.GetCPUCapability()
availableServers := getAvailableServers()
if variant != gpu.CPUCapabilityNone {
for cmp := range availableServers {
if cmp == "cpu_"+variant {
if cmp == "cpu_"+variant.String() {
return cmp
}
}

View File

@@ -33,12 +33,13 @@ type LlamaServer interface {
Ping(ctx context.Context) error
WaitUntilRunning(ctx context.Context) error
Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error
Embedding(ctx context.Context, prompt string) ([]float64, error)
Embed(ctx context.Context, input []string) ([][]float32, error)
Tokenize(ctx context.Context, content string) ([]int, error)
Detokenize(ctx context.Context, tokens []int) (string, error)
Close() error
EstimatedVRAM() uint64
EstimatedVRAM() uint64 // Total VRAM across all GPUs
EstimatedTotal() uint64
EstimatedVRAMByGPU(gpuID string) uint64
}
// llmServer is an instance of the llama.cpp server
@@ -49,18 +50,22 @@ type llmServer struct {
status *StatusWriter
options api.Options
// TODO - this should be broken down by GPU
estimatedVRAM uint64 // Estimated usage of VRAM by the loaded model
estimatedTotal uint64 // Total size of model
totalLayers uint64
gpuCount int
loadDuration time.Duration // Record how long it took the model to load
loadProgress float32
estimate MemoryEstimate
totalLayers uint64
// gpuCount int
gpus gpu.GpuInfoList // Recorded just before the model loaded, free space will be incorrect
loadDuration time.Duration // Record how long it took the model to load
loadProgress float32
sem *semaphore.Weighted
}
func LoadModel(model string) (*GGML, error) {
// LoadModel will load a model from disk. The model must be in the GGML format.
//
// It collects array values for arrays with a size less than or equal to
// maxArraySize. If maxArraySize is 0, the default value of 1024 is used. If
// the maxArraySize is negative, all arrays are collected.
func LoadModel(model string, maxArraySize int) (*GGML, error) {
if _, err := os.Stat(model); err != nil {
return nil, err
}
@@ -71,52 +76,66 @@ func LoadModel(model string) (*GGML, error) {
}
defer f.Close()
ggml, _, err := DecodeGGML(f)
ggml, _, err := DecodeGGML(f, maxArraySize)
return ggml, err
}
// NewLlamaServer will run a server for the given GPUs
// The gpu list must be a single family.
func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, projectors []string, opts api.Options) (LlamaServer, error) {
func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, projectors []string, opts api.Options, numParallel int) (LlamaServer, error) {
var err error
var cpuRunner string
var estimatedVRAM uint64
var estimatedTotal uint64
var systemMemory uint64
gpuCount := len(gpus)
if (len(gpus) == 1 && gpus[0].Library == "cpu") || opts.NumGPU == 0 {
// TODO evaluate system memory to see if we should block the load, or force an unload of another CPU runner
var estimate MemoryEstimate
var systemTotalMemory uint64
var systemFreeMemory uint64
var systemSwapFreeMemory uint64
cpuRunner = serverForCpu()
gpuCount = 0
_, _, estimatedTotal = EstimateGPULayers(gpus, ggml, projectors, opts)
systemMemInfo, err := gpu.GetCPUMem()
if err != nil {
slog.Error("failed to lookup system memory", "error", err)
} else {
if gpus[0].Library == "metal" {
memInfo, err := gpu.GetCPUMem()
if err != nil {
slog.Error("failed to lookup system memory", "error", err)
} else {
systemMemory = memInfo.TotalMemory
slog.Debug("system memory", "total", format.HumanBytes2(systemMemory))
}
}
var layers int
layers, estimatedVRAM, estimatedTotal = EstimateGPULayers(gpus, ggml, projectors, opts)
systemTotalMemory = systemMemInfo.TotalMemory
systemFreeMemory = systemMemInfo.FreeMemory
systemSwapFreeMemory = systemMemInfo.FreeSwap
slog.Debug("system memory", "total", format.HumanBytes2(systemTotalMemory), "free", format.HumanBytes2(systemFreeMemory), "free_swap", format.HumanBytes2(systemSwapFreeMemory))
}
// If the user wants zero GPU layers, reset the gpu list to be CPU/system ram info
if opts.NumGPU == 0 {
gpus = gpu.GetCPUInfo()
}
if len(gpus) == 1 && gpus[0].Library == "cpu" {
cpuRunner = serverForCpu()
estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
} else {
estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
switch {
case gpus[0].Library == "metal" && estimatedVRAM > systemMemory:
case gpus[0].Library == "metal" && estimate.VRAMSize > systemTotalMemory:
// disable partial offloading when model is greater than total system memory as this
// can lead to locking up the system
opts.NumGPU = 0
case gpus[0].Library != "metal" && layers == 0:
case gpus[0].Library != "metal" && estimate.Layers == 0:
// Don't bother loading into the GPU if no layers can fit
cpuRunner = serverForCpu()
gpuCount = 0
case opts.NumGPU < 0 && layers > 0 && gpus[0].Library != "cpu":
opts.NumGPU = layers
gpus = gpu.GetCPUInfo()
case opts.NumGPU < 0 && estimate.Layers > 0 && gpus[0].Library != "cpu":
opts.NumGPU = estimate.Layers
}
}
// On linux, over-allocating CPU memory will almost always result in an error
if runtime.GOOS == "linux" {
systemMemoryRequired := estimate.TotalSize - estimate.VRAMSize
available := systemFreeMemory + systemSwapFreeMemory
if systemMemoryRequired > available {
slog.Warn("model request too large for system", "requested", format.HumanBytes2(systemMemoryRequired), "available", available, "total", format.HumanBytes2(systemTotalMemory), "free", format.HumanBytes2(systemFreeMemory), "swap", format.HumanBytes2(systemSwapFreeMemory))
return nil, fmt.Errorf("model requires more system memory (%s) than is available (%s)", format.HumanBytes2(systemMemoryRequired), format.HumanBytes2(available))
}
}
estimate.log()
// Loop through potential servers
finalErr := errors.New("no suitable llama servers found")
@@ -124,7 +143,20 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
return nil, errors.New("ollama supports only one lora adapter, but multiple were provided")
}
availableServers := availableServers()
availableServers := getAvailableServers()
if len(availableServers) == 0 {
if runtime.GOOS != "windows" {
slog.Warn("llama server binary disappeared, reinitializing payloads")
err = Init()
if err != nil {
slog.Warn("failed to reinitialize payloads", "error", err)
return nil, err
}
availableServers = getAvailableServers()
} else {
return nil, finalErr
}
}
var servers []string
if cpuRunner != "" {
servers = []string{cpuRunner}
@@ -201,7 +233,8 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
if g.Library == "metal" &&
uint64(opts.NumGPU) > 0 &&
uint64(opts.NumGPU) < ggml.KV().BlockCount()+1 {
opts.UseMMap = false
opts.UseMMap = new(bool)
*opts.UseMMap = false
}
}
@@ -209,7 +242,13 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
params = append(params, "--flash-attn")
}
if !opts.UseMMap {
// Windows CUDA should not use mmap for best performance
// Linux with a model larger than free space, mmap leads to thrashing
// For CPU loads we want the memory to be allocated, not FS cache
if (runtime.GOOS == "windows" && gpus[0].Library == "cuda" && opts.UseMMap == nil) ||
(runtime.GOOS == "linux" && systemFreeMemory < estimate.TotalSize && opts.UseMMap == nil) ||
(gpus[0].Library == "cpu" && opts.UseMMap == nil) ||
(opts.UseMMap != nil && !*opts.UseMMap) {
params = append(params, "--no-mmap")
}
@@ -221,17 +260,12 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
params = append(params, "--numa")
}
numParallel := envconfig.NumParallel
// TODO (jmorganca): multimodal models don't support parallel yet
// see https://github.com/ollama/ollama/issues/4165
if len(projectors) > 0 {
numParallel = 1
slog.Warn("multimodal models don't support parallel requests yet")
}
params = append(params, "--parallel", fmt.Sprintf("%d", numParallel))
if estimate.TensorSplit != "" {
params = append(params, "--tensor-split", estimate.TensorSplit)
}
for i := range len(servers) {
dir := availableServers[servers[i]]
if dir == "" {
@@ -242,8 +276,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
}
if strings.HasPrefix(servers[i], "cpu") {
// TODO if we tried a gpu runner first, and it failed, record the error and bubble that back up
gpuCount = 0
gpus = gpu.GetCPUInfo()
}
// Find an availableServers port, retry on each iteration in case the failure was a port conflict race
@@ -265,8 +298,8 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
if runtime.GOOS == "windows" {
pathEnv = "PATH"
}
// prepend the server directory to LD_LIBRARY_PATH/PATH
libraryPaths := []string{dir}
// prepend the server directory to LD_LIBRARY_PATH/PATH and the parent dir for common dependencies
libraryPaths := []string{dir, filepath.Dir(dir)}
if libraryPath, ok := os.LookupEnv(pathEnv); ok {
// Append our runner directory to the path
@@ -299,22 +332,25 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
}
s := &llmServer{
port: port,
cmd: exec.Command(server, finalParams...),
status: NewStatusWriter(os.Stderr),
options: opts,
estimatedVRAM: estimatedVRAM,
estimatedTotal: estimatedTotal,
sem: semaphore.NewWeighted(int64(numParallel)),
totalLayers: ggml.KV().BlockCount() + 1,
gpuCount: gpuCount,
done: make(chan error, 1),
port: port,
cmd: exec.Command(server, finalParams...),
status: NewStatusWriter(os.Stderr),
options: opts,
estimate: estimate,
sem: semaphore.NewWeighted(int64(numParallel)),
totalLayers: ggml.KV().BlockCount() + 1,
gpus: gpus,
done: make(chan error, 1),
}
s.cmd.Env = os.Environ()
s.cmd.Stdout = os.Stdout
s.cmd.Stderr = s.status
envWorkarounds := [][2]string{}
for _, gpu := range gpus {
envWorkarounds = append(envWorkarounds, gpu.EnvWorkarounds...)
}
visibleDevicesEnv, visibleDevicesEnvVal := gpus.GetVisibleDevicesEnv()
pathEnvVal := strings.Join(libraryPaths, string(filepath.ListSeparator))
@@ -329,6 +365,12 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
} else if devicesNeeded && strings.EqualFold(cmp[0], visibleDevicesEnv) {
s.cmd.Env[i] = visibleDevicesEnv + "=" + visibleDevicesEnvVal
devicesNeeded = false
} else if len(envWorkarounds) != 0 {
for _, kv := range envWorkarounds {
if strings.EqualFold(cmp[0], kv[0]) {
s.cmd.Env[i] = kv[0] + "=" + kv[1]
}
}
}
}
if pathNeeded {
@@ -343,8 +385,10 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
filteredEnv := []string{}
for _, ev := range s.cmd.Env {
if strings.HasPrefix(ev, "CUDA_") ||
strings.HasPrefix(ev, "ROCR_") ||
strings.HasPrefix(ev, "ROCM_") ||
strings.HasPrefix(ev, "HIP_") ||
strings.HasPrefix(ev, "GPU_") ||
strings.HasPrefix(ev, "HSA_") ||
strings.HasPrefix(ev, "GGML_") ||
strings.HasPrefix(ev, "PATH=") ||
@@ -390,7 +434,7 @@ func projectorMemoryRequirements(filename string) uint64 {
}
defer file.Close()
ggml, _, err := DecodeGGML(file)
ggml, _, err := DecodeGGML(file, 0)
if err != nil {
return 0
}
@@ -540,6 +584,9 @@ func (s *llmServer) WaitUntilRunning(ctx context.Context) error {
if s.status != nil && s.status.LastErrMsg != "" {
msg = s.status.LastErrMsg
}
if strings.Contains(msg, "unknown model") {
return fmt.Errorf("this model is not supported by your version of Ollama. You may need to upgrade")
}
return fmt.Errorf("llama runner process has terminated: %v %s", err, msg)
default:
}
@@ -642,7 +689,7 @@ type CompletionRequest struct {
Prompt string
Format string
Images []ImageData
Options api.Options
Options *api.Options
}
type CompletionResponse struct {
@@ -662,10 +709,9 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
}
defer s.sem.Release(1)
// only allow maximum 10 "context shifts" to avoid infinite generation
// put an upper limit on num_predict to avoid the model running on forever
if req.Options.NumPredict < 0 || req.Options.NumPredict > 10*s.options.NumCtx {
req.Options.NumPredict = 10 * s.options.NumCtx
slog.Debug("setting token limit to 10x num_ctx", "num_ctx", s.options.NumCtx, "num_predict", req.Options.NumPredict)
}
request := map[string]any{
@@ -823,15 +869,15 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
return nil
}
type EmbeddingRequest struct {
Content string `json:"content"`
type EmbedRequest struct {
Content []string `json:"content"`
}
type EmbeddingResponse struct {
Embedding []float64 `json:"embedding"`
type EmbedResponse struct {
Embedding [][]float32 `json:"embedding"`
}
func (s *llmServer) Embedding(ctx context.Context, prompt string) ([]float64, error) {
func (s *llmServer) Embed(ctx context.Context, input []string) ([][]float32, error) {
if err := s.sem.Acquire(ctx, 1); err != nil {
slog.Error("Failed to acquire semaphore", "error", err)
return nil, err
@@ -846,7 +892,7 @@ func (s *llmServer) Embedding(ctx context.Context, prompt string) ([]float64, er
return nil, fmt.Errorf("unexpected server status: %s", status.ToString())
}
data, err := json.Marshal(TokenizeRequest{Content: prompt})
data, err := json.Marshal(EmbedRequest{Content: input})
if err != nil {
return nil, fmt.Errorf("error marshaling embed data: %w", err)
}
@@ -873,7 +919,7 @@ func (s *llmServer) Embedding(ctx context.Context, prompt string) ([]float64, er
return nil, fmt.Errorf("%s", body)
}
var embedding EmbeddingResponse
var embedding EmbedResponse
if err := json.Unmarshal(body, &embedding); err != nil {
return nil, fmt.Errorf("unmarshal tokenize response: %w", err)
}
@@ -1004,11 +1050,20 @@ func (s *llmServer) Close() error {
}
func (s *llmServer) EstimatedVRAM() uint64 {
return s.estimatedVRAM
return s.estimate.VRAMSize
}
func (s *llmServer) EstimatedTotal() uint64 {
return s.estimatedTotal
return s.estimate.TotalSize
}
func (s *llmServer) EstimatedVRAMByGPU(gpuID string) uint64 {
for i, gpu := range s.gpus {
if gpu.ID == gpuID {
return s.estimate.GPUSizes[i]
}
}
return 0
}
func parseDurationMs(ms float64) time.Duration {

View File

@@ -25,6 +25,7 @@ var errorPrefixes = []string{
"CUDA error",
"cudaMalloc failed",
"\"ERR\"",
"error loading model",
}
func (w *StatusWriter) Write(b []byte) (int, error) {

View File

@@ -3,15 +3,19 @@ package openai
import (
"bytes"
"encoding/base64"
"encoding/json"
"fmt"
"io"
"log/slog"
"math/rand"
"net/http"
"strings"
"time"
"github.com/gin-gonic/gin"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/types/model"
)
type Error struct {
@@ -26,8 +30,9 @@ type ErrorResponse struct {
}
type Message struct {
Role string `json:"role"`
Content string `json:"content"`
Role string `json:"role"`
Content any `json:"content"`
ToolCalls []ToolCall `json:"tool_calls,omitempty"`
}
type Choice struct {
@@ -42,6 +47,12 @@ type ChunkChoice struct {
FinishReason *string `json:"finish_reason"`
}
type CompleteChunkChoice struct {
Text string `json:"text"`
Index int `json:"index"`
FinishReason *string `json:"finish_reason"`
}
type Usage struct {
PromptTokens int `json:"prompt_tokens"`
CompletionTokens int `json:"completion_tokens"`
@@ -52,6 +63,11 @@ type ResponseFormat struct {
Type string `json:"type"`
}
type EmbedRequest struct {
Input any `json:"input"`
Model string `json:"model"`
}
type ChatCompletionRequest struct {
Model string `json:"model"`
Messages []Message `json:"messages"`
@@ -64,6 +80,7 @@ type ChatCompletionRequest struct {
PresencePenalty *float64 `json:"presence_penalty_penalty"`
TopP *float64 `json:"top_p"`
ResponseFormat *ResponseFormat `json:"response_format"`
Tools []api.Tool `json:"tools"`
}
type ChatCompletion struct {
@@ -85,6 +102,73 @@ type ChatCompletionChunk struct {
Choices []ChunkChoice `json:"choices"`
}
// TODO (https://github.com/ollama/ollama/issues/5259): support []string, []int and [][]int
type CompletionRequest struct {
Model string `json:"model"`
Prompt string `json:"prompt"`
FrequencyPenalty float32 `json:"frequency_penalty"`
MaxTokens *int `json:"max_tokens"`
PresencePenalty float32 `json:"presence_penalty"`
Seed *int `json:"seed"`
Stop any `json:"stop"`
Stream bool `json:"stream"`
Temperature *float32 `json:"temperature"`
TopP float32 `json:"top_p"`
Suffix string `json:"suffix"`
}
type Completion struct {
Id string `json:"id"`
Object string `json:"object"`
Created int64 `json:"created"`
Model string `json:"model"`
SystemFingerprint string `json:"system_fingerprint"`
Choices []CompleteChunkChoice `json:"choices"`
Usage Usage `json:"usage,omitempty"`
}
type CompletionChunk struct {
Id string `json:"id"`
Object string `json:"object"`
Created int64 `json:"created"`
Choices []CompleteChunkChoice `json:"choices"`
Model string `json:"model"`
SystemFingerprint string `json:"system_fingerprint"`
}
type ToolCall struct {
ID string `json:"id"`
Type string `json:"type"`
Function struct {
Name string `json:"name"`
Arguments string `json:"arguments"`
} `json:"function"`
}
type Model struct {
Id string `json:"id"`
Object string `json:"object"`
Created int64 `json:"created"`
OwnedBy string `json:"owned_by"`
}
type Embedding struct {
Object string `json:"object"`
Embedding []float32 `json:"embedding"`
Index int `json:"index"`
}
type ListCompletion struct {
Object string `json:"object"`
Data []Model `json:"data"`
}
type EmbeddingList struct {
Object string `json:"object"`
Data []Embedding `json:"data"`
Model string `json:"model"`
}
func NewError(code int, message string) ErrorResponse {
var etype string
switch code {
@@ -99,7 +183,31 @@ func NewError(code int, message string) ErrorResponse {
return ErrorResponse{Error{Type: etype, Message: message}}
}
func toolCallId() string {
const letterBytes = "abcdefghijklmnopqrstuvwxyz0123456789"
b := make([]byte, 8)
for i := range b {
b[i] = letterBytes[rand.Intn(len(letterBytes))]
}
return "call_" + strings.ToLower(string(b))
}
func toChatCompletion(id string, r api.ChatResponse) ChatCompletion {
toolCalls := make([]ToolCall, len(r.Message.ToolCalls))
for i, tc := range r.Message.ToolCalls {
toolCalls[i].ID = toolCallId()
toolCalls[i].Type = "function"
toolCalls[i].Function.Name = tc.Function.Name
args, err := json.Marshal(tc.Function.Arguments)
if err != nil {
slog.Error("could not marshall function arguments to json", "error", err)
continue
}
toolCalls[i].Function.Arguments = string(args)
}
return ChatCompletion{
Id: id,
Object: "chat.completion",
@@ -108,7 +216,7 @@ func toChatCompletion(id string, r api.ChatResponse) ChatCompletion {
SystemFingerprint: "fp_ollama",
Choices: []Choice{{
Index: 0,
Message: Message{Role: r.Message.Role, Content: r.Message.Content},
Message: Message{Role: r.Message.Role, Content: r.Message.Content, ToolCalls: toolCalls},
FinishReason: func(reason string) *string {
if len(reason) > 0 {
return &reason
@@ -117,7 +225,6 @@ func toChatCompletion(id string, r api.ChatResponse) ChatCompletion {
}(r.DoneReason),
}},
Usage: Usage{
// TODO: ollama returns 0 for prompt eval if the prompt was cached, but openai returns the actual count
PromptTokens: r.PromptEvalCount,
CompletionTokens: r.EvalCount,
TotalTokens: r.PromptEvalCount + r.EvalCount,
@@ -145,10 +252,169 @@ func toChunk(id string, r api.ChatResponse) ChatCompletionChunk {
}
}
func fromRequest(r ChatCompletionRequest) api.ChatRequest {
func toCompletion(id string, r api.GenerateResponse) Completion {
return Completion{
Id: id,
Object: "text_completion",
Created: r.CreatedAt.Unix(),
Model: r.Model,
SystemFingerprint: "fp_ollama",
Choices: []CompleteChunkChoice{{
Text: r.Response,
Index: 0,
FinishReason: func(reason string) *string {
if len(reason) > 0 {
return &reason
}
return nil
}(r.DoneReason),
}},
Usage: Usage{
PromptTokens: r.PromptEvalCount,
CompletionTokens: r.EvalCount,
TotalTokens: r.PromptEvalCount + r.EvalCount,
},
}
}
func toCompleteChunk(id string, r api.GenerateResponse) CompletionChunk {
return CompletionChunk{
Id: id,
Object: "text_completion",
Created: time.Now().Unix(),
Model: r.Model,
SystemFingerprint: "fp_ollama",
Choices: []CompleteChunkChoice{{
Text: r.Response,
Index: 0,
FinishReason: func(reason string) *string {
if len(reason) > 0 {
return &reason
}
return nil
}(r.DoneReason),
}},
}
}
func toListCompletion(r api.ListResponse) ListCompletion {
var data []Model
for _, m := range r.Models {
data = append(data, Model{
Id: m.Name,
Object: "model",
Created: m.ModifiedAt.Unix(),
OwnedBy: model.ParseName(m.Name).Namespace,
})
}
return ListCompletion{
Object: "list",
Data: data,
}
}
func toEmbeddingList(model string, r api.EmbedResponse) EmbeddingList {
if r.Embeddings != nil {
var data []Embedding
for i, e := range r.Embeddings {
data = append(data, Embedding{
Object: "embedding",
Embedding: e,
Index: i,
})
}
return EmbeddingList{
Object: "list",
Data: data,
Model: model,
}
}
return EmbeddingList{}
}
func toModel(r api.ShowResponse, m string) Model {
return Model{
Id: m,
Object: "model",
Created: r.ModifiedAt.Unix(),
OwnedBy: model.ParseName(m).Namespace,
}
}
func fromChatRequest(r ChatCompletionRequest) (*api.ChatRequest, error) {
var messages []api.Message
for _, msg := range r.Messages {
messages = append(messages, api.Message{Role: msg.Role, Content: msg.Content})
switch content := msg.Content.(type) {
case string:
messages = append(messages, api.Message{Role: msg.Role, Content: content})
case []any:
for _, c := range content {
data, ok := c.(map[string]any)
if !ok {
return nil, fmt.Errorf("invalid message format")
}
switch data["type"] {
case "text":
text, ok := data["text"].(string)
if !ok {
return nil, fmt.Errorf("invalid message format")
}
messages = append(messages, api.Message{Role: msg.Role, Content: text})
case "image_url":
var url string
if urlMap, ok := data["image_url"].(map[string]any); ok {
if url, ok = urlMap["url"].(string); !ok {
return nil, fmt.Errorf("invalid message format")
}
} else {
if url, ok = data["image_url"].(string); !ok {
return nil, fmt.Errorf("invalid message format")
}
}
types := []string{"jpeg", "jpg", "png"}
valid := false
for _, t := range types {
prefix := "data:image/" + t + ";base64,"
if strings.HasPrefix(url, prefix) {
url = strings.TrimPrefix(url, prefix)
valid = true
break
}
}
if !valid {
return nil, fmt.Errorf("invalid image input")
}
img, err := base64.StdEncoding.DecodeString(url)
if err != nil {
return nil, fmt.Errorf("invalid message format")
}
messages = append(messages, api.Message{Role: msg.Role, Images: []api.ImageData{img}})
default:
return nil, fmt.Errorf("invalid message format")
}
}
default:
if msg.ToolCalls == nil {
return nil, fmt.Errorf("invalid message content type: %T", content)
}
toolCalls := make([]api.ToolCall, len(msg.ToolCalls))
for i, tc := range msg.ToolCalls {
toolCalls[i].Function.Name = tc.Function.Name
err := json.Unmarshal([]byte(tc.Function.Arguments), &toolCalls[i].Function.Arguments)
if err != nil {
return nil, fmt.Errorf("invalid tool call arguments")
}
}
messages = append(messages, api.Message{Role: msg.Role, ToolCalls: toolCalls})
}
}
options := make(map[string]interface{})
@@ -156,7 +422,7 @@ func fromRequest(r ChatCompletionRequest) api.ChatRequest {
switch stop := r.Stop.(type) {
case string:
options["stop"] = []string{stop}
case []interface{}:
case []any:
var stops []string
for _, s := range stop {
if str, ok := s.(string); ok {
@@ -178,9 +444,6 @@ func fromRequest(r ChatCompletionRequest) api.ChatRequest {
if r.Seed != nil {
options["seed"] = *r.Seed
// temperature=0 is required for reproducible outputs
options["temperature"] = 0.0
}
if r.FrequencyPenalty != nil {
@@ -202,45 +465,98 @@ func fromRequest(r ChatCompletionRequest) api.ChatRequest {
format = "json"
}
return api.ChatRequest{
return &api.ChatRequest{
Model: r.Model,
Messages: messages,
Format: format,
Options: options,
Stream: &r.Stream,
}
Tools: r.Tools,
}, nil
}
type DeleteCompletion struct {
Id string `json:"id"`
Object string `json:"object"`
Deleted bool `json:"deleted"`
}
func fromCompleteRequest(r CompletionRequest) (api.GenerateRequest, error) {
options := make(map[string]any)
func toDeleteCompletion(model string) DeleteCompletion {
return DeleteCompletion{
Id: model,
Object: "model",
Deleted: true,
switch stop := r.Stop.(type) {
case string:
options["stop"] = []string{stop}
case []any:
var stops []string
for _, s := range stop {
if str, ok := s.(string); ok {
stops = append(stops, str)
} else {
return api.GenerateRequest{}, fmt.Errorf("invalid type for 'stop' field: %T", s)
}
}
options["stop"] = stops
}
if r.MaxTokens != nil {
options["num_predict"] = *r.MaxTokens
}
if r.Temperature != nil {
options["temperature"] = *r.Temperature * 2.0
} else {
options["temperature"] = 1.0
}
if r.Seed != nil {
options["seed"] = *r.Seed
}
options["frequency_penalty"] = r.FrequencyPenalty * 2.0
options["presence_penalty"] = r.PresencePenalty * 2.0
if r.TopP != 0.0 {
options["top_p"] = r.TopP
} else {
options["top_p"] = 1.0
}
return api.GenerateRequest{
Model: r.Model,
Prompt: r.Prompt,
Options: options,
Stream: &r.Stream,
Suffix: r.Suffix,
}, nil
}
type BaseWriter struct {
gin.ResponseWriter
}
type DeleteWriter struct {
type ChatWriter struct {
stream bool
id string
BaseWriter
}
type CompleteWriter struct {
stream bool
id string
BaseWriter
}
type ListWriter struct {
BaseWriter
}
type RetrieveWriter struct {
BaseWriter
model string
}
type writer struct {
stream bool
id string
gin.ResponseWriter
type EmbedWriter struct {
BaseWriter
model string
}
func (w *writer) writeError(code int, data []byte) (int, error) {
func (w *BaseWriter) writeError(code int, data []byte) (int, error) {
var serr api.StatusError
err := json.Unmarshal(data, &serr)
if err != nil {
@@ -256,7 +572,7 @@ func (w *writer) writeError(code int, data []byte) (int, error) {
return len(data), nil
}
func (w *writer) writeResponse(data []byte) (int, error) {
func (w *ChatWriter) writeResponse(data []byte) (int, error) {
var chatResponse api.ChatResponse
err := json.Unmarshal(data, &chatResponse)
if err != nil {
@@ -296,7 +612,7 @@ func (w *writer) writeResponse(data []byte) (int, error) {
return len(data), nil
}
func (w *writer) Write(data []byte) (int, error) {
func (w *ChatWriter) Write(data []byte) (int, error) {
code := w.ResponseWriter.Status()
if code != http.StatusOK {
return w.writeError(code, data)
@@ -305,7 +621,244 @@ func (w *writer) Write(data []byte) (int, error) {
return w.writeResponse(data)
}
func Middleware() gin.HandlerFunc {
func (w *CompleteWriter) writeResponse(data []byte) (int, error) {
var generateResponse api.GenerateResponse
err := json.Unmarshal(data, &generateResponse)
if err != nil {
return 0, err
}
// completion chunk
if w.stream {
d, err := json.Marshal(toCompleteChunk(w.id, generateResponse))
if err != nil {
return 0, err
}
w.ResponseWriter.Header().Set("Content-Type", "text/event-stream")
_, err = w.ResponseWriter.Write([]byte(fmt.Sprintf("data: %s\n\n", d)))
if err != nil {
return 0, err
}
if generateResponse.Done {
_, err = w.ResponseWriter.Write([]byte("data: [DONE]\n\n"))
if err != nil {
return 0, err
}
}
return len(data), nil
}
// completion
w.ResponseWriter.Header().Set("Content-Type", "application/json")
err = json.NewEncoder(w.ResponseWriter).Encode(toCompletion(w.id, generateResponse))
if err != nil {
return 0, err
}
return len(data), nil
}
func (w *CompleteWriter) Write(data []byte) (int, error) {
code := w.ResponseWriter.Status()
if code != http.StatusOK {
return w.writeError(code, data)
}
return w.writeResponse(data)
}
func (w *ListWriter) writeResponse(data []byte) (int, error) {
var listResponse api.ListResponse
err := json.Unmarshal(data, &listResponse)
if err != nil {
return 0, err
}
w.ResponseWriter.Header().Set("Content-Type", "application/json")
err = json.NewEncoder(w.ResponseWriter).Encode(toListCompletion(listResponse))
if err != nil {
return 0, err
}
return len(data), nil
}
func (w *ListWriter) Write(data []byte) (int, error) {
code := w.ResponseWriter.Status()
if code != http.StatusOK {
return w.writeError(code, data)
}
return w.writeResponse(data)
}
func (w *RetrieveWriter) writeResponse(data []byte) (int, error) {
var showResponse api.ShowResponse
err := json.Unmarshal(data, &showResponse)
if err != nil {
return 0, err
}
// retrieve completion
w.ResponseWriter.Header().Set("Content-Type", "application/json")
err = json.NewEncoder(w.ResponseWriter).Encode(toModel(showResponse, w.model))
if err != nil {
return 0, err
}
return len(data), nil
}
func (w *RetrieveWriter) Write(data []byte) (int, error) {
code := w.ResponseWriter.Status()
if code != http.StatusOK {
return w.writeError(code, data)
}
return w.writeResponse(data)
}
func (w *EmbedWriter) writeResponse(data []byte) (int, error) {
var embedResponse api.EmbedResponse
err := json.Unmarshal(data, &embedResponse)
if err != nil {
return 0, err
}
w.ResponseWriter.Header().Set("Content-Type", "application/json")
err = json.NewEncoder(w.ResponseWriter).Encode(toEmbeddingList(w.model, embedResponse))
if err != nil {
return 0, err
}
return len(data), nil
}
func (w *EmbedWriter) Write(data []byte) (int, error) {
code := w.ResponseWriter.Status()
if code != http.StatusOK {
return w.writeError(code, data)
}
return w.writeResponse(data)
}
func ListMiddleware() gin.HandlerFunc {
return func(c *gin.Context) {
w := &ListWriter{
BaseWriter: BaseWriter{ResponseWriter: c.Writer},
}
c.Writer = w
c.Next()
}
}
func RetrieveMiddleware() gin.HandlerFunc {
return func(c *gin.Context) {
var b bytes.Buffer
if err := json.NewEncoder(&b).Encode(api.ShowRequest{Name: c.Param("model")}); err != nil {
c.AbortWithStatusJSON(http.StatusInternalServerError, NewError(http.StatusInternalServerError, err.Error()))
return
}
c.Request.Body = io.NopCloser(&b)
// response writer
w := &RetrieveWriter{
BaseWriter: BaseWriter{ResponseWriter: c.Writer},
model: c.Param("model"),
}
c.Writer = w
c.Next()
}
}
func CompletionsMiddleware() gin.HandlerFunc {
return func(c *gin.Context) {
var req CompletionRequest
err := c.ShouldBindJSON(&req)
if err != nil {
c.AbortWithStatusJSON(http.StatusBadRequest, NewError(http.StatusBadRequest, err.Error()))
return
}
var b bytes.Buffer
genReq, err := fromCompleteRequest(req)
if err != nil {
c.AbortWithStatusJSON(http.StatusBadRequest, NewError(http.StatusBadRequest, err.Error()))
return
}
if err := json.NewEncoder(&b).Encode(genReq); err != nil {
c.AbortWithStatusJSON(http.StatusInternalServerError, NewError(http.StatusInternalServerError, err.Error()))
return
}
c.Request.Body = io.NopCloser(&b)
w := &CompleteWriter{
BaseWriter: BaseWriter{ResponseWriter: c.Writer},
stream: req.Stream,
id: fmt.Sprintf("cmpl-%d", rand.Intn(999)),
}
c.Writer = w
c.Next()
}
}
func EmbeddingsMiddleware() gin.HandlerFunc {
return func(c *gin.Context) {
var req EmbedRequest
err := c.ShouldBindJSON(&req)
if err != nil {
c.AbortWithStatusJSON(http.StatusBadRequest, NewError(http.StatusBadRequest, err.Error()))
return
}
if req.Input == "" {
req.Input = []string{""}
}
if req.Input == nil {
c.AbortWithStatusJSON(http.StatusBadRequest, NewError(http.StatusBadRequest, "invalid input"))
return
}
if v, ok := req.Input.([]any); ok && len(v) == 0 {
c.AbortWithStatusJSON(http.StatusBadRequest, NewError(http.StatusBadRequest, "invalid input"))
return
}
var b bytes.Buffer
if err := json.NewEncoder(&b).Encode(api.EmbedRequest{Model: req.Model, Input: req.Input}); err != nil {
c.AbortWithStatusJSON(http.StatusInternalServerError, NewError(http.StatusInternalServerError, err.Error()))
return
}
c.Request.Body = io.NopCloser(&b)
w := &EmbedWriter{
BaseWriter: BaseWriter{ResponseWriter: c.Writer},
model: req.Model,
}
c.Writer = w
c.Next()
}
}
func ChatMiddleware() gin.HandlerFunc {
return func(c *gin.Context) {
var req ChatCompletionRequest
err := c.ShouldBindJSON(&req)
@@ -320,87 +873,28 @@ func Middleware() gin.HandlerFunc {
}
var b bytes.Buffer
if err := json.NewEncoder(&b).Encode(fromRequest(req)); err != nil {
chatReq, err := fromChatRequest(req)
if err != nil {
c.AbortWithStatusJSON(http.StatusBadRequest, NewError(http.StatusBadRequest, err.Error()))
return
}
if err := json.NewEncoder(&b).Encode(chatReq); err != nil {
c.AbortWithStatusJSON(http.StatusInternalServerError, NewError(http.StatusInternalServerError, err.Error()))
return
}
c.Request.Body = io.NopCloser(&b)
w := &writer{
ResponseWriter: c.Writer,
stream: req.Stream,
id: fmt.Sprintf("chatcmpl-%d", rand.Intn(999)),
}
c.Writer = w
c.Next()
}
}
func (w *DeleteWriter) writeError(code int, data []byte) (int, error) {
var serr api.StatusError
err := json.Unmarshal(data, &serr)
if err != nil {
return 0, err
}
w.ResponseWriter.Header().Set("Content-Type", "application/json")
err = json.NewEncoder(w.ResponseWriter).Encode(NewError(http.StatusInternalServerError, serr.Error()))
if err != nil {
return 0, err
}
return len(data), nil
}
func (w *DeleteWriter) writeResponse(data []byte) (int, error) {
// delete completion
w.ResponseWriter.Header().Set("Content-Type", "application/json")
err := json.NewEncoder(w.ResponseWriter).Encode(toDeleteCompletion(w.model))
if err != nil {
return 0, err
}
return len(data), nil
}
func (w *DeleteWriter) Write(data []byte) (int, error) {
code := w.ResponseWriter.Status()
if code != http.StatusOK {
return w.writeError(code, data)
}
return w.writeResponse(data)
}
func DeleteMiddleware() gin.HandlerFunc {
return func(c *gin.Context) {
var b bytes.Buffer
if err := json.NewEncoder(&b).Encode(api.DeleteRequest{Model: c.Param("model")}); err != nil {
c.AbortWithStatusJSON(http.StatusInternalServerError, NewError(http.StatusInternalServerError, err.Error()))
return
}
c.Request.Body = io.NopCloser(&b)
// response writer
w := &DeleteWriter{
w := &ChatWriter{
BaseWriter: BaseWriter{ResponseWriter: c.Writer},
model: c.Param("model"),
stream: req.Stream,
id: fmt.Sprintf("chatcmpl-%d", rand.Intn(999)),
}
c.Writer = w
c.Next()
// If the status code is OK, write the DeleteCompletion response
if c.Writer.Status() == http.StatusOK {
_, err := w.writeResponse(nil)
if err != nil {
c.AbortWithStatusJSON(http.StatusInternalServerError, NewError(http.StatusInternalServerError, err.Error()))
}
}
}
}

496
openai/openai_test.go Normal file
View File

@@ -0,0 +1,496 @@
package openai
import (
"bytes"
"encoding/base64"
"encoding/json"
"io"
"net/http"
"net/http/httptest"
"strings"
"testing"
"time"
"github.com/gin-gonic/gin"
"github.com/ollama/ollama/api"
"github.com/stretchr/testify/assert"
)
const prefix = `data:image/jpeg;base64,`
const image = `iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mNk+A8AAQUBAScY42YAAAAASUVORK5CYII=`
const imageURL = prefix + image
func prepareRequest(req *http.Request, body any) {
bodyBytes, _ := json.Marshal(body)
req.Body = io.NopCloser(bytes.NewReader(bodyBytes))
req.Header.Set("Content-Type", "application/json")
}
func captureRequestMiddleware(capturedRequest any) gin.HandlerFunc {
return func(c *gin.Context) {
bodyBytes, _ := io.ReadAll(c.Request.Body)
c.Request.Body = io.NopCloser(bytes.NewReader(bodyBytes))
err := json.Unmarshal(bodyBytes, capturedRequest)
if err != nil {
c.AbortWithStatusJSON(http.StatusInternalServerError, "failed to unmarshal request")
}
c.Next()
}
}
func TestChatMiddleware(t *testing.T) {
type testCase struct {
Name string
Setup func(t *testing.T, req *http.Request)
Expected func(t *testing.T, req *api.ChatRequest, resp *httptest.ResponseRecorder)
}
var capturedRequest *api.ChatRequest
testCases := []testCase{
{
Name: "chat handler",
Setup: func(t *testing.T, req *http.Request) {
body := ChatCompletionRequest{
Model: "test-model",
Messages: []Message{{Role: "user", Content: "Hello"}},
}
prepareRequest(req, body)
},
Expected: func(t *testing.T, req *api.ChatRequest, resp *httptest.ResponseRecorder) {
if resp.Code != http.StatusOK {
t.Fatalf("expected 200, got %d", resp.Code)
}
if req.Messages[0].Role != "user" {
t.Fatalf("expected 'user', got %s", req.Messages[0].Role)
}
if req.Messages[0].Content != "Hello" {
t.Fatalf("expected 'Hello', got %s", req.Messages[0].Content)
}
},
},
{
Name: "chat handler with image content",
Setup: func(t *testing.T, req *http.Request) {
body := ChatCompletionRequest{
Model: "test-model",
Messages: []Message{
{
Role: "user", Content: []map[string]any{
{"type": "text", "text": "Hello"},
{"type": "image_url", "image_url": map[string]string{"url": imageURL}},
},
},
},
}
prepareRequest(req, body)
},
Expected: func(t *testing.T, req *api.ChatRequest, resp *httptest.ResponseRecorder) {
if resp.Code != http.StatusOK {
t.Fatalf("expected 200, got %d", resp.Code)
}
if req.Messages[0].Role != "user" {
t.Fatalf("expected 'user', got %s", req.Messages[0].Role)
}
if req.Messages[0].Content != "Hello" {
t.Fatalf("expected 'Hello', got %s", req.Messages[0].Content)
}
img, _ := base64.StdEncoding.DecodeString(imageURL[len(prefix):])
if req.Messages[1].Role != "user" {
t.Fatalf("expected 'user', got %s", req.Messages[1].Role)
}
if !bytes.Equal(req.Messages[1].Images[0], img) {
t.Fatalf("expected image encoding, got %s", req.Messages[1].Images[0])
}
},
},
{
Name: "chat handler with tools",
Setup: func(t *testing.T, req *http.Request) {
body := ChatCompletionRequest{
Model: "test-model",
Messages: []Message{
{Role: "user", Content: "What's the weather like in Paris Today?"},
{Role: "assistant", ToolCalls: []ToolCall{{
ID: "id",
Type: "function",
Function: struct {
Name string `json:"name"`
Arguments string `json:"arguments"`
}{
Name: "get_current_weather",
Arguments: "{\"location\": \"Paris, France\", \"format\": \"celsius\"}",
},
}}},
},
}
prepareRequest(req, body)
},
Expected: func(t *testing.T, req *api.ChatRequest, resp *httptest.ResponseRecorder) {
if resp.Code != 200 {
t.Fatalf("expected 200, got %d", resp.Code)
}
if req.Messages[0].Content != "What's the weather like in Paris Today?" {
t.Fatalf("expected What's the weather like in Paris Today?, got %s", req.Messages[0].Content)
}
if req.Messages[1].ToolCalls[0].Function.Arguments["location"] != "Paris, France" {
t.Fatalf("expected 'Paris, France', got %v", req.Messages[1].ToolCalls[0].Function.Arguments["location"])
}
if req.Messages[1].ToolCalls[0].Function.Arguments["format"] != "celsius" {
t.Fatalf("expected celsius, got %v", req.Messages[1].ToolCalls[0].Function.Arguments["format"])
}
},
},
{
Name: "chat handler error forwarding",
Setup: func(t *testing.T, req *http.Request) {
body := ChatCompletionRequest{
Model: "test-model",
Messages: []Message{{Role: "user", Content: 2}},
}
prepareRequest(req, body)
},
Expected: func(t *testing.T, req *api.ChatRequest, resp *httptest.ResponseRecorder) {
if resp.Code != http.StatusBadRequest {
t.Fatalf("expected 400, got %d", resp.Code)
}
if !strings.Contains(resp.Body.String(), "invalid message content type") {
t.Fatalf("error was not forwarded")
}
},
},
}
endpoint := func(c *gin.Context) {
c.Status(http.StatusOK)
}
gin.SetMode(gin.TestMode)
router := gin.New()
router.Use(ChatMiddleware(), captureRequestMiddleware(&capturedRequest))
router.Handle(http.MethodPost, "/api/chat", endpoint)
for _, tc := range testCases {
t.Run(tc.Name, func(t *testing.T) {
req, _ := http.NewRequest(http.MethodPost, "/api/chat", nil)
tc.Setup(t, req)
resp := httptest.NewRecorder()
router.ServeHTTP(resp, req)
tc.Expected(t, capturedRequest, resp)
capturedRequest = nil
})
}
}
func TestCompletionsMiddleware(t *testing.T) {
type testCase struct {
Name string
Setup func(t *testing.T, req *http.Request)
Expected func(t *testing.T, req *api.GenerateRequest, resp *httptest.ResponseRecorder)
}
var capturedRequest *api.GenerateRequest
testCases := []testCase{
{
Name: "completions handler",
Setup: func(t *testing.T, req *http.Request) {
temp := float32(0.8)
body := CompletionRequest{
Model: "test-model",
Prompt: "Hello",
Temperature: &temp,
Stop: []string{"\n", "stop"},
Suffix: "suffix",
}
prepareRequest(req, body)
},
Expected: func(t *testing.T, req *api.GenerateRequest, resp *httptest.ResponseRecorder) {
if req.Prompt != "Hello" {
t.Fatalf("expected 'Hello', got %s", req.Prompt)
}
if req.Options["temperature"] != 1.6 {
t.Fatalf("expected 1.6, got %f", req.Options["temperature"])
}
stopTokens, ok := req.Options["stop"].([]any)
if !ok {
t.Fatalf("expected stop tokens to be a list")
}
if stopTokens[0] != "\n" || stopTokens[1] != "stop" {
t.Fatalf("expected ['\\n', 'stop'], got %v", stopTokens)
}
if req.Suffix != "suffix" {
t.Fatalf("expected 'suffix', got %s", req.Suffix)
}
},
},
{
Name: "completions handler error forwarding",
Setup: func(t *testing.T, req *http.Request) {
body := CompletionRequest{
Model: "test-model",
Prompt: "Hello",
Temperature: nil,
Stop: []int{1, 2},
Suffix: "suffix",
}
prepareRequest(req, body)
},
Expected: func(t *testing.T, req *api.GenerateRequest, resp *httptest.ResponseRecorder) {
if resp.Code != http.StatusBadRequest {
t.Fatalf("expected 400, got %d", resp.Code)
}
if !strings.Contains(resp.Body.String(), "invalid type for 'stop' field") {
t.Fatalf("error was not forwarded")
}
},
},
}
endpoint := func(c *gin.Context) {
c.Status(http.StatusOK)
}
gin.SetMode(gin.TestMode)
router := gin.New()
router.Use(CompletionsMiddleware(), captureRequestMiddleware(&capturedRequest))
router.Handle(http.MethodPost, "/api/generate", endpoint)
for _, tc := range testCases {
t.Run(tc.Name, func(t *testing.T) {
req, _ := http.NewRequest(http.MethodPost, "/api/generate", nil)
tc.Setup(t, req)
resp := httptest.NewRecorder()
router.ServeHTTP(resp, req)
tc.Expected(t, capturedRequest, resp)
capturedRequest = nil
})
}
}
func TestEmbeddingsMiddleware(t *testing.T) {
type testCase struct {
Name string
Setup func(t *testing.T, req *http.Request)
Expected func(t *testing.T, req *api.EmbedRequest, resp *httptest.ResponseRecorder)
}
var capturedRequest *api.EmbedRequest
testCases := []testCase{
{
Name: "embed handler single input",
Setup: func(t *testing.T, req *http.Request) {
body := EmbedRequest{
Input: "Hello",
Model: "test-model",
}
prepareRequest(req, body)
},
Expected: func(t *testing.T, req *api.EmbedRequest, resp *httptest.ResponseRecorder) {
if req.Input != "Hello" {
t.Fatalf("expected 'Hello', got %s", req.Input)
}
if req.Model != "test-model" {
t.Fatalf("expected 'test-model', got %s", req.Model)
}
},
},
{
Name: "embed handler batch input",
Setup: func(t *testing.T, req *http.Request) {
body := EmbedRequest{
Input: []string{"Hello", "World"},
Model: "test-model",
}
prepareRequest(req, body)
},
Expected: func(t *testing.T, req *api.EmbedRequest, resp *httptest.ResponseRecorder) {
input, ok := req.Input.([]any)
if !ok {
t.Fatalf("expected input to be a list")
}
if input[0].(string) != "Hello" {
t.Fatalf("expected 'Hello', got %s", input[0])
}
if input[1].(string) != "World" {
t.Fatalf("expected 'World', got %s", input[1])
}
if req.Model != "test-model" {
t.Fatalf("expected 'test-model', got %s", req.Model)
}
},
},
{
Name: "embed handler error forwarding",
Setup: func(t *testing.T, req *http.Request) {
body := EmbedRequest{
Model: "test-model",
}
prepareRequest(req, body)
},
Expected: func(t *testing.T, req *api.EmbedRequest, resp *httptest.ResponseRecorder) {
if resp.Code != http.StatusBadRequest {
t.Fatalf("expected 400, got %d", resp.Code)
}
if !strings.Contains(resp.Body.String(), "invalid input") {
t.Fatalf("error was not forwarded")
}
},
},
}
endpoint := func(c *gin.Context) {
c.Status(http.StatusOK)
}
gin.SetMode(gin.TestMode)
router := gin.New()
router.Use(EmbeddingsMiddleware(), captureRequestMiddleware(&capturedRequest))
router.Handle(http.MethodPost, "/api/embed", endpoint)
for _, tc := range testCases {
t.Run(tc.Name, func(t *testing.T) {
req, _ := http.NewRequest(http.MethodPost, "/api/embed", nil)
tc.Setup(t, req)
resp := httptest.NewRecorder()
router.ServeHTTP(resp, req)
tc.Expected(t, capturedRequest, resp)
capturedRequest = nil
})
}
}
func TestMiddlewareResponses(t *testing.T) {
type testCase struct {
Name string
Method string
Path string
TestPath string
Handler func() gin.HandlerFunc
Endpoint func(c *gin.Context)
Setup func(t *testing.T, req *http.Request)
Expected func(t *testing.T, resp *httptest.ResponseRecorder)
}
testCases := []testCase{
{
Name: "list handler",
Method: http.MethodGet,
Path: "/api/tags",
TestPath: "/api/tags",
Handler: ListMiddleware,
Endpoint: func(c *gin.Context) {
c.JSON(http.StatusOK, api.ListResponse{
Models: []api.ListModelResponse{
{
Name: "Test Model",
},
},
})
},
Expected: func(t *testing.T, resp *httptest.ResponseRecorder) {
var listResp ListCompletion
if err := json.NewDecoder(resp.Body).Decode(&listResp); err != nil {
t.Fatal(err)
}
if listResp.Object != "list" {
t.Fatalf("expected list, got %s", listResp.Object)
}
if len(listResp.Data) != 1 {
t.Fatalf("expected 1, got %d", len(listResp.Data))
}
if listResp.Data[0].Id != "Test Model" {
t.Fatalf("expected Test Model, got %s", listResp.Data[0].Id)
}
},
},
{
Name: "retrieve model",
Method: http.MethodGet,
Path: "/api/show/:model",
TestPath: "/api/show/test-model",
Handler: RetrieveMiddleware,
Endpoint: func(c *gin.Context) {
c.JSON(http.StatusOK, api.ShowResponse{
ModifiedAt: time.Date(2024, 6, 17, 13, 45, 0, 0, time.UTC),
})
},
Expected: func(t *testing.T, resp *httptest.ResponseRecorder) {
var retrieveResp Model
if err := json.NewDecoder(resp.Body).Decode(&retrieveResp); err != nil {
t.Fatal(err)
}
if retrieveResp.Object != "model" {
t.Fatalf("Expected object to be model, got %s", retrieveResp.Object)
}
if retrieveResp.Id != "test-model" {
t.Fatalf("Expected id to be test-model, got %s", retrieveResp.Id)
}
},
},
}
gin.SetMode(gin.TestMode)
router := gin.New()
for _, tc := range testCases {
t.Run(tc.Name, func(t *testing.T) {
router = gin.New()
router.Use(tc.Handler())
router.Handle(tc.Method, tc.Path, tc.Endpoint)
req, _ := http.NewRequest(tc.Method, tc.TestPath, nil)
if tc.Setup != nil {
tc.Setup(t, req)
}
resp := httptest.NewRecorder()
router.ServeHTTP(resp, req)
assert.Equal(t, http.StatusOK, resp.Code)
tc.Expected(t, resp)
})
}
}

View File

@@ -124,7 +124,7 @@ func ParseFile(r io.Reader) (*File, error) {
case stateComment, stateNil:
// pass
case stateValue:
s, ok := unquote(b.String())
s, ok := unquote(strings.TrimSpace(b.String()))
if !ok || isSpace(r) {
if _, err := b.WriteRune(r); err != nil {
return nil, err
@@ -158,7 +158,7 @@ func ParseFile(r io.Reader) (*File, error) {
case stateComment, stateNil:
// pass; nothing to flush
case stateValue:
s, ok := unquote(b.String())
s, ok := unquote(strings.TrimSpace(b.String()))
if !ok {
return nil, io.ErrUnexpectedEOF
}

View File

@@ -22,7 +22,13 @@ ADAPTER adapter1
LICENSE MIT
PARAMETER param1 value1
PARAMETER param2 value2
TEMPLATE template1
TEMPLATE """{{ if .System }}<|start_header_id|>system<|end_header_id|>
{{ .System }}<|eot_id|>{{ end }}{{ if .Prompt }}<|start_header_id|>user<|end_header_id|>
{{ .Prompt }}<|eot_id|>{{ end }}<|start_header_id|>assistant<|end_header_id|>
{{ .Response }}<|eot_id|>"""
`
reader := strings.NewReader(input)
@@ -36,7 +42,40 @@ TEMPLATE template1
{Name: "license", Args: "MIT"},
{Name: "param1", Args: "value1"},
{Name: "param2", Args: "value2"},
{Name: "template", Args: "template1"},
{Name: "template", Args: "{{ if .System }}<|start_header_id|>system<|end_header_id|>\n\n{{ .System }}<|eot_id|>{{ end }}{{ if .Prompt }}<|start_header_id|>user<|end_header_id|>\n\n{{ .Prompt }}<|eot_id|>{{ end }}<|start_header_id|>assistant<|end_header_id|>\n\n{{ .Response }}<|eot_id|>"},
}
assert.Equal(t, expectedCommands, modelfile.Commands)
}
func TestParseFileTrimSpace(t *testing.T) {
input := `
FROM " model 1"
ADAPTER adapter3
LICENSE "MIT "
PARAMETER param1 value1
PARAMETER param2 value2
TEMPLATE """ {{ if .System }}<|start_header_id|>system<|end_header_id|>
{{ .System }}<|eot_id|>{{ end }}{{ if .Prompt }}<|start_header_id|>user<|end_header_id|>
{{ .Prompt }}<|eot_id|>{{ end }}<|start_header_id|>assistant<|end_header_id|>
{{ .Response }}<|eot_id|> """
`
reader := strings.NewReader(input)
modelfile, err := ParseFile(reader)
require.NoError(t, err)
expectedCommands := []Command{
{Name: "model", Args: " model 1"},
{Name: "adapter", Args: "adapter3"},
{Name: "license", Args: "MIT "},
{Name: "param1", Args: "value1"},
{Name: "param2", Args: "value2"},
{Name: "template", Args: " {{ if .System }}<|start_header_id|>system<|end_header_id|>\n\n{{ .System }}<|eot_id|>{{ end }}{{ if .Prompt }}<|start_header_id|>user<|end_header_id|>\n\n{{ .Prompt }}<|eot_id|>{{ end }}<|start_header_id|>assistant<|end_header_id|>\n\n{{ .Response }}<|eot_id|> "},
}
assert.Equal(t, expectedCommands, modelfile.Commands)
@@ -48,6 +87,26 @@ func TestParseFileFrom(t *testing.T) {
expected []Command
err error
}{
{
"FROM \"FOO BAR \"",
[]Command{{Name: "model", Args: "FOO BAR "}},
nil,
},
{
"FROM \"FOO BAR\"\nPARAMETER param1 value1",
[]Command{{Name: "model", Args: "FOO BAR"}, {Name: "param1", Args: "value1"}},
nil,
},
{
"FROM FOOO BAR ",
[]Command{{Name: "model", Args: "FOOO BAR"}},
nil,
},
{
"FROM /what/is/the path ",
[]Command{{Name: "model", Args: "/what/is/the path"}},
nil,
},
{
"FROM foo",
[]Command{{Name: "model", Args: "foo"}},
@@ -86,6 +145,11 @@ func TestParseFileFrom(t *testing.T) {
[]Command{{Name: "param1", Args: "value1"}, {Name: "model", Args: "foo"}},
nil,
},
{
"PARAMETER what the \nFROM lemons make lemonade ",
[]Command{{Name: "what", Args: "the"}, {Name: "model", Args: "lemons make lemonade"}},
nil,
},
}
for _, c := range cases {
@@ -399,7 +463,7 @@ func TestParseFileParameters(t *testing.T) {
"mirostat_eta 1.0": {"mirostat_eta", "1.0"},
"penalize_newline true": {"penalize_newline", "true"},
"stop ### User:": {"stop", "### User:"},
"stop ### User: ": {"stop", "### User: "},
"stop ### User: ": {"stop", "### User:"},
"stop \"### User:\"": {"stop", "### User:"},
"stop \"### User: \"": {"stop", "### User: "},
"stop \"\"\"### User:\"\"\"": {"stop", "### User:"},

View File

@@ -103,19 +103,22 @@ function buildApp() {
function gatherDependencies() {
write-host "Gathering runtime dependencies"
cd "${script:SRC_DIR}"
md "${script:DEPS_DIR}" -ea 0 > $null
md "${script:DEPS_DIR}\ollama_runners" -ea 0 > $null
# TODO - this varies based on host build system and MSVC version - drive from dumpbin output
# currently works for Win11 + MSVC 2019 + Cuda V11
cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\msvcp140.dll" "${script:DEPS_DIR}\"
cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140.dll" "${script:DEPS_DIR}\"
cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140_1.dll" "${script:DEPS_DIR}\"
cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\msvcp140*.dll" "${script:DEPS_DIR}\ollama_runners\"
cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140.dll" "${script:DEPS_DIR}\ollama_runners\"
cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140_1.dll" "${script:DEPS_DIR}\ollama_runners\"
foreach ($part in $("runtime", "stdio", "filesystem", "math", "convert", "heap", "string", "time", "locale", "environment")) {
cp "$env:VCToolsRedistDir\..\..\..\Tools\Llvm\x64\bin\api-ms-win-crt-${part}*.dll" "${script:DEPS_DIR}\ollama_runners\"
}
cp "${script:SRC_DIR}\app\ollama_welcome.ps1" "${script:SRC_DIR}\dist\"
if ("${env:KEY_CONTAINER}") {
write-host "about to sign"
foreach ($file in (get-childitem "${script:DEPS_DIR}/cu*.dll") + @("${script:SRC_DIR}\dist\ollama_welcome.ps1")){
foreach ($file in (get-childitem "${script:DEPS_DIR}\cuda\cu*.dll") + @("${script:SRC_DIR}\dist\ollama_welcome.ps1")){
write-host "signing $file"
& "${script:SignTool}" sign /v /fd sha256 /t http://timestamp.digicert.com /f "${script:OLLAMA_CERT}" `
/csp "Google Cloud KMS Provider" /kc ${env:KEY_CONTAINER} $file

View File

@@ -159,8 +159,8 @@ check_gpu() {
esac ;;
lshw)
case $2 in
nvidia) available lshw && $SUDO lshw -c display -numeric | grep -q 'vendor: .* \[10DE\]' || return 1 ;;
amdgpu) available lshw && $SUDO lshw -c display -numeric | grep -q 'vendor: .* \[1002\]' || return 1 ;;
nvidia) available lshw && $SUDO lshw -c display -numeric -disable network | grep -q 'vendor: .* \[10DE\]' || return 1 ;;
amdgpu) available lshw && $SUDO lshw -c display -numeric -disable network | grep -q 'vendor: .* \[1002\]' || return 1 ;;
esac ;;
nvidia-smi) available nvidia-smi || return 1 ;;
esac
@@ -279,7 +279,7 @@ if ! check_gpu nvidia-smi || [ -z "$(nvidia-smi | grep -o "CUDA Version: [0-9]*\
case $OS_NAME in
centos|rhel) install_cuda_driver_yum 'rhel' $(echo $OS_VERSION | cut -d '.' -f 1) ;;
rocky) install_cuda_driver_yum 'rhel' $(echo $OS_VERSION | cut -c1) ;;
fedora) [ $OS_VERSION -lt '37' ] && install_cuda_driver_yum $OS_NAME $OS_VERSION || install_cuda_driver_yum $OS_NAME '37';;
fedora) [ $OS_VERSION -lt '39' ] && install_cuda_driver_yum $OS_NAME $OS_VERSION || install_cuda_driver_yum $OS_NAME '39';;
amzn) install_cuda_driver_yum 'fedora' '37' ;;
debian) install_cuda_driver_apt $OS_NAME $OS_VERSION ;;
ubuntu) install_cuda_driver_apt $OS_NAME $(echo $OS_VERSION | sed 's/\.//') ;;

View File

@@ -6,10 +6,21 @@ set -ex
MACHINE=$(uname -m)
if grep -i "centos" /etc/system-release >/dev/null; then
# As of 7/1/2024 mirrorlist.centos.org has been taken offline, so adjust accordingly
sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo
sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo
# Centos 7 derivatives have too old of a git version to run our generate script
# uninstall and ignore failures
yum remove -y git
yum -y install epel-release centos-release-scl
# The release packages reinstate the mirrors, undo that again
sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo
sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo
yum -y install dnf
if [ "${MACHINE}" = "x86_64" ]; then
yum -y install https://repo.ius.io/ius-release-el7.rpm

View File

@@ -28,11 +28,27 @@ import (
"github.com/ollama/ollama/format"
"github.com/ollama/ollama/llm"
"github.com/ollama/ollama/parser"
"github.com/ollama/ollama/template"
"github.com/ollama/ollama/types/errtypes"
"github.com/ollama/ollama/types/model"
"github.com/ollama/ollama/version"
)
var (
errCapabilities = errors.New("does not support")
errCapabilityCompletion = errors.New("completion")
errCapabilityTools = errors.New("tools")
errCapabilityInsert = errors.New("insert")
)
type Capability string
const (
CapabilityCompletion = Capability("completion")
CapabilityTools = Capability("tools")
CapabilityInsert = Capability("insert")
)
type registryOptions struct {
Insecure bool
Username string
@@ -48,16 +64,59 @@ type Model struct {
ParentModel string
AdapterPaths []string
ProjectorPaths []string
Template string
System string
License []string
Digest string
Options map[string]interface{}
Messages []Message
Template *template.Template
}
func (m *Model) IsEmbedding() bool {
return slices.Contains(m.Config.ModelFamilies, "bert") || slices.Contains(m.Config.ModelFamilies, "nomic-bert")
// CheckCapabilities checks if the model has the specified capabilities returning an error describing
// any missing or unknown capabilities
func (m *Model) CheckCapabilities(caps ...Capability) error {
var errs []error
for _, cap := range caps {
switch cap {
case CapabilityCompletion:
f, err := os.Open(m.ModelPath)
if err != nil {
slog.Error("couldn't open model file", "error", err)
continue
}
defer f.Close()
// TODO(mxyng): decode the GGML into model to avoid doing this multiple times
ggml, _, err := llm.DecodeGGML(f, 0)
if err != nil {
slog.Error("couldn't decode ggml", "error", err)
continue
}
if _, ok := ggml.KV()[fmt.Sprintf("%s.pooling_type", ggml.KV().Architecture())]; ok {
errs = append(errs, errCapabilityCompletion)
}
case CapabilityTools:
if !slices.Contains(m.Template.Vars(), "tools") {
errs = append(errs, errCapabilityTools)
}
case CapabilityInsert:
vars := m.Template.Vars()
if !slices.Contains(vars, "suffix") {
errs = append(errs, errCapabilityInsert)
}
default:
slog.Error("unknown capability", "capability", cap)
return fmt.Errorf("unknown capability: %s", cap)
}
}
if err := errors.Join(errs...); err != nil {
return fmt.Errorf("%w %w", errCapabilities, errors.Join(errs...))
}
return nil
}
func (m *Model) String() string {
@@ -82,10 +141,10 @@ func (m *Model) String() string {
})
}
if m.Template != "" {
if m.Template != nil {
modelfile.Commands = append(modelfile.Commands, parser.Command{
Name: "template",
Args: m.Template,
Args: m.Template.String(),
})
}
@@ -135,13 +194,6 @@ type Message struct {
Content string `json:"content"`
}
type ManifestV2 struct {
SchemaVersion int `json:"schemaVersion"`
MediaType string `json:"mediaType"`
Config *Layer `json:"config"`
Layers []*Layer `json:"layers"`
}
type ConfigV2 struct {
ModelFormat string `json:"model_format"`
ModelFamily string `json:"model_family"`
@@ -160,7 +212,7 @@ type RootFS struct {
DiffIDs []string `json:"diff_ids"`
}
func GetManifest(mp ModelPath) (*ManifestV2, string, error) {
func GetManifest(mp ModelPath) (*Manifest, string, error) {
fp, err := mp.GetManifestPath()
if err != nil {
return nil, "", err
@@ -170,7 +222,7 @@ func GetManifest(mp ModelPath) (*ManifestV2, string, error) {
return nil, "", err
}
var manifest *ManifestV2
var manifest *Manifest
bts, err := os.ReadFile(fp)
if err != nil {
@@ -198,8 +250,7 @@ func GetModel(name string) (*Model, error) {
Name: mp.GetFullTagname(),
ShortName: mp.GetShortTagname(),
Digest: digest,
Template: "{{ .Prompt }}",
License: []string{},
Template: template.DefaultTemplate,
}
filename, err := GetBlobsPath(manifest.Config.Digest)
@@ -235,13 +286,17 @@ func GetModel(name string) (*Model, error) {
model.AdapterPaths = append(model.AdapterPaths, filename)
case "application/vnd.ollama.image.projector":
model.ProjectorPaths = append(model.ProjectorPaths, filename)
case "application/vnd.ollama.image.template":
case "application/vnd.ollama.image.prompt",
"application/vnd.ollama.image.template":
bts, err := os.ReadFile(filename)
if err != nil {
return nil, err
}
model.Template = string(bts)
model.Template, err = template.Parse(string(bts))
if err != nil {
return nil, err
}
case "application/vnd.ollama.image.system":
bts, err := os.ReadFile(filename)
if err != nil {
@@ -249,13 +304,6 @@ func GetModel(name string) (*Model, error) {
}
model.System = string(bts)
case "application/vnd.ollama.image.prompt":
bts, err := os.ReadFile(filename)
if err != nil {
return nil, err
}
model.Template = string(bts)
case "application/vnd.ollama.image.params":
params, err := os.Open(filename)
if err != nil {
@@ -414,17 +462,22 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio
return err
}
layers, err := parseFromFile(ctx, temp, "", fn)
layer, err := NewLayer(temp, baseLayer.MediaType)
if err != nil {
return err
}
if len(layers) != 1 {
return errors.New("quantization failed")
if _, err := temp.Seek(0, io.SeekStart); err != nil {
return err
}
baseLayer.Layer = layers[0].Layer
baseLayer.GGML = layers[0].GGML
ggml, _, err := llm.DecodeGGML(temp, 0)
if err != nil {
return err
}
baseLayer.Layer = layer
baseLayer.GGML = ggml
}
}
@@ -439,6 +492,12 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio
layers = append(layers, baseLayer.Layer)
}
case "license", "template", "system":
if c.Name == "template" {
if _, err := template.Parse(c.Args); err != nil {
return fmt.Errorf("%w: %s", errBadTemplate, err)
}
}
if c.Name != "license" {
// replace
layers = slices.DeleteFunc(layers, func(layer *Layer) bool {
@@ -817,7 +876,7 @@ func PushModel(ctx context.Context, name string, regOpts *registryOptions, fn fu
func PullModel(ctx context.Context, name string, regOpts *registryOptions, fn func(api.ProgressResponse)) error {
mp := ParseModelPath(name)
var manifest *ManifestV2
var manifest *Manifest
var err error
var noprune string
@@ -924,7 +983,7 @@ func PullModel(ctx context.Context, name string, regOpts *registryOptions, fn fu
return nil
}
func pullModelManifest(ctx context.Context, mp ModelPath, regOpts *registryOptions) (*ManifestV2, error) {
func pullModelManifest(ctx context.Context, mp ModelPath, regOpts *registryOptions) (*Manifest, error) {
requestURL := mp.BaseURL().JoinPath("v2", mp.GetNamespaceRepository(), "manifests", mp.Tag)
headers := make(http.Header)
@@ -935,7 +994,7 @@ func pullModelManifest(ctx context.Context, mp ModelPath, regOpts *registryOptio
}
defer resp.Body.Close()
var m *ManifestV2
var m *Manifest
if err := json.NewDecoder(resp.Body).Decode(&m); err != nil {
return nil, err
}

View File

@@ -14,7 +14,10 @@ import (
)
type Manifest struct {
ManifestV2
SchemaVersion int `json:"schemaVersion"`
MediaType string `json:"mediaType"`
Config *Layer `json:"config"`
Layers []*Layer `json:"layers"`
filepath string
fi os.FileInfo
@@ -66,7 +69,7 @@ func ParseNamedManifest(n model.Name) (*Manifest, error) {
p := filepath.Join(manifests, n.Filepath())
var m ManifestV2
var m Manifest
f, err := os.Open(p)
if err != nil {
return nil, err
@@ -83,12 +86,11 @@ func ParseNamedManifest(n model.Name) (*Manifest, error) {
return nil, err
}
return &Manifest{
ManifestV2: m,
filepath: p,
fi: fi,
digest: fmt.Sprintf("%x", sha256sum.Sum(nil)),
}, nil
m.filepath = p
m.fi = fi
m.digest = fmt.Sprintf("%x", sha256sum.Sum(nil))
return &m, nil
}
func WriteManifest(name model.Name, config *Layer, layers []*Layer) error {
@@ -108,7 +110,7 @@ func WriteManifest(name model.Name, config *Layer, layers []*Layer) error {
}
defer f.Close()
m := ManifestV2{
m := Manifest{
SchemaVersion: 2,
MediaType: "application/vnd.docker.distribution.manifest.v2+json",
Config: config,

View File

@@ -25,7 +25,7 @@ func createManifest(t *testing.T, path, name string) {
}
defer f.Close()
if err := json.NewEncoder(f).Encode(ManifestV2{}); err != nil {
if err := json.NewEncoder(f).Encode(Manifest{}); err != nil {
t.Fatal(err)
}
}

View File

@@ -4,6 +4,7 @@ import (
"archive/zip"
"bytes"
"context"
"encoding/json"
"errors"
"fmt"
"io"
@@ -11,11 +12,14 @@ import (
"net/http"
"os"
"path/filepath"
"slices"
"strings"
"text/template/parse"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/convert"
"github.com/ollama/ollama/llm"
"github.com/ollama/ollama/templates"
"github.com/ollama/ollama/template"
"github.com/ollama/ollama/types/model"
)
@@ -63,7 +67,7 @@ func parseFromModel(ctx context.Context, name model.Name, fn func(api.ProgressRe
}
defer blob.Close()
ggml, _, err := llm.DecodeGGML(blob)
ggml, _, err := llm.DecodeGGML(blob, 0)
if err != nil {
return nil, err
}
@@ -77,62 +81,79 @@ func parseFromModel(ctx context.Context, name model.Name, fn func(api.ProgressRe
return layers, nil
}
func parseFromZipFile(_ context.Context, file *os.File, digest string, fn func(api.ProgressResponse)) (layers []*layerGGML, err error) {
func extractFromZipFile(p string, file *os.File, fn func(api.ProgressResponse)) error {
stat, err := file.Stat()
if err != nil {
return nil, err
return err
}
r, err := zip.NewReader(file, stat.Size())
if err != nil {
return nil, err
return err
}
tempdir, err := os.MkdirTemp(filepath.Dir(file.Name()), "")
if err != nil {
return nil, err
}
defer os.RemoveAll(tempdir)
fn(api.ProgressResponse{Status: "unpacking model metadata"})
for _, f := range r.File {
if !filepath.IsLocal(f.Name) {
return fmt.Errorf("%w: %s", zip.ErrInsecurePath, f.Name)
}
n := filepath.Join(p, f.Name)
if err := os.MkdirAll(filepath.Dir(n), 0o750); err != nil {
return err
}
// TODO(mxyng): this should not write out all files to disk
outfile, err := os.Create(filepath.Join(tempdir, f.Name))
outfile, err := os.Create(n)
if err != nil {
return nil, err
return err
}
defer outfile.Close()
infile, err := f.Open()
if err != nil {
return nil, err
return err
}
defer infile.Close()
if _, err = io.Copy(outfile, infile); err != nil {
return nil, err
return err
}
if err := outfile.Close(); err != nil {
return nil, err
return err
}
if err := infile.Close(); err != nil {
return nil, err
return err
}
}
mf, err := convert.GetModelFormat(tempdir)
return nil
}
func parseFromZipFile(_ context.Context, file *os.File, digest string, fn func(api.ProgressResponse)) (layers []*layerGGML, err error) {
tempDir, err := os.MkdirTemp(filepath.Dir(file.Name()), "")
if err != nil {
return nil, err
}
defer os.RemoveAll(tempDir)
if err := extractFromZipFile(tempDir, file, fn); err != nil {
return nil, err
}
mf, err := convert.GetModelFormat(tempDir)
if err != nil {
return nil, err
}
params, err := mf.GetParams(tempdir)
params, err := mf.GetParams(tempDir)
if err != nil {
return nil, err
}
mArch, err := mf.GetModelArch("", tempdir, params)
mArch, err := mf.GetModelArch("", tempDir, params)
if err != nil {
return nil, err
}
@@ -150,7 +171,7 @@ func parseFromZipFile(_ context.Context, file *os.File, digest string, fn func(a
// TODO(mxyng): this should write directly into a layer
// e.g. NewLayer(arch.Reader(), "application/vnd.ollama.image.model")
temp, err := os.CreateTemp(tempdir, "fp16")
temp, err := os.CreateTemp(tempDir, "fp16")
if err != nil {
return nil, err
}
@@ -176,7 +197,7 @@ func parseFromZipFile(_ context.Context, file *os.File, digest string, fn func(a
}
defer bin.Close()
ggml, _, err := llm.DecodeGGML(bin)
ggml, _, err := llm.DecodeGGML(bin, 0)
if err != nil {
return nil, err
}
@@ -210,7 +231,7 @@ func parseFromFile(ctx context.Context, file *os.File, digest string, fn func(ap
var offset int64
for offset < stat.Size() {
ggml, n, err := llm.DecodeGGML(file)
ggml, n, err := llm.DecodeGGML(file, 0)
if errors.Is(err, io.EOF) {
break
} else if err != nil {
@@ -239,7 +260,7 @@ func parseFromFile(ctx context.Context, file *os.File, digest string, fn func(ap
func detectChatTemplate(layers []*layerGGML) ([]*layerGGML, error) {
for _, layer := range layers {
if s := layer.GGML.KV().ChatTemplate(); s != "" {
if t, err := templates.NamedTemplate(s); err != nil {
if t, err := template.Named(s); err != nil {
slog.Debug("template detection", "error", err)
} else {
tmpl, err := NewLayer(t.Reader(), "application/vnd.ollama.image.template")
@@ -272,3 +293,92 @@ func detectContentType(r io.Reader) (string, error) {
return "unknown", nil
}
// parseToolCalls attempts to parse a JSON string into a slice of ToolCalls.
// mxyng: this only really works if the input contains tool calls in some JSON format
func (m *Model) parseToolCalls(s string) ([]api.ToolCall, bool) {
// create a subtree from the node that ranges over .ToolCalls
tmpl := m.Template.Subtree(func(n parse.Node) bool {
if t, ok := n.(*parse.RangeNode); ok {
return slices.Contains(template.Identifiers(t.Pipe), "ToolCalls")
}
return false
})
if tmpl == nil {
return nil, false
}
var b bytes.Buffer
if err := tmpl.Execute(&b, map[string][]api.ToolCall{
"ToolCalls": {
{
Function: api.ToolCallFunction{
Name: "@@name@@",
Arguments: api.ToolCallFunctionArguments{
"@@argument@@": 1,
},
},
},
},
}); err != nil {
return nil, false
}
var kv map[string]any
// execute the subtree with placeholders to identify the keys
// trim any commands that might exist in the template
if err := json.Unmarshal(bytes.TrimSuffix(b.Bytes(), []byte(",")), &kv); err != nil {
return nil, false
}
// find the keys that correspond to the name and arguments fields
var name, arguments string
for k, v := range kv {
switch v.(type) {
case string:
name = k
case map[string]any:
arguments = k
}
}
var objs []map[string]any
for offset := 0; offset < len(s); {
var obj map[string]any
decoder := json.NewDecoder(strings.NewReader(s[offset:]))
if err := decoder.Decode(&obj); errors.Is(err, io.EOF) || errors.Is(err, io.ErrUnexpectedEOF) {
break
} else if syntax := &(json.SyntaxError{}); errors.As(err, &syntax) {
// skip over any syntax errors
offset += int(syntax.Offset)
} else if unmarshalType := &(json.UnmarshalTypeError{}); errors.As(err, &unmarshalType) {
// skip over any unmarshalable types
offset += int(unmarshalType.Offset)
} else if err != nil {
slog.Error("parseToolCalls", "error", err)
return nil, false
} else {
offset += int(decoder.InputOffset())
objs = append(objs, obj)
}
}
var toolCalls []api.ToolCall
for _, kv := range objs {
var call api.ToolCall
for k, v := range kv {
switch k {
case name:
call.Function.Name = v.(string)
case arguments:
call.Function.Arguments = v.(map[string]any)
}
}
toolCalls = append(toolCalls, call)
}
return toolCalls, len(toolCalls) > 0
}

235
server/model_test.go Normal file
View File

@@ -0,0 +1,235 @@
package server
import (
"archive/zip"
"bytes"
"encoding/json"
"errors"
"fmt"
"io"
"os"
"path/filepath"
"slices"
"strings"
"testing"
"github.com/google/go-cmp/cmp"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/template"
)
func createZipFile(t *testing.T, name string) *os.File {
t.Helper()
f, err := os.CreateTemp(t.TempDir(), "")
if err != nil {
t.Fatal(err)
}
zf := zip.NewWriter(f)
defer zf.Close()
zh, err := zf.CreateHeader(&zip.FileHeader{Name: name})
if err != nil {
t.Fatal(err)
}
if _, err := io.Copy(zh, bytes.NewReader([]byte(""))); err != nil {
t.Fatal(err)
}
return f
}
func TestExtractFromZipFile(t *testing.T) {
cases := []struct {
name string
expect []string
err error
}{
{
name: "good",
expect: []string{"good"},
},
{
name: strings.Join([]string{"path", "..", "to", "good"}, string(os.PathSeparator)),
expect: []string{filepath.Join("to", "good")},
},
{
name: strings.Join([]string{"path", "..", "to", "..", "good"}, string(os.PathSeparator)),
expect: []string{"good"},
},
{
name: strings.Join([]string{"path", "to", "..", "..", "good"}, string(os.PathSeparator)),
expect: []string{"good"},
},
{
name: strings.Join([]string{"..", "..", "..", "..", "..", "..", "..", "..", "..", "..", "..", "..", "..", "..", "..", "..", "bad"}, string(os.PathSeparator)),
err: zip.ErrInsecurePath,
},
{
name: strings.Join([]string{"path", "..", "..", "to", "bad"}, string(os.PathSeparator)),
err: zip.ErrInsecurePath,
},
}
for _, tt := range cases {
t.Run(tt.name, func(t *testing.T) {
f := createZipFile(t, tt.name)
defer f.Close()
tempDir := t.TempDir()
if err := extractFromZipFile(tempDir, f, func(api.ProgressResponse) {}); !errors.Is(err, tt.err) {
t.Fatal(err)
}
var matches []string
if err := filepath.Walk(tempDir, func(p string, fi os.FileInfo, err error) error {
if err != nil {
return err
}
if !fi.IsDir() {
matches = append(matches, p)
}
return nil
}); err != nil {
t.Fatal(err)
}
var actual []string
for _, match := range matches {
rel, err := filepath.Rel(tempDir, match)
if err != nil {
t.Error(err)
}
actual = append(actual, rel)
}
if !slices.Equal(actual, tt.expect) {
t.Fatalf("expected %d files, got %d", len(tt.expect), len(matches))
}
})
}
}
func readFile(t *testing.T, base, name string) *bytes.Buffer {
t.Helper()
bts, err := os.ReadFile(filepath.Join(base, name))
if err != nil {
t.Fatal(err)
}
return bytes.NewBuffer(bts)
}
func TestExecuteWithTools(t *testing.T) {
p := filepath.Join("testdata", "tools")
cases := []struct {
model string
output string
ok bool
}{
{"mistral", `[TOOL_CALLS] [{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]`, true},
{"mistral", `[TOOL_CALLS] [{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]
The temperature in San Francisco, CA is 70°F and in Toronto, Canada is 20°C.`, true},
{"mistral", `I'm not aware of that information. However, I can suggest searching for the weather using the "get_current_weather" function:
[{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]`, true},
{"mistral", " The weather in San Francisco, CA is 70°F and in Toronto, Canada is 20°C.", false},
{"command-r-plus", "Action: ```json" + `
[
{
"tool_name": "get_current_weather",
"parameters": {
"format": "fahrenheit",
"location": "San Francisco, CA"
}
},
{
"tool_name": "get_current_weather",
"parameters": {
"format": "celsius",
"location": "Toronto, Canada"
}
}
]
` + "```", true},
{"command-r-plus", " The weather in San Francisco, CA is 70°F and in Toronto, Canada is 20°C.", false},
{"firefunction", ` functools[{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]`, true},
{"firefunction", " The weather in San Francisco, CA is 70°F and in Toronto, Canada is 20°C.", false},
{"llama3-groq-tool-use", `<tool_call>
{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}}
{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}
</tool_call>`, true},
}
var tools []api.Tool
if err := json.Unmarshal(readFile(t, p, "tools.json").Bytes(), &tools); err != nil {
t.Fatal(err)
}
var messages []api.Message
if err := json.Unmarshal(readFile(t, p, "messages.json").Bytes(), &messages); err != nil {
t.Fatal(err)
}
calls := []api.ToolCall{
{
Function: api.ToolCallFunction{
Name: "get_current_weather",
Arguments: api.ToolCallFunctionArguments{
"format": "fahrenheit",
"location": "San Francisco, CA",
},
},
},
{
Function: api.ToolCallFunction{
Name: "get_current_weather",
Arguments: api.ToolCallFunctionArguments{
"format": "celsius",
"location": "Toronto, Canada",
},
},
},
}
for _, tt := range cases {
t.Run(tt.model, func(t *testing.T) {
tmpl, err := template.Parse(readFile(t, p, fmt.Sprintf("%s.gotmpl", tt.model)).String())
if err != nil {
t.Fatal(err)
}
t.Run("template", func(t *testing.T) {
var actual bytes.Buffer
if err := tmpl.Execute(&actual, template.Values{Tools: tools, Messages: messages}); err != nil {
t.Fatal(err)
}
if diff := cmp.Diff(actual.String(), readFile(t, p, fmt.Sprintf("%s.out", tt.model)).String()); diff != "" {
t.Errorf("mismatch (-got +want):\n%s", diff)
}
})
t.Run("parse", func(t *testing.T) {
m := &Model{Template: tmpl}
actual, ok := m.parseToolCalls(tt.output)
if ok != tt.ok {
t.Fatalf("expected %t, got %t", tt.ok, ok)
}
if tt.ok {
if diff := cmp.Diff(actual, calls); diff != "" {
t.Errorf("mismatch (-got +want):\n%s", diff)
}
}
})
})
}
}

View File

@@ -103,18 +103,9 @@ func (mp ModelPath) GetShortTagname() string {
return fmt.Sprintf("%s/%s/%s:%s", mp.Registry, mp.Namespace, mp.Repository, mp.Tag)
}
// modelsDir returns the value of the OLLAMA_MODELS environment variable or the user's home directory if OLLAMA_MODELS is not set.
// The models directory is where Ollama stores its model files and manifests.
func modelsDir() (string, error) {
return envconfig.ModelsDir, nil
}
// GetManifestPath returns the path to the manifest file for the given model path, it is up to the caller to create the directory if it does not exist.
func (mp ModelPath) GetManifestPath() (string, error) {
dir, err := modelsDir()
if err != nil {
return "", err
}
dir := envconfig.ModelsDir
return filepath.Join(dir, "manifests", mp.Registry, mp.Namespace, mp.Repository, mp.Tag), nil
}
@@ -127,10 +118,7 @@ func (mp ModelPath) BaseURL() *url.URL {
}
func GetManifestPath() (string, error) {
dir, err := modelsDir()
if err != nil {
return "", err
}
dir := envconfig.ModelsDir
path := filepath.Join(dir, "manifests")
if err := os.MkdirAll(path, 0o755); err != nil {
@@ -141,10 +129,7 @@ func GetManifestPath() (string, error) {
}
func GetBlobsPath(digest string) (string, error) {
dir, err := modelsDir()
if err != nil {
return "", err
}
dir := envconfig.ModelsDir
// only accept actual sha256 digests
pattern := "^sha256[:-][0-9a-fA-F]{64}$"

View File

@@ -1,221 +1,74 @@
package server
import (
"fmt"
"bytes"
"context"
"log/slog"
"strings"
"text/template"
"text/template/parse"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/llm"
"github.com/ollama/ollama/template"
)
// isResponseNode checks if the node contains .Response
func isResponseNode(node *parse.ActionNode) bool {
for _, cmd := range node.Pipe.Cmds {
for _, arg := range cmd.Args {
if fieldNode, ok := arg.(*parse.FieldNode); ok && len(fieldNode.Ident) > 0 {
if fieldNode.Ident[0] == "Response" {
return true
}
type tokenizeFunc func(context.Context, string) ([]int, error)
// chatPrompt accepts a list of messages and returns the prompt and images that should be used for the next chat turn.
// chatPrompt truncates any messages that exceed the context window of the model, making sure to always include 1) the
// latest message and 2) system messages
func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.Options, msgs []api.Message, tools []api.Tool) (prompt string, images []llm.ImageData, _ error) {
var system []api.Message
// always include the last message
n := len(msgs) - 1
// in reverse, find all messages that fit into context window
for i := n - 1; i >= 0; i-- {
system = make([]api.Message, 0)
for j := range i {
if msgs[j].Role == "system" {
system = append(system, msgs[j])
}
}
}
return false
}
// formatTemplateForResponse formats the template AST to:
// 1. remove all nodes after the first .Response (if generate=true)
// 2. add a .Response node to the end if it doesn't exist
// TODO(jmorganca): this should recursively cut the template before the first .Response
func formatTemplateForResponse(tmpl *template.Template, generate bool) {
var found bool
for i, node := range tmpl.Tree.Root.Nodes {
if actionNode, ok := node.(*parse.ActionNode); ok {
if isResponseNode(actionNode) {
found = true
if generate {
tmpl.Tree.Root.Nodes = tmpl.Tree.Root.Nodes[:i+1]
break
}
}
var b bytes.Buffer
if err := m.Template.Execute(&b, template.Values{Messages: append(system, msgs[i:]...), Tools: tools}); err != nil {
return "", nil, err
}
}
if !found {
// add the response node if it doesn't exist
responseFieldNode := &parse.FieldNode{NodeType: parse.NodeField, Ident: []string{"Response"}}
responsePipeNode := &parse.PipeNode{NodeType: parse.NodePipe, Cmds: []*parse.CommandNode{{NodeType: parse.NodeCommand, Args: []parse.Node{responseFieldNode}}}}
responseActionNode := &parse.ActionNode{NodeType: parse.NodeAction, Pipe: responsePipeNode}
tmpl.Tree.Root.Nodes = append(tmpl.Tree.Root.Nodes, responseActionNode)
}
}
// Prompt renders a prompt from a template. If generate is set to true,
// the response and parts of the template following it are not rendered
func Prompt(tmpl, system, prompt, response string, generate bool) (string, error) {
parsed, err := template.New("").Option("missingkey=zero").Parse(tmpl)
if err != nil {
return "", err
}
formatTemplateForResponse(parsed, generate)
vars := map[string]any{
"System": system,
"Prompt": prompt,
"Response": response,
}
var sb strings.Builder
if err := parsed.Execute(&sb, vars); err != nil {
return "", err
}
return sb.String(), nil
}
func countTokens(tmpl string, system string, prompt string, response string, encode func(string) ([]int, error)) (int, error) {
rendered, err := Prompt(tmpl, system, prompt, response, false)
if err != nil {
return 0, err
}
tokens, err := encode(rendered)
if err != nil {
slog.Error("failed to encode prompt", "err", err)
return 0, err
}
return len(tokens), err
}
// ChatPrompt builds up a prompt from a series of messages, truncating based on context window size
func ChatPrompt(tmpl string, messages []api.Message, window int, encode func(string) ([]int, error)) (string, error) {
type prompt struct {
System string
Prompt string
Response string
images []int
tokens int
}
var p prompt
// iterate through messages to build up {system,user,response} prompts
var imgId int
var prompts []prompt
for _, msg := range messages {
switch strings.ToLower(msg.Role) {
case "system":
if p.System != "" || p.Prompt != "" || p.Response != "" {
prompts = append(prompts, p)
p = prompt{}
}
p.System = msg.Content
case "user":
if p.Prompt != "" || p.Response != "" {
prompts = append(prompts, p)
p = prompt{}
}
var sb strings.Builder
for range msg.Images {
fmt.Fprintf(&sb, "[img-%d] ", imgId)
p.images = append(p.images, imgId)
imgId += 1
}
sb.WriteString(msg.Content)
p.Prompt = sb.String()
case "assistant":
if p.Response != "" {
prompts = append(prompts, p)
p = prompt{}
}
p.Response = msg.Content
default:
return "", fmt.Errorf("invalid role: %s, role must be one of [system, user, assistant]", msg.Role)
}
}
// add final prompt
if p.System != "" || p.Prompt != "" || p.Response != "" {
prompts = append(prompts, p)
}
// calculate token lengths for each prompt, estimating 768 tokens per images
for i, p := range prompts {
tokens, err := countTokens(tmpl, p.System, p.Prompt, p.Response, encode)
s, err := tokenize(ctx, b.String())
if err != nil {
return "", err
return "", nil, err
}
prompts[i].tokens = tokens + len(prompts[i].images)*768
}
// truncate images and prompts starting from the beginning of the list
// until either one prompt remains or the total tokens fits the context window
// TODO (jmorganca): this doesn't account for the context window room required for the response
for {
var required int
for _, p := range prompts {
required += p.tokens
c := len(s)
if m.ProjectorPaths != nil {
for _, m := range msgs[i:] {
// images are represented as 768 sized embeddings
// TODO: get embedding length from project metadata
c += 768 * len(m.Images)
}
}
required += 1 // for bos token
if required <= window {
slog.Debug("prompt now fits in context window", "required", required, "window", window)
if c > opts.NumCtx {
slog.Debug("truncating input messages which exceed context length", "truncated", len(msgs[i:]))
break
} else {
n = i
}
prompt := &prompts[0]
if len(prompt.images) > 1 {
img := prompt.images[0]
slog.Debug("prompt longer than context window, removing image", "id", img, "required", required, "window", window)
prompt.images = prompt.images[1:]
prompt.Prompt = strings.Replace(prompt.Prompt, fmt.Sprintf(" [img-%d]", img), "", 1)
prompt.tokens -= 768
continue
}
if len(prompts) > 1 {
slog.Debug("required tokens longer than context window, removing first prompt", "prompt", prompts[0].tokens, "required", required, "window", window)
system := prompt.System
prompts = prompts[1:]
if system != "" && prompts[0].System == "" {
prompts[0].System = system
tokens, err := countTokens(tmpl, prompts[0].System, prompts[0].Prompt, prompts[0].Response, encode)
if err != nil {
return "", err
}
prompts[0].tokens = tokens + len(prompts[0].images)*768
}
continue
}
// stop truncating if there's only one prompt left
break
}
var sb strings.Builder
for i, p := range prompts {
// last prompt should leave the response unrendered (for completion)
rendered, err := Prompt(tmpl, p.System, p.Prompt, p.Response, i == len(prompts)-1)
if err != nil {
return "", err
}
sb.WriteString(rendered)
// truncate any messages that do not fit into the context window
var b bytes.Buffer
if err := m.Template.Execute(&b, template.Values{Messages: append(system, msgs[n:]...), Tools: tools}); err != nil {
return "", nil, err
}
return sb.String(), nil
for _, m := range msgs[n:] {
for _, i := range m.Images {
images = append(images, llm.ImageData{
ID: len(images),
Data: i,
})
}
}
return b.String(), images, nil
}

View File

@@ -1,204 +1,209 @@
package server
import (
"strings"
"bytes"
"context"
"testing"
"github.com/google/go-cmp/cmp"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/template"
)
func TestPrompt(t *testing.T) {
tests := []struct {
name string
template string
system string
prompt string
response string
generate bool
want string
}{
{
name: "simple prompt",
template: "[INST] {{ .System }} {{ .Prompt }} [/INST]",
system: "You are a Wizard.",
prompt: "What are the potion ingredients?",
want: "[INST] You are a Wizard. What are the potion ingredients? [/INST]",
},
{
name: "implicit response",
template: "[INST] {{ .System }} {{ .Prompt }} [/INST]",
system: "You are a Wizard.",
prompt: "What are the potion ingredients?",
response: "I don't know.",
want: "[INST] You are a Wizard. What are the potion ingredients? [/INST]I don't know.",
},
{
name: "response",
template: "[INST] {{ .System }} {{ .Prompt }} [/INST] {{ .Response }}",
system: "You are a Wizard.",
prompt: "What are the potion ingredients?",
response: "I don't know.",
want: "[INST] You are a Wizard. What are the potion ingredients? [/INST] I don't know.",
},
{
name: "cut",
template: "<system>{{ .System }}</system><user>{{ .Prompt }}</user><assistant>{{ .Response }}</assistant>",
system: "You are a Wizard.",
prompt: "What are the potion ingredients?",
response: "I don't know.",
generate: true,
want: "<system>You are a Wizard.</system><user>What are the potion ingredients?</user><assistant>I don't know.",
},
{
name: "nocut",
template: "<system>{{ .System }}</system><user>{{ .Prompt }}</user><assistant>{{ .Response }}</assistant>",
system: "You are a Wizard.",
prompt: "What are the potion ingredients?",
response: "I don't know.",
want: "<system>You are a Wizard.</system><user>What are the potion ingredients?</user><assistant>I don't know.</assistant>",
},
}
for _, tc := range tests {
t.Run(tc.name, func(t *testing.T) {
got, err := Prompt(tc.template, tc.system, tc.prompt, tc.response, tc.generate)
if err != nil {
t.Errorf("error = %v", err)
}
if got != tc.want {
t.Errorf("got = %v, want %v", got, tc.want)
}
})
}
}
func TestChatPrompt(t *testing.T) {
tests := []struct {
name string
template string
messages []api.Message
window int
want string
type expect struct {
prompt string
images [][]byte
}
cases := []struct {
name string
limit int
msgs []api.Message
expect
}{
{
name: "simple prompt",
template: "[INST] {{ .Prompt }} [/INST]",
messages: []api.Message{
{Role: "user", Content: "Hello"},
name: "messages",
limit: 64,
msgs: []api.Message{
{Role: "user", Content: "You're a test, Harry!"},
{Role: "assistant", Content: "I-I'm a what?"},
{Role: "user", Content: "A test. And a thumping good one at that, I'd wager."},
},
expect: expect{
prompt: "You're a test, Harry! I-I'm a what? A test. And a thumping good one at that, I'd wager. ",
},
window: 1024,
want: "[INST] Hello [/INST]",
},
{
name: "with system message",
template: "[INST] {{ if .System }}<<SYS>>{{ .System }}<</SYS>> {{ end }}{{ .Prompt }} [/INST]",
messages: []api.Message{
{Role: "system", Content: "You are a Wizard."},
{Role: "user", Content: "Hello"},
name: "truncate messages",
limit: 1,
msgs: []api.Message{
{Role: "user", Content: "You're a test, Harry!"},
{Role: "assistant", Content: "I-I'm a what?"},
{Role: "user", Content: "A test. And a thumping good one at that, I'd wager."},
},
expect: expect{
prompt: "A test. And a thumping good one at that, I'd wager. ",
},
window: 1024,
want: "[INST] <<SYS>>You are a Wizard.<</SYS>> Hello [/INST]",
},
{
name: "with response",
template: "[INST] {{ if .System }}<<SYS>>{{ .System }}<</SYS>> {{ end }}{{ .Prompt }} [/INST] {{ .Response }}",
messages: []api.Message{
{Role: "system", Content: "You are a Wizard."},
{Role: "user", Content: "Hello"},
{Role: "assistant", Content: "I am?"},
name: "truncate messages with image",
limit: 64,
msgs: []api.Message{
{Role: "user", Content: "You're a test, Harry!"},
{Role: "assistant", Content: "I-I'm a what?"},
{Role: "user", Content: "A test. And a thumping good one at that, I'd wager.", Images: []api.ImageData{[]byte("something")}},
},
expect: expect{
prompt: "[img-0] A test. And a thumping good one at that, I'd wager. ",
images: [][]byte{
[]byte("something"),
},
},
window: 1024,
want: "[INST] <<SYS>>You are a Wizard.<</SYS>> Hello [/INST] I am?",
},
{
name: "with implicit response",
template: "[INST] {{ if .System }}<<SYS>>{{ .System }}<</SYS>> {{ end }}{{ .Prompt }} [/INST]",
messages: []api.Message{
{Role: "system", Content: "You are a Wizard."},
{Role: "user", Content: "Hello"},
{Role: "assistant", Content: "I am?"},
name: "truncate messages with images",
limit: 64,
msgs: []api.Message{
{Role: "user", Content: "You're a test, Harry!", Images: []api.ImageData{[]byte("something")}},
{Role: "assistant", Content: "I-I'm a what?"},
{Role: "user", Content: "A test. And a thumping good one at that, I'd wager.", Images: []api.ImageData{[]byte("somethingelse")}},
},
expect: expect{
prompt: "[img-0] A test. And a thumping good one at that, I'd wager. ",
images: [][]byte{
[]byte("somethingelse"),
},
},
window: 1024,
want: "[INST] <<SYS>>You are a Wizard.<</SYS>> Hello [/INST]I am?",
},
{
name: "with conversation",
template: "[INST] {{ if .System }}<<SYS>>{{ .System }}<</SYS>> {{ end }}{{ .Prompt }} [/INST] {{ .Response }} ",
messages: []api.Message{
{Role: "system", Content: "You are a Wizard."},
{Role: "user", Content: "What are the potion ingredients?"},
{Role: "assistant", Content: "sugar"},
{Role: "user", Content: "Anything else?"},
name: "messages with images",
limit: 2048,
msgs: []api.Message{
{Role: "user", Content: "You're a test, Harry!", Images: []api.ImageData{[]byte("something")}},
{Role: "assistant", Content: "I-I'm a what?"},
{Role: "user", Content: "A test. And a thumping good one at that, I'd wager.", Images: []api.ImageData{[]byte("somethingelse")}},
},
expect: expect{
prompt: "[img-0] You're a test, Harry! I-I'm a what? [img-1] A test. And a thumping good one at that, I'd wager. ",
images: [][]byte{
[]byte("something"),
[]byte("somethingelse"),
},
},
window: 1024,
want: "[INST] <<SYS>>You are a Wizard.<</SYS>> What are the potion ingredients? [/INST] sugar [INST] Anything else? [/INST] ",
},
{
name: "with truncation",
template: "{{ .System }} {{ .Prompt }} {{ .Response }} ",
messages: []api.Message{
{Role: "system", Content: "You are a Wizard."},
{Role: "user", Content: "Hello"},
{Role: "assistant", Content: "I am?"},
{Role: "user", Content: "Why is the sky blue?"},
{Role: "assistant", Content: "The sky is blue from rayleigh scattering"},
name: "message with image tag",
limit: 2048,
msgs: []api.Message{
{Role: "user", Content: "You're a test, Harry! [img]", Images: []api.ImageData{[]byte("something")}},
{Role: "assistant", Content: "I-I'm a what?"},
{Role: "user", Content: "A test. And a thumping good one at that, I'd wager.", Images: []api.ImageData{[]byte("somethingelse")}},
},
expect: expect{
prompt: "You're a test, Harry! [img-0] I-I'm a what? [img-1] A test. And a thumping good one at that, I'd wager. ",
images: [][]byte{
[]byte("something"),
[]byte("somethingelse"),
},
},
window: 10,
want: "You are a Wizard. Why is the sky blue? The sky is blue from rayleigh scattering",
},
{
name: "images",
template: "{{ .System }} {{ .Prompt }}",
messages: []api.Message{
{Role: "system", Content: "You are a Wizard."},
{Role: "user", Content: "Hello", Images: []api.ImageData{[]byte("base64")}},
name: "messages with interleaved images",
limit: 2048,
msgs: []api.Message{
{Role: "user", Content: "You're a test, Harry!"},
{Role: "user", Images: []api.ImageData{[]byte("something")}},
{Role: "user", Images: []api.ImageData{[]byte("somethingelse")}},
{Role: "assistant", Content: "I-I'm a what?"},
{Role: "user", Content: "A test. And a thumping good one at that, I'd wager."},
},
expect: expect{
prompt: "You're a test, Harry!\n\n[img-0]\n\n[img-1] I-I'm a what? A test. And a thumping good one at that, I'd wager. ",
images: [][]byte{
[]byte("something"),
[]byte("somethingelse"),
},
},
window: 1024,
want: "You are a Wizard. [img-0] Hello",
},
{
name: "images truncated",
template: "{{ .System }} {{ .Prompt }}",
messages: []api.Message{
{Role: "system", Content: "You are a Wizard."},
{Role: "user", Content: "Hello", Images: []api.ImageData{[]byte("img1"), []byte("img2")}},
name: "truncate message with interleaved images",
limit: 1024,
msgs: []api.Message{
{Role: "user", Content: "You're a test, Harry!"},
{Role: "user", Images: []api.ImageData{[]byte("something")}},
{Role: "user", Images: []api.ImageData{[]byte("somethingelse")}},
{Role: "assistant", Content: "I-I'm a what?"},
{Role: "user", Content: "A test. And a thumping good one at that, I'd wager."},
},
expect: expect{
prompt: "[img-0] I-I'm a what? A test. And a thumping good one at that, I'd wager. ",
images: [][]byte{
[]byte("somethingelse"),
},
},
window: 1024,
want: "You are a Wizard. [img-0] [img-1] Hello",
},
{
name: "empty list",
template: "{{ .System }} {{ .Prompt }}",
messages: []api.Message{},
window: 1024,
want: "",
name: "message with system prompt",
limit: 2048,
msgs: []api.Message{
{Role: "system", Content: "You are the Test Who Lived."},
{Role: "user", Content: "You're a test, Harry!"},
{Role: "assistant", Content: "I-I'm a what?"},
{Role: "user", Content: "A test. And a thumping good one at that, I'd wager."},
},
expect: expect{
prompt: "You are the Test Who Lived. You're a test, Harry! I-I'm a what? A test. And a thumping good one at that, I'd wager. ",
},
},
{
name: "empty prompt",
template: "[INST] {{ if .System }}<<SYS>>{{ .System }}<</SYS>> {{ end }}{{ .Prompt }} [/INST] {{ .Response }} ",
messages: []api.Message{
{Role: "user", Content: ""},
name: "out of order system",
limit: 2048,
msgs: []api.Message{
{Role: "user", Content: "You're a test, Harry!"},
{Role: "assistant", Content: "I-I'm a what?"},
{Role: "system", Content: "You are the Test Who Lived."},
{Role: "user", Content: "A test. And a thumping good one at that, I'd wager."},
},
expect: expect{
prompt: "You're a test, Harry! I-I'm a what? You are the Test Who Lived. A test. And a thumping good one at that, I'd wager. ",
},
window: 1024,
want: "",
},
}
encode := func(s string) ([]int, error) {
words := strings.Fields(s)
return make([]int, len(words)), nil
tmpl, err := template.Parse(`
{{- if .System }}{{ .System }} {{ end }}
{{- if .Prompt }}{{ .Prompt }} {{ end }}
{{- if .Response }}{{ .Response }} {{ end }}`)
if err != nil {
t.Fatal(err)
}
for _, tc := range tests {
t.Run(tc.name, func(t *testing.T) {
got, err := ChatPrompt(tc.template, tc.messages, tc.window, encode)
for _, tt := range cases {
t.Run(tt.name, func(t *testing.T) {
model := Model{Template: tmpl, ProjectorPaths: []string{"vision"}}
opts := api.Options{Runner: api.Runner{NumCtx: tt.limit}}
prompt, images, err := chatPrompt(context.TODO(), &model, mockRunner{}.Tokenize, &opts, tt.msgs, nil)
if err != nil {
t.Errorf("error = %v", err)
t.Fatal(err)
}
if got != tc.want {
t.Errorf("got: %q, want: %q", got, tc.want)
if diff := cmp.Diff(prompt, tt.prompt); diff != "" {
t.Errorf("mismatch (-got +want):\n%s", diff)
}
if len(images) != len(tt.images) {
t.Fatalf("expected %d images, got %d", len(tt.images), len(images))
}
for i := range images {
if images[i].ID != i {
t.Errorf("expected ID %d, got %d", i, images[i].ID)
}
if !bytes.Equal(images[i].Data, tt.images[i]) {
t.Errorf("expected %q, got %q", tt.images[i], images[i])
}
}
})
}

File diff suppressed because it is too large Load Diff

View File

@@ -85,6 +85,8 @@ func checkFileExists(t *testing.T, p string, expect []string) {
}
func TestCreateFromBin(t *testing.T) {
gin.SetMode(gin.TestMode)
p := t.TempDir()
t.Setenv("OLLAMA_MODELS", p)
envconfig.LoadConfig()
@@ -111,6 +113,8 @@ func TestCreateFromBin(t *testing.T) {
}
func TestCreateFromModel(t *testing.T) {
gin.SetMode(gin.TestMode)
p := t.TempDir()
t.Setenv("OLLAMA_MODELS", p)
envconfig.LoadConfig()
@@ -152,6 +156,8 @@ func TestCreateFromModel(t *testing.T) {
}
func TestCreateRemovesLayers(t *testing.T) {
gin.SetMode(gin.TestMode)
p := t.TempDir()
t.Setenv("OLLAMA_MODELS", p)
envconfig.LoadConfig()
@@ -199,6 +205,8 @@ func TestCreateRemovesLayers(t *testing.T) {
}
func TestCreateUnsetsSystem(t *testing.T) {
gin.SetMode(gin.TestMode)
p := t.TempDir()
t.Setenv("OLLAMA_MODELS", p)
envconfig.LoadConfig()
@@ -255,6 +263,8 @@ func TestCreateUnsetsSystem(t *testing.T) {
}
func TestCreateMergeParameters(t *testing.T) {
gin.SetMode(gin.TestMode)
p := t.TempDir()
t.Setenv("OLLAMA_MODELS", p)
envconfig.LoadConfig()
@@ -358,6 +368,8 @@ func TestCreateMergeParameters(t *testing.T) {
}
func TestCreateReplacesMessages(t *testing.T) {
gin.SetMode(gin.TestMode)
p := t.TempDir()
t.Setenv("OLLAMA_MODELS", p)
envconfig.LoadConfig()
@@ -434,6 +446,8 @@ func TestCreateReplacesMessages(t *testing.T) {
}
func TestCreateTemplateSystem(t *testing.T) {
gin.SetMode(gin.TestMode)
p := t.TempDir()
t.Setenv("OLLAMA_MODELS", p)
envconfig.LoadConfig()
@@ -477,9 +491,47 @@ func TestCreateTemplateSystem(t *testing.T) {
if string(system) != "Say bye!" {
t.Errorf("expected \"Say bye!\", actual %s", system)
}
t.Run("incomplete template", func(t *testing.T) {
w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
Name: "test",
Modelfile: fmt.Sprintf("FROM %s\nTEMPLATE {{ .Prompt", createBinFile(t, nil, nil)),
Stream: &stream,
})
if w.Code != http.StatusBadRequest {
t.Fatalf("expected status code 400, actual %d", w.Code)
}
})
t.Run("template with unclosed if", func(t *testing.T) {
w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
Name: "test",
Modelfile: fmt.Sprintf("FROM %s\nTEMPLATE {{ if .Prompt }}", createBinFile(t, nil, nil)),
Stream: &stream,
})
if w.Code != http.StatusBadRequest {
t.Fatalf("expected status code 400, actual %d", w.Code)
}
})
t.Run("template with undefined function", func(t *testing.T) {
w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
Name: "test",
Modelfile: fmt.Sprintf("FROM %s\nTEMPLATE {{ Prompt }}", createBinFile(t, nil, nil)),
Stream: &stream,
})
if w.Code != http.StatusBadRequest {
t.Fatalf("expected status code 400, actual %d", w.Code)
}
})
}
func TestCreateLicenses(t *testing.T) {
gin.SetMode(gin.TestMode)
p := t.TempDir()
t.Setenv("OLLAMA_MODELS", p)
envconfig.LoadConfig()
@@ -526,6 +578,8 @@ func TestCreateLicenses(t *testing.T) {
}
func TestCreateDetectTemplate(t *testing.T) {
gin.SetMode(gin.TestMode)
p := t.TempDir()
t.Setenv("OLLAMA_MODELS", p)
envconfig.LoadConfig()
@@ -545,9 +599,9 @@ func TestCreateDetectTemplate(t *testing.T) {
}
checkFileExists(t, filepath.Join(p, "blobs", "*"), []string{
filepath.Join(p, "blobs", "sha256-2f8e594e6f34b1b4d36a246628eeb3365ce442303d656f1fcc69e821722acea0"),
filepath.Join(p, "blobs", "sha256-542b217f179c7825eeb5bca3c77d2b75ed05bafbd3451d9188891a60a85337c6"),
filepath.Join(p, "blobs", "sha256-553c4a3f747b3d22a4946875f1cc8ed011c2930d83f864a0c7265f9ec0a20413"),
filepath.Join(p, "blobs", "sha256-c608dc615584cd20d9d830363dabf8a4783ae5d34245c3d8c115edb3bc7b28e4"),
filepath.Join(p, "blobs", "sha256-f836ee110db21567f826332e4cedd746c06d10664fd5a9ea3659e3683a944510"),
})
})

View File

@@ -8,12 +8,15 @@ import (
"path/filepath"
"testing"
"github.com/gin-gonic/gin"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/types/model"
)
func TestDelete(t *testing.T) {
gin.SetMode(gin.TestMode)
p := t.TempDir()
t.Setenv("OLLAMA_MODELS", p)
envconfig.LoadConfig()
@@ -77,6 +80,8 @@ func TestDelete(t *testing.T) {
}
func TestDeleteDuplicateLayers(t *testing.T) {
gin.SetMode(gin.TestMode)
p := t.TempDir()
t.Setenv("OLLAMA_MODELS", p)
var s Server

View File

@@ -0,0 +1,714 @@
package server
import (
"bytes"
"context"
"encoding/json"
"fmt"
"io"
"net/http"
"strings"
"testing"
"time"
"github.com/gin-gonic/gin"
"github.com/google/go-cmp/cmp"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/gpu"
"github.com/ollama/ollama/llm"
)
type mockRunner struct {
llm.LlamaServer
// CompletionRequest is only valid until the next call to Completion
llm.CompletionRequest
llm.CompletionResponse
}
func (m *mockRunner) Completion(_ context.Context, r llm.CompletionRequest, fn func(r llm.CompletionResponse)) error {
m.CompletionRequest = r
fn(m.CompletionResponse)
return nil
}
func (mockRunner) Tokenize(_ context.Context, s string) (tokens []int, err error) {
for range strings.Fields(s) {
tokens = append(tokens, len(tokens))
}
return
}
func newMockServer(mock *mockRunner) func(gpu.GpuInfoList, string, *llm.GGML, []string, []string, api.Options, int) (llm.LlamaServer, error) {
return func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, projectors, system []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
return mock, nil
}
}
func TestGenerateChat(t *testing.T) {
gin.SetMode(gin.TestMode)
mock := mockRunner{
CompletionResponse: llm.CompletionResponse{
Done: true,
DoneReason: "stop",
PromptEvalCount: 1,
PromptEvalDuration: 1,
EvalCount: 1,
EvalDuration: 1,
},
}
s := Server{
sched: &Scheduler{
pendingReqCh: make(chan *LlmRequest, 1),
finishedReqCh: make(chan *LlmRequest, 1),
expiredCh: make(chan *runnerRef, 1),
unloadedCh: make(chan any, 1),
loaded: make(map[string]*runnerRef),
newServerFn: newMockServer(&mock),
getGpuFn: gpu.GetGPUInfo,
getCpuFn: gpu.GetCPUInfo,
reschedDelay: 250 * time.Millisecond,
loadFn: func(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel int) {
// add small delay to simulate loading
time.Sleep(time.Millisecond)
req.successCh <- &runnerRef{
llama: &mock,
}
},
},
}
go s.sched.Run(context.TODO())
w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
Model: "test",
Modelfile: fmt.Sprintf(`FROM %s
TEMPLATE """
{{- if .System }}System: {{ .System }} {{ end }}
{{- if .Prompt }}User: {{ .Prompt }} {{ end }}
{{- if .Response }}Assistant: {{ .Response }} {{ end }}"""
`, createBinFile(t, llm.KV{
"general.architecture": "llama",
"llama.block_count": uint32(1),
"llama.context_length": uint32(8192),
"llama.embedding_length": uint32(4096),
"llama.attention.head_count": uint32(32),
"llama.attention.head_count_kv": uint32(8),
"tokenizer.ggml.tokens": []string{""},
"tokenizer.ggml.scores": []float32{0},
"tokenizer.ggml.token_type": []int32{0},
}, []llm.Tensor{
{Name: "token_embd.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.attn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.ffn_down.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.ffn_gate.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.ffn_up.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.ffn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.attn_k.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.attn_output.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.attn_q.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.attn_v.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "output.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
})),
Stream: &stream,
})
if w.Code != http.StatusOK {
t.Fatalf("expected status 200, got %d", w.Code)
}
t.Run("missing body", func(t *testing.T) {
w := createRequest(t, s.ChatHandler, nil)
if w.Code != http.StatusBadRequest {
t.Errorf("expected status 400, got %d", w.Code)
}
if diff := cmp.Diff(w.Body.String(), `{"error":"model is required"}`); diff != "" {
t.Errorf("mismatch (-got +want):\n%s", diff)
}
})
t.Run("missing model", func(t *testing.T) {
w := createRequest(t, s.ChatHandler, api.ChatRequest{})
if w.Code != http.StatusBadRequest {
t.Errorf("expected status 400, got %d", w.Code)
}
if diff := cmp.Diff(w.Body.String(), `{"error":"model is required"}`); diff != "" {
t.Errorf("mismatch (-got +want):\n%s", diff)
}
})
t.Run("missing capabilities chat", func(t *testing.T) {
w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
Model: "bert",
Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, llm.KV{
"general.architecture": "bert",
"bert.pooling_type": uint32(0),
}, []llm.Tensor{})),
Stream: &stream,
})
if w.Code != http.StatusOK {
t.Fatalf("expected status 200, got %d", w.Code)
}
w = createRequest(t, s.ChatHandler, api.ChatRequest{
Model: "bert",
})
if w.Code != http.StatusBadRequest {
t.Errorf("expected status 400, got %d", w.Code)
}
if diff := cmp.Diff(w.Body.String(), `{"error":"\"bert\" does not support chat"}`); diff != "" {
t.Errorf("mismatch (-got +want):\n%s", diff)
}
})
t.Run("load model", func(t *testing.T) {
w := createRequest(t, s.ChatHandler, api.ChatRequest{
Model: "test",
})
if w.Code != http.StatusOK {
t.Errorf("expected status 200, got %d", w.Code)
}
var actual api.ChatResponse
if err := json.NewDecoder(w.Body).Decode(&actual); err != nil {
t.Fatal(err)
}
if actual.Model != "test" {
t.Errorf("expected model test, got %s", actual.Model)
}
if !actual.Done {
t.Errorf("expected done true, got false")
}
if actual.DoneReason != "load" {
t.Errorf("expected done reason load, got %s", actual.DoneReason)
}
})
checkChatResponse := func(t *testing.T, body io.Reader, model, content string) {
t.Helper()
var actual api.ChatResponse
if err := json.NewDecoder(body).Decode(&actual); err != nil {
t.Fatal(err)
}
if actual.Model != model {
t.Errorf("expected model test, got %s", actual.Model)
}
if !actual.Done {
t.Errorf("expected done false, got true")
}
if actual.DoneReason != "stop" {
t.Errorf("expected done reason stop, got %s", actual.DoneReason)
}
if diff := cmp.Diff(actual.Message, api.Message{
Role: "assistant",
Content: content,
}); diff != "" {
t.Errorf("mismatch (-got +want):\n%s", diff)
}
if actual.PromptEvalCount == 0 {
t.Errorf("expected prompt eval count > 0, got 0")
}
if actual.PromptEvalDuration == 0 {
t.Errorf("expected prompt eval duration > 0, got 0")
}
if actual.EvalCount == 0 {
t.Errorf("expected eval count > 0, got 0")
}
if actual.EvalDuration == 0 {
t.Errorf("expected eval duration > 0, got 0")
}
if actual.LoadDuration == 0 {
t.Errorf("expected load duration > 0, got 0")
}
if actual.TotalDuration == 0 {
t.Errorf("expected total duration > 0, got 0")
}
}
mock.CompletionResponse.Content = "Hi!"
t.Run("messages", func(t *testing.T) {
w := createRequest(t, s.ChatHandler, api.ChatRequest{
Model: "test",
Messages: []api.Message{
{Role: "user", Content: "Hello!"},
},
Stream: &stream,
})
if w.Code != http.StatusOK {
t.Errorf("expected status 200, got %d", w.Code)
}
if diff := cmp.Diff(mock.CompletionRequest.Prompt, "User: Hello! "); diff != "" {
t.Errorf("mismatch (-got +want):\n%s", diff)
}
checkChatResponse(t, w.Body, "test", "Hi!")
})
w = createRequest(t, s.CreateModelHandler, api.CreateRequest{
Model: "test-system",
Modelfile: "FROM test\nSYSTEM You are a helpful assistant.",
})
if w.Code != http.StatusOK {
t.Fatalf("expected status 200, got %d", w.Code)
}
t.Run("messages with model system", func(t *testing.T) {
w := createRequest(t, s.ChatHandler, api.ChatRequest{
Model: "test-system",
Messages: []api.Message{
{Role: "user", Content: "Hello!"},
},
Stream: &stream,
})
if w.Code != http.StatusOK {
t.Errorf("expected status 200, got %d", w.Code)
}
if diff := cmp.Diff(mock.CompletionRequest.Prompt, "System: You are a helpful assistant. User: Hello! "); diff != "" {
t.Errorf("mismatch (-got +want):\n%s", diff)
}
checkChatResponse(t, w.Body, "test-system", "Hi!")
})
mock.CompletionResponse.Content = "Abra kadabra!"
t.Run("messages with system", func(t *testing.T) {
w := createRequest(t, s.ChatHandler, api.ChatRequest{
Model: "test-system",
Messages: []api.Message{
{Role: "system", Content: "You can perform magic tricks."},
{Role: "user", Content: "Hello!"},
},
Stream: &stream,
})
if w.Code != http.StatusOK {
t.Errorf("expected status 200, got %d", w.Code)
}
if diff := cmp.Diff(mock.CompletionRequest.Prompt, "System: You can perform magic tricks. User: Hello! "); diff != "" {
t.Errorf("mismatch (-got +want):\n%s", diff)
}
checkChatResponse(t, w.Body, "test-system", "Abra kadabra!")
})
t.Run("messages with interleaved system", func(t *testing.T) {
w := createRequest(t, s.ChatHandler, api.ChatRequest{
Model: "test-system",
Messages: []api.Message{
{Role: "user", Content: "Hello!"},
{Role: "assistant", Content: "I can help you with that."},
{Role: "system", Content: "You can perform magic tricks."},
{Role: "user", Content: "Help me write tests."},
},
Stream: &stream,
})
if w.Code != http.StatusOK {
t.Errorf("expected status 200, got %d", w.Code)
}
if diff := cmp.Diff(mock.CompletionRequest.Prompt, "System: You are a helpful assistant. User: Hello! Assistant: I can help you with that. System: You can perform magic tricks. User: Help me write tests. "); diff != "" {
t.Errorf("mismatch (-got +want):\n%s", diff)
}
checkChatResponse(t, w.Body, "test-system", "Abra kadabra!")
})
}
func TestGenerate(t *testing.T) {
gin.SetMode(gin.TestMode)
mock := mockRunner{
CompletionResponse: llm.CompletionResponse{
Done: true,
DoneReason: "stop",
PromptEvalCount: 1,
PromptEvalDuration: 1,
EvalCount: 1,
EvalDuration: 1,
},
}
s := Server{
sched: &Scheduler{
pendingReqCh: make(chan *LlmRequest, 1),
finishedReqCh: make(chan *LlmRequest, 1),
expiredCh: make(chan *runnerRef, 1),
unloadedCh: make(chan any, 1),
loaded: make(map[string]*runnerRef),
newServerFn: newMockServer(&mock),
getGpuFn: gpu.GetGPUInfo,
getCpuFn: gpu.GetCPUInfo,
reschedDelay: 250 * time.Millisecond,
loadFn: func(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel int) {
// add small delay to simulate loading
time.Sleep(time.Millisecond)
req.successCh <- &runnerRef{
llama: &mock,
}
},
},
}
go s.sched.Run(context.TODO())
w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
Model: "test",
Modelfile: fmt.Sprintf(`FROM %s
TEMPLATE """
{{- if .System }}System: {{ .System }} {{ end }}
{{- if .Prompt }}User: {{ .Prompt }} {{ end }}
{{- if .Response }}Assistant: {{ .Response }} {{ end }}"""
`, createBinFile(t, llm.KV{
"general.architecture": "llama",
"llama.block_count": uint32(1),
"llama.context_length": uint32(8192),
"llama.embedding_length": uint32(4096),
"llama.attention.head_count": uint32(32),
"llama.attention.head_count_kv": uint32(8),
"tokenizer.ggml.tokens": []string{""},
"tokenizer.ggml.scores": []float32{0},
"tokenizer.ggml.token_type": []int32{0},
}, []llm.Tensor{
{Name: "token_embd.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.attn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.ffn_down.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.ffn_gate.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.ffn_up.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.ffn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.attn_k.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.attn_output.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.attn_q.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.attn_v.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "output.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
})),
Stream: &stream,
})
if w.Code != http.StatusOK {
t.Fatalf("expected status 200, got %d", w.Code)
}
t.Run("missing body", func(t *testing.T) {
w := createRequest(t, s.GenerateHandler, nil)
if w.Code != http.StatusBadRequest {
t.Errorf("expected status 400, got %d", w.Code)
}
if diff := cmp.Diff(w.Body.String(), `{"error":"model is required"}`); diff != "" {
t.Errorf("mismatch (-got +want):\n%s", diff)
}
})
t.Run("missing model", func(t *testing.T) {
w := createRequest(t, s.GenerateHandler, api.GenerateRequest{})
if w.Code != http.StatusBadRequest {
t.Errorf("expected status 400, got %d", w.Code)
}
if diff := cmp.Diff(w.Body.String(), `{"error":"model is required"}`); diff != "" {
t.Errorf("mismatch (-got +want):\n%s", diff)
}
})
t.Run("missing capabilities generate", func(t *testing.T) {
w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
Model: "bert",
Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, llm.KV{
"general.architecture": "bert",
"bert.pooling_type": uint32(0),
}, []llm.Tensor{})),
Stream: &stream,
})
if w.Code != http.StatusOK {
t.Fatalf("expected status 200, got %d", w.Code)
}
w = createRequest(t, s.GenerateHandler, api.GenerateRequest{
Model: "bert",
})
if w.Code != http.StatusBadRequest {
t.Errorf("expected status 400, got %d", w.Code)
}
if diff := cmp.Diff(w.Body.String(), `{"error":"\"bert\" does not support generate"}`); diff != "" {
t.Errorf("mismatch (-got +want):\n%s", diff)
}
})
t.Run("missing capabilities suffix", func(t *testing.T) {
w := createRequest(t, s.GenerateHandler, api.GenerateRequest{
Model: "test",
Prompt: "def add(",
Suffix: " return c",
})
if w.Code != http.StatusBadRequest {
t.Errorf("expected status 400, got %d", w.Code)
}
if diff := cmp.Diff(w.Body.String(), `{"error":"test does not support insert"}`); diff != "" {
t.Errorf("mismatch (-got +want):\n%s", diff)
}
})
t.Run("load model", func(t *testing.T) {
w := createRequest(t, s.GenerateHandler, api.GenerateRequest{
Model: "test",
})
if w.Code != http.StatusOK {
t.Errorf("expected status 200, got %d", w.Code)
}
var actual api.GenerateResponse
if err := json.NewDecoder(w.Body).Decode(&actual); err != nil {
t.Fatal(err)
}
if actual.Model != "test" {
t.Errorf("expected model test, got %s", actual.Model)
}
if !actual.Done {
t.Errorf("expected done true, got false")
}
if actual.DoneReason != "load" {
t.Errorf("expected done reason load, got %s", actual.DoneReason)
}
})
checkGenerateResponse := func(t *testing.T, body io.Reader, model, content string) {
t.Helper()
var actual api.GenerateResponse
if err := json.NewDecoder(body).Decode(&actual); err != nil {
t.Fatal(err)
}
if actual.Model != model {
t.Errorf("expected model test, got %s", actual.Model)
}
if !actual.Done {
t.Errorf("expected done false, got true")
}
if actual.DoneReason != "stop" {
t.Errorf("expected done reason stop, got %s", actual.DoneReason)
}
if actual.Response != content {
t.Errorf("expected response %s, got %s", content, actual.Response)
}
if actual.Context == nil {
t.Errorf("expected context not nil")
}
if actual.PromptEvalCount == 0 {
t.Errorf("expected prompt eval count > 0, got 0")
}
if actual.PromptEvalDuration == 0 {
t.Errorf("expected prompt eval duration > 0, got 0")
}
if actual.EvalCount == 0 {
t.Errorf("expected eval count > 0, got 0")
}
if actual.EvalDuration == 0 {
t.Errorf("expected eval duration > 0, got 0")
}
if actual.LoadDuration == 0 {
t.Errorf("expected load duration > 0, got 0")
}
if actual.TotalDuration == 0 {
t.Errorf("expected total duration > 0, got 0")
}
}
mock.CompletionResponse.Content = "Hi!"
t.Run("prompt", func(t *testing.T) {
w := createRequest(t, s.GenerateHandler, api.GenerateRequest{
Model: "test",
Prompt: "Hello!",
Stream: &stream,
})
if w.Code != http.StatusOK {
t.Errorf("expected status 200, got %d", w.Code)
}
if diff := cmp.Diff(mock.CompletionRequest.Prompt, "User: Hello! "); diff != "" {
t.Errorf("mismatch (-got +want):\n%s", diff)
}
checkGenerateResponse(t, w.Body, "test", "Hi!")
})
w = createRequest(t, s.CreateModelHandler, api.CreateRequest{
Model: "test-system",
Modelfile: "FROM test\nSYSTEM You are a helpful assistant.",
})
if w.Code != http.StatusOK {
t.Fatalf("expected status 200, got %d", w.Code)
}
t.Run("prompt with model system", func(t *testing.T) {
w := createRequest(t, s.GenerateHandler, api.GenerateRequest{
Model: "test-system",
Prompt: "Hello!",
Stream: &stream,
})
if w.Code != http.StatusOK {
t.Errorf("expected status 200, got %d", w.Code)
}
if diff := cmp.Diff(mock.CompletionRequest.Prompt, "System: You are a helpful assistant. User: Hello! "); diff != "" {
t.Errorf("mismatch (-got +want):\n%s", diff)
}
checkGenerateResponse(t, w.Body, "test-system", "Hi!")
})
mock.CompletionResponse.Content = "Abra kadabra!"
t.Run("prompt with system", func(t *testing.T) {
w := createRequest(t, s.GenerateHandler, api.GenerateRequest{
Model: "test-system",
Prompt: "Hello!",
System: "You can perform magic tricks.",
Stream: &stream,
})
if w.Code != http.StatusOK {
t.Errorf("expected status 200, got %d", w.Code)
}
if diff := cmp.Diff(mock.CompletionRequest.Prompt, "System: You can perform magic tricks. User: Hello! "); diff != "" {
t.Errorf("mismatch (-got +want):\n%s", diff)
}
checkGenerateResponse(t, w.Body, "test-system", "Abra kadabra!")
})
t.Run("prompt with template", func(t *testing.T) {
w := createRequest(t, s.GenerateHandler, api.GenerateRequest{
Model: "test-system",
Prompt: "Help me write tests.",
System: "You can perform magic tricks.",
Template: `{{- if .System }}{{ .System }} {{ end }}
{{- if .Prompt }}### USER {{ .Prompt }} {{ end }}
{{- if .Response }}### ASSISTANT {{ .Response }} {{ end }}`,
Stream: &stream,
})
if w.Code != http.StatusOK {
t.Errorf("expected status 200, got %d", w.Code)
}
if diff := cmp.Diff(mock.CompletionRequest.Prompt, "You can perform magic tricks. ### USER Help me write tests. "); diff != "" {
t.Errorf("mismatch (-got +want):\n%s", diff)
}
checkGenerateResponse(t, w.Body, "test-system", "Abra kadabra!")
})
w = createRequest(t, s.CreateModelHandler, api.CreateRequest{
Model: "test-suffix",
Modelfile: `FROM test
TEMPLATE """{{- if .Suffix }}<PRE> {{ .Prompt }} <SUF>{{ .Suffix }} <MID>
{{- else }}{{ .Prompt }}
{{- end }}"""`,
})
if w.Code != http.StatusOK {
t.Fatalf("expected status 200, got %d", w.Code)
}
t.Run("prompt with suffix", func(t *testing.T) {
w := createRequest(t, s.GenerateHandler, api.GenerateRequest{
Model: "test-suffix",
Prompt: "def add(",
Suffix: " return c",
})
if w.Code != http.StatusOK {
t.Errorf("expected status 200, got %d", w.Code)
}
if diff := cmp.Diff(mock.CompletionRequest.Prompt, "<PRE> def add( <SUF> return c <MID>"); diff != "" {
t.Errorf("mismatch (-got +want):\n%s", diff)
}
})
t.Run("prompt without suffix", func(t *testing.T) {
w := createRequest(t, s.GenerateHandler, api.GenerateRequest{
Model: "test-suffix",
Prompt: "def add(",
})
if w.Code != http.StatusOK {
t.Errorf("expected status 200, got %d", w.Code)
}
if diff := cmp.Diff(mock.CompletionRequest.Prompt, "def add("); diff != "" {
t.Errorf("mismatch (-got +want):\n%s", diff)
}
})
t.Run("raw", func(t *testing.T) {
w := createRequest(t, s.GenerateHandler, api.GenerateRequest{
Model: "test-system",
Prompt: "Help me write tests.",
Raw: true,
Stream: &stream,
})
if w.Code != http.StatusOK {
t.Errorf("expected status 200, got %d", w.Code)
}
if diff := cmp.Diff(mock.CompletionRequest.Prompt, "Help me write tests."); diff != "" {
t.Errorf("mismatch (-got +want):\n%s", diff)
}
})
}

View File

@@ -7,11 +7,14 @@ import (
"slices"
"testing"
"github.com/gin-gonic/gin"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/envconfig"
)
func TestList(t *testing.T) {
gin.SetMode(gin.TestMode)
t.Setenv("OLLAMA_MODELS", t.TempDir())
envconfig.LoadConfig()

Some files were not shown because too many files have changed in this diff Show More