Compare commits

...

336 Commits

Author SHA1 Message Date
Jeffrey Morgan
aae31dc6ed naive whitespace detection 2024-02-20 00:15:51 -05:00
BADR
199e79ec0c docs: add tenere to terminal clients (#2329) 2024-02-19 23:13:03 -05:00
Jeffrey Morgan
8125ce4cb6 Update import.md
Add instructions to get public key on windows
2024-02-19 22:48:24 -05:00
Daniel
636d6eea99 Add ShellOracle to community terminal integrations (#1767) 2024-02-19 22:18:05 -05:00
Jeffrey Morgan
df56f1ee5e Update faq.md 2024-02-19 22:16:42 -05:00
Jean-Baptiste Detroyes
0b6c6c9092 feat: add Helm Chart link to Package managers list (#1673) 2024-02-19 22:05:14 -05:00
Jakob Hoeg Mørk
cb60389de7 NextJS web interface for Ollama (#2466) 2024-02-19 21:57:36 -05:00
lulz
ce0c95d097 [fix] /bye and /exit are now treated as prefixes (#2381)
* [fix] /bye and /exit are now treated as prefixes
instead of being treated as entire lines which doesn't align with the way the rest of the commands are treated

* Update cmd/interactive.go

Fixing whitespace

---------

Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com>
2024-02-19 21:56:49 -05:00
Eddú Meléndez Gonzales
a9bc1e1c37 Add LangChain4J (#2164) 2024-02-19 21:17:32 -05:00
Branislav Gerazov
62c71f4cb1 add ollama-chat.nvim (#2188) 2024-02-19 21:14:29 -05:00
Jeffrey Morgan
41aca5c2d0 Update faq.md 2024-02-19 21:11:01 -05:00
Jeffrey Morgan
753724d867 Update api.md to include examples for reproducible outputs 2024-02-19 20:36:16 -05:00
Jeffrey Morgan
e4576c2ee1 Update README.md 2024-02-19 20:15:24 -05:00
Patrick Devine
9a7a4b9533 add faqs for memory pre-loading and the keep_alive setting (#2601) 2024-02-19 14:45:25 -08:00
Daniel Hiltgen
2653191222 Merge pull request #2600 from dhiltgen/refined_win_docs
Document setting server vars for windows
2024-02-19 13:46:37 -08:00
Daniel Hiltgen
b338c0635f Document setting server vars for windows 2024-02-19 13:30:46 -08:00
Daniel Hiltgen
4fcbf1cde6 Merge pull request #2599 from dhiltgen/fix_avx
Explicitly disable AVX2 on GPU builds
2024-02-19 13:13:05 -08:00
Daniel Hiltgen
9220b4fa91 Merge pull request #2585 from dhiltgen/cuda_leaks
Fix cuda leaks
2024-02-19 12:48:00 -08:00
Daniel Hiltgen
fc39a6cd7a Fix cuda leaks
This should resolve the problem where we don't fully unload from the GPU
when we go idle.
2024-02-18 18:37:20 -08:00
Justin Hayes
1e23e82324 Update Web UI link to new project name (#2563)
Ollama WebUI is now known as Open WebUI.
2024-02-17 20:05:20 -08:00
Daniel Hiltgen
f9fd08040b Merge pull request #2552 from dhiltgen/dup_update_menus
Fix duplicate menus on update and exit on signals
2024-02-16 17:23:37 -08:00
Daniel Hiltgen
4318e35ee3 Merge pull request #2553 from dhiltgen/amdgpu_version
Harden AMD driver lookup logic
2024-02-16 17:23:12 -08:00
Daniel Hiltgen
9754c6d9d8 Harden AMD driver lookup logic
It looks like the version file doesnt exist on older(?) drivers
2024-02-16 16:20:16 -08:00
Daniel Hiltgen
a497235a55 Fix view logs menu 2024-02-16 15:42:53 -08:00
Daniel Hiltgen
df6dc4fd96 Fix duplicate menus on update and exit on signals
Also fixes a few fit-and-finish items for better developer experience
2024-02-16 15:33:16 -08:00
Bruce MacDonald
88622847c6 fix: chat system prompting overrides (#2542) 2024-02-16 14:42:43 -05:00
Tristan Rhodes
9774663013 Update faq.md with the location of models on Windows (#2545) 2024-02-16 11:04:19 -08:00
Daniel Hiltgen
a468ae0459 Merge pull request #2499 from ollama/windows-preview
Windows Preview
2024-02-15 16:06:32 -08:00
Daniel Hiltgen
c3e62ba38a Merge pull request #2516 from dhiltgen/single_tray_app
Fix a couple duplicate instance bugs
2024-02-15 15:52:43 -08:00
Daniel Hiltgen
117369aa73 Exit if we detect another copy of Ollama running 2024-02-15 14:58:29 -08:00
Daniel Hiltgen
1ba734de67 typo 2024-02-15 14:56:55 -08:00
Daniel Hiltgen
5208cf09b1 clean up some logging 2024-02-15 14:56:55 -08:00
Daniel Hiltgen
bb9de6037c Prevent multiple installers running concurrently 2024-02-15 14:56:55 -08:00
Daniel Hiltgen
272e53a1f5 Prepare to distribute standalone windows executable
This will be useful for our automated test riggig, and may be useful for
advanced users who want to "roll their own" system service
2024-02-15 14:56:55 -08:00
Daniel Hiltgen
db2a9ad1fe Explicitly disable AVX2 on GPU builds
Even though we weren't setting it to on, somewhere in the cmake config
it was getting toggled on.  By explicitly setting it to off, we get `/arch:AVX`
as intended.
2024-02-15 14:50:11 -08:00
Daniel Hiltgen
c9ab1aead3 Merge pull request #2526 from dhiltgen/harden_for_quotes
Harden the OLLAMA_HOST lookup for quotes
2024-02-15 14:13:40 -08:00
Daniel Hiltgen
4a10e7a7fa Harden the OLLAMA_HOST lookup for quotes 2024-02-15 13:46:56 -08:00
Michael Yang
86808f80a8 remove unused import 2024-02-15 12:09:11 -08:00
Michael Yang
4240b045e6 always enable view logs 2024-02-15 12:08:27 -08:00
Michael Yang
e547378893 disable default debug 2024-02-15 12:05:13 -08:00
Michael Yang
fd77dbec4d do not print update request headers 2024-02-15 11:36:35 -08:00
Michael
fefb3e77d1 Update README.md 2024-02-15 10:32:40 -08:00
Jeffrey Morgan
ed5489a96e higher resolution tray icons 2024-02-14 22:55:03 -08:00
jmorganca
76113742cf update installer title 2024-02-15 05:56:45 +00:00
Jeffrey Morgan
57e60c836f better windows app and tray icons 2024-02-15 05:56:45 +00:00
jmorganca
622b1f3e67 update installer and app.exe metadata 2024-02-15 05:56:45 +00:00
jmorganca
7ad9844ac0 set exe metadata using resource files 2024-02-15 05:56:45 +00:00
Michael Yang
e43648afe5 rerefactor 2024-02-15 05:56:45 +00:00
Daniel Hiltgen
823a520266 Fix lint error on ignored error for win console 2024-02-15 05:56:45 +00:00
vinjn
66ef308abd Import "containerd/console" lib to support colorful output in Windows terminal 2024-02-15 05:56:45 +00:00
Daniel Hiltgen
29e90cc13b Implement new Go based Desktop app
This focuses on Windows first, but coudl be used for Mac
and possibly linux in the future.
2024-02-15 05:56:45 +00:00
Daniel Hiltgen
f397e0e988 Move hub auth out to new package 2024-02-15 05:56:45 +00:00
Daniel Hiltgen
9da9e8fb72 Move Mac App to a new dir 2024-02-15 05:56:45 +00:00
Patrick Devine
42e77e2a69 handle race condition while setting raw mode in windows (#2509) 2024-02-14 21:28:35 -08:00
Jeffrey Morgan
9241a29336 Revert "Revert "bump submodule to 6c00a06 (#2479)"" (#2485)
This reverts commit 6920964b87.
2024-02-13 18:18:41 -08:00
Jeffrey Morgan
f7231ad9ad set shutting_down to false once shutdown is complete (#2484) 2024-02-13 17:48:41 -08:00
Jeffrey Morgan
6920964b87 Revert "bump submodule to 6c00a06 (#2479)"
This reverts commit 2f9ed52bbd.
2024-02-13 17:23:05 -08:00
Jeffrey Morgan
2f9ed52bbd bump submodule to 6c00a06 (#2479) 2024-02-13 17:12:42 -08:00
bnorick
caf2b13c10 Fix infinite keep_alive (#2480) 2024-02-13 15:40:32 -08:00
lebrunel
1d263449ff Update README.md to include link to Ollama-ex Elixir library (#2477) 2024-02-13 11:40:44 -08:00
Jeffrey Morgan
48a273f80b Fix issues with templating prompt in chat mode (#2460) 2024-02-12 15:06:57 -08:00
Daniel Hiltgen
939c60473f Merge pull request #2422 from dhiltgen/better_kill
More robust shutdown
2024-02-12 14:05:06 -08:00
Jeffrey Morgan
f76ca04f9e update submodule to 099afc6 (#2468) 2024-02-12 14:01:16 -08:00
Daniel Hiltgen
76b8728f0c Merge pull request #2465 from dhiltgen/block_rocm_pre_9
Detect AMD GPU info via sysfs and block old cards
2024-02-12 12:41:43 -08:00
Jeffrey Morgan
1f9078d6ae Check image filetype in api handlers (#2467) 2024-02-12 11:16:20 -08:00
Daniel Hiltgen
6d84f07505 Detect AMD GPU info via sysfs and block old cards
This wires up some new logic to start using sysfs to discover AMD GPU
information and detects old cards we can't yet support so we can fallback to CPU mode.
2024-02-12 08:19:41 -08:00
Jeffrey Morgan
26b13fc33c patch: always add token to cache_tokens (#2459) 2024-02-12 08:10:16 -08:00
Jeffrey Morgan
1c8435ffa9 Update domain name references in docs and install script (#2435) 2024-02-09 15:19:30 -08:00
Daniel Hiltgen
6680761596 Shutdown faster
Make sure that when a shutdown signal comes, we shutdown quickly instead
of waiting for a potentially long exchange to wrap up.
2024-02-08 22:22:50 -08:00
Jeffrey Morgan
42b797ed9c Update openai.md 2024-02-08 15:03:23 -05:00
Jeffrey Morgan
336aa43f3c Update openai.md 2024-02-08 12:48:28 -05:00
Daniel Hiltgen
69f392c9b7 Merge pull request #2403 from dhiltgen/handle_tmp_cleanup
Ensure the libraries are present
2024-02-07 17:55:31 -08:00
Daniel Hiltgen
a1dfab43b9 Ensure the libraries are present
When we store our libraries in a temp dir, a reaper might clean
them when we are idle, so make sure to check for them before
we reload.
2024-02-07 17:27:49 -08:00
Jeffrey Morgan
a0a199b108 Fix hanging issue when sending empty content (#2399) 2024-02-07 19:30:33 -05:00
Jeffrey Morgan
ab0d37fde4 Update openai.md 2024-02-07 17:25:33 -05:00
Jeffrey Morgan
14e71350c8 Update openai.md 2024-02-07 17:25:24 -05:00
Jeffrey Morgan
453f572f83 Initial OpenAI /v1/chat/completions API compatibility (#2376) 2024-02-07 17:24:29 -05:00
Daniel Hiltgen
c9dfa6e571 Merge pull request #2377 from dhiltgen/bump_llamacpp
Bump llama.cpp to b2081
2024-02-07 12:04:38 -08:00
Michael Yang
3dcbcd367d Merge pull request #2394 from ollama/mxyng/fix-error-response 2024-02-07 11:47:31 -08:00
Michael Yang
e805ac1d59 fix response on token error 2024-02-07 11:05:49 -08:00
Michael Yang
b9229ffca5 Merge pull request #2378 from ollama/mxyng/runners
runners
2024-02-06 13:49:58 -08:00
Michael Yang
46c847c4ad enable rocm builds 2024-02-06 13:36:13 -08:00
Michael Yang
92b1a21f79 use linux runners 2024-02-06 13:36:04 -08:00
Daniel Hiltgen
de76b95dd4 Bump llama.cpp to b2081 2024-02-06 12:06:43 -08:00
Michael Yang
59ec837ef6 Merge pull request #2374 from ollama/mxyng/rocm-builds
disable rocm builds
2024-02-06 09:41:02 -08:00
Michael Yang
f06b99a461 disable rocm builds 2024-02-06 09:29:42 -08:00
Bruce MacDonald
128fce5495 docs: keep_alive (#2258) 2024-02-06 11:00:05 -05:00
Daniel Hiltgen
27aa2d4a19 Merge pull request #1849 from mraiser/main
Accomodate split cuda lib dir
2024-02-05 16:01:16 -08:00
Jeffrey Morgan
b9f91a0b36 Update import instructions to use convert and quantize tooling from llama.cpp submodule (#2247) 2024-02-05 00:50:44 -05:00
Erik S
b538dc3858 Add llm-ollama plugin for Datasette's LLM CLI to README (#2340)
Co-authored-by: Erik Sp <git@aschwa.com>
2024-02-03 15:40:50 -08:00
Jeffrey Morgan
f0e9496c85 Update api.md 2024-02-02 12:17:24 -08:00
Jeffrey Morgan
09a6f76f4c fix error on ollama run with a non-existent model 2024-02-01 23:11:52 -08:00
Jeffrey Morgan
e135167484 Add multimodel support to ollama run in noninteractive mopde (#2317) 2024-02-01 21:33:06 -08:00
Jeffrey Morgan
38296ab352 clear previous images when submitting an image to ollama run (#2316) 2024-02-01 21:30:26 -08:00
Daniel Hiltgen
f43dea68d1 Merge pull request #2318 from dhiltgen/more_clean
Harden generate patching model
2024-02-01 20:41:29 -08:00
Daniel Hiltgen
e1f50377f4 Harden generate patching model
Only apply patches if we have any, and make sure to cleanup
every file we patched at the end to leave the tree clean
2024-02-01 19:34:36 -08:00
Jeffrey Morgan
7913104527 Improvements to ollama run for multimodal models (#2300) 2024-02-01 17:09:51 -08:00
Michael Yang
bfbf2f7cf7 Merge pull request #2296 from ollama/mxyng/img-tags
append image tags to user content
2024-02-01 13:16:59 -08:00
Michael Yang
fe3cbd014f Merge pull request #2298 from ollama/mxyng/debug-prompt
structured debug prompt
2024-02-01 13:16:49 -08:00
Michael Yang
3d6f48507a structured debug prompt 2024-02-01 11:56:28 -08:00
Michael Yang
f3761405c8 use image id 2024-02-01 11:52:42 -08:00
Michael Yang
e49dc9f3d8 fix tests 2024-02-01 11:48:11 -08:00
Michael Yang
d125510b4b remove image tags 2024-02-01 11:32:51 -08:00
Russell Canfield
1ca386aa9e Feature - Add Wingman Extension (#2313) 2024-02-01 11:16:24 -08:00
Michael Yang
fb56988014 account for image projection in token count 2024-02-01 09:50:48 -08:00
Michael Yang
d046bee790 use llm.ImageData for chat 2024-01-31 19:18:25 -08:00
Jeffrey Morgan
f11bf0740b use llm.ImageData 2024-01-31 19:13:48 -08:00
Michael Yang
8450bf66e6 trim images 2024-01-31 19:13:47 -08:00
Michael Yang
b4e11be8ef append image tags to user content 2024-01-31 19:13:10 -08:00
Bruce MacDonald
a896079705 preserve last system message from modelfile (#2289) 2024-01-31 21:45:01 -05:00
Michael Yang
583950c828 Merge pull request #2294 from ollama/mxyng/slog-source
update slog handler options
2024-01-31 15:29:11 -08:00
Michael Yang
8ac08a0eec update slog handler options
- consistent format by using text handler for debug and non-debug
- truncate source file to just the file name
2024-01-31 15:15:00 -08:00
Michael Yang
60f47be64c Merge pull request #2284 from ollama/mxyng/parse-raw
remove unnecessary parse raw
2024-01-31 09:40:48 -08:00
Daniel Hiltgen
6e56077ada Merge pull request #2263 from dhiltgen/bump_llamacpp
Bump llama.cpp to b1999
2024-01-31 08:39:41 -08:00
Hoang Nguyen
98ae9467bb Added MindMac to Community Integrations -> Web & Desktop section (#1957) 2024-01-31 07:48:37 -08:00
Richard Macarthy
b7a24af083 Add twinny vscode extension to Extensions and Plugins (#1950) 2024-01-31 06:25:06 -08:00
Michael Yang
c8b1f2369e remove unnecessary parse raw 2024-01-30 17:00:53 -08:00
Daniel Hiltgen
72b12c3be7 Bump llama.cpp to b1999
This requires an upstream change to support graceful termination,
carried as a patch.
2024-01-30 16:52:12 -08:00
Bruce MacDonald
0632dff3f8 trim chat prompt based on llm context size (#1963) 2024-01-30 15:59:29 -05:00
Maximilian Weber
509e2dec8a Update README.md (#2252)
Added - [Ollama for R - rollama](https://github.com/JBGruber/rollama) in Libraries in README.md
2024-01-30 11:56:51 -08:00
Daniel Hiltgen
78a48de804 Merge pull request #2256 from dhiltgen/container_logs
Add container hints for troubleshooting
2024-01-30 08:12:48 -08:00
Daniel Hiltgen
e7dbb00331 Add container hints for troubleshooting
Some users are new to containers and unsure where the server logs go
2024-01-29 08:53:41 -08:00
Marc Raiser
c3f9538636 remove default.nix 2024-01-29 00:05:07 -05:00
Jeffrey Morgan
2e06ed01d5 remove unknown CPPFLAGS option 2024-01-28 17:51:23 -08:00
Daniel Hiltgen
4072b5879b Merge pull request #2246 from dhiltgen/reject_cuda_without_avx
Don't disable GPUs on arm without AVX
2024-01-28 16:26:55 -08:00
Daniel Hiltgen
15562e887d Don't disable GPUs on arm without AVX
AVX is an x86 feature, so ARM should be excluded from
the check.
2024-01-28 15:22:38 -08:00
Jeffrey Morgan
f2245c7c77 print prompt with OLLAMA_DEBUG=1 (#2245) 2024-01-28 15:22:35 -08:00
Jeffrey Morgan
e4b9b72f2a Do not repeat system prompt for chat templating (#2241) 2024-01-28 14:15:56 -08:00
Daniel Hiltgen
311f8e0c3f Merge pull request #2243 from dhiltgen/harden_zero_gpus
Harden for zero detected GPUs
2024-01-28 13:30:44 -08:00
Daniel Hiltgen
f07f8b7a9e Harden for zero detected GPUs
At least with the ROCm libraries, its possible to have the library
present with zero GPUs.  This fix avoids a divide by zero bug in llm.go
when we try to calculate GPU memory with zero GPUs.
2024-01-28 13:13:10 -08:00
mraiser
4c4c730a0a Merge branch 'ollama:main' into main 2024-01-27 21:56:11 -05:00
Daniel Hiltgen
e02ecfb6c8 Merge pull request #2116 from dhiltgen/cc_50_80
Add support for CUDA 5.0 cards
2024-01-27 10:28:38 -08:00
Daniel Hiltgen
c8059b4dcf Merge pull request #2224 from jaglinux/fix_rocm_get_version_message
ROCm: Correct the response string in rocm_get_version function
2024-01-27 07:29:32 -08:00
Jagadish Krishnamoorthy
59d87127f5 Update gpu_info_rocm.c 2024-01-26 22:08:27 -08:00
Patrick Devine
b5cf31b460 add keep_alive to generate/chat/embedding api endpoints (#2146) 2024-01-26 14:28:02 -08:00
Daniel Hiltgen
cc4915e262 Merge pull request #2214 from dhiltgen/reject_cuda_without_avx
Detect lack of AVX and fallback to CPU mode
2024-01-26 12:06:44 -08:00
Daniel Hiltgen
667a2ba18a Detect lack of AVX and fallback to CPU mode
We build the GPU libraries with AVX enabled to ensure that if not all
layers fit on the GPU we get better performance in a mixed mode.
If the user is using a virtualization/emulation system that lacks AVX
this used to result in an illegal instruction error and crash before this
fix.  Now we will report a warning in the server log, and just use
CPU mode to ensure we don't crash.
2024-01-26 11:36:03 -08:00
Michael Yang
e054ebe059 Merge pull request #2212 from ollama/mxyng/fix-build
fix build
2024-01-26 11:19:08 -08:00
Michael Yang
9d3dcfd0ec fix logging 2024-01-26 11:04:27 -08:00
Michael Yang
6e0ea5ecc8 Merge pull request #1916 from ollama/mxyng/inactivity-monitor
download: add inactivity monitor
2024-01-26 10:56:00 -08:00
Daniel Hiltgen
a47d8b2557 Merge pull request #2197 from dhiltgen/remove_rocm_image
Add back ROCm container support
2024-01-26 09:34:23 -08:00
Daniel Hiltgen
30c43c285c Merge pull request #2195 from dhiltgen/rocm_real_gpus
Ignore AMD integrated GPUs
2024-01-26 09:30:24 -08:00
Daniel Hiltgen
23a7ea593b Merge pull request #2209 from dhiltgen/harden_mgmt
Fix crash on cuda ml init failure
2024-01-26 09:30:13 -08:00
Daniel Hiltgen
75c44aa319 Add back ROCm container support
This adds ROCm support back as a discrete image.
2024-01-26 09:24:29 -08:00
Daniel Hiltgen
9d7b5d6c91 Ignore AMD integrated GPUs
Detect and ignore integrated GPUs reported by rocm.
2024-01-26 09:21:35 -08:00
Daniel Hiltgen
5d9c4a5f5a Fix crash on cuda ml init failure
The new driver lookup code was triggering after init failure due to a missing return
2024-01-26 09:18:33 -08:00
Daniel Hiltgen
197e420a97 Merge pull request #2196 from dhiltgen/remove_rocm_image
Switch back to ubuntu base
2024-01-25 16:50:32 -08:00
Daniel Hiltgen
a34e1ad3cf Switch back to ubuntu base
The size increase for rocm support in the standard image is problematic
We'll revisit multiple tags for rocm support in a follow up PR.
2024-01-25 16:46:01 -08:00
Michael Yang
2ae0556292 Merge pull request #1679 from ollama/mxyng/build-gpus
build cuda and rocm
2024-01-25 16:38:14 -08:00
Jeffrey Morgan
5be9bdd444 Update modelfile.md 2024-01-25 16:29:48 -08:00
Jeffrey Morgan
b706794905 Update modelfile.md to include MESSAGE 2024-01-25 16:29:32 -08:00
Michael Yang
a8c5413d06 only generate gpu libs 2024-01-25 15:41:56 -08:00
Michael Yang
5580de4571 archive ollama binaries 2024-01-25 15:40:16 -08:00
Michael Yang
946431d5b0 build cuda and rocm 2024-01-25 15:40:15 -08:00
Michael Yang
0610126049 remove env setting 2024-01-25 15:39:43 -08:00
Jeffrey Morgan
3ebd6a83fc update submodule to cd4fddb29f81d6a1f6d51a0c016bc6b486d68def 2024-01-25 13:54:11 -08:00
Jeffrey Morgan
a64570dcae Fix clearing kv cache between requests with the same prompt (#2186)
* Fix clearing kv cache between requests with the same prompt

* fix powershell script
2024-01-25 13:46:20 -08:00
Patrick Devine
7c40a67841 Save and load sessions (#2063) 2024-01-25 12:12:36 -08:00
Michael Yang
e64b5b07a2 Merge pull request #2181 from ollama/mxyng/stub-lint
stub generate outputs for lint
2024-01-25 11:55:15 -08:00
Michael Yang
9e1e295cdc Merge pull request #2175 from ollama/mxyng/refactor-tensor-read
refactor tensor read
2024-01-25 09:22:42 -08:00
Marc Raiser
6eb3cddcb6 To build on NixOS: nix-shell --run 'go generate ./... && go build .' 2024-01-25 10:17:22 -05:00
mraiser
a4564232a4 Update gen_linux.sh to find libcudart in separate directory 2024-01-25 09:49:35 -05:00
Jeffrey Morgan
a643823f86 Update README.md 2024-01-24 21:36:56 -08:00
Michael Yang
8e5d359a03 stub generate outputs for lint 2024-01-24 17:36:10 -08:00
Daniel Hiltgen
a170888dd4 Merge pull request #2174 from dhiltgen/rocm_real_gpus
More logging for gpu management
2024-01-24 11:09:17 -08:00
Michael Yang
cd22855ef8 refactor tensor read 2024-01-24 10:48:31 -08:00
Daniel Hiltgen
013fd07139 More logging for gpu management
Fix an ordering glitch of dlerr/dlclose and add more logging to help
root cause some crashes users are hitting. This also refines the
function pointer names to use the underlying function names instead
of simplified names for readability.
2024-01-24 10:32:36 -08:00
Daniel Hiltgen
f63dc2db5c Merge pull request #2162 from dhiltgen/rocm_real_gpus
Report more information about GPUs in verbose mode
2024-01-23 17:45:40 -08:00
Jeffrey Morgan
eaa5a396d9 Update README.md 2024-01-23 16:08:15 -08:00
Jeffrey Morgan
8ed22f5d72 Update README.md 2024-01-23 14:38:01 -08:00
Daniel Hiltgen
987c16b2f7 Report more information about GPUs in verbose mode
This adds additional calls to both CUDA and ROCm management libraries to
discover additional attributes about the GPU(s) detected in the system, and
wires up runtime verbosity selection.  When users hit problems with GPUs we can
ask them to run with `OLLAMA_DEBUG=1 ollama serve` and share the results.
2024-01-23 11:37:02 -08:00
Jeffrey Morgan
950f636d64 Update README.md 2024-01-23 10:29:10 -08:00
Jeffrey Morgan
4458efb73a Load all layers on arm64 macOS if model is small enough (#2149) 2024-01-22 17:40:06 -08:00
Daniel Hiltgen
ceea599494 Merge pull request #2150 from dhiltgen/default_version
Set a default version using git describe
2024-01-22 17:38:27 -08:00
Daniel Hiltgen
3005ec74b3 Set a default version using git describe
If a VERSION is not specified, this will generate a version string that
represents the state of the repo.  For example `0.1.21-12-gffaf52e-dirty`
representing 12 commits away from 0.1.21 tag, on commit gffaf52e
and the tree is dirty.
2024-01-22 17:12:20 -08:00
Daniel Hiltgen
0759d8996e Merge pull request #2148 from dhiltgen/intel_mac
Refine Accelerate usage on mac
2024-01-22 16:56:58 -08:00
Daniel Hiltgen
0f5b843319 Refine Accelerate usage on mac
For old macs, accelerate seems to cause crashes, but for
AVX2 capable macs, it does not.
2024-01-22 16:25:56 -08:00
Jeffrey Morgan
ffaf52e1e9 update submodule to 011e8ec577fd135cbc02993d3ea9840c516d6a1c 2024-01-22 15:16:54 -08:00
Michael Yang
940b10b036 Merge pull request #2144 from jmorganca/mxyng/update-faq
faq: update to use launchctl setenv
2024-01-22 13:46:57 -08:00
Daniel Hiltgen
3bc28736cd Merge pull request #2143 from dhiltgen/llm_verbosity
Refine debug logging for llm
2024-01-22 13:19:16 -08:00
Michael Yang
93a756266c faq: update to use launchctl setenv 2024-01-22 13:10:13 -08:00
Daniel Hiltgen
a0a829bf7a Merge pull request #2142 from dhiltgen/debug_on_fail
Debug logging on init failure
2024-01-22 12:29:22 -08:00
Daniel Hiltgen
730dcfcc7a Refine debug logging for llm
This wires up logging in llama.cpp to always go to stderr, and also
turns up logging if OLLAMA_DEBUG is set.
2024-01-22 12:26:49 -08:00
Daniel Hiltgen
27a2d5af54 Debug logging on init failure 2024-01-22 12:08:22 -08:00
Jeffrey Morgan
5f81a33f43 update submodule to 6f9939d (#2115) 2024-01-22 11:56:40 -08:00
Michael Yang
6225fde046 Merge pull request #2102 from jmorganca/mxyng/fix-create-override
fix: remove overwritten model layers
2024-01-22 09:37:48 -08:00
Meng Zhuo
069184562b readline: drop not use min function (#2134) 2024-01-22 08:15:08 -08:00
Daniel Hiltgen
5576bb2348 Merge pull request #2130 from dhiltgen/more_faster
Make CPU builds parallel and customizable AMD GPUs
2024-01-21 16:14:12 -08:00
Daniel Hiltgen
2738837786 Merge pull request #2131 from dhiltgen/probe_cards_at_init
Probe GPUs before backend init
2024-01-21 16:13:47 -08:00
Daniel Hiltgen
ec3764538d Probe GPUs before backend init
Detect potential error scenarios so we can fallback to CPU mode without
hitting asserts.
2024-01-21 15:59:38 -08:00
Daniel Hiltgen
df54c723ae Make CPU builds parallel and customizable AMD GPUs
The linux build now support parallel CPU builds to speed things up.
This also exposes AMD GPU targets as an optional setting for advaced
users who want to alter our default set.
2024-01-21 15:12:21 -08:00
Daniel Hiltgen
fa8c990e58 Merge pull request #2127 from dhiltgen/rocm_container
Combine the 2 Dockerfiles and add ROCm
2024-01-21 11:49:01 -08:00
Daniel Hiltgen
da72235ebf Combine the 2 Dockerfiles and add ROCm
This renames Dockerfile.build to Dockerfile, and adds some new stages
to support 2 modes of building - the build_linux.sh script uses
intermediate stages to extract the artifacts for ./dist, and the default
build generates a container image usable by both cuda and rocm cards.
This required transitioniing the x86 base to the rocm image to avoid
layer bloat.
2024-01-21 11:37:11 -08:00
Jeffrey Morgan
89c4aee29e Unlock mutex when failing to load model (#2117) 2024-01-20 20:54:46 -05:00
Daniel Hiltgen
a447a083f2 Add compute capability 5.0, 7.5, and 8.0 2024-01-20 14:24:05 -08:00
Jeffrey Morgan
f32ea81b21 increase minimum overhead to 1024MiB (#2114) 2024-01-20 17:11:38 -05:00
Daniel Hiltgen
681a914990 Add support for CUDA 5.2 cards 2024-01-20 10:48:43 -08:00
Jeffrey Morgan
4c54f0ddeb sign dylibs on macOS (#2101) 2024-01-19 19:24:11 -05:00
Michael Yang
c08dfaa23d fix: remove overwritten model layers
if create overrides a manifest, first add the older manifest's layers to
the delete map so they can be cleaned up
2024-01-19 14:58:37 -08:00
Daniel Hiltgen
3b76e736ae Merge pull request #2100 from dhiltgen/more_wsl_globs
More WSL paths
2024-01-19 13:41:08 -08:00
Daniel Hiltgen
552db98bf1 More WSL paths 2024-01-19 13:23:29 -08:00
Daniel Hiltgen
fdcdfef620 Merge pull request #2099 from dhiltgen/fix_cuda_model_swap
Switch to local dlopen symbols
2024-01-19 12:22:04 -08:00
Daniel Hiltgen
6a042438af Switch to local dlopen symbols 2024-01-19 11:37:02 -08:00
Jeffrey Morgan
dc88cc3981 use gzip for runner embedding (#2067) 2024-01-19 13:23:03 -05:00
Daniel Hiltgen
62976087c6 Merge pull request #1999 from lainedfles/termux_android_cpu_only
Fix CPU-only build under Android Termux enviornment.
2024-01-18 17:16:53 -08:00
Self Denial
344342abdf Restore dyn_ext_server.c since RTLD_DEEPBIND has been removed 2024-01-18 17:30:42 -07:00
Self Denial
eb76f3e379 Fix CPU-only build under Android Termux enviornment.
Update gpu.go initGPUHandles() to declare gpuHandles variable before
reading it. This resolves an "invalid memory address or nil pointer
dereference" error.

Update dyn_ext_server.c to avoid setting the RTLD_DEEPBIND flag under
__TERMUX__ (Android).
2024-01-18 17:25:33 -07:00
Michael Yang
d017e3d0a6 Merge pull request #2060 from jmorganca/mxyng/fix-show
fix show handler
2024-01-18 16:02:27 -08:00
Michael Yang
aac9ab4db7 fix show handler 2024-01-18 15:36:50 -08:00
Michael Yang
1f5b7ff976 Merge pull request #1932 from jmorganca/mxyng/api-fields
api: add model for all requests
2024-01-18 14:56:51 -08:00
Michael Yang
e299831e2c Merge pull request #1958 from purificant/ci
ci: update setup-go action
2024-01-18 14:53:36 -08:00
Michael Yang
745b5934fa add model to ModelResponse 2024-01-18 14:32:55 -08:00
Michael Yang
a38d88d828 api: add model for all requests
prefer using req.Model and fallback to req.Name
2024-01-18 14:31:37 -08:00
Daniel Hiltgen
abec7f06e5 Merge pull request #2056 from dhiltgen/slog
Mechanical switch from log to slog
2024-01-18 14:27:24 -08:00
Michael Yang
e5da190bac Merge pull request #2020 from jmorganca/mxyng/install-fedora
install: pin fedora to max 37
2024-01-18 14:23:42 -08:00
Daniel Hiltgen
ecbfc0182f Go bump to v1.21 to pick up slog 2024-01-18 14:12:57 -08:00
Daniel Hiltgen
fedd705aea Mechanical switch from log to slog
A few obvious levels were adjusted, but generally everything mapped to "info" level.
2024-01-18 14:12:57 -08:00
Mike Bird
82ee019bfc add open interpreter to list of extensions (#2016) 2024-01-18 13:59:39 -08:00
Sachin Sachdeva
ad9dbc2a04 Haystack Ollama Integration (#2021)
Updated readme with the web link for haystack ollama integration
2024-01-18 13:38:32 -08:00
Daniel Hiltgen
fccdf4c635 Merge pull request #1987 from xyproto/archlinux
Let gpu.go and gen_linux.sh also find CUDA on Arch Linux
2024-01-18 13:32:10 -08:00
Daniel Hiltgen
d450fb1d1e Merge pull request #2055 from dhiltgen/cuda_docs
Refine the linux cuda/rocm developer docs
2024-01-18 12:07:31 -08:00
Daniel Hiltgen
df40b11d03 Merge pull request #2007 from dhiltgen/cpu_fallback
Add multiple CPU variants for Intel Mac
2024-01-18 11:32:29 -08:00
Daniel Hiltgen
9cd20b0ec8 Refine the linux cuda/rocm developer docs 2024-01-18 09:44:44 -08:00
Daniel Hiltgen
b992bf65fc Disable arm64 for test phase
The runners are x86 so we can only run binaries that match.
2024-01-17 19:26:13 -08:00
Daniel Hiltgen
1b249748ab Add multiple CPU variants for Intel Mac
This also refines the build process for the ext_server build.
2024-01-17 15:08:54 -08:00
Alexander F. Rødseth
cbe2adc78a Merge branch 'main' into archlinux 2024-01-17 12:50:11 +01:00
Michael Yang
d5a7353357 Merge pull request #2026 from jmorganca/mxyng/fix-windows
fix: normalize name path before splitting
2024-01-16 16:58:42 -08:00
Michael Yang
96cfb62641 fix: normalize name path before splitting 2024-01-16 16:48:29 -08:00
Daniel Hiltgen
7d00b5d110 Merge pull request #1915 from dhiltgen/bump_llama_with_new_dep
Bump llama.cpp to b1842 and add new cuda lib dep
2024-01-16 13:36:49 -08:00
Daniel Hiltgen
795674dd90 Bump llama.cpp to b1842 and add new cuda lib dep
Upstream llama.cpp has added a new dependency with the
NVIDIA CUDA Driver Libraries (libcuda.so) which is part of the
driver distribution, not the general cuda libraries, and is not
available as an archive, so we can not statically link it.  This may
introduce some additional compatibility challenges which we'll
need to keep an eye on.
2024-01-16 12:53:52 -08:00
Daniel Hiltgen
e282bdccdd Merge pull request #1990 from dhiltgen/ci_mac_cross
Add macos cross-compile CI coverage
2024-01-16 12:31:37 -08:00
Michael Yang
d9bfb2f08f install: pin fedora to max 37
repos for fedora 38 and newer do not exist as of this commit

```
$ dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/fedora38/x86_64/cuda-fedora38.repo
Adding repo from: https://developer.download.nvidia.com/compute/cuda/repos/fedora38/x86_64/cuda-fedora38.repo
Status code: 404 for https://developer.download.nvidia.com/compute/cuda/repos/fedora38/x86_64/cuda-fedora38.repo (IP: 152.195.19.142)
Error: Configuration of repo failed
```
2024-01-16 11:45:21 -08:00
Michael Yang
598d6d5572 Merge pull request #1937 from jmorganca/mxyng/remove-client-py
remove client.py
2024-01-16 11:01:41 -08:00
Bruce MacDonald
a897e833b8 do not cache prompt (#2018)
- prompt cache causes inferance to hang after some time
2024-01-16 13:48:05 -05:00
Patrick Devine
eef50accb4 Fix show parameters (#2017) 2024-01-16 10:34:44 -08:00
Michael Yang
05d53de7a1 Merge pull request #1968 from jmorganca/mxyng/fix-request-retry
fix: request retry with error
2024-01-16 10:33:50 -08:00
Daniel Hiltgen
8795447dad Merge pull request #1966 from fpreiss/fpreiss/gen_linux_cuda_detection
improve cuda detection (rel. issue #1704)
2024-01-14 18:00:11 -08:00
Daniel Hiltgen
b3035112a1 Add macos cross-compile CI coverage 2024-01-14 10:38:59 -08:00
Daniel Hiltgen
95ad9a9fc8 Merge pull request #1988 from dhiltgen/fix_intel_mac
Fix typo in arm mac arch script
2024-01-14 08:45:18 -08:00
Daniel Hiltgen
3ca5f69ce8 Fix typo in arm mac arch script 2024-01-14 08:32:57 -08:00
Daniel Hiltgen
cfa6337960 Merge pull request #1982 from dhiltgen/fix_intel_mac
Fix intel mac build
2024-01-14 08:26:46 -08:00
Alexander F. Rødseth
f4bf1d514f Let gpu.go and gen_linux.sh also find CUDA on Arch Linux 2024-01-14 13:40:36 +01:00
Jeffrey Morgan
557110d0ba Disable mmap with lora layers (#1985) 2024-01-13 23:36:31 -05:00
Daniel Hiltgen
2ecb247276 Fix intel mac build
Make sure we're building an x86 ext_server lib when cross-compiling
2024-01-13 14:46:34 -08:00
Jeffrey Morgan
288ef8ff95 add gcc -lstdc++ flag for linux cpu (#1974) 2024-01-13 03:53:00 -05:00
Jeffrey Morgan
4cf17990f7 use g++ to build libext_server.so on linux (#1972) 2024-01-13 03:12:42 -05:00
Michael Yang
27331ae3a8 download: add inactivity monitor
if a download part is inactive for some time, restart it
2024-01-12 15:23:15 -08:00
Michael Yang
b6c0ef1e70 Merge pull request #1961 from jmorganca/mxyng/rm-double-newline
remove double newlines in /set parameter
2024-01-12 15:18:19 -08:00
Michael Yang
356d178f6e Merge pull request #1971 from jmorganca/mxyng/max-context-length
add max context length check
2024-01-12 15:10:25 -08:00
Michael Yang
eaed6f8c45 add max context length check 2024-01-12 14:54:07 -08:00
purificant
6a5bfc2ed6 update actions/setup-go 2024-01-12 22:27:25 +00:00
Michael Yang
cf29bd2d72 fix: request retry with error
this fixes a subtle bug with makeRequestWithRetry where an HTTP status
error on a retried request will potentially not return the right err
2024-01-12 13:32:27 -08:00
Fabian Preiss
905862e17b improve cuda detection (rel. issue #1704) 2024-01-12 21:59:19 +01:00
Patrick Devine
565f8a3c44 Convert the REPL to use /api/chat for interactive responses (#1936) 2024-01-12 12:05:52 -08:00
Michael Yang
5121b7ac9c remove double newlines in /set parameter 2024-01-12 11:21:15 -08:00
Michael Yang
a70262c6b2 Update README.md
Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com>
2024-01-12 09:43:04 -08:00
Tristram Oaten
40a0a90a88 Add group delete to uninstall instructions (#1924)
After executing the `userdel ollama` command, I saw this message:

```sh
$ sudo userdel ollama
userdel: group ollama not removed because it has other members.
```

Which reminded me that I had to remove the dangling group too. For completeness, the uninstall instructions should do this too.

Thanks!
2024-01-12 00:07:00 -05:00
Michael Yang
cbe20c4375 update readme 2024-01-11 16:24:37 -08:00
Michael Yang
5ffbbea1d7 remove client.py 2024-01-11 15:53:10 -08:00
Daniel Hiltgen
3773fb6465 Merge pull request #1935 from dhiltgen/cpu_fallback
Fix up the CPU fallback selection
2024-01-11 15:52:32 -08:00
Daniel Hiltgen
7427fa1387 Fix up the CPU fallback selection
The memory changes and multi-variant change had some merge
glitches I missed.  This fixes them so we actually get the cpu llm lib
and best variant for the given system.
2024-01-11 15:27:06 -08:00
Michael Yang
f84537e0e0 Merge pull request #1934 from jmorganca/mxyng/fix-slices
fix build and lint
2024-01-11 14:36:20 -08:00
Michael Yang
d2be6387c9 fix typo 2024-01-11 14:25:21 -08:00
Michael Yang
d7af35d3d0 import fmt 2024-01-11 14:22:32 -08:00
Michael Yang
defc1dbd6e use x/exp/slices 2024-01-11 14:20:13 -08:00
Daniel Hiltgen
de2fbdec99 Merge pull request #1819 from dhiltgen/multi_variant
Support multiple LLM libs; ROCm v5 and v6; Rosetta, AVX, and AVX2 compatible CPU builds
2024-01-11 14:00:48 -08:00
Eduard van Valkenburg
f5faf79aa1 Add semantic kernel to Readme (#1931) 2024-01-11 14:40:23 -05:00
Michael Yang
f4f939de28 Merge pull request #1552 from jmorganca/mxyng/lint-test
add lint and test on pull_request
2024-01-11 09:37:45 -08:00
Daniel Hiltgen
39928a42e8 Always dynamically load the llm server library
This switches darwin to dynamic loading, and refactors the code now that no
static linking of the library is used on any platform
2024-01-11 08:42:47 -08:00
Daniel Hiltgen
d88c527be3 Build multiple CPU variants and pick the best
This reduces the built-in linux version to not use any vector extensions
which enables the resulting builds to run under Rosetta on MacOS in
Docker.  Then at runtime it checks for the actual CPU vector
extensions and loads the best CPU library available
2024-01-11 08:42:47 -08:00
Fabian Preiß
3bc8b9832b fix gpu_test.go Error (same type) uint64->uint32 (#1921) 2024-01-11 08:22:23 -05:00
Jeffrey Morgan
ab6be852c7 revisit memory allocation to account for full kv cache on main gpu 2024-01-11 01:45:31 -05:00
Daniel Hiltgen
052b33b81b DRY out the Dockefile.build 2024-01-10 17:27:51 -08:00
Daniel Hiltgen
8da7bef05f Support multiple variants for a given llm lib type
In some cases we may want multiple variants for a given GPU type or CPU.
This adds logic to have an optional Variant which we can use to select
an optimal library, but also allows us to try multiple variants in case
some fail to load.

This can be useful for scenarios such as ROCm v5 vs v6 incompatibility
or potentially CPU features.
2024-01-10 17:27:51 -08:00
Jeffrey Morgan
b24e8d17b2 Increase minimum CUDA memory allocation overhead and fix minimum overhead for multi-gpu (#1896)
* increase minimum cuda overhead and fix minimum overhead for multi-gpu

* fix multi gpu overhead

* limit overhead to 10% of all gpus

* better wording

* allocate fixed amount before layers

* fixed only includes graph alloc
2024-01-10 19:08:51 -05:00
Jeffrey Morgan
f83881390f revert submodule back to 328b83de23b33240e28f4e74900d1d06726f5eb1 2024-01-10 18:42:39 -05:00
Daniel Hiltgen
ac70ab6761 Merge pull request #1914 from dhiltgen/smarter_cuda_detection
Smarter GPU Management library detection
2024-01-10 15:21:56 -08:00
Daniel Hiltgen
3c49c3ab0d Harden GPU mgmt library lookup
When there are multiple management libraries installed on a system
not every one will be compatible with the current driver.  This change
improves our management library algorithm to build up a set of discovered
libraries based on glob patterns, and then try all of them until we're able to
load one without error.
2024-01-10 15:06:41 -08:00
Daniel Hiltgen
9754ae4c89 Support optional override of the target archictures
This can help speed up incremental builds when you're only testing one
archicture, like amd64.  E.g.
BUILD_ARCH=amd64 ./scripts/build_linux.sh && scp ./dist/ollama-linux-amd64 test-system:
2024-01-10 14:43:24 -08:00
Jeffrey Morgan
224fbf2795 update submodule to commit 1fc2f265ff9377a37fd2c61eae9cd813a3491bea until its main branch is fixed 2024-01-10 17:03:15 -05:00
Jeffrey Morgan
2c6e8f5248 Update submodule to 6efb8eb30e7025b168f3fda3ff83b9b386428ad6 (#1885)
* update submodule to `6efb8eb30e7025b168f3fda3ff83b9b386428ad6`
* unblock condition variable in `update_slots` when closing server
2024-01-10 16:48:38 -05:00
Jeffrey Morgan
34344d801c clean up cmake build directory when cross compiling macOS builds 2024-01-09 17:13:56 -05:00
Robin Glauser
e868c8a5c7 Update api.md (#1878)
Fixed assistant in the example response.
2024-01-09 16:21:17 -05:00
Jeffrey Morgan
c336693f07 calculate overhead based number of gpu devices (#1875) 2024-01-09 15:53:33 -05:00
Daniel Hiltgen
e89dc1d54b Merge pull request #1874 from dhiltgen/correct_cuda_min
Set corret CUDA minimum compute capability version
2024-01-09 11:37:22 -08:00
Daniel Hiltgen
1961a81f03 Set corret CUDA minimum compute capability version
If you attempt to run the current CUDA build on compute capability 5.2
cards, you'll hit the following failure:
cuBLAS error 15 at ggml-cuda.cu:7956: the requested functionality is not supported
2024-01-09 11:28:24 -08:00
Jeffrey Morgan
8a8c7e7f8d only build for metal on arm64 2024-01-09 13:51:08 -05:00
Jeffrey Morgan
6df83e6daa update rough cuda overhead estimate to 15% + 384MiB 2024-01-09 13:51:08 -05:00
Michael Yang
f921e2696e typo 2024-01-09 09:45:42 -08:00
Michael Yang
4a33cede20 remove unused fields and functions 2024-01-09 09:37:40 -08:00
Michael Yang
f95d2f25f3 fix temporary history file permissions 2024-01-09 09:36:58 -08:00
Michael Yang
2b9892a808 fix(windows): modelpath and list 2024-01-09 09:36:58 -08:00
Michael Yang
2bb2bdd5d4 fix lint 2024-01-09 09:36:58 -08:00
Michael Yang
acfc376efd add .golangci.yaml 2024-01-09 09:36:58 -08:00
Michael Yang
997253143f add lint and test on pull_request 2024-01-09 09:36:58 -08:00
Michael Yang
62023177f6 Merge pull request #1614 from jmorganca/mxyng/fix-set-template
fix: set template without triple quotes
2024-01-09 09:36:24 -08:00
Jeffrey Morgan
6164f378f2 revert cuda overhead to 20% 2024-01-09 00:54:29 -05:00
Jeffrey Morgan
f387e9631b use runner if cuda alloc won't fit 2024-01-09 00:44:34 -05:00
Jeffrey Morgan
6566387ae3 add TODO for cuda overhead 2024-01-09 00:28:03 -05:00
Jeffrey Morgan
37708931fb update cuda overhead to 20% to fix crashes when switching between models and large context sizes 2024-01-09 00:05:23 -05:00
Jeffrey Morgan
f6cb0a553c update cuda overhead to 15% or 400MiB 2024-01-08 23:45:45 -05:00
Jeffrey Morgan
2680078c13 fix build on linux 2024-01-08 23:44:13 -05:00
Jeffrey Morgan
f1b7e5f560 update overhead to 15% 2024-01-08 23:37:45 -05:00
Jeffrey Morgan
cb534e6ac2 use 10% vram overhead for cuda 2024-01-08 23:17:44 -05:00
Jeffrey Morgan
58ce2d8273 better estimate scratch buffer size 2024-01-08 21:32:44 -05:00
Jeffrey Morgan
18ddf6d57d fix windows build 2024-01-08 20:04:01 -05:00
Michael Yang
61e6502449 Merge pull request #1818 from jmorganca/mxyng/fix-alt-prompt
fix(cmd): history in alt prompt
2024-01-08 13:48:34 -08:00
Jeffrey Morgan
08f1e18965 Offload layers to GPU based on new model size estimates (#1850)
* select layers based on estimated model memory usage

* always account for scratch vram

* dont load +1 layers

* better estmation for graph alloc

* Update gpu/gpu_darwin.go

Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com>

* Update llm/llm.go

Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com>

* Update llm/llm.go

* add overhead for cuda memory

* Update llm/llm.go

Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com>

* fix build error on linux

* address comments

---------

Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com>
2024-01-08 16:42:00 -05:00
Bruce MacDonald
7e8f7c8358 remove ggml automatic re-pull (#1856) 2024-01-08 14:41:01 -05:00
Bruce MacDonald
3f3eb19a3b document response in modelfile template variables (#1428) 2024-01-08 14:38:51 -05:00
Daniel Hiltgen
059ae4585e Merge pull request #1834 from dhiltgen/old_cuda
Detect very old CUDA GPUs and fall back to CPU
2024-01-07 10:39:49 -08:00
Daniel Hiltgen
6347f501ca Merge pull request #1828 from dhiltgen/fix_llava
Accept windows paths for image processing
2024-01-07 09:05:46 -08:00
Jeffrey Morgan
5feec959ad dont use -Wall in static build (#1833) 2024-01-07 10:39:19 -05:00
Jeffrey Morgan
dbdd50b283 add -DCMAKE_SYSTEM_NAME=Darwin cmake flag (#1832) 2024-01-07 00:46:17 -05:00
Daniel Hiltgen
d74ce6bd4f Detect very old CUDA GPUs and fall back to CPU
If we try to load the CUDA library on an old GPU, it panics and crashes
the server.  This checks the compute capability before we load the
library so we can gracefully fall back to CPU mode.
2024-01-06 21:40:29 -08:00
Guilherme Baptista
57942b4676 Update README.md - Community Integrations - Ollama for Ruby (#1830) 2024-01-06 22:31:39 -05:00
Daniel Hiltgen
e0d05b0f1e Accept windows paths for image processing
This enhances our regex to support windows style paths.  The regex will
match invalid path specifications, but we'll still validate file
existence and filter out mismatches
2024-01-06 10:50:27 -08:00
Daniel Hiltgen
2d9dd14f27 Merge pull request #1697 from dhiltgen/win_docs
Add windows native build instructions
2024-01-05 19:34:20 -08:00
Jeffrey Morgan
1caa56128f add cuda lib path for nvidia container toolkit 2024-01-05 21:10:37 -05:00
Michael Yang
0101e76dbe Merge pull request #1797 from sublimator/nd-allow-extension-origins-still-needs-explicit-listing-2024-01-05
fix: allow extension origins (still needs explicit listing), fixes #1686
2024-01-05 17:20:09 -08:00
Michael Yang
2ef9352b94 fix(cmd): history in alt mode 2024-01-05 16:20:02 -08:00
Michael Yang
5580ae2472 fix: set template without triple quotes 2024-01-05 15:51:33 -08:00
Bruce MacDonald
3a9f447141 only pull gguf model if already exists (#1817) 2024-01-05 18:50:00 -05:00
Patrick Devine
9c2941e61b switch api for ShowRequest to use the name field (#1816) 2024-01-05 15:06:43 -08:00
Patrick Devine
238ac5e765 Add unit tests for Parser (#1815) 2024-01-05 14:04:31 -08:00
Bruce MacDonald
4f4980b66b simplify ggml update logic (#1814)
- additional information is now available in show response, use this to pull gguf before running
- make gguf updates cancellable
2024-01-05 15:22:32 -05:00
Patrick Devine
22e93efa41 add show info command and fix the modelfile 2024-01-05 12:20:05 -08:00
Patrick Devine
2909dce894 split up interactive generation 2024-01-05 12:20:05 -08:00
Jeffrey Morgan
df32537312 gpu: read memory info from all cuda devices (#1802)
* gpu: read memory info from all cuda devices

* add `LOOKUP_SIZE` constant

* better constant name

* address comments
2024-01-05 11:25:58 -05:00
Bruce MacDonald
3367b5f3df remove unused generate patches (#1810) 2024-01-05 11:25:45 -05:00
Matt Williams
46edbbc518 Merge pull request #1801 from jmorganca/mattw/correctdockerlink 2024-01-04 19:20:45 -08:00
Michael Yang
d2ff18cd6b Merge pull request #1791 from jmorganca/mxyng/update-build
update Dockerfile.build
2024-01-04 19:13:44 -08:00
Matt Williams
df086d3c8c fix docker doc to point to hub
Signed-off-by: Matt Williams <m@technovangelist.com>
2024-01-04 18:42:23 -08:00
Nicholas Dudfield
8baaaa39c0 Allow extension origins (still needs explicit listing), fixes #1686 2024-01-05 09:06:47 +07:00
Michael Yang
f9961c70ae update build 2024-01-04 17:34:38 -08:00
Daniel Hiltgen
e201efa14b Add windows native build instructions 2023-12-25 08:31:34 -08:00
176 changed files with 8575 additions and 3390 deletions

162
.github/workflows/test.yaml vendored Normal file
View File

@@ -0,0 +1,162 @@
name: test
on:
pull_request:
jobs:
generate:
strategy:
matrix:
os: [ubuntu-latest, macos-latest, windows-latest]
arch: [amd64, arm64]
exclude:
- os: ubuntu-latest
arch: arm64
- os: windows-latest
arch: arm64
runs-on: ${{ matrix.os }}
env:
GOARCH: ${{ matrix.arch }}
steps:
- uses: actions/checkout@v4
- uses: actions/setup-go@v5
with:
go-version: '1.21'
cache: true
- run: go get ./...
- run: go generate -x ./...
- uses: actions/upload-artifact@v4
with:
name: ${{ matrix.os }}-${{ matrix.arch }}-libraries
path: llm/llama.cpp/build/**/lib/*
generate-cuda:
strategy:
matrix:
cuda-version:
- '11.8.0'
runs-on: linux
container: nvidia/cuda:${{ matrix.cuda-version }}-devel-ubuntu20.04
steps:
- run: |
apt-get update && apt-get install -y git build-essential curl
curl -fsSL https://github.com/Kitware/CMake/releases/download/v3.28.1/cmake-3.28.1-linux-x86_64.tar.gz \
| tar -zx -C /usr --strip-components 1
env:
DEBIAN_FRONTEND: noninteractive
- uses: actions/checkout@v4
- uses: actions/setup-go@v4
with:
go-version: '1.21'
cache: true
- run: go get ./...
- run: |
git config --global --add safe.directory /__w/ollama/ollama
go generate -x ./...
env:
OLLAMA_SKIP_CPU_GENERATE: '1'
- uses: actions/upload-artifact@v4
with:
name: cuda-${{ matrix.cuda-version }}-libraries
path: llm/llama.cpp/build/**/lib/*
generate-rocm:
strategy:
matrix:
rocm-version:
- '5.7.1'
- '6.0'
runs-on: linux
container: rocm/dev-ubuntu-20.04:${{ matrix.rocm-version }}
steps:
- run: |
apt-get update && apt-get install -y git build-essential curl rocm-libs
curl -fsSL https://github.com/Kitware/CMake/releases/download/v3.28.1/cmake-3.28.1-linux-x86_64.tar.gz \
| tar -zx -C /usr --strip-components 1
env:
DEBIAN_FRONTEND: noninteractive
- uses: actions/checkout@v4
- uses: actions/setup-go@v4
with:
go-version: '1.21'
cache: true
- run: go get ./...
- run: |
git config --global --add safe.directory /__w/ollama/ollama
go generate -x ./...
env:
OLLAMA_SKIP_CPU_GENERATE: '1'
- uses: actions/upload-artifact@v4
with:
name: rocm-${{ matrix.rocm-version }}-libraries
path: llm/llama.cpp/build/**/lib/*
lint:
strategy:
matrix:
os: [ubuntu-latest, macos-latest, windows-latest]
arch: [amd64, arm64]
exclude:
- os: ubuntu-latest
arch: arm64
- os: windows-latest
arch: arm64
- os: macos-latest
arch: amd64
runs-on: ${{ matrix.os }}
env:
GOARCH: ${{ matrix.arch }}
CGO_ENABLED: "1"
steps:
- uses: actions/checkout@v4
with:
submodules: recursive
- uses: actions/setup-go@v5
with:
go-version: '1.21'
cache: false
- run: |
mkdir -p llm/llama.cpp/build/linux/${{ matrix.arch }}/stub/lib/
touch llm/llama.cpp/build/linux/${{ matrix.arch }}/stub/lib/stub.so
if: ${{ startsWith(matrix.os, 'ubuntu-') }}
- run: |
mkdir -p llm/llama.cpp/build/darwin/${{ matrix.arch }}/stub/lib/
touch llm/llama.cpp/build/darwin/${{ matrix.arch }}/stub/lib/stub.dylib
touch llm/llama.cpp/ggml-metal.metal
if: ${{ startsWith(matrix.os, 'macos-') }}
- run: |
mkdir -p llm/llama.cpp/build/windows/${{ matrix.arch }}/stub/lib/
touch llm/llama.cpp/build/windows/${{ matrix.arch }}/stub/lib/stub.dll
if: ${{ startsWith(matrix.os, 'windows-') }}
- uses: golangci/golangci-lint-action@v3
test:
needs: generate
strategy:
matrix:
os: [ubuntu-latest, macos-latest, windows-latest]
arch: [amd64]
exclude:
- os: ubuntu-latest
arch: arm64
- os: windows-latest
arch: arm64
runs-on: ${{ matrix.os }}
env:
GOARCH: ${{ matrix.arch }}
CGO_ENABLED: "1"
steps:
- uses: actions/checkout@v4
with:
submodules: recursive
- uses: actions/setup-go@v5
with:
go-version: '1.21'
cache: true
- run: go get
- uses: actions/download-artifact@v4
with:
name: ${{ matrix.os }}-${{ matrix.arch }}-libraries
path: llm/llama.cpp/build
- run: go build
- run: go test -v ./...
- uses: actions/upload-artifact@v4
with:
name: ${{ matrix.os }}-binaries
path: ollama

3
.gitignore vendored
View File

@@ -9,4 +9,5 @@ ggml-metal.metal
.cache
*.exe
.idea
test_data
test_data
*.crt

27
.golangci.yaml Normal file
View File

@@ -0,0 +1,27 @@
run:
timeout: 5m
linters:
enable:
- asasalint
- bidichk
- bodyclose
- containedctx
- contextcheck
- exportloopref
- gocheckcompilerdirectives
# FIXME: for some reason this errors on windows
# - gofmt
# - goimports
- misspell
- nilerr
- unused
linters-settings:
errcheck:
# exclude the following functions since we don't generally
# need to be concerned with the returned errors
exclude-functions:
- encoding/binary.Read
- (*os.File).Seek
- (*bufio.Writer).WriteString
- (*github.com/spf13/pflag.FlagSet).Set
- (*github.com/jmorganca/ollama/llm.readSeekOffset).Seek

View File

@@ -1,27 +1,135 @@
FROM nvidia/cuda:11.8.0-devel-ubuntu22.04
ARG GOLANG_VERSION=1.21.3
ARG CMAKE_VERSION=3.22.1
ARG CUDA_VERSION=11.3.1
ARG TARGETARCH
ARG GOFLAGS="'-ldflags=-w -s'"
# Copy the minimal context we need to run the generate scripts
FROM scratch AS llm-code
COPY .git .git
COPY .gitmodules .gitmodules
COPY llm llm
FROM --platform=linux/amd64 nvidia/cuda:$CUDA_VERSION-devel-centos7 AS cuda-build-amd64
ARG CMAKE_VERSION
COPY ./scripts/rh_linux_deps.sh /
RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
COPY --from=llm-code / /go/src/github.com/jmorganca/ollama/
WORKDIR /go/src/github.com/jmorganca/ollama/llm/generate
ARG CGO_CFLAGS
RUN OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION-devel-rockylinux8 AS cuda-build-arm64
ARG CMAKE_VERSION
COPY ./scripts/rh_linux_deps.sh /
RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
ENV PATH /opt/rh/gcc-toolset-10/root/usr/bin:$PATH
COPY --from=llm-code / /go/src/github.com/jmorganca/ollama/
WORKDIR /go/src/github.com/jmorganca/ollama/llm/generate
ARG CGO_CFLAGS
RUN OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
FROM --platform=linux/amd64 rocm/dev-centos-7:5.7.1-complete AS rocm-5-build-amd64
ARG CMAKE_VERSION
COPY ./scripts/rh_linux_deps.sh /
RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
ENV LIBRARY_PATH /opt/amdgpu/lib64
COPY --from=llm-code / /go/src/github.com/jmorganca/ollama/
WORKDIR /go/src/github.com/jmorganca/ollama/llm/generate
ARG CGO_CFLAGS
ARG AMDGPU_TARGETS
RUN OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
FROM --platform=linux/amd64 rocm/dev-centos-7:6.0-complete AS rocm-6-build-amd64
ARG CMAKE_VERSION
COPY ./scripts/rh_linux_deps.sh /
RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
ENV LIBRARY_PATH /opt/amdgpu/lib64
COPY --from=llm-code / /go/src/github.com/jmorganca/ollama/
WORKDIR /go/src/github.com/jmorganca/ollama/llm/generate
ARG CGO_CFLAGS
ARG AMDGPU_TARGETS
RUN OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
FROM --platform=linux/amd64 centos:7 AS cpu-builder-amd64
ARG CMAKE_VERSION
ARG GOLANG_VERSION
COPY ./scripts/rh_linux_deps.sh /
RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
COPY --from=llm-code / /go/src/github.com/jmorganca/ollama/
ARG OLLAMA_CUSTOM_CPU_DEFS
ARG CGO_CFLAGS
WORKDIR /go/src/github.com/jmorganca/ollama/llm/generate
FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu-build-amd64
RUN OLLAMA_CPU_TARGET="cpu" sh gen_linux.sh
FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu_avx-build-amd64
RUN OLLAMA_CPU_TARGET="cpu_avx" sh gen_linux.sh
FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu_avx2-build-amd64
RUN OLLAMA_CPU_TARGET="cpu_avx2" sh gen_linux.sh
FROM --platform=linux/arm64 centos:7 AS cpu-build-arm64
ARG CMAKE_VERSION
ARG GOLANG_VERSION
COPY ./scripts/rh_linux_deps.sh /
RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
COPY --from=llm-code / /go/src/github.com/jmorganca/ollama/
WORKDIR /go/src/github.com/jmorganca/ollama/llm/generate
# Note, we only build the "base" CPU variant on arm since avx/avx2 are x86 features
ARG OLLAMA_CUSTOM_CPU_DEFS
ARG CGO_CFLAGS
RUN OLLAMA_CPU_TARGET="cpu" sh gen_linux.sh
# Intermediate stage used for ./scripts/build_linux.sh
FROM --platform=linux/amd64 cpu-build-amd64 AS build-amd64
ENV CGO_ENABLED 1
WORKDIR /go/src/github.com/jmorganca/ollama
RUN apt-get update && apt-get install -y git build-essential cmake
ADD https://dl.google.com/go/go1.21.3.linux-$TARGETARCH.tar.gz /tmp/go1.21.3.tar.gz
RUN mkdir -p /usr/local && tar xz -C /usr/local </tmp/go1.21.3.tar.gz
COPY . .
ENV GOARCH=$TARGETARCH
ENV GOFLAGS=$GOFLAGS
RUN /usr/local/go/bin/go generate ./... \
&& /usr/local/go/bin/go build .
COPY --from=cpu_avx-build-amd64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
COPY --from=cpu_avx2-build-amd64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
COPY --from=cuda-build-amd64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
COPY --from=rocm-5-build-amd64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
COPY --from=rocm-6-build-amd64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
ARG GOFLAGS
ARG CGO_CFLAGS
RUN go build .
FROM ubuntu:22.04
# Intermediate stage used for ./scripts/build_linux.sh
FROM --platform=linux/arm64 cpu-build-arm64 AS build-arm64
ENV CGO_ENABLED 1
ARG GOLANG_VERSION
WORKDIR /go/src/github.com/jmorganca/ollama
COPY . .
COPY --from=cuda-build-arm64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
ARG GOFLAGS
ARG CGO_CFLAGS
RUN go build .
# Runtime stages
FROM --platform=linux/amd64 ubuntu:22.04 as runtime-amd64
RUN apt-get update && apt-get install -y ca-certificates
COPY --from=0 /go/src/github.com/jmorganca/ollama/ollama /bin/ollama
COPY --from=build-amd64 /go/src/github.com/jmorganca/ollama/ollama /bin/ollama
FROM --platform=linux/arm64 ubuntu:22.04 as runtime-arm64
RUN apt-get update && apt-get install -y ca-certificates
COPY --from=build-arm64 /go/src/github.com/jmorganca/ollama/ollama /bin/ollama
# Radeon images are much larger so we keep it distinct from the CPU/CUDA image
FROM --platform=linux/amd64 rocm/dev-centos-7:5.7.1-complete as runtime-rocm
RUN update-pciids
COPY --from=build-amd64 /go/src/github.com/jmorganca/ollama/ollama /bin/ollama
EXPOSE 11434
ENV OLLAMA_HOST 0.0.0.0
# set some environment variable for better NVIDIA compatibility
ENV PATH=/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
ENTRYPOINT ["/bin/ollama"]
CMD ["serve"]
FROM runtime-$TARGETARCH
EXPOSE 11434
ENV OLLAMA_HOST 0.0.0.0
ENV PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
ENV LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64
ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility

View File

@@ -1,74 +0,0 @@
# Ubuntu 20.04 amd64 dependencies
FROM --platform=linux/amd64 ubuntu:20.04 AS base-amd64
ARG CUDA_VERSION=11.3.1-1
ARG CMAKE_VERSION=3.22.1
# ROCm only supports amd64
ARG ROCM_VERSION=6.0
ARG CLBLAST_VER=1.6.1
# Note: https://rocm.docs.amd.com/en/latest/release/user_kernel_space_compat_matrix.html
RUN apt-get update && \
apt-get install -y wget gnupg && \
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-ubuntu2004.pin && \
mv cuda-ubuntu2004.pin /etc/apt/preferences.d/cuda-repository-pin-600 && \
apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub && \
echo "deb [by-hash=no] https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/ /" > /etc/apt/sources.list.d/cuda.list && \
wget "https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.sh" -O /tmp/cmake-installer.sh && \
chmod +x /tmp/cmake-installer.sh && /tmp/cmake-installer.sh --skip-license --prefix=/usr && \
mkdir --parents --mode=0755 /etc/apt/keyrings && \
wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | gpg --dearmor > /etc/apt/keyrings/rocm.gpg && \
echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/${ROCM_VERSION} focal main" > /etc/apt/sources.list.d/rocm.list && \
echo "Package: *" > /etc/apt/preferences.d/rocm-pin-600 && \
echo "Pin: release o=repo.radeon.com" >> /etc/apt/preferences.d/rocm-pin-600 && \
echo "Pin-Priority: 600" >> /etc/apt/preferences.d/rocm-pin-600 && \
apt-get update && \
DEBIAN_FRONTEND=noninteractive apt-get -y install cuda=${CUDA_VERSION} rocm-hip-libraries rocm-device-libs rocm-libs rocm-ocl-icd rocm-hip-sdk rocm-hip-libraries rocm-cmake rocm-clang-ocl rocm-dev
# CLBlast
RUN wget -qO- https://github.com/CNugteren/CLBlast/archive/refs/tags/${CLBLAST_VER}.tar.gz | tar zxv -C /tmp/ && \
cd /tmp/CLBlast-${CLBLAST_VER} && mkdir build && cd build && cmake .. && make && make install
ENV ROCM_PATH=/opt/rocm
# Ubuntu 22.04 arm64 dependencies
FROM --platform=linux/arm64 ubuntu:20.04 AS base-arm64
ARG CUDA_VERSION=11.3.1-1
ARG CMAKE_VERSION=3.27.6
RUN apt-get update && \
apt-get install -y wget gnupg && \
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/sbsa/cuda-ubuntu2004.pin && \
mv cuda-ubuntu2004.pin /etc/apt/preferences.d/cuda-repository-pin-600 && \
apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/sbsa//3bf863cc.pub && \
echo "deb [by-hash=no] https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/sbsa/ /" > /etc/apt/sources.list.d/cuda.list && \
wget "https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-aarch64.sh" -O /tmp/cmake-installer.sh && \
chmod +x /tmp/cmake-installer.sh && /tmp/cmake-installer.sh --skip-license --prefix=/usr && \
apt-get update && \
apt-cache madison cuda && \
DEBIAN_FRONTEND=noninteractive apt-get -y install cuda=${CUDA_VERSION}
FROM base-${TARGETARCH}
ARG TARGETARCH
ARG GOFLAGS="'-ldflags -w -s'"
ARG CGO_CFLAGS
ARG GOLANG_VERSION=1.21.3
# Common toolchain
RUN apt-get update && \
DEBIAN_FRONTEND=noninteractive apt-get install -y gcc-10 g++-10 cpp-10 git ocl-icd-opencl-dev && \
update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 100 --slave /usr/bin/g++ g++ /usr/bin/g++-10 --slave /usr/bin/gcov gcov /usr/bin/gcov-10
# install go
ADD https://dl.google.com/go/go${GOLANG_VERSION}.linux-$TARGETARCH.tar.gz /tmp/go${GOLANG_VERSION}.tar.gz
RUN mkdir -p /usr/local && tar xz -C /usr/local </tmp/go${GOLANG_VERSION}.tar.gz
# build the final binary
WORKDIR /go/src/github.com/jmorganca/ollama
COPY . .
ENV GOOS=linux
ENV GOARCH=$TARGETARCH
ENV GOFLAGS=$GOFLAGS
ENV CGO_CFLAGS=${CGO_CFLAGS}
RUN /usr/local/go/bin/go generate ./... && \
/usr/local/go/bin/go build .

View File

@@ -1,8 +1,5 @@
<div align="center">
<picture>
<source media="(prefers-color-scheme: dark)" height="200px" srcset="https://github.com/jmorganca/ollama/assets/3325447/56ea1849-1284-4645-8970-956de6e51c3c">
<img alt="logo" height="200px" src="https://github.com/jmorganca/ollama/assets/3325447/0d0b44e2-8f4a-4e99-9b52-a5c1c741c8f7">
</picture>
<img alt="ollama" height="200px" src="https://github.com/jmorganca/ollama/assets/3325447/0d0b44e2-8f4a-4e99-9b52-a5c1c741c8f7">
</div>
# Ollama
@@ -13,16 +10,16 @@ Get up and running with large language models locally.
### macOS
[Download](https://ollama.ai/download/Ollama-darwin.zip)
[Download](https://ollama.com/download/Ollama-darwin.zip)
### Windows
### Windows preview
Coming soon! For now, you can install Ollama on Windows via WSL2.
[Download](https://ollama.com/download/OllamaSetup.exe)
### Linux & WSL2
### Linux
```
curl https://ollama.ai/install.sh | sh
curl -fsSL https://ollama.com/install.sh | sh
```
[Manual install instructions](https://github.com/jmorganca/ollama/blob/main/docs/linux.md)
@@ -31,9 +28,14 @@ curl https://ollama.ai/install.sh | sh
The official [Ollama Docker image](https://hub.docker.com/r/ollama/ollama) `ollama/ollama` is available on Docker Hub.
### Libraries
- [ollama-python](https://github.com/ollama/ollama-python)
- [ollama-js](https://github.com/ollama/ollama-js)
## Quickstart
To run and chat with [Llama 2](https://ollama.ai/library/llama2):
To run and chat with [Llama 2](https://ollama.com/library/llama2):
```
ollama run llama2
@@ -41,9 +43,9 @@ ollama run llama2
## Model library
Ollama supports a list of open-source models available on [ollama.ai/library](https://ollama.ai/library 'ollama model library')
Ollama supports a list of models available on [ollama.com/library](https://ollama.com/library 'ollama model library')
Here are some example open-source models that can be downloaded:
Here are some example models that can be downloaded:
| Model | Parameters | Size | Download |
| ------------------ | ---------- | ----- | ------------------------------ |
@@ -198,18 +200,21 @@ brew install cmake go
```
Then generate dependencies:
```
go generate ./...
```
Then build the binary:
```
go build .
```
More detailed instructions can be found in the [developer guide](https://github.com/jmorganca/ollama/blob/main/docs/development.md)
### Running local builds
Next, start the server:
```
@@ -251,19 +256,21 @@ See the [API documentation](./docs/api.md) for all endpoints.
## Community Integrations
### Web & Desktop
- [Bionic GPT](https://github.com/bionic-gpt/bionic-gpt)
- [HTML UI](https://github.com/rtcfirefly/ollama-ui)
- [Chatbot UI](https://github.com/ivanfioravanti/chatbot-ollama)
- [Typescript UI](https://github.com/ollama-interface/Ollama-Gui?tab=readme-ov-file)
- [Minimalistic React UI for Ollama Models](https://github.com/richawo/minimal-llm-ui)
- [Web UI](https://github.com/ollama-webui/ollama-webui)
- [Open WebUI](https://github.com/open-webui/open-webui)
- [Ollamac](https://github.com/kevinhermawan/Ollamac)
- [big-AGI](https://github.com/enricoros/big-agi/blob/main/docs/config-ollama.md)
- [Cheshire Cat assistant framework](https://github.com/cheshire-cat-ai/core)
- [Amica](https://github.com/semperai/amica)
- [chatd](https://github.com/BruceMacD/chatd)
- [Ollama-SwiftUI](https://github.com/kghandour/Ollama-SwiftUI)
- [MindMac](https://mindmac.app)
- [NextJS Web Interface for Ollama](https://github.com/jakobhoeg/nextjs-ollama-llm-ui)
### Terminal
@@ -272,10 +279,14 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [Emacs client](https://github.com/zweifisch/ollama)
- [gen.nvim](https://github.com/David-Kunz/gen.nvim)
- [ollama.nvim](https://github.com/nomnivore/ollama.nvim)
- [ollama-chat.nvim](https://github.com/gerazov/ollama-chat.nvim)
- [ogpt.nvim](https://github.com/huynle/ogpt.nvim)
- [gptel Emacs client](https://github.com/karthink/gptel)
- [Oatmeal](https://github.com/dustinblackman/oatmeal)
- [cmdh](https://github.com/pgibler/cmdh)
- [tenere](https://github.com/pythops/tenere)
- [llm-ollama](https://github.com/taketwo/llm-ollama) for [Datasette's LLM CLI](https://llm.datasette.io/en/stable/).
- [ShellOracle](https://github.com/djcopley/ShellOracle)
### Database
@@ -284,14 +295,17 @@ See the [API documentation](./docs/api.md) for all endpoints.
### Package managers
- [Pacman](https://archlinux.org/packages/extra/x86_64/ollama/)
- [Helm Chart](https://artifacthub.io/packages/helm/ollama-helm/ollama)
### Libraries
- [LangChain](https://python.langchain.com/docs/integrations/llms/ollama) and [LangChain.js](https://js.langchain.com/docs/modules/model_io/models/llms/integrations/ollama) with [example](https://js.langchain.com/docs/use_cases/question_answering/local_retrieval_qa)
- [LangChainGo](https://github.com/tmc/langchaingo/) with [example](https://github.com/tmc/langchaingo/tree/main/examples/ollama-completion-example)
- [LlamaIndex](https://gpt-index.readthedocs.io/en/stable/examples/llm/ollama.html)
- [LangChain4j](https://github.com/langchain4j/langchain4j/tree/main/langchain4j-ollama)
- [LiteLLM](https://github.com/BerriAI/litellm)
- [OllamaSharp for .NET](https://github.com/awaescher/OllamaSharp)
- [Ollama for Ruby](https://github.com/gbaptista/ollama-ai)
- [Ollama-rs for Rust](https://github.com/pepperoni21/ollama-rs)
- [Ollama4j for Java](https://github.com/amithkoujalgi/ollama4j)
- [ModelFusion Typescript Library](https://modelfusion.dev/integration/model-provider/ollama)
@@ -299,6 +313,10 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [Ollama for Dart](https://github.com/breitburg/dart-ollama)
- [Ollama for Laravel](https://github.com/cloudstudio/ollama-laravel)
- [LangChainDart](https://github.com/davidmigloz/langchain_dart)
- [Semantic Kernel - Python](https://github.com/microsoft/semantic-kernel/tree/main/python/semantic_kernel/connectors/ai/ollama)
- [Haystack](https://github.com/deepset-ai/haystack-integrations/blob/main/integrations/ollama.md)
- [Ollama for R - rollama](https://github.com/JBGruber/rollama)
- [Ollama-ex for Elixir](https://github.com/lebrunel/ollama-ex)
### Mobile
@@ -319,3 +337,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [Rivet plugin](https://github.com/abrenneke/rivet-plugin-ollama)
- [Llama Coder](https://github.com/ex3ndr/llama-coder) (Copilot alternative using Ollama)
- [Obsidian BMO Chatbot plugin](https://github.com/longy2k/obsidian-bmo-chatbot)
- [Open Interpreter](https://docs.openinterpreter.com/language-model-setup/local-models/ollama)
- [twinny](https://github.com/rjmacarthy/twinny) (Copilot and Copilot chat alternative using Ollama)
- [Wingman-AI](https://github.com/RussellCanfield/wingman-ai) (Copilot code and chat alternative using Ollama and HuggingFace)

View File

@@ -1,284 +0,0 @@
import os
import json
import requests
import os
import hashlib
import json
from pathlib import Path
BASE_URL = os.environ.get('OLLAMA_HOST', 'http://localhost:11434')
# Generate a response for a given prompt with a provided model. This is a streaming endpoint, so will be a series of responses.
# The final response object will include statistics and additional data from the request. Use the callback function to override
# the default handler.
def generate(model_name, prompt, system=None, template=None, format="", context=None, options=None, callback=None):
try:
url = f"{BASE_URL}/api/generate"
payload = {
"model": model_name,
"prompt": prompt,
"system": system,
"template": template,
"context": context,
"options": options,
"format": format,
}
# Remove keys with None values
payload = {k: v for k, v in payload.items() if v is not None}
with requests.post(url, json=payload, stream=True) as response:
response.raise_for_status()
# Creating a variable to hold the context history of the final chunk
final_context = None
# Variable to hold concatenated response strings if no callback is provided
full_response = ""
# Iterating over the response line by line and displaying the details
for line in response.iter_lines():
if line:
# Parsing each line (JSON chunk) and extracting the details
chunk = json.loads(line)
# If a callback function is provided, call it with the chunk
if callback:
callback(chunk)
else:
# If this is not the last chunk, add the "response" field value to full_response and print it
if not chunk.get("done"):
response_piece = chunk.get("response", "")
full_response += response_piece
print(response_piece, end="", flush=True)
# Check if it's the last chunk (done is true)
if chunk.get("done"):
final_context = chunk.get("context")
# Return the full response and the final context
return full_response, final_context
except requests.exceptions.RequestException as e:
print(f"An error occurred: {e}")
return None, None
# Create a blob file on the server if it doesn't exist.
def create_blob(digest, file_path):
url = f"{BASE_URL}/api/blobs/{digest}"
# Check if the blob exists
response = requests.head(url)
if response.status_code != 404:
return # Blob already exists, no need to upload
response.raise_for_status()
# Upload the blob
with open(file_path, 'rb') as file_data:
requests.post(url, data=file_data)
# Create a model from a Modelfile. Use the callback function to override the default handler.
def create(model_name, filename, callback=None):
try:
file_path = Path(filename).expanduser().resolve()
processed_lines = []
# Read and process the modelfile
with open(file_path, 'r') as f:
for line in f:
# Skip empty or whitespace-only lines
if not line.strip():
continue
command, args = line.split(maxsplit=1)
if command.upper() in ["FROM", "ADAPTER"]:
path = Path(args.strip()).expanduser()
# Check if path is relative and resolve it
if not path.is_absolute():
path = (file_path.parent / path)
# Skip if file does not exist for "model", this is handled by the server
if not path.exists():
processed_lines.append(line)
continue
# Calculate SHA-256 hash
with open(path, 'rb') as bin_file:
hash = hashlib.sha256()
hash.update(bin_file.read())
blob = f"sha256:{hash.hexdigest()}"
# Add the file to the remote server
create_blob(blob, path)
# Replace path with digest in the line
line = f"{command} @{blob}\n"
processed_lines.append(line)
# Combine processed lines back into a single string
modelfile_content = '\n'.join(processed_lines)
url = f"{BASE_URL}/api/create"
payload = {"name": model_name, "modelfile": modelfile_content}
# Making a POST request with the stream parameter set to True to handle streaming responses
with requests.post(url, json=payload, stream=True) as response:
response.raise_for_status()
# Iterating over the response line by line and displaying the status
for line in response.iter_lines():
if line:
chunk = json.loads(line)
if callback:
callback(chunk)
else:
print(f"Status: {chunk.get('status')}")
except Exception as e:
print(f"An error occurred: {e}")
# Pull a model from a the model registry. Cancelled pulls are resumed from where they left off, and multiple
# calls to will share the same download progress. Use the callback function to override the default handler.
def pull(model_name, insecure=False, callback=None):
try:
url = f"{BASE_URL}/api/pull"
payload = {
"name": model_name,
"insecure": insecure
}
# Making a POST request with the stream parameter set to True to handle streaming responses
with requests.post(url, json=payload, stream=True) as response:
response.raise_for_status()
# Iterating over the response line by line and displaying the details
for line in response.iter_lines():
if line:
# Parsing each line (JSON chunk) and extracting the details
chunk = json.loads(line)
# If a callback function is provided, call it with the chunk
if callback:
callback(chunk)
else:
# Print the status message directly to the console
print(chunk.get('status', ''), end='', flush=True)
# If there's layer data, you might also want to print that (adjust as necessary)
if 'digest' in chunk:
print(f" - Digest: {chunk['digest']}", end='', flush=True)
print(f" - Total: {chunk['total']}", end='', flush=True)
print(f" - Completed: {chunk['completed']}", end='\n', flush=True)
else:
print()
except requests.exceptions.RequestException as e:
print(f"An error occurred: {e}")
# Push a model to the model registry. Use the callback function to override the default handler.
def push(model_name, insecure=False, callback=None):
try:
url = f"{BASE_URL}/api/push"
payload = {
"name": model_name,
"insecure": insecure
}
# Making a POST request with the stream parameter set to True to handle streaming responses
with requests.post(url, json=payload, stream=True) as response:
response.raise_for_status()
# Iterating over the response line by line and displaying the details
for line in response.iter_lines():
if line:
# Parsing each line (JSON chunk) and extracting the details
chunk = json.loads(line)
# If a callback function is provided, call it with the chunk
if callback:
callback(chunk)
else:
# Print the status message directly to the console
print(chunk.get('status', ''), end='', flush=True)
# If there's layer data, you might also want to print that (adjust as necessary)
if 'digest' in chunk:
print(f" - Digest: {chunk['digest']}", end='', flush=True)
print(f" - Total: {chunk['total']}", end='', flush=True)
print(f" - Completed: {chunk['completed']}", end='\n', flush=True)
else:
print()
except requests.exceptions.RequestException as e:
print(f"An error occurred: {e}")
# List models that are available locally.
def list():
try:
response = requests.get(f"{BASE_URL}/api/tags")
response.raise_for_status()
data = response.json()
models = data.get('models', [])
return models
except requests.exceptions.RequestException as e:
print(f"An error occurred: {e}")
return None
# Copy a model. Creates a model with another name from an existing model.
def copy(source, destination):
try:
# Create the JSON payload
payload = {
"source": source,
"destination": destination
}
response = requests.post(f"{BASE_URL}/api/copy", json=payload)
response.raise_for_status()
# If the request was successful, return a message indicating that the copy was successful
return "Copy successful"
except requests.exceptions.RequestException as e:
print(f"An error occurred: {e}")
return None
# Delete a model and its data.
def delete(model_name):
try:
url = f"{BASE_URL}/api/delete"
payload = {"name": model_name}
response = requests.delete(url, json=payload)
response.raise_for_status()
return "Delete successful"
except requests.exceptions.RequestException as e:
print(f"An error occurred: {e}")
return None
# Show info about a model.
def show(model_name):
try:
url = f"{BASE_URL}/api/show"
payload = {"name": model_name}
response = requests.post(url, json=payload)
response.raise_for_status()
# Parse the JSON response and return it
data = response.json()
return data
except requests.exceptions.RequestException as e:
print(f"An error occurred: {e}")
return None
def heartbeat():
try:
url = f"{BASE_URL}/"
response = requests.head(url)
response.raise_for_status()
return "Ollama is running"
except requests.exceptions.RequestException as e:
print(f"An error occurred: {e}")
return "Ollama is not running"

View File

@@ -34,24 +34,26 @@ func (e StatusError) Error() string {
type ImageData []byte
type GenerateRequest struct {
Model string `json:"model"`
Prompt string `json:"prompt"`
System string `json:"system"`
Template string `json:"template"`
Context []int `json:"context,omitempty"`
Stream *bool `json:"stream,omitempty"`
Raw bool `json:"raw,omitempty"`
Format string `json:"format"`
Images []ImageData `json:"images,omitempty"`
Model string `json:"model"`
Prompt string `json:"prompt"`
System string `json:"system"`
Template string `json:"template"`
Context []int `json:"context,omitempty"`
Stream *bool `json:"stream,omitempty"`
Raw bool `json:"raw,omitempty"`
Format string `json:"format"`
KeepAlive *Duration `json:"keep_alive,omitempty"`
Images []ImageData `json:"images,omitempty"`
Options map[string]interface{} `json:"options"`
}
type ChatRequest struct {
Model string `json:"model"`
Messages []Message `json:"messages"`
Stream *bool `json:"stream,omitempty"`
Format string `json:"format"`
Model string `json:"model"`
Messages []Message `json:"messages"`
Stream *bool `json:"stream,omitempty"`
Format string `json:"format"`
KeepAlive *Duration `json:"keep_alive,omitempty"`
Options map[string]interface{} `json:"options"`
}
@@ -126,8 +128,9 @@ type Runner struct {
}
type EmbeddingRequest struct {
Model string `json:"model"`
Prompt string `json:"prompt"`
Model string `json:"model"`
Prompt string `json:"prompt"`
KeepAlive *Duration `json:"keep_alive,omitempty"`
Options map[string]interface{} `json:"options"`
}
@@ -137,17 +140,30 @@ type EmbeddingResponse struct {
}
type CreateRequest struct {
Name string `json:"name"`
Model string `json:"model"`
Path string `json:"path"`
Modelfile string `json:"modelfile"`
Stream *bool `json:"stream,omitempty"`
// Name is deprecated, see Model
Name string `json:"name"`
}
type DeleteRequest struct {
Model string `json:"model"`
// Name is deprecated, see Model
Name string `json:"name"`
}
type ShowRequest struct {
Model string `json:"model"`
System string `json:"system"`
Template string `json:"template"`
Options map[string]interface{} `json:"options"`
// Name is deprecated, see Model
Name string `json:"name"`
}
@@ -158,6 +174,7 @@ type ShowResponse struct {
Template string `json:"template,omitempty"`
System string `json:"system,omitempty"`
Details ModelDetails `json:"details,omitempty"`
Messages []Message `json:"messages,omitempty"`
}
type CopyRequest struct {
@@ -166,11 +183,14 @@ type CopyRequest struct {
}
type PullRequest struct {
Name string `json:"name"`
Model string `json:"model"`
Insecure bool `json:"insecure,omitempty"`
Username string `json:"username"`
Password string `json:"password"`
Stream *bool `json:"stream,omitempty"`
// Name is deprecated, see Model
Name string `json:"name"`
}
type ProgressResponse struct {
@@ -181,11 +201,14 @@ type ProgressResponse struct {
}
type PushRequest struct {
Name string `json:"name"`
Model string `json:"model"`
Insecure bool `json:"insecure,omitempty"`
Username string `json:"username"`
Password string `json:"password"`
Stream *bool `json:"stream,omitempty"`
// Name is deprecated, see Model
Name string `json:"name"`
}
type ListResponse struct {
@@ -194,6 +217,7 @@ type ListResponse struct {
type ModelResponse struct {
Name string `json:"name"`
Model string `json:"model"`
ModifiedAt time.Time `json:"modified_at"`
Size int64 `json:"size"`
Digest string `json:"digest"`
@@ -216,6 +240,7 @@ type GenerateResponse struct {
}
type ModelDetails struct {
ParentModel string `json:"parent_model"`
Format string `json:"format"`
Family string `json:"family"`
Families []string `json:"families"`
@@ -390,15 +415,18 @@ func (d *Duration) UnmarshalJSON(b []byte) (err error) {
switch t := v.(type) {
case float64:
if t < 0 {
t = math.MaxFloat64
d.Duration = time.Duration(math.MaxInt64)
} else {
d.Duration = time.Duration(t * float64(time.Second))
}
d.Duration = time.Duration(t)
case string:
d.Duration, err = time.ParseDuration(t)
if err != nil {
return err
}
if d.Duration < 0 {
d.Duration = time.Duration(math.MaxInt64)
}
}
return nil

93
app/.gitignore vendored
View File

@@ -1,92 +1 @@
# Logs
logs
*.log
npm-debug.log*
yarn-debug.log*
yarn-error.log*
lerna-debug.log*
# Diagnostic reports (https://nodejs.org/api/report.html)
report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json
# Runtime data
pids
*.pid
*.seed
*.pid.lock
.DS_Store
# Directory for instrumented libs generated by jscoverage/JSCover
lib-cov
# Coverage directory used by tools like istanbul
coverage
*.lcov
# nyc test coverage
.nyc_output
# node-waf configuration
.lock-wscript
# Compiled binary addons (https://nodejs.org/api/addons.html)
build/Release
# Dependency directories
node_modules/
jspm_packages/
# TypeScript v1 declaration files
typings/
# TypeScript cache
*.tsbuildinfo
# Optional npm cache directory
.npm
# Optional eslint cache
.eslintcache
# Optional REPL history
.node_repl_history
# Output of 'npm pack'
*.tgz
# Yarn Integrity file
.yarn-integrity
# dotenv environment variables file
.env
.env.test
# parcel-bundler cache (https://parceljs.org/)
.cache
# next.js build output
.next
# nuxt.js build output
.nuxt
# vuepress build output
.vuepress/dist
# Serverless directories
.serverless/
# FuseBox cache
.fusebox/
# DynamoDB Local files
.dynamodb/
# Webpack
.webpack/
# Vite
.vite/
# Electron-Forge
out/
ollama.syso

View File

@@ -1,21 +1,22 @@
# Desktop
# Ollama App
This app builds upon Ollama to provide a desktop experience for running models.
## Linux
## Developing
TODO
First, build the `ollama` binary:
## MacOS
TODO
## Windows
If you want to build the installer, youll need to install
- https://jrsoftware.org/isinfo.php
In the top directory of this repo, run the following powershell script
to build the ollama CLI, ollama app, and ollama installer.
```
cd ..
go build .
powershell -ExecutionPolicy Bypass -File .\scripts\build_windows.ps1
```
Then run the desktop app with `npm start`:
```
cd app
npm install
npm start
```

BIN
app/assets/app.ico Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 7.3 KiB

17
app/assets/assets.go Normal file
View File

@@ -0,0 +1,17 @@
package assets
import (
"embed"
"io/fs"
)
//go:embed *.ico
var icons embed.FS
func ListIcons() ([]string, error) {
return fs.Glob(icons, "*")
}
func GetIcon(filename string) ([]byte, error) {
return icons.ReadFile(filename)
}

BIN
app/assets/setup.bmp Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 76 KiB

BIN
app/assets/tray.ico Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 89 KiB

BIN
app/assets/tray_upgrade.ico Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 91 KiB

View File

@@ -0,0 +1,9 @@
//go:build !windows
package lifecycle
import "fmt"
func GetStarted() error {
return fmt.Errorf("GetStarted not implemented")
}

View File

@@ -0,0 +1,44 @@
package lifecycle
import (
"fmt"
"log/slog"
"os"
"os/exec"
"path/filepath"
"syscall"
)
func GetStarted() error {
const CREATE_NEW_CONSOLE = 0x00000010
var err error
bannerScript := filepath.Join(AppDir, "ollama_welcome.ps1")
args := []string{
// TODO once we're signed, the execution policy bypass should be removed
"powershell", "-noexit", "-ExecutionPolicy", "Bypass", "-nologo", "-file", bannerScript,
}
args[0], err = exec.LookPath(args[0])
if err != nil {
return err
}
// Make sure the script actually exists
_, err = os.Stat(bannerScript)
if err != nil {
return fmt.Errorf("getting started banner script error %s", err)
}
slog.Info(fmt.Sprintf("opening getting started terminal with %v", args))
attrs := &os.ProcAttr{
Files: []*os.File{os.Stdin, os.Stdout, os.Stderr},
Sys: &syscall.SysProcAttr{CreationFlags: CREATE_NEW_CONSOLE, HideWindow: false},
}
proc, err := os.StartProcess(args[0], args, attrs)
if err != nil {
return fmt.Errorf("unable to start getting started shell %w", err)
}
slog.Debug(fmt.Sprintf("getting started terminal PID: %d", proc.Pid))
return proc.Release()
}

View File

@@ -0,0 +1,92 @@
package lifecycle
import (
"context"
"fmt"
"log"
"log/slog"
"os"
"os/signal"
"syscall"
"github.com/jmorganca/ollama/app/store"
"github.com/jmorganca/ollama/app/tray"
)
func Run() {
InitLogging()
ctx, cancel := context.WithCancel(context.Background())
var done chan int
t, err := tray.NewTray()
if err != nil {
log.Fatalf("Failed to start: %s", err)
}
callbacks := t.GetCallbacks()
signals := make(chan os.Signal, 1)
signal.Notify(signals, syscall.SIGINT, syscall.SIGTERM)
go func() {
slog.Debug("starting callback loop")
for {
select {
case <-callbacks.Quit:
slog.Debug("quit called")
t.Quit()
case <-signals:
slog.Debug("shutting down due to signal")
t.Quit()
case <-callbacks.Update:
err := DoUpgrade(cancel, done)
if err != nil {
slog.Warn(fmt.Sprintf("upgrade attempt failed: %s", err))
}
case <-callbacks.ShowLogs:
ShowLogs()
case <-callbacks.DoFirstUse:
err := GetStarted()
if err != nil {
slog.Warn(fmt.Sprintf("Failed to launch getting started shell: %s", err))
}
}
}
}()
// Are we first use?
if !store.GetFirstTimeRun() {
slog.Debug("First time run")
err = t.DisplayFirstUseNotification()
if err != nil {
slog.Debug(fmt.Sprintf("XXX failed to display first use notification %v", err))
}
store.SetFirstTimeRun(true)
} else {
slog.Debug("Not first time, skipping first run notification")
}
if IsServerRunning(ctx) {
slog.Info("Detected another instance of ollama running, exiting")
os.Exit(1)
} else {
done, err = SpawnServer(ctx, CLIName)
if err != nil {
// TODO - should we retry in a backoff loop?
// TODO - should we pop up a warning and maybe add a menu item to view application logs?
slog.Error(fmt.Sprintf("Failed to spawn ollama server %s", err))
done = make(chan int, 1)
done <- 1
}
}
StartBackgroundUpdaterChecker(ctx, t.UpdateAvailable)
t.Run()
cancel()
slog.Info("Waiting for ollama server to shutdown...")
if done != nil {
<-done
}
slog.Info("Ollama app exiting")
}

46
app/lifecycle/logging.go Normal file
View File

@@ -0,0 +1,46 @@
package lifecycle
import (
"fmt"
"log/slog"
"os"
"path/filepath"
)
func InitLogging() {
level := slog.LevelInfo
if debug := os.Getenv("OLLAMA_DEBUG"); debug != "" {
level = slog.LevelDebug
}
var logFile *os.File
var err error
// Detect if we're a GUI app on windows, and if not, send logs to console
if os.Stderr.Fd() != 0 {
// Console app detected
logFile = os.Stderr
// TODO - write one-line to the app.log file saying we're running in console mode to help avoid confusion
} else {
logFile, err = os.OpenFile(AppLogFile, os.O_APPEND|os.O_WRONLY|os.O_CREATE, 0755)
if err != nil {
slog.Error(fmt.Sprintf("failed to create server log %v", err))
return
}
}
handler := slog.NewTextHandler(logFile, &slog.HandlerOptions{
Level: level,
AddSource: true,
ReplaceAttr: func(_ []string, attr slog.Attr) slog.Attr {
if attr.Key == slog.SourceKey {
source := attr.Value.Any().(*slog.Source)
source.File = filepath.Base(source.File)
}
return attr
},
})
slog.SetDefault(slog.New(handler))
slog.Info("ollama app started")
}

View File

@@ -0,0 +1,9 @@
//go:build !windows
package lifecycle
import "log/slog"
func ShowLogs() {
slog.Warn("ShowLogs not yet implemented")
}

View File

@@ -0,0 +1,19 @@
package lifecycle
import (
"fmt"
"log/slog"
"os/exec"
"syscall"
)
func ShowLogs() {
cmd_path := "c:\\Windows\\system32\\cmd.exe"
slog.Debug(fmt.Sprintf("viewing logs with start %s", AppDataDir))
cmd := exec.Command(cmd_path, "/c", "start", AppDataDir)
cmd.SysProcAttr = &syscall.SysProcAttr{HideWindow: false, CreationFlags: 0x08000000}
err := cmd.Start()
if err != nil {
slog.Error(fmt.Sprintf("Failed to open log dir: %s", err))
}
}

79
app/lifecycle/paths.go Normal file
View File

@@ -0,0 +1,79 @@
package lifecycle
import (
"errors"
"fmt"
"log/slog"
"os"
"path/filepath"
"runtime"
"strings"
)
var (
AppName = "ollama app"
CLIName = "ollama"
AppDir = "/opt/Ollama"
AppDataDir = "/opt/Ollama"
// TODO - should there be a distinct log dir?
UpdateStageDir = "/tmp"
AppLogFile = "/tmp/ollama_app.log"
ServerLogFile = "/tmp/ollama.log"
UpgradeLogFile = "/tmp/ollama_update.log"
Installer = "OllamaSetup.exe"
)
func init() {
if runtime.GOOS == "windows" {
AppName += ".exe"
CLIName += ".exe"
// Logs, configs, downloads go to LOCALAPPDATA
localAppData := os.Getenv("LOCALAPPDATA")
AppDataDir = filepath.Join(localAppData, "Ollama")
UpdateStageDir = filepath.Join(AppDataDir, "updates")
AppLogFile = filepath.Join(AppDataDir, "app.log")
ServerLogFile = filepath.Join(AppDataDir, "server.log")
UpgradeLogFile = filepath.Join(AppDataDir, "upgrade.log")
// Executables are stored in APPDATA
AppDir = filepath.Join(localAppData, "Programs", "Ollama")
// Make sure we have PATH set correctly for any spawned children
paths := strings.Split(os.Getenv("PATH"), ";")
// Start with whatever we find in the PATH/LD_LIBRARY_PATH
found := false
for _, path := range paths {
d, err := filepath.Abs(path)
if err != nil {
continue
}
if strings.EqualFold(AppDir, d) {
found = true
}
}
if !found {
paths = append(paths, AppDir)
pathVal := strings.Join(paths, ";")
slog.Debug("setting PATH=" + pathVal)
err := os.Setenv("PATH", pathVal)
if err != nil {
slog.Error(fmt.Sprintf("failed to update PATH: %s", err))
}
}
// Make sure our logging dir exists
_, err := os.Stat(AppDataDir)
if errors.Is(err, os.ErrNotExist) {
if err := os.MkdirAll(AppDataDir, 0o755); err != nil {
slog.Error(fmt.Sprintf("create ollama dir %s: %v", AppDataDir, err))
}
}
} else if runtime.GOOS == "darwin" {
// TODO
AppName += ".app"
// } else if runtime.GOOS == "linux" {
// TODO
}
}

139
app/lifecycle/server.go Normal file
View File

@@ -0,0 +1,139 @@
package lifecycle
import (
"context"
"errors"
"fmt"
"io"
"log/slog"
"os"
"os/exec"
"path/filepath"
"time"
"github.com/jmorganca/ollama/api"
)
func getCLIFullPath(command string) string {
cmdPath := ""
appExe, err := os.Executable()
if err == nil {
cmdPath = filepath.Join(filepath.Dir(appExe), command)
_, err := os.Stat(cmdPath)
if err == nil {
return cmdPath
}
}
cmdPath, err = exec.LookPath(command)
if err == nil {
_, err := os.Stat(cmdPath)
if err == nil {
return cmdPath
}
}
pwd, err := os.Getwd()
if err == nil {
cmdPath = filepath.Join(pwd, command)
_, err = os.Stat(cmdPath)
if err == nil {
return cmdPath
}
}
return command
}
func SpawnServer(ctx context.Context, command string) (chan int, error) {
done := make(chan int)
logDir := filepath.Dir(ServerLogFile)
_, err := os.Stat(logDir)
if errors.Is(err, os.ErrNotExist) {
if err := os.MkdirAll(logDir, 0o755); err != nil {
return done, fmt.Errorf("create ollama server log dir %s: %v", logDir, err)
}
}
cmd := getCmd(ctx, getCLIFullPath(command))
// send stdout and stderr to a file
stdout, err := cmd.StdoutPipe()
if err != nil {
return done, fmt.Errorf("failed to spawn server stdout pipe %s", err)
}
stderr, err := cmd.StderrPipe()
if err != nil {
return done, fmt.Errorf("failed to spawn server stderr pipe %s", err)
}
stdin, err := cmd.StdinPipe()
if err != nil {
return done, fmt.Errorf("failed to spawn server stdin pipe %s", err)
}
// TODO - rotation
logFile, err := os.OpenFile(ServerLogFile, os.O_APPEND|os.O_WRONLY|os.O_CREATE, 0755)
if err != nil {
return done, fmt.Errorf("failed to create server log %w", err)
}
go func() {
defer logFile.Close()
io.Copy(logFile, stdout) //nolint:errcheck
}()
go func() {
defer logFile.Close()
io.Copy(logFile, stderr) //nolint:errcheck
}()
// run the command and wait for it to finish
if err := cmd.Start(); err != nil {
return done, fmt.Errorf("failed to start server %w", err)
}
if cmd.Process != nil {
slog.Info(fmt.Sprintf("started ollama server with pid %d", cmd.Process.Pid))
}
slog.Info(fmt.Sprintf("ollama server logs %s", ServerLogFile))
go func() {
// Keep the server running unless we're shuttind down the app
crashCount := 0
for {
cmd.Wait() //nolint:errcheck
stdin.Close()
var code int
if cmd.ProcessState != nil {
code = cmd.ProcessState.ExitCode()
}
select {
case <-ctx.Done():
slog.Debug(fmt.Sprintf("server shutdown with exit code %d", code))
done <- code
return
default:
crashCount++
slog.Warn(fmt.Sprintf("server crash %d - exit code %d - respawning", crashCount, code))
time.Sleep(500 * time.Millisecond)
if err := cmd.Start(); err != nil {
slog.Error(fmt.Sprintf("failed to restart server %s", err))
// Keep trying, but back off if we keep failing
time.Sleep(time.Duration(crashCount) * time.Second)
}
}
}
}()
return done, nil
}
func IsServerRunning(ctx context.Context) bool {
client, err := api.ClientFromEnvironment()
if err != nil {
slog.Info("unable to connect to server")
return false
}
err = client.Heartbeat(ctx)
if err != nil {
slog.Debug(fmt.Sprintf("heartbeat from server: %s", err))
slog.Info("unable to connect to server")
return false
}
return true
}

View File

@@ -0,0 +1,12 @@
//go:build !windows
package lifecycle
import (
"context"
"os/exec"
)
func getCmd(ctx context.Context, cmd string) *exec.Cmd {
return exec.CommandContext(ctx, cmd, "serve")
}

View File

@@ -0,0 +1,13 @@
package lifecycle
import (
"context"
"os/exec"
"syscall"
)
func getCmd(ctx context.Context, exePath string) *exec.Cmd {
cmd := exec.CommandContext(ctx, exePath, "serve")
cmd.SysProcAttr = &syscall.SysProcAttr{HideWindow: true, CreationFlags: 0x08000000}
return cmd
}

238
app/lifecycle/updater.go Normal file
View File

@@ -0,0 +1,238 @@
package lifecycle
import (
"context"
"crypto/rand"
"encoding/json"
"errors"
"fmt"
"io"
"log/slog"
"mime"
"net/http"
"net/url"
"os"
"path"
"path/filepath"
"runtime"
"strings"
"time"
"github.com/jmorganca/ollama/auth"
"github.com/jmorganca/ollama/version"
)
var (
UpdateCheckURLBase = "https://ollama.com/api/update"
UpdateDownloaded = false
UpdateCheckInterval = 60 * 60 * time.Second
)
// TODO - maybe move up to the API package?
type UpdateResponse struct {
UpdateURL string `json:"url"`
UpdateVersion string `json:"version"`
}
func getClient(req *http.Request) http.Client {
proxyURL, err := http.ProxyFromEnvironment(req)
if err != nil {
slog.Warn(fmt.Sprintf("failed to handle proxy: %s", err))
return http.Client{}
}
return http.Client{
Transport: &http.Transport{
Proxy: http.ProxyURL(proxyURL),
},
}
}
func IsNewReleaseAvailable(ctx context.Context) (bool, UpdateResponse) {
var updateResp UpdateResponse
requestURL, err := url.Parse(UpdateCheckURLBase)
if err != nil {
return false, updateResp
}
query := requestURL.Query()
query.Add("os", runtime.GOOS)
query.Add("arch", runtime.GOARCH)
query.Add("version", version.Version)
query.Add("ts", fmt.Sprintf("%d", time.Now().Unix()))
nonce, err := auth.NewNonce(rand.Reader, 16)
if err != nil {
return false, updateResp
}
query.Add("nonce", nonce)
requestURL.RawQuery = query.Encode()
data := []byte(fmt.Sprintf("%s,%s", http.MethodGet, requestURL.RequestURI()))
signature, err := auth.Sign(ctx, data)
if err != nil {
return false, updateResp
}
req, err := http.NewRequestWithContext(ctx, http.MethodGet, requestURL.String(), nil)
if err != nil {
slog.Warn(fmt.Sprintf("failed to check for update: %s", err))
return false, updateResp
}
req.Header.Set("Authorization", signature)
req.Header.Set("User-Agent", fmt.Sprintf("ollama/%s (%s %s) Go/%s", version.Version, runtime.GOARCH, runtime.GOOS, runtime.Version()))
client := getClient(req)
slog.Debug("checking for available update", "requestURL", requestURL)
resp, err := client.Do(req)
if err != nil {
slog.Warn(fmt.Sprintf("failed to check for update: %s", err))
return false, updateResp
}
defer resp.Body.Close()
if resp.StatusCode == 204 {
slog.Debug("check update response 204 (current version is up to date)")
return false, updateResp
}
body, err := io.ReadAll(resp.Body)
if err != nil {
slog.Warn(fmt.Sprintf("failed to read body response: %s", err))
}
err = json.Unmarshal(body, &updateResp)
if err != nil {
slog.Warn(fmt.Sprintf("malformed response checking for update: %s", err))
return false, updateResp
}
// Extract the version string from the URL in the github release artifact path
updateResp.UpdateVersion = path.Base(path.Dir(updateResp.UpdateURL))
slog.Info("New update available at " + updateResp.UpdateURL)
return true, updateResp
}
func DownloadNewRelease(ctx context.Context, updateResp UpdateResponse) error {
// Do a head first to check etag info
req, err := http.NewRequestWithContext(ctx, http.MethodHead, updateResp.UpdateURL, nil)
if err != nil {
return err
}
client := getClient(req)
resp, err := client.Do(req)
if err != nil {
return fmt.Errorf("error checking update: %w", err)
}
if resp.StatusCode != 200 {
return fmt.Errorf("unexpected status attempting to download update %d", resp.StatusCode)
}
resp.Body.Close()
etag := strings.Trim(resp.Header.Get("etag"), "\"")
if etag == "" {
slog.Debug("no etag detected, falling back to filename based dedup")
etag = "_"
}
filename := Installer
_, params, err := mime.ParseMediaType(resp.Header.Get("content-disposition"))
if err == nil {
filename = params["filename"]
}
stageFilename := filepath.Join(UpdateStageDir, etag, filename)
// Check to see if we already have it downloaded
_, err = os.Stat(stageFilename)
if err == nil {
slog.Info("update already downloaded")
return nil
}
cleanupOldDownloads()
req.Method = http.MethodGet
resp, err = client.Do(req)
if err != nil {
return fmt.Errorf("error checking update: %w", err)
}
defer resp.Body.Close()
etag = strings.Trim(resp.Header.Get("etag"), "\"")
if etag == "" {
slog.Debug("no etag detected, falling back to filename based dedup") // TODO probably can get rid of this redundant log
etag = "_"
}
stageFilename = filepath.Join(UpdateStageDir, etag, filename)
_, err = os.Stat(filepath.Dir(stageFilename))
if errors.Is(err, os.ErrNotExist) {
if err := os.MkdirAll(filepath.Dir(stageFilename), 0o755); err != nil {
return fmt.Errorf("create ollama dir %s: %v", filepath.Dir(stageFilename), err)
}
}
payload, err := io.ReadAll(resp.Body)
if err != nil {
return fmt.Errorf("failed to read body response: %w", err)
}
fp, err := os.OpenFile(stageFilename, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
if err != nil {
return fmt.Errorf("write payload %s: %w", stageFilename, err)
}
defer fp.Close()
if n, err := fp.Write(payload); err != nil || n != len(payload) {
return fmt.Errorf("write payload %s: %d vs %d -- %w", stageFilename, n, len(payload), err)
}
slog.Info("new update downloaded " + stageFilename)
UpdateDownloaded = true
return nil
}
func cleanupOldDownloads() {
files, err := os.ReadDir(UpdateStageDir)
if err != nil && errors.Is(err, os.ErrNotExist) {
// Expected behavior on first run
return
} else if err != nil {
slog.Warn(fmt.Sprintf("failed to list stage dir: %s", err))
return
}
for _, file := range files {
fullname := filepath.Join(UpdateStageDir, file.Name())
slog.Debug("cleaning up old download: " + fullname)
err = os.RemoveAll(fullname)
if err != nil {
slog.Warn(fmt.Sprintf("failed to cleanup stale update download %s", err))
}
}
}
func StartBackgroundUpdaterChecker(ctx context.Context, cb func(string) error) {
go func() {
// Don't blast an update message immediately after startup
// time.Sleep(30 * time.Second)
time.Sleep(3 * time.Second)
for {
available, resp := IsNewReleaseAvailable(ctx)
if available {
err := DownloadNewRelease(ctx, resp)
if err != nil {
slog.Error(fmt.Sprintf("failed to download new release: %s", err))
}
err = cb(resp.UpdateVersion)
if err != nil {
slog.Warn(fmt.Sprintf("failed to register update available with tray: %s", err))
}
}
select {
case <-ctx.Done():
slog.Debug("stopping background update checker")
return
default:
time.Sleep(UpdateCheckInterval)
}
}
}()
}

View File

@@ -0,0 +1,12 @@
//go:build !windows
package lifecycle
import (
"context"
"fmt"
)
func DoUpgrade(cancel context.CancelFunc, done chan int) error {
return fmt.Errorf("DoUpgrade not yet implemented")
}

View File

@@ -0,0 +1,80 @@
package lifecycle
import (
"context"
"fmt"
"log/slog"
"os"
"os/exec"
"path/filepath"
)
func DoUpgrade(cancel context.CancelFunc, done chan int) error {
files, err := filepath.Glob(filepath.Join(UpdateStageDir, "*", "*.exe")) // TODO generalize for multiplatform
if err != nil {
return fmt.Errorf("failed to lookup downloads: %s", err)
}
if len(files) == 0 {
return fmt.Errorf("no update downloads found")
} else if len(files) > 1 {
// Shouldn't happen
slog.Warn(fmt.Sprintf("multiple downloads found, using first one %v", files))
}
installerExe := files[0]
slog.Info("starting upgrade with " + installerExe)
slog.Info("upgrade log file " + UpgradeLogFile)
// When running in debug mode, we'll be "verbose" and let the installer pop up and prompt
installArgs := []string{
"/CLOSEAPPLICATIONS", // Quit the tray app if it's still running
"/LOG=" + filepath.Base(UpgradeLogFile), // Only relative seems reliable, so set pwd
"/FORCECLOSEAPPLICATIONS", // Force close the tray app - might be needed
}
// When we're not in debug mode, make the upgrade as quiet as possible (no GUI, no prompts)
// TODO - temporarily disable since we're pinning in debug mode for the preview
// if debug := os.Getenv("OLLAMA_DEBUG"); debug == "" {
installArgs = append(installArgs,
"/SP", // Skip the "This will install... Do you wish to continue" prompt
"/SUPPRESSMSGBOXES",
"/SILENT",
"/VERYSILENT",
)
// }
// Safeguard in case we have requests in flight that need to drain...
slog.Info("Waiting for server to shutdown")
cancel()
if done != nil {
<-done
} else {
// Shouldn't happen
slog.Warn("done chan was nil, not actually waiting")
}
slog.Debug(fmt.Sprintf("starting installer: %s %v", installerExe, installArgs))
os.Chdir(filepath.Dir(UpgradeLogFile)) //nolint:errcheck
cmd := exec.Command(installerExe, installArgs...)
if err := cmd.Start(); err != nil {
return fmt.Errorf("unable to start ollama app %w", err)
}
if cmd.Process != nil {
err = cmd.Process.Release()
if err != nil {
slog.Error(fmt.Sprintf("failed to release server process: %s", err))
}
} else {
// TODO - some details about why it didn't start, or is this a pedantic error case?
return fmt.Errorf("installer process did not start")
}
// TODO should we linger for a moment and check to make sure it's actually running by checking the pid?
slog.Info("Installer started in background, exiting")
os.Exit(0)
// Not reached
return nil
}

12
app/main.go Normal file
View File

@@ -0,0 +1,12 @@
package main
// Compile with the following to get rid of the cmd pop up on windows
// go build -ldflags="-H windowsgui" .
import (
"github.com/jmorganca/ollama/app/lifecycle"
)
func main() {
lifecycle.Run()
}

153
app/ollama.iss Normal file
View File

@@ -0,0 +1,153 @@
; Inno Setup Installer for Ollama
;
; To build the installer use the build script invoked from the top of the source tree
;
; powershell -ExecutionPolicy Bypass -File .\scripts\build_windows.ps
#define MyAppName "Ollama"
#if GetEnv("PKG_VERSION") != ""
#define MyAppVersion GetEnv("PKG_VERSION")
#else
#define MyAppVersion "0.0.0"
#endif
#define MyAppPublisher "Ollama"
#define MyAppURL "https://ollama.com/"
#define MyAppExeName "ollama app.exe"
#define MyIcon ".\assets\app.ico"
[Setup]
; NOTE: The value of AppId uniquely identifies this application. Do not use the same AppId value in installers for other applications.
; (To generate a new GUID, click Tools | Generate GUID inside the IDE.)
AppId={{44E83376-CE68-45EB-8FC1-393500EB558C}
AppName={#MyAppName}
AppVersion={#MyAppVersion}
VersionInfoVersion={#MyAppVersion}
;AppVerName={#MyAppName} {#MyAppVersion}
AppPublisher={#MyAppPublisher}
AppPublisherURL={#MyAppURL}
AppSupportURL={#MyAppURL}
AppUpdatesURL={#MyAppURL}
ArchitecturesAllowed=x64
ArchitecturesInstallIn64BitMode=x64
DefaultDirName={localappdata}\Programs\{#MyAppName}
DefaultGroupName={#MyAppName}
DisableProgramGroupPage=yes
PrivilegesRequired=lowest
OutputBaseFilename="OllamaSetup"
SetupIconFile={#MyIcon}
UninstallDisplayIcon={uninstallexe}
Compression=lzma2
SolidCompression=no
WizardStyle=modern
ChangesEnvironment=yes
OutputDir=..\dist\
; Disable logging once everything's battle tested
; Filename will be %TEMP%\Setup Log*.txt
SetupLogging=yes
CloseApplications=yes
RestartApplications=no
; Make sure they can at least download llama2 as a minimum
ExtraDiskSpaceRequired=3826806784
; https://jrsoftware.org/ishelp/index.php?topic=setup_wizardimagefile
WizardSmallImageFile=.\assets\setup.bmp
; TODO verifty actual min windows version...
; OG Win 10
MinVersion=10.0.10240
; First release that supports WinRT UI Composition for win32 apps
; MinVersion=10.0.17134
; First release with XAML Islands - possible UI path forward
; MinVersion=10.0.18362
; quiet...
DisableDirPage=yes
DisableFinishedPage=yes
DisableReadyMemo=yes
DisableReadyPage=yes
DisableStartupPrompt=yes
DisableWelcomePage=yes
; TODO - percentage can't be set less than 100, so how to make it shorter?
; WizardSizePercent=100,80
#if GetEnv("KEY_CONTAINER")
SignTool=MySignTool
SignedUninstaller=yes
#endif
SetupMutex=OllamaSetupMutex
[Languages]
Name: "english"; MessagesFile: "compiler:Default.isl"
[LangOptions]
DialogFontSize=12
[Files]
Source: ".\app.exe"; DestDir: "{app}"; DestName: "{#MyAppExeName}" ; Flags: ignoreversion 64bit
Source: "..\ollama.exe"; DestDir: "{app}"; Flags: ignoreversion 64bit
Source: "..\dist\windeps\*.dll"; DestDir: "{app}"; Flags: ignoreversion 64bit
Source: "..\dist\ollama_welcome.ps1"; DestDir: "{app}"; Flags: ignoreversion
Source: ".\assets\app.ico"; DestDir: "{app}"; Flags: ignoreversion
[Icons]
Name: "{group}\{#MyAppName}"; Filename: "{app}\{#MyAppExeName}"; IconFilename: "{app}\app.ico"
Name: "{userstartup}\{#MyAppName}"; Filename: "{app}\{#MyAppExeName}"; IconFilename: "{app}\app.ico"
Name: "{userprograms}\{#MyAppName}"; Filename: "{app}\{#MyAppExeName}"; IconFilename: "{app}\app.ico"
[Run]
Filename: "{cmd}"; Parameters: "/C set PATH={app};%PATH% & ""{app}\{#MyAppExeName}"""; Flags: postinstall nowait runhidden
[UninstallRun]
; Filename: "{cmd}"; Parameters: "/C ""taskkill /im ''{#MyAppExeName}'' /f /t"; Flags: runhidden
; Filename: "{cmd}"; Parameters: "/C ""taskkill /im ollama.exe /f /t"; Flags: runhidden
Filename: "taskkill"; Parameters: "/im ""{#MyAppExeName}"" /f /t"; Flags: runhidden
Filename: "taskkill"; Parameters: "/im ""ollama.exe"" /f /t"; Flags: runhidden
; HACK! need to give the server and app enough time to exit
; TODO - convert this to a Pascal code script so it waits until they're no longer running, then completes
Filename: "{cmd}"; Parameters: "/c timeout 5"; Flags: runhidden
[UninstallDelete]
Type: filesandordirs; Name: "{%TEMP}\ollama*"
Type: filesandordirs; Name: "{%LOCALAPPDATA}\Ollama"
Type: filesandordirs; Name: "{%LOCALAPPDATA}\Programs\Ollama"
Type: filesandordirs; Name: "{%USERPROFILE}\.ollama"
; NOTE: if the user has a custom OLLAMA_MODELS it will be preserved
[Messages]
WizardReady=Ollama Windows Preview
ReadyLabel1=%nLet's get you up and running with your own large language models.
SetupAppRunningError=Another Ollama installer is running.%n%nPlease cancel or finish the other installer, then click OK to continue with this install, or Cancel to exit.
;FinishedHeadingLabel=Run your first model
;FinishedLabel=%nRun this command in a PowerShell or cmd terminal.%n%n%n ollama run llama2
;ClickFinish=%n
[Registry]
Root: HKCU; Subkey: "Environment"; \
ValueType: expandsz; ValueName: "Path"; ValueData: "{olddata};{app}"; \
Check: NeedsAddPath('{app}')
[Code]
function NeedsAddPath(Param: string): boolean;
var
OrigPath: string;
begin
if not RegQueryStringValue(HKEY_CURRENT_USER,
'Environment',
'Path', OrigPath)
then begin
Result := True;
exit;
end;
{ look for the path with leading and trailing semicolon }
{ Pos() returns 0 if not found }
Result := Pos(';' + ExpandConstant(Param) + ';', ';' + OrigPath + ';') = 0;
end;

29
app/ollama.rc Normal file
View File

@@ -0,0 +1,29 @@
#include <winver.h>
VS_VERSION_INFO VERSIONINFO
FILEFLAGSMASK 0x3fL
#ifdef _DEBUG
FILEFLAGS 0x1L
#else
FILEFLAGS 0x0L
#endif
FILEOS 0x40004L
FILETYPE 0x1L
FILESUBTYPE 0x0L
BEGIN
BLOCK "StringFileInfo"
BEGIN
BLOCK "040904b0"
BEGIN
VALUE "FileDescription", "Ollama"
VALUE "InternalName", "Ollama"
VALUE "OriginalFilename", "ollama app.exe"
VALUE "ProductName", "Ollama"
END
END
BLOCK "VarFileInfo"
BEGIN
VALUE "Translation", 0x409, 1200
END
END

8
app/ollama_welcome.ps1 Normal file
View File

@@ -0,0 +1,8 @@
# TODO - consider ANSI colors and maybe ASCII art...
write-host ""
write-host "Welcome to Ollama!"
write-host ""
write-host "Run your first model:"
write-host ""
write-host "`tollama run llama2"
write-host ""

98
app/store/store.go Normal file
View File

@@ -0,0 +1,98 @@
package store
import (
"encoding/json"
"errors"
"fmt"
"log/slog"
"os"
"path/filepath"
"sync"
"github.com/google/uuid"
)
type Store struct {
ID string `json:"id"`
FirstTimeRun bool `json:"first-time-run"`
}
var (
lock sync.Mutex
store Store
)
func GetID() string {
lock.Lock()
defer lock.Unlock()
if store.ID == "" {
initStore()
}
return store.ID
}
func GetFirstTimeRun() bool {
lock.Lock()
defer lock.Unlock()
if store.ID == "" {
initStore()
}
return store.FirstTimeRun
}
func SetFirstTimeRun(val bool) {
lock.Lock()
defer lock.Unlock()
if store.FirstTimeRun == val {
return
}
store.FirstTimeRun = val
writeStore(getStorePath())
}
// lock must be held
func initStore() {
storeFile, err := os.Open(getStorePath())
if err == nil {
defer storeFile.Close()
err = json.NewDecoder(storeFile).Decode(&store)
if err == nil {
slog.Debug(fmt.Sprintf("loaded existing store %s - ID: %s", getStorePath(), store.ID))
return
}
} else if !errors.Is(err, os.ErrNotExist) {
slog.Debug(fmt.Sprintf("unexpected error searching for store: %s", err))
}
slog.Debug("initializing new store")
store.ID = uuid.New().String()
writeStore(getStorePath())
}
func writeStore(storeFilename string) {
ollamaDir := filepath.Dir(storeFilename)
_, err := os.Stat(ollamaDir)
if errors.Is(err, os.ErrNotExist) {
if err := os.MkdirAll(ollamaDir, 0o755); err != nil {
slog.Error(fmt.Sprintf("create ollama dir %s: %v", ollamaDir, err))
return
}
}
payload, err := json.Marshal(store)
if err != nil {
slog.Error(fmt.Sprintf("failed to marshal store: %s", err))
return
}
fp, err := os.OpenFile(storeFilename, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
if err != nil {
slog.Error(fmt.Sprintf("write store payload %s: %v", storeFilename, err))
return
}
defer fp.Close()
if n, err := fp.Write(payload); err != nil || n != len(payload) {
slog.Error(fmt.Sprintf("write store payload %s: %d vs %d -- %v", storeFilename, n, len(payload), err))
return
}
slog.Debug("Store contents: " + string(payload))
slog.Info(fmt.Sprintf("wrote store: %s", storeFilename))
}

13
app/store/store_darwin.go Normal file
View File

@@ -0,0 +1,13 @@
package store
import (
"os"
"path/filepath"
)
func getStorePath() string {
// TODO - system wide location?
home := os.Getenv("HOME")
return filepath.Join(home, "Library", "Application Support", "Ollama", "config.json")
}

16
app/store/store_linux.go Normal file
View File

@@ -0,0 +1,16 @@
package store
import (
"os"
"path/filepath"
)
func getStorePath() string {
if os.Geteuid() == 0 {
// TODO where should we store this on linux for system-wide operation?
return "/etc/ollama/config.json"
}
home := os.Getenv("HOME")
return filepath.Join(home, ".ollama", "config.json")
}

View File

@@ -0,0 +1,11 @@
package store
import (
"os"
"path/filepath"
)
func getStorePath() string {
localAppData := os.Getenv("LOCALAPPDATA")
return filepath.Join(localAppData, "Ollama", "config.json")
}

View File

@@ -0,0 +1,24 @@
package commontray
var (
Title = "Ollama"
ToolTip = "Ollama"
UpdateIconName = "tray_upgrade"
IconName = "tray"
)
type Callbacks struct {
Quit chan struct{}
Update chan struct{}
DoFirstUse chan struct{}
ShowLogs chan struct{}
}
type OllamaTray interface {
GetCallbacks() Callbacks
Run()
UpdateAvailable(ver string) error
DisplayFirstUseNotification() error
Quit()
}

33
app/tray/tray.go Normal file
View File

@@ -0,0 +1,33 @@
package tray
import (
"fmt"
"runtime"
"github.com/jmorganca/ollama/app/assets"
"github.com/jmorganca/ollama/app/tray/commontray"
)
func NewTray() (commontray.OllamaTray, error) {
extension := ".png"
if runtime.GOOS == "windows" {
extension = ".ico"
}
iconName := commontray.UpdateIconName + extension
updateIcon, err := assets.GetIcon(iconName)
if err != nil {
return nil, fmt.Errorf("failed to load icon %s: %w", iconName, err)
}
iconName = commontray.IconName + extension
icon, err := assets.GetIcon(iconName)
if err != nil {
return nil, fmt.Errorf("failed to load icon %s: %w", iconName, err)
}
tray, err := InitPlatformTray(icon, updateIcon)
if err != nil {
return nil, err
}
return tray, nil
}

View File

@@ -0,0 +1,13 @@
//go:build !windows
package tray
import (
"fmt"
"github.com/jmorganca/ollama/app/tray/commontray"
)
func InitPlatformTray(icon, updateIcon []byte) (commontray.OllamaTray, error) {
return nil, fmt.Errorf("NOT IMPLEMENTED YET")
}

10
app/tray/tray_windows.go Normal file
View File

@@ -0,0 +1,10 @@
package tray
import (
"github.com/jmorganca/ollama/app/tray/commontray"
"github.com/jmorganca/ollama/app/tray/wintray"
)
func InitPlatformTray(icon, updateIcon []byte) (commontray.OllamaTray, error) {
return wintray.InitTray(icon, updateIcon)
}

View File

@@ -0,0 +1,184 @@
//go:build windows
package wintray
import (
"fmt"
"log/slog"
"sync"
"unsafe"
"golang.org/x/sys/windows"
)
var (
quitOnce sync.Once
)
func (t *winTray) Run() {
nativeLoop()
}
func nativeLoop() {
// Main message pump.
slog.Debug("starting event handling loop")
m := &struct {
WindowHandle windows.Handle
Message uint32
Wparam uintptr
Lparam uintptr
Time uint32
Pt point
LPrivate uint32
}{}
for {
ret, _, err := pGetMessage.Call(uintptr(unsafe.Pointer(m)), 0, 0, 0)
// If the function retrieves a message other than WM_QUIT, the return value is nonzero.
// If the function retrieves the WM_QUIT message, the return value is zero.
// If there is an error, the return value is -1
// https://msdn.microsoft.com/en-us/library/windows/desktop/ms644936(v=vs.85).aspx
switch int32(ret) {
case -1:
slog.Error(fmt.Sprintf("get message failure: %v", err))
return
case 0:
return
default:
pTranslateMessage.Call(uintptr(unsafe.Pointer(m))) //nolint:errcheck
pDispatchMessage.Call(uintptr(unsafe.Pointer(m))) //nolint:errcheck
}
}
}
// WindowProc callback function that processes messages sent to a window.
// https://msdn.microsoft.com/en-us/library/windows/desktop/ms633573(v=vs.85).aspx
func (t *winTray) wndProc(hWnd windows.Handle, message uint32, wParam, lParam uintptr) (lResult uintptr) {
const (
WM_RBUTTONUP = 0x0205
WM_LBUTTONUP = 0x0202
WM_COMMAND = 0x0111
WM_ENDSESSION = 0x0016
WM_CLOSE = 0x0010
WM_DESTROY = 0x0002
WM_MOUSEMOVE = 0x0200
WM_LBUTTONDOWN = 0x0201
)
switch message {
case WM_COMMAND:
menuItemId := int32(wParam)
// https://docs.microsoft.com/en-us/windows/win32/menurc/wm-command#menus
switch menuItemId {
case quitMenuID:
select {
case t.callbacks.Quit <- struct{}{}:
// should not happen but in case not listening
default:
slog.Error("no listener on Quit")
}
case updateMenuID:
select {
case t.callbacks.Update <- struct{}{}:
// should not happen but in case not listening
default:
slog.Error("no listener on Update")
}
case diagLogsMenuID:
select {
case t.callbacks.ShowLogs <- struct{}{}:
// should not happen but in case not listening
default:
slog.Error("no listener on ShowLogs")
}
default:
slog.Debug(fmt.Sprintf("Unexpected menu item id: %d", menuItemId))
}
case WM_CLOSE:
boolRet, _, err := pDestroyWindow.Call(uintptr(t.window))
if boolRet == 0 {
slog.Error(fmt.Sprintf("failed to destroy window: %s", err))
}
err = t.wcex.unregister()
if err != nil {
slog.Error(fmt.Sprintf("failed to uregister windo %s", err))
}
case WM_DESTROY:
// same as WM_ENDSESSION, but throws 0 exit code after all
defer pPostQuitMessage.Call(uintptr(int32(0))) //nolint:errcheck
fallthrough
case WM_ENDSESSION:
t.muNID.Lock()
if t.nid != nil {
err := t.nid.delete()
if err != nil {
slog.Error(fmt.Sprintf("failed to delete nid: %s", err))
}
}
t.muNID.Unlock()
case t.wmSystrayMessage:
switch lParam {
case WM_MOUSEMOVE, WM_LBUTTONDOWN:
// Ignore these...
case WM_RBUTTONUP, WM_LBUTTONUP:
err := t.showMenu()
if err != nil {
slog.Error(fmt.Sprintf("failed to show menu: %s", err))
}
case 0x405: // TODO - how is this magic value derived for the notification left click
if t.pendingUpdate {
select {
case t.callbacks.Update <- struct{}{}:
// should not happen but in case not listening
default:
slog.Error("no listener on Update")
}
} else {
select {
case t.callbacks.DoFirstUse <- struct{}{}:
// should not happen but in case not listening
default:
slog.Error("no listener on DoFirstUse")
}
}
case 0x404: // Middle click or close notification
// slog.Debug("doing nothing on close of first time notification")
default:
// 0x402 also seems common - what is it?
slog.Debug(fmt.Sprintf("unmanaged app message, lParm: 0x%x", lParam))
}
case t.wmTaskbarCreated: // on explorer.exe restarts
t.muNID.Lock()
err := t.nid.add()
if err != nil {
slog.Error(fmt.Sprintf("failed to refresh the taskbar on explorer restart: %s", err))
}
t.muNID.Unlock()
default:
// Calls the default window procedure to provide default processing for any window messages that an application does not process.
// https://msdn.microsoft.com/en-us/library/windows/desktop/ms633572(v=vs.85).aspx
lResult, _, _ = pDefWindowProc.Call(
uintptr(hWnd),
uintptr(message),
uintptr(wParam),
uintptr(lParam),
)
}
return
}
func (t *winTray) Quit() {
quitOnce.Do(quit)
}
func quit() {
boolRet, _, err := pPostMessage.Call(
uintptr(wt.window),
WM_CLOSE,
0,
0,
)
if boolRet == 0 {
slog.Error(fmt.Sprintf("failed to post close message on shutdown %s", err))
}
}

71
app/tray/wintray/menus.go Normal file
View File

@@ -0,0 +1,71 @@
//go:build windows
package wintray
import (
"fmt"
"log/slog"
"unsafe"
"golang.org/x/sys/windows"
)
const (
updatAvailableMenuID = 1
updateMenuID = updatAvailableMenuID + 1
separatorMenuID = updateMenuID + 1
diagLogsMenuID = separatorMenuID + 1
diagSeparatorMenuID = diagLogsMenuID + 1
quitMenuID = diagSeparatorMenuID + 1
)
func (t *winTray) initMenus() error {
if err := t.addOrUpdateMenuItem(diagLogsMenuID, 0, diagLogsMenuTitle, false); err != nil {
return fmt.Errorf("unable to create menu entries %w\n", err)
}
if err := t.addSeparatorMenuItem(diagSeparatorMenuID, 0); err != nil {
return fmt.Errorf("unable to create menu entries %w", err)
}
if err := t.addOrUpdateMenuItem(quitMenuID, 0, quitMenuTitle, false); err != nil {
return fmt.Errorf("unable to create menu entries %w\n", err)
}
return nil
}
func (t *winTray) UpdateAvailable(ver string) error {
if !t.updateNotified {
slog.Debug("updating menu and sending notification for new update")
if err := t.addOrUpdateMenuItem(updatAvailableMenuID, 0, updateAvailableMenuTitle, true); err != nil {
return fmt.Errorf("unable to create menu entries %w", err)
}
if err := t.addOrUpdateMenuItem(updateMenuID, 0, updateMenutTitle, false); err != nil {
return fmt.Errorf("unable to create menu entries %w", err)
}
if err := t.addSeparatorMenuItem(separatorMenuID, 0); err != nil {
return fmt.Errorf("unable to create menu entries %w", err)
}
iconFilePath, err := iconBytesToFilePath(wt.updateIcon)
if err != nil {
return fmt.Errorf("unable to write icon data to temp file: %w", err)
}
if err := wt.setIcon(iconFilePath); err != nil {
return fmt.Errorf("unable to set icon: %w", err)
}
t.updateNotified = true
t.pendingUpdate = true
// Now pop up the notification
t.muNID.Lock()
defer t.muNID.Unlock()
copy(t.nid.InfoTitle[:], windows.StringToUTF16(updateTitle))
copy(t.nid.Info[:], windows.StringToUTF16(fmt.Sprintf(updateMessage, ver)))
t.nid.Flags |= NIF_INFO
t.nid.Timeout = 10
t.nid.Size = uint32(unsafe.Sizeof(*wt.nid))
err = t.nid.modify()
if err != nil {
return err
}
}
return nil
}

View File

@@ -0,0 +1,15 @@
//go:build windows
package wintray
const (
firstTimeTitle = "Ollama is running"
firstTimeMessage = "Click here to get started"
updateTitle = "Update available"
updateMessage = "Ollama version %s is ready to install"
quitMenuTitle = "Quit Ollama"
updateAvailableMenuTitle = "An update is available"
updateMenutTitle = "Restart to update"
diagLogsMenuTitle = "View logs"
)

View File

@@ -0,0 +1,66 @@
//go:build windows
package wintray
import (
"unsafe"
"golang.org/x/sys/windows"
)
// Contains information that the system needs to display notifications in the notification area.
// Used by Shell_NotifyIcon.
// https://msdn.microsoft.com/en-us/library/windows/desktop/bb773352(v=vs.85).aspx
// https://msdn.microsoft.com/en-us/library/windows/desktop/bb762159
type notifyIconData struct {
Size uint32
Wnd windows.Handle
ID, Flags, CallbackMessage uint32
Icon windows.Handle
Tip [128]uint16
State, StateMask uint32
Info [256]uint16
// Timeout, Version uint32
Timeout uint32
InfoTitle [64]uint16
InfoFlags uint32
GuidItem windows.GUID
BalloonIcon windows.Handle
}
func (nid *notifyIconData) add() error {
const NIM_ADD = 0x00000000
res, _, err := pShellNotifyIcon.Call(
uintptr(NIM_ADD),
uintptr(unsafe.Pointer(nid)),
)
if res == 0 {
return err
}
return nil
}
func (nid *notifyIconData) modify() error {
const NIM_MODIFY = 0x00000001
res, _, err := pShellNotifyIcon.Call(
uintptr(NIM_MODIFY),
uintptr(unsafe.Pointer(nid)),
)
if res == 0 {
return err
}
return nil
}
func (nid *notifyIconData) delete() error {
const NIM_DELETE = 0x00000002
res, _, err := pShellNotifyIcon.Call(
uintptr(NIM_DELETE),
uintptr(unsafe.Pointer(nid)),
)
if res == 0 {
return err
}
return nil
}

485
app/tray/wintray/tray.go Normal file
View File

@@ -0,0 +1,485 @@
//go:build windows
package wintray
import (
"crypto/md5"
"encoding/hex"
"fmt"
"log/slog"
"os"
"path/filepath"
"sort"
"sync"
"unsafe"
"github.com/jmorganca/ollama/app/tray/commontray"
"golang.org/x/sys/windows"
)
// Helpful sources: https://github.com/golang/exp/blob/master/shiny/driver/internal/win32
// Contains information about loaded resources
type winTray struct {
instance,
icon,
cursor,
window windows.Handle
loadedImages map[string]windows.Handle
muLoadedImages sync.RWMutex
// menus keeps track of the submenus keyed by the menu item ID, plus 0
// which corresponds to the main popup menu.
menus map[uint32]windows.Handle
muMenus sync.RWMutex
menuOf map[uint32]windows.Handle
muMenuOf sync.RWMutex
// menuItemIcons maintains the bitmap of each menu item (if applies). It's
// needed to show the icon correctly when showing a previously hidden menu
// item again.
// menuItemIcons map[uint32]windows.Handle
// muMenuItemIcons sync.RWMutex
visibleItems map[uint32][]uint32
muVisibleItems sync.RWMutex
nid *notifyIconData
muNID sync.RWMutex
wcex *wndClassEx
wmSystrayMessage,
wmTaskbarCreated uint32
pendingUpdate bool
updateNotified bool // Only pop up the notification once - TODO consider daily nag?
// Callbacks
callbacks commontray.Callbacks
normalIcon []byte
updateIcon []byte
}
var wt winTray
func (t *winTray) GetCallbacks() commontray.Callbacks {
return t.callbacks
}
func InitTray(icon, updateIcon []byte) (*winTray, error) {
wt.callbacks.Quit = make(chan struct{})
wt.callbacks.Update = make(chan struct{})
wt.callbacks.ShowLogs = make(chan struct{})
wt.callbacks.DoFirstUse = make(chan struct{})
wt.normalIcon = icon
wt.updateIcon = updateIcon
if err := wt.initInstance(); err != nil {
return nil, fmt.Errorf("Unable to init instance: %w\n", err)
}
if err := wt.createMenu(); err != nil {
return nil, fmt.Errorf("Unable to create menu: %w\n", err)
}
iconFilePath, err := iconBytesToFilePath(wt.normalIcon)
if err != nil {
return nil, fmt.Errorf("Unable to write icon data to temp file: %w", err)
}
if err := wt.setIcon(iconFilePath); err != nil {
return nil, fmt.Errorf("Unable to set icon: %w", err)
}
return &wt, wt.initMenus()
}
func (t *winTray) initInstance() error {
const (
className = "OllamaClass"
windowName = ""
)
t.wmSystrayMessage = WM_USER + 1
t.visibleItems = make(map[uint32][]uint32)
t.menus = make(map[uint32]windows.Handle)
t.menuOf = make(map[uint32]windows.Handle)
t.loadedImages = make(map[string]windows.Handle)
taskbarEventNamePtr, _ := windows.UTF16PtrFromString("TaskbarCreated")
// https://msdn.microsoft.com/en-us/library/windows/desktop/ms644947
res, _, err := pRegisterWindowMessage.Call(
uintptr(unsafe.Pointer(taskbarEventNamePtr)),
)
if res == 0 { // success 0xc000-0xfff
return fmt.Errorf("failed to register window: %w", err)
}
t.wmTaskbarCreated = uint32(res)
instanceHandle, _, err := pGetModuleHandle.Call(0)
if instanceHandle == 0 {
return err
}
t.instance = windows.Handle(instanceHandle)
// https://msdn.microsoft.com/en-us/library/windows/desktop/ms648072(v=vs.85).aspx
iconHandle, _, err := pLoadIcon.Call(0, uintptr(IDI_APPLICATION))
if iconHandle == 0 {
return err
}
t.icon = windows.Handle(iconHandle)
// https://msdn.microsoft.com/en-us/library/windows/desktop/ms648391(v=vs.85).aspx
cursorHandle, _, err := pLoadCursor.Call(0, uintptr(IDC_ARROW))
if cursorHandle == 0 {
return err
}
t.cursor = windows.Handle(cursorHandle)
classNamePtr, err := windows.UTF16PtrFromString(className)
if err != nil {
return err
}
windowNamePtr, err := windows.UTF16PtrFromString(windowName)
if err != nil {
return err
}
t.wcex = &wndClassEx{
Style: CS_HREDRAW | CS_VREDRAW,
WndProc: windows.NewCallback(t.wndProc),
Instance: t.instance,
Icon: t.icon,
Cursor: t.cursor,
Background: windows.Handle(6), // (COLOR_WINDOW + 1)
ClassName: classNamePtr,
IconSm: t.icon,
}
if err := t.wcex.register(); err != nil {
return err
}
windowHandle, _, err := pCreateWindowEx.Call(
uintptr(0),
uintptr(unsafe.Pointer(classNamePtr)),
uintptr(unsafe.Pointer(windowNamePtr)),
uintptr(WS_OVERLAPPEDWINDOW),
uintptr(CW_USEDEFAULT),
uintptr(CW_USEDEFAULT),
uintptr(CW_USEDEFAULT),
uintptr(CW_USEDEFAULT),
uintptr(0),
uintptr(0),
uintptr(t.instance),
uintptr(0),
)
if windowHandle == 0 {
return err
}
t.window = windows.Handle(windowHandle)
pShowWindow.Call(uintptr(t.window), uintptr(SW_HIDE)) //nolint:errcheck
boolRet, _, err := pUpdateWindow.Call(uintptr(t.window))
if boolRet == 0 {
slog.Error(fmt.Sprintf("failed to update window: %s", err))
}
t.muNID.Lock()
defer t.muNID.Unlock()
t.nid = &notifyIconData{
Wnd: windows.Handle(t.window),
ID: 100,
Flags: NIF_MESSAGE,
CallbackMessage: t.wmSystrayMessage,
}
t.nid.Size = uint32(unsafe.Sizeof(*t.nid))
return t.nid.add()
}
func (t *winTray) createMenu() error {
menuHandle, _, err := pCreatePopupMenu.Call()
if menuHandle == 0 {
return err
}
t.menus[0] = windows.Handle(menuHandle)
// https://msdn.microsoft.com/en-us/library/windows/desktop/ms647575(v=vs.85).aspx
mi := struct {
Size, Mask, Style, Max uint32
Background windows.Handle
ContextHelpID uint32
MenuData uintptr
}{
Mask: MIM_APPLYTOSUBMENUS,
}
mi.Size = uint32(unsafe.Sizeof(mi))
res, _, err := pSetMenuInfo.Call(
uintptr(t.menus[0]),
uintptr(unsafe.Pointer(&mi)),
)
if res == 0 {
return err
}
return nil
}
// Contains information about a menu item.
// https://msdn.microsoft.com/en-us/library/windows/desktop/ms647578(v=vs.85).aspx
type menuItemInfo struct {
Size, Mask, Type, State uint32
ID uint32
SubMenu, Checked, Unchecked windows.Handle
ItemData uintptr
TypeData *uint16
Cch uint32
BMPItem windows.Handle
}
func (t *winTray) addOrUpdateMenuItem(menuItemId uint32, parentId uint32, title string, disabled bool) error {
titlePtr, err := windows.UTF16PtrFromString(title)
if err != nil {
return err
}
mi := menuItemInfo{
Mask: MIIM_FTYPE | MIIM_STRING | MIIM_ID | MIIM_STATE,
Type: MFT_STRING,
ID: uint32(menuItemId),
TypeData: titlePtr,
Cch: uint32(len(title)),
}
mi.Size = uint32(unsafe.Sizeof(mi))
if disabled {
mi.State |= MFS_DISABLED
}
var res uintptr
t.muMenus.RLock()
menu := t.menus[parentId]
t.muMenus.RUnlock()
if t.getVisibleItemIndex(parentId, menuItemId) != -1 {
// We set the menu item info based on the menuID
boolRet, _, err := pSetMenuItemInfo.Call(
uintptr(menu),
uintptr(menuItemId),
0,
uintptr(unsafe.Pointer(&mi)),
)
if boolRet == 0 {
return fmt.Errorf("failed to set menu item: %w", err)
}
}
if res == 0 {
// Menu item does not already exist, create it
t.muMenus.RLock()
submenu, exists := t.menus[menuItemId]
t.muMenus.RUnlock()
if exists {
mi.Mask |= MIIM_SUBMENU
mi.SubMenu = submenu
}
t.addToVisibleItems(parentId, menuItemId)
position := t.getVisibleItemIndex(parentId, menuItemId)
res, _, err = pInsertMenuItem.Call(
uintptr(menu),
uintptr(position),
1,
uintptr(unsafe.Pointer(&mi)),
)
if res == 0 {
t.delFromVisibleItems(parentId, menuItemId)
return err
}
t.muMenuOf.Lock()
t.menuOf[menuItemId] = menu
t.muMenuOf.Unlock()
}
return nil
}
func (t *winTray) addSeparatorMenuItem(menuItemId, parentId uint32) error {
mi := menuItemInfo{
Mask: MIIM_FTYPE | MIIM_ID | MIIM_STATE,
Type: MFT_SEPARATOR,
ID: uint32(menuItemId),
}
mi.Size = uint32(unsafe.Sizeof(mi))
t.addToVisibleItems(parentId, menuItemId)
position := t.getVisibleItemIndex(parentId, menuItemId)
t.muMenus.RLock()
menu := uintptr(t.menus[parentId])
t.muMenus.RUnlock()
res, _, err := pInsertMenuItem.Call(
menu,
uintptr(position),
1,
uintptr(unsafe.Pointer(&mi)),
)
if res == 0 {
return err
}
return nil
}
// func (t *winTray) hideMenuItem(menuItemId, parentId uint32) error {
// const ERROR_SUCCESS syscall.Errno = 0
// t.muMenus.RLock()
// menu := uintptr(t.menus[parentId])
// t.muMenus.RUnlock()
// res, _, err := pRemoveMenu.Call(
// menu,
// uintptr(menuItemId),
// MF_BYCOMMAND,
// )
// if res == 0 && err.(syscall.Errno) != ERROR_SUCCESS {
// return err
// }
// t.delFromVisibleItems(parentId, menuItemId)
// return nil
// }
func (t *winTray) showMenu() error {
p := point{}
boolRet, _, err := pGetCursorPos.Call(uintptr(unsafe.Pointer(&p)))
if boolRet == 0 {
return err
}
boolRet, _, err = pSetForegroundWindow.Call(uintptr(t.window))
if boolRet == 0 {
slog.Warn(fmt.Sprintf("failed to bring menu to foreground: %s", err))
}
boolRet, _, err = pTrackPopupMenu.Call(
uintptr(t.menus[0]),
TPM_BOTTOMALIGN|TPM_LEFTALIGN,
uintptr(p.X),
uintptr(p.Y),
0,
uintptr(t.window),
0,
)
if boolRet == 0 {
return err
}
return nil
}
func (t *winTray) delFromVisibleItems(parent, val uint32) {
t.muVisibleItems.Lock()
defer t.muVisibleItems.Unlock()
visibleItems := t.visibleItems[parent]
for i, itemval := range visibleItems {
if val == itemval {
t.visibleItems[parent] = append(visibleItems[:i], visibleItems[i+1:]...)
break
}
}
}
func (t *winTray) addToVisibleItems(parent, val uint32) {
t.muVisibleItems.Lock()
defer t.muVisibleItems.Unlock()
if visibleItems, exists := t.visibleItems[parent]; !exists {
t.visibleItems[parent] = []uint32{val}
} else {
newvisible := append(visibleItems, val)
sort.Slice(newvisible, func(i, j int) bool { return newvisible[i] < newvisible[j] })
t.visibleItems[parent] = newvisible
}
}
func (t *winTray) getVisibleItemIndex(parent, val uint32) int {
t.muVisibleItems.RLock()
defer t.muVisibleItems.RUnlock()
for i, itemval := range t.visibleItems[parent] {
if val == itemval {
return i
}
}
return -1
}
func iconBytesToFilePath(iconBytes []byte) (string, error) {
bh := md5.Sum(iconBytes)
dataHash := hex.EncodeToString(bh[:])
iconFilePath := filepath.Join(os.TempDir(), "ollama_temp_icon_"+dataHash)
if _, err := os.Stat(iconFilePath); os.IsNotExist(err) {
if err := os.WriteFile(iconFilePath, iconBytes, 0644); err != nil {
return "", err
}
}
return iconFilePath, nil
}
// Loads an image from file and shows it in tray.
// Shell_NotifyIcon: https://msdn.microsoft.com/en-us/library/windows/desktop/bb762159(v=vs.85).aspx
func (t *winTray) setIcon(src string) error {
h, err := t.loadIconFrom(src)
if err != nil {
return err
}
t.muNID.Lock()
defer t.muNID.Unlock()
t.nid.Icon = h
t.nid.Flags |= NIF_ICON
t.nid.Size = uint32(unsafe.Sizeof(*t.nid))
return t.nid.modify()
}
// Loads an image from file to be shown in tray or menu item.
// LoadImage: https://msdn.microsoft.com/en-us/library/windows/desktop/ms648045(v=vs.85).aspx
func (t *winTray) loadIconFrom(src string) (windows.Handle, error) {
// Save and reuse handles of loaded images
t.muLoadedImages.RLock()
h, ok := t.loadedImages[src]
t.muLoadedImages.RUnlock()
if !ok {
srcPtr, err := windows.UTF16PtrFromString(src)
if err != nil {
return 0, err
}
res, _, err := pLoadImage.Call(
0,
uintptr(unsafe.Pointer(srcPtr)),
IMAGE_ICON,
0,
0,
LR_LOADFROMFILE|LR_DEFAULTSIZE,
)
if res == 0 {
return 0, err
}
h = windows.Handle(res)
t.muLoadedImages.Lock()
t.loadedImages[src] = h
t.muLoadedImages.Unlock()
}
return h, nil
}
func (t *winTray) DisplayFirstUseNotification() error {
t.muNID.Lock()
defer t.muNID.Unlock()
copy(t.nid.InfoTitle[:], windows.StringToUTF16(firstTimeTitle))
copy(t.nid.Info[:], windows.StringToUTF16(firstTimeMessage))
t.nid.Flags |= NIF_INFO
t.nid.Size = uint32(unsafe.Sizeof(*wt.nid))
return t.nid.modify()
}

View File

@@ -0,0 +1,89 @@
//go:build windows
package wintray
import (
"runtime"
"golang.org/x/sys/windows"
)
var (
k32 = windows.NewLazySystemDLL("Kernel32.dll")
u32 = windows.NewLazySystemDLL("User32.dll")
s32 = windows.NewLazySystemDLL("Shell32.dll")
pCreatePopupMenu = u32.NewProc("CreatePopupMenu")
pCreateWindowEx = u32.NewProc("CreateWindowExW")
pDefWindowProc = u32.NewProc("DefWindowProcW")
pDestroyWindow = u32.NewProc("DestroyWindow")
pDispatchMessage = u32.NewProc("DispatchMessageW")
pGetCursorPos = u32.NewProc("GetCursorPos")
pGetMessage = u32.NewProc("GetMessageW")
pGetModuleHandle = k32.NewProc("GetModuleHandleW")
pInsertMenuItem = u32.NewProc("InsertMenuItemW")
pLoadCursor = u32.NewProc("LoadCursorW")
pLoadIcon = u32.NewProc("LoadIconW")
pLoadImage = u32.NewProc("LoadImageW")
pPostMessage = u32.NewProc("PostMessageW")
pPostQuitMessage = u32.NewProc("PostQuitMessage")
pRegisterClass = u32.NewProc("RegisterClassExW")
pRegisterWindowMessage = u32.NewProc("RegisterWindowMessageW")
pSetForegroundWindow = u32.NewProc("SetForegroundWindow")
pSetMenuInfo = u32.NewProc("SetMenuInfo")
pSetMenuItemInfo = u32.NewProc("SetMenuItemInfoW")
pShellNotifyIcon = s32.NewProc("Shell_NotifyIconW")
pShowWindow = u32.NewProc("ShowWindow")
pTrackPopupMenu = u32.NewProc("TrackPopupMenu")
pTranslateMessage = u32.NewProc("TranslateMessage")
pUnregisterClass = u32.NewProc("UnregisterClassW")
pUpdateWindow = u32.NewProc("UpdateWindow")
)
const (
CS_HREDRAW = 0x0002
CS_VREDRAW = 0x0001
CW_USEDEFAULT = 0x80000000
IDC_ARROW = 32512 // Standard arrow
IDI_APPLICATION = 32512
IMAGE_ICON = 1 // Loads an icon
LR_DEFAULTSIZE = 0x00000040 // Loads default-size icon for windows(SM_CXICON x SM_CYICON) if cx, cy are set to zero
LR_LOADFROMFILE = 0x00000010 // Loads the stand-alone image from the file
MF_BYCOMMAND = 0x00000000
MFS_DISABLED = 0x00000003
MFT_SEPARATOR = 0x00000800
MFT_STRING = 0x00000000
MIIM_BITMAP = 0x00000080
MIIM_FTYPE = 0x00000100
MIIM_ID = 0x00000002
MIIM_STATE = 0x00000001
MIIM_STRING = 0x00000040
MIIM_SUBMENU = 0x00000004
MIM_APPLYTOSUBMENUS = 0x80000000
NIF_ICON = 0x00000002
NIF_INFO = 0x00000010
NIF_MESSAGE = 0x00000001
SW_HIDE = 0
TPM_BOTTOMALIGN = 0x0020
TPM_LEFTALIGN = 0x0000
WM_CLOSE = 0x0010
WM_USER = 0x0400
WS_CAPTION = 0x00C00000
WS_MAXIMIZEBOX = 0x00010000
WS_MINIMIZEBOX = 0x00020000
WS_OVERLAPPED = 0x00000000
WS_OVERLAPPEDWINDOW = WS_OVERLAPPED | WS_CAPTION | WS_SYSMENU | WS_THICKFRAME | WS_MINIMIZEBOX | WS_MAXIMIZEBOX
WS_SYSMENU = 0x00080000
WS_THICKFRAME = 0x00040000
)
// Not sure if this is actually needed on windows
func init() {
runtime.LockOSThread()
}
// The POINT structure defines the x- and y- coordinates of a point.
// https://msdn.microsoft.com/en-us/library/windows/desktop/dd162805(v=vs.85).aspx
type point struct {
X, Y int32
}

View File

@@ -0,0 +1,45 @@
//go:build windows
package wintray
import (
"unsafe"
"golang.org/x/sys/windows"
)
// Contains window class information.
// It is used with the RegisterClassEx and GetClassInfoEx functions.
// https://msdn.microsoft.com/en-us/library/ms633577.aspx
type wndClassEx struct {
Size, Style uint32
WndProc uintptr
ClsExtra, WndExtra int32
Instance, Icon, Cursor, Background windows.Handle
MenuName, ClassName *uint16
IconSm windows.Handle
}
// Registers a window class for subsequent use in calls to the CreateWindow or CreateWindowEx function.
// https://msdn.microsoft.com/en-us/library/ms633587.aspx
func (w *wndClassEx) register() error {
w.Size = uint32(unsafe.Sizeof(*w))
res, _, err := pRegisterClass.Call(uintptr(unsafe.Pointer(w)))
if res == 0 {
return err
}
return nil
}
// Unregisters a window class, freeing the memory required for the class.
// https://msdn.microsoft.com/en-us/library/ms644899.aspx
func (w *wndClassEx) unregister() error {
res, _, err := pUnregisterClass.Call(
uintptr(unsafe.Pointer(w.ClassName)),
uintptr(w.Instance),
)
if res == 0 {
return err
}
return nil
}

61
auth/auth.go Normal file
View File

@@ -0,0 +1,61 @@
package auth
import (
"bytes"
"context"
"crypto/rand"
"encoding/base64"
"fmt"
"io"
"log/slog"
"os"
"path/filepath"
"golang.org/x/crypto/ssh"
)
const defaultPrivateKey = "id_ed25519"
func NewNonce(r io.Reader, length int) (string, error) {
nonce := make([]byte, length)
if _, err := io.ReadFull(r, nonce); err != nil {
return "", err
}
return base64.RawURLEncoding.EncodeToString(nonce), nil
}
func Sign(ctx context.Context, bts []byte) (string, error) {
home, err := os.UserHomeDir()
if err != nil {
return "", err
}
keyPath := filepath.Join(home, ".ollama", defaultPrivateKey)
privateKeyFile, err := os.ReadFile(keyPath)
if err != nil {
slog.Info(fmt.Sprintf("Failed to load private key: %v", err))
return "", err
}
privateKey, err := ssh.ParsePrivateKey(privateKeyFile)
if err != nil {
return "", err
}
// get the pubkey, but remove the type
publicKey := ssh.MarshalAuthorizedKey(privateKey.PublicKey())
parts := bytes.Split(publicKey, []byte(" "))
if len(parts) < 2 {
return "", fmt.Errorf("malformed public key")
}
signedData, err := privateKey.Sign(rand.Reader, bts)
if err != nil {
return "", err
}
// signature is <pubkey>:<signature>
return fmt.Sprintf("%s:%s", bytes.TrimSpace(parts[1]), base64.StdEncoding.EncodeToString(signedData.Blob)), nil
}

View File

@@ -14,15 +14,15 @@ import (
"net"
"net/http"
"os"
"os/exec"
"os/signal"
"path/filepath"
"regexp"
"runtime"
"strings"
"syscall"
"time"
"github.com/containerd/console"
"github.com/olekukonko/tablewriter"
"github.com/spf13/cobra"
"golang.org/x/crypto/ssh"
@@ -33,13 +33,10 @@ import (
"github.com/jmorganca/ollama/format"
"github.com/jmorganca/ollama/parser"
"github.com/jmorganca/ollama/progress"
"github.com/jmorganca/ollama/readline"
"github.com/jmorganca/ollama/server"
"github.com/jmorganca/ollama/version"
)
type ImageData []byte
func CreateHandler(cmd *cobra.Command, args []string) error {
filename, _ := cmd.Flags().GetString("file")
filename, err := filepath.Abs(filename)
@@ -151,19 +148,68 @@ func RunHandler(cmd *cobra.Command, args []string) error {
}
name := args[0]
// check if the model exists on the server
_, err = client.Show(cmd.Context(), &api.ShowRequest{Name: name})
show, err := client.Show(cmd.Context(), &api.ShowRequest{Name: name})
var statusError api.StatusError
switch {
case errors.As(err, &statusError) && statusError.StatusCode == http.StatusNotFound:
if err := PullHandler(cmd, args); err != nil {
if err := PullHandler(cmd, []string{name}); err != nil {
return err
}
show, err = client.Show(cmd.Context(), &api.ShowRequest{Name: name})
if err != nil {
return err
}
case err != nil:
return err
}
return RunGenerate(cmd, args)
interactive := true
opts := runOptions{
Model: args[0],
WordWrap: os.Getenv("TERM") == "xterm-256color",
Options: map[string]interface{}{},
MultiModal: slices.Contains(show.Details.Families, "clip"),
ParentModel: show.Details.ParentModel,
}
format, err := cmd.Flags().GetString("format")
if err != nil {
return err
}
opts.Format = format
prompts := args[1:]
// prepend stdin to the prompt if provided
if !term.IsTerminal(int(os.Stdin.Fd())) {
in, err := io.ReadAll(os.Stdin)
if err != nil {
return err
}
prompts = append([]string{string(in)}, prompts...)
opts.WordWrap = false
interactive = false
}
opts.Prompt = strings.Join(prompts, " ")
if len(prompts) > 0 {
interactive = false
}
nowrap, err := cmd.Flags().GetBool("nowordwrap")
if err != nil {
return err
}
opts.WordWrap = !nowrap
if !interactive {
return generate(cmd, opts)
}
return generateInteractive(cmd, opts)
}
func PushHandler(cmd *cobra.Command, args []string) error {
@@ -415,66 +461,139 @@ func PullHandler(cmd *cobra.Command, args []string) error {
return nil
}
func RunGenerate(cmd *cobra.Command, args []string) error {
interactive := true
opts := generateOptions{
Model: args[0],
WordWrap: os.Getenv("TERM") == "xterm-256color",
Options: map[string]interface{}{},
Images: []ImageData{},
}
format, err := cmd.Flags().GetString("format")
if err != nil {
return err
}
opts.Format = format
prompts := args[1:]
// prepend stdin to the prompt if provided
if !term.IsTerminal(int(os.Stdin.Fd())) {
in, err := io.ReadAll(os.Stdin)
if err != nil {
return err
}
prompts = append([]string{string(in)}, prompts...)
opts.WordWrap = false
interactive = false
}
opts.Prompt = strings.Join(prompts, " ")
if len(prompts) > 0 {
interactive = false
}
nowrap, err := cmd.Flags().GetBool("nowordwrap")
if err != nil {
return err
}
opts.WordWrap = !nowrap
if !interactive {
return generate(cmd, opts)
}
return generateInteractive(cmd, opts)
}
type generateContextKey string
type generateOptions struct {
Model string
Prompt string
WordWrap bool
Format string
System string
Template string
Images []ImageData
Options map[string]interface{}
type runOptions struct {
Model string
ParentModel string
Prompt string
Messages []api.Message
WordWrap bool
Format string
System string
Template string
Images []api.ImageData
Options map[string]interface{}
MultiModal bool
}
func generate(cmd *cobra.Command, opts generateOptions) error {
type displayResponseState struct {
lineLength int
wordBuffer string
}
func displayResponse(content string, wordWrap bool, state *displayResponseState) {
termWidth, _, _ := term.GetSize(int(os.Stdout.Fd()))
if wordWrap && termWidth >= 10 {
for _, ch := range content {
if state.lineLength+1 > termWidth-5 {
if len(state.wordBuffer) > termWidth-10 {
fmt.Printf("%s%c", state.wordBuffer, ch)
state.wordBuffer = ""
state.lineLength = 0
continue
}
// backtrack the length of the last word and clear to the end of the line
fmt.Printf("\x1b[%dD\x1b[K\n", len(state.wordBuffer))
fmt.Printf("%s%c", state.wordBuffer, ch)
state.lineLength = len(state.wordBuffer) + 1
} else {
fmt.Print(string(ch))
state.lineLength += 1
switch ch {
case ' ':
state.wordBuffer = ""
case '\n':
state.lineLength = 0
default:
state.wordBuffer += string(ch)
}
}
}
} else {
fmt.Printf("%s%s", state.wordBuffer, content)
if len(state.wordBuffer) > 0 {
state.wordBuffer = ""
}
}
}
func chat(cmd *cobra.Command, opts runOptions) (*api.Message, error) {
client, err := api.ClientFromEnvironment()
if err != nil {
return nil, err
}
p := progress.NewProgress(os.Stderr)
defer p.StopAndClear()
spinner := progress.NewSpinner("")
p.Add("", spinner)
cancelCtx, cancel := context.WithCancel(cmd.Context())
defer cancel()
sigChan := make(chan os.Signal, 1)
signal.Notify(sigChan, syscall.SIGINT)
go func() {
<-sigChan
cancel()
}()
var state *displayResponseState = &displayResponseState{}
var latest api.ChatResponse
var fullResponse strings.Builder
var role string
fn := func(response api.ChatResponse) error {
p.StopAndClear()
latest = response
role = response.Message.Role
content := response.Message.Content
fullResponse.WriteString(content)
displayResponse(content, opts.WordWrap, state)
return nil
}
req := &api.ChatRequest{
Model: opts.Model,
Messages: opts.Messages,
Format: opts.Format,
Options: opts.Options,
}
if err := client.Chat(cancelCtx, req, fn); err != nil {
if errors.Is(err, context.Canceled) {
return nil, nil
}
return nil, err
}
if len(opts.Messages) > 0 {
fmt.Println()
fmt.Println()
}
verbose, err := cmd.Flags().GetBool("verbose")
if err != nil {
return nil, err
}
if verbose {
latest.Summary()
}
return &api.Message{Role: role, Content: fullResponse.String()}, nil
}
func generate(cmd *cobra.Command, opts runOptions) error {
client, err := api.ClientFromEnvironment()
if err != nil {
return err
@@ -493,11 +612,6 @@ func generate(cmd *cobra.Command, opts generateOptions) error {
generateContext = []int{}
}
termWidth, _, err := term.GetSize(int(os.Stdout.Fd()))
if err != nil {
opts.WordWrap = false
}
ctx, cancel := context.WithCancel(cmd.Context())
defer cancel()
@@ -509,94 +623,44 @@ func generate(cmd *cobra.Command, opts generateOptions) error {
cancel()
}()
var currentLineLength int
var wordBuffer string
var state *displayResponseState = &displayResponseState{}
fn := func(response api.GenerateResponse) error {
p.StopAndClear()
latest = response
content := response.Response
termWidth, _, _ = term.GetSize(int(os.Stdout.Fd()))
if opts.WordWrap && termWidth >= 10 {
for _, ch := range response.Response {
if currentLineLength+1 > termWidth-5 {
if len(wordBuffer) > termWidth-10 {
fmt.Printf("%s%c", wordBuffer, ch)
wordBuffer = ""
currentLineLength = 0
continue
}
// backtrack the length of the last word and clear to the end of the line
fmt.Printf("\x1b[%dD\x1b[K\n", len(wordBuffer))
fmt.Printf("%s%c", wordBuffer, ch)
currentLineLength = len(wordBuffer) + 1
} else {
fmt.Print(string(ch))
currentLineLength += 1
switch ch {
case ' ':
wordBuffer = ""
case '\n':
currentLineLength = 0
default:
wordBuffer += string(ch)
}
}
}
} else {
fmt.Printf("%s%s", wordBuffer, response.Response)
if len(wordBuffer) > 0 {
wordBuffer = ""
}
}
displayResponse(content, opts.WordWrap, state)
return nil
}
images := make([]api.ImageData, 0)
for _, i := range opts.Images {
images = append(images, api.ImageData(i))
if opts.MultiModal {
opts.Prompt, opts.Images, err = extractFileData(opts.Prompt)
if err != nil {
return err
}
}
request := api.GenerateRequest{
Model: opts.Model,
Prompt: opts.Prompt,
Context: generateContext,
Images: opts.Images,
Format: opts.Format,
System: opts.System,
Template: opts.Template,
Options: opts.Options,
Images: images,
}
if err := client.Generate(ctx, &request, fn); err != nil {
switch {
case errors.Is(err, context.Canceled):
if errors.Is(err, context.Canceled) {
return nil
case strings.Contains(err.Error(), "unsupported model format"):
// pull and retry to see if the model has been updated
parts := strings.Split(opts.Model, string(os.PathSeparator))
if len(parts) == 1 {
// this is a library model, log some info
fmt.Fprintln(os.Stderr, "This model is no longer compatible with Ollama. Pulling a new version...")
}
if err := PullHandler(cmd, []string{opts.Model}); err != nil {
fmt.Printf("Error: %s\n", err)
return fmt.Errorf("unsupported model, please update this model to gguf format") // relay the original error
}
// retry
if err := client.Generate(ctx, &request, fn); err != nil {
if errors.Is(err, context.Canceled) {
return nil
}
return err
}
default:
return err
}
return err
}
if opts.Prompt != "" {
fmt.Println()
fmt.Println()
@@ -621,461 +685,8 @@ func generate(cmd *cobra.Command, opts generateOptions) error {
return nil
}
type MultilineState int
const (
MultilineNone MultilineState = iota
MultilinePrompt
MultilineSystem
MultilineTemplate
)
func modelIsMultiModal(cmd *cobra.Command, name string) bool {
// get model details
client, err := api.ClientFromEnvironment()
if err != nil {
fmt.Println("error: couldn't connect to ollama server")
return false
}
req := api.ShowRequest{Name: name}
resp, err := client.Show(cmd.Context(), &req)
if err != nil {
return false
}
return slices.Contains(resp.Details.Families, "clip")
}
func generateInteractive(cmd *cobra.Command, opts generateOptions) error {
multiModal := modelIsMultiModal(cmd, opts.Model)
// load the model
loadOpts := generateOptions{
Model: opts.Model,
Prompt: "",
Images: []ImageData{},
}
if err := generate(cmd, loadOpts); err != nil {
return err
}
usage := func() {
fmt.Fprintln(os.Stderr, "Available Commands:")
fmt.Fprintln(os.Stderr, " /set Set session variables")
fmt.Fprintln(os.Stderr, " /show Show model information")
fmt.Fprintln(os.Stderr, " /bye Exit")
fmt.Fprintln(os.Stderr, " /?, /help Help for a command")
fmt.Fprintln(os.Stderr, " /? shortcuts Help for keyboard shortcuts")
fmt.Fprintln(os.Stderr, "")
fmt.Fprintln(os.Stderr, "Use \"\"\" to begin a multi-line message.")
fmt.Fprintln(os.Stderr, "")
}
usageSet := func() {
fmt.Fprintln(os.Stderr, "Available Commands:")
fmt.Fprintln(os.Stderr, " /set parameter ... Set a parameter")
fmt.Fprintln(os.Stderr, " /set system <string> Set system message")
fmt.Fprintln(os.Stderr, " /set template <string> Set prompt template")
fmt.Fprintln(os.Stderr, " /set history Enable history")
fmt.Fprintln(os.Stderr, " /set nohistory Disable history")
fmt.Fprintln(os.Stderr, " /set wordwrap Enable wordwrap")
fmt.Fprintln(os.Stderr, " /set nowordwrap Disable wordwrap")
fmt.Fprintln(os.Stderr, " /set format json Enable JSON mode")
fmt.Fprintln(os.Stderr, " /set noformat Disable formatting")
fmt.Fprintln(os.Stderr, " /set verbose Show LLM stats")
fmt.Fprintln(os.Stderr, " /set quiet Disable LLM stats")
fmt.Fprintln(os.Stderr, "")
}
usageShortcuts := func() {
fmt.Fprintln(os.Stderr, "Available keyboard shortcuts:")
fmt.Fprintln(os.Stderr, " Ctrl + a Move to the beginning of the line (Home)")
fmt.Fprintln(os.Stderr, " Ctrl + e Move to the end of the line (End)")
fmt.Fprintln(os.Stderr, " Alt + b Move back (left) one word")
fmt.Fprintln(os.Stderr, " Alt + f Move forward (right) one word")
fmt.Fprintln(os.Stderr, " Ctrl + k Delete the sentence after the cursor")
fmt.Fprintln(os.Stderr, " Ctrl + u Delete the sentence before the cursor")
fmt.Fprintln(os.Stderr, "")
fmt.Fprintln(os.Stderr, " Ctrl + l Clear the screen")
fmt.Fprintln(os.Stderr, " Ctrl + c Stop the model from responding")
fmt.Fprintln(os.Stderr, " Ctrl + d Exit ollama (/bye)")
fmt.Fprintln(os.Stderr, "")
}
usageShow := func() {
fmt.Fprintln(os.Stderr, "Available Commands:")
fmt.Fprintln(os.Stderr, " /show license Show model license")
fmt.Fprintln(os.Stderr, " /show modelfile Show Modelfile for this model")
fmt.Fprintln(os.Stderr, " /show parameters Show parameters for this model")
fmt.Fprintln(os.Stderr, " /show system Show system message")
fmt.Fprintln(os.Stderr, " /show template Show prompt template")
fmt.Fprintln(os.Stderr, "")
}
// only list out the most common parameters
usageParameters := func() {
fmt.Fprintln(os.Stderr, "Available Parameters:")
fmt.Fprintln(os.Stderr, " /set parameter seed <int> Random number seed")
fmt.Fprintln(os.Stderr, " /set parameter num_predict <int> Max number of tokens to predict")
fmt.Fprintln(os.Stderr, " /set parameter top_k <int> Pick from top k num of tokens")
fmt.Fprintln(os.Stderr, " /set parameter top_p <float> Pick token based on sum of probabilities")
fmt.Fprintln(os.Stderr, " /set parameter num_ctx <int> Set the context size")
fmt.Fprintln(os.Stderr, " /set parameter temperature <float> Set creativity level")
fmt.Fprintln(os.Stderr, " /set parameter repeat_penalty <float> How strongly to penalize repetitions")
fmt.Fprintln(os.Stderr, " /set parameter repeat_last_n <int> Set how far back to look for repetitions")
fmt.Fprintln(os.Stderr, " /set parameter num_gpu <int> The number of layers to send to the GPU")
fmt.Fprintln(os.Stderr, " /set parameter stop \"<string>\", ... Set the stop parameters")
fmt.Fprintln(os.Stderr, "")
}
scanner, err := readline.New(readline.Prompt{
Prompt: ">>> ",
AltPrompt: "... ",
Placeholder: "Send a message (/? for help)",
AltPlaceholder: `Use """ to end multi-line input`,
})
if err != nil {
return err
}
fmt.Print(readline.StartBracketedPaste)
defer fmt.Printf(readline.EndBracketedPaste)
var multiline MultilineState
var prompt string
for {
line, err := scanner.Readline()
switch {
case errors.Is(err, io.EOF):
fmt.Println()
return nil
case errors.Is(err, readline.ErrInterrupt):
if line == "" {
fmt.Println("\nUse Ctrl + d or /bye to exit.")
}
scanner.Prompt.UseAlt = false
prompt = ""
continue
case err != nil:
return err
}
switch {
case strings.HasPrefix(prompt, `"""`):
// if the prompt so far starts with """ then we're in multiline mode
// and we need to keep reading until we find a line that ends with """
cut, found := strings.CutSuffix(line, `"""`)
prompt += cut
if !found {
prompt += "\n"
continue
}
prompt = strings.TrimPrefix(prompt, `"""`)
scanner.Prompt.UseAlt = false
switch multiline {
case MultilineSystem:
opts.System = prompt
prompt = ""
fmt.Println("Set system message.")
case MultilineTemplate:
opts.Template = prompt
prompt = ""
fmt.Println("Set prompt template.")
}
multiline = MultilineNone
case strings.HasPrefix(line, `"""`) && len(prompt) == 0:
scanner.Prompt.UseAlt = true
multiline = MultilinePrompt
prompt += line + "\n"
continue
case scanner.Pasting:
prompt += line + "\n"
continue
case strings.HasPrefix(line, "/list"):
args := strings.Fields(line)
if err := ListHandler(cmd, args[1:]); err != nil {
return err
}
case strings.HasPrefix(line, "/set"):
args := strings.Fields(line)
if len(args) > 1 {
switch args[1] {
case "history":
scanner.HistoryEnable()
case "nohistory":
scanner.HistoryDisable()
case "wordwrap":
opts.WordWrap = true
fmt.Println("Set 'wordwrap' mode.")
case "nowordwrap":
opts.WordWrap = false
fmt.Println("Set 'nowordwrap' mode.")
case "verbose":
cmd.Flags().Set("verbose", "true")
fmt.Println("Set 'verbose' mode.")
case "quiet":
cmd.Flags().Set("verbose", "false")
fmt.Println("Set 'quiet' mode.")
case "format":
if len(args) < 3 || args[2] != "json" {
fmt.Println("Invalid or missing format. For 'json' mode use '/set format json'")
} else {
opts.Format = args[2]
fmt.Printf("Set format to '%s' mode.\n", args[2])
}
case "noformat":
opts.Format = ""
fmt.Println("Disabled format.")
case "parameter":
if len(args) < 4 {
usageParameters()
continue
}
var params []string
for _, p := range args[3:] {
params = append(params, p)
}
fp, err := api.FormatParams(map[string][]string{args[2]: params})
if err != nil {
fmt.Printf("Couldn't set parameter: %q\n\n", err)
continue
}
fmt.Printf("Set parameter '%s' to '%s'\n\n", args[2], strings.Join(params, ", "))
opts.Options[args[2]] = fp[args[2]]
case "system", "template":
if len(args) < 3 {
usageSet()
continue
}
line := strings.Join(args[2:], " ")
line = strings.TrimPrefix(line, `"""`)
if strings.HasPrefix(args[2], `"""`) {
cut, found := strings.CutSuffix(line, `"""`)
prompt += cut
if found {
if args[1] == "system" {
opts.System = prompt
fmt.Println("Set system message.")
} else {
opts.Template = prompt
fmt.Println("Set prompt template.")
}
prompt = ""
} else {
prompt = `"""` + prompt + "\n"
if args[1] == "system" {
multiline = MultilineSystem
} else {
multiline = MultilineTemplate
}
scanner.Prompt.UseAlt = true
}
} else {
opts.System = line
fmt.Println("Set system message.")
}
default:
fmt.Printf("Unknown command '/set %s'. Type /? for help\n", args[1])
}
} else {
usageSet()
}
case strings.HasPrefix(line, "/show"):
args := strings.Fields(line)
if len(args) > 1 {
client, err := api.ClientFromEnvironment()
if err != nil {
fmt.Println("error: couldn't connect to ollama server")
return err
}
resp, err := client.Show(cmd.Context(), &api.ShowRequest{Name: opts.Model})
if err != nil {
fmt.Println("error: couldn't get model")
return err
}
switch args[1] {
case "license":
if resp.License == "" {
fmt.Print("No license was specified for this model.\n\n")
} else {
fmt.Println(resp.License)
}
case "modelfile":
fmt.Println(resp.Modelfile)
case "parameters":
if resp.Parameters == "" {
fmt.Print("No parameters were specified for this model.\n\n")
} else {
if len(opts.Options) > 0 {
fmt.Println("User defined parameters:")
for k, v := range opts.Options {
fmt.Printf("%-*s %v\n", 30, k, v)
}
fmt.Println()
}
fmt.Println("Model defined parameters:")
fmt.Println(resp.Parameters)
}
case "system":
switch {
case opts.System != "":
fmt.Println(opts.System + "\n")
case resp.System != "":
fmt.Println(resp.System + "\n")
default:
fmt.Print("No system message was specified for this model.\n\n")
}
case "template":
switch {
case opts.Template != "":
fmt.Println(opts.Template + "\n")
case resp.Template != "":
fmt.Println(resp.Template)
default:
fmt.Print("No prompt template was specified for this model.\n\n")
}
default:
fmt.Printf("Unknown command '/show %s'. Type /? for help\n", args[1])
}
} else {
usageShow()
}
case strings.HasPrefix(line, "/help"), strings.HasPrefix(line, "/?"):
args := strings.Fields(line)
if len(args) > 1 {
switch args[1] {
case "set", "/set":
usageSet()
case "show", "/show":
usageShow()
case "shortcut", "shortcuts":
usageShortcuts()
}
} else {
usage()
}
case line == "/exit", line == "/bye":
return nil
case strings.HasPrefix(line, "/"):
args := strings.Fields(line)
isFile := false
if multiModal {
for _, f := range extractFileNames(line) {
if strings.HasPrefix(f, args[0]) {
isFile = true
break
}
}
}
if isFile {
prompt += line
} else {
fmt.Printf("Unknown command '%s'. Type /? for help\n", args[0])
continue
}
default:
prompt += line
}
if len(prompt) > 0 && multiline == MultilineNone {
opts.Prompt = prompt
if multiModal {
newPrompt, images, err := extractFileData(prompt)
if err != nil {
return err
}
opts.Prompt = newPrompt
// reset the context if we find another image
if len(images) > 0 {
opts.Images = images
ctx := cmd.Context()
ctx = context.WithValue(ctx, generateContextKey("context"), []int{})
cmd.SetContext(ctx)
}
if len(opts.Images) == 0 {
fmt.Println("This model requires you to add a jpeg, png, or svg image.")
fmt.Println()
prompt = ""
continue
}
}
if err := generate(cmd, opts); err != nil {
return err
}
prompt = ""
}
}
}
func normalizeFilePath(fp string) string {
// Define a map of escaped characters and their replacements
replacements := map[string]string{
"\\ ": " ", // Escaped space
"\\(": "(", // Escaped left parenthesis
"\\)": ")", // Escaped right parenthesis
"\\[": "[", // Escaped left square bracket
"\\]": "]", // Escaped right square bracket
"\\{": "{", // Escaped left curly brace
"\\}": "}", // Escaped right curly brace
"\\$": "$", // Escaped dollar sign
"\\&": "&", // Escaped ampersand
"\\;": ";", // Escaped semicolon
"\\'": "'", // Escaped single quote
"\\\\": "\\", // Escaped backslash
"\\*": "*", // Escaped asterisk
"\\?": "?", // Escaped question mark
}
for escaped, actual := range replacements {
fp = strings.ReplaceAll(fp, escaped, actual)
}
return fp
}
func extractFileNames(input string) []string {
// Regex to match file paths starting with / or ./ and include escaped spaces (\ or %20)
// and followed by more characters and a file extension
regexPattern := `(?:\./|/)[\S\\ ]+?\.(?i:jpg|jpeg|png|svg)\b`
re := regexp.MustCompile(regexPattern)
return re.FindAllString(input, -1)
}
func extractFileData(input string) (string, []ImageData, error) {
filePaths := extractFileNames(input)
var imgs []ImageData
for _, fp := range filePaths {
nfp := normalizeFilePath(fp)
data, err := getImageData(nfp)
if err != nil {
if os.IsNotExist(err) {
continue
}
fmt.Printf("Couldn't process image: %q\n", err)
return "", imgs, err
}
fmt.Printf("Added image '%s'\n", nfp)
input = strings.ReplaceAll(input, fp, "")
imgs = append(imgs, data)
}
return input, imgs, nil
}
func RunServer(cmd *cobra.Command, _ []string) error {
host, port, err := net.SplitHostPort(os.Getenv("OLLAMA_HOST"))
host, port, err := net.SplitHostPort(strings.Trim(os.Getenv("OLLAMA_HOST"), "\"'"))
if err != nil {
host, port = "127.0.0.1", "11434"
if ip := net.ParseIP(strings.Trim(os.Getenv("OLLAMA_HOST"), "[]")); ip != nil {
@@ -1095,50 +706,6 @@ func RunServer(cmd *cobra.Command, _ []string) error {
return server.Serve(ln)
}
func getImageData(filePath string) ([]byte, error) {
file, err := os.Open(filePath)
if err != nil {
return nil, err
}
defer file.Close()
buf := make([]byte, 512)
_, err = file.Read(buf)
if err != nil {
return nil, err
}
contentType := http.DetectContentType(buf)
allowedTypes := []string{"image/jpeg", "image/jpg", "image/svg+xml", "image/png"}
if !slices.Contains(allowedTypes, contentType) {
return nil, fmt.Errorf("invalid image type: %s", contentType)
}
info, err := file.Stat()
if err != nil {
return nil, err
}
// Check if the file size exceeds 100MB
var maxSize int64 = 100 * 1024 * 1024 // 100MB in bytes
if info.Size() > maxSize {
return nil, fmt.Errorf("file size exceeds maximum limit (100MB)")
}
buf = make([]byte, info.Size())
_, err = file.Seek(0, 0)
if err != nil {
return nil, err
}
_, err = io.ReadFull(file, buf)
if err != nil {
return nil, err
}
return buf, nil
}
func initializeKeypair() error {
home, err := os.UserHomeDir()
if err != nil {
@@ -1188,22 +755,8 @@ func initializeKeypair() error {
return nil
}
func startMacApp(ctx context.Context, client *api.Client) error {
exe, err := os.Executable()
if err != nil {
return err
}
link, err := os.Readlink(exe)
if err != nil {
return err
}
if !strings.Contains(link, "Ollama.app") {
return fmt.Errorf("could not find ollama app")
}
path := strings.Split(link, "Ollama.app")
if err := exec.Command("/usr/bin/open", "-a", path[0]+"Ollama.app").Run(); err != nil {
return err
}
//nolint:unused
func waitForServer(ctx context.Context, client *api.Client) error {
// wait for the server to start
timeout := time.After(5 * time.Second)
tick := time.Tick(500 * time.Millisecond)
@@ -1217,6 +770,7 @@ func startMacApp(ctx context.Context, client *api.Client) error {
}
}
}
}
func checkServerHeartbeat(cmd *cobra.Command, _ []string) error {
@@ -1225,15 +779,11 @@ func checkServerHeartbeat(cmd *cobra.Command, _ []string) error {
return err
}
if err := client.Heartbeat(cmd.Context()); err != nil {
if !strings.Contains(err.Error(), "connection refused") {
if !strings.Contains(err.Error(), " refused") {
return err
}
if runtime.GOOS == "darwin" {
if err := startMacApp(cmd.Context(), client); err != nil {
return fmt.Errorf("could not connect to ollama app, is it running?")
}
} else {
return fmt.Errorf("could not connect to ollama server, run 'ollama serve' to start it")
if err := startApp(cmd.Context(), client); err != nil {
return fmt.Errorf("could not connect to ollama app, is it running?")
}
}
return nil
@@ -1263,6 +813,11 @@ func NewCLI() *cobra.Command {
log.SetFlags(log.LstdFlags | log.Lshortfile)
cobra.EnableCommandSorting = false
if runtime.GOOS == "windows" {
// Enable colorful ANSI escape code in Windows terminal (disabled by default)
console.ConsoleFromFile(os.Stdout) //nolint:errcheck
}
rootCmd := &cobra.Command{
Use: "ollama",
Short: "Large language model runner",

663
cmd/interactive.go Normal file
View File

@@ -0,0 +1,663 @@
package cmd
import (
"errors"
"fmt"
"io"
"net/http"
"os"
"path/filepath"
"regexp"
"sort"
"strings"
"github.com/spf13/cobra"
"golang.org/x/exp/slices"
"github.com/jmorganca/ollama/api"
"github.com/jmorganca/ollama/progress"
"github.com/jmorganca/ollama/readline"
)
type MultilineState int
const (
MultilineNone MultilineState = iota
MultilinePrompt
MultilineSystem
MultilineTemplate
)
func loadModel(cmd *cobra.Command, opts *runOptions) error {
client, err := api.ClientFromEnvironment()
if err != nil {
return err
}
p := progress.NewProgress(os.Stderr)
defer p.StopAndClear()
spinner := progress.NewSpinner("")
p.Add("", spinner)
showReq := api.ShowRequest{Name: opts.Model}
showResp, err := client.Show(cmd.Context(), &showReq)
if err != nil {
return err
}
opts.MultiModal = slices.Contains(showResp.Details.Families, "clip")
opts.ParentModel = showResp.Details.ParentModel
if len(showResp.Messages) > 0 {
opts.Messages = append(opts.Messages, showResp.Messages...)
}
chatReq := &api.ChatRequest{
Model: opts.Model,
Messages: []api.Message{},
}
err = client.Chat(cmd.Context(), chatReq, func(resp api.ChatResponse) error {
p.StopAndClear()
if len(opts.Messages) > 0 {
for _, msg := range opts.Messages {
switch msg.Role {
case "user":
fmt.Printf(">>> %s\n", msg.Content)
case "assistant":
state := &displayResponseState{}
displayResponse(msg.Content, opts.WordWrap, state)
fmt.Println()
fmt.Println()
}
}
}
return nil
})
if err != nil {
return err
}
return nil
}
func generateInteractive(cmd *cobra.Command, opts runOptions) error {
opts.Messages = make([]api.Message, 0)
err := loadModel(cmd, &opts)
if err != nil {
return err
}
usage := func() {
fmt.Fprintln(os.Stderr, "Available Commands:")
fmt.Fprintln(os.Stderr, " /set Set session variables")
fmt.Fprintln(os.Stderr, " /show Show model information")
fmt.Fprintln(os.Stderr, " /load <model> Load a session or model")
fmt.Fprintln(os.Stderr, " /save <model> Save your current session")
fmt.Fprintln(os.Stderr, " /bye Exit")
fmt.Fprintln(os.Stderr, " /?, /help Help for a command")
fmt.Fprintln(os.Stderr, " /? shortcuts Help for keyboard shortcuts")
fmt.Fprintln(os.Stderr, "")
fmt.Fprintln(os.Stderr, "Use \"\"\" to begin a multi-line message.")
if opts.MultiModal {
fmt.Fprintf(os.Stderr, "Use %s to include .jpg or .png images.\n", filepath.FromSlash("/path/to/file"))
}
fmt.Fprintln(os.Stderr, "")
}
usageSet := func() {
fmt.Fprintln(os.Stderr, "Available Commands:")
fmt.Fprintln(os.Stderr, " /set parameter ... Set a parameter")
fmt.Fprintln(os.Stderr, " /set system <string> Set system message")
fmt.Fprintln(os.Stderr, " /set template <string> Set prompt template")
fmt.Fprintln(os.Stderr, " /set history Enable history")
fmt.Fprintln(os.Stderr, " /set nohistory Disable history")
fmt.Fprintln(os.Stderr, " /set wordwrap Enable wordwrap")
fmt.Fprintln(os.Stderr, " /set nowordwrap Disable wordwrap")
fmt.Fprintln(os.Stderr, " /set format json Enable JSON mode")
fmt.Fprintln(os.Stderr, " /set noformat Disable formatting")
fmt.Fprintln(os.Stderr, " /set verbose Show LLM stats")
fmt.Fprintln(os.Stderr, " /set quiet Disable LLM stats")
fmt.Fprintln(os.Stderr, "")
}
usageShortcuts := func() {
fmt.Fprintln(os.Stderr, "Available keyboard shortcuts:")
fmt.Fprintln(os.Stderr, " Ctrl + a Move to the beginning of the line (Home)")
fmt.Fprintln(os.Stderr, " Ctrl + e Move to the end of the line (End)")
fmt.Fprintln(os.Stderr, " Alt + b Move back (left) one word")
fmt.Fprintln(os.Stderr, " Alt + f Move forward (right) one word")
fmt.Fprintln(os.Stderr, " Ctrl + k Delete the sentence after the cursor")
fmt.Fprintln(os.Stderr, " Ctrl + u Delete the sentence before the cursor")
fmt.Fprintln(os.Stderr, "")
fmt.Fprintln(os.Stderr, " Ctrl + l Clear the screen")
fmt.Fprintln(os.Stderr, " Ctrl + c Stop the model from responding")
fmt.Fprintln(os.Stderr, " Ctrl + d Exit ollama (/bye)")
fmt.Fprintln(os.Stderr, "")
}
usageShow := func() {
fmt.Fprintln(os.Stderr, "Available Commands:")
fmt.Fprintln(os.Stderr, " /show info Show details for this model")
fmt.Fprintln(os.Stderr, " /show license Show model license")
fmt.Fprintln(os.Stderr, " /show modelfile Show Modelfile for this model")
fmt.Fprintln(os.Stderr, " /show parameters Show parameters for this model")
fmt.Fprintln(os.Stderr, " /show system Show system message")
fmt.Fprintln(os.Stderr, " /show template Show prompt template")
fmt.Fprintln(os.Stderr, "")
}
// only list out the most common parameters
usageParameters := func() {
fmt.Fprintln(os.Stderr, "Available Parameters:")
fmt.Fprintln(os.Stderr, " /set parameter seed <int> Random number seed")
fmt.Fprintln(os.Stderr, " /set parameter num_predict <int> Max number of tokens to predict")
fmt.Fprintln(os.Stderr, " /set parameter top_k <int> Pick from top k num of tokens")
fmt.Fprintln(os.Stderr, " /set parameter top_p <float> Pick token based on sum of probabilities")
fmt.Fprintln(os.Stderr, " /set parameter num_ctx <int> Set the context size")
fmt.Fprintln(os.Stderr, " /set parameter temperature <float> Set creativity level")
fmt.Fprintln(os.Stderr, " /set parameter repeat_penalty <float> How strongly to penalize repetitions")
fmt.Fprintln(os.Stderr, " /set parameter repeat_last_n <int> Set how far back to look for repetitions")
fmt.Fprintln(os.Stderr, " /set parameter num_gpu <int> The number of layers to send to the GPU")
fmt.Fprintln(os.Stderr, " /set parameter stop \"<string>\", ... Set the stop parameters")
fmt.Fprintln(os.Stderr, "")
}
scanner, err := readline.New(readline.Prompt{
Prompt: ">>> ",
AltPrompt: "... ",
Placeholder: "Send a message (/? for help)",
AltPlaceholder: `Use """ to end multi-line input`,
})
if err != nil {
return err
}
fmt.Print(readline.StartBracketedPaste)
defer fmt.Printf(readline.EndBracketedPaste)
var sb strings.Builder
var multiline MultilineState
for {
line, err := scanner.Readline()
switch {
case errors.Is(err, io.EOF):
fmt.Println()
return nil
case errors.Is(err, readline.ErrInterrupt):
if line == "" {
fmt.Println("\nUse Ctrl + d or /bye to exit.")
}
scanner.Prompt.UseAlt = false
sb.Reset()
continue
case err != nil:
return err
}
switch {
case multiline != MultilineNone:
// check if there's a multiline terminating string
before, ok := strings.CutSuffix(line, `"""`)
sb.WriteString(before)
if !ok {
fmt.Fprintln(&sb)
continue
}
switch multiline {
case MultilineSystem:
opts.System = sb.String()
opts.Messages = append(opts.Messages, api.Message{Role: "system", Content: opts.System})
fmt.Println("Set system message.")
sb.Reset()
case MultilineTemplate:
opts.Template = sb.String()
fmt.Println("Set prompt template.")
sb.Reset()
}
multiline = MultilineNone
scanner.Prompt.UseAlt = false
case strings.HasPrefix(line, `"""`):
line := strings.TrimPrefix(line, `"""`)
line, ok := strings.CutSuffix(line, `"""`)
sb.WriteString(line)
if !ok {
// no multiline terminating string; need more input
fmt.Fprintln(&sb)
multiline = MultilinePrompt
scanner.Prompt.UseAlt = true
}
case scanner.Pasting:
fmt.Fprintln(&sb, line)
continue
case strings.HasPrefix(line, "/list"):
args := strings.Fields(line)
if err := ListHandler(cmd, args[1:]); err != nil {
return err
}
case strings.HasPrefix(line, "/load"):
args := strings.Fields(line)
if len(args) != 2 {
fmt.Println("Usage:\n /load <modelname>")
continue
}
opts.Model = args[1]
opts.Messages = []api.Message{}
fmt.Printf("Loading model '%s'\n", opts.Model)
if err := loadModel(cmd, &opts); err != nil {
return err
}
continue
case strings.HasPrefix(line, "/save"):
args := strings.Fields(line)
if len(args) != 2 {
fmt.Println("Usage:\n /save <modelname>")
continue
}
client, err := api.ClientFromEnvironment()
if err != nil {
fmt.Println("error: couldn't connect to ollama server")
return err
}
req := &api.CreateRequest{
Name: args[1],
Modelfile: buildModelfile(opts),
}
fn := func(resp api.ProgressResponse) error { return nil }
err = client.Create(cmd.Context(), req, fn)
if err != nil {
fmt.Println("error: couldn't save model")
return err
}
fmt.Printf("Created new model '%s'\n", args[1])
continue
case strings.HasPrefix(line, "/set"):
args := strings.Fields(line)
if len(args) > 1 {
switch args[1] {
case "history":
scanner.HistoryEnable()
case "nohistory":
scanner.HistoryDisable()
case "wordwrap":
opts.WordWrap = true
fmt.Println("Set 'wordwrap' mode.")
case "nowordwrap":
opts.WordWrap = false
fmt.Println("Set 'nowordwrap' mode.")
case "verbose":
cmd.Flags().Set("verbose", "true")
fmt.Println("Set 'verbose' mode.")
case "quiet":
cmd.Flags().Set("verbose", "false")
fmt.Println("Set 'quiet' mode.")
case "format":
if len(args) < 3 || args[2] != "json" {
fmt.Println("Invalid or missing format. For 'json' mode use '/set format json'")
} else {
opts.Format = args[2]
fmt.Printf("Set format to '%s' mode.\n", args[2])
}
case "noformat":
opts.Format = ""
fmt.Println("Disabled format.")
case "parameter":
if len(args) < 4 {
usageParameters()
continue
}
params := args[3:]
fp, err := api.FormatParams(map[string][]string{args[2]: params})
if err != nil {
fmt.Printf("Couldn't set parameter: %q\n", err)
continue
}
fmt.Printf("Set parameter '%s' to '%s'\n", args[2], strings.Join(params, ", "))
opts.Options[args[2]] = fp[args[2]]
case "system", "template":
if len(args) < 3 {
usageSet()
continue
}
if args[1] == "system" {
multiline = MultilineSystem
} else if args[1] == "template" {
multiline = MultilineTemplate
}
line := strings.Join(args[2:], " ")
line, ok := strings.CutPrefix(line, `"""`)
if !ok {
multiline = MultilineNone
} else {
// only cut suffix if the line is multiline
line, ok = strings.CutSuffix(line, `"""`)
if ok {
multiline = MultilineNone
}
}
sb.WriteString(line)
if multiline != MultilineNone {
scanner.Prompt.UseAlt = true
continue
}
if args[1] == "system" {
opts.System = sb.String() // for display in modelfile
newMessage := api.Message{Role: "system", Content: sb.String()}
// Check if the slice is not empty and the last message is from 'system'
if len(opts.Messages) > 0 && opts.Messages[len(opts.Messages)-1].Role == "system" {
// Replace the last message
opts.Messages[len(opts.Messages)-1] = newMessage
} else {
opts.Messages = append(opts.Messages, newMessage)
}
fmt.Println("Set system message.")
sb.Reset()
} else if args[1] == "template" {
opts.Template = sb.String()
fmt.Println("Set prompt template.")
sb.Reset()
}
sb.Reset()
continue
default:
fmt.Printf("Unknown command '/set %s'. Type /? for help\n", args[1])
}
} else {
usageSet()
}
case strings.HasPrefix(line, "/show"):
args := strings.Fields(line)
if len(args) > 1 {
client, err := api.ClientFromEnvironment()
if err != nil {
fmt.Println("error: couldn't connect to ollama server")
return err
}
req := &api.ShowRequest{
Name: opts.Model,
System: opts.System,
Template: opts.Template,
Options: opts.Options,
}
resp, err := client.Show(cmd.Context(), req)
if err != nil {
fmt.Println("error: couldn't get model")
return err
}
switch args[1] {
case "info":
fmt.Println("Model details:")
if len(resp.Details.Families) > 0 {
fmt.Printf("Family %s\n", strings.Join(resp.Details.Families, ", "))
} else if resp.Details.Family != "" {
fmt.Printf("Family %s\n", resp.Details.Family)
}
fmt.Printf("Parameter Size %s\n", resp.Details.ParameterSize)
fmt.Printf("Quantization Level %s\n", resp.Details.QuantizationLevel)
fmt.Println("")
case "license":
if resp.License == "" {
fmt.Println("No license was specified for this model.")
} else {
fmt.Println(resp.License)
}
case "modelfile":
fmt.Println(resp.Modelfile)
case "parameters":
if resp.Parameters == "" {
fmt.Println("No parameters were specified for this model.")
} else {
if len(opts.Options) > 0 {
fmt.Println("User defined parameters:")
for k, v := range opts.Options {
fmt.Printf("%-*s %v\n", 30, k, v)
}
fmt.Println()
}
fmt.Println("Model defined parameters:")
fmt.Println(resp.Parameters)
}
case "system":
switch {
case opts.System != "":
fmt.Println(opts.System + "\n")
case resp.System != "":
fmt.Println(resp.System + "\n")
default:
fmt.Println("No system message was specified for this model.")
}
case "template":
switch {
case opts.Template != "":
fmt.Println(opts.Template + "\n")
case resp.Template != "":
fmt.Println(resp.Template)
default:
fmt.Println("No prompt template was specified for this model.")
}
default:
fmt.Printf("Unknown command '/show %s'. Type /? for help\n", args[1])
}
} else {
usageShow()
}
case strings.HasPrefix(line, "/help"), strings.HasPrefix(line, "/?"):
args := strings.Fields(line)
if len(args) > 1 {
switch args[1] {
case "set", "/set":
usageSet()
case "show", "/show":
usageShow()
case "shortcut", "shortcuts":
usageShortcuts()
}
} else {
usage()
}
case strings.HasPrefix(line, "/exit"), strings.HasPrefix(line, "/bye"):
return nil
case strings.HasPrefix(line, "/"):
args := strings.Fields(line)
isFile := false
if opts.MultiModal {
for _, f := range extractFileNames(line) {
if strings.HasPrefix(f, args[0]) {
isFile = true
break
}
}
}
if !isFile {
fmt.Printf("Unknown command '%s'. Type /? for help\n", args[0])
continue
}
sb.WriteString(line)
default:
sb.WriteString(line)
}
if sb.Len() > 0 && multiline == MultilineNone {
newMessage := api.Message{Role: "user", Content: sb.String()}
if opts.MultiModal {
msg, images, err := extractFileData(sb.String())
if err != nil {
return err
}
// clear all previous images for better responses
if len(images) > 0 {
for i := range opts.Messages {
opts.Messages[i].Images = nil
}
}
newMessage.Content = msg
newMessage.Images = images
}
opts.Messages = append(opts.Messages, newMessage)
assistant, err := chat(cmd, opts)
if err != nil {
return err
}
if assistant != nil {
opts.Messages = append(opts.Messages, *assistant)
}
sb.Reset()
}
}
}
func buildModelfile(opts runOptions) string {
var mf strings.Builder
model := opts.ParentModel
if model == "" {
model = opts.Model
}
fmt.Fprintf(&mf, "FROM %s\n", model)
if opts.System != "" {
fmt.Fprintf(&mf, "SYSTEM \"\"\"%s\"\"\"\n", opts.System)
}
if opts.Template != "" {
fmt.Fprintf(&mf, "TEMPLATE \"\"\"%s\"\"\"\n", opts.Template)
}
keys := make([]string, 0)
for k := range opts.Options {
keys = append(keys, k)
}
sort.Strings(keys)
for _, k := range keys {
fmt.Fprintf(&mf, "PARAMETER %s %v\n", k, opts.Options[k])
}
fmt.Fprintln(&mf)
for _, msg := range opts.Messages {
fmt.Fprintf(&mf, "MESSAGE %s \"\"\"%s\"\"\"\n", msg.Role, msg.Content)
}
return mf.String()
}
func normalizeFilePath(fp string) string {
// Define a map of escaped characters and their replacements
replacements := map[string]string{
"\\ ": " ", // Escaped space
"\\(": "(", // Escaped left parenthesis
"\\)": ")", // Escaped right parenthesis
"\\[": "[", // Escaped left square bracket
"\\]": "]", // Escaped right square bracket
"\\{": "{", // Escaped left curly brace
"\\}": "}", // Escaped right curly brace
"\\$": "$", // Escaped dollar sign
"\\&": "&", // Escaped ampersand
"\\;": ";", // Escaped semicolon
"\\'": "'", // Escaped single quote
"\\\\": "\\", // Escaped backslash
"\\*": "*", // Escaped asterisk
"\\?": "?", // Escaped question mark
}
for escaped, actual := range replacements {
fp = strings.ReplaceAll(fp, escaped, actual)
}
return fp
}
func extractFileNames(input string) []string {
// Regex to match file paths starting with optional drive letter, / ./ \ or .\ and include escaped or unescaped spaces (\ or %20)
// and followed by more characters and a file extension
// This will capture non filename strings, but we'll check for file existence to remove mismatches
regexPattern := `(?:[a-zA-Z]:)?(?:\./|/|\\)[\S\\ ]+?\.(?i:jpg|jpeg|png|svg)\b`
re := regexp.MustCompile(regexPattern)
return re.FindAllString(input, -1)
}
func extractFileData(input string) (string, []api.ImageData, error) {
filePaths := extractFileNames(input)
var imgs []api.ImageData
for _, fp := range filePaths {
nfp := normalizeFilePath(fp)
data, err := getImageData(nfp)
if err != nil {
if os.IsNotExist(err) {
continue
}
fmt.Fprintf(os.Stderr, "Couldn't process image: %q\n", err)
return "", imgs, err
}
fmt.Fprintf(os.Stderr, "Added image '%s'\n", nfp)
input = strings.ReplaceAll(input, fp, "")
imgs = append(imgs, data)
}
return input, imgs, nil
}
func getImageData(filePath string) ([]byte, error) {
file, err := os.Open(filePath)
if err != nil {
return nil, err
}
defer file.Close()
buf := make([]byte, 512)
_, err = file.Read(buf)
if err != nil {
return nil, err
}
contentType := http.DetectContentType(buf)
allowedTypes := []string{"image/jpeg", "image/jpg", "image/png"}
if !slices.Contains(allowedTypes, contentType) {
return nil, fmt.Errorf("invalid image type: %s", contentType)
}
info, err := file.Stat()
if err != nil {
return nil, err
}
// Check if the file size exceeds 100MB
var maxSize int64 = 100 * 1024 * 1024 // 100MB in bytes
if info.Size() > maxSize {
return nil, fmt.Errorf("file size exceeds maximum limit (100MB)")
}
buf = make([]byte, info.Size())
_, err = file.Seek(0, 0)
if err != nil {
return nil, err
}
_, err = io.ReadFull(file, buf)
if err != nil {
return nil, err
}
return buf, nil
}

116
cmd/interactive_test.go Normal file
View File

@@ -0,0 +1,116 @@
package cmd
import (
"bytes"
"testing"
"text/template"
"github.com/stretchr/testify/assert"
"github.com/jmorganca/ollama/api"
)
func TestExtractFilenames(t *testing.T) {
// Unix style paths
input := ` some preamble
./relative\ path/one.png inbetween1 ./not a valid two.jpg inbetween2
/unescaped space /three.jpeg inbetween3 /valid\ path/dir/four.png "./quoted with spaces/five.svg`
res := extractFileNames(input)
assert.Len(t, res, 5)
assert.Contains(t, res[0], "one.png")
assert.Contains(t, res[1], "two.jpg")
assert.Contains(t, res[2], "three.jpeg")
assert.Contains(t, res[3], "four.png")
assert.Contains(t, res[4], "five.svg")
assert.NotContains(t, res[4], '"')
assert.NotContains(t, res, "inbtween")
// Windows style paths
input = ` some preamble
c:/users/jdoe/one.png inbetween1 c:/program files/someplace/two.jpg inbetween2
/absolute/nospace/three.jpeg inbetween3 /absolute/with space/four.png inbetween4
./relative\ path/five.svg inbetween5 "./relative with/spaces/six.png inbetween6
d:\path with\spaces\seven.svg inbetween7 c:\users\jdoe\eight.png inbetween8
d:\program files\someplace\nine.png inbetween9 "E:\program files\someplace\ten.svg some ending
`
res = extractFileNames(input)
assert.Len(t, res, 10)
assert.NotContains(t, res, "inbtween")
assert.Contains(t, res[0], "one.png")
assert.Contains(t, res[0], "c:")
assert.Contains(t, res[1], "two.jpg")
assert.Contains(t, res[1], "c:")
assert.Contains(t, res[2], "three.jpeg")
assert.Contains(t, res[3], "four.png")
assert.Contains(t, res[4], "five.svg")
assert.Contains(t, res[5], "six.png")
assert.Contains(t, res[6], "seven.svg")
assert.Contains(t, res[6], "d:")
assert.Contains(t, res[7], "eight.png")
assert.Contains(t, res[7], "c:")
assert.Contains(t, res[8], "nine.png")
assert.Contains(t, res[8], "d:")
assert.Contains(t, res[9], "ten.svg")
assert.Contains(t, res[9], "E:")
}
func TestModelfileBuilder(t *testing.T) {
opts := runOptions{
Model: "hork",
System: "You are part horse and part shark, but all hork. Do horklike things",
Template: "This is a template.",
Messages: []api.Message{
{Role: "user", Content: "Hey there hork!"},
{Role: "assistant", Content: "Yes it is true, I am half horse, half shark."},
},
Options: map[string]interface{}{},
}
opts.Options["temperature"] = 0.9
opts.Options["seed"] = 42
opts.Options["penalize_newline"] = false
opts.Options["stop"] = []string{"hi", "there"}
mf := buildModelfile(opts)
expectedModelfile := `FROM {{.Model}}
SYSTEM """{{.System}}"""
TEMPLATE """{{.Template}}"""
PARAMETER penalize_newline false
PARAMETER seed 42
PARAMETER stop [hi there]
PARAMETER temperature 0.9
MESSAGE user """Hey there hork!"""
MESSAGE assistant """Yes it is true, I am half horse, half shark."""
`
tmpl, err := template.New("").Parse(expectedModelfile)
assert.Nil(t, err)
var buf bytes.Buffer
err = tmpl.Execute(&buf, opts)
assert.Nil(t, err)
assert.Equal(t, buf.String(), mf)
opts.ParentModel = "horseshark"
mf = buildModelfile(opts)
expectedModelfile = `FROM {{.ParentModel}}
SYSTEM """{{.System}}"""
TEMPLATE """{{.Template}}"""
PARAMETER penalize_newline false
PARAMETER seed 42
PARAMETER stop [hi there]
PARAMETER temperature 0.9
MESSAGE user """Hey there hork!"""
MESSAGE assistant """Yes it is true, I am half horse, half shark."""
`
tmpl, err = template.New("").Parse(expectedModelfile)
assert.Nil(t, err)
var parentBuf bytes.Buffer
err = tmpl.Execute(&parentBuf, opts)
assert.Nil(t, err)
assert.Equal(t, parentBuf.String(), mf)
}

30
cmd/start_darwin.go Normal file
View File

@@ -0,0 +1,30 @@
package cmd
import (
"context"
"fmt"
"os"
"os/exec"
"strings"
"github.com/jmorganca/ollama/api"
)
func startApp(ctx context.Context, client *api.Client) error {
exe, err := os.Executable()
if err != nil {
return err
}
link, err := os.Readlink(exe)
if err != nil {
return err
}
if !strings.Contains(link, "Ollama.app") {
return fmt.Errorf("could not find ollama app")
}
path := strings.Split(link, "Ollama.app")
if err := exec.Command("/usr/bin/open", "-a", path[0]+"Ollama.app").Run(); err != nil {
return err
}
return waitForServer(ctx, client)
}

14
cmd/start_default.go Normal file
View File

@@ -0,0 +1,14 @@
//go:build !windows && !darwin
package cmd
import (
"context"
"fmt"
"github.com/jmorganca/ollama/api"
)
func startApp(ctx context.Context, client *api.Client) error {
return fmt.Errorf("could not connect to ollama server, run 'ollama serve' to start it")
}

58
cmd/start_windows.go Normal file
View File

@@ -0,0 +1,58 @@
package cmd
import (
"context"
"errors"
"fmt"
"os"
"os/exec"
"path/filepath"
"strings"
"syscall"
"github.com/jmorganca/ollama/api"
)
func startApp(ctx context.Context, client *api.Client) error {
// log.Printf("XXX Attempting to find and start ollama app")
AppName := "ollama app.exe"
exe, err := os.Executable()
if err != nil {
return err
}
appExe := filepath.Join(filepath.Dir(exe), AppName)
_, err = os.Stat(appExe)
if errors.Is(err, os.ErrNotExist) {
// Try the standard install location
localAppData := os.Getenv("LOCALAPPDATA")
appExe = filepath.Join(localAppData, "Ollama", AppName)
_, err := os.Stat(appExe)
if errors.Is(err, os.ErrNotExist) {
// Finally look in the path
appExe, err = exec.LookPath(AppName)
if err != nil {
return fmt.Errorf("could not locate ollama app")
}
}
}
// log.Printf("XXX attempting to start app %s", appExe)
cmd_path := "c:\\Windows\\system32\\cmd.exe"
cmd := exec.Command(cmd_path, "/c", appExe)
// TODO - these hide flags aren't working - still pops up a command window for some reason
cmd.SysProcAttr = &syscall.SysProcAttr{CreationFlags: 0x08000000, HideWindow: true}
// TODO this didn't help either...
cmd.Stdin = strings.NewReader("")
cmd.Stdout = os.Stdout
cmd.Stderr = os.Stderr
if err := cmd.Start(); err != nil {
return fmt.Errorf("unable to start ollama app %w", err)
}
if cmd.Process != nil {
defer cmd.Process.Release() //nolint:errcheck
}
return waitForServer(ctx, client)
}

View File

@@ -10,9 +10,9 @@ Create new models or modify models already in the library using the Modelfile. L
Import models using source model weights found on Hugging Face and similar sites by referring to the **[Import Documentation](./import.md)**.
Installing on Linux in most cases is easy using the script on Ollama.ai. To get more detail about the install, including CUDA drivers, see the **[Linux Documentation](./linux.md)**.
Installing on Linux in most cases is easy using the script on [ollama.com/download](ollama.com/download). To get more detail about the install, including CUDA drivers, see the **[Linux Documentation](./linux.md)**.
Many of our users like the flexibility of using our official Docker Image. Learn more about using Docker with Ollama using the **[Docker Documentation](./docker.md)**.
Many of our users like the flexibility of using our official Docker Image. Learn more about using Docker with Ollama using the **[Docker Documentation](https://hub.docker.com/r/ollama/ollama)**.
It is easy to install on Linux and Mac, but many users will choose to build Ollama on their own. To do this, refer to the **[Development Documentation](./development.md)**.

View File

@@ -49,7 +49,8 @@ Advanced parameters (optional):
- `template`: the prompt template to use (overrides what is defined in the `Modelfile`)
- `context`: the context parameter returned from a previous request to `/generate`, this can be used to keep a short conversational memory
- `stream`: if `false` the response will be returned as a single response object, rather than a stream of objects
- `raw`: if `true` no formatting will be applied to the prompt. You may choose to use the `raw` parameter if you are specifying a full templated prompt in your request to the API.
- `raw`: if `true` no formatting will be applied to the prompt. You may choose to use the `raw` parameter if you are specifying a full templated prompt in your request to the API
- `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`)
#### JSON mode
@@ -246,6 +247,23 @@ curl http://localhost:11434/api/generate -d '{
}'
```
#### Request (Reproducible outputs)
For reproducible outputs, set `temperature` to 0 and `seed` to a number:
##### Request
```shell
curl http://localhost:11434/api/generate -d '{
"model": "mistral",
"prompt": "[INST] why is the sky blue? [/INST]",
"options": {
"seed": 101,
"temperature": 0
}
}'
```
##### Response
```json
@@ -379,6 +397,7 @@ Advanced parameters (optional):
- `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
- `template`: the prompt template to use (overrides what is defined in the `Modelfile`)
- `stream`: if `false` the response will be returned as a single response object, rather than a stream of objects
- `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`)
### Examples
@@ -409,7 +428,7 @@ A stream of JSON objects is returned:
"model": "llama2",
"created_at": "2023-08-04T08:52:19.385406455-07:00",
"message": {
"role": "assisant",
"role": "assistant",
"content": "The",
"images": null
},
@@ -505,7 +524,7 @@ A stream of JSON objects is returned:
"model": "llama2",
"created_at": "2023-08-04T08:52:19.385406455-07:00",
"message": {
"role": "assisant",
"role": "assistant",
"content": "The"
},
"done": false
@@ -542,7 +561,7 @@ curl http://localhost:11434/api/chat -d '{
"role": "user",
"content": "what is in this image?",
"images": ["iVBORw0KGgoAAAANSUhEUgAAAG0AAABmCAYAAADBPx+VAAAACXBIWXMAAAsTAAALEwEAmpwYAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAA3VSURBVHgB7Z27r0zdG8fX743i1bi1ikMoFMQloXRpKFFIqI7LH4BEQ+NWIkjQuSWCRIEoULk0gsK1kCBI0IhrQVT7tz/7zZo888yz1r7MnDl7z5xvsjkzs2fP3uu71nNfa7lkAsm7d++Sffv2JbNmzUqcc8m0adOSzZs3Z+/XES4ZckAWJEGWPiCxjsQNLWmQsWjRIpMseaxcuTKpG/7HP27I8P79e7dq1ars/yL4/v27S0ejqwv+cUOGEGGpKHR37tzJCEpHV9tnT58+dXXCJDdECBE2Ojrqjh071hpNECjx4cMHVycM1Uhbv359B2F79+51586daxN/+pyRkRFXKyRDAqxEp4yMlDDzXG1NPnnyJKkThoK0VFd1ELZu3TrzXKxKfW7dMBQ6bcuWLW2v0VlHjx41z717927ba22U9APcw7Nnz1oGEPeL3m3p2mTAYYnFmMOMXybPPXv2bNIPpFZr1NHn4HMw0KRBjg9NuRw95s8PEcz/6DZELQd/09C9QGq5RsmSRybqkwHGjh07OsJSsYYm3ijPpyHzoiacg35MLdDSIS/O1yM778jOTwYUkKNHWUzUWaOsylE00MyI0fcnOwIdjvtNdW/HZwNLGg+sR1kMepSNJXmIwxBZiG8tDTpEZzKg0GItNsosY8USkxDhD0Rinuiko2gfL/RbiD2LZAjU9zKQJj8RDR0vJBR1/Phx9+PHj9Z7REF4nTZkxzX4LCXHrV271qXkBAPGfP/atWvu/PnzHe4C97F48eIsRLZ9+3a3f/9+87dwP1JxaF7/3r17ba+5l4EcaVo0lj3SBq5kGTJSQmLWMjgYNei2GPT1MuMqGTDEFHzeQSP2wi/jGnkmPJ/nhccs44jvDAxpVcxnq0F6eT8h4ni/iIWpR5lPyA6ETkNXoSukvpJAD3AsXLiwpZs49+fPn5ke4j10TqYvegSfn0OnafC+Tv9ooA/JPkgQysqQNBzagXY55nO/oa1F7qvIPWkRL12WRpMWUvpVDYmxAPehxWSe8ZEXL20sadYIozfmNch4QJPAfeJgW3rNsnzphBKNJM2KKODo1rVOMRYik5ETy3ix4qWNI81qAAirizgMIc+yhTytx0JWZuNI03qsrgWlGtwjoS9XwgUhWGyhUaRZZQNNIEwCiXD16tXcAHUs79co0vSD8rrJCIW98pzvxpAWyyo3HYwqS0+H0BjStClcZJT5coMm6D2LOF8TolGJtK9fvyZpyiC5ePFi9nc/oJU4eiEP0jVoAnHa9wyJycITMP78+eMeP37sXrx44d6+fdt6f82aNdkx1pg9e3Zb5W+RSRE+n+VjksQWifvVaTKFhn5O8my63K8Qabdv33b379/PiAP//vuvW7BggZszZ072/+TJk91YgkafPn166zXB1rQHFvouAWHq9z3SEevSUerqCn2/dDCeta2jxYbr69evk4MHDyY7d+7MjhMnTiTPnz9Pfv/+nfQT2ggpO2dMF8cghuoM7Ygj5iWCqRlGFml0QC/ftGmTmzt3rmsaKDsgBSPh0/8yPeLLBihLkOKJc0jp8H8vUzcxIA1k6QJ/c78tWEyj5P3o4u9+jywNPdJi5rAH9x0KHcl4Hg570eQp3+vHXGyrmEeigzQsQsjavXt38ujRo44LQuDDhw+TW7duRS1HGgMxhNXHgflaNTOsHyKvHK5Ijo2jbFjJBQK9YwFd6RVMzfgRBmEfP37suBBm/p49e1qjEP2mwTViNRo0VJWH1deMXcNK08uUjVUu7s/zRaL+oLNxz1bpANco4npUgX4G2eFbpDFyQoQxojBCpEGSytmOH8qrH5Q9vuzD6ofQylkCUmh8DBAr+q8JCyVNtWQIidKQE9wNtLSQnS4jDSsxNHogzFuQBw4cyM61UKVsjfr3ooBkPSqqQHesUPWVtzi9/vQi1T+rJj7WiTz4Pt/l3LxUkr5P2VYZaZ4URpsE+st/dujQoaBBYokbrz/8TJNQYLSonrPS9kUaSkPeZyj1AWSj+d+VBoy1pIWVNed8P0Ll/ee5HdGRhrHhR5GGN0r4LGZBaj8oFDJitBTJzIZgFcmU0Y8ytWMZMzJOaXUSrUs5RxKnrxmbb5YXO9VGUhtpXldhEUogFr3IzIsvlpmdosVcGVGXFWp2oU9kLFL3dEkSz6NHEY1sjSRdIuDFWEhd8KxFqsRi1uM/nz9/zpxnwlESONdg6dKlbsaMGS4EHFHtjFIDHwKOo46l4TxSuxgDzi+rE2jg+BaFruOX4HXa0Nnf1lwAPufZeF8/r6zD97WK2qFnGjBxTw5qNGPxT+5T/r7/7RawFC3j4vTp09koCxkeHjqbHJqArmH5UrFKKksnxrK7FuRIs8STfBZv+luugXZ2pR/pP9Ois4z+TiMzUUkUjD0iEi1fzX8GmXyuxUBRcaUfykV0YZnlJGKQpOiGB76x5GeWkWWJc3mOrK6S7xdND+W5N6XyaRgtWJFe13GkaZnKOsYqGdOVVVbGupsyA/l7emTLHi7vwTdirNEt0qxnzAvBFcnQF16xh/TMpUuXHDowhlA9vQVraQhkudRdzOnK+04ZSP3DUhVSP61YsaLtd/ks7ZgtPcXqPqEafHkdqa84X6aCeL7YWlv6edGFHb+ZFICPlljHhg0bKuk0CSvVznWsotRu433alNdFrqG45ejoaPCaUkWERpLXjzFL2Rpllp7PJU2a/v7Ab8N05/9t27Z16KUqoFGsxnI9EosS2niSYg9SpU6B4JgTrvVW1flt1sT+0ADIJU2maXzcUTraGCRaL1Wp9rUMk16PMom8QhruxzvZIegJjFU7LLCePfS8uaQdPny4jTTL0dbee5mYokQsXTIWNY46kuMbnt8Kmec+LGWtOVIl9cT1rCB0V8WqkjAsRwta93TbwNYoGKsUSChN44lgBNCoHLHzquYKrU6qZ8lolCIN0Rh6cP0Q3U6I6IXILYOQI513hJaSKAorFpuHXJNfVlpRtmYBk1Su1obZr5dnKAO+L10Hrj3WZW+E3qh6IszE37F6EB+68mGpvKm4eb9bFrlzrok7fvr0Kfv727dvWRmdVTJHw0qiiCUSZ6wCK+7XL/AcsgNyL74DQQ730sv78Su7+t/A36MdY0sW5o40ahslXr58aZ5HtZB8GH64m9EmMZ7FpYw4T6QnrZfgenrhFxaSiSGXtPnz57e9TkNZLvTjeqhr734CNtrK41L40sUQckmj1lGKQ0rC37x544r8eNXRpnVE3ZZY7zXo8NomiO0ZUCj2uHz58rbXoZ6gc0uA+F6ZeKS/jhRDUq8MKrTho9fEkihMmhxtBI1DxKFY9XLpVcSkfoi8JGnToZO5sU5aiDQIW716ddt7ZLYtMQlhECdBGXZZMWldY5BHm5xgAroWj4C0hbYkSc/jBmggIrXJWlZM6pSETsEPGqZOndr2uuuR5rF169a2HoHPdurUKZM4CO1WTPqaDaAd+GFGKdIQkxAn9RuEWcTRyN2KSUgiSgF5aWzPTeA/lN5rZubMmR2bE4SIC4nJoltgAV/dVefZm72AtctUCJU2CMJ327hxY9t7EHbkyJFseq+EJSY16RPo3Dkq1kkr7+q0bNmyDuLQcZBEPYmHVdOBiJyIlrRDq41YPWfXOxUysi5fvtyaj+2BpcnsUV/oSoEMOk2CQGlr4ckhBwaetBhjCwH0ZHtJROPJkyc7UjcYLDjmrH7ADTEBXFfOYmB0k9oYBOjJ8b4aOYSe7QkKcYhFlq3QYLQhSidNmtS2RATwy8YOM3EQJsUjKiaWZ+vZToUQgzhkHXudb/PW5YMHD9yZM2faPsMwoc7RciYJXbGuBqJ1UIGKKLv915jsvgtJxCZDubdXr165mzdvtr1Hz5LONA8jrUwKPqsmVesKa49S3Q4WxmRPUEYdTjgiUcfUwLx589ySJUva3oMkP6IYddq6HMS4o55xBJBUeRjzfa4Zdeg56QZ43LhxoyPo7Lf1kNt7oO8wWAbNwaYjIv5lhyS7kRf96dvm5Jah8vfvX3flyhX35cuX6HfzFHOToS1H4BenCaHvO8pr8iDuwoUL7tevX+b5ZdbBair0xkFIlFDlW4ZknEClsp/TzXyAKVOmmHWFVSbDNw1l1+4f90U6IY/q4V27dpnE9bJ+v87QEydjqx/UamVVPRG+mwkNTYN+9tjkwzEx+atCm/X9WvWtDtAb68Wy9LXa1UmvCDDIpPkyOQ5ZwSzJ4jMrvFcr0rSjOUh+GcT4LSg5ugkW1Io0/SCDQBojh0hPlaJdah+tkVYrnTZowP8iq1F1TgMBBauufyB33x1v+NWFYmT5KmppgHC+NkAgbmRkpD3yn9QIseXymoTQFGQmIOKTxiZIWpvAatenVqRVXf2nTrAWMsPnKrMZHz6bJq5jvce6QK8J1cQNgKxlJapMPdZSR64/UivS9NztpkVEdKcrs5alhhWP9NeqlfWopzhZScI6QxseegZRGeg5a8C3Re1Mfl1ScP36ddcUaMuv24iOJtz7sbUjTS4qBvKmstYJoUauiuD3k5qhyr7QdUHMeCgLa1Ear9NquemdXgmum4fvJ6w1lqsuDhNrg1qSpleJK7K3TF0Q2jSd94uSZ60kK1e3qyVpQK6PVWXp2/FC3mp6jBhKKOiY2h3gtUV64TWM6wDETRPLDfSakXmH3w8g9Jlug8ZtTt4kVF0kLUYYmCCtD/DrQ5YhMGbA9L3ucdjh0y8kOHW5gU/VEEmJTcL4Pz/f7mgoAbYkAAAAAElFTkSuQmCC"]
},
}
]
}'
```
@@ -568,6 +587,46 @@ curl http://localhost:11434/api/chat -d '{
}
```
#### Chat request (Reproducible outputs)
##### Request
```shell
curl http://localhost:11434/api/chat -d '{
"model": "llama2",
"messages": [
{
"role": "user",
"content": "Hello!"
}
],
"options": {
"seed": 101,
"temperature": 0
}
}'
```
##### Response
```json
{
"model": "registry.ollama.ai/library/llama2:latest",
"created_at": "2023-12-12T14:13:43.416799Z",
"message": {
"role": "assistant",
"content": "Hello! How are you today?"
},
"done": true,
"total_duration": 5191566416,
"load_duration": 2154458,
"prompt_eval_count": 26,
"prompt_eval_duration": 383809000,
"eval_count": 298,
"eval_duration": 4799921000
}
```
## Create a Model
```shell
@@ -958,6 +1017,7 @@ Generate embeddings from a model
Advanced parameters:
- `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
- `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`)
### Examples

View File

@@ -1,13 +1,9 @@
# Development
- Install cmake or (optionally, required tools for GPUs)
- run `go generate ./...`
- run `go build .`
Install required tools:
- cmake version 3.24 or higher
- go version 1.20 or higher
- go version 1.21 or higher
- gcc version 11.4.0 or higher
```bash
@@ -17,7 +13,11 @@ brew install go cmake gcc
Optionally enable debugging and more verbose logging:
```bash
# At build time
export CGO_CFLAGS="-g"
# At runtime
export OLLAMA_DEBUG=1
```
Get the required libraries and build the native LLM code:
@@ -38,37 +38,101 @@ Now you can run `ollama`:
./ollama
```
## Building on Linux with GPU support
### Linux
#### Linux CUDA (NVIDIA)
### Linux/Windows CUDA (NVIDIA)
*Your operating system distribution may already have packages for NVIDIA CUDA. Distro packages are often preferable, but instructions are distro-specific. Please consult distro-specific docs for dependencies if available!*
Install `cmake` and `golang` as well as [NVIDIA CUDA](https://developer.nvidia.com/cuda-downloads) development and runtime packages.
Install `cmake` and `golang` as well as [NVIDIA CUDA](https://developer.nvidia.com/cuda-downloads)
development and runtime packages.
Typically the build scripts will auto-detect CUDA, however, if your Linux distro
or installation approach uses unusual paths, you can specify the location by
specifying an environment variable `CUDA_LIB_DIR` to the location of the shared
libraries, and `CUDACXX` to the location of the nvcc compiler. You can customize
set set of target CUDA architectues by setting `CMAKE_CUDA_ARCHITECTURES` (e.g. "50;60;70")
Then generate dependencies:
```
go generate ./...
```
Then build the binary:
```
go build .
```
### Linux ROCm (AMD)
#### Linux ROCm (AMD)
*Your operating system distribution may already have packages for AMD ROCm and CLBlast. Distro packages are often preferable, but instructions are distro-specific. Please consult distro-specific docs for dependencies if available!*
Install [CLBlast](https://github.com/CNugteren/CLBlast/blob/master/doc/installation.md) and [ROCm](https://rocm.docs.amd.com/en/latest/deploy/linux/quick_start.html) developement packages first, as well as `cmake` and `golang`.
Adjust the paths below (correct for Arch) as appropriate for your distributions install locations and generate dependencies:
Typically the build scripts will auto-detect ROCm, however, if your Linux distro
or installation approach uses unusual paths, you can specify the location by
specifying an environment variable `ROCM_PATH` to the location of the ROCm
install (typically `/opt/rocm`), and `CLBlast_DIR` to the location of the
CLBlast install (typically `/usr/lib/cmake/CLBlast`). You can also customize
the AMD GPU targets by setting AMDGPU_TARGETS (e.g. `AMDGPU_TARGETS="gfx1101;gfx1102"`)
```
CLBlast_DIR=/usr/lib/cmake/CLBlast ROCM_PATH=/opt/rocm go generate ./...
go generate ./...
```
Then build the binary:
```
go build .
```
ROCm requires elevated privileges to access the GPU at runtime. On most distros you can add your user account to the `render` group, or run as root.
## Containerized Build
#### Advanced CPU Settings
If you have Docker available, you can build linux binaries with `./scripts/build_linux.sh` which has the CUDA and ROCm dependencies included.
By default, running `go generate ./...` will compile a few different variations
of the LLM library based on common CPU families and vector math capabilities,
including a lowest-common-denominator which should run on almost any 64 bit CPU
somewhat slowly. At runtime, Ollama will auto-detect the optimal variation to
load. If you would like to build a CPU-based build customized for your
processor, you can set `OLLAMA_CUSTOM_CPU_DEFS` to the llama.cpp flags you would
like to use. For example, to compile an optimized binary for an Intel i9-9880H,
you might use:
```
OLLAMA_CUSTOM_CPU_DEFS="-DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_F16C=on -DLLAMA_FMA=on" go generate ./...
go build .
```
#### Containerized Linux Build
If you have Docker available, you can build linux binaries with `./scripts/build_linux.sh` which has the CUDA and ROCm dependencies included. The resulting binary is placed in `./dist`
### Windows
Note: The windows build for Ollama is still under development.
Install required tools:
- MSVC toolchain - C/C++ and cmake as minimal requirements
- go version 1.21 or higher
- MinGW (pick one variant) with GCC.
- <https://www.mingw-w64.org/>
- <https://www.msys2.org/>
```powershell
$env:CGO_ENABLED="1"
go generate ./...
go build .
```
#### Windows CUDA (NVIDIA)
In addition to the common Windows development tools described above, install:
- [NVIDIA CUDA](https://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/index.html)

View File

@@ -2,69 +2,116 @@
## How can I upgrade Ollama?
To upgrade Ollama, run the installation process again. On the Mac, click the Ollama icon in the menubar and choose the restart option if an update is available.
Ollama on macOS and Windows will automatically download updates. Click on the taskbar or menubar item and then click "Restart to update" to apply the update. Updates can also be installed by downloading the latest version [manually](https://ollama.com/download/).
On Linux, re-run the install script:
```
curl -fsSL https://ollama.com/install.sh | sh
```
## How can I view the logs?
Review the [Troubleshooting](./troubleshooting.md) docs for more about using logs.
## How do I use Ollama server environment variables on Mac
## How can I specify the context window size?
On macOS, Ollama runs in the background and is managed by the menubar app. If adding environment variables, Ollama will need to be run manually.
By default, Ollama uses a context window size of 2048 tokens.
1. Click the menubar icon for Ollama and choose **Quit Ollama**.
2. Open a new terminal window and run the following command (this example uses `OLLAMA_HOST` with an IP address of `123.1.1.1`):
To change this when using `ollama run`, use `/set parameter`:
```bash
OLLAMA_HOST=123.1.1.1 ollama serve
```
```
/set parameter num_ctx 4096
```
## How do I use Ollama server environment variables on Linux?
When using the API, specify the `num_ctx` parameter:
If Ollama is installed with the install script, a systemd service was created, running as the Ollama user. To add an environment variable, such as OLLAMA_HOST, follow these steps:
```
curl http://localhost:11434/api/generate -d '{
"model": "llama2",
"prompt": "Why is the sky blue?",
"options": {
"num_ctx": 4096
}
}'
```
1. Create a `systemd` drop-in directory and add a config file. This is only needed once.
## How do I configure Ollama server?
```bash
mkdir -p /etc/systemd/system/ollama.service.d
echo '[Service]' >>/etc/systemd/system/ollama.service.d/environment.conf
```
Ollama server can be configured with environment variables.
2. For each environment variable, add it to the config file:
### Setting environment variables on Mac
```bash
echo 'Environment="OLLAMA_HOST=0.0.0.0:11434"' >>/etc/systemd/system/ollama.service.d/environment.conf
```
If Ollama is run as a macOS application, environment variables should be set using `launchctl`:
3. Reload `systemd` and restart Ollama:
1. For each environment variable, call `launchctl setenv`.
```bash
launchctl setenv OLLAMA_HOST "0.0.0.0"
```
2. Restart Ollama application.
### Setting environment variables on Linux
If Ollama is run as a systemd service, environment variables should be set using `systemctl`:
1. Edit the systemd service by calling `systemctl edit ollama.service`. This will open an editor.
2. For each environment variable, add a line `Environment` under section `[Service]`:
```ini
[Service]
Environment="OLLAMA_HOST=0.0.0.0"
```
3. Save and exit.
4. Reload `systemd` and restart Ollama:
```bash
systemctl daemon-reload
systemctl restart ollama
```
### Setting environment variables on Windows
On windows, Ollama inherits your user and system environment variables.
1. First Quit Ollama by clicking on it in the task bar
2. Edit system environment variables from the control panel
3. Edit or create New variable(s) for your user account for `OLLAMA_HOST`, `OLLAMA_MODELS`, etc.
4. Click OK/Apply to save
5. Run `ollama` from a new terminal window
## How can I expose Ollama on my network?
Ollama binds to 127.0.0.1 port 11434 by default. Change the bind address with the `OLLAMA_HOST` environment variable. Refer to the section above for how to use environment variables on your platform.
Ollama binds 127.0.0.1 port 11434 by default. Change the bind address with the `OLLAMA_HOST` environment variable.
Refer to the section [above](#how-do-i-configure-ollama-server) for how to set environment variables on your platform.
## How can I allow additional web origins to access Ollama?
Ollama allows cross-origin requests from `127.0.0.1` and `0.0.0.0` by default. Add additional origins with the `OLLAMA_ORIGINS` environment variable. For example, to add all ports on 192.168.1.1 and https://example.com, use:
Ollama allows cross-origin requests from `127.0.0.1` and `0.0.0.0` by default. Additional origins can be configured with `OLLAMA_ORIGINS`.
```shell
OLLAMA_ORIGINS=http://192.168.1.1:*,https://example.com
```
Refer to the section above for how to use environment variables on your platform.
Refer to the section [above](#how-do-i-configure-ollama-server) for how to set environment variables on your platform.
## Where are models stored?
- macOS: `~/.ollama/models`.
- macOS: `~/.ollama/models`
- Linux: `/usr/share/ollama/.ollama/models`
- Windows: `C:\Users\<username>\.ollama\models`
## How do I set them to a different location?
### How do I set them to a different location?
If a different directory needs to be used, set the environment variable `OLLAMA_MODELS` to the chosen directory. Refer to the section above for how to use environment variables on your platform.
If a different directory needs to be used, set the environment variable `OLLAMA_MODELS` to the chosen directory.
Refer to the section [above](#how-do-i-configure-ollama-server) for how to set environment variables on your platform.
## Does Ollama send my prompts and answers back to Ollama.ai to use in any way?
@@ -112,3 +159,37 @@ This can impact both installing Ollama, as well as downloading models.
Open `Control Panel > Networking and Internet > View network status and tasks` and click on `Change adapter settings` on the left panel. Find the `vEthernel (WSL)` adapter, right click and select `Properties`.
Click on `Configure` and open the `Advanced` tab. Search through each of the properties until you find `Large Send Offload Version 2 (IPv4)` and `Large Send Offload Version 2 (IPv6)`. *Disable* both of these
properties.
## How can I pre-load a model to get faster response times?
If you are using the API you can preload a model by sending the Ollama server an empty request. This works with both the `/api/generate` and `/api/chat` API endpoints.
To preload the mistral model using the generate endpoint, use:
```shell
curl http://localhost:11434/api/generate -d '{"model": "mistral"}'
```
To use the chat completions endpoint, use:
```shell
curl http://localhost:11434/api/chat -d '{"model": "mistral"}'
```
## How do I keep a model loaded in memory or make it unload immediately?
By default models are kept in memory for 5 minutes before being unloaded. This allows for quicker response times if you are making numerous requests to the LLM. You may, however, want to free up the memory before the 5 minutes have elapsed or keep the model loaded indefinitely. Use the `keep_alive` parameter with either the `/api/generate` and `/api/chat` API endpoints to control how long the model is left in memory.
The `keep_alive` parameter can be set to:
* a duration string (such as "10m" or "24h")
* a number in seconds (such as 3600)
* any negative number which will keep the model loaded in memory (e.g. -1 or "-1m")
* '0' which will unload the model immediately after generating a response
For example, to preload a model and leave it in memory use:
```shell
curl http://localhost:11434/api/generate -d '{"model": "llama2", "keep_alive": -1}'
```
To unload the model and free up memory use:
```shell
curl http://localhost:11434/api/generate -d '{"model": "llama2", "keep_alive": 0}'
```

View File

@@ -15,7 +15,7 @@ FROM ./mistral-7b-v0.1.Q4_0.gguf
(Optional) many chat models require a prompt template in order to answer correctly. A default prompt template can be specified with the `TEMPLATE` instruction in the `Modelfile`:
```
FROM ./q4_0.bin
FROM ./mistral-7b-v0.1.Q4_0.gguf
TEMPLATE "[INST] {{ .Prompt }} [/INST]"
```
@@ -37,55 +37,69 @@ ollama run example "What is your favourite condiment?"
## Importing (PyTorch & Safetensors)
### Supported models
> Importing from PyTorch and Safetensors is a longer process than importing from GGUF. Improvements that make it easier are a work in progress.
Ollama supports a set of model architectures, with support for more coming soon:
### Setup
- Llama & Mistral
- Falcon & RW
- BigCode
First, clone the `ollama/ollama` repo:
To view a model's architecture, check the `config.json` file in its HuggingFace repo. You should see an entry under `architectures` (e.g. `LlamaForCausalLM`).
```
git clone git@github.com:ollama/ollama.git ollama
cd ollama
```
### Step 1: Clone the HuggingFace repository (optional)
and then fetch its `llama.cpp` submodule:
```shell
git submodule init
git submodule update llm/llama.cpp
```
Next, install the Python dependencies:
```
python3 -m venv llm/llama.cpp/.venv
source llm/llama.cpp/.venv/bin/activate
pip install -r llm/llama.cpp/requirements.txt
```
Then build the `quantize` tool:
```
make -C llm/llama.cpp quantize
```
### Clone the HuggingFace repository (optional)
If the model is currently hosted in a HuggingFace repository, first clone that repository to download the raw model.
Install [Git LFS](https://docs.github.com/en/repositories/working-with-files/managing-large-files/installing-git-large-file-storage), verify it's installed, and then clone the model's repository:
```
git lfs install
git clone https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1
cd Mistral-7B-Instruct-v0.1
git clone https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1 model
```
### Step 2: Convert and quantize to a `.bin` file (optional, for PyTorch and Safetensors)
### Convert the model
If the model is in PyTorch or Safetensors format, a [Docker image](https://hub.docker.com/r/ollama/quantize) with the tooling required to convert and quantize models is available.
First, Install [Docker](https://www.docker.com/get-started/).
Next, to convert and quantize your model, run:
> Note: some model architectures require using specific convert scripts. For example, Qwen models require running `convert-hf-to-gguf.py` instead of `convert.py`
```
docker run --rm -v .:/model ollama/quantize -q q4_0 /model
python llm/llama.cpp/convert.py ./model --outtype f16 --outfile converted.bin
```
This will output two files into the directory:
### Quantize the model
- `f16.bin`: the model converted to GGUF
- `q4_0.bin` the model quantized to a 4-bit quantization (Ollama will use this file to create the Ollama model)
```
llm/llama.cpp/quantize converted.bin quantized.bin q4_0
```
### Step 3: Write a `Modelfile`
Next, create a `Modelfile` for your model:
```
FROM ./q4_0.bin
```
(Optional) many chat models require a prompt template in order to answer correctly. A default prompt template can be specified with the `TEMPLATE` instruction in the `Modelfile`:
```
FROM ./q4_0.bin
FROM quantized.bin
TEMPLATE "[INST] {{ .Prompt }} [/INST]"
```
@@ -109,9 +123,9 @@ ollama run example "What is your favourite condiment?"
Publishing models is in early alpha. If you'd like to publish your model to share with others, follow these steps:
1. Create [an account](https://ollama.ai/signup)
2. Run `cat ~/.ollama/id_ed25519.pub` to view your Ollama public key. Copy this to the clipboard.
3. Add your public key to your [Ollama account](https://ollama.ai/settings/keys)
1. Create [an account](https://ollama.com/signup)
2. Run `cat ~/.ollama/id_ed25519.pub` (or `type %USERPROFILE%\.ollama\id_ed25519.pub` on Windows) to view your Ollama public key. Copy this to the clipboard.
3. Add your public key to your [Ollama account](https://ollama.com/settings/keys)
Next, copy your model to your username's namespace:
@@ -125,7 +139,7 @@ Then push the model:
ollama push <your username>/example
```
After publishing, your model will be available at `https://ollama.ai/<your username>/example`.
After publishing, your model will be available at `https://ollama.com/<your username>/example`.
## Quantization reference
@@ -149,47 +163,3 @@ The quantization options are as follow (from highest highest to lowest levels of
- `q6_K`
- `q8_0`
- `f16`
## Manually converting & quantizing models
### Prerequisites
Start by cloning the `llama.cpp` repo to your machine in another directory:
```
git clone https://github.com/ggerganov/llama.cpp.git
cd llama.cpp
```
Next, install the Python dependencies:
```
pip install -r requirements.txt
```
Finally, build the `quantize` tool:
```
make quantize
```
### Convert the model
Run the correct conversion script for your model architecture:
```shell
# LlamaForCausalLM or MistralForCausalLM
python convert.py <path to model directory>
# FalconForCausalLM
python convert-falcon-hf-to-gguf.py <path to model directory>
# GPTBigCodeForCausalLM
python convert-starcoder-hf-to-gguf.py <path to model directory>
```
### Quantize the model
```
quantize <path to model dir>/ggml-model-f32.bin <path to model dir>/q4_0.bin q4_0
```

View File

@@ -3,9 +3,11 @@
## Install
Install Ollama running this one-liner:
>
```bash
curl https://ollama.ai/install.sh | sh
curl -fsSL https://ollama.com/install.sh | sh
```
## Manual install
@@ -15,7 +17,7 @@ curl https://ollama.ai/install.sh | sh
Ollama is distributed as a self-contained binary. Download it to a directory in your PATH:
```bash
sudo curl -L https://ollama.ai/download/ollama-linux-amd64 -o /usr/bin/ollama
sudo curl -L https://ollama.com/download/ollama-linux-amd64 -o /usr/bin/ollama
sudo chmod +x /usr/bin/ollama
```
@@ -75,13 +77,13 @@ sudo systemctl start ollama
Update ollama by running the install script again:
```bash
curl https://ollama.ai/install.sh | sh
curl -fsSL https://ollama.com/install.sh | sh
```
Or by downloading the ollama binary:
```bash
sudo curl -L https://ollama.ai/download/ollama-linux-amd64 -o /usr/bin/ollama
sudo curl -L https://ollama.com/download/ollama-linux-amd64 -o /usr/bin/ollama
sudo chmod +x /usr/bin/ollama
```
@@ -109,8 +111,10 @@ Remove the ollama binary from your bin directory (either `/usr/local/bin`, `/usr
sudo rm $(which ollama)
```
Remove the downloaded models and Ollama service user:
Remove the downloaded models and Ollama service user and group:
```bash
sudo rm -r /usr/share/ollama
sudo userdel ollama
sudo groupdel ollama
```

View File

@@ -19,6 +19,7 @@ A model file is the blueprint to create and share models with Ollama.
- [SYSTEM](#system)
- [ADAPTER](#adapter)
- [LICENSE](#license)
- [MESSAGE](#message)
- [Notes](#notes)
## Format
@@ -38,6 +39,7 @@ INSTRUCTION arguments
| [`SYSTEM`](#system) | Specifies the system message that will be set in the template. |
| [`ADAPTER`](#adapter) | Defines the (Q)LoRA adapters to apply to the model. |
| [`LICENSE`](#license) | Specifies the legal license. |
| [`MESSAGE`](#message) | Specify message history. |
## Examples
@@ -65,13 +67,13 @@ To use this:
More examples are available in the [examples directory](../examples).
### `Modelfile`s in [ollama.ai/library][1]
### `Modelfile`s in [ollama.com/library][1]
There are two ways to view `Modelfile`s underlying the models in [ollama.ai/library][1]:
There are two ways to view `Modelfile`s underlying the models in [ollama.com/library][1]:
- Option 1: view a details page from a model's tags page:
1. Go to a particular model's tags (e.g. https://ollama.ai/library/llama2/tags)
2. Click on a tag (e.g. https://ollama.ai/library/llama2:13b)
1. Go to a particular model's tags (e.g. https://ollama.com/library/llama2/tags)
2. Click on a tag (e.g. https://ollama.com/library/llama2:13b)
3. Scroll down to "Layers"
- Note: if the [`FROM` instruction](#from-required) is not present,
it means the model was created from a local file
@@ -84,7 +86,7 @@ There are two ways to view `Modelfile`s underlying the models in [ollama.ai/libr
# FROM llama2:13b
FROM /root/.ollama/models/blobs/sha256:123abc
TEMPLATE """[INST] {{ if and .First .System }}<<SYS>>{{ .System }}<</SYS>>
TEMPLATE """[INST] {{ if .System }}<<SYS>>{{ .System }}<</SYS>>
{{ end }}{{ .Prompt }} [/INST] """
SYSTEM """"""
@@ -152,30 +154,23 @@ PARAMETER <parameter> <parametervalue>
### TEMPLATE
`TEMPLATE` of the full prompt template to be passed into the model. It may include (optionally) a system message and a user's prompt. This is used to create a full custom prompt, and syntax may be model specific. You can usually find the template for a given model in the readme for that model.
`TEMPLATE` of the full prompt template to be passed into the model. It may include (optionally) a system message, a user's message and the response from the model. Note: syntax may be model specific. Templates use Go [template syntax](https://pkg.go.dev/text/template).
#### Template Variables
| Variable | Description |
| --------------- | ------------------------------------------------------------------------------------------------------------- |
| `{{ .System }}` | The system message used to specify custom behavior, this must also be set in the Modelfile as an instruction. |
| `{{ .Prompt }}` | The incoming prompt, this is not specified in the model file and will be set based on input. |
| `{{ .First }}` | A boolean value used to render specific template information for the first generation of a session. |
| Variable | Description |
| ----------------- | --------------------------------------------------------------------------------------------- |
| `{{ .System }}` | The system message used to specify custom behavior. |
| `{{ .Prompt }}` | The user prompt message. |
| `{{ .Response }}` | The response from the model. When generating a response, text after this variable is omitted. |
```modelfile
TEMPLATE """
{{- if .First }}
### System:
{{ .System }}
{{- end }}
### User:
{{ .Prompt }}
### Response:
```
TEMPLATE """{{ if .System }}<|im_start|>system
{{ .System }}<|im_end|>
{{ end }}{{ if .Prompt }}<|im_start|>user
{{ .Prompt }}<|im_end|>
{{ end }}<|im_start|>assistant
"""
SYSTEM """<system message>"""
```
### SYSTEM
@@ -204,9 +199,22 @@ LICENSE """
"""
```
### MESSAGE
The `MESSAGE` instruction allows you to specify a message history for the model to use when responding:
```modelfile
MESSAGE user Is Toronto in Canada?
MESSAGE assistant yes
MESSAGE user Is Sacramento in Canada?
MESSAGE assistant no
MESSAGE user Is Ontario in Canada?
MESSAGE assistant yes
```
## Notes
- the **`Modelfile` is not case sensitive**. In the examples, uppercase instructions are used to make it easier to distinguish it from arguments.
- Instructions can be in any order. In the examples, the `FROM` instruction is first to keep it easily readable.
[1]: https://ollama.ai/library
[1]: https://ollama.com/library

141
docs/openai.md Normal file
View File

@@ -0,0 +1,141 @@
# OpenAI compatibility
> **Note:** OpenAI compatibility is experimental and is subject to major adjustments including breaking changes. For fully-featured access to the Ollama API, see the Ollama [Python library](https://github.com/ollama/ollama-python), [JavaScript library](https://github.com/ollama/ollama-js) and [REST API](https://github.com/jmorganca/ollama/blob/main/docs/api.md).
Ollama provides experimental compatibility with parts of the [OpenAI API](https://platform.openai.com/docs/api-reference) to help connect existing applications to Ollama.
## Usage
### OpenAI Python library
```python
from openai import OpenAI
client = OpenAI(
base_url='http://localhost:11434/v1/',
# required but ignored
api_key='ollama',
)
chat_completion = client.chat.completions.create(
messages=[
{
'role': 'user',
'content': 'Say this is a test',
}
],
model='llama2',
)
```
### OpenAI JavaScript library
```javascript
import OpenAI from 'openai'
const openai = new OpenAI({
baseURL: 'http://localhost:11434/v1/',
// required but ignored
apiKey: 'ollama',
})
const chatCompletion = await openai.chat.completions.create({
messages: [{ role: 'user', content: 'Say this is a test' }],
model: 'llama2',
})
```
### `curl`
```
curl http://localhost:11434/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "llama2",
"messages": [
{
"role": "system",
"content": "You are a helpful assistant."
},
{
"role": "user",
"content": "Hello!"
}
]
}'
```
## Endpoints
### `/v1/chat/completions`
#### Supported features
- [x] Chat completions
- [x] Streaming
- [x] JSON mode
- [x] Reproducible outputs
- [ ] Vision
- [ ] Function calling
- [ ] Logprobs
#### Supported request fields
- [x] `model`
- [x] `messages`
- [x] Text `content`
- [ ] Array of `content` parts
- [x] `frequency_penalty`
- [x] `presence_penalty`
- [x] `response_format`
- [x] `seed`
- [x] `stop`
- [x] `stream`
- [x] `temperature`
- [x] `top_p`
- [x] `max_tokens`
- [ ] `logit_bias`
- [ ] `tools`
- [ ] `tool_choice`
- [ ] `user`
- [ ] `n`
#### Notes
- Setting `seed` will always set `temperature` to `0`
- `finish_reason` will always be `stop`
- `usage.prompt_tokens` will be 0 for completions where prompt evaluation is cached
## Models
Before using a model, pull it locally `ollama pull`:
```shell
ollama pull llama2
```
### Default model names
For tooling that relies on default OpenAI model names such as `gpt-3.5-turbo`, use `ollama cp` to copy an existing model name to a temporary name:
```
ollama cp llama2 gpt-3.5-turbo
```
Afterwards, this new model name can be specified the `model` field:
```shell
curl http://localhost:11434/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "gpt-3.5-turbo",
"messages": [
{
"role": "user",
"content": "Hello!"
}
]
}'
```

View File

@@ -1,22 +1,72 @@
# How to troubleshoot issues
Sometimes Ollama may not perform as expected. One of the best ways to figure out what happened is to take a look at the logs. Find the logs on Mac by running the command:
```shell
cat ~/.ollama/logs/server.log
```
On Linux systems with systemd, the logs can be found with this command:
```shell
journalctl -u ollama
```
If manually running `ollama serve` in a terminal, the logs will be on that terminal.
Join the [Discord](https://discord.gg/ollama) for help interpreting the logs.
## Known issues
* `signal: illegal instruction (core dumped)`: Ollama requires AVX support from the CPU. This was introduced in 2011 and CPUs started offering it in 2012. CPUs from before that and some lower end CPUs after that may not have AVX support and thus are not supported by Ollama. Some users have had luck with building Ollama on their machines disabling the need for AVX.
# How to troubleshoot issues
Sometimes Ollama may not perform as expected. One of the best ways to figure out what happened is to take a look at the logs. Find the logs on **Mac** by running the command:
```shell
cat ~/.ollama/logs/server.log
```
On **Linux** systems with systemd, the logs can be found with this command:
```shell
journalctl -u ollama
```
When you run Ollama in a **container**, the logs go to stdout/stderr in the container:
```shell
docker logs <container-name>
```
(Use `docker ps` to find the container name)
If manually running `ollama serve` in a terminal, the logs will be on that terminal.
When you run Ollama on **Windows**, there are a few different locations. You can view them in the explorer window by hitting `<cmd>+R` and type in:
- `explorer %LOCALAPPDATA%\Ollama` to view logs
- `explorer %LOCALAPPDATA%\Programs\Ollama` to browse the binaries (The installer adds this to your user PATH)
- `explorer %HOMEPATH%\.ollama` to browse where models and configuration is stored
- `explorer %TEMP%` where temporary executable files are stored in one or more `ollama*` directories
To enable additional debug logging to help troubleshoot problems, first **Quit the running app from the tray menu** then in a powershell terminal
```powershell
$env:OLLAMA_DEBUG="1"
& "ollama app.exe"
```
Join the [Discord](https://discord.gg/ollama) for help interpreting the logs.
## LLM libraries
Ollama includes multiple LLM libraries compiled for different GPUs and CPU
vector features. Ollama tries to pick the best one based on the capabilities of
your system. If this autodetection has problems, or you run into other problems
(e.g. crashes in your GPU) you can workaround this by forcing a specific LLM
library. `cpu_avx2` will perform the best, followed by `cpu_avx` an the slowest
but most compatible is `cpu`. Rosetta emulation under MacOS will work with the
`cpu` library.
In the server log, you will see a message that looks something like this (varies
from release to release):
```
Dynamic LLM libraries [rocm_v6 cpu cpu_avx cpu_avx2 cuda_v11 rocm_v5]
```
**Experimental LLM Library Override**
You can set OLLAMA_LLM_LIBRARY to any of the available LLM libraries to bypass
autodetection, so for example, if you have a CUDA card, but want to force the
CPU LLM library with AVX2 vector support, use:
```
OLLAMA_LLM_LIBRARY="cpu_avx2" ollama serve
```
You can see what features your CPU has with the following.
```
cat /proc/cpuinfo| grep flags | head -1
```
## Known issues
* N/A

View File

@@ -17,7 +17,7 @@ Prerequisites:
Here are the steps:
- Install Ollama via standard Linux command (ignore the 404 error): `curl https://ollama.ai/install.sh | sh`
- Install Ollama via standard Linux command (ignore the 404 error): `curl https://ollama.com/install.sh | sh`
- Stop the Ollama service: `sudo systemctl stop ollama`
- Start Ollama serve in a tmux session called ollama_jetson and reference the CUDA libraries path: `tmux has-session -t ollama_jetson 2>/dev/null || tmux new-session -d -s ollama_jetson
'LD_LIBRARY_PATH=/usr/local/cuda/lib64 ollama serve'`

46
docs/windows.md Normal file
View File

@@ -0,0 +1,46 @@
# Ollama Windows Preview
Welcome to the Ollama Windows preview.
No more WSL required!
Ollama now runs as a native Windows application, including NVIDIA GPU support.
After installing Ollama Windows Preview, Ollama will run in the background and
the `ollama` command line is available in `cmd`, `powershell` or your favorite
terminal application. As usual the Ollama [api](./api.md) will be served on
`http://localhost:11434`.
As this is a preview release, you should expect a few bugs here and there. If
you run into a problem you can reach out on
[Discord](https://discord.gg/ollama), or file an
[issue](https://github.com/ollama/ollama/issues).
Logs will often be helpful in dianosing the problem (see
[Troubleshooting](#troubleshooting) below)
## System Requirements
* Windows 10 or newer, Home or Pro
* NVIDIA 452.39 or newer Drivers if you have an NVIDIA card
## API Access
Here's a quick example showing API access from `powershell`
```powershell
(Invoke-WebRequest -method POST -Body '{"model":"llama2", "prompt":"Why is the sky blue?", "stream": false}' -uri http://localhost:11434/api/generate ).Content | ConvertFrom-json
```
## Troubleshooting
While we're in preview, `OLLAMA_DEBUG` is always enabled, which adds
a "view logs" menu item to the app, and increses logging for the GUI app and
server.
Ollama on Windows stores files in a few different locations. You can view them in
the explorer window by hitting `<cmd>+R` and type in:
- `explorer %LOCALAPPDATA%\Ollama` contains logs, and downloaded updates
- *app.log* contains logs from the GUI application
- *server.log* contains the server logs
- *upgrade.log* contains log output for upgrades
- `explorer %LOCALAPPDATA%\Programs\Ollama` contains the binaries (The installer adds this to your user PATH)
- `explorer %HOMEPATH%\.ollama` contains models and configuration
- `explorer %TEMP%` contains temporary executable files in one or more `ollama*` directories

View File

@@ -8,7 +8,7 @@
"outputs": [],
"source": [
"# Download and run the Ollama Linux install script\n",
"!curl https://ollama.ai/install.sh | sh\n",
"!curl -fsSL https://ollama.com/install.sh | sh\n",
"!command -v systemctl >/dev/null && sudo systemctl stop ollama"
]
},

View File

@@ -2,28 +2,28 @@
## Prerequisites
- Ollama: https://ollama.ai/download
- Ollama: https://ollama.com/download
- Kubernetes cluster. This example will use Google Kubernetes Engine.
## Steps
1. Create the Ollama namespace, daemon set, and service
```bash
kubectl apply -f cpu.yaml
```
```bash
kubectl apply -f cpu.yaml
```
1. Port forward the Ollama service to connect and use it locally
```bash
kubectl -n ollama port-forward service/ollama 11434:80
```
```bash
kubectl -n ollama port-forward service/ollama 11434:80
```
1. Pull and run a model, for example `orca-mini:3b`
```bash
ollama run orca-mini:3b
```
```bash
ollama run orca-mini:3b
```
## (Optional) Hardware Acceleration

View File

@@ -1,6 +1,6 @@
# LangChain Web Summarization
This example summarizes the website, [https://ollama.ai/blog/run-llama2-uncensored-locally](https://ollama.ai/blog/run-llama2-uncensored-locally)
This example summarizes the website, [https://ollama.com/blog/run-llama2-uncensored-locally](https://ollama.com/blog/run-llama2-uncensored-locally)
## Running the Example

View File

@@ -2,7 +2,7 @@ from langchain.llms import Ollama
from langchain.document_loaders import WebBaseLoader
from langchain.chains.summarize import load_summarize_chain
loader = WebBaseLoader("https://ollama.ai/blog/run-llama2-uncensored-locally")
loader = WebBaseLoader("https://ollama.com/blog/run-llama2-uncensored-locally")
docs = loader.load()
llm = Ollama(model="llama2")

View File

@@ -40,13 +40,13 @@ You are a log file analyzer. You will receive a set of lines from a log file for
"""
```
This model is available at https://ollama.ai/mattw/loganalyzer. You can customize it and add to your own namespace using the command `ollama create <namespace/modelname> -f <path-to-modelfile>` then `ollama push <namespace/modelname>`.
This model is available at https://ollama.com/mattw/loganalyzer. You can customize it and add to your own namespace using the command `ollama create <namespace/modelname> -f <path-to-modelfile>` then `ollama push <namespace/modelname>`.
Then loganalysis.py scans all the lines in the given log file and searches for the word 'error'. When the word is found, the 10 lines before and after are set as the prompt for a call to the Generate API.
```python
data = {
"prompt": "\n".join(error_logs),
"prompt": "\n".join(error_logs),
"model": "mattw/loganalyzer"
}
```

View File

@@ -29,9 +29,9 @@ You can also add your own character to be chosen at random when you ask a questi
```bash
ollama pull stablebeluga2:70b-q4_K_M
```
2. Create a new character:
```bash
npm run charactergen "Lorne Greene"
```
@@ -41,15 +41,15 @@ You can also add your own character to be chosen at random when you ask a questi
3. Now you can create a model with this command:
```bash
ollama create <YourNamespace>/lornegreene -f lornegreene/Modelfile
ollama create <username>/lornegreene -f lornegreene/Modelfile
```
`YourNamespace` is whatever name you set up when you signed up at [https://ollama.ai/signup](https://ollama.ai/signup).
`username` is whatever name you set up when you signed up at [https://ollama.com/signup](https://ollama.com/signup).
4. To add this to your mentors, you will have to update the code as follows. On line 8 of `mentors.ts`, add an object to the array, replacing `<YourNamespace>` with the namespace you used above.
4. To add this to your mentors, you will have to update the code as follows. On line 8 of `mentors.ts`, add an object to the array, replacing `<username>` with the username you used above.
```bash
{ns: "<YourNamespace>", char: "Lorne Greene"}
{ns: "<username>", char: "Lorne Greene"}
```
## Review the Code

18
go.mod
View File

@@ -1,6 +1,6 @@
module github.com/jmorganca/ollama
go 1.20
go 1.21
require (
github.com/emirpasic/gods v1.18.1
@@ -12,12 +12,26 @@ require (
)
require (
github.com/containerd/console v1.0.3 // indirect
github.com/cratonica/2goarray v0.0.0-20190331194516-514510793eaa // indirect
github.com/davecgh/go-spew v1.1.1 // indirect
github.com/getlantern/context v0.0.0-20190109183933-c447772a6520 // indirect
github.com/getlantern/errors v0.0.0-20190325191628-abdb3e3e36f7 // indirect
github.com/getlantern/golog v0.0.0-20190830074920-4ef2e798c2d7 // indirect
github.com/getlantern/hex v0.0.0-20190417191902-c6586a6fe0b7 // indirect
github.com/getlantern/hidden v0.0.0-20190325191715-f02dbb02be55 // indirect
github.com/getlantern/ops v0.0.0-20190325191751-d70cb0d6f85f // indirect
github.com/getlantern/systray v1.2.2 // indirect
github.com/go-stack/stack v1.8.0 // indirect
github.com/google/uuid v1.0.0 // indirect
github.com/mattn/go-runewidth v0.0.14 // indirect
github.com/oxtoacart/bpool v0.0.0-20190530202638-03653db5a59c // indirect
github.com/pborman/uuid v1.2.1 // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect
github.com/rivo/uniseg v0.2.0 // indirect
)
require (
github.com/bytedance/sonic v1.9.1 // indirect
github.com/chenzhuoyu/base64x v0.0.0-20221115062448-fe3a3abad311 // indirect
@@ -45,7 +59,7 @@ require (
golang.org/x/crypto v0.14.0
golang.org/x/exp v0.0.0-20230817173708-d852ddb80c63
golang.org/x/net v0.17.0 // indirect
golang.org/x/sys v0.13.0 // indirect
golang.org/x/sys v0.13.0
golang.org/x/term v0.13.0
golang.org/x/text v0.13.0 // indirect
google.golang.org/protobuf v1.30.0 // indirect

33
go.sum
View File

@@ -4,7 +4,11 @@ github.com/bytedance/sonic v1.9.1/go.mod h1:i736AoUSYt75HyZLoJW9ERYxcy6eaN6h4BZX
github.com/chenzhuoyu/base64x v0.0.0-20211019084208-fb5309c8db06/go.mod h1:DH46F32mSOjUmXrMHnKwZdA8wcEefY7UVqBKYGjpdQY=
github.com/chenzhuoyu/base64x v0.0.0-20221115062448-fe3a3abad311 h1:qSGYFH7+jGhDF8vLC+iwCD4WpbV1EBDSzWkJODFLams=
github.com/chenzhuoyu/base64x v0.0.0-20221115062448-fe3a3abad311/go.mod h1:b583jCggY9gE99b6G5LEC39OIiVsWj+R97kbl5odCEk=
github.com/containerd/console v1.0.3 h1:lIr7SlA5PxZyMV30bDW0MGbiOPXwc63yRuCP0ARubLw=
github.com/containerd/console v1.0.3/go.mod h1:7LqA/THxQ86k76b8c/EMSiaJ3h1eZkMkXar0TQ1gf3U=
github.com/cpuguy83/go-md2man/v2 v2.0.2/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=
github.com/cratonica/2goarray v0.0.0-20190331194516-514510793eaa h1:Wg+722vs7a2zQH5lR9QWYsVbplKeffaQFIs5FTdfNNo=
github.com/cratonica/2goarray v0.0.0-20190331194516-514510793eaa/go.mod h1:6Arca19mRx58CA7OWEd7Wu1NpC1rd3uDnNs6s1pj/DI=
github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
@@ -13,6 +17,20 @@ github.com/emirpasic/gods v1.18.1 h1:FXtiHYKDGKCW2KzwZKx0iC0PQmdlorYgdFG9jPXJ1Bc
github.com/emirpasic/gods v1.18.1/go.mod h1:8tpGGwCnJ5H4r6BWwaV6OrWmMoPhUl5jm/FMNAnJvWQ=
github.com/gabriel-vasile/mimetype v1.4.2 h1:w5qFW6JKBz9Y393Y4q372O9A7cUSequkh1Q7OhCmWKU=
github.com/gabriel-vasile/mimetype v1.4.2/go.mod h1:zApsH/mKG4w07erKIaJPFiX0Tsq9BFQgN3qGY5GnNgA=
github.com/getlantern/context v0.0.0-20190109183933-c447772a6520 h1:NRUJuo3v3WGC/g5YiyF790gut6oQr5f3FBI88Wv0dx4=
github.com/getlantern/context v0.0.0-20190109183933-c447772a6520/go.mod h1:L+mq6/vvYHKjCX2oez0CgEAJmbq1fbb/oNJIWQkBybY=
github.com/getlantern/errors v0.0.0-20190325191628-abdb3e3e36f7 h1:6uJ+sZ/e03gkbqZ0kUG6mfKoqDb4XMAzMIwlajq19So=
github.com/getlantern/errors v0.0.0-20190325191628-abdb3e3e36f7/go.mod h1:l+xpFBrCtDLpK9qNjxs+cHU6+BAdlBaxHqikB6Lku3A=
github.com/getlantern/golog v0.0.0-20190830074920-4ef2e798c2d7 h1:guBYzEaLz0Vfc/jv0czrr2z7qyzTOGC9hiQ0VC+hKjk=
github.com/getlantern/golog v0.0.0-20190830074920-4ef2e798c2d7/go.mod h1:zx/1xUUeYPy3Pcmet8OSXLbF47l+3y6hIPpyLWoR9oc=
github.com/getlantern/hex v0.0.0-20190417191902-c6586a6fe0b7 h1:micT5vkcr9tOVk1FiH8SWKID8ultN44Z+yzd2y/Vyb0=
github.com/getlantern/hex v0.0.0-20190417191902-c6586a6fe0b7/go.mod h1:dD3CgOrwlzca8ed61CsZouQS5h5jIzkK9ZWrTcf0s+o=
github.com/getlantern/hidden v0.0.0-20190325191715-f02dbb02be55 h1:XYzSdCbkzOC0FDNrgJqGRo8PCMFOBFL9py72DRs7bmc=
github.com/getlantern/hidden v0.0.0-20190325191715-f02dbb02be55/go.mod h1:6mmzY2kW1TOOrVy+r41Za2MxXM+hhqTtY3oBKd2AgFA=
github.com/getlantern/ops v0.0.0-20190325191751-d70cb0d6f85f h1:wrYrQttPS8FHIRSlsrcuKazukx/xqO/PpLZzZXsF+EA=
github.com/getlantern/ops v0.0.0-20190325191751-d70cb0d6f85f/go.mod h1:D5ao98qkA6pxftxoqzibIBBrLSUli+kYnJqrgBf9cIA=
github.com/getlantern/systray v1.2.2 h1:dCEHtfmvkJG7HZ8lS/sLklTH4RKUcIsKrAD9sThoEBE=
github.com/getlantern/systray v1.2.2/go.mod h1:pXFOI1wwqwYXEhLPm9ZGjS2u/vVELeIgNMY5HvhHhcE=
github.com/gin-contrib/cors v1.4.0 h1:oJ6gwtUl3lqV0WEIwM/LxPF1QZ5qe2lGWdY2+bz7y0g=
github.com/gin-contrib/cors v1.4.0/go.mod h1:bs9pNM0x/UsmHPBWT2xZz9ROh8xYjYkiURUfmBoMlcs=
github.com/gin-contrib/sse v0.1.0 h1:Y/yl/+YNO8GZSjAhjMsSuLt29uWRFHdHYUb5lYOV9qE=
@@ -31,6 +49,8 @@ github.com/go-playground/universal-translator v0.18.1/go.mod h1:xekY+UJKNuX9WP91
github.com/go-playground/validator/v10 v10.10.0/go.mod h1:74x4gJWsvQexRdW8Pn3dXSGrTK4nAUsbPlLADvpJkos=
github.com/go-playground/validator/v10 v10.14.0 h1:vgvQWe3XCz3gIeFDm/HnTIbj6UGmg/+t63MyGU2n5js=
github.com/go-playground/validator/v10 v10.14.0/go.mod h1:9iXMNT7sEkjXb0I+enO7QXmzG6QCsPWY4zveKFVRSyU=
github.com/go-stack/stack v1.8.0 h1:5SgMzNM5HxrEjV0ww2lTmX6E2Izsfxas4+YHWRs3Lsk=
github.com/go-stack/stack v1.8.0/go.mod h1:v0f6uXyyMGvRgIKkXu+yp6POWl0qKG85gN/melR3HDY=
github.com/goccy/go-json v0.9.7/go.mod h1:6MelG93GURQebXPDq3khkgXZkazVtN9CRI+MGFi0w8I=
github.com/goccy/go-json v0.10.2 h1:CrxCmQqYDkv1z7lO7Wbh2HN93uovUHgrECaO5ZrCXAU=
github.com/goccy/go-json v0.10.2/go.mod h1:6MelG93GURQebXPDq3khkgXZkazVtN9CRI+MGFi0w8I=
@@ -39,6 +59,8 @@ github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/
github.com/google/go-cmp v0.5.9 h1:O2Tfq5qg4qc4AmwVlvv0oLiVAGB7enBSJ2x2DqQFi38=
github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
github.com/google/uuid v1.0.0 h1:b4Gk+7WdP/d3HZH8EJsZpvV7EtDOgaZLtnaNGIu1adA=
github.com/google/uuid v1.0.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8=
github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw=
github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM=
@@ -57,6 +79,8 @@ github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
github.com/leodido/go-urn v1.2.1/go.mod h1:zt4jvISO2HfUBqxjfIshjdMTYS56ZS/qv49ictyFfxY=
github.com/leodido/go-urn v1.2.4 h1:XlAE/cm/ms7TE/VMVoduSpNBoyc2dOxHs5MZSwAN63Q=
github.com/leodido/go-urn v1.2.4/go.mod h1:7ZrI8mTSeBSHl/UaRyKQW1qZeMgak41ANeCNaVckg+4=
github.com/lxn/walk v0.0.0-20210112085537-c389da54e794/go.mod h1:E23UucZGqpuUANJooIbHWCufXvOcT6E7Stq81gU+CSQ=
github.com/lxn/win v0.0.0-20210218163916-a377121e959e/go.mod h1:KxxjdtRkfNoYDCUP5ryK7XJJNTnpC8atvtmTheChOtk=
github.com/mattn/go-isatty v0.0.14/go.mod h1:7GGIvUiUoEMVVmxf/4nioHXj79iQHKdU27kJ6hsGG94=
github.com/mattn/go-isatty v0.0.19 h1:JITubQf0MOLdlGRuRq+jtsDlekdYPia9ZFsB8h/APPA=
github.com/mattn/go-isatty v0.0.19/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
@@ -70,8 +94,12 @@ github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9G
github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk=
github.com/olekukonko/tablewriter v0.0.5 h1:P2Ga83D34wi1o9J6Wh1mRuqd4mF/x/lgBS7N7AbDhec=
github.com/olekukonko/tablewriter v0.0.5/go.mod h1:hPp6KlRPjbx+hW8ykQs1w3UBbZlj6HuIJcUGPhkA7kY=
github.com/oxtoacart/bpool v0.0.0-20190530202638-03653db5a59c h1:rp5dCmg/yLR3mgFuSOe4oEnDDmGLROTvMragMUXpTQw=
github.com/oxtoacart/bpool v0.0.0-20190530202638-03653db5a59c/go.mod h1:X07ZCGwUbLaax7L0S3Tw4hpejzu63ZrrQiUe6W0hcy0=
github.com/pbnjay/memory v0.0.0-20210728143218-7b4eea64cf58 h1:onHthvaw9LFnH4t2DcNVpwGmV9E1BkGknEliJkfwQj0=
github.com/pbnjay/memory v0.0.0-20210728143218-7b4eea64cf58/go.mod h1:DXv8WO4yhMYhSNPKjeNKa5WY9YCIEBRbNzFFPJbWO6Y=
github.com/pborman/uuid v1.2.1 h1:+ZZIw58t/ozdjRaXh/3awHfmWRbzYxJoAdNJxe/3pvw=
github.com/pborman/uuid v1.2.1/go.mod h1:X/NO0urCmaxf9VXbdlT7C2Yzkj2IKimNn4k+gtPdI/k=
github.com/pelletier/go-toml/v2 v2.0.1/go.mod h1:r9LEWfGN8R5k0VXJ+0BkIe7MYkRdwZOjgMj2KwnJFUo=
github.com/pelletier/go-toml/v2 v2.0.8 h1:0ctb6s9mE31h0/lhu+J6OPmVeDxJn+kYnJc2jZR9tGQ=
github.com/pelletier/go-toml/v2 v2.0.8/go.mod h1:vuYfssBdrU2XDZ9bYydBu6t+6a6PYNcZljzZR9VXg+4=
@@ -84,6 +112,7 @@ github.com/rogpeppe/go-internal v1.6.1/go.mod h1:xXDCJY+GAPziupqXw64V24skbSoqbTE
github.com/rogpeppe/go-internal v1.8.0 h1:FCbCCtXNOY3UtUuHUYaghJg4y7Fd14rXifAYUAtL9R8=
github.com/rogpeppe/go-internal v1.8.0/go.mod h1:WmiCO8CzOY8rg0OYDC4/i/2WRWAB6poM+XZ2dLUbcbE=
github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
github.com/skratchdot/open-golang v0.0.0-20200116055534-eef842397966/go.mod h1:sUM3LWHvSMaG192sy56D9F7CNvL7jUJVXoqM1QKLnog=
github.com/spf13/cobra v1.7.0 h1:hyqWnYt1ZQShIddO5kBpj3vu05/++x6tJ6dg8EC572I=
github.com/spf13/cobra v1.7.0/go.mod h1:uLxZILRyS/50WlhOIKD7W6V5bgeIt+4sICxh6uRMrb0=
github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA=
@@ -120,11 +149,14 @@ golang.org/x/net v0.17.0 h1:pVaXccu2ozPjCXewfr1S7xza/zcXTity9cCdXQYSjIM=
golang.org/x/net v0.17.0/go.mod h1:NxSsAGuq816PNPmqtQdLE42eU2Fs7NoRIZrHJAlaCOE=
golang.org/x/sync v0.3.0 h1:ftCYgMx6zT/asHUrPw8BLLscYtGznsLAnjq5RH9P66E=
golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y=
golang.org/x/sys v0.0.0-20201018230417-eeed37f84f13/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210124154548-22da62e12c0c/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20210630005230-0f9fa26af87c/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20210806184541-e5e7981a1069/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220704084225-05e143d24a9e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.13.0 h1:Af8nKPmuFypiUBjVoU9V20FiaFXOcuZI21p0ycVYYGE=
golang.org/x/sys v0.13.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
@@ -141,6 +173,7 @@ google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp0
google.golang.org/protobuf v1.28.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I=
google.golang.org/protobuf v1.30.0 h1:kPPoIgf3TsEvrm0PFe15JQ+570QVxYzEvvHqChK+cng=
google.golang.org/protobuf v1.30.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I=
gopkg.in/Knetic/govaluate.v3 v3.0.0/go.mod h1:csKLBORsPbafmSCGTEh3U7Ozmsuq8ZSIlKk1bcqph0E=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=

101
gpu/amd.go Normal file
View File

@@ -0,0 +1,101 @@
package gpu
import (
"bufio"
"errors"
"fmt"
"io"
"log/slog"
"os"
"path/filepath"
"strconv"
"strings"
)
// TODO - windows vs. non-windows vs darwin
// Discovery logic for AMD/ROCm GPUs
const (
DriverVersionFile = "/sys/module/amdgpu/version"
GPUPropertiesFileGlob = "/sys/class/kfd/kfd/topology/nodes/*/properties"
// TODO probably break these down per GPU to make the logic simpler
GPUTotalMemoryFileGlob = "/sys/class/kfd/kfd/topology/nodes/*/mem_banks/*/properties" // size_in_bytes line
GPUUsedMemoryFileGlob = "/sys/class/kfd/kfd/topology/nodes/*/mem_banks/*/used_memory"
)
func AMDDetected() bool {
// Some driver versions (older?) don't have a version file, so just lookup the parent dir
sysfsDir := filepath.Dir(DriverVersionFile)
_, err := os.Stat(sysfsDir)
if errors.Is(err, os.ErrNotExist) {
slog.Debug("amd driver not detected " + sysfsDir)
return false
} else if err != nil {
slog.Debug(fmt.Sprintf("error looking up amd driver %s %s", sysfsDir, err))
return false
}
return true
}
func AMDDriverVersion() (string, error) {
_, err := os.Stat(DriverVersionFile)
if err != nil {
return "", fmt.Errorf("amdgpu file stat error: %s %w", DriverVersionFile, err)
}
fp, err := os.Open(DriverVersionFile)
if err != nil {
return "", err
}
defer fp.Close()
verString, err := io.ReadAll(fp)
if err != nil {
return "", err
}
return strings.TrimSpace(string(verString)), nil
}
func AMDGFXVersions() []Version {
res := []Version{}
matches, _ := filepath.Glob(GPUPropertiesFileGlob)
for _, match := range matches {
fp, err := os.Open(match)
if err != nil {
slog.Debug(fmt.Sprintf("failed to open sysfs node file %s: %s", match, err))
continue
}
defer fp.Close()
scanner := bufio.NewScanner(fp)
// optionally, resize scanner's capacity for lines over 64K, see next example
for scanner.Scan() {
line := strings.TrimSpace(scanner.Text())
if strings.HasPrefix(line, "gfx_target_version") {
ver := strings.Fields(line)
if len(ver) != 2 || len(ver[1]) < 5 {
slog.Debug("malformed " + line)
continue
}
l := len(ver[1])
patch, err1 := strconv.ParseUint(ver[1][l-2:l], 10, 32)
minor, err2 := strconv.ParseUint(ver[1][l-4:l-2], 10, 32)
major, err3 := strconv.ParseUint(ver[1][:l-4], 10, 32)
if err1 != nil || err2 != nil || err3 != nil {
slog.Debug("malformed int " + line)
continue
}
res = append(res, Version{
Major: uint(major),
Minor: uint(minor),
Patch: uint(patch),
})
}
}
}
return res
}
func (v Version) ToGFXString() string {
return fmt.Sprintf("gfx%d%d%d", v.Major, v.Minor, v.Patch)
}

21
gpu/cpu_common.go Normal file
View File

@@ -0,0 +1,21 @@
package gpu
import (
"log/slog"
"golang.org/x/sys/cpu"
)
func GetCPUVariant() string {
if cpu.X86.HasAVX2 {
slog.Info("CPU has AVX2")
return "avx2"
}
if cpu.X86.HasAVX {
slog.Info("CPU has AVX")
return "avx"
}
slog.Info("CPU does not have vector extensions")
// else LCD
return ""
}

View File

@@ -12,12 +12,14 @@ package gpu
import "C"
import (
"fmt"
"log"
"log/slog"
"os"
"path/filepath"
"runtime"
"strconv"
"strings"
"sync"
"unsafe"
"github.com/jmorganca/ollama/api"
)
type handles struct {
@@ -28,31 +30,86 @@ type handles struct {
var gpuMutex sync.Mutex
var gpuHandles *handles = nil
// With our current CUDA compile flags, older than 5.0 will not work properly
var CudaComputeMin = [2]C.int{5, 0}
// Possible locations for the nvidia-ml library
var CudaLinuxGlobs = []string{
"/usr/local/cuda/lib64/libnvidia-ml.so*",
"/usr/lib/x86_64-linux-gnu/nvidia/current/libnvidia-ml.so*",
"/usr/lib/x86_64-linux-gnu/libnvidia-ml.so*",
"/usr/lib/wsl/lib/libnvidia-ml.so*",
"/usr/lib/wsl/drivers/*/libnvidia-ml.so*",
"/opt/cuda/lib64/libnvidia-ml.so*",
"/usr/lib*/libnvidia-ml.so*",
"/usr/local/lib*/libnvidia-ml.so*",
"/usr/lib/aarch64-linux-gnu/nvidia/current/libnvidia-ml.so*",
"/usr/lib/aarch64-linux-gnu/libnvidia-ml.so*",
// TODO: are these stubs ever valid?
"/opt/cuda/targets/x86_64-linux/lib/stubs/libnvidia-ml.so*",
}
var CudaWindowsGlobs = []string{
"c:\\Windows\\System32\\nvml.dll",
}
var RocmLinuxGlobs = []string{
"/opt/rocm*/lib*/librocm_smi64.so*",
}
var RocmWindowsGlobs = []string{
"c:\\Windows\\System32\\rocm_smi64.dll",
}
// Note: gpuMutex must already be held
func initGPUHandles() {
// TODO - if the ollama build is CPU only, don't do these checks as they're irrelevant and confusing
log.Printf("Detecting GPU type")
gpuHandles = &handles{nil, nil}
var resp C.cuda_init_resp_t
C.cuda_init(&resp)
if resp.err != nil {
log.Printf("CUDA not detected: %s", C.GoString(resp.err))
C.free(unsafe.Pointer(resp.err))
var resp C.rocm_init_resp_t
C.rocm_init(&resp)
if resp.err != nil {
log.Printf("ROCm not detected: %s", C.GoString(resp.err))
C.free(unsafe.Pointer(resp.err))
} else {
log.Printf("Radeon GPU detected")
rocm := resp.rh
gpuHandles.rocm = &rocm
// TODO - if the ollama build is CPU only, don't do these checks as they're irrelevant and confusing
gpuHandles = &handles{nil, nil}
var cudaMgmtName string
var cudaMgmtPatterns []string
var rocmMgmtName string
var rocmMgmtPatterns []string
switch runtime.GOOS {
case "windows":
cudaMgmtName = "nvml.dll"
cudaMgmtPatterns = make([]string, len(CudaWindowsGlobs))
copy(cudaMgmtPatterns, CudaWindowsGlobs)
rocmMgmtName = "rocm_smi64.dll"
rocmMgmtPatterns = make([]string, len(RocmWindowsGlobs))
copy(rocmMgmtPatterns, RocmWindowsGlobs)
case "linux":
cudaMgmtName = "libnvidia-ml.so"
cudaMgmtPatterns = make([]string, len(CudaLinuxGlobs))
copy(cudaMgmtPatterns, CudaLinuxGlobs)
rocmMgmtName = "librocm_smi64.so"
rocmMgmtPatterns = make([]string, len(RocmLinuxGlobs))
copy(rocmMgmtPatterns, RocmLinuxGlobs)
default:
return
}
slog.Info("Detecting GPU type")
cudaLibPaths := FindGPULibs(cudaMgmtName, cudaMgmtPatterns)
if len(cudaLibPaths) > 0 {
cuda := LoadCUDAMgmt(cudaLibPaths)
if cuda != nil {
slog.Info("Nvidia GPU detected")
gpuHandles.cuda = cuda
return
}
}
rocmLibPaths := FindGPULibs(rocmMgmtName, rocmMgmtPatterns)
if len(rocmLibPaths) > 0 {
rocm := LoadROCMMgmt(rocmLibPaths)
if rocm != nil {
slog.Info("Radeon GPU detected")
gpuHandles.rocm = rocm
return
}
} else {
log.Printf("Nvidia GPU detected")
cuda := resp.ch
gpuHandles.cuda = &cuda
}
}
@@ -65,39 +122,107 @@ func GetGPUInfo() GpuInfo {
initGPUHandles()
}
// All our GPU builds on x86 have AVX enabled, so fallback to CPU if we don't detect at least AVX
cpuVariant := GetCPUVariant()
if cpuVariant == "" && runtime.GOARCH == "amd64" {
slog.Warn("CPU does not have AVX or AVX2, disabling GPU support.")
}
var memInfo C.mem_info_t
resp := GpuInfo{}
if gpuHandles.cuda != nil {
if gpuHandles.cuda != nil && (cpuVariant != "" || runtime.GOARCH != "amd64") {
C.cuda_check_vram(*gpuHandles.cuda, &memInfo)
if memInfo.err != nil {
log.Printf("error looking up CUDA GPU memory: %s", C.GoString(memInfo.err))
slog.Info(fmt.Sprintf("error looking up CUDA GPU memory: %s", C.GoString(memInfo.err)))
C.free(unsafe.Pointer(memInfo.err))
} else {
resp.Library = "cuda"
} else if memInfo.count > 0 {
// Verify minimum compute capability
var cc C.cuda_compute_capability_t
C.cuda_compute_capability(*gpuHandles.cuda, &cc)
if cc.err != nil {
slog.Info(fmt.Sprintf("error looking up CUDA GPU compute capability: %s", C.GoString(cc.err)))
C.free(unsafe.Pointer(cc.err))
} else if cc.major > CudaComputeMin[0] || (cc.major == CudaComputeMin[0] && cc.minor >= CudaComputeMin[1]) {
slog.Info(fmt.Sprintf("CUDA Compute Capability detected: %d.%d", cc.major, cc.minor))
resp.Library = "cuda"
} else {
slog.Info(fmt.Sprintf("CUDA GPU is too old. Falling back to CPU mode. Compute Capability detected: %d.%d", cc.major, cc.minor))
}
}
} else if gpuHandles.rocm != nil {
C.rocm_check_vram(*gpuHandles.rocm, &memInfo)
if memInfo.err != nil {
log.Printf("error looking up ROCm GPU memory: %s", C.GoString(memInfo.err))
C.free(unsafe.Pointer(memInfo.err))
} else if AMDDetected() && gpuHandles.rocm != nil && (cpuVariant != "" || runtime.GOARCH != "amd64") {
ver, err := AMDDriverVersion()
if err == nil {
slog.Info("AMD Driver: " + ver)
} else {
resp.Library = "rocm"
// For now this is benign, but we may eventually need to fail compatibility checks
slog.Debug("error looking up amd driver version: %s", err)
}
gfx := AMDGFXVersions()
tooOld := false
for _, v := range gfx {
if v.Major < 9 {
slog.Info("AMD GPU too old, falling back to CPU " + v.ToGFXString())
tooOld = true
break
}
// TODO - remap gfx strings for unsupporetd minor/patch versions to supported for the same major
// e.g. gfx1034 works if we map it to gfx1030 at runtime
}
if !tooOld {
// TODO - this algo can be shifted over to use sysfs instead of the rocm info library...
C.rocm_check_vram(*gpuHandles.rocm, &memInfo)
if memInfo.err != nil {
slog.Info(fmt.Sprintf("error looking up ROCm GPU memory: %s", C.GoString(memInfo.err)))
C.free(unsafe.Pointer(memInfo.err))
} else if memInfo.igpu_index >= 0 && memInfo.count == 1 {
// Only one GPU detected and it appears to be an integrated GPU - skip it
slog.Info("ROCm unsupported integrated GPU detected")
} else if memInfo.count > 0 {
if memInfo.igpu_index >= 0 {
// We have multiple GPUs reported, and one of them is an integrated GPU
// so we have to set the env var to bypass it
// If the user has specified their own ROCR_VISIBLE_DEVICES, don't clobber it
val := os.Getenv("ROCR_VISIBLE_DEVICES")
if val == "" {
devices := []string{}
for i := 0; i < int(memInfo.count); i++ {
if i == int(memInfo.igpu_index) {
continue
}
devices = append(devices, strconv.Itoa(i))
}
val = strings.Join(devices, ",")
os.Setenv("ROCR_VISIBLE_DEVICES", val)
}
slog.Info(fmt.Sprintf("ROCm integrated GPU detected - ROCR_VISIBLE_DEVICES=%s", val))
}
resp.Library = "rocm"
var version C.rocm_version_resp_t
C.rocm_get_version(*gpuHandles.rocm, &version)
verString := C.GoString(version.str)
if version.status == 0 {
resp.Variant = "v" + verString
} else {
slog.Info(fmt.Sprintf("failed to look up ROCm version: %s", verString))
}
C.free(unsafe.Pointer(version.str))
}
}
}
if resp.Library == "" {
C.cpu_check_ram(&memInfo)
// In the future we may offer multiple CPU variants to tune CPU features
if runtime.GOOS == "windows" {
resp.Library = "cpu"
} else {
resp.Library = "default"
}
resp.Library = "cpu"
resp.Variant = cpuVariant
}
if memInfo.err != nil {
log.Printf("error looking up CPU memory: %s", C.GoString(memInfo.err))
slog.Info(fmt.Sprintf("error looking up CPU memory: %s", C.GoString(memInfo.err)))
C.free(unsafe.Pointer(memInfo.err))
return resp
}
resp.DeviceCount = uint32(memInfo.count)
resp.FreeMemory = uint64(memInfo.free)
resp.TotalMemory = uint64(memInfo.total)
return resp
@@ -119,31 +244,111 @@ func getCPUMem() (memInfo, error) {
func CheckVRAM() (int64, error) {
gpuInfo := GetGPUInfo()
if gpuInfo.FreeMemory > 0 && (gpuInfo.Library == "cuda" || gpuInfo.Library == "rocm") {
return int64(gpuInfo.FreeMemory), nil
// leave 10% or 1024MiB of VRAM free per GPU to handle unaccounted for overhead
overhead := gpuInfo.FreeMemory / 10
gpus := uint64(gpuInfo.DeviceCount)
if overhead < gpus*1024*1024*1024 {
overhead = gpus * 1024 * 1024 * 1024
}
avail := int64(gpuInfo.FreeMemory - overhead)
slog.Debug(fmt.Sprintf("%s detected %d devices with %dM available memory", gpuInfo.Library, gpuInfo.DeviceCount, avail/1024/1024))
return avail, nil
}
return 0, fmt.Errorf("no GPU detected") // TODO - better handling of CPU based memory determiniation
}
func NumGPU(numLayer, fileSizeBytes int64, opts api.Options) int {
if opts.NumGPU != -1 {
return opts.NumGPU
func FindGPULibs(baseLibName string, patterns []string) []string {
// Multiple GPU libraries may exist, and some may not work, so keep trying until we exhaust them
var ldPaths []string
gpuLibPaths := []string{}
slog.Info(fmt.Sprintf("Searching for GPU management library %s", baseLibName))
switch runtime.GOOS {
case "windows":
ldPaths = strings.Split(os.Getenv("PATH"), ";")
case "linux":
ldPaths = strings.Split(os.Getenv("LD_LIBRARY_PATH"), ":")
default:
return gpuLibPaths
}
info := GetGPUInfo()
if info.Library == "cpu" || info.Library == "default" {
return 0
// Start with whatever we find in the PATH/LD_LIBRARY_PATH
for _, ldPath := range ldPaths {
d, err := filepath.Abs(ldPath)
if err != nil {
continue
}
patterns = append(patterns, filepath.Join(d, baseLibName+"*"))
}
/*
Calculate bytes per layer, this will roughly be the size of the model file divided by the number of layers.
We can store the model weights and the kv cache in vram,
to enable kv chache vram storage add two additional layers to the number of layers retrieved from the model file.
*/
bytesPerLayer := uint64(fileSizeBytes / numLayer)
// 75% of the absolute max number of layers we can fit in available VRAM, off-loading too many layers to the GPU can cause OOM errors
layers := int(info.FreeMemory/bytesPerLayer) * 3 / 4
log.Printf("%d MB VRAM available, loading up to %d %s GPU layers out of %d", info.FreeMemory/(1024*1024), layers, info.Library, numLayer)
return layers
slog.Debug(fmt.Sprintf("gpu management search paths: %v", patterns))
for _, pattern := range patterns {
// Ignore glob discovery errors
matches, _ := filepath.Glob(pattern)
for _, match := range matches {
// Resolve any links so we don't try the same lib multiple times
// and weed out any dups across globs
libPath := match
tmp := match
var err error
for ; err == nil; tmp, err = os.Readlink(libPath) {
if !filepath.IsAbs(tmp) {
tmp = filepath.Join(filepath.Dir(libPath), tmp)
}
libPath = tmp
}
new := true
for _, cmp := range gpuLibPaths {
if cmp == libPath {
new = false
break
}
}
if new {
gpuLibPaths = append(gpuLibPaths, libPath)
}
}
}
slog.Info(fmt.Sprintf("Discovered GPU libraries: %v", gpuLibPaths))
return gpuLibPaths
}
func LoadCUDAMgmt(cudaLibPaths []string) *C.cuda_handle_t {
var resp C.cuda_init_resp_t
resp.ch.verbose = getVerboseState()
for _, libPath := range cudaLibPaths {
lib := C.CString(libPath)
defer C.free(unsafe.Pointer(lib))
C.cuda_init(lib, &resp)
if resp.err != nil {
slog.Info(fmt.Sprintf("Unable to load CUDA management library %s: %s", libPath, C.GoString(resp.err)))
C.free(unsafe.Pointer(resp.err))
} else {
return &resp.ch
}
}
return nil
}
func LoadROCMMgmt(rocmLibPaths []string) *C.rocm_handle_t {
var resp C.rocm_init_resp_t
resp.rh.verbose = getVerboseState()
for _, libPath := range rocmLibPaths {
lib := C.CString(libPath)
defer C.free(unsafe.Pointer(lib))
C.rocm_init(lib, &resp)
if resp.err != nil {
slog.Info(fmt.Sprintf("Unable to load ROCm management library %s: %s", libPath, C.GoString(resp.err)))
C.free(unsafe.Pointer(resp.err))
} else {
return &resp.rh
}
}
return nil
}
func getVerboseState() C.uint16_t {
if debug := os.Getenv("OLLAMA_DEBUG"); debug != "" {
return C.uint16_t(1)
}
return C.uint16_t(0)
}

View File

@@ -6,21 +6,41 @@ import "C"
import (
"runtime"
"github.com/jmorganca/ollama/api"
"github.com/pbnjay/memory"
)
// CheckVRAM returns the free VRAM in bytes on Linux machines with NVIDIA GPUs
func CheckVRAM() (int64, error) {
// TODO - assume metal, and return free memory?
return 0, nil
if runtime.GOARCH == "amd64" {
// gpu not supported, this may not be metal
return 0, nil
}
// on macOS, there's already buffer for available vram (see below) so just return the total
systemMemory := int64(memory.TotalMemory())
// macOS limits how much memory is available to the GPU based on the amount of system memory
// TODO: handle case where iogpu.wired_limit_mb is set to a higher value
if systemMemory <= 36*1024*1024*1024 {
systemMemory = systemMemory * 2 / 3
} else {
systemMemory = systemMemory * 3 / 4
}
return systemMemory, nil
}
func GetGPUInfo() GpuInfo {
// TODO - Metal vs. x86 macs...
mem, _ := getCPUMem()
if runtime.GOARCH == "amd64" {
return GpuInfo{
Library: "cpu",
Variant: GetCPUVariant(),
memInfo: mem,
}
}
return GpuInfo{
Library: "default",
Library: "metal",
memInfo: mem,
}
}
@@ -29,22 +49,6 @@ func getCPUMem() (memInfo, error) {
return memInfo{
TotalMemory: 0,
FreeMemory: 0,
DeviceCount: 0,
}, nil
}
func NumGPU(numLayer, fileSizeBytes int64, opts api.Options) int {
if opts.NumGPU != -1 {
return opts.NumGPU
}
// metal only supported on arm64
if runtime.GOARCH == "arm64" {
return 1
}
return 0
}
func nativeInit() error {
return nil
}

View File

@@ -27,6 +27,13 @@
#endif
#define LOG(verbose, ...) \
do { \
if (verbose) { \
fprintf(stderr, __VA_ARGS__); \
} \
} while (0)
#ifdef __cplusplus
extern "C" {
#endif
@@ -34,6 +41,8 @@ extern "C" {
typedef struct mem_info {
uint64_t total;
uint64_t free;
unsigned int count;
int igpu_index; // If >= 0, we detected an integrated GPU to ignore
char *err; // If non-nill, caller responsible for freeing
} mem_info_t;

View File

@@ -8,6 +8,7 @@ void cpu_check_ram(mem_info_t *resp) {
MEMORYSTATUSEX info;
info.dwLength = sizeof(info);
if (GlobalMemoryStatusEx(&info) != 0) {
resp->count = 1;
resp->total = info.ullTotalPhys;
resp->free = info.ullAvailPhys;
} else {
@@ -26,6 +27,7 @@ void cpu_check_ram(mem_info_t *resp) {
if (sysinfo(&info) != 0) {
resp->err = strdup(strerror(errno));
} else {
resp->count = 1;
resp->total = info.totalram * info.mem_unit;
resp->free = info.freeram * info.mem_unit;
}

View File

@@ -4,23 +4,7 @@
#include <string.h>
#ifndef _WIN32
const char *cuda_lib_paths[] = {
"libnvidia-ml.so",
"/usr/local/cuda/lib64/libnvidia-ml.so",
"/usr/lib/x86_64-linux-gnu/nvidia/current/libnvidia-ml.so",
"/usr/lib/wsl/lib/libnvidia-ml.so.1", // TODO Maybe glob?
NULL,
};
#else
const char *cuda_lib_paths[] = {
"nvml.dll",
"",
NULL,
};
#endif
void cuda_init(cuda_init_resp_t *resp) {
void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
nvmlReturn_t ret;
resp->err = NULL;
const int buflen = 256;
@@ -30,34 +14,47 @@ void cuda_init(cuda_init_resp_t *resp) {
struct lookup {
char *s;
void **p;
} l[4] = {
{"nvmlInit_v2", (void *)&resp->ch.initFn},
{"nvmlShutdown", (void *)&resp->ch.shutdownFn},
{"nvmlDeviceGetHandleByIndex", (void *)&resp->ch.getHandle},
{"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.getMemInfo},
} l[] = {
{"nvmlInit_v2", (void *)&resp->ch.nvmlInit_v2},
{"nvmlShutdown", (void *)&resp->ch.nvmlShutdown},
{"nvmlDeviceGetHandleByIndex", (void *)&resp->ch.nvmlDeviceGetHandleByIndex},
{"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.nvmlDeviceGetMemoryInfo},
{"nvmlDeviceGetCount_v2", (void *)&resp->ch.nvmlDeviceGetCount_v2},
{"nvmlDeviceGetCudaComputeCapability", (void *)&resp->ch.nvmlDeviceGetCudaComputeCapability},
{"nvmlSystemGetDriverVersion", (void *)&resp->ch.nvmlSystemGetDriverVersion},
{"nvmlDeviceGetName", (void *)&resp->ch.nvmlDeviceGetName},
{"nvmlDeviceGetSerial", (void *)&resp->ch.nvmlDeviceGetSerial},
{"nvmlDeviceGetVbiosVersion", (void *)&resp->ch.nvmlDeviceGetVbiosVersion},
{"nvmlDeviceGetBoardPartNumber", (void *)&resp->ch.nvmlDeviceGetBoardPartNumber},
{"nvmlDeviceGetBrand", (void *)&resp->ch.nvmlDeviceGetBrand},
{NULL, NULL},
};
for (i = 0; cuda_lib_paths[i] != NULL && resp->ch.handle == NULL; i++) {
resp->ch.handle = LOAD_LIBRARY(cuda_lib_paths[i], RTLD_LAZY);
}
resp->ch.handle = LOAD_LIBRARY(cuda_lib_path, RTLD_LAZY);
if (!resp->ch.handle) {
// TODO improve error message, as the LOAD_ERR will have typically have the
// final path that was checked which might be confusing.
char *msg = LOAD_ERR();
LOG(resp->ch.verbose, "library %s load err: %s\n", cuda_lib_path, msg);
snprintf(buf, buflen,
"Unable to load %s library to query for Nvidia GPUs: %s",
cuda_lib_paths[0], msg);
cuda_lib_path, msg);
free(msg);
resp->err = strdup(buf);
return;
}
for (i = 0; i < 4; i++) { // TODO - fix this to use a null terminated list
// TODO once we've squashed the remaining corner cases remove this log
LOG(resp->ch.verbose, "wiring nvidia management library functions in %s\n", cuda_lib_path);
for (i = 0; l[i].s != NULL; i++) {
// TODO once we've squashed the remaining corner cases remove this log
LOG(resp->ch.verbose, "dlsym: %s\n", l[i].s);
*l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
if (!l[i].p) {
UNLOAD_LIBRARY(resp->ch.handle);
resp->ch.handle = NULL;
char *msg = LOAD_ERR();
LOG(resp->ch.verbose, "dlerr: %s\n", msg);
UNLOAD_LIBRARY(resp->ch.handle);
snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
msg);
free(msg);
@@ -66,13 +63,23 @@ void cuda_init(cuda_init_resp_t *resp) {
}
}
ret = (*resp->ch.initFn)();
ret = (*resp->ch.nvmlInit_v2)();
if (ret != NVML_SUCCESS) {
LOG(resp->ch.verbose, "nvmlInit_v2 err: %d\n", ret);
UNLOAD_LIBRARY(resp->ch.handle);
resp->ch.handle = NULL;
snprintf(buf, buflen, "nvml vram init failure: %d", ret);
resp->err = strdup(buf);
return;
}
return;
// Report driver version if we're in verbose mode, ignore errors
ret = (*resp->ch.nvmlSystemGetDriverVersion)(buf, buflen);
if (ret != NVML_SUCCESS) {
LOG(resp->ch.verbose, "nvmlSystemGetDriverVersion failed: %d\n", ret);
} else {
LOG(resp->ch.verbose, "CUDA driver version: %s\n", buf);
}
}
void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
@@ -89,22 +96,119 @@ void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
return;
}
// TODO - handle multiple GPUs
ret = (*h.getHandle)(0, &device);
ret = (*h.nvmlDeviceGetCount_v2)(&resp->count);
if (ret != NVML_SUCCESS) {
snprintf(buf, buflen, "unable to get device handle: %d", ret);
snprintf(buf, buflen, "unable to get device count: %d", ret);
resp->err = strdup(buf);
return;
}
ret = (*h.getMemInfo)(device, &memInfo);
resp->total = 0;
resp->free = 0;
for (i = 0; i < resp->count; i++) {
ret = (*h.nvmlDeviceGetHandleByIndex)(i, &device);
if (ret != NVML_SUCCESS) {
snprintf(buf, buflen, "unable to get device handle %d: %d", i, ret);
resp->err = strdup(buf);
return;
}
ret = (*h.nvmlDeviceGetMemoryInfo)(device, &memInfo);
if (ret != NVML_SUCCESS) {
snprintf(buf, buflen, "device memory info lookup failure %d: %d", i, ret);
resp->err = strdup(buf);
return;
}
if (h.verbose) {
nvmlBrandType_t brand = 0;
// When in verbose mode, report more information about
// the card we discover, but don't fail on error
ret = (*h.nvmlDeviceGetName)(device, buf, buflen);
if (ret != RSMI_STATUS_SUCCESS) {
LOG(h.verbose, "nvmlDeviceGetName failed: %d\n", ret);
} else {
LOG(h.verbose, "[%d] CUDA device name: %s\n", i, buf);
}
ret = (*h.nvmlDeviceGetBoardPartNumber)(device, buf, buflen);
if (ret != RSMI_STATUS_SUCCESS) {
LOG(h.verbose, "nvmlDeviceGetBoardPartNumber failed: %d\n", ret);
} else {
LOG(h.verbose, "[%d] CUDA part number: %s\n", i, buf);
}
ret = (*h.nvmlDeviceGetSerial)(device, buf, buflen);
if (ret != RSMI_STATUS_SUCCESS) {
LOG(h.verbose, "nvmlDeviceGetSerial failed: %d\n", ret);
} else {
LOG(h.verbose, "[%d] CUDA S/N: %s\n", i, buf);
}
ret = (*h.nvmlDeviceGetVbiosVersion)(device, buf, buflen);
if (ret != RSMI_STATUS_SUCCESS) {
LOG(h.verbose, "nvmlDeviceGetVbiosVersion failed: %d\n", ret);
} else {
LOG(h.verbose, "[%d] CUDA vbios version: %s\n", i, buf);
}
ret = (*h.nvmlDeviceGetBrand)(device, &brand);
if (ret != RSMI_STATUS_SUCCESS) {
LOG(h.verbose, "nvmlDeviceGetBrand failed: %d\n", ret);
} else {
LOG(h.verbose, "[%d] CUDA brand: %d\n", i, brand);
}
}
LOG(h.verbose, "[%d] CUDA totalMem %ld\n", i, memInfo.total);
LOG(h.verbose, "[%d] CUDA usedMem %ld\n", i, memInfo.free);
resp->total += memInfo.total;
resp->free += memInfo.free;
}
}
void cuda_compute_capability(cuda_handle_t h, cuda_compute_capability_t *resp) {
resp->err = NULL;
resp->major = 0;
resp->minor = 0;
nvmlDevice_t device;
int major = 0;
int minor = 0;
nvmlReturn_t ret;
const int buflen = 256;
char buf[buflen + 1];
int i;
if (h.handle == NULL) {
resp->err = strdup("nvml handle not initialized");
return;
}
unsigned int devices;
ret = (*h.nvmlDeviceGetCount_v2)(&devices);
if (ret != NVML_SUCCESS) {
snprintf(buf, buflen, "device memory info lookup failure: %d", ret);
snprintf(buf, buflen, "unable to get device count: %d", ret);
resp->err = strdup(buf);
return;
}
resp->total = memInfo.total;
resp->free = memInfo.free;
return;
for (i = 0; i < devices; i++) {
ret = (*h.nvmlDeviceGetHandleByIndex)(i, &device);
if (ret != NVML_SUCCESS) {
snprintf(buf, buflen, "unable to get device handle %d: %d", i, ret);
resp->err = strdup(buf);
return;
}
ret = (*h.nvmlDeviceGetCudaComputeCapability)(device, &major, &minor);
if (ret != NVML_SUCCESS) {
snprintf(buf, buflen, "device compute capability lookup failure %d: %d", i, ret);
resp->err = strdup(buf);
return;
}
// Report the lowest major.minor we detect as that limits our compatibility
if (resp->major == 0 || resp->major > major ) {
resp->major = major;
resp->minor = minor;
} else if ( resp->major == major && resp->minor > minor ) {
resp->minor = minor;
}
}
}
#endif // __APPLE__

View File

@@ -15,12 +15,26 @@ typedef struct nvmlMemory_st {
unsigned long long used;
} nvmlMemory_t;
typedef enum nvmlBrandType_enum
{
NVML_BRAND_UNKNOWN = 0,
} nvmlBrandType_t;
typedef struct cuda_handle {
void *handle;
nvmlReturn_t (*initFn)(void);
nvmlReturn_t (*shutdownFn)(void);
nvmlReturn_t (*getHandle)(unsigned int, nvmlDevice_t *);
nvmlReturn_t (*getMemInfo)(nvmlDevice_t, nvmlMemory_t *);
uint16_t verbose;
nvmlReturn_t (*nvmlInit_v2)(void);
nvmlReturn_t (*nvmlShutdown)(void);
nvmlReturn_t (*nvmlDeviceGetHandleByIndex)(unsigned int, nvmlDevice_t *);
nvmlReturn_t (*nvmlDeviceGetMemoryInfo)(nvmlDevice_t, nvmlMemory_t *);
nvmlReturn_t (*nvmlDeviceGetCount_v2)(unsigned int *);
nvmlReturn_t (*nvmlDeviceGetCudaComputeCapability)(nvmlDevice_t, int* major, int* minor);
nvmlReturn_t (*nvmlSystemGetDriverVersion) (char* version, unsigned int length);
nvmlReturn_t (*nvmlDeviceGetName) (nvmlDevice_t device, char* name, unsigned int length);
nvmlReturn_t (*nvmlDeviceGetSerial) (nvmlDevice_t device, char* serial, unsigned int length);
nvmlReturn_t (*nvmlDeviceGetVbiosVersion) (nvmlDevice_t device, char* version, unsigned int length);
nvmlReturn_t (*nvmlDeviceGetBoardPartNumber) (nvmlDevice_t device, char* partNumber, unsigned int length);
nvmlReturn_t (*nvmlDeviceGetBrand) (nvmlDevice_t device, nvmlBrandType_t* type);
} cuda_handle_t;
typedef struct cuda_init_resp {
@@ -28,8 +42,15 @@ typedef struct cuda_init_resp {
cuda_handle_t ch;
} cuda_init_resp_t;
void cuda_init(cuda_init_resp_t *resp);
typedef struct cuda_compute_capability {
char *err;
int major;
int minor;
} cuda_compute_capability_t;
void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp);
void cuda_check_vram(cuda_handle_t ch, mem_info_t *resp);
void cuda_compute_capability(cuda_handle_t ch, cuda_compute_capability_t *cc);
#endif // __GPU_INFO_CUDA_H__
#endif // __APPLE__

View File

@@ -4,22 +4,7 @@
#include <string.h>
#ifndef _WIN32
const char *rocm_lib_paths[] = {
"librocm_smi64.so",
"/opt/rocm/lib/librocm_smi64.so",
NULL,
};
#else
// TODO untested
const char *rocm_lib_paths[] = {
"rocm_smi64.dll",
"/opt/rocm/lib/rocm_smi64.dll",
NULL,
};
#endif
void rocm_init(rocm_init_resp_t *resp) {
void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
rsmi_status_t ret;
resp->err = NULL;
const int buflen = 256;
@@ -28,32 +13,48 @@ void rocm_init(rocm_init_resp_t *resp) {
struct lookup {
char *s;
void **p;
} l[4] = {
{"rsmi_init", (void *)&resp->rh.initFn},
{"rsmi_shut_down", (void *)&resp->rh.shutdownFn},
{"rsmi_dev_memory_total_get", (void *)&resp->rh.totalMemFn},
{"rsmi_dev_memory_usage_get", (void *)&resp->rh.usageMemFn},
// { "rsmi_dev_id_get", (void*)&resp->rh.getHandle },
} l[] = {
{"rsmi_init", (void *)&resp->rh.rsmi_init},
{"rsmi_shut_down", (void *)&resp->rh.rsmi_shut_down},
{"rsmi_dev_memory_total_get", (void *)&resp->rh.rsmi_dev_memory_total_get},
{"rsmi_dev_memory_usage_get", (void *)&resp->rh.rsmi_dev_memory_usage_get},
{"rsmi_version_get", (void *)&resp->rh.rsmi_version_get},
{"rsmi_num_monitor_devices", (void*)&resp->rh.rsmi_num_monitor_devices},
{"rsmi_dev_id_get", (void*)&resp->rh.rsmi_dev_id_get},
{"rsmi_dev_name_get", (void *)&resp->rh.rsmi_dev_name_get},
{"rsmi_dev_brand_get", (void *)&resp->rh.rsmi_dev_brand_get},
{"rsmi_dev_vendor_name_get", (void *)&resp->rh.rsmi_dev_vendor_name_get},
{"rsmi_dev_vram_vendor_get", (void *)&resp->rh.rsmi_dev_vram_vendor_get},
{"rsmi_dev_serial_number_get", (void *)&resp->rh.rsmi_dev_serial_number_get},
{"rsmi_dev_subsystem_name_get", (void *)&resp->rh.rsmi_dev_subsystem_name_get},
{"rsmi_dev_vbios_version_get", (void *)&resp->rh.rsmi_dev_vbios_version_get},
{NULL, NULL},
};
for (i = 0; rocm_lib_paths[i] != NULL && resp->rh.handle == NULL; i++) {
resp->rh.handle = LOAD_LIBRARY(rocm_lib_paths[i], RTLD_LAZY);
}
resp->rh.handle = LOAD_LIBRARY(rocm_lib_path, RTLD_LAZY);
if (!resp->rh.handle) {
char *msg = LOAD_ERR();
snprintf(buf, buflen,
"Unable to load %s library to query for Radeon GPUs: %s\n",
rocm_lib_paths[0], msg);
rocm_lib_path, msg);
free(msg);
resp->err = strdup(buf);
return;
}
for (i = 0; i < 4; i++) {
// TODO once we've squashed the remaining corner cases remove this log
LOG(resp->rh.verbose, "wiring rocm management library functions in %s\n", rocm_lib_path);
for (i = 0; l[i].s != NULL; i++) {
// TODO once we've squashed the remaining corner cases remove this log
LOG(resp->rh.verbose, "dlsym: %s\n", l[i].s);
*l[i].p = LOAD_SYMBOL(resp->rh.handle, l[i].s);
if (!l[i].p) {
UNLOAD_LIBRARY(resp->rh.handle);
resp->rh.handle = NULL;
char *msg = LOAD_ERR();
LOG(resp->rh.verbose, "dlerr: %s\n", msg);
UNLOAD_LIBRARY(resp->rh.handle);
snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
msg);
free(msg);
@@ -62,8 +63,11 @@ void rocm_init(rocm_init_resp_t *resp) {
}
}
ret = (*resp->rh.initFn)(0);
ret = (*resp->rh.rsmi_init)(0);
if (ret != RSMI_STATUS_SUCCESS) {
LOG(resp->rh.verbose, "rsmi_init err: %d\n", ret);
UNLOAD_LIBRARY(resp->rh.handle);
resp->rh.handle = NULL;
snprintf(buf, buflen, "rocm vram init failure: %d", ret);
resp->err = strdup(buf);
}
@@ -73,8 +77,7 @@ void rocm_init(rocm_init_resp_t *resp) {
void rocm_check_vram(rocm_handle_t h, mem_info_t *resp) {
resp->err = NULL;
// uint32_t num_devices;
// uint16_t device;
resp->igpu_index = -1;
uint64_t totalMem = 0;
uint64_t usedMem = 0;
rsmi_status_t ret;
@@ -83,36 +86,113 @@ void rocm_check_vram(rocm_handle_t h, mem_info_t *resp) {
int i;
if (h.handle == NULL) {
resp->err = strdup("nvml handle sn't initialized");
resp->err = strdup("rocm handle not initialized");
return;
}
// TODO - iterate through devices... ret =
// rsmi_num_monitor_devices(&num_devices);
// ret = (*h.getHandle)(0, &device);
// if (ret != RSMI_STATUS_SUCCESS) {
// printf("rocm vram device lookup failure: %d\n", ret);
// return -1;
// }
// Get total memory - used memory for available memory
ret = (*h.totalMemFn)(0, RSMI_MEM_TYPE_VRAM, &totalMem);
ret = (*h.rsmi_num_monitor_devices)(&resp->count);
if (ret != RSMI_STATUS_SUCCESS) {
snprintf(buf, buflen, "rocm total mem lookup failure: %d", ret);
resp->err = strdup(buf);
return;
}
ret = (*h.usageMemFn)(0, RSMI_MEM_TYPE_VRAM, &usedMem);
if (ret != RSMI_STATUS_SUCCESS) {
snprintf(buf, buflen, "rocm usage mem lookup failure: %d", ret);
snprintf(buf, buflen, "unable to get device count: %d", ret);
resp->err = strdup(buf);
return;
}
LOG(h.verbose, "discovered %d ROCm GPU Devices\n", resp->count);
resp->total = totalMem;
resp->free = totalMem - usedMem;
return;
resp->total = 0;
resp->free = 0;
for (i = 0; i < resp->count; i++) {
if (h.verbose) {
// When in verbose mode, report more information about
// the card we discover, but don't fail on error
ret = (*h.rsmi_dev_name_get)(i, buf, buflen);
if (ret != RSMI_STATUS_SUCCESS) {
LOG(h.verbose, "rsmi_dev_name_get failed: %d\n", ret);
} else {
LOG(h.verbose, "[%d] ROCm device name: %s\n", i, buf);
}
ret = (*h.rsmi_dev_brand_get)(i, buf, buflen);
if (ret != RSMI_STATUS_SUCCESS) {
LOG(h.verbose, "rsmi_dev_brand_get failed: %d\n", ret);
} else {
LOG(h.verbose, "[%d] ROCm brand: %s\n", i, buf);
}
ret = (*h.rsmi_dev_vendor_name_get)(i, buf, buflen);
if (ret != RSMI_STATUS_SUCCESS) {
LOG(h.verbose, "rsmi_dev_vendor_name_get failed: %d\n", ret);
} else {
LOG(h.verbose, "[%d] ROCm vendor: %s\n", i, buf);
}
ret = (*h.rsmi_dev_vram_vendor_get)(i, buf, buflen);
if (ret != RSMI_STATUS_SUCCESS) {
LOG(h.verbose, "rsmi_dev_vram_vendor_get failed: %d\n", ret);
} else {
LOG(h.verbose, "[%d] ROCm VRAM vendor: %s\n", i, buf);
}
ret = (*h.rsmi_dev_serial_number_get)(i, buf, buflen);
if (ret != RSMI_STATUS_SUCCESS) {
LOG(h.verbose, "rsmi_dev_serial_number_get failed: %d\n", ret);
} else {
LOG(h.verbose, "[%d] ROCm S/N: %s\n", i, buf);
}
ret = (*h.rsmi_dev_subsystem_name_get)(i, buf, buflen);
if (ret != RSMI_STATUS_SUCCESS) {
LOG(h.verbose, "rsmi_dev_subsystem_name_get failed: %d\n", ret);
} else {
LOG(h.verbose, "[%d] ROCm subsystem name: %s\n", i, buf);
}
ret = (*h.rsmi_dev_vbios_version_get)(i, buf, buflen);
if (ret != RSMI_STATUS_SUCCESS) {
LOG(h.verbose, "rsmi_dev_vbios_version_get failed: %d\n", ret);
} else {
LOG(h.verbose, "[%d] ROCm vbios version: %s\n", i, buf);
}
}
// Get total memory - used memory for available memory
ret = (*h.rsmi_dev_memory_total_get)(i, RSMI_MEM_TYPE_VRAM, &totalMem);
if (ret != RSMI_STATUS_SUCCESS) {
snprintf(buf, buflen, "rocm total mem lookup failure: %d", ret);
resp->err = strdup(buf);
return;
}
ret = (*h.rsmi_dev_memory_usage_get)(i, RSMI_MEM_TYPE_VRAM, &usedMem);
if (ret != RSMI_STATUS_SUCCESS) {
snprintf(buf, buflen, "rocm usage mem lookup failure: %d", ret);
resp->err = strdup(buf);
return;
}
LOG(h.verbose, "[%d] ROCm totalMem %ld\n", i, totalMem);
LOG(h.verbose, "[%d] ROCm usedMem %ld\n", i, usedMem);
if (totalMem < 1024 * 1024 * 1024) {
// Do not add up integrated GPU memory capacity, it's a bogus 512M, and actually uses system memory
LOG(h.verbose, "[%d] ROCm integrated GPU\n", i);
resp->igpu_index = i;
} else {
resp->total += totalMem;
resp->free += totalMem - usedMem;
}
}
}
#endif // __APPLE__
void rocm_get_version(rocm_handle_t h, rocm_version_resp_t *resp) {
const int buflen = 256;
char buf[buflen + 1];
if (h.handle == NULL) {
resp->str = strdup("rocm handle not initialized");
resp->status = 1;
return;
}
rsmi_version_t ver;
rsmi_status_t ret;
ret = h.rsmi_version_get(&ver);
if (ret != RSMI_STATUS_SUCCESS) {
snprintf(buf, buflen, "unexpected response on version lookup %d", ret);
resp->status = 1;
} else {
snprintf(buf, buflen, "%d", ver.major);
resp->status = 0;
}
resp->str = strdup(buf);
}
#endif // __APPLE__

View File

@@ -15,13 +15,30 @@ typedef enum rsmi_memory_type {
RSMI_MEM_TYPE_GTT,
} rsmi_memory_type_t;
typedef struct {
uint32_t major;
uint32_t minor;
uint32_t patch;
const char *build;
} rsmi_version_t;
typedef struct rocm_handle {
void *handle;
rsmi_status_t (*initFn)(uint64_t);
rsmi_status_t (*shutdownFn)(void);
rsmi_status_t (*totalMemFn)(uint32_t, rsmi_memory_type_t, uint64_t *);
rsmi_status_t (*usageMemFn)(uint32_t, rsmi_memory_type_t, uint64_t *);
// rsmi_status_t (*getHandle)(uint32_t, uint16_t *);
uint16_t verbose;
rsmi_status_t (*rsmi_init)(uint64_t);
rsmi_status_t (*rsmi_shut_down)(void);
rsmi_status_t (*rsmi_dev_memory_total_get)(uint32_t, rsmi_memory_type_t, uint64_t *);
rsmi_status_t (*rsmi_dev_memory_usage_get)(uint32_t, rsmi_memory_type_t, uint64_t *);
rsmi_status_t (*rsmi_version_get) (rsmi_version_t *version);
rsmi_status_t (*rsmi_num_monitor_devices) (uint32_t *);
rsmi_status_t (*rsmi_dev_id_get)(uint32_t, uint16_t *);
rsmi_status_t (*rsmi_dev_name_get) (uint32_t,char *,size_t);
rsmi_status_t (*rsmi_dev_brand_get) (uint32_t, char *, uint32_t);
rsmi_status_t (*rsmi_dev_vendor_name_get) (uint32_t, char *, uint32_t);
rsmi_status_t (*rsmi_dev_vram_vendor_get) (uint32_t, char *, uint32_t);
rsmi_status_t (*rsmi_dev_serial_number_get) (uint32_t, char *, uint32_t);
rsmi_status_t (*rsmi_dev_subsystem_name_get) (uint32_t, char *, uint32_t);
rsmi_status_t (*rsmi_dev_vbios_version_get) (uint32_t, char *, uint32_t);
} rocm_handle_t;
typedef struct rocm_init_resp {
@@ -29,8 +46,14 @@ typedef struct rocm_init_resp {
rocm_handle_t rh;
} rocm_init_resp_t;
void rocm_init(rocm_init_resp_t *resp);
typedef struct rocm_version_resp {
rsmi_status_t status;
char *str; // Contains version or error string if status != 0
} rocm_version_resp_t;
void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp);
void rocm_check_vram(rocm_handle_t rh, mem_info_t *resp);
void rocm_get_version(rocm_handle_t rh, rocm_version_resp_t *resp);
#endif // __GPU_INFO_ROCM_H__
#endif // __APPLE__

View File

@@ -9,7 +9,7 @@ import (
func TestBasicGetGPUInfo(t *testing.T) {
info := GetGPUInfo()
assert.Contains(t, "cuda rocm cpu default", info.Library)
assert.Contains(t, "cuda rocm cpu metal", info.Library)
switch runtime.GOOS {
case "darwin":
@@ -18,6 +18,7 @@ func TestBasicGetGPUInfo(t *testing.T) {
case "linux", "windows":
assert.Greater(t, info.TotalMemory, uint64(0))
assert.Greater(t, info.FreeMemory, uint64(0))
assert.Greater(t, info.DeviceCount, uint32(0))
default:
return
}
@@ -35,7 +36,6 @@ func TestCPUMemInfo(t *testing.T) {
default:
return
}
}
// TODO - add some logic to figure out card type through other means and actually verify we got back what we expected

View File

@@ -3,6 +3,7 @@ package gpu
type memInfo struct {
TotalMemory uint64 `json:"total_memory,omitempty"`
FreeMemory uint64 `json:"free_memory,omitempty"`
DeviceCount uint32 `json:"device_count,omitempty"`
}
// Beginning of an `ollama info` command
@@ -10,5 +11,14 @@ type GpuInfo struct {
memInfo
Library string `json:"library,omitempty"`
// Optional variant to select (e.g. versions, cpu feature flags)
Variant string `json:"variant,omitempty"`
// TODO add other useful attributes about the card here for discovery information
}
type Version struct {
Major uint
Minor uint
Patch uint
}

View File

@@ -1,11 +1,11 @@
#include "dynamic_shim.h"
#include "dyn_ext_server.h"
#include <stdio.h>
#include <string.h>
#ifdef __linux__
#include <dlfcn.h>
#define LOAD_LIBRARY(lib, flags) dlopen(lib, flags | RTLD_DEEPBIND)
#define LOAD_LIBRARY(lib, flags) dlopen(lib, flags)
#define LOAD_SYMBOL(handle, sym) dlsym(handle, sym)
#define LOAD_ERR() strdup(dlerror())
#define UNLOAD_LIBRARY(handle) dlclose(handle)
@@ -33,7 +33,7 @@ inline char *LOAD_ERR() {
#define UNLOAD_LIBRARY(handle) dlclose(handle)
#endif
void dynamic_shim_init(const char *libPath, struct dynamic_llama_server *s,
void dyn_init(const char *libPath, struct dynamic_llama_server *s,
ext_server_resp_t *err) {
int i = 0;
struct lookup {
@@ -58,8 +58,8 @@ void dynamic_shim_init(const char *libPath, struct dynamic_llama_server *s,
{"", NULL},
};
printf("Lazy loading %s library\n", libPath);
s->handle = LOAD_LIBRARY(libPath, RTLD_NOW);
printf("loading library %s\n", libPath);
s->handle = LOAD_LIBRARY(libPath, RTLD_LOCAL|RTLD_NOW);
if (!s->handle) {
err->id = -1;
char *msg = LOAD_ERR();
@@ -83,63 +83,63 @@ void dynamic_shim_init(const char *libPath, struct dynamic_llama_server *s,
}
}
inline void dynamic_shim_llama_server_init(struct dynamic_llama_server s,
inline void dyn_llama_server_init(struct dynamic_llama_server s,
ext_server_params_t *sparams,
ext_server_resp_t *err) {
s.llama_server_init(sparams, err);
}
inline void dynamic_shim_llama_server_start(struct dynamic_llama_server s) {
inline void dyn_llama_server_start(struct dynamic_llama_server s) {
s.llama_server_start();
}
inline void dynamic_shim_llama_server_stop(struct dynamic_llama_server s) {
inline void dyn_llama_server_stop(struct dynamic_llama_server s) {
s.llama_server_stop();
}
inline void dynamic_shim_llama_server_completion(struct dynamic_llama_server s,
inline void dyn_llama_server_completion(struct dynamic_llama_server s,
const char *json_req,
ext_server_resp_t *resp) {
s.llama_server_completion(json_req, resp);
}
inline void dynamic_shim_llama_server_completion_next_result(
inline void dyn_llama_server_completion_next_result(
struct dynamic_llama_server s, const int task_id,
ext_server_task_result_t *result) {
s.llama_server_completion_next_result(task_id, result);
}
inline void dynamic_shim_llama_server_completion_cancel(
inline void dyn_llama_server_completion_cancel(
struct dynamic_llama_server s, const int task_id, ext_server_resp_t *err) {
s.llama_server_completion_cancel(task_id, err);
}
inline void dynamic_shim_llama_server_release_task_result(
inline void dyn_llama_server_release_task_result(
struct dynamic_llama_server s, ext_server_task_result_t *result) {
s.llama_server_release_task_result(result);
}
inline void dynamic_shim_llama_server_tokenize(struct dynamic_llama_server s,
inline void dyn_llama_server_tokenize(struct dynamic_llama_server s,
const char *json_req,
char **json_resp,
ext_server_resp_t *err) {
s.llama_server_tokenize(json_req, json_resp, err);
}
inline void dynamic_shim_llama_server_detokenize(struct dynamic_llama_server s,
inline void dyn_llama_server_detokenize(struct dynamic_llama_server s,
const char *json_req,
char **json_resp,
ext_server_resp_t *err) {
s.llama_server_detokenize(json_req, json_resp, err);
}
inline void dynamic_shim_llama_server_embedding(struct dynamic_llama_server s,
inline void dyn_llama_server_embedding(struct dynamic_llama_server s,
const char *json_req,
char **json_resp,
ext_server_resp_t *err) {
s.llama_server_embedding(json_req, json_resp, err);
}
inline void dynamic_shim_llama_server_release_json_resp(
inline void dyn_llama_server_release_json_resp(
struct dynamic_llama_server s, char **json_resp) {
s.llama_server_release_json_resp(json_resp);
}

View File

@@ -3,61 +3,43 @@ package llm
/*
#cgo CFLAGS: -I${SRCDIR}/ext_server -I${SRCDIR}/llama.cpp -I${SRCDIR}/llama.cpp/common -I${SRCDIR}/llama.cpp/examples/server
#cgo CFLAGS: -DNDEBUG -DLLAMA_SERVER_LIBRARY=1 -D_XOPEN_SOURCE=600 -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64
#cgo CFLAGS: -Wmissing-noreturn -Wall -Wextra -Wcast-qual -Wno-unused-function -Wno-array-bounds
#cgo CPPFLAGS: -Ofast -Wall -Wextra -Wno-unused-function -Wno-unused-variable -Wno-deprecated-declarations -Wno-unused-but-set-variable
#cgo CFLAGS: -Wmissing-noreturn -Wextra -Wcast-qual -Wno-unused-function -Wno-array-bounds
#cgo CPPFLAGS: -Ofast -Wextra -Wno-unused-function -Wno-unused-variable -Wno-deprecated-declarations
#cgo darwin CFLAGS: -D_DARWIN_C_SOURCE
#cgo darwin CPPFLAGS: -DGGML_USE_ACCELERATE
#cgo darwin CPPFLAGS: -DGGML_USE_METAL -DGGML_METAL_NDEBUG
#cgo darwin LDFLAGS: -lc++ -framework Accelerate
#cgo darwin LDFLAGS: -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
#cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/build/darwin/metal/lib/libcommon.a
#cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/build/darwin/metal/lib/libext_server.a
#cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/build/darwin/metal/lib/libllama.a
#cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/build/darwin/metal/lib/libggml_static.a
#cgo linux CFLAGS: -D_GNU_SOURCE
#cgo linux windows CFLAGS: -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_USE_CUBLAS
#cgo linux LDFLAGS: -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/local/cuda/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib/stubs
#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/build/linux/cpu/lib/libext_server.a
#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/build/linux/cpu/lib/libcommon.a
#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/build/linux/cpu/lib/libllama.a
#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/build/linux/cpu/lib/libggml_static.a
#cgo linux LDFLAGS: -lrt -ldl -lstdc++ -lm
#cgo linux windows LDFLAGS: -lpthread
#include <stdlib.h>
#include "ext_server.h"
#include "dyn_ext_server.h"
*/
import "C"
import (
"bytes"
"context"
"encoding/json"
"fmt"
"log"
"log/slog"
"os"
"path/filepath"
"runtime"
"strings"
"sync"
"time"
"unsafe"
"github.com/jmorganca/ollama/api"
"github.com/jmorganca/ollama/gpu"
)
type extServer interface {
LLM
llama_server_init(sparams *C.ext_server_params_t, err *C.ext_server_resp_t)
llama_server_start()
llama_server_stop()
llama_server_completion(json_req *C.char, resp *C.ext_server_resp_t)
llama_server_completion_next_result(task_id C.int, resp *C.ext_server_task_result_t)
llama_server_completion_cancel(task_id C.int, err *C.ext_server_resp_t)
llama_server_release_task_result(result *C.ext_server_task_result_t)
llama_server_tokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t)
llama_server_detokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t)
llama_server_embedding(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t)
llama_server_release_json_resp(json_resp **C.char)
type dynExtServer struct {
s C.struct_dynamic_llama_server
options api.Options
}
// Note: current implementation does not support concurrent instantiations
@@ -82,25 +64,39 @@ func extServerResponseToErr(resp C.ext_server_resp_t) error {
return fmt.Errorf(C.GoString(resp.msg))
}
func newExtServer(server extServer, model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) {
// Note: current implementation does not support concurrent instantiations
var llm *dynExtServer
func newDynExtServer(library, model string, adapters, projectors []string, opts api.Options) (LLM, error) {
if !mutex.TryLock() {
log.Printf("concurrent llm servers not yet supported, waiting for prior server to complete")
slog.Info("concurrent llm servers not yet supported, waiting for prior server to complete")
mutex.Lock()
}
fileInfo, err := os.Stat(model)
if err != nil {
return nil, err
updatePath(filepath.Dir(library))
libPath := C.CString(library)
defer C.free(unsafe.Pointer(libPath))
resp := newExtServerResp(512)
defer freeExtServerResp(resp)
var srv C.struct_dynamic_llama_server
C.dyn_init(libPath, &srv, &resp)
if resp.id < 0 {
mutex.Unlock()
return nil, fmt.Errorf("Unable to load dynamic library: %s", C.GoString(resp.msg))
}
llm = &dynExtServer{
s: srv,
options: opts,
}
slog.Info(fmt.Sprintf("Loading Dynamic llm server: %s", library))
var sparams C.ext_server_params_t
sparams.model = C.CString(model)
defer C.free(unsafe.Pointer(sparams.model))
numGPU := gpu.NumGPU(numLayers, fileInfo.Size(), opts)
sparams.embedding = true
sparams.n_ctx = C.uint(opts.NumCtx)
sparams.n_batch = C.uint(opts.NumBatch)
sparams.n_gpu_layers = C.int(numGPU)
sparams.n_gpu_layers = C.int(opts.NumGPU)
sparams.main_gpu = C.int(opts.MainGPU)
sparams.n_parallel = 1 // TODO - wire up concurrency
@@ -140,29 +136,35 @@ func newExtServer(server extServer, model string, adapters, projectors []string,
sparams.n_threads = C.uint(opts.NumThread)
log.Printf("Initializing internal llama server")
resp := newExtServerResp(128)
defer freeExtServerResp(resp)
server.llama_server_init(&sparams, &resp)
if resp.id < 0 {
return nil, extServerResponseToErr(resp)
if debug := os.Getenv("OLLAMA_DEBUG"); debug != "" {
sparams.verbose_logging = C.bool(true)
} else {
sparams.verbose_logging = C.bool(false)
}
log.Printf("Starting internal llama main loop")
server.llama_server_start()
return server, nil
slog.Info("Initializing llama server")
initResp := newExtServerResp(128)
defer freeExtServerResp(initResp)
C.dyn_llama_server_init(llm.s, &sparams, &initResp)
if initResp.id < 0 {
mutex.Unlock()
err := extServerResponseToErr(initResp)
slog.Debug(fmt.Sprintf("failure during initialization: %s", err))
return nil, err
}
slog.Info("Starting llama main loop")
C.dyn_llama_server_start(llm.s)
return llm, nil
}
func predict(ctx context.Context, llm extServer, predict PredictOpts, fn func(PredictResult)) error {
func (llm *dynExtServer) Predict(ctx context.Context, predict PredictOpts, fn func(PredictResult)) error {
resp := newExtServerResp(128)
defer freeExtServerResp(resp)
var imageData []ImageData
if len(predict.Images) > 0 {
for cnt, i := range predict.Images {
imageData = append(imageData, ImageData{Data: i, ID: cnt})
}
slog.Info(fmt.Sprintf("loaded %d images", len(predict.Images)))
}
log.Printf("loaded %d images", len(imageData))
request := map[string]any{
"prompt": predict.Prompt,
@@ -184,7 +186,7 @@ func predict(ctx context.Context, llm extServer, predict PredictOpts, fn func(Pr
"penalize_nl": predict.Options.PenalizeNewline,
"seed": predict.Options.Seed,
"stop": predict.Options.Stop,
"image_data": imageData,
"image_data": predict.Images,
"cache_prompt": true,
}
@@ -192,6 +194,7 @@ func predict(ctx context.Context, llm extServer, predict PredictOpts, fn func(Pr
request["grammar"] = jsonGrammar
}
var whitespace int
retryDelay := 100 * time.Microsecond
for retries := 0; retries < maxRetries; retries++ {
if retries > 0 {
@@ -211,7 +214,7 @@ func predict(ctx context.Context, llm extServer, predict PredictOpts, fn func(Pr
req := C.CString(buffer.String())
defer C.free(unsafe.Pointer(req))
llm.llama_server_completion(req, &resp)
C.dyn_llama_server_completion(llm.s, req, &resp)
if resp.id < 0 {
return extServerResponseToErr(resp)
}
@@ -222,7 +225,7 @@ func predict(ctx context.Context, llm extServer, predict PredictOpts, fn func(Pr
select {
case <-ctx.Done():
// This handles the request cancellation
llm.llama_server_completion_cancel(resp.id, &resp)
C.dyn_llama_server_completion_cancel(llm.s, resp.id, &resp)
if resp.id < 0 {
return extServerResponseToErr(resp)
} else {
@@ -230,13 +233,13 @@ func predict(ctx context.Context, llm extServer, predict PredictOpts, fn func(Pr
}
default:
var result C.ext_server_task_result_t
llm.llama_server_completion_next_result(resp.id, &result)
C.dyn_llama_server_completion_next_result(llm.s, resp.id, &result)
json_resp := C.GoString(result.json_resp)
llm.llama_server_release_task_result(&result)
C.dyn_llama_server_release_task_result(llm.s, &result)
var p prediction
if err := json.Unmarshal([]byte(json_resp), &p); err != nil {
llm.llama_server_completion_cancel(resp.id, &resp)
C.dyn_llama_server_completion_cancel(llm.s, resp.id, &resp)
if resp.id < 0 {
return fmt.Errorf("error unmarshaling llm prediction response: %w and cancel %s", err, C.GoString(resp.msg))
} else {
@@ -250,13 +253,31 @@ func predict(ctx context.Context, llm extServer, predict PredictOpts, fn func(Pr
break out
}
// detect if p.Content is entirely whitespace
if predict.Format == "json" && strings.TrimSpace(p.Content) == "" {
whitespace++
// if we get 100 consecutive whitespace responses, cancel
if whitespace > 100 {
slog.Debug("cancelling due to excessive whitespace")
C.dyn_llama_server_completion_cancel(llm.s, resp.id, &resp)
if resp.id < 0 {
return extServerResponseToErr(resp)
}
return nil
}
} else {
whitespace = 0
}
if p.Content != "" {
fn(PredictResult{
Content: p.Content,
})
}
if p.Stop {
if p.Stop || bool(result.stop) {
fn(PredictResult{
Done: true,
PromptEvalCount: p.Timings.PromptN,
@@ -277,7 +298,7 @@ func predict(ctx context.Context, llm extServer, predict PredictOpts, fn func(Pr
return fmt.Errorf("max retries exceeded")
}
func encode(llm extServer, ctx context.Context, prompt string) ([]int, error) {
func (llm *dynExtServer) Encode(ctx context.Context, prompt string) ([]int, error) {
data, err := json.Marshal(TokenizeRequest{Content: prompt})
if err != nil {
return nil, fmt.Errorf("marshaling encode data: %w", err)
@@ -287,11 +308,11 @@ func encode(llm extServer, ctx context.Context, prompt string) ([]int, error) {
var json_resp *C.char
resp := newExtServerResp(128)
defer freeExtServerResp(resp)
llm.llama_server_tokenize(req, &json_resp, &resp)
C.dyn_llama_server_tokenize(llm.s, req, &json_resp, &resp)
if resp.id < 0 {
return nil, extServerResponseToErr(resp)
}
defer llm.llama_server_release_json_resp(&json_resp)
defer C.dyn_llama_server_release_json_resp(llm.s, &json_resp)
var encoded TokenizeResponse
if err2 := json.Unmarshal([]byte(C.GoString(json_resp)), &encoded); err2 != nil {
@@ -301,7 +322,7 @@ func encode(llm extServer, ctx context.Context, prompt string) ([]int, error) {
return encoded.Tokens, err
}
func decode(llm extServer, ctx context.Context, tokens []int) (string, error) {
func (llm *dynExtServer) Decode(ctx context.Context, tokens []int) (string, error) {
if len(tokens) == 0 {
return "", nil
}
@@ -315,11 +336,11 @@ func decode(llm extServer, ctx context.Context, tokens []int) (string, error) {
var json_resp *C.char
resp := newExtServerResp(128)
defer freeExtServerResp(resp)
llm.llama_server_detokenize(req, &json_resp, &resp)
C.dyn_llama_server_detokenize(llm.s, req, &json_resp, &resp)
if resp.id < 0 {
return "", extServerResponseToErr(resp)
}
defer llm.llama_server_release_json_resp(&json_resp)
defer C.dyn_llama_server_release_json_resp(llm.s, &json_resp)
var decoded DetokenizeResponse
if err2 := json.Unmarshal([]byte(C.GoString(json_resp)), &decoded); err2 != nil {
@@ -329,7 +350,7 @@ func decode(llm extServer, ctx context.Context, tokens []int) (string, error) {
return decoded.Content, err
}
func embedding(llm extServer, ctx context.Context, input string) ([]float64, error) {
func (llm *dynExtServer) Embedding(ctx context.Context, input string) ([]float64, error) {
data, err := json.Marshal(TokenizeRequest{Content: input})
if err != nil {
return nil, fmt.Errorf("error marshaling embed data: %w", err)
@@ -340,11 +361,11 @@ func embedding(llm extServer, ctx context.Context, input string) ([]float64, err
var json_resp *C.char
resp := newExtServerResp(128)
defer freeExtServerResp(resp)
llm.llama_server_embedding(req, &json_resp, &resp)
C.dyn_llama_server_embedding(llm.s, req, &json_resp, &resp)
if resp.id < 0 {
return nil, extServerResponseToErr(resp)
}
defer llm.llama_server_release_json_resp(&json_resp)
defer C.dyn_llama_server_release_json_resp(llm.s, &json_resp)
var embedding EmbeddingResponse
if err := json.Unmarshal([]byte(C.GoString(json_resp)), &embedding); err != nil {
@@ -354,7 +375,29 @@ func embedding(llm extServer, ctx context.Context, input string) ([]float64, err
return embedding.Embedding, nil
}
func close(llm extServer) {
llm.llama_server_stop()
func (llm *dynExtServer) Close() {
C.dyn_llama_server_stop(llm.s)
mutex.Unlock()
}
func updatePath(dir string) {
if runtime.GOOS == "windows" {
tmpDir := filepath.Dir(dir)
pathComponents := strings.Split(os.Getenv("PATH"), ";")
i := 0
for _, comp := range pathComponents {
if strings.EqualFold(comp, dir) {
return
}
// Remove any other prior paths to our temp dir
if !strings.HasPrefix(strings.ToLower(comp), strings.ToLower(tmpDir)) {
pathComponents[i] = comp
i++
}
}
newPath := strings.Join(append([]string{dir}, pathComponents...), ";")
slog.Info(fmt.Sprintf("Updating PATH to %s", newPath))
os.Setenv("PATH", newPath)
}
// linux and darwin rely on rpath
}

View File

@@ -27,46 +27,46 @@ struct dynamic_llama_server {
void (*llama_server_release_json_resp)(char **json_resp);
};
void dynamic_shim_init(const char *libPath, struct dynamic_llama_server *s,
void dyn_init(const char *libPath, struct dynamic_llama_server *s,
ext_server_resp_t *err);
// No good way to call C function pointers from Go so inline the indirection
void dynamic_shim_llama_server_init(struct dynamic_llama_server s,
void dyn_llama_server_init(struct dynamic_llama_server s,
ext_server_params_t *sparams,
ext_server_resp_t *err);
void dynamic_shim_llama_server_start(struct dynamic_llama_server s);
void dyn_llama_server_start(struct dynamic_llama_server s);
void dynamic_shim_llama_server_stop(struct dynamic_llama_server s);
void dyn_llama_server_stop(struct dynamic_llama_server s);
void dynamic_shim_llama_server_completion(struct dynamic_llama_server s,
void dyn_llama_server_completion(struct dynamic_llama_server s,
const char *json_req,
ext_server_resp_t *resp);
void dynamic_shim_llama_server_completion_next_result(
void dyn_llama_server_completion_next_result(
struct dynamic_llama_server s, const int task_id,
ext_server_task_result_t *result);
void dynamic_shim_llama_server_completion_cancel(struct dynamic_llama_server s,
void dyn_llama_server_completion_cancel(struct dynamic_llama_server s,
const int task_id,
ext_server_resp_t *err);
void dynamic_shim_llama_server_release_task_result(
void dyn_llama_server_release_task_result(
struct dynamic_llama_server s, ext_server_task_result_t *result);
void dynamic_shim_llama_server_tokenize(struct dynamic_llama_server s,
void dyn_llama_server_tokenize(struct dynamic_llama_server s,
const char *json_req, char **json_resp,
ext_server_resp_t *err);
void dynamic_shim_llama_server_detokenize(struct dynamic_llama_server s,
void dyn_llama_server_detokenize(struct dynamic_llama_server s,
const char *json_req,
char **json_resp,
ext_server_resp_t *err);
void dynamic_shim_llama_server_embedding(struct dynamic_llama_server s,
void dyn_llama_server_embedding(struct dynamic_llama_server s,
const char *json_req, char **json_resp,
ext_server_resp_t *err);
void dynamic_shim_llama_server_release_json_resp(struct dynamic_llama_server s,
void dyn_llama_server_release_json_resp(struct dynamic_llama_server s,
char **json_resp);
#ifdef __cplusplus

View File

@@ -2,28 +2,24 @@
set(TARGET ext_server)
option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
add_library(${TARGET} STATIC ../../../ext_server/ext_server.cpp)
if (WIN32)
add_library(${TARGET} SHARED ../../../ext_server/ext_server.cpp ../../llama.cpp)
else()
add_library(${TARGET} STATIC ../../../ext_server/ext_server.cpp ../../llama.cpp)
endif()
target_include_directories(${TARGET} PRIVATE ../../common)
target_include_directories(${TARGET} PRIVATE ../..)
target_include_directories(${TARGET} PRIVATE ../../..)
target_compile_features(${TARGET} PRIVATE cxx_std_11)
target_compile_definitions(${TARGET} PUBLIC LLAMA_SERVER_LIBRARY=1)
target_link_libraries(${TARGET} PRIVATE common llama llava ${CMAKE_THREAD_LIBS_INIT})
target_compile_definitions(${TARGET} PRIVATE
SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
)
if (BUILD_SHARED_LIBS)
set_target_properties(ext_server PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_compile_definitions(ext_server PRIVATE LLAMA_SHARED LLAMA_BUILD)
add_library(ext_server_shared SHARED $<TARGET_OBJECTS:ext_server>)
target_link_libraries(ext_server_shared PRIVATE ggml llama llava common ${CMAKE_THREAD_LIBS_INIT})
install(TARGETS ext_server_shared LIBRARY)
endif()
target_link_libraries(${TARGET} PRIVATE ggml llava common )
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_compile_definitions(${TARGET} PRIVATE SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>)
install(TARGETS ext_server LIBRARY)
if (CUDAToolkit_FOUND)
target_include_directories(${TARGET} PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
if (WIN32)
target_link_libraries(ext_server_shared PRIVATE nvml)
target_link_libraries(${TARGET} PRIVATE nvml)
endif()
endif()

View File

@@ -1,4 +1,18 @@
# Extern C Server
This directory contains a thin facade we layer on top of the Llama.cpp server
to expose `extern C` interfaces to access the functionality through direct API calls in-process
This directory contains a thin facade we layer on top of the Llama.cpp server to
expose `extern C` interfaces to access the functionality through direct API
calls in-process. The llama.cpp code uses compile time macros to configure GPU
type along with other settings. During the `go generate ./...` execution, the
build will generate one or more copies of the llama.cpp `extern C` server based
on what GPU libraries are detected to support multiple GPU types as well as CPU
only support. The Ollama go build then embeds these different servers to support
different GPUs and settings at runtime.
If you are making changes to the code in this directory, make sure to disable
caching during your go build to ensure you pick up your changes. A typical
iteration cycle from the top of the source tree looks like:
```
go generate ./... && go build -a .
```

View File

@@ -1,23 +1,63 @@
#include "ext_server.h"
#include <atomic>
// Necessary evil since the server types are not defined in a header
#include "server.cpp"
// Low level API access to verify GPU access
#if defined(GGML_USE_CUBLAS)
#if defined(GGML_USE_HIPBLAS)
#include <hip/hip_runtime.h>
#include <hipblas/hipblas.h>
#include <hip/hip_fp16.h>
#ifdef __HIP_PLATFORM_AMD__
// for rocblas_initialize()
#include "rocblas/rocblas.h"
#endif // __HIP_PLATFORM_AMD__
#define cudaGetDevice hipGetDevice
#define cudaError_t hipError_t
#define cudaSuccess hipSuccess
#define cudaGetErrorString hipGetErrorString
#else
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <cuda_fp16.h>
#endif // defined(GGML_USE_HIPBLAS)
#endif // GGML_USE_CUBLAS
// Expose the llama server as a callable extern "C" API
llama_server_context *llama = NULL;
std::atomic<bool> ext_server_running(false);
std::thread ext_server_thread;
bool shutting_down = false;
std::atomic_int recv_counter;
// RAII wrapper for tracking in-flight recv calls
class atomicRecv {
public:
atomicRecv(std::atomic<int> &atomic) : atomic(atomic) {
++this->atomic;
}
~atomicRecv() {
--this->atomic;
}
private:
std::atomic<int> &atomic;
};
void llama_server_init(ext_server_params *sparams, ext_server_resp_t *err) {
#if SERVER_VERBOSE != 1
log_disable();
#endif
recv_counter = 0;
assert(err != NULL && sparams != NULL);
log_set_target(stderr);
if (!sparams->verbose_logging) {
server_verbose = true;
log_disable();
}
LOG_TEE("system info: %s\n", llama_print_system_info());
err->id = 0;
err->msg[0] = '\0';
try {
llama = new llama_server_context;
log_set_target(stdout);
gpt_params params;
params.n_ctx = sparams->n_ctx;
params.n_batch = sparams->n_batch;
@@ -46,15 +86,31 @@ void llama_server_init(ext_server_params *sparams, ext_server_resp_t *err) {
params.model = sparams->model;
}
for (ext_server_lora_adapter *la = sparams->lora_adapters; la != NULL;
la = la->next) {
params.lora_adapter.push_back(std::make_tuple(la->adapter, la->scale));
if (sparams->lora_adapters != NULL) {
for (ext_server_lora_adapter *la = sparams->lora_adapters; la != NULL;
la = la->next) {
params.lora_adapter.push_back(std::make_tuple(la->adapter, la->scale));
}
params.use_mmap = false;
}
if (sparams->mmproj != NULL) {
params.mmproj = std::string(sparams->mmproj);
}
#if defined(GGML_USE_CUBLAS)
// Before attempting to init the backend which will assert on error, verify the CUDA/ROCM GPU is accessible
LOG_TEE("Performing pre-initialization of GPU\n");
int id;
cudaError_t cudaErr = cudaGetDevice(&id);
if (cudaErr != cudaSuccess) {
err->id = -1;
snprintf(err->msg, err->msg_len, "Unable to init GPU: %s", cudaGetErrorString(cudaErr));
return;
}
#endif
llama_backend_init(params.numa);
// load the model
@@ -83,18 +139,23 @@ void llama_server_start() {
assert(llama != NULL);
// TODO mutex to protect thread creation
ext_server_thread = std::thread([&]() {
ext_server_running = true;
try {
LOG_TEE("llama server main loop starting\n");
ggml_time_init();
while (ext_server_running.load()) {
if (!llama->update_slots()) {
LOG_TEE(
"unexpected error in llama server update_slots - exiting main "
"loop\n");
break;
}
}
llama->queue_tasks.on_new_task(std::bind(
&llama_server_context::process_single_task, llama, std::placeholders::_1));
llama->queue_tasks.on_finish_multitask(std::bind(
&llama_server_context::on_finish_multitask, llama, std::placeholders::_1));
llama->queue_tasks.on_all_tasks_finished(std::bind(
&llama_server_context::run_on_all_tasks_finished, llama));
llama->queue_results.on_multitask_update(std::bind(
&llama_server_queue::update_multitask,
&llama->queue_tasks,
std::placeholders::_1,
std::placeholders::_2,
std::placeholders::_3
));
llama->queue_tasks.start_loop();
} catch (std::exception &e) {
LOG_TEE("caught exception in llama server main loop: %s\n", e.what());
} catch (...) {
@@ -107,13 +168,22 @@ void llama_server_start() {
void llama_server_stop() {
assert(llama != NULL);
// TODO - too verbose, remove once things are solid
LOG_TEE("requesting llama server shutdown\n");
ext_server_running = false;
// Shutdown any in-flight requests and block incoming requests.
LOG_TEE("\ninitiating shutdown - draining remaining tasks...\n");
shutting_down = true;
while (recv_counter.load() > 0) {
std::this_thread::sleep_for(std::chrono::milliseconds(50));
}
// This may take a while for any pending tasks to drain
// TODO - consider a timeout to cancel tasks if it's taking too long
llama->queue_tasks.terminate();
ext_server_thread.join();
delete llama;
llama = NULL;
LOG_TEE("llama server shutdown complete\n");
shutting_down = false;
}
void llama_server_completion(const char *json_req, ext_server_resp_t *resp) {
@@ -121,8 +191,13 @@ void llama_server_completion(const char *json_req, ext_server_resp_t *resp) {
resp->id = -1;
resp->msg[0] = '\0';
try {
if (shutting_down) {
throw std::runtime_error("server shutting down");
}
json data = json::parse(json_req);
resp->id = llama->request_completion(data, false, false, -1);
resp->id = llama->queue_tasks.get_new_id();
llama->queue_results.add_waiting_task_id(resp->id);
llama->request_completion(resp->id, data, false, false, -1);
} catch (std::exception &e) {
snprintf(resp->msg, resp->msg_len, "exception %s", e.what());
} catch (...) {
@@ -140,16 +215,28 @@ void llama_server_completion_next_result(const int task_id,
resp->json_resp = NULL;
std::string result_json;
try {
task_result result = llama->next_result(task_id);
atomicRecv ar(recv_counter);
task_result result = llama->queue_results.recv(task_id);
result_json =
result.result_json.dump(-1, ' ', false, json::error_handler_t::replace);
resp->id = result.id;
resp->stop = result.stop;
resp->error = result.error;
if (result.error) {
LOG_TEE("next result cancel on error\n");
llama->request_cancel(task_id);
LOG_TEE("next result removing waiting tak ID: %d\n", task_id);
llama->queue_results.remove_waiting_task_id(task_id);
} else if (result.stop) {
LOG_TEE("next result cancel on stop\n");
llama->request_cancel(task_id);
LOG_TEE("next result removing waiting task ID: %d\n", task_id);
llama->queue_results.remove_waiting_task_id(task_id);
} else if (shutting_down) {
LOG_TEE("aborting completion due to shutdown %d\n", task_id);
llama->request_cancel(task_id);
llama->queue_results.remove_waiting_task_id(task_id);
resp->stop = true;
}
} catch (std::exception &e) {
resp->error = true;
@@ -180,6 +267,7 @@ void llama_server_completion_cancel(const int task_id, ext_server_resp_t *err) {
err->msg[0] = '\0';
try {
llama->request_cancel(task_id);
llama->queue_results.remove_waiting_task_id(task_id);
} catch (std::exception &e) {
err->id = -1;
snprintf(err->msg, err->msg_len, "exception %s", e.what());
@@ -197,6 +285,9 @@ void llama_server_tokenize(const char *json_req, char **json_resp,
err->id = 0;
err->msg[0] = '\0';
try {
if (shutting_down) {
throw std::runtime_error("server shutting down");
}
const json body = json::parse(json_req);
std::vector<llama_token> tokens;
if (body.count("content") != 0) {
@@ -230,6 +321,9 @@ void llama_server_detokenize(const char *json_req, char **json_resp,
err->id = 0;
err->msg[0] = '\0';
try {
if (shutting_down) {
throw std::runtime_error("server shutting down");
}
const json body = json::parse(json_req);
std::string content;
if (body.count("tokens") != 0) {
@@ -257,6 +351,9 @@ void llama_server_embedding(const char *json_req, char **json_resp,
err->id = 0;
err->msg[0] = '\0';
try {
if (shutting_down) {
throw std::runtime_error("server shutting down");
}
const json body = json::parse(json_req);
json prompt;
if (body.count("content") != 0) {
@@ -264,13 +361,16 @@ void llama_server_embedding(const char *json_req, char **json_resp,
} else {
prompt = "";
}
const int task_id = llama->request_completion(
{{"prompt", prompt}, {"n_predict", 0}}, false, true, -1);
task_result result = llama->next_result(task_id);
const int task_id = llama->queue_tasks.get_new_id();
llama->queue_results.add_waiting_task_id(task_id);
llama->request_completion(task_id, {{"prompt", prompt}, {"n_predict", 0}}, false, true, -1);
atomicRecv ar(recv_counter);
task_result result = llama->queue_results.recv(task_id);
std::string result_json = result.result_json.dump();
const std::string::size_type size = result_json.size() + 1;
*json_resp = new char[size];
snprintf(*json_resp, size, "%s", result_json.c_str());
llama->queue_results.remove_waiting_task_id(task_id);
} catch (std::exception &e) {
err->id = -1;
snprintf(err->msg, err->msg_len, "exception %s", e.what());

View File

@@ -45,6 +45,7 @@ typedef struct ext_server_params {
bool embedding; // get only sentence embedding
ext_server_lora_adapter_t *lora_adapters;
char *mmproj;
bool verbose_logging; // Enable verbose logging of the server
} ext_server_params_t;
typedef struct ext_server_task_result {

View File

@@ -1,80 +0,0 @@
//go:build !windows
package llm
/*
#include <stdlib.h>
#include "ext_server.h"
*/
import "C"
import (
"context"
"github.com/jmorganca/ollama/api"
)
type llamaExtServer struct {
api.Options
}
func (llm *llamaExtServer) llama_server_init(sparams *C.ext_server_params_t, err *C.ext_server_resp_t) {
C.llama_server_init(sparams, err)
}
func (llm *llamaExtServer) llama_server_start() {
C.llama_server_start()
}
func (llm *llamaExtServer) llama_server_stop() {
C.llama_server_stop()
}
func (llm *llamaExtServer) llama_server_completion(json_req *C.char, resp *C.ext_server_resp_t) {
C.llama_server_completion(json_req, resp)
}
func (llm *llamaExtServer) llama_server_completion_next_result(task_id C.int, resp *C.ext_server_task_result_t) {
C.llama_server_completion_next_result(task_id, resp)
}
func (llm *llamaExtServer) llama_server_completion_cancel(task_id C.int, err *C.ext_server_resp_t) {
C.llama_server_completion_cancel(task_id, err)
}
func (llm *llamaExtServer) llama_server_release_task_result(result *C.ext_server_task_result_t) {
C.llama_server_release_task_result(result)
}
func (llm *llamaExtServer) llama_server_tokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
C.llama_server_tokenize(json_req, json_resp, err)
}
func (llm *llamaExtServer) llama_server_detokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
C.llama_server_detokenize(json_req, json_resp, err)
}
func (llm *llamaExtServer) llama_server_embedding(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
C.llama_server_embedding(json_req, json_resp, err)
}
func (llm *llamaExtServer) llama_server_release_json_resp(json_resp **C.char) {
C.llama_server_release_json_resp(json_resp)
}
func newDefaultExtServer(model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) {
server := &llamaExtServer{opts}
return newExtServer(server, model, adapters, projectors, numLayers, opts)
}
func (llm *llamaExtServer) Predict(ctx context.Context, pred PredictOpts, fn func(PredictResult)) error {
return predict(ctx, llm, pred, fn)
}
func (llm *llamaExtServer) Encode(ctx context.Context, prompt string) ([]int, error) {
return encode(llm, ctx, prompt)
}
func (llm *llamaExtServer) Decode(ctx context.Context, tokens []int) (string, error) {
return decode(llm, ctx, tokens)
}
func (llm *llamaExtServer) Embedding(ctx context.Context, input string) ([]float64, error) {
return embedding(llm, ctx, input)
}
func (llm *llamaExtServer) Close() {
close(llm)
}

View File

@@ -1,12 +0,0 @@
package llm
import (
"github.com/jmorganca/ollama/api"
)
func newDefaultExtServer(model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) {
// On windows we always load the llama.cpp libraries dynamically to avoid startup DLL dependencies
// This ensures we can update the PATH at runtime to get everything loaded
return newDynamicShimExtServer(AvailableShims["cpu"], model, adapters, projectors, numLayers, opts)
}

View File

@@ -1,15 +1,46 @@
# common logic accross linux and darwin
init_vars() {
case "${GOARCH}" in
"amd64")
ARCH="x86_64"
;;
"arm64")
ARCH="arm64"
;;
*)
ARCH=$(uname -m | sed -e "s/aarch64/arm64/g")
esac
LLAMACPP_DIR=../llama.cpp
PATCHES="0001-Expose-callable-API-for-server.patch"
CMAKE_DEFS=""
CMAKE_TARGETS="--target ggml --target ggml_static --target llama --target build_info --target common --target ext_server --target llava_static"
CMAKE_TARGETS="--target ext_server"
if echo "${CGO_CFLAGS}" | grep -- '-g' >/dev/null; then
CMAKE_DEFS="-DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_VERBOSE_MAKEFILE=on -DLLAMA_GPROF=on -DLLAMA_SERVER_VERBOSE=on"
CMAKE_DEFS="-DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_VERBOSE_MAKEFILE=on -DLLAMA_GPROF=on -DLLAMA_SERVER_VERBOSE=on ${CMAKE_DEFS}"
else
# TODO - add additional optimization flags...
CMAKE_DEFS="-DCMAKE_BUILD_TYPE=Release -DLLAMA_SERVER_VERBOSE=off"
CMAKE_DEFS="-DCMAKE_BUILD_TYPE=Release -DLLAMA_SERVER_VERBOSE=off ${CMAKE_DEFS}"
fi
case $(uname -s) in
"Darwin")
LIB_EXT="dylib"
WHOLE_ARCHIVE="-Wl,-force_load"
NO_WHOLE_ARCHIVE=""
GCC_ARCH="-arch ${ARCH}"
;;
"Linux")
LIB_EXT="so"
WHOLE_ARCHIVE="-Wl,--whole-archive"
NO_WHOLE_ARCHIVE="-Wl,--no-whole-archive"
# Cross compiling not supported on linux - Use docker
GCC_ARCH=""
;;
*)
;;
esac
if [ -z "${CMAKE_CUDA_ARCHITECTURES}" ] ; then
CMAKE_CUDA_ARCHITECTURES="50;52;61;70;75;80"
fi
}
@@ -33,6 +64,19 @@ apply_patches() {
if ! grep ollama ${LLAMACPP_DIR}/examples/server/CMakeLists.txt; then
echo 'include (../../../ext_server/CMakeLists.txt) # ollama' >>${LLAMACPP_DIR}/examples/server/CMakeLists.txt
fi
if [ -n "$(ls -A ../patches/*.diff)" ]; then
# apply temporary patches until fix is upstream
for patch in ../patches/*.diff; do
for file in $(grep "^+++ " ${patch} | cut -f2 -d' ' | cut -f2- -d/); do
(cd ${LLAMACPP_DIR}; git checkout ${file})
done
done
for patch in ../patches/*.diff; do
(cd ${LLAMACPP_DIR} && git apply ${patch})
done
fi
# Avoid duplicate main symbols when we link into the cgo binary
sed -e 's/int main(/int __main(/g' <${LLAMACPP_DIR}/examples/server/server.cpp >${LLAMACPP_DIR}/examples/server/server.cpp.tmp &&
mv ${LLAMACPP_DIR}/examples/server/server.cpp.tmp ${LLAMACPP_DIR}/examples/server/server.cpp
@@ -41,18 +85,41 @@ apply_patches() {
build() {
cmake -S ${LLAMACPP_DIR} -B ${BUILD_DIR} ${CMAKE_DEFS}
cmake --build ${BUILD_DIR} ${CMAKE_TARGETS} -j8
mkdir -p ${BUILD_DIR}/lib/
g++ -fPIC -g -shared -o ${BUILD_DIR}/lib/libext_server.${LIB_EXT} \
${GCC_ARCH} \
${WHOLE_ARCHIVE} ${BUILD_DIR}/examples/server/libext_server.a ${NO_WHOLE_ARCHIVE} \
${BUILD_DIR}/common/libcommon.a \
${BUILD_DIR}/libllama.a \
-Wl,-rpath,\$ORIGIN \
-lpthread -ldl -lm \
${EXTRA_LIBS}
}
install() {
rm -rf ${BUILD_DIR}/lib
mkdir -p ${BUILD_DIR}/lib
cp ${BUILD_DIR}/examples/server/libext_server.a ${BUILD_DIR}/lib
cp ${BUILD_DIR}/common/libcommon.a ${BUILD_DIR}/lib
cp ${BUILD_DIR}/libllama.a ${BUILD_DIR}/lib
cp ${BUILD_DIR}/libggml_static.a ${BUILD_DIR}/lib
compress_libs() {
echo "Compressing payloads to reduce overall binary size..."
pids=""
rm -rf ${BUILD_DIR}/lib/*.${LIB_EXT}*.gz
for lib in ${BUILD_DIR}/lib/*.${LIB_EXT}* ; do
gzip --best -f ${lib} &
pids+=" $!"
done
echo
for pid in ${pids}; do
wait $pid
done
echo "Finished compression"
}
# Keep the local tree clean after we're done with the build
cleanup() {
(cd ${LLAMACPP_DIR}/examples/server/ && git checkout CMakeLists.txt server.cpp)
if [ -n "$(ls -A ../patches/*.diff)" ]; then
for patch in ../patches/*.diff; do
for file in $(grep "^+++ " ${patch} | cut -f2 -d' ' | cut -f2- -d/); do
(cd ${LLAMACPP_DIR}; git checkout ${file})
done
done
fi
}

View File

@@ -9,14 +9,63 @@ set -o pipefail
echo "Starting darwin generate script"
source $(dirname $0)/gen_common.sh
init_vars
CMAKE_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 -DLLAMA_METAL=on -DLLAMA_ACCELERATE=on ${CMAKE_DEFS}"
BUILD_DIR="${LLAMACPP_DIR}/build/darwin/metal"
git_module_setup
apply_patches
sign() {
if [ -n "$APPLE_IDENTITY" ]; then
codesign -f --timestamp --deep --options=runtime --sign "$APPLE_IDENTITY" --identifier ai.ollama.ollama $1
fi
}
COMMON_DARWIN_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 -DCMAKE_SYSTEM_NAME=Darwin"
case "${GOARCH}" in
"amd64")
CMAKE_DEFS="-DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
COMMON_CPU_DEFS="${COMMON_DARWIN_DEFS} -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DLLAMA_METAL=off -DLLAMA_NATIVE=off"
#
# CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
#
CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
BUILD_DIR="${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu"
echo "Building LCD CPU"
build
sign ${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu/lib/libext_server.dylib
compress_libs
#
# ~2011 CPU Dynamic library with more capabilities turned on to optimize performance
# Approximately 400% faster than LCD on same CPU
#
init_vars
CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
BUILD_DIR="${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu_avx"
echo "Building AVX CPU"
build
sign ${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu_avx/lib/libext_server.dylib
compress_libs
#
# ~2013 CPU Dynamic library
# Approximately 10% faster than AVX on same CPU
#
init_vars
CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=on -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on ${CMAKE_DEFS}"
BUILD_DIR="${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu_avx2"
echo "Building AVX2 CPU"
EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation"
build
sign ${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu_avx2/lib/libext_server.dylib
compress_libs
;;
"arm64")
CMAKE_DEFS="-DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 ${CMAKE_DEFS}"
CMAKE_DEFS="${COMMON_DARWIN_DEFS} -DLLAMA_ACCELERATE=on -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DLLAMA_METAL=on ${CMAKE_DEFS}"
BUILD_DIR="${LLAMACPP_DIR}/build/darwin/${ARCH}/metal"
EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders"
build
sign ${LLAMACPP_DIR}/build/darwin/${ARCH}/metal/lib/libext_server.dylib
compress_libs
;;
*)
echo "GOARCH must be set"
@@ -25,8 +74,4 @@ case "${GOARCH}" in
;;
esac
git_module_setup
apply_patches
build
install
cleanup

View File

@@ -2,24 +2,25 @@
# This script is intended to run inside the go generate
# working directory must be llm/generate/
# First we build our default built-in library which will be linked into the CGO
# binary as a normal dependency. This default build is CPU based.
# First we build one or more CPU based LLM libraries
#
# Then we build a CUDA dynamic library (although statically linked with the CUDA
# library dependencies for maximum portability)
# Then if we detect CUDA, we build a CUDA dynamic library, and carry the required
# library dependencies
#
# Then if we detect ROCm, we build a dynamically loaded ROCm lib. ROCm is particularly
# important to be a dynamic lib even if it's the only GPU library detected because
# we can't redistribute the objectfiles but must rely on dynamic libraries at
# runtime, which could lead the server not to start if not present.
# Then if we detect ROCm, we build a dynamically loaded ROCm lib. The ROCM
# libraries are quite large, and also dynamically load data files at runtime
# which in turn are large, so we don't attempt to cary them as payload
set -ex
set -o pipefail
# See https://llvm.org/docs/AMDGPUUsage.html#processors for reference
amdGPUs() {
if [ -n "${AMDGPU_TARGETS}" ]; then
echo "${AMDGPU_TARGETS}"
return
fi
GPU_LIST=(
"gfx803"
"gfx900"
"gfx906:xnack-"
"gfx908:xnack-"
@@ -39,8 +40,13 @@ amdGPUs() {
}
echo "Starting linux generate script"
if [ -z "${CUDACXX}" -a -x /usr/local/cuda/bin/nvcc ]; then
export CUDACXX=/usr/local/cuda/bin/nvcc
if [ -z "${CUDACXX}" ]; then
if [ -x /usr/local/cuda/bin/nvcc ]; then
export CUDACXX=/usr/local/cuda/bin/nvcc
else
# Try the default location in case it exists
export CUDACXX=$(command -v nvcc)
fi
fi
COMMON_CMAKE_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off"
source $(dirname $0)/gen_common.sh
@@ -48,38 +54,115 @@ init_vars
git_module_setup
apply_patches
#
# CPU first for the default library
#
CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS}"
BUILD_DIR="${LLAMACPP_DIR}/build/linux/cpu"
if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
# Users building from source can tune the exact flags we pass to cmake for configuring
# llama.cpp, and we'll build only 1 CPU variant in that case as the default.
if [ -n "${OLLAMA_CUSTOM_CPU_DEFS}" ]; then
echo "OLLAMA_CUSTOM_CPU_DEFS=\"${OLLAMA_CUSTOM_CPU_DEFS}\""
CMAKE_DEFS="${OLLAMA_CUSTOM_CPU_DEFS} -DCMAKE_POSITION_INDEPENDENT_CODE=on ${CMAKE_DEFS}"
BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu"
echo "Building custom CPU"
build
compress_libs
else
# Darwin Rosetta x86 emulation does NOT support AVX, AVX2, AVX512
# -DLLAMA_AVX -- 2011 Intel Sandy Bridge & AMD Bulldozer
# -DLLAMA_F16C -- 2012 Intel Ivy Bridge & AMD 2011 Bulldozer (No significant improvement over just AVX)
# -DLLAMA_AVX2 -- 2013 Intel Haswell & 2015 AMD Excavator / 2017 AMD Zen
# -DLLAMA_FMA (FMA3) -- 2013 Intel Haswell & 2012 AMD Piledriver
# Note: the following seem to yield slower results than AVX2 - ymmv
# -DLLAMA_AVX512 -- 2017 Intel Skylake and High End DeskTop (HEDT)
# -DLLAMA_AVX512_VBMI -- 2018 Intel Cannon Lake
# -DLLAMA_AVX512_VNNI -- 2021 Intel Alder Lake
build
install
COMMON_CPU_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off"
if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu" ]; then
#
# CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
#
CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu"
echo "Building LCD CPU"
build
compress_libs
fi
# Placeholder to keep go embed happy until we start building dynamic CPU lib variants
touch ${BUILD_DIR}/lib/dummy.so
if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu_avx" ]; then
#
# ~2011 CPU Dynamic library with more capabilities turned on to optimize performance
# Approximately 400% faster than LCD on same CPU
#
init_vars
CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu_avx"
echo "Building AVX CPU"
build
compress_libs
fi
if [ -d /usr/local/cuda/lib64/ ]; then
if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu_avx2" ]; then
#
# ~2013 CPU Dynamic library
# Approximately 10% faster than AVX on same CPU
#
init_vars
CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on ${CMAKE_DEFS}"
BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu_avx2"
echo "Building AVX2 CPU"
build
compress_libs
fi
fi
else
echo "Skipping CPU generation step as requested"
fi
# If needed, look for the default CUDA toolkit location
if [ -z "${CUDA_LIB_DIR}" ] && [ -d /usr/local/cuda/lib64 ]; then
CUDA_LIB_DIR=/usr/local/cuda/lib64
fi
# If needed, look for CUDA on Arch Linux
if [ -z "${CUDA_LIB_DIR}" ] && [ -d /opt/cuda/targets/x86_64-linux/lib ]; then
CUDA_LIB_DIR=/opt/cuda/targets/x86_64-linux/lib
fi
# Allow override in case libcudart is in the wrong place
if [ -z "${CUDART_LIB_DIR}" ]; then
CUDART_LIB_DIR="${CUDA_LIB_DIR}"
fi
if [ -d "${CUDA_LIB_DIR}" ]; then
echo "CUDA libraries detected - building dynamic CUDA library"
init_vars
CMAKE_DEFS="-DLLAMA_CUBLAS=on ${COMMON_CMAKE_DEFS} ${CMAKE_DEFS}"
BUILD_DIR="${LLAMACPP_DIR}/build/linux/cuda"
CUDA_LIB_DIR=/usr/local/cuda/lib64
CUDA_MAJOR=$(ls "${CUDA_LIB_DIR}"/libcudart.so.* | head -1 | cut -f3 -d. || true)
if [ -n "${CUDA_MAJOR}" ]; then
CUDA_VARIANT=_v${CUDA_MAJOR}
fi
CMAKE_DEFS="-DLLAMA_CUBLAS=on -DLLAMA_CUDA_FORCE_MMQ=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} ${COMMON_CMAKE_DEFS} ${CMAKE_DEFS}"
BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cuda${CUDA_VARIANT}"
EXTRA_LIBS="-L${CUDA_LIB_DIR} -lcudart -lcublas -lcublasLt -lcuda"
build
install
gcc -fPIC -g -shared -o ${BUILD_DIR}/lib/libext_server.so \
-Wl,--whole-archive \
${BUILD_DIR}/lib/libext_server.a \
${BUILD_DIR}/lib/libcommon.a \
${BUILD_DIR}/lib/libllama.a \
-Wl,--no-whole-archive \
${CUDA_LIB_DIR}/libcudart_static.a \
${CUDA_LIB_DIR}/libcublas_static.a \
${CUDA_LIB_DIR}/libcublasLt_static.a \
${CUDA_LIB_DIR}/libcudadevrt.a \
${CUDA_LIB_DIR}/libculibos.a \
-lrt -lpthread -ldl -lstdc++ -lm
# Cary the CUDA libs as payloads to help reduce dependency burden on users
#
# TODO - in the future we may shift to packaging these separately and conditionally
# downloading them in the install script.
DEPS="$(ldd ${BUILD_DIR}/lib/libext_server.so )"
for lib in libcudart.so libcublas.so libcublasLt.so ; do
DEP=$(echo "${DEPS}" | grep ${lib} | cut -f1 -d' ' | xargs || true)
if [ -n "${DEP}" -a -e "${CUDA_LIB_DIR}/${DEP}" ]; then
cp "${CUDA_LIB_DIR}/${DEP}" "${BUILD_DIR}/lib/"
elif [ -e "${CUDA_LIB_DIR}/${lib}.${CUDA_MAJOR}" ]; then
cp "${CUDA_LIB_DIR}/${lib}.${CUDA_MAJOR}" "${BUILD_DIR}/lib/"
elif [ -e "${CUDART_LIB_DIR}/${lib}" ]; then
cp -d ${CUDART_LIB_DIR}/${lib}* "${BUILD_DIR}/lib/"
else
cp -d "${CUDA_LIB_DIR}/${lib}*" "${BUILD_DIR}/lib/"
fi
done
compress_libs
fi
if [ -z "${ROCM_PATH}" ]; then
@@ -96,21 +179,18 @@ fi
if [ -d "${ROCM_PATH}" ]; then
echo "ROCm libraries detected - building dynamic ROCm library"
if [ -f ${ROCM_PATH}/lib/librocm_smi64.so.? ]; then
ROCM_VARIANT=_v$(ls ${ROCM_PATH}/lib/librocm_smi64.so.? | cut -f3 -d. || true)
fi
init_vars
CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DLLAMA_HIPBLAS=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS=$(amdGPUs) -DGPU_TARGETS=$(amdGPUs)"
BUILD_DIR="${LLAMACPP_DIR}/build/linux/rocm"
BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/rocm${ROCM_VARIANT}"
EXTRA_LIBS="-L${ROCM_PATH}/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ -Wl,-rpath,${ROCM_PATH}/lib,-rpath,/opt/amdgpu/lib/x86_64-linux-gnu/ -lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu"
build
install
gcc -fPIC -g -shared -o ${BUILD_DIR}/lib/libext_server.so \
-Wl,--whole-archive \
${BUILD_DIR}/lib/libext_server.a \
${BUILD_DIR}/lib/libcommon.a \
${BUILD_DIR}/lib/libllama.a \
-Wl,--no-whole-archive \
-lrt -lpthread -ldl -lstdc++ -lm \
-L/opt/rocm/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ \
-Wl,-rpath,/opt/rocm/lib,-rpath,/opt/amdgpu/lib/x86_64-linux-gnu/ \
-lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu
# Note: the ROCM libs and runtime library files are too large to embed, so we depend on
# them being present at runtime on the host
compress_libs
fi
cleanup

View File

@@ -3,10 +3,11 @@
$ErrorActionPreference = "Stop"
function init_vars {
$script:SRC_DIR = $(resolve-path "..\..\")
$script:llamacppDir = "../llama.cpp"
$script:patches = @("0001-Expose-callable-API-for-server.patch")
$script:cmakeDefs = @("-DBUILD_SHARED_LIBS=on", "-DLLAMA_NATIVE=off", "-DLLAMA_F16C=off", "-DLLAMA_FMA=off", "-DLLAMA_AVX512=off", "-DLLAMA_AVX2=off", "-DLLAMA_AVX=on", "-A","x64")
$script:cmakeTargets = @("ggml", "ggml_static", "llama", "build_info", "common", "ext_server_shared", "llava_static")
$script:cmakeDefs = @("-DBUILD_SHARED_LIBS=on", "-DLLAMA_NATIVE=off", "-A", "x64")
$script:cmakeTargets = @("ext_server")
$script:ARCH = "amd64" # arm not yet supported.
if ($env:CGO_CFLAGS -contains "-g") {
$script:cmakeDefs += @("-DCMAKE_VERBOSE_MAKEFILE=on", "-DLLAMA_SERVER_VERBOSE=on")
$script:config = "RelWithDebInfo"
@@ -14,6 +15,28 @@ function init_vars {
$script:cmakeDefs += @("-DLLAMA_SERVER_VERBOSE=off")
$script:config = "Release"
}
# Try to find the CUDA dir
if ($env:CUDA_LIB_DIR -eq $null) {
$d=(get-command -ea 'silentlycontinue' nvcc).path
if ($d -ne $null) {
$script:CUDA_LIB_DIR=($d| split-path -parent)
$script:CUDA_INCLUDE_DIR=($script:CUDA_LIB_DIR|split-path -parent)+"\include"
}
} else {
$script:CUDA_LIB_DIR=$env:CUDA_LIB_DIR
}
$script:GZIP=(get-command -ea 'silentlycontinue' gzip).path
$script:DUMPBIN=(get-command -ea 'silentlycontinue' dumpbin).path
if ($null -eq $env:CMAKE_CUDA_ARCHITECTURES) {
$script:CMAKE_CUDA_ARCHITECTURES="50;52;61;70;75;80"
} else {
$script:CMAKE_CUDA_ARCHITECTURES=$env:CMAKE_CUDA_ARCHITECTURES
}
# Note: 10 Windows Kit signtool crashes with GCP's plugin
${script:SignTool}="C:\Program Files (x86)\Windows Kits\8.1\bin\x64\signtool.exe"
if ("${env:KEY_CONTAINER}") {
${script:OLLAMA_CERT}=$(resolve-path "${script:SRC_DIR}\ollama_inc.crt")
}
}
function git_module_setup {
@@ -29,6 +52,29 @@ function apply_patches {
if (!(Select-String -Path "${script:llamacppDir}/examples/server/CMakeLists.txt" -Pattern 'ollama')) {
Add-Content -Path "${script:llamacppDir}/examples/server/CMakeLists.txt" -Value 'include (../../../ext_server/CMakeLists.txt) # ollama'
}
# Apply temporary patches until fix is upstream
$patches = Get-ChildItem "../patches/*.diff"
foreach ($patch in $patches) {
# Extract file paths from the patch file
$filePaths = Get-Content $patch.FullName | Where-Object { $_ -match '^\+\+\+ ' } | ForEach-Object {
$parts = $_ -split ' '
($parts[1] -split '/', 2)[1]
}
# Checkout each file
Set-Location -Path ${script:llamacppDir}
foreach ($file in $filePaths) {
git checkout $file
}
}
# Apply each patch
foreach ($patch in $patches) {
Set-Location -Path ${script:llamacppDir}
git apply $patch.FullName
}
# Avoid duplicate main symbols when we link into the cgo binary
$content = Get-Content -Path "${script:llamacppDir}/examples/server/server.cpp"
$content = $content -replace 'int main\(', 'int __main('
@@ -48,41 +94,113 @@ function build {
function install {
rm -ea 0 -recurse -force -path "${script:buildDir}/lib"
md "${script:buildDir}/lib" -ea 0 > $null
cp "${script:buildDir}/bin/${script:config}/ext_server_shared.dll" "${script:buildDir}/lib"
cp "${script:buildDir}/bin/${script:config}/ext_server.dll" "${script:buildDir}/lib"
cp "${script:buildDir}/bin/${script:config}/llama.dll" "${script:buildDir}/lib"
# Display the dll dependencies in the build log
dumpbin /dependents "${script:buildDir}/bin/${script:config}/ext_server_shared.dll" | select-string ".dll"
if ($script:DUMPBIN -ne $null) {
& "$script:DUMPBIN" /dependents "${script:buildDir}/bin/${script:config}/ext_server.dll" | select-string ".dll"
}
}
function sign {
if ("${env:KEY_CONTAINER}") {
write-host "Signing ${script:buildDir}/lib/*.dll"
foreach ($file in (get-childitem "${script:buildDir}/lib/*.dll")){
& "${script:SignTool}" sign /v /fd sha256 /t http://timestamp.digicert.com /f "${script:OLLAMA_CERT}" `
/csp "Google Cloud KMS Provider" /kc "${env:KEY_CONTAINER}" $file
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
}
}
}
function compress_libs {
if ($script:GZIP -eq $null) {
write-host "gzip not installed, not compressing files"
return
}
write-host "Compressing dlls..."
$libs = dir "${script:buildDir}/lib/*.dll"
foreach ($file in $libs) {
& "$script:GZIP" --best -f $file
}
}
function cleanup {
$patches = Get-ChildItem "../patches/*.diff"
foreach ($patch in $patches) {
# Extract file paths from the patch file
$filePaths = Get-Content $patch.FullName | Where-Object { $_ -match '^\+\+\+ ' } | ForEach-Object {
$parts = $_ -split ' '
($parts[1] -split '/', 2)[1]
}
# Checkout each file
Set-Location -Path ${script:llamacppDir}
foreach ($file in $filePaths) {
git checkout $file
}
}
Set-Location "${script:llamacppDir}/examples/server"
git checkout CMakeLists.txt server.cpp
}
init_vars
git_module_setup
apply_patches
# first build CPU based
$script:buildDir="${script:llamacppDir}/build/windows/cpu"
# -DLLAMA_AVX -- 2011 Intel Sandy Bridge & AMD Bulldozer
# -DLLAMA_F16C -- 2012 Intel Ivy Bridge & AMD 2011 Bulldozer (No significant improvement over just AVX)
# -DLLAMA_AVX2 -- 2013 Intel Haswell & 2015 AMD Excavator / 2017 AMD Zen
# -DLLAMA_FMA (FMA3) -- 2013 Intel Haswell & 2012 AMD Piledriver
$script:commonCpuDefs = @("-DCMAKE_POSITION_INDEPENDENT_CODE=on", "-DLLAMA_NATIVE=off")
$script:cmakeDefs = $script:commonCpuDefs + @("-DLLAMA_AVX=off", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs
$script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cpu"
write-host "Building LCD CPU"
build
install
sign
compress_libs
# Then build cuda as a dynamically loaded library
init_vars
$script:buildDir="${script:llamacppDir}/build/windows/cuda"
$script:cmakeDefs += @("-DLLAMA_CUBLAS=ON")
$script:cmakeDefs = $script:commonCpuDefs + @("-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs
$script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cpu_avx"
write-host "Building AVX CPU"
build
install
sign
compress_libs
$script:cmakeDefs = $script:commonCpuDefs + @("-DLLAMA_AVX=on", "-DLLAMA_AVX2=on", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=on", "-DLLAMA_F16C=on") + $script:cmakeDefs
$script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cpu_avx2"
write-host "Building AVX2 CPU"
build
install
sign
compress_libs
if ($null -ne $script:CUDA_LIB_DIR) {
# Then build cuda as a dynamically loaded library
$nvcc = "$script:CUDA_LIB_DIR\nvcc.exe"
$script:CUDA_VERSION=(get-item ($nvcc | split-path | split-path)).Basename
if ($null -ne $script:CUDA_VERSION) {
$script:CUDA_VARIANT="_"+$script:CUDA_VERSION
}
init_vars
$script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cuda$script:CUDA_VARIANT"
$script:cmakeDefs += @("-DLLAMA_CUBLAS=ON", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DCUDAToolkit_INCLUDE_DIR=$script:CUDA_INCLUDE_DIR", "-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}")
build
install
sign
compress_libs
}
# TODO - actually implement ROCm support on windows
$script:buildDir="${script:llamacppDir}/build/windows/rocm"
$script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/rocm"
rm -ea 0 -recurse -force -path "${script:buildDir}/lib"
md "${script:buildDir}/lib" -ea 0 > $null
echo $null >> "${script:buildDir}/lib/.generated"
cleanup
write-host "`ngo generate completed"
write-host "`ngo generate completed"

View File

@@ -78,7 +78,12 @@ type model interface {
ModelFamily() string
ModelType() string
FileType() string
NumLayers() int64
NumLayers() uint32
NumGQA() uint32
NumEmbed() uint32
NumHead() uint32
NumHeadKv() uint32
NumCtx() uint32
}
type container interface {
@@ -94,9 +99,9 @@ func (c *containerLORA) Name() string {
return "ggla"
}
func (c *containerLORA) Decode(ro *readSeekOffset) (model, error) {
func (c *containerLORA) Decode(rso *readSeekOffset) (model, error) {
var version uint32
binary.Read(ro, binary.LittleEndian, &version)
binary.Read(rso, binary.LittleEndian, &version)
switch version {
case 1:
@@ -107,7 +112,7 @@ func (c *containerLORA) Decode(ro *readSeekOffset) (model, error) {
c.version = version
// remaining file contents aren't decoded
ro.Seek(0, io.SeekEnd)
rso.Seek(0, io.SeekEnd)
return nil, nil
}

View File

@@ -69,12 +69,65 @@ type tensor struct {
name string
kind uint32
offset uint64
size uint64
// shape is the number of elements in each dimension
shape [4]uint64
}
func (t tensor) blockSize() uint64 {
switch {
case t.kind < 2:
return 1
case t.kind < 10:
return 32
default:
return 256
}
}
func (t tensor) typeSize() uint64 {
blockSize := t.blockSize()
switch t.kind {
case 0: // FP32
return 4
case 1: // FP16
return 2
case 2: // Q4_0
return 2 + blockSize/2
case 3: // Q4_1
return 2 + 2 + blockSize/2
case 6: // Q5_0
return 2 + 4 + blockSize/2
case 7: // Q5_1
return 2 + 2 + 4 + blockSize/2
case 8: // Q8_0
return 2 + blockSize
case 9: // Q8_1
return 4 + 4 + blockSize
case 10: // Q2_K
return blockSize/16 + blockSize/4 + 2 + 2
case 11: // Q3_K
return blockSize/8 + blockSize/4 + 12 + 2
case 12: // Q4_K
return 2 + 2 + 12 + blockSize/2
case 13: // Q5_K
return 2 + 2 + 12 + blockSize/8 + blockSize/2
case 14: // Q6_K
return blockSize/2 + blockSize/4 + blockSize/16 + 2
default:
return 0
}
}
func (t tensor) parameters() uint64 {
return t.shape[0] * t.shape[1] * t.shape[2] * t.shape[3]
}
func (t tensor) size() uint64 {
return t.parameters() * t.typeSize() / t.blockSize()
}
type ggufModel struct {
*containerGGUF
@@ -201,61 +254,15 @@ func (llm *ggufModel) Decode(rso *readSeekOffset) error {
shape[i] = llm.readU64(rso)
}
kind := llm.readU32(rso)
offset := llm.readU64(rso)
var blockSize uint64
switch {
case kind < 2:
blockSize = 1
case kind < 10:
blockSize = 32
default:
blockSize = 256
}
var typeSize uint64
switch kind {
case 0: // FP32
typeSize = 4
case 1: // FP16
typeSize = 2
case 2: // Q4_0
typeSize = 2 + blockSize/2
case 3: // Q4_1
typeSize = 2 + 2 + blockSize/2
case 6: // Q5_0
typeSize = 2 + 4 + blockSize/2
case 7: // Q5_1
typeSize = 2 + 2 + 4 + blockSize/2
case 8: // Q8_0
typeSize = 2 + blockSize
case 9: // Q8_1
typeSize = 4 + 4 + blockSize
case 10: // Q2_K
typeSize = blockSize/16 + blockSize/4 + 2 + 2
case 11: // Q3_K
typeSize = blockSize/8 + blockSize/4 + 12 + 2
case 12: // Q4_K
typeSize = 2 + 2 + 12 + blockSize/2
case 13: // Q5_K
typeSize = 2 + 2 + 12 + blockSize/8 + blockSize/2
case 14: // Q6_K
typeSize = blockSize/2 + blockSize/4 + blockSize/16 + 2
}
parameters := shape[0] * shape[1] * shape[2] * shape[3]
size := parameters * typeSize / blockSize
llm.tensors = append(llm.tensors, tensor{
tensor := tensor{
name: name,
kind: kind,
offset: offset,
size: size,
kind: llm.readU32(rso),
offset: llm.readU64(rso),
shape: shape,
})
}
llm.parameters += parameters
llm.tensors = append(llm.tensors, tensor)
llm.parameters += tensor.parameters()
}
alignment, ok := llm.kv["general.alignment"].(uint32)
@@ -265,21 +272,65 @@ func (llm *ggufModel) Decode(rso *readSeekOffset) error {
rso.Seek(int64(alignment)-rso.offset%int64(alignment), io.SeekCurrent)
for _, tensor := range llm.tensors {
padded := (int64(tensor.size) + int64(alignment) - 1) & ^(int64(alignment) - 1)
padded := (int64(tensor.size()) + int64(alignment) - 1) & ^(int64(alignment) - 1)
rso.Seek(padded, io.SeekCurrent)
}
return nil
}
func (llm *ggufModel) NumLayers() int64 {
func (llm *ggufModel) NumLayers() uint32 {
value, exists := llm.kv[fmt.Sprintf("%s.block_count", llm.ModelFamily())]
if !exists {
return 0
}
v := value.(uint32)
return int64(v)
return value.(uint32)
}
func (llm *ggufModel) NumHead() uint32 {
value, exists := llm.kv[fmt.Sprintf("%s.attention.head_count", llm.ModelFamily())]
if !exists {
return 0
}
return value.(uint32)
}
func (llm *ggufModel) NumEmbed() uint32 {
value, exists := llm.kv[fmt.Sprintf("%s.embedding_length", llm.ModelFamily())]
if !exists {
return 0
}
return value.(uint32)
}
func (llm *ggufModel) NumHeadKv() uint32 {
value, exists := llm.kv[fmt.Sprintf("%s.attention.head_count_kv", llm.ModelFamily())]
if !exists {
return 0
}
return value.(uint32)
}
func (llm *ggufModel) NumCtx() uint32 {
value, exists := llm.kv[fmt.Sprintf("%s.context_length", llm.ModelFamily())]
if !exists {
return 0
}
return value.(uint32)
}
func (llm *ggufModel) NumGQA() uint32 {
numHeadKv := llm.NumHeadKv()
if numHeadKv == 0 {
return 0
}
return llm.NumHead() / numHeadKv
}
func (llm ggufModel) readU8(r io.Reader) uint8 {

Some files were not shown because too many files have changed in this diff Show More