Compare commits

...

160 Commits

Author SHA1 Message Date
hoyyeva
83f6db367d Update docs/modelfile.md
Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com>
2024-04-01 15:45:39 -07:00
hoyyeva
c7294265f1 Update docs/modelfile.md
Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com>
2024-04-01 15:45:34 -07:00
Eva Ho
af4ed1d401 update modelfile readme 2024-03-27 10:36:18 -04:00
Eva Ho
186a4a0b65 update modelfile readme 2024-03-27 10:28:45 -04:00
Jeffrey Morgan
913306f4fd Detect arrow keys on windows (#3363)
* detect arrow keys on windows
* add some helpful comments
2024-03-26 18:21:56 -04:00
Jeffrey Morgan
f5ca7f8c8e add license in file header for vendored llama.cpp code (#3351) 2024-03-26 16:23:23 -04:00
Jeffrey Morgan
856b8ec131 remove need for $VSINSTALLDIR since build will fail if ninja cannot be found (#3350) 2024-03-26 16:23:16 -04:00
Patrick Devine
1b272d5bcd change github.com/jmorganca/ollama to github.com/ollama/ollama (#3347) 2024-03-26 13:04:17 -07:00
Christophe Dervieux
29715dbca7 malformed markdown link (#3358) 2024-03-26 10:46:36 -04:00
Daniel Hiltgen
54a028d07f Merge pull request #3356 from dhiltgen/fix_arm_linux
Switch runner for final release job
2024-03-25 20:54:46 -07:00
Daniel Hiltgen
f83e4db365 Switch runner for final release job
The manifest and tagging step use a lot of disk space
2024-03-25 20:51:40 -07:00
Daniel Hiltgen
3b5866a233 Merge pull request #3353 from dhiltgen/fix_arm_linux
Use Rocky Linux Vault to get GCC 10.2 installed
2024-03-25 19:38:56 -07:00
Daniel Hiltgen
b8c2be6142 Use Rocky Linux Vault to get GCC 10.2 installed
This should hopefully only be a temporary workaround until Rocky 8
picks up GCC 10.4 which fixes the NVCC bug
2024-03-25 19:18:50 -07:00
Daniel Hiltgen
e0319bd78d Revert "Switch arm cuda base image to centos 7"
This reverts commit 5dacc1ebe8.
2024-03-25 19:01:11 -07:00
Daniel Hiltgen
b31ed7f031 Merge pull request #3352 from dhiltgen/fix_arm_linux
Switch arm cuda base image to centos 7
2024-03-25 16:13:10 -07:00
Daniel Hiltgen
5dacc1ebe8 Switch arm cuda base image to centos 7
We had started using rocky linux 8, but they've updated to GCC 10.3,
which breaks NVCC.  10.2 is compatible (or 10.4, but that's not
available from rocky linux 8 repos yet)
2024-03-25 15:57:08 -07:00
Daniel Hiltgen
c2712b5566 Merge pull request #3348 from dhiltgen/bump_llamacpp
Bump llama.cpp to b2527
2024-03-25 14:15:53 -07:00
Daniel Hiltgen
8091ef2eeb Bump llama.cpp to b2527 2024-03-25 13:47:44 -07:00
Jeffrey Morgan
f38b705dc7 Fix ROCm link in development.md 2024-03-25 16:32:44 -04:00
Daniel Hiltgen
560be5e0b6 Merge pull request #3308 from dhiltgen/bump_more
Bump llama.cpp to b2510
2024-03-25 12:56:12 -07:00
Daniel Hiltgen
4a1c76b3aa Merge pull request #3331 from dhiltgen/integration_testing
Integration tests conditionally pull
2024-03-25 12:48:51 -07:00
Daniel Hiltgen
28a64e23ca Merge pull request #2279 from remy415/main
Add support for libcudart.so for CUDA devices (Adds Jetson support)
2024-03-25 12:46:28 -07:00
Niclas Pahlfer
92d74e2f59 adds ooo to community integrations (#1623)
Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com>
2024-03-25 15:08:33 -04:00
Herval Freire
6f8f57dd1d Add cliobot to ollama supported list (#1873)
* Update README.md

* Update README.md

---------

Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com>
2024-03-25 15:07:19 -04:00
Chenhe Gu
b2fa68b0ea Add Dify.AI to community integrations (#1944)
Dify.AI is a model-agnostic LLMOps platform for building and managing LLM applications.

Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com>
2024-03-25 15:06:39 -04:00
Marco Antônio
3767d5ef0d enh: add ollero.nvim to community applications (#1905)
Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com>
2024-03-25 15:06:08 -04:00
Ani Betts
9fed85bc8b Add typechat-cli to Terminal apps (#2428)
Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com>
2024-03-25 15:05:04 -04:00
Miguel
4501bc0913 add new Web & Desktop link in readme for alpaca webui (#2881)
Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com>
2024-03-25 15:00:18 -04:00
Danny Avila
57ba519e63 Add LibreChat to Web & Desktop Apps (#2918) 2024-03-25 14:59:18 -04:00
enoch1118
d98d322d24 Add Community Integration: OllamaGUI (#2927)
Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com>
2024-03-25 14:58:28 -04:00
fly2tomato
0c3ec74cf1 Add Community Integration: OpenAOE (#2946)
* Update README.md

* Update README.md

---------

Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com>
2024-03-25 14:57:40 -04:00
tusharhero
42ae8359fa docs: Add AI telegram to Community Integrations. (#3033) 2024-03-25 14:56:42 -04:00
Timothy Carambat
e4b76dfb76 docs: Add AnythingLLM to README as integration option (#3145)
Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com>
2024-03-25 14:54:48 -04:00
Jikku Jose
2c56517494 Add Saddle (#3178) 2024-03-25 14:54:09 -04:00
Yusuf Can Bayrak
cfbc1b152b tlm added to README.md terminal section. (#3274) 2024-03-25 14:53:26 -04:00
RAPID ARCHITECT
9305ac1b2e Update README.md (#3288)
Added Ollama Basic chat based on hyperdiv

Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com>
2024-03-25 14:52:25 -04:00
drazdra
45d6292959 Update README.md (#3338)
adding drazdra/ollama-chats to the list of UI :)
2024-03-25 14:50:51 -04:00
Blake Mizerany
22921a3969 doc: specify ADAPTER is optional (#3333) 2024-03-25 09:43:19 -07:00
Daniel Hiltgen
7b6cbc10ec Integration tests conditionally pull
If images aren't present, pull them.
Also fixes the expected responses
2024-03-25 08:57:45 -07:00
Jeremy
dfc6721b20 add support for libcudart.so for CUDA devices (adds Jetson support) 2024-03-25 11:07:44 -04:00
Blake Mizerany
acfa2b9422 llm: prevent race appending to slice (#3320) 2024-03-24 11:35:54 -07:00
Daniel Hiltgen
2c390a73ac Merge pull request #3282 from dhiltgen/gpu_docs
Add docs for GPU selection and nvidia uvm workaround
2024-03-24 19:15:03 +01:00
Daniel Hiltgen
3e30c75f3e Bump llama.cpp to b2510 2024-03-23 19:55:56 +01:00
Eddú Meléndez Gonzales
7e430ff352 Add Testcontainers into Libraries section (#3291)
Testcontainers provides a module for Ollama.
2024-03-23 19:55:25 +01:00
Daniel Hiltgen
1784113ef5 Merge pull request #3309 from dhiltgen/integration_testing
Revamp go based integration tests
2024-03-23 19:08:49 +01:00
Daniel Hiltgen
949b6c01e0 Revamp go based integration tests
This uplevels the integration tests to run the server which can allow
testing an existing server, or a remote server.
2024-03-23 14:24:18 +01:00
jmorganca
38daf0a252 rename .gitattributes 2024-03-23 12:40:31 +01:00
Daniel Hiltgen
43799532c1 Bump llama.cpp to b2474
The release just before ggml-cuda.cu refactoring
2024-03-23 09:54:56 +01:00
Daniel Hiltgen
d8fdbfd8da Add docs for GPU selection and nvidia uvm workaround 2024-03-21 11:52:54 +01:00
Bruce MacDonald
a5ba0fcf78 doc: faq gpu compatibility (#3142) 2024-03-21 05:21:34 -04:00
Jeffrey Morgan
3a30bf56dc Update faq.md 2024-03-20 17:48:39 +01:00
Daniel Hiltgen
a1c0a48524 Merge pull request #3122 from dhiltgen/better_tmp_cleanup
Better tmpdir cleanup
2024-03-20 16:28:03 +01:00
Daniel Hiltgen
74788b487c Better tmpdir cleanup
If expanding the runners fails, don't leave a corrupt/incomplete payloads dir
We now write a pid file out to the tmpdir, which allows us to scan for stale tmpdirs
and remove this as long as there isn't still a process running.
2024-03-20 16:03:19 +01:00
Jeffrey Morgan
7ed3e94105 Update faq.md 2024-03-18 10:24:39 +01:00
jmorganca
2297ad39da update faq.md 2024-03-18 10:17:59 +01:00
Michael Yang
01cff6136d Merge pull request #3217 from ollama/mxyng/cleanup
remove global
2024-03-18 02:13:30 -07:00
Michael Yang
3c4ad0ecab dyn global 2024-03-18 09:45:45 +01:00
Michael Yang
22f326464e Merge pull request #3083 from ollama/mxyng/refactor-readseeker
refactor readseeker
2024-03-16 12:08:56 -07:00
Jeffrey Morgan
e95ffc7448 llama: remove server static assets (#3174) 2024-03-15 19:24:12 -07:00
Jeffrey Morgan
2dce1ab40b add llm/ext_server directory to linguist-vendored (#3173) 2024-03-15 17:46:46 -07:00
Daniel Hiltgen
f4b31c2d53 Merge pull request #3111 from alitrack/main
Update ollama.iss
2024-03-15 16:46:59 -07:00
Daniel Hiltgen
ab3456207b Merge pull request #3028 from ollama/ci_release
CI release process
2024-03-15 16:40:54 -07:00
Daniel Hiltgen
6ad414f31e Merge pull request #3086 from dhiltgen/import_server
Import server.cpp to retain llava support
2024-03-15 16:10:35 -07:00
Daniel Hiltgen
052b5a3b77 Merge pull request #3171 from dhiltgen/rocm_94x
Add Radeon gfx940-942 GPU support
2024-03-15 15:58:33 -07:00
Daniel Hiltgen
d4c10df2b0 Add Radeon gfx940-942 GPU support 2024-03-15 15:34:58 -07:00
Daniel Hiltgen
540f4af45f Wire up more complete CI for releases
Flesh out our github actions CI so we can build official releaes.
2024-03-15 12:37:36 -07:00
Blake Mizerany
6ce37e4d96 llm,readline: use errors.Is instead of simple == check (#3161)
This fixes some brittle, simple equality checks to use errors.Is. Since
go1.13, errors.Is is the idiomatic way to check for errors.

Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com>
2024-03-15 07:14:12 -07:00
Blake Mizerany
703684a82a server: replace blob prefix separator from ':' to '-' (#3146)
This fixes issues with blob file names that contain ':' characters to be rejected by file systems that do not support them.
2024-03-14 20:18:06 -07:00
Daniel Hiltgen
6459377ae0 Add ROCm support to linux install script (#2966) 2024-03-14 18:00:16 -07:00
Blake Mizerany
8546dd3d72 .github: fix model and feature request yml (#3155) 2024-03-14 15:26:06 -07:00
Blake Mizerany
87100be5e0 .github: add issue templates (#3143) 2024-03-14 15:19:10 -07:00
Michael Yang
e87c780ff9 Merge pull request #3149 from ollama/mxyng/fix-memory-leak
fix: clip memory leak
2024-03-14 13:34:15 -07:00
Michael Yang
291c663865 fix: clip memory leak 2024-03-14 13:12:42 -07:00
Daniel Hiltgen
da20786e3e Merge pull request #3068 from dhiltgen/win_pipe
Use stdin for term discovery on windows
2024-03-14 11:55:19 -07:00
Jeffrey Morgan
5ce997a7b9 Update README.md 2024-03-13 21:12:17 -07:00
Jeffrey Morgan
672ffe9b7d add OLLAMA_KEEP_ALIVE to environment variable docs for ollama serve (#3127) 2024-03-13 14:35:33 -07:00
Patrick Devine
47cfe58af5 Default Keep Alive environment variable (#3094)
---------

Co-authored-by: Chris-AS1 <8493773+Chris-AS1@users.noreply.github.com>
2024-03-13 13:29:40 -07:00
Daniel Hiltgen
c1a81c6fe3 Use stdin for term discovery on windows
When you feed input to the cmd via a pipe it no longer reports a warning
2024-03-13 10:37:31 -07:00
Steven Lee
152ab524c2 Update ollama.iss
add arm64 support
2024-03-13 20:15:45 +08:00
Jeffrey Morgan
e72c567cfd restore locale patch (#3091) 2024-03-12 22:08:13 -07:00
Bruce MacDonald
3e22611200 token repeat limit for prediction requests (#3080) 2024-03-12 22:08:25 -04:00
Daniel Hiltgen
a54d4a28dc Merge pull request #3088 from dhiltgen/rocm_igpu_linux
Fix iGPU detection for linux
2024-03-12 17:20:27 -07:00
Daniel Hiltgen
82b0c7c27e Fix iGPU detection for linux
This fixes a few bugs in the new sysfs discovery logic.  iGPUs are now
correctly identified by their <1G VRAM reported.  the sysfs IDs are off
by one compared to what HIP wants due to the CPU being reported
in amdgpu, but HIP only cares about GPUs.
2024-03-12 16:57:19 -07:00
Patrick Devine
ba7cf7fb66 add more docs on for the modelfile message command (#3087) 2024-03-12 16:41:41 -07:00
Bruce MacDonald
2f804068bd warn when json format is expected but not mentioned in prompt (#3081) 2024-03-12 19:07:11 -04:00
Daniel Hiltgen
85129d3a32 Adapt our build for imported server.cpp 2024-03-12 14:57:15 -07:00
Daniel Hiltgen
9ac6440da3 Import server.cpp as of b2356 2024-03-12 13:58:06 -07:00
Michael Yang
0085297928 refactor readseeker 2024-03-12 12:54:18 -07:00
Daniel Hiltgen
34d00f90b1 Merge pull request #3070 from dhiltgen/visible_devices
Add docs explaining GPU selection env vars
2024-03-12 11:36:46 -07:00
Daniel Hiltgen
b53229a2ed Add docs explaining GPU selection env vars 2024-03-12 11:33:06 -07:00
racerole
53c107e20e chore: fix typo (#3073)
Signed-off-by: racerole <jiangyifeng@outlook.com>
2024-03-12 14:09:22 -04:00
mofanke
51578d8573 fix gpu_info_cuda.c compile warning (#3077) 2024-03-12 14:08:40 -04:00
Jeffrey Morgan
b5fcd9d3aa use -trimpath when building releases (#3069) 2024-03-11 15:58:46 -07:00
Bruce MacDonald
b80661e8c7 relay load model errors to the client (#3065) 2024-03-11 16:48:27 -04:00
Jeffrey Morgan
6d3adfbea2 Update troubleshooting.md 2024-03-11 13:22:28 -07:00
Jeffrey Morgan
369eda65f5 update llama.cpp submodule to ceca1ae (#3064) 2024-03-11 12:57:48 -07:00
Michael Yang
f878e91070 Merge pull request #3044 from ollama/mxyng/fix-convert-shape
convert: fix shape
2024-03-11 09:56:57 -07:00
Daniel Hiltgen
0d651478e4 Merge pull request #3056 from dhiltgen/rocm_link_clash
Avoid rocm runner and dependency clash
2024-03-11 09:48:48 -07:00
Michael Yang
9ea492f1ce convert: fix shape 2024-03-11 09:41:01 -07:00
Daniel Hiltgen
bc13da2bfe Avoid rocm runner and dependency clash
Putting the rocm symlink next to the runners is risky.  This moves
the payloads into a subdir to avoid potential clashes.
2024-03-11 09:33:22 -07:00
Jeffrey Morgan
41b00b9856 fix 03-locale.diff 2024-03-10 16:21:05 -07:00
Daniel Hiltgen
c2a8ed48e7 Merge pull request #3048 from dhiltgen/harden_rocm_deps
Harden for deps file being empty (or short)
2024-03-10 15:17:22 -07:00
Daniel Hiltgen
3dc1bb6a35 Harden for deps file being empty (or short) 2024-03-10 14:45:38 -07:00
Daniel Hiltgen
7865a6996a Merge pull request #3046 from dhiltgen/rocm_search_paths
Add ollama executable peer dir for rocm
2024-03-10 12:30:56 -07:00
Daniel Hiltgen
00ec269321 Add ollama executable peer dir for rocm
This allows people who package up ollama on their own to place
the rocm dependencies in a peer directory to the ollama executable
much like our windows install flow.
2024-03-10 12:16:30 -07:00
Jeffrey Morgan
908005d90b patch: use default locale in wpm tokenizer (#3034) 2024-03-09 21:12:12 -08:00
Jeffrey Morgan
cdf65e793f only copy deps for amd64 in build_linux.sh 2024-03-09 17:55:22 -08:00
Daniel Hiltgen
82ca694d68 Rename ROCm deps file to avoid confusion (#3025) 2024-03-09 17:48:38 -08:00
Jeffrey Morgan
5017a15bcb add macapp to .dockerignore 2024-03-09 16:07:06 -08:00
Jeffrey Morgan
e11668aa07 add bundle_metal and cleanup_metal funtions to gen_darwin.sh 2024-03-09 16:04:57 -08:00
Jeffrey Morgan
0bd0f4a29c tidy cleanup logs 2024-03-09 15:56:48 -08:00
Jeffrey Morgan
1ffb1e2874 update llama.cpp submodule to 77d1ac7 (#3030) 2024-03-09 15:55:34 -08:00
Daniel Hiltgen
0a7844413c Merge pull request #3026 from dhiltgen/win_rocm_docs
Doc how to set up ROCm builds on windows
2024-03-09 14:17:19 -08:00
Jeffrey Morgan
f9cd55c70b disable gpu for certain model architectures and fix divide-by-zero on memory estimation 2024-03-09 12:51:38 -08:00
Daniel Hiltgen
0fdebb34a9 Doc how to set up ROCm builds on windows 2024-03-09 11:29:45 -08:00
Daniel Hiltgen
ac64cd4ef9 Merge pull request #3008 from dhiltgen/no_more_idempotent
Finish unwinding idempotent payload logic
2024-03-09 09:13:24 -08:00
Daniel Hiltgen
4a5c9b8035 Finish unwinding idempotent payload logic
The recent ROCm change partially removed idempotent
payloads, but the ggml-metal.metal file for mac was still
idempotent.  This finishes switching to always extract
the payloads, and now that idempotentcy is gone, the
version directory is no longer useful.
2024-03-09 08:34:39 -08:00
Jeffrey Morgan
efe5617b64 update llama.cpp submodule to c2101a2 (#3020) 2024-03-09 00:44:50 -08:00
Jeffrey Morgan
5b3fad9636 separate out isLocalIP 2024-03-09 00:22:08 -08:00
Jeffrey Morgan
bfec2c6e10 simplify host checks 2024-03-08 23:29:53 -08:00
Jeffrey Morgan
5c143af726 add additional allowed hosts 2024-03-08 23:23:59 -08:00
Jeffrey Morgan
6c0af2599e Update docs README.md and table of contents 2024-03-08 22:45:11 -08:00
Jeffrey Morgan
fc8c044584 add allowed host middleware and remove workDir middleware (#3018) 2024-03-08 22:23:47 -08:00
Michael Yang
ecc133d843 Merge pull request #3014 from ollama/mxyng/decode-ggla 2024-03-08 16:14:53 -08:00
Michael Yang
76bdebbadf decode ggla 2024-03-08 15:46:25 -08:00
Michael Yang
18979ad4a1 convert: fix default shape 2024-03-08 15:42:48 -08:00
Michael Yang
8e0ef931d8 Merge pull request #2990 from ollama/mxyng/default-term-size
fix: default terminal width, height
2024-03-08 15:20:54 -08:00
Daniel Hiltgen
280da44522 Merge pull request #2988 from dhiltgen/rocm_docs
Refined ROCm troubleshooting docs
2024-03-08 13:33:30 -08:00
Bruce MacDonald
0cebc79cba fix: allow importing a model from name reference (#3005) 2024-03-08 12:27:47 -05:00
Jeffrey Morgan
0e4669b04f update llama.cpp submodule to 6cdabe6 (#2999) 2024-03-08 00:26:20 -08:00
Jeffrey Morgan
b886bec3f9 Update api.md 2024-03-07 23:27:51 -08:00
Jeffrey Morgan
fc06205971 Revert "adjust download and upload concurrency based on available bandwidth" (#2995) 2024-03-07 18:10:16 -08:00
Blake Mizerany
2ada81e068 cmd: tighten up env var usage sections (#2962)
Also, document OLLAMA_HOST client semantics per command that honors it.
This looks nicer than having a general puprose environment variable
section in the root usage which was showing up after the "addition help
topics" section outputed by Cobra's default template.

It was decided this was easier to work with than using a custom template
for Cobra right now.
2024-03-07 13:57:07 -08:00
Michael Yang
b1e74d4fda default terminal width, height 2024-03-07 11:35:42 -08:00
Michael Yang
f678f5c5c3 Merge pull request #2991 from ollama/mxyng/fix-ci
fix ci
2024-03-07 11:35:06 -08:00
Michael Yang
2cb74e23fb fix ci 2024-03-07 11:33:49 -08:00
Daniel Hiltgen
69f0227813 Refined ROCm troubleshooting docs 2024-03-07 11:22:37 -08:00
Daniel Hiltgen
3c8df3808b Merge pull request #2885 from dhiltgen/rocm_v6_only
Revamp ROCm support
2024-03-07 10:51:00 -08:00
Michael Yang
7d564835c2 Merge pull request #2985 from ollama/rm-empty-examples
remove empty examples
2024-03-07 10:49:40 -08:00
Michael Yang
72431031d9 no ci test on docs, examples 2024-03-07 10:44:48 -08:00
Michael Yang
6041abb5b2 remove empty examples 2024-03-07 10:40:32 -08:00
Daniel Hiltgen
6c5ccb11f9 Revamp ROCm support
This refines where we extract the LLM libraries to by adding a new
OLLAMA_HOME env var, that defaults to `~/.ollama` The logic was already
idempotenent, so this should speed up startups after the first time a
new release is deployed.  It also cleans up after itself.

We now build only a single ROCm version (latest major) on both windows
and linux.  Given the large size of ROCms tensor files, we split the
dependency out.  It's bundled into the installer on windows, and a
separate download on windows.  The linux install script is now smart and
detects the presence of AMD GPUs and looks to see if rocm v6 is already
present, and if not, then downloads our dependency tar file.

For Linux discovery, we now use sysfs and check each GPU against what
ROCm supports so we can degrade to CPU gracefully instead of having
llama.cpp+rocm assert/crash on us.  For Windows, we now use go's windows
dynamic library loading logic to access the amdhip64.dll APIs to query
the GPU information.
2024-03-07 10:36:50 -08:00
Michael Yang
2e20110e50 Merge pull request #2221 from ollama/mxyng/up-down-ccy
adjust download and upload concurrency based on available bandwidth
2024-03-07 09:27:33 -08:00
Daniel Hiltgen
82ddc3e441 Merge pull request #2964 from dhiltgen/mem_limit_var
Allow setting max vram for workarounds
2024-03-07 09:25:44 -08:00
Jeffrey Morgan
d481fb3cc8 update go to 1.22 in other places (#2975) 2024-03-07 07:39:49 -08:00
DJ Johnson
23ee633252 docs: Add LLM-X to Web Integration section (#2759) 2024-03-07 10:11:53 -05:00
John
23ebe8fe11 fix some typos (#2973)
Signed-off-by: hishope <csqiye@126.com>
2024-03-06 22:50:11 -08:00
Patrick Devine
2c017ca441 Convert Safetensors to an Ollama model (#2824) 2024-03-06 21:01:51 -08:00
Daniel Hiltgen
be330174dd Allow setting max vram for workarounds
Until we get all the memory calculations correct, this can provide
and escape valve for users to workaround out of memory crashes.
2024-03-06 17:15:06 -08:00
Blake Mizerany
0ded7fdc4b cmd: document environment variables for serve command
Updates #2944
2024-03-06 13:48:46 -08:00
Leo
2103a5073c Add Odin Runes, a Feature-Rich Java UI for Ollama, to README (#2440)
* Add Odin Runes to README

Add Odin Runes to README

This commit adds Odin Runes to the "Community Integrations" section of the README. Odin Runes is a Java-based GPT client designed to provide seamless interaction with GPT models, enhancing productivity in prompt engineering and text generation tasks. This addition highlights the integration between Odin Runes and Ollama, offering users the flexibility to leverage large language models locally within their development workflow.

* Update README.md

this commit applies the comments of the reviewer.
2024-03-06 11:57:49 -08:00
Jeffrey Morgan
ce9f7c4674 Update api.md 2024-03-05 13:13:23 -08:00
Anders Rex
e5596c1944 Add NotesOllama to Community Integrations (#2909) 2024-03-04 01:18:10 -08:00
Timothy Graupmann
9bc3fee694 Added community link for Ollama Copilot (#2582)
* Added community link for Ollama Copilot

* Update README.md

---------

Co-authored-by: Michael <mchiang0610@users.noreply.github.com>
2024-03-04 00:40:36 -08:00
Michael Yang
084d846621 refactor 2024-02-21 13:42:48 -08:00
Michael Yang
6a4b994433 lint 2024-02-21 13:42:48 -08:00
Michael Yang
bea007deb7 use LimitGroup for uploads 2024-02-21 13:42:48 -08:00
Michael Yang
074934be03 adjust group limit based on download speed 2024-02-21 13:42:48 -08:00
Michael Yang
0de12368a0 add new LimitGroup for dynamic concurrency 2024-02-21 13:42:48 -08:00
Michael Yang
917bd61084 refactor download run 2024-02-21 13:42:46 -08:00
124 changed files with 44230 additions and 1746 deletions

View File

@@ -1,8 +1,9 @@
.vscode
ollama
app
macapp
dist
llm/llama.cpp
.env
.cache
test_data
test_data

1
.gitattributes vendored Normal file
View File

@@ -0,0 +1 @@
llm/ext_server/* linguist-vendored

View File

@@ -0,0 +1,18 @@
name: Model request
description: Request a new model for the library
labels: [mr]
body:
- type: markdown
attributes:
value: |
Please check if your Model request is [already available](https://ollama.com/search) or that you cannot [import it](https://github.com/ollama/ollama/blob/main/docs/import.md#import-a-model) yourself.
Tell us about which Model you'd like to see in the library!
- type: textarea
id: problem
attributes:
label: What model would you like?
description: Please provide a link to the model.
- type: markdown
attributes:
value: |
Thanks for filing a model request!

View File

@@ -0,0 +1,41 @@
name: Feature request
description: Propose a new feature
labels: [needs-triage, fr]
body:
- type: markdown
attributes:
value: |
Please check if your feature request is [already filed](https://github.com/ollama/ollama/issues).
Tell us about your idea!
- type: textarea
id: problem
attributes:
label: What are you trying to do?
description: Tell us about the problem you're trying to solve.
validations:
required: false
- type: textarea
id: solution
attributes:
label: How should we solve this?
description: If you have an idea of how you'd like to see this feature work, let us know.
validations:
required: false
- type: textarea
id: alternative
attributes:
label: What is the impact of not solving this?
description: (How) Are you currently working around the issue?
validations:
required: false
- type: textarea
id: context
attributes:
label: Anything else?
description: Any additional context to share, e.g., links
validations:
required: false
- type: markdown
attributes:
value: |
Thanks for filing a feature request!

125
.github/ISSUE_TEMPLATE/90_bug_report.yml vendored Normal file
View File

@@ -0,0 +1,125 @@
name: Bug report
description: File a bug report. If you need help, please join our Discord server.
labels: [needs-triage, bug]
body:
- type: markdown
attributes:
value: |
Please check if your bug is [already filed](https://github.com/ollama/ollama/issues) before filing a new one.
- type: textarea
id: what-happened
attributes:
label: What is the issue?
description: What happened? What did you expect to happen?
validations:
required: true
- type: textarea
id: what-was-expected
attributes:
label: What did you expect to see?
description: What did you expect to see/happen instead?
validations:
required: true
- type: textarea
id: steps
attributes:
label: Steps to reproduce
description: What are the steps you took that hit this issue?
validations:
required: false
- type: textarea
id: changes
attributes:
label: Are there any recent changes that introduced the issue?
description: If so, what are those changes?
validations:
required: false
- type: dropdown
id: os
attributes:
label: OS
description: What OS are you using? You may select more than one.
multiple: true
options:
- Linux
- macOS
- Windows
- Other
validations:
required: false
- type: dropdown
id: architecture
attributes:
label: Architecture
description: What architecture are you using? You may select more than one.
multiple: true
options:
- arm64
- amd64
- x86
- Other
- type: dropdown
id: platform
attributes:
label: Platform
description: What platform are you using? You may select more than one.
multiple: true
options:
- Docker
- WSL
- WSL2
validations:
required: false
- type: input
id: ollama-version
attributes:
label: Ollama version
description: What Ollama version are you using? (`ollama --version`)
placeholder: e.g., 1.14.4
validations:
required: false
- type: dropdown
id: gpu
attributes:
label: GPU
description: What GPU, if any, are you using? You may select more than one.
multiple: true
options:
- Nvidia
- AMD
- Intel
- Apple
- Other
validations:
required: false
- type: textarea
id: gpu-info
attributes:
label: GPU info
description: What GPU info do you have? (`nvidia-smi`, `rocminfo`, `system_profiler SPDisplaysDataType`, etc.)
validations:
required: false
- type: dropdown
id: cpu
attributes:
label: CPU
description: What CPU are you using? You may select more than one.
multiple: true
options:
- Intel
- AMD
- Apple
- Other
validations:
required: false
- type: textarea
id: other-software
attributes:
label: Other software
description: What other software are you using that might be related to this issue?
validations:
required: false
- type: markdown
attributes:
value: |
Thanks for filing a bug report!

8
.github/ISSUE_TEMPLATE/config.yml vendored Normal file
View File

@@ -0,0 +1,8 @@
blank_issues_enabled: true
contact_links:
- name: Help
url: https://discord.com/invite/ollama
about: Please join our Discord server for help using Ollama
- name: Troubleshooting
url: https://github.com/ollama/ollama/blob/main/docs/faq.md#faq
about: See the FAQ for common issues and solutions

463
.github/workflows/release.yaml vendored Normal file
View File

@@ -0,0 +1,463 @@
name: release
on:
push:
tags:
- 'v*'
jobs:
# Full build of the Mac assets
build-darwin:
runs-on: macos-latest
environment: release
steps:
- uses: actions/checkout@v4
- name: Set Version
shell: bash
run: |
echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV
echo "RELEASE_VERSION=$(echo ${GITHUB_REF_NAME} | cut -f1 -d-)" >> $GITHUB_ENV
- name: key
env:
MACOS_SIGNING_KEY: ${{ secrets.MACOS_SIGNING_KEY }}
MACOS_SIGNING_KEY_PASSWORD: ${{ secrets.MACOS_SIGNING_KEY_PASSWORD }}
run: |
echo $MACOS_SIGNING_KEY | base64 --decode > certificate.p12
security create-keychain -p password build.keychain
security default-keychain -s build.keychain
security unlock-keychain -p password build.keychain
security import certificate.p12 -k build.keychain -P $MACOS_SIGNING_KEY_PASSWORD -T /usr/bin/codesign
security set-key-partition-list -S apple-tool:,apple:,codesign: -s -k password build.keychain
- uses: actions/setup-go@v5
with:
go-version: '1.22'
cache: true
- name: Build Darwin
env:
APPLE_IDENTITY: ${{ secrets.APPLE_IDENTITY }}
APPLE_PASSWORD: ${{ secrets.APPLE_PASSWORD }}
APPLE_TEAM_ID: ${{ vars.APPLE_TEAM_ID }}
APPLE_ID: ${{ vars.APPLE_ID }}
run: |
./scripts/build_darwin.sh
- uses: actions/upload-artifact@v4
with:
name: dist-darwin
path: |
dist/*arwin*
!dist/*-cov
# Windows builds take a long time to both install the dependencies and build, so parallelize
# CPU generation step
generate-windows-cpu:
environment: release
runs-on: windows
env:
KEY_CONTAINER: ${{ vars.KEY_CONTAINER }}
steps:
- uses: actions/checkout@v4
- name: Set Version
shell: bash
run: echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV
- uses: 'google-github-actions/auth@v2'
with:
project_id: 'ollama'
credentials_json: '${{ secrets.GOOGLE_SIGNING_CREDENTIALS }}'
- run: echo "${{ vars.OLLAMA_CERT }}" > ollama_inc.crt
- name: install Windows SDK 8.1 to get signtool
run: |
$ErrorActionPreference = "Stop"
write-host "downloading SDK"
Invoke-WebRequest -Uri "https://go.microsoft.com/fwlink/p/?LinkId=323507" -OutFile "${env:RUNNER_TEMP}\sdksetup.exe"
Start-Process "${env:RUNNER_TEMP}\sdksetup.exe" -ArgumentList @("/q") -NoNewWindow -Wait
write-host "Win SDK 8.1 installed"
gci -path 'C:\Program Files (x86)\Windows Kits\' -r -fi 'signtool.exe'
- name: install signing plugin
run: |
$ErrorActionPreference = "Stop"
write-host "downloading plugin"
Invoke-WebRequest -Uri "https://github.com/GoogleCloudPlatform/kms-integrations/releases/download/cng-v1.0/kmscng-1.0-windows-amd64.zip" -OutFile "${env:RUNNER_TEMP}\plugin.zip"
Expand-Archive -Path "${env:RUNNER_TEMP}\plugin.zip" -DestinationPath ${env:RUNNER_TEMP}\plugin\
write-host "Installing plugin"
& "${env:RUNNER_TEMP}\plugin\*\kmscng.msi" /quiet
write-host "plugin installed"
- uses: actions/setup-go@v5
with:
go-version: '1.22'
cache: true
- run: go get ./...
- run: |
$gopath=(get-command go).source | split-path -parent
& "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1"
cd $env:GITHUB_WORKSPACE
$env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
$env:PATH="$gopath;$env:PATH"
go generate -x ./...
name: go generate
- uses: actions/upload-artifact@v4
with:
name: generate-windows-cpu
path: llm/llama.cpp/build/**/lib/*
# ROCm generation step
generate-windows-rocm:
environment: release
runs-on: windows
env:
KEY_CONTAINER: ${{ vars.KEY_CONTAINER }}
steps:
- uses: actions/checkout@v4
- name: Set Version
shell: bash
run: echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV
- uses: 'google-github-actions/auth@v2'
with:
project_id: 'ollama'
credentials_json: '${{ secrets.GOOGLE_SIGNING_CREDENTIALS }}'
- run: echo "${{ vars.OLLAMA_CERT }}" > ollama_inc.crt
- name: install Windows SDK 8.1 to get signtool
run: |
$ErrorActionPreference = "Stop"
write-host "downloading SDK"
Invoke-WebRequest -Uri "https://go.microsoft.com/fwlink/p/?LinkId=323507" -OutFile "${env:RUNNER_TEMP}\sdksetup.exe"
Start-Process "${env:RUNNER_TEMP}\sdksetup.exe" -ArgumentList @("/q") -NoNewWindow -Wait
write-host "Win SDK 8.1 installed"
gci -path 'C:\Program Files (x86)\Windows Kits\' -r -fi 'signtool.exe'
- name: install signing plugin
run: |
$ErrorActionPreference = "Stop"
write-host "downloading plugin"
Invoke-WebRequest -Uri "https://github.com/GoogleCloudPlatform/kms-integrations/releases/download/cng-v1.0/kmscng-1.0-windows-amd64.zip" -OutFile "${env:RUNNER_TEMP}\plugin.zip"
Expand-Archive -Path "${env:RUNNER_TEMP}\plugin.zip" -DestinationPath ${env:RUNNER_TEMP}\plugin\
write-host "Installing plugin"
& "${env:RUNNER_TEMP}\plugin\*\kmscng.msi" /quiet
write-host "plugin installed"
- uses: actions/setup-go@v5
with:
go-version: '1.22'
cache: true
- name: "Install ROCm"
run: |
$ErrorActionPreference = "Stop"
write-host "downloading AMD HIP Installer"
Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-23.Q4-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
write-host "Installing AMD HIP"
Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
write-host "Completed AMD HIP"
- name: "Verify ROCm"
run: |
& 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
- run: go get ./...
- run: |
$gopath=(get-command go).source | split-path -parent
& "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1"
cd $env:GITHUB_WORKSPACE
$env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
$env:PATH="$gopath;$env:PATH"
$env:OLLAMA_SKIP_CPU_GENERATE="1"
$env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
go generate -x ./...
name: go generate
- name: "gather rocm dependencies"
run: |
$HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
md "dist\deps\bin\rocblas\library"
cp "${HIP_PATH}\bin\hipblas.dll" "dist\deps\bin\"
cp "${HIP_PATH}\bin\rocblas.dll" "dist\deps\bin\"
cp "${HIP_PATH}\bin\rocblas\library\*" "dist\deps\bin\rocblas\library\"
- uses: actions/upload-artifact@v4
with:
name: generate-windows-rocm
path: llm/llama.cpp/build/**/lib/*
- uses: actions/upload-artifact@v4
with:
name: windows-rocm-deps
path: dist/deps/*
# CUDA generation step
generate-windows-cuda:
environment: release
runs-on: windows
env:
KEY_CONTAINER: ${{ vars.KEY_CONTAINER }}
steps:
- uses: actions/checkout@v4
- name: Set Version
shell: bash
run: echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV
- uses: 'google-github-actions/auth@v2'
with:
project_id: 'ollama'
credentials_json: '${{ secrets.GOOGLE_SIGNING_CREDENTIALS }}'
- run: echo "${{ vars.OLLAMA_CERT }}" > ollama_inc.crt
- name: install Windows SDK 8.1 to get signtool
run: |
$ErrorActionPreference = "Stop"
write-host "downloading SDK"
Invoke-WebRequest -Uri "https://go.microsoft.com/fwlink/p/?LinkId=323507" -OutFile "${env:RUNNER_TEMP}\sdksetup.exe"
Start-Process "${env:RUNNER_TEMP}\sdksetup.exe" -ArgumentList @("/q") -NoNewWindow -Wait
write-host "Win SDK 8.1 installed"
gci -path 'C:\Program Files (x86)\Windows Kits\' -r -fi 'signtool.exe'
- name: install signing plugin
run: |
$ErrorActionPreference = "Stop"
write-host "downloading plugin"
Invoke-WebRequest -Uri "https://github.com/GoogleCloudPlatform/kms-integrations/releases/download/cng-v1.0/kmscng-1.0-windows-amd64.zip" -OutFile "${env:RUNNER_TEMP}\plugin.zip"
Expand-Archive -Path "${env:RUNNER_TEMP}\plugin.zip" -DestinationPath ${env:RUNNER_TEMP}\plugin\
write-host "Installing plugin"
& "${env:RUNNER_TEMP}\plugin\*\kmscng.msi" /quiet
write-host "plugin installed"
- uses: actions/setup-go@v5
with:
go-version: '1.22'
cache: true
# TODO - consider replacing this action with a ps1 snippet to install
# This actions seems to fail sometimes with "no tools in cache" but a re-run of the failed job clears it
# https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.89_win10.exe
- name: "Install CUDA"
uses: Jimver/cuda-toolkit@v0.2.14
id: cuda-toolkit
with:
cuda: '11.3.1'
- name: "Verify CUDA"
run: nvcc -V
- run: go get ./...
- name: go generate
run: |
$gopath=(get-command go).source | split-path -parent
& "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1"
cd $env:GITHUB_WORKSPACE
$env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
$env:PATH="$gopath;$env:PATH"
$env:OLLAMA_SKIP_CPU_GENERATE="1"
go generate -x ./...
- name: "gather cuda dependencies"
run: |
$NVIDIA_DIR=(resolve-path 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\*\bin\')[0]
md "dist\deps"
cp "${NVIDIA_DIR}\cudart64_*.dll" "dist\deps\"
cp "${NVIDIA_DIR}\cublas64_*.dll" "dist\deps\"
cp "${NVIDIA_DIR}\cublasLt64_*.dll" "dist\deps\"
- uses: actions/upload-artifact@v4
with:
name: generate-windows-cuda
path: llm/llama.cpp/build/**/lib/*
- uses: actions/upload-artifact@v4
with:
name: windows-cuda-deps
path: dist/deps/*
# Import the prior generation steps and build the final windows assets
build-windows:
environment: release
runs-on: windows
needs:
- generate-windows-cuda
- generate-windows-rocm
- generate-windows-cpu
env:
KEY_CONTAINER: ${{ vars.KEY_CONTAINER }}
steps:
- uses: actions/checkout@v4
with:
submodules: recursive
- name: Set Version
shell: bash
run: echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV
- uses: 'google-github-actions/auth@v2'
with:
project_id: 'ollama'
credentials_json: '${{ secrets.GOOGLE_SIGNING_CREDENTIALS }}'
- run: echo "${{ vars.OLLAMA_CERT }}" > ollama_inc.crt
- name: install Windows SDK 8.1 to get signtool
run: |
$ErrorActionPreference = "Stop"
write-host "downloading SDK"
Invoke-WebRequest -Uri "https://go.microsoft.com/fwlink/p/?LinkId=323507" -OutFile "${env:RUNNER_TEMP}\sdksetup.exe"
Start-Process "${env:RUNNER_TEMP}\sdksetup.exe" -ArgumentList @("/q") -NoNewWindow -Wait
write-host "Win SDK 8.1 installed"
gci -path 'C:\Program Files (x86)\Windows Kits\' -r -fi 'signtool.exe'
- name: install signing plugin
run: |
$ErrorActionPreference = "Stop"
write-host "downloading plugin"
Invoke-WebRequest -Uri "https://github.com/GoogleCloudPlatform/kms-integrations/releases/download/cng-v1.0/kmscng-1.0-windows-amd64.zip" -OutFile "${env:RUNNER_TEMP}\plugin.zip"
Expand-Archive -Path "${env:RUNNER_TEMP}\plugin.zip" -DestinationPath ${env:RUNNER_TEMP}\plugin\
write-host "Installing plugin"
& "${env:RUNNER_TEMP}\plugin\*\kmscng.msi" /quiet
write-host "plugin installed"
- uses: actions/setup-go@v5
with:
go-version: '1.22'
cache: true
- run: go get
- uses: actions/download-artifact@v4
with:
name: generate-windows-cpu
path: llm/llama.cpp/build
- uses: actions/download-artifact@v4
with:
name: generate-windows-cuda
path: llm/llama.cpp/build
- uses: actions/download-artifact@v4
with:
name: windows-cuda-deps
path: dist/deps
- uses: actions/download-artifact@v4
with:
name: windows-rocm-deps
path: dist/deps
- uses: actions/download-artifact@v4
with:
name: generate-windows-rocm
path: llm/llama.cpp/build
- run: dir llm/llama.cpp/build
- run: |
$gopath=(get-command go).source | split-path -parent
& "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1"
cd $env:GITHUB_WORKSPACE
$env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
$env:PATH="$gopath;$env:PATH"
$env:OLLAMA_SKIP_GENERATE="1"
$env:NVIDIA_DIR=$(resolve-path ".\dist\deps")
$env:HIP_PATH=$(resolve-path ".\dist\deps")
& .\scripts\build_windows.ps1
- uses: actions/upload-artifact@v4
with:
name: dist-windows
path: dist/*.exe
# Linux x86 assets built using the container based build
build-linux-amd64:
environment: release
runs-on: linux
env:
OLLAMA_SKIP_MANIFEST_CREATE: "1"
BUILD_ARCH: amd64
PUSH: "1"
steps:
- uses: actions/checkout@v4
with:
submodules: recursive
- name: Set Version
shell: bash
run: echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV
- name: Login to Docker Hub
uses: docker/login-action@v3
with:
username: ${{ vars.DOCKER_USER }}
password: ${{ secrets.DOCKER_ACCESS_TOKEN }}
- run: |
./scripts/build_linux.sh
./scripts/build_docker.sh
mv dist/deps/* dist/
- uses: actions/upload-artifact@v4
with:
name: dist-linux-amd64
path: |
dist/*linux*
!dist/*-cov
# Linux ARM assets built using the container based build
# (at present, docker isn't pre-installed on arm ubunutu images)
build-linux-arm64:
environment: release
runs-on: linux-arm64
env:
OLLAMA_SKIP_MANIFEST_CREATE: "1"
BUILD_ARCH: arm64
PUSH: "1"
steps:
- uses: actions/checkout@v4
with:
submodules: recursive
- name: Set Version
shell: bash
run: echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV
- name: "Install Docker"
run: |
# Add Docker's official GPG key:
env
uname -a
sudo apt-get update
sudo apt-get install -y ca-certificates curl
sudo install -m 0755 -d /etc/apt/keyrings
sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc
sudo chmod a+r /etc/apt/keyrings/docker.asc
# Add the repository to Apt sources:
echo \
"deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \
$(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \
sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
sudo apt-get update
sudo apt-get install -y docker-ce docker-ce-cli containerd.io
sudo usermod -aG docker $USER
sudo apt-get install acl
sudo setfacl --modify user:$USER:rw /var/run/docker.sock
- name: Login to Docker Hub
uses: docker/login-action@v3
with:
username: ${{ vars.DOCKER_USER }}
password: ${{ secrets.DOCKER_ACCESS_TOKEN }}
- run: |
./scripts/build_linux.sh
./scripts/build_docker.sh
- uses: actions/upload-artifact@v4
with:
name: dist-linux-arm64
path: |
dist/*linux*
!dist/*-cov
# Aggregate all the assets and ship a release
release:
needs:
- build-darwin
- build-windows
- build-linux-amd64
- build-linux-arm64
runs-on: linux
environment: release
permissions:
contents: write
env:
OLLAMA_SKIP_IMAGE_BUILD: "1"
PUSH: "1"
steps:
- uses: actions/checkout@v4
- name: Set Version
shell: bash
run: |
echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV
echo "RELEASE_VERSION=$(echo ${GITHUB_REF_NAME} | cut -f1 -d-)" >> $GITHUB_ENV
- name: Login to Docker Hub
uses: docker/login-action@v3
with:
username: ${{ vars.DOCKER_USER }}
password: ${{ secrets.DOCKER_ACCESS_TOKEN }}
- run: ./scripts/build_docker.sh
- name: Retrieve built artifact
uses: actions/download-artifact@v4
with:
path: dist
pattern: dist-*
merge-multiple: true
- run: |
ls -lh dist/
(cd dist; sha256sum * > sha256sum.txt)
cat dist/sha256sum.txt
- uses: ncipollo/release-action@v1
with:
name: ${{ env.RELEASE_VERSION }}
allowUpdates: true
artifacts: "dist/*"
draft: true
prerelease: true
omitBodyDuringUpdate: true
generateReleaseNotes: true
omitDraftDuringUpdate: true
omitPrereleaseDuringUpdate: true
replacesArtifacts: true

View File

@@ -2,17 +2,22 @@ name: test
on:
pull_request:
paths:
- '**/*'
- '!docs/**'
- '!examples/**'
- '!README.md'
jobs:
generate:
strategy:
matrix:
os: [ubuntu-latest, macos-latest, windows-latest]
os: [ubuntu-latest, macos-latest, windows-2019]
arch: [amd64, arm64]
exclude:
- os: ubuntu-latest
arch: arm64
- os: windows-latest
- os: windows-2019
arch: arm64
runs-on: ${{ matrix.os }}
env:
@@ -21,10 +26,21 @@ jobs:
- uses: actions/checkout@v4
- uses: actions/setup-go@v5
with:
go-version: '1.21'
go-version: '1.22'
cache: true
- run: go get ./...
- run: |
$gopath=(get-command go).source | split-path -parent
& "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1"
cd $env:GITHUB_WORKSPACE
$env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
$env:PATH="$gopath;$env:PATH"
go generate -x ./...
if: ${{ startsWith(matrix.os, 'windows-') }}
name: "Windows Go Generate"
- run: go generate -x ./...
if: ${{ ! startsWith(matrix.os, 'windows-') }}
name: "Unix Go Generate"
- uses: actions/upload-artifact@v4
with:
name: ${{ matrix.os }}-${{ matrix.arch }}-libraries
@@ -46,7 +62,7 @@ jobs:
- uses: actions/checkout@v4
- uses: actions/setup-go@v4
with:
go-version: '1.21'
go-version: '1.22'
cache: true
- run: go get ./...
- run: |
@@ -62,7 +78,6 @@ jobs:
strategy:
matrix:
rocm-version:
- '5.7.1'
- '6.0'
runs-on: linux
container: rocm/dev-ubuntu-20.04:${{ matrix.rocm-version }}
@@ -76,7 +91,7 @@ jobs:
- uses: actions/checkout@v4
- uses: actions/setup-go@v4
with:
go-version: '1.21'
go-version: '1.22'
cache: true
- run: go get ./...
- run: |
@@ -91,26 +106,26 @@ jobs:
lint:
strategy:
matrix:
os: [ubuntu-latest, macos-latest, windows-latest]
os: [ubuntu-latest, macos-latest, windows-2019]
arch: [amd64, arm64]
exclude:
- os: ubuntu-latest
arch: arm64
- os: windows-latest
- os: windows-2019
arch: arm64
- os: macos-latest
arch: amd64
runs-on: ${{ matrix.os }}
env:
GOARCH: ${{ matrix.arch }}
CGO_ENABLED: "1"
CGO_ENABLED: '1'
steps:
- uses: actions/checkout@v4
with:
submodules: recursive
- uses: actions/setup-go@v5
with:
go-version: '1.21'
go-version: '1.22'
cache: false
- run: |
mkdir -p llm/llama.cpp/build/linux/${{ matrix.arch }}/stub/lib/
@@ -130,24 +145,24 @@ jobs:
needs: generate
strategy:
matrix:
os: [ubuntu-latest, macos-latest, windows-latest]
os: [ubuntu-latest, macos-latest, windows-2019]
arch: [amd64]
exclude:
- os: ubuntu-latest
arch: arm64
- os: windows-latest
- os: windows-2019
arch: arm64
runs-on: ${{ matrix.os }}
env:
GOARCH: ${{ matrix.arch }}
CGO_ENABLED: "1"
CGO_ENABLED: '1'
steps:
- uses: actions/checkout@v4
with:
submodules: recursive
- uses: actions/setup-go@v5
with:
go-version: '1.21'
go-version: '1.22'
cache: true
- run: go get
- uses: actions/download-artifact@v4

View File

@@ -24,4 +24,4 @@ linters-settings:
- (*os.File).Seek
- (*bufio.Writer).WriteString
- (*github.com/spf13/pflag.FlagSet).Set
- (*github.com/jmorganca/ollama/llm.readSeekOffset).Seek
- (*github.com/ollama/ollama/llm.readSeekOffset).Seek

View File

@@ -1,6 +1,8 @@
ARG GOLANG_VERSION=1.21.3
ARG GOLANG_VERSION=1.22.1
ARG CMAKE_VERSION=3.22.1
# this CUDA_VERSION corresponds with the one specified in docs/gpu.md
ARG CUDA_VERSION=11.3.1
ARG ROCM_VERSION=6.0
# Copy the minimal context we need to run the generate scripts
FROM scratch AS llm-code
@@ -13,8 +15,8 @@ ARG CMAKE_VERSION
COPY ./scripts/rh_linux_deps.sh /
RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
COPY --from=llm-code / /go/src/github.com/jmorganca/ollama/
WORKDIR /go/src/github.com/jmorganca/ollama/llm/generate
COPY --from=llm-code / /go/src/github.com/ollama/ollama/
WORKDIR /go/src/github.com/ollama/ollama/llm/generate
ARG CGO_CFLAGS
RUN OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
@@ -23,34 +25,30 @@ ARG CMAKE_VERSION
COPY ./scripts/rh_linux_deps.sh /
RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
ENV PATH /opt/rh/gcc-toolset-10/root/usr/bin:$PATH
COPY --from=llm-code / /go/src/github.com/jmorganca/ollama/
WORKDIR /go/src/github.com/jmorganca/ollama/llm/generate
COPY --from=llm-code / /go/src/github.com/ollama/ollama/
WORKDIR /go/src/github.com/ollama/ollama/llm/generate
ARG CGO_CFLAGS
RUN OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
FROM --platform=linux/amd64 rocm/dev-centos-7:5.7.1-complete AS rocm-5-build-amd64
FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete AS rocm-build-amd64
ARG CMAKE_VERSION
COPY ./scripts/rh_linux_deps.sh /
RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
ENV LIBRARY_PATH /opt/amdgpu/lib64
COPY --from=llm-code / /go/src/github.com/jmorganca/ollama/
WORKDIR /go/src/github.com/jmorganca/ollama/llm/generate
COPY --from=llm-code / /go/src/github.com/ollama/ollama/
WORKDIR /go/src/github.com/ollama/ollama/llm/generate
ARG CGO_CFLAGS
ARG AMDGPU_TARGETS
RUN OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
RUN mkdir /tmp/scratch && \
for dep in $(cat /go/src/github.com/ollama/ollama/llm/llama.cpp/build/linux/x86_64/rocm*/lib/deps.txt) ; do \
cp ${dep} /tmp/scratch/ || exit 1 ; \
done && \
(cd /opt/rocm/lib && tar cf - rocblas/library) | (cd /tmp/scratch/ && tar xf - ) && \
mkdir -p /go/src/github.com/ollama/ollama/dist/deps/ && \
(cd /tmp/scratch/ && tar czvf /go/src/github.com/ollama/ollama/dist/deps/ollama-linux-amd64-rocm.tgz . )
FROM --platform=linux/amd64 rocm/dev-centos-7:6.0-complete AS rocm-6-build-amd64
ARG CMAKE_VERSION
COPY ./scripts/rh_linux_deps.sh /
RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
ENV LIBRARY_PATH /opt/amdgpu/lib64
COPY --from=llm-code / /go/src/github.com/jmorganca/ollama/
WORKDIR /go/src/github.com/jmorganca/ollama/llm/generate
ARG CGO_CFLAGS
ARG AMDGPU_TARGETS
RUN OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
FROM --platform=linux/amd64 centos:7 AS cpu-builder-amd64
ARG CMAKE_VERSION
@@ -58,10 +56,10 @@ ARG GOLANG_VERSION
COPY ./scripts/rh_linux_deps.sh /
RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
COPY --from=llm-code / /go/src/github.com/jmorganca/ollama/
COPY --from=llm-code / /go/src/github.com/ollama/ollama/
ARG OLLAMA_CUSTOM_CPU_DEFS
ARG CGO_CFLAGS
WORKDIR /go/src/github.com/jmorganca/ollama/llm/generate
WORKDIR /go/src/github.com/ollama/ollama/llm/generate
FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu-build-amd64
RUN OLLAMA_CPU_TARGET="cpu" sh gen_linux.sh
@@ -76,8 +74,8 @@ ARG GOLANG_VERSION
COPY ./scripts/rh_linux_deps.sh /
RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
COPY --from=llm-code / /go/src/github.com/jmorganca/ollama/
WORKDIR /go/src/github.com/jmorganca/ollama/llm/generate
COPY --from=llm-code / /go/src/github.com/ollama/ollama/
WORKDIR /go/src/github.com/ollama/ollama/llm/generate
# Note, we only build the "base" CPU variant on arm since avx/avx2 are x86 features
ARG OLLAMA_CUSTOM_CPU_DEFS
ARG CGO_CFLAGS
@@ -86,40 +84,41 @@ RUN OLLAMA_CPU_TARGET="cpu" sh gen_linux.sh
# Intermediate stage used for ./scripts/build_linux.sh
FROM --platform=linux/amd64 cpu-build-amd64 AS build-amd64
ENV CGO_ENABLED 1
WORKDIR /go/src/github.com/jmorganca/ollama
WORKDIR /go/src/github.com/ollama/ollama
COPY . .
COPY --from=cpu_avx-build-amd64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
COPY --from=cpu_avx2-build-amd64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
COPY --from=cuda-build-amd64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
COPY --from=rocm-5-build-amd64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
COPY --from=rocm-6-build-amd64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
COPY --from=cpu_avx-build-amd64 /go/src/github.com/ollama/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
COPY --from=cpu_avx2-build-amd64 /go/src/github.com/ollama/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
COPY --from=cuda-build-amd64 /go/src/github.com/ollama/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/dist/deps/ ./dist/deps/
ARG GOFLAGS
ARG CGO_CFLAGS
RUN go build .
RUN go build -trimpath .
# Intermediate stage used for ./scripts/build_linux.sh
FROM --platform=linux/arm64 cpu-build-arm64 AS build-arm64
ENV CGO_ENABLED 1
ARG GOLANG_VERSION
WORKDIR /go/src/github.com/jmorganca/ollama
WORKDIR /go/src/github.com/ollama/ollama
COPY . .
COPY --from=cuda-build-arm64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
COPY --from=cuda-build-arm64 /go/src/github.com/ollama/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
RUN mkdir -p /go/src/github.com/ollama/ollama/dist/deps/
ARG GOFLAGS
ARG CGO_CFLAGS
RUN go build .
RUN go build -trimpath .
# Runtime stages
FROM --platform=linux/amd64 ubuntu:22.04 as runtime-amd64
RUN apt-get update && apt-get install -y ca-certificates
COPY --from=build-amd64 /go/src/github.com/jmorganca/ollama/ollama /bin/ollama
COPY --from=build-amd64 /go/src/github.com/ollama/ollama/ollama /bin/ollama
FROM --platform=linux/arm64 ubuntu:22.04 as runtime-arm64
RUN apt-get update && apt-get install -y ca-certificates
COPY --from=build-arm64 /go/src/github.com/jmorganca/ollama/ollama /bin/ollama
COPY --from=build-arm64 /go/src/github.com/ollama/ollama/ollama /bin/ollama
# Radeon images are much larger so we keep it distinct from the CPU/CUDA image
FROM --platform=linux/amd64 rocm/dev-centos-7:5.7.1-complete as runtime-rocm
FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete as runtime-rocm
RUN update-pciids
COPY --from=build-amd64 /go/src/github.com/jmorganca/ollama/ollama /bin/ollama
COPY --from=build-amd64 /go/src/github.com/ollama/ollama/ollama /bin/ollama
EXPOSE 11434
ENV OLLAMA_HOST 0.0.0.0

View File

@@ -1,5 +1,5 @@
<div align="center">
<img alt="ollama" height="200px" src="https://github.com/jmorganca/ollama/assets/3325447/0d0b44e2-8f4a-4e99-9b52-a5c1c741c8f7">
<img alt="ollama" height="200px" src="https://github.com/ollama/ollama/assets/3325447/0d0b44e2-8f4a-4e99-9b52-a5c1c741c8f7">
</div>
# Ollama
@@ -22,7 +22,7 @@ Get up and running with large language models locally.
curl -fsSL https://ollama.com/install.sh | sh
```
[Manual install instructions](https://github.com/jmorganca/ollama/blob/main/docs/linux.md)
[Manual install instructions](https://github.com/ollama/ollama/blob/main/docs/linux.md)
### Docker
@@ -213,7 +213,7 @@ Then build the binary:
go build .
```
More detailed instructions can be found in the [developer guide](https://github.com/jmorganca/ollama/blob/main/docs/development.md)
More detailed instructions can be found in the [developer guide](https://github.com/ollama/ollama/blob/main/docs/development.md)
### Running local builds
@@ -259,9 +259,11 @@ See the [API documentation](./docs/api.md) for all endpoints.
### Web & Desktop
- [LibreChat](https://github.com/danny-avila/LibreChat)
- [Bionic GPT](https://github.com/bionic-gpt/bionic-gpt)
- [Enchanted (macOS native)](https://github.com/AugustDev/enchanted)
- [HTML UI](https://github.com/rtcfirefly/ollama-ui)
- [Saddle](https://github.com/jikkuatwork/saddle)
- [Chatbot UI](https://github.com/ivanfioravanti/chatbot-ollama)
- [Typescript UI](https://github.com/ollama-interface/Ollama-Gui?tab=readme-ov-file)
- [Minimalistic React UI for Ollama Models](https://github.com/richawo/minimal-llm-ui)
@@ -272,11 +274,21 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [Amica](https://github.com/semperai/amica)
- [chatd](https://github.com/BruceMacD/chatd)
- [Ollama-SwiftUI](https://github.com/kghandour/Ollama-SwiftUI)
- [Dify.AI](https://github.com/langgenius/dify)
- [MindMac](https://mindmac.app)
- [NextJS Web Interface for Ollama](https://github.com/jakobhoeg/nextjs-ollama-llm-ui)
- [Msty](https://msty.app)
- [Chatbox](https://github.com/Bin-Huang/Chatbox)
- [WinForm Ollama Copilot](https://github.com/tgraupmann/WinForm_Ollama_Copilot)
- [NextChat](https://github.com/ChatGPTNextWeb/ChatGPT-Next-Web) with [Get Started Doc](https://docs.nextchat.dev/models/ollama)
- [Alpaca WebUI](https://github.com/mmo80/alpaca-webui)
- [OllamaGUI](https://github.com/enoch1118/ollamaGUI)
- [OpenAOE](https://github.com/InternLM/OpenAOE)
- [Odin Runes](https://github.com/leonid20000/OdinRunes)
- [LLM-X: Progressive Web App](https://github.com/mrdjohnson/llm-x)
- [AnythingLLM (Docker + MacOs/Windows/Linux native app)](https://github.com/Mintplex-Labs/anything-llm)
- [Ollama Basic Chat: Uses HyperDiv Reactive UI](https://github.com/rapidarchitect/ollama_basic_chat)
- [Ollama-chats RPG](https://github.com/drazdra/ollama-chats)
### Terminal
@@ -285,14 +297,18 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [Emacs client](https://github.com/zweifisch/ollama)
- [gen.nvim](https://github.com/David-Kunz/gen.nvim)
- [ollama.nvim](https://github.com/nomnivore/ollama.nvim)
- [ollero.nvim](https://github.com/marco-souza/ollero.nvim)
- [ollama-chat.nvim](https://github.com/gerazov/ollama-chat.nvim)
- [ogpt.nvim](https://github.com/huynle/ogpt.nvim)
- [gptel Emacs client](https://github.com/karthink/gptel)
- [Oatmeal](https://github.com/dustinblackman/oatmeal)
- [cmdh](https://github.com/pgibler/cmdh)
- [ooo](https://github.com/npahlfer/ooo)
- [tenere](https://github.com/pythops/tenere)
- [llm-ollama](https://github.com/taketwo/llm-ollama) for [Datasette's LLM CLI](https://llm.datasette.io/en/stable/).
- [typechat-cli](https://github.com/anaisbetts/typechat-cli)
- [ShellOracle](https://github.com/djcopley/ShellOracle)
- [tlm](https://github.com/yusufcanb/tlm)
### Database
@@ -309,7 +325,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [LangChainGo](https://github.com/tmc/langchaingo/) with [example](https://github.com/tmc/langchaingo/tree/main/examples/ollama-completion-example)
- [LangChain4j](https://github.com/langchain4j/langchain4j) with [example](https://github.com/langchain4j/langchain4j-examples/tree/main/ollama-examples/src/main/java)
- [LlamaIndex](https://gpt-index.readthedocs.io/en/stable/examples/llm/ollama.html)
- [LangChain4j](https://github.com/langchain4j/langchain4j/tree/main/langchain4j-ollama)
- [LiteLLM](https://github.com/BerriAI/litellm)
- [OllamaSharp for .NET](https://github.com/awaescher/OllamaSharp)
- [Ollama for Ruby](https://github.com/gbaptista/ollama-ai)
@@ -326,6 +341,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [Ollama for R - rollama](https://github.com/JBGruber/rollama)
- [Ollama-ex for Elixir](https://github.com/lebrunel/ollama-ex)
- [Ollama Connector for SAP ABAP](https://github.com/b-tocs/abap_btocs_ollama)
- [Testcontainers](https://testcontainers.com/modules/ollama/)
### Mobile
@@ -339,6 +355,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [Continue](https://github.com/continuedev/continue)
- [Obsidian Ollama plugin](https://github.com/hinterdupfinger/obsidian-ollama)
- [Logseq Ollama plugin](https://github.com/omagdy7/ollama-logseq)
- [NotesOllama](https://github.com/andersrex/notesollama) (Apple Notes Ollama plugin)
- [Dagger Chatbot](https://github.com/samalba/dagger-chatbot)
- [Discord AI Bot](https://github.com/mekb-turtle/discord-ai-bot)
- [Ollama Telegram Bot](https://github.com/ruecat/ollama-telegram)
@@ -346,9 +363,11 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [Rivet plugin](https://github.com/abrenneke/rivet-plugin-ollama)
- [Llama Coder](https://github.com/ex3ndr/llama-coder) (Copilot alternative using Ollama)
- [Obsidian BMO Chatbot plugin](https://github.com/longy2k/obsidian-bmo-chatbot)
- [Cliobot](https://github.com/herval/cliobot) (Telegram bot with Ollama support)
- [Copilot for Obsidian plugin](https://github.com/logancyang/obsidian-copilot)
- [Obsidian Local GPT plugin](https://github.com/pfrankov/obsidian-local-gpt)
- [Open Interpreter](https://docs.openinterpreter.com/language-model-setup/local-models/ollama)
- [twinny](https://github.com/rjmacarthy/twinny) (Copilot and Copilot chat alternative using Ollama)
- [Wingman-AI](https://github.com/RussellCanfield/wingman-ai) (Copilot code and chat alternative using Ollama and HuggingFace)
- [Page Assist](https://github.com/n4ze3m/page-assist) (Chrome Extension)
- [AI Telegram Bot](https://github.com/tusharhero/aitelegrambot) (Telegram bot using Ollama in backend)

View File

@@ -15,8 +15,8 @@ import (
"runtime"
"strings"
"github.com/jmorganca/ollama/format"
"github.com/jmorganca/ollama/version"
"github.com/ollama/ollama/format"
"github.com/ollama/ollama/version"
)
type Client struct {

50
api/types_test.go Normal file
View File

@@ -0,0 +1,50 @@
package api
import (
"encoding/json"
"math"
"testing"
"time"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func TestKeepAliveParsingFromJSON(t *testing.T) {
tests := []struct {
name string
req string
exp *Duration
}{
{
name: "Positive Integer",
req: `{ "keep_alive": 42 }`,
exp: &Duration{42 * time.Second},
},
{
name: "Positive Integer String",
req: `{ "keep_alive": "42m" }`,
exp: &Duration{42 * time.Minute},
},
{
name: "Negative Integer",
req: `{ "keep_alive": -1 }`,
exp: &Duration{math.MaxInt64},
},
{
name: "Negative Integer String",
req: `{ "keep_alive": "-1m" }`,
exp: &Duration{math.MaxInt64},
},
}
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
var dec ChatRequest
err := json.Unmarshal([]byte(test.req), &dec)
require.NoError(t, err)
assert.Equal(t, test.exp, dec.KeepAlive)
})
}
}

View File

@@ -9,8 +9,8 @@ import (
"os/signal"
"syscall"
"github.com/jmorganca/ollama/app/store"
"github.com/jmorganca/ollama/app/tray"
"github.com/ollama/ollama/app/store"
"github.com/ollama/ollama/app/tray"
)
func Run() {

View File

@@ -11,7 +11,7 @@ import (
"path/filepath"
"time"
"github.com/jmorganca/ollama/api"
"github.com/ollama/ollama/api"
)
func getCLIFullPath(command string) string {

View File

@@ -18,8 +18,8 @@ import (
"strings"
"time"
"github.com/jmorganca/ollama/auth"
"github.com/jmorganca/ollama/version"
"github.com/ollama/ollama/auth"
"github.com/ollama/ollama/version"
)
var (

View File

@@ -4,7 +4,7 @@ package main
// go build -ldflags="-H windowsgui" .
import (
"github.com/jmorganca/ollama/app/lifecycle"
"github.com/ollama/ollama/app/lifecycle"
)
func main() {

View File

@@ -28,8 +28,8 @@ AppPublisher={#MyAppPublisher}
AppPublisherURL={#MyAppURL}
AppSupportURL={#MyAppURL}
AppUpdatesURL={#MyAppURL}
ArchitecturesAllowed=x64
ArchitecturesInstallIn64BitMode=x64
ArchitecturesAllowed=x64 arm64
ArchitecturesInstallIn64BitMode=x64 arm64
DefaultDirName={localappdata}\Programs\{#MyAppName}
DefaultGroupName={#MyAppName}
DisableProgramGroupPage=yes
@@ -91,6 +91,14 @@ Source: "..\ollama.exe"; DestDir: "{app}"; Flags: ignoreversion 64bit
Source: "..\dist\windeps\*.dll"; DestDir: "{app}"; Flags: ignoreversion 64bit
Source: "..\dist\ollama_welcome.ps1"; DestDir: "{app}"; Flags: ignoreversion
Source: ".\assets\app.ico"; DestDir: "{app}"; Flags: ignoreversion
; Assumes v5.7, may need adjustments for v6
#if GetEnv("HIP_PATH") != ""
Source: "{#GetEnv('HIP_PATH')}\bin\hipblas.dll"; DestDir: "{app}\rocm\"; Flags: ignoreversion
Source: "{#GetEnv('HIP_PATH')}\bin\rocblas.dll"; DestDir: "{app}\rocm\"; Flags: ignoreversion
; amdhip64.dll dependency comes from the driver and must be installed already
Source: "{#GetEnv('HIP_PATH')}\bin\rocblas\library\*"; DestDir: "{app}\rocm\rocblas\library\"; Flags: ignoreversion
#endif
[Icons]
Name: "{group}\{#MyAppName}"; Filename: "{app}\{#MyAppExeName}"; IconFilename: "{app}\app.ico"

View File

@@ -4,8 +4,8 @@ import (
"fmt"
"runtime"
"github.com/jmorganca/ollama/app/assets"
"github.com/jmorganca/ollama/app/tray/commontray"
"github.com/ollama/ollama/app/assets"
"github.com/ollama/ollama/app/tray/commontray"
)
func NewTray() (commontray.OllamaTray, error) {

View File

@@ -5,7 +5,7 @@ package tray
import (
"fmt"
"github.com/jmorganca/ollama/app/tray/commontray"
"github.com/ollama/ollama/app/tray/commontray"
)
func InitPlatformTray(icon, updateIcon []byte) (commontray.OllamaTray, error) {

View File

@@ -1,8 +1,8 @@
package tray
import (
"github.com/jmorganca/ollama/app/tray/commontray"
"github.com/jmorganca/ollama/app/tray/wintray"
"github.com/ollama/ollama/app/tray/commontray"
"github.com/ollama/ollama/app/tray/wintray"
)
func InitPlatformTray(icon, updateIcon []byte) (commontray.OllamaTray, error) {

View File

@@ -13,7 +13,7 @@ import (
"sync"
"unsafe"
"github.com/jmorganca/ollama/app/tray/commontray"
"github.com/ollama/ollama/app/tray/commontray"
"golang.org/x/sys/windows"
)

View File

@@ -1,6 +1,7 @@
package cmd
import (
"archive/zip"
"bytes"
"context"
"crypto/ed25519"
@@ -29,12 +30,12 @@ import (
"golang.org/x/exp/slices"
"golang.org/x/term"
"github.com/jmorganca/ollama/api"
"github.com/jmorganca/ollama/format"
"github.com/jmorganca/ollama/parser"
"github.com/jmorganca/ollama/progress"
"github.com/jmorganca/ollama/server"
"github.com/jmorganca/ollama/version"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/format"
"github.com/ollama/ollama/parser"
"github.com/ollama/ollama/progress"
"github.com/ollama/ollama/server"
"github.com/ollama/ollama/version"
)
func CreateHandler(cmd *cobra.Command, args []string) error {
@@ -87,22 +88,82 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
path = filepath.Join(filepath.Dir(filename), path)
}
bin, err := os.Open(path)
fi, err := os.Stat(path)
if errors.Is(err, os.ErrNotExist) && c.Name == "model" {
continue
} else if err != nil {
return err
}
defer bin.Close()
hash := sha256.New()
if _, err := io.Copy(hash, bin); err != nil {
return err
// TODO make this work w/ adapters
if fi.IsDir() {
tf, err := os.CreateTemp("", "ollama-tf")
if err != nil {
return err
}
defer os.RemoveAll(tf.Name())
zf := zip.NewWriter(tf)
files, err := filepath.Glob(filepath.Join(path, "model-*.safetensors"))
if err != nil {
return err
}
if len(files) == 0 {
return fmt.Errorf("no safetensors files were found in '%s'", path)
}
// add the safetensor config file + tokenizer
files = append(files, filepath.Join(path, "config.json"))
files = append(files, filepath.Join(path, "added_tokens.json"))
files = append(files, filepath.Join(path, "tokenizer.model"))
for _, fn := range files {
f, err := os.Open(fn)
if os.IsNotExist(err) && strings.HasSuffix(fn, "added_tokens.json") {
continue
} else if err != nil {
return err
}
fi, err := f.Stat()
if err != nil {
return err
}
h, err := zip.FileInfoHeader(fi)
if err != nil {
return err
}
h.Name = filepath.Base(fn)
h.Method = zip.Store
w, err := zf.CreateHeader(h)
if err != nil {
return err
}
_, err = io.Copy(w, f)
if err != nil {
return err
}
}
if err := zf.Close(); err != nil {
return err
}
if err := tf.Close(); err != nil {
return err
}
path = tf.Name()
}
bin.Seek(0, io.SeekStart)
digest := fmt.Sprintf("sha256:%x", hash.Sum(nil))
if err = client.CreateBlob(cmd.Context(), digest, bin); err != nil {
digest, err := createBlob(cmd, client, path)
if err != nil {
return err
}
@@ -141,6 +202,26 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
return nil
}
func createBlob(cmd *cobra.Command, client *api.Client, path string) (string, error) {
bin, err := os.Open(path)
if err != nil {
return "", err
}
defer bin.Close()
hash := sha256.New()
if _, err := io.Copy(hash, bin); err != nil {
return "", err
}
bin.Seek(0, io.SeekStart)
digest := fmt.Sprintf("sha256:%x", hash.Sum(nil))
if err = client.CreateBlob(cmd.Context(), digest, bin); err != nil {
return "", err
}
return digest, nil
}
func RunHandler(cmd *cobra.Command, args []string) error {
client, err := api.ClientFromEnvironment()
if err != nil {
@@ -806,13 +887,20 @@ func versionHandler(cmd *cobra.Command, _ []string) {
}
}
func appendHostEnvDocs(cmd *cobra.Command) {
const hostEnvDocs = `
Environment Variables:
OLLAMA_HOST The host:port or base URL of the Ollama server (e.g. http://localhost:11434)
`
cmd.SetUsageTemplate(cmd.UsageTemplate() + hostEnvDocs)
}
func NewCLI() *cobra.Command {
log.SetFlags(log.LstdFlags | log.Lshortfile)
cobra.EnableCommandSorting = false
if runtime.GOOS == "windows" {
// Enable colorful ANSI escape code in Windows terminal (disabled by default)
console.ConsoleFromFile(os.Stdout) //nolint:errcheck
console.ConsoleFromFile(os.Stdin) //nolint:errcheck
}
rootCmd := &cobra.Command{
@@ -871,7 +959,6 @@ func NewCLI() *cobra.Command {
runCmd.Flags().Bool("insecure", false, "Use an insecure registry")
runCmd.Flags().Bool("nowordwrap", false, "Don't wrap words to the next line automatically")
runCmd.Flags().String("format", "", "Response format (e.g. json)")
serveCmd := &cobra.Command{
Use: "serve",
Aliases: []string{"start"},
@@ -879,6 +966,14 @@ func NewCLI() *cobra.Command {
Args: cobra.ExactArgs(0),
RunE: RunServer,
}
serveCmd.SetUsageTemplate(serveCmd.UsageTemplate() + `
Environment Variables:
OLLAMA_HOST The host:port to bind to (default "127.0.0.1:11434")
OLLAMA_ORIGINS A comma separated list of allowed origins.
OLLAMA_MODELS The path to the models directory (default is "~/.ollama/models")
OLLAMA_KEEP_ALIVE The duration that models stay loaded in memory (default is "5m")
`)
pullCmd := &cobra.Command{
Use: "pull MODEL",
@@ -907,7 +1002,6 @@ func NewCLI() *cobra.Command {
PreRunE: checkServerHeartbeat,
RunE: ListHandler,
}
copyCmd := &cobra.Command{
Use: "cp SOURCE TARGET",
Short: "Copy a model",
@@ -924,6 +1018,19 @@ func NewCLI() *cobra.Command {
RunE: DeleteHandler,
}
for _, cmd := range []*cobra.Command{
createCmd,
showCmd,
runCmd,
pullCmd,
pushCmd,
listCmd,
copyCmd,
deleteCmd,
} {
appendHostEnvDocs(cmd)
}
rootCmd.AddCommand(
serveCmd,
createCmd,

View File

@@ -14,9 +14,9 @@ import (
"github.com/spf13/cobra"
"golang.org/x/exp/slices"
"github.com/jmorganca/ollama/api"
"github.com/jmorganca/ollama/progress"
"github.com/jmorganca/ollama/readline"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/progress"
"github.com/ollama/ollama/readline"
)
type MultilineState int

View File

@@ -7,7 +7,7 @@ import (
"github.com/stretchr/testify/assert"
"github.com/jmorganca/ollama/api"
"github.com/ollama/ollama/api"
)
func TestExtractFilenames(t *testing.T) {

View File

@@ -7,7 +7,7 @@ import (
"os/exec"
"strings"
"github.com/jmorganca/ollama/api"
"github.com/ollama/ollama/api"
)
func startApp(ctx context.Context, client *api.Client) error {

View File

@@ -6,7 +6,7 @@ import (
"context"
"fmt"
"github.com/jmorganca/ollama/api"
"github.com/ollama/ollama/api"
)
func startApp(ctx context.Context, client *api.Client) error {

View File

@@ -10,7 +10,7 @@ import (
"strings"
"syscall"
"github.com/jmorganca/ollama/api"
"github.com/ollama/ollama/api"
)
func startApp(ctx context.Context, client *api.Client) error {

331
convert/convert.go Normal file
View File

@@ -0,0 +1,331 @@
package convert
import (
"bytes"
"cmp"
"encoding/binary"
"encoding/json"
"fmt"
"io"
"log/slog"
"os"
"path/filepath"
"regexp"
"slices"
"github.com/mitchellh/mapstructure"
"google.golang.org/protobuf/proto"
"github.com/ollama/ollama/convert/sentencepiece"
"github.com/ollama/ollama/llm"
)
type Params struct {
Architectures []string `json:"architectures"`
VocabSize int `json:"vocab_size"`
HiddenSize int `json:"hidden_size"` // n_embd
HiddenLayers int `json:"num_hidden_layers"` // n_layer
ContextSize int `json:"max_position_embeddings"`
IntermediateSize int `json:"intermediate_size"`
AttentionHeads int `json:"num_attention_heads"` // n_head
KeyValHeads int `json:"num_key_value_heads"`
NormEPS float64 `json:"rms_norm_eps"`
RopeFreqBase float64 `json:"rope_theta"`
BoSTokenID int `json:"bos_token_id"`
EoSTokenID int `json:"eos_token_id"`
}
type MetaData struct {
Type string `mapstructure:"dtype"`
Shape []int `mapstructure:"shape"`
Offsets []int `mapstructure:"data_offsets"`
}
func ReadSafeTensors(fn string, offset uint64) ([]llm.Tensor, uint64, error) {
f, err := os.Open(fn)
if err != nil {
return []llm.Tensor{}, 0, err
}
defer f.Close()
var jsonSize uint64
binary.Read(f, binary.LittleEndian, &jsonSize)
buf := make([]byte, jsonSize)
_, err = io.ReadFull(f, buf)
if err != nil {
return []llm.Tensor{}, 0, err
}
d := json.NewDecoder(bytes.NewBuffer(buf))
d.UseNumber()
var parsed map[string]interface{}
if err = d.Decode(&parsed); err != nil {
return []llm.Tensor{}, 0, err
}
var keys []string
for k := range parsed {
keys = append(keys, k)
}
slices.Sort(keys)
slog.Info("converting layers")
var tensors []llm.Tensor
for _, k := range keys {
vals := parsed[k].(map[string]interface{})
var data MetaData
if err = mapstructure.Decode(vals, &data); err != nil {
return []llm.Tensor{}, 0, err
}
var size uint64
var kind uint32
switch len(data.Shape) {
case 0:
// metadata
continue
case 1:
// convert to float32
kind = 0
size = uint64(data.Shape[0] * 4)
case 2:
// convert to float16
kind = 1
size = uint64(data.Shape[0] * data.Shape[1] * 2)
}
ggufName, err := GetTensorName(k)
if err != nil {
slog.Error("%v", err)
return []llm.Tensor{}, 0, err
}
shape := []uint64{0, 0, 0, 0}
for i := range data.Shape {
shape[i] = uint64(data.Shape[i])
}
t := llm.Tensor{
Name: ggufName,
Kind: kind,
Offset: offset,
Shape: shape[:],
FileName: fn,
OffsetPadding: 8 + jsonSize,
FileOffsets: []uint64{uint64(data.Offsets[0]), uint64(data.Offsets[1])},
}
slog.Debug(fmt.Sprintf("%v", t))
tensors = append(tensors, t)
offset += size
}
return tensors, offset, nil
}
func GetSafeTensors(dirpath string) ([]llm.Tensor, error) {
var tensors []llm.Tensor
files, err := filepath.Glob(filepath.Join(dirpath, "/model-*.safetensors"))
if err != nil {
return []llm.Tensor{}, err
}
var offset uint64
for _, f := range files {
var t []llm.Tensor
var err error
t, offset, err = ReadSafeTensors(f, offset)
if err != nil {
slog.Error("%v", err)
return []llm.Tensor{}, err
}
tensors = append(tensors, t...)
}
return tensors, nil
}
func GetParams(dirpath string) (*Params, error) {
f, err := os.Open(filepath.Join(dirpath, "config.json"))
if err != nil {
return nil, err
}
defer f.Close()
var params Params
d := json.NewDecoder(f)
err = d.Decode(&params)
if err != nil {
return nil, err
}
return &params, nil
}
// Details on gguf's tokenizer can be found at:
// https://github.com/ggerganov/ggml/blob/master/docs/gguf.md#tokenizer
type Vocab struct {
Tokens []string
Scores []float32
Types []int32
}
func LoadTokens(dirpath string) (*Vocab, error) {
slog.Info(fmt.Sprintf("reading vocab from %s", filepath.Join(dirpath, "tokenizer.model")))
in, err := os.ReadFile(filepath.Join(dirpath, "tokenizer.model"))
if err != nil {
return nil, err
}
// To regenerate sentencepiece from the protobufs use:
// protoc -I=./ --go_out=./ sentencepiece_model.proto
modelProto := &sentencepiece.ModelProto{}
if err := proto.Unmarshal(in, modelProto); err != nil {
return nil, err
}
v := &Vocab{
Tokens: make([]string, 0),
Scores: make([]float32, 0),
Types: make([]int32, 0),
}
pieces := modelProto.GetPieces()
for _, p := range pieces {
v.Tokens = append(v.Tokens, p.GetPiece())
v.Scores = append(v.Scores, p.GetScore())
t := p.GetType()
v.Types = append(v.Types, int32(t))
}
slog.Info(fmt.Sprintf("vocab size: %d", len(v.Tokens)))
// add any additional tokens
addIn, err := os.ReadFile(filepath.Join(dirpath, "added_tokens.json"))
if os.IsNotExist(err) {
return v, nil
} else if err != nil {
return nil, err
}
slog.Info("reading user defined tokens")
var extraTokenData map[string]int
if err := json.Unmarshal(addIn, &extraTokenData); err != nil {
return nil, err
}
type token struct {
key string
pos int
}
extraTokens := make([]token, 0)
for k, id := range extraTokenData {
extraTokens = append(extraTokens, token{k, id})
}
slices.SortFunc(extraTokens, func(a, b token) int {
return cmp.Compare(a.pos, b.pos)
})
numToks := len(v.Tokens)
for cnt, t := range extraTokens {
// the token id should match the specific index for the total number of tokens
if t.pos != cnt+numToks {
return nil, fmt.Errorf("token ID '%d' for '%s' doesn't match total token size", t.pos, t.key)
}
v.Tokens = append(v.Tokens, t.key)
v.Scores = append(v.Scores, -1000.0)
v.Types = append(v.Types, int32(llm.GGUFTokenUserDefined))
}
slog.Info(fmt.Sprintf("vocab size w/ extra tokens: %d", len(v.Tokens)))
return v, nil
}
func GetTensorName(n string) (string, error) {
tMap := map[string]string{
"model.embed_tokens.weight": "token_embd.weight",
"model.layers.(\\d+).input_layernorm.weight": "blk.$1.attn_norm.weight",
"model.layers.(\\d+).mlp.down_proj.weight": "blk.$1.ffn_down.weight",
"model.layers.(\\d+).mlp.gate_proj.weight": "blk.$1.ffn_gate.weight",
"model.layers.(\\d+).mlp.up_proj.weight": "blk.$1.ffn_up.weight",
"model.layers.(\\d+).post_attention_layernorm.weight": "blk.$1.ffn_norm.weight",
"model.layers.(\\d+).self_attn.k_proj.weight": "blk.$1.attn_k.weight",
"model.layers.(\\d+).self_attn.o_proj.weight": "blk.$1.attn_output.weight",
"model.layers.(\\d+).self_attn.q_proj.weight": "blk.$1.attn_q.weight",
"model.layers.(\\d+).self_attn.v_proj.weight": "blk.$1.attn_v.weight",
"lm_head.weight": "output.weight",
"model.norm.weight": "output_norm.weight",
}
v, ok := tMap[n]
if ok {
return v, nil
}
// quick hack to rename the layers to gguf format
for k, v := range tMap {
re := regexp.MustCompile(k)
newName := re.ReplaceAllString(n, v)
if newName != n {
return newName, nil
}
}
return "", fmt.Errorf("couldn't find a layer name for '%s'", n)
}
func WriteGGUF(name string, tensors []llm.Tensor, params *Params, vocab *Vocab) (string, error) {
c := llm.ContainerGGUF{
ByteOrder: binary.LittleEndian,
}
m := llm.NewGGUFModel(&c)
m.Tensors = tensors
m.KV["general.architecture"] = "llama"
m.KV["general.name"] = name
m.KV["llama.context_length"] = uint32(params.ContextSize)
m.KV["llama.embedding_length"] = uint32(params.HiddenSize)
m.KV["llama.block_count"] = uint32(params.HiddenLayers)
m.KV["llama.feed_forward_length"] = uint32(params.IntermediateSize)
m.KV["llama.rope.dimension_count"] = uint32(128)
m.KV["llama.attention.head_count"] = uint32(params.AttentionHeads)
m.KV["llama.attention.head_count_kv"] = uint32(params.KeyValHeads)
m.KV["llama.attention.layer_norm_rms_epsilon"] = float32(params.NormEPS)
m.KV["llama.rope.freq_base"] = float32(params.RopeFreqBase)
m.KV["general.file_type"] = uint32(1)
m.KV["tokenizer.ggml.model"] = "llama"
m.KV["tokenizer.ggml.tokens"] = vocab.Tokens
m.KV["tokenizer.ggml.scores"] = vocab.Scores
m.KV["tokenizer.ggml.token_type"] = vocab.Types
m.KV["tokenizer.ggml.bos_token_id"] = uint32(params.BoSTokenID)
m.KV["tokenizer.ggml.eos_token_id"] = uint32(params.EoSTokenID)
m.KV["tokenizer.ggml.unknown_token_id"] = uint32(0)
m.KV["tokenizer.ggml.add_bos_token"] = true
m.KV["tokenizer.ggml.add_eos_token"] = false
// llamacpp sets the chat template, however we don't need to set it since we pass it in through a layer
// m.KV["tokenizer.chat_template"] = "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}" // XXX removeme
c.V3.NumTensor = uint64(len(tensors))
c.V3.NumKV = uint64(len(m.KV))
f, err := os.CreateTemp("", "ollama-gguf")
if err != nil {
return "", err
}
defer f.Close()
err = m.Encode(f)
if err != nil {
return "", err
}
return f.Name(), nil
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,333 @@
// Copyright 2016 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.!
syntax = "proto2";
// TODO(taku): Needs to use LITE RUNTIME in OSS release.
option optimize_for = LITE_RUNTIME;
option go_package = "./sentencepiece";
package sentencepiece;
// TrainerSpec encodes a various parameters for SentencePiece training.
// Next id: 55
message TrainerSpec {
///////////////////////////////////////////////////////////////////
// General parameters
//
// Input corpus files.
// Trainer accepts the following two formats:
// A) Monolingual: plain text, one sentence per line.
// B) Bilingual: TSV, source sentence <tab> target sentence
// When bilingual data is passed, shared vocabulary model is built.
// Note that the input file must be raw corpus, not a preprocessed corpus.
// Trainer only loads the first `input_sentence_size` sentences specified
// with this parameter.
repeated string input = 1;
// Input corpus format:
// "text": one-sentence-per-line text format (default)
// "tsv": sentence <tab> freq
optional string input_format = 7;
// Output model file prefix.
// <model_prefix>.model and <model_prefix>.vocab are generated.
optional string model_prefix = 2;
// Model type. only have UNIGRAM now.
enum ModelType {
UNIGRAM = 1; // Unigram language model with dynamic algorithm
BPE = 2; // Byte Pair Encoding
WORD = 3; // Delimitered by whitespace.
CHAR = 4; // tokenizes into character sequence
}
optional ModelType model_type = 3 [default = UNIGRAM];
// Vocabulary size. 8k is the default size.
optional int32 vocab_size = 4 [default = 8000];
// List of the languages this model can accept.
// Since the model is language-agnostic, this field is used as a reference.
repeated string accept_language = 5;
// Size of self-test samples, which are encoded in the model file.
optional int32 self_test_sample_size = 6 [default = 0];
// Whether to use DP version of sentencepiece. Use it with TSV input format
// (requires precomputed word tab counts to work).
optional bool enable_differential_privacy = 50 [default = false];
// Set these parameters if you need DP version of sentencepiece.
// std of noise to add.
optional float differential_privacy_noise_level = 51 [default = 0.0];
// Clipping threshold to apply after adding noise. All the words with
// frequency less than this value are dropped.
optional uint64 differential_privacy_clipping_threshold = 52 [default = 0];
///////////////////////////////////////////////////////////////////
// Training parameters.
//
// Uses characters which cover the corpus with the ratio of `chars_coverage`.
// This parameter determines the set of basic Alphabet of sentence piece.
// 1.0 - `chars_coverage` characters are treated as UNK.
// See also required_chars field.
optional float character_coverage = 10 [default = 0.9995];
// Maximum size of sentences the trainer loads from `input` parameter.
// Trainer simply loads the `input` files in sequence.
// It is better to shuffle the input corpus randomly.
optional uint64 input_sentence_size = 11 [default = 0];
optional bool shuffle_input_sentence = 19 [default = true];
// Maximum size of sentences to make seed sentence pieces.
// Extended suffix array is constructed to extract frequent
// sub-strings from the corpus. This uses 20N working space,
// where N is the size of corpus.
optional int32 mining_sentence_size = 12 [deprecated = true];
// Maximum size of sentences to train sentence pieces.
optional int32 training_sentence_size = 13 [deprecated = true];
// The size of seed sentencepieces.
// `seed_sentencepiece_size` must be larger than `vocab_size`.
optional int32 seed_sentencepiece_size = 14 [default = 1000000];
// In every EM sub-iterations, keeps top
// `shrinking_factor` * `current sentencepieces size` with respect to
// the loss of the sentence piece. This value should be smaller than 1.0.
optional float shrinking_factor = 15 [default = 0.75];
// The maximum sentence length in byte. The sentences with the length
// larger than `max_sentence_length` is simply ignored.
// Longer input tends to bring the following risks:
// * Overflow during EM training (unigram language model only)
// * Performance drop because of O(n log n) cost in BPE.
optional int32 max_sentence_length = 18 [default = 4192];
// Number of threads in the training.
optional int32 num_threads = 16 [default = 16];
// Number of EM sub iterations.
optional int32 num_sub_iterations = 17 [default = 2];
///////////////////////////////////////////////////////////////////
// SentencePiece parameters which control the shapes of sentence piece.
//
// Maximum length of sentencepiece.
optional int32 max_sentencepiece_length = 20 [default = 16];
// Uses Unicode script to split sentence pieces.
// When `split_by_unicode_script` is true, we do not allow sentence piece to
// include multiple Unicode scripts, e.g. "F1" is not a valid piece.
// Exception: CJ characters (Hiragana/Katakana/Han) are all handled
// as one script type, since Japanese word can consist of multiple scripts.
// This exception is always applied regardless of the accept-language
// parameter.
optional bool split_by_unicode_script = 21 [default = true];
// When `split_by_number` is true, put a boundary between number and
// non-number transition. If we want to treat "F1" is one token, set this flag
// to be false.
optional bool split_by_number = 23 [default = true];
// Use a white space to split sentence pieces.
// When `split_by_whitespace` is false, we may have the piece containing
// a white space in the middle. e.g., "in_the".
optional bool split_by_whitespace = 22 [default = true];
// Adds whitespace symbol (_) as a suffix instead of prefix. e.g., _hello =>
// hello_. When `treat_whitespace_as_suffix` is true,
// NormalizerSpec::add_dummy_prefix will add the dummy whitespace to the end
// of sentence.
optional bool treat_whitespace_as_suffix = 24 [default = false];
// Allows pieces that only contain whitespaces instead of appearing only as
// prefix or suffix of other pieces.
optional bool allow_whitespace_only_pieces = 26 [default = false];
// Split all digits (0-9) into separate pieces.
optional bool split_digits = 25 [default = false];
// Defines the pre-tokenization delimiter.
// When specified, no pieces crossing this delimiter is not included
// in the vocab. Then the delimiter string is virtually ignored
// during the training. This field can allows constraints on the vocabulary
// selection. Note that this field is available on unigram mode.
optional string pretokenization_delimiter = 53 [ default = ""];
///////////////////////////////////////////////////////////////////
// Vocabulary management
//
// Defines control symbols used as an indicator to
// change the behavior of the decoder. <s> and </s> are pre-defined.
// We can use this field to encode various meta information,
// including language indicator in multilingual model.
// These symbols are not visible to users, but visible to
// the decoder. Note that when the input sentence contains control symbols,
// they are not treated as one token, but segmented into normal pieces.
// Control symbols must be inserted independently from the segmentation.
repeated string control_symbols = 30;
// Defines user defined symbols.
// These symbols are added with extremely high score
// so they are always treated as one unique symbol in any context.
// Typical usage of user_defined_symbols is placeholder for named entities.
repeated string user_defined_symbols = 31;
// Defines required characters. Each UTF8 character in this string is included
// in the character set regardless of character_coverage value. Unlike
// user_defined_symbols, these characters have scores based on the frequency
// on input sentences, and the model can form subwords using characters
// in this field.
optional string required_chars = 36;
// Decomposes unknown pieces into UTF-8 bytes.
optional bool byte_fallback = 35 [default = false];
// When creating the vocabulary file, defines whether or not to additionally
// output the score for each piece.
optional bool vocabulary_output_piece_score = 32 [default = true];
// `vocab_size` is treated as hard limit. Crash if
// the model can not produce the vocab of size `vocab_size`,
// When `hard_vocab_limit` is false, vocab_size is treated
// as soft limit. Note that when model_type=char,
// always assumes hard_vocab_limit = false.
optional bool hard_vocab_limit = 33 [default = true];
// use all symbols for vocab extraction. This flag is valid
// if model type is either CHAR or WORD
optional bool use_all_vocab = 34 [default = false];
///////////////////////////////////////////////////////////////////
// Reserved special meta tokens.
// * -1 is not used.
// * unk_id must not be -1.
// Id must starts with 0 and be contigous.
optional int32 unk_id = 40 [default = 0]; // <unk>
optional int32 bos_id = 41 [default = 1]; // <s>
optional int32 eos_id = 42 [default = 2]; // </s>
optional int32 pad_id = 43 [default = -1]; // <pad> (padding)
optional string unk_piece = 45 [default = "<unk>"];
optional string bos_piece = 46 [default = "<s>"];
optional string eos_piece = 47 [default = "</s>"];
optional string pad_piece = 48 [default = "<pad>"];
// Encodes <unk> into U+2047 (DOUBLE QUESTION MARK),
// since this character can be useful both for user and
// developer. We can easily figure out that <unk> is emitted.
optional string unk_surface = 44 [default = " \xE2\x81\x87 "];
// Increase bit depth to allow unigram model training on large
// (>10M sentences) corpora. A Side-effect of enabling this flag
// is increased memory usage.
optional bool train_extremely_large_corpus = 49 [default = false];
// Path to a seed sentencepieces file, with one tab-separated
// seed sentencepiece <tab> frequency per line.
optional string seed_sentencepieces_file = 54 [default = ""];
// Customized extensions: the range of field numbers
// are open to third-party extensions.
extensions 200 to max;
}
// NormalizerSpec encodes a various parameters for string normalizaiton
message NormalizerSpec {
// name of normalization rule.
optional string name = 1;
// Pre-compiled normalization rule created by
// Builder::GetPrecompiledCharsMap() or Builder::CompileCharsMap() method.
// Usually this field is set by Builder::GetNormalizerSpec() method.
optional bytes precompiled_charsmap = 2;
// Adds dummy whitespace at the beginning of text in order to
// treat "world" in "world" and "hello world" in the same way.
optional bool add_dummy_prefix = 3 [default = true];
// Removes leading, trailing, and duplicate internal whitespace.
optional bool remove_extra_whitespaces = 4 [default = true];
// Replaces whitespace with meta symbol.
// This field must be true to train sentence piece model.
optional bool escape_whitespaces = 5 [default = true];
// Custom normalization rule file in TSV format.
// https://github.com/google/sentencepiece/blob/master/doc/normalization.md
// This field is only used in SentencePieceTrainer::Train() method, which
// compiles the rule into the binary rule stored in `precompiled_charsmap`.
optional string normalization_rule_tsv = 6;
// Customized extensions: the range of field numbers
// are open to third-party extensions.
extensions 200 to max;
}
// Proto to store samples for self-testing.
message SelfTestData {
message Sample {
optional string input = 1;
optional string expected = 2;
}
repeated Sample samples = 1;
// Customized extensions: the range of field numbers
// are open to third-party extensions.
extensions 200 to max;
}
// ModelProto stores model parameters.
// SentencePieceProcessor is supposed to be self-contained.
// All settings/parameters which may change the behavior must be encoded
// in ModelProto.
message ModelProto {
message SentencePiece {
enum Type {
NORMAL = 1; // normal symbol
UNKNOWN = 2; // unknown symbol. only <unk> for now.
CONTROL = 3; // control symbols. </s>, <s>, <2ja> etc.
USER_DEFINED = 4; // user defined symbols.
// Typical usage of USER_DEFINED symbol
// is placeholder.
BYTE = 6; // byte symbols. Used when `byte_fallback` is true.
UNUSED = 5; // this piece is not used.
}
optional string piece = 1; // piece must not be empty.
optional float score = 2;
optional Type type = 3 [default = NORMAL];
// Customized extensions: the range of field numbers
// are open to third-party extensions.
extensions 200 to max;
}
// Sentence pieces with scores.
repeated SentencePiece pieces = 1;
// Spec used to generate this model file.
optional TrainerSpec trainer_spec = 2;
// Spec for text normalization.
optional NormalizerSpec normalizer_spec = 3;
// Stores sample input and its expected segmentation to verify the model.
optional SelfTestData self_test_data = 4;
// Spec for text de-normalization.
optional NormalizerSpec denormalizer_spec = 5;
// Customized extensions: the range of field numbers
// are open to third-party extensions.
extensions 200 to max;
}

View File

@@ -1,25 +1,21 @@
# Documentation
To get started, see the project's **[quickstart](../README.md#quickstart)**.
### Getting Started
* [Quickstart](../README.md#quickstart)
* [Examples](../examples)
* [Importing models](./import.md)
* [Linux Documentation](./linux.md)
* [Windows Documentation](./windows.md)
* [Docker Documentation](https://hub.docker.com/r/ollama/ollama)
Ollama is a tool for running AI models on your hardware. Many users will choose to use the Command Line Interface (CLI) to work with Ollama. Learn more about all the commands in the CLI in the **[Main Readme](../README.md)**.
### Reference
Use the RESTful API using any language, including Python, JavaScript, Typescript, Go, Rust, and many more. Learn more about using the API in the **[API Documentation](./api.md)**.
* [API Reference](./api.md)
* [Modelfile Reference](./modelfile.md)
* [OpenAI Compatibility](./openai.md)
Create new models or modify models already in the library using the Modelfile. Learn more about the Modelfile syntax in the **[Modelfile Documentation](./modelfile.md)**.
### Resources
Import models using source model weights found on Hugging Face and similar sites by referring to the **[Import Documentation](./import.md)**.
Installing on Linux in most cases is easy using the script on [ollama.com/download](ollama.com/download). To get more detail about the install, including CUDA drivers, see the **[Linux Documentation](./linux.md)**.
Many of our users like the flexibility of using our official Docker Image. Learn more about using Docker with Ollama using the **[Docker Documentation](https://hub.docker.com/r/ollama/ollama)**.
It is easy to install on Linux and Mac, but many users will choose to build Ollama on their own. To do this, refer to the **[Development Documentation](./development.md)**.
If encountering a problem with Ollama, the best place to start is the logs. Find more information about them here in the **[Troubleshooting Guide](./troubleshooting.md)**.
Finally for all the questions that don't fit anywhere else, there is the **[FAQ](./faq.md)**
[Tutorials](./tutorials.md) apply the documentation to tasks.
For working code examples of using Ollama, see [Examples](../examples).
* [Troubleshooting Guide](./troubleshooting.md)
* [FAQ](./faq.md)
* [Development guide](./development.md)

View File

@@ -54,7 +54,7 @@ Advanced parameters (optional):
#### JSON mode
Enable JSON mode by setting the `format` parameter to `json`. This will structure the response as a valid JSON object. See the JSON mode [example](#generate-request-json-mode) below.
Enable JSON mode by setting the `format` parameter to `json`. This will structure the response as a valid JSON object. See the JSON mode [example](#request-json-mode) below.
> Note: it's important to instruct the model to use JSON in the `prompt`. Otherwise, the model may generate large amounts whitespace.
@@ -256,9 +256,9 @@ For reproducible outputs, set `temperature` to 0 and `seed` to a number:
```shell
curl http://localhost:11434/api/generate -d '{
"model": "mistral",
"prompt": "[INST] why is the sky blue? [/INST]",
"prompt": "Why is the sky blue?",
"options": {
"seed": 101,
"seed": 123,
"temperature": 0
}
}'
@@ -1024,7 +1024,7 @@ Advanced parameters:
```shell
curl http://localhost:11434/api/embeddings -d '{
"model": "llama2",
"model": "all-minilm",
"prompt": "Here is an article about llamas..."
}'
```

View File

@@ -3,7 +3,7 @@
Install required tools:
- cmake version 3.24 or higher
- go version 1.21 or higher
- go version 1.22 or higher
- gcc version 11.4.0 or higher
```bash
@@ -42,15 +42,15 @@ Now you can run `ollama`:
#### Linux CUDA (NVIDIA)
*Your operating system distribution may already have packages for NVIDIA CUDA. Distro packages are often preferable, but instructions are distro-specific. Please consult distro-specific docs for dependencies if available!*
_Your operating system distribution may already have packages for NVIDIA CUDA. Distro packages are often preferable, but instructions are distro-specific. Please consult distro-specific docs for dependencies if available!_
Install `cmake` and `golang` as well as [NVIDIA CUDA](https://developer.nvidia.com/cuda-downloads)
development and runtime packages.
development and runtime packages.
Typically the build scripts will auto-detect CUDA, however, if your Linux distro
or installation approach uses unusual paths, you can specify the location by
specifying an environment variable `CUDA_LIB_DIR` to the location of the shared
libraries, and `CUDACXX` to the location of the nvcc compiler. You can customize
libraries, and `CUDACXX` to the location of the nvcc compiler. You can customize
set set of target CUDA architectues by setting `CMAKE_CUDA_ARCHITECTURES` (e.g. "50;60;70")
Then generate dependencies:
@@ -67,15 +67,15 @@ go build .
#### Linux ROCm (AMD)
*Your operating system distribution may already have packages for AMD ROCm and CLBlast. Distro packages are often preferable, but instructions are distro-specific. Please consult distro-specific docs for dependencies if available!*
_Your operating system distribution may already have packages for AMD ROCm and CLBlast. Distro packages are often preferable, but instructions are distro-specific. Please consult distro-specific docs for dependencies if available!_
Install [CLBlast](https://github.com/CNugteren/CLBlast/blob/master/doc/installation.md) and [ROCm](https://rocm.docs.amd.com/en/latest/deploy/linux/quick_start.html) developement packages first, as well as `cmake` and `golang`.
Install [CLBlast](https://github.com/CNugteren/CLBlast/blob/master/doc/installation.md) and [ROCm](https://rocm.docs.amd.com/en/latest/) development packages first, as well as `cmake` and `golang`.
Typically the build scripts will auto-detect ROCm, however, if your Linux distro
or installation approach uses unusual paths, you can specify the location by
specifying an environment variable `ROCM_PATH` to the location of the ROCm
install (typically `/opt/rocm`), and `CLBlast_DIR` to the location of the
CLBlast install (typically `/usr/lib/cmake/CLBlast`). You can also customize
CLBlast install (typically `/usr/lib/cmake/CLBlast`). You can also customize
the AMD GPU targets by setting AMDGPU_TARGETS (e.g. `AMDGPU_TARGETS="gfx1101;gfx1102"`)
```
@@ -88,17 +88,17 @@ Then build the binary:
go build .
```
ROCm requires elevated privileges to access the GPU at runtime. On most distros you can add your user account to the `render` group, or run as root.
ROCm requires elevated privileges to access the GPU at runtime. On most distros you can add your user account to the `render` group, or run as root.
#### Advanced CPU Settings
By default, running `go generate ./...` will compile a few different variations
of the LLM library based on common CPU families and vector math capabilities,
including a lowest-common-denominator which should run on almost any 64 bit CPU
somewhat slowly. At runtime, Ollama will auto-detect the optimal variation to
load. If you would like to build a CPU-based build customized for your
somewhat slowly. At runtime, Ollama will auto-detect the optimal variation to
load. If you would like to build a CPU-based build customized for your
processor, you can set `OLLAMA_CUSTOM_CPU_DEFS` to the llama.cpp flags you would
like to use. For example, to compile an optimized binary for an Intel i9-9880H,
like to use. For example, to compile an optimized binary for an Intel i9-9880H,
you might use:
```
@@ -108,8 +108,7 @@ go build .
#### Containerized Linux Build
If you have Docker available, you can build linux binaries with `./scripts/build_linux.sh` which has the CUDA and ROCm dependencies included. The resulting binary is placed in `./dist`
If you have Docker available, you can build linux binaries with `./scripts/build_linux.sh` which has the CUDA and ROCm dependencies included. The resulting binary is placed in `./dist`
### Windows
@@ -118,21 +117,29 @@ Note: The windows build for Ollama is still under development.
Install required tools:
- MSVC toolchain - C/C++ and cmake as minimal requirements
- go version 1.21 or higher
- Go version 1.22 or higher
- MinGW (pick one variant) with GCC.
- <https://www.mingw-w64.org/>
- <https://www.msys2.org/>
- [MinGW-w64](https://www.mingw-w64.org/)
- [MSYS2](https://www.msys2.org/)
```powershell
$env:CGO_ENABLED="1"
go generate ./...
go build .
```
#### Windows CUDA (NVIDIA)
In addition to the common Windows development tools described above, install:
In addition to the common Windows development tools described above, install CUDA after installing MSVC.
- [NVIDIA CUDA](https://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/index.html)
#### Windows ROCm (AMD Radeon)
In addition to the common Windows development tools described above, install AMDs HIP package after installing MSVC.
- [AMD HIP](https://www.amd.com/en/developer/resources/rocm-hub/hip-sdk.html)
- [Strawberry Perl](https://strawberryperl.com/)
Lastly, add `ninja.exe` included with MSVC to the system path (e.g. `C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\Common7\IDE\CommonExtensions\Microsoft\CMake\Ninja`).

View File

@@ -14,6 +14,10 @@ curl -fsSL https://ollama.com/install.sh | sh
Review the [Troubleshooting](./troubleshooting.md) docs for more about using logs.
## Is my GPU compatible with Ollama?
Please refer to the [GPU docs](./gpu.md).
## How can I specify the context window size?
By default, Ollama uses a context window size of 2048 tokens.
@@ -95,6 +99,37 @@ Ollama binds 127.0.0.1 port 11434 by default. Change the bind address with the `
Refer to the section [above](#how-do-i-configure-ollama-server) for how to set environment variables on your platform.
## How can I use Ollama with a proxy server?
Ollama runs an HTTP server and can be exposed using a proxy server such as Nginx. To do so, configure the proxy to forward requests and optionally set required headers (if not exposing Ollama on the network). For example, with Nginx:
```
server {
listen 80;
server_name example.com; # Replace with your domain or IP
location / {
proxy_pass http://localhost:11434;
proxy_set_header Host localhost:11434;
}
}
```
## How can I use Ollama with ngrok?
Ollama can be accessed using a range of tools for tunneling tools. For example with Ngrok:
```
ngrok http 11434 --host-header="localhost:11434"
```
## How can I use Ollama with Cloudflare Tunnel?
To use Ollama with Cloudflare Tunnel, use the `--url` and `--http-host-header` flags:
```
cloudflared tunnel --url http://localhost:11434 --http-host-header="localhost:11434"
```
## How can I allow additional web origins to access Ollama?
Ollama allows cross-origin requests from `127.0.0.1` and `0.0.0.0` by default. Additional origins can be configured with `OLLAMA_ORIGINS`.
@@ -119,7 +154,7 @@ No. Ollama runs locally, and conversation data does not leave your machine.
## How can I use Ollama in Visual Studio Code?
There is already a large collection of plugins available for VSCode as well as other editors that leverage Ollama. See the list of [extensions & plugins](https://github.com/jmorganca/ollama#extensions--plugins) at the bottom of the main repository readme.
There is already a large collection of plugins available for VSCode as well as other editors that leverage Ollama. See the list of [extensions & plugins](https://github.com/ollama/ollama#extensions--plugins) at the bottom of the main repository readme.
## How do I use Ollama behind a proxy?

102
docs/gpu.md Normal file
View File

@@ -0,0 +1,102 @@
# GPU
## Nvidia
Ollama supports Nvidia GPUs with compute capability 5.0+.
Check your compute compatibility to see if your card is supported:
[https://developer.nvidia.com/cuda-gpus](https://developer.nvidia.com/cuda-gpus)
| Compute Capability | Family | Cards |
| ------------------ | ------------------- | ----------------------------------------------------------------------------------------------------------- |
| 9.0 | NVIDIA | `H100` |
| 8.9 | GeForce RTX 40xx | `RTX 4090` `RTX 4080` `RTX 4070 Ti` `RTX 4060 Ti` |
| | NVIDIA Professional | `L4` `L40` `RTX 6000` |
| 8.6 | GeForce RTX 30xx | `RTX 3090 Ti` `RTX 3090` `RTX 3080 Ti` `RTX 3080` `RTX 3070 Ti` `RTX 3070` `RTX 3060 Ti` `RTX 3060` |
| | NVIDIA Professional | `A40` `RTX A6000` `RTX A5000` `RTX A4000` `RTX A3000` `RTX A2000` `A10` `A16` `A2` |
| 8.0 | NVIDIA | `A100` `A30` |
| 7.5 | GeForce GTX/RTX | `GTX 1650 Ti` `TITAN RTX` `RTX 2080 Ti` `RTX 2080` `RTX 2070` `RTX 2060` |
| | NVIDIA Professional | `T4` `RTX 5000` `RTX 4000` `RTX 3000` `T2000` `T1200` `T1000` `T600` `T500` |
| | Quadro | `RTX 8000` `RTX 6000` `RTX 5000` `RTX 4000` |
| 7.0 | NVIDIA | `TITAN V` `V100` `Quadro GV100` |
| 6.1 | NVIDIA TITAN | `TITAN Xp` `TITAN X` |
| | GeForce GTX | `GTX 1080 Ti` `GTX 1080` `GTX 1070 Ti` `GTX 1070` `GTX 1060` `GTX 1050` |
| | Quadro | `P6000` `P5200` `P4200` `P3200` `P5000` `P4000` `P3000` `P2200` `P2000` `P1000` `P620` `P600` `P500` `P520` |
| | Tesla | `P40` `P4` |
| 6.0 | NVIDIA | `Tesla P100` `Quadro GP100` |
| 5.2 | GeForce GTX | `GTX TITAN X` `GTX 980 Ti` `GTX 980` `GTX 970` `GTX 960` `GTX 950` |
| | Quadro | `M6000 24GB` `M6000` `M5000` `M5500M` `M4000` `M2200` `M2000` `M620` |
| | Tesla | `M60` `M40` |
| 5.0 | GeForce GTX | `GTX 750 Ti` `GTX 750` `NVS 810` |
| | Quadro | `K2200` `K1200` `K620` `M1200` `M520` `M5000M` `M4000M` `M3000M` `M2000M` `M1000M` `K620M` `M600M` `M500M` |
### GPU Selection
If you have multiple NVIDIA GPUs in your system and want to limit Ollama to use
a subset, you can set `CUDA_VISIBLE_DEVICES` to a comma separated list of GPUs.
Numeric IDs may be used, however ordering may vary, so UUIDs are more reliable.
You can discover the UUID of your GPUs by running `nvidia-smi -L` If you want to
ignore the GPUs and force CPU usage, use an invalid GPU ID (e.g., "-1")
### Laptop Suspend Resume
On linux, after a suspend/resume cycle, sometimes Ollama will fail to discover
your NVIDIA GPU, and fallback to running on the CPU. You can workaround this
driver bug by reloading the NVIDIA UVM driver with `sudo rmmod nvidia_uvm &&
sudo modprobe nvidia_uvm`
## AMD Radeon
Ollama supports the following AMD GPUs:
| Family | Cards and accelerators |
| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
| AMD Radeon RX | `7900 XTX` `7900 XT` `7900 GRE` `7800 XT` `7700 XT` `7600 XT` `7600` `6950 XT` `6900 XTX` `6900XT` `6800 XT` `6800` `Vega 64` `Vega 56` |
| AMD Radeon PRO | `W7900` `W7800` `W7700` `W7600` `W7500` `W6900X` `W6800X Duo` `W6800X` `W6800` `V620` `V420` `V340` `V320` `Vega II Duo` `Vega II` `VII` `SSG` |
| AMD Instinct | `MI300X` `MI300A` `MI300` `MI250X` `MI250` `MI210` `MI200` `MI100` `MI60` `MI50` |
### Overrides
Ollama leverages the AMD ROCm library, which does not support all AMD GPUs. In
some cases you can force the system to try to use a similar LLVM target that is
close. For example The Radeon RX 5400 is `gfx1034` (also known as 10.3.4)
however, ROCm does not currently support this target. The closest support is
`gfx1030`. You can use the environment variable `HSA_OVERRIDE_GFX_VERSION` with
`x.y.z` syntax. So for example, to force the system to run on the RX 5400, you
would set `HSA_OVERRIDE_GFX_VERSION="10.3.0"` as an environment variable for the
server. If you have an unsupported AMD GPU you can experiment using the list of
supported types below.
At this time, the known supported GPU types are the following LLVM Targets.
This table shows some example GPUs that map to these LLVM targets:
| **LLVM Target** | **An Example GPU** |
|-----------------|---------------------|
| gfx900 | Radeon RX Vega 56 |
| gfx906 | Radeon Instinct MI50 |
| gfx908 | Radeon Instinct MI100 |
| gfx90a | Radeon Instinct MI210 |
| gfx940 | Radeon Instinct MI300 |
| gfx941 | |
| gfx942 | |
| gfx1030 | Radeon PRO V620 |
| gfx1100 | Radeon PRO W7900 |
| gfx1101 | Radeon PRO W7700 |
| gfx1102 | Radeon RX 7600 |
AMD is working on enhancing ROCm v6 to broaden support for families of GPUs in a
future release which should increase support for more GPUs.
Reach out on [Discord](https://discord.gg/ollama) or file an
[issue](https://github.com/ollama/ollama/issues) for additional help.
### GPU Selection
If you have multiple AMD GPUs in your system and want to limit Ollama to use a
subset, you can set `HIP_VISIBLE_DEVICES` to a comma separated list of GPUs.
You can see the list of devices with `rocminfo`. If you want to ignore the GPUs
and force CPU usage, use an invalid GPU ID (e.g., "-1")
### Container Permission
In some Linux distributions, SELinux can prevent containers from
accessing the AMD GPU devices. On the host system you can run
`sudo setsebool container_use_devices=1` to allow containers to use devices.
### Metal (Apple GPUs)
Ollama supports GPU acceleration on Apple devices via the Metal API.

View File

@@ -10,6 +10,14 @@ Install Ollama running this one-liner:
curl -fsSL https://ollama.com/install.sh | sh
```
## AMD Radeon GPU support
While AMD has contributed the `amdgpu` driver upstream to the official linux
kernel source, the version is older and may not support all ROCm features. We
recommend you install the latest driver from
https://www.amd.com/en/support/linux-drivers for best support of your Radeon
GPU.
## Manual install
### Download the `ollama` binary
@@ -64,6 +72,11 @@ Verify that the drivers are installed by running the following command, which sh
nvidia-smi
```
### Install ROCm (optional - for Radeon GPUs)
[Download and Install](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/tutorial/quick-start.html)
Make sure to install ROCm v6
### Start Ollama
Start Ollama using `systemd`:

View File

@@ -71,12 +71,9 @@ More examples are available in the [examples directory](../examples).
There are two ways to view `Modelfile`s underlying the models in [ollama.com/library][1]:
- Option 1: view a details page from a model's tags page:
1. Go to a particular model's tags (e.g. https://ollama.com/library/llama2/tags)
2. Click on a tag (e.g. https://ollama.com/library/llama2:13b)
3. Scroll down to "Layers"
- Note: if the [`FROM` instruction](#from-required) is not present,
it means the model was created from a local file
- Option 1: view a model's data:
1. Go to a particular model page (e.g. https://ollama.com/library/llama2)
2. There is a table that displays the model's different components
- Option 2: use `ollama show` to print the `Modelfile` for any local models like so:
```bash
@@ -113,7 +110,7 @@ FROM llama2
```
A list of available base models:
<https://github.com/jmorganca/ollama#model-library>
<https://github.com/ollama/ollama#model-library>
#### Build from a `bin` file
@@ -131,7 +128,7 @@ The `PARAMETER` instruction defines a parameter that can be set when the model i
PARAMETER <parameter> <parametervalue>
```
### Valid Parameters and Values
#### Valid Parameters and Values
| Parameter | Description | Value Type | Example Usage |
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------- | -------------------- |
@@ -183,7 +180,7 @@ SYSTEM """<system message>"""
### ADAPTER
The `ADAPTER` instruction specifies the LoRA adapter to apply to the base model. The value of this instruction should be an absolute path or a path relative to the Modelfile and the file must be in a GGML file format. The adapter should be tuned from the base model otherwise the behaviour is undefined.
The `ADAPTER` instruction is an optional instruction that specifies any LoRA adapter that should apply to the base model. The value of this instruction should be an absolute path or a path relative to the Modelfile and the file must be in a GGML file format. The adapter should be tuned from the base model otherwise the behaviour is undefined.
```modelfile
ADAPTER ./ollama-lora.bin
@@ -201,7 +198,21 @@ LICENSE """
### MESSAGE
The `MESSAGE` instruction allows you to specify a message history for the model to use when responding:
The `MESSAGE` instruction allows you to specify a message history for the model to use when responding. Use multiple iterations of the MESSAGE command to build up a conversation which will guide the model to answer in a similar way.
```modelfile
MESSAGE <role> <message>
```
#### Valid roles
| Role | Description |
| --------- | ------------------------------------------------------------ |
| system | Alternate way of providing the SYSTEM message for the model. |
| user | An example message of what the user could have asked. |
| assistant | An example message of how the model should respond. |
#### Example conversation
```modelfile
MESSAGE user Is Toronto in Canada?

View File

@@ -1,6 +1,6 @@
# OpenAI compatibility
> **Note:** OpenAI compatibility is experimental and is subject to major adjustments including breaking changes. For fully-featured access to the Ollama API, see the Ollama [Python library](https://github.com/ollama/ollama-python), [JavaScript library](https://github.com/ollama/ollama-js) and [REST API](https://github.com/jmorganca/ollama/blob/main/docs/api.md).
> **Note:** OpenAI compatibility is experimental and is subject to major adjustments including breaking changes. For fully-featured access to the Ollama API, see the Ollama [Python library](https://github.com/ollama/ollama-python), [JavaScript library](https://github.com/ollama/ollama-js) and [REST API](https://github.com/ollama/ollama/blob/main/docs/api.md).
Ollama provides experimental compatibility with parts of the [OpenAI API](https://platform.openai.com/docs/api-reference) to help connect existing applications to Ollama.

View File

@@ -67,6 +67,12 @@ You can see what features your CPU has with the following.
cat /proc/cpuinfo| grep flags | head -1
```
## Known issues
## Installing older or pre-release versions on Linux
* N/A
If you run into problems on Linux and want to install an older version, or you'd
like to try out a pre-release before it's officially released, you can tell the
install script which version to install.
```sh
curl -fsSL https://ollama.com/install.sh | OLLAMA_VERSION="0.1.29" sh
```

View File

@@ -4,7 +4,7 @@ Welcome to the Ollama Windows preview.
No more WSL required!
Ollama now runs as a native Windows application, including NVIDIA GPU support.
Ollama now runs as a native Windows application, including NVIDIA and AMD Radeon GPU support.
After installing Ollama Windows Preview, Ollama will run in the background and
the `ollama` command line is available in `cmd`, `powershell` or your favorite
terminal application. As usual the Ollama [api](./api.md) will be served on
@@ -21,6 +21,7 @@ Logs will often be helpful in dianosing the problem (see
* Windows 10 or newer, Home or Pro
* NVIDIA 452.39 or newer Drivers if you have an NVIDIA card
* AMD Radeon Driver https://www.amd.com/en/support if you have a Radeon card
## API Access

View File

@@ -1,6 +1,6 @@
# PrivateGPT with Llama 2 uncensored
https://github.com/jmorganca/ollama/assets/3325447/20cf8ec6-ff25-42c6-bdd8-9be594e3ce1b
https://github.com/ollama/ollama/assets/3325447/20cf8ec6-ff25-42c6-bdd8-9be594e3ce1b
> Note: this example is a slightly modified version of PrivateGPT using models such as Llama 2 Uncensored. All credit for PrivateGPT goes to Iván Martínez who is the creator of it, and you can find his GitHub repo [here](https://github.com/imartinez/privateGPT).

View File

@@ -28,7 +28,7 @@ You are Mario from Super Mario Bros, acting as an assistant.
What if you want to change its behaviour?
- Try changing the prompt
- Try changing the parameters [Docs](https://github.com/jmorganca/ollama/blob/main/docs/modelfile.md)
- Try changing the parameters [Docs](https://github.com/ollama/ollama/blob/main/docs/modelfile.md)
- Try changing the model (e.g. An uncensored model by `FROM wizard-vicuna` this is the wizard-vicuna uncensored model )
Once the changes are made,

View File

@@ -1,23 +0,0 @@
# Example Modelfile - Tweetwriter
This simple examples shows what you can do without any code, simply relying on a Modelfile. The file has two instructions:
1. FROM - The From instructions defines the parent model to use for this one. If you choose a model from the library, you can enter just the model name. For all other models, you need to specify the namespace as well. You could also use a local file. Just include the relative path to the converted, quantized model weights file. To learn more about creating that file, see the `import.md` file in the docs folder of this repository.
2. SYSTEM - This defines the system prompt for the model and overrides the system prompt from the parent model.
## Running the Example
1. Create the model:
```bash
ollama create tweetwriter
```
2. Enter a topic to generate a tweet about.
3. Show the Modelfile in the REPL.
```bash
/show modelfile
```
Notice that the FROM and SYSTEM match what was in the file. But there is also a TEMPLATE and PARAMETER. These are inherited from the parent model.

View File

@@ -1,21 +0,0 @@
# Ollama Chat App
Build a Llama2 chat app using Streamlit and Ollama.
## Running the Example
1. Ensure you have the `llama2` model installed:
```bash
ollama pull llama2
```
2. Install the Python Requirements.
```bash
pip install -r requirements.txt
```
3. Run the example:
```bash
python main.py
```

View File

@@ -1,6 +1,6 @@
# JSON Output Example
![llmjson 2023-11-10 15_31_31](https://github.com/jmorganca/ollama/assets/633681/e599d986-9b4a-4118-81a4-4cfe7e22da25)
![llmjson 2023-11-10 15_31_31](https://github.com/ollama/ollama/assets/633681/e599d986-9b4a-4118-81a4-4cfe7e22da25)
There are two python scripts in this example. `randomaddresses.py` generates random addresses from different countries. `predefinedschema.py` sets a template for the model to fill in.

View File

@@ -1,6 +1,6 @@
# Log Analysis example
![loganalyzer 2023-11-10 08_53_29](https://github.com/jmorganca/ollama/assets/633681/ad30f1fc-321f-4953-8914-e30e24db9921)
![loganalyzer 2023-11-10 08_53_29](https://github.com/ollama/ollama/assets/633681/ad30f1fc-321f-4953-8914-e30e24db9921)
This example shows one possible way to create a log file analyzer. It uses the model **mattw/loganalyzer** which is based on **codebooga**, a 34b parameter model.

View File

@@ -1,6 +1,6 @@
# Function calling
![function calling 2023-11-16 16_12_58](https://github.com/jmorganca/ollama/assets/633681/a0acc247-9746-45ab-b325-b65dfbbee4fb)
![function calling 2023-11-16 16_12_58](https://github.com/ollama/ollama/assets/633681/a0acc247-9746-45ab-b325-b65dfbbee4fb)
One of the features added to some models is 'function calling'. It's a bit of a confusing name. It's understandable if you think that means the model can call functions, but that's not what it means. Function calling simply means that the output of the model is formatted in JSON, using a preconfigured schema, and uses the expected types. Then your code can use the output of the model and call functions with it. Using the JSON format in Ollama, you can use any model for function calling.

27
go.mod
View File

@@ -1,23 +1,43 @@
module github.com/jmorganca/ollama
module github.com/ollama/ollama
go 1.21
go 1.22
toolchain go1.22.0
require (
github.com/containerd/console v1.0.3
github.com/d4l3k/go-bfloat16 v0.0.0-20211005043715-690c3bdd05f1
github.com/emirpasic/gods v1.18.1
github.com/gin-gonic/gin v1.9.1
github.com/golang/protobuf v1.5.0
github.com/google/uuid v1.0.0
github.com/mitchellh/mapstructure v1.5.0
github.com/olekukonko/tablewriter v0.0.5
github.com/spf13/cobra v1.7.0
github.com/stretchr/testify v1.8.4
github.com/x448/float16 v0.8.4
golang.org/x/sync v0.3.0
)
require github.com/pdevine/tensor v0.0.0-20240228013915-64ccaa8d9ca9
require (
github.com/apache/arrow/go/arrow v0.0.0-20201229220542-30ce2eb5d4dc // indirect
github.com/chewxy/hm v1.0.0 // indirect
github.com/chewxy/math32 v1.0.8 // indirect
github.com/davecgh/go-spew v1.1.1 // indirect
github.com/gogo/protobuf v1.3.2 // indirect
github.com/google/flatbuffers v1.12.0 // indirect
github.com/mattn/go-runewidth v0.0.14 // indirect
github.com/pkg/errors v0.9.1 // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect
github.com/rivo/uniseg v0.2.0 // indirect
github.com/xtgo/set v1.0.0 // indirect
go4.org/unsafe/assume-no-moving-gc v0.0.0-20231121144256-b99613f794b6 // indirect
golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 // indirect
gonum.org/v1/gonum v0.8.2 // indirect
gorgonia.org/vecf32 v0.9.0 // indirect
gorgonia.org/vecf64 v0.9.0 // indirect
)
require (
@@ -38,7 +58,6 @@ require (
github.com/mattn/go-isatty v0.0.19 // indirect
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
github.com/modern-go/reflect2 v1.0.2 // indirect
github.com/pbnjay/memory v0.0.0-20210728143218-7b4eea64cf58
github.com/pelletier/go-toml/v2 v2.0.8 // indirect
github.com/spf13/pflag v1.0.5 // indirect
github.com/twitchyliquid64/golang-asm v0.15.1 // indirect
@@ -50,6 +69,6 @@ require (
golang.org/x/sys v0.13.0
golang.org/x/term v0.13.0
golang.org/x/text v0.13.0 // indirect
google.golang.org/protobuf v1.30.0 // indirect
google.golang.org/protobuf v1.30.0
gopkg.in/yaml.v3 v3.0.1 // indirect
)

150
go.sum
View File

@@ -1,18 +1,38 @@
cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw=
github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
github.com/ajstarks/svgo v0.0.0-20180226025133-644b8db467af/go.mod h1:K08gAheRH3/J6wwsYMMT4xOr94bZjxIelGM0+d/wbFw=
github.com/apache/arrow/go/arrow v0.0.0-20201229220542-30ce2eb5d4dc h1:zvQ6w7KwtQWgMQiewOF9tFtundRMVZFSAksNV6ogzuY=
github.com/apache/arrow/go/arrow v0.0.0-20201229220542-30ce2eb5d4dc/go.mod h1:c9sxoIT3YgLxH4UhLOCKaBlEojuMhVYpk4Ntv3opUTQ=
github.com/bytedance/sonic v1.5.0/go.mod h1:ED5hyg4y6t3/9Ku1R6dU/4KyJ48DZ4jPhfY1O2AihPM=
github.com/bytedance/sonic v1.9.1 h1:6iJ6NqdoxCDr6mbY8h18oSO+cShGSMRGCEo7F2h0x8s=
github.com/bytedance/sonic v1.9.1/go.mod h1:i736AoUSYt75HyZLoJW9ERYxcy6eaN6h4BZXU064P/U=
github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU=
github.com/chenzhuoyu/base64x v0.0.0-20211019084208-fb5309c8db06/go.mod h1:DH46F32mSOjUmXrMHnKwZdA8wcEefY7UVqBKYGjpdQY=
github.com/chenzhuoyu/base64x v0.0.0-20221115062448-fe3a3abad311 h1:qSGYFH7+jGhDF8vLC+iwCD4WpbV1EBDSzWkJODFLams=
github.com/chenzhuoyu/base64x v0.0.0-20221115062448-fe3a3abad311/go.mod h1:b583jCggY9gE99b6G5LEC39OIiVsWj+R97kbl5odCEk=
github.com/chewxy/hm v1.0.0 h1:zy/TSv3LV2nD3dwUEQL2VhXeoXbb9QkpmdRAVUFiA6k=
github.com/chewxy/hm v1.0.0/go.mod h1:qg9YI4q6Fkj/whwHR1D+bOGeF7SniIP40VweVepLjg0=
github.com/chewxy/math32 v1.0.0/go.mod h1:Miac6hA1ohdDUTagnvJy/q+aNnEk16qWUdb8ZVhvCN0=
github.com/chewxy/math32 v1.0.8 h1:fU5E4Ec4Z+5RtRAi3TovSxUjQPkgRh+HbP7tKB2OFbM=
github.com/chewxy/math32 v1.0.8/go.mod h1:dOB2rcuFrCn6UHrze36WSLVPKtzPMRAQvBvUwkSsLqs=
github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw=
github.com/cncf/udpa/go v0.0.0-20191209042840-269d4d468f6f/go.mod h1:M8M6+tZqaGXZJjfX53e64911xZQV5JYwmTeXPW+k8Sc=
github.com/containerd/console v1.0.3 h1:lIr7SlA5PxZyMV30bDW0MGbiOPXwc63yRuCP0ARubLw=
github.com/containerd/console v1.0.3/go.mod h1:7LqA/THxQ86k76b8c/EMSiaJ3h1eZkMkXar0TQ1gf3U=
github.com/cpuguy83/go-md2man/v2 v2.0.2/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=
github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
github.com/d4l3k/go-bfloat16 v0.0.0-20211005043715-690c3bdd05f1 h1:cBzrdJPAFBsgCrDPnZxlp1dF2+k4r1kVpD7+1S1PVjY=
github.com/d4l3k/go-bfloat16 v0.0.0-20211005043715-690c3bdd05f1/go.mod h1:uw2gLcxEuYUlAd/EXyjc/v55nd3+47YAgWbSXVxPrNI=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/emirpasic/gods v1.18.1 h1:FXtiHYKDGKCW2KzwZKx0iC0PQmdlorYgdFG9jPXJ1Bc=
github.com/emirpasic/gods v1.18.1/go.mod h1:8tpGGwCnJ5H4r6BWwaV6OrWmMoPhUl5jm/FMNAnJvWQ=
github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
github.com/envoyproxy/go-control-plane v0.9.4/go.mod h1:6rpuAdCZL397s3pYoYcLgu1mIlRU8Am5FuJP05cCM98=
github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c=
github.com/fogleman/gg v1.2.1-0.20190220221249-0403632d5b90/go.mod h1:R/bRT+9gY/C5z7JzPU0zXsXHKM4/ayA+zqcVNZzPa1k=
github.com/gabriel-vasile/mimetype v1.4.2 h1:w5qFW6JKBz9Y393Y4q372O9A7cUSequkh1Q7OhCmWKU=
github.com/gabriel-vasile/mimetype v1.4.2/go.mod h1:zApsH/mKG4w07erKIaJPFiX0Tsq9BFQgN3qGY5GnNgA=
github.com/gin-contrib/cors v1.4.0 h1:oJ6gwtUl3lqV0WEIwM/LxPF1QZ5qe2lGWdY2+bz7y0g=
@@ -37,7 +57,31 @@ github.com/go-playground/validator/v10 v10.14.0/go.mod h1:9iXMNT7sEkjXb0I+enO7QX
github.com/goccy/go-json v0.9.7/go.mod h1:6MelG93GURQebXPDq3khkgXZkazVtN9CRI+MGFi0w8I=
github.com/goccy/go-json v0.10.2 h1:CrxCmQqYDkv1z7lO7Wbh2HN93uovUHgrECaO5ZrCXAU=
github.com/goccy/go-json v0.10.2/go.mod h1:6MelG93GURQebXPDq3khkgXZkazVtN9CRI+MGFi0w8I=
github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q=
github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q=
github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0/go.mod h1:E/TSTwGwJL78qG/PmXZO1EjYhfJinVAhrmmHX6Z8B9k=
github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q=
github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A=
github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
github.com/golang/protobuf v1.3.2/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
github.com/golang/protobuf v1.3.3/go.mod h1:vzj43D7+SQXF/4pzW/hwtAqwc6iTitCiVSaWz5lYuqw=
github.com/golang/protobuf v1.4.0-rc.1/go.mod h1:ceaxUfeHdC40wWswd/P6IGgMaK3YpKi5j83Wpe3EHw8=
github.com/golang/protobuf v1.4.0-rc.1.0.20200221234624-67d41d38c208/go.mod h1:xKAWHe0F5eneWXFV3EuXVDTCmh+JuBKY0li0aMyXATA=
github.com/golang/protobuf v1.4.0-rc.2/go.mod h1:LlEzMj4AhA7rCAGe4KMBDvJI+AwstrUpVNzEA03Pprs=
github.com/golang/protobuf v1.4.0-rc.4.0.20200313231945-b860323f09d0/go.mod h1:WU3c8KckQ9AFe+yFwt9sWVRKCVIyN9cPHBJSNnbL67w=
github.com/golang/protobuf v1.4.0/go.mod h1:jodUvKwWbYaEsadDk5Fwe5c77LiNKVO9IDvqG2KuDX0=
github.com/golang/protobuf v1.4.1/go.mod h1:U8fpvMrcmy5pZrNK1lt4xCsGvpyWQ/VVv6QDs8UjoX8=
github.com/golang/protobuf v1.4.2/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI=
github.com/golang/protobuf v1.5.0 h1:LUVKkCeviFUMKqHa4tXIIij/lbhnMbP7Fn5wKdKkRh4=
github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk=
github.com/google/flatbuffers v1.11.0/go.mod h1:1AeVuKshWv4vARoZatz6mlQ0JxURH0Kv5+zNeJKJCa8=
github.com/google/flatbuffers v1.12.0 h1:/PtAHvnBY4Kqnx/xCQ3OIV9uYcSFGScBsWI3Oogeh6w=
github.com/google/flatbuffers v1.12.0/go.mod h1:1AeVuKshWv4vARoZatz6mlQ0JxURH0Kv5+zNeJKJCa8=
github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M=
github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU=
github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU=
github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
github.com/google/go-cmp v0.5.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
github.com/google/go-cmp v0.5.9 h1:O2Tfq5qg4qc4AmwVlvv0oLiVAGB7enBSJ2x2DqQFi38=
github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
@@ -48,6 +92,9 @@ github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2
github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw=
github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM=
github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo=
github.com/jung-kurt/gofpdf v1.0.3-0.20190309125859-24315acbbda5/go.mod h1:7Id9E/uU8ce6rXgefFLlgrJj/GYY22cpxn+r32jIOes=
github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8=
github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
github.com/klauspost/cpuid/v2 v2.0.9/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg=
github.com/klauspost/cpuid/v2 v2.2.4 h1:acbojRNwl3o09bUq+yDCtZFc1aiwaAAxtcn8YkZXnvk=
github.com/klauspost/cpuid/v2 v2.2.4/go.mod h1:RVVoqg1df56z8g3pUjL/3lE5UfnlrJX8tyFgg4nqhuY=
@@ -68,6 +115,8 @@ github.com/mattn/go-isatty v0.0.19/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D
github.com/mattn/go-runewidth v0.0.9/go.mod h1:H031xJmbD/WCDINGzjvQ9THkh0rPKHF+m2gUSrubnMI=
github.com/mattn/go-runewidth v0.0.14 h1:+xnbZSEeDbOIg5/mE6JF0w6n9duR1l3/WmbinWVwUuU=
github.com/mattn/go-runewidth v0.0.14/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w=
github.com/mitchellh/mapstructure v1.5.0 h1:jeMsZIYE/09sWLaz43PL7Gy6RuMjD2eJVyuac5Z2hdY=
github.com/mitchellh/mapstructure v1.5.0/go.mod h1:bFUtVrKA4DC2yAKiSyO/QUcy7e+RRV2QTWOzhPopBRo=
github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg=
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
@@ -75,14 +124,17 @@ github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9G
github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk=
github.com/olekukonko/tablewriter v0.0.5 h1:P2Ga83D34wi1o9J6Wh1mRuqd4mF/x/lgBS7N7AbDhec=
github.com/olekukonko/tablewriter v0.0.5/go.mod h1:hPp6KlRPjbx+hW8ykQs1w3UBbZlj6HuIJcUGPhkA7kY=
github.com/pbnjay/memory v0.0.0-20210728143218-7b4eea64cf58 h1:onHthvaw9LFnH4t2DcNVpwGmV9E1BkGknEliJkfwQj0=
github.com/pbnjay/memory v0.0.0-20210728143218-7b4eea64cf58/go.mod h1:DXv8WO4yhMYhSNPKjeNKa5WY9YCIEBRbNzFFPJbWO6Y=
github.com/pdevine/tensor v0.0.0-20240228013915-64ccaa8d9ca9 h1:DV4iXjNn6fGeDl1AkZ1I0QB/0DBjrc7kPpxHrmuDzW4=
github.com/pdevine/tensor v0.0.0-20240228013915-64ccaa8d9ca9/go.mod h1:nR7l3gM6ubiOm+mCkmmUyIBUcBAyiUmW6dQrDZhugFE=
github.com/pelletier/go-toml/v2 v2.0.1/go.mod h1:r9LEWfGN8R5k0VXJ+0BkIe7MYkRdwZOjgMj2KwnJFUo=
github.com/pelletier/go-toml/v2 v2.0.8 h1:0ctb6s9mE31h0/lhu+J6OPmVeDxJn+kYnJc2jZR9tGQ=
github.com/pelletier/go-toml/v2 v2.0.8/go.mod h1:vuYfssBdrU2XDZ9bYydBu6t+6a6PYNcZljzZR9VXg+4=
github.com/pkg/diff v0.0.0-20210226163009-20ebb0f2a09e/go.mod h1:pJLUxLENpZxwdsKMEsNbx1VGcRFpLqf3715MtcvvzbA=
github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=
github.com/rivo/uniseg v0.2.0 h1:S1pD9weZBuJdFmowNwbpi7BJ8TNftyUImj/0WQi72jY=
github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc=
github.com/rogpeppe/go-internal v1.6.1/go.mod h1:xXDCJY+GAPziupqXw64V24skbSoqbTEfhy4qGm1nDQc=
@@ -96,6 +148,8 @@ github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
github.com/stretchr/testify v1.1.4/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
github.com/stretchr/testify v1.2.0/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
@@ -112,19 +166,61 @@ github.com/ugorji/go v1.2.7/go.mod h1:nF9osbDWLy6bDVv/Rtoh6QgnvNDpmCalQV5urGCCS6
github.com/ugorji/go/codec v1.2.7/go.mod h1:WGN1fab3R1fzQlVQTkfxVtIBhWDRqOviHU95kRgeqEY=
github.com/ugorji/go/codec v1.2.11 h1:BMaWp1Bb6fHwEtbplGBGJ498wD+LKlNSl25MjdZY4dU=
github.com/ugorji/go/codec v1.2.11/go.mod h1:UNopzCgEMSXjBc6AOMqYvWC1ktqTAfzJZUZgYf6w6lg=
github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM=
github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg=
github.com/xtgo/set v1.0.0 h1:6BCNBRv3ORNDQ7fyoJXRv+tstJz3m1JVFQErfeZz2pY=
github.com/xtgo/set v1.0.0/go.mod h1:d3NHzGzSa0NmB2NhFyECA+QdRp29oEn2xbT+TpeFoM8=
github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
go4.org/unsafe/assume-no-moving-gc v0.0.0-20231121144256-b99613f794b6 h1:lGdhQUN/cnWdSH3291CUuxSEqc+AsGTiDxPP3r2J0l4=
go4.org/unsafe/assume-no-moving-gc v0.0.0-20231121144256-b99613f794b6/go.mod h1:FftLjUGFEDu5k8lt0ddY+HcrH/qU/0qk+H8j9/nTl3E=
golang.org/x/arch v0.0.0-20210923205945-b76863e36670/go.mod h1:5om86z9Hs0C8fWVUuoMHwpExlXzs5Tkyp9hOrfG7pp8=
golang.org/x/arch v0.3.0 h1:02VY4/ZcO/gBOH6PUaoiptASxtXU10jazRCP865E97k=
golang.org/x/arch v0.3.0/go.mod h1:5om86z9Hs0C8fWVUuoMHwpExlXzs5Tkyp9hOrfG7pp8=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
golang.org/x/crypto v0.0.0-20210711020723-a769d52b0f97/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
golang.org/x/crypto v0.14.0 h1:wBqGXzWJW6m1XrIKlAH0Hs1JJ7+9KBwnIO8v66Q9cHc=
golang.org/x/crypto v0.14.0/go.mod h1:MVFd36DqK4CsrnJYDkBA3VC4m2GkXAM0PvzMCn4JQf4=
golang.org/x/exp v0.0.0-20180321215751-8460e604b9de/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
golang.org/x/exp v0.0.0-20180807140117-3d87b88a115f/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
golang.org/x/exp v0.0.0-20190125153040-c74c464bbbf2/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
golang.org/x/exp v0.0.0-20230817173708-d852ddb80c63 h1:m64FZMko/V45gv0bNmrNYoDEq8U5YUhetc9cBWKS1TQ=
golang.org/x/exp v0.0.0-20230817173708-d852ddb80c63/go.mod h1:0v4NqG35kSWCMzLaMeX+IQrlSnVE/bqGSyC2cz/9Le8=
golang.org/x/image v0.0.0-20180708004352-c73c2afc3b81/go.mod h1:ux5Hcp/YLpHSI86hEcLt0YII63i6oz57MZXIpbrjZUs=
golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE=
golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU=
golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc=
golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/net v0.0.0-20190213061140-3a22650c66bd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20200904194848-62affa334b73/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA=
golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
golang.org/x/net v0.17.0 h1:pVaXccu2ozPjCXewfr1S7xza/zcXTity9cCdXQYSjIM=
golang.org/x/net v0.17.0/go.mod h1:NxSsAGuq816PNPmqtQdLE42eU2Fs7NoRIZrHJAlaCOE=
golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.3.0 h1:ftCYgMx6zT/asHUrPw8BLLscYtGznsLAnjq5RH9P66E=
golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y=
golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200909081042-eff7692f9009/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210124154548-22da62e12c0c/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
@@ -137,12 +233,56 @@ golang.org/x/sys v0.13.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
golang.org/x/term v0.13.0 h1:bb+I9cTfFazGW51MZqBVmZy7+JEJMouUHTUSKVQLBek=
golang.org/x/term v0.13.0/go.mod h1:LTmsnFJwVN6bCy1rVCoS+qHT1HhALEFxKncY3WNNh4U=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.13.0 h1:ablQoSUd0tRdKxZewP80B+BaqeKJuVhuRxj/dkrun3k=
golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE=
golang.org/x/tools v0.0.0-20180525024113-a5b4c53f6e8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20190206041539-40960b6deb8e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY=
golang.org/x/tools v0.0.0-20190311212946-11955173bddd/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs=
golang.org/x/tools v0.0.0-20190524140312-2c0ae7006135/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q=
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE=
golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA=
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 h1:go1bK/D/BFZV2I8cIQd1NKEZ+0owSTG1fDTci4IqFcE=
golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
gonum.org/v1/gonum v0.0.0-20180816165407-929014505bf4/go.mod h1:Y+Yx5eoAFn32cQvJDxZx5Dpnq+c3wtXuadVZAcxbbBo=
gonum.org/v1/gonum v0.8.2 h1:CCXrcPKiGGotvnN6jfUsKk4rRqm7q09/YbKb5xCEvtM=
gonum.org/v1/gonum v0.8.2/go.mod h1:oe/vMfY3deqTw+1EZJhuvEW2iwGF1bW9wwu7XCu0+v0=
gonum.org/v1/netlib v0.0.0-20190313105609-8cb42192e0e0 h1:OE9mWmgKkjJyEmDAAtGMPjXu+YNeGvK9VTSHY6+Qihc=
gonum.org/v1/netlib v0.0.0-20190313105609-8cb42192e0e0/go.mod h1:wa6Ws7BG/ESfp6dHfk7C6KdzKA7wR7u/rKwOGE66zvw=
gonum.org/v1/plot v0.0.0-20190515093506-e2840ee46a6b/go.mod h1:Wt8AAjI+ypCyYX3nZBvf6cAIx93T+c/OS2HFAYskSZc=
google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM=
google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4=
google.golang.org/genproto v0.0.0-20180817151627-c66870c02cf8/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc=
google.golang.org/genproto v0.0.0-20190819201941-24fa4b261c55/go.mod h1:DMBHOl98Agz4BDEuKkezgsaosCRResVns1a3J2ZsMNc=
google.golang.org/genproto v0.0.0-20200526211855-cb27e3aa2013/go.mod h1:NbSheEEYHJ7i3ixzK3sjbqSGDJWnxyFXZblF3eUsNvo=
google.golang.org/genproto v0.0.0-20200911024640-645f7a48b24f h1:Yv4xsIx7HZOoyUGSJ2ksDyWE2qIBXROsZKt2ny3hCGM=
google.golang.org/genproto v0.0.0-20200911024640-645f7a48b24f/go.mod h1:FWY/as6DDZQgahTzZj3fqbO1CbirC29ZNUFHwi0/+no=
google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c=
google.golang.org/grpc v1.23.0/go.mod h1:Y5yQAOtifL1yxbo5wqy6BxZv8vAUGQwXBOALyacEbxg=
google.golang.org/grpc v1.25.1/go.mod h1:c3i+UQWmh7LiEpx4sFZnkU36qjEYZ0imhYfXVyQciAY=
google.golang.org/grpc v1.27.0/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk=
google.golang.org/grpc v1.32.0 h1:zWTV+LMdc3kaiJMSTOFz2UgSBgx8RNQoTGiZu3fR9S0=
google.golang.org/grpc v1.32.0/go.mod h1:N36X2cJ7JwdamYAgDz+s+rVMFjt3numwzf/HckM8pak=
google.golang.org/grpc/cmd/protoc-gen-go-grpc v0.0.0-20200910201057-6591123024b3/go.mod h1:6Kw0yEErY5E/yWrBtf03jp27GLLJujG4z/JK95pnjjw=
google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8=
google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0=
google.golang.org/protobuf v0.0.0-20200228230310-ab0ca4ff8a60/go.mod h1:cfTl7dwQJ+fmap5saPgwCLgHXTUD7jkjRqWcaiX5VyM=
google.golang.org/protobuf v1.20.1-0.20200309200217-e05f789c0967/go.mod h1:A+miEFZTKqfCUM6K7xSMQL9OKL/b6hQv+e19PK+JZNE=
google.golang.org/protobuf v1.21.0/go.mod h1:47Nbq4nVaFHyn7ilMalzfO3qCViNmqZ2kzikPIcrTAo=
google.golang.org/protobuf v1.22.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU=
google.golang.org/protobuf v1.23.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU=
google.golang.org/protobuf v1.23.1-0.20200526195155-81db48ad09cc/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU=
google.golang.org/protobuf v1.24.0/go.mod h1:r/3tXBNzIEhYS9I1OUVjXDlt8tc493IdKGjtUeSXeh4=
google.golang.org/protobuf v1.25.0/go.mod h1:9JNX74DMeImyA3h4bdi1ymwjUzf21/xIlbajtzgsN7c=
google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw=
google.golang.org/protobuf v1.28.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I=
google.golang.org/protobuf v1.30.0 h1:kPPoIgf3TsEvrm0PFe15JQ+570QVxYzEvvHqChK+cng=
@@ -157,4 +297,10 @@ gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C
gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
gorgonia.org/vecf32 v0.9.0 h1:PClazic1r+JVJ1dEzRXgeiVl4g1/Hf/w+wUSqnco1Xg=
gorgonia.org/vecf32 v0.9.0/go.mod h1:NCc+5D2oxddRL11hd+pCB1PEyXWOyiQxfZ/1wwhOXCA=
gorgonia.org/vecf64 v0.9.0 h1:bgZDP5x0OzBF64PjMGC3EvTdOoMEcmfAh1VCUnZFm1A=
gorgonia.org/vecf64 v0.9.0/go.mod h1:hp7IOWCnRiVQKON73kkC/AUMtEXyf9kGlVrtPQ9ccVA=
honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
honnef.co/go/tools v0.0.0-20190523083050-ea95bdfd59fc/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
rsc.io/pdf v0.1.1/go.mod h1:n8OzWcQ6Sp37PL01nO98y4iUCRdTGarVfzxY20ICaU4=

View File

@@ -1,101 +0,0 @@
package gpu
import (
"bufio"
"errors"
"fmt"
"io"
"log/slog"
"os"
"path/filepath"
"strconv"
"strings"
)
// TODO - windows vs. non-windows vs darwin
// Discovery logic for AMD/ROCm GPUs
const (
DriverVersionFile = "/sys/module/amdgpu/version"
GPUPropertiesFileGlob = "/sys/class/kfd/kfd/topology/nodes/*/properties"
// TODO probably break these down per GPU to make the logic simpler
GPUTotalMemoryFileGlob = "/sys/class/kfd/kfd/topology/nodes/*/mem_banks/*/properties" // size_in_bytes line
GPUUsedMemoryFileGlob = "/sys/class/kfd/kfd/topology/nodes/*/mem_banks/*/used_memory"
)
func AMDDetected() bool {
// Some driver versions (older?) don't have a version file, so just lookup the parent dir
sysfsDir := filepath.Dir(DriverVersionFile)
_, err := os.Stat(sysfsDir)
if errors.Is(err, os.ErrNotExist) {
slog.Debug("amd driver not detected " + sysfsDir)
return false
} else if err != nil {
slog.Debug(fmt.Sprintf("error looking up amd driver %s %s", sysfsDir, err))
return false
}
return true
}
func AMDDriverVersion() (string, error) {
_, err := os.Stat(DriverVersionFile)
if err != nil {
return "", fmt.Errorf("amdgpu file stat error: %s %w", DriverVersionFile, err)
}
fp, err := os.Open(DriverVersionFile)
if err != nil {
return "", err
}
defer fp.Close()
verString, err := io.ReadAll(fp)
if err != nil {
return "", err
}
return strings.TrimSpace(string(verString)), nil
}
func AMDGFXVersions() []Version {
res := []Version{}
matches, _ := filepath.Glob(GPUPropertiesFileGlob)
for _, match := range matches {
fp, err := os.Open(match)
if err != nil {
slog.Debug(fmt.Sprintf("failed to open sysfs node file %s: %s", match, err))
continue
}
defer fp.Close()
scanner := bufio.NewScanner(fp)
// optionally, resize scanner's capacity for lines over 64K, see next example
for scanner.Scan() {
line := strings.TrimSpace(scanner.Text())
if strings.HasPrefix(line, "gfx_target_version") {
ver := strings.Fields(line)
if len(ver) != 2 || len(ver[1]) < 5 {
slog.Debug("malformed " + line)
continue
}
l := len(ver[1])
patch, err1 := strconv.ParseUint(ver[1][l-2:l], 10, 32)
minor, err2 := strconv.ParseUint(ver[1][l-4:l-2], 10, 32)
major, err3 := strconv.ParseUint(ver[1][:l-4], 10, 32)
if err1 != nil || err2 != nil || err3 != nil {
slog.Debug("malformed int " + line)
continue
}
res = append(res, Version{
Major: uint(major),
Minor: uint(minor),
Patch: uint(patch),
})
}
}
}
return res
}
func (v Version) ToGFXString() string {
return fmt.Sprintf("gfx%d%d%d", v.Major, v.Minor, v.Patch)
}

56
gpu/amd_common.go Normal file
View File

@@ -0,0 +1,56 @@
//go:build linux || windows
package gpu
import (
"fmt"
"log/slog"
"os"
"path/filepath"
"strconv"
"strings"
)
// Determine if the given ROCm lib directory is usable by checking for existence of some glob patterns
func rocmLibUsable(libDir string) bool {
slog.Debug("evaluating potential rocm lib dir " + libDir)
for _, g := range ROCmLibGlobs {
res, _ := filepath.Glob(filepath.Join(libDir, g))
if len(res) == 0 {
return false
}
}
return true
}
func GetSupportedGFX(libDir string) ([]string, error) {
var ret []string
files, err := filepath.Glob(filepath.Join(libDir, "rocblas", "library", "TensileLibrary_lazy_gfx*.dat"))
if err != nil {
return nil, err
}
for _, file := range files {
ret = append(ret, strings.TrimSuffix(strings.TrimPrefix(filepath.Base(file), "TensileLibrary_lazy_"), ".dat"))
}
return ret, nil
}
func amdSetVisibleDevices(ids []int, skip map[int]interface{}) {
// Set the visible devices if not already set
// TODO - does sort order matter?
devices := []string{}
for i := range ids {
if _, skipped := skip[i]; skipped {
continue
}
devices = append(devices, strconv.Itoa(i))
}
val := strings.Join(devices, ",")
err := os.Setenv("HIP_VISIBLE_DEVICES", val)
if err != nil {
slog.Warn(fmt.Sprintf("failed to set env: %s", err))
} else {
slog.Info("Setting HIP_VISIBLE_DEVICES=" + val)
}
}

141
gpu/amd_hip_windows.go Normal file
View File

@@ -0,0 +1,141 @@
package gpu
import (
"fmt"
"log/slog"
"strconv"
"syscall"
"unsafe"
"golang.org/x/sys/windows"
)
const (
hipSuccess = 0
hipErrorNoDevice = 100
)
type hipDevicePropMinimal struct {
Name [256]byte
unused1 [140]byte
GcnArchName [256]byte // gfx####
iGPU int // Doesn't seem to actually report correctly
unused2 [128]byte
}
// Wrap the amdhip64.dll library for GPU discovery
type HipLib struct {
dll windows.Handle
hipGetDeviceCount uintptr
hipGetDeviceProperties uintptr
hipMemGetInfo uintptr
hipSetDevice uintptr
hipDriverGetVersion uintptr
}
func NewHipLib() (*HipLib, error) {
h, err := windows.LoadLibrary("amdhip64.dll")
if err != nil {
return nil, fmt.Errorf("unable to load amdhip64.dll: %w", err)
}
hl := &HipLib{}
hl.dll = h
hl.hipGetDeviceCount, err = windows.GetProcAddress(hl.dll, "hipGetDeviceCount")
if err != nil {
return nil, err
}
hl.hipGetDeviceProperties, err = windows.GetProcAddress(hl.dll, "hipGetDeviceProperties")
if err != nil {
return nil, err
}
hl.hipMemGetInfo, err = windows.GetProcAddress(hl.dll, "hipMemGetInfo")
if err != nil {
return nil, err
}
hl.hipSetDevice, err = windows.GetProcAddress(hl.dll, "hipSetDevice")
if err != nil {
return nil, err
}
hl.hipDriverGetVersion, err = windows.GetProcAddress(hl.dll, "hipDriverGetVersion")
if err != nil {
return nil, err
}
return hl, nil
}
// The hip library only evaluates the HIP_VISIBLE_DEVICES variable at startup
// so we have to unload/reset the library after we do our initial discovery
// to make sure our updates to that variable are processed by llama.cpp
func (hl *HipLib) Release() {
err := windows.FreeLibrary(hl.dll)
if err != nil {
slog.Warn(fmt.Sprintf("failed to unload amdhip64.dll: %s", err))
}
hl.dll = 0
}
func (hl *HipLib) AMDDriverVersion() (string, error) {
if hl.dll == 0 {
return "", fmt.Errorf("dll has been unloaded")
}
var version int
status, _, err := syscall.SyscallN(hl.hipDriverGetVersion, uintptr(unsafe.Pointer(&version)))
if status != hipSuccess {
return "", fmt.Errorf("failed call to hipDriverGetVersion: %d %s", status, err)
}
return strconv.Itoa(version), nil
}
func (hl *HipLib) HipGetDeviceCount() int {
if hl.dll == 0 {
slog.Error("dll has been unloaded")
return 0
}
var count int
status, _, err := syscall.SyscallN(hl.hipGetDeviceCount, uintptr(unsafe.Pointer(&count)))
if status == hipErrorNoDevice {
slog.Info("AMD ROCm reports no devices found")
return 0
}
if status != hipSuccess {
slog.Warn(fmt.Sprintf("failed call to hipGetDeviceCount: %d %s", status, err))
}
return count
}
func (hl *HipLib) HipSetDevice(device int) error {
if hl.dll == 0 {
return fmt.Errorf("dll has been unloaded")
}
status, _, err := syscall.SyscallN(hl.hipSetDevice, uintptr(device))
if status != hipSuccess {
return fmt.Errorf("failed call to hipSetDevice: %d %s", status, err)
}
return nil
}
func (hl *HipLib) HipGetDeviceProperties(device int) (*hipDevicePropMinimal, error) {
if hl.dll == 0 {
return nil, fmt.Errorf("dll has been unloaded")
}
var props hipDevicePropMinimal
status, _, err := syscall.SyscallN(hl.hipGetDeviceProperties, uintptr(unsafe.Pointer(&props)), uintptr(device))
if status != hipSuccess {
return nil, fmt.Errorf("failed call to hipGetDeviceProperties: %d %s", status, err)
}
return &props, nil
}
// free, total, err
func (hl *HipLib) HipMemGetInfo() (uint64, uint64, error) {
if hl.dll == 0 {
return 0, 0, fmt.Errorf("dll has been unloaded")
}
var totalMemory uint64
var freeMemory uint64
status, _, err := syscall.SyscallN(hl.hipMemGetInfo, uintptr(unsafe.Pointer(&freeMemory)), uintptr(unsafe.Pointer(&totalMemory)))
if status != hipSuccess {
return 0, 0, fmt.Errorf("failed call to hipMemGetInfo: %d %s", status, err)
}
return freeMemory, totalMemory, nil
}

434
gpu/amd_linux.go Normal file
View File

@@ -0,0 +1,434 @@
package gpu
import (
"bufio"
"errors"
"fmt"
"io"
"log/slog"
"os"
"path/filepath"
"slices"
"strconv"
"strings"
)
// Discovery logic for AMD/ROCm GPUs
const (
DriverVersionFile = "/sys/module/amdgpu/version"
AMDNodesSysfsDir = "/sys/class/kfd/kfd/topology/nodes/"
GPUPropertiesFileGlob = AMDNodesSysfsDir + "*/properties"
// Prefix with the node dir
GPUTotalMemoryFileGlob = "mem_banks/*/properties" // size_in_bytes line
GPUUsedMemoryFileGlob = "mem_banks/*/used_memory"
RocmStandardLocation = "/opt/rocm/lib"
// TODO find a better way to detect iGPU instead of minimum memory
IGPUMemLimit = 1024 * 1024 * 1024 // 512G is what they typically report, so anything less than 1G must be iGPU
)
var (
// Used to validate if the given ROCm lib is usable
ROCmLibGlobs = []string{"libhipblas.so.2*", "rocblas"} // TODO - probably include more coverage of files here...
)
// Gather GPU information from the amdgpu driver if any supported GPUs are detected
// HIP_VISIBLE_DEVICES will be set if we detect a mix of unsupported and supported devices
// and the user hasn't already set this variable
func AMDGetGPUInfo(resp *GpuInfo) {
// TODO - DRY this out with windows
if !AMDDetected() {
return
}
skip := map[int]interface{}{}
// Opportunistic logging of driver version to aid in troubleshooting
ver, err := AMDDriverVersion()
if err == nil {
slog.Info("AMD Driver: " + ver)
} else {
// TODO - if we see users crash and burn with the upstreamed kernel this can be adjusted to hard-fail rocm support and fallback to CPU
slog.Warn(fmt.Sprintf("ollama recommends running the https://www.amd.com/en/support/linux-drivers: %s", err))
}
// If the user has specified exactly which GPUs to use, look up their memory
visibleDevices := os.Getenv("HIP_VISIBLE_DEVICES")
if visibleDevices != "" {
ids := []int{}
for _, idStr := range strings.Split(visibleDevices, ",") {
id, err := strconv.Atoi(idStr)
if err != nil {
slog.Warn(fmt.Sprintf("malformed HIP_VISIBLE_DEVICES=%s %s", visibleDevices, err))
} else {
ids = append(ids, id)
}
}
amdProcMemLookup(resp, nil, ids)
return
}
// Gather GFX version information from all detected cards
gfx := AMDGFXVersions()
verStrings := []string{}
for i, v := range gfx {
verStrings = append(verStrings, v.ToGFXString())
if v.Major == 0 {
// Silently skip CPUs
skip[i] = struct{}{}
continue
}
if v.Major < 9 {
// TODO consider this a build-time setting if we can support 8xx family GPUs
slog.Warn(fmt.Sprintf("amdgpu [%d] too old %s", i, v.ToGFXString()))
skip[i] = struct{}{}
}
}
slog.Info(fmt.Sprintf("detected amdgpu versions %v", verStrings))
// Abort if all GPUs are skipped
if len(skip) >= len(gfx) {
slog.Info("all detected amdgpus are skipped, falling back to CPU")
return
}
// If we got this far, then we have at least 1 GPU that's a ROCm candidate, so make sure we have a lib
libDir, err := AMDValidateLibDir()
if err != nil {
slog.Warn(fmt.Sprintf("unable to verify rocm library, will use cpu: %s", err))
return
}
gfxOverride := os.Getenv("HSA_OVERRIDE_GFX_VERSION")
if gfxOverride == "" {
supported, err := GetSupportedGFX(libDir)
if err != nil {
slog.Warn(fmt.Sprintf("failed to lookup supported GFX types, falling back to CPU mode: %s", err))
return
}
slog.Debug(fmt.Sprintf("rocm supported GPU types %v", supported))
for i, v := range gfx {
if !slices.Contains[[]string, string](supported, v.ToGFXString()) {
slog.Warn(fmt.Sprintf("amdgpu [%d] %s is not supported by %s %v", i, v.ToGFXString(), libDir, supported))
// TODO - consider discrete markdown just for ROCM troubleshooting?
slog.Warn("See https://github.com/ollama/ollama/blob/main/docs/troubleshooting.md for HSA_OVERRIDE_GFX_VERSION usage")
skip[i] = struct{}{}
} else {
slog.Info(fmt.Sprintf("amdgpu [%d] %s is supported", i, v.ToGFXString()))
}
}
} else {
slog.Debug("skipping rocm gfx compatibility check with HSA_OVERRIDE_GFX_VERSION=" + gfxOverride)
}
if len(skip) >= len(gfx) {
slog.Info("all detected amdgpus are skipped, falling back to CPU")
return
}
ids := make([]int, len(gfx))
i := 0
for k := range gfx {
ids[i] = k
i++
}
amdProcMemLookup(resp, skip, ids)
if resp.memInfo.DeviceCount == 0 {
return
}
if len(skip) > 0 {
amdSetVisibleDevices(ids, skip)
}
}
// Walk the sysfs nodes for the available GPUs and gather information from them
// skipping over any devices in the skip map
func amdProcMemLookup(resp *GpuInfo, skip map[int]interface{}, ids []int) {
resp.memInfo.DeviceCount = 0
resp.memInfo.TotalMemory = 0
resp.memInfo.FreeMemory = 0
slog.Debug("discovering VRAM for amdgpu devices")
if len(ids) == 0 {
entries, err := os.ReadDir(AMDNodesSysfsDir)
if err != nil {
slog.Warn(fmt.Sprintf("failed to read amdgpu sysfs %s - %s", AMDNodesSysfsDir, err))
return
}
for _, node := range entries {
if !node.IsDir() {
continue
}
id, err := strconv.Atoi(node.Name())
if err != nil {
slog.Warn("malformed amdgpu sysfs node id " + node.Name())
continue
}
ids = append(ids, id)
}
}
slog.Debug(fmt.Sprintf("amdgpu devices %v", ids))
for _, id := range ids {
if _, skipped := skip[id]; skipped {
continue
}
totalMemory := uint64(0)
usedMemory := uint64(0)
// Adjust for sysfs vs HIP ids
propGlob := filepath.Join(AMDNodesSysfsDir, strconv.Itoa(id+1), GPUTotalMemoryFileGlob)
propFiles, err := filepath.Glob(propGlob)
if err != nil {
slog.Warn(fmt.Sprintf("error looking up total GPU memory: %s %s", propGlob, err))
}
// 1 or more memory banks - sum the values of all of them
for _, propFile := range propFiles {
fp, err := os.Open(propFile)
if err != nil {
slog.Warn(fmt.Sprintf("failed to open sysfs node file %s: %s", propFile, err))
continue
}
defer fp.Close()
scanner := bufio.NewScanner(fp)
for scanner.Scan() {
line := strings.TrimSpace(scanner.Text())
if strings.HasPrefix(line, "size_in_bytes") {
ver := strings.Fields(line)
if len(ver) != 2 {
slog.Warn("malformed " + line)
continue
}
bankSizeInBytes, err := strconv.ParseUint(ver[1], 10, 64)
if err != nil {
slog.Warn("malformed int " + line)
continue
}
totalMemory += bankSizeInBytes
}
}
}
if totalMemory == 0 {
slog.Warn(fmt.Sprintf("amdgpu [%d] reports zero total memory, skipping", id))
skip[id] = struct{}{}
continue
}
if totalMemory < IGPUMemLimit {
slog.Info(fmt.Sprintf("amdgpu [%d] appears to be an iGPU with %dM reported total memory, skipping", id, totalMemory/1024/1024))
skip[id] = struct{}{}
continue
}
usedGlob := filepath.Join(AMDNodesSysfsDir, strconv.Itoa(id), GPUUsedMemoryFileGlob)
usedFiles, err := filepath.Glob(usedGlob)
if err != nil {
slog.Warn(fmt.Sprintf("error looking up used GPU memory: %s %s", usedGlob, err))
continue
}
for _, usedFile := range usedFiles {
fp, err := os.Open(usedFile)
if err != nil {
slog.Warn(fmt.Sprintf("failed to open sysfs node file %s: %s", usedFile, err))
continue
}
defer fp.Close()
data, err := io.ReadAll(fp)
if err != nil {
slog.Warn(fmt.Sprintf("failed to read sysfs node file %s: %s", usedFile, err))
continue
}
used, err := strconv.ParseUint(strings.TrimSpace(string(data)), 10, 64)
if err != nil {
slog.Warn(fmt.Sprintf("malformed used memory %s: %s", string(data), err))
continue
}
usedMemory += used
}
slog.Info(fmt.Sprintf("[%d] amdgpu totalMemory %dM", id, totalMemory/1024/1024))
slog.Info(fmt.Sprintf("[%d] amdgpu freeMemory %dM", id, (totalMemory-usedMemory)/1024/1024))
resp.memInfo.DeviceCount++
resp.memInfo.TotalMemory += totalMemory
resp.memInfo.FreeMemory += (totalMemory - usedMemory)
}
if resp.memInfo.DeviceCount > 0 {
resp.Library = "rocm"
}
}
// Quick check for AMD driver so we can skip amdgpu discovery if not present
func AMDDetected() bool {
// Some driver versions (older?) don't have a version file, so just lookup the parent dir
sysfsDir := filepath.Dir(DriverVersionFile)
_, err := os.Stat(sysfsDir)
if errors.Is(err, os.ErrNotExist) {
slog.Debug("amdgpu driver not detected " + sysfsDir)
return false
} else if err != nil {
slog.Debug(fmt.Sprintf("error looking up amd driver %s %s", sysfsDir, err))
return false
}
return true
}
func setupLink(source, target string) error {
if err := os.RemoveAll(target); err != nil {
return fmt.Errorf("failed to remove old rocm directory %s %w", target, err)
}
if err := os.Symlink(source, target); err != nil {
return fmt.Errorf("failed to create link %s => %s %w", source, target, err)
}
slog.Debug(fmt.Sprintf("host rocm linked %s => %s", source, target))
return nil
}
// Ensure the AMD rocm lib dir is wired up
// Prefer to use host installed ROCm, as long as it meets our minimum requirements
// failing that, tell the user how to download it on their own
func AMDValidateLibDir() (string, error) {
// We rely on the rpath compiled into our library to find rocm
// so we establish a symlink to wherever we find it on the system
// to <payloads>/rocm
payloadsDir, err := PayloadsDir()
if err != nil {
return "", err
}
// If we already have a rocm dependency wired, nothing more to do
rocmTargetDir := filepath.Clean(filepath.Join(payloadsDir, "..", "rocm"))
if rocmLibUsable(rocmTargetDir) {
return rocmTargetDir, nil
}
// next to the running binary
exe, err := os.Executable()
if err == nil {
peerDir := filepath.Dir(exe)
if rocmLibUsable(peerDir) {
slog.Debug("detected ROCM next to ollama executable " + peerDir)
return rocmTargetDir, setupLink(peerDir, rocmTargetDir)
}
peerDir = filepath.Join(filepath.Dir(exe), "rocm")
if rocmLibUsable(peerDir) {
slog.Debug("detected ROCM next to ollama executable " + peerDir)
return rocmTargetDir, setupLink(peerDir, rocmTargetDir)
}
}
// Well known ollama installer path
installedRocmDir := "/usr/share/ollama/lib/rocm"
if rocmLibUsable(installedRocmDir) {
return rocmTargetDir, setupLink(installedRocmDir, rocmTargetDir)
}
// Prefer explicit HIP env var
hipPath := os.Getenv("HIP_PATH")
if hipPath != "" {
hipLibDir := filepath.Join(hipPath, "lib")
if rocmLibUsable(hipLibDir) {
slog.Debug("detected ROCM via HIP_PATH=" + hipPath)
return rocmTargetDir, setupLink(hipLibDir, rocmTargetDir)
}
}
// Scan the library path for potential matches
ldPaths := strings.Split(os.Getenv("LD_LIBRARY_PATH"), ":")
for _, ldPath := range ldPaths {
d, err := filepath.Abs(ldPath)
if err != nil {
continue
}
if rocmLibUsable(d) {
return rocmTargetDir, setupLink(d, rocmTargetDir)
}
}
// Well known location(s)
if rocmLibUsable("/opt/rocm/lib") {
return rocmTargetDir, setupLink("/opt/rocm/lib", rocmTargetDir)
}
// If we still haven't found a usable rocm, the user will have to install it on their own
slog.Warn("amdgpu detected, but no compatible rocm library found. Either install rocm v6, or follow manual install instructions at https://github.com/ollama/ollama/blob/main/docs/linux.md#manual-install")
return "", fmt.Errorf("no suitable rocm found, falling back to CPU")
}
func AMDDriverVersion() (string, error) {
_, err := os.Stat(DriverVersionFile)
if err != nil {
return "", fmt.Errorf("amdgpu version file missing: %s %w", DriverVersionFile, err)
}
fp, err := os.Open(DriverVersionFile)
if err != nil {
return "", err
}
defer fp.Close()
verString, err := io.ReadAll(fp)
if err != nil {
return "", err
}
return strings.TrimSpace(string(verString)), nil
}
func AMDGFXVersions() map[int]Version {
// The amdgpu driver always exposes the host CPU as node 0, but we have to skip that and subtract one
// from the other IDs to get alignment with the HIP libraries expectations (zero is the first GPU, not the CPU)
res := map[int]Version{}
matches, _ := filepath.Glob(GPUPropertiesFileGlob)
for _, match := range matches {
fp, err := os.Open(match)
if err != nil {
slog.Debug(fmt.Sprintf("failed to open sysfs node file %s: %s", match, err))
continue
}
defer fp.Close()
i, err := strconv.Atoi(filepath.Base(filepath.Dir(match)))
if err != nil {
slog.Debug(fmt.Sprintf("failed to parse node ID %s", err))
continue
}
if i == 0 {
// Skipping the CPU
continue
}
// Align with HIP IDs (zero is first GPU, not CPU)
i -= 1
scanner := bufio.NewScanner(fp)
for scanner.Scan() {
line := strings.TrimSpace(scanner.Text())
if strings.HasPrefix(line, "gfx_target_version") {
ver := strings.Fields(line)
if len(ver) != 2 || len(ver[1]) < 5 {
if ver[1] != "0" {
slog.Debug("malformed " + line)
}
res[i] = Version{
Major: 0,
Minor: 0,
Patch: 0,
}
continue
}
l := len(ver[1])
patch, err1 := strconv.ParseUint(ver[1][l-2:l], 10, 32)
minor, err2 := strconv.ParseUint(ver[1][l-4:l-2], 10, 32)
major, err3 := strconv.ParseUint(ver[1][:l-4], 10, 32)
if err1 != nil || err2 != nil || err3 != nil {
slog.Debug("malformed int " + line)
continue
}
res[i] = Version{
Major: uint(major),
Minor: uint(minor),
Patch: uint(patch),
}
}
}
}
return res
}
func (v Version) ToGFXString() string {
return fmt.Sprintf("gfx%d%d%d", v.Major, v.Minor, v.Patch)
}

180
gpu/amd_windows.go Normal file
View File

@@ -0,0 +1,180 @@
package gpu
import (
"bytes"
"fmt"
"log/slog"
"os"
"path/filepath"
"slices"
"strings"
)
const (
RocmStandardLocation = "C:\\Program Files\\AMD\\ROCm\\5.7\\bin" // TODO glob?
// TODO We're lookinng for this exact name to detect iGPUs since hipGetDeviceProperties never reports integrated==true
iGPUName = "AMD Radeon(TM) Graphics"
)
var (
// Used to validate if the given ROCm lib is usable
ROCmLibGlobs = []string{"hipblas.dll", "rocblas"} // TODO - probably include more coverage of files here...
)
func AMDGetGPUInfo(resp *GpuInfo) {
hl, err := NewHipLib()
if err != nil {
slog.Debug(err.Error())
return
}
defer hl.Release()
skip := map[int]interface{}{}
ids := []int{}
resp.memInfo.DeviceCount = 0
resp.memInfo.TotalMemory = 0
resp.memInfo.FreeMemory = 0
ver, err := hl.AMDDriverVersion()
if err == nil {
slog.Info("AMD Driver: " + ver)
} else {
// For now this is benign, but we may eventually need to fail compatibility checks
slog.Debug(fmt.Sprintf("error looking up amd driver version: %s", err))
}
// Note: the HIP library automatically handles HIP_VISIBLE_DEVICES
count := hl.HipGetDeviceCount()
if count == 0 {
return
}
libDir, err := AMDValidateLibDir()
if err != nil {
slog.Warn(fmt.Sprintf("unable to verify rocm library, will use cpu: %s", err))
return
}
var supported []string
gfxOverride := os.Getenv("HSA_OVERRIDE_GFX_VERSION")
if gfxOverride == "" {
supported, err = GetSupportedGFX(libDir)
if err != nil {
slog.Warn(fmt.Sprintf("failed to lookup supported GFX types, falling back to CPU mode: %s", err))
return
}
} else {
slog.Debug("skipping rocm gfx compatibility check with HSA_OVERRIDE_GFX_VERSION=" + gfxOverride)
}
slog.Info(fmt.Sprintf("detected %d hip devices", count))
for i := 0; i < count; i++ {
ids = append(ids, i)
err = hl.HipSetDevice(i)
if err != nil {
slog.Warn(fmt.Sprintf("[%d] %s", i, err))
skip[i] = struct{}{}
continue
}
props, err := hl.HipGetDeviceProperties(i)
if err != nil {
slog.Warn(fmt.Sprintf("[%d] %s", i, err))
skip[i] = struct{}{}
continue
}
n := bytes.IndexByte(props.Name[:], 0)
name := string(props.Name[:n])
slog.Info(fmt.Sprintf("[%d] Name: %s", i, name))
n = bytes.IndexByte(props.GcnArchName[:], 0)
gfx := string(props.GcnArchName[:n])
slog.Info(fmt.Sprintf("[%d] GcnArchName: %s", i, gfx))
//slog.Info(fmt.Sprintf("[%d] Integrated: %d", i, props.iGPU)) // DOESN'T REPORT CORRECTLY! Always 0
// TODO Why isn't props.iGPU accurate!?
if strings.EqualFold(name, iGPUName) {
slog.Info(fmt.Sprintf("iGPU detected [%d] skipping", i))
skip[i] = struct{}{}
continue
}
if gfxOverride == "" {
if !slices.Contains[[]string, string](supported, gfx) {
slog.Warn(fmt.Sprintf("amdgpu [%d] %s is not supported by %s %v", i, gfx, libDir, supported))
// TODO - consider discrete markdown just for ROCM troubleshooting?
slog.Warn("See https://github.com/ollama/ollama/blob/main/docs/troubleshooting.md for HSA_OVERRIDE_GFX_VERSION usage")
skip[i] = struct{}{}
continue
} else {
slog.Info(fmt.Sprintf("amdgpu [%d] %s is supported", i, gfx))
}
}
totalMemory, freeMemory, err := hl.HipMemGetInfo()
if err != nil {
slog.Warn(fmt.Sprintf("[%d] %s", i, err))
continue
}
// TODO according to docs, freeMem may lie on windows!
slog.Info(fmt.Sprintf("[%d] Total Mem: %d", i, totalMemory))
slog.Info(fmt.Sprintf("[%d] Free Mem: %d", i, freeMemory))
resp.memInfo.DeviceCount++
resp.memInfo.TotalMemory += totalMemory
resp.memInfo.FreeMemory += freeMemory
}
if resp.memInfo.DeviceCount > 0 {
resp.Library = "rocm"
}
// Abort if all GPUs are skipped
if len(skip) >= count {
slog.Info("all detected amdgpus are skipped, falling back to CPU")
return
}
if len(skip) > 0 {
amdSetVisibleDevices(ids, skip)
}
UpdatePath(libDir)
}
func AMDValidateLibDir() (string, error) {
// On windows non-admins typically can't create links
// so instead of trying to rely on rpath and a link in
// $LibDir/rocm, we instead rely on setting PATH to point
// to the location of the ROCm library
// Installer payload location if we're running the installed binary
exe, err := os.Executable()
if err == nil {
rocmTargetDir := filepath.Join(filepath.Dir(exe), "rocm")
if rocmLibUsable(rocmTargetDir) {
slog.Debug("detected ROCM next to ollama executable " + rocmTargetDir)
return rocmTargetDir, nil
}
}
// Installer payload (if we're running from some other location)
localAppData := os.Getenv("LOCALAPPDATA")
appDir := filepath.Join(localAppData, "Programs", "Ollama")
rocmTargetDir := filepath.Join(appDir, "rocm")
if rocmLibUsable(rocmTargetDir) {
slog.Debug("detected ollama installed ROCm at " + rocmTargetDir)
return rocmTargetDir, nil
}
// Prefer explicit HIP env var
hipPath := os.Getenv("HIP_PATH")
if hipPath != "" {
hipLibDir := filepath.Join(hipPath, "bin")
if rocmLibUsable(hipLibDir) {
slog.Debug("detected ROCM via HIP_PATH=" + hipPath)
return hipLibDir, nil
}
}
// Well known location(s)
if rocmLibUsable(RocmStandardLocation) {
return RocmStandardLocation, nil
}
// Should not happen on windows since we include it in the installer, but stand-alone binary might hit this
slog.Warn("amdgpu detected, but no compatible rocm library found. Please install ROCm")
return "", fmt.Errorf("no suitable rocm found, falling back to CPU")
}

112
gpu/assets.go Normal file
View File

@@ -0,0 +1,112 @@
package gpu
import (
"errors"
"fmt"
"log/slog"
"os"
"path/filepath"
"runtime"
"strconv"
"strings"
"sync"
"syscall"
)
var (
lock sync.Mutex
payloadsDir = ""
)
func PayloadsDir() (string, error) {
lock.Lock()
defer lock.Unlock()
if payloadsDir == "" {
cleanupTmpDirs()
tmpDir, err := os.MkdirTemp("", "ollama")
if err != nil {
return "", fmt.Errorf("failed to generate tmp dir: %w", err)
}
// Track our pid so we can clean up orphaned tmpdirs
pidFilePath := filepath.Join(tmpDir, "ollama.pid")
pidFile, err := os.OpenFile(pidFilePath, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, os.ModePerm)
if err != nil {
return "", err
}
if _, err := pidFile.Write([]byte(fmt.Sprint(os.Getpid()))); err != nil {
return "", err
}
// We create a distinct subdirectory for payloads within the tmpdir
// This will typically look like /tmp/ollama3208993108/runners on linux
payloadsDir = filepath.Join(tmpDir, "runners")
}
return payloadsDir, nil
}
// Best effort to clean up prior tmpdirs
func cleanupTmpDirs() {
dirs, err := filepath.Glob(filepath.Join(os.TempDir(), "ollama*"))
if err != nil {
return
}
for _, d := range dirs {
info, err := os.Stat(d)
if err != nil || !info.IsDir() {
continue
}
raw, err := os.ReadFile(filepath.Join(d, "ollama.pid"))
if err == nil {
pid, err := strconv.Atoi(string(raw))
if err == nil {
if proc, err := os.FindProcess(int(pid)); err == nil && !errors.Is(proc.Signal(syscall.Signal(0)), os.ErrProcessDone) {
// Another running ollama, ignore this tmpdir
continue
}
}
} else {
slog.Debug("failed to open ollama.pid", "path", d, "error", err)
}
err = os.RemoveAll(d)
if err != nil {
slog.Debug(fmt.Sprintf("unable to cleanup stale tmpdir %s: %s", d, err))
}
}
}
func Cleanup() {
lock.Lock()
defer lock.Unlock()
if payloadsDir != "" {
// We want to fully clean up the tmpdir parent of the payloads dir
tmpDir := filepath.Clean(filepath.Join(payloadsDir, ".."))
slog.Debug("cleaning up", "dir", tmpDir)
err := os.RemoveAll(tmpDir)
if err != nil {
slog.Warn("failed to clean up", "dir", tmpDir, "err", err)
}
}
}
func UpdatePath(dir string) {
if runtime.GOOS == "windows" {
tmpDir := filepath.Dir(dir)
pathComponents := strings.Split(os.Getenv("PATH"), ";")
i := 0
for _, comp := range pathComponents {
if strings.EqualFold(comp, dir) {
return
}
// Remove any other prior paths to our temp dir
if !strings.HasPrefix(strings.ToLower(comp), strings.ToLower(tmpDir)) {
pathComponents[i] = comp
i++
}
}
newPath := strings.Join(append([]string{dir}, pathComponents...), ";")
slog.Info(fmt.Sprintf("Updating PATH to %s", newPath))
os.Setenv("PATH", newPath)
}
// linux and darwin rely on rpath
}

View File

@@ -23,8 +23,8 @@ import (
)
type handles struct {
cuda *C.cuda_handle_t
rocm *C.rocm_handle_t
nvml *C.nvml_handle_t
cudart *C.cudart_handle_t
}
var gpuMutex sync.Mutex
@@ -34,7 +34,7 @@ var gpuHandles *handles = nil
var CudaComputeMin = [2]C.int{5, 0}
// Possible locations for the nvidia-ml library
var CudaLinuxGlobs = []string{
var NvmlLinuxGlobs = []string{
"/usr/local/cuda/lib64/libnvidia-ml.so*",
"/usr/lib/x86_64-linux-gnu/nvidia/current/libnvidia-ml.so*",
"/usr/lib/x86_64-linux-gnu/libnvidia-ml.so*",
@@ -42,75 +42,98 @@ var CudaLinuxGlobs = []string{
"/usr/lib/wsl/drivers/*/libnvidia-ml.so*",
"/opt/cuda/lib64/libnvidia-ml.so*",
"/usr/lib*/libnvidia-ml.so*",
"/usr/local/lib*/libnvidia-ml.so*",
"/usr/lib/aarch64-linux-gnu/nvidia/current/libnvidia-ml.so*",
"/usr/lib/aarch64-linux-gnu/libnvidia-ml.so*",
"/usr/local/lib*/libnvidia-ml.so*",
// TODO: are these stubs ever valid?
"/opt/cuda/targets/x86_64-linux/lib/stubs/libnvidia-ml.so*",
}
var CudaWindowsGlobs = []string{
var NvmlWindowsGlobs = []string{
"c:\\Windows\\System32\\nvml.dll",
}
var RocmLinuxGlobs = []string{
"/opt/rocm*/lib*/librocm_smi64.so*",
var CudartLinuxGlobs = []string{
"/usr/local/cuda/lib64/libcudart.so*",
"/usr/lib/x86_64-linux-gnu/nvidia/current/libcudart.so*",
"/usr/lib/x86_64-linux-gnu/libcudart.so*",
"/usr/lib/wsl/lib/libcudart.so*",
"/usr/lib/wsl/drivers/*/libcudart.so*",
"/opt/cuda/lib64/libcudart.so*",
"/usr/local/cuda*/targets/aarch64-linux/lib/libcudart.so*",
"/usr/lib/aarch64-linux-gnu/nvidia/current/libcudart.so*",
"/usr/lib/aarch64-linux-gnu/libcudart.so*",
"/usr/local/cuda/lib*/libcudart.so*",
"/usr/lib*/libcudart.so*",
"/usr/local/lib*/libcudart.so*",
}
var RocmWindowsGlobs = []string{
"c:\\Windows\\System32\\rocm_smi64.dll",
var CudartWindowsGlobs = []string{
"c:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v*\\bin\\cudart64_*.dll",
}
// Jetson devices have JETSON_JETPACK="x.y.z" factory set to the Jetpack version installed.
// Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices.
var CudaTegra string = os.Getenv("JETSON_JETPACK")
// Note: gpuMutex must already be held
func initGPUHandles() {
// TODO - if the ollama build is CPU only, don't do these checks as they're irrelevant and confusing
gpuHandles = &handles{nil, nil}
var cudaMgmtName string
var cudaMgmtPatterns []string
var rocmMgmtName string
var rocmMgmtPatterns []string
var nvmlMgmtName string
var nvmlMgmtPatterns []string
var cudartMgmtName string
var cudartMgmtPatterns []string
tmpDir, _ := PayloadsDir()
switch runtime.GOOS {
case "windows":
cudaMgmtName = "nvml.dll"
cudaMgmtPatterns = make([]string, len(CudaWindowsGlobs))
copy(cudaMgmtPatterns, CudaWindowsGlobs)
rocmMgmtName = "rocm_smi64.dll"
rocmMgmtPatterns = make([]string, len(RocmWindowsGlobs))
copy(rocmMgmtPatterns, RocmWindowsGlobs)
nvmlMgmtName = "nvml.dll"
nvmlMgmtPatterns = make([]string, len(NvmlWindowsGlobs))
copy(nvmlMgmtPatterns, NvmlWindowsGlobs)
cudartMgmtName = "cudart64_*.dll"
localAppData := os.Getenv("LOCALAPPDATA")
cudartMgmtPatterns = []string{filepath.Join(localAppData, "Programs", "Ollama", cudartMgmtName)}
cudartMgmtPatterns = append(cudartMgmtPatterns, CudartWindowsGlobs...)
case "linux":
cudaMgmtName = "libnvidia-ml.so"
cudaMgmtPatterns = make([]string, len(CudaLinuxGlobs))
copy(cudaMgmtPatterns, CudaLinuxGlobs)
rocmMgmtName = "librocm_smi64.so"
rocmMgmtPatterns = make([]string, len(RocmLinuxGlobs))
copy(rocmMgmtPatterns, RocmLinuxGlobs)
nvmlMgmtName = "libnvidia-ml.so"
nvmlMgmtPatterns = make([]string, len(NvmlLinuxGlobs))
copy(nvmlMgmtPatterns, NvmlLinuxGlobs)
cudartMgmtName = "libcudart.so*"
if tmpDir != "" {
// TODO - add "payloads" for subprocess
cudartMgmtPatterns = []string{filepath.Join(tmpDir, "cuda*", cudartMgmtName)}
}
cudartMgmtPatterns = append(cudartMgmtPatterns, CudartLinuxGlobs...)
default:
return
}
slog.Info("Detecting GPU type")
cudaLibPaths := FindGPULibs(cudaMgmtName, cudaMgmtPatterns)
if len(cudaLibPaths) > 0 {
cuda := LoadCUDAMgmt(cudaLibPaths)
if cuda != nil {
slog.Info("Nvidia GPU detected")
gpuHandles.cuda = cuda
cudartLibPaths := FindGPULibs(cudartMgmtName, cudartMgmtPatterns)
if len(cudartLibPaths) > 0 {
cudart := LoadCUDARTMgmt(cudartLibPaths)
if cudart != nil {
slog.Info("Nvidia GPU detected via cudart")
gpuHandles.cudart = cudart
return
}
}
rocmLibPaths := FindGPULibs(rocmMgmtName, rocmMgmtPatterns)
if len(rocmLibPaths) > 0 {
rocm := LoadROCMMgmt(rocmLibPaths)
if rocm != nil {
slog.Info("Radeon GPU detected")
gpuHandles.rocm = rocm
// TODO once we build confidence, remove this and the gpu_info_nvml.[ch] files
nvmlLibPaths := FindGPULibs(nvmlMgmtName, nvmlMgmtPatterns)
if len(nvmlLibPaths) > 0 {
nvml := LoadNVMLMgmt(nvmlLibPaths)
if nvml != nil {
slog.Info("Nvidia GPU detected via nvidia-ml")
gpuHandles.nvml = nvml
return
}
}
}
func GetGPUInfo() GpuInfo {
@@ -130,85 +153,48 @@ func GetGPUInfo() GpuInfo {
var memInfo C.mem_info_t
resp := GpuInfo{}
if gpuHandles.cuda != nil && (cpuVariant != "" || runtime.GOARCH != "amd64") {
C.cuda_check_vram(*gpuHandles.cuda, &memInfo)
if gpuHandles.nvml != nil && (cpuVariant != "" || runtime.GOARCH != "amd64") {
C.nvml_check_vram(*gpuHandles.nvml, &memInfo)
if memInfo.err != nil {
slog.Info(fmt.Sprintf("error looking up CUDA GPU memory: %s", C.GoString(memInfo.err)))
slog.Info(fmt.Sprintf("[nvidia-ml] error looking up NVML GPU memory: %s", C.GoString(memInfo.err)))
C.free(unsafe.Pointer(memInfo.err))
} else if memInfo.count > 0 {
// Verify minimum compute capability
var cc C.cuda_compute_capability_t
C.cuda_compute_capability(*gpuHandles.cuda, &cc)
var cc C.nvml_compute_capability_t
C.nvml_compute_capability(*gpuHandles.nvml, &cc)
if cc.err != nil {
slog.Info(fmt.Sprintf("error looking up CUDA GPU compute capability: %s", C.GoString(cc.err)))
slog.Info(fmt.Sprintf("[nvidia-ml] error looking up NVML GPU compute capability: %s", C.GoString(cc.err)))
C.free(unsafe.Pointer(cc.err))
} else if cc.major > CudaComputeMin[0] || (cc.major == CudaComputeMin[0] && cc.minor >= CudaComputeMin[1]) {
slog.Info(fmt.Sprintf("CUDA Compute Capability detected: %d.%d", cc.major, cc.minor))
slog.Info(fmt.Sprintf("[nvidia-ml] NVML CUDA Compute Capability detected: %d.%d", cc.major, cc.minor))
resp.Library = "cuda"
} else {
slog.Info(fmt.Sprintf("CUDA GPU is too old. Falling back to CPU mode. Compute Capability detected: %d.%d", cc.major, cc.minor))
slog.Info(fmt.Sprintf("[nvidia-ml] CUDA GPU is too old. Falling back to CPU mode. Compute Capability detected: %d.%d", cc.major, cc.minor))
}
}
} else if AMDDetected() && gpuHandles.rocm != nil && (cpuVariant != "" || runtime.GOARCH != "amd64") {
ver, err := AMDDriverVersion()
if err == nil {
slog.Info("AMD Driver: " + ver)
} else {
// For now this is benign, but we may eventually need to fail compatibility checks
slog.Debug("error looking up amd driver version: %s", err)
}
gfx := AMDGFXVersions()
tooOld := false
for _, v := range gfx {
if v.Major < 9 {
slog.Info("AMD GPU too old, falling back to CPU " + v.ToGFXString())
tooOld = true
break
} else if gpuHandles.cudart != nil && (cpuVariant != "" || runtime.GOARCH != "amd64") {
C.cudart_check_vram(*gpuHandles.cudart, &memInfo)
if memInfo.err != nil {
slog.Info(fmt.Sprintf("[cudart] error looking up CUDART GPU memory: %s", C.GoString(memInfo.err)))
C.free(unsafe.Pointer(memInfo.err))
} else if memInfo.count > 0 {
// Verify minimum compute capability
var cc C.cudart_compute_capability_t
C.cudart_compute_capability(*gpuHandles.cudart, &cc)
if cc.err != nil {
slog.Info(fmt.Sprintf("[cudart] error looking up CUDA compute capability: %s", C.GoString(cc.err)))
C.free(unsafe.Pointer(cc.err))
} else if cc.major > CudaComputeMin[0] || (cc.major == CudaComputeMin[0] && cc.minor >= CudaComputeMin[1]) {
slog.Info(fmt.Sprintf("[cudart] CUDART CUDA Compute Capability detected: %d.%d", cc.major, cc.minor))
resp.Library = "cuda"
} else {
slog.Info(fmt.Sprintf("[cudart] CUDA GPU is too old. Falling back to CPU mode. Compute Capability detected: %d.%d", cc.major, cc.minor))
}
// TODO - remap gfx strings for unsupporetd minor/patch versions to supported for the same major
// e.g. gfx1034 works if we map it to gfx1030 at runtime
}
if !tooOld {
// TODO - this algo can be shifted over to use sysfs instead of the rocm info library...
C.rocm_check_vram(*gpuHandles.rocm, &memInfo)
if memInfo.err != nil {
slog.Info(fmt.Sprintf("error looking up ROCm GPU memory: %s", C.GoString(memInfo.err)))
C.free(unsafe.Pointer(memInfo.err))
} else if memInfo.igpu_index >= 0 && memInfo.count == 1 {
// Only one GPU detected and it appears to be an integrated GPU - skip it
slog.Info("ROCm unsupported integrated GPU detected")
} else if memInfo.count > 0 {
if memInfo.igpu_index >= 0 {
// We have multiple GPUs reported, and one of them is an integrated GPU
// so we have to set the env var to bypass it
// If the user has specified their own ROCR_VISIBLE_DEVICES, don't clobber it
val := os.Getenv("ROCR_VISIBLE_DEVICES")
if val == "" {
devices := []string{}
for i := 0; i < int(memInfo.count); i++ {
if i == int(memInfo.igpu_index) {
continue
}
devices = append(devices, strconv.Itoa(i))
}
val = strings.Join(devices, ",")
os.Setenv("ROCR_VISIBLE_DEVICES", val)
}
slog.Info(fmt.Sprintf("ROCm integrated GPU detected - ROCR_VISIBLE_DEVICES=%s", val))
}
resp.Library = "rocm"
var version C.rocm_version_resp_t
C.rocm_get_version(*gpuHandles.rocm, &version)
verString := C.GoString(version.str)
if version.status == 0 {
resp.Variant = "v" + verString
} else {
slog.Info(fmt.Sprintf("failed to look up ROCm version: %s", verString))
}
C.free(unsafe.Pointer(version.str))
}
} else {
AMDGetGPUInfo(&resp)
if resp.Library != "" {
return resp
}
}
if resp.Library == "" {
@@ -242,6 +228,15 @@ func getCPUMem() (memInfo, error) {
}
func CheckVRAM() (int64, error) {
userLimit := os.Getenv("OLLAMA_MAX_VRAM")
if userLimit != "" {
avail, err := strconv.ParseInt(userLimit, 10, 64)
if err != nil {
return 0, fmt.Errorf("Invalid OLLAMA_MAX_VRAM setting %s: %s", userLimit, err)
}
slog.Info(fmt.Sprintf("user override OLLAMA_MAX_VRAM=%d", avail))
return avail, nil
}
gpuInfo := GetGPUInfo()
if gpuInfo.FreeMemory > 0 && (gpuInfo.Library == "cuda" || gpuInfo.Library == "rocm") {
// leave 10% or 1024MiB of VRAM free per GPU to handle unaccounted for overhead
@@ -250,6 +245,11 @@ func CheckVRAM() (int64, error) {
if overhead < gpus*1024*1024*1024 {
overhead = gpus * 1024 * 1024 * 1024
}
// Assigning full reported free memory for Tegras due to OS controlled caching.
if CudaTegra != "" {
// Setting overhead for non-Tegra devices
overhead = 0
}
avail := int64(gpuInfo.FreeMemory - overhead)
slog.Debug(fmt.Sprintf("%s detected %d devices with %dM available memory", gpuInfo.Library, gpuInfo.DeviceCount, avail/1024/1024))
return avail, nil
@@ -312,15 +312,15 @@ func FindGPULibs(baseLibName string, patterns []string) []string {
return gpuLibPaths
}
func LoadCUDAMgmt(cudaLibPaths []string) *C.cuda_handle_t {
var resp C.cuda_init_resp_t
func LoadNVMLMgmt(nvmlLibPaths []string) *C.nvml_handle_t {
var resp C.nvml_init_resp_t
resp.ch.verbose = getVerboseState()
for _, libPath := range cudaLibPaths {
for _, libPath := range nvmlLibPaths {
lib := C.CString(libPath)
defer C.free(unsafe.Pointer(lib))
C.cuda_init(lib, &resp)
C.nvml_init(lib, &resp)
if resp.err != nil {
slog.Info(fmt.Sprintf("Unable to load CUDA management library %s: %s", libPath, C.GoString(resp.err)))
slog.Info(fmt.Sprintf("Unable to load NVML management library %s: %s", libPath, C.GoString(resp.err)))
C.free(unsafe.Pointer(resp.err))
} else {
return &resp.ch
@@ -329,18 +329,18 @@ func LoadCUDAMgmt(cudaLibPaths []string) *C.cuda_handle_t {
return nil
}
func LoadROCMMgmt(rocmLibPaths []string) *C.rocm_handle_t {
var resp C.rocm_init_resp_t
resp.rh.verbose = getVerboseState()
for _, libPath := range rocmLibPaths {
func LoadCUDARTMgmt(cudartLibPaths []string) *C.cudart_handle_t {
var resp C.cudart_init_resp_t
resp.ch.verbose = getVerboseState()
for _, libPath := range cudartLibPaths {
lib := C.CString(libPath)
defer C.free(unsafe.Pointer(lib))
C.rocm_init(lib, &resp)
C.cudart_init(lib, &resp)
if resp.err != nil {
slog.Info(fmt.Sprintf("Unable to load ROCm management library %s: %s", libPath, C.GoString(resp.err)))
slog.Info(fmt.Sprintf("Unable to load cudart CUDA management library %s: %s", libPath, C.GoString(resp.err)))
C.free(unsafe.Pointer(resp.err))
} else {
return &resp.rh
return &resp.ch
}
}
return nil

View File

@@ -1,6 +1,7 @@
//go:build darwin
package gpu
/*
#cgo CFLAGS: -x objective-c
#cgo LDFLAGS: -framework Foundation -framework CoreGraphics -framework Metal
@@ -8,11 +9,25 @@ package gpu
*/
import "C"
import (
"fmt"
"log/slog"
"os"
"runtime"
"strconv"
)
// CheckVRAM returns the free VRAM in bytes on Linux machines with NVIDIA GPUs
func CheckVRAM() (int64, error) {
userLimit := os.Getenv("OLLAMA_MAX_VRAM")
if userLimit != "" {
avail, err := strconv.ParseInt(userLimit, 10, 64)
if err != nil {
return 0, fmt.Errorf("Invalid OLLAMA_MAX_VRAM setting %s: %s", userLimit, err)
}
slog.Info(fmt.Sprintf("user override OLLAMA_MAX_VRAM=%d", avail))
return avail, nil
}
if runtime.GOARCH == "amd64" {
// gpu not supported, this may not be metal
return 0, nil

View File

@@ -52,8 +52,8 @@ void cpu_check_ram(mem_info_t *resp);
}
#endif
#include "gpu_info_cuda.h"
#include "gpu_info_rocm.h"
#include "gpu_info_nvml.h"
#include "gpu_info_cudart.h"
#endif // __GPU_INFO_H__
#endif // __APPLE__

190
gpu/gpu_info_cudart.c Normal file
View File

@@ -0,0 +1,190 @@
#ifndef __APPLE__ // TODO - maybe consider nvidia support on intel macs?
#include <string.h>
#include "gpu_info_cudart.h"
void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp) {
cudartReturn_t ret;
resp->err = NULL;
const int buflen = 256;
char buf[buflen + 1];
int i;
struct lookup {
char *s;
void **p;
} l[] = {
{"cudaSetDevice", (void *)&resp->ch.cudaSetDevice},
{"cudaDeviceSynchronize", (void *)&resp->ch.cudaDeviceSynchronize},
{"cudaDeviceReset", (void *)&resp->ch.cudaDeviceReset},
{"cudaMemGetInfo", (void *)&resp->ch.cudaMemGetInfo},
{"cudaGetDeviceCount", (void *)&resp->ch.cudaGetDeviceCount},
{"cudaDeviceGetAttribute", (void *)&resp->ch.cudaDeviceGetAttribute},
{"cudaDriverGetVersion", (void *)&resp->ch.cudaDriverGetVersion},
{NULL, NULL},
};
resp->ch.handle = LOAD_LIBRARY(cudart_lib_path, RTLD_LAZY);
if (!resp->ch.handle) {
char *msg = LOAD_ERR();
LOG(resp->ch.verbose, "library %s load err: %s\n", cudart_lib_path, msg);
snprintf(buf, buflen,
"Unable to load %s library to query for Nvidia GPUs: %s",
cudart_lib_path, msg);
free(msg);
resp->err = strdup(buf);
return;
}
// TODO once we've squashed the remaining corner cases remove this log
LOG(resp->ch.verbose, "wiring cudart library functions in %s\n", cudart_lib_path);
for (i = 0; l[i].s != NULL; i++) {
// TODO once we've squashed the remaining corner cases remove this log
LOG(resp->ch.verbose, "dlsym: %s\n", l[i].s);
*l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
if (!l[i].p) {
char *msg = LOAD_ERR();
LOG(resp->ch.verbose, "dlerr: %s\n", msg);
UNLOAD_LIBRARY(resp->ch.handle);
resp->ch.handle = NULL;
snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
msg);
free(msg);
resp->err = strdup(buf);
return;
}
}
ret = (*resp->ch.cudaSetDevice)(0);
if (ret != CUDART_SUCCESS) {
LOG(resp->ch.verbose, "cudaSetDevice err: %d\n", ret);
UNLOAD_LIBRARY(resp->ch.handle);
resp->ch.handle = NULL;
snprintf(buf, buflen, "cudart init failure: %d", ret);
resp->err = strdup(buf);
return;
}
int version = 0;
cudartDriverVersion_t driverVersion;
driverVersion.major = 0;
driverVersion.minor = 0;
// Report driver version if we're in verbose mode, ignore errors
ret = (*resp->ch.cudaDriverGetVersion)(&version);
if (ret != CUDART_SUCCESS) {
LOG(resp->ch.verbose, "cudaDriverGetVersion failed: %d\n", ret);
} else {
driverVersion.major = version / 1000;
driverVersion.minor = (version - (driverVersion.major * 1000)) / 10;
LOG(resp->ch.verbose, "CUDA driver version: %d-%d\n", driverVersion.major, driverVersion.minor);
}
}
void cudart_check_vram(cudart_handle_t h, mem_info_t *resp) {
resp->err = NULL;
cudartMemory_t memInfo = {0,0,0};
cudartReturn_t ret;
const int buflen = 256;
char buf[buflen + 1];
int i;
if (h.handle == NULL) {
resp->err = strdup("cudart handle isn't initialized");
return;
}
// cudaGetDeviceCount takes int type, resp-> count is uint
int deviceCount;
ret = (*h.cudaGetDeviceCount)(&deviceCount);
if (ret != CUDART_SUCCESS) {
snprintf(buf, buflen, "unable to get device count: %d", ret);
resp->err = strdup(buf);
return;
} else {
resp->count = (unsigned int)deviceCount;
}
resp->total = 0;
resp->free = 0;
for (i = 0; i < resp-> count; i++) {
ret = (*h.cudaSetDevice)(i);
if (ret != CUDART_SUCCESS) {
snprintf(buf, buflen, "cudart device failed to initialize");
resp->err = strdup(buf);
return;
}
ret = (*h.cudaMemGetInfo)(&memInfo.free, &memInfo.total);
if (ret != CUDART_SUCCESS) {
snprintf(buf, buflen, "cudart device memory info lookup failure %d", ret);
resp->err = strdup(buf);
return;
}
LOG(h.verbose, "[%d] CUDA totalMem %lu\n", i, memInfo.total);
LOG(h.verbose, "[%d] CUDA freeMem %lu\n", i, memInfo.free);
resp->total += memInfo.total;
resp->free += memInfo.free;
}
}
void cudart_compute_capability(cudart_handle_t h, cudart_compute_capability_t *resp) {
resp->err = NULL;
resp->major = 0;
resp->minor = 0;
int major = 0;
int minor = 0;
cudartReturn_t ret;
const int buflen = 256;
char buf[buflen + 1];
int i;
if (h.handle == NULL) {
resp->err = strdup("cudart handle not initialized");
return;
}
int devices;
ret = (*h.cudaGetDeviceCount)(&devices);
if (ret != CUDART_SUCCESS) {
snprintf(buf, buflen, "unable to get cudart device count: %d", ret);
resp->err = strdup(buf);
return;
}
for (i = 0; i < devices; i++) {
ret = (*h.cudaSetDevice)(i);
if (ret != CUDART_SUCCESS) {
snprintf(buf, buflen, "cudart device failed to initialize");
resp->err = strdup(buf);
return;
}
ret = (*h.cudaDeviceGetAttribute)(&major, cudartDevAttrComputeCapabilityMajor, i);
if (ret != CUDART_SUCCESS) {
snprintf(buf, buflen, "device compute capability lookup failure %d: %d", i, ret);
resp->err = strdup(buf);
return;
}
ret = (*h.cudaDeviceGetAttribute)(&minor, cudartDevAttrComputeCapabilityMinor, i);
if (ret != CUDART_SUCCESS) {
snprintf(buf, buflen, "device compute capability lookup failure %d: %d", i, ret);
resp->err = strdup(buf);
return;
}
// Report the lowest major.minor we detect as that limits our compatibility
if (resp->major == 0 || resp->major > major ) {
resp->major = major;
resp->minor = minor;
} else if ( resp->major == major && resp->minor > minor ) {
resp->minor = minor;
}
}
}
#endif // __APPLE__

59
gpu/gpu_info_cudart.h Normal file
View File

@@ -0,0 +1,59 @@
#ifndef __APPLE__
#ifndef __GPU_INFO_CUDART_H__
#define __GPU_INFO_CUDART_H__
#include "gpu_info.h"
// Just enough typedef's to dlopen/dlsym for memory information
typedef enum cudartReturn_enum {
CUDART_SUCCESS = 0,
CUDART_UNSUPPORTED = 1,
// Other values omitted for now...
} cudartReturn_t;
typedef enum cudartDeviceAttr_enum {
cudartDevAttrComputeCapabilityMajor = 75,
cudartDevAttrComputeCapabilityMinor = 76,
} cudartDeviceAttr_t;
typedef void *cudartDevice_t; // Opaque is sufficient
typedef struct cudartMemory_st {
size_t total;
size_t free;
size_t used;
} cudartMemory_t;
typedef struct cudartDriverVersion {
int major;
int minor;
} cudartDriverVersion_t;
typedef struct cudart_handle {
void *handle;
uint16_t verbose;
cudartReturn_t (*cudaSetDevice)(int device);
cudartReturn_t (*cudaDeviceSynchronize)(void);
cudartReturn_t (*cudaDeviceReset)(void);
cudartReturn_t (*cudaMemGetInfo)(size_t *, size_t *);
cudartReturn_t (*cudaGetDeviceCount)(int *);
cudartReturn_t (*cudaDeviceGetAttribute)(int* value, cudartDeviceAttr_t attr, int device);
cudartReturn_t (*cudaDriverGetVersion) (int *driverVersion);
} cudart_handle_t;
typedef struct cudart_init_resp {
char *err; // If err is non-null handle is invalid
cudart_handle_t ch;
} cudart_init_resp_t;
typedef struct cudart_compute_capability {
char *err;
int major;
int minor;
} cudart_compute_capability_t;
void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp);
void cudart_check_vram(cudart_handle_t ch, mem_info_t *resp);
void cudart_compute_capability(cudart_handle_t th, cudart_compute_capability_t *cc);
#endif // __GPU_INFO_CUDART_H__
#endif // __APPLE__

View File

@@ -1,10 +1,10 @@
#ifndef __APPLE__ // TODO - maybe consider nvidia support on intel macs?
#include "gpu_info_cuda.h"
#include <string.h>
void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
#include "gpu_info_nvml.h"
void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp) {
nvmlReturn_t ret;
resp->err = NULL;
const int buflen = 256;
@@ -30,20 +30,20 @@ void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
{NULL, NULL},
};
resp->ch.handle = LOAD_LIBRARY(cuda_lib_path, RTLD_LAZY);
resp->ch.handle = LOAD_LIBRARY(nvml_lib_path, RTLD_LAZY);
if (!resp->ch.handle) {
char *msg = LOAD_ERR();
LOG(resp->ch.verbose, "library %s load err: %s\n", cuda_lib_path, msg);
LOG(resp->ch.verbose, "library %s load err: %s\n", nvml_lib_path, msg);
snprintf(buf, buflen,
"Unable to load %s library to query for Nvidia GPUs: %s",
cuda_lib_path, msg);
nvml_lib_path, msg);
free(msg);
resp->err = strdup(buf);
return;
}
// TODO once we've squashed the remaining corner cases remove this log
LOG(resp->ch.verbose, "wiring nvidia management library functions in %s\n", cuda_lib_path);
LOG(resp->ch.verbose, "wiring nvidia management library functions in %s\n", nvml_lib_path);
for (i = 0; l[i].s != NULL; i++) {
// TODO once we've squashed the remaining corner cases remove this log
@@ -82,7 +82,7 @@ void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
}
}
void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
void nvml_check_vram(nvml_handle_t h, mem_info_t *resp) {
resp->err = NULL;
nvmlDevice_t device;
nvmlMemory_t memInfo = {0};
@@ -92,7 +92,7 @@ void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
int i;
if (h.handle == NULL) {
resp->err = strdup("nvml handle sn't initialized");
resp->err = strdup("nvml handle isn't initialized");
return;
}
@@ -124,31 +124,31 @@ void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
// When in verbose mode, report more information about
// the card we discover, but don't fail on error
ret = (*h.nvmlDeviceGetName)(device, buf, buflen);
if (ret != RSMI_STATUS_SUCCESS) {
if (ret != NVML_SUCCESS) {
LOG(h.verbose, "nvmlDeviceGetName failed: %d\n", ret);
} else {
LOG(h.verbose, "[%d] CUDA device name: %s\n", i, buf);
}
ret = (*h.nvmlDeviceGetBoardPartNumber)(device, buf, buflen);
if (ret != RSMI_STATUS_SUCCESS) {
if (ret != NVML_SUCCESS) {
LOG(h.verbose, "nvmlDeviceGetBoardPartNumber failed: %d\n", ret);
} else {
LOG(h.verbose, "[%d] CUDA part number: %s\n", i, buf);
}
ret = (*h.nvmlDeviceGetSerial)(device, buf, buflen);
if (ret != RSMI_STATUS_SUCCESS) {
if (ret != NVML_SUCCESS) {
LOG(h.verbose, "nvmlDeviceGetSerial failed: %d\n", ret);
} else {
LOG(h.verbose, "[%d] CUDA S/N: %s\n", i, buf);
}
ret = (*h.nvmlDeviceGetVbiosVersion)(device, buf, buflen);
if (ret != RSMI_STATUS_SUCCESS) {
if (ret != NVML_SUCCESS) {
LOG(h.verbose, "nvmlDeviceGetVbiosVersion failed: %d\n", ret);
} else {
LOG(h.verbose, "[%d] CUDA vbios version: %s\n", i, buf);
}
ret = (*h.nvmlDeviceGetBrand)(device, &brand);
if (ret != RSMI_STATUS_SUCCESS) {
if (ret != NVML_SUCCESS) {
LOG(h.verbose, "nvmlDeviceGetBrand failed: %d\n", ret);
} else {
LOG(h.verbose, "[%d] CUDA brand: %d\n", i, brand);
@@ -156,14 +156,14 @@ void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
}
LOG(h.verbose, "[%d] CUDA totalMem %ld\n", i, memInfo.total);
LOG(h.verbose, "[%d] CUDA usedMem %ld\n", i, memInfo.used);
LOG(h.verbose, "[%d] CUDA freeMem %ld\n", i, memInfo.free);
resp->total += memInfo.total;
resp->free += memInfo.free;
}
}
void cuda_compute_capability(cuda_handle_t h, cuda_compute_capability_t *resp) {
void nvml_compute_capability(nvml_handle_t h, nvml_compute_capability_t *resp) {
resp->err = NULL;
resp->major = 0;
resp->minor = 0;

View File

@@ -1,6 +1,6 @@
#ifndef __APPLE__
#ifndef __GPU_INFO_CUDA_H__
#define __GPU_INFO_CUDA_H__
#ifndef __GPU_INFO_NVML_H__
#define __GPU_INFO_NVML_H__
#include "gpu_info.h"
// Just enough typedef's to dlopen/dlsym for memory information
@@ -20,7 +20,7 @@ typedef enum nvmlBrandType_enum
NVML_BRAND_UNKNOWN = 0,
} nvmlBrandType_t;
typedef struct cuda_handle {
typedef struct nvml_handle {
void *handle;
uint16_t verbose;
nvmlReturn_t (*nvmlInit_v2)(void);
@@ -35,22 +35,22 @@ typedef struct cuda_handle {
nvmlReturn_t (*nvmlDeviceGetVbiosVersion) (nvmlDevice_t device, char* version, unsigned int length);
nvmlReturn_t (*nvmlDeviceGetBoardPartNumber) (nvmlDevice_t device, char* partNumber, unsigned int length);
nvmlReturn_t (*nvmlDeviceGetBrand) (nvmlDevice_t device, nvmlBrandType_t* type);
} cuda_handle_t;
} nvml_handle_t;
typedef struct cuda_init_resp {
typedef struct nvml_init_resp {
char *err; // If err is non-null handle is invalid
cuda_handle_t ch;
} cuda_init_resp_t;
nvml_handle_t ch;
} nvml_init_resp_t;
typedef struct cuda_compute_capability {
typedef struct nvml_compute_capability {
char *err;
int major;
int minor;
} cuda_compute_capability_t;
} nvml_compute_capability_t;
void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp);
void cuda_check_vram(cuda_handle_t ch, mem_info_t *resp);
void cuda_compute_capability(cuda_handle_t ch, cuda_compute_capability_t *cc);
void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp);
void nvml_check_vram(nvml_handle_t ch, mem_info_t *resp);
void nvml_compute_capability(nvml_handle_t ch, nvml_compute_capability_t *cc);
#endif // __GPU_INFO_CUDA_H__
#endif // __GPU_INFO_NVML_H__
#endif // __APPLE__

View File

@@ -1,198 +0,0 @@
#ifndef __APPLE__
#include "gpu_info_rocm.h"
#include <string.h>
void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
rsmi_status_t ret;
resp->err = NULL;
const int buflen = 256;
char buf[buflen + 1];
int i;
struct lookup {
char *s;
void **p;
} l[] = {
{"rsmi_init", (void *)&resp->rh.rsmi_init},
{"rsmi_shut_down", (void *)&resp->rh.rsmi_shut_down},
{"rsmi_dev_memory_total_get", (void *)&resp->rh.rsmi_dev_memory_total_get},
{"rsmi_dev_memory_usage_get", (void *)&resp->rh.rsmi_dev_memory_usage_get},
{"rsmi_version_get", (void *)&resp->rh.rsmi_version_get},
{"rsmi_num_monitor_devices", (void*)&resp->rh.rsmi_num_monitor_devices},
{"rsmi_dev_id_get", (void*)&resp->rh.rsmi_dev_id_get},
{"rsmi_dev_name_get", (void *)&resp->rh.rsmi_dev_name_get},
{"rsmi_dev_brand_get", (void *)&resp->rh.rsmi_dev_brand_get},
{"rsmi_dev_vendor_name_get", (void *)&resp->rh.rsmi_dev_vendor_name_get},
{"rsmi_dev_vram_vendor_get", (void *)&resp->rh.rsmi_dev_vram_vendor_get},
{"rsmi_dev_serial_number_get", (void *)&resp->rh.rsmi_dev_serial_number_get},
{"rsmi_dev_subsystem_name_get", (void *)&resp->rh.rsmi_dev_subsystem_name_get},
{"rsmi_dev_vbios_version_get", (void *)&resp->rh.rsmi_dev_vbios_version_get},
{NULL, NULL},
};
resp->rh.handle = LOAD_LIBRARY(rocm_lib_path, RTLD_LAZY);
if (!resp->rh.handle) {
char *msg = LOAD_ERR();
snprintf(buf, buflen,
"Unable to load %s library to query for Radeon GPUs: %s\n",
rocm_lib_path, msg);
free(msg);
resp->err = strdup(buf);
return;
}
// TODO once we've squashed the remaining corner cases remove this log
LOG(resp->rh.verbose, "wiring rocm management library functions in %s\n", rocm_lib_path);
for (i = 0; l[i].s != NULL; i++) {
// TODO once we've squashed the remaining corner cases remove this log
LOG(resp->rh.verbose, "dlsym: %s\n", l[i].s);
*l[i].p = LOAD_SYMBOL(resp->rh.handle, l[i].s);
if (!l[i].p) {
resp->rh.handle = NULL;
char *msg = LOAD_ERR();
LOG(resp->rh.verbose, "dlerr: %s\n", msg);
UNLOAD_LIBRARY(resp->rh.handle);
snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
msg);
free(msg);
resp->err = strdup(buf);
return;
}
}
ret = (*resp->rh.rsmi_init)(0);
if (ret != RSMI_STATUS_SUCCESS) {
LOG(resp->rh.verbose, "rsmi_init err: %d\n", ret);
UNLOAD_LIBRARY(resp->rh.handle);
resp->rh.handle = NULL;
snprintf(buf, buflen, "rocm vram init failure: %d", ret);
resp->err = strdup(buf);
}
return;
}
void rocm_check_vram(rocm_handle_t h, mem_info_t *resp) {
resp->err = NULL;
resp->igpu_index = -1;
uint64_t totalMem = 0;
uint64_t usedMem = 0;
rsmi_status_t ret;
const int buflen = 256;
char buf[buflen + 1];
int i;
if (h.handle == NULL) {
resp->err = strdup("rocm handle not initialized");
return;
}
ret = (*h.rsmi_num_monitor_devices)(&resp->count);
if (ret != RSMI_STATUS_SUCCESS) {
snprintf(buf, buflen, "unable to get device count: %d", ret);
resp->err = strdup(buf);
return;
}
LOG(h.verbose, "discovered %d ROCm GPU Devices\n", resp->count);
resp->total = 0;
resp->free = 0;
for (i = 0; i < resp->count; i++) {
if (h.verbose) {
// When in verbose mode, report more information about
// the card we discover, but don't fail on error
ret = (*h.rsmi_dev_name_get)(i, buf, buflen);
if (ret != RSMI_STATUS_SUCCESS) {
LOG(h.verbose, "rsmi_dev_name_get failed: %d\n", ret);
} else {
LOG(h.verbose, "[%d] ROCm device name: %s\n", i, buf);
}
ret = (*h.rsmi_dev_brand_get)(i, buf, buflen);
if (ret != RSMI_STATUS_SUCCESS) {
LOG(h.verbose, "rsmi_dev_brand_get failed: %d\n", ret);
} else {
LOG(h.verbose, "[%d] ROCm brand: %s\n", i, buf);
}
ret = (*h.rsmi_dev_vendor_name_get)(i, buf, buflen);
if (ret != RSMI_STATUS_SUCCESS) {
LOG(h.verbose, "rsmi_dev_vendor_name_get failed: %d\n", ret);
} else {
LOG(h.verbose, "[%d] ROCm vendor: %s\n", i, buf);
}
ret = (*h.rsmi_dev_vram_vendor_get)(i, buf, buflen);
if (ret != RSMI_STATUS_SUCCESS) {
LOG(h.verbose, "rsmi_dev_vram_vendor_get failed: %d\n", ret);
} else {
LOG(h.verbose, "[%d] ROCm VRAM vendor: %s\n", i, buf);
}
ret = (*h.rsmi_dev_serial_number_get)(i, buf, buflen);
if (ret != RSMI_STATUS_SUCCESS) {
LOG(h.verbose, "rsmi_dev_serial_number_get failed: %d\n", ret);
} else {
LOG(h.verbose, "[%d] ROCm S/N: %s\n", i, buf);
}
ret = (*h.rsmi_dev_subsystem_name_get)(i, buf, buflen);
if (ret != RSMI_STATUS_SUCCESS) {
LOG(h.verbose, "rsmi_dev_subsystem_name_get failed: %d\n", ret);
} else {
LOG(h.verbose, "[%d] ROCm subsystem name: %s\n", i, buf);
}
ret = (*h.rsmi_dev_vbios_version_get)(i, buf, buflen);
if (ret != RSMI_STATUS_SUCCESS) {
LOG(h.verbose, "rsmi_dev_vbios_version_get failed: %d\n", ret);
} else {
LOG(h.verbose, "[%d] ROCm vbios version: %s\n", i, buf);
}
}
// Get total memory - used memory for available memory
ret = (*h.rsmi_dev_memory_total_get)(i, RSMI_MEM_TYPE_VRAM, &totalMem);
if (ret != RSMI_STATUS_SUCCESS) {
snprintf(buf, buflen, "rocm total mem lookup failure: %d", ret);
resp->err = strdup(buf);
return;
}
ret = (*h.rsmi_dev_memory_usage_get)(i, RSMI_MEM_TYPE_VRAM, &usedMem);
if (ret != RSMI_STATUS_SUCCESS) {
snprintf(buf, buflen, "rocm usage mem lookup failure: %d", ret);
resp->err = strdup(buf);
return;
}
LOG(h.verbose, "[%d] ROCm totalMem %ld\n", i, totalMem);
LOG(h.verbose, "[%d] ROCm usedMem %ld\n", i, usedMem);
if (totalMem < 1024 * 1024 * 1024) {
// Do not add up integrated GPU memory capacity, it's a bogus 512M, and actually uses system memory
LOG(h.verbose, "[%d] ROCm integrated GPU\n", i);
resp->igpu_index = i;
} else {
resp->total += totalMem;
resp->free += totalMem - usedMem;
}
}
}
void rocm_get_version(rocm_handle_t h, rocm_version_resp_t *resp) {
const int buflen = 256;
char buf[buflen + 1];
if (h.handle == NULL) {
resp->str = strdup("rocm handle not initialized");
resp->status = 1;
return;
}
rsmi_version_t ver;
rsmi_status_t ret;
ret = h.rsmi_version_get(&ver);
if (ret != RSMI_STATUS_SUCCESS) {
snprintf(buf, buflen, "unexpected response on version lookup %d", ret);
resp->status = 1;
} else {
snprintf(buf, buflen, "%d", ver.major);
resp->status = 0;
}
resp->str = strdup(buf);
}
#endif // __APPLE__

View File

@@ -1,59 +0,0 @@
#ifndef __APPLE__
#ifndef __GPU_INFO_ROCM_H__
#define __GPU_INFO_ROCM_H__
#include "gpu_info.h"
// Just enough typedef's to dlopen/dlsym for memory information
typedef enum rsmi_status_return {
RSMI_STATUS_SUCCESS = 0,
// Other values omitted for now...
} rsmi_status_t;
typedef enum rsmi_memory_type {
RSMI_MEM_TYPE_VRAM = 0,
RSMI_MEM_TYPE_VIS_VRAM,
RSMI_MEM_TYPE_GTT,
} rsmi_memory_type_t;
typedef struct {
uint32_t major;
uint32_t minor;
uint32_t patch;
const char *build;
} rsmi_version_t;
typedef struct rocm_handle {
void *handle;
uint16_t verbose;
rsmi_status_t (*rsmi_init)(uint64_t);
rsmi_status_t (*rsmi_shut_down)(void);
rsmi_status_t (*rsmi_dev_memory_total_get)(uint32_t, rsmi_memory_type_t, uint64_t *);
rsmi_status_t (*rsmi_dev_memory_usage_get)(uint32_t, rsmi_memory_type_t, uint64_t *);
rsmi_status_t (*rsmi_version_get) (rsmi_version_t *version);
rsmi_status_t (*rsmi_num_monitor_devices) (uint32_t *);
rsmi_status_t (*rsmi_dev_id_get)(uint32_t, uint16_t *);
rsmi_status_t (*rsmi_dev_name_get) (uint32_t,char *,size_t);
rsmi_status_t (*rsmi_dev_brand_get) (uint32_t, char *, uint32_t);
rsmi_status_t (*rsmi_dev_vendor_name_get) (uint32_t, char *, uint32_t);
rsmi_status_t (*rsmi_dev_vram_vendor_get) (uint32_t, char *, uint32_t);
rsmi_status_t (*rsmi_dev_serial_number_get) (uint32_t, char *, uint32_t);
rsmi_status_t (*rsmi_dev_subsystem_name_get) (uint32_t, char *, uint32_t);
rsmi_status_t (*rsmi_dev_vbios_version_get) (uint32_t, char *, uint32_t);
} rocm_handle_t;
typedef struct rocm_init_resp {
char *err; // If err is non-null handle is invalid
rocm_handle_t rh;
} rocm_init_resp_t;
typedef struct rocm_version_resp {
rsmi_status_t status;
char *str; // Contains version or error string if status != 0
} rocm_version_resp_t;
void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp);
void rocm_check_vram(rocm_handle_t rh, mem_info_t *resp);
void rocm_get_version(rocm_handle_t rh, rocm_version_resp_t *resp);
#endif // __GPU_INFO_ROCM_H__
#endif // __APPLE__

11
integration/README.md Normal file
View File

@@ -0,0 +1,11 @@
# Integration Tests
This directory contains integration tests to exercise Ollama end-to-end to verify behavior
By default, these tests are disabled so `go test ./...` will exercise only unit tests. To run integration tests you must pass the integration tag. `go test -tags=integration ./...`
The integration tests have 2 modes of operating.
1. By default, they will start the server on a random port, run the tests, and then shutdown the server.
2. If `OLLAMA_TEST_EXISTING` is set to a non-empty string, the tests will run against an existing running server, which can be remote

28
integration/basic_test.go Normal file
View File

@@ -0,0 +1,28 @@
//go:build integration
package integration
import (
"context"
"net/http"
"testing"
"time"
"github.com/ollama/ollama/api"
)
func TestOrcaMiniBlueSky(t *testing.T) {
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
defer cancel()
// Set up the test data
req := api.GenerateRequest{
Model: "orca-mini",
Prompt: "why is the sky blue?",
Stream: &stream,
Options: map[string]interface{}{
"temperature": 0,
"seed": 123,
},
}
GenerateTestHelper(ctx, t, &http.Client{}, req, []string{"rayleigh"})
}

View File

@@ -1,49 +1,38 @@
//go:build integration
package server
package integration
import (
"context"
"encoding/base64"
"log"
"os"
"strings"
"net/http"
"testing"
"time"
"github.com/jmorganca/ollama/api"
"github.com/jmorganca/ollama/llm"
"github.com/stretchr/testify/assert"
"github.com/ollama/ollama/api"
"github.com/stretchr/testify/require"
)
func TestIntegrationMultimodal(t *testing.T) {
SkipIFNoTestData(t)
image, err := base64.StdEncoding.DecodeString(imageEncoding)
require.NoError(t, err)
req := api.GenerateRequest{
Model: "llava:7b",
Prompt: "what does the text in this image say?",
Options: map[string]interface{}{},
Model: "llava:7b",
Prompt: "what does the text in this image say?",
Stream: &stream,
Options: map[string]interface{}{
"seed": 42,
"temperature": 0.0,
},
Images: []api.ImageData{
image,
},
}
resp := "the ollamas"
workDir, err := os.MkdirTemp("", "ollama")
require.NoError(t, err)
defer os.RemoveAll(workDir)
require.NoError(t, llm.Init(workDir))
ctx, cancel := context.WithTimeout(context.Background(), time.Second*60)
ctx, cancel := context.WithTimeout(context.Background(), 3*time.Minute)
defer cancel()
opts := api.DefaultOptions()
opts.Seed = 42
opts.Temperature = 0.0
model, llmRunner := PrepareModelForPrompts(t, req.Model, opts)
defer llmRunner.Close()
response := OneShotPromptResponse(t, ctx, req, model, llmRunner)
log.Print(response)
assert.Contains(t, strings.ToLower(response), resp)
GenerateTestHelper(ctx, t, &http.Client{}, req, []string{resp})
}
const imageEncoding = `iVBORw0KGgoAAAANSUhEUgAAANIAAAB4CAYAAACHHqzKAAAAAXNSR0IArs4c6QAAAIRlWElmTU0AKgAAAAgABQESAAMAAAABAAEAAAEaAAUAAAABAAAASgEb

73
integration/llm_test.go Normal file
View File

@@ -0,0 +1,73 @@
//go:build integration
package integration
import (
"context"
"net/http"
"sync"
"testing"
"time"
"github.com/ollama/ollama/api"
)
// TODO - this would ideally be in the llm package, but that would require some refactoring of interfaces in the server
// package to avoid circular dependencies
// WARNING - these tests will fail on mac if you don't manually copy ggml-metal.metal to this dir (./server)
//
// TODO - Fix this ^^
var (
stream = false
req = [2]api.GenerateRequest{
{
Model: "orca-mini",
Prompt: "why is the ocean blue?",
Stream: &stream,
Options: map[string]interface{}{
"seed": 42,
"temperature": 0.0,
},
}, {
Model: "orca-mini",
Prompt: "what is the origin of the us thanksgiving holiday?",
Stream: &stream,
Options: map[string]interface{}{
"seed": 42,
"temperature": 0.0,
},
},
}
resp = [2][]string{
[]string{"sunlight"},
[]string{"england", "english", "massachusetts", "pilgrims"},
}
)
func TestIntegrationSimpleOrcaMini(t *testing.T) {
ctx, cancel := context.WithTimeout(context.Background(), time.Second*120)
defer cancel()
GenerateTestHelper(ctx, t, &http.Client{}, req[0], resp[0])
}
// TODO
// The server always loads a new runner and closes the old one, which forces serial execution
// At present this test case fails with concurrency problems. Eventually we should try to
// get true concurrency working with n_parallel support in the backend
func TestIntegrationConcurrentPredictOrcaMini(t *testing.T) {
var wg sync.WaitGroup
wg.Add(len(req))
ctx, cancel := context.WithTimeout(context.Background(), time.Second*120)
defer cancel()
for i := 0; i < len(req); i++ {
go func(i int) {
defer wg.Done()
GenerateTestHelper(ctx, t, &http.Client{}, req[i], resp[i])
}(i)
}
wg.Wait()
}
// TODO - create a parallel test with 2 different models once we support concurrency

250
integration/utils_test.go Normal file
View File

@@ -0,0 +1,250 @@
//go:build integration
package integration
import (
"bytes"
"context"
"encoding/json"
"fmt"
"io"
"log/slog"
"math/rand"
"net"
"net/http"
"os"
"path/filepath"
"runtime"
"strconv"
"strings"
"sync"
"testing"
"time"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/app/lifecycle"
"github.com/stretchr/testify/assert"
)
func FindPort() string {
port := 0
if a, err := net.ResolveTCPAddr("tcp", "localhost:0"); err == nil {
var l *net.TCPListener
if l, err = net.ListenTCP("tcp", a); err == nil {
port = l.Addr().(*net.TCPAddr).Port
l.Close()
}
}
if port == 0 {
port = rand.Intn(65535-49152) + 49152 // get a random port in the ephemeral range
}
return strconv.Itoa(port)
}
func GetTestEndpoint() (string, string) {
defaultPort := "11434"
ollamaHost := os.Getenv("OLLAMA_HOST")
scheme, hostport, ok := strings.Cut(ollamaHost, "://")
if !ok {
scheme, hostport = "http", ollamaHost
}
// trim trailing slashes
hostport = strings.TrimRight(hostport, "/")
host, port, err := net.SplitHostPort(hostport)
if err != nil {
host, port = "127.0.0.1", defaultPort
if ip := net.ParseIP(strings.Trim(hostport, "[]")); ip != nil {
host = ip.String()
} else if hostport != "" {
host = hostport
}
}
if os.Getenv("OLLAMA_TEST_EXISTING") == "" && port == defaultPort {
port = FindPort()
}
url := fmt.Sprintf("%s:%s", host, port)
slog.Info("server connection", "url", url)
return scheme, url
}
// TODO make fanicier, grab logs, etc.
var serverMutex sync.Mutex
var serverReady bool
func StartServer(ctx context.Context, ollamaHost string) error {
// Make sure the server has been built
CLIName, err := filepath.Abs("../ollama")
if err != nil {
return err
}
if runtime.GOOS == "windows" {
CLIName += ".exe"
}
_, err = os.Stat(CLIName)
if err != nil {
return fmt.Errorf("CLI missing, did you forget to build first? %w", err)
}
serverMutex.Lock()
defer serverMutex.Unlock()
if serverReady {
return nil
}
if tmp := os.Getenv("OLLAMA_HOST"); tmp != ollamaHost {
slog.Info("setting env", "OLLAMA_HOST", ollamaHost)
os.Setenv("OLLAMA_HOST", ollamaHost)
}
slog.Info("starting server", "url", ollamaHost)
done, err := lifecycle.SpawnServer(ctx, "../ollama")
if err != nil {
return fmt.Errorf("failed to start server: %w", err)
}
go func() {
<-ctx.Done()
serverMutex.Lock()
defer serverMutex.Unlock()
exitCode := <-done
if exitCode > 0 {
slog.Warn("server failure", "exit", exitCode)
}
serverReady = false
}()
// TODO wait only long enough for the server to be responsive...
time.Sleep(500 * time.Millisecond)
serverReady = true
return nil
}
func PullIfMissing(ctx context.Context, client *http.Client, scheme, testEndpoint, modelName string) error {
slog.Debug("checking status of model", "model", modelName)
showReq := &api.ShowRequest{Name: modelName}
requestJSON, err := json.Marshal(showReq)
if err != nil {
return err
}
req, err := http.NewRequest("POST", scheme+"://"+testEndpoint+"/api/show", bytes.NewReader(requestJSON))
if err != nil {
return err
}
// Make the request with the HTTP client
response, err := client.Do(req.WithContext(ctx))
if err != nil {
return err
}
defer response.Body.Close()
if response.StatusCode == 200 {
slog.Info("model already present", "model", modelName)
return nil
}
slog.Info("model missing", "status", response.StatusCode)
pullReq := &api.PullRequest{Name: modelName, Stream: &stream}
requestJSON, err = json.Marshal(pullReq)
if err != nil {
return err
}
req, err = http.NewRequest("POST", scheme+"://"+testEndpoint+"/api/pull", bytes.NewReader(requestJSON))
if err != nil {
return err
}
slog.Info("pulling", "model", modelName)
response, err = client.Do(req.WithContext(ctx))
if err != nil {
return err
}
defer response.Body.Close()
if response.StatusCode != 200 {
return fmt.Errorf("failed to pull model") // TODO more details perhaps
}
slog.Info("model pulled", "model", modelName)
return nil
}
func GenerateTestHelper(ctx context.Context, t *testing.T, client *http.Client, genReq api.GenerateRequest, anyResp []string) {
requestJSON, err := json.Marshal(genReq)
if err != nil {
t.Fatalf("Error serializing request: %v", err)
}
defer func() {
if t.Failed() && os.Getenv("OLLAMA_TEST_EXISTING") == "" {
// TODO
fp, err := os.Open(lifecycle.ServerLogFile)
if err != nil {
slog.Error("failed to open server log", "logfile", lifecycle.ServerLogFile, "error", err)
return
}
data, err := io.ReadAll(fp)
if err != nil {
slog.Error("failed to read server log", "logfile", lifecycle.ServerLogFile, "error", err)
return
}
slog.Warn("SERVER LOG FOLLOWS")
os.Stderr.Write(data)
slog.Warn("END OF SERVER")
}
err = os.Remove(lifecycle.ServerLogFile)
if err != nil && !os.IsNotExist(err) {
slog.Warn("failed to cleanup", "logfile", lifecycle.ServerLogFile, "error", err)
}
}()
scheme, testEndpoint := GetTestEndpoint()
if os.Getenv("OLLAMA_TEST_EXISTING") == "" {
assert.NoError(t, StartServer(ctx, testEndpoint))
}
err = PullIfMissing(ctx, client, scheme, testEndpoint, genReq.Model)
if err != nil {
t.Fatalf("Error pulling model: %v", err)
}
// Make the request and get the response
req, err := http.NewRequest("POST", scheme+"://"+testEndpoint+"/api/generate", bytes.NewReader(requestJSON))
if err != nil {
t.Fatalf("Error creating request: %v", err)
}
// Set the content type for the request
req.Header.Set("Content-Type", "application/json")
// Make the request with the HTTP client
response, err := client.Do(req.WithContext(ctx))
if err != nil {
t.Fatalf("Error making request: %v", err)
}
defer response.Body.Close()
body, err := io.ReadAll(response.Body)
assert.NoError(t, err)
assert.Equal(t, response.StatusCode, 200, string(body))
// Verify the response is valid JSON
var payload api.GenerateResponse
err = json.Unmarshal(body, &payload)
if err != nil {
assert.NoError(t, err, body)
}
// Verify the response contains the expected data
atLeastOne := false
for _, resp := range anyResp {
if strings.Contains(strings.ToLower(payload.Response), resp) {
atLeastOne = true
break
}
}
assert.True(t, atLeastOne, "none of %v found in %s", anyResp, payload.Response)
}

View File

@@ -14,17 +14,14 @@
#define LOAD_LIBRARY(lib, flags) LoadLibrary(lib)
#define LOAD_SYMBOL(handle, sym) GetProcAddress(handle, sym)
#define UNLOAD_LIBRARY(handle) FreeLibrary(handle)
inline char *LOAD_ERR() {
LPSTR messageBuffer = NULL;
size_t size = FormatMessageA(
FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM |
FORMAT_MESSAGE_IGNORE_INSERTS,
NULL, GetLastError(), MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT),
(LPSTR)&messageBuffer, 0, NULL);
char *resp = strdup(messageBuffer);
LocalFree(messageBuffer);
return resp;
}
#define LOAD_ERR() ({\
LPSTR messageBuffer = NULL; \
size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, \
NULL, GetLastError(), MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&messageBuffer, 0, NULL); \
char *resp = strdup(messageBuffer); \
LocalFree(messageBuffer); \
resp; \
})
#else
#include <dlfcn.h>
#define LOAD_LIBRARY(lib, flags) dlopen(lib, flags)

View File

@@ -28,13 +28,13 @@ import (
"log/slog"
"os"
"path/filepath"
"runtime"
"strings"
"sync"
"time"
"unsafe"
"github.com/jmorganca/ollama/api"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/gpu"
)
type dynExtServer struct {
@@ -64,15 +64,12 @@ func extServerResponseToErr(resp C.ext_server_resp_t) error {
return fmt.Errorf(C.GoString(resp.msg))
}
// Note: current implementation does not support concurrent instantiations
var llm *dynExtServer
func newDynExtServer(library, model string, adapters, projectors []string, opts api.Options) (LLM, error) {
if !mutex.TryLock() {
slog.Info("concurrent llm servers not yet supported, waiting for prior server to complete")
mutex.Lock()
}
updatePath(filepath.Dir(library))
gpu.UpdatePath(filepath.Dir(library))
libPath := C.CString(library)
defer C.free(unsafe.Pointer(libPath))
resp := newExtServerResp(512)
@@ -83,7 +80,7 @@ func newDynExtServer(library, model string, adapters, projectors []string, opts
mutex.Unlock()
return nil, fmt.Errorf("Unable to load dynamic library: %s", C.GoString(resp.msg))
}
llm = &dynExtServer{
llm := dynExtServer{
s: srv,
options: opts,
}
@@ -148,7 +145,8 @@ func newDynExtServer(library, model string, adapters, projectors []string, opts
}
slog.Info("Initializing llama server")
initResp := newExtServerResp(128)
slog.Debug(fmt.Sprintf("server params: %+v", sparams))
initResp := newExtServerResp(512)
defer freeExtServerResp(initResp)
C.dyn_llama_server_init(llm.s, &sparams, &initResp)
if initResp.id < 0 {
@@ -160,7 +158,7 @@ func newDynExtServer(library, model string, adapters, projectors []string, opts
slog.Info("Starting llama main loop")
C.dyn_llama_server_start(llm.s)
return llm, nil
return &llm, nil
}
func (llm *dynExtServer) Predict(ctx context.Context, predict PredictOpts, fn func(PredictResult)) error {
@@ -197,6 +195,9 @@ func (llm *dynExtServer) Predict(ctx context.Context, predict PredictOpts, fn fu
if predict.Format == "json" {
request["grammar"] = jsonGrammar
if !strings.Contains(strings.ToLower(predict.Prompt), "json") {
slog.Warn("Prompt does not specify that the LLM should response in JSON, but JSON format is expected. For best results specify that JSON is expected in the system prompt.")
}
}
retryDelay := 100 * time.Microsecond
@@ -224,17 +225,14 @@ func (llm *dynExtServer) Predict(ctx context.Context, predict PredictOpts, fn fu
}
retryNeeded := false
// keep track of the last token generated, this is used to abort if the model starts looping
var lastToken string
var tokenRepeat int
out:
for {
select {
case <-ctx.Done():
// This handles the request cancellation
C.dyn_llama_server_completion_cancel(llm.s, resp.id, &resp)
if resp.id < 0 {
return extServerResponseToErr(resp)
} else {
return nil
}
return cancelCompletion(llm, resp)
default:
var result C.ext_server_task_result_t
C.dyn_llama_server_completion_next_result(llm.s, resp.id, &result)
@@ -257,6 +255,20 @@ func (llm *dynExtServer) Predict(ctx context.Context, predict PredictOpts, fn fu
break out
}
switch {
case strings.TrimSpace(p.Content) == lastToken:
tokenRepeat++
default:
lastToken = strings.TrimSpace(p.Content)
tokenRepeat = 0
}
// 30 picked as an arbitrary max token repeat limit, modify as needed
if tokenRepeat > 30 {
slog.Debug("prediction aborted, token repeat limit reached")
return cancelCompletion(llm, resp)
}
if p.Content != "" {
fn(PredictResult{
Content: p.Content,
@@ -284,6 +296,15 @@ func (llm *dynExtServer) Predict(ctx context.Context, predict PredictOpts, fn fu
return fmt.Errorf("max retries exceeded")
}
func cancelCompletion(llm *dynExtServer, resp C.ext_server_resp_t) error {
C.dyn_llama_server_completion_cancel(llm.s, resp.id, &resp)
if resp.id < 0 {
return extServerResponseToErr(resp)
} else {
return nil
}
}
func (llm *dynExtServer) Encode(ctx context.Context, prompt string) ([]int, error) {
data, err := json.Marshal(TokenizeRequest{Content: prompt})
if err != nil {
@@ -365,25 +386,3 @@ func (llm *dynExtServer) Close() {
C.dyn_llama_server_stop(llm.s)
mutex.Unlock()
}
func updatePath(dir string) {
if runtime.GOOS == "windows" {
tmpDir := filepath.Dir(dir)
pathComponents := strings.Split(os.Getenv("PATH"), ";")
i := 0
for _, comp := range pathComponents {
if strings.EqualFold(comp, dir) {
return
}
// Remove any other prior paths to our temp dir
if !strings.HasPrefix(strings.ToLower(comp), strings.ToLower(tmpDir)) {
pathComponents[i] = comp
i++
}
}
newPath := strings.Join(append([]string{dir}, pathComponents...), ";")
slog.Info(fmt.Sprintf("Updating PATH to %s", newPath))
os.Setenv("PATH", newPath)
}
// linux and darwin rely on rpath
}

View File

@@ -1,15 +1,11 @@
# Ollama specific CMakefile to include in llama.cpp/examples/server
set(TARGET ext_server)
option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
if (WIN32)
add_library(${TARGET} SHARED ../../../ext_server/ext_server.cpp ../../llama.cpp)
add_library(${TARGET} SHARED ext_server.cpp ../llama.cpp/llama.cpp)
else()
add_library(${TARGET} STATIC ../../../ext_server/ext_server.cpp ../../llama.cpp)
add_library(${TARGET} STATIC ext_server.cpp ../llama.cpp/llama.cpp)
endif()
target_include_directories(${TARGET} PRIVATE ../../common)
target_include_directories(${TARGET} PRIVATE ../..)
target_include_directories(${TARGET} PRIVATE ../../..)
target_compile_features(${TARGET} PRIVATE cxx_std_11)
target_compile_definitions(${TARGET} PUBLIC LLAMA_SERVER_LIBRARY=1)
target_link_libraries(${TARGET} PRIVATE ggml llava common )

View File

@@ -114,16 +114,12 @@ void llama_server_init(ext_server_params *sparams, ext_server_resp_t *err) {
llama_backend_init();
llama_numa_init(params.numa);
// load the model
if (!llama->load_model(params)) {
// TODO - consider modifying the logging logic or patching load_model so
// we can capture more detailed error messages and pass them back to the
// caller for better UX
err->id = -1;
snprintf(err->msg, err->msg_len, "error loading model %s",
params.model.c_str());
return;
}
if (!llama->load_model(params)) {
// an error occurred that was not thrown
err->id = -1;
snprintf(err->msg, err->msg_len, "error loading model %s", params.model.c_str());
return;
}
llama->initialize();
} catch (std::exception &e) {

8794
llm/ext_server/httplib.h vendored Normal file

File diff suppressed because it is too large Load Diff

24596
llm/ext_server/json.hpp vendored Normal file

File diff suppressed because it is too large Load Diff

3290
llm/ext_server/server.cpp vendored Normal file

File diff suppressed because it is too large Load Diff

655
llm/ext_server/utils.hpp vendored Normal file
View File

@@ -0,0 +1,655 @@
// MIT License
// Copyright (c) 2023 Georgi Gerganov
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#pragma once
#include <string>
#include <vector>
#include <set>
#include <mutex>
#include <condition_variable>
#include <unordered_map>
#include "json.hpp"
#include "../llava/clip.h"
using json = nlohmann::json;
extern bool server_verbose;
extern bool server_log_json;
#ifndef SERVER_VERBOSE
#define SERVER_VERBOSE 1
#endif
#if SERVER_VERBOSE != 1
#define LOG_VERBOSE(MSG, ...)
#else
#define LOG_VERBOSE(MSG, ...) \
do \
{ \
if (server_verbose) \
{ \
server_log("VERB", __func__, __LINE__, MSG, __VA_ARGS__); \
} \
} while (0)
#endif
#define LOG_ERROR( MSG, ...) server_log("ERR", __func__, __LINE__, MSG, __VA_ARGS__)
#define LOG_WARNING(MSG, ...) server_log("WARN", __func__, __LINE__, MSG, __VA_ARGS__)
#define LOG_INFO( MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__)
enum server_state {
SERVER_STATE_LOADING_MODEL, // Server is starting up, model not fully loaded yet
SERVER_STATE_READY, // Server is ready and model is loaded
SERVER_STATE_ERROR // An error occurred, load_model failed
};
enum task_type {
TASK_TYPE_COMPLETION,
TASK_TYPE_CANCEL,
TASK_TYPE_NEXT_RESPONSE,
TASK_TYPE_METRICS
};
struct task_server {
int id = -1; // to be filled by llama_server_queue
int target_id;
task_type type;
json data;
bool infill_mode = false;
bool embedding_mode = false;
int multitask_id = -1;
};
struct task_result {
int id;
int multitask_id = -1;
bool stop;
bool error;
json result_json;
};
struct task_multi {
int id;
std::set<int> subtasks_remaining{};
std::vector<task_result> results{};
};
// completion token output with probabilities
struct completion_token_output {
struct token_prob
{
llama_token tok;
float prob;
};
std::vector<token_prob> probs;
llama_token tok;
std::string text_to_send;
};
struct token_translator {
llama_context * ctx;
std::string operator()(llama_token tok) const { return llama_token_to_piece(ctx, tok); }
std::string operator()(const completion_token_output &cto) const { return (*this)(cto.tok); }
};
static inline void server_log(const char *level, const char *function, int line, const char *message, const nlohmann::ordered_json &extra) {
std::stringstream ss_tid;
ss_tid << std::this_thread::get_id();
json log = nlohmann::ordered_json{
{"tid", ss_tid.str()},
{"timestamp", time(nullptr)},
};
if (server_log_json) {
log.merge_patch(
{
{"level", level},
{"function", function},
{"line", line},
{"msg", message},
});
if (!extra.empty()) {
log.merge_patch(extra);
}
std::cout << log.dump(-1, ' ', false, json::error_handler_t::replace) << "\n" << std::flush;
} else {
char buf[1024];
snprintf(buf, 1024, "%4s [%24s] %s", level, function, message);
if (!extra.empty()) {
log.merge_patch(extra);
}
std::stringstream ss;
ss << buf << " |";
for (const auto& el : log.items())
{
const std::string value = el.value().dump(-1, ' ', false, json::error_handler_t::replace);
ss << " " << el.key() << "=" << value;
}
const std::string str = ss.str();
printf("%.*s\n", (int)str.size(), str.data());
fflush(stdout);
}
}
//
// server utils
//
template <typename T>
static T json_value(const json &body, const std::string &key, const T &default_value) {
// Fallback null to default value
return body.contains(key) && !body.at(key).is_null()
? body.value(key, default_value)
: default_value;
}
// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
inline bool verify_custom_template(const std::string & tmpl) {
llama_chat_message chat[] = {{"user", "test"}};
std::vector<char> buf(1);
int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, buf.data(), buf.size());
return res >= 0;
}
// Format given chat. If tmpl is empty, we take the template from model metadata
inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector<json> & messages) {
size_t alloc_size = 0;
// vector holding all allocated string to be passed to llama_chat_apply_template
std::vector<std::string> str(messages.size() * 2);
std::vector<llama_chat_message> chat(messages.size());
for (size_t i = 0; i < messages.size(); ++i) {
auto &curr_msg = messages[i];
str[i*2 + 0] = json_value(curr_msg, "role", std::string(""));
str[i*2 + 1] = json_value(curr_msg, "content", std::string(""));
alloc_size += str[i*2 + 1].length();
chat[i].role = str[i*2 + 0].c_str();
chat[i].content = str[i*2 + 1].c_str();
}
const char * ptr_tmpl = tmpl.empty() ? nullptr : tmpl.c_str();
std::vector<char> buf(alloc_size * 2);
// run the first time to get the total output length
int32_t res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), true, buf.data(), buf.size());
// if it turns out that our buffer is too small, we resize it
if ((size_t) res > buf.size()) {
buf.resize(res);
res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), true, buf.data(), buf.size());
}
std::string formatted_chat(buf.data(), res);
LOG_VERBOSE("formatted_chat", {{"text", formatted_chat.c_str()}});
return formatted_chat;
}
//
// work queue utils
//
struct llama_server_queue {
int id = 0;
std::mutex mutex_tasks;
bool running;
// queues
std::vector<task_server> queue_tasks;
std::vector<task_server> queue_tasks_deferred;
std::vector<task_multi> queue_multitasks;
std::condition_variable condition_tasks;
// callback functions
std::function<void(task_server&)> callback_new_task;
std::function<void(task_multi&)> callback_finish_multitask;
std::function<void(void)> callback_run_slots;
// Add a new task to the end of the queue
int post(task_server task) {
std::unique_lock<std::mutex> lock(mutex_tasks);
if (task.id == -1) {
task.id = id++;
LOG_VERBOSE("new task id", {{"new_id", task.id}});
}
queue_tasks.push_back(std::move(task));
condition_tasks.notify_one();
return task.id;
}
// Add a new task, but defer until one slot is available
void defer(task_server task) {
std::unique_lock<std::mutex> lock(mutex_tasks);
queue_tasks_deferred.push_back(std::move(task));
}
// Get the next id for creating anew task
int get_new_id() {
std::unique_lock<std::mutex> lock(mutex_tasks);
int new_id = id++;
LOG_VERBOSE("new task id", {{"new_id", new_id}});
return new_id;
}
// Register function to process a new task
void on_new_task(std::function<void(task_server&)> callback) {
callback_new_task = callback;
}
// Register function to process a multitask when it is finished
void on_finish_multitask(std::function<void(task_multi&)> callback) {
callback_finish_multitask = callback;
}
// Register the function to be called when all slots data is ready to be processed
void on_run_slots(std::function<void(void)> callback) {
callback_run_slots = callback;
}
// Call when the state of one slot is changed
void notify_slot_changed() {
// move deferred tasks back to main loop
std::unique_lock<std::mutex> lock(mutex_tasks);
for (auto & task : queue_tasks_deferred) {
queue_tasks.push_back(std::move(task));
}
queue_tasks_deferred.clear();
}
// end the start_loop routine
void terminate() {
{
std::unique_lock<std::mutex> lock(mutex_tasks);
running = false;
}
condition_tasks.notify_all();
}
/**
* Main loop consists of these steps:
* - Wait until a new task arrives
* - Process the task (i.e. maybe copy data into slot)
* - Check if multitask is finished
* - Run all slots
*/
void start_loop() {
running = true;
while (true) {
LOG_VERBOSE("new task may arrive", {});
{
while (true)
{
std::unique_lock<std::mutex> lock(mutex_tasks);
if (queue_tasks.empty()) {
lock.unlock();
break;
}
task_server task = queue_tasks.front();
queue_tasks.erase(queue_tasks.begin());
lock.unlock();
LOG_VERBOSE("callback_new_task", {{"task_id", task.id}});
callback_new_task(task);
}
LOG_VERBOSE("update_multitasks", {});
// check if we have any finished multitasks
auto queue_iterator = queue_multitasks.begin();
while (queue_iterator != queue_multitasks.end())
{
if (queue_iterator->subtasks_remaining.empty())
{
// all subtasks done == multitask is done
task_multi current_multitask = *queue_iterator;
callback_finish_multitask(current_multitask);
// remove this multitask
queue_iterator = queue_multitasks.erase(queue_iterator);
}
else
{
++queue_iterator;
}
}
// all tasks in the current loop is processed, slots data is now ready
LOG_VERBOSE("callback_run_slots", {});
callback_run_slots();
}
LOG_VERBOSE("wait for new task", {});
// wait for new task
{
std::unique_lock<std::mutex> lock(mutex_tasks);
if (queue_tasks.empty()) {
if (!running) {
LOG_VERBOSE("ending start_loop", {});
return;
}
condition_tasks.wait(lock, [&]{
return (!queue_tasks.empty() || !running);
});
}
}
}
}
//
// functions to manage multitasks
//
// add a multitask by specifying the id of all subtask (subtask is a task_server)
void add_multitask(int multitask_id, std::vector<int>& sub_ids)
{
std::lock_guard<std::mutex> lock(mutex_tasks);
task_multi multi;
multi.id = multitask_id;
std::copy(sub_ids.begin(), sub_ids.end(), std::inserter(multi.subtasks_remaining, multi.subtasks_remaining.end()));
queue_multitasks.push_back(multi);
}
// updatethe remaining subtasks, while appending results to multitask
void update_multitask(int multitask_id, int subtask_id, task_result& result)
{
std::lock_guard<std::mutex> lock(mutex_tasks);
for (auto& multitask : queue_multitasks)
{
if (multitask.id == multitask_id)
{
multitask.subtasks_remaining.erase(subtask_id);
multitask.results.push_back(result);
}
}
}
};
struct llama_server_response {
typedef std::function<void(int, int, task_result&)> callback_multitask_t;
callback_multitask_t callback_update_multitask;
// for keeping track of all tasks waiting for the result
std::set<int> waiting_task_ids;
// the main result queue
std::vector<task_result> queue_results;
std::mutex mutex_results;
std::condition_variable condition_results;
// add the task_id to the list of tasks waiting for response
void add_waiting_task_id(int task_id) {
LOG_VERBOSE("waiting for task id", {{"task_id", task_id}});
std::unique_lock<std::mutex> lock(mutex_results);
waiting_task_ids.insert(task_id);
}
// when the request is finished, we can remove task associated with it
void remove_waiting_task_id(int task_id) {
LOG_VERBOSE("remove waiting for task id", {{"task_id", task_id}});
std::unique_lock<std::mutex> lock(mutex_results);
waiting_task_ids.erase(task_id);
}
// This function blocks the thread until there is a response for this task_id
task_result recv(int task_id) {
while (true)
{
std::unique_lock<std::mutex> lock(mutex_results);
condition_results.wait(lock, [&]{
return !queue_results.empty();
});
for (int i = 0; i < (int) queue_results.size(); i++)
{
if (queue_results[i].id == task_id)
{
assert(queue_results[i].multitask_id == -1);
task_result res = queue_results[i];
queue_results.erase(queue_results.begin() + i);
return res;
}
}
}
// should never reach here
}
// Register the function to update multitask
void on_multitask_update(callback_multitask_t callback) {
callback_update_multitask = callback;
}
// Send a new result to a waiting task_id
void send(task_result result) {
std::unique_lock<std::mutex> lock(mutex_results);
LOG_VERBOSE("send new result", {{"task_id", result.id}});
for (auto& task_id : waiting_task_ids) {
// LOG_TEE("waiting task id %i \n", task_id);
// for now, tasks that have associated parent multitasks just get erased once multitask picks up the result
if (result.multitask_id == task_id)
{
LOG_VERBOSE("callback_update_multitask", {{"task_id", task_id}});
callback_update_multitask(task_id, result.id, result);
continue;
}
if (result.id == task_id)
{
LOG_VERBOSE("queue_results.push_back", {{"task_id", task_id}});
queue_results.push_back(result);
condition_results.notify_all();
return;
}
}
}
};
//
// base64 utils (TODO: move to common in the future)
//
static const std::string base64_chars =
"ABCDEFGHIJKLMNOPQRSTUVWXYZ"
"abcdefghijklmnopqrstuvwxyz"
"0123456789+/";
static inline bool is_base64(uint8_t c)
{
return (isalnum(c) || (c == '+') || (c == '/'));
}
static inline std::vector<uint8_t> base64_decode(const std::string & encoded_string)
{
int i = 0;
int j = 0;
int in_ = 0;
int in_len = encoded_string.size();
uint8_t char_array_4[4];
uint8_t char_array_3[3];
std::vector<uint8_t> ret;
while (in_len-- && (encoded_string[in_] != '=') && is_base64(encoded_string[in_]))
{
char_array_4[i++] = encoded_string[in_]; in_++;
if (i == 4)
{
for (i = 0; i <4; i++)
{
char_array_4[i] = base64_chars.find(char_array_4[i]);
}
char_array_3[0] = ((char_array_4[0] ) << 2) + ((char_array_4[1] & 0x30) >> 4);
char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
for (i = 0; (i < 3); i++)
{
ret.push_back(char_array_3[i]);
}
i = 0;
}
}
if (i)
{
for (j = i; j <4; j++)
{
char_array_4[j] = 0;
}
for (j = 0; j <4; j++)
{
char_array_4[j] = base64_chars.find(char_array_4[j]);
}
char_array_3[0] = ((char_array_4[0] ) << 2) + ((char_array_4[1] & 0x30) >> 4);
char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
for (j = 0; (j < i - 1); j++)
{
ret.push_back(char_array_3[j]);
}
}
return ret;
}
//
// random string / id
//
static std::string random_string()
{
static const std::string str("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
std::random_device rd;
std::mt19937 generator(rd());
std::string result(32, ' ');
for (int i = 0; i < 32; ++i) {
result[i] = str[generator() % str.size()];
}
return result;
}
static std::string gen_chatcmplid()
{
std::stringstream chatcmplid;
chatcmplid << "chatcmpl-" << random_string();
return chatcmplid.str();
}
//
// other common utils
//
static size_t common_part(const std::vector<llama_token> &a, const std::vector<llama_token> &b)
{
size_t i;
for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++)
{
}
return i;
}
static bool ends_with(const std::string &str, const std::string &suffix)
{
return str.size() >= suffix.size() &&
0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix);
}
static size_t find_partial_stop_string(const std::string &stop,
const std::string &text)
{
if (!text.empty() && !stop.empty())
{
const char text_last_char = text.back();
for (int64_t char_index = stop.size() - 1; char_index >= 0; char_index--)
{
if (stop[char_index] == text_last_char)
{
const std::string current_partial = stop.substr(0, char_index + 1);
if (ends_with(text, current_partial))
{
return text.size() - char_index - 1;
}
}
}
}
return std::string::npos;
}
// TODO: reuse llama_detokenize
template <class Iter>
static std::string tokens_to_str(llama_context *ctx, Iter begin, Iter end)
{
std::string ret;
for (; begin != end; ++begin)
{
ret += llama_token_to_piece(ctx, *begin);
}
return ret;
}
// format incomplete utf-8 multibyte character for output
static std::string tokens_to_output_formatted_string(const llama_context *ctx, const llama_token token)
{
std::string out = token == -1 ? "" : llama_token_to_piece(ctx, token);
// if the size is 1 and first bit is 1, meaning it's a partial character
// (size > 1 meaning it's already a known token)
if (out.size() == 1 && (out[0] & 0x80) == 0x80)
{
std::stringstream ss;
ss << std::hex << (out[0] & 0xff);
std::string res(ss.str());
out = "byte: \\x" + res;
}
return out;
}
// convert a vector of completion_token_output to json
static json probs_vector_to_json(const llama_context *ctx, const std::vector<completion_token_output> &probs)
{
json out = json::array();
for (const auto &prob : probs)
{
json probs_for_token = json::array();
for (const auto &p : prob.probs)
{
std::string tok_str = tokens_to_output_formatted_string(ctx, p.tok);
probs_for_token.push_back(json
{
{"tok_str", tok_str},
{"prob", p.prob},
});
}
std::string tok_str = tokens_to_output_formatted_string(ctx, prob.tok);
out.push_back(json{
{"content", tok_str},
{"probs", probs_for_token},
});
}
return out;
}

View File

@@ -1,4 +1,4 @@
# common logic accross linux and darwin
# common logic across linux and darwin
init_vars() {
case "${GOARCH}" in
@@ -39,7 +39,7 @@ init_vars() {
*)
;;
esac
if [ -z "${CMAKE_CUDA_ARCHITECTURES}" ] ; then
if [ -z "${CMAKE_CUDA_ARCHITECTURES}" ] ; then
CMAKE_CUDA_ARCHITECTURES="50;52;61;70;75;80"
fi
}
@@ -61,8 +61,8 @@ git_module_setup() {
apply_patches() {
# Wire up our CMakefile
if ! grep ollama ${LLAMACPP_DIR}/examples/server/CMakeLists.txt; then
echo 'include (../../../ext_server/CMakeLists.txt) # ollama' >>${LLAMACPP_DIR}/examples/server/CMakeLists.txt
if ! grep ollama ${LLAMACPP_DIR}/CMakeLists.txt; then
echo 'add_subdirectory(../ext_server ext_server) # ollama' >>${LLAMACPP_DIR}/CMakeLists.txt
fi
if [ -n "$(ls -A ../patches/*.diff)" ]; then
@@ -76,19 +76,16 @@ apply_patches() {
(cd ${LLAMACPP_DIR} && git apply ${patch})
done
fi
# Avoid duplicate main symbols when we link into the cgo binary
sed -e 's/int main(/int __main(/g' <${LLAMACPP_DIR}/examples/server/server.cpp >${LLAMACPP_DIR}/examples/server/server.cpp.tmp &&
mv ${LLAMACPP_DIR}/examples/server/server.cpp.tmp ${LLAMACPP_DIR}/examples/server/server.cpp
}
build() {
cmake -S ${LLAMACPP_DIR} -B ${BUILD_DIR} ${CMAKE_DEFS}
cmake --build ${BUILD_DIR} ${CMAKE_TARGETS} -j8
mkdir -p ${BUILD_DIR}/lib/
ls ${BUILD_DIR}
g++ -fPIC -g -shared -o ${BUILD_DIR}/lib/libext_server.${LIB_EXT} \
${GCC_ARCH} \
${WHOLE_ARCHIVE} ${BUILD_DIR}/examples/server/libext_server.a ${NO_WHOLE_ARCHIVE} \
${WHOLE_ARCHIVE} ${BUILD_DIR}/ext_server/libext_server.a ${NO_WHOLE_ARCHIVE} \
${BUILD_DIR}/common/libcommon.a \
${BUILD_DIR}/libllama.a \
-Wl,-rpath,\$ORIGIN \
@@ -113,7 +110,7 @@ compress_libs() {
# Keep the local tree clean after we're done with the build
cleanup() {
(cd ${LLAMACPP_DIR}/examples/server/ && git checkout CMakeLists.txt server.cpp)
(cd ${LLAMACPP_DIR}/ && git checkout CMakeLists.txt)
if [ -n "$(ls -A ../patches/*.diff)" ]; then
for patch in ../patches/*.diff; do

View File

@@ -60,7 +60,7 @@ case "${GOARCH}" in
compress_libs
;;
"arm64")
CMAKE_DEFS="${COMMON_DARWIN_DEFS} -DLLAMA_ACCELERATE=on -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DLLAMA_METAL=on ${CMAKE_DEFS}"
CMAKE_DEFS="${COMMON_DARWIN_DEFS} -DLLAMA_METAL_EMBED_LIBRARY=on -DLLAMA_ACCELERATE=on -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DLLAMA_METAL=on ${CMAKE_DEFS}"
BUILD_DIR="${LLAMACPP_DIR}/build/darwin/${ARCH}/metal"
EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders"
build

View File

@@ -26,6 +26,9 @@ amdGPUs() {
"gfx908:xnack-"
"gfx90a:xnack+"
"gfx90a:xnack-"
"gfx940"
"gfx941"
"gfx942"
"gfx1010"
"gfx1012"
"gfx1030"
@@ -87,30 +90,35 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
compress_libs
fi
if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu_avx" ]; then
if [ "${ARCH}" == "x86_64" ]; then
#
# ~2011 CPU Dynamic library with more capabilities turned on to optimize performance
# Approximately 400% faster than LCD on same CPU
# ARM chips in M1/M2/M3-based MACs and NVidia Tegra devices do not currently support avx extensions.
#
init_vars
CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu_avx"
echo "Building AVX CPU"
build
compress_libs
fi
if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu_avx" ]; then
#
# ~2011 CPU Dynamic library with more capabilities turned on to optimize performance
# Approximately 400% faster than LCD on same CPU
#
init_vars
CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu_avx"
echo "Building AVX CPU"
build
compress_libs
fi
if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu_avx2" ]; then
#
# ~2013 CPU Dynamic library
# Approximately 10% faster than AVX on same CPU
#
init_vars
CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on ${CMAKE_DEFS}"
BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu_avx2"
echo "Building AVX2 CPU"
build
compress_libs
if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu_avx2" ]; then
#
# ~2013 CPU Dynamic library
# Approximately 10% faster than AVX on same CPU
#
init_vars
CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on ${CMAKE_DEFS}"
BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu_avx2"
echo "Building AVX2 CPU"
build
compress_libs
fi
fi
fi
else
@@ -139,12 +147,21 @@ if [ -d "${CUDA_LIB_DIR}" ]; then
if [ -n "${CUDA_MAJOR}" ]; then
CUDA_VARIANT=_v${CUDA_MAJOR}
fi
CMAKE_DEFS="-DLLAMA_CUBLAS=on -DLLAMA_CUDA_FORCE_MMQ=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} ${COMMON_CMAKE_DEFS} ${CMAKE_DEFS}"
if [ "${ARCH}" == "arm64" ]; then
echo "ARM CPU detected - disabling unsupported AVX instructions"
# ARM-based CPUs such as M1 and Tegra do not support AVX extensions.
#
# CUDA compute < 6.0 lacks proper FP16 support on ARM.
# Disabling has minimal performance effect while maintaining compatibility.
ARM64_DEFS="-DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_CUDA_F16=off"
fi
CMAKE_DEFS="-DLLAMA_CUBLAS=on -DLLAMA_CUDA_FORCE_MMQ=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} ${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS}"
BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cuda${CUDA_VARIANT}"
EXTRA_LIBS="-L${CUDA_LIB_DIR} -lcudart -lcublas -lcublasLt -lcuda"
build
# Cary the CUDA libs as payloads to help reduce dependency burden on users
# Carry the CUDA libs as payloads to help reduce dependency burden on users
#
# TODO - in the future we may shift to packaging these separately and conditionally
# downloading them in the install script.
@@ -179,17 +196,27 @@ fi
if [ -d "${ROCM_PATH}" ]; then
echo "ROCm libraries detected - building dynamic ROCm library"
if [ -f ${ROCM_PATH}/lib/librocm_smi64.so.? ]; then
ROCM_VARIANT=_v$(ls ${ROCM_PATH}/lib/librocm_smi64.so.? | cut -f3 -d. || true)
if [ -f ${ROCM_PATH}/lib/librocblas.so.*.*.????? ]; then
ROCM_VARIANT=_v$(ls ${ROCM_PATH}/lib/librocblas.so.*.*.????? | cut -f5 -d. || true)
fi
init_vars
CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DLLAMA_HIPBLAS=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS=$(amdGPUs) -DGPU_TARGETS=$(amdGPUs)"
BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/rocm${ROCM_VARIANT}"
EXTRA_LIBS="-L${ROCM_PATH}/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ -Wl,-rpath,${ROCM_PATH}/lib,-rpath,/opt/amdgpu/lib/x86_64-linux-gnu/ -lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu"
EXTRA_LIBS="-L${ROCM_PATH}/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ -Wl,-rpath,\$ORIGIN/../../rocm/ -lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu"
build
# Note: the ROCM libs and runtime library files are too large to embed, so we depend on
# them being present at runtime on the host
# Record the ROCM dependencies
rm -f "${BUILD_DIR}/lib/deps.txt"
touch "${BUILD_DIR}/lib/deps.txt"
for dep in $(ldd "${BUILD_DIR}/lib/libext_server.so" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -e rocm -e amdgpu -e libtinfo ); do
echo "${dep}" >> "${BUILD_DIR}/lib/deps.txt"
done
# bomb out if for some reason we didn't get a few deps
if [ $(cat "${BUILD_DIR}/lib/deps.txt" | wc -l ) -lt 8 ] ; then
cat "${BUILD_DIR}/lib/deps.txt"
echo "ERROR: deps file short"
exit 1
fi
compress_libs
fi

View File

@@ -2,19 +2,49 @@
$ErrorActionPreference = "Stop"
function amdGPUs {
if ($env:AMDGPU_TARGETS) {
return $env:AMDGPU_TARGETS
}
# TODO - load from some common data file for linux + windows build consistency
$GPU_LIST = @(
"gfx900"
"gfx906:xnack-"
"gfx908:xnack-"
"gfx90a:xnack+"
"gfx90a:xnack-"
"gfx940"
"gfx941"
"gfx942"
"gfx1010"
"gfx1012"
"gfx1030"
"gfx1100"
"gfx1101"
"gfx1102"
)
$GPU_LIST -join ';'
}
function init_vars {
$script:SRC_DIR = $(resolve-path "..\..\")
$script:llamacppDir = "../llama.cpp"
$script:cmakeDefs = @("-DBUILD_SHARED_LIBS=on", "-DLLAMA_NATIVE=off", "-A", "x64")
$script:cmakeDefs = @(
"-DBUILD_SHARED_LIBS=on",
"-DLLAMA_NATIVE=off"
)
$script:cmakeTargets = @("ext_server")
$script:ARCH = "amd64" # arm not yet supported.
if ($env:CGO_CFLAGS -contains "-g") {
$script:cmakeDefs += @("-DCMAKE_VERBOSE_MAKEFILE=on", "-DLLAMA_SERVER_VERBOSE=on")
$script:cmakeDefs += @("-DCMAKE_VERBOSE_MAKEFILE=on", "-DLLAMA_SERVER_VERBOSE=on", "-DCMAKE_BUILD_TYPE=RelWithDebInfo")
$script:config = "RelWithDebInfo"
} else {
$script:cmakeDefs += @("-DLLAMA_SERVER_VERBOSE=off")
$script:cmakeDefs += @("-DLLAMA_SERVER_VERBOSE=off", "-DCMAKE_BUILD_TYPE=Release")
$script:config = "Release"
}
if ($null -ne $env:CMAKE_SYSTEM_VERSION) {
$script:cmakeDefs += @("-DCMAKE_SYSTEM_VERSION=${env:CMAKE_SYSTEM_VERSION}")
}
# Try to find the CUDA dir
if ($env:CUDA_LIB_DIR -eq $null) {
$d=(get-command -ea 'silentlycontinue' nvcc).path
@@ -32,8 +62,12 @@ function init_vars {
} else {
$script:CMAKE_CUDA_ARCHITECTURES=$env:CMAKE_CUDA_ARCHITECTURES
}
# Note: 10 Windows Kit signtool crashes with GCP's plugin
${script:SignTool}="C:\Program Files (x86)\Windows Kits\8.1\bin\x64\signtool.exe"
# Note: Windows Kits 10 signtool crashes with GCP's plugin
if ($null -eq $env:SIGN_TOOL) {
${script:SignTool}="C:\Program Files (x86)\Windows Kits\8.1\bin\x64\signtool.exe"
} else {
${script:SignTool}=${env:SIGN_TOOL}
}
if ("${env:KEY_CONTAINER}") {
${script:OLLAMA_CERT}=$(resolve-path "${script:SRC_DIR}\ollama_inc.crt")
}
@@ -49,8 +83,8 @@ function git_module_setup {
function apply_patches {
# Wire up our CMakefile
if (!(Select-String -Path "${script:llamacppDir}/examples/server/CMakeLists.txt" -Pattern 'ollama')) {
Add-Content -Path "${script:llamacppDir}/examples/server/CMakeLists.txt" -Value 'include (../../../ext_server/CMakeLists.txt) # ollama'
if (!(Select-String -Path "${script:llamacppDir}/CMakeLists.txt" -Pattern 'ollama')) {
Add-Content -Path "${script:llamacppDir}/CMakeLists.txt" -Value 'add_subdirectory(../ext_server ext_server) # ollama'
}
# Apply temporary patches until fix is upstream
@@ -74,11 +108,6 @@ function apply_patches {
Set-Location -Path ${script:llamacppDir}
git apply $patch.FullName
}
# Avoid duplicate main symbols when we link into the cgo binary
$content = Get-Content -Path "${script:llamacppDir}/examples/server/server.cpp"
$content = $content -replace 'int main\(', 'int __main('
Set-Content -Path "${script:llamacppDir}/examples/server/server.cpp" -Value $content
}
function build {
@@ -106,7 +135,7 @@ function sign {
if ("${env:KEY_CONTAINER}") {
write-host "Signing ${script:buildDir}/lib/*.dll"
foreach ($file in (get-childitem "${script:buildDir}/lib/*.dll")){
& "${script:SignTool}" sign /v /fd sha256 /t http://timestamp.digicert.com /f "${script:OLLAMA_CERT}" `
& "${script:SignTool}" sign /v /debug /fd sha256 /t http://timestamp.digicert.com /f "${script:OLLAMA_CERT}" `
/csp "Google Cloud KMS Provider" /kc "${env:KEY_CONTAINER}" $file
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
}
@@ -140,8 +169,8 @@ function cleanup {
git checkout $file
}
}
Set-Location "${script:llamacppDir}/examples/server"
git checkout CMakeLists.txt server.cpp
Set-Location "${script:llamacppDir}/"
git checkout CMakeLists.txt
}
@@ -156,32 +185,37 @@ apply_patches
$script:commonCpuDefs = @("-DCMAKE_POSITION_INDEPENDENT_CODE=on")
init_vars
$script:cmakeDefs = $script:commonCpuDefs + @("-DLLAMA_AVX=off", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs
$script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cpu"
write-host "Building LCD CPU"
build
install
sign
compress_libs
if ($null -eq ${env:OLLAMA_SKIP_CPU_GENERATE}) {
init_vars
$script:cmakeDefs = $script:commonCpuDefs + @("-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs
$script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cpu_avx"
write-host "Building AVX CPU"
build
install
sign
compress_libs
init_vars
$script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DLLAMA_AVX=off", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs
$script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cpu"
write-host "Building LCD CPU"
build
install
sign
compress_libs
init_vars
$script:cmakeDefs = $script:commonCpuDefs + @("-DLLAMA_AVX=on", "-DLLAMA_AVX2=on", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=on", "-DLLAMA_F16C=on") + $script:cmakeDefs
$script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cpu_avx2"
write-host "Building AVX2 CPU"
build
install
sign
compress_libs
init_vars
$script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs
$script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cpu_avx"
write-host "Building AVX CPU"
build
install
sign
compress_libs
init_vars
$script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=on", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=on", "-DLLAMA_F16C=on") + $script:cmakeDefs
$script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cpu_avx2"
write-host "Building AVX2 CPU"
build
install
sign
compress_libs
} else {
write-host "Skipping CPU generation step as requested"
}
if ($null -ne $script:CUDA_LIB_DIR) {
# Then build cuda as a dynamically loaded library
@@ -192,18 +226,51 @@ if ($null -ne $script:CUDA_LIB_DIR) {
}
init_vars
$script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cuda$script:CUDA_VARIANT"
$script:cmakeDefs += @("-DLLAMA_CUBLAS=ON", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DCUDAToolkit_INCLUDE_DIR=$script:CUDA_INCLUDE_DIR", "-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}")
$script:cmakeDefs += @("-A", "x64", "-DLLAMA_CUBLAS=ON", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DCUDAToolkit_INCLUDE_DIR=$script:CUDA_INCLUDE_DIR", "-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}")
write-host "Building CUDA"
build
install
sign
compress_libs
}
# TODO - actually implement ROCm support on windows
$script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/rocm"
rm -ea 0 -recurse -force -path "${script:buildDir}/lib"
md "${script:buildDir}/lib" -ea 0 > $null
echo $null >> "${script:buildDir}/lib/.generated"
if ($null -ne $env:HIP_PATH) {
$script:ROCM_VERSION=(get-item $env:HIP_PATH).Basename
if ($null -ne $script:ROCM_VERSION) {
$script:ROCM_VARIANT="_v"+$script:ROCM_VERSION
}
init_vars
$script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/rocm$script:ROCM_VARIANT"
$script:cmakeDefs += @(
"-G", "Ninja",
"-DCMAKE_C_COMPILER=clang.exe",
"-DCMAKE_CXX_COMPILER=clang++.exe",
"-DLLAMA_HIPBLAS=on",
"-DLLAMA_AVX=on",
"-DLLAMA_AVX2=off",
"-DCMAKE_POSITION_INDEPENDENT_CODE=on",
"-DAMDGPU_TARGETS=$(amdGPUs)",
"-DGPU_TARGETS=$(amdGPUs)"
)
# Make sure the ROCm binary dir is first in the path
$env:PATH="$env:HIP_PATH\bin;$env:PATH"
# We have to clobber the LIB var from the developer shell for clang to work properly
$env:LIB=""
write-host "Building ROCm"
build
# Ninja doesn't prefix with config name
${script:config}=""
install
if ($null -ne $script:DUMPBIN) {
& "$script:DUMPBIN" /dependents "${script:buildDir}/bin/${script:config}/ext_server.dll" | select-string ".dll"
}
sign
compress_libs
}
cleanup
write-host "`ngo generate completed"
write-host "`ngo generate completed. LLM runners: $(get-childitem -path ${script:SRC_DIR}\llm\llama.cpp\build\windows\${script:ARCH})"

162
llm/ggla.go Normal file
View File

@@ -0,0 +1,162 @@
package llm
import (
"encoding/binary"
"errors"
"io"
"slices"
)
type ContainerGGLA struct {
version uint32
}
func (c *ContainerGGLA) Name() string {
return "ggla"
}
func (c *ContainerGGLA) Decode(rs io.ReadSeeker) (model, error) {
binary.Read(rs, binary.LittleEndian, &c.version)
switch c.version {
case 1:
default:
return nil, errors.New("invalid version")
}
model := newModelGGLA(c)
err := model.decode(rs)
return model, err
}
type ModelGGLA struct {
*ContainerGGLA
kv KV
tensors []Tensor
}
func newModelGGLA(container *ContainerGGLA) *ModelGGLA {
return &ModelGGLA{
ContainerGGLA: container,
kv: make(KV),
}
}
func (m *ModelGGLA) decode(rs io.ReadSeeker) error {
var r uint32
if err := binary.Read(rs, binary.LittleEndian, &r); err != nil {
return err
}
m.kv["r"] = r
var alpha uint32
if err := binary.Read(rs, binary.LittleEndian, &alpha); err != nil {
return err
}
m.kv["alpha"] = alpha
for {
var dims uint32
if err := binary.Read(rs, binary.LittleEndian, &dims); err != nil {
return err
}
var namesize uint32
if err := binary.Read(rs, binary.LittleEndian, &namesize); err != nil {
return err
}
var t Tensor
if err := binary.Read(rs, binary.LittleEndian, &t.Kind); err != nil {
return err
}
t.Shape = make([]uint64, dims)
for i := 0; uint32(i) < dims; i++ {
var shape32 uint32
if err := binary.Read(rs, binary.LittleEndian, &shape32); err != nil {
return err
}
t.Shape[i] = uint64(shape32)
}
// ggla tensor shape is reversed
// ref: https://github.com/ggerganov/llama.cpp/blob/29ae62d2ae163e2b68aa0ad3bf2ab4636de0c957/convert-lora-to-ggml.py#L44
slices.Reverse(t.Shape)
name := make([]byte, namesize)
if err := binary.Read(rs, binary.LittleEndian, &name); err != nil {
return err
}
t.Name = string(name)
offset, err := rs.Seek(0, io.SeekCurrent)
if err != nil {
return err
}
if _, err := rs.Seek((offset+31)&-32, io.SeekStart); err != nil {
return err
}
offset, err = rs.Seek(0, io.SeekCurrent)
if err != nil {
return err
}
t.Offset = uint64(offset)
if _, err := rs.Seek(int64(t.Size()), io.SeekCurrent); err != nil {
return err
}
m.tensors = append(m.tensors, t)
}
}
func (m *ModelGGLA) KV() KV {
return m.kv
}
func (m *ModelGGLA) Tensor() []Tensor {
return m.tensors
}
func (*ModelGGLA) ModelFamily() string {
return "ggla"
}
func (*ModelGGLA) ModelType() string {
panic("not implemented")
}
func (*ModelGGLA) FileType() string {
panic("not implemented")
}
func (*ModelGGLA) NumLayers() uint32 {
panic("not implemented")
}
func (*ModelGGLA) NumGQA() uint32 {
panic("not implemented")
}
func (*ModelGGLA) NumEmbed() uint32 {
panic("not implemented")
}
func (*ModelGGLA) NumHead() uint32 {
panic("not implemented")
}
func (*ModelGGLA) NumHeadKv() uint32 {
panic("not implemented")
}
func (*ModelGGLA) NumCtx() uint32 {
panic("not implemented")
}

View File

@@ -103,33 +103,7 @@ type model interface {
type container interface {
Name() string
Decode(*readSeekOffset) (model, error)
}
type containerLORA struct {
version uint32
}
func (c *containerLORA) Name() string {
return "ggla"
}
func (c *containerLORA) Decode(rso *readSeekOffset) (model, error) {
var version uint32
binary.Read(rso, binary.LittleEndian, &version)
switch version {
case 1:
default:
return nil, errors.New("invalid version")
}
c.version = version
// remaining file contents aren't decoded
rso.Seek(0, io.SeekEnd)
return nil, nil
Decode(io.ReadSeeker) (model, error)
}
const (
@@ -148,11 +122,9 @@ const (
var ErrUnsupportedFormat = errors.New("unsupported model format")
func DecodeGGML(r io.ReadSeeker) (*GGML, error) {
ro := readSeekOffset{ReadSeeker: r}
func DecodeGGML(rs io.ReadSeeker) (*GGML, error) {
var magic uint32
if err := binary.Read(&ro, binary.LittleEndian, &magic); err != nil {
if err := binary.Read(rs, binary.LittleEndian, &magic); err != nil {
return nil, err
}
@@ -161,16 +133,23 @@ func DecodeGGML(r io.ReadSeeker) (*GGML, error) {
case FILE_MAGIC_GGML, FILE_MAGIC_GGMF, FILE_MAGIC_GGJT:
return nil, ErrUnsupportedFormat
case FILE_MAGIC_GGLA:
c = &containerLORA{}
c = &ContainerGGLA{}
case FILE_MAGIC_GGUF_LE:
c = &containerGGUF{bo: binary.LittleEndian}
c = &ContainerGGUF{ByteOrder: binary.LittleEndian}
case FILE_MAGIC_GGUF_BE:
c = &containerGGUF{bo: binary.BigEndian}
c = &ContainerGGUF{ByteOrder: binary.BigEndian}
default:
return nil, errors.New("invalid file magic")
}
model, err := c.Decode(&ro)
model, err := c.Decode(rs)
if errors.Is(err, io.EOF) {
// noop
} else if err != nil {
return nil, err
}
offset, err := rs.Seek(0, io.SeekCurrent)
if err != nil {
return nil, err
}
@@ -179,27 +158,6 @@ func DecodeGGML(r io.ReadSeeker) (*GGML, error) {
return &GGML{
container: c,
model: model,
Size: ro.offset,
Size: offset,
}, nil
}
type readSeekOffset struct {
io.ReadSeeker
offset int64
}
func (rso *readSeekOffset) Seek(offset int64, whence int) (int64, error) {
offset, err := rso.ReadSeeker.Seek(offset, whence)
if err != nil {
return 0, err
}
rso.offset = offset
return offset, nil
}
func (rso *readSeekOffset) Read(p []byte) (int, error) {
n, err := rso.ReadSeeker.Read(p)
rso.offset += int64(n)
return n, err
}

File diff suppressed because it is too large Load Diff

View File

@@ -5,7 +5,7 @@ import (
"fmt"
"time"
"github.com/jmorganca/ollama/api"
"github.com/ollama/ollama/api"
)
const jsonGrammar = `

View File

@@ -6,9 +6,10 @@ import (
"log/slog"
"os"
"runtime"
"slices"
"github.com/jmorganca/ollama/api"
"github.com/jmorganca/ollama/gpu"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/gpu"
)
type LLM interface {
@@ -19,7 +20,11 @@ type LLM interface {
Close()
}
func New(workDir, model string, adapters, projectors []string, opts api.Options) (LLM, error) {
var cpuOnlyFamilies = []string{
"mamba",
}
func New(model string, adapters, projectors []string, opts api.Options) (LLM, error) {
if _, err := os.Stat(model); err != nil {
return nil, err
}
@@ -48,13 +53,18 @@ func New(workDir, model string, adapters, projectors []string, opts api.Options)
size := ggml.Size
// fp16 k,v matrices require = n_ctx * n_layer * n_embd / n_head * n_head_kv * 2 bytes each * 2 key and value
kv := 2 * 2 * int64(opts.NumCtx) * int64(ggml.NumLayers()) * int64(ggml.NumEmbed()) * int64(ggml.NumHeadKv()) / int64(ggml.NumHead())
kv := 2 * 2 * int64(opts.NumCtx) * int64(ggml.NumLayers()) * int64(ggml.NumEmbed()) * int64(ggml.NumHeadKv()) / int64(max(ggml.NumHead(), 1))
// this amount is the overhead + tensors in memory
// TODO: get this from the llama.cpp's graph calculations instead of
// estimating it's 1/6 * kv_cache_size * num_gqa
graph := int64(ggml.NumGQA()) * kv / 6
// certain model architectures don't support gpu inference yet
if slices.Contains(cpuOnlyFamilies, ggml.ModelFamily()) {
opts.NumGPU = 0
}
info := gpu.GetGPUInfo()
switch runtime.GOOS {
case "darwin":
@@ -63,9 +73,7 @@ func New(workDir, model string, adapters, projectors []string, opts api.Options)
}
if size+kv+graph > vram {
slog.Info("not enough vram available, falling back to CPU only")
info.Library = "cpu"
info.Variant = gpu.GetCPUVariant()
slog.Info("not enough vram available, setting num_gpu=0")
opts.NumGPU = 0
break
}
@@ -120,15 +128,15 @@ func New(workDir, model string, adapters, projectors []string, opts api.Options)
opts.RopeFrequencyBase = 0.0
opts.RopeFrequencyScale = 0.0
return newLlmServer(info, workDir, model, adapters, projectors, opts)
return newLlmServer(info, model, adapters, projectors, opts)
}
// Give any native cgo implementations an opportunity to initialize
func Init(workdir string) error {
return nativeInit(workdir)
func Init() error {
return nativeInit()
}
func newLlmServer(gpuInfo gpu.GpuInfo, workDir, model string, adapters, projectors []string, opts api.Options) (LLM, error) {
func newLlmServer(gpuInfo gpu.GpuInfo, model string, adapters, projectors []string, opts api.Options) (LLM, error) {
dynLibs := getDynLibs(gpuInfo)
// Check to see if the user has requested a specific library instead of auto-detecting
@@ -147,7 +155,7 @@ func newLlmServer(gpuInfo gpu.GpuInfo, workDir, model string, adapters, projecto
_, err := os.Stat(dynLibs[0])
if err != nil {
slog.Info(fmt.Sprintf("%s has disappeared, reloading libraries", dynLibs[0]))
err = nativeInit(workDir)
err = nativeInit()
if err != nil {
return nil, err
}

View File

@@ -1,21 +0,0 @@
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 2b2f4a0f..afac49af 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -997,13 +997,15 @@ struct llama_server_context
slot.n_sent_text += result.text_to_send.size();
// add the token to slot queue and cache
}
- slot.add_token_string(result);
+
if (slot.params.stream)
{
send_partial_response(slot, result);
}
}
+ slot.add_token_string(result);
+
if (incomplete)
{
slot.has_next_token = true;

View File

@@ -1,114 +0,0 @@
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 2b2f4a0f..25857bdd 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -31,6 +31,10 @@
#include <atomic>
#include <signal.h>
+#ifdef GGML_USE_CUBLAS
+extern "C" GGML_CALL void ggml_free_cublas(void);
+#endif
+
using json = nlohmann::json;
struct server_params {
@@ -363,6 +367,9 @@ struct llama_server_context
llama_free_model(model);
model = nullptr;
}
+#ifdef GGML_USE_CUBLAS
+ ggml_free_cublas();
+#endif
}
bool load_model(const gpt_params &params_)
@@ -3494,6 +3501,7 @@ int main(int argc, char **argv)
sigemptyset (&sigint_action.sa_mask);
sigint_action.sa_flags = 0;
sigaction(SIGINT, &sigint_action, NULL);
+ sigaction(SIGUSR1, &sigint_action, NULL);
#elif defined (_WIN32)
auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false;
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 0c6501e9..75c12723 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -43,6 +43,7 @@
#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
#define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
#define cublasCreate hipblasCreate
+#define cublasDestroy hipblasDestroy
#define cublasGemmEx hipblasGemmEx
#define cublasGemmBatchedEx hipblasGemmBatchedEx
#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
@@ -8694,10 +8695,10 @@ GGML_CALL bool ggml_cublas_loaded(void) {
return g_cublas_loaded;
}
-GGML_CALL void ggml_init_cublas() {
- static bool initialized = false;
+static bool g_cublas_initialized = false;
- if (!initialized) {
+GGML_CALL void ggml_init_cublas() {
+ if (!g_cublas_initialized) {
#ifdef __HIP_PLATFORM_AMD__
// Workaround for a rocBLAS bug when using multiple graphics cards:
@@ -8707,7 +8708,7 @@ GGML_CALL void ggml_init_cublas() {
#endif
if (cudaGetDeviceCount(&g_device_count) != cudaSuccess) {
- initialized = true;
+ g_cublas_initialized = true;
g_cublas_loaded = false;
fprintf(stderr, "%s: no " GGML_CUDA_NAME " devices found, " GGML_CUDA_NAME " will be disabled\n", __func__);
return;
@@ -8778,7 +8779,7 @@ GGML_CALL void ggml_init_cublas() {
// configure logging to stdout
// CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
- initialized = true;
+ g_cublas_initialized = true;
g_cublas_loaded = true;
}
}
@@ -12345,3 +12346,22 @@ GGML_CALL int ggml_backend_cuda_reg_devices() {
}
return device_count;
}
+
+extern "C" GGML_CALL void ggml_free_cublas(void);
+GGML_CALL void ggml_free_cublas(void) {
+ for (int id = 0; id < g_device_count; ++id) {
+#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
+ if (g_device_caps[id].vmm) {
+ CU_CHECK(cuMemUnmap(g_cuda_pool_addr[id], g_cuda_pool_size[id]));
+ g_cuda_pool_size[id] = 0;
+ g_cuda_pool_addr[id] = 0;
+ }
+#endif
+ // TODO: free legacy non-vmm memory
+ // destroy cublas handle
+ CUBLAS_CHECK(cublasDestroy(g_cublas_handles[id]));
+ g_cublas_handles[id] = nullptr;
+ }
+
+ g_cublas_initialized = false;
+}
diff --git a/ggml-cuda.h b/ggml-cuda.h
index b1ebd61d..6dd58ddf 100644
--- a/ggml-cuda.h
+++ b/ggml-cuda.h
@@ -23,6 +23,9 @@ GGML_API GGML_CALL void ggml_init_cublas(void);
// Returns `true` if there are available CUDA devices and cublas loads successfully; otherwise, it returns `false`.
GGML_API GGML_CALL bool ggml_cublas_loaded(void);
+// Release CUDA resources
+GGML_API GGML_CALL void ggml_free_cublas(void);
+
GGML_API GGML_CALL void * ggml_cuda_host_malloc(size_t size);
GGML_API GGML_CALL void ggml_cuda_host_free(void * ptr);

View File

@@ -0,0 +1,44 @@
diff --git a/llama.cpp b/llama.cpp
index 4225f955..7b762f86 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -4756,7 +4756,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
}
} catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: error loading model: %s\n", __func__, err.what());
- return -1;
+ throw;
}
return 0;
@@ -12102,16 +12102,22 @@ struct llama_model * llama_load_model_from_file(
};
}
- int status = llama_model_load(path_model, *model, params);
- GGML_ASSERT(status <= 0);
- if (status < 0) {
- if (status == -1) {
- LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
- } else if (status == -2) {
- LLAMA_LOG_INFO("%s: cancelled model load\n", __func__);
+ try {
+ int status = llama_model_load(path_model, *model, params);
+ GGML_ASSERT(status <= 0);
+ if (status < 0) {
+ if (status == -1) {
+ LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
+ } else if (status == -2) {
+ LLAMA_LOG_INFO("%s: cancelled model load\n", __func__);
+ }
+ delete model;
+ return nullptr;
}
+ } catch (...) {
+ LLAMA_LOG_ERROR("%s: exception loading model\n", __func__);
delete model;
- return nullptr;
+ throw;
}
return model;

View File

@@ -0,0 +1,13 @@
diff --git a/llama.cpp b/llama.cpp
index b27aa272..99372f9c 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -9360,7 +9360,7 @@ struct llm_tokenizer_wpm {
}
uint32_t to_lower(uint32_t code) {
- static const std::locale locale("en_US.UTF-8");
+ static const std::locale locale("");
#if defined(_WIN32)
if (code > 0xFFFF) {
return code;

View File

@@ -11,11 +11,12 @@ import (
"path/filepath"
"runtime"
"strings"
"sync"
"golang.org/x/exp/slices"
"golang.org/x/sync/errgroup"
"github.com/jmorganca/ollama/gpu"
"github.com/ollama/ollama/gpu"
)
// Libraries names may contain an optional variant separated by '_'
@@ -103,24 +104,17 @@ func rocmDynLibPresent() bool {
return false
}
func nativeInit(workdir string) error {
slog.Info("Extracting dynamic libraries...")
if runtime.GOOS == "darwin" {
err := extractPayloadFiles(workdir, "llama.cpp/ggml-metal.metal")
if err != nil {
if err == payloadMissing {
// TODO perhaps consider this a hard failure on arm macs?
slog.Info("ggml-meta.metal payload missing")
return nil
}
return err
}
os.Setenv("GGML_METAL_PATH_RESOURCES", workdir)
func nativeInit() error {
payloadsDir, err := gpu.PayloadsDir()
if err != nil {
return err
}
libs, err := extractDynamicLibs(workdir, "llama.cpp/build/*/*/*/lib/*")
slog.Info(fmt.Sprintf("Extracting dynamic libraries to %s ...", payloadsDir))
libs, err := extractDynamicLibs(payloadsDir, "llama.cpp/build/*/*/*/lib/*")
if err != nil {
if err == payloadMissing {
if errors.Is(err, payloadMissing) {
slog.Info(fmt.Sprintf("%s", payloadMissing))
return nil
}
@@ -149,18 +143,15 @@ func nativeInit(workdir string) error {
return nil
}
func extractDynamicLibs(workDir, glob string) ([]string, error) {
func extractDynamicLibs(payloadsDir, glob string) ([]string, error) {
files, err := fs.Glob(libEmbed, glob)
if err != nil || len(files) == 0 {
return nil, payloadMissing
}
libs := []string{}
// TODO consider making this idempotent with some sort of persistent directory (where we store models probably)
// and tracking by version so we don't reexpand the files every time
// Also maybe consider lazy loading only what is needed
g := new(errgroup.Group)
var mu sync.Mutex
var libs []string
var g errgroup.Group
for _, file := range files {
pathComps := strings.Split(file, "/")
if len(pathComps) != pathComponentCount {
@@ -172,14 +163,14 @@ func extractDynamicLibs(workDir, glob string) ([]string, error) {
g.Go(func() error {
// llama.cpp/build/$OS/$GOARCH/$VARIANT/lib/$LIBRARY
// Include the variant in the path to avoid conflicts between multiple server libs
targetDir := filepath.Join(workDir, pathComps[pathComponentCount-3])
targetDir := filepath.Join(payloadsDir, pathComps[pathComponentCount-3])
srcFile, err := libEmbed.Open(file)
if err != nil {
return fmt.Errorf("read payload %s: %v", file, err)
}
defer srcFile.Close()
if err := os.MkdirAll(targetDir, 0o755); err != nil {
return fmt.Errorf("create payload temp dir %s: %v", workDir, err)
return fmt.Errorf("create payload lib dir %s: %v", payloadsDir, err)
}
src := io.Reader(srcFile)
filename := file
@@ -193,71 +184,29 @@ func extractDynamicLibs(workDir, glob string) ([]string, error) {
destFile := filepath.Join(targetDir, filepath.Base(filename))
if strings.Contains(destFile, "server") {
mu.Lock()
libs = append(libs, destFile)
mu.Unlock()
}
_, err = os.Stat(destFile)
switch {
case errors.Is(err, os.ErrNotExist):
destFile, err := os.OpenFile(destFile, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
if err != nil {
return fmt.Errorf("write payload %s: %v", file, err)
}
defer destFile.Close()
if _, err := io.Copy(destFile, src); err != nil {
return fmt.Errorf("copy payload %s: %v", file, err)
}
case err != nil:
return fmt.Errorf("stat payload %s: %v", file, err)
destFp, err := os.OpenFile(destFile, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
if err != nil {
return fmt.Errorf("write payload %s: %v", file, err)
}
defer destFp.Close()
if _, err := io.Copy(destFp, src); err != nil {
return fmt.Errorf("copy payload %s: %v", file, err)
}
return nil
})
}
return libs, g.Wait()
}
func extractPayloadFiles(workDir, glob string) error {
files, err := fs.Glob(libEmbed, glob)
if err != nil || len(files) == 0 {
return payloadMissing
err = g.Wait()
if err != nil {
// If we fail to extract, the payload dir is unusable, so cleanup whatever we extracted
gpu.Cleanup()
return nil, err
}
for _, file := range files {
srcFile, err := libEmbed.Open(file)
if err != nil {
return fmt.Errorf("read payload %s: %v", file, err)
}
defer srcFile.Close()
if err := os.MkdirAll(workDir, 0o755); err != nil {
return fmt.Errorf("create payload temp dir %s: %v", workDir, err)
}
src := io.Reader(srcFile)
filename := file
if strings.HasSuffix(file, ".gz") {
src, err = gzip.NewReader(src)
if err != nil {
return fmt.Errorf("decompress payload %s: %v", file, err)
}
filename = strings.TrimSuffix(filename, ".gz")
}
destFile := filepath.Join(workDir, filepath.Base(filename))
_, err = os.Stat(destFile)
switch {
case errors.Is(err, os.ErrNotExist):
destFile, err := os.OpenFile(destFile, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
if err != nil {
return fmt.Errorf("write payload %s: %v", file, err)
}
defer destFile.Close()
if _, err := io.Copy(destFile, src); err != nil {
return fmt.Errorf("copy payload %s: %v", file, err)
}
case err != nil:
return fmt.Errorf("stat payload %s: %v", file, err)
}
}
return nil
return libs, nil
}
func verifyDriverAccess() error {

View File

@@ -4,5 +4,5 @@ import (
"embed"
)
//go:embed llama.cpp/ggml-metal.metal llama.cpp/build/darwin/x86_64/*/lib/*.dylib*
//go:embed llama.cpp/build/darwin/x86_64/*/lib/*.dylib*
var libEmbed embed.FS

View File

@@ -4,5 +4,5 @@ import (
"embed"
)
//go:embed llama.cpp/build/linux/*/*/lib/*.so*
//go:embed llama.cpp/build/linux/*/*/lib/*
var libEmbed embed.FS

View File

@@ -3,7 +3,7 @@ package llm
import (
"testing"
"github.com/jmorganca/ollama/gpu"
"github.com/ollama/ollama/gpu"
"github.com/stretchr/testify/assert"
)

View File

@@ -3,7 +3,7 @@ package main
import (
"context"
"github.com/jmorganca/ollama/cmd"
"github.com/ollama/ollama/cmd"
"github.com/spf13/cobra"
)

View File

@@ -11,7 +11,7 @@ import (
"time"
"github.com/gin-gonic/gin"
"github.com/jmorganca/ollama/api"
"github.com/ollama/ollama/api"
)
type Error struct {

View File

@@ -6,7 +6,7 @@ import (
"strings"
"time"
"github.com/jmorganca/ollama/format"
"github.com/ollama/ollama/format"
"golang.org/x/term"
)

View File

@@ -19,10 +19,9 @@ type Buffer struct {
func NewBuffer(prompt *Prompt) (*Buffer, error) {
fd := int(os.Stdout.Fd())
width, height, err := term.GetSize(fd)
if err != nil {
fmt.Println("Error getting size:", err)
return nil, err
width, height := 80, 24
if termWidth, termHeight, err := term.GetSize(fd); err == nil {
width, height = termWidth, termHeight
}
lwidth := width - len(prompt.prompt())

Some files were not shown because too many files have changed in this diff Show More