Compare commits

...

194 Commits

Author SHA1 Message Date
Roy Han
1071e17626 lint 2024-06-28 13:14:49 -07:00
Roy Han
c9fd7a730a changes 2024-06-28 13:10:39 -07:00
Roy Han
01ecaf95fe deprecate 2024-06-28 13:07:44 -07:00
royjhan
b910fa9010 Ollama Show: Check for Projector Type (#5307)
* Check exists projtype

* Maintain Ordering
2024-06-28 11:30:16 -07:00
royjhan
6d4219083c Update docs (#5312) 2024-06-28 09:58:14 -07:00
Michael Yang
1ed4f521c4 Merge pull request #5340 from ollama/mxyng/mem
gemma2 graph
2024-06-27 14:26:49 -07:00
Michael Yang
de2163dafd gemma2 graph 2024-06-27 13:34:52 -07:00
Michael
2cc7d05012 update readme for gemma 2 (#5333)
* update readme for gemma 2
2024-06-27 12:45:16 -04:00
Michael Yang
123a722a6f zip: prevent extracting files into parent dirs (#5314) 2024-06-26 21:38:21 -07:00
Jeffrey Morgan
4d311eb731 llm: architecture patch (#5316) 2024-06-26 21:38:12 -07:00
Blake Mizerany
cb42e607c5 llm: speed up gguf decoding by a lot (#5246)
Previously, some costly things were causing the loading of GGUF files
and their metadata and tensor information to be VERY slow:

  * Too many allocations when decoding strings
  * Hitting disk for each read of each key and value, resulting in a
    not-okay amount of syscalls/disk I/O.

The show API is now down to 33ms from 800ms+ for llama3 on a macbook pro
m3.

This commit also prevents collecting large arrays of values when
decoding GGUFs (if desired). When such keys are encountered, their
values are null, and are encoded as such in JSON.

Also, this fixes a broken test that was not encoding valid GGUF.
2024-06-24 21:47:52 -07:00
Blake Mizerany
2aa91a937b cmd: defer stating model info until necessary (#5248)
This commit changes the 'ollama run' command to defer fetching model
information until it really needs it. That is, when in interactive mode.

It also removes one such case where the model information is fetch in
duplicate, just before calling generateInteractive and then again, first
thing, in generateInteractive.

This positively impacts the performance of the command:

    ; time ./before run llama3 'hi'
    Hi! It's nice to meet you. Is there something I can help you with, or would you like to chat?

    ./before run llama3 'hi'  0.02s user 0.01s system 2% cpu 1.168 total
    ; time ./before run llama3 'hi'
    Hi! It's nice to meet you. Is there something I can help you with, or would you like to chat?

    ./before run llama3 'hi'  0.02s user 0.01s system 2% cpu 1.220 total
    ; time ./before run llama3 'hi'
    Hi! It's nice to meet you. Is there something I can help you with, or would you like to chat?

    ./before run llama3 'hi'  0.02s user 0.01s system 2% cpu 1.217 total
    ; time ./after run llama3 'hi'
    Hi! It's nice to meet you. Is there something I can help you with, or would you like to chat?

    ./after run llama3 'hi'  0.02s user 0.01s system 4% cpu 0.652 total
    ; time ./after run llama3 'hi'
    Hi! It's nice to meet you. Is there something I can help you with, or would you like to chat?

    ./after run llama3 'hi'  0.01s user 0.01s system 5% cpu 0.498 total
    ; time ./after run llama3 'hi'
    Hi! It's nice to meet you. Is there something I can help you with or would you like to chat?

    ./after run llama3 'hi'  0.01s user 0.01s system 3% cpu 0.479 total
    ; time ./after run llama3 'hi'
    Hi! It's nice to meet you. Is there something I can help you with, or would you like to chat?

    ./after run llama3 'hi'  0.02s user 0.01s system 5% cpu 0.507 total
    ; time ./after run llama3 'hi'
    Hi! It's nice to meet you. Is there something I can help you with, or would you like to chat?

    ./after run llama3 'hi'  0.02s user 0.01s system 5% cpu 0.507 total
2024-06-24 20:14:03 -07:00
Daniel Hiltgen
ccef9431c8 Merge pull request #5205 from dhiltgen/modelfile_use_mmap
Fix use_mmap parsing for modelfiles
2024-06-21 16:30:36 -07:00
royjhan
9a9e7d83c4 Docs (#5149) 2024-06-21 15:52:09 -07:00
Michael Yang
189a43caa2 Merge pull request #5206 from ollama/mxyng/quantize
fix: quantization with template
2024-06-21 13:44:34 -07:00
Michael Yang
e835ef1836 fix: quantization with template 2024-06-21 13:39:25 -07:00
Daniel Hiltgen
7e7749224c Fix use_mmap parsing for modelfiles
Add the new tristate parsing logic for the code path for modelfiles,
as well as a unit test.
2024-06-21 12:27:19 -07:00
Daniel Hiltgen
c7c2f3bc22 Merge pull request #5194 from dhiltgen/linux_mmap_auto
Refine mmap default logic on linux
2024-06-20 11:44:08 -07:00
Daniel Hiltgen
54a79d6a8a Merge pull request #5125 from dhiltgen/fedora39
Bump latest fedora cuda repo to 39
2024-06-20 11:27:24 -07:00
Daniel Hiltgen
5bf5aeec01 Refine mmap default logic on linux
If we try to use mmap when the model is larger than the system free space, loading is slower than the no-mmap approach.
2024-06-20 11:07:04 -07:00
Michael Yang
e01e535cbb Merge pull request #5192 from ollama/mxyng/kv
handle asymmetric embedding KVs
2024-06-20 10:46:24 -07:00
Josh
0195d6a2f8 Merge pull request #5188 from ollama/jyan/tmpdir2
fix: skip os.removeAll() if PID does not exist
2024-06-20 10:40:59 -07:00
Michael Yang
8e0641a9bf handle asymmetric embedding KVs 2024-06-20 09:57:27 -07:00
Josh Yan
662568d453 err!=nil check 2024-06-20 09:30:59 -07:00
Josh Yan
4ebb66c662 reformat error check 2024-06-20 09:23:43 -07:00
Josh Yan
23e899f32d skip os.removeAll() if PID does not exist 2024-06-20 08:51:35 -07:00
royjhan
fedf71635e Extend api/show and ollama show to return more model info (#4881)
* API Show Extended

* Initial Draft of Information

Co-Authored-By: Patrick Devine <pdevine@sonic.net>

* Clean Up

* Descriptive arg error messages and other fixes

* Second Draft of Show with Projectors Included

* Remove Chat Template

* Touches

* Prevent wrapping from files

* Verbose functionality

* Docs

* Address Feedback

* Lint

* Resolve Conflicts

* Function Name

* Tests for api/show model info

* Show Test File

* Add Projector Test

* Clean routes

* Projector Check

* Move Show Test

* Touches

* Doc update

---------

Co-authored-by: Patrick Devine <pdevine@sonic.net>
2024-06-19 14:19:02 -07:00
Daniel Hiltgen
97c59be653 Merge pull request #5074 from dhiltgen/app_log_rotation
Implement log rotation for tray app
2024-06-19 13:02:24 -07:00
Daniel Hiltgen
9d8a4988e8 Implement log rotation for tray app 2024-06-19 12:53:34 -07:00
Michael Yang
1ae0750a21 Merge pull request #5147 from ollama/mxyng/cleanup
remove confusing log message
2024-06-19 12:50:31 -07:00
Michael Yang
9d91e5e587 remove confusing log message 2024-06-19 11:14:11 -07:00
Daniel Hiltgen
96624aa412 Merge pull request #5072 from dhiltgen/windows_path
Move libraries out of users path
2024-06-19 09:13:39 -07:00
Daniel Hiltgen
10f33b8537 Merge pull request #5146 from dhiltgen/backout
Put back temporary intel GPU env var
2024-06-19 09:12:45 -07:00
Daniel Hiltgen
4a633cc295 Merge pull request #5145 from dhiltgen/bad_loads
Fix bad symbol load detection
2024-06-19 09:12:33 -07:00
Daniel Hiltgen
d34d88e417 Revert "Revert "gpu: add env var for detecting Intel oneapi gpus (#5076)""
This reverts commit 755b4e4fc2.
2024-06-19 08:57:41 -07:00
Daniel Hiltgen
52ce350b7a Fix bad symbol load detection
pointer deref's weren't correct on a few libraries, which explains
some crashes on older systems or miswired symlinks for discovery libraries.
2024-06-19 08:39:07 -07:00
Daniel Hiltgen
2abebb2cbe Merge pull request #5128 from zhewang1-intc/fix_levelzero_empty_symbol_detect
Fix levelzero empty symbol detect
2024-06-19 08:33:16 -07:00
Blake Mizerany
380e06e5be types/model: remove Digest
The Digest type in its current form is awkward to work with and presents
challenges with regard to how it serializes via String using the '-'
prefix.

We currently only use this in ollama.com, so we'll move our specific
needs around digest parsing and validation there.
2024-06-18 20:28:11 -07:00
Wang,Zhe
badf975e45 get real func ptr. 2024-06-19 09:00:51 +08:00
Wang,Zhe
755b4e4fc2 Revert "gpu: add env var for detecting Intel oneapi gpus (#5076)"
This reverts commit 163cd3e77c.
2024-06-19 08:59:58 +08:00
Daniel Hiltgen
1a1c99e334 Bump latest fedora cuda repo to 39 2024-06-18 17:13:54 -07:00
Michael Yang
21adf8b6d2 Merge pull request #5121 from ollama/mxyng/deepseekv2
deepseek v2 graph
2024-06-18 16:30:58 -07:00
Michael Yang
e873841cbb deepseek v2 graph 2024-06-18 15:35:12 -07:00
Daniel Hiltgen
26d0bf9236 Merge pull request #5117 from dhiltgen/fix_prediction
Handle models with divergent layer sizes
2024-06-18 11:36:51 -07:00
Daniel Hiltgen
359b15a597 Handle models with divergent layer sizes
The recent refactoring of the memory prediction assumed all layers
are the same size, but for some models (like deepseek-coder-v2) this
is not the case, so our predictions were significantly off.
2024-06-18 11:05:34 -07:00
Daniel Hiltgen
b55958a587 Merge pull request #5106 from dhiltgen/clean_logs
Tighten up memory prediction logging
2024-06-18 09:24:38 -07:00
Daniel Hiltgen
7784ca33ce Tighten up memory prediction logging
Prior to this change, we logged the memory prediction multiple times
as the scheduler iterates to find a suitable configuration, which can be
confusing since only the last log before the server starts is actually valid.
This now logs once just before starting the server on the final configuration.
It also reports what library instead of always saying "offloading to gpu" when
using CPU.
2024-06-18 09:15:35 -07:00
Daniel Hiltgen
c9c8c98bf6 Merge pull request #5105 from dhiltgen/cuda_mmap
Adjust mmap logic for cuda windows for faster model load
2024-06-17 17:07:30 -07:00
Daniel Hiltgen
171796791f Adjust mmap logic for cuda windows for faster model load
On Windows, recent llama.cpp changes make mmap slower in most
cases, so default to off.  This also implements a tri-state for
use_mmap so we can detect the difference between a user provided
value of true/false, or unspecified.
2024-06-17 16:54:30 -07:00
Jeffrey Morgan
176d0f7075 Update import.md 2024-06-17 19:44:14 -04:00
Daniel Hiltgen
8ed51cac37 Merge pull request #5103 from dhiltgen/faster_win_build
Revert powershell jobs, but keep nvcc and cmake parallelism
2024-06-17 14:23:18 -07:00
Daniel Hiltgen
c9e6f0542d Merge pull request #5069 from dhiltgen/ci_release
Implement custom github release action
2024-06-17 13:59:37 -07:00
Daniel Hiltgen
b0930626c5 Add back lower level parallel flags
nvcc supports parallelism (threads) and cmake + make can use -j,
while msbuild requires /p:CL_MPcount=8
2024-06-17 13:44:46 -07:00
Daniel Hiltgen
e890be4814 Revert "More parallelism on windows generate"
This reverts commit 0577af98f4.
2024-06-17 13:32:46 -07:00
Daniel Hiltgen
b2799f111b Move libraries out of users path
We update the PATH on windows to get the CLI mapped, but this has
an unintended side effect of causing other apps that may use our bundled
DLLs to get terminated when we upgrade.
2024-06-17 13:12:18 -07:00
Jeffrey Morgan
152fc202f5 llm: update llama.cpp commit to 7c26775 (#4896)
* llm: update llama.cpp submodule to `7c26775`

* disable `LLAMA_BLAS` for now

* `-DLLAMA_OPENMP=off`
2024-06-17 15:56:16 -04:00
Lei Jitang
4ad0d4d6d3 Fix a build warning (#5096)
Signed-off-by: Lei Jitang <leijitang@outlook.com>
2024-06-17 14:47:48 -04:00
Jeffrey Morgan
163cd3e77c gpu: add env var for detecting Intel oneapi gpus (#5076)
* gpu: add env var for detecting intel oneapi gpus

* fix build error
2024-06-16 20:09:05 -04:00
Daniel Hiltgen
4c2c8f93dd Merge pull request #5080 from dhiltgen/debug_intel_crash
Add some more debugging logs for intel discovery
2024-06-16 14:42:41 -07:00
Daniel Hiltgen
fd1e6e0590 Add some more debugging logs for intel discovery
Also removes an unused overall count variable
2024-06-16 07:42:52 -07:00
royjhan
89c79bec8c Add ModifiedAt Field to /api/show (#5033)
* Add Mod Time to Show

* Error Handling
2024-06-15 20:53:56 -07:00
Jeffrey Morgan
c7b77004e3 docs: add missing powershell package to windows development instructions (#5075)
* docs: add missing instruction for powershell build

The powershell script for building Ollama on Windows now requires the `ThreadJob` module. Add this to the instructions and dependency list.

* Update development.md
2024-06-15 23:08:09 -04:00
Daniel Hiltgen
07d143f412 Merge pull request #5058 from coolljt0725/fix_build_warning
gpu: Fix build warning
2024-06-15 11:52:36 -07:00
Daniel Hiltgen
a12283e2ff Implement custom github release action
This implements the release logic we want via gh cli
to support updating releases with rc tags in place and retain
release notes and other community reactions.
2024-06-15 11:36:56 -07:00
Daniel Hiltgen
4b0050cf0e Merge pull request #5037 from dhiltgen/faster_win_build
More parallelism on windows generate
2024-06-15 08:03:05 -07:00
Daniel Hiltgen
0577af98f4 More parallelism on windows generate
Make the build faster
2024-06-15 07:44:55 -07:00
Daniel Hiltgen
17ce203a26 Merge pull request #4875 from dhiltgen/rocm_gfx900_workaround
Rocm gfx900 workaround
2024-06-15 07:38:58 -07:00
Daniel Hiltgen
d76555ffb5 Merge pull request #4874 from dhiltgen/rocm_v6_bump
Rocm v6 bump
2024-06-15 07:38:32 -07:00
Daniel Hiltgen
2786dff5d3 Merge pull request #4264 from dhiltgen/show_gpu_visible_settings
Centralize GPU configuration vars
2024-06-15 07:33:52 -07:00
Lei Jitang
225f0d1219 gpu: Fix build warning
Signed-off-by: Lei Jitang <leijitang@outlook.com>
2024-06-15 14:26:23 +08:00
Daniel Hiltgen
532db58311 Merge pull request #4972 from jayson-cloude/main
fix: "Skip searching for network devices"
2024-06-14 17:04:40 -07:00
Daniel Hiltgen
6be309e1bd Centralize GPU configuration vars
This should aid in troubleshooting by capturing and reporting the GPU
settings at startup in the logs along with all the other server settings.
2024-06-14 15:59:10 -07:00
Daniel Hiltgen
da3bf23354 Workaround gfx900 SDMA bugs
Implement support for GPU env var workarounds, and leverage
this for the Vega RX 56 which needs
HSA_ENABLE_SDMA=0 set to work properly
2024-06-14 15:38:13 -07:00
Daniel Hiltgen
26ab67732b Bump ROCm linux to 6.1.1 2024-06-14 15:37:54 -07:00
Daniel Hiltgen
45cacbaf05 Merge pull request #4517 from dhiltgen/gpu_incremental
Enhanced GPU discovery and multi-gpu support with concurrency
2024-06-14 15:35:00 -07:00
Daniel Hiltgen
17df6520c8 Remove mmap related output calc logic 2024-06-14 14:55:50 -07:00
Daniel Hiltgen
6f351bf586 review comments and coverage 2024-06-14 14:55:50 -07:00
Daniel Hiltgen
ff4f0cbd1d Prevent multiple concurrent loads on the same gpus
While models are loading, the VRAM metrics are dynamic, so try
to load on a GPU that doesn't have a model actively loading, or wait
to avoid races that lead to OOMs
2024-06-14 14:51:40 -07:00
Daniel Hiltgen
fc37c192ae Refine CPU load behavior with system memory visibility 2024-06-14 14:51:40 -07:00
Daniel Hiltgen
434dfe30c5 Reintroduce nvidia nvml library for windows
This library will give us the most reliable free VRAM reporting on windows
to enable concurrent model scheduling.
2024-06-14 14:51:40 -07:00
Daniel Hiltgen
4e2b7e181d Refactor intel gpu discovery 2024-06-14 14:51:40 -07:00
Daniel Hiltgen
48702dd149 Harden unload for empty runners 2024-06-14 14:51:40 -07:00
Daniel Hiltgen
68dfc6236a refined test timing
adjust timing on some tests so they don't timeout on small/slow GPUs
2024-06-14 14:51:40 -07:00
Daniel Hiltgen
5e8ff556cb Support forced spreading for multi GPU
Our default behavior today is to try to fit into a single GPU if possible.
Some users would prefer the old behavior of always spreading across
multiple GPUs even if the model can fit into one.  This exposes that
tunable behavior.
2024-06-14 14:51:40 -07:00
Daniel Hiltgen
6fd04ca922 Improve multi-gpu handling at the limit
Still not complete, needs some refinement to our prediction to understand the
discrete GPUs available space so we can see how many layers fit in each one
since we can't split one layer across multiple GPUs we can't treat free space
as one logical block
2024-06-14 14:51:40 -07:00
Daniel Hiltgen
206797bda4 Fix concurrency integration test to work locally
This worked remotely but wound up trying to spawn multiple servers
locally which doesn't work
2024-06-14 14:51:40 -07:00
Daniel Hiltgen
43ed358f9a Refine GPU discovery to bootstrap once
Now that we call the GPU discovery routines many times to
update memory, this splits initial discovery from free memory
updating.
2024-06-14 14:51:40 -07:00
Daniel Hiltgen
b32ebb4f29 Use DRM driver for VRAM info for amd
The amdgpu drivers free VRAM reporting omits some other apps, so leverage the
upstream DRM driver which keeps better tabs on things
2024-06-14 14:51:40 -07:00
Daniel Hiltgen
fb9cdfa723 Fix server.cpp for the new cuda build macros 2024-06-14 14:51:40 -07:00
Daniel Hiltgen
efac488675 Revert "Limit GPU lib search for now (#4777)"
This reverts commit 476fb8e892.
2024-06-14 14:51:40 -07:00
Jeffrey Morgan
6b800aa7b7 openai: do not set temperature to 0 when setting seed (#5045) 2024-06-14 13:43:56 -07:00
Jeffrey Morgan
dd7c9ebeaf server: longer timeout in TestRequests (#5046) 2024-06-14 09:48:25 -07:00
Patrick Devine
4dc7fb9525 update 40xx gpu compat matrix (#5036) 2024-06-13 17:10:33 -07:00
Daniel Hiltgen
c39761c552 Merge pull request #5032 from dhiltgen/actually_skip
Actually skip PhysX on windows
2024-06-13 13:26:09 -07:00
Daniel Hiltgen
aac367636d Actually skip PhysX on windows 2024-06-13 13:17:19 -07:00
Michael Yang
15a687ae4b Merge pull request #5031 from ollama/mxyng/fix-multibyte-utf16
fix: multibyte utf16
2024-06-13 13:14:55 -07:00
Michael Yang
d528e1af75 fix utf16 for multibyte runes 2024-06-13 13:07:42 -07:00
Michael Yang
cd234ce22c parser: add test for multibyte runes 2024-06-13 13:07:42 -07:00
Patrick Devine
94618b2365 add OLLAMA_MODELS to envconfig (#5029) 2024-06-13 12:52:03 -07:00
Jeffrey Morgan
1fd236d177 server: remove jwt decoding error (#5027) 2024-06-13 11:21:15 -07:00
Michael Yang
e87fc7200d Merge pull request #5025 from ollama/mxyng/revert-parser-scan
Revert "proper utf16 support"
2024-06-13 10:31:25 -07:00
Michael Yang
20b9f8e6f4 Revert "proper utf16 support"
This reverts commit 66ab48772f.

this change broke utf-8 scanning of multi-byte runes
2024-06-13 10:22:16 -07:00
Patrick Devine
c69bc19e46 move OLLAMA_HOST to envconfig (#5009) 2024-06-12 18:48:16 -04:00
Michael Yang
bba5d177aa Merge pull request #5004 from ollama/mxyng/fix-templates
fix: multiple templates when creating from model
2024-06-12 14:39:29 -07:00
Michael Yang
c16f8af911 fix: multiple templates when creating from model
multiple templates may appear in a model if a model is created from
another model that 1) has an autodetected template and 2) defines a
custom template
2024-06-12 13:35:49 -07:00
Michael Yang
217f60c3d9 Merge pull request #4987 from ollama/mxyng/revert-byte-order
Revert "Merge pull request #4938 from ollama/mxyng/fix-byte-order"
2024-06-11 16:04:20 -07:00
Michael Yang
7bdcd1da94 Revert "Merge pull request #4938 from ollama/mxyng/fix-byte-order"
This reverts commit f5f245cc15, reversing
changes made to 94d37fdcae.

this change broke gguf v2 which is incorrectly detected as big endian
2024-06-11 15:56:17 -07:00
Jeffrey Morgan
ead259d877 llm: fix seed value not being applied to requests (#4986) 2024-06-11 14:24:41 -07:00
James Montgomery
2ff45d571d Add Ollama-hpp to Community Libraries in README. (#4983) 2024-06-11 11:15:05 -07:00
jayson-cloude
157f09acdf fix: "Skip searching for network devices"
On an Ubuntu 24.04 computer with vmware installed, the sudo lshw command will get stuck. "Network interfaces" is always displayed
2024-06-11 16:11:35 +08:00
Michael Yang
0f3cf1d42e Merge pull request #4715 from ollama/mxyng/utf16-parser
proper utf16 support
2024-06-10 11:41:29 -07:00
Michael Yang
5bc029c529 Merge pull request #4921 from ollama/mxyng/import-md
update import.md
2024-06-10 11:41:09 -07:00
Michael Yang
e9a9c6a8e8 Merge pull request #4965 from ollama/mxyng/skip-layer-remove
fix: skip removing layers that no longer exist
2024-06-10 11:40:03 -07:00
Michael Yang
515f497e6d fix: skip removing layers that no longer exist 2024-06-10 11:32:19 -07:00
Michael Yang
b27268aaef add test 2024-06-10 11:32:15 -07:00
Michael Yang
f5f245cc15 Merge pull request #4938 from ollama/mxyng/fix-byte-order
fix parsing big endian gguf
2024-06-10 09:38:12 -07:00
Jim Scardelis
94d37fdcae fix: examples/langchain-python-rag-privategpt/requirements.txt (#3382) 2024-06-09 10:58:09 -07:00
Craig Hughes
b84aea1685 Critical fix from llama.cpp JSON grammar to forbid un-escaped escape characters inside strings, which breaks parsing. (#3782) 2024-06-09 10:57:09 -07:00
Napuh
896495de7b Add instructions to easily install specific versions on faq.md (#4084)
* Added instructions to easily install specific versions on faq.md

* Small typo

* Moved instructions on how to install specific version to linux.md

* Update docs/linux.md

* Update docs/linux.md

---------

Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com>
2024-06-09 10:49:03 -07:00
dcasota
5528dd9d11 Error handling load_single_document() in ingest.py (#4852)
load_single_document() handles
- corrupt files
- empty (zero byte) files
- unsupported file extensions
2024-06-09 10:41:07 -07:00
Jeffrey Morgan
943172cbf4 Update api.md 2024-06-08 23:04:32 -07:00
Nischal Jain
85169e8d6f Added headless-ollama (#4612) 2024-06-08 18:51:16 -07:00
Jeffrey Morgan
34f142797a llm: always add bos token to prompt (#4941)
* fix embedding by adding fixes from llama.cpp upstream

* remove assert

---------

Co-authored-by: Jesper Ek <deadbeef84@gmail.com>
2024-06-08 18:47:10 -07:00
Erhan
46a7f1e74a Update README.md with LangChainRust (#4854) 2024-06-08 17:29:36 -07:00
Michael Yang
620d5c569e fix parsing big endian gguf 2024-06-08 12:35:26 -07:00
Michael Yang
b9ce7bf75e update import.md 2024-06-07 16:45:15 -07:00
Daniel Hiltgen
cddc63381c Merge pull request #4909 from dhiltgen/oneapi_disable
Add ability to skip oneapi generate
2024-06-07 14:07:15 -07:00
Michael Yang
385a32ecb5 Merge pull request #4910 from ollama/mxyng/detect-chat-template
fix create model when template detection errors
2024-06-07 11:07:39 -07:00
Michael Yang
030e765e76 fix create model when template detection errors 2024-06-07 10:51:35 -07:00
Daniel Hiltgen
ab8c929e20 Add ability to skip oneapi generate
This follows the same pattern for cuda and rocm to allow
disabling the build even when we detect the dependent libraries
2024-06-07 08:32:49 -07:00
Jeffrey Morgan
ce0dc33cb8 llm: patch to fix qwen 2 temporarily on nvidia (#4897) 2024-06-06 23:14:33 -07:00
Michael Yang
78f81fc0e5 Merge pull request #4800 from ollama/mxyng/detect-chat-template
detect chat template from KV
2024-06-06 16:17:18 -07:00
Michael Yang
9b6c2e6eb6 detect chat template from KV 2024-06-06 16:03:47 -07:00
royjhan
1a29e9a879 API app/browser access (#4879)
* API app/browser access

* Add tauri (resolves #2291, #4791, #3799, #4388)
2024-06-06 15:19:03 -07:00
royjhan
4bf1da4944 Separate ListResponse and ModelResponse for api/tags vs api/ps (#4842)
* Remove false time fields

* Struct Separation for List and Process

* Remove Marshaler
2024-06-06 10:11:45 -07:00
Blake Mizerany
de5beb06b3 server: skip blob verification for already verified blobs 2024-06-05 16:39:11 -07:00
Sam
98e65929dc docs(tools): add gollama (#4829) 2024-06-05 14:13:39 -07:00
Michael Yang
66ab48772f proper utf16 support 2024-06-05 13:11:50 -07:00
Michael Yang
22fcf8f7de Merge pull request #3737 from ollama/mxyng/modelname-4
update create handler to use model.Name
2024-06-05 12:05:05 -07:00
royjhan
28c7813ac4 API PS Documentation (#4822)
* API PS Documentation
2024-06-05 11:06:53 -07:00
Kartikeya Mishra
1d8616d30f docs: update to add LLocal.in to web & desktop integrations (#4719) 2024-06-04 14:43:59 -07:00
Michael Yang
d61ef8b954 update create handler to use model.Name 2024-06-04 13:28:25 -07:00
Michael Yang
89d9900152 Merge pull request #4570 from ollama/mxyng/slices
lint some of the things
2024-06-04 13:27:05 -07:00
Michael
4a048715b6 local wording was confusing people
local wording was confusing people -- Ollama runs on cloud providers
2024-06-04 13:25:25 -07:00
Michael Yang
6297f85606 gofmt, goimports 2024-06-04 13:20:24 -07:00
Michael Yang
ed56428dd7 warn on intrange, usestdlibvars 2024-06-04 11:52:48 -07:00
Michael Yang
ad40b92b6a disable intrange 2024-06-04 11:35:30 -07:00
Michael Yang
8ce4032e72 more lint 2024-06-04 11:13:30 -07:00
Michael Yang
42660466f8 no usestdlibvars 2024-06-04 11:13:30 -07:00
Michael Yang
e919f6811f lint windows 2024-06-04 11:13:30 -07:00
Michael Yang
bf7edb0d5d lint linux 2024-06-04 11:13:30 -07:00
Michael Yang
f38353d6b9 stdin.fd 2024-06-04 11:13:30 -07:00
Michael Yang
201d853fdf nolintlint 2024-06-04 11:13:30 -07:00
Michael Yang
e40145a39d lint 2024-06-04 11:13:30 -07:00
Michael Yang
c895a7d13f some gocritic 2024-06-04 11:13:30 -07:00
Michael Yang
dad7a987ae nosprintfhostport 2024-06-04 11:13:30 -07:00
Michael Yang
8ffb51749f nolintlint 2024-06-04 11:13:30 -07:00
Michael Yang
55f6eba049 gofmt 2024-06-04 11:13:30 -07:00
Michael Yang
04f3c12bb7 replace x/exp/slices with slices 2024-06-04 11:13:30 -07:00
Shubham
60323e0805 add embed model command and fix question invoke (#4766)
* add embed model command and fix question invoke

* Update docs/tutorials/langchainpy.md

Co-authored-by: Kim Hallberg <hallberg.kim@gmail.com>

* Update docs/tutorials/langchainpy.md

---------

Co-authored-by: Kim Hallberg <hallberg.kim@gmail.com>
Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com>
2024-06-03 22:20:48 -07:00
Jeffrey Morgan
d4a86102fd update welcome prompt in windows to llama3 (#4779) 2024-06-01 21:05:51 -07:00
Jeffrey Morgan
476fb8e892 Limit GPU lib search for now (#4777)
* fix oneapi errors on windows 10
2024-06-01 19:24:33 -07:00
Michael Yang
829ff87bd1 revert tokenize ffi (#4761)
* Revert "use `int32_t` for call to tokenize (#4738)"

This reverts commit 763bb65dbb.

* Revert "vocab only"

This reverts commit bf54c845e9.

* Revert "use ffi for tokenizing/detokenizing"

This reverts commit 26a00a0410.
2024-05-31 18:54:21 -07:00
Josh
f6b622c4b3 Merge pull request #4733 from ollama/jyan/isvalidname
added IsValidNamespace function
2024-05-31 14:08:45 -07:00
Josh Yan
2e4da8eec2 added tests for IsValidNamespace 2024-05-31 11:48:07 -07:00
Jeffrey Morgan
763bb65dbb use int32_t for call to tokenize (#4738)
* use `int32_t` for call to tokenize

* variable naming

* cleanup

* fix crash
2024-05-30 21:43:30 -07:00
Jeffrey Morgan
7ca9605f54 speed up tests by only building static lib (#4740) 2024-05-30 21:43:15 -07:00
Michael Yang
eb2c443a79 Merge pull request #4736 from ollama/mxyng/vocab-only
vocab only for tokenize
2024-05-30 17:21:00 -07:00
Michael Yang
278e25ea44 Merge pull request #4737 from ollama/mxyng/less-generate
only generate on relevant changes
2024-05-30 17:17:50 -07:00
Jeffrey Morgan
a50a87a7b8 partial offloading: allow flash attention and disable mmap (#4734)
* partial offloading: allow flash attention and disable mmap

* allow mmap with num_gpu=0
2024-05-30 16:58:01 -07:00
Michael Yang
98085015d5 only generate on relevant changes 2024-05-30 16:54:11 -07:00
Michael Yang
bf54c845e9 vocab only 2024-05-30 16:49:28 -07:00
Josh Yan
c365f195a8 directly use isvalidpart 2024-05-30 16:40:04 -07:00
Josh
e91d0ef737 Merge pull request #4728 from ollama/jyan/japanese
fixed japanese characters deleted at end of line
2024-05-30 16:25:12 -07:00
Jeffrey Morgan
22f5c12ced Update llama.cpp submodule to 5921b8f0 (#4731)
* update llama.cpp submodule to `5921b8f089d3b7bda86aac5a66825df6a6c10603`

* add patch
2024-05-30 16:20:22 -07:00
Josh Yan
298c996e54 added IsValidNamespace function 2024-05-30 16:02:07 -07:00
Daniel Hiltgen
0fc0cfc6d2 Merge pull request #4594 from dhiltgen/doc_container_workarounds
Add isolated gpu test to troubleshooting
2024-05-30 13:10:54 -07:00
Josh Yan
914f68f021 replaced duplicate call with variable 2024-05-30 10:38:07 -07:00
Josh Yan
bd1d119ba9 fixed japanese characters deleted at end of line 2024-05-30 10:24:21 -07:00
Lei Jitang
a03be18189 Fix OLLAMA_LLM_LIBRARY with wrong map name and add more env vars to help message (#4663)
* envconfig/config.go: Fix wrong description of OLLAMA_LLM_LIBRARY

Signed-off-by: Lei Jitang <leijitang@outlook.com>

* serve: Add more env to help message of ollama serve

Add more enviroment variables to `ollama serve --help`
to let users know what can be configurated.

Signed-off-by: Lei Jitang <leijitang@outlook.com>

---------

Signed-off-by: Lei Jitang <leijitang@outlook.com>
2024-05-30 09:36:51 -07:00
Michael Yang
96bc232b43 Merge pull request #4413 from ollama/mxyng/name-check
check if name exists before create/pull/copy
2024-05-29 12:06:58 -07:00
Michael Yang
bca7b12284 Merge pull request #3718 from ollama/mxyng/modelname-3
update delete handler to use model.Name
2024-05-29 12:02:07 -07:00
Michael Yang
32cb1960c1 Merge pull request #4380 from ollama/mxyng/tokenize
use tokenize/detokenize
2024-05-29 12:00:59 -07:00
Michael Yang
de781b37c8 rm unused infill 2024-05-29 11:26:47 -07:00
Michael Yang
3e21799377 rm unused system prompt 2024-05-29 11:26:47 -07:00
Michael Yang
26a00a0410 use ffi for tokenizing/detokenizing 2024-05-29 11:26:47 -07:00
Daniel Hiltgen
f77713bf1f Add isolated gpu test to troubleshooting 2024-05-23 09:33:25 -07:00
Michael Yang
85a57006d1 check if name exists before create/pull/copy 2024-05-14 14:58:58 -07:00
Michael Yang
c5e892cb3e update tests 2024-05-14 14:56:31 -07:00
Michael Yang
81fb06f530 more resilient Manifests 2024-05-14 14:08:24 -07:00
Michael Yang
a385382ff5 filepath.Join 2024-05-14 14:08:24 -07:00
Michael Yang
b8772a353f remove DeleteModel 2024-05-14 14:08:24 -07:00
Michael Yang
c2714fcbfd routes: use Manifests for ListHandler 2024-05-14 14:08:24 -07:00
Michael Yang
a2fc933fed update delete handler to use model.Name 2024-05-14 14:08:24 -07:00
146 changed files with 5896 additions and 2202 deletions

View File

@@ -437,6 +437,7 @@ jobs:
env:
OLLAMA_SKIP_IMAGE_BUILD: '1'
PUSH: '1'
GH_TOKEN: ${{ github.token }}
steps:
- uses: actions/checkout@v4
- name: Set Version
@@ -460,15 +461,20 @@ jobs:
ls -lh dist/
(cd dist; sha256sum * > sha256sum.txt)
cat dist/sha256sum.txt
- uses: ncipollo/release-action@v1
with:
name: ${{ env.RELEASE_VERSION }}
allowUpdates: true
artifacts: 'dist/*'
draft: true
prerelease: true
omitBodyDuringUpdate: true
generateReleaseNotes: true
omitDraftDuringUpdate: true
omitPrereleaseDuringUpdate: true
replacesArtifacts: true
- name: Create or update Release
run: |
echo "Looking for existing release for ${{ env.RELEASE_VERSION }}"
OLD_TAG=$(gh release ls --json name,tagName | jq -r ".[] | select(.name == \"${{ env.RELEASE_VERSION }}\") | .tagName")
if [ -n "$OLD_TAG" ]; then
echo "Updating release ${{ env.RELEASE_VERSION }} to point to new tag ${GITHUB_REF_NAME}"
gh release edit ${OLD_TAG} --tag ${GITHUB_REF_NAME}
else
echo "Creating new release ${{ env.RELEASE_VERSION }} pointing to tag ${GITHUB_REF_NAME}"
gh release create ${GITHUB_REF_NAME} \
--title ${{ env.RELEASE_VERSION }} \
--draft \
--generate-notes \
--prerelease
fi
echo "Uploading artifacts for tag ${GITHUB_REF_NAME}"
gh release upload ${GITHUB_REF_NAME} dist/* --clobber

View File

@@ -34,13 +34,13 @@ jobs:
git diff-tree -r --no-commit-id --name-only \
$(git merge-base ${{ github.event.pull_request.base.sha }} ${{ github.event.pull_request.head.sha }}) \
${{ github.event.pull_request.head.sha }} \
| xargs python3 -c "import sys; print(any([x.startswith('$1') for x in sys.argv[1:]]))"
| xargs python3 -c "import sys; from pathlib import Path; print(any(Path(x).match(glob) for x in sys.argv[1:] for glob in '$*'.split(' ')))"
}
{
echo GENERATE=$(changed llm/)
echo GENERATE_CUDA=$(changed llm/)
echo GENERATE_ROCM=$(changed llm/)
echo GENERATE=$(changed 'llm/llama.cpp' 'llm/patches/**' 'llm/ext_server/**' 'llm/generate/**')
echo GENERATE_CUDA=$(changed 'llm/llama.cpp' 'llm/patches/**' 'llm/ext_server/**' 'llm/generate/**')
echo GENERATE_ROCM=$(changed 'llm/llama.cpp' 'llm/patches/**' 'llm/ext_server/**' 'llm/generate/**')
} >>$GITHUB_OUTPUT
generate:
@@ -124,7 +124,7 @@ jobs:
strategy:
matrix:
rocm-version:
- '6.0.2'
- '6.1.1'
runs-on: linux
container: rocm/dev-ubuntu-20.04:${{ matrix.rocm-version }}
steps:
@@ -269,9 +269,9 @@ jobs:
mkdir -p llm/build/darwin/$ARCH/stub/bin
touch llm/build/darwin/$ARCH/stub/bin/ollama_llama_server
if: ${{ startsWith(matrix.os, 'macos-') }}
- uses: golangci/golangci-lint-action@v4
- uses: golangci/golangci-lint-action@v6
with:
args: --timeout 8m0s -v
args: --timeout 8m0s -v ${{ startsWith(matrix.os, 'windows-') && '' || '--disable gofmt --disable goimports' }}
test:
strategy:
matrix:
@@ -287,6 +287,8 @@ jobs:
GOARCH: ${{ matrix.arch }}
CGO_ENABLED: '1'
OLLAMA_CPU_TARGET: 'static'
OLLAMA_SKIP_CPU_GENERATE: '1'
OLLAMA_SKIP_METAL_GENERATE: '1'
steps:
- uses: actions/checkout@v4
with:

View File

@@ -9,9 +9,26 @@ linters:
- contextcheck
- exportloopref
- gocheckcompilerdirectives
# FIXME: for some reason this errors on windows
# conditionally enable this on linux/macos
# - gofmt
# - goimports
- intrange
- misspell
- nilerr
- nolintlint
- nosprintfhostport
- testifylint
- unconvert
- unused
- wastedassign
- whitespace
- usestdlibvars
severity:
default-severity: error
rules:
- linters:
- gofmt
- goimports
- intrange
- usestdlibvars
severity: info

View File

@@ -2,7 +2,7 @@ ARG GOLANG_VERSION=1.22.1
ARG CMAKE_VERSION=3.22.1
# this CUDA_VERSION corresponds with the one specified in docs/gpu.md
ARG CUDA_VERSION=11.3.1
ARG ROCM_VERSION=6.0.2
ARG ROCM_VERSION=6.1.1
# Copy the minimal context we need to run the generate scripts
FROM scratch AS llm-code

View File

@@ -6,7 +6,7 @@
[![Discord](https://dcbadge.vercel.app/api/server/ollama?style=flat&compact=true)](https://discord.gg/ollama)
Get up and running with large language models locally.
Get up and running with large language models.
### macOS
@@ -53,8 +53,8 @@ Here are some example models that can be downloaded:
| Llama 3 | 70B | 40GB | `ollama run llama3:70b` |
| Phi 3 Mini | 3.8B | 2.3GB | `ollama run phi3` |
| Phi 3 Medium | 14B | 7.9GB | `ollama run phi3:medium` |
| Gemma | 2B | 1.4GB | `ollama run gemma:2b` |
| Gemma | 7B | 4.8GB | `ollama run gemma:7b` |
| Gemma 2 | 9B | 5.5GB | `ollama run gemma2` |
| Gemma 2 | 27B | 16GB | `ollama run gemma2:27b` |
| Mistral | 7B | 4.1GB | `ollama run mistral` |
| Moondream 2 | 1.4B | 829MB | `ollama run moondream` |
| Neural Chat | 7B | 4.1GB | `ollama run neural-chat` |
@@ -182,6 +182,12 @@ $ ollama run llama3 "Summarize this file: $(cat README.md)"
Ollama is a lightweight, extensible framework for building and running language models on the local machine. It provides a simple API for creating, running, and managing models, as well as a library of pre-built models that can be easily used in a variety of applications.
```
### Show model information
```
ollama show llama3
```
### List models on your computer
```
@@ -285,6 +291,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [macai](https://github.com/Renset/macai) (macOS client for Ollama, ChatGPT, and other compatible API back-ends)
- [Olpaka](https://github.com/Otacon/olpaka) (User-friendly Flutter Web App for Ollama)
- [OllamaSpring](https://github.com/CrazyNeil/OllamaSpring) (Ollama Client for macOS)
- [LLocal.in](https://github.com/kartikm7/llocal) (Easy to use Electron Desktop Client for Ollama)
### Terminal
@@ -307,6 +314,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [ShellOracle](https://github.com/djcopley/ShellOracle)
- [tlm](https://github.com/yusufcanb/tlm)
- [podman-ollama](https://github.com/ericcurtin/podman-ollama)
- [gollama](https://github.com/sammcj/gollama)
### Database
@@ -324,11 +332,13 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [LangChain](https://python.langchain.com/docs/integrations/llms/ollama) and [LangChain.js](https://js.langchain.com/docs/modules/model_io/models/llms/integrations/ollama) with [example](https://js.langchain.com/docs/use_cases/question_answering/local_retrieval_qa)
- [LangChainGo](https://github.com/tmc/langchaingo/) with [example](https://github.com/tmc/langchaingo/tree/main/examples/ollama-completion-example)
- [LangChain4j](https://github.com/langchain4j/langchain4j) with [example](https://github.com/langchain4j/langchain4j-examples/tree/main/ollama-examples/src/main/java)
- [LangChainRust](https://github.com/Abraxas-365/langchain-rust) with [example](https://github.com/Abraxas-365/langchain-rust/blob/main/examples/llm_ollama.rs)
- [LlamaIndex](https://gpt-index.readthedocs.io/en/stable/examples/llm/ollama.html)
- [LiteLLM](https://github.com/BerriAI/litellm)
- [OllamaSharp for .NET](https://github.com/awaescher/OllamaSharp)
- [Ollama for Ruby](https://github.com/gbaptista/ollama-ai)
- [Ollama-rs for Rust](https://github.com/pepperoni21/ollama-rs)
- [Ollama-hpp for C++](https://github.com/jmont-dev/ollama-hpp)
- [Ollama4j for Java](https://github.com/amithkoujalgi/ollama4j)
- [ModelFusion Typescript Library](https://modelfusion.dev/integration/model-provider/ollama)
- [OllamaKit for Swift](https://github.com/kevinhermawan/OllamaKit)
@@ -346,6 +356,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [Portkey](https://portkey.ai/docs/welcome/integration-guides/ollama)
- [PromptingTools.jl](https://github.com/svilupp/PromptingTools.jl) with an [example](https://svilupp.github.io/PromptingTools.jl/dev/examples/working_with_ollama)
- [LlamaScript](https://github.com/Project-Llama/llamascript)
### Mobile
- [Enchanted](https://github.com/AugustDev/enchanted)
@@ -378,7 +389,9 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [AI ST Completion](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (Sublime Text 4 AI assistant plugin with Ollama support)
- [Discord-Ollama Chat Bot](https://github.com/kevinthedang/discord-ollama) (Generalized TypeScript Discord Bot w/ Tuning Documentation)
- [Discord AI chat/moderation bot](https://github.com/rapmd73/Companion) Chat/moderation bot written in python. Uses Ollama to create personalities.
- [Headless Ollama](https://github.com/nischalj10/headless-ollama) (Scripts to automatically install ollama client & models on any OS for apps that depends on ollama server)
### Supported backends
### Supported backends
- [llama.cpp](https://github.com/ggerganov/llama.cpp) project founded by Georgi Gerganov.

View File

@@ -23,11 +23,9 @@ import (
"net"
"net/http"
"net/url"
"os"
"runtime"
"strconv"
"strings"
"github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/format"
"github.com/ollama/ollama/version"
)
@@ -65,10 +63,7 @@ func checkError(resp *http.Response, body []byte) error {
// If the variable is not specified, a default ollama host and port will be
// used.
func ClientFromEnvironment() (*Client, error) {
ollamaHost, err := GetOllamaHost()
if err != nil {
return nil, err
}
ollamaHost := envconfig.Host
return &Client{
base: &url.URL{
@@ -79,52 +74,6 @@ func ClientFromEnvironment() (*Client, error) {
}, nil
}
type OllamaHost struct {
Scheme string
Host string
Port string
}
func GetOllamaHost() (OllamaHost, error) {
defaultPort := "11434"
hostVar := os.Getenv("OLLAMA_HOST")
hostVar = strings.TrimSpace(strings.Trim(strings.TrimSpace(hostVar), "\"'"))
scheme, hostport, ok := strings.Cut(hostVar, "://")
switch {
case !ok:
scheme, hostport = "http", hostVar
case scheme == "http":
defaultPort = "80"
case scheme == "https":
defaultPort = "443"
}
// trim trailing slashes
hostport = strings.TrimRight(hostport, "/")
host, port, err := net.SplitHostPort(hostport)
if err != nil {
host, port = "127.0.0.1", defaultPort
if ip := net.ParseIP(strings.Trim(hostport, "[]")); ip != nil {
host = ip.String()
} else if hostport != "" {
host = hostport
}
}
if portNum, err := strconv.ParseInt(port, 10, 32); err != nil || portNum > 65535 || portNum < 0 {
return OllamaHost{}, ErrInvalidHostPort
}
return OllamaHost{
Scheme: scheme,
Host: host,
Port: port,
}, nil
}
func NewClient(base *url.URL, http *http.Client) *Client {
return &Client{
base: base,
@@ -355,8 +304,8 @@ func (c *Client) List(ctx context.Context) (*ListResponse, error) {
}
// List running models.
func (c *Client) ListRunning(ctx context.Context) (*ListResponse, error) {
var lr ListResponse
func (c *Client) ListRunning(ctx context.Context) (*ProcessResponse, error) {
var lr ProcessResponse
if err := c.do(ctx, http.MethodGet, "/api/ps", nil, &lr); err != nil {
return nil, err
}

View File

@@ -1,11 +1,9 @@
package api
import (
"fmt"
"net"
"testing"
"github.com/stretchr/testify/assert"
"github.com/ollama/ollama/envconfig"
)
func TestClientFromEnvironment(t *testing.T) {
@@ -35,6 +33,7 @@ func TestClientFromEnvironment(t *testing.T) {
for k, v := range testCases {
t.Run(k, func(t *testing.T) {
t.Setenv("OLLAMA_HOST", v.value)
envconfig.LoadConfig()
client, err := ClientFromEnvironment()
if err != v.err {
@@ -46,40 +45,4 @@ func TestClientFromEnvironment(t *testing.T) {
}
})
}
hostTestCases := map[string]*testCase{
"empty": {value: "", expect: "127.0.0.1:11434"},
"only address": {value: "1.2.3.4", expect: "1.2.3.4:11434"},
"only port": {value: ":1234", expect: ":1234"},
"address and port": {value: "1.2.3.4:1234", expect: "1.2.3.4:1234"},
"hostname": {value: "example.com", expect: "example.com:11434"},
"hostname and port": {value: "example.com:1234", expect: "example.com:1234"},
"zero port": {value: ":0", expect: ":0"},
"too large port": {value: ":66000", err: ErrInvalidHostPort},
"too small port": {value: ":-1", err: ErrInvalidHostPort},
"ipv6 localhost": {value: "[::1]", expect: "[::1]:11434"},
"ipv6 world open": {value: "[::]", expect: "[::]:11434"},
"ipv6 no brackets": {value: "::1", expect: "[::1]:11434"},
"ipv6 + port": {value: "[::1]:1337", expect: "[::1]:1337"},
"extra space": {value: " 1.2.3.4 ", expect: "1.2.3.4:11434"},
"extra quotes": {value: "\"1.2.3.4\"", expect: "1.2.3.4:11434"},
"extra space+quotes": {value: " \" 1.2.3.4 \" ", expect: "1.2.3.4:11434"},
"extra single quotes": {value: "'1.2.3.4'", expect: "1.2.3.4:11434"},
}
for k, v := range hostTestCases {
t.Run(k, func(t *testing.T) {
t.Setenv("OLLAMA_HOST", v.value)
oh, err := GetOllamaHost()
if err != v.err {
t.Fatalf("expected %s, got %s", v.err, err)
}
if err == nil {
host := net.JoinHostPort(oh.Host, oh.Port)
assert.Equal(t, v.expect, host, fmt.Sprintf("%s: expected %s, got %s", k, v.expect, host))
}
})
}
}

View File

@@ -2,7 +2,6 @@ package api
import (
"encoding/json"
"errors"
"fmt"
"log/slog"
"math"
@@ -160,18 +159,49 @@ type Options struct {
// Runner options which must be set when the model is loaded into memory
type Runner struct {
UseNUMA bool `json:"numa,omitempty"`
NumCtx int `json:"num_ctx,omitempty"`
NumBatch int `json:"num_batch,omitempty"`
NumGPU int `json:"num_gpu,omitempty"`
MainGPU int `json:"main_gpu,omitempty"`
LowVRAM bool `json:"low_vram,omitempty"`
F16KV bool `json:"f16_kv,omitempty"`
LogitsAll bool `json:"logits_all,omitempty"`
VocabOnly bool `json:"vocab_only,omitempty"`
UseMMap bool `json:"use_mmap,omitempty"`
UseMLock bool `json:"use_mlock,omitempty"`
NumThread int `json:"num_thread,omitempty"`
UseNUMA bool `json:"numa,omitempty"`
NumCtx int `json:"num_ctx,omitempty"`
NumBatch int `json:"num_batch,omitempty"`
NumGPU int `json:"num_gpu,omitempty"`
MainGPU int `json:"main_gpu,omitempty"`
LowVRAM bool `json:"low_vram,omitempty"`
F16KV bool `json:"f16_kv,omitempty"`
LogitsAll bool `json:"logits_all,omitempty"`
VocabOnly bool `json:"vocab_only,omitempty"`
UseMMap TriState `json:"use_mmap,omitempty"`
UseMLock bool `json:"use_mlock,omitempty"`
NumThread int `json:"num_thread,omitempty"`
}
type TriState int
const (
TriStateUndefined TriState = -1
TriStateFalse TriState = 0
TriStateTrue TriState = 1
)
func (b *TriState) UnmarshalJSON(data []byte) error {
var v bool
if err := json.Unmarshal(data, &v); err != nil {
return err
}
if v {
*b = TriStateTrue
}
*b = TriStateFalse
return nil
}
func (b *TriState) MarshalJSON() ([]byte, error) {
if *b == TriStateUndefined {
return nil, nil
}
var v bool
if *b == TriStateTrue {
v = true
}
return json.Marshal(v)
}
// EmbeddingRequest is the request passed to [Client.Embeddings].
@@ -223,6 +253,7 @@ type ShowRequest struct {
Model string `json:"model"`
System string `json:"system"`
Template string `json:"template"`
Verbose bool `json:"verbose"`
Options map[string]interface{} `json:"options"`
@@ -232,13 +263,16 @@ type ShowRequest struct {
// ShowResponse is the response returned from [Client.Show].
type ShowResponse struct {
License string `json:"license,omitempty"`
Modelfile string `json:"modelfile,omitempty"`
Parameters string `json:"parameters,omitempty"`
Template string `json:"template,omitempty"`
System string `json:"system,omitempty"`
Details ModelDetails `json:"details,omitempty"`
Messages []Message `json:"messages,omitempty"`
License string `json:"license,omitempty"`
Modelfile string `json:"modelfile,omitempty"`
Parameters string `json:"parameters,omitempty"`
Template string `json:"template,omitempty"`
System string `json:"system,omitempty"`
Details ModelDetails `json:"details,omitempty"`
Messages []Message `json:"messages,omitempty"`
ModelInfo map[string]any `json:"model_info,omitempty"`
ProjectorInfo map[string]any `json:"projector_info,omitempty"`
ModifiedAt time.Time `json:"modified_at,omitempty"`
}
// CopyRequest is the request passed to [Client.Copy].
@@ -282,19 +316,31 @@ type PushRequest struct {
// ListResponse is the response from [Client.List].
type ListResponse struct {
Models []ModelResponse `json:"models"`
Models []ListModelResponse `json:"models"`
}
// ModelResponse is a single model description in [ListResponse].
type ModelResponse struct {
// ProcessResponse is the response from [Client.Process].
type ProcessResponse struct {
Models []ProcessModelResponse `json:"models"`
}
// ListModelResponse is a single model description in [ListResponse].
type ListModelResponse struct {
Name string `json:"name"`
Model string `json:"model"`
ModifiedAt time.Time `json:"modified_at,omitempty"`
ModifiedAt time.Time `json:"modified_at"`
Size int64 `json:"size"`
Digest string `json:"digest"`
Details ModelDetails `json:"details,omitempty"`
ExpiresAt time.Time `json:"expires_at,omitempty"`
SizeVRAM int64 `json:"size_vram,omitempty"`
}
// ProcessModelResponse is a single model description in [ProcessResponse].
type ProcessModelResponse struct {
Model string `json:"model"`
Size int64 `json:"size"`
Digest string `json:"digest"`
Details ModelDetails `json:"details,omitempty"`
ExpiresAt time.Time `json:"expires_at"`
SizeVRAM int64 `json:"size_vram"`
}
type TokenResponse struct {
@@ -306,7 +352,7 @@ type GenerateResponse struct {
// Model is the model name that generated the response.
Model string `json:"model"`
//CreatedAt is the timestamp of the response.
// CreatedAt is the timestamp of the response.
CreatedAt time.Time `json:"created_at"`
// Response is the textual response itself.
@@ -363,8 +409,6 @@ func (m *Metrics) Summary() {
}
}
var ErrInvalidHostPort = errors.New("invalid port specified in OLLAMA_HOST")
func (opts *Options) FromMap(m map[string]interface{}) error {
valueOpts := reflect.ValueOf(opts).Elem() // names of the fields in the options struct
typeOpts := reflect.TypeOf(opts).Elem() // types of the fields in the options struct
@@ -391,6 +435,19 @@ func (opts *Options) FromMap(m map[string]interface{}) error {
continue
}
if reflect.PointerTo(field.Type()) == reflect.TypeOf((*TriState)(nil)) {
val, ok := val.(bool)
if !ok {
return fmt.Errorf("option %q must be of type boolean", key)
}
if val {
field.SetInt(int64(TriStateTrue))
} else {
field.SetInt(int64(TriStateFalse))
}
continue
}
switch field.Kind() {
case reflect.Int:
switch t := val.(type) {
@@ -479,7 +536,7 @@ func DefaultOptions() Options {
LowVRAM: false,
F16KV: true,
UseMLock: false,
UseMMap: true,
UseMMap: TriStateUndefined,
UseNUMA: false,
},
}
@@ -549,6 +606,19 @@ func FormatParams(params map[string][]string) (map[string]interface{}, error) {
} else {
field := valueOpts.FieldByName(opt.Name)
if field.IsValid() && field.CanSet() {
if reflect.PointerTo(field.Type()) == reflect.TypeOf((*TriState)(nil)) {
boolVal, err := strconv.ParseBool(vals[0])
if err != nil {
return nil, fmt.Errorf("invalid bool value %s", vals)
}
if boolVal {
out[key] = TriStateTrue
} else {
out[key] = TriStateFalse
}
continue
}
switch field.Kind() {
case reflect.Float32:
floatVal, err := strconv.ParseFloat(vals[0], 32)

View File

@@ -2,6 +2,7 @@ package api
import (
"encoding/json"
"fmt"
"math"
"testing"
"time"
@@ -72,13 +73,13 @@ func TestDurationMarshalUnmarshal(t *testing.T) {
},
{
"positive duration",
time.Duration(42 * time.Second),
time.Duration(42 * time.Second),
42 * time.Second,
42 * time.Second,
},
{
"another positive duration",
time.Duration(42 * time.Minute),
time.Duration(42 * time.Minute),
42 * time.Minute,
42 * time.Minute,
},
{
"zero duration",
@@ -105,3 +106,101 @@ func TestDurationMarshalUnmarshal(t *testing.T) {
})
}
}
func TestUseMmapParsingFromJSON(t *testing.T) {
tests := []struct {
name string
req string
exp TriState
}{
{
name: "Undefined",
req: `{ }`,
exp: TriStateUndefined,
},
{
name: "True",
req: `{ "use_mmap": true }`,
exp: TriStateTrue,
},
{
name: "False",
req: `{ "use_mmap": false }`,
exp: TriStateFalse,
},
}
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
var oMap map[string]interface{}
err := json.Unmarshal([]byte(test.req), &oMap)
require.NoError(t, err)
opts := DefaultOptions()
err = opts.FromMap(oMap)
require.NoError(t, err)
assert.Equal(t, test.exp, opts.UseMMap)
})
}
}
func TestUseMmapFormatParams(t *testing.T) {
tests := []struct {
name string
req map[string][]string
exp TriState
err error
}{
{
name: "True",
req: map[string][]string{
"use_mmap": []string{"true"},
},
exp: TriStateTrue,
err: nil,
},
{
name: "False",
req: map[string][]string{
"use_mmap": []string{"false"},
},
exp: TriStateFalse,
err: nil,
},
{
name: "Numeric True",
req: map[string][]string{
"use_mmap": []string{"1"},
},
exp: TriStateTrue,
err: nil,
},
{
name: "Numeric False",
req: map[string][]string{
"use_mmap": []string{"0"},
},
exp: TriStateFalse,
err: nil,
},
{
name: "invalid string",
req: map[string][]string{
"use_mmap": []string{"foo"},
},
exp: TriStateUndefined,
err: fmt.Errorf("invalid bool value [foo]"),
},
}
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
resp, err := FormatParams(test.req)
require.Equal(t, err, test.err)
respVal, ok := resp["use_mmap"]
if test.exp != TriStateUndefined {
assert.True(t, ok, "resp: %v", resp)
assert.Equal(t, test.exp, respVal)
}
})
}
}

View File

@@ -5,6 +5,8 @@ import (
"log/slog"
"os"
"path/filepath"
"strconv"
"strings"
"github.com/ollama/ollama/envconfig"
)
@@ -24,6 +26,7 @@ func InitLogging() {
logFile = os.Stderr
// TODO - write one-line to the app.log file saying we're running in console mode to help avoid confusion
} else {
rotateLogs(AppLogFile)
logFile, err = os.OpenFile(AppLogFile, os.O_APPEND|os.O_WRONLY|os.O_CREATE, 0755)
if err != nil {
slog.Error(fmt.Sprintf("failed to create server log %v", err))
@@ -46,3 +49,32 @@ func InitLogging() {
slog.Info("ollama app started")
}
func rotateLogs(logFile string) {
if _, err := os.Stat(logFile); os.IsNotExist(err) {
return
}
index := strings.LastIndex(logFile, ".")
pre := logFile[:index]
post := "." + logFile[index+1:]
for i := LogRotationCount; i > 0; i-- {
older := pre + "-" + strconv.Itoa(i) + post
newer := pre + "-" + strconv.Itoa(i-1) + post
if i == 1 {
newer = pre + post
}
if _, err := os.Stat(newer); err == nil {
if _, err := os.Stat(older); err == nil {
err := os.Remove(older)
if err != nil {
slog.Warn("Failed to remove older log", "older", older, "error", err)
continue
}
}
err := os.Rename(newer, older)
if err != nil {
slog.Warn("Failed to rotate log", "older", older, "newer", newer, "error", err)
}
}
}
}

View File

@@ -0,0 +1,44 @@
package lifecycle
import (
"os"
"path/filepath"
"strconv"
"testing"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func TestRotateLogs(t *testing.T) {
logDir := t.TempDir()
logFile := filepath.Join(logDir, "testlog.log")
// No log exists
rotateLogs(logFile)
require.NoError(t, os.WriteFile(logFile, []byte("1"), 0644))
assert.FileExists(t, logFile)
// First rotation
rotateLogs(logFile)
assert.FileExists(t, filepath.Join(logDir, "testlog-1.log"))
assert.NoFileExists(t, filepath.Join(logDir, "testlog-2.log"))
assert.NoFileExists(t, logFile)
// Should be a no-op without a new log
rotateLogs(logFile)
assert.FileExists(t, filepath.Join(logDir, "testlog-1.log"))
assert.NoFileExists(t, filepath.Join(logDir, "testlog-2.log"))
assert.NoFileExists(t, logFile)
for i := 2; i <= LogRotationCount+1; i++ {
require.NoError(t, os.WriteFile(logFile, []byte(strconv.Itoa(i)), 0644))
assert.FileExists(t, logFile)
rotateLogs(logFile)
assert.NoFileExists(t, logFile)
for j := 1; j < i; j++ {
assert.FileExists(t, filepath.Join(logDir, "testlog-"+strconv.Itoa(j)+".log"))
}
assert.NoFileExists(t, filepath.Join(logDir, "testlog-"+strconv.Itoa(i+1)+".log"))
}
}

View File

@@ -16,11 +16,12 @@ var (
AppDir = "/opt/Ollama"
AppDataDir = "/opt/Ollama"
// TODO - should there be a distinct log dir?
UpdateStageDir = "/tmp"
AppLogFile = "/tmp/ollama_app.log"
ServerLogFile = "/tmp/ollama.log"
UpgradeLogFile = "/tmp/ollama_update.log"
Installer = "OllamaSetup.exe"
UpdateStageDir = "/tmp"
AppLogFile = "/tmp/ollama_app.log"
ServerLogFile = "/tmp/ollama.log"
UpgradeLogFile = "/tmp/ollama_update.log"
Installer = "OllamaSetup.exe"
LogRotationCount = 5
)
func init() {
@@ -69,7 +70,6 @@ func init() {
slog.Error(fmt.Sprintf("create ollama dir %s: %v", AppDataDir, err))
}
}
} else if runtime.GOOS == "darwin" {
// TODO
AppName += ".app"

View File

@@ -15,7 +15,7 @@ import (
)
func getCLIFullPath(command string) string {
cmdPath := ""
var cmdPath string
appExe, err := os.Executable()
if err == nil {
cmdPath = filepath.Join(filepath.Dir(appExe), command)
@@ -54,7 +54,7 @@ func start(ctx context.Context, command string) (*exec.Cmd, error) {
return nil, fmt.Errorf("failed to spawn server stderr pipe: %w", err)
}
// TODO - rotation
rotateLogs(ServerLogFile)
logFile, err := os.OpenFile(ServerLogFile, os.O_APPEND|os.O_WRONLY|os.O_CREATE, 0755)
if err != nil {
return nil, fmt.Errorf("failed to create server log: %w", err)
@@ -65,7 +65,6 @@ func start(ctx context.Context, command string) (*exec.Cmd, error) {
if err != nil {
if !errors.Is(err, os.ErrNotExist) {
return nil, fmt.Errorf("stat ollama server log dir %s: %v", logDir, err)
}
if err := os.MkdirAll(logDir, 0o755); err != nil {

View File

@@ -24,7 +24,8 @@ func terminate(cmd *exec.Cmd) error {
if err != nil {
return err
}
defer dll.Release() // nolint: errcheck
//nolint:errcheck
defer dll.Release()
pid := cmd.Process.Pid
@@ -73,7 +74,8 @@ func isProcessExited(pid int) (bool, error) {
if err != nil {
return false, fmt.Errorf("failed to open process: %v", err)
}
defer windows.CloseHandle(hProcess) // nolint: errcheck
//nolint:errcheck
defer windows.CloseHandle(hProcess)
var exitCode uint32
err = windows.GetExitCodeProcess(hProcess, &exitCode)

View File

@@ -78,7 +78,7 @@ func IsNewReleaseAvailable(ctx context.Context) (bool, UpdateResponse) {
}
defer resp.Body.Close()
if resp.StatusCode == 204 {
if resp.StatusCode == http.StatusNoContent {
slog.Debug("check update response 204 (current version is up to date)")
return false, updateResp
}
@@ -87,7 +87,7 @@ func IsNewReleaseAvailable(ctx context.Context) (bool, UpdateResponse) {
slog.Warn(fmt.Sprintf("failed to read body response: %s", err))
}
if resp.StatusCode != 200 {
if resp.StatusCode != http.StatusOK {
slog.Info(fmt.Sprintf("check update error %d - %.96s", resp.StatusCode, string(body)))
return false, updateResp
}
@@ -114,7 +114,7 @@ func DownloadNewRelease(ctx context.Context, updateResp UpdateResponse) error {
if err != nil {
return fmt.Errorf("error checking update: %w", err)
}
if resp.StatusCode != 200 {
if resp.StatusCode != http.StatusOK {
return fmt.Errorf("unexpected status attempting to download update %d", resp.StatusCode)
}
resp.Body.Close()

View File

@@ -88,10 +88,15 @@ DialogFontSize=12
[Files]
Source: ".\app.exe"; DestDir: "{app}"; DestName: "{#MyAppExeName}" ; Flags: ignoreversion 64bit
Source: "..\ollama.exe"; DestDir: "{app}"; Flags: ignoreversion 64bit
Source: "..\dist\windows-{#ARCH}\*.dll"; DestDir: "{app}"; Flags: ignoreversion 64bit
Source: "..\dist\windows-{#ARCH}\ollama_runners\*"; DestDir: "{app}\ollama_runners"; Flags: ignoreversion 64bit recursesubdirs
Source: "..\dist\ollama_welcome.ps1"; DestDir: "{app}"; Flags: ignoreversion
Source: ".\assets\app.ico"; DestDir: "{app}"; Flags: ignoreversion
#if DirExists("..\dist\windows-amd64\cuda")
Source: "..\dist\windows-amd64\cuda\*"; DestDir: "{app}\cuda\"; Flags: ignoreversion recursesubdirs
#endif
#if DirExists("..\dist\windows-amd64\oneapi")
Source: "..\dist\windows-amd64\oneapi\*"; DestDir: "{app}\oneapi\"; Flags: ignoreversion recursesubdirs
#endif
#if DirExists("..\dist\windows-amd64\rocm")
Source: "..\dist\windows-amd64\rocm\*"; DestDir: "{app}\rocm\"; Flags: ignoreversion recursesubdirs
#endif

View File

@@ -4,5 +4,5 @@ write-host "Welcome to Ollama!"
write-host ""
write-host "Run your first model:"
write-host ""
write-host "`tollama run llama2"
write-host "`tollama run llama3"
write-host ""

View File

@@ -29,7 +29,6 @@ func GetID() string {
initStore()
}
return store.ID
}
func GetFirstTimeRun() bool {

View File

@@ -47,7 +47,6 @@ func nativeLoop() {
default:
pTranslateMessage.Call(uintptr(unsafe.Pointer(m))) //nolint:errcheck
pDispatchMessage.Call(uintptr(unsafe.Pointer(m))) //nolint:errcheck
}
}
}
@@ -160,8 +159,8 @@ func (t *winTray) wndProc(hWnd windows.Handle, message uint32, wParam, lParam ui
lResult, _, _ = pDefWindowProc.Call(
uintptr(hWnd),
uintptr(message),
uintptr(wParam),
uintptr(lParam),
wParam,
lParam,
)
}
return

View File

@@ -186,7 +186,7 @@ func (t *winTray) initInstance() error {
t.muNID.Lock()
defer t.muNID.Unlock()
t.nid = &notifyIconData{
Wnd: windows.Handle(t.window),
Wnd: t.window,
ID: 100,
Flags: NIF_MESSAGE,
CallbackMessage: t.wmSystrayMessage,
@@ -197,7 +197,6 @@ func (t *winTray) initInstance() error {
}
func (t *winTray) createMenu() error {
menuHandle, _, err := pCreatePopupMenu.Call()
if menuHandle == 0 {
return err
@@ -246,7 +245,7 @@ func (t *winTray) addOrUpdateMenuItem(menuItemId uint32, parentId uint32, title
mi := menuItemInfo{
Mask: MIIM_FTYPE | MIIM_STRING | MIIM_ID | MIIM_STATE,
Type: MFT_STRING,
ID: uint32(menuItemId),
ID: menuItemId,
TypeData: titlePtr,
Cch: uint32(len(title)),
}
@@ -302,11 +301,10 @@ func (t *winTray) addOrUpdateMenuItem(menuItemId uint32, parentId uint32, title
}
func (t *winTray) addSeparatorMenuItem(menuItemId, parentId uint32) error {
mi := menuItemInfo{
Mask: MIIM_FTYPE | MIIM_ID | MIIM_STATE,
Type: MFT_SEPARATOR,
ID: uint32(menuItemId),
ID: menuItemId,
}
mi.Size = uint32(unsafe.Sizeof(mi))
@@ -426,7 +424,6 @@ func iconBytesToFilePath(iconBytes []byte) (string, error) {
// Loads an image from file and shows it in tray.
// Shell_NotifyIcon: https://msdn.microsoft.com/en-us/library/windows/desktop/bb762159(v=vs.85).aspx
func (t *winTray) setIcon(src string) error {
h, err := t.loadIconFrom(src)
if err != nil {
return err
@@ -444,7 +441,6 @@ func (t *winTray) setIcon(src string) error {
// Loads an image from file to be shown in tray or menu item.
// LoadImage: https://msdn.microsoft.com/en-us/library/windows/desktop/ms648045(v=vs.85).aspx
func (t *winTray) loadIconFrom(src string) (windows.Handle, error) {
// Save and reuse handles of loaded images
t.muLoadedImages.RLock()
h, ok := t.loadedImages[src]

View File

@@ -20,6 +20,7 @@ import (
"path/filepath"
"regexp"
"runtime"
"slices"
"strings"
"syscall"
"time"
@@ -29,7 +30,6 @@ import (
"github.com/olekukonko/tablewriter"
"github.com/spf13/cobra"
"golang.org/x/crypto/ssh"
"golang.org/x/exp/slices"
"golang.org/x/term"
"github.com/ollama/ollama/api"
@@ -162,9 +162,6 @@ func tempZipFiles(path string) (string, error) {
}
defer tempfile.Close()
zipfile := zip.NewWriter(tempfile)
defer zipfile.Close()
detectContentType := func(path string) (string, error) {
f, err := os.Open(path)
if err != nil {
@@ -233,6 +230,9 @@ func tempZipFiles(path string) (string, error) {
files = append(files, tks...)
}
zipfile := zip.NewWriter(tempfile)
defer zipfile.Close()
for _, file := range files {
f, err := os.Open(file)
if err != nil {
@@ -287,38 +287,12 @@ func createBlob(cmd *cobra.Command, client *api.Client, path string) (string, er
}
func RunHandler(cmd *cobra.Command, args []string) error {
client, err := api.ClientFromEnvironment()
if err != nil {
return err
}
name := args[0]
// check if the model exists on the server
show, err := client.Show(cmd.Context(), &api.ShowRequest{Name: name})
var statusError api.StatusError
switch {
case errors.As(err, &statusError) && statusError.StatusCode == http.StatusNotFound:
if err := PullHandler(cmd, []string{name}); err != nil {
return err
}
show, err = client.Show(cmd.Context(), &api.ShowRequest{Name: name})
if err != nil {
return err
}
case err != nil:
return err
}
interactive := true
opts := runOptions{
Model: args[0],
WordWrap: os.Getenv("TERM") == "xterm-256color",
Options: map[string]interface{}{},
MultiModal: slices.Contains(show.Details.Families, "clip"),
ParentModel: show.Details.ParentModel,
Model: args[0],
WordWrap: os.Getenv("TERM") == "xterm-256color",
Options: map[string]interface{}{},
}
format, err := cmd.Flags().GetString("format")
@@ -362,11 +336,38 @@ func RunHandler(cmd *cobra.Command, args []string) error {
}
opts.WordWrap = !nowrap
if !interactive {
return generate(cmd, opts)
// Fill out the rest of the options based on information about the
// model.
client, err := api.ClientFromEnvironment()
if err != nil {
return err
}
return generateInteractive(cmd, opts)
name := args[0]
info, err := func() (*api.ShowResponse, error) {
showReq := &api.ShowRequest{Name: name}
info, err := client.Show(cmd.Context(), showReq)
var se api.StatusError
if errors.As(err, &se) && se.StatusCode == http.StatusNotFound {
if err := PullHandler(cmd, []string{name}); err != nil {
return nil, err
}
return client.Show(cmd.Context(), &api.ShowRequest{Name: name})
}
return info, err
}()
if err != nil {
return err
}
opts.MultiModal = slices.Contains(info.Details.Families, "clip")
opts.ParentModel = info.Details.ParentModel
opts.Messages = append(opts.Messages, info.Messages...)
if interactive {
return generateInteractive(cmd, opts)
}
return generate(cmd, opts)
}
func errFromUnknownKey(unknownKeyErr error) error {
@@ -525,7 +526,7 @@ func ListRunningHandler(cmd *cobra.Command, args []string) error {
var data [][]string
for _, m := range models.Models {
if len(args) == 0 || strings.HasPrefix(m.Name, args[0]) {
if len(args) == 0 || strings.HasPrefix(m.Model, args[0]) {
var procStr string
switch {
case m.SizeVRAM == 0:
@@ -539,7 +540,7 @@ func ListRunningHandler(cmd *cobra.Command, args []string) error {
cpuPercent := math.Round(float64(sizeCPU) / float64(m.Size) * 100)
procStr = fmt.Sprintf("%d%%/%d%% CPU/GPU", int(cpuPercent), int(100-cpuPercent))
}
data = append(data, []string{m.Name, m.Digest[:12], format.HumanBytes(m.Size), procStr, format.HumanTime(m.ExpiresAt, "Never")})
data = append(data, []string{m.Model, m.Digest[:12], format.HumanBytes(m.Size), procStr, format.HumanTime(m.ExpiresAt, "Never")})
}
}
@@ -579,10 +580,6 @@ func ShowHandler(cmd *cobra.Command, args []string) error {
return err
}
if len(args) != 1 {
return errors.New("missing model name")
}
license, errLicense := cmd.Flags().GetBool("license")
modelfile, errModelfile := cmd.Flags().GetBool("modelfile")
parameters, errParams := cmd.Flags().GetBool("parameters")
@@ -625,8 +622,29 @@ func ShowHandler(cmd *cobra.Command, args []string) error {
if flagsSet > 1 {
return errors.New("only one of '--license', '--modelfile', '--parameters', '--system', or '--template' can be specified")
} else if flagsSet == 0 {
return errors.New("one of '--license', '--modelfile', '--parameters', '--system', or '--template' must be specified")
}
if flagsSet == 1 {
req := api.ShowRequest{Name: args[0]}
resp, err := client.Show(cmd.Context(), &req)
if err != nil {
return err
}
switch showType {
case "license":
fmt.Println(resp.License)
case "modelfile":
fmt.Println(resp.Modelfile)
case "parameters":
fmt.Println(resp.Parameters)
case "system":
fmt.Println(resp.System)
case "template":
fmt.Println(resp.Template)
}
return nil
}
req := api.ShowRequest{Name: args[0]}
@@ -635,22 +653,120 @@ func ShowHandler(cmd *cobra.Command, args []string) error {
return err
}
switch showType {
case "license":
fmt.Println(resp.License)
case "modelfile":
fmt.Println(resp.Modelfile)
case "parameters":
fmt.Println(resp.Parameters)
case "system":
fmt.Println(resp.System)
case "template":
fmt.Println(resp.Template)
arch := resp.ModelInfo["general.architecture"].(string)
modelData := [][]string{
{"arch", arch},
{"parameters", resp.Details.ParameterSize},
{"quantization", resp.Details.QuantizationLevel},
{"context length", fmt.Sprintf("%v", resp.ModelInfo[fmt.Sprintf("%s.context_length", arch)].(float64))},
{"embedding length", fmt.Sprintf("%v", resp.ModelInfo[fmt.Sprintf("%s.embedding_length", arch)].(float64))},
}
mainTableData := [][]string{
{"Model"},
{renderSubTable(modelData, false)},
}
if resp.ProjectorInfo != nil {
projectorData := [][]string{
{"arch", "clip"},
{"parameters", format.HumanNumber(uint64(resp.ProjectorInfo["general.parameter_count"].(float64)))},
}
if projectorType, ok := resp.ProjectorInfo["clip.projector_type"]; ok {
projectorData = append(projectorData, []string{"projector type", projectorType.(string)})
}
projectorData = append(projectorData,
[]string{"embedding length", fmt.Sprintf("%v", resp.ProjectorInfo["clip.vision.embedding_length"].(float64))},
[]string{"projection dimensionality", fmt.Sprintf("%v", resp.ProjectorInfo["clip.vision.projection_dim"].(float64))},
)
mainTableData = append(mainTableData,
[]string{"Projector"},
[]string{renderSubTable(projectorData, false)},
)
}
if resp.Parameters != "" {
mainTableData = append(mainTableData, []string{"Parameters"}, []string{formatParams(resp.Parameters)})
}
if resp.System != "" {
mainTableData = append(mainTableData, []string{"System"}, []string{renderSubTable(twoLines(resp.System), true)})
}
if resp.License != "" {
mainTableData = append(mainTableData, []string{"License"}, []string{renderSubTable(twoLines(resp.License), true)})
}
table := tablewriter.NewWriter(os.Stdout)
table.SetAutoWrapText(false)
table.SetBorder(false)
table.SetAlignment(tablewriter.ALIGN_LEFT)
for _, v := range mainTableData {
table.Append(v)
}
table.Render()
return nil
}
func renderSubTable(data [][]string, file bool) string {
var buf bytes.Buffer
table := tablewriter.NewWriter(&buf)
table.SetAutoWrapText(!file)
table.SetBorder(false)
table.SetNoWhiteSpace(true)
table.SetTablePadding("\t")
table.SetAlignment(tablewriter.ALIGN_LEFT)
for _, v := range data {
table.Append(v)
}
table.Render()
renderedTable := buf.String()
lines := strings.Split(renderedTable, "\n")
for i, line := range lines {
lines[i] = "\t" + line
}
return strings.Join(lines, "\n")
}
func twoLines(s string) [][]string {
lines := strings.Split(s, "\n")
res := [][]string{}
count := 0
for _, line := range lines {
line = strings.TrimSpace(line)
if line != "" {
count++
res = append(res, []string{line})
if count == 2 {
return res
}
}
}
return res
}
func formatParams(s string) string {
lines := strings.Split(s, "\n")
table := [][]string{}
for _, line := range lines {
table = append(table, strings.Fields(line))
}
return renderSubTable(table, false)
}
func CopyHandler(cmd *cobra.Command, args []string) error {
client, err := api.ClientFromEnvironment()
if err != nil {
@@ -746,7 +862,6 @@ func displayResponse(content string, wordWrap bool, state *displayResponseState)
if wordWrap && termWidth >= 10 {
for _, ch := range content {
if state.lineLength+1 > termWidth-5 {
if runewidth.StringWidth(state.wordBuffer) > termWidth-10 {
fmt.Printf("%s%c", state.wordBuffer, ch)
state.wordBuffer = ""
@@ -755,7 +870,11 @@ func displayResponse(content string, wordWrap bool, state *displayResponseState)
}
// backtrack the length of the last word and clear to the end of the line
fmt.Printf("\x1b[%dD\x1b[K\n", runewidth.StringWidth(state.wordBuffer))
a := runewidth.StringWidth(state.wordBuffer)
if a > 0 {
fmt.Printf("\x1b[%dD", a)
}
fmt.Printf("\x1b[K\n")
fmt.Printf("%s%c", state.wordBuffer, ch)
chWidth := runewidth.RuneWidth(ch)
@@ -957,17 +1076,11 @@ func generate(cmd *cobra.Command, opts runOptions) error {
}
func RunServer(cmd *cobra.Command, _ []string) error {
// retrieve the OLLAMA_HOST environment variable
ollamaHost, err := api.GetOllamaHost()
if err != nil {
return err
}
if err := initializeKeypair(); err != nil {
return err
}
ln, err := net.Listen("tcp", net.JoinHostPort(ollamaHost.Host, ollamaHost.Port))
ln, err := net.Listen("tcp", net.JoinHostPort(envconfig.Host.Host, envconfig.Host.Port))
if err != nil {
return err
}
@@ -1026,24 +1139,6 @@ func initializeKeypair() error {
return nil
}
//nolint:unused
func waitForServer(ctx context.Context, client *api.Client) error {
// wait for the server to start
timeout := time.After(5 * time.Second)
tick := time.Tick(500 * time.Millisecond)
for {
select {
case <-timeout:
return errors.New("timed out waiting for server to start")
case <-tick:
if err := client.Heartbeat(ctx); err == nil {
return nil // server has started
}
}
}
}
func checkServerHeartbeat(cmd *cobra.Command, _ []string) error {
client, err := api.ClientFromEnvironment()
if err != nil {
@@ -1251,6 +1346,9 @@ func NewCLI() *cobra.Command {
envVars["OLLAMA_NOPRUNE"],
envVars["OLLAMA_ORIGINS"],
envVars["OLLAMA_TMPDIR"],
envVars["OLLAMA_FLASH_ATTENTION"],
envVars["OLLAMA_LLM_LIBRARY"],
envVars["OLLAMA_MAX_VRAM"],
})
default:
appendEnvDocs(cmd, envs)

View File

@@ -8,11 +8,11 @@ import (
"os"
"path/filepath"
"regexp"
"slices"
"sort"
"strings"
"github.com/spf13/cobra"
"golang.org/x/exp/slices"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/envconfig"
@@ -31,65 +31,40 @@ const (
)
func loadModel(cmd *cobra.Command, opts *runOptions) error {
client, err := api.ClientFromEnvironment()
if err != nil {
return err
}
p := progress.NewProgress(os.Stderr)
defer p.StopAndClear()
spinner := progress.NewSpinner("")
p.Add("", spinner)
showReq := api.ShowRequest{Name: opts.Model}
showResp, err := client.Show(cmd.Context(), &showReq)
client, err := api.ClientFromEnvironment()
if err != nil {
return err
}
opts.MultiModal = slices.Contains(showResp.Details.Families, "clip")
opts.ParentModel = showResp.Details.ParentModel
if len(showResp.Messages) > 0 {
opts.Messages = append(opts.Messages, showResp.Messages...)
}
chatReq := &api.ChatRequest{
Model: opts.Model,
Messages: []api.Message{},
Model: opts.Model,
KeepAlive: opts.KeepAlive,
}
if opts.KeepAlive != nil {
chatReq.KeepAlive = opts.KeepAlive
}
err = client.Chat(cmd.Context(), chatReq, func(resp api.ChatResponse) error {
return client.Chat(cmd.Context(), chatReq, func(resp api.ChatResponse) error {
p.StopAndClear()
if len(opts.Messages) > 0 {
for _, msg := range opts.Messages {
switch msg.Role {
case "user":
fmt.Printf(">>> %s\n", msg.Content)
case "assistant":
state := &displayResponseState{}
displayResponse(msg.Content, opts.WordWrap, state)
fmt.Println()
fmt.Println()
}
for _, msg := range opts.Messages {
switch msg.Role {
case "user":
fmt.Printf(">>> %s\n", msg.Content)
case "assistant":
state := &displayResponseState{}
displayResponse(msg.Content, opts.WordWrap, state)
fmt.Println()
fmt.Println()
}
}
return nil
})
if err != nil {
return err
}
return nil
}
func generateInteractive(cmd *cobra.Command, opts runOptions) error {
opts.Messages = make([]api.Message, 0)
err := loadModel(cmd, &opts)
if err != nil {
return err

View File

@@ -6,6 +6,7 @@ import (
"text/template"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"github.com/ollama/ollama/api"
)
@@ -85,11 +86,11 @@ MESSAGE assistant """Yes it is true, I am half horse, half shark."""
`
tmpl, err := template.New("").Parse(expectedModelfile)
assert.Nil(t, err)
require.NoError(t, err)
var buf bytes.Buffer
err = tmpl.Execute(&buf, opts)
assert.Nil(t, err)
require.NoError(t, err)
assert.Equal(t, buf.String(), mf)
opts.ParentModel = "horseshark"
@@ -107,10 +108,10 @@ MESSAGE assistant """Yes it is true, I am half horse, half shark."""
`
tmpl, err = template.New("").Parse(expectedModelfile)
assert.Nil(t, err)
require.NoError(t, err)
var parentBuf bytes.Buffer
err = tmpl.Execute(&parentBuf, opts)
assert.Nil(t, err)
require.NoError(t, err)
assert.Equal(t, parentBuf.String(), mf)
}

27
cmd/start.go Normal file
View File

@@ -0,0 +1,27 @@
//go:build darwin || windows
package cmd
import (
"context"
"errors"
"time"
"github.com/ollama/ollama/api"
)
func waitForServer(ctx context.Context, client *api.Client) error {
// wait for the server to start
timeout := time.After(5 * time.Second)
tick := time.Tick(500 * time.Millisecond)
for {
select {
case <-timeout:
return errors.New("timed out waiting for server to start")
case <-tick:
if err := client.Heartbeat(ctx); err == nil {
return nil // server has started
}
}
}
}

View File

@@ -189,7 +189,7 @@ func LoadSentencePieceTokens(dirpath string, params *Params) (*Vocab, error) {
if params.VocabSize > len(v.Tokens) {
missingTokens := params.VocabSize - len(v.Tokens)
slog.Warn(fmt.Sprintf("vocab is missing %d tokens", missingTokens))
for cnt := 0; cnt < missingTokens; cnt++ {
for cnt := range missingTokens {
v.Tokens = append(v.Tokens, fmt.Sprintf("<dummy%05d>", cnt+1))
v.Scores = append(v.Scores, -1)
v.Types = append(v.Types, tokenTypeUserDefined)

View File

@@ -35,7 +35,6 @@ func addOnes(data []float32, vectorSize int) ([]float32, error) {
f32s = append(f32s, t...)
}
return f32s, nil
}

View File

@@ -119,11 +119,12 @@ func llamaRepack(name string, params *Params, data []float32, shape []uint64) ([
}
var heads int
if strings.HasSuffix(name, "attn_q.weight") {
switch {
case strings.HasSuffix(name, "attn_q.weight"):
heads = params.AttentionHeads
} else if strings.HasSuffix(name, "attn_k.weight") {
case strings.HasSuffix(name, "attn_k.weight"):
heads = cmp.Or(params.KeyValHeads, params.AttentionHeads)
} else {
default:
return nil, fmt.Errorf("unknown tensor name: %s", name)
}

View File

@@ -120,7 +120,7 @@ func (m *SafetensorFormat) readTensors(fn string, offset uint64, params *Params)
Name: name,
Kind: kind,
Offset: offset,
Shape: shape[:],
Shape: shape,
}
t.WriterTo = safetensorWriterTo{

View File

@@ -85,11 +85,8 @@ func parseTokens(dirpath string) (pre string, tokens []Token, merges []string, e
sha256sum := sha256.New()
for _, pt := range t.PreTokenizer.PreTokenizers {
switch pt.Type {
case "Split":
if pt.Pattern.Regex != "" {
sha256sum.Write([]byte(pt.Pattern.Regex))
}
if pt.Type == "Split" && pt.Pattern.Regex != "" {
sha256sum.Write([]byte(pt.Pattern.Regex))
}
}

View File

@@ -88,7 +88,7 @@ func (tf *TorchFormat) GetTensors(dirpath string, params *Params) ([]llm.Tensor,
Name: ggufName,
Kind: kind,
Offset: offset, // calculate the offset
Shape: shape[:],
Shape: shape,
}
tensor.WriterTo = torchWriterTo{
@@ -104,7 +104,6 @@ func (tf *TorchFormat) GetTensors(dirpath string, params *Params) ([]llm.Tensor,
}
return tensors, nil
}
func getAltParams(dirpath string) (*Params, error) {

View File

@@ -12,6 +12,7 @@
- [Pull a Model](#pull-a-model)
- [Push a Model](#push-a-model)
- [Generate Embeddings](#generate-embeddings)
- [List Running Models](#list-running-models)
## Conventions
@@ -249,7 +250,7 @@ curl http://localhost:11434/api/generate -d '{
#### Request (Reproducible outputs)
For reproducible outputs, set `temperature` to 0 and `seed` to a number:
For reproducible outputs, set `seed` to a number:
##### Request
@@ -258,8 +259,7 @@ curl http://localhost:11434/api/generate -d '{
"model": "mistral",
"prompt": "Why is the sky blue?",
"options": {
"seed": 123,
"temperature": 0
"seed": 123
}
}'
```
@@ -777,11 +777,12 @@ A single JSON object will be returned.
POST /api/show
```
Show information about a model including details, modelfile, template, parameters, license, and system prompt.
Show information about a model including details, modelfile, template, parameters, license, system prompt.
### Parameters
- `name`: name of the model to show
- `verbose`: (optional) if set to `true`, returns full data for verbose response fields
### Examples
@@ -798,14 +799,40 @@ curl http://localhost:11434/api/show -d '{
```json
{
"modelfile": "# Modelfile generated by \"ollama show\"\n# To build a new Modelfile based on this one, replace the FROM line with:\n# FROM llava:latest\n\nFROM /Users/matt/.ollama/models/blobs/sha256:200765e1283640ffbd013184bf496e261032fa75b99498a9613be4e94d63ad52\nTEMPLATE \"\"\"{{ .System }}\nUSER: {{ .Prompt }}\nASSISTANT: \"\"\"\nPARAMETER num_ctx 4096\nPARAMETER stop \"\u003c/s\u003e\"\nPARAMETER stop \"USER:\"\nPARAMETER stop \"ASSISTANT:\"",
"parameters": "num_ctx 4096\nstop \u003c/s\u003e\nstop USER:\nstop ASSISTANT:",
"template": "{{ .System }}\nUSER: {{ .Prompt }}\nASSISTANT: ",
"parameters": "num_keep 24\nstop \"<|start_header_id|>\"\nstop \"<|end_header_id|>\"\nstop \"<|eot_id|>\"",
"template": "{{ if .System }}<|start_header_id|>system<|end_header_id|>\n\n{{ .System }}<|eot_id|>{{ end }}{{ if .Prompt }}<|start_header_id|>user<|end_header_id|>\n\n{{ .Prompt }}<|eot_id|>{{ end }}<|start_header_id|>assistant<|end_header_id|>\n\n{{ .Response }}<|eot_id|>",
"details": {
"parent_model": "",
"format": "gguf",
"family": "llama",
"families": ["llama", "clip"],
"parameter_size": "7B",
"families": [
"llama"
],
"parameter_size": "8.0B",
"quantization_level": "Q4_0"
},
"model_info": {
"general.architecture": "llama",
"general.file_type": 2,
"general.parameter_count": 8030261248,
"general.quantization_version": 2,
"llama.attention.head_count": 32,
"llama.attention.head_count_kv": 8,
"llama.attention.layer_norm_rms_epsilon": 0.00001,
"llama.block_count": 32,
"llama.context_length": 8192,
"llama.embedding_length": 4096,
"llama.feed_forward_length": 14336,
"llama.rope.dimension_count": 128,
"llama.rope.freq_base": 500000,
"llama.vocab_size": 128256,
"tokenizer.ggml.bos_token_id": 128000,
"tokenizer.ggml.eos_token_id": 128009,
"tokenizer.ggml.merges": [], // populates if `verbose=true`
"tokenizer.ggml.model": "gpt2",
"tokenizer.ggml.pre": "llama-bpe",
"tokenizer.ggml.token_type": [], // populates if `verbose=true`
"tokenizer.ggml.tokens": [] // populates if `verbose=true`
}
}
```
@@ -1035,3 +1062,46 @@ curl http://localhost:11434/api/embeddings -d '{
]
}
```
## List Running Models
```shell
GET /api/ps
```
List models that are currently loaded into memory.
#### Examples
### Request
```shell
curl http://localhost:11434/api/ps
```
#### Response
A single JSON object will be returned.
```json
{
"models": [
{
"model": "mistral:latest",
"size": 5137025024,
"digest": "2ae6f6dd7a3dd734790bbbf58b8909a606e0e7e97e94b7604e0aa7ae4490e6d8",
"details": {
"parent_model": "",
"format": "gguf",
"family": "llama",
"families": [
"llama"
],
"parameter_size": "7.2B",
"quantization_level": "Q4_0"
},
"expires_at": "2024-06-04T14:38:31.83753-07:00",
"size_vram": 5137025024
}
]
}
```

View File

@@ -114,15 +114,18 @@ If you have Docker available, you can build linux binaries with `./scripts/build
### Windows
Note: The windows build for Ollama is still under development.
Note: The Windows build for Ollama is still under development.
Install required tools:
First, install required tools:
- MSVC toolchain - C/C++ and cmake as minimal requirements
- Go version 1.22 or higher
- MinGW (pick one variant) with GCC.
- [MinGW-w64](https://www.mingw-w64.org/)
- [MSYS2](https://www.msys2.org/)
- The `ThreadJob` Powershell module: `Install-Module -Name ThreadJob -Scope CurrentUser`
Then, build the `ollama` binary:
```powershell
$env:CGO_ENABLED="1"

View File

@@ -8,7 +8,7 @@ Check your compute compatibility to see if your card is supported:
| Compute Capability | Family | Cards |
| ------------------ | ------------------- | ----------------------------------------------------------------------------------------------------------- |
| 9.0 | NVIDIA | `H100` |
| 8.9 | GeForce RTX 40xx | `RTX 4090` `RTX 4080` `RTX 4070 Ti` `RTX 4060 Ti` |
| 8.9 | GeForce RTX 40xx | `RTX 4090` `RTX 4080 SUPER` `RTX 4080` `RTX 4070 Ti SUPER` `RTX 4070 Ti` `RTX 4070 SUPER` `RTX 4070` `RTX 4060 Ti` `RTX 4060` |
| | NVIDIA Professional | `L4` `L40` `RTX 6000` |
| 8.6 | GeForce RTX 30xx | `RTX 3090 Ti` `RTX 3090` `RTX 3080 Ti` `RTX 3080` `RTX 3070 Ti` `RTX 3070` `RTX 3060 Ti` `RTX 3060` |
| | NVIDIA Professional | `A40` `RTX A6000` `RTX A5000` `RTX A4000` `RTX A3000` `RTX A2000` `A10` `A16` `A2` |

View File

@@ -1,170 +1,88 @@
# Import a model
# Import
This guide walks through importing a GGUF, PyTorch or Safetensors model.
GGUF models and select Safetensors models can be imported directly into Ollama.
## Importing (GGUF)
## Import GGUF
### Step 1: Write a `Modelfile`
A binary GGUF file can be imported directly into Ollama through a Modelfile.
Start by creating a `Modelfile`. This file is the blueprint for your model, specifying weights, parameters, prompt templates and more.
```
FROM ./mistral-7b-v0.1.Q4_0.gguf
```dockerfile
FROM /path/to/file.gguf
```
(Optional) many chat models require a prompt template in order to answer correctly. A default prompt template can be specified with the `TEMPLATE` instruction in the `Modelfile`:
## Import Safetensors
```
FROM ./mistral-7b-v0.1.Q4_0.gguf
TEMPLATE "[INST] {{ .Prompt }} [/INST]"
If the model being imported is one of these architectures, it can be imported directly into Ollama through a Modelfile:
- LlamaForCausalLM
- MistralForCausalLM
- GemmaForCausalLM
```dockerfile
FROM /path/to/safetensors/directory
```
### Step 2: Create the Ollama model
For architectures not directly convertable by Ollama, see llama.cpp's [guide](https://github.com/ggerganov/llama.cpp/blob/master/README.md#prepare-and-quantize) on conversion. After conversion, see [Import GGUF](#import-gguf).
Finally, create a model from your `Modelfile`:
## Automatic Quantization
> [!NOTE]
> Automatic quantization requires v0.1.35 or higher.
Ollama is capable of quantizing FP16 or FP32 models to any of the supported quantizations with the `-q/--quantize` flag in `ollama create`.
```dockerfile
FROM /path/to/my/gemma/f16/model
```
ollama create example -f Modelfile
```
### Step 3: Run your model
Next, test the model with `ollama run`:
```
ollama run example "What is your favourite condiment?"
```
## Importing (PyTorch & Safetensors)
> Importing from PyTorch and Safetensors is a longer process than importing from GGUF. Improvements that make it easier are a work in progress.
### Setup
First, clone the `ollama/ollama` repo:
```
git clone git@github.com:ollama/ollama.git ollama
cd ollama
```
and then fetch its `llama.cpp` submodule:
```shell
git submodule init
git submodule update llm/llama.cpp
$ ollama create -q Q4_K_M mymodel
transferring model data
quantizing F16 model to Q4_K_M
creating new layer sha256:735e246cc1abfd06e9cdcf95504d6789a6cd1ad7577108a70d9902fef503c1bd
creating new layer sha256:0853f0ad24e5865173bbf9ffcc7b0f5d56b66fd690ab1009867e45e7d2c4db0f
writing manifest
success
```
Next, install the Python dependencies:
### Supported Quantizations
```
python3 -m venv llm/llama.cpp/.venv
source llm/llama.cpp/.venv/bin/activate
pip install -r llm/llama.cpp/requirements.txt
- `Q4_0`
- `Q4_1`
- `Q5_0`
- `Q5_1`
- `Q8_0`
#### K-means Quantizations
- `Q3_K_S`
- `Q3_K_M`
- `Q3_K_L`
- `Q4_K_S`
- `Q4_K_M`
- `Q5_K_S`
- `Q5_K_M`
- `Q6_K`
## Template Detection
> [!NOTE]
> Template detection requires v0.1.42 or higher.
Ollama uses model metadata, specifically `tokenizer.chat_template`, to automatically create a template appropriate for the model you're importing.
```dockerfile
FROM /path/to/my/gemma/model
```
Then build the `quantize` tool:
```
make -C llm/llama.cpp quantize
```shell
$ ollama create mymodel
transferring model data
using autodetected template gemma-instruct
creating new layer sha256:baa2a0edc27d19cc6b7537578a9a7ba1a4e3214dc185ed5ae43692b319af7b84
creating new layer sha256:ba66c3309914dbef07e5149a648fd1877f030d337a4f240d444ea335008943cb
writing manifest
success
```
### Clone the HuggingFace repository (optional)
If the model is currently hosted in a HuggingFace repository, first clone that repository to download the raw model.
Install [Git LFS](https://docs.github.com/en/repositories/working-with-files/managing-large-files/installing-git-large-file-storage), verify it's installed, and then clone the model's repository:
```
git lfs install
git clone https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1 model
```
### Convert the model
> Note: some model architectures require using specific convert scripts. For example, Qwen models require running `convert-hf-to-gguf.py` instead of `convert.py`
```
python llm/llama.cpp/convert.py ./model --outtype f16 --outfile converted.bin
```
### Quantize the model
```
llm/llama.cpp/quantize converted.bin quantized.bin q4_0
```
### Step 3: Write a `Modelfile`
Next, create a `Modelfile` for your model:
```
FROM quantized.bin
TEMPLATE "[INST] {{ .Prompt }} [/INST]"
```
### Step 4: Create the Ollama model
Finally, create a model from your `Modelfile`:
```
ollama create example -f Modelfile
```
### Step 5: Run your model
Next, test the model with `ollama run`:
```
ollama run example "What is your favourite condiment?"
```
## Publishing your model (optional early alpha)
Publishing models is in early alpha. If you'd like to publish your model to share with others, follow these steps:
1. Create [an account](https://ollama.com/signup)
2. Copy your Ollama public key:
- macOS: `cat ~/.ollama/id_ed25519.pub | pbcopy`
- Windows: `type %USERPROFILE%\.ollama\id_ed25519.pub`
- Linux: `cat /usr/share/ollama/.ollama/id_ed25519.pub`
3. Add your public key to your [Ollama account](https://ollama.com/settings/keys)
Next, copy your model to your username's namespace:
```
ollama cp example <your username>/example
```
> Note: model names may only contain lowercase letters, digits, and the characters `.`, `-`, and `_`.
Then push the model:
```
ollama push <your username>/example
```
After publishing, your model will be available at `https://ollama.com/<your username>/example`.
## Quantization reference
The quantization options are as follow (from highest highest to lowest levels of quantization). Note: some architectures such as Falcon do not support K quants.
- `q2_K`
- `q3_K`
- `q3_K_S`
- `q3_K_M`
- `q3_K_L`
- `q4_0` (recommended)
- `q4_1`
- `q4_K`
- `q4_K_S`
- `q4_K_M`
- `q5_0`
- `q5_1`
- `q5_K`
- `q5_K_S`
- `q5_K_M`
- `q6_K`
- `q8_0`
- `f16`
Defining a template in the Modelfile will disable this feature which may be useful if you want to use a different template than the autodetected one.

View File

@@ -100,6 +100,16 @@ sudo curl -L https://ollama.com/download/ollama-linux-amd64 -o /usr/bin/ollama
sudo chmod +x /usr/bin/ollama
```
## Installing specific versions
Use `OLLAMA_VERSION` environment variable with the install script to install a specific version of Ollama, including pre-releases. You can find the version numbers in the [releases page](https://github.com/ollama/ollama/releases).
For example:
```
curl -fsSL https://ollama.com/install.sh | OLLAMA_VERSION=0.1.32 sh
```
## Viewing logs
To view logs of Ollama running as a startup service, run:

View File

@@ -104,8 +104,6 @@ curl http://localhost:11434/v1/chat/completions \
#### Notes
- Setting `seed` will always set `temperature` to `0`
- `finish_reason` will always be `stop`
- `usage.prompt_tokens` will be 0 for completions where prompt evaluation is cached
## Models

View File

@@ -22,7 +22,7 @@ docker logs <container-name>
If manually running `ollama serve` in a terminal, the logs will be on that terminal.
When you run Ollama on **Windows**, there are a few different locations. You can view them in the explorer window by hitting `<cmd>+R` and type in:
- `explorer %LOCALAPPDATA%\Ollama` to view logs
- `explorer %LOCALAPPDATA%\Ollama` to view logs. The most recent server logs will be in `server.log` and older logs will be in `server-#.log`
- `explorer %LOCALAPPDATA%\Programs\Ollama` to browse the binaries (The installer adds this to your user PATH)
- `explorer %HOMEPATH%\.ollama` to browse where models and configuration is stored
- `explorer %TEMP%` where temporary executable files are stored in one or more `ollama*` directories
@@ -76,6 +76,7 @@ Make sure you've set up the container runtime first as described in [docker.md](
Sometimes the container runtime can have difficulties initializing the GPU. When you check the server logs, this can show up as various error codes, such as "3" (not initialized), "46" (device unavailable), "100" (no device), "999" (unknown), or others. The following troubleshooting techniques may help resolve the problem
- Is the container runtime working? Try `docker run --gpus all ubuntu nvidia-smi` - if this doesn't work, Ollama wont be able to see your NVIDIA GPU.
- Is the uvm driver not loaded? `sudo nvidia-modprobe -u`
- Try reloading the nvidia_uvm driver - `sudo rmmod nvidia_uvm` then `sudo modprobe nvidia_uvm`
- Try rebooting

View File

@@ -45,7 +45,7 @@ all_splits = text_splitter.split_documents(data)
```
It's split up, but we have to find the relevant splits and then submit those to the model. We can do this by creating embeddings and storing them in a vector database. We can use Ollama directly to instantiate an embedding model. We will use ChromaDB in this example for a vector database. `pip install chromadb`
We also need to pull embedding model: `ollama pull nomic-embed-text`
```python
from langchain.embeddings import OllamaEmbeddings
from langchain.vectorstores import Chroma
@@ -68,7 +68,8 @@ The next thing is to send the question and the relevant parts of the docs to the
```python
from langchain.chains import RetrievalQA
qachain=RetrievalQA.from_chain_type(ollama, retriever=vectorstore.as_retriever())
qachain.invoke({"query": question})
res = qachain.invoke({"query": question})
print(res['result'])
```
The answer received from this chain was:

View File

@@ -39,8 +39,8 @@ server.
Ollama on Windows stores files in a few different locations. You can view them in
the explorer window by hitting `<cmd>+R` and type in:
- `explorer %LOCALAPPDATA%\Ollama` contains logs, and downloaded updates
- *app.log* contains logs from the GUI application
- *server.log* contains the server logs
- *app.log* contains most resent logs from the GUI application
- *server.log* contains the most recent server logs
- *upgrade.log* contains log output for upgrades
- `explorer %LOCALAPPDATA%\Programs\Ollama` contains the binaries (The installer adds this to your user PATH)
- `explorer %HOMEPATH%\.ollama` contains models and configuration

View File

@@ -1,8 +1,10 @@
package envconfig
import (
"errors"
"fmt"
"log/slog"
"net"
"os"
"path/filepath"
"runtime"
@@ -10,6 +12,18 @@ import (
"strings"
)
type OllamaHost struct {
Scheme string
Host string
Port string
}
func (o OllamaHost) String() string {
return fmt.Sprintf("%s://%s:%s", o.Scheme, o.Host, o.Port)
}
var ErrInvalidHostPort = errors.New("invalid port specified in OLLAMA_HOST")
var (
// Set via OLLAMA_ORIGINS in the environment
AllowOrigins []string
@@ -17,6 +31,8 @@ var (
Debug bool
// Experimental flash attention
FlashAttention bool
// Set via OLLAMA_HOST in the environment
Host *OllamaHost
// Set via OLLAMA_KEEP_ALIVE in the environment
KeepAlive string
// Set via OLLAMA_LLM_LIBRARY in the environment
@@ -25,6 +41,8 @@ var (
MaxRunners int
// Set via OLLAMA_MAX_QUEUE in the environment
MaxQueuedRequests int
// Set via OLLAMA_MODELS in the environment
ModelsDir string
// Set via OLLAMA_MAX_VRAM in the environment
MaxVRAM uint64
// Set via OLLAMA_NOHISTORY in the environment
@@ -35,8 +53,23 @@ var (
NumParallel int
// Set via OLLAMA_RUNNERS_DIR in the environment
RunnersDir string
// Set via OLLAMA_SCHED_SPREAD in the environment
SchedSpread bool
// Set via OLLAMA_TMPDIR in the environment
TmpDir string
// Set via OLLAMA_INTEL_GPU in the environment
IntelGpu bool
// Set via CUDA_VISIBLE_DEVICES in the environment
CudaVisibleDevices string
// Set via HIP_VISIBLE_DEVICES in the environment
HipVisibleDevices string
// Set via ROCR_VISIBLE_DEVICES in the environment
RocrVisibleDevices string
// Set via GPU_DEVICE_ORDINAL in the environment
GpuDeviceOrdinal string
// Set via HSA_OVERRIDE_GFX_VERSION in the environment
HsaOverrideGfxVersion string
)
type EnvVar struct {
@@ -46,23 +79,33 @@ type EnvVar struct {
}
func AsMap() map[string]EnvVar {
return map[string]EnvVar{
ret := map[string]EnvVar{
"OLLAMA_DEBUG": {"OLLAMA_DEBUG", Debug, "Show additional debug information (e.g. OLLAMA_DEBUG=1)"},
"OLLAMA_FLASH_ATTENTION": {"OLLAMA_FLASH_ATTENTION", FlashAttention, "Enabled flash attention"},
"OLLAMA_HOST": {"OLLAMA_HOST", "", "IP Address for the ollama server (default 127.0.0.1:11434)"},
"OLLAMA_HOST": {"OLLAMA_HOST", Host, "IP Address for the ollama server (default 127.0.0.1:11434)"},
"OLLAMA_KEEP_ALIVE": {"OLLAMA_KEEP_ALIVE", KeepAlive, "The duration that models stay loaded in memory (default \"5m\")"},
"OLLAMA_LLM_LIBRARY": {"OLLAMA_ORIGINS", LLMLibrary, ""},
"OLLAMA_LLM_LIBRARY": {"OLLAMA_LLM_LIBRARY", LLMLibrary, "Set LLM library to bypass autodetection"},
"OLLAMA_MAX_LOADED_MODELS": {"OLLAMA_MAX_LOADED_MODELS", MaxRunners, "Maximum number of loaded models (default 1)"},
"OLLAMA_MAX_QUEUE": {"OLLAMA_MAX_QUEUE", MaxQueuedRequests, "Maximum number of queued requests"},
"OLLAMA_MAX_VRAM": {"OLLAMA_MAX_VRAM", MaxVRAM, ""},
"OLLAMA_MODELS": {"OLLAMA_MODELS", "", "The path to the models directory"},
"OLLAMA_MAX_VRAM": {"OLLAMA_MAX_VRAM", MaxVRAM, "Maximum VRAM"},
"OLLAMA_MODELS": {"OLLAMA_MODELS", ModelsDir, "The path to the models directory"},
"OLLAMA_NOHISTORY": {"OLLAMA_NOHISTORY", NoHistory, "Do not preserve readline history"},
"OLLAMA_NOPRUNE": {"OLLAMA_NOPRUNE", NoPrune, "Do not prune model blobs on startup"},
"OLLAMA_NUM_PARALLEL": {"OLLAMA_NUM_PARALLEL", NumParallel, "Maximum number of parallel requests (default 1)"},
"OLLAMA_ORIGINS": {"OLLAMA_ORIGINS", AllowOrigins, "A comma separated list of allowed origins"},
"OLLAMA_RUNNERS_DIR": {"OLLAMA_RUNNERS_DIR", RunnersDir, ""},
"OLLAMA_RUNNERS_DIR": {"OLLAMA_RUNNERS_DIR", RunnersDir, "Location for runners"},
"OLLAMA_SCHED_SPREAD": {"OLLAMA_SCHED_SPREAD", SchedSpread, "Always schedule model across all GPUs"},
"OLLAMA_TMPDIR": {"OLLAMA_TMPDIR", TmpDir, "Location for temporary files"},
}
if runtime.GOOS != "darwin" {
ret["CUDA_VISIBLE_DEVICES"] = EnvVar{"CUDA_VISIBLE_DEVICES", CudaVisibleDevices, "Set which NVIDIA devices are visible"}
ret["HIP_VISIBLE_DEVICES"] = EnvVar{"HIP_VISIBLE_DEVICES", HipVisibleDevices, "Set which AMD devices are visible"}
ret["ROCR_VISIBLE_DEVICES"] = EnvVar{"ROCR_VISIBLE_DEVICES", RocrVisibleDevices, "Set which AMD devices are visible"}
ret["GPU_DEVICE_ORDINAL"] = EnvVar{"GPU_DEVICE_ORDINAL", GpuDeviceOrdinal, "Set which AMD devices are visible"}
ret["HSA_OVERRIDE_GFX_VERSION"] = EnvVar{"HSA_OVERRIDE_GFX_VERSION", HsaOverrideGfxVersion, "Override the gfx used for all detected AMD GPUs"}
ret["OLLAMA_INTEL_GPU"] = EnvVar{"OLLAMA_INTEL_GPU", IntelGpu, "Enable experimental Intel GPU detection"}
}
return ret
}
func Values() map[string]string {
@@ -126,7 +169,7 @@ func LoadConfig() {
var paths []string
for _, root := range []string{filepath.Dir(appExe), cwd} {
paths = append(paths,
filepath.Join(root),
root,
filepath.Join(root, "windows-"+runtime.GOARCH),
filepath.Join(root, "dist", "windows-"+runtime.GOARCH),
)
@@ -173,6 +216,15 @@ func LoadConfig() {
NoHistory = true
}
if spread := clean("OLLAMA_SCHED_SPREAD"); spread != "" {
s, err := strconv.ParseBool(spread)
if err == nil {
SchedSpread = s
} else {
SchedSpread = true
}
}
if noprune := clean("OLLAMA_NOPRUNE"); noprune != "" {
NoPrune = true
}
@@ -184,11 +236,17 @@ func LoadConfig() {
AllowOrigins = append(AllowOrigins,
fmt.Sprintf("http://%s", allowOrigin),
fmt.Sprintf("https://%s", allowOrigin),
fmt.Sprintf("http://%s:*", allowOrigin),
fmt.Sprintf("https://%s:*", allowOrigin),
fmt.Sprintf("http://%s", net.JoinHostPort(allowOrigin, "*")),
fmt.Sprintf("https://%s", net.JoinHostPort(allowOrigin, "*")),
)
}
AllowOrigins = append(AllowOrigins,
"app://*",
"file://*",
"tauri://*",
)
maxRunners := clean("OLLAMA_MAX_LOADED_MODELS")
if maxRunners != "" {
m, err := strconv.Atoi(maxRunners)
@@ -209,4 +267,80 @@ func LoadConfig() {
}
KeepAlive = clean("OLLAMA_KEEP_ALIVE")
var err error
ModelsDir, err = getModelsDir()
if err != nil {
slog.Error("invalid setting", "OLLAMA_MODELS", ModelsDir, "error", err)
}
Host, err = getOllamaHost()
if err != nil {
slog.Error("invalid setting", "OLLAMA_HOST", Host, "error", err, "using default port", Host.Port)
}
if set, err := strconv.ParseBool(clean("OLLAMA_INTEL_GPU")); err == nil {
IntelGpu = set
}
CudaVisibleDevices = clean("CUDA_VISIBLE_DEVICES")
HipVisibleDevices = clean("HIP_VISIBLE_DEVICES")
RocrVisibleDevices = clean("ROCR_VISIBLE_DEVICES")
GpuDeviceOrdinal = clean("GPU_DEVICE_ORDINAL")
HsaOverrideGfxVersion = clean("HSA_OVERRIDE_GFX_VERSION")
}
func getModelsDir() (string, error) {
if models, exists := os.LookupEnv("OLLAMA_MODELS"); exists {
return models, nil
}
home, err := os.UserHomeDir()
if err != nil {
return "", err
}
return filepath.Join(home, ".ollama", "models"), nil
}
func getOllamaHost() (*OllamaHost, error) {
defaultPort := "11434"
hostVar := os.Getenv("OLLAMA_HOST")
hostVar = strings.TrimSpace(strings.Trim(strings.TrimSpace(hostVar), "\"'"))
scheme, hostport, ok := strings.Cut(hostVar, "://")
switch {
case !ok:
scheme, hostport = "http", hostVar
case scheme == "http":
defaultPort = "80"
case scheme == "https":
defaultPort = "443"
}
// trim trailing slashes
hostport = strings.TrimRight(hostport, "/")
host, port, err := net.SplitHostPort(hostport)
if err != nil {
host, port = "127.0.0.1", defaultPort
if ip := net.ParseIP(strings.Trim(hostport, "[]")); ip != nil {
host = ip.String()
} else if hostport != "" {
host = hostport
}
}
if portNum, err := strconv.ParseInt(port, 10, 32); err != nil || portNum > 65535 || portNum < 0 {
return &OllamaHost{
Scheme: scheme,
Host: host,
Port: defaultPort,
}, ErrInvalidHostPort
}
return &OllamaHost{
Scheme: scheme,
Host: host,
Port: port,
}, nil
}

View File

@@ -1,8 +1,11 @@
package envconfig
import (
"fmt"
"net"
"testing"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
@@ -21,3 +24,48 @@ func TestConfig(t *testing.T) {
LoadConfig()
require.True(t, FlashAttention)
}
func TestClientFromEnvironment(t *testing.T) {
type testCase struct {
value string
expect string
err error
}
hostTestCases := map[string]*testCase{
"empty": {value: "", expect: "127.0.0.1:11434"},
"only address": {value: "1.2.3.4", expect: "1.2.3.4:11434"},
"only port": {value: ":1234", expect: ":1234"},
"address and port": {value: "1.2.3.4:1234", expect: "1.2.3.4:1234"},
"hostname": {value: "example.com", expect: "example.com:11434"},
"hostname and port": {value: "example.com:1234", expect: "example.com:1234"},
"zero port": {value: ":0", expect: ":0"},
"too large port": {value: ":66000", err: ErrInvalidHostPort},
"too small port": {value: ":-1", err: ErrInvalidHostPort},
"ipv6 localhost": {value: "[::1]", expect: "[::1]:11434"},
"ipv6 world open": {value: "[::]", expect: "[::]:11434"},
"ipv6 no brackets": {value: "::1", expect: "[::1]:11434"},
"ipv6 + port": {value: "[::1]:1337", expect: "[::1]:1337"},
"extra space": {value: " 1.2.3.4 ", expect: "1.2.3.4:11434"},
"extra quotes": {value: "\"1.2.3.4\"", expect: "1.2.3.4:11434"},
"extra space+quotes": {value: " \" 1.2.3.4 \" ", expect: "1.2.3.4:11434"},
"extra single quotes": {value: "'1.2.3.4'", expect: "1.2.3.4:11434"},
}
for k, v := range hostTestCases {
t.Run(k, func(t *testing.T) {
t.Setenv("OLLAMA_HOST", v.value)
LoadConfig()
oh, err := getOllamaHost()
if err != v.err {
t.Fatalf("expected %s, got %s", v.err, err)
}
if err == nil {
host := net.JoinHostPort(oh.Host, oh.Port)
assert.Equal(t, v.expect, host, fmt.Sprintf("%s: expected %s, got %s", k, v.expect, host))
}
})
}
}

View File

@@ -77,13 +77,21 @@ LOADER_MAPPING = {
def load_single_document(file_path: str) -> List[Document]:
ext = "." + file_path.rsplit(".", 1)[-1]
if ext in LOADER_MAPPING:
loader_class, loader_args = LOADER_MAPPING[ext]
loader = loader_class(file_path, **loader_args)
return loader.load()
if os.path.getsize(file_path) != 0:
filename, ext = os.path.splitext(file_path)
if ext in LOADER_MAPPING:
loader_class, loader_args = LOADER_MAPPING[ext]
try:
loader = loader_class(file_path, **loader_args)
if loader:
return loader.load()
except:
print(f"Corrupted file {file_path}. Ignoring it.")
else:
print(f"Unsupported file {file_path}. Ignoring it.")
else:
print(f"Empty file {file_path}. Ignoring it.")
raise ValueError(f"Unsupported file extension '{ext}'")
def load_documents(source_dir: str, ignored_files: List[str] = []) -> List[Document]:
"""
@@ -100,7 +108,8 @@ def load_documents(source_dir: str, ignored_files: List[str] = []) -> List[Docum
results = []
with tqdm(total=len(filtered_files), desc='Loading new documents', ncols=80) as pbar:
for i, docs in enumerate(pool.imap_unordered(load_single_document, filtered_files)):
results.extend(docs)
if docs:
results.extend(docs)
pbar.update()
return results

View File

@@ -11,4 +11,5 @@ tabulate==0.9.0
pandoc==2.3
pypandoc==1.11
tqdm==4.66.1
sentence_transformers==2.2.2
sentence_transformers==2.2.2
numpy>=1.22.2 # not directly required, pinned by Snyk to avoid a vulnerability

View File

@@ -5,7 +5,6 @@ import (
)
func TestHumanNumber(t *testing.T) {
type testCase struct {
input uint64
expected string

1
go.mod
View File

@@ -16,6 +16,7 @@ require (
)
require (
github.com/agnivade/levenshtein v1.1.1
github.com/d4l3k/go-bfloat16 v0.0.0-20211005043715-690c3bdd05f1
github.com/mattn/go-runewidth v0.0.14
github.com/nlpodyssey/gopickle v0.3.0

6
go.sum
View File

@@ -4,10 +4,14 @@ dmitri.shuralyov.com/gpu/mtl v0.0.0-20190408044501-666a987793e9/go.mod h1:H6x//7
gioui.org v0.0.0-20210308172011-57750fc8a0a6/go.mod h1:RSH6KIUZ0p2xy5zHDxgAM4zumjgTw83q2ge/PI+yyw8=
github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym/WlBOVXweHU+Q+/VP0lqqI8lqeDx9IjBqo=
github.com/agnivade/levenshtein v1.1.1 h1:QY8M92nrzkmr798gCo3kmMyqXFzdQVpxLlGPRBij0P8=
github.com/agnivade/levenshtein v1.1.1/go.mod h1:veldBMzWxcCG2ZvUTKD2kJNRdCk5hVbJomOvKkmgYbo=
github.com/ajstarks/svgo v0.0.0-20180226025133-644b8db467af/go.mod h1:K08gAheRH3/J6wwsYMMT4xOr94bZjxIelGM0+d/wbFw=
github.com/antihax/optional v1.0.0/go.mod h1:uupD/76wgC+ih3iEmQUL+0Ugr19nfwCT1kdvxnR2qWY=
github.com/apache/arrow/go/arrow v0.0.0-20211112161151-bc219186db40 h1:q4dksr6ICHXqG5hm0ZW5IHyeEJXoIJSOZeBLmWPNeIQ=
github.com/apache/arrow/go/arrow v0.0.0-20211112161151-bc219186db40/go.mod h1:Q7yQnSMnLvcXlZ8RV+jwz/6y1rQTqbX6C82SndT52Zs=
github.com/arbovm/levenshtein v0.0.0-20160628152529-48b4e1c0c4d0 h1:jfIu9sQUG6Ig+0+Ap1h4unLjW6YQJpKZVmUzxsD4E/Q=
github.com/arbovm/levenshtein v0.0.0-20160628152529-48b4e1c0c4d0/go.mod h1:t2tdKJDJF9BV14lnkjHmOQgcvEKgtqs5a1N3LNdJhGE=
github.com/boombuler/barcode v1.0.0/go.mod h1:paBWMcWSl3LHKBqUq+rly7CNSldXjb2rDl3JlRe0mD8=
github.com/bytedance/sonic v1.11.6 h1:oUp34TzMlL+OY1OUWxHqsdkgC/Zfc85zGqw9siXjrc0=
github.com/bytedance/sonic v1.11.6/go.mod h1:LysEHSvpvDySVdC2f87zGWf6CIKJcAvqab1ZaiQtds4=
@@ -36,6 +40,8 @@ github.com/d4l3k/go-bfloat16 v0.0.0-20211005043715-690c3bdd05f1/go.mod h1:uw2gLc
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/dgryski/trifles v0.0.0-20200323201526-dd97f9abfb48 h1:fRzb/w+pyskVMQ+UbP35JkH8yB7MYb4q/qhBarqZE6g=
github.com/dgryski/trifles v0.0.0-20200323201526-dd97f9abfb48/go.mod h1:if7Fbed8SFyPtHLHbg49SI7NAdJiC5WIA09pe59rfAA=
github.com/emirpasic/gods v1.18.1 h1:FXtiHYKDGKCW2KzwZKx0iC0PQmdlorYgdFG9jPXJ1Bc=
github.com/emirpasic/gods v1.18.1/go.mod h1:8tpGGwCnJ5H4r6BWwaV6OrWmMoPhUl5jm/FMNAnJvWQ=
github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=

View File

@@ -13,6 +13,7 @@ import (
"strconv"
"strings"
"github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/format"
)
@@ -25,7 +26,16 @@ const (
// Prefix with the node dir
GPUTotalMemoryFileGlob = "mem_banks/*/properties" // size_in_bytes line
GPUUsedMemoryFileGlob = "mem_banks/*/used_memory"
// Direct Rendering Manager sysfs location
DRMDeviceDirGlob = "/sys/class/drm/card*/device"
DRMTotalMemoryFile = "mem_info_vram_total"
DRMUsedMemoryFile = "mem_info_vram_used"
// In hex; properties file is in decimal
DRMUniqueIDFile = "unique_id"
DRMVendorFile = "vendor"
DRMDeviceFile = "device"
)
var (
@@ -35,8 +45,8 @@ var (
)
// Gather GPU information from the amdgpu driver if any supported GPUs are detected
func AMDGetGPUInfo() []GpuInfo {
resp := []GpuInfo{}
func AMDGetGPUInfo() []RocmGPUInfo {
resp := []RocmGPUInfo{}
if !AMDDetected() {
return resp
}
@@ -50,9 +60,9 @@ func AMDGetGPUInfo() []GpuInfo {
// Determine if the user has already pre-selected which GPUs to look at, then ignore the others
var visibleDevices []string
hipVD := os.Getenv("HIP_VISIBLE_DEVICES") // zero based index only
rocrVD := os.Getenv("ROCR_VISIBLE_DEVICES") // zero based index or UUID, but consumer cards seem to not support UUID
gpuDO := os.Getenv("GPU_DEVICE_ORDINAL") // zero based index
hipVD := envconfig.HipVisibleDevices // zero based index only
rocrVD := envconfig.RocrVisibleDevices // zero based index or UUID, but consumer cards seem to not support UUID
gpuDO := envconfig.GpuDeviceOrdinal // zero based index
switch {
// TODO is this priorty order right?
case hipVD != "":
@@ -65,7 +75,7 @@ func AMDGetGPUInfo() []GpuInfo {
visibleDevices = strings.Split(gpuDO, ",")
}
gfxOverride := os.Getenv("HSA_OVERRIDE_GFX_VERSION")
gfxOverride := envconfig.HsaOverrideGfxVersion
var supported []string
libDir := ""
@@ -90,7 +100,7 @@ func AMDGetGPUInfo() []GpuInfo {
scanner := bufio.NewScanner(fp)
isCPU := false
var major, minor, patch uint64
var vendor, device uint64
var vendor, device, uniqueID uint64
for scanner.Scan() {
line := strings.TrimSpace(scanner.Text())
// Note: we could also use "cpu_cores_count X" where X is greater than zero to detect CPUs
@@ -121,30 +131,43 @@ func AMDGetGPUInfo() []GpuInfo {
} else if strings.HasPrefix(line, "vendor_id") {
ver := strings.Fields(line)
if len(ver) != 2 {
slog.Debug("malformed vendor_id", "vendor_id", line)
slog.Debug("malformed", "vendor_id", line)
continue
}
vendor, err = strconv.ParseUint(ver[1], 10, 32)
vendor, err = strconv.ParseUint(ver[1], 10, 64)
if err != nil {
slog.Debug("malformed vendor_id" + line)
slog.Debug("malformed", "vendor_id", line, "error", err)
}
} else if strings.HasPrefix(line, "device_id") {
ver := strings.Fields(line)
if len(ver) != 2 {
slog.Debug("malformed device_id", "device_id", line)
slog.Debug("malformed", "device_id", line)
continue
}
device, err = strconv.ParseUint(ver[1], 10, 32)
device, err = strconv.ParseUint(ver[1], 10, 64)
if err != nil {
slog.Debug("malformed device_id" + line)
slog.Debug("malformed", "device_id", line, "error", err)
}
} else if strings.HasPrefix(line, "unique_id") {
ver := strings.Fields(line)
if len(ver) != 2 {
slog.Debug("malformed", "unique_id", line)
continue
}
uniqueID, err = strconv.ParseUint(ver[1], 10, 64)
if err != nil {
slog.Debug("malformed", "unique_id", line, "error", err)
}
}
// TODO - any other properties we want to extract and record?
// vendor_id + device_id -> pci lookup for "Name"
// Other metrics that may help us understand relative performance between multiple GPUs
}
// Note: while ./mem_banks/*/used_memory exists, it doesn't appear to take other VRAM consumers
// into consideration, so we instead map the device over to the DRM driver sysfs nodes which
// do reliably report VRAM usage.
if isCPU {
cpuCount++
continue
@@ -156,7 +179,7 @@ func AMDGetGPUInfo() []GpuInfo {
// Shouldn't happen, but just in case...
if gpuID < 0 {
slog.Error("unexpected amdgpu sysfs data resulted in negative GPU ID, please set OLLAMA_DEBUG=1 and report an issue")
return []GpuInfo{}
return nil
}
if int(major) < RocmComputeMin {
@@ -167,65 +190,68 @@ func AMDGetGPUInfo() []GpuInfo {
// Look up the memory for the current node
totalMemory := uint64(0)
usedMemory := uint64(0)
propGlob := filepath.Join(AMDNodesSysfsDir, strconv.Itoa(nodeID), GPUTotalMemoryFileGlob)
propFiles, err := filepath.Glob(propGlob)
if err != nil {
slog.Warn("error looking up total GPU memory", "glob", propGlob, "error", err)
var usedFile string
mapping := []struct {
id uint64
filename string
}{
{vendor, DRMVendorFile},
{device, DRMDeviceFile},
{uniqueID, DRMUniqueIDFile}, // Not all devices will report this
}
// 1 or more memory banks - sum the values of all of them
for _, propFile := range propFiles {
fp, err := os.Open(propFile)
if err != nil {
slog.Warn("failed to open sysfs node", "file", propFile, "erroir", err)
continue
}
defer fp.Close()
scanner := bufio.NewScanner(fp)
for scanner.Scan() {
line := strings.TrimSpace(scanner.Text())
if strings.HasPrefix(line, "size_in_bytes") {
ver := strings.Fields(line)
if len(ver) != 2 {
slog.Warn("malformed " + line)
continue
}
bankSizeInBytes, err := strconv.ParseUint(ver[1], 10, 64)
if err != nil {
slog.Warn("malformed int " + line)
continue
}
totalMemory += bankSizeInBytes
slog.Debug("mapping amdgpu to drm sysfs nodes", "amdgpu", match, "vendor", vendor, "device", device, "unique_id", uniqueID)
// Map over to DRM location to find the total/free memory
drmMatches, _ := filepath.Glob(DRMDeviceDirGlob)
for _, devDir := range drmMatches {
matched := true
for _, m := range mapping {
if m.id == 0 {
// Null ID means it didn't populate, so we can't use it to match
continue
}
filename := filepath.Join(devDir, m.filename)
buf, err := os.ReadFile(filename)
if err != nil {
slog.Debug("failed to read sysfs node", "file", filename, "error", err)
matched = false
break
}
// values here are in hex, strip off the lead 0x and parse so we can compare the numeric (decimal) values in amdgpu
cmp, err := strconv.ParseUint(strings.TrimPrefix(strings.TrimSpace(string(buf)), "0x"), 16, 64)
if err != nil {
slog.Debug("failed to parse sysfs node", "file", filename, "error", err)
matched = false
break
}
if cmp != m.id {
matched = false
break
}
}
}
if totalMemory == 0 {
slog.Warn("amdgpu reports zero total memory", "gpu", gpuID)
continue
}
usedGlob := filepath.Join(AMDNodesSysfsDir, strconv.Itoa(nodeID), GPUUsedMemoryFileGlob)
usedFiles, err := filepath.Glob(usedGlob)
if err != nil {
slog.Warn("error looking up used GPU memory", "glob", usedGlob, "error", err)
continue
}
for _, usedFile := range usedFiles {
fp, err := os.Open(usedFile)
if err != nil {
slog.Warn("failed to open sysfs node", "file", usedFile, "error", err)
if !matched {
continue
}
defer fp.Close()
data, err := io.ReadAll(fp)
// Found the matching DRM directory
slog.Debug("matched", "amdgpu", match, "drm", devDir)
totalFile := filepath.Join(devDir, DRMTotalMemoryFile)
buf, err := os.ReadFile(totalFile)
if err != nil {
slog.Warn("failed to read sysfs node", "file", usedFile, "error", err)
continue
slog.Debug("failed to read sysfs node", "file", totalFile, "error", err)
break
}
used, err := strconv.ParseUint(strings.TrimSpace(string(data)), 10, 64)
totalMemory, err = strconv.ParseUint(strings.TrimSpace(string(buf)), 10, 64)
if err != nil {
slog.Warn("malformed used memory", "data", string(data), "error", err)
continue
slog.Debug("failed to parse sysfs node", "file", totalFile, "error", err)
break
}
usedMemory += used
usedFile = filepath.Join(devDir, DRMUsedMemoryFile)
usedMemory, err = getFreeMemory(usedFile)
if err != nil {
slog.Debug("failed to update used memory", "error", err)
}
break
}
// iGPU detection, remove this check once we can support an iGPU variant of the rocm library
@@ -241,18 +267,21 @@ func AMDGetGPUInfo() []GpuInfo {
slog.Debug("amdgpu memory", "gpu", gpuID, "total", format.HumanBytes2(totalMemory))
slog.Debug("amdgpu memory", "gpu", gpuID, "available", format.HumanBytes2(totalMemory-usedMemory))
gpuInfo := GpuInfo{
Library: "rocm",
memInfo: memInfo{
TotalMemory: totalMemory,
FreeMemory: (totalMemory - usedMemory),
gpuInfo := RocmGPUInfo{
GpuInfo: GpuInfo{
Library: "rocm",
memInfo: memInfo{
TotalMemory: totalMemory,
FreeMemory: (totalMemory - usedMemory),
},
ID: strconv.Itoa(gpuID),
Name: name,
Compute: fmt.Sprintf("gfx%d%x%x", major, minor, patch),
MinimumMemory: rocmMinimumMemory,
DriverMajor: driverMajor,
DriverMinor: driverMinor,
},
ID: fmt.Sprintf("%d", gpuID),
Name: name,
Compute: fmt.Sprintf("gfx%d%x%x", major, minor, patch),
MinimumMemory: rocmMinimumMemory,
DriverMajor: driverMajor,
DriverMinor: driverMinor,
usedFilepath: usedFile,
}
// If the user wants to filter to a subset of devices, filter out if we aren't a match
@@ -276,7 +305,7 @@ func AMDGetGPUInfo() []GpuInfo {
libDir, err = AMDValidateLibDir()
if err != nil {
slog.Warn("unable to verify rocm library, will use cpu", "error", err)
return []GpuInfo{}
return nil
}
}
gpuInfo.DependencyPath = libDir
@@ -287,7 +316,7 @@ func AMDGetGPUInfo() []GpuInfo {
supported, err = GetSupportedGFX(libDir)
if err != nil {
slog.Warn("failed to lookup supported GFX types, falling back to CPU mode", "error", err)
return []GpuInfo{}
return nil
}
slog.Debug("rocm supported GPUs", "types", supported)
}
@@ -304,6 +333,11 @@ func AMDGetGPUInfo() []GpuInfo {
slog.Info("skipping rocm gfx compatibility check", "HSA_OVERRIDE_GFX_VERSION", gfxOverride)
}
// Check for env var workarounds
if name == "1002:687f" { // Vega RX 56
gpuInfo.EnvWorkarounds = append(gpuInfo.EnvWorkarounds, [2]string{"HSA_ENABLE_SDMA", "0"})
}
// The GPU has passed all the verification steps and is supported
resp = append(resp, gpuInfo)
}
@@ -378,3 +412,31 @@ func AMDDriverVersion() (driverMajor, driverMinor int, err error) {
}
return driverMajor, driverMinor, nil
}
func (gpus RocmGPUInfoList) RefreshFreeMemory() error {
if len(gpus) == 0 {
return nil
}
for i := range gpus {
usedMemory, err := getFreeMemory(gpus[i].usedFilepath)
if err != nil {
return err
}
slog.Debug("updating rocm free memory", "gpu", gpus[i].ID, "name", gpus[i].Name, "before", format.HumanBytes2(gpus[i].FreeMemory), "now", format.HumanBytes2(gpus[i].TotalMemory-usedMemory))
gpus[i].FreeMemory = gpus[i].TotalMemory - usedMemory
}
return nil
}
func getFreeMemory(usedFile string) (uint64, error) {
buf, err := os.ReadFile(usedFile)
if err != nil {
return 0, fmt.Errorf("failed to read sysfs node %s %w", usedFile, err)
}
usedMemory, err := strconv.ParseUint(strings.TrimSpace(string(buf)), 10, 64)
if err != nil {
slog.Debug("failed to parse sysfs node", "file", usedFile, "error", err)
return 0, fmt.Errorf("failed to parse sysfs node %s %w", usedFile, err)
}
return usedMemory, nil
}

View File

@@ -7,8 +7,10 @@ import (
"os"
"path/filepath"
"slices"
"strconv"
"strings"
"github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/format"
)
@@ -24,8 +26,8 @@ var (
RocmStandardLocations = []string{"C:\\Program Files\\AMD\\ROCm\\5.7\\bin"} // TODO glob?
)
func AMDGetGPUInfo() []GpuInfo {
resp := []GpuInfo{}
func AMDGetGPUInfo() []RocmGPUInfo {
resp := []RocmGPUInfo{}
hl, err := NewHipLib()
if err != nil {
slog.Debug(err.Error())
@@ -52,7 +54,7 @@ func AMDGetGPUInfo() []GpuInfo {
}
var supported []string
gfxOverride := os.Getenv("HSA_OVERRIDE_GFX_VERSION")
gfxOverride := envconfig.HsaOverrideGfxVersion
if gfxOverride == "" {
supported, err = GetSupportedGFX(libDir)
if err != nil {
@@ -65,7 +67,7 @@ func AMDGetGPUInfo() []GpuInfo {
slog.Debug("detected hip devices", "count", count)
// TODO how to determine the underlying device ID when visible devices is causing this to subset?
for i := 0; i < count; i++ {
for i := range count {
err = hl.HipSetDevice(i)
if err != nil {
slog.Warn("set device", "id", i, "error", err)
@@ -117,21 +119,24 @@ func AMDGetGPUInfo() []GpuInfo {
// v5.7 only reports VRAM used by this process, so it's completely wrong and unusable
slog.Debug("amdgpu memory", "gpu", i, "total", format.HumanBytes2(totalMemory))
slog.Debug("amdgpu memory", "gpu", i, "available", format.HumanBytes2(freeMemory))
gpuInfo := GpuInfo{
Library: "rocm",
memInfo: memInfo{
TotalMemory: totalMemory,
FreeMemory: freeMemory,
},
ID: fmt.Sprintf("%d", i), // TODO this is probably wrong if we specify visible devices
DependencyPath: libDir,
MinimumMemory: rocmMinimumMemory,
Name: name,
Compute: gfx,
gpuInfo := RocmGPUInfo{
GpuInfo: GpuInfo{
Library: "rocm",
memInfo: memInfo{
TotalMemory: totalMemory,
FreeMemory: freeMemory,
},
ID: strconv.Itoa(i), // TODO this is probably wrong if we specify visible devices
DependencyPath: libDir,
MinimumMemory: rocmMinimumMemory,
Name: name,
Compute: gfx,
// TODO - this information isn't accurate on windows, so don't report it until we find the right way to retrieve
// DriverMajor: driverMajor,
// DriverMinor: driverMinor,
// TODO - this information isn't accurate on windows, so don't report it until we find the right way to retrieve
// DriverMajor: driverMajor,
// DriverMinor: driverMinor,
},
index: i,
}
resp = append(resp, gpuInfo)
@@ -159,3 +164,30 @@ func AMDValidateLibDir() (string, error) {
slog.Warn("amdgpu detected, but no compatible rocm library found. Please install ROCm")
return "", fmt.Errorf("no suitable rocm found, falling back to CPU")
}
func (gpus RocmGPUInfoList) RefreshFreeMemory() error {
if len(gpus) == 0 {
return nil
}
hl, err := NewHipLib()
if err != nil {
slog.Debug(err.Error())
return nil
}
defer hl.Release()
for i := range gpus {
err := hl.HipSetDevice(gpus[i].index)
if err != nil {
return err
}
freeMemory, _, err := hl.HipMemGetInfo()
if err != nil {
slog.Warn("get mem info", "id", i, "error", err)
continue
}
slog.Debug("updating rocm free memory", "gpu", gpus[i].ID, "name", gpus[i].Name, "before", format.HumanBytes2(gpus[i].FreeMemory), "now", format.HumanBytes2(freeMemory))
gpus[i].FreeMemory = freeMemory
}
return nil
}

View File

@@ -77,20 +77,27 @@ func cleanupTmpDirs() {
continue
}
raw, err := os.ReadFile(filepath.Join(d, "ollama.pid"))
if err == nil {
pid, err := strconv.Atoi(string(raw))
if err == nil {
if proc, err := os.FindProcess(int(pid)); err == nil && !errors.Is(proc.Signal(syscall.Signal(0)), os.ErrProcessDone) {
// Another running ollama, ignore this tmpdir
continue
}
}
} else {
slog.Debug("failed to open ollama.pid", "path", d, "error", err)
}
err = os.RemoveAll(d)
if err != nil {
slog.Debug("unable to cleanup stale tmpdir", "path", d, "error", err)
slog.Warn("failed to read ollama.pid", "path", d, "error", err)
// No pid, ignore this tmpdir
continue
}
pid, err := strconv.Atoi(string(raw))
if err != nil {
slog.Warn("failed to parse pid", "path", d, "error", err)
continue
}
proc, err := os.FindProcess(pid)
if err == nil && !errors.Is(proc.Signal(syscall.Signal(0)), os.ErrProcessDone) {
slog.Warn("found running ollama", "pid", pid, "path", d)
// Another running ollama, ignore this tmpdir
continue
}
if err := os.Remove(d); err != nil {
slog.Warn("unable to cleanup stale tmpdir", "path", d, "error", err)
}
}
}

View File

@@ -1,21 +1,16 @@
package gpu
import (
"log/slog"
"golang.org/x/sys/cpu"
)
func GetCPUVariant() string {
func GetCPUCapability() CPUCapability {
if cpu.X86.HasAVX2 {
slog.Debug("CPU has AVX2")
return "avx2"
return CPUCapabilityAVX2
}
if cpu.X86.HasAVX {
slog.Debug("CPU has AVX")
return "avx"
return CPUCapabilityAVX
}
slog.Debug("CPU does not have vector extensions")
// else LCD
return ""
return CPUCapabilityNone
}

View File

@@ -18,5 +18,4 @@ func cudaGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) {
ids = append(ids, info.ID)
}
return "CUDA_VISIBLE_DEVICES", strings.Join(ids, ",")
}

View File

@@ -16,28 +16,45 @@ import (
"os"
"path/filepath"
"runtime"
"strconv"
"strings"
"sync"
"unsafe"
"github.com/ollama/ollama/format"
"github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/format"
)
type handles struct {
type cudaHandles struct {
deviceCount int
cudart *C.cudart_handle_t
nvcuda *C.nvcuda_handle_t
nvml *C.nvml_handle_t
}
type oneapiHandles struct {
oneapi *C.oneapi_handle_t
deviceCount int
}
const (
cudaMinimumMemory = 457 * format.MebiByte
rocmMinimumMemory = 457 * format.MebiByte
// TODO OneAPI minimum memory
)
var gpuMutex sync.Mutex
var (
gpuMutex sync.Mutex
bootstrapped bool
cpuCapability CPUCapability
cpus []CPUInfo
cudaGPUs []CudaGPUInfo
nvcudaLibPath string
cudartLibPath string
oneapiLibPath string
nvmlLibPath string
rocmGPUs []RocmGPUInfo
oneapiGPUs []OneapiGPUInfo
)
// With our current CUDA compile flags, older than 5.0 will not work properly
var CudaComputeMin = [2]C.int{5, 0}
@@ -47,130 +64,113 @@ var RocmComputeMin = 9
// TODO find a better way to detect iGPU instead of minimum memory
const IGPUMemLimit = 1 * format.GibiByte // 512G is what they typically report, so anything less than 1G must be iGPU
var CudartLinuxGlobs = []string{
"/usr/local/cuda/lib64/libcudart.so*",
"/usr/lib/x86_64-linux-gnu/nvidia/current/libcudart.so*",
"/usr/lib/x86_64-linux-gnu/libcudart.so*",
"/usr/lib/wsl/lib/libcudart.so*",
"/usr/lib/wsl/drivers/*/libcudart.so*",
"/opt/cuda/lib64/libcudart.so*",
"/usr/local/cuda*/targets/aarch64-linux/lib/libcudart.so*",
"/usr/lib/aarch64-linux-gnu/nvidia/current/libcudart.so*",
"/usr/lib/aarch64-linux-gnu/libcudart.so*",
"/usr/local/cuda/lib*/libcudart.so*",
"/usr/lib*/libcudart.so*",
"/usr/local/lib*/libcudart.so*",
}
var CudartWindowsGlobs = []string{
"c:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v*\\bin\\cudart64_*.dll",
}
var NvcudaLinuxGlobs = []string{
"/usr/local/cuda*/targets/*/lib/libcuda.so*",
"/usr/lib/*-linux-gnu/nvidia/current/libcuda.so*",
"/usr/lib/*-linux-gnu/libcuda.so*",
"/usr/lib/wsl/lib/libcuda.so*",
"/usr/lib/wsl/drivers/*/libcuda.so*",
"/opt/cuda/lib*/libcuda.so*",
"/usr/local/cuda/lib*/libcuda.so*",
"/usr/lib*/libcuda.so*",
"/usr/local/lib*/libcuda.so*",
}
var NvcudaWindowsGlobs = []string{
"c:\\windows\\system*\\nvcuda.dll",
}
var OneapiWindowsGlobs = []string{
"c:\\Windows\\System32\\DriverStore\\FileRepository\\*\\ze_intel_gpu64.dll",
}
var OneapiLinuxGlobs = []string{
"/usr/lib/x86_64-linux-gnu/libze_intel_gpu.so*",
"/usr/lib*/libze_intel_gpu.so*",
}
// Jetson devices have JETSON_JETPACK="x.y.z" factory set to the Jetpack version installed.
// Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices.
var CudaTegra string = os.Getenv("JETSON_JETPACK")
// Note: gpuMutex must already be held
func initGPUHandles() *handles {
func initCudaHandles() *cudaHandles {
// TODO - if the ollama build is CPU only, don't do these checks as they're irrelevant and confusing
gpuHandles := &handles{}
var cudartMgmtName string
var cudartMgmtPatterns []string
var nvcudaMgmtName string
var nvcudaMgmtPatterns []string
var oneapiMgmtName string
var oneapiMgmtPatterns []string
tmpDir, _ := PayloadsDir()
switch runtime.GOOS {
case "windows":
cudartMgmtName = "cudart64_*.dll"
localAppData := os.Getenv("LOCALAPPDATA")
cudartMgmtPatterns = []string{filepath.Join(localAppData, "Programs", "Ollama", cudartMgmtName)}
cudartMgmtPatterns = append(cudartMgmtPatterns, CudartWindowsGlobs...)
// Aligned with driver, we can't carry as payloads
nvcudaMgmtName = "nvcuda.dll"
nvcudaMgmtPatterns = NvcudaWindowsGlobs
oneapiMgmtName = "ze_intel_gpu64.dll"
oneapiMgmtPatterns = OneapiWindowsGlobs
case "linux":
cudartMgmtName = "libcudart.so*"
if tmpDir != "" {
// TODO - add "payloads" for subprocess
cudartMgmtPatterns = []string{filepath.Join(tmpDir, "cuda*", cudartMgmtName)}
}
cudartMgmtPatterns = append(cudartMgmtPatterns, CudartLinuxGlobs...)
// Aligned with driver, we can't carry as payloads
nvcudaMgmtName = "libcuda.so*"
nvcudaMgmtPatterns = NvcudaLinuxGlobs
oneapiMgmtName = "libze_intel_gpu.so"
oneapiMgmtPatterns = OneapiLinuxGlobs
default:
return gpuHandles
cHandles := &cudaHandles{}
// Short Circuit if we already know which library to use
if nvmlLibPath != "" {
cHandles.nvml, _ = LoadNVMLMgmt([]string{nvmlLibPath})
return cHandles
}
if nvcudaLibPath != "" {
cHandles.deviceCount, cHandles.nvcuda, _ = LoadNVCUDAMgmt([]string{nvcudaLibPath})
return cHandles
}
if cudartLibPath != "" {
cHandles.deviceCount, cHandles.cudart, _ = LoadCUDARTMgmt([]string{cudartLibPath})
return cHandles
}
slog.Debug("Detecting GPUs")
nvcudaLibPaths := FindGPULibs(nvcudaMgmtName, nvcudaMgmtPatterns)
slog.Debug("searching for GPU discovery libraries for NVIDIA")
var cudartMgmtPatterns []string
// Aligned with driver, we can't carry as payloads
nvcudaMgmtPatterns := NvcudaGlobs
if runtime.GOOS == "windows" {
localAppData := os.Getenv("LOCALAPPDATA")
cudartMgmtPatterns = []string{filepath.Join(localAppData, "Programs", "Ollama", CudartMgmtName)}
}
tmpDir, _ := PayloadsDir()
if tmpDir != "" {
// TODO - add "payloads" for subprocess
cudartMgmtPatterns = []string{filepath.Join(tmpDir, "cuda*", CudartMgmtName)}
}
cudartMgmtPatterns = append(cudartMgmtPatterns, CudartGlobs...)
if len(NvmlGlobs) > 0 {
nvmlLibPaths := FindGPULibs(NvmlMgmtName, NvmlGlobs)
if len(nvmlLibPaths) > 0 {
nvml, libPath := LoadNVMLMgmt(nvmlLibPaths)
if nvml != nil {
slog.Debug("nvidia-ml loaded", "library", libPath)
cHandles.nvml = nvml
nvmlLibPath = libPath
}
}
}
nvcudaLibPaths := FindGPULibs(NvcudaMgmtName, nvcudaMgmtPatterns)
if len(nvcudaLibPaths) > 0 {
deviceCount, nvcuda, libPath := LoadNVCUDAMgmt(nvcudaLibPaths)
if nvcuda != nil {
slog.Debug("detected GPUs", "count", deviceCount, "library", libPath)
gpuHandles.nvcuda = nvcuda
gpuHandles.deviceCount = deviceCount
return gpuHandles
cHandles.nvcuda = nvcuda
cHandles.deviceCount = deviceCount
nvcudaLibPath = libPath
return cHandles
}
}
cudartLibPaths := FindGPULibs(cudartMgmtName, cudartMgmtPatterns)
cudartLibPaths := FindGPULibs(CudartMgmtName, cudartMgmtPatterns)
if len(cudartLibPaths) > 0 {
deviceCount, cudart, libPath := LoadCUDARTMgmt(cudartLibPaths)
if cudart != nil {
slog.Debug("detected GPUs", "library", libPath, "count", deviceCount)
gpuHandles.cudart = cudart
gpuHandles.deviceCount = deviceCount
return gpuHandles
cHandles.cudart = cudart
cHandles.deviceCount = deviceCount
cudartLibPath = libPath
return cHandles
}
}
oneapiLibPaths := FindGPULibs(oneapiMgmtName, oneapiMgmtPatterns)
return cHandles
}
// Note: gpuMutex must already be held
func initOneAPIHandles() *oneapiHandles {
oHandles := &oneapiHandles{}
// Short Circuit if we already know which library to use
if oneapiLibPath != "" {
oHandles.deviceCount, oHandles.oneapi, _ = LoadOneapiMgmt([]string{oneapiLibPath})
return oHandles
}
oneapiLibPaths := FindGPULibs(OneapiMgmtName, OneapiGlobs)
if len(oneapiLibPaths) > 0 {
deviceCount, oneapi, libPath := LoadOneapiMgmt(oneapiLibPaths)
if oneapi != nil {
slog.Debug("detected Intel GPUs", "library", libPath, "count", deviceCount)
gpuHandles.oneapi = oneapi
gpuHandles.deviceCount = deviceCount
return gpuHandles
}
oHandles.deviceCount, oHandles.oneapi, oneapiLibPath = LoadOneapiMgmt(oneapiLibPaths)
}
return gpuHandles
return oHandles
}
func GetCPUInfo() GpuInfoList {
gpuMutex.Lock()
if !bootstrapped {
gpuMutex.Unlock()
GetGPUInfo()
} else {
gpuMutex.Unlock()
}
return GpuInfoList{cpus[0].GpuInfo}
}
func GetGPUInfo() GpuInfoList {
@@ -178,124 +178,255 @@ func GetGPUInfo() GpuInfoList {
// GPUs so we can report warnings if we see Nvidia/AMD but fail to load the libraries
gpuMutex.Lock()
defer gpuMutex.Unlock()
gpuHandles := initGPUHandles()
needRefresh := true
var cHandles *cudaHandles
var oHandles *oneapiHandles
defer func() {
if gpuHandles.cudart != nil {
C.cudart_release(*gpuHandles.cudart)
if cHandles != nil {
if cHandles.cudart != nil {
C.cudart_release(*cHandles.cudart)
}
if cHandles.nvcuda != nil {
C.nvcuda_release(*cHandles.nvcuda)
}
if cHandles.nvml != nil {
C.nvml_release(*cHandles.nvml)
}
}
if gpuHandles.nvcuda != nil {
C.nvcuda_release(*gpuHandles.nvcuda)
if oHandles != nil {
if oHandles.oneapi != nil {
// TODO - is this needed?
C.oneapi_release(*oHandles.oneapi)
}
}
}()
// All our GPU builds on x86 have AVX enabled, so fallback to CPU if we don't detect at least AVX
cpuVariant := GetCPUVariant()
if cpuVariant == "" && runtime.GOARCH == "amd64" {
slog.Warn("CPU does not have AVX or AVX2, disabling GPU support.")
}
if !bootstrapped {
slog.Debug("Detecting GPUs")
needRefresh = false
cpuCapability = GetCPUCapability()
var memInfo C.mem_info_t
// On windows we bundle the nvidia library one level above the runner dir
depPath := ""
if runtime.GOOS == "windows" && envconfig.RunnersDir != "" {
depPath = filepath.Dir(envconfig.RunnersDir)
}
var memInfo C.mem_info_t
resp := []GpuInfo{}
// NVIDIA first
for i := 0; i < gpuHandles.deviceCount; i++ {
// TODO once we support CPU compilation variants of GPU libraries refine this...
if cpuVariant == "" && runtime.GOARCH == "amd64" {
continue
mem, err := GetCPUMem()
if err != nil {
slog.Warn("error looking up system memory", "error", err)
}
if gpuHandles.cudart != nil || gpuHandles.nvcuda != nil {
gpuInfo := GpuInfo{
Library: "cuda",
cpus = []CPUInfo{CPUInfo{
GpuInfo: GpuInfo{
memInfo: mem,
Library: "cpu",
Variant: cpuCapability,
ID: "0",
},
}}
// Fallback to CPU mode if we're lacking required vector extensions on x86
if cpuCapability < GPURunnerCPUCapability && runtime.GOARCH == "amd64" {
slog.Warn("CPU does not have minimum vector extensions, GPU inference disabled", "required", GPURunnerCPUCapability, "detected", cpuCapability)
bootstrapped = true
// No need to do any GPU discovery, since we can't run on them
return GpuInfoList{cpus[0].GpuInfo}
}
// On windows we bundle the nvidia library one level above the runner dir
depPath := ""
if runtime.GOOS == "windows" && envconfig.RunnersDir != "" {
depPath = filepath.Join(filepath.Dir(envconfig.RunnersDir), "cuda")
}
// Load ALL libraries
cHandles = initCudaHandles()
// NVIDIA
for i := range cHandles.deviceCount {
if cHandles.cudart != nil || cHandles.nvcuda != nil {
gpuInfo := CudaGPUInfo{
GpuInfo: GpuInfo{
Library: "cuda",
},
index: i,
}
var driverMajor int
var driverMinor int
if cHandles.cudart != nil {
C.cudart_bootstrap(*cHandles.cudart, C.int(i), &memInfo)
} else {
C.nvcuda_bootstrap(*cHandles.nvcuda, C.int(i), &memInfo)
driverMajor = int(cHandles.nvcuda.driver_major)
driverMinor = int(cHandles.nvcuda.driver_minor)
}
if memInfo.err != nil {
slog.Info("error looking up nvidia GPU memory", "error", C.GoString(memInfo.err))
C.free(unsafe.Pointer(memInfo.err))
continue
}
if memInfo.major < CudaComputeMin[0] || (memInfo.major == CudaComputeMin[0] && memInfo.minor < CudaComputeMin[1]) {
slog.Info(fmt.Sprintf("[%d] CUDA GPU is too old. Compute Capability detected: %d.%d", i, memInfo.major, memInfo.minor))
continue
}
gpuInfo.TotalMemory = uint64(memInfo.total)
gpuInfo.FreeMemory = uint64(memInfo.free)
gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
gpuInfo.Compute = fmt.Sprintf("%d.%d", memInfo.major, memInfo.minor)
gpuInfo.MinimumMemory = cudaMinimumMemory
gpuInfo.DependencyPath = depPath
gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
gpuInfo.DriverMajor = driverMajor
gpuInfo.DriverMinor = driverMinor
// TODO potentially sort on our own algorithm instead of what the underlying GPU library does...
cudaGPUs = append(cudaGPUs, gpuInfo)
}
var driverMajor int
var driverMinor int
if gpuHandles.cudart != nil {
C.cudart_check_vram(*gpuHandles.cudart, C.int(i), &memInfo)
}
// Intel
if envconfig.IntelGpu {
oHandles = initOneAPIHandles()
// On windows we bundle the oneapi library one level above the runner dir
depPath = ""
if runtime.GOOS == "windows" && envconfig.RunnersDir != "" {
depPath = filepath.Join(filepath.Dir(envconfig.RunnersDir), "oneapi")
}
for d := range oHandles.oneapi.num_drivers {
if oHandles.oneapi == nil {
// shouldn't happen
slog.Warn("nil oneapi handle with driver count", "count", int(oHandles.oneapi.num_drivers))
continue
}
devCount := C.oneapi_get_device_count(*oHandles.oneapi, C.int(d))
for i := range devCount {
gpuInfo := OneapiGPUInfo{
GpuInfo: GpuInfo{
Library: "oneapi",
},
driverIndex: int(d),
gpuIndex: int(i),
}
// TODO - split bootstrapping from updating free memory
C.oneapi_check_vram(*oHandles.oneapi, C.int(d), i, &memInfo)
// TODO - convert this to MinimumMemory based on testing...
var totalFreeMem float64 = float64(memInfo.free) * 0.95 // work-around: leave some reserve vram for mkl lib used in ggml-sycl backend.
memInfo.free = C.uint64_t(totalFreeMem)
gpuInfo.TotalMemory = uint64(memInfo.total)
gpuInfo.FreeMemory = uint64(memInfo.free)
gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
gpuInfo.DependencyPath = depPath
oneapiGPUs = append(oneapiGPUs, gpuInfo)
}
}
}
rocmGPUs = AMDGetGPUInfo()
bootstrapped = true
}
// For detected GPUs, load library if not loaded
// Refresh free memory usage
if needRefresh {
mem, err := GetCPUMem()
if err != nil {
slog.Warn("error looking up system memory", "error", err)
} else {
slog.Debug("updating system memory data",
slog.Group(
"before",
"total", format.HumanBytes2(cpus[0].TotalMemory),
"free", format.HumanBytes2(cpus[0].FreeMemory),
),
slog.Group(
"now",
"total", format.HumanBytes2(mem.TotalMemory),
"free", format.HumanBytes2(mem.FreeMemory),
),
)
cpus[0].FreeMemory = mem.FreeMemory
}
var memInfo C.mem_info_t
if cHandles == nil && len(cudaGPUs) > 0 {
cHandles = initCudaHandles()
}
for i, gpu := range cudaGPUs {
if cHandles.nvml != nil {
C.nvml_get_free(*cHandles.nvml, C.int(gpu.index), &memInfo.free, &memInfo.total, &memInfo.used)
} else if cHandles.cudart != nil {
C.cudart_bootstrap(*cHandles.cudart, C.int(gpu.index), &memInfo)
} else if cHandles.nvcuda != nil {
C.nvcuda_get_free(*cHandles.nvcuda, C.int(gpu.index), &memInfo.free, &memInfo.total)
memInfo.used = memInfo.total - memInfo.free
} else {
C.nvcuda_check_vram(*gpuHandles.nvcuda, C.int(i), &memInfo)
driverMajor = int(gpuHandles.nvcuda.driver_major)
driverMinor = int(gpuHandles.nvcuda.driver_minor)
// shouldn't happen
slog.Warn("no valid cuda library loaded to refresh vram usage")
break
}
if memInfo.err != nil {
slog.Info("error looking up nvidia GPU memory", "error", C.GoString(memInfo.err))
slog.Warn("error looking up nvidia GPU memory", "error", C.GoString(memInfo.err))
C.free(unsafe.Pointer(memInfo.err))
continue
}
if memInfo.major < CudaComputeMin[0] || (memInfo.major == CudaComputeMin[0] && memInfo.minor < CudaComputeMin[1]) {
slog.Info(fmt.Sprintf("[%d] CUDA GPU is too old. Compute Capability detected: %d.%d", i, memInfo.major, memInfo.minor))
if memInfo.free == 0 {
slog.Warn("error looking up nvidia GPU memory")
continue
}
gpuInfo.TotalMemory = uint64(memInfo.total)
gpuInfo.FreeMemory = uint64(memInfo.free)
gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
gpuInfo.Compute = fmt.Sprintf("%d.%d", memInfo.major, memInfo.minor)
gpuInfo.MinimumMemory = cudaMinimumMemory
gpuInfo.DependencyPath = depPath
gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
gpuInfo.DriverMajor = int(driverMajor)
gpuInfo.DriverMinor = int(driverMinor)
// TODO potentially sort on our own algorithm instead of what the underlying GPU library does...
resp = append(resp, gpuInfo)
slog.Debug("updating cuda memory data",
"gpu", gpu.ID,
"name", gpu.Name,
slog.Group(
"before",
"total", format.HumanBytes2(gpu.TotalMemory),
"free", format.HumanBytes2(gpu.FreeMemory),
),
slog.Group(
"now",
"total", format.HumanBytes2(uint64(memInfo.total)),
"free", format.HumanBytes2(uint64(memInfo.free)),
"used", format.HumanBytes2(uint64(memInfo.used)),
),
)
cudaGPUs[i].FreeMemory = uint64(memInfo.free)
}
if gpuHandles.oneapi != nil {
gpuInfo := GpuInfo{
Library: "oneapi",
if oHandles == nil && len(oneapiGPUs) > 0 {
oHandles = initOneAPIHandles()
}
for i, gpu := range oneapiGPUs {
if oHandles.oneapi == nil {
// shouldn't happen
slog.Warn("nil oneapi handle with device count", "count", oHandles.deviceCount)
continue
}
C.oneapi_check_vram(*gpuHandles.oneapi, &memInfo)
C.oneapi_check_vram(*oHandles.oneapi, C.int(gpu.driverIndex), C.int(gpu.gpuIndex), &memInfo)
// TODO - convert this to MinimumMemory based on testing...
var totalFreeMem float64 = float64(memInfo.free) * 0.95 // work-around: leave some reserve vram for mkl lib used in ggml-sycl backend.
memInfo.free = C.uint64_t(totalFreeMem)
gpuInfo.TotalMemory = uint64(memInfo.total)
gpuInfo.FreeMemory = uint64(memInfo.free)
gpuInfo.ID = strconv.Itoa(i)
resp = append(resp, gpuInfo)
oneapiGPUs[i].FreeMemory = uint64(memInfo.free)
}
err = RocmGPUInfoList(rocmGPUs).RefreshFreeMemory()
if err != nil {
slog.Debug("problem refreshing ROCm free memory", "error", err)
}
}
// Then AMD
resp = append(resp, AMDGetGPUInfo()...)
resp := []GpuInfo{}
for _, gpu := range cudaGPUs {
resp = append(resp, gpu.GpuInfo)
}
for _, gpu := range rocmGPUs {
resp = append(resp, gpu.GpuInfo)
}
for _, gpu := range oneapiGPUs {
resp = append(resp, gpu.GpuInfo)
}
if len(resp) == 0 {
C.cpu_check_ram(&memInfo)
if memInfo.err != nil {
slog.Info("error looking up CPU memory", "error", C.GoString(memInfo.err))
C.free(unsafe.Pointer(memInfo.err))
return resp
}
gpuInfo := GpuInfo{
Library: "cpu",
Variant: cpuVariant,
}
gpuInfo.TotalMemory = uint64(memInfo.total)
gpuInfo.FreeMemory = uint64(memInfo.free)
gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
resp = append(resp, gpuInfo)
resp = append(resp, cpus[0].GpuInfo)
}
return resp
}
func GetCPUMem() (memInfo, error) {
var ret memInfo
var info C.mem_info_t
C.cpu_check_ram(&info)
if info.err != nil {
defer C.free(unsafe.Pointer(info.err))
return ret, fmt.Errorf(C.GoString(info.err))
}
ret.FreeMemory = uint64(info.free)
ret.TotalMemory = uint64(info.total)
return ret, nil
}
func FindGPULibs(baseLibName string, defaultPatterns []string) []string {
// Multiple GPU libraries may exist, and some may not work, so keep trying until we exhaust them
var ldPaths []string
@@ -326,6 +457,7 @@ func FindGPULibs(baseLibName string, defaultPatterns []string) []string {
// Nvidia PhysX known to return bogus results
if strings.Contains(pattern, "PhysX") {
slog.Debug("skipping PhysX cuda library path", "path", pattern)
continue
}
// Ignore glob discovery errors
matches, _ := filepath.Glob(pattern)
@@ -391,8 +523,26 @@ func LoadNVCUDAMgmt(nvcudaLibPaths []string) (int, *C.nvcuda_handle_t, string) {
return 0, nil, ""
}
func LoadNVMLMgmt(nvmlLibPaths []string) (*C.nvml_handle_t, string) {
var resp C.nvml_init_resp_t
resp.ch.verbose = getVerboseState()
for _, libPath := range nvmlLibPaths {
lib := C.CString(libPath)
defer C.free(unsafe.Pointer(lib))
C.nvml_init(lib, &resp)
if resp.err != nil {
slog.Info(fmt.Sprintf("Unable to load NVML management library %s: %s", libPath, C.GoString(resp.err)))
C.free(unsafe.Pointer(resp.err))
} else {
return &resp.ch, libPath
}
}
return nil, ""
}
func LoadOneapiMgmt(oneapiLibPaths []string) (int, *C.oneapi_handle_t, string) {
var resp C.oneapi_init_resp_t
num_devices := 0
resp.oh.verbose = getVerboseState()
for _, libPath := range oneapiLibPaths {
lib := C.CString(libPath)
@@ -402,7 +552,10 @@ func LoadOneapiMgmt(oneapiLibPaths []string) (int, *C.oneapi_handle_t, string) {
slog.Debug("Unable to load oneAPI management library", "library", libPath, "error", C.GoString(resp.err))
C.free(unsafe.Pointer(resp.err))
} else {
return int(resp.num_devices), &resp.oh, libPath
for i := range resp.oh.num_drivers {
num_devices += int(C.oneapi_get_device_count(resp.oh, C.int(i)))
}
return num_devices, &resp.oh, libPath
}
}
return 0, nil, ""

View File

@@ -24,7 +24,7 @@ func GetGPUInfo() GpuInfoList {
return []GpuInfo{
{
Library: "cpu",
Variant: GetCPUVariant(),
Variant: GetCPUCapability(),
memInfo: mem,
},
}
@@ -42,6 +42,17 @@ func GetGPUInfo() GpuInfoList {
return []GpuInfo{info}
}
func GetCPUInfo() GpuInfoList {
mem, _ := GetCPUMem()
return []GpuInfo{
{
Library: "cpu",
Variant: GetCPUCapability(),
memInfo: mem,
},
}
}
func GetCPUMem() (memInfo, error) {
return memInfo{
TotalMemory: uint64(C.getPhysicalMemory()),

View File

@@ -47,6 +47,7 @@ typedef struct mem_info {
char gpu_name[GPU_NAME_LEN];
uint64_t total;
uint64_t free;
uint64_t used;
// Compute Capability
int major;
@@ -62,6 +63,7 @@ void cpu_check_ram(mem_info_t *resp);
#include "gpu_info_cudart.h"
#include "gpu_info_nvcuda.h"
#include "gpu_info_nvml.h"
#include "gpu_info_oneapi.h"
#endif // __GPU_INFO_H__

View File

@@ -1,45 +0,0 @@
#include "gpu_info.h"
// Fallbacks for CPU mode
#ifdef _WIN32
#include <sysinfoapi.h>
void cpu_check_ram(mem_info_t *resp) {
resp->err = NULL;
MEMORYSTATUSEX info;
info.dwLength = sizeof(info);
if (GlobalMemoryStatusEx(&info) != 0) {
resp->total = info.ullTotalPhys;
resp->free = info.ullAvailPhys;
snprintf(&resp->gpu_id[0], GPU_ID_LEN, "0");
} else {
resp->err = LOAD_ERR();
}
return;
}
#elif __linux__
#include <errno.h>
#include <string.h>
#include <sys/sysinfo.h>
void cpu_check_ram(mem_info_t *resp) {
struct sysinfo info;
resp->err = NULL;
if (sysinfo(&info) != 0) {
resp->err = strdup(strerror(errno));
} else {
resp->total = info.totalram * info.mem_unit;
resp->free = info.freeram * info.mem_unit;
snprintf(&resp->gpu_id[0], GPU_ID_LEN, "0");
}
return;
}
#elif __APPLE__
// TODO consider an Apple implementation that does something useful
// mem_info_t cpu_check_ram() {
// mem_info_t resp = {0, 0, NULL};
// return resp;
// }
#else
#error "Unsupported platform"
#endif

View File

@@ -40,7 +40,7 @@ void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp) {
for (i = 0; l[i].s != NULL; i++) {
*l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
if (!l[i].p) {
if (!*(l[i].p)) {
char *msg = LOAD_ERR();
LOG(resp->ch.verbose, "dlerr: %s\n", msg);
UNLOAD_LIBRARY(resp->ch.handle);
@@ -94,7 +94,7 @@ void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp) {
}
void cudart_check_vram(cudart_handle_t h, int i, mem_info_t *resp) {
void cudart_bootstrap(cudart_handle_t h, int i, mem_info_t *resp) {
resp->err = NULL;
cudartMemory_t memInfo = {0,0,0};
cudartReturn_t ret;
@@ -166,9 +166,11 @@ void cudart_check_vram(cudart_handle_t h, int i, mem_info_t *resp) {
resp->total = memInfo.total;
resp->free = memInfo.free;
resp->used = memInfo.used;
LOG(h.verbose, "[%s] CUDA totalMem %lu\n", resp->gpu_id, resp->total);
LOG(h.verbose, "[%s] CUDA freeMem %lu\n", resp->gpu_id, resp->free);
LOG(h.verbose, "[%s] CUDA usedMem %lu\n", resp->gpu_id, resp->used);
LOG(h.verbose, "[%s] Compute Capability %d.%d\n", resp->gpu_id, resp->major, resp->minor);
}

View File

@@ -140,7 +140,8 @@ typedef struct cudart_init_resp {
} cudart_init_resp_t;
void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp);
void cudart_check_vram(cudart_handle_t ch, int device_id, mem_info_t *resp);
void cudart_bootstrap(cudart_handle_t ch, int device_id, mem_info_t *resp);
// TODO - if we keep this library longer term, add cudart_get_free
void cudart_release(cudart_handle_t ch);
#endif // __GPU_INFO_CUDART_H__

View File

@@ -43,7 +43,7 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
for (i = 0; l[i].s != NULL; i++) {
*l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
if (!*l[i].p) {
if (!*(l[i].p)) {
char *msg = LOAD_ERR();
LOG(resp->ch.verbose, "dlerr: %s\n", msg);
UNLOAD_LIBRARY(resp->ch.handle);
@@ -96,7 +96,7 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
}
const int buflen = 256;
void nvcuda_check_vram(nvcuda_handle_t h, int i, mem_info_t *resp) {
void nvcuda_bootstrap(nvcuda_handle_t h, int i, mem_info_t *resp) {
resp->err = NULL;
nvcudaMemory_t memInfo = {0,0};
CUresult ret;
@@ -168,7 +168,7 @@ void nvcuda_check_vram(nvcuda_handle_t h, int i, mem_info_t *resp) {
// To get memory we have to set (and release) a context
ret = (*h.cuCtxCreate_v3)(&ctx, NULL, 0, 0, device);
if (ret != CUDA_SUCCESS) {
snprintf(buf, buflen, "nvcuda failed to get primary device context %d", ret);
snprintf(buf, buflen, "nvcuda failed to get device context %d", ret);
resp->err = strdup(buf);
return;
}
@@ -193,7 +193,42 @@ void nvcuda_check_vram(nvcuda_handle_t h, int i, mem_info_t *resp) {
ret = (*h.cuCtxDestroy)(ctx);
if (ret != CUDA_SUCCESS) {
LOG(1, "nvcuda failed to release primary device context %d", ret);
LOG(1, "nvcuda failed to release device context %d", ret);
}
}
void nvcuda_get_free(nvcuda_handle_t h, int i, uint64_t *free, uint64_t *total) {
CUresult ret;
CUcontext ctx = NULL;
CUdevice device = -1;
*free = 0;
*total = 0;
ret = (*h.cuDeviceGet)(&device, i);
if (ret != CUDA_SUCCESS) {
LOG(1, "nvcuda device failed to initialize");
return;
}
// To get memory we have to set (and release) a context
ret = (*h.cuCtxCreate_v3)(&ctx, NULL, 0, 0, device);
if (ret != CUDA_SUCCESS) {
LOG(1, "nvcuda failed to get device context %d", ret);
return;
}
ret = (*h.cuMemGetInfo_v2)(free, total);
if (ret != CUDA_SUCCESS) {
LOG(1, "nvcuda device memory info lookup failure %d", ret);
// Best effort on failure...
(*h.cuCtxDestroy)(ctx);
return;
}
ret = (*h.cuCtxDestroy)(ctx);
if (ret != CUDA_SUCCESS) {
LOG(1, "nvcuda failed to release device context %d", ret);
}
}

View File

@@ -67,7 +67,8 @@ typedef struct nvcuda_init_resp {
} nvcuda_init_resp_t;
void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp);
void nvcuda_check_vram(nvcuda_handle_t ch, int device_id, mem_info_t *resp);
void nvcuda_bootstrap(nvcuda_handle_t ch, int device_id, mem_info_t *resp);
void nvcuda_get_free(nvcuda_handle_t ch, int device_id, uint64_t *free, uint64_t *total);
void nvcuda_release(nvcuda_handle_t ch);
#endif // __GPU_INFO_NVCUDA_H__

104
gpu/gpu_info_nvml.c Normal file
View File

@@ -0,0 +1,104 @@
#ifndef __APPLE__ // TODO - maybe consider nvidia support on intel macs?
#include <string.h>
#include "gpu_info_nvml.h"
void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp) {
nvmlReturn_t ret;
resp->err = NULL;
const int buflen = 256;
char buf[buflen + 1];
int i;
struct lookup {
char *s;
void **p;
} l[] = {
{"nvmlInit_v2", (void *)&resp->ch.nvmlInit_v2},
{"nvmlShutdown", (void *)&resp->ch.nvmlShutdown},
{"nvmlDeviceGetHandleByIndex", (void *)&resp->ch.nvmlDeviceGetHandleByIndex},
{"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.nvmlDeviceGetMemoryInfo},
{NULL, NULL},
};
resp->ch.handle = LOAD_LIBRARY(nvml_lib_path, RTLD_LAZY);
if (!resp->ch.handle) {
char *msg = LOAD_ERR();
LOG(resp->ch.verbose, "library %s load err: %s\n", nvml_lib_path, msg);
snprintf(buf, buflen,
"Unable to load %s library to query for Nvidia GPUs: %s",
nvml_lib_path, msg);
free(msg);
resp->err = strdup(buf);
return;
}
// TODO once we've squashed the remaining corner cases remove this log
// LOG(resp->ch.verbose, "wiring nvidia management library functions in %s\n", nvml_lib_path);
for (i = 0; l[i].s != NULL; i++) {
// TODO once we've squashed the remaining corner cases remove this log
// LOG(resp->ch.verbose, "dlsym: %s\n", l[i].s);
*l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
if (!*(l[i].p)) {
resp->ch.handle = NULL;
char *msg = LOAD_ERR();
LOG(resp->ch.verbose, "dlerr: %s\n", msg);
UNLOAD_LIBRARY(resp->ch.handle);
snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
msg);
free(msg);
resp->err = strdup(buf);
return;
}
}
ret = (*resp->ch.nvmlInit_v2)();
if (ret != NVML_SUCCESS) {
LOG(resp->ch.verbose, "nvmlInit_v2 err: %d\n", ret);
UNLOAD_LIBRARY(resp->ch.handle);
resp->ch.handle = NULL;
snprintf(buf, buflen, "nvml vram init failure: %d", ret);
resp->err = strdup(buf);
return;
}
}
void nvml_get_free(nvml_handle_t h, int device_id, uint64_t *free, uint64_t *total, uint64_t *used) {
nvmlDevice_t device;
nvmlMemory_t memInfo = {0};
nvmlReturn_t ret;
ret = (*h.nvmlDeviceGetHandleByIndex)(device_id, &device);
if (ret != NVML_SUCCESS) {
LOG(1, "unable to get device handle %d: %d", device_id, ret);
*free = 0;
return;
}
ret = (*h.nvmlDeviceGetMemoryInfo)(device, &memInfo);
if (ret != NVML_SUCCESS) {
LOG(1, "device memory info lookup failure %d: %d", device_id, ret);
*free = 0;
return;
}
*free = memInfo.free;
*total = memInfo.total;
*used = memInfo.used;
}
void nvml_release(nvml_handle_t h) {
LOG(h.verbose, "releasing nvml library\n");
nvmlReturn_t ret;
ret = (*h.nvmlShutdown)();
if (ret != NVML_SUCCESS) {
LOG(1, "error during nvmlShutdown %d", ret);
}
UNLOAD_LIBRARY(h.handle);
h.handle = NULL;
}
#endif // __APPLE__

48
gpu/gpu_info_nvml.h Normal file
View File

@@ -0,0 +1,48 @@
#ifndef __APPLE__
#ifndef __GPU_INFO_NVML_H__
#define __GPU_INFO_NVML_H__
#include "gpu_info.h"
// Just enough typedef's to dlopen/dlsym for memory information
typedef enum nvmlReturn_enum {
NVML_SUCCESS = 0,
// Other values omitted for now...
} nvmlReturn_t;
typedef void *nvmlDevice_t; // Opaque is sufficient
typedef struct nvmlMemory_st {
unsigned long long total;
unsigned long long free;
unsigned long long used;
} nvmlMemory_t;
typedef enum nvmlBrandType_enum
{
NVML_BRAND_UNKNOWN = 0,
} nvmlBrandType_t;
typedef struct nvml_handle {
void *handle;
uint16_t verbose;
nvmlReturn_t (*nvmlInit_v2)(void);
nvmlReturn_t (*nvmlShutdown)(void);
nvmlReturn_t (*nvmlDeviceGetHandleByIndex)(unsigned int, nvmlDevice_t *);
nvmlReturn_t (*nvmlDeviceGetMemoryInfo)(nvmlDevice_t, nvmlMemory_t *);
} nvml_handle_t;
typedef struct nvml_init_resp {
char *err; // If err is non-null handle is invalid
nvml_handle_t ch;
} nvml_init_resp_t;
typedef struct nvml_compute_capability {
char *err;
int major;
int minor;
} nvml_compute_capability_t;
void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp);
void nvml_get_free(nvml_handle_t ch, int device_id, uint64_t *free, uint64_t *total, uint64_t *used);
void nvml_release(nvml_handle_t ch);
#endif // __GPU_INFO_NVML_H__
#endif // __APPLE__

View File

@@ -4,15 +4,17 @@
#include <string.h>
void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp)
{
void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp) {
ze_result_t ret;
resp->err = NULL;
resp->oh.devices = NULL;
resp->oh.num_devices = NULL;
resp->oh.drivers = NULL;
resp->oh.num_drivers = 0;
const int buflen = 256;
char buf[buflen + 1];
int i;
struct lookup
{
int i, d;
struct lookup {
char *s;
void **p;
} l[] = {
@@ -28,8 +30,7 @@ void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp)
};
resp->oh.handle = LOAD_LIBRARY(oneapi_lib_path, RTLD_LAZY);
if (!resp->oh.handle)
{
if (!resp->oh.handle) {
char *msg = LOAD_ERR();
snprintf(buf, buflen,
"Unable to load %s library to query for Intel GPUs: %s\n",
@@ -44,14 +45,12 @@ void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp)
"wiring Level-Zero management library functions in %s\n",
oneapi_lib_path);
for (i = 0; l[i].s != NULL; i++)
{
for (i = 0; l[i].s != NULL; i++) {
// TODO once we've squashed the remaining corner cases remove this log
LOG(resp->oh.verbose, "dlsym: %s\n", l[i].s);
*l[i].p = LOAD_SYMBOL(resp->oh.handle, l[i].s);
if (!l[i].p)
{
if (!*(l[i].p)) {
resp->oh.handle = NULL;
char *msg = LOAD_ERR();
LOG(resp->oh.verbose, "dlerr: %s\n", msg);
@@ -63,23 +62,70 @@ void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp)
}
}
LOG(resp->oh.verbose, "calling zesInit\n");
ret = (*resp->oh.zesInit)(0);
if (ret != ZE_RESULT_SUCCESS)
{
LOG(resp->oh.verbose, "zesInit err: %d\n", ret);
UNLOAD_LIBRARY(resp->oh.handle);
resp->oh.handle = NULL;
snprintf(buf, buflen, "oneapi vram init failure: %d", ret);
if (ret != ZE_RESULT_SUCCESS) {
LOG(resp->oh.verbose, "zesInit err: %x\n", ret);
snprintf(buf, buflen, "oneapi vram init failure: %x", ret);
resp->err = strdup(buf);
oneapi_release(resp->oh);
return;
}
(*resp->oh.zesDriverGet)(&resp->num_devices, NULL);
LOG(resp->oh.verbose, "calling zesDriverGet\n");
ret = (*resp->oh.zesDriverGet)(&resp->oh.num_drivers, NULL);
if (ret != ZE_RESULT_SUCCESS) {
LOG(resp->oh.verbose, "zesDriverGet err: %x\n", ret);
snprintf(buf, buflen, "unable to get driver count: %x", ret);
resp->err = strdup(buf);
oneapi_release(resp->oh);
return;
}
LOG(resp->oh.verbose, "oneapi driver count: %d\n", resp->oh.num_drivers);
resp->oh.drivers = malloc(resp->oh.num_drivers * sizeof(zes_driver_handle_t));
resp->oh.num_devices = malloc(resp->oh.num_drivers * sizeof(uint32_t));
memset(&resp->oh.num_devices[0], 0, resp->oh.num_drivers * sizeof(uint32_t));
resp->oh.devices =
malloc(resp->oh.num_drivers * sizeof(zes_device_handle_t *));
ret = (*resp->oh.zesDriverGet)(&resp->oh.num_drivers, &resp->oh.drivers[0]);
if (ret != ZE_RESULT_SUCCESS) {
LOG(resp->oh.verbose, "zesDriverGet err: %x\n", ret);
snprintf(buf, buflen, "unable to get driver count: %x", ret);
resp->err = strdup(buf);
oneapi_release(resp->oh);
return;
}
for (d = 0; d < resp->oh.num_drivers; d++) {
LOG(resp->oh.verbose, "calling zesDeviceGet count %d: %p\n", d, resp->oh.drivers[d]);
ret = (*resp->oh.zesDeviceGet)(resp->oh.drivers[d],
&resp->oh.num_devices[d], NULL);
if (ret != ZE_RESULT_SUCCESS) {
LOG(resp->oh.verbose, "zesDeviceGet err: %x\n", ret);
snprintf(buf, buflen, "unable to get device count: %x", ret);
resp->err = strdup(buf);
oneapi_release(resp->oh);
return;
}
resp->oh.devices[d] =
malloc(resp->oh.num_devices[d] * sizeof(zes_device_handle_t));
ret = (*resp->oh.zesDeviceGet)(
resp->oh.drivers[d], &resp->oh.num_devices[d], resp->oh.devices[d]);
if (ret != ZE_RESULT_SUCCESS) {
LOG(resp->oh.verbose, "zesDeviceGet err: %x\n", ret);
snprintf(buf, buflen, "unable to get device count: %x", ret);
resp->err = strdup(buf);
oneapi_release(resp->oh);
return;
}
}
return;
}
void oneapi_check_vram(oneapi_handle_t h, mem_info_t *resp)
{
void oneapi_check_vram(oneapi_handle_t h, int driver, int device,
mem_info_t *resp) {
ze_result_t ret;
resp->err = NULL;
uint64_t totalMem = 0;
@@ -88,127 +134,126 @@ void oneapi_check_vram(oneapi_handle_t h, mem_info_t *resp)
char buf[buflen + 1];
int i, d, m;
if (h.handle == NULL)
{
if (h.handle == NULL) {
resp->err = strdup("Level-Zero handle not initialized");
return;
}
uint32_t driversCount = 0;
ret = (*h.zesDriverGet)(&driversCount, NULL);
if (ret != ZE_RESULT_SUCCESS)
{
snprintf(buf, buflen, "unable to get driver count: %d", ret);
resp->err = strdup(buf);
if (driver > h.num_drivers || device > h.num_devices[driver]) {
resp->err = strdup("driver of device index out of bounds");
return;
}
LOG(h.verbose, "discovered %d Level-Zero drivers\n", driversCount);
zes_driver_handle_t *allDrivers =
malloc(driversCount * sizeof(zes_driver_handle_t));
(*h.zesDriverGet)(&driversCount, allDrivers);
resp->total = 0;
resp->free = 0;
for (d = 0; d < driversCount; d++)
{
uint32_t deviceCount = 0;
ret = (*h.zesDeviceGet)(allDrivers[d], &deviceCount, NULL);
if (ret != ZE_RESULT_SUCCESS)
{
snprintf(buf, buflen, "unable to get device count: %d", ret);
zes_device_ext_properties_t ext_props;
ext_props.stype = ZES_STRUCTURE_TYPE_DEVICE_EXT_PROPERTIES;
ext_props.pNext = NULL;
zes_device_properties_t props;
props.stype = ZES_STRUCTURE_TYPE_DEVICE_PROPERTIES;
props.pNext = &ext_props;
ret = (*h.zesDeviceGetProperties)(h.devices[driver][device], &props);
if (ret != ZE_RESULT_SUCCESS) {
snprintf(buf, buflen, "unable to get device properties: %d", ret);
resp->err = strdup(buf);
return;
}
snprintf(&resp->gpu_name[0], GPU_NAME_LEN, "%s", props.modelName);
// TODO this needs to map to ONEAPI_DEVICE_SELECTOR syntax
// (this is probably wrong...)
// TODO - the driver isn't included - what if there are multiple drivers?
snprintf(&resp->gpu_id[0], GPU_ID_LEN, "%d", device);
if (h.verbose) {
// When in verbose mode, report more information about
// the card we discover.
LOG(h.verbose, "[%d:%d] oneAPI device name: %s\n", driver, device,
props.modelName);
LOG(h.verbose, "[%d:%d] oneAPI brand: %s\n", driver, device,
props.brandName);
LOG(h.verbose, "[%d:%d] oneAPI vendor: %s\n", driver, device,
props.vendorName);
LOG(h.verbose, "[%d:%d] oneAPI S/N: %s\n", driver, device,
props.serialNumber);
LOG(h.verbose, "[%d:%d] oneAPI board number: %s\n", driver, device,
props.boardNumber);
}
// TODO
// Compute Capability equivalent in resp->major, resp->minor, resp->patch
uint32_t memCount = 0;
ret = (*h.zesDeviceEnumMemoryModules)(h.devices[driver][device], &memCount,
NULL);
if (ret != ZE_RESULT_SUCCESS) {
snprintf(buf, buflen, "unable to enumerate Level-Zero memory modules: %x",
ret);
resp->err = strdup(buf);
return;
}
LOG(h.verbose, "discovered %d Level-Zero memory modules\n", memCount);
zes_mem_handle_t *mems = malloc(memCount * sizeof(zes_mem_handle_t));
(*h.zesDeviceEnumMemoryModules)(h.devices[driver][device], &memCount, mems);
for (m = 0; m < memCount; m++) {
zes_mem_state_t state;
state.stype = ZES_STRUCTURE_TYPE_MEM_STATE;
state.pNext = NULL;
ret = (*h.zesMemoryGetState)(mems[m], &state);
if (ret != ZE_RESULT_SUCCESS) {
snprintf(buf, buflen, "unable to get memory state: %x", ret);
resp->err = strdup(buf);
free(allDrivers);
free(mems);
return;
}
LOG(h.verbose, "discovered %d Level-Zero devices\n", deviceCount);
zes_device_handle_t *devices =
malloc(deviceCount * sizeof(zes_device_handle_t));
(*h.zesDeviceGet)(allDrivers[d], &deviceCount, devices);
for (i = 0; i < deviceCount; i++)
{
zes_device_ext_properties_t ext_props;
ext_props.stype = ZES_STRUCTURE_TYPE_DEVICE_EXT_PROPERTIES;
ext_props.pNext = NULL;
zes_device_properties_t props;
props.stype = ZES_STRUCTURE_TYPE_DEVICE_PROPERTIES;
props.pNext = &ext_props;
ret = (*h.zesDeviceGetProperties)(devices[i], &props);
if (ret != ZE_RESULT_SUCCESS)
{
snprintf(buf, buflen, "unable to get device properties: %d", ret);
resp->err = strdup(buf);
free(allDrivers);
free(devices);
return;
}
if (h.verbose)
{
// When in verbose mode, report more information about
// the card we discover.
LOG(h.verbose, "[%d] oneAPI device name: %s\n", i,
props.modelName);
LOG(h.verbose, "[%d] oneAPI brand: %s\n", i,
props.brandName);
LOG(h.verbose, "[%d] oneAPI vendor: %s\n", i,
props.vendorName);
LOG(h.verbose, "[%d] oneAPI S/N: %s\n", i,
props.serialNumber);
LOG(h.verbose, "[%d] oneAPI board number: %s\n", i,
props.boardNumber);
}
uint32_t memCount = 0;
ret = (*h.zesDeviceEnumMemoryModules)(devices[i], &memCount, NULL);
if (ret != ZE_RESULT_SUCCESS)
{
snprintf(buf, buflen,
"unable to enumerate Level-Zero memory modules: %d", ret);
resp->err = strdup(buf);
free(allDrivers);
free(devices);
return;
}
LOG(h.verbose, "discovered %d Level-Zero memory modules\n", memCount);
zes_mem_handle_t *mems = malloc(memCount * sizeof(zes_mem_handle_t));
(*h.zesDeviceEnumMemoryModules)(devices[i], &memCount, mems);
for (m = 0; m < memCount; m++)
{
zes_mem_state_t state;
state.stype = ZES_STRUCTURE_TYPE_MEM_STATE;
state.pNext = NULL;
ret = (*h.zesMemoryGetState)(mems[m], &state);
if (ret != ZE_RESULT_SUCCESS)
{
snprintf(buf, buflen, "unable to get memory state: %d", ret);
resp->err = strdup(buf);
free(allDrivers);
free(devices);
free(mems);
return;
}
resp->total += state.size;
resp->free += state.free;
}
free(mems);
}
free(devices);
resp->total += state.size;
resp->free += state.free;
}
free(allDrivers);
free(mems);
}
void oneapi_release(oneapi_handle_t h) {
int d;
LOG(h.verbose, "releasing oneapi library\n");
for (d = 0; d < h.num_drivers; d++) {
if (h.devices != NULL && h.devices[d] != NULL) {
free(h.devices[d]);
}
}
if (h.devices != NULL) {
free(h.devices);
h.devices = NULL;
}
if (h.num_devices != NULL) {
free(h.num_devices);
h.num_devices = NULL;
}
if (h.drivers != NULL) {
free(h.drivers);
h.drivers = NULL;
}
h.num_drivers = 0;
UNLOAD_LIBRARY(h.handle);
h.handle = NULL;
}
int oneapi_get_device_count(oneapi_handle_t h, int driver) {
if (h.handle == NULL || h.num_devices == NULL) {
return 0;
}
if (driver > h.num_drivers) {
return 0;
}
return (int)h.num_devices[driver];
}
#endif // __APPLE__

View File

@@ -9,8 +9,7 @@
#define ZE_BIT(_i) (1 << _i)
// Just enough typedef's to dlopen/dlsym for memory information
typedef enum ze_result_t
{
typedef enum ze_result_t {
ZE_RESULT_SUCCESS = 0,
// Other values omitted for now...
} ze_result_t;
@@ -20,13 +19,11 @@ typedef struct _zes_driver_handle_t *zes_driver_handle_t;
typedef struct _zes_device_handle_t *zes_device_handle_t;
typedef struct _zes_mem_handle_t *zes_mem_handle_t;
typedef enum _ze_structure_type_t
{
typedef enum _ze_structure_type_t {
ZE_STRUCTURE_TYPE_FORCE_UINT32 = 0x7fffffff
} ze_structure_type_t;
typedef enum _zes_structure_type_t
{
typedef enum _zes_structure_type_t {
ZES_STRUCTURE_TYPE_DEVICE_PROPERTIES = 0x1,
ZES_STRUCTURE_TYPE_MEM_PROPERTIES = 0xb,
ZES_STRUCTURE_TYPE_MEM_STATE = 0x1e,
@@ -34,35 +31,29 @@ typedef enum _zes_structure_type_t
ZES_STRUCTURE_TYPE_FORCE_UINT32 = 0x7fffffff
} zes_structure_type_t;
typedef enum _zes_mem_type_t
{
typedef enum _zes_mem_type_t {
ZES_MEM_TYPE_FORCE_UINT32 = 0x7fffffff
} zes_mem_type_t;
typedef enum _zes_mem_loc_t
{
typedef enum _zes_mem_loc_t {
ZES_MEM_LOC_SYSTEM = 0,
ZES_MEM_LOC_DEVICE = 1,
ZES_MEM_LOC_FORCE_UINT32 = 0x7fffffff
} zes_mem_loc_t;
typedef enum _zes_mem_health_t
{
typedef enum _zes_mem_health_t {
ZES_MEM_HEALTH_FORCE_UINT32 = 0x7fffffff
} zes_mem_health_t;
typedef struct _ze_device_uuid_t
{
typedef struct _ze_device_uuid_t {
uint8_t id[ZE_MAX_DEVICE_UUID_SIZE];
} ze_device_uuid_t;
typedef struct _zes_uuid_t
{
typedef struct _zes_uuid_t {
uint8_t id[ZE_MAX_DEVICE_UUID_SIZE];
} zes_uuid_t;
typedef enum _ze_device_type_t
{
typedef enum _ze_device_type_t {
ZE_DEVICE_TYPE_GPU = 1,
ZE_DEVICE_TYPE_CPU = 2,
ZE_DEVICE_TYPE_FPGA = 3,
@@ -71,8 +62,7 @@ typedef enum _ze_device_type_t
ZE_DEVICE_TYPE_FORCE_UINT32 = 0x7fffffff
} ze_device_type_t;
typedef enum _zes_device_type_t
{
typedef enum _zes_device_type_t {
ZES_DEVICE_TYPE_GPU = 1,
ZES_DEVICE_TYPE_CPU = 2,
ZES_DEVICE_TYPE_FPGA = 3,
@@ -82,8 +72,7 @@ typedef enum _zes_device_type_t
} zes_device_type_t;
typedef uint32_t ze_device_property_flags_t;
typedef enum _ze_device_property_flag_t
{
typedef enum _ze_device_property_flag_t {
ZE_DEVICE_PROPERTY_FLAG_INTEGRATED = ZE_BIT(0),
ZE_DEVICE_PROPERTY_FLAG_SUBDEVICE = ZE_BIT(1),
ZE_DEVICE_PROPERTY_FLAG_ECC = ZE_BIT(2),
@@ -92,8 +81,7 @@ typedef enum _ze_device_property_flag_t
} ze_device_property_flag_t;
typedef uint32_t zes_device_property_flags_t;
typedef enum _zes_device_property_flag_t
{
typedef enum _zes_device_property_flag_t {
ZES_DEVICE_PROPERTY_FLAG_INTEGRATED = ZE_BIT(0),
ZES_DEVICE_PROPERTY_FLAG_SUBDEVICE = ZE_BIT(1),
ZES_DEVICE_PROPERTY_FLAG_ECC = ZE_BIT(2),
@@ -101,8 +89,7 @@ typedef enum _zes_device_property_flag_t
ZES_DEVICE_PROPERTY_FLAG_FORCE_UINT32 = 0x7fffffff
} zes_device_property_flag_t;
typedef struct _ze_device_properties_t
{
typedef struct _ze_device_properties_t {
ze_structure_type_t stype;
void *pNext;
ze_device_type_t type;
@@ -126,8 +113,7 @@ typedef struct _ze_device_properties_t
char name[ZE_MAX_DEVICE_NAME];
} ze_device_properties_t;
typedef struct _zes_device_properties_t
{
typedef struct _zes_device_properties_t {
zes_structure_type_t stype;
void *pNext;
ze_device_properties_t core;
@@ -140,8 +126,7 @@ typedef struct _zes_device_properties_t
char driverVersion[ZES_STRING_PROPERTY_SIZE];
} zes_device_properties_t;
typedef struct _zes_device_ext_properties_t
{
typedef struct _zes_device_ext_properties_t {
zes_structure_type_t stype;
void *pNext;
zes_uuid_t uuid;
@@ -149,8 +134,7 @@ typedef struct _zes_device_ext_properties_t
zes_device_property_flags_t flags;
} zes_device_ext_properties_t;
typedef struct _zes_mem_properties_t
{
typedef struct _zes_mem_properties_t {
zes_structure_type_t stype;
void *pNext;
zes_mem_type_t type;
@@ -162,8 +146,7 @@ typedef struct _zes_mem_properties_t
int32_t numChannels;
} zes_mem_properties_t;
typedef struct _zes_mem_state_t
{
typedef struct _zes_mem_state_t {
zes_structure_type_t stype;
const void *pNext;
zes_mem_health_t health;
@@ -171,10 +154,19 @@ typedef struct _zes_mem_state_t
uint64_t size;
} zes_mem_state_t;
typedef struct oneapi_handle
{
typedef struct oneapi_handle {
void *handle;
uint16_t verbose;
uint32_t num_drivers;
zes_driver_handle_t *drivers;
uint32_t *num_devices;
zes_device_handle_t **devices;
// TODO Driver major, minor information
// int driver_major;
// int driver_minor;
ze_result_t (*zesInit)(int);
ze_result_t (*zesDriverGet)(uint32_t *pCount, zes_driver_handle_t *phDrivers);
ze_result_t (*zesDeviceGet)(zes_driver_handle_t hDriver, uint32_t *pCount,
@@ -191,21 +183,21 @@ typedef struct oneapi_handle
} oneapi_handle_t;
typedef struct oneapi_init_resp
{
typedef struct oneapi_init_resp {
char *err; // If err is non-null handle is invalid
int num_devices;
oneapi_handle_t oh;
} oneapi_init_resp_t;
typedef struct oneapi_version_resp
{
typedef struct oneapi_version_resp {
ze_result_t status;
char *str; // Contains version or error string if status != 0
} oneapi_version_resp_t;
void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp);
void oneapi_check_vram(oneapi_handle_t rh, mem_info_t *resp);
void oneapi_check_vram(oneapi_handle_t h, int driver, int device,
mem_info_t *resp);
void oneapi_release(oneapi_handle_t h);
int oneapi_get_device_count(oneapi_handle_t h, int driver);
#endif // __GPU_INFO_INTEL_H__
#endif // __APPLE__

89
gpu/gpu_linux.go Normal file
View File

@@ -0,0 +1,89 @@
package gpu
import (
"bufio"
"fmt"
"os"
"strings"
"github.com/ollama/ollama/format"
)
var CudartGlobs = []string{
"/usr/local/cuda/lib64/libcudart.so*",
"/usr/lib/x86_64-linux-gnu/nvidia/current/libcudart.so*",
"/usr/lib/x86_64-linux-gnu/libcudart.so*",
"/usr/lib/wsl/lib/libcudart.so*",
"/usr/lib/wsl/drivers/*/libcudart.so*",
"/opt/cuda/lib64/libcudart.so*",
"/usr/local/cuda*/targets/aarch64-linux/lib/libcudart.so*",
"/usr/lib/aarch64-linux-gnu/nvidia/current/libcudart.so*",
"/usr/lib/aarch64-linux-gnu/libcudart.so*",
"/usr/local/cuda/lib*/libcudart.so*",
"/usr/lib*/libcudart.so*",
"/usr/local/lib*/libcudart.so*",
}
var NvmlGlobs = []string{}
var NvcudaGlobs = []string{
"/usr/local/cuda*/targets/*/lib/libcuda.so*",
"/usr/lib/*-linux-gnu/nvidia/current/libcuda.so*",
"/usr/lib/*-linux-gnu/libcuda.so*",
"/usr/lib/wsl/lib/libcuda.so*",
"/usr/lib/wsl/drivers/*/libcuda.so*",
"/opt/cuda/lib*/libcuda.so*",
"/usr/local/cuda/lib*/libcuda.so*",
"/usr/lib*/libcuda.so*",
"/usr/local/lib*/libcuda.so*",
}
var OneapiGlobs = []string{
"/usr/lib/x86_64-linux-gnu/libze_intel_gpu.so*",
"/usr/lib*/libze_intel_gpu.so*",
}
var CudartMgmtName = "libcudart.so*"
var NvcudaMgmtName = "libcuda.so*"
var NvmlMgmtName = "" // not currently wired on linux
var OneapiMgmtName = "libze_intel_gpu.so"
func GetCPUMem() (memInfo, error) {
var mem memInfo
var total, available, free, buffers, cached uint64
f, err := os.Open("/proc/meminfo")
if err != nil {
return mem, err
}
defer f.Close()
s := bufio.NewScanner(f)
for s.Scan() {
line := s.Text()
switch {
case strings.HasPrefix(line, "MemTotal:"):
_, err = fmt.Sscanf(line, "MemTotal:%d", &total)
case strings.HasPrefix(line, "MemAvailable:"):
_, err = fmt.Sscanf(line, "MemAvailable:%d", &available)
case strings.HasPrefix(line, "MemFree:"):
_, err = fmt.Sscanf(line, "MemFree:%d", &free)
case strings.HasPrefix(line, "Buffers:"):
_, err = fmt.Sscanf(line, "Buffers:%d", &buffers)
case strings.HasPrefix(line, "Cached:"):
_, err = fmt.Sscanf(line, "Cached:%d", &cached)
default:
continue
}
if err != nil {
return mem, err
}
if total > 0 && available > 0 {
mem.TotalMemory = total * format.KibiByte
mem.FreeMemory = available * format.KibiByte
return mem, nil
}
}
mem.TotalMemory = total * format.KibiByte
mem.FreeMemory = (free + buffers + cached) * format.KibiByte
return mem, nil
}

View File

@@ -5,11 +5,12 @@ import (
"testing"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func TestBasicGetGPUInfo(t *testing.T) {
info := GetGPUInfo()
assert.Greater(t, len(info), 0)
assert.NotEmpty(t, len(info))
assert.Contains(t, "cuda rocm cpu metal", info[0].Library)
if info[0].Library != "cpu" {
assert.Greater(t, info[0].TotalMemory, uint64(0))
@@ -19,7 +20,7 @@ func TestBasicGetGPUInfo(t *testing.T) {
func TestCPUMemInfo(t *testing.T) {
info, err := GetCPUMem()
assert.NoError(t, err)
require.NoError(t, err)
switch runtime.GOOS {
case "darwin":
t.Skip("CPU memory not populated on darwin")

55
gpu/gpu_windows.go Normal file
View File

@@ -0,0 +1,55 @@
package gpu
import (
"fmt"
"syscall"
"unsafe"
)
type MEMORYSTATUSEX struct {
length uint32
MemoryLoad uint32
TotalPhys uint64
AvailPhys uint64
TotalPageFile uint64
AvailPageFile uint64
TotalVirtual uint64
AvailVirtual uint64
AvailExtendedVirtual uint64
}
var (
k32 = syscall.NewLazyDLL("kernel32.dll")
globalMemoryStatusExProc = k32.NewProc("GlobalMemoryStatusEx")
sizeofMemoryStatusEx = uint32(unsafe.Sizeof(MEMORYSTATUSEX{}))
)
var CudartGlobs = []string{
"c:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v*\\bin\\cudart64_*.dll",
}
var NvmlGlobs = []string{
"c:\\Windows\\System32\\nvml.dll",
}
var NvcudaGlobs = []string{
"c:\\windows\\system*\\nvcuda.dll",
}
var OneapiGlobs = []string{
"c:\\Windows\\System32\\DriverStore\\FileRepository\\*\\ze_intel_gpu64.dll",
}
var CudartMgmtName = "cudart64_*.dll"
var NvcudaMgmtName = "nvcuda.dll"
var NvmlMgmtName = "nvml.dll"
var OneapiMgmtName = "ze_intel_gpu64.dll"
func GetCPUMem() (memInfo, error) {
memStatus := MEMORYSTATUSEX{length: sizeofMemoryStatusEx}
r1, _, err := globalMemoryStatusExProc.Call(uintptr(unsafe.Pointer(&memStatus)))
if r1 == 0 {
return memInfo{}, fmt.Errorf("GlobalMemoryStatusEx failed: %w", err)
}
return memInfo{TotalMemory: memStatus.TotalPhys, FreeMemory: memStatus.AvailPhys}, nil
}

View File

@@ -18,7 +18,7 @@ type GpuInfo struct {
Library string `json:"library,omitempty"`
// Optional variant to select (e.g. versions, cpu feature flags)
Variant string `json:"variant,omitempty"`
Variant CPUCapability `json:"variant"`
// MinimumMemory represents the minimum memory required to use the GPU
MinimumMemory uint64 `json:"-"`
@@ -26,6 +26,9 @@ type GpuInfo struct {
// Any extra PATH/LD_LIBRARY_PATH dependencies required for the Library to operate properly
DependencyPath string `json:"lib_path,omitempty"`
// Extra environment variables specific to the GPU as list of [key,value]
EnvWorkarounds [][2]string `json:"envs,omitempty"`
// GPU information
ID string `json:"gpu_id"` // string to use for selection of this specific GPU
Name string `json:"name"` // user friendly name if available
@@ -38,6 +41,30 @@ type GpuInfo struct {
// TODO other performance capability info to help in scheduling decisions
}
type CPUInfo struct {
GpuInfo
}
type CudaGPUInfo struct {
GpuInfo
index int //nolint:unused,nolintlint
}
type CudaGPUInfoList []CudaGPUInfo
type RocmGPUInfo struct {
GpuInfo
usedFilepath string //nolint:unused,nolintlint
index int //nolint:unused,nolintlint
}
type RocmGPUInfoList []RocmGPUInfo
type OneapiGPUInfo struct {
GpuInfo
driverIndex int //nolint:unused,nolintlint
gpuIndex int //nolint:unused,nolintlint
}
type OneapiGPUInfoList []OneapiGPUInfo
type GpuInfoList []GpuInfo
// Split up the set of gpu info's by Library and variant
@@ -47,8 +74,8 @@ func (l GpuInfoList) ByLibrary() []GpuInfoList {
for _, info := range l {
found := false
requested := info.Library
if info.Variant != "" {
requested += "_" + info.Variant
if info.Variant != CPUCapabilityNone {
requested += "_" + info.Variant.String()
}
for i, lib := range libs {
if lib == requested {
@@ -86,3 +113,26 @@ type ByFreeMemory []GpuInfo
func (a ByFreeMemory) Len() int { return len(a) }
func (a ByFreeMemory) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
func (a ByFreeMemory) Less(i, j int) bool { return a[i].FreeMemory < a[j].FreeMemory }
type CPUCapability uint32
// Override at build time when building base GPU runners
var GPURunnerCPUCapability = CPUCapabilityAVX
const (
CPUCapabilityNone CPUCapability = iota
CPUCapabilityAVX
CPUCapabilityAVX2
// TODO AVX512
)
func (c CPUCapability) String() string {
switch c {
case CPUCapabilityAVX:
return "avx"
case CPUCapabilityAVX2:
return "avx2"
default:
return "no vector extensions"
}
}

View File

@@ -19,17 +19,19 @@ func TestMultiModelConcurrency(t *testing.T) {
var (
req = [2]api.GenerateRequest{
{
Model: "orca-mini",
Prompt: "why is the ocean blue?",
Stream: &stream,
Model: "orca-mini",
Prompt: "why is the ocean blue?",
Stream: &stream,
KeepAlive: &api.Duration{Duration: 10 * time.Second},
Options: map[string]interface{}{
"seed": 42,
"temperature": 0.0,
},
}, {
Model: "tinydolphin",
Prompt: "what is the origin of the us thanksgiving holiday?",
Stream: &stream,
Model: "tinydolphin",
Prompt: "what is the origin of the us thanksgiving holiday?",
Stream: &stream,
KeepAlive: &api.Duration{Duration: 10 * time.Second},
Options: map[string]interface{}{
"seed": 42,
"temperature": 0.0,
@@ -38,42 +40,64 @@ func TestMultiModelConcurrency(t *testing.T) {
}
resp = [2][]string{
[]string{"sunlight"},
[]string{"england", "english", "massachusetts", "pilgrims"},
[]string{"england", "english", "massachusetts", "pilgrims", "british"},
}
)
var wg sync.WaitGroup
wg.Add(len(req))
ctx, cancel := context.WithTimeout(context.Background(), time.Second*120)
ctx, cancel := context.WithTimeout(context.Background(), time.Second*240)
defer cancel()
client, _, cleanup := InitServerConnection(ctx, t)
defer cleanup()
for i := 0; i < len(req); i++ {
require.NoError(t, PullIfMissing(ctx, client, req[i].Model))
}
for i := 0; i < len(req); i++ {
go func(i int) {
defer wg.Done()
GenerateTestHelper(ctx, t, req[i], resp[i])
DoGenerate(ctx, t, client, req[i], resp[i], 60*time.Second, 10*time.Second)
}(i)
}
wg.Wait()
}
func TestIntegrationConcurrentPredictOrcaMini(t *testing.T) {
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Minute) // GTX 750 2G card takes ~9 minutes
req, resp := GenerateRequests()
reqLimit := len(req)
iterLimit := 5
vram := os.Getenv("OLLAMA_MAX_VRAM")
if vram != "" {
max, err := strconv.ParseUint(vram, 10, 64)
require.NoError(t, err)
// Don't hammer on small VRAM cards...
if max < 4*1024*1024*1024 {
reqLimit = min(reqLimit, 2)
iterLimit = 2
}
}
ctx, cancel := context.WithTimeout(context.Background(), 9*time.Minute)
defer cancel()
client, _, cleanup := InitServerConnection(ctx, t)
defer cleanup()
req, resp := GenerateRequests()
// Get the server running (if applicable) warm the model up with a single initial request
DoGenerate(ctx, t, client, req[0], resp[0], 60*time.Second, 5*time.Second)
DoGenerate(ctx, t, client, req[0], resp[0], 60*time.Second, 10*time.Second)
var wg sync.WaitGroup
wg.Add(len(req))
for i := 0; i < len(req); i++ {
wg.Add(reqLimit)
for i := 0; i < reqLimit; i++ {
go func(i int) {
defer wg.Done()
for j := 0; j < 5; j++ {
for j := 0; j < iterLimit; j++ {
slog.Info("Starting", "req", i, "iter", j)
// On slower GPUs it can take a while to process the 4 concurrent requests
// On slower GPUs it can take a while to process the concurrent requests
// so we allow a much longer initial timeout
DoGenerate(ctx, t, client, req[i], resp[i], 90*time.Second, 5*time.Second)
DoGenerate(ctx, t, client, req[i], resp[i], 120*time.Second, 20*time.Second)
}
}(i)
}
@@ -221,5 +245,23 @@ func TestMultiModelStress(t *testing.T) {
}
}(i)
}
go func() {
for {
time.Sleep(2 * time.Second)
select {
case <-ctx.Done():
return
default:
models, err := client.ListRunning(ctx)
if err != nil {
slog.Warn("failed to list running models", "error", err)
continue
}
for _, m := range models.Models {
slog.Info("loaded model snapshot", "model", m)
}
}
}
}()
wg.Wait()
}

View File

@@ -11,7 +11,8 @@ import (
)
func TestContextExhaustion(t *testing.T) {
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute) // TODO maybe shorter?
// Longer needed for small footprint GPUs
ctx, cancel := context.WithTimeout(context.Background(), 6*time.Minute)
defer cancel()
// Set up the test data
req := api.GenerateRequest{

View File

@@ -32,7 +32,11 @@ func TestIntegrationMultimodal(t *testing.T) {
resp := "the ollam"
ctx, cancel := context.WithTimeout(context.Background(), 3*time.Minute)
defer cancel()
GenerateTestHelper(ctx, t, req, []string{resp})
client, _, cleanup := InitServerConnection(ctx, t)
defer cleanup()
require.NoError(t, PullIfMissing(ctx, client, req.Model))
// llava models on CPU can be quite slow to start,
DoGenerate(ctx, t, client, req, []string{resp}, 120*time.Second, 30*time.Second)
}
const imageEncoding = `iVBORw0KGgoAAAANSUhEUgAAANIAAAB4CAYAAACHHqzKAAAAAXNSR0IArs4c6QAAAIRlWElmTU0AKgAAAAgABQESAAMAAAABAAEAAAEaAAUAAAABAAAASgEb

View File

@@ -140,7 +140,7 @@ func PullIfMissing(ctx context.Context, client *api.Client, modelName string) er
showCtx, cancel := context.WithDeadlineCause(
ctx,
time.Now().Add(5*time.Second),
time.Now().Add(10*time.Second),
fmt.Errorf("show for existing model %s took too long", modelName),
)
defer cancel()
@@ -287,41 +287,46 @@ func DoGenerate(ctx context.Context, t *testing.T, client *api.Client, genReq ap
func GenerateRequests() ([]api.GenerateRequest, [][]string) {
return []api.GenerateRequest{
{
Model: "orca-mini",
Prompt: "why is the ocean blue?",
Stream: &stream,
Model: "orca-mini",
Prompt: "why is the ocean blue?",
Stream: &stream,
KeepAlive: &api.Duration{Duration: 10 * time.Second},
Options: map[string]interface{}{
"seed": 42,
"temperature": 0.0,
},
}, {
Model: "orca-mini",
Prompt: "why is the color of dirt brown?",
Stream: &stream,
Model: "orca-mini",
Prompt: "why is the color of dirt brown?",
Stream: &stream,
KeepAlive: &api.Duration{Duration: 10 * time.Second},
Options: map[string]interface{}{
"seed": 42,
"temperature": 0.0,
},
}, {
Model: "orca-mini",
Prompt: "what is the origin of the us thanksgiving holiday?",
Stream: &stream,
Model: "orca-mini",
Prompt: "what is the origin of the us thanksgiving holiday?",
Stream: &stream,
KeepAlive: &api.Duration{Duration: 10 * time.Second},
Options: map[string]interface{}{
"seed": 42,
"temperature": 0.0,
},
}, {
Model: "orca-mini",
Prompt: "what is the origin of independence day?",
Stream: &stream,
Model: "orca-mini",
Prompt: "what is the origin of independence day?",
Stream: &stream,
KeepAlive: &api.Duration{Duration: 10 * time.Second},
Options: map[string]interface{}{
"seed": 42,
"temperature": 0.0,
},
}, {
Model: "orca-mini",
Prompt: "what is the composition of air?",
Stream: &stream,
Model: "orca-mini",
Prompt: "what is the composition of air?",
Stream: &stream,
KeepAlive: &api.Duration{Duration: 10 * time.Second},
Options: map[string]interface{}{
"seed": 42,
"temperature": 0.0,
@@ -331,7 +336,7 @@ func GenerateRequests() ([]api.GenerateRequest, [][]string) {
[][]string{
[]string{"sunlight"},
[]string{"soil", "organic", "earth", "black", "tan"},
[]string{"england", "english", "massachusetts", "pilgrims"},
[]string{"england", "english", "massachusetts", "pilgrims", "british"},
[]string{"fourth", "july", "declaration", "independence"},
[]string{"nitrogen", "oxygen", "carbon", "dioxide"},
}

View File

@@ -56,7 +56,6 @@ struct server_params {
std::string hostname = "127.0.0.1";
std::vector<std::string> api_keys;
std::string public_path = "examples/server/public";
std::string chat_template = "";
int32_t port = 8080;
int32_t read_timeout = 600;
int32_t write_timeout = 600;
@@ -140,7 +139,6 @@ struct server_slot {
std::vector<llama_token> cache_tokens;
std::vector<completion_token_output> generated_token_probs;
bool infill = false;
bool embedding = false;
bool has_next_token = true;
bool truncated = false;
@@ -187,7 +185,6 @@ struct server_slot {
n_past = 0;
n_sent_text = 0;
n_sent_token_probs = 0;
infill = false;
ga_i = 0;
n_past_se = 0;
@@ -361,7 +358,6 @@ struct llama_server_context
// slots / clients
std::vector<server_slot> slots;
json default_generation_settings_for_props;
llama_server_queue queue_tasks;
llama_server_response queue_results;
@@ -430,16 +426,6 @@ struct llama_server_context
return true;
}
void validate_model_chat_template(server_params & sparams) {
llama_chat_message chat[] = {{"user", "test"}};
std::vector<char> buf(1);
int res = llama_chat_apply_template(model, nullptr, chat, 1, true, buf.data(), buf.size());
if (res < 0) {
LOG_ERROR("The chat template comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", {});
sparams.chat_template = "chatml";
}
}
void initialize() {
// create slots
all_slots_are_idle = true;
@@ -485,9 +471,6 @@ struct llama_server_context
slots.push_back(slot);
}
default_generation_settings_for_props = get_formated_generation(slots.front());
default_generation_settings_for_props["seed"] = -1;
batch = llama_batch_init(n_ctx, 0, params.n_parallel);
}
@@ -586,7 +569,7 @@ struct llama_server_context
slot->sparams.mirostat_eta = json_value(data, "mirostat_eta", default_sparams.mirostat_eta);
slot->sparams.penalize_nl = json_value(data, "penalize_nl", default_sparams.penalize_nl);
slot->params.n_keep = json_value(data, "n_keep", slot->params.n_keep);
slot->params.seed = json_value(data, "seed", default_params.seed);
slot->sparams.seed = json_value(data, "seed", default_params.seed);
slot->sparams.grammar = json_value(data, "grammar", default_sparams.grammar);
slot->sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs);
slot->sparams.min_keep = json_value(data, "min_keep", default_sparams.min_keep);
@@ -600,16 +583,6 @@ struct llama_server_context
slot->params.n_predict = slot->n_predict;
}
// infill
if (data.count("input_prefix") != 0)
{
slot->params.input_prefix = data["input_prefix"];
}
else
{
slot->params.input_prefix = "";
}
if (data.count("input_suffix") != 0)
{
slot->params.input_suffix = data["input_suffix"];
@@ -823,7 +796,6 @@ struct llama_server_context
llama_sampling_free(slot->ctx_sampling);
}
slot->ctx_sampling = llama_sampling_init(slot->sparams);
llama_set_rng_seed(ctx, slot->params.seed);
slot->command = LOAD_PROMPT;
all_slots_are_idle = false;
@@ -847,7 +819,7 @@ struct llama_server_context
system_tokens.clear();
if (!system_prompt.empty()) {
system_tokens = ::llama_tokenize(ctx, system_prompt, add_bos_token);
system_tokens = ::llama_tokenize(ctx, system_prompt, true);
llama_batch_clear(batch);
@@ -897,15 +869,6 @@ struct llama_server_context
system_need_update = true;
}
void system_prompt_process(const json &sys_props) {
system_prompt = sys_props.value("prompt", "");
name_user = sys_props.value("anti_prompt", "");
name_assistant = sys_props.value("assistant_name", "");
system_prompt_notify();
}
static size_t find_stopping_strings(const std::string &text, const size_t last_token_size,
const stop_type type, server_slot &slot)
{
@@ -1263,13 +1226,12 @@ struct llama_server_context
queue_results.send(res);
}
void request_completion(int task_id, json data, bool infill, bool embedding, int multitask_id)
void request_completion(int task_id, json data, bool embedding, int multitask_id)
{
task_server task;
task.id = task_id;
task.target_id = 0;
task.data = std::move(data);
task.infill_mode = infill;
task.embedding_mode = embedding;
task.type = TASK_TYPE_COMPLETION;
task.multitask_id = multitask_id;
@@ -1415,8 +1377,8 @@ struct llama_server_context
json subtask_data = multiprompt_task.data;
subtask_data["prompt"] = subtask_data["prompt"][i];
// subtasks inherit everything else (infill mode, embedding mode, etc.)
request_completion(subtask_ids[i], subtask_data, multiprompt_task.infill_mode, multiprompt_task.embedding_mode, multitask_id);
// subtasks inherit everything else (embedding mode, etc.)
request_completion(subtask_ids[i], subtask_data, multiprompt_task.embedding_mode, multitask_id);
}
}
@@ -1434,26 +1396,8 @@ struct llama_server_context
break;
}
if (task.data.contains("system_prompt"))
{
if (!all_slots_are_idle) {
send_error(task, "system prompt can only be updated when all slots are idle");
break;
}
system_prompt_process(task.data["system_prompt"]);
// reset cache_tokens for all slots
for (server_slot &slot : slots)
{
slot.cache_tokens.clear();
slot.n_past = 0;
slot.n_past_se = 0;
}
}
slot->reset();
slot->infill = task.infill_mode;
slot->embedding = task.embedding_mode;
slot->task_id = task.id;
slot->multitask_id = task.multitask_id;
@@ -1679,8 +1623,7 @@ struct llama_server_context
const bool has_prompt = slot.prompt.is_array() || (slot.prompt.is_string() && !slot.prompt.get<std::string>().empty()) || !slot.images.empty();
// empty prompt passed -> release the slot and send empty response
// note: infill mode allows empty prompt
if (slot.state == IDLE && slot.command == LOAD_PROMPT && !has_prompt && !slot.infill)
if (slot.state == IDLE && slot.command == LOAD_PROMPT && !has_prompt)
{
slot.release();
slot.print_timings();
@@ -1697,33 +1640,7 @@ struct llama_server_context
slot.t_start_process_prompt = ggml_time_us();
slot.t_start_genereration = 0;
if (slot.infill)
{
bool suff_rm_leading_spc = true;
if (params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1)
{
params.input_suffix.erase(0, 1);
suff_rm_leading_spc = false;
}
auto prefix_tokens = tokenize(slot.params.input_prefix, false);
auto suffix_tokens = tokenize(slot.params.input_suffix, false);
const int space_token = 29871; // TODO: this should not be hardcoded
if (suff_rm_leading_spc && !suffix_tokens.empty() && suffix_tokens[0] == space_token) {
suffix_tokens.erase(suffix_tokens.begin());
}
prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(model));
prefix_tokens.insert(prefix_tokens.begin(), llama_token_bos(model)); // always add BOS
prefix_tokens.insert(prefix_tokens.end(), llama_token_suffix(model));
prefix_tokens.insert(prefix_tokens.end(), suffix_tokens.begin(), suffix_tokens.end());
prefix_tokens.push_back(llama_token_middle(model));
prompt_tokens = prefix_tokens;
}
else
{
prompt_tokens = tokenize(slot.prompt, system_prompt.empty() && add_bos_token); // add BOS if there isn't system prompt
}
prompt_tokens = tokenize(slot.prompt, system_prompt.empty()); // add BOS if there isn't system prompt
slot.n_prompt_tokens = prompt_tokens.size();
@@ -2130,8 +2047,7 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
printf("\n");
}
static void server_params_parse(int argc, char **argv, server_params &sparams,
gpt_params &params, llama_server_context& llama)
static void server_params_parse(int argc, char **argv, server_params &sparams, gpt_params &params)
{
gpt_params default_params;
server_params default_sparams;
@@ -2408,9 +2324,9 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
invalid_param = true;
break;
}
#ifndef GGML_USE_CUBLAS
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting the split mode has no effect.\n");
#endif // GGML_USE_CUBLAS
#ifndef GGML_USE_CUDA
fprintf(stderr, "warning: llama.cpp was compiled without CUDA. Setting the split mode has no effect.\n");
#endif // GGML_USE_CUDA
}
else if (arg == "--tensor-split" || arg == "-ts")
{
@@ -2419,7 +2335,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
invalid_param = true;
break;
}
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)
#if defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL)
std::string arg_next = argv[i];
// split string by , and /
@@ -2440,8 +2356,8 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
}
}
#else
LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n", {});
#endif // GGML_USE_CUBLAS
LOG_WARNING("llama.cpp was compiled without CUDA. It is not possible to set a tensor split.\n", {});
#endif // GGML_USE_CUDA
}
else if (arg == "--main-gpu" || arg == "-mg")
{
@@ -2450,7 +2366,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
invalid_param = true;
break;
}
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)
#if defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL)
params.main_gpu = std::stoi(argv[i]);
#else
LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.", {});
@@ -2546,27 +2462,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
}
params.n_predict = std::stoi(argv[i]);
}
else if (arg == "-spf" || arg == "--system-prompt-file")
{
if (++i >= argc)
{
invalid_param = true;
break;
}
std::ifstream file(argv[i]);
if (!file) {
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
invalid_param = true;
break;
}
std::string systm_content;
std::copy(
std::istreambuf_iterator<char>(file),
std::istreambuf_iterator<char>(),
std::back_inserter(systm_content)
);
llama.system_prompt_process(json::parse(systm_content));
}
else if (arg == "-ctk" || arg == "--cache-type-k") {
params.cache_type_k = argv[++i];
}
@@ -2629,7 +2524,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
invalid_param = true;
break;
}
sparams.chat_template = argv[i];
}
else if (arg == "--override-kv")
{
@@ -2818,7 +2712,7 @@ int main(int argc, char **argv) {
// struct that contains llama context and inference
llama_server_context llama;
server_params_parse(argc, argv, sparams, params, llama);
server_params_parse(argc, argv, sparams, params);
if (params.model_alias == "unknown")
{
@@ -3102,11 +2996,6 @@ int main(int argc, char **argv) {
}
const auto model_meta = llama.model_meta();
if (sparams.chat_template.empty()) { // custom chat template is not supplied
// check if the template comes with the model is supported by us
llama.validate_model_chat_template(sparams);
}
// Middleware for API key validation
auto validate_api_key = [&sparams](const httplib::Request &req, httplib::Response &res) -> bool {
// If API key is not set, skip validation
@@ -3150,7 +3039,7 @@ int main(int argc, char **argv) {
json data = json::parse(req.body);
const int task_id = llama.queue_tasks.get_new_id();
llama.queue_results.add_waiting_task_id(task_id);
llama.request_completion(task_id, data, false, false, -1);
llama.request_completion(task_id, data, false, -1);
if (!json_value(data, "stream", false)) {
std::string completion_text;
task_result result = llama.queue_results.recv(task_id);
@@ -3272,7 +3161,7 @@ int main(int argc, char **argv) {
// create and queue the task
const int task_id = llama.queue_tasks.get_new_id();
llama.queue_results.add_waiting_task_id(task_id);
llama.request_completion(task_id, { {"prompt", prompt}, { "n_predict", 0}, {"image_data", image_data} }, false, true, -1);
llama.request_completion(task_id, { {"prompt", prompt}, { "n_predict", 0}, {"image_data", image_data} }, true, -1);
// get the result
task_result result = llama.queue_results.recv(task_id);

View File

@@ -18,7 +18,7 @@ sign() {
fi
}
COMMON_DARWIN_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DLLAMA_METAL_MACOSX_VERSION_MIN=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DLLAMA_METAL_EMBED_LIBRARY=on"
COMMON_DARWIN_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DLLAMA_METAL_MACOSX_VERSION_MIN=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DLLAMA_METAL_EMBED_LIBRARY=on -DLLAMA_OPENMP=off"
case "${GOARCH}" in
"amd64")
@@ -27,65 +27,68 @@ case "${GOARCH}" in
# Static build for linking into the Go binary
init_vars
CMAKE_TARGETS="--target llama --target ggml"
CMAKE_DEFS="${COMMON_CPU_DEFS} -DBUILD_SHARED_LIBS=off -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
CMAKE_DEFS="${COMMON_CPU_DEFS} -DBUILD_SHARED_LIBS=off -DLLAMA_BLAS=off -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
BUILD_DIR="../build/darwin/${ARCH}_static"
echo "Building static library"
build
if [ -z "$OLLAMA_SKIP_CPU_GENERATE" ]; then
#
# CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
#
init_vars
CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=off -DLLAMA_BLAS=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
BUILD_DIR="../build/darwin/${ARCH}/cpu"
echo "Building LCD CPU"
build
sign ${BUILD_DIR}/bin/ollama_llama_server
compress
#
# CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
#
init_vars
CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
BUILD_DIR="../build/darwin/${ARCH}/cpu"
echo "Building LCD CPU"
build
sign ${BUILD_DIR}/bin/ollama_llama_server
compress
#
# ~2011 CPU Dynamic library with more capabilities turned on to optimize performance
# Approximately 400% faster than LCD on same CPU
#
init_vars
CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=off -DLLAMA_BLAS=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
BUILD_DIR="../build/darwin/${ARCH}/cpu_avx"
echo "Building AVX CPU"
build
sign ${BUILD_DIR}/bin/ollama_llama_server
compress
#
# ~2011 CPU Dynamic library with more capabilities turned on to optimize performance
# Approximately 400% faster than LCD on same CPU
#
init_vars
CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
BUILD_DIR="../build/darwin/${ARCH}/cpu_avx"
echo "Building AVX CPU"
build
sign ${BUILD_DIR}/bin/ollama_llama_server
compress
#
# ~2013 CPU Dynamic library
# Approximately 10% faster than AVX on same CPU
#
init_vars
CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=on -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on ${CMAKE_DEFS}"
BUILD_DIR="../build/darwin/${ARCH}/cpu_avx2"
echo "Building AVX2 CPU"
EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation"
build
sign ${BUILD_DIR}/bin/ollama_llama_server
compress
#
# ~2013 CPU Dynamic library
# Approximately 10% faster than AVX on same CPU
#
init_vars
CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=on -DLLAMA_BLAS=off -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on ${CMAKE_DEFS}"
BUILD_DIR="../build/darwin/${ARCH}/cpu_avx2"
echo "Building AVX2 CPU"
EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation"
build
sign ${BUILD_DIR}/bin/ollama_llama_server
compress
fi
;;
"arm64")
# Static build for linking into the Go binary
init_vars
CMAKE_TARGETS="--target llama --target ggml"
CMAKE_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DBUILD_SHARED_LIBS=off -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DLLAMA_METAL=off -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
CMAKE_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DLLAMA_BLAS=off -DCMAKE_SYSTEM_NAME=Darwin -DBUILD_SHARED_LIBS=off -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DLLAMA_METAL=off -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
BUILD_DIR="../build/darwin/${ARCH}_static"
echo "Building static library"
build
init_vars
CMAKE_DEFS="${COMMON_DARWIN_DEFS} -DLLAMA_ACCELERATE=on -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DLLAMA_METAL=on ${CMAKE_DEFS}"
BUILD_DIR="../build/darwin/${ARCH}/metal"
EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders"
build
sign ${BUILD_DIR}/bin/ollama_llama_server
compress
if [ -z "$OLLAMA_SKIP_METAL_GENERATE" ]; then
init_vars
CMAKE_DEFS="${COMMON_DARWIN_DEFS} -DLLAMA_ACCELERATE=on -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DLLAMA_METAL=on ${CMAKE_DEFS}"
BUILD_DIR="../build/darwin/${ARCH}/metal"
EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders"
build
sign ${BUILD_DIR}/bin/ollama_llama_server
compress
fi
;;
*)
echo "GOARCH must be set"

View File

@@ -51,7 +51,7 @@ if [ -z "${CUDACXX}" ]; then
export CUDACXX=$(command -v nvcc)
fi
fi
COMMON_CMAKE_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off"
COMMON_CMAKE_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off -DLLAMA_OPENMP=off"
source $(dirname $0)/gen_common.sh
init_vars
git_module_setup
@@ -64,7 +64,7 @@ if [ -z "${OLLAMA_SKIP_STATIC_GENERATE}" -o "${OLLAMA_CPU_TARGET}" = "static" ];
# Static build for linking into the Go binary
init_vars
CMAKE_TARGETS="--target llama --target ggml"
CMAKE_DEFS="-DBUILD_SHARED_LIBS=off -DLLAMA_NATIVE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
CMAKE_DEFS="-DBUILD_SHARED_LIBS=off -DLLAMA_NATIVE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off -DLLAMA_OPENMP=off ${CMAKE_DEFS}"
BUILD_DIR="../build/linux/${ARCH}_static"
echo "Building static library"
build
@@ -93,7 +93,7 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
# -DLLAMA_AVX512_VBMI -- 2018 Intel Cannon Lake
# -DLLAMA_AVX512_VNNI -- 2021 Intel Alder Lake
COMMON_CPU_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off"
COMMON_CPU_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off -DLLAMA_OPENMP=off"
if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu" ]; then
#
# CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
@@ -178,7 +178,7 @@ if [ -z "${OLLAMA_SKIP_CUDA_GENERATE}" -a -d "${CUDA_LIB_DIR}" ]; then
CMAKE_CUDA_DEFS="-DLLAMA_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} ${OLLAMA_CUSTOM_CUDA_DEFS}"
echo "Building custom CUDA GPU"
else
CMAKE_CUDA_DEFS="-DLLAMA_CUDA=on -DLLAMA_CUDA_FORCE_MMQ=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}"
CMAKE_CUDA_DEFS="-DLLAMA_CUDA=on -DCMAKE_CUDA_FLAGS=-t8 -DLLAMA_CUDA_FORCE_MMQ=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}"
fi
CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS} ${CMAKE_CUDA_DEFS}"
BUILD_DIR="../build/linux/${ARCH}/cuda${CUDA_VARIANT}"
@@ -211,7 +211,7 @@ if [ -z "${ONEAPI_ROOT}" ]; then
ONEAPI_ROOT=/opt/intel/oneapi
fi
if [ -d "${ONEAPI_ROOT}" ]; then
if [ -z "${OLLAMA_SKIP_ONEAPI_GENERATE}" -a -d "${ONEAPI_ROOT}" ]; then
echo "OneAPI libraries detected - building dynamic OneAPI library"
init_vars
source ${ONEAPI_ROOT}/setvars.sh --force # set up environment variables for oneAPI

View File

@@ -39,7 +39,8 @@ function init_vars {
}
$script:cmakeDefs = @(
"-DBUILD_SHARED_LIBS=on",
"-DLLAMA_NATIVE=off"
"-DLLAMA_NATIVE=off",
"-DLLAMA_OPENMP=off"
)
$script:commonCpuDefs = @("-DCMAKE_POSITION_INDEPENDENT_CODE=on")
$script:ARCH = $Env:PROCESSOR_ARCHITECTURE.ToLower()
@@ -122,8 +123,13 @@ function build {
& cmake --version
& cmake -S "${script:llamacppDir}" -B $script:buildDir $script:cmakeDefs
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
write-host "building with: cmake --build $script:buildDir --config $script:config $($script:cmakeTargets | ForEach-Object { `"--target`", $_ })"
& cmake --build $script:buildDir --config $script:config ($script:cmakeTargets | ForEach-Object { "--target", $_ })
if ($cmakeDefs -contains "-G") {
$extra=@("-j8")
} else {
$extra= @("--", "/p:CL_MPcount=8")
}
write-host "building with: cmake --build $script:buildDir --config $script:config $($script:cmakeTargets | ForEach-Object { `"--target`", $_ }) $extra"
& cmake --build $script:buildDir --config $script:config ($script:cmakeTargets | ForEach-Object { "--target", $_ }) $extra
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
# Rearrange output to be consistent between different generators
if ($null -ne ${script:config} -And (test-path -path "${script:buildDir}/bin/${script:config}" ) ) {
@@ -203,7 +209,8 @@ function build_static() {
"-DLLAMA_AVX2=off",
"-DLLAMA_AVX512=off",
"-DLLAMA_F16C=off",
"-DLLAMA_FMA=off")
"-DLLAMA_FMA=off",
"-DLLAMA_OPENMP=off")
$script:buildDir="../build/windows/${script:ARCH}_static"
write-host "Building static library"
build
@@ -270,7 +277,15 @@ function build_cuda() {
init_vars
$script:buildDir="../build/windows/${script:ARCH}/cuda$script:CUDA_VARIANT"
$script:distDir="$script:DIST_BASE\cuda$script:CUDA_VARIANT"
$script:cmakeDefs += @("-A", "x64", "-DLLAMA_CUDA=ON", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DCUDAToolkit_INCLUDE_DIR=$script:CUDA_INCLUDE_DIR", "-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}")
$script:cmakeDefs += @(
"-A", "x64",
"-DLLAMA_CUDA=ON",
"-DLLAMA_AVX=on",
"-DLLAMA_AVX2=off",
"-DCUDAToolkit_INCLUDE_DIR=$script:CUDA_INCLUDE_DIR",
"-DCMAKE_CUDA_FLAGS=-t8",
"-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}"
)
if ($null -ne $env:OLLAMA_CUSTOM_CUDA_DEFS) {
write-host "OLLAMA_CUSTOM_CUDA_DEFS=`"${env:OLLAMA_CUSTOM_CUDA_DEFS}`""
$script:cmakeDefs +=@("${env:OLLAMA_CUSTOM_CUDA_DEFS}")
@@ -280,17 +295,19 @@ function build_cuda() {
sign
install
write-host "copying CUDA dependencies to ${script:SRC_DIR}\dist\windows-${script:ARCH}\"
cp "${script:CUDA_LIB_DIR}\cudart64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\"
cp "${script:CUDA_LIB_DIR}\cublas64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\"
cp "${script:CUDA_LIB_DIR}\cublasLt64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\"
rm -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
md "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\" -ea 0 > $null
write-host "copying CUDA dependencies to ${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
cp "${script:CUDA_LIB_DIR}\cudart64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
cp "${script:CUDA_LIB_DIR}\cublas64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
cp "${script:CUDA_LIB_DIR}\cublasLt64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
} else {
write-host "Skipping CUDA generation step"
}
}
function build_oneapi() {
if ((-not "${env:OLLAMA_SKIP_CUDA_GENERATE}") -and ("${env:ONEAPI_ROOT}")) {
if ((-not "${env:OLLAMA_SKIP_ONEAPI_GENERATE}") -and ("${env:ONEAPI_ROOT}")) {
# Get oneAPI version
$script:ONEAPI_VERSION = icpx --version
$script:ONEAPI_VERSION = [regex]::Match($script:ONEAPI_VERSION, '(?<=oneAPI DPC\+\+/C\+\+ Compiler )(?<version>\d+\.\d+\.\d+)').Value
@@ -317,16 +334,18 @@ function build_oneapi() {
sign
install
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libirngmd.dll" "${script:distDir}"
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libmmd.dll" "${script:distDir}"
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_level_zero.dll" "${script:distDir}"
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_unified_runtime.dll" "${script:distDir}"
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_win_proxy_loader.dll" "${script:distDir}"
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\svml_dispmd.dll" "${script:distDir}"
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\sycl7.dll" "${script:distDir}"
cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_core.2.dll" "${script:distDir}"
cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_sycl_blas.4.dll" "${script:distDir}"
cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_tbb_thread.2.dll" "${script:distDir}"
rm -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
md "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\" -ea 0 > $null
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libirngmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libmmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_level_zero.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_unified_runtime.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_win_proxy_loader.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\svml_dispmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\sycl7.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_core.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_sycl_blas.4.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_tbb_thread.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
} else {
Write-Host "Skipping oneAPI generation step"
}

View File

@@ -53,7 +53,7 @@ func (llm *ggla) Tensors() Tensors {
return llm.tensors
}
func (llm *ggla) decode(rs io.ReadSeeker) error {
func (llm *ggla) decode(rs io.ReadSeeker) (retErr error) {
var r uint32
if err := binary.Read(rs, binary.LittleEndian, &r); err != nil {
return err
@@ -69,9 +69,18 @@ func (llm *ggla) decode(rs io.ReadSeeker) error {
for {
var dims uint32
if err := binary.Read(rs, binary.LittleEndian, &dims); err != nil {
if errors.Is(err, io.EOF) {
return nil
}
return err
}
defer func() {
if errors.Is(retErr, io.EOF) {
retErr = io.ErrUnexpectedEOF
}
}()
var namesize uint32
if err := binary.Read(rs, binary.LittleEndian, &namesize); err != nil {
return err
@@ -108,7 +117,7 @@ func (llm *ggla) decode(rs io.ReadSeeker) error {
return err
}
if _, err := rs.Seek((offset+31)&-32, io.SeekStart); err != nil {
if _, err := rs.Seek((offset+31)&-32-offset, io.SeekCurrent); err != nil {
return err
}

View File

@@ -6,6 +6,8 @@ import (
"fmt"
"io"
"strings"
"github.com/ollama/ollama/util/bufioutil"
)
type GGML struct {
@@ -69,6 +71,30 @@ func (kv KV) HeadCountKV() uint64 {
return 1
}
func (kv KV) EmbeddingHeadCount() uint64 {
if heads := kv.HeadCount(); heads > 0 {
return kv.EmbeddingLength() / kv.HeadCount()
}
return 0
}
func (kv KV) EmbeddingHeadCountK() uint64 {
if k := kv.u64(fmt.Sprintf("%s.attention.key_length", kv.Architecture())); k > 0 {
return k
}
return kv.EmbeddingHeadCount()
}
func (kv KV) EmbeddingHeadCountV() uint64 {
if v := kv.u64(fmt.Sprintf("%s.attention.value_length", kv.Architecture())); v > 0 {
return v
}
return kv.EmbeddingHeadCount()
}
func (kv KV) GQA() uint64 {
return kv.HeadCount() / kv.HeadCountKV()
}
@@ -81,6 +107,11 @@ func (kv KV) ContextLength() uint64 {
return kv.u64(fmt.Sprintf("%s.context_length", kv.Architecture()))
}
func (kv KV) ChatTemplate() string {
s, _ := kv["tokenizer.chat_template"].(string)
return s
}
type Tensors []*Tensor
func (ts Tensors) Layers() map[string]Layer {
@@ -249,7 +280,18 @@ func DetectGGMLType(b []byte) string {
}
}
func DecodeGGML(rs io.ReadSeeker) (*GGML, int64, error) {
// DecodeGGML decodes a GGML model from the given reader.
//
// It collects array values for arrays with a size less than or equal to
// maxArraySize. If maxArraySize is 0, the default value of 1024 is used. If
// the maxArraySize is negative, all arrays are collected.
func DecodeGGML(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) {
if maxArraySize == 0 {
maxArraySize = 1024
}
rs = bufioutil.NewBufferedSeeker(rs, 32<<10)
var magic uint32
if err := binary.Read(rs, binary.LittleEndian, &magic); err != nil {
return nil, 0, err
@@ -262,17 +304,15 @@ func DecodeGGML(rs io.ReadSeeker) (*GGML, int64, error) {
case FILE_MAGIC_GGLA:
c = &containerGGLA{}
case FILE_MAGIC_GGUF_LE:
c = &containerGGUF{ByteOrder: binary.LittleEndian}
c = &containerGGUF{ByteOrder: binary.LittleEndian, maxArraySize: maxArraySize}
case FILE_MAGIC_GGUF_BE:
c = &containerGGUF{ByteOrder: binary.BigEndian}
c = &containerGGUF{ByteOrder: binary.BigEndian, maxArraySize: maxArraySize}
default:
return nil, 0, errors.New("invalid file magic")
}
model, err := c.Decode(rs)
if errors.Is(err, io.EOF) {
// noop
} else if err != nil {
if err != nil {
return nil, 0, err
}
@@ -292,7 +332,10 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
embedding := llm.KV().EmbeddingLength()
heads := llm.KV().HeadCount()
headsKV := llm.KV().HeadCountKV()
vocab := uint64(len(llm.KV()["tokenizer.ggml.tokens"].([]any)))
vocab := uint64(llm.KV()["tokenizer.ggml.tokens"].(*array).size)
embeddingHeads := llm.KV().EmbeddingHeadCount()
embeddingHeadsK := llm.KV().EmbeddingHeadCountK()
layers := llm.Tensors().Layers()
@@ -302,7 +345,8 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
partialOffload = 4 * batch * embedding
partialOffload += max(
4*batch*(1+embedding+max(context, embedding))+embedding*embedding*9/16+4*context*(batch*heads+embedding/heads*headsKV),
// 4*batch*(4+6*embedding+context*(2*heads)+llm.KV().GQA()),
4*batch*(1+embedding+max(context, embedding))+embedding*embedding*9/16+4*context*(batch*heads+embeddingHeads*headsKV),
4*batch*(embedding+vocab)+embedding*vocab*105/128,
)
@@ -310,21 +354,30 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
// mixtral 8x22b
ff := uint64(llm.KV()["llama.feed_forward_length"].(uint32))
partialOffload = max(
3*ffnGateExpsWeight.Size()+4*batch*(2*ff+headsKV+embedding+context+embedding/heads*headsKV),
4*(context*batch*heads+context*embedding/heads*headsKV+batch*1024+embedding/heads*headsKV*batch),
3*ffnGateExpsWeight.Size()+4*batch*(2*ff+headsKV+embedding+context+embeddingHeads*headsKV),
4*(context*batch*heads+context*embeddingHeads*headsKV+batch*1024+embeddingHeads*headsKV*batch),
)
} else if ffnGateWeight, ok := layers["blk.0"]["ffn_gate.0.weight"]; ok {
// mixtral 8x7b
ffnGateWeight1 := ffnGateWeight.Shape[1]
fullOffload = 4 * batch * (2 + 3*embedding + context*(1+heads) + 2*headsKV + ffnGateWeight1)
partialOffload = max(
4*batch*(3+embedding/heads*headsKV+embedding+context*(1+heads)+ffnGateWeight1)+(embedding*embedding+3*embedding*headsKV*ffnGateWeight1)*9/16,
4*batch*(3+embeddingHeads*headsKV+embedding+context*(1+heads)+ffnGateWeight1)+(embedding*embedding+3*embedding*headsKV*ffnGateWeight1)*9/16,
4*batch*(1+2*embedding+context*(1+heads))+embedding*(6*context*headsKV/heads+embedding*9/16),
)
}
case "gemma":
fullOffload = 4 * batch * (embedding + vocab)
partialOffload = 4*batch*(2*embedding+vocab+1) + embedding*vocab*105/128
case "gemma", "gemma2":
fullOffload = max(
4*batch*(embedding+vocab),
4*batch*(2+context+context*heads+2*embedding+2*embeddingHeadsK*heads),
)
partialOffload = max(
4*embedding*batch+embedding*vocab*105/128+4*vocab*batch,
4*batch*(2*embedding+1+2*embeddingHeadsK*heads+context+context*heads)+
4*embeddingHeadsK*context*8+
embedding*embeddingHeadsK*heads*9/16,
)
case "command-r":
fullOffload = max(
4*batch*(embedding+vocab),
@@ -361,6 +414,16 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
4*batch*(vocab+2*embedding),
fullOffload,
)
case "deepseek2":
fullOffload = max(
4*batch*(3*embedding+vocab),
4*batch*(3*embedding+2+context*(1+headsKV)+2*embeddingHeadsK*headsKV),
)
partialOffload = max(
4*batch*(3*embedding+vocab)+embedding*vocab*105/128,
4*batch*(2*embedding+1+2*embeddingHeadsK*headsKV+context+context*headsKV)+4*embeddingHeadsK*context*headsKV+embedding*embeddingHeadsK*headsKV*9/16,
)
}
return

1
llm/ggml_test.go Normal file
View File

@@ -0,0 +1 @@
package llm

View File

@@ -3,11 +3,10 @@ package llm
import (
"bytes"
"encoding/binary"
"encoding/json"
"fmt"
"io"
"strings"
"log/slog"
)
type containerGGUF struct {
@@ -29,6 +28,12 @@ type containerGGUF struct {
NumTensor uint64
NumKV uint64
}
maxArraySize int
}
func (c *containerGGUF) canCollectArray(size int) bool {
return c.maxArraySize < 0 || size <= c.maxArraySize
}
func (c *containerGGUF) Name() string {
@@ -54,7 +59,6 @@ func (c *containerGGUF) Decode(rs io.ReadSeeker) (model, error) {
}
model := newGGUF(c)
slog.Debug(fmt.Sprintf("model = %#v", model))
if err := model.Decode(rs); err != nil {
return nil, err
}
@@ -85,6 +89,8 @@ type gguf struct {
tensors []*Tensor
parameters uint64
scratch [16 << 10]byte
}
func newGGUF(container *containerGGUF) *gguf {
@@ -181,34 +187,34 @@ func (llm *gguf) Decode(rs io.ReadSeeker) error {
}
// decode tensors
for i := 0; uint64(i) < llm.numTensor(); i++ {
for range llm.numTensor() {
name, err := readGGUFString(llm, rs)
if err != nil {
return err
return fmt.Errorf("failed to read tensor name: %w", err)
}
// dims is the number of dimensions in the tensor
dims, err := readGGUF[uint32](llm, rs)
if err != nil {
return err
return fmt.Errorf("failed to read tensor dimensions: %w", err)
}
shape := [4]uint64{1, 1, 1, 1}
for i := 0; uint32(i) < dims; i++ {
shape[i], err = readGGUF[uint64](llm, rs)
if err != nil {
return err
return fmt.Errorf("failed to read tensor shape: %w", err)
}
}
kind, err := readGGUF[uint32](llm, rs)
if err != nil {
return err
return fmt.Errorf("failed to read tensor kind: %w", err)
}
offset, err := readGGUF[uint64](llm, rs)
if err != nil {
return err
return fmt.Errorf("failed to read tensor offset: %w", err)
}
tensor := Tensor{
@@ -230,24 +236,19 @@ func (llm *gguf) Decode(rs io.ReadSeeker) error {
alignment = 32
}
offset, err := rs.Seek(0, io.SeekCurrent)
if err != nil {
return err
}
padding := llm.padding(offset, int64(alignment))
if _, err := rs.Seek(padding, io.SeekCurrent); err != nil {
return err
}
for _, tensor := range llm.tensors {
if _, err := rs.Seek(int64(tensor.Size()), io.SeekCurrent); err != nil {
return err
offset, err := rs.Seek(0, io.SeekCurrent)
if err != nil {
return fmt.Errorf("failed to get current offset: %w", err)
}
padding := llm.padding(int64(tensor.Size()), int64(alignment))
padding := llm.padding(offset, int64(alignment))
if _, err := rs.Seek(padding, io.SeekCurrent); err != nil {
return err
return fmt.Errorf("failed to seek to init padding: %w", err)
}
if _, err := rs.Seek(int64(tensor.Size()), io.SeekCurrent); err != nil {
return fmt.Errorf("failed to seek to tensor: %w", err)
}
}
@@ -285,22 +286,48 @@ func readGGUFV1String(llm *gguf, r io.Reader) (string, error) {
return b.String(), nil
}
func discardGGUFString(llm *gguf, r io.Reader) error {
buf := llm.scratch[:8]
_, err := io.ReadFull(r, buf)
if err != nil {
return err
}
size := int(llm.ByteOrder.Uint64(buf))
for size > 0 {
n, err := r.Read(llm.scratch[:min(size, cap(llm.scratch))])
if err != nil {
return err
}
size -= n
}
return nil
}
func readGGUFString(llm *gguf, r io.Reader) (string, error) {
if llm.Version == 1 {
return readGGUFV1String(llm, r)
}
var length uint64
if err := binary.Read(r, llm.ByteOrder, &length); err != nil {
buf := llm.scratch[:8]
_, err := io.ReadFull(r, buf)
if err != nil {
return "", err
}
var b bytes.Buffer
if _, err := io.CopyN(&b, r, int64(length)); err != nil {
length := int(llm.ByteOrder.Uint64(buf))
if length > len(llm.scratch) {
buf = make([]byte, length)
} else {
buf = llm.scratch[:length]
}
clear(buf)
_, err = io.ReadFull(r, buf)
if err != nil {
return "", err
}
return b.String(), nil
return string(buf), nil
}
func writeGGUFString(llm *gguf, w io.Writer, s string) error {
@@ -316,7 +343,16 @@ func writeGGUFString(llm *gguf, w io.Writer, s string) error {
return err
}
func readGGUFV1Array(llm *gguf, r io.Reader) (a []any, err error) {
type array struct {
size int
values []any
}
func (a *array) MarshalJSON() ([]byte, error) {
return json.Marshal(a.values)
}
func readGGUFV1Array(llm *gguf, r io.Reader) (*array, error) {
t, err := readGGUF[uint32](llm, r)
if err != nil {
return nil, err
@@ -327,7 +363,12 @@ func readGGUFV1Array(llm *gguf, r io.Reader) (a []any, err error) {
return nil, err
}
for i := 0; uint32(i) < n; i++ {
a := &array{size: int(n)}
if llm.canCollectArray(int(n)) {
a.values = make([]any, 0, int(n))
}
for i := range n {
var e any
switch t {
case ggufTypeUint8:
@@ -361,13 +402,15 @@ func readGGUFV1Array(llm *gguf, r io.Reader) (a []any, err error) {
return nil, err
}
a = append(a, e)
if a.values != nil {
a.values[i] = e
}
}
return
return a, nil
}
func readGGUFArray(llm *gguf, r io.Reader) (a []any, err error) {
func readGGUFArray(llm *gguf, r io.Reader) (*array, error) {
if llm.Version == 1 {
return readGGUFV1Array(llm, r)
}
@@ -382,7 +425,12 @@ func readGGUFArray(llm *gguf, r io.Reader) (a []any, err error) {
return nil, err
}
for i := 0; uint64(i) < n; i++ {
a := &array{size: int(n)}
if llm.canCollectArray(int(n)) {
a.values = make([]any, int(n))
}
for i := range n {
var e any
switch t {
case ggufTypeUint8:
@@ -408,7 +456,11 @@ func readGGUFArray(llm *gguf, r io.Reader) (a []any, err error) {
case ggufTypeBool:
e, err = readGGUF[bool](llm, r)
case ggufTypeString:
e, err = readGGUFString(llm, r)
if a.values != nil {
e, err = readGGUFString(llm, r)
} else {
err = discardGGUFString(llm, r)
}
default:
return nil, fmt.Errorf("invalid array type: %d", t)
}
@@ -416,10 +468,12 @@ func readGGUFArray(llm *gguf, r io.Reader) (a []any, err error) {
return nil, err
}
a = append(a, e)
if a.values != nil {
a.values[i] = e
}
}
return
return a, nil
}
func writeGGUFArray[S ~[]E, E any](llm *gguf, w io.Writer, t uint32, s S) error {
@@ -592,8 +646,8 @@ func (llm *gguf) Encode(ws io.WriteSeeker, kv KV, tensors []Tensor) error {
return err
}
dims := 0
for cnt := 0; cnt < len(tensor.Shape); cnt++ {
var dims int
for cnt := range len(tensor.Shape) {
if tensor.Shape[cnt] > 0 {
dims++
}
@@ -603,8 +657,8 @@ func (llm *gguf) Encode(ws io.WriteSeeker, kv KV, tensors []Tensor) error {
return err
}
for i := 0; i < dims; i++ {
if err := binary.Write(ws, llm.ByteOrder, uint64(tensor.Shape[dims-1-i])); err != nil {
for i := range dims {
if err := binary.Write(ws, llm.ByteOrder, tensor.Shape[dims-1-i]); err != nil {
return err
}
}
@@ -618,22 +672,8 @@ func (llm *gguf) Encode(ws io.WriteSeeker, kv KV, tensors []Tensor) error {
}
}
offset, err := ws.Seek(0, io.SeekCurrent)
if err != nil {
return err
}
var alignment int64 = 32
padding := llm.padding(offset, alignment)
if err := binary.Write(ws, llm.ByteOrder, bytes.Repeat([]byte{0}, int(padding))); err != nil {
return err
}
for _, tensor := range tensors {
if _, err := tensor.WriteTo(ws); err != nil {
return err
}
offset, err := ws.Seek(0, io.SeekCurrent)
if err != nil {
return err
@@ -643,6 +683,10 @@ func (llm *gguf) Encode(ws io.WriteSeeker, kv KV, tensors []Tensor) error {
if err := binary.Write(ws, llm.ByteOrder, bytes.Repeat([]byte{0}, int(padding))); err != nil {
return err
}
if _, err := tensor.WriteTo(ws); err != nil {
return err
}
}
return nil

View File

@@ -3,11 +3,12 @@ package llm
import (
"fmt"
"log/slog"
"strconv"
"strings"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/format"
"github.com/ollama/ollama/gpu"
"github.com/ollama/ollama/envconfig"
)
// This algorithm looks for a complete fit to determine if we need to unload other models
@@ -16,7 +17,8 @@ func PredictServerFit(allGpus gpu.GpuInfoList, ggml *GGML, adapters, projectors
var estimatedVRAM uint64
for _, gpus := range allGpus.ByLibrary() {
var layerCount int
layerCount, estimatedVRAM, _ = EstimateGPULayers(gpus, ggml, projectors, opts)
estimate := EstimateGPULayers(gpus, ggml, projectors, opts)
layerCount, estimatedVRAM = estimate.Layers, estimate.VRAMSize
if opts.NumGPU < 0 {
if layerCount > 0 && layerCount >= int(ggml.KV().BlockCount()+1) {
return true, estimatedVRAM
@@ -30,24 +32,76 @@ func PredictServerFit(allGpus gpu.GpuInfoList, ggml *GGML, adapters, projectors
return false, estimatedVRAM
}
type MemoryEstimate struct {
// How many layers we predict we can load
Layers int
// The size of the graph which occupies the main GPU
Graph uint64
// How much VRAM will be allocated given the number of layers we predict
VRAMSize uint64
// The total size of the model if loaded into VRAM. If all layers are loaded, VRAMSize == TotalSize
TotalSize uint64
// For multi-GPU scenarios, this provides the tensor split parameter
TensorSplit string
// For multi-GPU scenarios, this is the size in bytes per GPU
GPUSizes []uint64
// internal fields for logging purposes
inferenceLibrary string
layersRequested int
layersModel int
availableList []string
kv uint64
allocationsList []string
memoryWeights uint64
memoryLayerOutput uint64
graphFullOffload uint64
graphPartialOffload uint64
}
// Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size
// The GPUs provided must all be the same Library
func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts api.Options) (int, uint64, uint64) {
var memoryAvailable uint64
for _, info := range gpus {
memoryAvailable += info.FreeMemory
}
if envconfig.MaxVRAM > 0 {
memoryAvailable = envconfig.MaxVRAM
}
func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts api.Options) MemoryEstimate {
// Graph size for a partial offload, applies to all GPUs
var graphPartialOffload uint64
slog.Debug("evaluating", "library", gpus[0].Library, "gpu_count", len(gpus), "available", format.HumanBytes2(memoryAvailable))
// Graph size when all layers are offloaded, applies to all GPUs
var graphFullOffload uint64
// TODO - this is probably wrong, first GPU vs secondaries will have different overheads
memoryMinimum := gpus[0].MinimumMemory
// Final graph offload once we know full or partial
var graphOffload uint64
// Projectors loaded into GPU0 only
var projectorSize uint64
// Conditional output size on GPU 0
var memoryLayerOutput uint64
// The sizes of a layer
var layerSize uint64
// The sum of all the layer sizes (just for logging)
var memoryWeights uint64
// True if all the layers are loaded
var fullyLoaded bool
// Overflow that didn't fit into the GPU
var overflow uint64
availableList := make([]string, len(gpus))
for i, gpu := range gpus {
availableList[i] = format.HumanBytes2(gpu.FreeMemory)
}
slog.Debug("evaluating", "library", gpus[0].Library, "gpu_count", len(gpus), "available", availableList)
for _, projector := range projectors {
memoryMinimum += projectorMemoryRequirements(projector)
projectorSize += projectorMemoryRequirements(projector)
// multimodal models require at least 2048 context
opts.NumCtx = max(opts.NumCtx, 2048)
@@ -56,127 +110,246 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
layers := ggml.Tensors().Layers()
// add one layer worth of memory as a buffer
if blk0, ok := layers["blk.0"]; ok {
memoryMinimum += blk0.size()
layerSize = blk0.size()
} else {
slog.Warn("model missing blk.0 layer size")
}
// fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv
var kv uint64 = 2 * 2 * uint64(opts.NumCtx) * ggml.KV().BlockCount() * ggml.KV().EmbeddingLength() / ggml.KV().HeadCount() * ggml.KV().HeadCountKV()
// fp16 k,v = sizeof(float16) * n_ctx * n_layer * (n_embd_head_k + n_embd_head_v) * n_head_kv
var kv uint64 = 2 * uint64(opts.NumCtx) * ggml.KV().BlockCount() * (ggml.KV().EmbeddingHeadCountK() + ggml.KV().EmbeddingHeadCountV()) * ggml.KV().HeadCountKV()
graphPartialOffload, graphFullOffload := ggml.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)))
// KV is proportional to the number of layers
layerSize += kv / ggml.KV().BlockCount()
graphPartialOffload, graphFullOffload = ggml.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)))
if graphPartialOffload == 0 {
graphPartialOffload = ggml.KV().GQA() * kv / 6
}
if graphFullOffload == 0 {
graphFullOffload = graphPartialOffload
}
graphFullOffload *= uint64(len(gpus))
graphPartialOffload *= uint64(len(gpus))
// on metal there's no partial offload overhead
if gpus[0].Library == "metal" {
graphPartialOffload = graphFullOffload
} else if len(gpus) > 1 {
// multigpu should always use the partial graph size
graphFullOffload = graphPartialOffload
}
// memoryRequiredTotal represents the memory required for full GPU offloading (all layers)
memoryRequiredTotal := memoryMinimum + graphFullOffload
// memoryRequiredPartial represents the memory required for partial GPU offloading (n > 0, n < layers)
memoryRequiredPartial := memoryMinimum + graphPartialOffload
var memoryLayerOutput uint64
if layer, ok := layers["output_norm"]; ok {
memoryLayerOutput += layer.size()
}
if layer, ok := layers["output"]; ok {
memoryLayerOutput += layer.size()
} else if layer, ok := layers["token_embd"]; ok {
memoryLayerOutput += layer.size()
}
if gpus[0].Library == "metal" && opts.UseMMap {
// memory is preallocated for output tensors
memoryRequiredTotal += memoryLayerOutput
memoryRequiredPartial += memoryLayerOutput
// Output layer handled at the end if we have space
gpuZeroOverhead := projectorSize
// Reduce set of GPUs to only those that have sufficient space to fit overhead and at least one layer
var layerCount int
layerCounts := make([]int, len(gpus))
gpuAllocations := make([]uint64, len(gpus))
type gs struct {
i int
g *gpu.GpuInfo
}
gpusWithSpace := []gs{}
for i := range gpus {
var gzo uint64
if len(gpusWithSpace) == 0 {
gzo = gpuZeroOverhead
}
// Only include GPUs that can fit the graph, gpu minimum, the layer buffer and at least more layer
if gpus[i].FreeMemory < gzo+max(graphPartialOffload, graphFullOffload)+gpus[i].MinimumMemory+2*layerSize {
slog.Debug("gpu has too little memory to allocate any layers", "gpu", gpus[i])
continue
}
gpusWithSpace = append(gpusWithSpace, gs{i, &gpus[i]})
gpuAllocations[i] += gpus[i].MinimumMemory + layerSize // We hold off on graph until we know partial vs. full
}
var layerCount int
for i := 0; i < int(ggml.KV().BlockCount()); i++ {
var gpuZeroID int
if len(gpusWithSpace) > 0 {
gpuZeroID = gpusWithSpace[0].i
gpuAllocations[gpuZeroID] += gpuZeroOverhead
}
// For all the layers, find where they can fit on the GPU(s)
for i := range int(ggml.KV().BlockCount()) {
// Some models have inconsistent layer sizes
if blk, ok := layers[fmt.Sprintf("blk.%d", i)]; ok {
memoryLayer := blk.size()
layerSize = blk.size()
layerSize += kv / ggml.KV().BlockCount()
}
memoryWeights += layerSize
// KV is proportional to the number of layers
memoryLayer += kv / ggml.KV().BlockCount()
if opts.NumGPU >= 0 && layerCount >= opts.NumGPU {
// Stop allocating on GPU(s) once we hit the users target NumGPU
continue
}
memoryRequiredTotal += memoryLayer
if (opts.NumGPU >= 0 && layerCount+1 <= opts.NumGPU) || (opts.NumGPU < 0 && memoryAvailable > memoryRequiredPartial+memoryLayer) {
memoryRequiredPartial += memoryLayer
// distribute the layers across the GPU(s) that have space
for j := len(gpusWithSpace); j > 0; j-- {
g := gpusWithSpace[i%j]
used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
if g.g.FreeMemory > used+layerSize {
gpuAllocations[g.i] += layerSize
layerCounts[g.i]++
layerCount++
break
} else {
gpusWithSpace = append(gpusWithSpace[:i%j], gpusWithSpace[i%j+1:]...)
}
}
}
if gpus[0].Library != "metal" || !opts.UseMMap {
// memory was not preallocated for output tensors
memoryRequiredTotal += memoryLayerOutput
if layerCount >= int(ggml.KV().BlockCount()) {
fullyLoaded = true
} else {
for i := layerCount; i < int(ggml.KV().BlockCount()); i++ {
overflow += layerSize
}
}
if (opts.NumGPU >= 0 && layerCount+1 <= opts.NumGPU) || (opts.NumGPU < 0 && memoryAvailable > memoryRequiredTotal) {
layerCount = int(ggml.KV().BlockCount()) + 1
memoryRequiredPartial = memoryRequiredTotal
// Determine if we need to consider output then find where it fits
if memoryLayerOutput > 0 && (opts.NumGPU < 0 || layerCount < opts.NumGPU) {
for j := len(gpusWithSpace); j > 0; j-- {
g := gpusWithSpace[layerCount%j]
used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
if g.g.FreeMemory > used+memoryLayerOutput {
gpuAllocations[g.i] += memoryLayerOutput
layerCounts[g.i]++
layerCount++
break
}
}
if layerCount < int(ggml.KV().BlockCount())+1 {
fullyLoaded = false
overflow += memoryLayerOutput
}
}
memoryWeights := memoryRequiredTotal - memoryMinimum - graphFullOffload - kv
// Add the applicable (full or partial) graph allocations
for i := range gpus {
if layerCounts[i] <= 0 {
continue
}
if fullyLoaded {
gpuAllocations[i] += graphFullOffload
} else {
gpuAllocations[i] += graphPartialOffload
}
}
if fullyLoaded {
graphOffload = graphFullOffload
} else {
graphOffload = graphPartialOffload
}
// Summaries for the log
var memoryRequiredPartial, memoryRequiredTotal uint64
for i := range gpuAllocations {
memoryRequiredPartial += gpuAllocations[i]
}
memoryRequiredTotal = memoryRequiredPartial + overflow
tensorSplit := ""
if len(gpus) > 1 {
splits := make([]string, len(gpus))
for i, count := range layerCounts {
splits[i] = strconv.Itoa(count)
}
tensorSplit = strings.Join(splits, ",")
}
allocationsList := []string{}
for _, a := range gpuAllocations {
allocationsList = append(allocationsList, format.HumanBytes2(a))
}
estimate := MemoryEstimate{
TotalSize: memoryRequiredTotal,
Layers: 0,
Graph: 0,
VRAMSize: 0,
GPUSizes: []uint64{},
inferenceLibrary: gpus[0].Library,
layersRequested: opts.NumGPU,
layersModel: int(ggml.KV().BlockCount()) + 1,
availableList: availableList,
kv: kv,
allocationsList: allocationsList,
memoryWeights: memoryWeights,
memoryLayerOutput: memoryLayerOutput,
graphFullOffload: graphFullOffload,
graphPartialOffload: graphPartialOffload,
}
if gpus[0].Library == "cpu" {
return estimate
}
if layerCount == 0 {
slog.Debug("insufficient VRAM to load any model layers")
return estimate
}
estimate.Layers = layerCount
estimate.Graph = graphOffload
estimate.VRAMSize = memoryRequiredPartial
estimate.TotalSize = memoryRequiredTotal
estimate.TensorSplit = tensorSplit
estimate.GPUSizes = gpuAllocations
return estimate
}
func (m MemoryEstimate) log() {
slog.Info(
"offload to gpu",
"offload to "+m.inferenceLibrary,
slog.Group(
"layers",
// requested number of layers to offload
"requested", opts.NumGPU,
"requested", m.layersRequested,
// The number of layers the model has (including output)
"model", m.layersModel,
// estimated number of layers that can be offloaded
"real", layerCount,
"offload", m.Layers,
// multi-gpu split for tensors
"split", m.TensorSplit,
),
slog.Group(
"memory",
// memory available for offloading
"available", format.HumanBytes2(memoryAvailable),
// memory available by GPU for offloading
"available", m.availableList,
slog.Group(
"required",
// memory required for full offloading
"full", format.HumanBytes2(memoryRequiredTotal),
"full", format.HumanBytes2(m.TotalSize),
// memory required to offload layers.estimate layers
"partial", format.HumanBytes2(memoryRequiredPartial),
"partial", format.HumanBytes2(m.VRAMSize),
// memory of KV cache
"kv", format.HumanBytes2(kv),
"kv", format.HumanBytes2(m.kv),
// Allocations across the GPUs
"allocations", m.allocationsList,
),
slog.Group(
"weights",
// memory of the weights
"total", format.HumanBytes2(memoryWeights),
"total", format.HumanBytes2(m.memoryWeights),
// memory of repeating layers
"repeating", format.HumanBytes2(memoryWeights-memoryLayerOutput),
"repeating", format.HumanBytes2(m.memoryWeights-m.memoryLayerOutput),
// memory of non-repeating layers
"nonrepeating", format.HumanBytes2(memoryLayerOutput),
"nonrepeating", format.HumanBytes2(m.memoryLayerOutput),
),
slog.Group(
"graph",
// memory of graph when fully offloaded
"full", format.HumanBytes2(graphFullOffload),
"full", format.HumanBytes2(m.graphFullOffload),
// memory of graph when not fully offloaded
"partial", format.HumanBytes2(graphPartialOffload),
"partial", format.HumanBytes2(m.graphPartialOffload),
),
),
)
if gpus[0].Library == "cpu" {
return 0, 0, memoryRequiredTotal
}
if memoryRequiredPartial > memoryAvailable {
slog.Debug("insufficient VRAM to load any model layers")
return 0, 0, memoryRequiredTotal
}
return layerCount, memoryRequiredPartial, memoryRequiredTotal
}

130
llm/memory_test.go Normal file
View File

@@ -0,0 +1,130 @@
package llm
import (
"bytes"
"encoding/binary"
"fmt"
"os"
"testing"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/gpu"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func TestEstimateGPULayers(t *testing.T) {
envconfig.Debug = true
modelName := "dummy"
f, err := os.CreateTemp(t.TempDir(), modelName)
require.NoError(t, err)
defer f.Close()
gguf := NewGGUFV3(binary.LittleEndian)
inputLayerCount := 5
tensors := []Tensor{
{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
{Name: "blk.1.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
{Name: "blk.2.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
{Name: "blk.3.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
{Name: "blk.4.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
}
assert.Len(t, tensors, inputLayerCount+1)
err = gguf.Encode(f, KV{
"general.architecture": "llama",
"general.name": "name",
"llama.context_length": uint32(32),
"llama.embedding_length": uint32(4096),
"llama.block_count": uint32(inputLayerCount),
"llama.attention.head_count": uint32(32),
"llama.attention.head_count_kv": uint32(32),
"tokenizer.ggml.tokens": []string{" "},
"tokenizer.ggml.scores": []float32{0},
"tokenizer.ggml.token_type": []int32{0},
}, tensors)
require.NoError(t, err)
ggml, err := LoadModel(f.Name(), 0)
if err != nil {
t.Fatal(err)
}
// Simple CPU scenario
gpus := []gpu.GpuInfo{
{
Library: "cpu",
},
}
projectors := []string{}
opts := api.DefaultOptions()
t.Run("cpu", func(t *testing.T) {
estimate := EstimateGPULayers(gpus, ggml, projectors, opts)
assert.Equal(t, 0, estimate.Layers)
assert.Equal(t, uint64(0), estimate.Graph)
})
// derived from the dummy ggml file above
graphPartialOffload := uint64(202377216)
graphFullOffload := uint64(171968512)
layerSize := uint64(33554436)
projectorSize := uint64(0)
memoryLayerOutput := uint64(4)
// Dual CUDA scenario with assymetry
gpuMinimumMemory := uint64(2048)
gpus = []gpu.GpuInfo{
{
Library: "cuda",
MinimumMemory: gpuMinimumMemory,
},
{
Library: "cuda",
MinimumMemory: gpuMinimumMemory,
},
}
// Nested array: GPU0 layer space, GPU1 layer space, expected gpu0, expected gpu1
for i, s := range []struct {
layer0, layer1 uint64
expect0, expect1 uint64
}{
{1, 1, 1, 1},
{2, 1, 2, 1},
{2, 2, 2, 2},
{1, 2, 1, 2},
{3, 3, 3, 3},
{4, 4, 3, 3},
{6, 6, 3, 3},
{0, 3, 0, 3},
} {
t.Run(fmt.Sprintf("%v", s), func(t *testing.T) {
gpus[0].FreeMemory = 0
gpus[1].FreeMemory = 0
gpus[0].FreeMemory += projectorSize
if s.layer0 > 0 {
gpus[0].FreeMemory += memoryLayerOutput
} else {
gpus[1].FreeMemory += memoryLayerOutput
}
gpus[0].FreeMemory += gpuMinimumMemory + layerSize + s.layer0*layerSize + 1
gpus[1].FreeMemory += gpuMinimumMemory + layerSize + s.layer1*layerSize + 1
gpus[0].FreeMemory += max(graphFullOffload, graphPartialOffload)
gpus[1].FreeMemory += max(graphFullOffload, graphPartialOffload)
estimate := EstimateGPULayers(gpus, ggml, projectors, opts)
assert.Equal(t, int(s.expect0+s.expect1), estimate.Layers, "scenario %d: %v", i, s)
assert.Equal(t, fmt.Sprintf("%d,%d", s.expect0, s.expect1), estimate.TensorSplit, "scenario %d: %v", i, s)
var layerSums uint64
for _, b := range estimate.GPUSizes {
layerSums += b
}
if estimate.Layers < inputLayerCount+1 {
assert.Less(t, estimate.VRAMSize, estimate.TotalSize, "scenario %d: %v %+v", i, s, estimate)
assert.Equal(t, estimate.VRAMSize, layerSums, "scenario %d: %v %+v", i, s, estimate)
} else {
assert.Equal(t, estimate.VRAMSize, estimate.TotalSize, "scenario %d: %v %+v", i, s, estimate)
assert.Equal(t, estimate.TotalSize, layerSums, "scenario %d: %v %+v", i, s, estimate)
}
})
}
}

View File

@@ -1,8 +1,8 @@
diff --git a/common/common.cpp b/common/common.cpp
index ba1ecf0e..cead57cc 100644
index 73ff0e85..6adb1a92 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1836,6 +1836,8 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
@@ -2447,6 +2447,8 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
mparams.use_mmap = params.use_mmap;
mparams.use_mlock = params.use_mlock;
mparams.check_tensors = params.check_tensors;
@@ -12,20 +12,20 @@ index ba1ecf0e..cead57cc 100644
mparams.kv_overrides = NULL;
} else {
diff --git a/common/common.h b/common/common.h
index d80344f2..71e84834 100644
index 58ed72f4..0bb2605e 100644
--- a/common/common.h
+++ b/common/common.h
@@ -174,6 +174,13 @@ struct gpt_params {
// multimodal models (see examples/llava)
@@ -180,6 +180,13 @@ struct gpt_params {
std::string mmproj = ""; // path to multimodal projector
std::vector<std::string> image; // path to image file(s)
+
+ // Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
+ // If the provided progress_callback returns true, model loading continues.
+ // If it returns false, model loading is immediately aborted.
+ llama_progress_callback progress_callback = NULL;
+ // context pointer passed to the progress callback
+ void * progress_callback_user_data;
};
void gpt_params_handle_model_default(gpt_params & params);
+
// server params
int32_t port = 8080; // server listens on this network port
int32_t timeout_read = 600; // http read timeout in seconds

View File

@@ -1,35 +1,32 @@
From d02a06f3f45a09255ace8684a66590e06ce44605 Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Thu, 23 May 2024 11:33:20 -0700
Subject: [PATCH] default pretokenizer on unrecognized type
---
llama.cpp | 5 +----
1 file changed, 1 insertion(+), 4 deletions(-)
diff --git a/llama.cpp b/llama.cpp
index 15c66077..af1aede3 100644
index 61948751..4b72a293 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -4504,9 +4504,6 @@ static void llm_load_vocab(
LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
LLAMA_LOG_WARN("%s: \n", __func__);
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
- } else if (
- tokenizer_pre == "default") {
@@ -4824,16 +4824,7 @@ static void llm_load_vocab(
// for now, only BPE models have pre-tokenizers
if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
- if (tokenizer_pre.empty()) {
- LLAMA_LOG_WARN("%s: missing pre-tokenizer type, using: 'default'\n", __func__);
- LLAMA_LOG_WARN("%s: \n", __func__);
- LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
- LLAMA_LOG_WARN("%s: GENERATION QUALITY WILL BE DEGRADED! \n", __func__);
- LLAMA_LOG_WARN("%s: CONSIDER REGENERATING THE MODEL \n", __func__);
- LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
- LLAMA_LOG_WARN("%s: \n", __func__);
- vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
- } else if (tokenizer_pre == "default") {
+ if (tokenizer_pre == "default") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
} else if (
tokenizer_pre == "llama3" ||
tokenizer_pre == "llama-v3" ||
@@ -4553,7 +4550,7 @@ static void llm_load_vocab(
tokenizer_pre == "dbrx") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DBRX;
@@ -4888,7 +4879,8 @@ static void llm_load_vocab(
tokenizer_pre == "poro-chat") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_PORO;
} else {
- throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
+ LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
}
} else {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
--
2.45.1

13
llm/patches/06-qwen2.diff Normal file
View File

@@ -0,0 +1,13 @@
diff --git a/llama.cpp b/llama.cpp
index 40d2ec2c..f34eb79a 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -6943,7 +6943,7 @@ static struct ggml_tensor * llm_build_kqv(
struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
cb(kq, "kq", il);
- if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX) {
+ if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX || model.arch == LLM_ARCH_QWEN2) {
// for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs
// ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847
ggml_mul_mat_set_prec(kq, GGML_PREC_F32);

305
llm/patches/07-gemma.diff Normal file
View File

@@ -0,0 +1,305 @@
From 5cadb45f39d001ffbad95b690d6cf0abcb4a6d96 Mon Sep 17 00:00:00 2001
From: Ollama maintainers <hello@ollama.com>
Date: Wed, 26 Jun 2024 16:18:09 -0700
Subject: [PATCH] Architecture support
---
llama.cpp | 194 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 193 insertions(+), 1 deletion(-)
diff --git a/llama.cpp b/llama.cpp
index 61948751..3b4196f5 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -217,6 +217,7 @@ enum llm_arch {
LLM_ARCH_INTERNLM2,
LLM_ARCH_MINICPM,
LLM_ARCH_GEMMA,
+ LLM_ARCH_GEMMA2,
LLM_ARCH_STARCODER2,
LLM_ARCH_MAMBA,
LLM_ARCH_XVERSE,
@@ -255,6 +256,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_INTERNLM2, "internlm2" },
{ LLM_ARCH_MINICPM, "minicpm" },
{ LLM_ARCH_GEMMA, "gemma" },
+ { LLM_ARCH_GEMMA2, "gemma2" },
{ LLM_ARCH_STARCODER2, "starcoder2" },
{ LLM_ARCH_MAMBA, "mamba" },
{ LLM_ARCH_XVERSE, "xverse" },
@@ -464,10 +466,12 @@ enum llm_tensor {
LLM_TENSOR_ATTN_NORM,
LLM_TENSOR_ATTN_NORM_2,
LLM_TENSOR_ATTN_OUT_NORM,
+ LLM_TENSOR_ATTN_POST_NORM,
LLM_TENSOR_ATTN_ROT_EMBD,
LLM_TENSOR_FFN_GATE_INP,
LLM_TENSOR_FFN_GATE_INP_SHEXP,
LLM_TENSOR_FFN_NORM,
+ LLM_TENSOR_FFN_POST_NORM,
LLM_TENSOR_FFN_GATE,
LLM_TENSOR_FFN_DOWN,
LLM_TENSOR_FFN_UP,
@@ -960,6 +964,24 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
},
},
+ {
+ LLM_ARCH_GEMMA2,
+ {
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
+ { LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
+ { LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
+ },
+ },
{
LLM_ARCH_STARCODER2,
{
@@ -1941,6 +1963,8 @@ enum e_model {
MODEL_8x22B,
MODEL_16x12B,
MODEL_10B_128x3_66B,
+ MODEL_9B,
+ MODEL_27B,
};
static const size_t kiB = 1024;
@@ -2114,6 +2138,7 @@ struct llama_layer {
struct ggml_tensor * attn_out_norm_b;
struct ggml_tensor * attn_q_a_norm;
struct ggml_tensor * attn_kv_a_norm;
+ struct ggml_tensor * attn_post_norm;
// attention
struct ggml_tensor * wq;
@@ -2136,6 +2161,7 @@ struct llama_layer {
// normalization
struct ggml_tensor * ffn_norm;
struct ggml_tensor * ffn_norm_b;
+ struct ggml_tensor * ffn_post_norm;
struct ggml_tensor * layer_out_norm;
struct ggml_tensor * layer_out_norm_b;
struct ggml_tensor * ffn_norm_exps;
@@ -4529,6 +4555,16 @@ static void llm_load_hparams(
}
} break;
case LLM_ARCH_GEMMA:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+ switch (hparams.n_layer) {
+ case 18: model.type = e_model::MODEL_9B; break;
+ case 28: model.type = e_model::MODEL_27B; break;
+ default: model.type = e_model::MODEL_UNKNOWN;
+ }
+ } break;
+ case LLM_ARCH_GEMMA2:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -6305,6 +6341,40 @@ static bool llm_load_tensors(
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
}
} break;
+ case LLM_ARCH_GEMMA2:
+ {
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+
+ // output
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
+
+ const int64_t n_ff = hparams.n_ff;
+ const int64_t n_embd_head_k = hparams.n_embd_head_k;
+ const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
+ const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
+
+ for (uint32_t i = 0; i < n_layer; ++i) {
+ ggml_context * ctx_layer = ctx_for_layer(i);
+ ggml_context * ctx_split = ctx_for_layer_split(i);
+
+ auto & layer = model.layers[i];
+
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
+
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * hparams.n_head});
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa});
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa});
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * hparams.n_head, n_embd});
+ layer.attn_post_norm = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd});
+
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
+ layer.ffn_post_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd});
+ }
+ } break;
case LLM_ARCH_STARCODER2:
{
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
@@ -10614,6 +10684,123 @@ struct llm_build_context {
return gf;
}
+ struct ggml_cgraph * build_gemma2() {
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+
+ const int64_t n_embd_head_k = hparams.n_embd_head_k;
+
+ struct ggml_tensor * cur;
+ struct ggml_tensor * inpL;
+
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+
+ inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
+ cb(inpL, "inp_scaled", -1);
+
+ // inp_pos - contains the positions
+ struct ggml_tensor * inp_pos = build_inp_pos();
+
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+
+ for (int il = 0; il < n_layer; ++il) {
+ // norm
+ cur = llm_build_norm(ctx0, inpL, hparams,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, cb, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ // compute Q and K and RoPE them
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+
+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+
+ Qcur = ggml_rope_ext(
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos, nullptr,
+ n_embd_head_k, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+ cb(Qcur, "Qcur", il);
+
+ Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k)));
+ cb(Qcur, "Qcur_scaled", il);
+
+ Kcur = ggml_rope_ext(
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos, nullptr,
+ n_embd_head_k, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+ cb(Kcur, "Kcur", il);
+
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
+ model.layers[il].wo, NULL,
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
+ }
+
+ if (il == n_layer - 1) {
+ // skip computing output for unused tokens
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
+ }
+
+ cur = llm_build_norm(ctx0, cur, hparams,
+ model.layers[il].attn_post_norm, NULL,
+ LLM_NORM_RMS, cb, il);
+ cb(cur, "attn_post_norm", il);
+
+ struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
+ cb(sa_out, "sa_out", il);
+
+ cur = llm_build_norm(ctx0, sa_out, hparams,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, cb, il);
+ cb(cur, "ffn_norm", il);
+
+ // feed-forward network
+ {
+ cur = llm_build_ffn(ctx0, cur,
+ model.layers[il].ffn_up, NULL,
+ model.layers[il].ffn_gate, NULL,
+ model.layers[il].ffn_down, NULL,
+ NULL,
+ LLM_FFN_GELU, LLM_FFN_PAR, cb, il);
+ cb(cur, "ffn_out", il);
+ }
+
+ cur = llm_build_norm(ctx0, cur, hparams,
+ model.layers[il].ffn_post_norm, NULL,
+ LLM_NORM_RMS, cb, -1);
+ cb(cur, "ffn_post_norm", -1);
+
+ cur = ggml_add(ctx0, cur, sa_out);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+
+ cur = inpL;
+
+ cur = llm_build_norm(ctx0, cur, hparams,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, cb, -1);
+ cb(cur, "result_norm", -1);
+
+ // lm_head
+ cur = ggml_mul_mat(ctx0, model.output, cur);
+ cb(cur, "result_output", -1);
+
+ ggml_build_forward_expand(gf, cur);
+
+ return gf;
+ }
+
struct ggml_cgraph * build_starcoder2() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
@@ -11847,6 +12034,10 @@ static struct ggml_cgraph * llama_build_graph(
{
result = llm.build_gemma();
} break;
+ case LLM_ARCH_GEMMA2:
+ {
+ result = llm.build_gemma2();
+ } break;
case LLM_ARCH_STARCODER2:
{
result = llm.build_starcoder2();
@@ -16671,6 +16862,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
case LLM_ARCH_PHI2:
case LLM_ARCH_PHI3:
case LLM_ARCH_GEMMA:
+ case LLM_ARCH_GEMMA2:
case LLM_ARCH_STARCODER2:
case LLM_ARCH_GPTNEOX:
return LLAMA_ROPE_TYPE_NEOX;
@@ -18551,7 +18743,7 @@ static int32_t llama_chat_apply_template_internal(
if (add_ass) {
ss << "<s>assistant\n";
}
- } else if (tmpl == "gemma" || tmpl.find("<start_of_turn>") != std::string::npos) {
+ } else if (tmpl == "gemma" || tmpl == "gemma2" || tmpl.find("<start_of_turn>") != std::string::npos) {
// google/gemma-7b-it
std::string system_prompt = "";
for (auto message : chat) {
--
2.45.2

View File

@@ -10,9 +10,9 @@ import (
"os"
"path/filepath"
"runtime"
"slices"
"strings"
"golang.org/x/exp/slices"
"golang.org/x/sync/errgroup"
"github.com/ollama/ollama/gpu"
@@ -58,7 +58,7 @@ func availableServers() map[string]string {
}
// glob payloadsDir for files that start with ollama_
pattern := filepath.Join(payloadsDir, "*")
pattern := filepath.Join(payloadsDir, "*", "ollama_*")
files, err := filepath.Glob(pattern)
if err != nil {
@@ -69,7 +69,7 @@ func availableServers() map[string]string {
servers := make(map[string]string)
for _, file := range files {
slog.Debug("availableServers : found", "file", file)
servers[filepath.Base(file)] = file
servers[filepath.Base(filepath.Dir(file))] = filepath.Dir(file)
}
return servers
@@ -82,8 +82,8 @@ func serversForGpu(info gpu.GpuInfo) []string {
// glob workDir for files that start with ollama_
availableServers := availableServers()
requested := info.Library
if info.Variant != "" {
requested += "_" + info.Variant
if info.Variant != gpu.CPUCapabilityNone {
requested += "_" + info.Variant.String()
}
servers := []string{}
@@ -117,14 +117,14 @@ func serversForGpu(info gpu.GpuInfo) []string {
// Load up the best CPU variant if not primary requested
if info.Library != "cpu" {
variant := gpu.GetCPUVariant()
variant := gpu.GetCPUCapability()
// If no variant, then we fall back to default
// If we have a variant, try that if we find an exact match
// Attempting to run the wrong CPU instructions will panic the
// process
if variant != "" {
if variant != gpu.CPUCapabilityNone {
for cmp := range availableServers {
if cmp == "cpu_"+variant {
if cmp == "cpu_"+variant.String() {
servers = append(servers, cmp)
break
}
@@ -146,11 +146,11 @@ func serverForCpu() string {
if runtime.GOOS == "darwin" && runtime.GOARCH == "arm64" {
return "metal"
}
variant := gpu.GetCPUVariant()
variant := gpu.GetCPUCapability()
availableServers := availableServers()
if variant != "" {
if variant != gpu.CPUCapabilityNone {
for cmp := range availableServers {
if cmp == "cpu_"+variant {
if cmp == "cpu_"+variant.String() {
return cmp
}
}

View File

@@ -37,8 +37,9 @@ type LlamaServer interface {
Tokenize(ctx context.Context, content string) ([]int, error)
Detokenize(ctx context.Context, tokens []int) (string, error)
Close() error
EstimatedVRAM() uint64
EstimatedVRAM() uint64 // Total VRAM across all GPUs
EstimatedTotal() uint64
EstimatedVRAMByGPU(gpuID string) uint64
}
// llmServer is an instance of the llama.cpp server
@@ -49,18 +50,22 @@ type llmServer struct {
status *StatusWriter
options api.Options
// TODO - this should be broken down by GPU
estimatedVRAM uint64 // Estimated usage of VRAM by the loaded model
estimatedTotal uint64 // Total size of model
totalLayers uint64
gpuCount int
loadDuration time.Duration // Record how long it took the model to load
loadProgress float32
estimate MemoryEstimate
totalLayers uint64
// gpuCount int
gpus gpu.GpuInfoList // Recorded just before the model loaded, free space will be incorrect
loadDuration time.Duration // Record how long it took the model to load
loadProgress float32
sem *semaphore.Weighted
}
func LoadModel(model string) (*GGML, error) {
// LoadModel will load a model from disk. The model must be in the GGML format.
//
// It collects array values for arrays with a size less than or equal to
// maxArraySize. If maxArraySize is 0, the default value of 1024 is used. If
// the maxArraySize is negative, all arrays are collected.
func LoadModel(model string, maxArraySize int) (*GGML, error) {
if _, err := os.Stat(model); err != nil {
return nil, err
}
@@ -71,7 +76,7 @@ func LoadModel(model string) (*GGML, error) {
}
defer f.Close()
ggml, _, err := DecodeGGML(f)
ggml, _, err := DecodeGGML(f, maxArraySize)
return ggml, err
}
@@ -80,45 +85,47 @@ func LoadModel(model string) (*GGML, error) {
func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, projectors []string, opts api.Options) (LlamaServer, error) {
var err error
var cpuRunner string
var estimatedVRAM uint64
var estimatedTotal uint64
var systemMemory uint64
gpuCount := len(gpus)
if (len(gpus) == 1 && gpus[0].Library == "cpu") || opts.NumGPU == 0 {
var estimate MemoryEstimate
var systemTotalMemory uint64
var systemFreeMemory uint64
// TODO evaluate system memory to see if we should block the load, or force an unload of another CPU runner
cpuRunner = serverForCpu()
gpuCount = 0
_, _, estimatedTotal = EstimateGPULayers(gpus, ggml, projectors, opts)
systemMemInfo, err := gpu.GetCPUMem()
if err != nil {
slog.Error("failed to lookup system memory", "error", err)
} else {
if gpus[0].Library == "metal" {
memInfo, err := gpu.GetCPUMem()
if err != nil {
slog.Error("failed to lookup system memory", "error", err)
} else {
systemMemory = memInfo.TotalMemory
slog.Debug("system memory", "total", format.HumanBytes2(systemMemory))
}
}
var layers int
layers, estimatedVRAM, estimatedTotal = EstimateGPULayers(gpus, ggml, projectors, opts)
systemTotalMemory = systemMemInfo.TotalMemory
systemFreeMemory = systemMemInfo.FreeMemory
slog.Debug("system memory", "total", format.HumanBytes2(systemTotalMemory), "free", systemFreeMemory)
}
if gpus[0].Library == "metal" && estimatedVRAM > systemMemory {
// If the user wants zero GPU layers, reset the gpu list to be CPU/system ram info
if opts.NumGPU == 0 {
gpus = gpu.GetCPUInfo()
}
if len(gpus) == 1 && gpus[0].Library == "cpu" {
cpuRunner = serverForCpu()
estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
} else {
estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
switch {
case gpus[0].Library == "metal" && estimate.VRAMSize > systemTotalMemory:
// disable partial offloading when model is greater than total system memory as this
// can lead to locking up the system
opts.NumGPU = 0
} else if gpus[0].Library != "metal" && layers == 0 {
case gpus[0].Library != "metal" && estimate.Layers == 0:
// Don't bother loading into the GPU if no layers can fit
cpuRunner = serverForCpu()
gpuCount = 0
} else if opts.NumGPU < 0 && layers > 0 && gpus[0].Library != "cpu" {
opts.NumGPU = layers
gpus = gpu.GetCPUInfo()
case opts.NumGPU < 0 && estimate.Layers > 0 && gpus[0].Library != "cpu":
opts.NumGPU = estimate.Layers
}
}
estimate.log()
// Loop through potential servers
finalErr := fmt.Errorf("no suitable llama servers found")
finalErr := errors.New("no suitable llama servers found")
if len(adapters) > 1 {
return nil, errors.New("ollama supports only one lora adapter, but multiple were provided")
@@ -189,35 +196,42 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
params = append(params, "--memory-f32")
}
flashAttnEnabled := envconfig.FlashAttention
for _, g := range gpus {
// only cuda (compute capability 7+) and metal support flash attention
if g.Library != "metal" && (g.Library != "cuda" || g.DriverMajor < 7) {
flashAttnEnabled = false
}
// mmap has issues with partial offloading on metal
if g.Library == "metal" &&
uint64(opts.NumGPU) > 0 &&
uint64(opts.NumGPU) < ggml.KV().BlockCount()+1 {
opts.UseMMap = api.TriStateFalse
}
}
if flashAttnEnabled {
params = append(params, "--flash-attn")
}
// Windows CUDA should not use mmap for best performance
// Linux with a model larger than free space, mmap leads to thrashing
if (runtime.GOOS == "windows" && gpus[0].Library == "cuda" && opts.UseMMap == api.TriStateUndefined) ||
(runtime.GOOS == "linux" && systemFreeMemory < estimate.TotalSize && opts.UseMMap == api.TriStateUndefined) ||
opts.UseMMap == api.TriStateFalse {
params = append(params, "--no-mmap")
}
if opts.UseMLock {
params = append(params, "--mlock")
}
if !opts.UseMMap {
params = append(params, "--no-mmap")
}
if opts.UseNUMA {
params = append(params, "--numa")
}
flashAttnEnabled := envconfig.FlashAttention
// partial offloading does not support flash attention
if uint64(opts.NumGPU) < ggml.KV().BlockCount()+1 {
flashAttnEnabled = false
}
// only cuda (compute capability 7+) and metal support flash attention
for _, g := range gpus {
if g.Library != "metal" && (g.Library != "cuda" || g.DriverMajor < 7) {
flashAttnEnabled = false
}
}
if flashAttnEnabled {
params = append(params, "--flash-attn")
}
numParallel := envconfig.NumParallel
// TODO (jmorganca): multimodal models don't support parallel yet
@@ -229,7 +243,15 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
params = append(params, "--parallel", fmt.Sprintf("%d", numParallel))
for i := 0; i < len(servers); i++ {
if estimate.TensorSplit != "" {
params = append(params, "--tensor-split", estimate.TensorSplit)
}
if estimate.TensorSplit != "" {
params = append(params, "--tensor-split", estimate.TensorSplit)
}
for i := range len(servers) {
dir := availableServers[servers[i]]
if dir == "" {
// Shouldn't happen
@@ -239,8 +261,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
}
if strings.HasPrefix(servers[i], "cpu") {
// TODO if we tried a gpu runner first, and it failed, record the error and bubble that back up
gpuCount = 0
gpus = gpu.GetCPUInfo()
}
// Find an availableServers port, retry on each iteration in case the failure was a port conflict race
@@ -262,8 +283,8 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
if runtime.GOOS == "windows" {
pathEnv = "PATH"
}
// prepend the server directory to LD_LIBRARY_PATH/PATH
libraryPaths := []string{dir}
// prepend the server directory to LD_LIBRARY_PATH/PATH and the parent dir for common dependencies
libraryPaths := []string{dir, filepath.Dir(dir)}
if libraryPath, ok := os.LookupEnv(pathEnv); ok {
// Append our runner directory to the path
@@ -281,7 +302,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
server := filepath.Join(dir, "ollama_llama_server")
if runtime.GOOS == "windows" {
server = server + ".exe"
server += ".exe"
}
// Detect tmp cleaners wiping out the file
@@ -296,23 +317,26 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
}
s := &llmServer{
port: port,
cmd: exec.Command(server, finalParams...),
status: NewStatusWriter(os.Stderr),
options: opts,
estimatedVRAM: estimatedVRAM,
estimatedTotal: estimatedTotal,
sem: semaphore.NewWeighted(int64(numParallel)),
totalLayers: ggml.KV().BlockCount() + 1,
gpuCount: gpuCount,
done: make(chan error, 1),
port: port,
cmd: exec.Command(server, finalParams...),
status: NewStatusWriter(os.Stderr),
options: opts,
estimate: estimate,
sem: semaphore.NewWeighted(int64(numParallel)),
totalLayers: ggml.KV().BlockCount() + 1,
gpus: gpus,
done: make(chan error, 1),
}
s.cmd.Env = os.Environ()
s.cmd.Stdout = os.Stdout
s.cmd.Stderr = s.status
visibleDevicesEnv, visibleDevicesEnvVal := gpu.GpuInfoList(gpus).GetVisibleDevicesEnv()
envWorkarounds := [][2]string{}
for _, gpu := range gpus {
envWorkarounds = append(envWorkarounds, gpu.EnvWorkarounds...)
}
visibleDevicesEnv, visibleDevicesEnvVal := gpus.GetVisibleDevicesEnv()
pathEnvVal := strings.Join(libraryPaths, string(filepath.ListSeparator))
// Update or add the path and visible devices variable with our adjusted version
@@ -326,6 +350,12 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
} else if devicesNeeded && strings.EqualFold(cmp[0], visibleDevicesEnv) {
s.cmd.Env[i] = visibleDevicesEnv + "=" + visibleDevicesEnvVal
devicesNeeded = false
} else if len(envWorkarounds) != 0 {
for _, kv := range envWorkarounds {
if strings.EqualFold(cmp[0], kv[0]) {
s.cmd.Env[i] = kv[0] + "=" + kv[1]
}
}
}
}
if pathNeeded {
@@ -387,7 +417,7 @@ func projectorMemoryRequirements(filename string) uint64 {
}
defer file.Close()
ggml, _, err := DecodeGGML(file)
ggml, _, err := DecodeGGML(file, 0)
if err != nil {
return 0
}
@@ -456,7 +486,7 @@ func (s *llmServer) getServerStatus(ctx context.Context) (ServerStatus, error) {
resp, err := http.DefaultClient.Do(req)
if err != nil {
if errors.Is(err, context.DeadlineExceeded) {
return ServerStatusNotResponding, fmt.Errorf("server not responding")
return ServerStatusNotResponding, errors.New("server not responding")
}
return ServerStatusError, fmt.Errorf("health resp: %w", err)
}
@@ -603,7 +633,7 @@ array ::=
string ::=
"\"" (
[^"\\] |
[^"\\\x7F\x00-\x1F] |
"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) # escapes
)* "\"" ws
@@ -1001,11 +1031,20 @@ func (s *llmServer) Close() error {
}
func (s *llmServer) EstimatedVRAM() uint64 {
return s.estimatedVRAM
return s.estimate.VRAMSize
}
func (s *llmServer) EstimatedTotal() uint64 {
return s.estimatedTotal
return s.estimate.TotalSize
}
func (s *llmServer) EstimatedVRAMByGPU(gpuID string) uint64 {
for i, gpu := range s.gpus {
if gpu.ID == gpuID {
return s.estimate.GPUSizes[i]
}
}
return 0
}
func parseDurationMs(ms float64) time.Duration {

View File

@@ -178,9 +178,6 @@ func fromRequest(r ChatCompletionRequest) api.ChatRequest {
if r.Seed != nil {
options["seed"] = *r.Seed
// temperature=0 is required for reproducible outputs
options["temperature"] = 0.0
}
if r.FrequencyPenalty != nil {
@@ -245,7 +242,6 @@ func (w *writer) writeResponse(data []byte) (int, error) {
d, err := json.Marshal(toChunk(w.id, chatResponse))
if err != nil {
return 0, err
}
w.ResponseWriter.Header().Set("Content-Type", "text/event-stream")

View File

@@ -8,7 +8,9 @@ import (
"io"
"strconv"
"strings"
"unicode"
"golang.org/x/text/encoding/unicode"
"golang.org/x/text/transform"
)
type File struct {
@@ -69,14 +71,11 @@ func ParseFile(r io.Reader) (*File, error) {
var b bytes.Buffer
var role string
var lineCount int
var linePos int
var utf16 bool
var f File
br := bufio.NewReader(r)
tr := unicode.BOMOverride(unicode.UTF8.NewDecoder())
br := bufio.NewReader(transform.NewReader(r, tr))
for {
r, _, err := br.ReadRune()
if errors.Is(err, io.EOF) {
@@ -85,17 +84,6 @@ func ParseFile(r io.Reader) (*File, error) {
return nil, err
}
// the utf16 byte order mark will be read as "unreadable" by ReadRune()
if isUnreadable(r) && lineCount == 0 && linePos == 0 {
utf16 = true
continue
}
// skip the second byte if we're reading utf16
if utf16 && r == 0 {
continue
}
next, r, err := parseRuneForState(r, curr)
if errors.Is(err, io.ErrUnexpectedEOF) {
return nil, fmt.Errorf("%w: %s", err, b.String())
@@ -103,13 +91,6 @@ func ParseFile(r io.Reader) (*File, error) {
return nil, err
}
if isNewline(r) {
lineCount++
linePos = 0
} else {
linePos++
}
// process the state transition, some transitions need to be intercepted and redirected
if next != curr {
switch curr {
@@ -309,10 +290,6 @@ func isNewline(r rune) bool {
return r == '\r' || r == '\n'
}
func isUnreadable(r rune) bool {
return r == unicode.ReplacementChar
}
func isValidMessageRole(role string) bool {
return role == "system" || role == "user" || role == "assistant"
}

View File

@@ -10,6 +10,9 @@ import (
"unicode/utf16"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"golang.org/x/text/encoding"
"golang.org/x/text/encoding/unicode"
)
func TestParseFileFile(t *testing.T) {
@@ -25,7 +28,7 @@ TEMPLATE template1
reader := strings.NewReader(input)
modelfile, err := ParseFile(reader)
assert.NoError(t, err)
require.NoError(t, err)
expectedCommands := []Command{
{Name: "model", Args: "model1"},
@@ -88,7 +91,7 @@ func TestParseFileFrom(t *testing.T) {
for _, c := range cases {
t.Run("", func(t *testing.T) {
modelfile, err := ParseFile(strings.NewReader(c.input))
assert.ErrorIs(t, err, c.err)
require.ErrorIs(t, err, c.err)
if modelfile != nil {
assert.Equal(t, c.expected, modelfile.Commands)
}
@@ -105,7 +108,7 @@ PARAMETER param1
reader := strings.NewReader(input)
_, err := ParseFile(reader)
assert.ErrorIs(t, err, io.ErrUnexpectedEOF)
require.ErrorIs(t, err, io.ErrUnexpectedEOF)
}
func TestParseFileBadCommand(t *testing.T) {
@@ -114,8 +117,7 @@ FROM foo
BADCOMMAND param1 value1
`
_, err := ParseFile(strings.NewReader(input))
assert.ErrorIs(t, err, errInvalidCommand)
require.ErrorIs(t, err, errInvalidCommand)
}
func TestParseFileMessages(t *testing.T) {
@@ -201,7 +203,7 @@ MESSAGE system`,
for _, c := range cases {
t.Run("", func(t *testing.T) {
modelfile, err := ParseFile(strings.NewReader(c.input))
assert.ErrorIs(t, err, c.err)
require.ErrorIs(t, err, c.err)
if modelfile != nil {
assert.Equal(t, c.expected, modelfile.Commands)
}
@@ -355,7 +357,7 @@ TEMPLATE """
for _, c := range cases {
t.Run("", func(t *testing.T) {
modelfile, err := ParseFile(strings.NewReader(c.multiline))
assert.ErrorIs(t, err, c.err)
require.ErrorIs(t, err, c.err)
if modelfile != nil {
assert.Equal(t, c.expected, modelfile.Commands)
}
@@ -413,7 +415,7 @@ func TestParseFileParameters(t *testing.T) {
fmt.Fprintln(&b, "FROM foo")
fmt.Fprintln(&b, "PARAMETER", k)
modelfile, err := ParseFile(&b)
assert.NoError(t, err)
require.NoError(t, err)
assert.Equal(t, []Command{
{Name: "model", Args: "foo"},
@@ -442,7 +444,7 @@ FROM foo
for _, c := range cases {
t.Run("", func(t *testing.T) {
modelfile, err := ParseFile(strings.NewReader(c.input))
assert.NoError(t, err)
require.NoError(t, err)
assert.Equal(t, c.expected, modelfile.Commands)
})
}
@@ -501,15 +503,14 @@ SYSTEM ""
for _, c := range cases {
t.Run("", func(t *testing.T) {
modelfile, err := ParseFile(strings.NewReader(c))
assert.NoError(t, err)
require.NoError(t, err)
modelfile2, err := ParseFile(strings.NewReader(modelfile.String()))
assert.NoError(t, err)
require.NoError(t, err)
assert.Equal(t, modelfile, modelfile2)
})
}
}
func TestParseFileUTF16ParseFile(t *testing.T) {
@@ -518,14 +519,6 @@ PARAMETER param1 1
PARAMETER param2 4096
SYSTEM You are a utf16 file.
`
// simulate a utf16 le file
utf16File := utf16.Encode(append([]rune{'\ufffe'}, []rune(data)...))
buf := new(bytes.Buffer)
err := binary.Write(buf, binary.LittleEndian, utf16File)
assert.NoError(t, err)
actual, err := ParseFile(buf)
assert.NoError(t, err)
expected := []Command{
{Name: "model", Args: "bob"},
@@ -534,14 +527,52 @@ SYSTEM You are a utf16 file.
{Name: "system", Args: "You are a utf16 file."},
}
assert.Equal(t, expected, actual.Commands)
t.Run("le", func(t *testing.T) {
var b bytes.Buffer
require.NoError(t, binary.Write(&b, binary.LittleEndian, []byte{0xff, 0xfe}))
require.NoError(t, binary.Write(&b, binary.LittleEndian, utf16.Encode([]rune(data))))
// simulate a utf16 be file
buf = new(bytes.Buffer)
err = binary.Write(buf, binary.BigEndian, utf16File)
assert.NoError(t, err)
actual, err := ParseFile(&b)
require.NoError(t, err)
actual, err = ParseFile(buf)
assert.NoError(t, err)
assert.Equal(t, expected, actual.Commands)
assert.Equal(t, expected, actual.Commands)
})
t.Run("be", func(t *testing.T) {
var b bytes.Buffer
require.NoError(t, binary.Write(&b, binary.BigEndian, []byte{0xfe, 0xff}))
require.NoError(t, binary.Write(&b, binary.BigEndian, utf16.Encode([]rune(data))))
actual, err := ParseFile(&b)
require.NoError(t, err)
assert.Equal(t, expected, actual.Commands)
})
}
func TestParseMultiByte(t *testing.T) {
input := `FROM test
SYSTEM 你好👋`
expect := []Command{
{Name: "model", Args: "test"},
{Name: "system", Args: "你好👋"},
}
encodings := []encoding.Encoding{
unicode.UTF8,
unicode.UTF16(unicode.LittleEndian, unicode.UseBOM),
unicode.UTF16(unicode.BigEndian, unicode.UseBOM),
}
for _, encoding := range encodings {
t.Run(fmt.Sprintf("%s", encoding), func(t *testing.T) {
s, err := encoding.NewEncoder().String(input)
require.NoError(t, err)
actual, err := ParseFile(strings.NewReader(s))
require.NoError(t, err)
assert.Equal(t, expect, actual.Commands)
})
}
}

View File

@@ -59,7 +59,7 @@ func (p *Progress) StopAndClear() bool {
stopped := p.stop()
if stopped {
// clear all progress lines
for i := 0; i < p.pos; i++ {
for i := range p.pos {
if i > 0 {
fmt.Fprint(p.w, "\033[A")
}
@@ -85,7 +85,7 @@ func (p *Progress) render() {
defer fmt.Fprint(p.w, "\033[?25h")
// clear already rendered progress lines
for i := 0; i < p.pos; i++ {
for i := range p.pos {
if i > 0 {
fmt.Fprint(p.w, "\033[A")
}

View File

@@ -52,7 +52,6 @@ func (b *Buffer) GetLineSpacing(line int) bool {
}
return hasSpace.(bool)
}
func (b *Buffer) MoveLeft() {
@@ -117,15 +116,12 @@ func (b *Buffer) MoveRight() {
if b.DisplayPos%b.LineWidth == 0 {
fmt.Printf(CursorDown + CursorBOL + cursorRightN(len(b.Prompt.prompt())))
} else if (b.DisplayPos-rLength)%b.LineWidth == b.LineWidth-1 && hasSpace {
fmt.Printf(CursorDown + CursorBOL + cursorRightN(len(b.Prompt.prompt())+rLength))
b.DisplayPos += 1
} else if b.LineHasSpace.Size() > 0 && b.DisplayPos%b.LineWidth == b.LineWidth-1 && hasSpace {
fmt.Printf(CursorDown + CursorBOL + cursorRightN(len(b.Prompt.prompt())))
b.DisplayPos += 1
} else {
fmt.Print(cursorRightN(rLength))
}
@@ -154,7 +150,7 @@ func (b *Buffer) MoveToStart() {
if b.Pos > 0 {
currLine := b.DisplayPos / b.LineWidth
if currLine > 0 {
for cnt := 0; cnt < currLine; cnt++ {
for range currLine {
fmt.Print(CursorUp)
}
}
@@ -169,7 +165,7 @@ func (b *Buffer) MoveToEnd() {
currLine := b.DisplayPos / b.LineWidth
totalLines := b.DisplaySize() / b.LineWidth
if currLine < totalLines {
for cnt := 0; cnt < totalLines-currLine; cnt++ {
for range totalLines - currLine {
fmt.Print(CursorDown)
}
remainder := b.DisplaySize() % b.LineWidth
@@ -185,7 +181,7 @@ func (b *Buffer) MoveToEnd() {
func (b *Buffer) DisplaySize() int {
sum := 0
for i := 0; i < b.Buf.Size(); i++ {
for i := range b.Buf.Size() {
if e, ok := b.Buf.Get(i); ok {
if r, ok := e.(rune); ok {
sum += runewidth.RuneWidth(r)
@@ -197,7 +193,6 @@ func (b *Buffer) DisplaySize() int {
}
func (b *Buffer) Add(r rune) {
if b.Pos == b.Buf.Size() {
b.AddChar(r, false)
} else {
@@ -210,7 +205,6 @@ func (b *Buffer) AddChar(r rune, insert bool) {
b.DisplayPos += rLength
if b.Pos > 0 {
if b.DisplayPos%b.LineWidth == 0 {
fmt.Printf("%c", r)
fmt.Printf("\n%s", b.Prompt.AltPrompt)
@@ -235,7 +229,6 @@ func (b *Buffer) AddChar(r rune, insert bool) {
} else {
b.LineHasSpace.Add(true)
}
} else {
fmt.Printf("%c", r)
}
@@ -356,7 +349,6 @@ func (b *Buffer) drawRemaining() {
func (b *Buffer) Remove() {
if b.Buf.Size() > 0 && b.Pos > 0 {
if e, ok := b.Buf.Get(b.Pos - 1); ok {
if r, ok := e.(rune); ok {
rLength := runewidth.RuneWidth(r)
@@ -382,7 +374,6 @@ func (b *Buffer) Remove() {
} else {
fmt.Print(" " + CursorLeft)
}
} else if (b.DisplayPos-rLength)%b.LineWidth == 0 && hasSpace {
fmt.Printf(CursorBOL + ClearToEOL)
fmt.Printf(CursorUp + CursorBOL + cursorRightN(b.Width))
@@ -391,10 +382,9 @@ func (b *Buffer) Remove() {
b.LineHasSpace.Remove(b.DisplayPos/b.LineWidth - 1)
}
b.DisplayPos -= 1
} else {
fmt.Print(cursorLeftN(rLength))
for i := 0; i < rLength; i++ {
for range rLength {
fmt.Print(" ")
}
fmt.Print(cursorLeftN(rLength))
@@ -451,7 +441,7 @@ func (b *Buffer) DeleteBefore() {
func (b *Buffer) DeleteRemaining() {
if b.DisplaySize() > 0 && b.Pos < b.DisplaySize() {
charsToDel := b.Buf.Size() - b.Pos
for cnt := 0; cnt < charsToDel; cnt++ {
for range charsToDel {
b.Delete()
}
}
@@ -495,7 +485,7 @@ func (b *Buffer) ClearScreen() {
if currPos > 0 {
targetLine := currPos / b.LineWidth
if targetLine > 0 {
for cnt := 0; cnt < targetLine; cnt++ {
for range targetLine {
fmt.Print(CursorDown)
}
}
@@ -525,7 +515,7 @@ func (b *Buffer) Replace(r []rune) {
fmt.Printf(CursorBOL + ClearToEOL)
for i := 0; i < lineNums; i++ {
for range lineNums {
fmt.Print(CursorUp + CursorBOL + ClearToEOL)
}

View File

@@ -91,7 +91,7 @@ func (h *History) Add(l []rune) {
func (h *History) Compact() {
s := h.Buf.Size()
if s > h.Limit {
for cnt := 0; cnt < s-h.Limit; cnt++ {
for range s - h.Limit {
h.Buf.Remove(0)
}
}
@@ -139,7 +139,7 @@ func (h *History) Save() error {
defer f.Close()
buf := bufio.NewWriter(f)
for cnt := 0; cnt < h.Size(); cnt++ {
for cnt := range h.Size() {
v, _ := h.Buf.Get(cnt)
line, _ := v.([]rune)
if _, err := buf.WriteString(string(line) + "\n"); err != nil {

View File

@@ -5,7 +5,6 @@ import (
"fmt"
"io"
"os"
"syscall"
)
type Prompt struct {
@@ -63,7 +62,7 @@ func New(prompt Prompt) (*Instance, error) {
func (i *Instance) Readline() (string, error) {
if !i.Terminal.rawmode {
fd := int(syscall.Stdin)
fd := os.Stdin.Fd()
termios, err := SetRawMode(fd)
if err != nil {
return "", err
@@ -80,8 +79,8 @@ func (i *Instance) Readline() (string, error) {
fmt.Print(prompt)
defer func() {
fd := int(syscall.Stdin)
// nolint: errcheck
fd := os.Stdin.Fd()
//nolint:errcheck
UnsetRawMode(fd, i.Terminal.termios)
i.Terminal.rawmode = false
}()
@@ -136,7 +135,7 @@ func (i *Instance) Readline() (string, error) {
buf.MoveRight()
case CharBracketedPaste:
var code string
for cnt := 0; cnt < 3; cnt++ {
for range 3 {
r, err = i.Terminal.Read()
if err != nil {
return "", io.EOF
@@ -198,7 +197,7 @@ func (i *Instance) Readline() (string, error) {
buf.Remove()
case CharTab:
// todo: convert back to real tabs
for cnt := 0; cnt < 8; cnt++ {
for range 8 {
buf.Add(' ')
}
case CharDelete:
@@ -216,7 +215,7 @@ func (i *Instance) Readline() (string, error) {
case CharCtrlW:
buf.DeleteWord()
case CharCtrlZ:
fd := int(syscall.Stdin)
fd := os.Stdin.Fd()
return handleCharCtrlZ(fd, i.Terminal.termios)
case CharEnter, CharCtrlJ:
output := buf.String()
@@ -248,7 +247,7 @@ func (i *Instance) HistoryDisable() {
}
func NewTerminal() (*Terminal, error) {
fd := int(syscall.Stdin)
fd := os.Stdin.Fd()
termios, err := SetRawMode(fd)
if err != nil {
return nil, err

View File

@@ -6,7 +6,7 @@ import (
"syscall"
)
func handleCharCtrlZ(fd int, termios any) (string, error) {
func handleCharCtrlZ(fd uintptr, termios any) (string, error) {
t := termios.(*Termios)
if err := UnsetRawMode(fd, t); err != nil {
return "", err

View File

@@ -1,6 +1,6 @@
package readline
func handleCharCtrlZ(fd int, state any) (string, error) {
func handleCharCtrlZ(fd uintptr, state any) (string, error) {
// not supported
return "", nil
}

View File

@@ -8,7 +8,7 @@ import (
type Termios syscall.Termios
func SetRawMode(fd int) (*Termios, error) {
func SetRawMode(fd uintptr) (*Termios, error) {
termios, err := getTermios(fd)
if err != nil {
return nil, err
@@ -25,13 +25,13 @@ func SetRawMode(fd int) (*Termios, error) {
return termios, setTermios(fd, &newTermios)
}
func UnsetRawMode(fd int, termios any) error {
func UnsetRawMode(fd uintptr, termios any) error {
t := termios.(*Termios)
return setTermios(fd, t)
}
// IsTerminal returns true if the given file descriptor is a terminal.
func IsTerminal(fd int) bool {
func IsTerminal(fd uintptr) bool {
_, err := getTermios(fd)
return err == nil
}

View File

@@ -7,17 +7,17 @@ import (
"unsafe"
)
func getTermios(fd int) (*Termios, error) {
func getTermios(fd uintptr) (*Termios, error) {
termios := new(Termios)
_, _, err := syscall.Syscall6(syscall.SYS_IOCTL, uintptr(fd), syscall.TIOCGETA, uintptr(unsafe.Pointer(termios)), 0, 0, 0)
_, _, err := syscall.Syscall6(syscall.SYS_IOCTL, fd, syscall.TIOCGETA, uintptr(unsafe.Pointer(termios)), 0, 0, 0)
if err != 0 {
return nil, err
}
return termios, nil
}
func setTermios(fd int, termios *Termios) error {
_, _, err := syscall.Syscall6(syscall.SYS_IOCTL, uintptr(fd), syscall.TIOCSETA, uintptr(unsafe.Pointer(termios)), 0, 0, 0)
func setTermios(fd uintptr, termios *Termios) error {
_, _, err := syscall.Syscall6(syscall.SYS_IOCTL, fd, syscall.TIOCSETA, uintptr(unsafe.Pointer(termios)), 0, 0, 0)
if err != 0 {
return err
}

View File

@@ -10,17 +10,17 @@ import (
const tcgets = 0x5401
const tcsets = 0x5402
func getTermios(fd int) (*Termios, error) {
func getTermios(fd uintptr) (*Termios, error) {
termios := new(Termios)
_, _, err := syscall.Syscall6(syscall.SYS_IOCTL, uintptr(fd), tcgets, uintptr(unsafe.Pointer(termios)), 0, 0, 0)
_, _, err := syscall.Syscall6(syscall.SYS_IOCTL, fd, tcgets, uintptr(unsafe.Pointer(termios)), 0, 0, 0)
if err != 0 {
return nil, err
}
return termios, nil
}
func setTermios(fd int, termios *Termios) error {
_, _, err := syscall.Syscall6(syscall.SYS_IOCTL, uintptr(fd), tcsets, uintptr(unsafe.Pointer(termios)), 0, 0, 0)
func setTermios(fd uintptr, termios *Termios) error {
_, _, err := syscall.Syscall6(syscall.SYS_IOCTL, fd, tcsets, uintptr(unsafe.Pointer(termios)), 0, 0, 0)
if err != 0 {
return err
}

Some files were not shown because too many files have changed in this diff Show More