Compare commits

...

232 Commits

Author SHA1 Message Date
Jeffrey Morgan
2789ed31a7 improve scratch buffer estimates 2024-01-19 13:24:24 -05:00
Jeffrey Morgan
dc88cc3981 use gzip for runner embedding (#2067) 2024-01-19 13:23:03 -05:00
Daniel Hiltgen
62976087c6 Merge pull request #1999 from lainedfles/termux_android_cpu_only
Fix CPU-only build under Android Termux enviornment.
2024-01-18 17:16:53 -08:00
Self Denial
344342abdf Restore dyn_ext_server.c since RTLD_DEEPBIND has been removed 2024-01-18 17:30:42 -07:00
Self Denial
eb76f3e379 Fix CPU-only build under Android Termux enviornment.
Update gpu.go initGPUHandles() to declare gpuHandles variable before
reading it. This resolves an "invalid memory address or nil pointer
dereference" error.

Update dyn_ext_server.c to avoid setting the RTLD_DEEPBIND flag under
__TERMUX__ (Android).
2024-01-18 17:25:33 -07:00
Michael Yang
d017e3d0a6 Merge pull request #2060 from jmorganca/mxyng/fix-show
fix show handler
2024-01-18 16:02:27 -08:00
Michael Yang
aac9ab4db7 fix show handler 2024-01-18 15:36:50 -08:00
Michael Yang
1f5b7ff976 Merge pull request #1932 from jmorganca/mxyng/api-fields
api: add model for all requests
2024-01-18 14:56:51 -08:00
Michael Yang
e299831e2c Merge pull request #1958 from purificant/ci
ci: update setup-go action
2024-01-18 14:53:36 -08:00
Michael Yang
745b5934fa add model to ModelResponse 2024-01-18 14:32:55 -08:00
Michael Yang
a38d88d828 api: add model for all requests
prefer using req.Model and fallback to req.Name
2024-01-18 14:31:37 -08:00
Daniel Hiltgen
abec7f06e5 Merge pull request #2056 from dhiltgen/slog
Mechanical switch from log to slog
2024-01-18 14:27:24 -08:00
Michael Yang
e5da190bac Merge pull request #2020 from jmorganca/mxyng/install-fedora
install: pin fedora to max 37
2024-01-18 14:23:42 -08:00
Daniel Hiltgen
ecbfc0182f Go bump to v1.21 to pick up slog 2024-01-18 14:12:57 -08:00
Daniel Hiltgen
fedd705aea Mechanical switch from log to slog
A few obvious levels were adjusted, but generally everything mapped to "info" level.
2024-01-18 14:12:57 -08:00
Mike Bird
82ee019bfc add open interpreter to list of extensions (#2016) 2024-01-18 13:59:39 -08:00
Sachin Sachdeva
ad9dbc2a04 Haystack Ollama Integration (#2021)
Updated readme with the web link for haystack ollama integration
2024-01-18 13:38:32 -08:00
Daniel Hiltgen
fccdf4c635 Merge pull request #1987 from xyproto/archlinux
Let gpu.go and gen_linux.sh also find CUDA on Arch Linux
2024-01-18 13:32:10 -08:00
Daniel Hiltgen
d450fb1d1e Merge pull request #2055 from dhiltgen/cuda_docs
Refine the linux cuda/rocm developer docs
2024-01-18 12:07:31 -08:00
Daniel Hiltgen
df40b11d03 Merge pull request #2007 from dhiltgen/cpu_fallback
Add multiple CPU variants for Intel Mac
2024-01-18 11:32:29 -08:00
Daniel Hiltgen
9cd20b0ec8 Refine the linux cuda/rocm developer docs 2024-01-18 09:44:44 -08:00
Daniel Hiltgen
b992bf65fc Disable arm64 for test phase
The runners are x86 so we can only run binaries that match.
2024-01-17 19:26:13 -08:00
Daniel Hiltgen
1b249748ab Add multiple CPU variants for Intel Mac
This also refines the build process for the ext_server build.
2024-01-17 15:08:54 -08:00
Alexander F. Rødseth
cbe2adc78a Merge branch 'main' into archlinux 2024-01-17 12:50:11 +01:00
Michael Yang
d5a7353357 Merge pull request #2026 from jmorganca/mxyng/fix-windows
fix: normalize name path before splitting
2024-01-16 16:58:42 -08:00
Michael Yang
96cfb62641 fix: normalize name path before splitting 2024-01-16 16:48:29 -08:00
Daniel Hiltgen
7d00b5d110 Merge pull request #1915 from dhiltgen/bump_llama_with_new_dep
Bump llama.cpp to b1842 and add new cuda lib dep
2024-01-16 13:36:49 -08:00
Daniel Hiltgen
795674dd90 Bump llama.cpp to b1842 and add new cuda lib dep
Upstream llama.cpp has added a new dependency with the
NVIDIA CUDA Driver Libraries (libcuda.so) which is part of the
driver distribution, not the general cuda libraries, and is not
available as an archive, so we can not statically link it.  This may
introduce some additional compatibility challenges which we'll
need to keep an eye on.
2024-01-16 12:53:52 -08:00
Daniel Hiltgen
e282bdccdd Merge pull request #1990 from dhiltgen/ci_mac_cross
Add macos cross-compile CI coverage
2024-01-16 12:31:37 -08:00
Michael Yang
d9bfb2f08f install: pin fedora to max 37
repos for fedora 38 and newer do not exist as of this commit

```
$ dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/fedora38/x86_64/cuda-fedora38.repo
Adding repo from: https://developer.download.nvidia.com/compute/cuda/repos/fedora38/x86_64/cuda-fedora38.repo
Status code: 404 for https://developer.download.nvidia.com/compute/cuda/repos/fedora38/x86_64/cuda-fedora38.repo (IP: 152.195.19.142)
Error: Configuration of repo failed
```
2024-01-16 11:45:21 -08:00
Michael Yang
598d6d5572 Merge pull request #1937 from jmorganca/mxyng/remove-client-py
remove client.py
2024-01-16 11:01:41 -08:00
Bruce MacDonald
a897e833b8 do not cache prompt (#2018)
- prompt cache causes inferance to hang after some time
2024-01-16 13:48:05 -05:00
Patrick Devine
eef50accb4 Fix show parameters (#2017) 2024-01-16 10:34:44 -08:00
Michael Yang
05d53de7a1 Merge pull request #1968 from jmorganca/mxyng/fix-request-retry
fix: request retry with error
2024-01-16 10:33:50 -08:00
Daniel Hiltgen
8795447dad Merge pull request #1966 from fpreiss/fpreiss/gen_linux_cuda_detection
improve cuda detection (rel. issue #1704)
2024-01-14 18:00:11 -08:00
Daniel Hiltgen
b3035112a1 Add macos cross-compile CI coverage 2024-01-14 10:38:59 -08:00
Daniel Hiltgen
95ad9a9fc8 Merge pull request #1988 from dhiltgen/fix_intel_mac
Fix typo in arm mac arch script
2024-01-14 08:45:18 -08:00
Daniel Hiltgen
3ca5f69ce8 Fix typo in arm mac arch script 2024-01-14 08:32:57 -08:00
Daniel Hiltgen
cfa6337960 Merge pull request #1982 from dhiltgen/fix_intel_mac
Fix intel mac build
2024-01-14 08:26:46 -08:00
Alexander F. Rødseth
f4bf1d514f Let gpu.go and gen_linux.sh also find CUDA on Arch Linux 2024-01-14 13:40:36 +01:00
Jeffrey Morgan
557110d0ba Disable mmap with lora layers (#1985) 2024-01-13 23:36:31 -05:00
Daniel Hiltgen
2ecb247276 Fix intel mac build
Make sure we're building an x86 ext_server lib when cross-compiling
2024-01-13 14:46:34 -08:00
Jeffrey Morgan
288ef8ff95 add gcc -lstdc++ flag for linux cpu (#1974) 2024-01-13 03:53:00 -05:00
Jeffrey Morgan
4cf17990f7 use g++ to build libext_server.so on linux (#1972) 2024-01-13 03:12:42 -05:00
Michael Yang
b6c0ef1e70 Merge pull request #1961 from jmorganca/mxyng/rm-double-newline
remove double newlines in /set parameter
2024-01-12 15:18:19 -08:00
Michael Yang
356d178f6e Merge pull request #1971 from jmorganca/mxyng/max-context-length
add max context length check
2024-01-12 15:10:25 -08:00
Michael Yang
eaed6f8c45 add max context length check 2024-01-12 14:54:07 -08:00
purificant
6a5bfc2ed6 update actions/setup-go 2024-01-12 22:27:25 +00:00
Michael Yang
cf29bd2d72 fix: request retry with error
this fixes a subtle bug with makeRequestWithRetry where an HTTP status
error on a retried request will potentially not return the right err
2024-01-12 13:32:27 -08:00
Fabian Preiss
905862e17b improve cuda detection (rel. issue #1704) 2024-01-12 21:59:19 +01:00
Patrick Devine
565f8a3c44 Convert the REPL to use /api/chat for interactive responses (#1936) 2024-01-12 12:05:52 -08:00
Michael Yang
5121b7ac9c remove double newlines in /set parameter 2024-01-12 11:21:15 -08:00
Michael Yang
a70262c6b2 Update README.md
Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com>
2024-01-12 09:43:04 -08:00
Tristram Oaten
40a0a90a88 Add group delete to uninstall instructions (#1924)
After executing the `userdel ollama` command, I saw this message:

```sh
$ sudo userdel ollama
userdel: group ollama not removed because it has other members.
```

Which reminded me that I had to remove the dangling group too. For completeness, the uninstall instructions should do this too.

Thanks!
2024-01-12 00:07:00 -05:00
Michael Yang
cbe20c4375 update readme 2024-01-11 16:24:37 -08:00
Michael Yang
5ffbbea1d7 remove client.py 2024-01-11 15:53:10 -08:00
Daniel Hiltgen
3773fb6465 Merge pull request #1935 from dhiltgen/cpu_fallback
Fix up the CPU fallback selection
2024-01-11 15:52:32 -08:00
Daniel Hiltgen
7427fa1387 Fix up the CPU fallback selection
The memory changes and multi-variant change had some merge
glitches I missed.  This fixes them so we actually get the cpu llm lib
and best variant for the given system.
2024-01-11 15:27:06 -08:00
Michael Yang
f84537e0e0 Merge pull request #1934 from jmorganca/mxyng/fix-slices
fix build and lint
2024-01-11 14:36:20 -08:00
Michael Yang
d2be6387c9 fix typo 2024-01-11 14:25:21 -08:00
Michael Yang
d7af35d3d0 import fmt 2024-01-11 14:22:32 -08:00
Michael Yang
defc1dbd6e use x/exp/slices 2024-01-11 14:20:13 -08:00
Daniel Hiltgen
de2fbdec99 Merge pull request #1819 from dhiltgen/multi_variant
Support multiple LLM libs; ROCm v5 and v6; Rosetta, AVX, and AVX2 compatible CPU builds
2024-01-11 14:00:48 -08:00
Eduard van Valkenburg
f5faf79aa1 Add semantic kernel to Readme (#1931) 2024-01-11 14:40:23 -05:00
Michael Yang
f4f939de28 Merge pull request #1552 from jmorganca/mxyng/lint-test
add lint and test on pull_request
2024-01-11 09:37:45 -08:00
Daniel Hiltgen
39928a42e8 Always dynamically load the llm server library
This switches darwin to dynamic loading, and refactors the code now that no
static linking of the library is used on any platform
2024-01-11 08:42:47 -08:00
Daniel Hiltgen
d88c527be3 Build multiple CPU variants and pick the best
This reduces the built-in linux version to not use any vector extensions
which enables the resulting builds to run under Rosetta on MacOS in
Docker.  Then at runtime it checks for the actual CPU vector
extensions and loads the best CPU library available
2024-01-11 08:42:47 -08:00
Fabian Preiß
3bc8b9832b fix gpu_test.go Error (same type) uint64->uint32 (#1921) 2024-01-11 08:22:23 -05:00
Jeffrey Morgan
ab6be852c7 revisit memory allocation to account for full kv cache on main gpu 2024-01-11 01:45:31 -05:00
Daniel Hiltgen
052b33b81b DRY out the Dockefile.build 2024-01-10 17:27:51 -08:00
Daniel Hiltgen
8da7bef05f Support multiple variants for a given llm lib type
In some cases we may want multiple variants for a given GPU type or CPU.
This adds logic to have an optional Variant which we can use to select
an optimal library, but also allows us to try multiple variants in case
some fail to load.

This can be useful for scenarios such as ROCm v5 vs v6 incompatibility
or potentially CPU features.
2024-01-10 17:27:51 -08:00
Jeffrey Morgan
b24e8d17b2 Increase minimum CUDA memory allocation overhead and fix minimum overhead for multi-gpu (#1896)
* increase minimum cuda overhead and fix minimum overhead for multi-gpu

* fix multi gpu overhead

* limit overhead to 10% of all gpus

* better wording

* allocate fixed amount before layers

* fixed only includes graph alloc
2024-01-10 19:08:51 -05:00
Jeffrey Morgan
f83881390f revert submodule back to 328b83de23b33240e28f4e74900d1d06726f5eb1 2024-01-10 18:42:39 -05:00
Daniel Hiltgen
ac70ab6761 Merge pull request #1914 from dhiltgen/smarter_cuda_detection
Smarter GPU Management library detection
2024-01-10 15:21:56 -08:00
Daniel Hiltgen
3c49c3ab0d Harden GPU mgmt library lookup
When there are multiple management libraries installed on a system
not every one will be compatible with the current driver.  This change
improves our management library algorithm to build up a set of discovered
libraries based on glob patterns, and then try all of them until we're able to
load one without error.
2024-01-10 15:06:41 -08:00
Daniel Hiltgen
9754ae4c89 Support optional override of the target archictures
This can help speed up incremental builds when you're only testing one
archicture, like amd64.  E.g.
BUILD_ARCH=amd64 ./scripts/build_linux.sh && scp ./dist/ollama-linux-amd64 test-system:
2024-01-10 14:43:24 -08:00
Jeffrey Morgan
224fbf2795 update submodule to commit 1fc2f265ff9377a37fd2c61eae9cd813a3491bea until its main branch is fixed 2024-01-10 17:03:15 -05:00
Jeffrey Morgan
2c6e8f5248 Update submodule to 6efb8eb30e7025b168f3fda3ff83b9b386428ad6 (#1885)
* update submodule to `6efb8eb30e7025b168f3fda3ff83b9b386428ad6`
* unblock condition variable in `update_slots` when closing server
2024-01-10 16:48:38 -05:00
Jeffrey Morgan
34344d801c clean up cmake build directory when cross compiling macOS builds 2024-01-09 17:13:56 -05:00
Robin Glauser
e868c8a5c7 Update api.md (#1878)
Fixed assistant in the example response.
2024-01-09 16:21:17 -05:00
Jeffrey Morgan
c336693f07 calculate overhead based number of gpu devices (#1875) 2024-01-09 15:53:33 -05:00
Daniel Hiltgen
e89dc1d54b Merge pull request #1874 from dhiltgen/correct_cuda_min
Set corret CUDA minimum compute capability version
2024-01-09 11:37:22 -08:00
Daniel Hiltgen
1961a81f03 Set corret CUDA minimum compute capability version
If you attempt to run the current CUDA build on compute capability 5.2
cards, you'll hit the following failure:
cuBLAS error 15 at ggml-cuda.cu:7956: the requested functionality is not supported
2024-01-09 11:28:24 -08:00
Jeffrey Morgan
8a8c7e7f8d only build for metal on arm64 2024-01-09 13:51:08 -05:00
Jeffrey Morgan
6df83e6daa update rough cuda overhead estimate to 15% + 384MiB 2024-01-09 13:51:08 -05:00
Michael Yang
f921e2696e typo 2024-01-09 09:45:42 -08:00
Michael Yang
4a33cede20 remove unused fields and functions 2024-01-09 09:37:40 -08:00
Michael Yang
f95d2f25f3 fix temporary history file permissions 2024-01-09 09:36:58 -08:00
Michael Yang
2b9892a808 fix(windows): modelpath and list 2024-01-09 09:36:58 -08:00
Michael Yang
2bb2bdd5d4 fix lint 2024-01-09 09:36:58 -08:00
Michael Yang
acfc376efd add .golangci.yaml 2024-01-09 09:36:58 -08:00
Michael Yang
997253143f add lint and test on pull_request 2024-01-09 09:36:58 -08:00
Michael Yang
62023177f6 Merge pull request #1614 from jmorganca/mxyng/fix-set-template
fix: set template without triple quotes
2024-01-09 09:36:24 -08:00
Jeffrey Morgan
6164f378f2 revert cuda overhead to 20% 2024-01-09 00:54:29 -05:00
Jeffrey Morgan
f387e9631b use runner if cuda alloc won't fit 2024-01-09 00:44:34 -05:00
Jeffrey Morgan
6566387ae3 add TODO for cuda overhead 2024-01-09 00:28:03 -05:00
Jeffrey Morgan
37708931fb update cuda overhead to 20% to fix crashes when switching between models and large context sizes 2024-01-09 00:05:23 -05:00
Jeffrey Morgan
f6cb0a553c update cuda overhead to 15% or 400MiB 2024-01-08 23:45:45 -05:00
Jeffrey Morgan
2680078c13 fix build on linux 2024-01-08 23:44:13 -05:00
Jeffrey Morgan
f1b7e5f560 update overhead to 15% 2024-01-08 23:37:45 -05:00
Jeffrey Morgan
cb534e6ac2 use 10% vram overhead for cuda 2024-01-08 23:17:44 -05:00
Jeffrey Morgan
58ce2d8273 better estimate scratch buffer size 2024-01-08 21:32:44 -05:00
Jeffrey Morgan
18ddf6d57d fix windows build 2024-01-08 20:04:01 -05:00
Michael Yang
61e6502449 Merge pull request #1818 from jmorganca/mxyng/fix-alt-prompt
fix(cmd): history in alt prompt
2024-01-08 13:48:34 -08:00
Jeffrey Morgan
08f1e18965 Offload layers to GPU based on new model size estimates (#1850)
* select layers based on estimated model memory usage

* always account for scratch vram

* dont load +1 layers

* better estmation for graph alloc

* Update gpu/gpu_darwin.go

Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com>

* Update llm/llm.go

Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com>

* Update llm/llm.go

* add overhead for cuda memory

* Update llm/llm.go

Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com>

* fix build error on linux

* address comments

---------

Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com>
2024-01-08 16:42:00 -05:00
Bruce MacDonald
7e8f7c8358 remove ggml automatic re-pull (#1856) 2024-01-08 14:41:01 -05:00
Bruce MacDonald
3f3eb19a3b document response in modelfile template variables (#1428) 2024-01-08 14:38:51 -05:00
Daniel Hiltgen
059ae4585e Merge pull request #1834 from dhiltgen/old_cuda
Detect very old CUDA GPUs and fall back to CPU
2024-01-07 10:39:49 -08:00
Daniel Hiltgen
6347f501ca Merge pull request #1828 from dhiltgen/fix_llava
Accept windows paths for image processing
2024-01-07 09:05:46 -08:00
Jeffrey Morgan
5feec959ad dont use -Wall in static build (#1833) 2024-01-07 10:39:19 -05:00
Jeffrey Morgan
dbdd50b283 add -DCMAKE_SYSTEM_NAME=Darwin cmake flag (#1832) 2024-01-07 00:46:17 -05:00
Daniel Hiltgen
d74ce6bd4f Detect very old CUDA GPUs and fall back to CPU
If we try to load the CUDA library on an old GPU, it panics and crashes
the server.  This checks the compute capability before we load the
library so we can gracefully fall back to CPU mode.
2024-01-06 21:40:29 -08:00
Guilherme Baptista
57942b4676 Update README.md - Community Integrations - Ollama for Ruby (#1830) 2024-01-06 22:31:39 -05:00
Daniel Hiltgen
e0d05b0f1e Accept windows paths for image processing
This enhances our regex to support windows style paths.  The regex will
match invalid path specifications, but we'll still validate file
existence and filter out mismatches
2024-01-06 10:50:27 -08:00
Daniel Hiltgen
2d9dd14f27 Merge pull request #1697 from dhiltgen/win_docs
Add windows native build instructions
2024-01-05 19:34:20 -08:00
Jeffrey Morgan
1caa56128f add cuda lib path for nvidia container toolkit 2024-01-05 21:10:37 -05:00
Michael Yang
0101e76dbe Merge pull request #1797 from sublimator/nd-allow-extension-origins-still-needs-explicit-listing-2024-01-05
fix: allow extension origins (still needs explicit listing), fixes #1686
2024-01-05 17:20:09 -08:00
Michael Yang
2ef9352b94 fix(cmd): history in alt mode 2024-01-05 16:20:02 -08:00
Michael Yang
5580ae2472 fix: set template without triple quotes 2024-01-05 15:51:33 -08:00
Bruce MacDonald
3a9f447141 only pull gguf model if already exists (#1817) 2024-01-05 18:50:00 -05:00
Patrick Devine
9c2941e61b switch api for ShowRequest to use the name field (#1816) 2024-01-05 15:06:43 -08:00
Patrick Devine
238ac5e765 Add unit tests for Parser (#1815) 2024-01-05 14:04:31 -08:00
Bruce MacDonald
4f4980b66b simplify ggml update logic (#1814)
- additional information is now available in show response, use this to pull gguf before running
- make gguf updates cancellable
2024-01-05 15:22:32 -05:00
Patrick Devine
22e93efa41 add show info command and fix the modelfile 2024-01-05 12:20:05 -08:00
Patrick Devine
2909dce894 split up interactive generation 2024-01-05 12:20:05 -08:00
Jeffrey Morgan
df32537312 gpu: read memory info from all cuda devices (#1802)
* gpu: read memory info from all cuda devices

* add `LOOKUP_SIZE` constant

* better constant name

* address comments
2024-01-05 11:25:58 -05:00
Bruce MacDonald
3367b5f3df remove unused generate patches (#1810) 2024-01-05 11:25:45 -05:00
Matt Williams
46edbbc518 Merge pull request #1801 from jmorganca/mattw/correctdockerlink 2024-01-04 19:20:45 -08:00
Michael Yang
d2ff18cd6b Merge pull request #1791 from jmorganca/mxyng/update-build
update Dockerfile.build
2024-01-04 19:13:44 -08:00
Matt Williams
df086d3c8c fix docker doc to point to hub
Signed-off-by: Matt Williams <m@technovangelist.com>
2024-01-04 18:42:23 -08:00
Nicholas Dudfield
8baaaa39c0 Allow extension origins (still needs explicit listing), fixes #1686 2024-01-05 09:06:47 +07:00
Michael Yang
f9961c70ae update build 2024-01-04 17:34:38 -08:00
Daniel Hiltgen
cd8fad3398 Merge pull request #1790 from dhiltgen/llm_code_shuffle
Cleaup stale submodule
2024-01-04 13:47:25 -08:00
Daniel Hiltgen
9983fa5f4e Cleaup stale submodule
If the tree has a stale submodule, make sure we clean it up first
2024-01-04 13:40:16 -08:00
Daniel Hiltgen
dfda91c2ee Merge pull request #1788 from dhiltgen/llm_code_shuffle
Revamp code layout for the llm directory and llama.cpp submodule
2024-01-04 13:14:28 -08:00
Daniel Hiltgen
fac9060da5 Init submodule with new path 2024-01-04 13:00:13 -08:00
Daniel Hiltgen
a554616f8e remove old llama.cpp submodule path 2024-01-04 12:12:21 -08:00
Daniel Hiltgen
77d96da94b Code shuffle to clean up the llm dir 2024-01-04 12:12:05 -08:00
Brian Murray
0d6e3565ae Add embeddings to API (#1773) 2024-01-04 15:00:52 -05:00
Daniel Hiltgen
b5939008a1 Merge pull request #1785 from dhiltgen/win_native_cli
Load dynamic cpu lib on windows
2024-01-04 08:55:01 -08:00
Daniel Hiltgen
e9ce91e9a6 Load dynamic cpu lib on windows
On linux, we link the CPU library in to the Go app and fall back to it
when no GPU match is found. On windows we do not link in the CPU library
so that we can better control our dependencies for the CLI.  This fixes
the logic so we correctly fallback to the dynamic CPU library
on windows.
2024-01-04 08:41:41 -08:00
Bruce MacDonald
4ad6c9b11f fix: pull either original model or from model on create (#1774) 2024-01-04 01:34:38 -05:00
Jeffrey Morgan
c0285158a9 tweak memory requirements error text 2024-01-03 19:47:18 -05:00
Jeffrey Morgan
77a66df72c add macOS memory check for 47B models 2024-01-03 19:46:16 -05:00
Jeffrey Morgan
5b4837f881 remove unused filetype check 2024-01-03 19:45:39 -05:00
Jeffrey Morgan
29340c2e62 update cmake flags for amd64 macOS (#1780)
* update cmake flags for intel macOS

* remove `LLAMA_K_QUANTS`

* put back `CMAKE_OSX_DEPLOYMENT_TARGET` and disable `LLAMA_F16C`
2024-01-03 19:22:15 -05:00
Daniel Hiltgen
d5ec730354 Merge pull request #1779 from dhiltgen/refined_amd_gpu_list
Improve maintainability of Radeon card list
2024-01-03 16:18:57 -08:00
Daniel Hiltgen
8bed487aba Merge pull request #1778 from dhiltgen/wsl1
Fail fast on WSL1 while allowing on WSL2
2024-01-03 16:18:41 -08:00
Daniel Hiltgen
c1a10a6e9b Merge pull request #1781 from dhiltgen/cpu_only_build
Fix CPU only builds
2024-01-03 16:18:25 -08:00
Daniel Hiltgen
ddbfa6fe31 Fix CPU only builds
Go embed doesn't like when there's no matching files, so put
a dummy placeholder in to allow building without any GPU support
If no "server" library is found, it's safely ignored at runtime.
2024-01-03 16:08:34 -08:00
Daniel Hiltgen
2fcd41ef81 Fail fast on WSL1 while allowing on WSL2
This prevents users from accidentally installing on WSL1 with instructions
guiding how to upgrade their WSL instance to version 2.  Once running WSL2
if you have an NVIDIA card, you can follow their instructions to set up
GPU passthrough and run models on the GPU.  This is not possible on WSL1.
2024-01-03 16:02:32 -08:00
Daniel Hiltgen
16f4603b67 Improve maintainability of Radeon card list
This moves the list of AMD GPUs to an easier to maintain list which
should make it easier to update over time.
2024-01-03 15:16:56 -08:00
Daniel Hiltgen
1184686649 Merge pull request #1776 from dhiltgen/render_group
Add ollama user to render group for Radeon support
2024-01-03 13:07:54 -08:00
Daniel Hiltgen
2588cb2daa Add ollama user to render group for Radeon support
For the ROCm libraries to access the driver, we need to add the ollama user
to the render group.
2024-01-03 12:56:31 -08:00
Jeffrey Morgan
c7ea8f237e set num_gpu to 1 only by default on darwin arm64 (#1771) 2024-01-03 14:10:29 -05:00
Bruce MacDonald
0b3118e0af fix: relay request opts to loaded llm prediction (#1761) 2024-01-03 12:01:42 -05:00
Daniel Hiltgen
05face44ef Merge pull request #1683 from dhiltgen/fix_windows_test
Fix windows system memory lookup
2024-01-03 09:00:39 -08:00
Daniel Hiltgen
a2ad952440 Fix windows system memory lookup
This refines the gpu package error handling and fixes a bug with the
system memory lookup on windows.
2024-01-03 08:50:01 -08:00
Daniel Hiltgen
5fea4410be Merge pull request #1680 from dhiltgen/better_patching
Refactor how we augment llama.cpp and refine windows native build
2024-01-03 08:10:17 -08:00
Bruce MacDonald
b846eb64d0 Fix template api doc description (#1661) 2024-01-03 11:00:59 -05:00
Cole Gillespie
3c5dd9ed1d Update README.md (#1766) 2024-01-03 10:44:22 -05:00
Jeffrey Morgan
b17ccd0542 Update import.md 2024-01-02 22:28:18 -05:00
Patrick Devine
d0409f772f keyboard shortcut help (#1764) 2024-01-02 18:04:12 -08:00
Jeffrey Morgan
ec261422af use docker build in build scripts 2024-01-02 19:32:54 -05:00
Daniel Hiltgen
0498f7ce56 Get rid of one-line llama.log
This one log line was triggering a single line llama.log to be generated
in the pwd of the server
2024-01-02 15:36:16 -08:00
Daniel Hiltgen
738a8d12eb Rename the ollama cmakefile 2024-01-02 15:36:16 -08:00
Daniel Hiltgen
d966b730ac Switch windows build to fully dynamic
Refactor where we store build outputs, and support a fully dynamic loading
model on windows so the base executable has no special dependencies thus
doesn't require a special PATH.
2024-01-02 15:36:16 -08:00
Daniel Hiltgen
9a70aecccb Refactor how we augment llama.cpp
This changes the model for llama.cpp inclusion so we're not applying a patch,
but instead have the C++ code directly in the ollama tree, which should make it
easier to refine and update over time.
2024-01-02 15:35:55 -08:00
Karim ElGhandour
22cd5eaab6 Added Ollama-SwiftUI to integrations (#1747) 2024-01-02 09:47:50 -05:00
Dane Madsen
304a8799ca Update README.md (#1757) 2024-01-02 09:47:08 -05:00
Jeffrey Morgan
2a2fa3c329 api.md cleanup & formatting 2023-12-27 14:32:35 -05:00
Jeffrey Morgan
55978c1dc9 clean up cache api option 2023-12-27 14:27:45 -05:00
Jeffrey Morgan
d4ebdadbe7 enable cache_prompt by default 2023-12-27 14:23:42 -05:00
Daniel Hiltgen
e201efa14b Add windows native build instructions 2023-12-25 08:31:34 -08:00
Icelain
c5f21f73a4 follow best practices by adding resp.Body.Close() (#1708) 2023-12-25 09:01:37 -05:00
Jeffrey Morgan
371bc73531 Update README.md 2023-12-24 11:54:08 -05:00
Jeffrey Morgan
c651d8b824 Update README.md 2023-12-23 11:18:12 -05:00
Daniel Hiltgen
cf50ef5b51 Merge pull request #1684 from dhiltgen/tag_integration_tests
Guard integration tests with a tag
2023-12-22 16:43:41 -08:00
Daniel Hiltgen
697bea6939 Guard integration tests with a tag
This should help CI avoid running the integration test logic in a
container where it's not currently possible.
2023-12-22 16:33:27 -08:00
K0IN
10da41d677 Add Cache flag to api (#1642) 2023-12-22 17:16:20 -05:00
Bruce MacDonald
db356c8519 post-response templating (#1427) 2023-12-22 17:07:05 -05:00
Jeffrey Morgan
b80081022f cache docker builds in build_linux.sh 2023-12-22 16:01:20 -05:00
Matt Williams
790457398a Merge pull request #1677 from jmorganca/mattw/docrunupdate
update where are models stored q
2023-12-22 09:56:27 -08:00
Matt Williams
511069a2a5 update where are models stored q
Signed-off-by: Matt Williams <m@technovangelist.com>
2023-12-22 09:48:44 -08:00
Matt Williams
5a85070c22 Update readmes, requirements, packagejsons, etc for all examples (#1452)
Most of the examples needed updates of Readmes to show how to run them. Some of the requirements.txt files had extra content that wasn't needed, or missing altogether. Apparently some folks like to run npm start
to run typescript, so a script was added to all typescript examples which
hadn't been done before.

Basically just a lot of cleanup.

Signed-off-by: Matt Williams <m@technovangelist.com>
2023-12-22 09:10:41 -08:00
Matt Williams
291700c92d Clean up documentation (#1506)
* Clean up documentation

Will probably need to update with PRs for new release.

Signed-off-by: Matt Williams <m@technovangelist.com>

* Correcting to fit in 0.1.15 changes

Signed-off-by: Matt Williams <m@technovangelist.com>

* Update README.md

Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com>

* addressing comments

Signed-off-by: Matt Williams <m@technovangelist.com>

* more api cleanup

Signed-off-by: Matt Williams <m@technovangelist.com>

* its llava not llama

Signed-off-by: Matt Williams <m@technovangelist.com>

* Update docs/troubleshooting.md

Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com>

* Updated hosting to server and documented all env vars

Signed-off-by: Matt Williams <m@technovangelist.com>

* remove last of the cli descriptions

Signed-off-by: Matt Williams <m@technovangelist.com>

* Update README.md

Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com>

* update further per conversation with jeff earlier today

Signed-off-by: Matt Williams <m@technovangelist.com>

* cleanup the doc readme

Signed-off-by: Matt Williams <m@technovangelist.com>

* move upgrade to faq

Signed-off-by: Matt Williams <m@technovangelist.com>

* first change

Signed-off-by: Matt Williams <m@technovangelist.com>

* updated

Signed-off-by: Matt Williams <m@technovangelist.com>

* Update docs/faq.md

Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com>

* Update docs/api.md

Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com>

* Update docs/api.md

Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com>

* Update docs/api.md

Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com>

* Update docs/api.md

Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com>

* Update docs/api.md

Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com>

* Update docs/api.md

Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com>

* Update docs/README.md

Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com>

* Update docs/api.md

Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com>

* Update docs/api.md

Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com>

* Update docs/api.md

Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com>

* Update README.md

Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com>

* Update docs/README.md

Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com>

* Update docs/api.md

Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com>

* Update docs/api.md

Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com>

* Update docs/api.md

Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com>

* Update docs/README.md

Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com>

* Update docs/README.md

Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com>

* Update docs/README.md

Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com>

* examples in parent

Signed-off-by: Matt Williams <m@technovangelist.com>

* add exapmle for create model.

Signed-off-by: Matt Williams <m@technovangelist.com>

* update faq

Signed-off-by: Matt Williams <m@technovangelist.com>

* update create model api

Signed-off-by: Matt Williams <m@technovangelist.com>

* Update docs/api.md

Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com>

* Update docs/faq.md

Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com>

* Update docs/troubleshooting.md

Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com>

* update the readme in docs

Signed-off-by: Matt Williams <m@technovangelist.com>

* update a few more things

Signed-off-by: Matt Williams <m@technovangelist.com>

* Update docs/troubleshooting.md

Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com>

* Update docs/faq.md

Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com>

* Update README.md

Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com>

* Update docs/modelfile.md

Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com>

* Update docs/troubleshooting.md

Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com>

---------

Signed-off-by: Matt Williams <m@technovangelist.com>
Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com>
2023-12-22 09:10:01 -08:00
Daniel Hiltgen
9db28af84e Merge pull request #1675 from dhiltgen/less_verbose
Quiet down llama.cpp logging by default
2023-12-22 08:57:17 -08:00
Daniel Hiltgen
e5202eb687 Quiet down llama.cpp logging by default
By default builds will now produce non-debug and non-verbose binaries.
To enable verbose logs in llama.cpp and debug symbols in the
native code, set `CGO_CFLAGS=-g`
2023-12-22 08:47:18 -08:00
Daniel Hiltgen
96fb441abd Merge pull request #1146 from dhiltgen/ext_server_cgo
Add cgo implementation for llama.cpp
2023-12-22 08:16:31 -08:00
Daniel Hiltgen
495c06e4a6 Fix doc glitch 2023-12-21 18:21:31 -08:00
Daniel Hiltgen
fa24e73b82 Remove CPU build, fixup linux build script 2023-12-21 18:21:31 -08:00
Daniel Hiltgen
325d74985b Fix CPU performance on hyperthreaded systems
The default thread count logic was broken and resulted in 2x the number
of threads as it should on a hyperthreading CPU
resulting in thrashing and poor performance.
2023-12-21 16:23:36 -08:00
Bruce MacDonald
fabf2f3467 allow for starting llava queries with filepath (#1549) 2023-12-21 13:20:59 -05:00
Daniel Hiltgen
d9cd3d9667 Revive windows build
The windows native setup still needs some more work, but this gets it building
again and if you set the PATH properly, you can run the resulting exe on a cuda system.
2023-12-20 17:21:54 -08:00
Patrick Devine
a607d922f0 add FAQ for slow networking in WSL2 (#1646) 2023-12-20 16:27:24 -08:00
Daniel Hiltgen
7555ea44f8 Revamp the dynamic library shim
This switches the default llama.cpp to be CPU based, and builds the GPU variants
as dynamically loaded libraries which we can select at runtime.

This also bumps the ROCm library to version 6 given 5.7 builds don't work
on the latest ROCm library that just shipped.
2023-12-20 14:45:57 -08:00
Jeffrey Morgan
df06812494 Update api.md 2023-12-20 08:47:53 -05:00
Daniel Hiltgen
1d1eb1688c Additional nvidial-ml path to check 2023-12-19 15:52:34 -08:00
Michael Yang
23dc179350 Merge pull request #1619 from jmorganca/mxyng/fix-version-test
fix(test): use real version string for comparison
2023-12-19 15:48:52 -08:00
Michael Yang
63aac0edc5 fix(test): use real version string for comparison 2023-12-19 15:03:02 -08:00
Daniel Hiltgen
6558f94ed0 Fix darwin intel build 2023-12-19 13:32:24 -08:00
Erick Ghaumez
1ca484f67e Add Langchain Dart library (#1564)
* Add Langchain Dart

* Update README.md

---------

Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com>
2023-12-19 14:04:52 -05:00
Jeffrey Morgan
72b0c32fe9 Update README.md 2023-12-19 12:59:22 -05:00
Jeffrey Morgan
68c28224f8 Update README.md 2023-12-19 12:59:03 -05:00
Daniel Hiltgen
54dbfa4c4a Carry ggml-metal.metal as payload 2023-12-19 09:05:46 -08:00
Daniel Hiltgen
5646826a79 Add WSL2 path to nvidia-ml.so library 2023-12-19 09:05:46 -08:00
Daniel Hiltgen
3269535a4c Refine handling of shim presence
This allows the CPU only builds to work on systems with Radeon cards
2023-12-19 09:05:46 -08:00
Daniel Hiltgen
1b991d0ba9 Refine build to support CPU only
If someone checks out the ollama repo and doesn't install the CUDA
library, this will ensure they can build a CPU only version
2023-12-19 09:05:46 -08:00
Daniel Hiltgen
51082535e1 Add automated test for multimodal
A simple test case that verifies llava:7b can read text in an image
2023-12-19 09:05:46 -08:00
Daniel Hiltgen
9adca7f711 Bump llama.cpp to b1662 and set n_parallel=1 2023-12-19 09:05:46 -08:00
Daniel Hiltgen
89bbaafa64 Build linux using ubuntu 20.04
This changes the container-based linux build to use an older Ubuntu
distro to improve our compatibility matrix for older user machines
2023-12-19 09:05:46 -08:00
Daniel Hiltgen
35934b2e05 Adapted rocm support to cgo based llama.cpp 2023-12-19 09:05:46 -08:00
65a
f8ef4439e9 Use build tags to generate accelerated binaries for CUDA and ROCm on Linux.
The build tags rocm or cuda must be specified to both go generate and go build.
ROCm builds should have both ROCM_PATH set (and the ROCM SDK present) as well
as CLBlast installed (for GGML) and CLBlast_DIR set in the environment to the
CLBlast cmake directory (likely /usr/lib/cmake/CLBlast). Build tags are also
used to switch VRAM detection between cuda and rocm implementations, using
added "accelerator_foo.go" files which contain architecture specific functions
and variables. accelerator_none is used when no tags are set, and a helper
function addRunner will ignore it if it is the chosen accelerator. Fix go
generate commands, thanks @deadmeu for testing.
2023-12-19 09:05:46 -08:00
Daniel Hiltgen
d4cd695759 Add cgo implementation for llama.cpp
Run the server.cpp directly inside the Go runtime via cgo
while retaining the LLM Go abstractions.
2023-12-19 09:05:46 -08:00
Bruce MacDonald
5e7fd6906f Update images.go 2023-12-19 09:05:46 -08:00
Bruce MacDonald
811b1f03c8 deprecate ggml
- remove ggml runner
- automatically pull gguf models when ggml detected
- tell users to update to gguf in the case automatic pull fails

Co-Authored-By: Jeffrey Morgan <jmorganca@gmail.com>
2023-12-19 09:05:46 -08:00
Matt Williams
ed195f3562 Merge pull request #1595 from pgibler/main
Added cmdh to community section in README
2023-12-18 20:55:18 -08:00
Matt Williams
e0d0072ef1 Merge pull request #1592 from jmorganca/mattw/examplepruning
Lets get rid of these old modelfile examples
2023-12-18 20:29:48 -08:00
pgibler
620a2ffcfb Added cmdh to community section in README 2023-12-18 22:04:40 -05:00
Matt Williams
d287013f24 Lets get rid of these old modelfile examples
Signed-off-by: Matt Williams <m@technovangelist.com>
2023-12-18 17:47:33 -08:00
Jeffrey Morgan
6b5bdfa6c9 update runner submodule 2023-12-18 17:33:46 -05:00
Jeffrey Morgan
c063ee4af0 update runner submodule to fix hipblas build 2023-12-18 15:41:13 -05:00
Bruce MacDonald
d99fa6ce0a send empty messages on last chat response (#1530) 2023-12-18 14:23:38 -05:00
Patrick Devine
3948c6ea06 add magic header for unit tests (#1558) 2023-12-18 10:41:02 -08:00
Jeffrey Morgan
b85982eb91 update runner submodule 2023-12-18 12:43:31 -05:00
Patrick Devine
86b0dd4b16 add API create/copy handlers (#1541) 2023-12-15 11:59:18 -08:00
Augustinas Malinauskas
f728738427 README with Enchanted iOS App (#1529)
* feat(docs): README with Enchanted iOS app

* Update README.md

---------

Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com>
2023-12-15 14:37:29 -05:00
Ian Purton
115048a0d8 Added Bionic GPT as a front end. (#1463)
* Added Bionic GPT as a front end.

* Update README.md

---------

Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com>
2023-12-15 14:33:04 -05:00
Bruce MacDonald
1b417a7836 use exp slices for go 1.20 compatibility (#1544) 2023-12-15 14:15:56 -05:00
Patrick Devine
0174665d0e add API tests for list handler (#1535) 2023-12-14 18:18:25 -08:00
Patrick Devine
630518f0d9 Add unit test of API routes (#1528) 2023-12-14 16:47:40 -08:00
Bruce MacDonald
6e16098a60 remove sample_count from docs (#1527)
this info has not been returned from these endpoints in some time
2023-12-14 17:49:00 -05:00
137 changed files with 6833 additions and 3025 deletions

View File

@@ -2,8 +2,7 @@
ollama
app
dist
scripts
llm/llama.cpp/ggml
llm/llama.cpp/gguf
llm/llama.cpp
.env
.cache
test_data

106
.github/workflows/test.yaml vendored Normal file
View File

@@ -0,0 +1,106 @@
name: test
on:
pull_request:
jobs:
generate:
strategy:
matrix:
os: [ubuntu-latest, macos-latest, windows-latest]
arch: [amd64, arm64]
exclude:
- os: ubuntu-latest
arch: arm64
- os: windows-latest
arch: arm64
runs-on: ${{ matrix.os }}
env:
GOARCH: ${{ matrix.arch }}
steps:
- uses: actions/checkout@v4
- uses: actions/setup-go@v5
with:
go-version: '1.21'
cache: true
- if: ${{ startsWith(matrix.os, 'windows-') }}
shell: pwsh
run: |
$path = vswhere -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath
if ($path) {
$path = join-path $path 'Common7\Tools\vsdevcmd.bat'
if (test-path $path) {
cmd /s /c """$path"" $args && set" | where { $_ -match '(\w+)=(.*)' } | foreach {
echo "$($Matches[1])=$($Matches[2])" | Out-File -FilePath $Env:GITHUB_ENV -Encoding utf8 -Append
}
}
}
echo "C:\Program Files\Git\usr\bin" | Out-File -FilePath $Env:GITHUB_PATH -Encoding utf8 -Append
- run: go get ./...
- run: go generate -x ./...
- uses: actions/upload-artifact@v4
with:
name: ${{ matrix.os }}-${{ matrix.arch }}-libraries
path: |
llm/llama.cpp/build/**/lib/*
lint:
needs: generate
strategy:
matrix:
os: [ubuntu-latest, macos-latest, windows-latest]
arch: [amd64, arm64]
exclude:
- os: ubuntu-latest
arch: arm64
- os: windows-latest
arch: arm64
- os: macos-latest
arch: amd64
runs-on: ${{ matrix.os }}
env:
GOARCH: ${{ matrix.arch }}
CGO_ENABLED: "1"
steps:
- uses: actions/checkout@v4
with:
submodules: recursive
- uses: actions/setup-go@v5
with:
go-version: '1.21'
cache: false
- uses: actions/download-artifact@v4
with:
name: ${{ matrix.os }}-${{ matrix.arch }}-libraries
path: llm/llama.cpp/build
- uses: golangci/golangci-lint-action@v3
test:
needs: generate
strategy:
matrix:
os: [ubuntu-latest, macos-latest, windows-latest]
arch: [amd64]
exclude:
- os: ubuntu-latest
arch: arm64
- os: windows-latest
arch: arm64
runs-on: ${{ matrix.os }}
env:
GOARCH: ${{ matrix.arch }}
CGO_ENABLED: "1"
steps:
- uses: actions/checkout@v4
with:
submodules: recursive
- uses: actions/setup-go@v5
with:
go-version: '1.21'
cache: true
- run: go get
- uses: actions/download-artifact@v4
with:
name: ${{ matrix.os }}-${{ matrix.arch }}-libraries
path: llm/llama.cpp/build
- run: go build
- run: go test -v ./...

3
.gitignore vendored
View File

@@ -8,4 +8,5 @@ ollama
ggml-metal.metal
.cache
*.exe
.idea
.idea
test_data

14
.gitmodules vendored
View File

@@ -1,10 +1,4 @@
[submodule "llm/llama.cpp/ggml"]
path = llm/llama.cpp/ggml
url = https://github.com/ggerganov/llama.cpp.git
ignore = dirty
shallow = true
[submodule "llm/llama.cpp/gguf"]
path = llm/llama.cpp/gguf
url = https://github.com/ggerganov/llama.cpp.git
ignore = dirty
shallow = true
[submodule "llama.cpp"]
path = llm/llama.cpp
url = https://github.com/ggerganov/llama.cpp.git
shallow = true

27
.golangci.yaml Normal file
View File

@@ -0,0 +1,27 @@
run:
timeout: 5m
linters:
enable:
- asasalint
- bidichk
- bodyclose
- containedctx
- contextcheck
- exportloopref
- gocheckcompilerdirectives
# FIXME: for some reason this errors on windows
# - gofmt
# - goimports
- misspell
- nilerr
- unused
linters-settings:
errcheck:
# exclude the following functions since we don't generally
# need to be concerned with the returned errors
exclude-functions:
- encoding/binary.Read
- (*os.File).Seek
- (*bufio.Writer).WriteString
- (*github.com/spf13/pflag.FlagSet).Set
- (*github.com/jmorganca/ollama/llm.readSeekOffset).Seek

View File

@@ -1,31 +1,99 @@
# centos7 amd64 dependencies
FROM --platform=linux/amd64 nvidia/cuda:11.3.1-devel-centos7 AS base-amd64
RUN yum install -y https://repo.ius.io/ius-release-el7.rpm centos-release-scl && \
yum update -y && \
yum install -y devtoolset-10-gcc devtoolset-10-gcc-c++ git236 wget
RUN wget "https://github.com/Kitware/CMake/releases/download/v3.27.6/cmake-3.27.6-linux-x86_64.sh" -O cmake-installer.sh && chmod +x cmake-installer.sh && ./cmake-installer.sh --skip-license --prefix=/usr/local
ARG GOLANG_VERSION=1.21.3
ARG CMAKE_VERSION=3.22.1
ARG CUDA_VERSION=11.3.1
# Copy the minimal context we need to run the generate scripts
FROM scratch AS llm-code
COPY .git .git
COPY .gitmodules .gitmodules
COPY llm llm
FROM --platform=linux/amd64 nvidia/cuda:$CUDA_VERSION-devel-centos7 AS cuda-build-amd64
ARG CMAKE_VERSION
ARG CGO_CFLAGS
COPY ./scripts/rh_linux_deps.sh /
RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
COPY --from=llm-code / /go/src/github.com/jmorganca/ollama/
WORKDIR /go/src/github.com/jmorganca/ollama/llm/generate
RUN OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
# centos8 arm64 dependencies
FROM --platform=linux/arm64 nvidia/cuda-arm64:11.3.1-devel-centos8 AS base-arm64
RUN sed -i -e 's/mirrorlist/#mirrorlist/g' -e 's|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g' /etc/yum.repos.d/CentOS-*
RUN yum install -y git cmake
FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION-devel-rockylinux8 AS cuda-build-arm64
ARG CMAKE_VERSION
ARG CGO_CFLAGS
COPY ./scripts/rh_linux_deps.sh /
RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
ENV PATH /opt/rh/gcc-toolset-10/root/usr/bin:$PATH
COPY --from=llm-code / /go/src/github.com/jmorganca/ollama/
WORKDIR /go/src/github.com/jmorganca/ollama/llm/generate
RUN OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
FROM base-${TARGETARCH}
ARG TARGETARCH
ARG GOFLAGS="'-ldflags -w -s'"
FROM --platform=linux/amd64 rocm/dev-centos-7:5.7.1-complete AS rocm-5-build-amd64
ARG CMAKE_VERSION
ARG CGO_CFLAGS
COPY ./scripts/rh_linux_deps.sh /
RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
ENV LIBRARY_PATH /opt/amdgpu/lib64
COPY --from=llm-code / /go/src/github.com/jmorganca/ollama/
WORKDIR /go/src/github.com/jmorganca/ollama/llm/generate
RUN OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
# install go
ADD https://dl.google.com/go/go1.21.3.linux-$TARGETARCH.tar.gz /tmp/go1.21.3.tar.gz
RUN mkdir -p /usr/local && tar xz -C /usr/local </tmp/go1.21.3.tar.gz
FROM --platform=linux/amd64 rocm/dev-centos-7:6.0-complete AS rocm-6-build-amd64
ARG CMAKE_VERSION
ARG CGO_CFLAGS
COPY ./scripts/rh_linux_deps.sh /
RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
ENV LIBRARY_PATH /opt/amdgpu/lib64
COPY --from=llm-code / /go/src/github.com/jmorganca/ollama/
WORKDIR /go/src/github.com/jmorganca/ollama/llm/generate
RUN OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
# build the final binary
FROM --platform=linux/amd64 centos:7 AS cpu-build-amd64
ARG CMAKE_VERSION
ARG GOLANG_VERSION
ARG OLLAMA_CUSTOM_CPU_DEFS
ARG CGO_CFLAGS
COPY ./scripts/rh_linux_deps.sh /
RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
COPY --from=llm-code / /go/src/github.com/jmorganca/ollama/
WORKDIR /go/src/github.com/jmorganca/ollama/llm/generate
RUN sh gen_linux.sh
FROM --platform=linux/arm64 centos:7 AS cpu-build-arm64
ARG CMAKE_VERSION
ARG GOLANG_VERSION
ARG OLLAMA_CUSTOM_CPU_DEFS
ARG CGO_CFLAGS
COPY ./scripts/rh_linux_deps.sh /
RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
COPY --from=llm-code / /go/src/github.com/jmorganca/ollama/
WORKDIR /go/src/github.com/jmorganca/ollama/llm/generate
RUN sh gen_linux.sh
FROM --platform=linux/amd64 cpu-build-amd64 AS build-amd64
ENV CGO_ENABLED 1
ARG GOFLAGS
ARG CGO_CFLAGS
WORKDIR /go/src/github.com/jmorganca/ollama
COPY . .
COPY --from=cuda-build-amd64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
COPY --from=rocm-5-build-amd64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
COPY --from=rocm-6-build-amd64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
RUN go build .
ENV GOOS=linux
ENV GOARCH=$TARGETARCH
ENV GOFLAGS=$GOFLAGS
FROM --platform=linux/arm64 cpu-build-arm64 AS build-arm64
ENV CGO_ENABLED 1
ARG GOLANG_VERSION
ARG GOFLAGS
ARG CGO_CFLAGS
WORKDIR /go/src/github.com/jmorganca/ollama
COPY . .
COPY --from=cuda-build-arm64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
RUN go build .
RUN /usr/local/go/bin/go generate ./... && \
/usr/local/go/bin/go build .
FROM build-$TARGETARCH

View File

@@ -17,7 +17,7 @@ Get up and running with large language models locally.
### Windows
Coming soon!
Coming soon! For now, you can install Ollama on Windows via WSL2.
### Linux & WSL2
@@ -47,10 +47,12 @@ Here are some example open-source models that can be downloaded:
| Model | Parameters | Size | Download |
| ------------------ | ---------- | ----- | ------------------------------ |
| Llama 2 | 7B | 3.8GB | `ollama run llama2` |
| Mistral | 7B | 4.1GB | `ollama run mistral` |
| Dolphin Phi | 2.7B | 1.6GB | `ollama run dolphin-phi` |
| Phi-2 | 2.7B | 1.7GB | `ollama run phi` |
| Neural Chat | 7B | 4.1GB | `ollama run neural-chat` |
| Starling | 7B | 4.1GB | `ollama run starling-lm` |
| Mistral | 7B | 4.1GB | `ollama run mistral` |
| Llama 2 | 7B | 3.8GB | `ollama run llama2` |
| Code Llama | 7B | 3.8GB | `ollama run codellama` |
| Llama 2 Uncensored | 7B | 3.8GB | `ollama run llama2-uncensored` |
| Llama 2 13B | 13B | 7.3GB | `ollama run llama2:13b` |
@@ -59,9 +61,9 @@ Here are some example open-source models that can be downloaded:
| Vicuna | 7B | 3.8GB | `ollama run vicuna` |
| LLaVA | 7B | 4.5GB | `ollama run llava` |
> Note: You should have at least 8 GB of RAM to run the 3B models, 16 GB to run the 7B models, and 32 GB to run the 13B models.
> Note: You should have at least 8 GB of RAM available to run the 7B models, 16 GB to run the 13B models, and 32 GB to run the 33B models.
## Customize your own model
## Customize a model
### Import from GGUF
@@ -128,6 +130,10 @@ For more examples, see the [examples](examples) directory. For more information
`ollama create` is used to create a model from a Modelfile.
```
ollama create mymodel -f ./Modelfile
```
### Pull a model
```
@@ -191,13 +197,19 @@ Install `cmake` and `go`:
brew install cmake go
```
Then generate dependencies and build:
Then generate dependencies:
```
go generate ./...
```
Then build the binary:
```
go build .
```
More detailed instructions can be found in the [developer guide](https://github.com/jmorganca/ollama/blob/main/docs/development.md)
### Running local builds
Next, start the server:
```
@@ -236,10 +248,14 @@ curl http://localhost:11434/api/chat -d '{
See the [API documentation](./docs/api.md) for all endpoints.
## Integrations
- [ollama-python](https://github.com/jmorganca/ollama-python)
## Community Integrations
### Web & Desktop
- [Bionic GPT](https://github.com/bionic-gpt/bionic-gpt)
- [HTML UI](https://github.com/rtcfirefly/ollama-ui)
- [Chatbot UI](https://github.com/ivanfioravanti/chatbot-ollama)
- [Typescript UI](https://github.com/ollama-interface/Ollama-Gui?tab=readme-ov-file)
@@ -250,6 +266,8 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [Cheshire Cat assistant framework](https://github.com/cheshire-cat-ai/core)
- [Amica](https://github.com/semperai/amica)
- [chatd](https://github.com/BruceMacD/chatd)
- [Ollama-SwiftUI](https://github.com/kghandour/Ollama-SwiftUI)
### Terminal
@@ -261,6 +279,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [ogpt.nvim](https://github.com/huynle/ogpt.nvim)
- [gptel Emacs client](https://github.com/karthink/gptel)
- [Oatmeal](https://github.com/dustinblackman/oatmeal)
- [cmdh](https://github.com/pgibler/cmdh)
### Database
@@ -277,16 +296,22 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [LlamaIndex](https://gpt-index.readthedocs.io/en/stable/examples/llm/ollama.html)
- [LiteLLM](https://github.com/BerriAI/litellm)
- [OllamaSharp for .NET](https://github.com/awaescher/OllamaSharp)
- [Ollama for Ruby](https://github.com/gbaptista/ollama-ai)
- [Ollama-rs for Rust](https://github.com/pepperoni21/ollama-rs)
- [Ollama4j for Java](https://github.com/amithkoujalgi/ollama4j)
- [ModelFusion Typescript Library](https://modelfusion.dev/integration/model-provider/ollama)
- [OllamaKit for Swift](https://github.com/kevinhermawan/OllamaKit)
- [Ollama for Dart](https://github.com/breitburg/dart-ollama)
- [Ollama for Laravel](https://github.com/cloudstudio/ollama-laravel)
- [LangChainDart](https://github.com/davidmigloz/langchain_dart)
- [Semantic Kernel - Python](https://github.com/microsoft/semantic-kernel/tree/main/python/semantic_kernel/connectors/ai/ollama)
- [Haystack](https://github.com/deepset-ai/haystack-integrations/blob/main/integrations/ollama.md)
### Mobile
- [Maid](https://github.com/danemadsen/Maid) (Mobile Artificial Intelligence Distribution)
- [Enchanted](https://github.com/AugustDev/enchanted)
- [Maid](https://github.com/Mobile-Artificial-Intelligence/maid)
### Extensions & Plugins
@@ -302,3 +327,4 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [Rivet plugin](https://github.com/abrenneke/rivet-plugin-ollama)
- [Llama Coder](https://github.com/ex3ndr/llama-coder) (Copilot alternative using Ollama)
- [Obsidian BMO Chatbot plugin](https://github.com/longy2k/obsidian-bmo-chatbot)
- [Open Interpreter](https://docs.openinterpreter.com/language-model-setup/local-models/ollama)

View File

@@ -309,6 +309,13 @@ func (c *Client) Heartbeat(ctx context.Context) error {
}
return nil
}
func (c *Client) Embeddings(ctx context.Context, req *EmbeddingRequest) (*EmbeddingResponse, error) {
var resp EmbeddingResponse
if err := c.do(ctx, http.MethodPost, "/api/embeddings", req, &resp); err != nil {
return nil, err
}
return &resp, nil
}
func (c *Client) CreateBlob(ctx context.Context, digest string, r io.Reader) error {
if err := c.do(ctx, http.MethodHead, fmt.Sprintf("/api/blobs/%s", digest), nil, nil); err != nil {

View File

@@ -1,284 +0,0 @@
import os
import json
import requests
import os
import hashlib
import json
from pathlib import Path
BASE_URL = os.environ.get('OLLAMA_HOST', 'http://localhost:11434')
# Generate a response for a given prompt with a provided model. This is a streaming endpoint, so will be a series of responses.
# The final response object will include statistics and additional data from the request. Use the callback function to override
# the default handler.
def generate(model_name, prompt, system=None, template=None, format="", context=None, options=None, callback=None):
try:
url = f"{BASE_URL}/api/generate"
payload = {
"model": model_name,
"prompt": prompt,
"system": system,
"template": template,
"context": context,
"options": options,
"format": format,
}
# Remove keys with None values
payload = {k: v for k, v in payload.items() if v is not None}
with requests.post(url, json=payload, stream=True) as response:
response.raise_for_status()
# Creating a variable to hold the context history of the final chunk
final_context = None
# Variable to hold concatenated response strings if no callback is provided
full_response = ""
# Iterating over the response line by line and displaying the details
for line in response.iter_lines():
if line:
# Parsing each line (JSON chunk) and extracting the details
chunk = json.loads(line)
# If a callback function is provided, call it with the chunk
if callback:
callback(chunk)
else:
# If this is not the last chunk, add the "response" field value to full_response and print it
if not chunk.get("done"):
response_piece = chunk.get("response", "")
full_response += response_piece
print(response_piece, end="", flush=True)
# Check if it's the last chunk (done is true)
if chunk.get("done"):
final_context = chunk.get("context")
# Return the full response and the final context
return full_response, final_context
except requests.exceptions.RequestException as e:
print(f"An error occurred: {e}")
return None, None
# Create a blob file on the server if it doesn't exist.
def create_blob(digest, file_path):
url = f"{BASE_URL}/api/blobs/{digest}"
# Check if the blob exists
response = requests.head(url)
if response.status_code != 404:
return # Blob already exists, no need to upload
response.raise_for_status()
# Upload the blob
with open(file_path, 'rb') as file_data:
requests.post(url, data=file_data)
# Create a model from a Modelfile. Use the callback function to override the default handler.
def create(model_name, filename, callback=None):
try:
file_path = Path(filename).expanduser().resolve()
processed_lines = []
# Read and process the modelfile
with open(file_path, 'r') as f:
for line in f:
# Skip empty or whitespace-only lines
if not line.strip():
continue
command, args = line.split(maxsplit=1)
if command.upper() in ["FROM", "ADAPTER"]:
path = Path(args.strip()).expanduser()
# Check if path is relative and resolve it
if not path.is_absolute():
path = (file_path.parent / path)
# Skip if file does not exist for "model", this is handled by the server
if not path.exists():
processed_lines.append(line)
continue
# Calculate SHA-256 hash
with open(path, 'rb') as bin_file:
hash = hashlib.sha256()
hash.update(bin_file.read())
blob = f"sha256:{hash.hexdigest()}"
# Add the file to the remote server
create_blob(blob, path)
# Replace path with digest in the line
line = f"{command} @{blob}\n"
processed_lines.append(line)
# Combine processed lines back into a single string
modelfile_content = '\n'.join(processed_lines)
url = f"{BASE_URL}/api/create"
payload = {"name": model_name, "modelfile": modelfile_content}
# Making a POST request with the stream parameter set to True to handle streaming responses
with requests.post(url, json=payload, stream=True) as response:
response.raise_for_status()
# Iterating over the response line by line and displaying the status
for line in response.iter_lines():
if line:
chunk = json.loads(line)
if callback:
callback(chunk)
else:
print(f"Status: {chunk.get('status')}")
except Exception as e:
print(f"An error occurred: {e}")
# Pull a model from a the model registry. Cancelled pulls are resumed from where they left off, and multiple
# calls to will share the same download progress. Use the callback function to override the default handler.
def pull(model_name, insecure=False, callback=None):
try:
url = f"{BASE_URL}/api/pull"
payload = {
"name": model_name,
"insecure": insecure
}
# Making a POST request with the stream parameter set to True to handle streaming responses
with requests.post(url, json=payload, stream=True) as response:
response.raise_for_status()
# Iterating over the response line by line and displaying the details
for line in response.iter_lines():
if line:
# Parsing each line (JSON chunk) and extracting the details
chunk = json.loads(line)
# If a callback function is provided, call it with the chunk
if callback:
callback(chunk)
else:
# Print the status message directly to the console
print(chunk.get('status', ''), end='', flush=True)
# If there's layer data, you might also want to print that (adjust as necessary)
if 'digest' in chunk:
print(f" - Digest: {chunk['digest']}", end='', flush=True)
print(f" - Total: {chunk['total']}", end='', flush=True)
print(f" - Completed: {chunk['completed']}", end='\n', flush=True)
else:
print()
except requests.exceptions.RequestException as e:
print(f"An error occurred: {e}")
# Push a model to the model registry. Use the callback function to override the default handler.
def push(model_name, insecure=False, callback=None):
try:
url = f"{BASE_URL}/api/push"
payload = {
"name": model_name,
"insecure": insecure
}
# Making a POST request with the stream parameter set to True to handle streaming responses
with requests.post(url, json=payload, stream=True) as response:
response.raise_for_status()
# Iterating over the response line by line and displaying the details
for line in response.iter_lines():
if line:
# Parsing each line (JSON chunk) and extracting the details
chunk = json.loads(line)
# If a callback function is provided, call it with the chunk
if callback:
callback(chunk)
else:
# Print the status message directly to the console
print(chunk.get('status', ''), end='', flush=True)
# If there's layer data, you might also want to print that (adjust as necessary)
if 'digest' in chunk:
print(f" - Digest: {chunk['digest']}", end='', flush=True)
print(f" - Total: {chunk['total']}", end='', flush=True)
print(f" - Completed: {chunk['completed']}", end='\n', flush=True)
else:
print()
except requests.exceptions.RequestException as e:
print(f"An error occurred: {e}")
# List models that are available locally.
def list():
try:
response = requests.get(f"{BASE_URL}/api/tags")
response.raise_for_status()
data = response.json()
models = data.get('models', [])
return models
except requests.exceptions.RequestException as e:
print(f"An error occurred: {e}")
return None
# Copy a model. Creates a model with another name from an existing model.
def copy(source, destination):
try:
# Create the JSON payload
payload = {
"source": source,
"destination": destination
}
response = requests.post(f"{BASE_URL}/api/copy", json=payload)
response.raise_for_status()
# If the request was successful, return a message indicating that the copy was successful
return "Copy successful"
except requests.exceptions.RequestException as e:
print(f"An error occurred: {e}")
return None
# Delete a model and its data.
def delete(model_name):
try:
url = f"{BASE_URL}/api/delete"
payload = {"name": model_name}
response = requests.delete(url, json=payload)
response.raise_for_status()
return "Delete successful"
except requests.exceptions.RequestException as e:
print(f"An error occurred: {e}")
return None
# Show info about a model.
def show(model_name):
try:
url = f"{BASE_URL}/api/show"
payload = {"name": model_name}
response = requests.post(url, json=payload)
response.raise_for_status()
# Parse the JSON response and return it
data = response.json()
return data
except requests.exceptions.RequestException as e:
print(f"An error occurred: {e}")
return None
def heartbeat():
try:
url = f"{BASE_URL}/"
response = requests.head(url)
response.raise_for_status()
return "Ollama is running"
except requests.exceptions.RequestException as e:
print(f"An error occurred: {e}")
return "Ollama is not running"

View File

@@ -59,13 +59,13 @@ type ChatRequest struct {
type Message struct {
Role string `json:"role"` // one of ["system", "user", "assistant"]
Content string `json:"content"`
Images []ImageData `json:"images, omitempty"`
Images []ImageData `json:"images,omitempty"`
}
type ChatResponse struct {
Model string `json:"model"`
CreatedAt time.Time `json:"created_at"`
Message *Message `json:"message,omitempty"`
Message Message `json:"message"`
Done bool `json:"done"`
@@ -137,17 +137,30 @@ type EmbeddingResponse struct {
}
type CreateRequest struct {
Name string `json:"name"`
Model string `json:"model"`
Path string `json:"path"`
Modelfile string `json:"modelfile"`
Stream *bool `json:"stream,omitempty"`
// Name is deprecated, see Model
Name string `json:"name"`
}
type DeleteRequest struct {
Model string `json:"model"`
// Name is deprecated, see Model
Name string `json:"name"`
}
type ShowRequest struct {
Model string `json:"model"`
System string `json:"system"`
Template string `json:"template"`
Options map[string]interface{} `json:"options"`
// Name is deprecated, see Model
Name string `json:"name"`
}
@@ -166,11 +179,14 @@ type CopyRequest struct {
}
type PullRequest struct {
Name string `json:"name"`
Model string `json:"model"`
Insecure bool `json:"insecure,omitempty"`
Username string `json:"username"`
Password string `json:"password"`
Stream *bool `json:"stream,omitempty"`
// Name is deprecated, see Model
Name string `json:"name"`
}
type ProgressResponse struct {
@@ -181,11 +197,14 @@ type ProgressResponse struct {
}
type PushRequest struct {
Name string `json:"name"`
Model string `json:"model"`
Insecure bool `json:"insecure,omitempty"`
Username string `json:"username"`
Password string `json:"password"`
Stream *bool `json:"stream,omitempty"`
// Name is deprecated, see Model
Name string `json:"name"`
}
type ListResponse struct {
@@ -194,6 +213,7 @@ type ListResponse struct {
type ModelResponse struct {
Name string `json:"name"`
Model string `json:"model"`
ModifiedAt time.Time `json:"modified_at"`
Size int64 `json:"size"`
Digest string `json:"digest"`

View File

@@ -17,9 +17,7 @@ import (
"os/exec"
"os/signal"
"path/filepath"
"regexp"
"runtime"
"slices"
"strings"
"syscall"
"time"
@@ -33,13 +31,10 @@ import (
"github.com/jmorganca/ollama/format"
"github.com/jmorganca/ollama/parser"
"github.com/jmorganca/ollama/progress"
"github.com/jmorganca/ollama/readline"
"github.com/jmorganca/ollama/server"
"github.com/jmorganca/ollama/version"
)
type ImageData []byte
func CreateHandler(cmd *cobra.Command, args []string) error {
filename, _ := cmd.Flags().GetString("file")
filename, err := filepath.Abs(filename)
@@ -156,7 +151,7 @@ func RunHandler(cmd *cobra.Command, args []string) error {
var statusError api.StatusError
switch {
case errors.As(err, &statusError) && statusError.StatusCode == http.StatusNotFound:
if err := PullHandler(cmd, args); err != nil {
if err := PullHandler(cmd, []string{name}); err != nil {
return err
}
case err != nil:
@@ -418,11 +413,10 @@ func PullHandler(cmd *cobra.Command, args []string) error {
func RunGenerate(cmd *cobra.Command, args []string) error {
interactive := true
opts := generateOptions{
opts := runOptions{
Model: args[0],
WordWrap: os.Getenv("TERM") == "xterm-256color",
Options: map[string]interface{}{},
Images: []ImageData{},
}
format, err := cmd.Flags().GetString("format")
@@ -463,18 +457,135 @@ func RunGenerate(cmd *cobra.Command, args []string) error {
type generateContextKey string
type generateOptions struct {
type runOptions struct {
Model string
Prompt string
Messages []api.Message
WordWrap bool
Format string
System string
Template string
Images []ImageData
Images []api.ImageData
Options map[string]interface{}
}
func generate(cmd *cobra.Command, opts generateOptions) error {
type displayResponseState struct {
lineLength int
wordBuffer string
}
func displayResponse(content string, wordWrap bool, state *displayResponseState) {
termWidth, _, _ := term.GetSize(int(os.Stdout.Fd()))
if wordWrap && termWidth >= 10 {
for _, ch := range content {
if state.lineLength+1 > termWidth-5 {
if len(state.wordBuffer) > termWidth-10 {
fmt.Printf("%s%c", state.wordBuffer, ch)
state.wordBuffer = ""
state.lineLength = 0
continue
}
// backtrack the length of the last word and clear to the end of the line
fmt.Printf("\x1b[%dD\x1b[K\n", len(state.wordBuffer))
fmt.Printf("%s%c", state.wordBuffer, ch)
state.lineLength = len(state.wordBuffer) + 1
} else {
fmt.Print(string(ch))
state.lineLength += 1
switch ch {
case ' ':
state.wordBuffer = ""
case '\n':
state.lineLength = 0
default:
state.wordBuffer += string(ch)
}
}
}
} else {
fmt.Printf("%s%s", state.wordBuffer, content)
if len(state.wordBuffer) > 0 {
state.wordBuffer = ""
}
}
}
func chat(cmd *cobra.Command, opts runOptions) (*api.Message, error) {
client, err := api.ClientFromEnvironment()
if err != nil {
return nil, err
}
p := progress.NewProgress(os.Stderr)
defer p.StopAndClear()
spinner := progress.NewSpinner("")
p.Add("", spinner)
cancelCtx, cancel := context.WithCancel(cmd.Context())
defer cancel()
sigChan := make(chan os.Signal, 1)
signal.Notify(sigChan, syscall.SIGINT)
go func() {
<-sigChan
cancel()
}()
var state *displayResponseState = &displayResponseState{}
var latest api.ChatResponse
var fullResponse strings.Builder
var role string
fn := func(response api.ChatResponse) error {
p.StopAndClear()
latest = response
role = response.Message.Role
content := response.Message.Content
fullResponse.WriteString(content)
displayResponse(content, opts.WordWrap, state)
return nil
}
req := &api.ChatRequest{
Model: opts.Model,
Messages: opts.Messages,
Format: opts.Format,
Options: opts.Options,
}
if err := client.Chat(cancelCtx, req, fn); err != nil {
if errors.Is(err, context.Canceled) {
return nil, nil
}
return nil, err
}
if len(opts.Messages) > 0 {
fmt.Println()
fmt.Println()
}
verbose, err := cmd.Flags().GetBool("verbose")
if err != nil {
return nil, err
}
if verbose {
latest.Summary()
}
return &api.Message{Role: role, Content: fullResponse.String()}, nil
}
func generate(cmd *cobra.Command, opts runOptions) error {
client, err := api.ClientFromEnvironment()
if err != nil {
return err
@@ -493,11 +604,6 @@ func generate(cmd *cobra.Command, opts generateOptions) error {
generateContext = []int{}
}
termWidth, _, err := term.GetSize(int(os.Stdout.Fd()))
if err != nil {
opts.WordWrap = false
}
ctx, cancel := context.WithCancel(cmd.Context())
defer cancel()
@@ -509,57 +615,19 @@ func generate(cmd *cobra.Command, opts generateOptions) error {
cancel()
}()
var currentLineLength int
var wordBuffer string
var state *displayResponseState = &displayResponseState{}
fn := func(response api.GenerateResponse) error {
p.StopAndClear()
latest = response
content := response.Response
termWidth, _, _ = term.GetSize(int(os.Stdout.Fd()))
if opts.WordWrap && termWidth >= 10 {
for _, ch := range response.Response {
if currentLineLength+1 > termWidth-5 {
if len(wordBuffer) > termWidth-10 {
fmt.Printf("%s%c", wordBuffer, ch)
wordBuffer = ""
currentLineLength = 0
continue
}
// backtrack the length of the last word and clear to the end of the line
fmt.Printf("\x1b[%dD\x1b[K\n", len(wordBuffer))
fmt.Printf("%s%c", wordBuffer, ch)
currentLineLength = len(wordBuffer) + 1
} else {
fmt.Print(string(ch))
currentLineLength += 1
switch ch {
case ' ':
wordBuffer = ""
case '\n':
currentLineLength = 0
default:
wordBuffer += string(ch)
}
}
}
} else {
fmt.Printf("%s%s", wordBuffer, response.Response)
if len(wordBuffer) > 0 {
wordBuffer = ""
}
}
displayResponse(content, opts.WordWrap, state)
return nil
}
images := make([]api.ImageData, 0)
for _, i := range opts.Images {
images = append(images, api.ImageData(i))
}
request := api.GenerateRequest{
Model: opts.Model,
Prompt: opts.Prompt,
@@ -568,7 +636,6 @@ func generate(cmd *cobra.Command, opts generateOptions) error {
System: opts.System,
Template: opts.Template,
Options: opts.Options,
Images: images,
}
if err := client.Generate(ctx, &request, fn); err != nil {
@@ -577,6 +644,7 @@ func generate(cmd *cobra.Command, opts generateOptions) error {
}
return err
}
if opts.Prompt != "" {
fmt.Println()
fmt.Println()
@@ -601,422 +669,6 @@ func generate(cmd *cobra.Command, opts generateOptions) error {
return nil
}
type MultilineState int
const (
MultilineNone MultilineState = iota
MultilinePrompt
MultilineSystem
MultilineTemplate
)
func modelIsMultiModal(cmd *cobra.Command, name string) bool {
// get model details
client, err := api.ClientFromEnvironment()
if err != nil {
fmt.Println("error: couldn't connect to ollama server")
return false
}
req := api.ShowRequest{Name: name}
resp, err := client.Show(cmd.Context(), &req)
if err != nil {
return false
}
return slices.Contains(resp.Details.Families, "clip")
}
func generateInteractive(cmd *cobra.Command, opts generateOptions) error {
multiModal := modelIsMultiModal(cmd, opts.Model)
// load the model
loadOpts := generateOptions{
Model: opts.Model,
Prompt: "",
Images: []ImageData{},
}
if err := generate(cmd, loadOpts); err != nil {
return err
}
usage := func() {
fmt.Fprintln(os.Stderr, "Available Commands:")
fmt.Fprintln(os.Stderr, " /set Set session variables")
fmt.Fprintln(os.Stderr, " /show Show model information")
fmt.Fprintln(os.Stderr, " /bye Exit")
fmt.Fprintln(os.Stderr, " /?, /help Help for a command")
fmt.Fprintln(os.Stderr, "")
fmt.Fprintln(os.Stderr, "Use \"\"\" to begin a multi-line message.")
fmt.Fprintln(os.Stderr, "")
}
usageSet := func() {
fmt.Fprintln(os.Stderr, "Available Commands:")
fmt.Fprintln(os.Stderr, " /set parameter ... Set a parameter")
fmt.Fprintln(os.Stderr, " /set system <string> Set system message")
fmt.Fprintln(os.Stderr, " /set template <string> Set prompt template")
fmt.Fprintln(os.Stderr, " /set history Enable history")
fmt.Fprintln(os.Stderr, " /set nohistory Disable history")
fmt.Fprintln(os.Stderr, " /set wordwrap Enable wordwrap")
fmt.Fprintln(os.Stderr, " /set nowordwrap Disable wordwrap")
fmt.Fprintln(os.Stderr, " /set format json Enable JSON mode")
fmt.Fprintln(os.Stderr, " /set noformat Disable formatting")
fmt.Fprintln(os.Stderr, " /set verbose Show LLM stats")
fmt.Fprintln(os.Stderr, " /set quiet Disable LLM stats")
fmt.Fprintln(os.Stderr, "")
}
usageShow := func() {
fmt.Fprintln(os.Stderr, "Available Commands:")
fmt.Fprintln(os.Stderr, " /show license Show model license")
fmt.Fprintln(os.Stderr, " /show modelfile Show Modelfile for this model")
fmt.Fprintln(os.Stderr, " /show parameters Show parameters for this model")
fmt.Fprintln(os.Stderr, " /show system Show system message")
fmt.Fprintln(os.Stderr, " /show template Show prompt template")
fmt.Fprintln(os.Stderr, "")
}
// only list out the most common parameters
usageParameters := func() {
fmt.Fprintln(os.Stderr, "Available Parameters:")
fmt.Fprintln(os.Stderr, " /set parameter seed <int> Random number seed")
fmt.Fprintln(os.Stderr, " /set parameter num_predict <int> Max number of tokens to predict")
fmt.Fprintln(os.Stderr, " /set parameter top_k <int> Pick from top k num of tokens")
fmt.Fprintln(os.Stderr, " /set parameter top_p <float> Pick token based on sum of probabilities")
fmt.Fprintln(os.Stderr, " /set parameter num_ctx <int> Set the context size")
fmt.Fprintln(os.Stderr, " /set parameter temperature <float> Set creativity level")
fmt.Fprintln(os.Stderr, " /set parameter repeat_penalty <float> How strongly to penalize repetitions")
fmt.Fprintln(os.Stderr, " /set parameter repeat_last_n <int> Set how far back to look for repetitions")
fmt.Fprintln(os.Stderr, " /set parameter num_gpu <int> The number of layers to send to the GPU")
fmt.Fprintln(os.Stderr, " /set parameter stop \"<string>\", ... Set the stop parameters")
fmt.Fprintln(os.Stderr, "")
}
scanner, err := readline.New(readline.Prompt{
Prompt: ">>> ",
AltPrompt: "... ",
Placeholder: "Send a message (/? for help)",
AltPlaceholder: `Use """ to end multi-line input`,
})
if err != nil {
return err
}
fmt.Print(readline.StartBracketedPaste)
defer fmt.Printf(readline.EndBracketedPaste)
var multiline MultilineState
var prompt string
for {
line, err := scanner.Readline()
switch {
case errors.Is(err, io.EOF):
fmt.Println()
return nil
case errors.Is(err, readline.ErrInterrupt):
if line == "" {
fmt.Println("\nUse Ctrl-D or /bye to exit.")
}
scanner.Prompt.UseAlt = false
prompt = ""
continue
case err != nil:
return err
}
switch {
case strings.HasPrefix(prompt, `"""`):
// if the prompt so far starts with """ then we're in multiline mode
// and we need to keep reading until we find a line that ends with """
cut, found := strings.CutSuffix(line, `"""`)
prompt += cut
if !found {
prompt += "\n"
continue
}
prompt = strings.TrimPrefix(prompt, `"""`)
scanner.Prompt.UseAlt = false
switch multiline {
case MultilineSystem:
opts.System = prompt
prompt = ""
fmt.Println("Set system message.")
case MultilineTemplate:
opts.Template = prompt
prompt = ""
fmt.Println("Set prompt template.")
}
multiline = MultilineNone
case strings.HasPrefix(line, `"""`) && len(prompt) == 0:
scanner.Prompt.UseAlt = true
multiline = MultilinePrompt
prompt += line + "\n"
continue
case scanner.Pasting:
prompt += line + "\n"
continue
case strings.HasPrefix(line, "/list"):
args := strings.Fields(line)
if err := ListHandler(cmd, args[1:]); err != nil {
return err
}
case strings.HasPrefix(line, "/set"):
args := strings.Fields(line)
if len(args) > 1 {
switch args[1] {
case "history":
scanner.HistoryEnable()
case "nohistory":
scanner.HistoryDisable()
case "wordwrap":
opts.WordWrap = true
fmt.Println("Set 'wordwrap' mode.")
case "nowordwrap":
opts.WordWrap = false
fmt.Println("Set 'nowordwrap' mode.")
case "verbose":
cmd.Flags().Set("verbose", "true")
fmt.Println("Set 'verbose' mode.")
case "quiet":
cmd.Flags().Set("verbose", "false")
fmt.Println("Set 'quiet' mode.")
case "format":
if len(args) < 3 || args[2] != "json" {
fmt.Println("Invalid or missing format. For 'json' mode use '/set format json'")
} else {
opts.Format = args[2]
fmt.Printf("Set format to '%s' mode.\n", args[2])
}
case "noformat":
opts.Format = ""
fmt.Println("Disabled format.")
case "parameter":
if len(args) < 4 {
usageParameters()
continue
}
var params []string
for _, p := range args[3:] {
params = append(params, p)
}
fp, err := api.FormatParams(map[string][]string{args[2]: params})
if err != nil {
fmt.Printf("Couldn't set parameter: %q\n\n", err)
continue
}
fmt.Printf("Set parameter '%s' to '%s'\n\n", args[2], strings.Join(params, ", "))
opts.Options[args[2]] = fp[args[2]]
case "system", "template":
if len(args) < 3 {
usageSet()
continue
}
line := strings.Join(args[2:], " ")
line = strings.TrimPrefix(line, `"""`)
if strings.HasPrefix(args[2], `"""`) {
cut, found := strings.CutSuffix(line, `"""`)
prompt += cut
if found {
if args[1] == "system" {
opts.System = prompt
fmt.Println("Set system message.")
} else {
opts.Template = prompt
fmt.Println("Set prompt template.")
}
prompt = ""
} else {
prompt = `"""` + prompt + "\n"
if args[1] == "system" {
multiline = MultilineSystem
} else {
multiline = MultilineTemplate
}
scanner.Prompt.UseAlt = true
}
} else {
opts.System = line
fmt.Println("Set system message.")
}
default:
fmt.Printf("Unknown command '/set %s'. Type /? for help\n", args[1])
}
} else {
usageSet()
}
case strings.HasPrefix(line, "/show"):
args := strings.Fields(line)
if len(args) > 1 {
client, err := api.ClientFromEnvironment()
if err != nil {
fmt.Println("error: couldn't connect to ollama server")
return err
}
resp, err := client.Show(cmd.Context(), &api.ShowRequest{Name: opts.Model})
if err != nil {
fmt.Println("error: couldn't get model")
return err
}
switch args[1] {
case "license":
if resp.License == "" {
fmt.Print("No license was specified for this model.\n\n")
} else {
fmt.Println(resp.License)
}
case "modelfile":
fmt.Println(resp.Modelfile)
case "parameters":
if resp.Parameters == "" {
fmt.Print("No parameters were specified for this model.\n\n")
} else {
if len(opts.Options) > 0 {
fmt.Println("User defined parameters:")
for k, v := range opts.Options {
fmt.Printf("%-*s %v\n", 30, k, v)
}
fmt.Println()
}
fmt.Println("Model defined parameters:")
fmt.Println(resp.Parameters)
}
case "system":
switch {
case opts.System != "":
fmt.Println(opts.System + "\n")
case resp.System != "":
fmt.Println(resp.System + "\n")
default:
fmt.Print("No system message was specified for this model.\n\n")
}
case "template":
switch {
case opts.Template != "":
fmt.Println(opts.Template + "\n")
case resp.Template != "":
fmt.Println(resp.Template)
default:
fmt.Print("No prompt template was specified for this model.\n\n")
}
default:
fmt.Printf("Unknown command '/show %s'. Type /? for help\n", args[1])
}
} else {
usageShow()
}
case strings.HasPrefix(line, "/help"), strings.HasPrefix(line, "/?"):
args := strings.Fields(line)
if len(args) > 1 {
switch args[1] {
case "set", "/set":
usageSet()
case "show", "/show":
usageShow()
}
} else {
usage()
}
case line == "/exit", line == "/bye":
return nil
case strings.HasPrefix(line, "/"):
args := strings.Fields(line)
fmt.Printf("Unknown command '%s'. Type /? for help\n", args[0])
continue
default:
prompt += line
}
if len(prompt) > 0 && multiline == MultilineNone {
opts.Prompt = prompt
if multiModal {
newPrompt, images, err := extractFileNames(prompt)
if err != nil {
return err
}
opts.Prompt = newPrompt
// reset the context if we find another image
if len(images) > 0 {
opts.Images = images
ctx := cmd.Context()
ctx = context.WithValue(ctx, generateContextKey("context"), []int{})
cmd.SetContext(ctx)
}
if len(opts.Images) == 0 {
fmt.Println("This model requires you to add a jpeg, png, or svg image.")
fmt.Println()
prompt = ""
continue
}
}
if err := generate(cmd, opts); err != nil {
return err
}
prompt = ""
}
}
}
func normalizeFilePath(fp string) string {
// Define a map of escaped characters and their replacements
replacements := map[string]string{
"\\ ": " ", // Escaped space
"\\(": "(", // Escaped left parenthesis
"\\)": ")", // Escaped right parenthesis
"\\[": "[", // Escaped left square bracket
"\\]": "]", // Escaped right square bracket
"\\{": "{", // Escaped left curly brace
"\\}": "}", // Escaped right curly brace
"\\$": "$", // Escaped dollar sign
"\\&": "&", // Escaped ampersand
"\\;": ";", // Escaped semicolon
"\\'": "'", // Escaped single quote
"\\\\": "\\", // Escaped backslash
"\\*": "*", // Escaped asterisk
"\\?": "?", // Escaped question mark
}
for escaped, actual := range replacements {
fp = strings.ReplaceAll(fp, escaped, actual)
}
return fp
}
func extractFileNames(input string) (string, []ImageData, error) {
// Regex to match file paths starting with / or ./ and include escaped spaces (\ or %20)
// and followed by more characters and a file extension
regexPattern := `(?:\./|/)[\S\\ ]+?\.(?i:jpg|jpeg|png|svg)\b`
re := regexp.MustCompile(regexPattern)
filePaths := re.FindAllString(input, -1)
var imgs []ImageData
for _, fp := range filePaths {
nfp := normalizeFilePath(fp)
data, err := getImageData(nfp)
if err != nil {
if os.IsNotExist(err) {
continue
}
fmt.Printf("Couldn't process image: %q\n", err)
return "", imgs, err
}
fmt.Printf("Added image '%s'\n", nfp)
input = strings.ReplaceAll(input, fp, "")
imgs = append(imgs, data)
}
return input, imgs, nil
}
func RunServer(cmd *cobra.Command, _ []string) error {
host, port, err := net.SplitHostPort(os.Getenv("OLLAMA_HOST"))
if err != nil {
@@ -1035,56 +687,7 @@ func RunServer(cmd *cobra.Command, _ []string) error {
return err
}
var origins []string
if o := os.Getenv("OLLAMA_ORIGINS"); o != "" {
origins = strings.Split(o, ",")
}
return server.Serve(ln, origins)
}
func getImageData(filePath string) ([]byte, error) {
file, err := os.Open(filePath)
if err != nil {
return nil, err
}
defer file.Close()
buf := make([]byte, 512)
_, err = file.Read(buf)
if err != nil {
return nil, err
}
contentType := http.DetectContentType(buf)
allowedTypes := []string{"image/jpeg", "image/jpg", "image/svg+xml", "image/png"}
if !slices.Contains(allowedTypes, contentType) {
return nil, fmt.Errorf("invalid image type: %s", contentType)
}
info, err := file.Stat()
if err != nil {
return nil, err
}
// Check if the file size exceeds 100MB
var maxSize int64 = 100 * 1024 * 1024 // 100MB in bytes
if info.Size() > maxSize {
return nil, fmt.Errorf("file size exceeds maximum limit (100MB)")
}
buf = make([]byte, info.Size())
_, err = file.Seek(0, 0)
if err != nil {
return nil, err
}
_, err = io.ReadFull(file, buf)
if err != nil {
return nil, err
}
return buf, nil
return server.Serve(ln)
}
func initializeKeypair() error {

555
cmd/interactive.go Normal file
View File

@@ -0,0 +1,555 @@
package cmd
import (
"errors"
"fmt"
"io"
"net/http"
"os"
"regexp"
"strings"
"github.com/spf13/cobra"
"golang.org/x/exp/slices"
"github.com/jmorganca/ollama/api"
"github.com/jmorganca/ollama/readline"
)
type MultilineState int
const (
MultilineNone MultilineState = iota
MultilinePrompt
MultilineSystem
MultilineTemplate
)
func modelIsMultiModal(cmd *cobra.Command, name string) bool {
// get model details
client, err := api.ClientFromEnvironment()
if err != nil {
fmt.Println("error: couldn't connect to ollama server")
return false
}
req := api.ShowRequest{Name: name}
resp, err := client.Show(cmd.Context(), &req)
if err != nil {
return false
}
return slices.Contains(resp.Details.Families, "clip")
}
func generateInteractive(cmd *cobra.Command, opts runOptions) error {
multiModal := modelIsMultiModal(cmd, opts.Model)
// load the model
loadOpts := runOptions{
Model: opts.Model,
Prompt: "",
Messages: []api.Message{},
}
if _, err := chat(cmd, loadOpts); err != nil {
return err
}
usage := func() {
fmt.Fprintln(os.Stderr, "Available Commands:")
fmt.Fprintln(os.Stderr, " /set Set session variables")
fmt.Fprintln(os.Stderr, " /show Show model information")
fmt.Fprintln(os.Stderr, " /bye Exit")
fmt.Fprintln(os.Stderr, " /?, /help Help for a command")
fmt.Fprintln(os.Stderr, " /? shortcuts Help for keyboard shortcuts")
fmt.Fprintln(os.Stderr, "")
fmt.Fprintln(os.Stderr, "Use \"\"\" to begin a multi-line message.")
fmt.Fprintln(os.Stderr, "")
}
usageSet := func() {
fmt.Fprintln(os.Stderr, "Available Commands:")
fmt.Fprintln(os.Stderr, " /set parameter ... Set a parameter")
fmt.Fprintln(os.Stderr, " /set system <string> Set system message")
fmt.Fprintln(os.Stderr, " /set template <string> Set prompt template")
fmt.Fprintln(os.Stderr, " /set history Enable history")
fmt.Fprintln(os.Stderr, " /set nohistory Disable history")
fmt.Fprintln(os.Stderr, " /set wordwrap Enable wordwrap")
fmt.Fprintln(os.Stderr, " /set nowordwrap Disable wordwrap")
fmt.Fprintln(os.Stderr, " /set format json Enable JSON mode")
fmt.Fprintln(os.Stderr, " /set noformat Disable formatting")
fmt.Fprintln(os.Stderr, " /set verbose Show LLM stats")
fmt.Fprintln(os.Stderr, " /set quiet Disable LLM stats")
fmt.Fprintln(os.Stderr, "")
}
usageShortcuts := func() {
fmt.Fprintln(os.Stderr, "Available keyboard shortcuts:")
fmt.Fprintln(os.Stderr, " Ctrl + a Move to the beginning of the line (Home)")
fmt.Fprintln(os.Stderr, " Ctrl + e Move to the end of the line (End)")
fmt.Fprintln(os.Stderr, " Alt + b Move back (left) one word")
fmt.Fprintln(os.Stderr, " Alt + f Move forward (right) one word")
fmt.Fprintln(os.Stderr, " Ctrl + k Delete the sentence after the cursor")
fmt.Fprintln(os.Stderr, " Ctrl + u Delete the sentence before the cursor")
fmt.Fprintln(os.Stderr, "")
fmt.Fprintln(os.Stderr, " Ctrl + l Clear the screen")
fmt.Fprintln(os.Stderr, " Ctrl + c Stop the model from responding")
fmt.Fprintln(os.Stderr, " Ctrl + d Exit ollama (/bye)")
fmt.Fprintln(os.Stderr, "")
}
usageShow := func() {
fmt.Fprintln(os.Stderr, "Available Commands:")
fmt.Fprintln(os.Stderr, " /show info Show details for this model")
fmt.Fprintln(os.Stderr, " /show license Show model license")
fmt.Fprintln(os.Stderr, " /show modelfile Show Modelfile for this model")
fmt.Fprintln(os.Stderr, " /show parameters Show parameters for this model")
fmt.Fprintln(os.Stderr, " /show system Show system message")
fmt.Fprintln(os.Stderr, " /show template Show prompt template")
fmt.Fprintln(os.Stderr, "")
}
// only list out the most common parameters
usageParameters := func() {
fmt.Fprintln(os.Stderr, "Available Parameters:")
fmt.Fprintln(os.Stderr, " /set parameter seed <int> Random number seed")
fmt.Fprintln(os.Stderr, " /set parameter num_predict <int> Max number of tokens to predict")
fmt.Fprintln(os.Stderr, " /set parameter top_k <int> Pick from top k num of tokens")
fmt.Fprintln(os.Stderr, " /set parameter top_p <float> Pick token based on sum of probabilities")
fmt.Fprintln(os.Stderr, " /set parameter num_ctx <int> Set the context size")
fmt.Fprintln(os.Stderr, " /set parameter temperature <float> Set creativity level")
fmt.Fprintln(os.Stderr, " /set parameter repeat_penalty <float> How strongly to penalize repetitions")
fmt.Fprintln(os.Stderr, " /set parameter repeat_last_n <int> Set how far back to look for repetitions")
fmt.Fprintln(os.Stderr, " /set parameter num_gpu <int> The number of layers to send to the GPU")
fmt.Fprintln(os.Stderr, " /set parameter stop \"<string>\", ... Set the stop parameters")
fmt.Fprintln(os.Stderr, "")
}
scanner, err := readline.New(readline.Prompt{
Prompt: ">>> ",
AltPrompt: "... ",
Placeholder: "Send a message (/? for help)",
AltPlaceholder: `Use """ to end multi-line input`,
})
if err != nil {
return err
}
fmt.Print(readline.StartBracketedPaste)
defer fmt.Printf(readline.EndBracketedPaste)
var sb strings.Builder
var multiline MultilineState
opts.Messages = make([]api.Message, 0)
for {
line, err := scanner.Readline()
switch {
case errors.Is(err, io.EOF):
fmt.Println()
return nil
case errors.Is(err, readline.ErrInterrupt):
if line == "" {
fmt.Println("\nUse Ctrl + d or /bye to exit.")
}
scanner.Prompt.UseAlt = false
sb.Reset()
continue
case err != nil:
return err
}
switch {
case multiline != MultilineNone:
// check if there's a multiline terminating string
before, ok := strings.CutSuffix(line, `"""`)
sb.WriteString(before)
if !ok {
fmt.Fprintln(&sb)
continue
}
switch multiline {
case MultilineSystem:
opts.System = sb.String()
fmt.Println("Set system message.")
sb.Reset()
case MultilineTemplate:
opts.Template = sb.String()
fmt.Println("Set prompt template.")
sb.Reset()
}
multiline = MultilineNone
scanner.Prompt.UseAlt = false
case strings.HasPrefix(line, `"""`):
line := strings.TrimPrefix(line, `"""`)
line, ok := strings.CutSuffix(line, `"""`)
sb.WriteString(line)
if !ok {
// no multiline terminating string; need more input
fmt.Fprintln(&sb)
multiline = MultilinePrompt
scanner.Prompt.UseAlt = true
break
}
case scanner.Pasting:
fmt.Fprintln(&sb, line)
continue
case strings.HasPrefix(line, "/list"):
args := strings.Fields(line)
if err := ListHandler(cmd, args[1:]); err != nil {
return err
}
case strings.HasPrefix(line, "/set"):
args := strings.Fields(line)
if len(args) > 1 {
switch args[1] {
case "history":
scanner.HistoryEnable()
case "nohistory":
scanner.HistoryDisable()
case "wordwrap":
opts.WordWrap = true
fmt.Println("Set 'wordwrap' mode.")
case "nowordwrap":
opts.WordWrap = false
fmt.Println("Set 'nowordwrap' mode.")
case "verbose":
cmd.Flags().Set("verbose", "true")
fmt.Println("Set 'verbose' mode.")
case "quiet":
cmd.Flags().Set("verbose", "false")
fmt.Println("Set 'quiet' mode.")
case "format":
if len(args) < 3 || args[2] != "json" {
fmt.Println("Invalid or missing format. For 'json' mode use '/set format json'")
} else {
opts.Format = args[2]
fmt.Printf("Set format to '%s' mode.\n", args[2])
}
case "noformat":
opts.Format = ""
fmt.Println("Disabled format.")
case "parameter":
if len(args) < 4 {
usageParameters()
continue
}
params := args[3:]
fp, err := api.FormatParams(map[string][]string{args[2]: params})
if err != nil {
fmt.Printf("Couldn't set parameter: %q\n", err)
continue
}
fmt.Printf("Set parameter '%s' to '%s'\n", args[2], strings.Join(params, ", "))
opts.Options[args[2]] = fp[args[2]]
case "system", "template":
if len(args) < 3 {
usageSet()
continue
}
if args[1] == "system" {
multiline = MultilineSystem
} else if args[1] == "template" {
multiline = MultilineTemplate
}
line := strings.Join(args[2:], " ")
line, ok := strings.CutPrefix(line, `"""`)
if !ok {
multiline = MultilineNone
} else {
// only cut suffix if the line is multiline
line, ok = strings.CutSuffix(line, `"""`)
if ok {
multiline = MultilineNone
}
}
sb.WriteString(line)
if multiline != MultilineNone {
scanner.Prompt.UseAlt = true
continue
}
if args[1] == "system" {
opts.System = sb.String()
fmt.Println("Set system message.")
} else if args[1] == "template" {
opts.Template = sb.String()
fmt.Println("Set prompt template.")
}
sb.Reset()
continue
default:
fmt.Printf("Unknown command '/set %s'. Type /? for help\n", args[1])
}
} else {
usageSet()
}
case strings.HasPrefix(line, "/show"):
args := strings.Fields(line)
if len(args) > 1 {
client, err := api.ClientFromEnvironment()
if err != nil {
fmt.Println("error: couldn't connect to ollama server")
return err
}
req := &api.ShowRequest{
Name: opts.Model,
System: opts.System,
Template: opts.Template,
Options: opts.Options,
}
resp, err := client.Show(cmd.Context(), req)
if err != nil {
fmt.Println("error: couldn't get model")
return err
}
switch args[1] {
case "info":
fmt.Println("Model details:")
if len(resp.Details.Families) > 0 {
fmt.Printf("Family %s\n", strings.Join(resp.Details.Families, ", "))
} else if resp.Details.Family != "" {
fmt.Printf("Family %s\n", resp.Details.Family)
}
fmt.Printf("Parameter Size %s\n", resp.Details.ParameterSize)
fmt.Printf("Quantization Level %s\n", resp.Details.QuantizationLevel)
fmt.Println("")
case "license":
if resp.License == "" {
fmt.Println("No license was specified for this model.")
} else {
fmt.Println(resp.License)
}
case "modelfile":
fmt.Println(resp.Modelfile)
case "parameters":
if resp.Parameters == "" {
fmt.Println("No parameters were specified for this model.")
} else {
if len(opts.Options) > 0 {
fmt.Println("User defined parameters:")
for k, v := range opts.Options {
fmt.Printf("%-*s %v\n", 30, k, v)
}
fmt.Println()
}
fmt.Println("Model defined parameters:")
fmt.Println(resp.Parameters)
}
case "system":
switch {
case opts.System != "":
fmt.Println(opts.System + "\n")
case resp.System != "":
fmt.Println(resp.System + "\n")
default:
fmt.Println("No system message was specified for this model.")
}
case "template":
switch {
case opts.Template != "":
fmt.Println(opts.Template + "\n")
case resp.Template != "":
fmt.Println(resp.Template)
default:
fmt.Println("No prompt template was specified for this model.")
}
default:
fmt.Printf("Unknown command '/show %s'. Type /? for help\n", args[1])
}
} else {
usageShow()
}
case strings.HasPrefix(line, "/help"), strings.HasPrefix(line, "/?"):
args := strings.Fields(line)
if len(args) > 1 {
switch args[1] {
case "set", "/set":
usageSet()
case "show", "/show":
usageShow()
case "shortcut", "shortcuts":
usageShortcuts()
}
} else {
usage()
}
case line == "/exit", line == "/bye":
return nil
case strings.HasPrefix(line, "/"):
args := strings.Fields(line)
isFile := false
if multiModal {
for _, f := range extractFileNames(line) {
if strings.HasPrefix(f, args[0]) {
isFile = true
break
}
}
}
if !isFile {
fmt.Printf("Unknown command '%s'. Type /? for help\n", args[0])
continue
}
sb.WriteString(line)
default:
sb.WriteString(line)
}
if sb.Len() > 0 && multiline == MultilineNone {
newMessage := api.Message{Role: "user", Content: sb.String()}
if multiModal {
msg, images, err := extractFileData(sb.String())
if err != nil {
return err
}
newMessage.Content = msg
// reset the context if we find another image
if len(images) > 0 {
newMessage.Images = append(newMessage.Images, images...)
// reset the context for the new image
opts.Messages = []api.Message{}
} else {
if len(opts.Messages) > 1 {
newMessage.Images = append(newMessage.Images, opts.Messages[len(opts.Messages)-2].Images...)
}
}
if len(newMessage.Images) == 0 {
fmt.Println("This model requires you to add a jpeg, png, or svg image.")
fmt.Println()
sb.Reset()
continue
}
}
if opts.System != "" {
opts.Messages = append(opts.Messages, api.Message{Role: "system", Content: opts.System})
}
opts.Messages = append(opts.Messages, newMessage)
assistant, err := chat(cmd, opts)
if err != nil {
return err
}
if assistant != nil {
opts.Messages = append(opts.Messages, *assistant)
}
sb.Reset()
}
}
}
func normalizeFilePath(fp string) string {
// Define a map of escaped characters and their replacements
replacements := map[string]string{
"\\ ": " ", // Escaped space
"\\(": "(", // Escaped left parenthesis
"\\)": ")", // Escaped right parenthesis
"\\[": "[", // Escaped left square bracket
"\\]": "]", // Escaped right square bracket
"\\{": "{", // Escaped left curly brace
"\\}": "}", // Escaped right curly brace
"\\$": "$", // Escaped dollar sign
"\\&": "&", // Escaped ampersand
"\\;": ";", // Escaped semicolon
"\\'": "'", // Escaped single quote
"\\\\": "\\", // Escaped backslash
"\\*": "*", // Escaped asterisk
"\\?": "?", // Escaped question mark
}
for escaped, actual := range replacements {
fp = strings.ReplaceAll(fp, escaped, actual)
}
return fp
}
func extractFileNames(input string) []string {
// Regex to match file paths starting with optional drive letter, / ./ \ or .\ and include escaped or unescaped spaces (\ or %20)
// and followed by more characters and a file extension
// This will capture non filename strings, but we'll check for file existence to remove mismatches
regexPattern := `(?:[a-zA-Z]:)?(?:\./|/|\\)[\S\\ ]+?\.(?i:jpg|jpeg|png|svg)\b`
re := regexp.MustCompile(regexPattern)
return re.FindAllString(input, -1)
}
func extractFileData(input string) (string, []api.ImageData, error) {
filePaths := extractFileNames(input)
var imgs []api.ImageData
for _, fp := range filePaths {
nfp := normalizeFilePath(fp)
data, err := getImageData(nfp)
if err != nil {
if os.IsNotExist(err) {
continue
}
fmt.Printf("Couldn't process image: %q\n", err)
return "", imgs, err
}
fmt.Printf("Added image '%s'\n", nfp)
input = strings.ReplaceAll(input, fp, "")
imgs = append(imgs, data)
}
return input, imgs, nil
}
func getImageData(filePath string) ([]byte, error) {
file, err := os.Open(filePath)
if err != nil {
return nil, err
}
defer file.Close()
buf := make([]byte, 512)
_, err = file.Read(buf)
if err != nil {
return nil, err
}
contentType := http.DetectContentType(buf)
allowedTypes := []string{"image/jpeg", "image/jpg", "image/svg+xml", "image/png"}
if !slices.Contains(allowedTypes, contentType) {
return nil, fmt.Errorf("invalid image type: %s", contentType)
}
info, err := file.Stat()
if err != nil {
return nil, err
}
// Check if the file size exceeds 100MB
var maxSize int64 = 100 * 1024 * 1024 // 100MB in bytes
if info.Size() > maxSize {
return nil, fmt.Errorf("file size exceeds maximum limit (100MB)")
}
buf = make([]byte, info.Size())
_, err = file.Seek(0, 0)
if err != nil {
return nil, err
}
_, err = io.ReadFull(file, buf)
if err != nil {
return nil, err
}
return buf, nil
}

51
cmd/interactive_test.go Normal file
View File

@@ -0,0 +1,51 @@
package cmd
import (
"testing"
"github.com/stretchr/testify/assert"
)
func TestExtractFilenames(t *testing.T) {
// Unix style paths
input := ` some preamble
./relative\ path/one.png inbetween1 ./not a valid two.jpg inbetween2
/unescaped space /three.jpeg inbetween3 /valid\ path/dir/four.png "./quoted with spaces/five.svg`
res := extractFileNames(input)
assert.Len(t, res, 5)
assert.Contains(t, res[0], "one.png")
assert.Contains(t, res[1], "two.jpg")
assert.Contains(t, res[2], "three.jpeg")
assert.Contains(t, res[3], "four.png")
assert.Contains(t, res[4], "five.svg")
assert.NotContains(t, res[4], '"')
assert.NotContains(t, res, "inbtween")
// Windows style paths
input = ` some preamble
c:/users/jdoe/one.png inbetween1 c:/program files/someplace/two.jpg inbetween2
/absolute/nospace/three.jpeg inbetween3 /absolute/with space/four.png inbetween4
./relative\ path/five.svg inbetween5 "./relative with/spaces/six.png inbetween6
d:\path with\spaces\seven.svg inbetween7 c:\users\jdoe\eight.png inbetween8
d:\program files\someplace\nine.png inbetween9 "E:\program files\someplace\ten.svg some ending
`
res = extractFileNames(input)
assert.Len(t, res, 10)
assert.NotContains(t, res, "inbtween")
assert.Contains(t, res[0], "one.png")
assert.Contains(t, res[0], "c:")
assert.Contains(t, res[1], "two.jpg")
assert.Contains(t, res[1], "c:")
assert.Contains(t, res[2], "three.jpeg")
assert.Contains(t, res[3], "four.png")
assert.Contains(t, res[4], "five.svg")
assert.Contains(t, res[5], "six.png")
assert.Contains(t, res[6], "seven.svg")
assert.Contains(t, res[6], "d:")
assert.Contains(t, res[7], "eight.png")
assert.Contains(t, res[7], "c:")
assert.Contains(t, res[8], "nine.png")
assert.Contains(t, res[8], "d:")
assert.Contains(t, res[9], "ten.svg")
assert.Contains(t, res[9], "E:")
}

View File

@@ -1,6 +1,25 @@
# Documentation
- [Modelfile](./modelfile.md)
- [How to develop Ollama](./development.md)
- [API](./api.md)
- [Tutorials](./tutorials.md)
To get started, see the project's **[quickstart](../README.md#quickstart)**.
Ollama is a tool for running AI models on your hardware. Many users will choose to use the Command Line Interface (CLI) to work with Ollama. Learn more about all the commands in the CLI in the **[Main Readme](../README.md)**.
Use the RESTful API using any language, including Python, JavaScript, Typescript, Go, Rust, and many more. Learn more about using the API in the **[API Documentation](./api.md)**.
Create new models or modify models already in the library using the Modelfile. Learn more about the Modelfile syntax in the **[Modelfile Documentation](./modelfile.md)**.
Import models using source model weights found on Hugging Face and similar sites by referring to the **[Import Documentation](./import.md)**.
Installing on Linux in most cases is easy using the script on Ollama.ai. To get more detail about the install, including CUDA drivers, see the **[Linux Documentation](./linux.md)**.
Many of our users like the flexibility of using our official Docker Image. Learn more about using Docker with Ollama using the **[Docker Documentation](https://hub.docker.com/r/ollama/ollama)**.
It is easy to install on Linux and Mac, but many users will choose to build Ollama on their own. To do this, refer to the **[Development Documentation](./development.md)**.
If encountering a problem with Ollama, the best place to start is the logs. Find more information about them here in the **[Troubleshooting Guide](./troubleshooting.md)**.
Finally for all the questions that don't fit anywhere else, there is the **[FAQ](./faq.md)**
[Tutorials](./tutorials.md) apply the documentation to tasks.
For working code examples of using Ollama, see [Examples](../examples).

View File

@@ -17,7 +17,7 @@
### Model names
Model names follow a `model:tag` format. Some examples are `orca-mini:3b-q4_1` and `llama2:70b`. The tag is optional and, if not provided, will default to `latest`. The tag is used to identify a specific version.
Model names follow a `model:tag` format, where `model` can have an optional namespace such as `example/model`. Some examples are `orca-mini:3b-q4_1` and `llama2:70b`. The tag is optional and, if not provided, will default to `latest`. The tag is used to identify a specific version.
### Durations
@@ -25,7 +25,7 @@ All durations are returned in nanoseconds.
### Streaming responses
Certain endpoints stream responses as JSON objects.
Certain endpoints stream responses as JSON objects and can optional return non-streamed responses.
## Generate a completion
@@ -39,27 +39,29 @@ Generate a response for a given prompt with a provided model. This is a streamin
- `model`: (required) the [model name](#model-names)
- `prompt`: the prompt to generate a response for
- `images`: a list of base64-encoded images (for multimodal models such as `llava`)
- `images`: (optional) a list of base64-encoded images (for multimodal models such as `llava`)
Advanced parameters (optional):
- `format`: the format to return a response in. Currently the only accepted value is `json`
- `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
- `system`: system message to (overrides what is defined in the `Modelfile`)
- `template`: the full prompt or prompt template (overrides what is defined in the `Modelfile`)
- `template`: the prompt template to use (overrides what is defined in the `Modelfile`)
- `context`: the context parameter returned from a previous request to `/generate`, this can be used to keep a short conversational memory
- `stream`: if `false` the response will be returned as a single response object, rather than a stream of objects
- `raw`: if `true` no formatting will be applied to the prompt. You may choose to use the `raw` parameter if you are specifying a full templated prompt in your request to the API.
### JSON mode
#### JSON mode
Enable JSON mode by setting the `format` parameter to `json`. This will structure the response as valid JSON. See the JSON mode [example](#request-json-mode) below.
Enable JSON mode by setting the `format` parameter to `json`. This will structure the response as a valid JSON object. See the JSON mode [example](#generate-request-json-mode) below.
> Note: it's important to instruct the model to use JSON in the `prompt`. Otherwise, the model may generate large amounts whitespace.
### Examples
#### Request
#### Generate request (Streaming)
##### Request
```shell
curl http://localhost:11434/api/generate -d '{
@@ -68,7 +70,7 @@ curl http://localhost:11434/api/generate -d '{
}'
```
#### Response
##### Response
A stream of JSON objects is returned:
@@ -85,8 +87,6 @@ The final response in the stream also includes additional data about the generat
- `total_duration`: time spent generating the response
- `load_duration`: time spent in nanoseconds loading the model
- `sample_count`: number of samples generated
- `sample_duration`: time spent generating samples
- `prompt_eval_count`: number of tokens in the prompt
- `prompt_eval_duration`: time spent in nanoseconds evaluating the prompt
- `eval_count`: number of tokens the response
@@ -101,22 +101,22 @@ To calculate how fast the response is generated in tokens per second (token/s),
"model": "llama2",
"created_at": "2023-08-04T19:22:45.499127Z",
"response": "",
"context": [1, 2, 3],
"done": true,
"total_duration": 5589157167,
"load_duration": 3013701500,
"sample_count": 114,
"sample_duration": 81442000,
"prompt_eval_count": 46,
"prompt_eval_duration": 1160282000,
"eval_count": 113,
"eval_duration": 1325948000
"context": [1, 2, 3],
"total_duration": 10706818083,
"load_duration": 6338219291,
"prompt_eval_count": 26,
"prompt_eval_duration": 130079000,
"eval_count": 259,
"eval_duration": 4232710000
}
```
#### Request (No streaming)
A response can be recieved in one reply when streaming is off.
##### Request
A response can be received in one reply when streaming is off.
```shell
curl http://localhost:11434/api/generate -d '{
@@ -126,7 +126,7 @@ curl http://localhost:11434/api/generate -d '{
}'
```
#### Response
##### Response
If `stream` is set to `false`, the response will be a single JSON object:
@@ -135,83 +135,23 @@ If `stream` is set to `false`, the response will be a single JSON object:
"model": "llama2",
"created_at": "2023-08-04T19:22:45.499127Z",
"response": "The sky is blue because it is the color of the sky.",
"context": [1, 2, 3],
"done": true,
"total_duration": 5589157167,
"load_duration": 3013701500,
"sample_count": 114,
"sample_duration": 81442000,
"prompt_eval_count": 46,
"prompt_eval_duration": 1160282000,
"eval_count": 13,
"eval_duration": 1325948000
}
```
#### Request (with images)
To submit images to multimodal models such as `llava` or `bakllava`, provide a list of base64-encoded `images`:
```shell
curl http://localhost:11434/api/generate -d '{
"model": "llava",
"prompt":"What is in this picture?",
"stream": false,
"images": ["iVBORw0KGgoAAAANSUhEUgAAAG0AAABmCAYAAADBPx+VAAAACXBIWXMAAAsTAAALEwEAmpwYAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAA3VSURBVHgB7Z27r0zdG8fX743i1bi1ikMoFMQloXRpKFFIqI7LH4BEQ+NWIkjQuSWCRIEoULk0gsK1kCBI0IhrQVT7tz/7zZo888yz1r7MnDl7z5xvsjkzs2fP3uu71nNfa7lkAsm7d++Sffv2JbNmzUqcc8m0adOSzZs3Z+/XES4ZckAWJEGWPiCxjsQNLWmQsWjRIpMseaxcuTKpG/7HP27I8P79e7dq1ars/yL4/v27S0ejqwv+cUOGEGGpKHR37tzJCEpHV9tnT58+dXXCJDdECBE2Ojrqjh071hpNECjx4cMHVycM1Uhbv359B2F79+51586daxN/+pyRkRFXKyRDAqxEp4yMlDDzXG1NPnnyJKkThoK0VFd1ELZu3TrzXKxKfW7dMBQ6bcuWLW2v0VlHjx41z717927ba22U9APcw7Nnz1oGEPeL3m3p2mTAYYnFmMOMXybPPXv2bNIPpFZr1NHn4HMw0KRBjg9NuRw95s8PEcz/6DZELQd/09C9QGq5RsmSRybqkwHGjh07OsJSsYYm3ijPpyHzoiacg35MLdDSIS/O1yM778jOTwYUkKNHWUzUWaOsylE00MyI0fcnOwIdjvtNdW/HZwNLGg+sR1kMepSNJXmIwxBZiG8tDTpEZzKg0GItNsosY8USkxDhD0Rinuiko2gfL/RbiD2LZAjU9zKQJj8RDR0vJBR1/Phx9+PHj9Z7REF4nTZkxzX4LCXHrV271qXkBAPGfP/atWvu/PnzHe4C97F48eIsRLZ9+3a3f/9+87dwP1JxaF7/3r17ba+5l4EcaVo0lj3SBq5kGTJSQmLWMjgYNei2GPT1MuMqGTDEFHzeQSP2wi/jGnkmPJ/nhccs44jvDAxpVcxnq0F6eT8h4ni/iIWpR5lPyA6ETkNXoSukvpJAD3AsXLiwpZs49+fPn5ke4j10TqYvegSfn0OnafC+Tv9ooA/JPkgQysqQNBzagXY55nO/oa1F7qvIPWkRL12WRpMWUvpVDYmxAPehxWSe8ZEXL20sadYIozfmNch4QJPAfeJgW3rNsnzphBKNJM2KKODo1rVOMRYik5ETy3ix4qWNI81qAAirizgMIc+yhTytx0JWZuNI03qsrgWlGtwjoS9XwgUhWGyhUaRZZQNNIEwCiXD16tXcAHUs79co0vSD8rrJCIW98pzvxpAWyyo3HYwqS0+H0BjStClcZJT5coMm6D2LOF8TolGJtK9fvyZpyiC5ePFi9nc/oJU4eiEP0jVoAnHa9wyJycITMP78+eMeP37sXrx44d6+fdt6f82aNdkx1pg9e3Zb5W+RSRE+n+VjksQWifvVaTKFhn5O8my63K8Qabdv33b379/PiAP//vuvW7BggZszZ072/+TJk91YgkafPn166zXB1rQHFvouAWHq9z3SEevSUerqCn2/dDCeta2jxYbr69evk4MHDyY7d+7MjhMnTiTPnz9Pfv/+nfQT2ggpO2dMF8cghuoM7Ygj5iWCqRlGFml0QC/ftGmTmzt3rmsaKDsgBSPh0/8yPeLLBihLkOKJc0jp8H8vUzcxIA1k6QJ/c78tWEyj5P3o4u9+jywNPdJi5rAH9x0KHcl4Hg570eQp3+vHXGyrmEeigzQsQsjavXt38ujRo44LQuDDhw+TW7duRS1HGgMxhNXHgflaNTOsHyKvHK5Ijo2jbFjJBQK9YwFd6RVMzfgRBmEfP37suBBm/p49e1qjEP2mwTViNRo0VJWH1deMXcNK08uUjVUu7s/zRaL+oLNxz1bpANco4npUgX4G2eFbpDFyQoQxojBCpEGSytmOH8qrH5Q9vuzD6ofQylkCUmh8DBAr+q8JCyVNtWQIidKQE9wNtLSQnS4jDSsxNHogzFuQBw4cyM61UKVsjfr3ooBkPSqqQHesUPWVtzi9/vQi1T+rJj7WiTz4Pt/l3LxUkr5P2VYZaZ4URpsE+st/dujQoaBBYokbrz/8TJNQYLSonrPS9kUaSkPeZyj1AWSj+d+VBoy1pIWVNed8P0Ll/ee5HdGRhrHhR5GGN0r4LGZBaj8oFDJitBTJzIZgFcmU0Y8ytWMZMzJOaXUSrUs5RxKnrxmbb5YXO9VGUhtpXldhEUogFr3IzIsvlpmdosVcGVGXFWp2oU9kLFL3dEkSz6NHEY1sjSRdIuDFWEhd8KxFqsRi1uM/nz9/zpxnwlESONdg6dKlbsaMGS4EHFHtjFIDHwKOo46l4TxSuxgDzi+rE2jg+BaFruOX4HXa0Nnf1lwAPufZeF8/r6zD97WK2qFnGjBxTw5qNGPxT+5T/r7/7RawFC3j4vTp09koCxkeHjqbHJqArmH5UrFKKksnxrK7FuRIs8STfBZv+luugXZ2pR/pP9Ois4z+TiMzUUkUjD0iEi1fzX8GmXyuxUBRcaUfykV0YZnlJGKQpOiGB76x5GeWkWWJc3mOrK6S7xdND+W5N6XyaRgtWJFe13GkaZnKOsYqGdOVVVbGupsyA/l7emTLHi7vwTdirNEt0qxnzAvBFcnQF16xh/TMpUuXHDowhlA9vQVraQhkudRdzOnK+04ZSP3DUhVSP61YsaLtd/ks7ZgtPcXqPqEafHkdqa84X6aCeL7YWlv6edGFHb+ZFICPlljHhg0bKuk0CSvVznWsotRu433alNdFrqG45ejoaPCaUkWERpLXjzFL2Rpllp7PJU2a/v7Ab8N05/9t27Z16KUqoFGsxnI9EosS2niSYg9SpU6B4JgTrvVW1flt1sT+0ADIJU2maXzcUTraGCRaL1Wp9rUMk16PMom8QhruxzvZIegJjFU7LLCePfS8uaQdPny4jTTL0dbee5mYokQsXTIWNY46kuMbnt8Kmec+LGWtOVIl9cT1rCB0V8WqkjAsRwta93TbwNYoGKsUSChN44lgBNCoHLHzquYKrU6qZ8lolCIN0Rh6cP0Q3U6I6IXILYOQI513hJaSKAorFpuHXJNfVlpRtmYBk1Su1obZr5dnKAO+L10Hrj3WZW+E3qh6IszE37F6EB+68mGpvKm4eb9bFrlzrok7fvr0Kfv727dvWRmdVTJHw0qiiCUSZ6wCK+7XL/AcsgNyL74DQQ730sv78Su7+t/A36MdY0sW5o40ahslXr58aZ5HtZB8GH64m9EmMZ7FpYw4T6QnrZfgenrhFxaSiSGXtPnz57e9TkNZLvTjeqhr734CNtrK41L40sUQckmj1lGKQ0rC37x544r8eNXRpnVE3ZZY7zXo8NomiO0ZUCj2uHz58rbXoZ6gc0uA+F6ZeKS/jhRDUq8MKrTho9fEkihMmhxtBI1DxKFY9XLpVcSkfoi8JGnToZO5sU5aiDQIW716ddt7ZLYtMQlhECdBGXZZMWldY5BHm5xgAroWj4C0hbYkSc/jBmggIrXJWlZM6pSETsEPGqZOndr2uuuR5rF169a2HoHPdurUKZM4CO1WTPqaDaAd+GFGKdIQkxAn9RuEWcTRyN2KSUgiSgF5aWzPTeA/lN5rZubMmR2bE4SIC4nJoltgAV/dVefZm72AtctUCJU2CMJ327hxY9t7EHbkyJFseq+EJSY16RPo3Dkq1kkr7+q0bNmyDuLQcZBEPYmHVdOBiJyIlrRDq41YPWfXOxUysi5fvtyaj+2BpcnsUV/oSoEMOk2CQGlr4ckhBwaetBhjCwH0ZHtJROPJkyc7UjcYLDjmrH7ADTEBXFfOYmB0k9oYBOjJ8b4aOYSe7QkKcYhFlq3QYLQhSidNmtS2RATwy8YOM3EQJsUjKiaWZ+vZToUQgzhkHXudb/PW5YMHD9yZM2faPsMwoc7RciYJXbGuBqJ1UIGKKLv915jsvgtJxCZDubdXr165mzdvtr1Hz5LONA8jrUwKPqsmVesKa49S3Q4WxmRPUEYdTjgiUcfUwLx589ySJUva3oMkP6IYddq6HMS4o55xBJBUeRjzfa4Zdeg56QZ43LhxoyPo7Lf1kNt7oO8wWAbNwaYjIv5lhyS7kRf96dvm5Jah8vfvX3flyhX35cuX6HfzFHOToS1H4BenCaHvO8pr8iDuwoUL7tevX+b5ZdbBair0xkFIlFDlW4ZknEClsp/TzXyAKVOmmHWFVSbDNw1l1+4f90U6IY/q4V27dpnE9bJ+v87QEydjqx/UamVVPRG+mwkNTYN+9tjkwzEx+atCm/X9WvWtDtAb68Wy9LXa1UmvCDDIpPkyOQ5ZwSzJ4jMrvFcr0rSjOUh+GcT4LSg5ugkW1Io0/SCDQBojh0hPlaJdah+tkVYrnTZowP8iq1F1TgMBBauufyB33x1v+NWFYmT5KmppgHC+NkAgbmRkpD3yn9QIseXymoTQFGQmIOKTxiZIWpvAatenVqRVXf2nTrAWMsPnKrMZHz6bJq5jvce6QK8J1cQNgKxlJapMPdZSR64/UivS9NztpkVEdKcrs5alhhWP9NeqlfWopzhZScI6QxseegZRGeg5a8C3Re1Mfl1ScP36ddcUaMuv24iOJtz7sbUjTS4qBvKmstYJoUauiuD3k5qhyr7QdUHMeCgLa1Ear9NquemdXgmum4fvJ6w1lqsuDhNrg1qSpleJK7K3TF0Q2jSd94uSZ60kK1e3qyVpQK6PVWXp2/FC3mp6jBhKKOiY2h3gtUV64TWM6wDETRPLDfSakXmH3w8g9Jlug8ZtTt4kVF0kLUYYmCCtD/DrQ5YhMGbA9L3ucdjh0y8kOHW5gU/VEEmJTcL4Pz/f7mgoAbYkAAAAAElFTkSuQmCC"]
}'
```
#### Response
```
{
"model": "llava",
"created_at": "2023-11-03T15:36:02.583064Z",
"response": "A happy cartoon character, which is cute and cheerful.",
"context": [1, 2, 3],
"done": true,
"total_duration": 14648695333,
"load_duration": 3302671417,
"prompt_eval_count": 14,
"prompt_eval_duration": 286243000,
"eval_count": 129,
"eval_duration": 10931424000
}
```
#### Request (Raw Mode)
In some cases you may wish to bypass the templating system and provide a full prompt. In this case, you can use the `raw` parameter to disable formatting.
```shell
curl http://localhost:11434/api/generate -d '{
"model": "mistral",
"prompt": "[INST] why is the sky blue? [/INST]",
"raw": true,
"stream": false
}'
```
#### Response
```json
{
"model": "mistral",
"created_at": "2023-11-03T15:36:02.583064Z",
"response": " The sky appears blue because of a phenomenon called Rayleigh scattering.",
"context": [1, 2, 3],
"done": true,
"total_duration": 14648695333,
"load_duration": 3302671417,
"prompt_eval_count": 14,
"prompt_eval_duration": 286243000,
"eval_count": 129,
"eval_duration": 10931424000
"total_duration": 5043500667,
"load_duration": 5025959,
"prompt_eval_count": 26,
"prompt_eval_duration": 325953000,
"eval_count": 290,
"eval_duration": 4709213000
}
```
#### Request (JSON mode)
> When `format` is set to `json`, the output will always be a well-formed JSON object. It's important to also instruct the model to respond in JSON.
##### Request
```shell
curl http://localhost:11434/api/generate -d '{
"model": "llama2",
@@ -221,7 +161,7 @@ curl http://localhost:11434/api/generate -d '{
}'
```
#### Response
##### Response
```json
{
@@ -229,12 +169,13 @@ curl http://localhost:11434/api/generate -d '{
"created_at": "2023-11-09T21:07:55.186497Z",
"response": "{\n\"morning\": {\n\"color\": \"blue\"\n},\n\"noon\": {\n\"color\": \"blue-gray\"\n},\n\"afternoon\": {\n\"color\": \"warm gray\"\n},\n\"evening\": {\n\"color\": \"orange\"\n}\n}\n",
"done": true,
"total_duration": 4661289125,
"load_duration": 1714434500,
"context": [1, 2, 3],
"total_duration": 4648158584,
"load_duration": 4071084,
"prompt_eval_count": 36,
"prompt_eval_duration": 264132000,
"eval_count": 75,
"eval_duration": 2112149000
"prompt_eval_duration": 439038000,
"eval_count": 180,
"eval_duration": 4196918000
}
```
@@ -257,10 +198,77 @@ The value of `response` will be a string containing JSON similar to:
}
```
#### Request (With options)
#### Request (with images)
To submit images to multimodal models such as `llava` or `bakllava`, provide a list of base64-encoded `images`:
#### Request
```shell
curl http://localhost:11434/api/generate -d '{
"model": "llava",
"prompt":"What is in this picture?",
"stream": false,
"images": ["iVBORw0KGgoAAAANSUhEUgAAAG0AAABmCAYAAADBPx+VAAAACXBIWXMAAAsTAAALEwEAmpwYAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAA3VSURBVHgB7Z27r0zdG8fX743i1bi1ikMoFMQloXRpKFFIqI7LH4BEQ+NWIkjQuSWCRIEoULk0gsK1kCBI0IhrQVT7tz/7zZo888yz1r7MnDl7z5xvsjkzs2fP3uu71nNfa7lkAsm7d++Sffv2JbNmzUqcc8m0adOSzZs3Z+/XES4ZckAWJEGWPiCxjsQNLWmQsWjRIpMseaxcuTKpG/7HP27I8P79e7dq1ars/yL4/v27S0ejqwv+cUOGEGGpKHR37tzJCEpHV9tnT58+dXXCJDdECBE2Ojrqjh071hpNECjx4cMHVycM1Uhbv359B2F79+51586daxN/+pyRkRFXKyRDAqxEp4yMlDDzXG1NPnnyJKkThoK0VFd1ELZu3TrzXKxKfW7dMBQ6bcuWLW2v0VlHjx41z717927ba22U9APcw7Nnz1oGEPeL3m3p2mTAYYnFmMOMXybPPXv2bNIPpFZr1NHn4HMw0KRBjg9NuRw95s8PEcz/6DZELQd/09C9QGq5RsmSRybqkwHGjh07OsJSsYYm3ijPpyHzoiacg35MLdDSIS/O1yM778jOTwYUkKNHWUzUWaOsylE00MyI0fcnOwIdjvtNdW/HZwNLGg+sR1kMepSNJXmIwxBZiG8tDTpEZzKg0GItNsosY8USkxDhD0Rinuiko2gfL/RbiD2LZAjU9zKQJj8RDR0vJBR1/Phx9+PHj9Z7REF4nTZkxzX4LCXHrV271qXkBAPGfP/atWvu/PnzHe4C97F48eIsRLZ9+3a3f/9+87dwP1JxaF7/3r17ba+5l4EcaVo0lj3SBq5kGTJSQmLWMjgYNei2GPT1MuMqGTDEFHzeQSP2wi/jGnkmPJ/nhccs44jvDAxpVcxnq0F6eT8h4ni/iIWpR5lPyA6ETkNXoSukvpJAD3AsXLiwpZs49+fPn5ke4j10TqYvegSfn0OnafC+Tv9ooA/JPkgQysqQNBzagXY55nO/oa1F7qvIPWkRL12WRpMWUvpVDYmxAPehxWSe8ZEXL20sadYIozfmNch4QJPAfeJgW3rNsnzphBKNJM2KKODo1rVOMRYik5ETy3ix4qWNI81qAAirizgMIc+yhTytx0JWZuNI03qsrgWlGtwjoS9XwgUhWGyhUaRZZQNNIEwCiXD16tXcAHUs79co0vSD8rrJCIW98pzvxpAWyyo3HYwqS0+H0BjStClcZJT5coMm6D2LOF8TolGJtK9fvyZpyiC5ePFi9nc/oJU4eiEP0jVoAnHa9wyJycITMP78+eMeP37sXrx44d6+fdt6f82aNdkx1pg9e3Zb5W+RSRE+n+VjksQWifvVaTKFhn5O8my63K8Qabdv33b379/PiAP//vuvW7BggZszZ072/+TJk91YgkafPn166zXB1rQHFvouAWHq9z3SEevSUerqCn2/dDCeta2jxYbr69evk4MHDyY7d+7MjhMnTiTPnz9Pfv/+nfQT2ggpO2dMF8cghuoM7Ygj5iWCqRlGFml0QC/ftGmTmzt3rmsaKDsgBSPh0/8yPeLLBihLkOKJc0jp8H8vUzcxIA1k6QJ/c78tWEyj5P3o4u9+jywNPdJi5rAH9x0KHcl4Hg570eQp3+vHXGyrmEeigzQsQsjavXt38ujRo44LQuDDhw+TW7duRS1HGgMxhNXHgflaNTOsHyKvHK5Ijo2jbFjJBQK9YwFd6RVMzfgRBmEfP37suBBm/p49e1qjEP2mwTViNRo0VJWH1deMXcNK08uUjVUu7s/zRaL+oLNxz1bpANco4npUgX4G2eFbpDFyQoQxojBCpEGSytmOH8qrH5Q9vuzD6ofQylkCUmh8DBAr+q8JCyVNtWQIidKQE9wNtLSQnS4jDSsxNHogzFuQBw4cyM61UKVsjfr3ooBkPSqqQHesUPWVtzi9/vQi1T+rJj7WiTz4Pt/l3LxUkr5P2VYZaZ4URpsE+st/dujQoaBBYokbrz/8TJNQYLSonrPS9kUaSkPeZyj1AWSj+d+VBoy1pIWVNed8P0Ll/ee5HdGRhrHhR5GGN0r4LGZBaj8oFDJitBTJzIZgFcmU0Y8ytWMZMzJOaXUSrUs5RxKnrxmbb5YXO9VGUhtpXldhEUogFr3IzIsvlpmdosVcGVGXFWp2oU9kLFL3dEkSz6NHEY1sjSRdIuDFWEhd8KxFqsRi1uM/nz9/zpxnwlESONdg6dKlbsaMGS4EHFHtjFIDHwKOo46l4TxSuxgDzi+rE2jg+BaFruOX4HXa0Nnf1lwAPufZeF8/r6zD97WK2qFnGjBxTw5qNGPxT+5T/r7/7RawFC3j4vTp09koCxkeHjqbHJqArmH5UrFKKksnxrK7FuRIs8STfBZv+luugXZ2pR/pP9Ois4z+TiMzUUkUjD0iEi1fzX8GmXyuxUBRcaUfykV0YZnlJGKQpOiGB76x5GeWkWWJc3mOrK6S7xdND+W5N6XyaRgtWJFe13GkaZnKOsYqGdOVVVbGupsyA/l7emTLHi7vwTdirNEt0qxnzAvBFcnQF16xh/TMpUuXHDowhlA9vQVraQhkudRdzOnK+04ZSP3DUhVSP61YsaLtd/ks7ZgtPcXqPqEafHkdqa84X6aCeL7YWlv6edGFHb+ZFICPlljHhg0bKuk0CSvVznWsotRu433alNdFrqG45ejoaPCaUkWERpLXjzFL2Rpllp7PJU2a/v7Ab8N05/9t27Z16KUqoFGsxnI9EosS2niSYg9SpU6B4JgTrvVW1flt1sT+0ADIJU2maXzcUTraGCRaL1Wp9rUMk16PMom8QhruxzvZIegJjFU7LLCePfS8uaQdPny4jTTL0dbee5mYokQsXTIWNY46kuMbnt8Kmec+LGWtOVIl9cT1rCB0V8WqkjAsRwta93TbwNYoGKsUSChN44lgBNCoHLHzquYKrU6qZ8lolCIN0Rh6cP0Q3U6I6IXILYOQI513hJaSKAorFpuHXJNfVlpRtmYBk1Su1obZr5dnKAO+L10Hrj3WZW+E3qh6IszE37F6EB+68mGpvKm4eb9bFrlzrok7fvr0Kfv727dvWRmdVTJHw0qiiCUSZ6wCK+7XL/AcsgNyL74DQQ730sv78Su7+t/A36MdY0sW5o40ahslXr58aZ5HtZB8GH64m9EmMZ7FpYw4T6QnrZfgenrhFxaSiSGXtPnz57e9TkNZLvTjeqhr734CNtrK41L40sUQckmj1lGKQ0rC37x544r8eNXRpnVE3ZZY7zXo8NomiO0ZUCj2uHz58rbXoZ6gc0uA+F6ZeKS/jhRDUq8MKrTho9fEkihMmhxtBI1DxKFY9XLpVcSkfoi8JGnToZO5sU5aiDQIW716ddt7ZLYtMQlhECdBGXZZMWldY5BHm5xgAroWj4C0hbYkSc/jBmggIrXJWlZM6pSETsEPGqZOndr2uuuR5rF169a2HoHPdurUKZM4CO1WTPqaDaAd+GFGKdIQkxAn9RuEWcTRyN2KSUgiSgF5aWzPTeA/lN5rZubMmR2bE4SIC4nJoltgAV/dVefZm72AtctUCJU2CMJ327hxY9t7EHbkyJFseq+EJSY16RPo3Dkq1kkr7+q0bNmyDuLQcZBEPYmHVdOBiJyIlrRDq41YPWfXOxUysi5fvtyaj+2BpcnsUV/oSoEMOk2CQGlr4ckhBwaetBhjCwH0ZHtJROPJkyc7UjcYLDjmrH7ADTEBXFfOYmB0k9oYBOjJ8b4aOYSe7QkKcYhFlq3QYLQhSidNmtS2RATwy8YOM3EQJsUjKiaWZ+vZToUQgzhkHXudb/PW5YMHD9yZM2faPsMwoc7RciYJXbGuBqJ1UIGKKLv915jsvgtJxCZDubdXr165mzdvtr1Hz5LONA8jrUwKPqsmVesKa49S3Q4WxmRPUEYdTjgiUcfUwLx589ySJUva3oMkP6IYddq6HMS4o55xBJBUeRjzfa4Zdeg56QZ43LhxoyPo7Lf1kNt7oO8wWAbNwaYjIv5lhyS7kRf96dvm5Jah8vfvX3flyhX35cuX6HfzFHOToS1H4BenCaHvO8pr8iDuwoUL7tevX+b5ZdbBair0xkFIlFDlW4ZknEClsp/TzXyAKVOmmHWFVSbDNw1l1+4f90U6IY/q4V27dpnE9bJ+v87QEydjqx/UamVVPRG+mwkNTYN+9tjkwzEx+atCm/X9WvWtDtAb68Wy9LXa1UmvCDDIpPkyOQ5ZwSzJ4jMrvFcr0rSjOUh+GcT4LSg5ugkW1Io0/SCDQBojh0hPlaJdah+tkVYrnTZowP8iq1F1TgMBBauufyB33x1v+NWFYmT5KmppgHC+NkAgbmRkpD3yn9QIseXymoTQFGQmIOKTxiZIWpvAatenVqRVXf2nTrAWMsPnKrMZHz6bJq5jvce6QK8J1cQNgKxlJapMPdZSR64/UivS9NztpkVEdKcrs5alhhWP9NeqlfWopzhZScI6QxseegZRGeg5a8C3Re1Mfl1ScP36ddcUaMuv24iOJtz7sbUjTS4qBvKmstYJoUauiuD3k5qhyr7QdUHMeCgLa1Ear9NquemdXgmum4fvJ6w1lqsuDhNrg1qSpleJK7K3TF0Q2jSd94uSZ60kK1e3qyVpQK6PVWXp2/FC3mp6jBhKKOiY2h3gtUV64TWM6wDETRPLDfSakXmH3w8g9Jlug8ZtTt4kVF0kLUYYmCCtD/DrQ5YhMGbA9L3ucdjh0y8kOHW5gU/VEEmJTcL4Pz/f7mgoAbYkAAAAAElFTkSuQmCC"]
}'
```
#### Response
```
{
"model": "llava",
"created_at": "2023-11-03T15:36:02.583064Z",
"response": "A happy cartoon character, which is cute and cheerful.",
"done": true,
"context": [1, 2, 3],
"total_duration": 2938432250,
"load_duration": 2559292,
"prompt_eval_count": 1,
"prompt_eval_duration": 2195557000,
"eval_count": 44,
"eval_duration": 736432000
}
```
#### Request (Raw Mode)
In some cases, you may wish to bypass the templating system and provide a full prompt. In this case, you can use the `raw` parameter to disable templating. Also note that raw mode will not return a context.
##### Request
```shell
curl http://localhost:11434/api/generate -d '{
"model": "mistral",
"prompt": "[INST] why is the sky blue? [/INST]",
"raw": true,
"stream": false
}'
```
##### Response
```json
{
"model": "mistral",
"created_at": "2023-11-03T15:36:02.583064Z",
"response": " The sky appears blue because of a phenomenon called Rayleigh scattering.",
"done": true,
"total_duration": 8493852375,
"load_duration": 6589624375,
"prompt_eval_count": 14,
"prompt_eval_duration": 119039000,
"eval_count": 110,
"eval_duration": 1779061000
}
```
#### Generate request (With options)
If you want to set custom options for the model at runtime rather than in the Modelfile, you can do so with the `options` parameter. This example sets every available option, but you can set any of them individually and omit the ones you do not want to override.
##### Request
```shell
curl http://localhost:11434/api/generate -d '{
"model": "llama2",
@@ -292,7 +300,6 @@ curl http://localhost:11434/api/generate -d '{
"main_gpu": 0,
"low_vram": false,
"f16_kv": true,
"logits_all": false,
"vocab_only": false,
"use_mmap": true,
"use_mlock": false,
@@ -304,7 +311,7 @@ curl http://localhost:11434/api/generate -d '{
}'
```
#### Response
##### Response
```json
{
@@ -312,14 +319,38 @@ curl http://localhost:11434/api/generate -d '{
"created_at": "2023-08-04T19:22:45.499127Z",
"response": "The sky is blue because it is the color of the sky.",
"done": true,
"total_duration": 5589157167,
"load_duration": 3013701500,
"sample_count": 114,
"sample_duration": 81442000,
"prompt_eval_count": 46,
"prompt_eval_duration": 1160282000,
"eval_count": 13,
"eval_duration": 1325948000
"context": [1, 2, 3],
"total_duration": 4935886791,
"load_duration": 534986708,
"prompt_eval_count": 26,
"prompt_eval_duration": 107345000,
"eval_count": 237,
"eval_duration": 4289432000
}
```
#### Load a model
If an empty prompt is provided, the model will be loaded into memory.
##### Request
```shell
curl http://localhost:11434/api/generate -d '{
"model": "llama2"
}'
```
##### Response
A single JSON object is returned:
```json
{
"model": "llama2",
"created_at": "2023-12-18T19:52:07.071755Z",
"response": "",
"done": true
}
```
@@ -329,7 +360,7 @@ curl http://localhost:11434/api/generate -d '{
POST /api/chat
```
Generate the next message in a chat with a provided model. This is a streaming endpoint, so there will be a series of responses. The final response object will include statistics and additional data from the request.
Generate the next message in a chat with a provided model. This is a streaming endpoint, so there will be a series of responses. Streaming can be disabled using `"stream": false`. The final response object will include statistics and additional data from the request.
### Parameters
@@ -346,12 +377,14 @@ Advanced parameters (optional):
- `format`: the format to return a response in. Currently the only accepted value is `json`
- `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
- `template`: the full prompt or prompt template (overrides what is defined in the `Modelfile`)
- `template`: the prompt template to use (overrides what is defined in the `Modelfile`)
- `stream`: if `false` the response will be returned as a single response object, rather than a stream of objects
### Examples
#### Request
#### Chat Request (Streaming)
##### Request
Send a chat message with a streaming response.
@@ -367,7 +400,7 @@ curl http://localhost:11434/api/chat -d '{
}'
```
#### Response
##### Response
A stream of JSON objects is returned:
@@ -376,8 +409,9 @@ A stream of JSON objects is returned:
"model": "llama2",
"created_at": "2023-08-04T08:52:19.385406455-07:00",
"message": {
"role": "assisant",
"content": "The"
"role": "assistant",
"content": "The",
"images": null
},
"done": false
}
@@ -390,20 +424,57 @@ Final response:
"model": "llama2",
"created_at": "2023-08-04T19:22:45.499127Z",
"done": true,
"total_duration": 5589157167,
"load_duration": 3013701500,
"sample_count": 114,
"sample_duration": 81442000,
"prompt_eval_count": 46,
"prompt_eval_duration": 1160282000,
"eval_count": 113,
"eval_duration": 1325948000
"total_duration": 4883583458,
"load_duration": 1334875,
"prompt_eval_count": 26,
"prompt_eval_duration": 342546000,
"eval_count": 282,
"eval_duration": 4535599000
}
```
#### Request (With History)
#### Chat request (No streaming)
Send a chat message with a conversation history.
##### Request
```shell
curl http://localhost:11434/api/chat -d '{
"model": "llama2",
"messages": [
{
"role": "user",
"content": "why is the sky blue?"
}
],
"stream": false
}'
```
##### Response
```json
{
"model": "registry.ollama.ai/library/llama2:latest",
"created_at": "2023-12-12T14:13:43.416799Z",
"message": {
"role": "assistant",
"content": "Hello! How are you today?"
},
"done": true,
"total_duration": 5191566416,
"load_duration": 2154458,
"prompt_eval_count": 26,
"prompt_eval_duration": 383809000,
"eval_count": 298,
"eval_duration": 4799921000
}
```
#### Chat request (With History)
Send a chat message with a conversation history. You can use this same approach to start the conversation using multi-shot or chain-of-thought prompting.
##### Request
```shell
curl http://localhost:11434/api/chat -d '{
@@ -425,7 +496,7 @@ curl http://localhost:11434/api/chat -d '{
}'
```
#### Response
##### Response
A stream of JSON objects is returned:
@@ -434,7 +505,7 @@ A stream of JSON objects is returned:
"model": "llama2",
"created_at": "2023-08-04T08:52:19.385406455-07:00",
"message": {
"role": "assisant",
"role": "assistant",
"content": "The"
},
"done": false
@@ -448,24 +519,24 @@ Final response:
"model": "llama2",
"created_at": "2023-08-04T19:22:45.499127Z",
"done": true,
"total_duration": 5589157167,
"load_duration": 3013701500,
"sample_count": 114,
"sample_duration": 81442000,
"prompt_eval_count": 46,
"prompt_eval_duration": 1160282000,
"eval_count": 113,
"eval_duration": 1325948000
"total_duration": 8113331500,
"load_duration": 6396458,
"prompt_eval_count": 61,
"prompt_eval_duration": 398801000,
"eval_count": 468,
"eval_duration": 7701267000
}
```
#### Request (with images)
#### Chat request (with images)
##### Request
Send a chat message with a conversation history.
```shell
curl http://localhost:11434/api/chat -d '{
"model": "llama2",
"model": "llava",
"messages": [
{
"role": "user",
@@ -476,13 +547,34 @@ curl http://localhost:11434/api/chat -d '{
}'
```
##### Response
```json
{
"model": "llava",
"created_at": "2023-12-13T22:42:50.203334Z",
"message": {
"role": "assistant",
"content": " The image features a cute, little pig with an angry facial expression. It's wearing a heart on its shirt and is waving in the air. This scene appears to be part of a drawing or sketching project.",
"images": null
},
"done": true,
"total_duration": 1668506709,
"load_duration": 1986209,
"prompt_eval_count": 26,
"prompt_eval_duration": 359682000,
"eval_count": 83,
"eval_duration": 1303285000
}
```
## Create a Model
```shell
POST /api/create
```
Create a model from a [`Modelfile`](./modelfile.md). It is recommended to set `modelfile` to the content of the Modelfile rather than just set `path`. This is a requirement for remote create. Remote model creation should also create any file blobs, fields such as `FROM` and `ADAPTER`, explicitly with the server using [Create a Blob](#create-a-blob) and the value to the path indicated in the response.
Create a model from a [`Modelfile`](./modelfile.md). It is recommended to set `modelfile` to the content of the Modelfile rather than just set `path`. This is a requirement for remote create. Remote model creation must also create any file blobs, fields such as `FROM` and `ADAPTER`, explicitly with the server using [Create a Blob](#create-a-blob) and the value to the path indicated in the response.
### Parameters
@@ -493,7 +585,11 @@ Create a model from a [`Modelfile`](./modelfile.md). It is recommended to set `m
### Examples
#### Request
#### Create a new model
Create a new model from a `Modelfile`.
##### Request
```shell
curl http://localhost:11434/api/create -d '{
@@ -502,14 +598,22 @@ curl http://localhost:11434/api/create -d '{
}'
```
#### Response
##### Response
A stream of JSON objects. When finished, `status` is `success`.
A stream of JSON objects. Notice that the final JSON object shows a `"status": "success"`.
```json
{
"status": "parsing modelfile"
}
{"status":"reading model metadata"}
{"status":"creating system layer"}
{"status":"using already created layer sha256:22f7f8ef5f4c791c1b03d7eb414399294764d7cc82c7e94aa81a1feb80a983a2"}
{"status":"using already created layer sha256:8c17c2ebb0ea011be9981cc3922db8ca8fa61e828c5d3f44cb6ae342bf80460b"}
{"status":"using already created layer sha256:7c23fb36d80141c4ab8cdbb61ee4790102ebd2bf7aeff414453177d4f2110e5d"}
{"status":"using already created layer sha256:2e0493f67d0c8c9c68a8aeacdf6a38a2151cb3c4c1d42accf296e19810527988"}
{"status":"using already created layer sha256:2759286baa875dc22de5394b4a925701b1896a7e3f8e53275c36f75a877a82c9"}
{"status":"writing layer sha256:df30045fe90f0d750db82a058109cecd6d4de9c90a3d75b19c09e5f64580bb42"}
{"status":"writing layer sha256:f18a68eb09bf925bb1b669490407c1b1251c5db98dc4d3d81f3088498ea55690"}
{"status":"writing manifest"}
{"status":"success"}
```
### Check if a Blob Exists
@@ -518,7 +622,7 @@ A stream of JSON objects. When finished, `status` is `success`.
HEAD /api/blobs/:digest
```
Check if a blob is known to the server.
Ensures that the file blob used for a FROM or ADAPTER field exists on the server. This is checking your Ollama server and not Ollama.ai.
#### Query Parameters
@@ -542,7 +646,7 @@ Return 200 OK if the blob exists, 404 Not Found if it does not.
POST /api/blobs/:digest
```
Create a blob from a file. Returns the server file path.
Create a blob from a file on the server. Returns the server file path.
#### Query Parameters
@@ -558,7 +662,7 @@ curl -T model.bin -X POST http://localhost:11434/api/blobs/sha256:29fdb92e57cf08
##### Response
Return 201 Created if the blob was successfully created.
Return 201 Created if the blob was successfully created, 400 Bad Request if the digest used is not expected.
## List Local Models
@@ -584,14 +688,30 @@ A single JSON object will be returned.
{
"models": [
{
"name": "llama2",
"modified_at": "2023-08-02T17:02:23.713454393-07:00",
"size": 3791730596
"name": "codellama:13b",
"modified_at": "2023-11-04T14:56:49.277302595-07:00",
"size": 7365960935,
"digest": "9f438cb9cd581fc025612d27f7c1a6669ff83a8bb0ed86c94fcf4c5440555697",
"details": {
"format": "gguf",
"family": "llama",
"families": null,
"parameter_size": "13B",
"quantization_level": "Q4_0"
}
},
{
"name": "llama2:13b",
"modified_at": "2023-08-08T12:08:38.093596297-07:00",
"size": 7323310500
"name": "llama2:latest",
"modified_at": "2023-12-07T09:32:18.757212583-08:00",
"size": 3825819519,
"digest": "fe938a131f40e6f6d40083c9f0f430a515233eb2edaa6d72eb85c50d64f2300e",
"details": {
"format": "gguf",
"family": "llama",
"families": null,
"parameter_size": "7B",
"quantization_level": "Q4_0"
}
}
]
}
@@ -623,12 +743,12 @@ curl http://localhost:11434/api/show -d '{
```json
{
"modelfile": "# Modelfile generated by \"ollama show\"\n# To build a new Modelfile based on this one, replace the FROM line with:\n# FROM llava:latest\n\nFROM mike/llava:latest\nTEMPLATE \"\"\"\nUSER:{{ .Prompt }}\nASSISTANT:\n\"\"\"\nPARAMETER num_ctx 4096",
"parameters": "num_ctx 4096",
"template": "\nUSER:{{ .Prompt }}\nASSISTANT:\n",
"license:": "<license>",
"modelfile": "# Modelfile generated by \"ollama show\"\n# To build a new Modelfile based on this one, replace the FROM line with:\n# FROM llava:latest\n\nFROM /Users/matt/.ollama/models/blobs/sha256:200765e1283640ffbd013184bf496e261032fa75b99498a9613be4e94d63ad52\nTEMPLATE \"\"\"{{ .System }}\nUSER: {{ .Prompt }}\nASSSISTANT: \"\"\"\nPARAMETER num_ctx 4096\nPARAMETER stop \"\u003c/s\u003e\"\nPARAMETER stop \"USER:\"\nPARAMETER stop \"ASSSISTANT:\"",
"parameters": "num_ctx 4096\nstop \u003c/s\u003e\nstop USER:\nstop ASSSISTANT:",
"template": "{{ .System }}\nUSER: {{ .Prompt }}\nASSSISTANT: ",
"details": {
"format": "gguf",
"family": "llama",
"families": ["llama", "clip"],
"parameter_size": "7B",
"quantization_level": "Q4_0"
@@ -657,7 +777,7 @@ curl http://localhost:11434/api/copy -d '{
#### Response
The only response is a 200 OK if successful.
Returns a 200 OK if successful, or a 404 Not Found if the source model doesn't exist.
## Delete a Model
@@ -683,7 +803,7 @@ curl -X DELETE http://localhost:11434/api/delete -d '{
#### Response
If successful, the only response is a 200 OK.
Returns a 200 OK if successful, 404 Not Found if the model to be deleted doesn't exist.
## Pull a Model

View File

@@ -1,20 +1,26 @@
# Development
- Install cmake or (optionally, required tools for GPUs)
- run `go generate ./...`
- run `go build .`
Install required tools:
- cmake version 3.24 or higher
- go version 1.20 or higher
- go version 1.21 or higher
- gcc version 11.4.0 or higher
```bash
brew install go cmake gcc
```
Get the required libraries:
Optionally enable debugging and more verbose logging:
```bash
# At build time
export CGO_CFLAGS="-g"
# At runtime
export OLLAMA_DEBUG=1
```
Get the required libraries and build the native LLM code:
```bash
go generate ./...
@@ -32,8 +38,99 @@ Now you can run `ollama`:
./ollama
```
## Building on Linux with GPU support
### Linux
- Install cmake and nvidia-cuda-toolkit
- run `go generate ./...`
- run `go build .`
#### Linux CUDA (NVIDIA)
*Your operating system distribution may already have packages for NVIDIA CUDA. Distro packages are often preferable, but instructions are distro-specific. Please consult distro-specific docs for dependencies if available!*
Install `cmake` and `golang` as well as [NVIDIA CUDA](https://developer.nvidia.com/cuda-downloads)
development and runtime packages.
Typically the build scripts will auto-detect CUDA, however, if your Linux distro
or installation approach uses unusual paths, you can specify the location by
specifying an environment variable `CUDA_LIB_DIR` to the location of the shared
libraries, and `CUDACXX` to the location of the nvcc compiler.
Then generate dependencies:
```
go generate ./...
```
Then build the binary:
```
go build .
```
#### Linux ROCm (AMD)
*Your operating system distribution may already have packages for AMD ROCm and CLBlast. Distro packages are often preferable, but instructions are distro-specific. Please consult distro-specific docs for dependencies if available!*
Install [CLBlast](https://github.com/CNugteren/CLBlast/blob/master/doc/installation.md) and [ROCm](https://rocm.docs.amd.com/en/latest/deploy/linux/quick_start.html) developement packages first, as well as `cmake` and `golang`.
Typically the build scripts will auto-detect ROCm, however, if your Linux distro
or installation approach uses unusual paths, you can specify the location by
specifying an environment variable `ROCM_PATH` to the location of the ROCm
install (typically `/opt/rocm`), and `CLBlast_DIR` to the location of the
CLBlast install (typically `/usr/lib/cmake/CLBlast`).
```
go generate ./...
```
Then build the binary:
```
go build .
```
ROCm requires elevated privileges to access the GPU at runtime. On most distros you can add your user account to the `render` group, or run as root.
#### Advanced CPU Settings
By default, running `go generate ./...` will compile a few different variations
of the LLM library based on common CPU families and vector math capabilities,
including a lowest-common-denominator which should run on almost any 64 bit CPU
somewhat slowly. At runtime, Ollama will auto-detect the optimal variation to
load. If you would like to build a CPU-based build customized for your
processor, you can set `OLLAMA_CUSTOM_CPU_DEFS` to the llama.cpp flags you would
like to use. For example, to compile an optimized binary for an Intel i9-9880H,
you might use:
```
OLLAMA_CUSTOM_CPU_DEFS="-DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_F16C=on -DLLAMA_FMA=on" go generate ./...
go build .
```
#### Containerized Linux Build
If you have Docker available, you can build linux binaries with `./scripts/build_linux.sh` which has the CUDA and ROCm dependencies included. The resulting binary is placed in `./dist`
### Windows
Note: The windows build for Ollama is still under development.
Install required tools:
- MSVC toolchain - C/C++ and cmake as minimal requirements
- go version 1.21 or higher
- MinGW (pick one variant) with GCC.
- <https://www.mingw-w64.org/>
- <https://www.msys2.org/>
```powershell
$env:CGO_ENABLED="1"
go generate ./...
go build .
```
#### Windows CUDA (NVIDIA)
In addition to the common Windows development tools described above, install:
- [NVIDIA CUDA](https://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/index.html)

View File

@@ -1,138 +1,88 @@
# FAQ
## How can I upgrade Ollama?
To upgrade Ollama, run the installation process again. On the Mac, click the Ollama icon in the menubar and choose the restart option if an update is available.
## How can I view the logs?
On macOS:
Review the [Troubleshooting](./troubleshooting.md) docs for more about using logs.
```
cat ~/.ollama/logs/server.log
```
## How do I use Ollama server environment variables on Mac
On Linux:
On macOS, Ollama runs in the background and is managed by the menubar app. If adding environment variables, Ollama will need to be run manually.
```
journalctl -u ollama
```
1. Click the menubar icon for Ollama and choose **Quit Ollama**.
2. Open a new terminal window and run the following command (this example uses `OLLAMA_HOST` with an IP address of `123.1.1.1`):
If you're running `ollama serve` directly, the logs will be printed to the console.
```bash
OLLAMA_HOST=123.1.1.1 ollama serve
```
## How do I use Ollama server environment variables on Linux?
If Ollama is installed with the install script, a systemd service was created, running as the Ollama user. To add an environment variable, such as OLLAMA_HOST, follow these steps:
1. Create a `systemd` drop-in directory and add a config file. This is only needed once.
```bash
mkdir -p /etc/systemd/system/ollama.service.d
echo '[Service]' >>/etc/systemd/system/ollama.service.d/environment.conf
```
2. For each environment variable, add it to the config file:
```bash
echo 'Environment="OLLAMA_HOST=0.0.0.0:11434"' >>/etc/systemd/system/ollama.service.d/environment.conf
```
3. Reload `systemd` and restart Ollama:
```bash
systemctl daemon-reload
systemctl restart ollama
```
## How can I expose Ollama on my network?
Ollama binds to 127.0.0.1 port 11434 by default. Change the bind address with the `OLLAMA_HOST` environment variable.
On macOS:
```bash
OLLAMA_HOST=0.0.0.0:11434 ollama serve
```
On Linux:
Create a `systemd` drop-in directory and set `Environment=OLLAMA_HOST`
```bash
mkdir -p /etc/systemd/system/ollama.service.d
echo '[Service]' >>/etc/systemd/system/ollama.service.d/environment.conf
```
```bash
echo 'Environment="OLLAMA_HOST=0.0.0.0:11434"' >>/etc/systemd/system/ollama.service.d/environment.conf
```
Reload `systemd` and restart Ollama:
```bash
systemctl daemon-reload
systemctl restart ollama
```
Ollama binds to 127.0.0.1 port 11434 by default. Change the bind address with the `OLLAMA_HOST` environment variable. Refer to the section above for how to use environment variables on your platform.
## How can I allow additional web origins to access Ollama?
Ollama allows cross origin requests from `127.0.0.1` and `0.0.0.0` by default. Add additional origins with the `OLLAMA_ORIGINS` environment variable:
Ollama allows cross-origin requests from `127.0.0.1` and `0.0.0.0` by default. Add additional origins with the `OLLAMA_ORIGINS` environment variable. For example, to add all ports on 192.168.1.1 and https://example.com, use:
On macOS:
```bash
OLLAMA_ORIGINS=http://192.168.1.1:*,https://example.com ollama serve
```shell
OLLAMA_ORIGINS=http://192.168.1.1:*,https://example.com
```
On Linux:
```bash
echo 'Environment="OLLAMA_ORIGINS=http://192.168.1.1:*,https://example.com"' >>/etc/systemd/system/ollama.service.d/environment.conf
```
Reload `systemd` and restart Ollama:
```bash
systemctl daemon-reload
systemctl restart ollama
```
Refer to the section above for how to use environment variables on your platform.
## Where are models stored?
- macOS: Raw model data is stored under `~/.ollama/models`.
- Linux: Raw model data is stored under `/usr/share/ollama/.ollama/models`
- macOS: `~/.ollama/models`.
- Linux: `/usr/share/ollama/.ollama/models`
Below the models directory you will find a structure similar to the following:
## How do I set them to a different location?
```shell
.
├── blobs
└── manifests
└── registry.ollama.ai
├── f0rodo
├── library
├── mattw
└── saikatkumardey
```
There is a `manifests/registry.ollama.ai/namespace` path. In example above, the user has downloaded models from the official `library`, `f0rodo`, `mattw`, and `saikatkumardey` namespaces. Within each of those directories, you will find directories for each of the models downloaded. And in there you will find a file name representing each tag. Each tag file is the manifest for the model.
The manifest lists all the layers used in this model. You will see a `media type` for each layer, along with a digest. That digest corresponds with a file in the `models/blobs directory`.
### How can I change where Ollama stores models?
To modify where models are stored, you can use the `OLLAMA_MODELS` environment variable. Note that on Linux this means defining `OLLAMA_MODELS` in a drop-in `/etc/systemd/system/ollama.service.d` service file, reloading systemd, and restarting the ollama service.
If a different directory needs to be used, set the environment variable `OLLAMA_MODELS` to the chosen directory. Refer to the section above for how to use environment variables on your platform.
## Does Ollama send my prompts and answers back to Ollama.ai to use in any way?
No. Anything you do with Ollama, such as generate a response from the model, stays with you. We don't collect any data about how you use the model. You are always in control of your own data.
No, Ollama runs entirely locally, and conversation data will never leave your machine.
## How can I use Ollama in Visual Studio Code?
There is already a large collection of plugins available for VSCode as well as other editors that leverage Ollama. You can see the list of [extensions & plugins](https://github.com/jmorganca/ollama#extensions--plugins) at the bottom of the main repository readme.
There is already a large collection of plugins available for VSCode as well as other editors that leverage Ollama. See the list of [extensions & plugins](https://github.com/jmorganca/ollama#extensions--plugins) at the bottom of the main repository readme.
## How do I use Ollama behind a proxy?
Ollama is compatible with proxy servers if `HTTP_PROXY` or `HTTPS_PROXY` are configured. When using either variables, ensure it is set where `ollama serve` can access the values.
When using `HTTPS_PROXY`, ensure the proxy certificate is installed as a system certificate.
On macOS:
```bash
HTTPS_PROXY=http://proxy.example.com ollama serve
```
On Linux:
```bash
echo 'Environment="HTTPS_PROXY=https://proxy.example.com"' >>/etc/systemd/system/ollama.service.d/environment.conf
```
Reload `systemd` and restart Ollama:
```bash
systemctl daemon-reload
systemctl restart ollama
```
Ollama is compatible with proxy servers if `HTTP_PROXY` or `HTTPS_PROXY` are configured. When using either variables, ensure it is set where `ollama serve` can access the values. When using `HTTPS_PROXY`, ensure the proxy certificate is installed as a system certificate. Refer to the section above for how to use environment variables on your platform.
### How do I use Ollama behind a proxy in Docker?
The Ollama Docker container image can be configured to use a proxy by passing `-e HTTPS_PROXY=https://proxy.example.com` when starting the container.
Alternatively, Docker daemon can be configured to use a proxy. Instructions are available for Docker Desktop on [macOS](https://docs.docker.com/desktop/settings/mac/#proxies), [Windows](https://docs.docker.com/desktop/settings/windows/#proxies), and [Linux](https://docs.docker.com/desktop/settings/linux/#proxies), and Docker [daemon with systemd](https://docs.docker.com/config/daemon/systemd/#httphttps-proxy).
Alternatively, the Docker daemon can be configured to use a proxy. Instructions are available for Docker Desktop on [macOS](https://docs.docker.com/desktop/settings/mac/#proxies), [Windows](https://docs.docker.com/desktop/settings/windows/#proxies), and [Linux](https://docs.docker.com/desktop/settings/linux/#proxies), and Docker [daemon with systemd](https://docs.docker.com/config/daemon/systemd/#httphttps-proxy).
Ensure the certificate is installed as a system certificate when using HTTPS. This may require a new Docker image when using a self-signed certificate.
@@ -154,3 +104,11 @@ docker run -d -e HTTPS_PROXY=https://my.proxy.example.com -p 11434:11434 ollama-
The Ollama Docker container can be configured with GPU acceleration in Linux or Windows (with WSL2). This requires the [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia-container-toolkit). See [ollama/ollama](https://hub.docker.com/r/ollama/ollama) for more details.
GPU acceleration is not available for Docker Desktop in macOS due to the lack of GPU passthrough and emulation.
## Why is networking slow in WSL2 on Windows 10?
This can impact both installing Ollama, as well as downloading models.
Open `Control Panel > Networking and Internet > View network status and tasks` and click on `Change adapter settings` on the left panel. Find the `vEthernel (WSL)` adapter, right click and select `Properties`.
Click on `Configure` and open the `Advanced` tab. Search through each of the properties until you find `Large Send Offload Version 2 (IPv4)` and `Large Send Offload Version 2 (IPv6)`. *Disable* both of these
properties.

View File

@@ -72,7 +72,7 @@ docker run --rm -v .:/model ollama/quantize -q q4_0 /model
This will output two files into the directory:
- `f16.bin`: the model converted to GGUF
- `q4_0.bin` the model quantized to a 4-bit quantization (we will use this file to create the Ollama model)
- `q4_0.bin` the model quantized to a 4-bit quantization (Ollama will use this file to create the Ollama model)
### Step 3: Write a `Modelfile`
@@ -148,6 +148,7 @@ The quantization options are as follow (from highest highest to lowest levels of
- `q5_K_M`
- `q6_K`
- `q8_0`
- `f16`
## Manually converting & quantizing models

View File

@@ -109,8 +109,9 @@ Remove the ollama binary from your bin directory (either `/usr/local/bin`, `/usr
sudo rm $(which ollama)
```
Remove the downloaded models and Ollama service user:
Remove the downloaded models and Ollama service user and group:
```bash
sudo rm -r /usr/share/ollama
sudo userdel ollama
sudo groupdel ollama
```

View File

@@ -1,6 +1,6 @@
# Ollama Model File
> Note: this `Modelfile` syntax is in development
> Note: `Modelfile` syntax is in development
A model file is the blueprint to create and share models with Ollama.
@@ -75,7 +75,7 @@ There are two ways to view `Modelfile`s underlying the models in [ollama.ai/libr
3. Scroll down to "Layers"
- Note: if the [`FROM` instruction](#from-required) is not present,
it means the model was created from a local file
- Option 2: use `ollama show` to print the `Modelfile` like so:
- Option 2: use `ollama show` to print the `Modelfile` for any local models like so:
```bash
> ollama show --modelfile llama2:13b
@@ -156,11 +156,12 @@ PARAMETER <parameter> <parametervalue>
#### Template Variables
| Variable | Description |
| --------------- | ------------------------------------------------------------------------------------------------------------- |
| `{{ .System }}` | The system message used to specify custom behavior, this must also be set in the Modelfile as an instruction. |
| `{{ .Prompt }}` | The incoming prompt, this is not specified in the model file and will be set based on input. |
| `{{ .First }}` | A boolean value used to render specific template information for the first generation of a session. |
| Variable | Description |
| ----------------- | ------------------------------------------------------------------------------------------------------------- |
| `{{ .System }}` | The system message used to specify custom behavior, this must also be set in the Modelfile as an instruction. |
| `{{ .Prompt }}` | The incoming prompt, this is not specified in the model file and will be set based on input. |
| `{{ .Response }}` | The response from the LLM, if not specified response is appended to the end of the template. |
| `{{ .First }}` | A boolean value used to render specific template information for the first generation of a session. |
```modelfile
TEMPLATE """
@@ -206,7 +207,7 @@ LICENSE """
## Notes
- the **`Modelfile` is not case sensitive**. In the examples, we use uppercase for instructions to make it easier to distinguish it from arguments.
- Instructions can be in any order. In the examples, we start with FROM instruction to keep it easily readable.
- the **`Modelfile` is not case sensitive**. In the examples, uppercase instructions are used to make it easier to distinguish it from arguments.
- Instructions can be in any order. In the examples, the `FROM` instruction is first to keep it easily readable.
[1]: https://ollama.ai/library

53
docs/troubleshooting.md Normal file
View File

@@ -0,0 +1,53 @@
# How to troubleshoot issues
Sometimes Ollama may not perform as expected. One of the best ways to figure out what happened is to take a look at the logs. Find the logs on Mac by running the command:
```shell
cat ~/.ollama/logs/server.log
```
On Linux systems with systemd, the logs can be found with this command:
```shell
journalctl -u ollama
```
If manually running `ollama serve` in a terminal, the logs will be on that terminal.
Join the [Discord](https://discord.gg/ollama) for help interpreting the logs.
## LLM libraries
Ollama includes multiple LLM libraries compiled for different GPUs and CPU
vector features. Ollama tries to pick the best one based on the capabilities of
your system. If this autodetection has problems, or you run into other problems
(e.g. crashes in your GPU) you can workaround this by forcing a specific LLM
library. `cpu_avx2` will perform the best, followed by `cpu_avx` an the slowest
but most compatible is `cpu`. Rosetta emulation under MacOS will work with the
`cpu` library.
In the server log, you will see a message that looks something like this (varies
from release to release):
```
Dynamic LLM libraries [rocm_v6 cpu cpu_avx cpu_avx2 cuda_v11 rocm_v5]
```
**Experimental LLM Library Override**
You can set OLLAMA_LLM_LIBRARY to any of the available LLM libraries to bypass
autodetection, so for example, if you have a CUDA card, but want to force the
CPU LLM library with AVX2 vector support, use:
```
OLLAMA_LLM_LIBRARY="cpu_avx2" ollama serve
```
You can see what features your CPU has with the following.
```
cat /proc/cpuinfo| grep flags | head -1
```
## Known issues
* N/A

3
examples/.gitignore vendored
View File

@@ -1,7 +1,10 @@
node_modules
bun.lockb
.vscode
# OSX
.DS_STORE
# Models
models/

View File

@@ -18,6 +18,8 @@ func main() {
os.Exit(1)
}
defer resp.Body.Close()
responseData, err := io.ReadAll(resp.Body)
if err != nil {
log.Fatal(err)

View File

@@ -1,15 +1,23 @@
# LangChain Web Summarization
This example summarizes a website
This example summarizes the website, [https://ollama.ai/blog/run-llama2-uncensored-locally](https://ollama.ai/blog/run-llama2-uncensored-locally)
## Setup
## Running the Example
```
pip install -r requirements.txt
```
1. Ensure you have the `llama2` model installed:
## Run
```bash
ollama pull llama2
```
```
python main.py
```
2. Install the Python Requirements.
```bash
pip install -r requirements.txt
```
3. Run the example:
```bash
python main.py
```

View File

@@ -1,2 +1 @@
langchain==0.0.259
bs4==0.0.1

View File

@@ -2,20 +2,23 @@
This example is a basic "hello world" of using LangChain with Ollama.
## Setup
## Running the Example
```
pip install -r requirements.txt
```
1. Ensure you have the `llama2` model installed:
## Run
```bash
ollama pull llama2
```
```
python main.py
```
2. Install the Python Requirements.
Running this example will print the response for "hello":
```bash
pip install -r requirements.txt
```
```
Hello! It's nice to meet you. hopefully you are having a great day! Is there something I can help you with or would you like to chat?
```
3. Run the example:
```bash
python main.py
```

View File

@@ -1,4 +1,6 @@
from langchain.llms import Ollama
input = input("What is your question?")
llm = Ollama(model="llama2")
res = llm.predict("hello")
res = llm.predict(input)
print (res)

View File

@@ -2,20 +2,22 @@
This example is a basic "hello world" of using LangChain with Ollama using Node.js and Typescript.
## Setup
## Running the Example
```shell
npm install
```
1. Install the prerequisites:
## Run
```bash
npm install
```
```shell
ts-node main.ts
```
2. Ensure the `mistral` model is available:
Running this example will print the response for "hello":
```bash
ollama pull mistral
```
```plaintext
Hello! It's nice to meet you. hopefully you are having a great day! Is there something I can help you with or would you like to chat?
```
3. Run the example:
```bash
npm start
```

View File

@@ -1,15 +1,25 @@
import { Ollama} from 'langchain/llms/ollama';
import { Ollama } from 'langchain/llms/ollama';
import * as readline from "readline";
async function main() {
const ollama = new Ollama({
model: 'mistral'
// other parameters can be found at https://js.langchain.com/docs/api/llms_ollama/classes/Ollama
})
const stream = await ollama.stream("Hello");
});
for await (const chunk of stream) {
process.stdout.write(chunk);
}
const rl = readline.createInterface({
input: process.stdin,
output: process.stdout,
});
rl.question("What is your question: \n", async (user_input) => {
const stream = await ollama.stream(user_input);
for await (const chunk of stream) {
process.stdout.write(chunk);
}
rl.close();
})
}
main();

View File

@@ -1,5 +1,5 @@
{
"name": "with-langchain-typescript-simplegenerate",
"name": "langchain-typescript-simple",
"lockfileVersion": 3,
"requires": true,
"packages": {

View File

@@ -1,8 +1,13 @@
{
"scripts": {
"start": "tsx main.ts"
},
"devDependencies": {
"typescript": "^5.2.2"
"tsx": "^4.6.2",
"typescript": "^5.3.3"
},
"dependencies": {
"langchain": "^0.0.165"
"langchain": "^0.0.165",
"readline": "^1.3.0"
}
}

View File

@@ -1,7 +0,0 @@
# Modelfile for creating a list of ten tweets from a topic
# Run `ollama create 10tweets -f ./Modelfile` and then `ollama run 10tweets` and enter a topic
FROM llama2
SYSTEM """
You are a content marketer who needs to come up with 10 short but succinct tweets. The answer should be a list of ten tweets. Each tweet can have a maximum of 280 characters and should include hashtags. Each user input will be a subject and you should expand it in ten creative ways. Never stop after just one tweet. Always include ten.
"""

View File

@@ -1,23 +0,0 @@
# Ten Tweets Modelfile
This is a simple modelfile that generates ten tweets based off any topic.
```bash
ollama create tentweets
ollama run tentweets
>>> underwater basketweaving
Great! Here are ten creative tweets about underwater basketweaving:
1. "Just discovered the ultimate stress-reliever: Underwater basketweaving! 🌊🧵 #UnderwaterBasketweaving #StressRelief"
2. "Who needs meditation when you can do underwater basketweaving? 😴👀 #PeacefulDistraction #UnderwaterBasketweaving"
3. "Just spent an hour in the pool and still managed to knot my basket. Goal: untangle it before next session. 💪🏽 #ChallengeAccepted #UnderwaterBasketweaving"
4. "When life gives you lemons, make underwater basketweaving! 🍋🧵 #LemonadeLife #UnderwaterBasketweaving"
5. "Just realized my underwater basketweaving skills could come in handy during a zombie apocalypse. 😂🧡 #SurvivalTips #UnderwaterBasketweaving"
6. "I'm not lazy, I'm just conserving energy for my next underwater basketweaving session. 😴💤 #LazyDay #UnderwaterBasketweaving"
7. "Just found my inner peace while doing underwater basketweaving. It's like meditation, but with knots! 🙏🧵 #Mindfulness #UnderwaterBasketweaving"
8. "Why study for exams when you can do underwater basketweaving and forget all your worries? 😜🧵 #ProcrastinationStation #UnderwaterBasketweaving"
9. "Just had to cut my underwater basketweaving session short due to a sudden urge to breathe. 🤯🌊 #AquaticAdventures #UnderwaterBasketweaving"
10. "I'm not sure what's more impressive: my underwater basketweaving skills or the fact that I didn't drown trying to make this tweet. 😅🧵 #Accomplishment
#UnderwaterBasketweaving"
```

View File

@@ -1,8 +0,0 @@
# Modelfile for creating a devops engineer assistant
# Run `ollama create devops-engineer -f ./Modelfile` and then `ollama run devops-engineer` and enter a topic
FROM mistral
PARAMETER temperature 1
SYSTEM """
You are a senior devops engineer, acting as an assistant. You offer help with cloud technologies like: Terraform, AWS, kubernetes, python. You answer with code examples when possible
"""

View File

@@ -1,40 +0,0 @@
# DevOps Engineer Modelfile
This is a simple modelfile that takes will respond to any questions that a senior DevOps engineer should know how to solve.
````text
ollama create devopsengineer
ollama run devopsengineer
>>> how can i proxy all connections from inside the office through a single server that caches large files. Provide a config file for nginx to achieve this?
Here is an example configuration file for Nginx that can be used to proxy all connections from inside the office network to a centralized cache server:
```perl
http {
upstream cache_server {
server <cache_server_ip>;
server <cache_server_ip>;
server <cache_server_ip>;
}
server {
listen 80;
server_name localhost;
location / {
proxy_pass http://cache_server;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
}
}
}
```
In this configuration, we define an `upstream` block that lists the IP addresses of the cache servers. We then define a server block that listens on port 80 and routes all
traffic to the `/` location to the cache servers using the `proxy_pass` directive. The `proxy_set_header` directives are used to preserve the source IP address of the client
request when forwarding it to the cache server.
To use this configuration, you would need to replace the placeholder `<cache_server_ip>` with the actual IP addresses of your cache servers. You would also need to make sure
that the cache servers are configured to accept incoming connections from the Nginx server and handle requests for files.
````

View File

@@ -1,11 +0,0 @@
# Modelfile for creating a Midjourney prompts from a topic
# This prompt was adapted from the original at https://www.greataiprompts.com/guide/midjourney/best-chatgpt-prompt-for-midjourney/
# Run `ollama create mj -f ./Modelfile` and then `ollama run mj` and enter a topic
FROM zephyr
PARAMETER temperature 0.8
PARAMETER top_k 500
PARAMETER top_p 0.9
SYSTEM """
Embrace your role as a creative illustrator. Based on a concept provided, you must produce a single paragraph with a multifaceted description of an image, ensuring significant details of the concept and more is represented in your instructions. You do not need to write complete sentences but rather short concepts with the following information: the level of detail that should be represented, an artistic style and maybe a specific name of a painter or illustrator, the ideal color pallete, lighting, mood, perspective, the setting, time of day, weather, the season, the time period, location, materials, the textures, patterns, lines, brushstrokes, techniques, the medium, the genre, the rendering style. Don't include everything and keep the description length under 250 words.
"""

View File

@@ -1,11 +0,0 @@
# Midjourney Prompt Generator Modelfile
This simple modelfile will help create a prompt to feed to Midjourney.
```text
ollama create midjourney
ollama run midjourney
>>> a sports car in the mountains.
A sleek, high-performance automobile cuts through a serpentine mountain landscape. The concept is a classic illustration of speed and power, depicted in the style of pop art by Andy Warhol. The color palette is dominated by bold, primary hues of red, blue, and yellow, with striking accent colors of white, black, and metallic shades. The lighting is bright and focused, casting sharp shadows on the rugged terrain. A sense of excitement and anticipation permeates throughout the scene, as the car navigates a treacherous course through the winding road. The perspective is low, allowing for a full view of the vehicle's sleek lines and intricate details. The setting takes place in the afternoon during a sunny day in autumn, as evidenced by the vibrant foliage on the mountainside. The time period is modern, with nods to classic car design. The materials are primarily digital, allowing for smooth curves and sharp contrasts. The textures are sleek and polished, with meticulously detailed lines and brushstrokes that accentuate the car's aerodynamic design. The patterns consist of geometric shapes and bold stripes, adding to the car's dynamic appeal. The genre is modern realism, with a focus on precision and detail. The rendering style is highly technical, capturing the nuances and subtleties of the vehicle and its surroundings in breathtaking detail.
```

View File

@@ -1,6 +0,0 @@
# Modelfile for creating a recipe from a list of ingredients
# Run `ollama create recipemaker -f ./Modelfile` and then `ollama run recipemaker` and feed it lists of ingredients to create recipes around.
FROM nous-hermes
SYSTEM """
The instruction will be a list of ingredients. You should generate a recipe that can be made in less than an hour. You can also include ingredients that most people will find in their pantry every day. The recipe should be 4 people and you should include a description of what the meal will taste like
"""

View File

@@ -1,20 +0,0 @@
# Recipe Maker Modelfile
Simple modelfile to generate a recipe from a short list of ingredients.
```
ollama create recipemaker
ollama run recipemaker
>>> chilli pepper, white chocolate, kale
Ingredients:
- 1 small chili pepper
- 4 squares of white chocolate
- handful of kale leaves
Instructions:
1. In a blender or food processor, puree the chilies and white chocolate until smooth.
2. Add the chopped kale leaves to the blender and pulse until well combined.
3. Serve immediately as a dip for crackers or use it as an ingredient in your favorite recipe. The mixture of spicy chili pepper with sweet white chocolate and nutritious
kale will make your taste buds dance with delight!
```

View File

@@ -1,28 +0,0 @@
# Modelfile for creating a sentiment analyzer.
# Run `ollama create sentiments -f pathtofile` and then `ollama run sentiments` and enter a topic
FROM orca
TEMPLATE """
{{- if .First }}
### System:
{{ .System }}
{{- end }}
### User:
I hate it when my phone dies
### Response:
NEGATIVE
### User:
He is awesome
### Response:
POSITIVE
### User:
This is the link to the article
### Response:
NEUTRAL
### User:
{{ .Prompt }}
### Response:
"""
SYSTEM """You are a sentiment analyzer. You will receive text and output only one word, either POSITIVE or NEGATIVE or NEUTRAL, depending on the sentiment of the text."""

View File

@@ -1,25 +0,0 @@
# Sentiments Modelfile
This is a simple sentiments analyzer using the Orca model. When you pull Orca from the registry, it has a Template already defined that looks like this:
```Modelfile
{{- if .First }}
### System:
{{ .System }}
{{- end }}
### User:
{{ .Prompt }}
### Response:
```
If we just wanted to have the text:
```Plaintext
You are a sentiment analyzer. You will receive text and output only one word, either POSITIVE or NEGATIVE or NEUTRAL, depending on the sentiment of the text.
```
then we could have put this in a SYSTEM block. But we want to provide examples which require updating the full Template. Any Modelfile you create will inherit all the settings from the source model. But in this example, we are overriding the Template.
When providing examples for the input and output, you should include the way the model usually provides information. Since the Orca model expects a user prompt to appear after ### User: and the response is after ### Response, we should format our examples like that as well. If we were using the Llama 2 model, the format would be a bit different.

View File

@@ -1,7 +0,0 @@
# Modelfile for creating a tweet from a topic
# Run `ollama create tweetwriter -f ./Modelfile` and then `ollama run tweetwriter` and enter a topic
FROM nous-hermes
SYSTEM """
You are a content marketer who needs to come up with a short but succinct tweet. Make sure to include the appropriate hashtags and links. Sometimes when appropriate, describe a meme that can be included as well. All answers should be in the form of a tweet which has a max size of 280 characters. Every instruction will be the topic to create a tweet about.
"""

View File

@@ -0,0 +1,23 @@
# Example Modelfile - Tweetwriter
This simple examples shows what you can do without any code, simply relying on a Modelfile. The file has two instructions:
1. FROM - The From instructions defines the parent model to use for this one. If you choose a model from the library, you can enter just the model name. For all other models, you need to specify the namespace as well. You could also use a local file. Just include the relative path to the converted, quantized model weights file. To learn more about creating that file, see the `import.md` file in the docs folder of this repository.
2. SYSTEM - This defines the system prompt for the model and overrides the system prompt from the parent model.
## Running the Example
1. Create the model:
```bash
ollama create tweetwriter
```
2. Enter a topic to generate a tweet about.
3. Show the Modelfile in the REPL.
```bash
/show modelfile
```
Notice that the FROM and SYSTEM match what was in the file. But there is also a TEMPLATE and PARAMETER. These are inherited from the parent model.

View File

@@ -1,15 +1,31 @@
# DockerIt
DockerIt is a tool to help you build and run your application in a Docker container. It consists of a model that defines the system prompt and model weights to use, along with a python script to then build the container and run the image automatically.
DockerIt is a tool to help you build and run your application in a Docker container. It consists of a model that defines the system prompt and model weights to use, along with a python script to then build the container and run the image automatically.
## Running the Example
1. Ensure you have the `mattw/dockerit` model installed:
```bash
ollama pull mattw/dockerit
```
2. Make sure Docker is running on your machine.
3. Install the Python Requirements.
```bash
pip install -r requirements.txt
```
4. Run the example:
```bash
python dockerit.py "simple postgres server with admin password set to 123"
```
5. Enter the name you would like to use for your container image.
## Caveats
This is an simple example. It's assuming the Dockerfile content generated is going to work. In many cases, even with simple web servers, it fails when trying to copy files that don't exist. It's simply an example of what you could possibly do.
## Example Usage
```bash
> python3 ./dockerit.py "simple postgres server with admin password set to 123"
Enter the name of the image: matttest
Container named happy_keller started with id: 7c201bb6c30f02b356ddbc8e2a5af9d7d7d7b8c228519c9a501d15c0bd9d6b3e
```
This is a simple example. It's assuming the Dockerfile content generated is going to work. In many cases, even with simple web servers, it fails when trying to copy files that don't exist. It's simply an example of what you could possibly do.

View File

@@ -4,6 +4,32 @@
There are two python scripts in this example. `randomaddresses.py` generates random addresses from different countries. `predefinedschema.py` sets a template for the model to fill in.
## Running the Example
1. Ensure you have the `llama2` model installed:
```bash
ollama pull llama2
```
2. Install the Python Requirements.
```bash
pip install -r requirements.txt
```
3. Run the Random Addresses example:
```bash
python randomaddresses.py
```
4. Run the Predefined Schema example:
```bash
python predefinedschema.py
```
## Review the Code
Both programs are basically the same, with a different prompt for each, demonstrating two different ideas. The key part of getting JSON out of a model is to state in the prompt or system prompt that it should respond using JSON, and specifying the `format` as `json` in the data body.

View File

@@ -16,12 +16,12 @@ def find_errors_in_log_file():
with open(log_file_path, 'r') as log_file:
log_lines = log_file.readlines()
error_logs = []
for i, line in enumerate(log_lines):
if "error" in line.lower():
start_index = max(0, i - prelines)
end_index = min(len(log_lines), i + postlines + 1)
error_logs.extend(log_lines[start_index:end_index])
error_logs = []
for i, line in enumerate(log_lines):
if "error" in line.lower():
start_index = max(0, i - prelines)
end_index = min(len(log_lines), i + postlines + 1)
error_logs.extend(log_lines[start_index:end_index])
return error_logs
@@ -32,7 +32,6 @@ data = {
"model": "mattw/loganalyzer"
}
response = requests.post("http://localhost:11434/api/generate", json=data, stream=True)
for line in response.iter_lines():
if line:

View File

@@ -2,12 +2,34 @@
![loganalyzer 2023-11-10 08_53_29](https://github.com/jmorganca/ollama/assets/633681/ad30f1fc-321f-4953-8914-e30e24db9921)
This example shows one possible way to create a log file analyzer. To use it, run:
This example shows one possible way to create a log file analyzer. It uses the model **mattw/loganalyzer** which is based on **codebooga**, a 34b parameter model.
To use it, run:
`python loganalysis.py <logfile>`
You can try this with the `logtest.logfile` file included in this directory.
## Running the Example
1. Ensure you have the `mattw/loganalyzer` model installed:
```bash
ollama pull mattw/loganalyzer
```
2. Install the Python Requirements.
```bash
pip install -r requirements.txt
```
3. Run the example:
```bash
python loganalysis.py logtest.logfile
```
## Review the code
The first part of this example is a Modelfile that takes `codebooga` and applies a new System Prompt:
@@ -45,4 +67,4 @@ for line in response.iter_lines():
There is a lot more that can be done here. This is a simple way to detect errors, looking for the word error. Perhaps it would be interesting to find anomalous activity in the logs. It could be interesting to create embeddings for each line and compare them, looking for similar lines. Or look into applying Levenshtein Distance algorithms to find similar lines to help identify the anomalous lines.
Also try different models and different prompts to analyze the data. You could consider adding retrieval augmented generation (RAG) to this to help understand newer log formats.
Try different models and different prompts to analyze the data. You could consider adding retrieval augmented generation (RAG) to this to help understand newer log formats.

View File

@@ -14,9 +14,22 @@ This example goes through a series of steps:
This example lets you pick from a few different topic areas, then summarize the most recent x articles for that topic. It then creates chunks of sentences from each article and then generates embeddings for each of those chunks.
You can run the example like this:
## Running the Example
```bash
pip install -r requirements.txt
python summ.py
```
1. Ensure you have the `mistral-openorca` model installed:
```bash
ollama pull mistral-openorca
```
2. Install the Python Requirements.
```bash
pip install -r requirements.txt
```
3. Run the example:
```bash
python summ.py
```

View File

@@ -24,7 +24,6 @@ def chat(messages):
# the response streams one token at a time, print that as we receive it
print(content, end="", flush=True)
if body.get("done", False):
message["content"] = output
return message
@@ -32,9 +31,11 @@ def chat(messages):
def main():
messages = []
while True:
user_input = input("Enter a prompt: ")
if not user_input:
exit()
print()
messages.append({"role": "user", "content": user_input})
message = chat(messages)

View File

@@ -1,6 +1,26 @@
# Simple Chat Example
The **chat** endpoint is one of two ways to generate text from an LLM with Ollama. At a high level you provide the endpoint an array of objects with a role and content specified. Then with each output and prompt, you add more of those role/content objects, which builds up the history.
The **chat** endpoint is one of two ways to generate text from an LLM with Ollama, and is introduced in version 0.1.14. At a high level, you provide the endpoint an array of objects with a role and content specified. Then with each output and prompt, you add more of those role/content objects, which builds up the history.
## Running the Example
1. Ensure you have the `llama2` model installed:
```bash
ollama pull llama2
```
2. Install the Python Requirements.
```bash
pip install -r requirements.txt
```
3. Run the example:
```bash
python client.py
```
## Review the Code

View File

@@ -0,0 +1 @@
Requests==2.31.0

View File

@@ -0,0 +1,29 @@
# Simple Generate Example
This is a simple example using the **Generate** endpoint.
## Running the Example
1. Ensure you have the `stablelm-zephyr` model installed:
```bash
ollama pull stablelm-zephyr
```
2. Install the Python Requirements.
```bash
pip install -r requirements.txt
```
3. Run the example:
```bash
python client.py
```
## Review the Code
The **main** function simply asks for input, then passes that to the generate function. The output from generate is then passed back to generate on the next run.
The **generate** function uses `requests.post` to call `/api/generate`, passing the model, prompt, and context. The `generate` endpoint returns a stream of JSON blobs that are then iterated through, looking for the response values. That is then printed out. The final JSON object includes the full context of the conversation so far, and that is the return value from the function.

View File

@@ -2,7 +2,7 @@ import json
import requests
# NOTE: ollama must be running for this to work, start the ollama app or run `ollama serve`
model = 'llama2' # TODO: update this for whatever model you wish to use
model = 'stablelm-zephyr' # TODO: update this for whatever model you wish to use
def generate(prompt, context):
r = requests.post('http://localhost:11434/api/generate',
@@ -30,6 +30,8 @@ def main():
context = [] # the context stores a conversation history, you can use this to make the model more context aware
while True:
user_input = input("Enter a prompt: ")
if not user_input:
exit()
print()
context = generate(user_input, context)
print()

View File

@@ -0,0 +1 @@
Requests==2.31.0

View File

@@ -4,18 +4,62 @@ This example demonstrates how one would create a set of 'mentors' you can have a
## Usage
```bash
ts-node ./character-generator.ts "Lorne Greene"
```
1. Add llama2 to have the mentors ask your questions:
This will create `lornegreene/Modelfile`. Now you can create a model with this command:
```bash
ollama pull llama2
```
```bash
ollama create lornegreene -f lornegreene/Modelfile
```
2. Install prerequisites:
If you want to add your own mentors, you will have to update the code to look at your namespace instead of **mattw**. Also set the list of mentors to include yours.
```bash
npm install
```
```bash
ts-node ./mentors.ts "What is a Jackalope?"
```
3. Ask a question:
```bash
npm start "what is a jackalope"
```
You can also add your own character to be chosen at random when you ask a question.
1. Make sure you have the right model installed:
```bash
ollama pull stablebeluga2:70b-q4_K_M
```
2. Create a new character:
```bash
npm run charactergen "Lorne Greene"
```
You can choose any well-known person you like. This example will create `lornegreene/Modelfile`.
3. Now you can create a model with this command:
```bash
ollama create <YourNamespace>/lornegreene -f lornegreene/Modelfile
```
`YourNamespace` is whatever name you set up when you signed up at [https://ollama.ai/signup](https://ollama.ai/signup).
4. To add this to your mentors, you will have to update the code as follows. On line 8 of `mentors.ts`, add an object to the array, replacing `<YourNamespace>` with the namespace you used above.
```bash
{ns: "<YourNamespace>", char: "Lorne Greene"}
```
## Review the Code
There are two scripts you can run in this example. The first is the main script to ask the mentors a question. The other one lets you generate a character to add to the mentors. Both scripts are mostly about adjusting the prompts at each inference stage.
### mentors.ts
In the **main** function, it starts by generating a list of mentors. This chooses 3 from a list of interesting characters. Then we ask for a question, and then things get interesting. We set the prompt for each of the 3 mentors a little differently. And the 2nd and 3rd mentors see what the previous folks said. The other functions in mentors sets the prompts for each mentor.
### character-generator.ts
**Character Generator** simply customizes the prompt to build a character profile for any famous person. And most of the script is just tweaking the prompt. This uses Stable Beluga 2 70b parameters. The 70b models tend to do better writing a bio about a character than smaller models, and Stable Beluga seemed to do better than Llama 2. Since this is used at development time for the characters, it doesn't affect the runtime of asking the mentors for their input.

View File

@@ -2,10 +2,11 @@ import { Ollama } from 'ollama-node';
const mentorCount = 3;
const ollama = new Ollama();
type Mentor = { ns: string, char: string };
function getMentors(): string[] {
const mentors = ['Gary Vaynerchuk', 'Kanye West', 'Martha Stewart', 'Neil deGrasse Tyson', 'Owen Wilson', 'Ronald Reagan', 'Donald Trump', 'Barack Obama', 'Jeff Bezos'];
const chosenMentors: string[] = [];
function getMentors(): Mentor[] {
const mentors = [{ ns: 'mattw', char: 'Gary Vaynerchuk' }, { ns: 'mattw', char: 'Kanye West'}, {ns: 'mattw', char: 'Martha Stewart'}, {ns: 'mattw', char: 'Neil deGrasse Tyson'}, {ns: 'mattw', char: 'Owen Wilson'}, {ns: 'mattw', char: 'Ronald Reagan'}, {ns: 'mattw', char: 'Donald Trump'}, {ns: 'mattw', char: 'Barack Obama'}, {ns: 'mattw', char: 'Jeff Bezos'}];
const chosenMentors: Mentor[] = [];
for (let i = 0; i < mentorCount; i++) {
const mentor = mentors[Math.floor(Math.random() * mentors.length)];
chosenMentors.push(mentor);
@@ -14,12 +15,12 @@ function getMentors(): string[] {
return chosenMentors;
}
function getMentorFileName(mentor: string): string {
const model = mentor.toLowerCase().replace(/\s/g, '');
return `mattw/${model}`;
function getMentorFileName(mentor: Mentor): string {
const model = mentor.char.toLowerCase().replace(/\s/g, '');
return `${mentor.ns}/${model}`;
}
async function getSystemPrompt(mentor: string, isLast: boolean, question: string): Promise<string> {
async function getSystemPrompt(mentor: Mentor, isLast: boolean, question: string): Promise<string> {
ollama.setModel(getMentorFileName(mentor));
const info = await ollama.showModelInfo()
let SystemPrompt = info.system || '';
@@ -43,8 +44,8 @@ async function main() {
ollama.setModel(getMentorFileName(mentor));
ollama.setSystemPrompt(SystemPrompt);
let output = '';
process.stdout.write(`\n${mentor}: `);
for await (const chunk of ollama.streamingGenerate(theConversation + `Continue the conversation as if you were ${mentor} on the question "${question}".`)) {
process.stdout.write(`\n${mentor.char}: `);
for await (const chunk of ollama.streamingGenerate(theConversation + `Continue the conversation as if you were ${mentor.char} on the question "${question}".`)) {
if (chunk.response) {
output += chunk.response;
process.stdout.write(chunk.response);
@@ -52,7 +53,7 @@ async function main() {
process.stdout.write('\n');
}
}
theConversation += `${mentor}: ${output}\n\n`
theConversation += `${mentor.char}: ${output}\n\n`
}
}

View File

@@ -1,7 +1,15 @@
{
"scripts": {
"charactergen": "tsx character-generator.ts",
"start": "tsx mentors.ts"
},
"dependencies": {
"fs": "^0.0.1-security",
"ollama-node": "^0.0.3",
"path": "^0.12.7"
},
"devDependencies": {
"tsx": "^4.6.2",
"typescript": "^5.3.3"
}
}

View File

@@ -1 +1,12 @@
{ "dependencies": { "@types/node": "^20.10.4", "prompt-sync": "^4.2.0", "readline": "^1.3.0" } }
{
"scripts": {
"start": "tsx client.ts"
},
"dependencies": {
"@types/node": "^20.10.4",
"prompt-sync": "^4.2.0",
"readline": "^1.3.0",
"tsx": "^4.6.2",
"typescript": "^5.3.3"
}
}

View File

@@ -1,14 +1,10 @@
# Simple Chat Example
The **chat** endpoint is one of two ways to generate text from an LLM with Ollama. At a high level you provide the endpoint an array of message objects with a role and content specified. Then with each output and prompt, you add more messages, which builds up the history.
The **chat** endpoint, available as of v0.1.14, is one of two ways to generate text from an LLM with Ollama. At a high level, you provide the endpoint an array of message objects with a role and content specified. Then with each output and prompt, you add more messages, which builds up the history.
## Run the Example
There are a few ways to run this, just like any Typescript code:
1. Compile with `tsc` and then run it with `node client.js`.
2. Install `tsx` and run it with `tsx client.ts`.
3. Install `bun` and run it with `bun client.ts`.
`npm start`
## Review the Code
@@ -30,7 +26,7 @@ With the **generate** endpoint, you need to provide a `prompt`. But with **chat*
The final JSON object doesn't provide the full content, so you will need to build the content yourself. In this example, **chat** takes the full array of messages and outputs the resulting message from this call of the chat endpoint.
In the **askQuestion** function, we collect `user_input` and add it as a message to our messages and that is passed to the chat function. When the LLM is done responding the output is added as another message to the messages array.
In the **askQuestion** function, we collect `user_input` and add it as a message to our messages, and that is passed to the chat function. When the LLM is done responding, the output is added as another message to the messages array.
At the end, you will see a printout of all the messages.

7
go.mod
View File

@@ -1,17 +1,20 @@
module github.com/jmorganca/ollama
go 1.20
go 1.21
require (
github.com/emirpasic/gods v1.18.1
github.com/gin-gonic/gin v1.9.1
github.com/olekukonko/tablewriter v0.0.5
github.com/spf13/cobra v1.7.0
github.com/stretchr/testify v1.8.4
golang.org/x/sync v0.3.0
)
require (
github.com/davecgh/go-spew v1.1.1 // indirect
github.com/mattn/go-runewidth v0.0.14 // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect
github.com/rivo/uniseg v0.2.0 // indirect
)
@@ -42,7 +45,7 @@ require (
golang.org/x/crypto v0.14.0
golang.org/x/exp v0.0.0-20230817173708-d852ddb80c63
golang.org/x/net v0.17.0 // indirect
golang.org/x/sys v0.13.0 // indirect
golang.org/x/sys v0.13.0
golang.org/x/term v0.13.0
golang.org/x/text v0.13.0 // indirect
google.golang.org/protobuf v1.30.0 // indirect

3
go.sum
View File

@@ -98,8 +98,9 @@ github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/
github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
github.com/stretchr/testify v1.8.2/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
github.com/stretchr/testify v1.8.3 h1:RP3t2pwF7cMEbC1dqtB6poj3niw/9gnV4Cjg5oW5gtY=
github.com/stretchr/testify v1.8.3/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk=
github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
github.com/twitchyliquid64/golang-asm v0.15.1 h1:SU5vSMR7hnwNxj24w34ZyCi/FmDZTkS4MhqMhdFk5YI=
github.com/twitchyliquid64/golang-asm v0.15.1/go.mod h1:a1lVb/DtPvCB8fslRZhAngC2+aY1QWCk3Cedj/Gdt08=
github.com/ugorji/go v1.2.7/go.mod h1:nF9osbDWLy6bDVv/Rtoh6QgnvNDpmCalQV5urGCCS6M=

21
gpu/cpu_common.go Normal file
View File

@@ -0,0 +1,21 @@
package gpu
import (
"log/slog"
"golang.org/x/sys/cpu"
)
func GetCPUVariant() string {
if cpu.X86.HasAVX2 {
slog.Info("CPU has AVX2")
return "avx2"
}
if cpu.X86.HasAVX {
slog.Info("CPU has AVX")
return "avx"
}
slog.Info("CPU does not have vector extensions")
// else LCD
return ""
}

283
gpu/gpu.go Normal file
View File

@@ -0,0 +1,283 @@
//go:build linux || windows
package gpu
/*
#cgo linux LDFLAGS: -lrt -lpthread -ldl -lstdc++ -lm
#cgo windows LDFLAGS: -lpthread
#include "gpu_info.h"
*/
import "C"
import (
"fmt"
"log/slog"
"os"
"path/filepath"
"runtime"
"strings"
"sync"
"unsafe"
)
type handles struct {
cuda *C.cuda_handle_t
rocm *C.rocm_handle_t
}
var gpuMutex sync.Mutex
var gpuHandles *handles = nil
// With our current CUDA compile flags, 5.2 and older will not work properly
const CudaComputeMajorMin = 6
// Possible locations for the nvidia-ml library
var CudaLinuxGlobs = []string{
"/usr/local/cuda/lib64/libnvidia-ml.so*",
"/usr/lib/x86_64-linux-gnu/nvidia/current/libnvidia-ml.so*",
"/usr/lib/x86_64-linux-gnu/libnvidia-ml.so*",
"/usr/lib/wsl/lib/libnvidia-ml.so*",
"/opt/cuda/lib64/libnvidia-ml.so*",
"/opt/cuda/targets/x86_64-linux/lib/stubs/libnvidia-ml.so*",
"/usr/lib*/libnvidia-ml.so*",
"/usr/local/lib*/libnvidia-ml.so*",
"/usr/lib/aarch64-linux-gnu/nvidia/current/libnvidia-ml.so*",
"/usr/lib/aarch64-linux-gnu/libnvidia-ml.so*",
}
var CudaWindowsGlobs = []string{
"c:\\Windows\\System32\\nvml.dll",
}
var RocmLinuxGlobs = []string{
"/opt/rocm*/lib*/librocm_smi64.so*",
}
var RocmWindowsGlobs = []string{
"c:\\Windows\\System32\\rocm_smi64.dll",
}
// Note: gpuMutex must already be held
func initGPUHandles() {
// TODO - if the ollama build is CPU only, don't do these checks as they're irrelevant and confusing
gpuHandles = &handles{nil, nil}
var cudaMgmtName string
var cudaMgmtPatterns []string
var rocmMgmtName string
var rocmMgmtPatterns []string
switch runtime.GOOS {
case "windows":
cudaMgmtName = "nvml.dll"
cudaMgmtPatterns = make([]string, len(CudaWindowsGlobs))
copy(cudaMgmtPatterns, CudaWindowsGlobs)
rocmMgmtName = "rocm_smi64.dll"
rocmMgmtPatterns = make([]string, len(RocmWindowsGlobs))
copy(rocmMgmtPatterns, RocmWindowsGlobs)
case "linux":
cudaMgmtName = "libnvidia-ml.so"
cudaMgmtPatterns = make([]string, len(CudaLinuxGlobs))
copy(cudaMgmtPatterns, CudaLinuxGlobs)
rocmMgmtName = "librocm_smi64.so"
rocmMgmtPatterns = make([]string, len(RocmLinuxGlobs))
copy(rocmMgmtPatterns, RocmLinuxGlobs)
default:
return
}
slog.Info("Detecting GPU type")
cudaLibPaths := FindGPULibs(cudaMgmtName, cudaMgmtPatterns)
if len(cudaLibPaths) > 0 {
cuda := LoadCUDAMgmt(cudaLibPaths)
if cuda != nil {
slog.Info("Nvidia GPU detected")
gpuHandles.cuda = cuda
return
}
}
rocmLibPaths := FindGPULibs(rocmMgmtName, rocmMgmtPatterns)
if len(rocmLibPaths) > 0 {
rocm := LoadROCMMgmt(rocmLibPaths)
if rocm != nil {
slog.Info("Radeon GPU detected")
gpuHandles.rocm = rocm
return
}
}
}
func GetGPUInfo() GpuInfo {
// TODO - consider exploring lspci (and equivalent on windows) to check for
// GPUs so we can report warnings if we see Nvidia/AMD but fail to load the libraries
gpuMutex.Lock()
defer gpuMutex.Unlock()
if gpuHandles == nil {
initGPUHandles()
}
var memInfo C.mem_info_t
resp := GpuInfo{}
if gpuHandles.cuda != nil {
C.cuda_check_vram(*gpuHandles.cuda, &memInfo)
if memInfo.err != nil {
slog.Info(fmt.Sprintf("error looking up CUDA GPU memory: %s", C.GoString(memInfo.err)))
C.free(unsafe.Pointer(memInfo.err))
} else {
// Verify minimum compute capability
var cc C.cuda_compute_capability_t
C.cuda_compute_capability(*gpuHandles.cuda, &cc)
if cc.err != nil {
slog.Info(fmt.Sprintf("error looking up CUDA GPU compute capability: %s", C.GoString(cc.err)))
C.free(unsafe.Pointer(cc.err))
} else if cc.major >= CudaComputeMajorMin {
slog.Info(fmt.Sprintf("CUDA Compute Capability detected: %d.%d", cc.major, cc.minor))
resp.Library = "cuda"
} else {
slog.Info(fmt.Sprintf("CUDA GPU is too old. Falling back to CPU mode. Compute Capability detected: %d.%d", cc.major, cc.minor))
}
}
} else if gpuHandles.rocm != nil {
C.rocm_check_vram(*gpuHandles.rocm, &memInfo)
if memInfo.err != nil {
slog.Info(fmt.Sprintf("error looking up ROCm GPU memory: %s", C.GoString(memInfo.err)))
C.free(unsafe.Pointer(memInfo.err))
} else {
resp.Library = "rocm"
var version C.rocm_version_resp_t
C.rocm_get_version(*gpuHandles.rocm, &version)
verString := C.GoString(version.str)
if version.status == 0 {
resp.Variant = "v" + verString
} else {
slog.Info(fmt.Sprintf("failed to look up ROCm version: %s", verString))
}
C.free(unsafe.Pointer(version.str))
}
}
if resp.Library == "" {
C.cpu_check_ram(&memInfo)
resp.Library = "cpu"
resp.Variant = GetCPUVariant()
}
if memInfo.err != nil {
slog.Info(fmt.Sprintf("error looking up CPU memory: %s", C.GoString(memInfo.err)))
C.free(unsafe.Pointer(memInfo.err))
return resp
}
resp.DeviceCount = uint32(memInfo.count)
resp.FreeMemory = uint64(memInfo.free)
resp.TotalMemory = uint64(memInfo.total)
return resp
}
func getCPUMem() (memInfo, error) {
var ret memInfo
var info C.mem_info_t
C.cpu_check_ram(&info)
if info.err != nil {
defer C.free(unsafe.Pointer(info.err))
return ret, fmt.Errorf(C.GoString(info.err))
}
ret.FreeMemory = uint64(info.free)
ret.TotalMemory = uint64(info.total)
return ret, nil
}
func CheckVRAM() (int64, error) {
gpuInfo := GetGPUInfo()
if gpuInfo.FreeMemory > 0 && (gpuInfo.Library == "cuda" || gpuInfo.Library == "rocm") {
return int64(gpuInfo.FreeMemory), nil
}
return 0, fmt.Errorf("no GPU detected") // TODO - better handling of CPU based memory determiniation
}
func FindGPULibs(baseLibName string, patterns []string) []string {
// Multiple GPU libraries may exist, and some may not work, so keep trying until we exhaust them
var ldPaths []string
gpuLibPaths := []string{}
slog.Info(fmt.Sprintf("Searching for GPU management library %s", baseLibName))
switch runtime.GOOS {
case "windows":
ldPaths = strings.Split(os.Getenv("PATH"), ";")
case "linux":
ldPaths = strings.Split(os.Getenv("LD_LIBRARY_PATH"), ":")
default:
return gpuLibPaths
}
// Start with whatever we find in the PATH/LD_LIBRARY_PATH
for _, ldPath := range ldPaths {
d, err := filepath.Abs(ldPath)
if err != nil {
continue
}
patterns = append(patterns, filepath.Join(d, baseLibName+"*"))
}
slog.Debug(fmt.Sprintf("gpu management search paths: %v", patterns))
for _, pattern := range patterns {
// Ignore glob discovery errors
matches, _ := filepath.Glob(pattern)
for _, match := range matches {
// Resolve any links so we don't try the same lib multiple times
// and weed out any dups across globs
libPath := match
tmp := match
var err error
for ; err == nil; tmp, err = os.Readlink(libPath) {
if !filepath.IsAbs(tmp) {
tmp = filepath.Join(filepath.Dir(libPath), tmp)
}
libPath = tmp
}
new := true
for _, cmp := range gpuLibPaths {
if cmp == libPath {
new = false
break
}
}
if new {
gpuLibPaths = append(gpuLibPaths, libPath)
}
}
}
slog.Info(fmt.Sprintf("Discovered GPU libraries: %v", gpuLibPaths))
return gpuLibPaths
}
func LoadCUDAMgmt(cudaLibPaths []string) *C.cuda_handle_t {
var resp C.cuda_init_resp_t
for _, libPath := range cudaLibPaths {
lib := C.CString(libPath)
defer C.free(unsafe.Pointer(lib))
C.cuda_init(lib, &resp)
if resp.err != nil {
slog.Info(fmt.Sprintf("Unable to load CUDA management library %s: %s", libPath, C.GoString(resp.err)))
C.free(unsafe.Pointer(resp.err))
} else {
return &resp.ch
}
}
return nil
}
func LoadROCMMgmt(rocmLibPaths []string) *C.rocm_handle_t {
var resp C.rocm_init_resp_t
for _, libPath := range rocmLibPaths {
lib := C.CString(libPath)
defer C.free(unsafe.Pointer(lib))
C.rocm_init(lib, &resp)
if resp.err != nil {
slog.Info(fmt.Sprintf("Unable to load ROCm management library %s: %s", libPath, C.GoString(resp.err)))
C.free(unsafe.Pointer(resp.err))
} else {
return &resp.rh
}
}
return nil
}

54
gpu/gpu_darwin.go Normal file
View File

@@ -0,0 +1,54 @@
//go:build darwin
package gpu
import "C"
import (
"runtime"
"github.com/pbnjay/memory"
)
// CheckVRAM returns the free VRAM in bytes on Linux machines with NVIDIA GPUs
func CheckVRAM() (int64, error) {
if runtime.GOARCH == "amd64" {
// gpu not supported, this may not be metal
return 0, nil
}
// on macOS, there's already buffer for available vram (see below) so just return the total
systemMemory := int64(memory.TotalMemory())
// macOS limits how much memory is available to the GPU based on the amount of system memory
// TODO: handle case where iogpu.wired_limit_mb is set to a higher value
if systemMemory <= 36*1024*1024*1024 {
systemMemory = systemMemory * 2 / 3
} else {
systemMemory = systemMemory * 3 / 4
}
return systemMemory, nil
}
func GetGPUInfo() GpuInfo {
mem, _ := getCPUMem()
if runtime.GOARCH == "amd64" {
return GpuInfo{
Library: "cpu",
Variant: GetCPUVariant(),
memInfo: mem,
}
}
return GpuInfo{
Library: "metal",
memInfo: mem,
}
}
func getCPUMem() (memInfo, error) {
return memInfo{
TotalMemory: 0,
FreeMemory: 0,
DeviceCount: 0,
}, nil
}

51
gpu/gpu_info.h Normal file
View File

@@ -0,0 +1,51 @@
#ifndef __APPLE__
#ifndef __GPU_INFO_H__
#define __GPU_INFO_H__
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#ifndef _WIN32
#include <dlfcn.h>
#define LOAD_LIBRARY(lib, flags) dlopen(lib, flags)
#define LOAD_SYMBOL(handle, sym) dlsym(handle, sym)
#define LOAD_ERR() strdup(dlerror())
#define UNLOAD_LIBRARY(handle) dlclose(handle)
#else
#include <windows.h>
#define LOAD_LIBRARY(lib, flags) LoadLibrary(lib)
#define LOAD_SYMBOL(handle, sym) GetProcAddress(handle, sym)
#define UNLOAD_LIBRARY(handle) FreeLibrary(handle)
#define LOAD_ERR() ({\
LPSTR messageBuffer = NULL; \
size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, \
NULL, GetLastError(), MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&messageBuffer, 0, NULL); \
char *resp = strdup(messageBuffer); \
LocalFree(messageBuffer); \
resp; \
})
#endif
#ifdef __cplusplus
extern "C" {
#endif
typedef struct mem_info {
uint64_t total;
uint64_t free;
unsigned int count;
char *err; // If non-nill, caller responsible for freeing
} mem_info_t;
void cpu_check_ram(mem_info_t *resp);
#ifdef __cplusplus
}
#endif
#include "gpu_info_cuda.h"
#include "gpu_info_rocm.h"
#endif // __GPU_INFO_H__
#endif // __APPLE__

45
gpu/gpu_info_cpu.c Normal file
View File

@@ -0,0 +1,45 @@
#include "gpu_info.h"
// Fallbacks for CPU mode
#ifdef _WIN32
#include <sysinfoapi.h>
void cpu_check_ram(mem_info_t *resp) {
resp->err = NULL;
MEMORYSTATUSEX info;
info.dwLength = sizeof(info);
if (GlobalMemoryStatusEx(&info) != 0) {
resp->count = 1;
resp->total = info.ullTotalPhys;
resp->free = info.ullAvailPhys;
} else {
resp->err = LOAD_ERR();
}
return;
}
#elif __linux__
#include <errno.h>
#include <string.h>
#include <sys/sysinfo.h>
void cpu_check_ram(mem_info_t *resp) {
struct sysinfo info;
resp->err = NULL;
if (sysinfo(&info) != 0) {
resp->err = strdup(strerror(errno));
} else {
resp->count = 1;
resp->total = info.totalram * info.mem_unit;
resp->free = info.freeram * info.mem_unit;
}
return;
}
#elif __APPLE__
// TODO consider an Apple implementation that does something useful
// mem_info_t cpu_check_ram() {
// mem_info_t resp = {0, 0, NULL};
// return resp;
// }
#else
#error "Unsupported platform"
#endif

155
gpu/gpu_info_cuda.c Normal file
View File

@@ -0,0 +1,155 @@
#ifndef __APPLE__ // TODO - maybe consider nvidia support on intel macs?
#include "gpu_info_cuda.h"
#include <string.h>
#define CUDA_LOOKUP_SIZE 6
void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
nvmlReturn_t ret;
resp->err = NULL;
const int buflen = 256;
char buf[buflen + 1];
int i;
struct lookup {
char *s;
void **p;
} l[CUDA_LOOKUP_SIZE] = {
{"nvmlInit_v2", (void *)&resp->ch.initFn},
{"nvmlShutdown", (void *)&resp->ch.shutdownFn},
{"nvmlDeviceGetHandleByIndex", (void *)&resp->ch.getHandle},
{"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.getMemInfo},
{"nvmlDeviceGetCount_v2", (void *)&resp->ch.getCount},
{"nvmlDeviceGetCudaComputeCapability", (void *)&resp->ch.getComputeCapability},
};
resp->ch.handle = LOAD_LIBRARY(cuda_lib_path, RTLD_LAZY);
if (!resp->ch.handle) {
char *msg = LOAD_ERR();
snprintf(buf, buflen,
"Unable to load %s library to query for Nvidia GPUs: %s",
cuda_lib_path, msg);
free(msg);
resp->err = strdup(buf);
return;
}
for (i = 0; i < CUDA_LOOKUP_SIZE; i++) { // TODO - fix this to use a null terminated list
*l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
if (!l[i].p) {
UNLOAD_LIBRARY(resp->ch.handle);
resp->ch.handle = NULL;
char *msg = LOAD_ERR();
snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
msg);
free(msg);
resp->err = strdup(buf);
return;
}
}
ret = (*resp->ch.initFn)();
if (ret != NVML_SUCCESS) {
UNLOAD_LIBRARY(resp->ch.handle);
resp->ch.handle = NULL;
snprintf(buf, buflen, "nvml vram init failure: %d", ret);
resp->err = strdup(buf);
}
return;
}
void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
resp->err = NULL;
nvmlDevice_t device;
nvmlMemory_t memInfo = {0};
nvmlReturn_t ret;
const int buflen = 256;
char buf[buflen + 1];
int i;
if (h.handle == NULL) {
resp->err = strdup("nvml handle sn't initialized");
return;
}
ret = (*h.getCount)(&resp->count);
if (ret != NVML_SUCCESS) {
snprintf(buf, buflen, "unable to get device count: %d", ret);
resp->err = strdup(buf);
return;
}
resp->total = 0;
resp->free = 0;
for (i = 0; i < resp->count; i++) {
ret = (*h.getHandle)(i, &device);
if (ret != NVML_SUCCESS) {
snprintf(buf, buflen, "unable to get device handle %d: %d", i, ret);
resp->err = strdup(buf);
return;
}
ret = (*h.getMemInfo)(device, &memInfo);
if (ret != NVML_SUCCESS) {
snprintf(buf, buflen, "device memory info lookup failure %d: %d", i, ret);
resp->err = strdup(buf);
return;
}
resp->total += memInfo.total;
resp->free += memInfo.free;
}
}
void cuda_compute_capability(cuda_handle_t h, cuda_compute_capability_t *resp) {
resp->err = NULL;
resp->major = 0;
resp->minor = 0;
nvmlDevice_t device;
int major = 0;
int minor = 0;
nvmlReturn_t ret;
const int buflen = 256;
char buf[buflen + 1];
int i;
if (h.handle == NULL) {
resp->err = strdup("nvml handle not initialized");
return;
}
unsigned int devices;
ret = (*h.getCount)(&devices);
if (ret != NVML_SUCCESS) {
snprintf(buf, buflen, "unable to get device count: %d", ret);
resp->err = strdup(buf);
return;
}
for (i = 0; i < devices; i++) {
ret = (*h.getHandle)(i, &device);
if (ret != NVML_SUCCESS) {
snprintf(buf, buflen, "unable to get device handle %d: %d", i, ret);
resp->err = strdup(buf);
return;
}
ret = (*h.getComputeCapability)(device, &major, &minor);
if (ret != NVML_SUCCESS) {
snprintf(buf, buflen, "device compute capability lookup failure %d: %d", i, ret);
resp->err = strdup(buf);
return;
}
// Report the lowest major.minor we detect as that limits our compatibility
if (resp->major == 0 || resp->major > major ) {
resp->major = major;
resp->minor = minor;
} else if ( resp->major == major && resp->minor > minor ) {
resp->minor = minor;
}
}
}
#endif // __APPLE__

44
gpu/gpu_info_cuda.h Normal file
View File

@@ -0,0 +1,44 @@
#ifndef __APPLE__
#ifndef __GPU_INFO_CUDA_H__
#define __GPU_INFO_CUDA_H__
#include "gpu_info.h"
// Just enough typedef's to dlopen/dlsym for memory information
typedef enum nvmlReturn_enum {
NVML_SUCCESS = 0,
// Other values omitted for now...
} nvmlReturn_t;
typedef void *nvmlDevice_t; // Opaque is sufficient
typedef struct nvmlMemory_st {
unsigned long long total;
unsigned long long free;
unsigned long long used;
} nvmlMemory_t;
typedef struct cuda_handle {
void *handle;
nvmlReturn_t (*initFn)(void);
nvmlReturn_t (*shutdownFn)(void);
nvmlReturn_t (*getHandle)(unsigned int, nvmlDevice_t *);
nvmlReturn_t (*getMemInfo)(nvmlDevice_t, nvmlMemory_t *);
nvmlReturn_t (*getCount)(unsigned int *);
nvmlReturn_t (*getComputeCapability)(nvmlDevice_t, int* major, int* minor);
} cuda_handle_t;
typedef struct cuda_init_resp {
char *err; // If err is non-null handle is invalid
cuda_handle_t ch;
} cuda_init_resp_t;
typedef struct cuda_compute_capability {
char *err;
int major;
int minor;
} cuda_compute_capability_t;
void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp);
void cuda_check_vram(cuda_handle_t ch, mem_info_t *resp);
void cuda_compute_capability(cuda_handle_t ch, cuda_compute_capability_t *cc);
#endif // __GPU_INFO_CUDA_H__
#endif // __APPLE__

130
gpu/gpu_info_rocm.c Normal file
View File

@@ -0,0 +1,130 @@
#ifndef __APPLE__
#include "gpu_info_rocm.h"
#include <string.h>
#define ROCM_LOOKUP_SIZE 5
void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
rsmi_status_t ret;
resp->err = NULL;
const int buflen = 256;
char buf[buflen + 1];
int i;
struct lookup {
char *s;
void **p;
} l[ROCM_LOOKUP_SIZE] = {
{"rsmi_init", (void *)&resp->rh.initFn},
{"rsmi_shut_down", (void *)&resp->rh.shutdownFn},
{"rsmi_dev_memory_total_get", (void *)&resp->rh.totalMemFn},
{"rsmi_dev_memory_usage_get", (void *)&resp->rh.usageMemFn},
{"rsmi_version_get", (void *)&resp->rh.versionGetFn},
// { "rsmi_dev_id_get", (void*)&resp->rh.getHandle },
};
resp->rh.handle = LOAD_LIBRARY(rocm_lib_path, RTLD_LAZY);
if (!resp->rh.handle) {
char *msg = LOAD_ERR();
snprintf(buf, buflen,
"Unable to load %s library to query for Radeon GPUs: %s\n",
rocm_lib_path, msg);
free(msg);
resp->err = strdup(buf);
return;
}
for (i = 0; i < ROCM_LOOKUP_SIZE; i++) {
*l[i].p = LOAD_SYMBOL(resp->rh.handle, l[i].s);
if (!l[i].p) {
UNLOAD_LIBRARY(resp->rh.handle);
resp->rh.handle = NULL;
char *msg = LOAD_ERR();
snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
msg);
free(msg);
resp->err = strdup(buf);
return;
}
}
ret = (*resp->rh.initFn)(0);
if (ret != RSMI_STATUS_SUCCESS) {
UNLOAD_LIBRARY(resp->rh.handle);
resp->rh.handle = NULL;
snprintf(buf, buflen, "rocm vram init failure: %d", ret);
resp->err = strdup(buf);
}
return;
}
void rocm_check_vram(rocm_handle_t h, mem_info_t *resp) {
resp->err = NULL;
// uint32_t num_devices;
// uint16_t device;
uint64_t totalMem = 0;
uint64_t usedMem = 0;
rsmi_status_t ret;
const int buflen = 256;
char buf[buflen + 1];
int i;
if (h.handle == NULL) {
resp->err = strdup("rocm handle not initialized");
return;
}
// TODO - iterate through devices... ret =
// rsmi_num_monitor_devices(&num_devices);
// ret = (*h.getHandle)(0, &device);
// if (ret != RSMI_STATUS_SUCCESS) {
// printf("rocm vram device lookup failure: %d\n", ret);
// return -1;
// }
// Get total memory - used memory for available memory
ret = (*h.totalMemFn)(0, RSMI_MEM_TYPE_VRAM, &totalMem);
if (ret != RSMI_STATUS_SUCCESS) {
snprintf(buf, buflen, "rocm total mem lookup failure: %d", ret);
resp->err = strdup(buf);
return;
}
ret = (*h.usageMemFn)(0, RSMI_MEM_TYPE_VRAM, &usedMem);
if (ret != RSMI_STATUS_SUCCESS) {
snprintf(buf, buflen, "rocm usage mem lookup failure: %d", ret);
resp->err = strdup(buf);
return;
}
// TODO: set this to the actual number of devices
resp->count = 1;
resp->total = totalMem;
resp->free = totalMem - usedMem;
return;
}
void rocm_get_version(rocm_handle_t h, rocm_version_resp_t *resp) {
const int buflen = 256;
char buf[buflen + 1];
if (h.handle == NULL) {
resp->str = strdup("nvml handle not initialized");
resp->status = 1;
return;
}
rsmi_version_t ver;
rsmi_status_t ret;
ret = h.versionGetFn(&ver);
if (ret != RSMI_STATUS_SUCCESS) {
snprintf(buf, buflen, "unexpected response on version lookup %d", ret);
resp->status = 1;
} else {
snprintf(buf, buflen, "%d", ver.major);
resp->status = 0;
}
resp->str = strdup(buf);
}
#endif // __APPLE__

50
gpu/gpu_info_rocm.h Normal file
View File

@@ -0,0 +1,50 @@
#ifndef __APPLE__
#ifndef __GPU_INFO_ROCM_H__
#define __GPU_INFO_ROCM_H__
#include "gpu_info.h"
// Just enough typedef's to dlopen/dlsym for memory information
typedef enum rsmi_status_return {
RSMI_STATUS_SUCCESS = 0,
// Other values omitted for now...
} rsmi_status_t;
typedef enum rsmi_memory_type {
RSMI_MEM_TYPE_VRAM = 0,
RSMI_MEM_TYPE_VIS_VRAM,
RSMI_MEM_TYPE_GTT,
} rsmi_memory_type_t;
typedef struct {
uint32_t major;
uint32_t minor;
uint32_t patch;
const char *build;
} rsmi_version_t;
typedef struct rocm_handle {
void *handle;
rsmi_status_t (*initFn)(uint64_t);
rsmi_status_t (*shutdownFn)(void);
rsmi_status_t (*totalMemFn)(uint32_t, rsmi_memory_type_t, uint64_t *);
rsmi_status_t (*usageMemFn)(uint32_t, rsmi_memory_type_t, uint64_t *);
rsmi_status_t (*versionGetFn) (rsmi_version_t *version);
// rsmi_status_t (*getHandle)(uint32_t, uint16_t *);
} rocm_handle_t;
typedef struct rocm_init_resp {
char *err; // If err is non-null handle is invalid
rocm_handle_t rh;
} rocm_init_resp_t;
typedef struct rocm_version_resp {
rsmi_status_t status;
char *str; // Contains version or error string if status != 0
} rocm_version_resp_t;
void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp);
void rocm_check_vram(rocm_handle_t rh, mem_info_t *resp);
void rocm_get_version(rocm_handle_t rh, rocm_version_resp_t *resp);
#endif // __GPU_INFO_ROCM_H__
#endif // __APPLE__

41
gpu/gpu_test.go Normal file
View File

@@ -0,0 +1,41 @@
package gpu
import (
"runtime"
"testing"
"github.com/stretchr/testify/assert"
)
func TestBasicGetGPUInfo(t *testing.T) {
info := GetGPUInfo()
assert.Contains(t, "cuda rocm cpu metal", info.Library)
switch runtime.GOOS {
case "darwin":
// TODO - remove this once MacOS returns some size for CPU
return
case "linux", "windows":
assert.Greater(t, info.TotalMemory, uint64(0))
assert.Greater(t, info.FreeMemory, uint64(0))
assert.Greater(t, info.DeviceCount, uint32(0))
default:
return
}
}
func TestCPUMemInfo(t *testing.T) {
info, err := getCPUMem()
assert.NoError(t, err)
switch runtime.GOOS {
case "darwin":
t.Skip("CPU memory not populated on darwin")
case "linux", "windows":
assert.Greater(t, info.TotalMemory, uint64(0))
assert.Greater(t, info.FreeMemory, uint64(0))
default:
return
}
}
// TODO - add some logic to figure out card type through other means and actually verify we got back what we expected

18
gpu/types.go Normal file
View File

@@ -0,0 +1,18 @@
package gpu
type memInfo struct {
TotalMemory uint64 `json:"total_memory,omitempty"`
FreeMemory uint64 `json:"free_memory,omitempty"`
DeviceCount uint32 `json:"device_count,omitempty"`
}
// Beginning of an `ollama info` command
type GpuInfo struct {
memInfo
Library string `json:"library,omitempty"`
// Optional variant to select (e.g. versions, cpu feature flags)
Variant string `json:"variant,omitempty"`
// TODO add other useful attributes about the card here for discovery information
}

145
llm/dyn_ext_server.c Normal file
View File

@@ -0,0 +1,145 @@
#include "dyn_ext_server.h"
#include <stdio.h>
#include <string.h>
#ifdef __linux__
#include <dlfcn.h>
#define LOAD_LIBRARY(lib, flags) dlopen(lib, flags)
#define LOAD_SYMBOL(handle, sym) dlsym(handle, sym)
#define LOAD_ERR() strdup(dlerror())
#define UNLOAD_LIBRARY(handle) dlclose(handle)
#elif _WIN32
#include <windows.h>
#define LOAD_LIBRARY(lib, flags) LoadLibrary(lib)
#define LOAD_SYMBOL(handle, sym) GetProcAddress(handle, sym)
#define UNLOAD_LIBRARY(handle) FreeLibrary(handle)
inline char *LOAD_ERR() {
LPSTR messageBuffer = NULL;
size_t size = FormatMessageA(
FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM |
FORMAT_MESSAGE_IGNORE_INSERTS,
NULL, GetLastError(), MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT),
(LPSTR)&messageBuffer, 0, NULL);
char *resp = strdup(messageBuffer);
LocalFree(messageBuffer);
return resp;
}
#else
#include <dlfcn.h>
#define LOAD_LIBRARY(lib, flags) dlopen(lib, flags)
#define LOAD_SYMBOL(handle, sym) dlsym(handle, sym)
#define LOAD_ERR() strdup(dlerror())
#define UNLOAD_LIBRARY(handle) dlclose(handle)
#endif
void dyn_init(const char *libPath, struct dynamic_llama_server *s,
ext_server_resp_t *err) {
int i = 0;
struct lookup {
char *s;
void **p;
} l[] = {
{"llama_server_init", (void *)&s->llama_server_init},
{"llama_server_start", (void *)&s->llama_server_start},
{"llama_server_stop", (void *)&s->llama_server_stop},
{"llama_server_completion", (void *)&s->llama_server_completion},
{"llama_server_completion_next_result",
(void *)&s->llama_server_completion_next_result},
{"llama_server_completion_cancel",
(void *)&s->llama_server_completion_cancel},
{"llama_server_release_task_result",
(void *)&s->llama_server_release_task_result},
{"llama_server_tokenize", (void *)&s->llama_server_tokenize},
{"llama_server_detokenize", (void *)&s->llama_server_detokenize},
{"llama_server_embedding", (void *)&s->llama_server_embedding},
{"llama_server_release_json_resp",
(void *)&s->llama_server_release_json_resp},
{"", NULL},
};
printf("loading library %s\n", libPath);
s->handle = LOAD_LIBRARY(libPath, RTLD_GLOBAL|RTLD_NOW);
if (!s->handle) {
err->id = -1;
char *msg = LOAD_ERR();
snprintf(err->msg, err->msg_len,
"Unable to load dynamic server library: %s", msg);
free(msg);
return;
}
for (i = 0; l[i].p != NULL; i++) {
*l[i].p = LOAD_SYMBOL(s->handle, l[i].s);
if (!l[i].p) {
UNLOAD_LIBRARY(s->handle);
err->id = -1;
char *msg = LOAD_ERR();
snprintf(err->msg, err->msg_len, "symbol lookup for %s failed: %s",
l[i].s, msg);
free(msg);
return;
}
}
}
inline void dyn_llama_server_init(struct dynamic_llama_server s,
ext_server_params_t *sparams,
ext_server_resp_t *err) {
s.llama_server_init(sparams, err);
}
inline void dyn_llama_server_start(struct dynamic_llama_server s) {
s.llama_server_start();
}
inline void dyn_llama_server_stop(struct dynamic_llama_server s) {
s.llama_server_stop();
}
inline void dyn_llama_server_completion(struct dynamic_llama_server s,
const char *json_req,
ext_server_resp_t *resp) {
s.llama_server_completion(json_req, resp);
}
inline void dyn_llama_server_completion_next_result(
struct dynamic_llama_server s, const int task_id,
ext_server_task_result_t *result) {
s.llama_server_completion_next_result(task_id, result);
}
inline void dyn_llama_server_completion_cancel(
struct dynamic_llama_server s, const int task_id, ext_server_resp_t *err) {
s.llama_server_completion_cancel(task_id, err);
}
inline void dyn_llama_server_release_task_result(
struct dynamic_llama_server s, ext_server_task_result_t *result) {
s.llama_server_release_task_result(result);
}
inline void dyn_llama_server_tokenize(struct dynamic_llama_server s,
const char *json_req,
char **json_resp,
ext_server_resp_t *err) {
s.llama_server_tokenize(json_req, json_resp, err);
}
inline void dyn_llama_server_detokenize(struct dynamic_llama_server s,
const char *json_req,
char **json_resp,
ext_server_resp_t *err) {
s.llama_server_detokenize(json_req, json_resp, err);
}
inline void dyn_llama_server_embedding(struct dynamic_llama_server s,
const char *json_req,
char **json_resp,
ext_server_resp_t *err) {
s.llama_server_embedding(json_req, json_resp, err);
}
inline void dyn_llama_server_release_json_resp(
struct dynamic_llama_server s, char **json_resp) {
s.llama_server_release_json_resp(json_resp);
}

377
llm/dyn_ext_server.go Normal file
View File

@@ -0,0 +1,377 @@
package llm
/*
#cgo CFLAGS: -I${SRCDIR}/ext_server -I${SRCDIR}/llama.cpp -I${SRCDIR}/llama.cpp/common -I${SRCDIR}/llama.cpp/examples/server
#cgo CFLAGS: -DNDEBUG -DLLAMA_SERVER_LIBRARY=1 -D_XOPEN_SOURCE=600 -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64
#cgo CFLAGS: -Wmissing-noreturn -Wextra -Wcast-qual -Wno-unused-function -Wno-array-bounds
#cgo CPPFLAGS: -Ofast -Wextra -Wno-unused-function -Wno-unused-variable -Wno-deprecated-declarations -Wno-unused-but-set-variable
#cgo darwin CFLAGS: -D_DARWIN_C_SOURCE
#cgo darwin CPPFLAGS: -DGGML_USE_ACCELERATE
#cgo darwin CPPFLAGS: -DGGML_USE_METAL -DGGML_METAL_NDEBUG
#cgo darwin LDFLAGS: -lc++ -framework Accelerate
#cgo darwin LDFLAGS: -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
#cgo linux CFLAGS: -D_GNU_SOURCE
#cgo linux LDFLAGS: -lrt -ldl -lstdc++ -lm
#cgo linux windows LDFLAGS: -lpthread
#include <stdlib.h>
#include "dyn_ext_server.h"
*/
import "C"
import (
"bytes"
"context"
"encoding/json"
"fmt"
"log/slog"
"os"
"path/filepath"
"runtime"
"strings"
"sync"
"time"
"unsafe"
"github.com/jmorganca/ollama/api"
)
type dynExtServer struct {
s C.struct_dynamic_llama_server
options api.Options
}
// Note: current implementation does not support concurrent instantiations
var mutex sync.Mutex
func newExtServerResp(len C.size_t) C.ext_server_resp_t {
var resp C.ext_server_resp_t
resp.msg_len = len
bytes := make([]byte, len)
resp.msg = (*C.char)(C.CBytes(bytes))
return resp
}
func freeExtServerResp(resp C.ext_server_resp_t) {
if resp.msg_len == 0 {
return
}
C.free(unsafe.Pointer(resp.msg))
}
func extServerResponseToErr(resp C.ext_server_resp_t) error {
return fmt.Errorf(C.GoString(resp.msg))
}
// Note: current implementation does not support concurrent instantiations
var llm *dynExtServer
func newDynExtServer(library, model string, adapters, projectors []string, opts api.Options) (LLM, error) {
if !mutex.TryLock() {
slog.Info("concurrent llm servers not yet supported, waiting for prior server to complete")
mutex.Lock()
}
updatePath(filepath.Dir(library))
libPath := C.CString(library)
defer C.free(unsafe.Pointer(libPath))
resp := newExtServerResp(512)
defer freeExtServerResp(resp)
var srv C.struct_dynamic_llama_server
C.dyn_init(libPath, &srv, &resp)
if resp.id < 0 {
mutex.Unlock()
return nil, fmt.Errorf("Unable to load dynamic library: %s", C.GoString(resp.msg))
}
llm = &dynExtServer{
s: srv,
options: opts,
}
slog.Info(fmt.Sprintf("Loading Dynamic llm server: %s", library))
var sparams C.ext_server_params_t
sparams.model = C.CString(model)
defer C.free(unsafe.Pointer(sparams.model))
sparams.embedding = true
sparams.n_ctx = C.uint(opts.NumCtx)
sparams.n_batch = C.uint(opts.NumBatch)
sparams.n_gpu_layers = C.int(opts.NumGPU)
sparams.main_gpu = C.int(opts.MainGPU)
sparams.n_parallel = 1 // TODO - wire up concurrency
// Always use the value encoded in the model
sparams.rope_freq_base = 0.0
sparams.rope_freq_scale = 0.0
sparams.memory_f16 = C.bool(opts.F16KV)
sparams.use_mlock = C.bool(opts.UseMLock)
sparams.use_mmap = C.bool(opts.UseMMap)
sparams.numa = C.bool(opts.UseNUMA)
sparams.lora_adapters = nil
for i := 0; i < len(adapters); i++ {
la := (*C.ext_server_lora_adapter_t)(C.malloc(C.sizeof_ext_server_lora_adapter_t))
defer C.free(unsafe.Pointer(la))
la.adapter = C.CString(adapters[i])
defer C.free(unsafe.Pointer(la.adapter))
la.scale = C.float(1.0) // TODO expose scale/weights up through ollama UX
la.next = nil
if i == 0 {
sparams.lora_adapters = la
} else {
tmp := sparams.lora_adapters
for ; tmp.next != nil; tmp = tmp.next {
}
tmp.next = la
}
}
if len(projectors) > 0 {
// TODO: applying multiple projectors is not supported by the llama.cpp server yet
sparams.mmproj = C.CString(projectors[0])
defer C.free(unsafe.Pointer(sparams.mmproj))
} else {
sparams.mmproj = nil
}
sparams.n_threads = C.uint(opts.NumThread)
slog.Info("Initializing llama server")
initResp := newExtServerResp(128)
defer freeExtServerResp(initResp)
C.dyn_llama_server_init(llm.s, &sparams, &initResp)
if initResp.id < 0 {
return nil, extServerResponseToErr(initResp)
}
slog.Info("Starting llama main loop")
C.dyn_llama_server_start(llm.s)
return llm, nil
}
func (llm *dynExtServer) Predict(ctx context.Context, predict PredictOpts, fn func(PredictResult)) error {
resp := newExtServerResp(128)
defer freeExtServerResp(resp)
var imageData []ImageData
if len(predict.Images) > 0 {
for cnt, i := range predict.Images {
imageData = append(imageData, ImageData{Data: i, ID: cnt})
}
}
slog.Info(fmt.Sprintf("loaded %d images", len(imageData)))
request := map[string]any{
"prompt": predict.Prompt,
"stream": true,
"n_predict": predict.Options.NumPredict,
"n_keep": predict.Options.NumKeep,
"temperature": predict.Options.Temperature,
"top_k": predict.Options.TopK,
"top_p": predict.Options.TopP,
"tfs_z": predict.Options.TFSZ,
"typical_p": predict.Options.TypicalP,
"repeat_last_n": predict.Options.RepeatLastN,
"repeat_penalty": predict.Options.RepeatPenalty,
"presence_penalty": predict.Options.PresencePenalty,
"frequency_penalty": predict.Options.FrequencyPenalty,
"mirostat": predict.Options.Mirostat,
"mirostat_tau": predict.Options.MirostatTau,
"mirostat_eta": predict.Options.MirostatEta,
"penalize_nl": predict.Options.PenalizeNewline,
"seed": predict.Options.Seed,
"stop": predict.Options.Stop,
"image_data": imageData,
}
if predict.Format == "json" {
request["grammar"] = jsonGrammar
}
retryDelay := 100 * time.Microsecond
for retries := 0; retries < maxRetries; retries++ {
if retries > 0 {
time.Sleep(retryDelay) // wait before retrying
retryDelay *= 2 // exponential backoff
}
// Handling JSON marshaling with special characters unescaped.
buffer := &bytes.Buffer{}
enc := json.NewEncoder(buffer)
enc.SetEscapeHTML(false)
if err := enc.Encode(request); err != nil {
return fmt.Errorf("failed to marshal data: %w", err)
}
req := C.CString(buffer.String())
defer C.free(unsafe.Pointer(req))
C.dyn_llama_server_completion(llm.s, req, &resp)
if resp.id < 0 {
return extServerResponseToErr(resp)
}
retryNeeded := false
out:
for {
select {
case <-ctx.Done():
// This handles the request cancellation
C.dyn_llama_server_completion_cancel(llm.s, resp.id, &resp)
if resp.id < 0 {
return extServerResponseToErr(resp)
} else {
return nil
}
default:
var result C.ext_server_task_result_t
C.dyn_llama_server_completion_next_result(llm.s, resp.id, &result)
json_resp := C.GoString(result.json_resp)
C.dyn_llama_server_release_task_result(llm.s, &result)
var p prediction
if err := json.Unmarshal([]byte(json_resp), &p); err != nil {
C.dyn_llama_server_completion_cancel(llm.s, resp.id, &resp)
if resp.id < 0 {
return fmt.Errorf("error unmarshaling llm prediction response: %w and cancel %s", err, C.GoString(resp.msg))
} else {
return fmt.Errorf("error unmarshaling llm prediction response: %w", err)
}
}
if bool(result.error) && strings.Contains(json_resp, "slot unavailable") {
retryNeeded = true
// task will already be canceled
break out
}
if p.Content != "" {
fn(PredictResult{
Content: p.Content,
})
}
if p.Stop {
fn(PredictResult{
Done: true,
PromptEvalCount: p.Timings.PromptN,
PromptEvalDuration: parseDurationMs(p.Timings.PromptMS),
EvalCount: p.Timings.PredictedN,
EvalDuration: parseDurationMs(p.Timings.PredictedMS),
})
return nil
}
}
}
if !retryNeeded {
return nil // success
}
}
// should never reach here ideally
return fmt.Errorf("max retries exceeded")
}
func (llm *dynExtServer) Encode(ctx context.Context, prompt string) ([]int, error) {
data, err := json.Marshal(TokenizeRequest{Content: prompt})
if err != nil {
return nil, fmt.Errorf("marshaling encode data: %w", err)
}
req := C.CString(string(data))
defer C.free(unsafe.Pointer(req))
var json_resp *C.char
resp := newExtServerResp(128)
defer freeExtServerResp(resp)
C.dyn_llama_server_tokenize(llm.s, req, &json_resp, &resp)
if resp.id < 0 {
return nil, extServerResponseToErr(resp)
}
defer C.dyn_llama_server_release_json_resp(llm.s, &json_resp)
var encoded TokenizeResponse
if err2 := json.Unmarshal([]byte(C.GoString(json_resp)), &encoded); err2 != nil {
return nil, fmt.Errorf("unmarshal encode response: %w", err2)
}
return encoded.Tokens, err
}
func (llm *dynExtServer) Decode(ctx context.Context, tokens []int) (string, error) {
if len(tokens) == 0 {
return "", nil
}
data, err := json.Marshal(DetokenizeRequest{Tokens: tokens})
if err != nil {
return "", fmt.Errorf("marshaling decode data: %w", err)
}
req := C.CString(string(data))
defer C.free(unsafe.Pointer(req))
var json_resp *C.char
resp := newExtServerResp(128)
defer freeExtServerResp(resp)
C.dyn_llama_server_detokenize(llm.s, req, &json_resp, &resp)
if resp.id < 0 {
return "", extServerResponseToErr(resp)
}
defer C.dyn_llama_server_release_json_resp(llm.s, &json_resp)
var decoded DetokenizeResponse
if err2 := json.Unmarshal([]byte(C.GoString(json_resp)), &decoded); err2 != nil {
return "", fmt.Errorf("unmarshal encode response: %w", err2)
}
return decoded.Content, err
}
func (llm *dynExtServer) Embedding(ctx context.Context, input string) ([]float64, error) {
data, err := json.Marshal(TokenizeRequest{Content: input})
if err != nil {
return nil, fmt.Errorf("error marshaling embed data: %w", err)
}
req := C.CString(string(data))
defer C.free(unsafe.Pointer(req))
var json_resp *C.char
resp := newExtServerResp(128)
defer freeExtServerResp(resp)
C.dyn_llama_server_embedding(llm.s, req, &json_resp, &resp)
if resp.id < 0 {
return nil, extServerResponseToErr(resp)
}
defer C.dyn_llama_server_release_json_resp(llm.s, &json_resp)
var embedding EmbeddingResponse
if err := json.Unmarshal([]byte(C.GoString(json_resp)), &embedding); err != nil {
return nil, fmt.Errorf("unmarshal tokenize response: %w", err)
}
return embedding.Embedding, nil
}
func (llm *dynExtServer) Close() {
C.dyn_llama_server_stop(llm.s)
mutex.Unlock()
}
func updatePath(dir string) {
if runtime.GOOS == "windows" {
tmpDir := filepath.Dir(dir)
pathComponents := strings.Split(os.Getenv("PATH"), ";")
i := 0
for _, comp := range pathComponents {
if strings.EqualFold(comp, dir) {
return
}
// Remove any other prior paths to our temp dir
if !strings.HasPrefix(strings.ToLower(comp), strings.ToLower(tmpDir)) {
pathComponents[i] = comp
i++
}
}
newPath := strings.Join(append([]string{dir}, pathComponents...), ";")
slog.Info(fmt.Sprintf("Updating PATH to %s", newPath))
os.Setenv("PATH", newPath)
}
// linux and darwin rely on rpath
}

74
llm/dyn_ext_server.h Normal file
View File

@@ -0,0 +1,74 @@
#include <stdlib.h>
#include "ext_server.h"
#ifdef __cplusplus
extern "C" {
#endif
struct dynamic_llama_server {
void *handle;
void (*llama_server_init)(ext_server_params_t *sparams,
ext_server_resp_t *err);
void (*llama_server_start)();
void (*llama_server_stop)();
void (*llama_server_completion)(const char *json_req,
ext_server_resp_t *resp);
void (*llama_server_completion_next_result)(const int task_id,
ext_server_task_result_t *result);
void (*llama_server_completion_cancel)(const int task_id,
ext_server_resp_t *err);
void (*llama_server_release_task_result)(ext_server_task_result_t *result);
void (*llama_server_tokenize)(const char *json_req, char **json_resp,
ext_server_resp_t *err);
void (*llama_server_detokenize)(const char *json_req, char **json_resp,
ext_server_resp_t *err);
void (*llama_server_embedding)(const char *json_req, char **json_resp,
ext_server_resp_t *err);
void (*llama_server_release_json_resp)(char **json_resp);
};
void dyn_init(const char *libPath, struct dynamic_llama_server *s,
ext_server_resp_t *err);
// No good way to call C function pointers from Go so inline the indirection
void dyn_llama_server_init(struct dynamic_llama_server s,
ext_server_params_t *sparams,
ext_server_resp_t *err);
void dyn_llama_server_start(struct dynamic_llama_server s);
void dyn_llama_server_stop(struct dynamic_llama_server s);
void dyn_llama_server_completion(struct dynamic_llama_server s,
const char *json_req,
ext_server_resp_t *resp);
void dyn_llama_server_completion_next_result(
struct dynamic_llama_server s, const int task_id,
ext_server_task_result_t *result);
void dyn_llama_server_completion_cancel(struct dynamic_llama_server s,
const int task_id,
ext_server_resp_t *err);
void dyn_llama_server_release_task_result(
struct dynamic_llama_server s, ext_server_task_result_t *result);
void dyn_llama_server_tokenize(struct dynamic_llama_server s,
const char *json_req, char **json_resp,
ext_server_resp_t *err);
void dyn_llama_server_detokenize(struct dynamic_llama_server s,
const char *json_req,
char **json_resp,
ext_server_resp_t *err);
void dyn_llama_server_embedding(struct dynamic_llama_server s,
const char *json_req, char **json_resp,
ext_server_resp_t *err);
void dyn_llama_server_release_json_resp(struct dynamic_llama_server s,
char **json_resp);
#ifdef __cplusplus
}
#endif

View File

@@ -0,0 +1,25 @@
# Ollama specific CMakefile to include in llama.cpp/examples/server
set(TARGET ext_server)
option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
if (WIN32)
add_library(${TARGET} SHARED ../../../ext_server/ext_server.cpp ../../llama.cpp)
else()
add_library(${TARGET} STATIC ../../../ext_server/ext_server.cpp ../../llama.cpp)
endif()
target_include_directories(${TARGET} PRIVATE ../../common)
target_include_directories(${TARGET} PRIVATE ../..)
target_include_directories(${TARGET} PRIVATE ../../..)
target_compile_features(${TARGET} PRIVATE cxx_std_11)
target_compile_definitions(${TARGET} PUBLIC LLAMA_SERVER_LIBRARY=1)
target_link_libraries(${TARGET} PRIVATE ggml llava common )
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_compile_definitions(${TARGET} PRIVATE SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>)
install(TARGETS ext_server LIBRARY)
if (CUDAToolkit_FOUND)
target_include_directories(${TARGET} PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
if (WIN32)
target_link_libraries(${TARGET} PRIVATE nvml)
endif()
endif()

18
llm/ext_server/README.md Normal file
View File

@@ -0,0 +1,18 @@
# Extern C Server
This directory contains a thin facade we layer on top of the Llama.cpp server to
expose `extern C` interfaces to access the functionality through direct API
calls in-process. The llama.cpp code uses compile time macros to configure GPU
type along with other settings. During the `go generate ./...` execution, the
build will generate one or more copies of the llama.cpp `extern C` server based
on what GPU libraries are detected to support multiple GPU types as well as CPU
only support. The Ollama go build then embeds these different servers to support
different GPUs and settings at runtime.
If you are making changes to the code in this directory, make sure to disable
caching during your go build to ensure you pick up your changes. A typical
iteration cycle from the top of the source tree looks like:
```
go generate ./... && go build -a .
```

View File

@@ -0,0 +1,290 @@
#include "ext_server.h"
// Necessary evil since the server types are not defined in a header
#include "server.cpp"
// Expose the llama server as a callable extern "C" API
llama_server_context *llama = NULL;
std::atomic<bool> ext_server_running(false);
std::thread ext_server_thread;
void llama_server_init(ext_server_params *sparams, ext_server_resp_t *err) {
#if SERVER_VERBOSE != 1
log_disable();
#endif
LOG_TEE("system info: %s", llama_print_system_info());
assert(err != NULL && sparams != NULL);
err->id = 0;
err->msg[0] = '\0';
try {
llama = new llama_server_context;
log_set_target(stdout);
gpt_params params;
params.n_ctx = sparams->n_ctx;
params.n_batch = sparams->n_batch;
if (sparams->n_threads > 0) {
params.n_threads = sparams->n_threads;
}
params.n_parallel = sparams->n_parallel;
params.rope_freq_base = sparams->rope_freq_base;
params.rope_freq_scale = sparams->rope_freq_scale;
if (sparams->memory_f16) {
params.cache_type_k = "f16";
params.cache_type_v = "f16";
} else {
params.cache_type_k = "f32";
params.cache_type_v = "f32";
}
params.n_gpu_layers = sparams->n_gpu_layers;
params.main_gpu = sparams->main_gpu;
params.use_mlock = sparams->use_mlock;
params.use_mmap = sparams->use_mmap;
params.numa = sparams->numa;
params.embedding = sparams->embedding;
if (sparams->model != NULL) {
params.model = sparams->model;
}
if (sparams->lora_adapters != NULL) {
for (ext_server_lora_adapter *la = sparams->lora_adapters; la != NULL;
la = la->next) {
params.lora_adapter.push_back(std::make_tuple(la->adapter, la->scale));
}
params.use_mmap = false;
}
if (sparams->mmproj != NULL) {
params.mmproj = std::string(sparams->mmproj);
}
llama_backend_init(params.numa);
// load the model
if (!llama->load_model(params)) {
// TODO - consider modifying the logging logic or patching load_model so
// we can capture more detailed error messages and pass them back to the
// caller for better UX
err->id = -1;
snprintf(err->msg, err->msg_len, "error loading model %s",
params.model.c_str());
return;
}
llama->initialize();
} catch (std::exception &e) {
err->id = -1;
snprintf(err->msg, err->msg_len, "exception %s", e.what());
} catch (...) {
err->id = -1;
snprintf(err->msg, err->msg_len,
"Unknown exception initializing llama server");
}
}
void llama_server_start() {
assert(llama != NULL);
// TODO mutex to protect thread creation
ext_server_thread = std::thread([&]() {
ext_server_running = true;
try {
LOG_TEE("llama server main loop starting\n");
ggml_time_init();
while (ext_server_running.load()) {
if (!llama->update_slots()) {
LOG_TEE(
"unexpected error in llama server update_slots - exiting main "
"loop\n");
break;
}
}
} catch (std::exception &e) {
LOG_TEE("caught exception in llama server main loop: %s\n", e.what());
} catch (...) {
LOG_TEE("caught unknown exception in llama server main loop\n");
}
LOG_TEE("\nllama server shutting down\n");
llama_backend_free();
});
}
void llama_server_stop() {
assert(llama != NULL);
// TODO - too verbose, remove once things are solid
LOG_TEE("requesting llama server shutdown\n");
ext_server_running = false;
// unblocks the update_slots() loop so it can clean up and exit
llama->request_cancel(0);
ext_server_thread.join();
delete llama;
llama = NULL;
LOG_TEE("llama server shutdown complete\n");
}
void llama_server_completion(const char *json_req, ext_server_resp_t *resp) {
assert(llama != NULL && json_req != NULL && resp != NULL);
resp->id = -1;
resp->msg[0] = '\0';
try {
json data = json::parse(json_req);
resp->id = llama->request_completion(data, false, false, -1);
} catch (std::exception &e) {
snprintf(resp->msg, resp->msg_len, "exception %s", e.what());
} catch (...) {
snprintf(resp->msg, resp->msg_len, "Unknown exception during completion");
}
}
void llama_server_completion_next_result(const int task_id,
ext_server_task_result_t *resp) {
assert(llama != NULL && resp != NULL);
std::string msg;
resp->id = -1;
resp->stop = false;
resp->error = false;
resp->json_resp = NULL;
std::string result_json;
try {
task_result result = llama->next_result(task_id);
result_json =
result.result_json.dump(-1, ' ', false, json::error_handler_t::replace);
resp->id = result.id;
resp->stop = result.stop;
resp->error = result.error;
if (result.error) {
llama->request_cancel(task_id);
} else if (result.stop) {
llama->request_cancel(task_id);
}
} catch (std::exception &e) {
resp->error = true;
resp->id = -1;
result_json = "{\"error\":\"exception " + std::string(e.what()) + "\"}";
LOG_TEE("llama server completion exception %s\n", e.what());
} catch (...) {
resp->error = true;
resp->id = -1;
result_json = "{\"error\":\"Unknown exception during completion\"}";
LOG_TEE("llama server completion unknown exception\n");
}
const std::string::size_type size = result_json.size() + 1;
resp->json_resp = new char[size];
snprintf(resp->json_resp, size, "%s", result_json.c_str());
}
void llama_server_release_task_result(ext_server_task_result_t *result) {
if (result == NULL || result->json_resp == NULL) {
return;
}
delete[] result->json_resp;
}
void llama_server_completion_cancel(const int task_id, ext_server_resp_t *err) {
assert(llama != NULL && err != NULL);
err->id = 0;
err->msg[0] = '\0';
try {
llama->request_cancel(task_id);
} catch (std::exception &e) {
err->id = -1;
snprintf(err->msg, err->msg_len, "exception %s", e.what());
} catch (...) {
err->id = -1;
snprintf(err->msg, err->msg_len,
"Unknown exception completion cancel in llama server");
}
}
void llama_server_tokenize(const char *json_req, char **json_resp,
ext_server_resp_t *err) {
assert(llama != NULL && json_req != NULL && json_resp != NULL && err != NULL);
*json_resp = NULL;
err->id = 0;
err->msg[0] = '\0';
try {
const json body = json::parse(json_req);
std::vector<llama_token> tokens;
if (body.count("content") != 0) {
tokens = llama->tokenize(body["content"], false);
}
const json data = format_tokenizer_response(tokens);
std::string result_json = data.dump();
const std::string::size_type size = result_json.size() + 1;
*json_resp = new char[size];
snprintf(*json_resp, size, "%s", result_json.c_str());
} catch (std::exception &e) {
err->id = -1;
snprintf(err->msg, err->msg_len, "exception %s", e.what());
} catch (...) {
err->id = -1;
snprintf(err->msg, err->msg_len, "Unknown exception during tokenize");
}
}
void llama_server_release_json_resp(char **json_resp) {
if (json_resp == NULL || *json_resp == NULL) {
return;
}
delete[] *json_resp;
}
void llama_server_detokenize(const char *json_req, char **json_resp,
ext_server_resp_t *err) {
assert(llama != NULL && json_req != NULL && json_resp != NULL && err != NULL);
*json_resp = NULL;
err->id = 0;
err->msg[0] = '\0';
try {
const json body = json::parse(json_req);
std::string content;
if (body.count("tokens") != 0) {
const std::vector<llama_token> tokens = body["tokens"];
content = tokens_to_str(llama->ctx, tokens.cbegin(), tokens.cend());
}
const json data = format_detokenized_response(content);
std::string result_json = data.dump();
const std::string::size_type size = result_json.size() + 1;
*json_resp = new char[size];
snprintf(*json_resp, size, "%s", result_json.c_str());
} catch (std::exception &e) {
err->id = -1;
snprintf(err->msg, err->msg_len, "exception %s", e.what());
} catch (...) {
err->id = -1;
snprintf(err->msg, err->msg_len, "Unknown exception during detokenize");
}
}
void llama_server_embedding(const char *json_req, char **json_resp,
ext_server_resp_t *err) {
assert(llama != NULL && json_req != NULL && json_resp != NULL && err != NULL);
*json_resp = NULL;
err->id = 0;
err->msg[0] = '\0';
try {
const json body = json::parse(json_req);
json prompt;
if (body.count("content") != 0) {
prompt = body["content"];
} else {
prompt = "";
}
const int task_id = llama->request_completion(
{{"prompt", prompt}, {"n_predict", 0}}, false, true, -1);
task_result result = llama->next_result(task_id);
std::string result_json = result.result_json.dump();
const std::string::size_type size = result_json.size() + 1;
*json_resp = new char[size];
snprintf(*json_resp, size, "%s", result_json.c_str());
} catch (std::exception &e) {
err->id = -1;
snprintf(err->msg, err->msg_len, "exception %s", e.what());
} catch (...) {
err->id = -1;
snprintf(err->msg, err->msg_len, "Unknown exception during embedding");
}
}

View File

@@ -0,0 +1,94 @@
#if defined(LLAMA_SERVER_LIBRARY)
#ifndef LLAMA_SERVER_H
#define LLAMA_SERVER_H
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
int __main(int argc, char **argv);
// This exposes extern C entrypoints into the llama_server
// To enable the server compile with LLAMA_SERVER_LIBRARY
#ifdef __cplusplus
extern "C" {
#endif
typedef struct ext_server_resp {
int id; // < 0 on error
size_t msg_len; // caller must allocate msg and set msg_len
char *msg;
} ext_server_resp_t;
// Allocated and freed by caller
typedef struct ext_server_lora_adapter {
char *adapter;
float scale;
struct ext_server_lora_adapter *next;
} ext_server_lora_adapter_t;
// Allocated and freed by caller
typedef struct ext_server_params {
char *model;
uint32_t n_ctx; // token context window, 0 = from model
uint32_t n_batch; // prompt processing maximum batch size
uint32_t n_threads; // number of threads to use for generation
int32_t n_parallel; // number of parallel sequences to decodewra
float rope_freq_base; // RoPE base frequency, 0 = from model
float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model
bool memory_f16; // use f16 instead of f32 for memory kv
int32_t n_gpu_layers; // number of layers to store in VRAM (-1 - use default)
int32_t main_gpu; // the GPU that is used for scratch and small tensors
bool use_mlock; // force system to keep model in RAM
bool use_mmap; // use mmap if possible
bool numa; // attempt optimizations that help on some NUMA systems
bool embedding; // get only sentence embedding
ext_server_lora_adapter_t *lora_adapters;
char *mmproj;
} ext_server_params_t;
typedef struct ext_server_task_result {
int id;
bool stop;
bool error;
char *json_resp; // null terminated, memory managed by ext_server
} ext_server_task_result_t;
// Initialize the server once per process
// err->id = 0 for success and err->msg[0] = NULL
// err->id != 0 for failure, and err->msg contains error message
void llama_server_init(ext_server_params_t *sparams, ext_server_resp_t *err);
// Run the main loop, called once per init
void llama_server_start();
// Stop the main loop and free up resources allocated in init and start. Init
// must be called again to reuse
void llama_server_stop();
// json_req null terminated string, memory managed by caller
// resp->id >= 0 on success (task ID)
// resp->id < 0 on error, and resp->msg contains error message
void llama_server_completion(const char *json_req, ext_server_resp_t *resp);
// Caller must call llama_server_release_task_result to free resp->json_resp
void llama_server_completion_next_result(const int task_id,
ext_server_task_result_t *result);
void llama_server_completion_cancel(const int task_id, ext_server_resp_t *err);
void llama_server_release_task_result(ext_server_task_result_t *result);
// Caller must call llama_server_releaes_json_resp to free json_resp if err.id <
// 0
void llama_server_tokenize(const char *json_req, char **json_resp,
ext_server_resp_t *err);
void llama_server_detokenize(const char *json_req, char **json_resp,
ext_server_resp_t *err);
void llama_server_embedding(const char *json_req, char **json_resp,
ext_server_resp_t *err);
void llama_server_release_json_resp(char **json_resp);
#ifdef __cplusplus
}
#endif
#endif
#endif // LLAMA_SERVER_LIBRARY

100
llm/generate/gen_common.sh Normal file
View File

@@ -0,0 +1,100 @@
# common logic accross linux and darwin
init_vars() {
case "${GOARCH}" in
"amd64")
ARCH="x86_64"
;;
"arm64")
ARCH="arm64"
;;
*)
ARCH=$(uname -m | sed -e "s/aarch64/arm64/g")
esac
LLAMACPP_DIR=../llama.cpp
CMAKE_DEFS=""
CMAKE_TARGETS="--target ext_server"
if echo "${CGO_CFLAGS}" | grep -- '-g' >/dev/null; then
CMAKE_DEFS="-DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_VERBOSE_MAKEFILE=on -DLLAMA_GPROF=on -DLLAMA_SERVER_VERBOSE=on ${CMAKE_DEFS}"
else
# TODO - add additional optimization flags...
CMAKE_DEFS="-DCMAKE_BUILD_TYPE=Release -DLLAMA_SERVER_VERBOSE=off ${CMAKE_DEFS}"
fi
case $(uname -s) in
"Darwin")
LIB_EXT="dylib"
WHOLE_ARCHIVE="-Wl,-force_load"
NO_WHOLE_ARCHIVE=""
GCC_ARCH="-arch ${ARCH}"
;;
"Linux")
LIB_EXT="so"
WHOLE_ARCHIVE="-Wl,--whole-archive"
NO_WHOLE_ARCHIVE="-Wl,--no-whole-archive"
# Cross compiling not supported on linux - Use docker
GCC_ARCH=""
;;
*)
;;
esac
}
git_module_setup() {
if [ -n "${OLLAMA_SKIP_PATCHING}" ]; then
echo "Skipping submodule initialization"
return
fi
# Make sure the tree is clean after the directory moves
if [ -d "${LLAMACPP_DIR}/gguf" ]; then
echo "Cleaning up old submodule"
rm -rf ${LLAMACPP_DIR}
fi
git submodule init
git submodule update --force ${LLAMACPP_DIR}
}
apply_patches() {
# Wire up our CMakefile
if ! grep ollama ${LLAMACPP_DIR}/examples/server/CMakeLists.txt; then
echo 'include (../../../ext_server/CMakeLists.txt) # ollama' >>${LLAMACPP_DIR}/examples/server/CMakeLists.txt
fi
# Avoid duplicate main symbols when we link into the cgo binary
sed -e 's/int main(/int __main(/g' <${LLAMACPP_DIR}/examples/server/server.cpp >${LLAMACPP_DIR}/examples/server/server.cpp.tmp &&
mv ${LLAMACPP_DIR}/examples/server/server.cpp.tmp ${LLAMACPP_DIR}/examples/server/server.cpp
}
build() {
cmake -S ${LLAMACPP_DIR} -B ${BUILD_DIR} ${CMAKE_DEFS}
cmake --build ${BUILD_DIR} ${CMAKE_TARGETS} -j8
mkdir -p ${BUILD_DIR}/lib/
g++ -fPIC -g -shared -o ${BUILD_DIR}/lib/libext_server.${LIB_EXT} \
${GCC_ARCH} \
${WHOLE_ARCHIVE} ${BUILD_DIR}/examples/server/libext_server.a ${NO_WHOLE_ARCHIVE} \
${BUILD_DIR}/common/libcommon.a \
${BUILD_DIR}/libllama.a \
-Wl,-rpath,\$ORIGIN \
-lpthread -ldl -lm \
${EXTRA_LIBS}
}
compress_libs() {
echo "Compressing payloads to reduce overall binary size..."
pids=""
for lib in ${BUILD_DIR}/lib/*.${LIB_EXT}* ; do
gzip --best ${lib} &
pids+=" $!"
done
echo
for pid in ${pids}; do
wait $pid
done
echo "Finished compression"
}
# Keep the local tree clean after we're done with the build
cleanup() {
(cd ${LLAMACPP_DIR}/examples/server/ && git checkout CMakeLists.txt server.cpp)
}

66
llm/generate/gen_darwin.sh Executable file
View File

@@ -0,0 +1,66 @@
#!/bin/bash
# This script is intended to run inside the go generate
# working directory must be ./llm/generate/
# TODO - add hardening to detect missing tools (cmake, etc.)
set -ex
set -o pipefail
echo "Starting darwin generate script"
source $(dirname $0)/gen_common.sh
init_vars
git_module_setup
apply_patches
COMMON_DARWIN_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 -DCMAKE_SYSTEM_NAME=Darwin -DLLAMA_ACCELERATE=off"
case "${GOARCH}" in
"amd64")
COMMON_CPU_DEFS="${COMMON_DARWIN_DEFS} -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DLLAMA_METAL=off -DLLAMA_NATIVE=off"
#
# CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
#
CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
BUILD_DIR="${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu"
echo "Building LCD CPU"
build
compress_libs
#
# ~2011 CPU Dynamic library with more capabilities turned on to optimize performance
# Approximately 400% faster than LCD on same CPU
#
init_vars
CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
BUILD_DIR="${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu_avx"
echo "Building AVX CPU"
build
compress_libs
#
# ~2013 CPU Dynamic library
# Approximately 10% faster than AVX on same CPU
#
init_vars
CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on ${CMAKE_DEFS}"
BUILD_DIR="${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu_avx2"
echo "Building AVX2 CPU"
build
compress_libs
;;
"arm64")
CMAKE_DEFS="${COMMON_DARWIN_DEFS} -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DLLAMA_METAL=on -DLLAMA_ACCELERATE=on ${CMAKE_DEFS}"
BUILD_DIR="${LLAMACPP_DIR}/build/darwin/${ARCH}/metal"
EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders"
build
compress_libs
;;
*)
echo "GOARCH must be set"
echo "this script is meant to be run from within go generate"
exit 1
;;
esac
cleanup

180
llm/generate/gen_linux.sh Executable file
View File

@@ -0,0 +1,180 @@
#!/bin/bash
# This script is intended to run inside the go generate
# working directory must be llm/generate/
# First we build one or more CPU based LLM libraries
#
# Then if we detect CUDA, we build a CUDA dynamic library, and carry the required
# library dependencies
#
# Then if we detect ROCm, we build a dynamically loaded ROCm lib. The ROCM
# libraries are quite large, and also dynamically load data files at runtime
# which in turn are large, so we don't attempt to cary them as payload
set -ex
set -o pipefail
# See https://llvm.org/docs/AMDGPUUsage.html#processors for reference
amdGPUs() {
GPU_LIST=(
"gfx803"
"gfx900"
"gfx906:xnack-"
"gfx908:xnack-"
"gfx90a:xnack+"
"gfx90a:xnack-"
"gfx1010"
"gfx1012"
"gfx1030"
"gfx1100"
"gfx1101"
"gfx1102"
)
(
IFS=$';'
echo "'${GPU_LIST[*]}'"
)
}
echo "Starting linux generate script"
if [ -z "${CUDACXX}" ]; then
if [ -x /usr/local/cuda/bin/nvcc ]; then
export CUDACXX=/usr/local/cuda/bin/nvcc
else
# Try the default location in case it exists
export CUDACXX=$(command -v nvcc)
fi
fi
COMMON_CMAKE_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off"
source $(dirname $0)/gen_common.sh
init_vars
git_module_setup
apply_patches
if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
# Users building from source can tune the exact flags we pass to cmake for configuring
# llama.cpp, and we'll build only 1 CPU variant in that case as the default.
if [ -n "${OLLAMA_CUSTOM_CPU_DEFS}" ]; then
echo "OLLAMA_CUSTOM_CPU_DEFS=\"${OLLAMA_CUSTOM_CPU_DEFS}\""
CMAKE_DEFS="${OLLAMA_CUSTOM_CPU_DEFS} -DCMAKE_POSITION_INDEPENDENT_CODE=on ${CMAKE_DEFS}"
BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu"
echo "Building custom CPU"
build
compress_libs
else
# Darwin Rosetta x86 emulation does NOT support AVX, AVX2, AVX512
# -DLLAMA_AVX -- 2011 Intel Sandy Bridge & AMD Bulldozer
# -DLLAMA_F16C -- 2012 Intel Ivy Bridge & AMD 2011 Bulldozer (No significant improvement over just AVX)
# -DLLAMA_AVX2 -- 2013 Intel Haswell & 2015 AMD Excavator / 2017 AMD Zen
# -DLLAMA_FMA (FMA3) -- 2013 Intel Haswell & 2012 AMD Piledriver
# Note: the following seem to yield slower results than AVX2 - ymmv
# -DLLAMA_AVX512 -- 2017 Intel Skylake and High End DeskTop (HEDT)
# -DLLAMA_AVX512_VBMI -- 2018 Intel Cannon Lake
# -DLLAMA_AVX512_VNNI -- 2021 Intel Alder Lake
COMMON_CPU_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off"
#
# CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
#
CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu"
echo "Building LCD CPU"
build
compress_libs
#
# ~2011 CPU Dynamic library with more capabilities turned on to optimize performance
# Approximately 400% faster than LCD on same CPU
#
init_vars
CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu_avx"
echo "Building AVX CPU"
build
compress_libs
#
# ~2013 CPU Dynamic library
# Approximately 10% faster than AVX on same CPU
#
init_vars
CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on ${CMAKE_DEFS}"
BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu_avx2"
echo "Building AVX2 CPU"
build
compress_libs
fi
else
echo "Skipping CPU generation step as requested"
fi
# If needed, look for the default CUDA toolkit location
if [ -z "${CUDA_LIB_DIR}" ] && [ -d /usr/local/cuda/lib64 ]; then
CUDA_LIB_DIR=/usr/local/cuda/lib64
fi
# If needed, look for CUDA on Arch Linux
if [ -z "${CUDA_LIB_DIR}" ] && [ -d /opt/cuda/targets/x86_64-linux/lib ]; then
CUDA_LIB_DIR=/opt/cuda/targets/x86_64-linux/lib
fi
if [ -d "${CUDA_LIB_DIR}" ]; then
echo "CUDA libraries detected - building dynamic CUDA library"
init_vars
CUDA_MAJOR=$(ls "${CUDA_LIB_DIR}"/libcudart.so.* | head -1 | cut -f3 -d. || true)
if [ -n "${CUDA_MAJOR}" ]; then
CUDA_VARIANT=_v${CUDA_MAJOR}
fi
CMAKE_DEFS="-DLLAMA_CUBLAS=on ${COMMON_CMAKE_DEFS} ${CMAKE_DEFS}"
BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cuda${CUDA_VARIANT}"
EXTRA_LIBS="-L${CUDA_LIB_DIR} -lcudart -lcublas -lcublasLt -lcuda"
build
# Cary the CUDA libs as payloads to help reduce dependency burden on users
#
# TODO - in the future we may shift to packaging these separately and conditionally
# downloading them in the install script.
DEPS="$(ldd ${BUILD_DIR}/lib/libext_server.so )"
for lib in libcudart.so libcublas.so libcublasLt.so ; do
DEP=$(echo "${DEPS}" | grep ${lib} | cut -f1 -d' ' | xargs || true)
if [ -n "${DEP}" -a -e "${CUDA_LIB_DIR}/${DEP}" ]; then
cp "${CUDA_LIB_DIR}/${DEP}" "${BUILD_DIR}/lib/"
elif [ -e "${CUDA_LIB_DIR}/${lib}.${CUDA_MAJOR}" ]; then
cp "${CUDA_LIB_DIR}/${lib}.${CUDA_MAJOR}" "${BUILD_DIR}/lib/"
else
cp -d "${CUDA_LIB_DIR}/${lib}*" "${BUILD_DIR}/lib/"
fi
done
compress_libs
fi
if [ -z "${ROCM_PATH}" ]; then
# Try the default location in case it exists
ROCM_PATH=/opt/rocm
fi
if [ -z "${CLBlast_DIR}" ]; then
# Try the default location in case it exists
if [ -d /usr/lib/cmake/CLBlast ]; then
export CLBlast_DIR=/usr/lib/cmake/CLBlast
fi
fi
if [ -d "${ROCM_PATH}" ]; then
echo "ROCm libraries detected - building dynamic ROCm library"
if [ -f ${ROCM_PATH}/lib/librocm_smi64.so.? ]; then
ROCM_VARIANT=_v$(ls ${ROCM_PATH}/lib/librocm_smi64.so.? | cut -f3 -d. || true)
fi
init_vars
CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DLLAMA_HIPBLAS=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS=$(amdGPUs) -DGPU_TARGETS=$(amdGPUs)"
BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/rocm${ROCM_VARIANT}"
EXTRA_LIBS="-L${ROCM_PATH}/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ -Wl,-rpath,${ROCM_PATH}/lib,-rpath,/opt/amdgpu/lib/x86_64-linux-gnu/ -lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu"
build
# Note: the ROCM libs and runtime library files are too large to embed, so we depend on
# them being present at runtime on the host
compress_libs
fi
cleanup

View File

@@ -0,0 +1,147 @@
#!powershell
$ErrorActionPreference = "Stop"
function init_vars {
$script:llamacppDir = "../llama.cpp"
$script:cmakeDefs = @("-DBUILD_SHARED_LIBS=on", "-DLLAMA_NATIVE=off", "-A","x64")
$script:cmakeTargets = @("ext_server")
$script:ARCH = "amd64" # arm not yet supported.
if ($env:CGO_CFLAGS -contains "-g") {
$script:cmakeDefs += @("-DCMAKE_VERBOSE_MAKEFILE=on", "-DLLAMA_SERVER_VERBOSE=on")
$script:config = "RelWithDebInfo"
} else {
$script:cmakeDefs += @("-DLLAMA_SERVER_VERBOSE=off")
$script:config = "Release"
}
# Try to find the CUDA dir
if ($env:CUDA_LIB_DIR -eq $null) {
$d=(get-command -ea 'silentlycontinue' nvcc).path
if ($d -ne $null) {
$script:CUDA_LIB_DIR=($d| split-path -parent)
}
} else {
$script:CUDA_LIB_DIR=$env:CUDA_LIB_DIR
}
$script:GZIP=(get-command -ea 'silentlycontinue' gzip).path
$script:DUMPBIN=(get-command -ea 'silentlycontinue' dumpbin).path
}
function git_module_setup {
# TODO add flags to skip the init/patch logic to make it easier to mod llama.cpp code in-repo
& git submodule init
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
& git submodule update --force "${script:llamacppDir}"
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
}
function apply_patches {
# Wire up our CMakefile
if (!(Select-String -Path "${script:llamacppDir}/examples/server/CMakeLists.txt" -Pattern 'ollama')) {
Add-Content -Path "${script:llamacppDir}/examples/server/CMakeLists.txt" -Value 'include (../../../ext_server/CMakeLists.txt) # ollama'
}
# Avoid duplicate main symbols when we link into the cgo binary
$content = Get-Content -Path "${script:llamacppDir}/examples/server/server.cpp"
$content = $content -replace 'int main\(', 'int __main('
Set-Content -Path "${script:llamacppDir}/examples/server/server.cpp" -Value $content
}
function build {
write-host "generating config with: cmake -S ${script:llamacppDir} -B $script:buildDir $script:cmakeDefs"
& cmake --version
& cmake -S "${script:llamacppDir}" -B $script:buildDir $script:cmakeDefs
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
write-host "building with: cmake --build $script:buildDir --config $script:config ($script:cmakeTargets | ForEach-Object { "--target", $_ })"
& cmake --build $script:buildDir --config $script:config ($script:cmakeTargets | ForEach-Object { "--target", $_ })
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
}
function install {
rm -ea 0 -recurse -force -path "${script:buildDir}/lib"
md "${script:buildDir}/lib" -ea 0 > $null
cp "${script:buildDir}/bin/${script:config}/ext_server.dll" "${script:buildDir}/lib"
cp "${script:buildDir}/bin/${script:config}/llama.dll" "${script:buildDir}/lib"
# Display the dll dependencies in the build log
if ($script:DUMPBIN -ne $null) {
& "$script:DUMPBIN" /dependents "${script:buildDir}/bin/${script:config}/ext_server.dll" | select-string ".dll"
}
}
function compress_libs {
if ($script:GZIP -eq $null) {
write-host "gzip not installed, not compressing files"
return
}
write-host "Compressing dlls..."
$libs = dir "${script:buildDir}/lib/*.dll"
foreach ($file in $libs) {
& "$script:GZIP" --best $file
}
}
function cleanup {
Set-Location "${script:llamacppDir}/examples/server"
git checkout CMakeLists.txt server.cpp
}
init_vars
git_module_setup
apply_patches
# -DLLAMA_AVX -- 2011 Intel Sandy Bridge & AMD Bulldozer
# -DLLAMA_F16C -- 2012 Intel Ivy Bridge & AMD 2011 Bulldozer (No significant improvement over just AVX)
# -DLLAMA_AVX2 -- 2013 Intel Haswell & 2015 AMD Excavator / 2017 AMD Zen
# -DLLAMA_FMA (FMA3) -- 2013 Intel Haswell & 2012 AMD Piledriver
$script:commonCpuDefs = @("-DCMAKE_POSITION_INDEPENDENT_CODE=on", "-DLLAMA_NATIVE=off")
$script:cmakeDefs = $script:commonCpuDefs + @("-DLLAMA_AVX=off", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs
$script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cpu"
write-host "Building LCD CPU"
build
install
compress_libs
$script:cmakeDefs = $script:commonCpuDefs + @("-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs
$script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cpu_avx"
write-host "Building AVX CPU"
build
install
compress_libs
$script:cmakeDefs = $script:commonCpuDefs + @("-DLLAMA_AVX=on", "-DLLAMA_AVX2=on", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=on", "-DLLAMA_F16C=on") + $script:cmakeDefs
$script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cpu_avx2"
write-host "Building AVX2 CPU"
build
install
compress_libs
if ($null -ne $script:CUDA_LIB_DIR) {
# Then build cuda as a dynamically loaded library
$nvcc = (get-command -ea 'silentlycontinue' nvcc)
if ($null -ne $nvcc) {
$script:CUDA_VERSION=(get-item ($nvcc | split-path | split-path)).Basename
}
if ($null -ne $script:CUDA_VERSION) {
$script:CUDA_VARIANT="_"+$script:CUDA_VERSION
}
init_vars
$script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cuda$script:CUDA_VARIANT"
$script:cmakeDefs += @("-DLLAMA_CUBLAS=ON", "-DLLAMA_AVX=on")
build
install
cp "${script:CUDA_LIB_DIR}/cudart64_*.dll" "${script:buildDir}/lib"
cp "${script:CUDA_LIB_DIR}/cublas64_*.dll" "${script:buildDir}/lib"
cp "${script:CUDA_LIB_DIR}/cublasLt64_*.dll" "${script:buildDir}/lib"
compress_libs
}
# TODO - actually implement ROCm support on windows
$script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/rocm"
rm -ea 0 -recurse -force -path "${script:buildDir}/lib"
md "${script:buildDir}/lib" -ea 0 > $null
echo $null >> "${script:buildDir}/lib/.generated"
cleanup
write-host "`ngo generate completed"

View File

@@ -0,0 +1,3 @@
package generate
//go:generate sh ./gen_darwin.sh

View File

@@ -0,0 +1,3 @@
package generate
//go:generate bash ./gen_linux.sh

View File

@@ -0,0 +1,3 @@
package generate
//go:generate powershell -ExecutionPolicy Bypass -File ./gen_windows.ps1

View File

@@ -78,7 +78,12 @@ type model interface {
ModelFamily() string
ModelType() string
FileType() string
NumLayers() int64
NumLayers() uint32
NumGQA() uint32
NumEmbed() uint32
NumHead() uint32
NumHeadKv() uint32
NumCtx() uint32
}
type container interface {
@@ -86,74 +91,6 @@ type container interface {
Decode(*readSeekOffset) (model, error)
}
type containerGGML struct{}
func (c *containerGGML) Name() string {
return "ggml"
}
func (c *containerGGML) Decode(ro *readSeekOffset) (model, error) {
// file contents aren't decoded
ro.Seek(0, io.SeekEnd)
return nil, nil
}
type containerGGMF struct {
version uint32
}
func (c *containerGGMF) Name() string {
return "ggmf"
}
func (c *containerGGMF) Decode(ro *readSeekOffset) (model, error) {
var version uint32
binary.Read(ro, binary.LittleEndian, &version)
switch version {
case 1:
default:
return nil, errors.New("invalid version")
}
c.version = version
// remaining file contents aren't decoded
ro.Seek(0, io.SeekEnd)
return nil, nil
}
type containerGGJT struct {
version uint32
}
func (c *containerGGJT) Name() string {
return "ggjt"
}
func (c *containerGGJT) Decode(ro *readSeekOffset) (model, error) {
var version uint32
binary.Read(ro, binary.LittleEndian, &version)
switch version {
case 1, 2, 3:
default:
return nil, errors.New("invalid version")
}
c.version = version
// different model types may have different layouts for hyperparameters
var llama llamaModel
binary.Read(ro, binary.LittleEndian, &llama.hyperparameters)
// remaining file contents aren't decoded
ro.Seek(0, io.SeekEnd)
return &llama, nil
}
type containerLORA struct {
version uint32
}
@@ -162,9 +99,9 @@ func (c *containerLORA) Name() string {
return "ggla"
}
func (c *containerLORA) Decode(ro *readSeekOffset) (model, error) {
func (c *containerLORA) Decode(rso *readSeekOffset) (model, error) {
var version uint32
binary.Read(ro, binary.LittleEndian, &version)
binary.Read(rso, binary.LittleEndian, &version)
switch version {
case 1:
@@ -175,7 +112,7 @@ func (c *containerLORA) Decode(ro *readSeekOffset) (model, error) {
c.version = version
// remaining file contents aren't decoded
ro.Seek(0, io.SeekEnd)
rso.Seek(0, io.SeekEnd)
return nil, nil
}
@@ -194,6 +131,8 @@ const (
FILE_MAGIC_GGUF_BE = 0x47475546
)
var ErrUnsupportedFormat = errors.New("unsupported model format")
func DecodeGGML(r io.ReadSeeker) (*GGML, error) {
ro := readSeekOffset{ReadSeeker: r}
@@ -204,12 +143,8 @@ func DecodeGGML(r io.ReadSeeker) (*GGML, error) {
var c container
switch magic {
case FILE_MAGIC_GGML:
c = &containerGGML{}
case FILE_MAGIC_GGMF:
c = &containerGGMF{}
case FILE_MAGIC_GGJT:
c = &containerGGJT{}
case FILE_MAGIC_GGML, FILE_MAGIC_GGMF, FILE_MAGIC_GGJT:
return nil, ErrUnsupportedFormat
case FILE_MAGIC_GGLA:
c = &containerLORA{}
case FILE_MAGIC_GGUF_LE:

View File

@@ -272,14 +272,58 @@ func (llm *ggufModel) Decode(rso *readSeekOffset) error {
return nil
}
func (llm *ggufModel) NumLayers() int64 {
func (llm *ggufModel) NumLayers() uint32 {
value, exists := llm.kv[fmt.Sprintf("%s.block_count", llm.ModelFamily())]
if !exists {
return 0
}
v := value.(uint32)
return int64(v)
return value.(uint32)
}
func (llm *ggufModel) NumHead() uint32 {
value, exists := llm.kv[fmt.Sprintf("%s.attention.head_count", llm.ModelFamily())]
if !exists {
return 0
}
return value.(uint32)
}
func (llm *ggufModel) NumEmbed() uint32 {
value, exists := llm.kv[fmt.Sprintf("%s.embedding_length", llm.ModelFamily())]
if !exists {
return 0
}
return value.(uint32)
}
func (llm *ggufModel) NumHeadKv() uint32 {
value, exists := llm.kv[fmt.Sprintf("%s.attention.head_count_kv", llm.ModelFamily())]
if !exists {
return 0
}
return value.(uint32)
}
func (llm *ggufModel) NumCtx() uint32 {
value, exists := llm.kv[fmt.Sprintf("%s.context_length", llm.ModelFamily())]
if !exists {
return 0
}
return value.(uint32)
}
func (llm *ggufModel) NumGQA() uint32 {
numHeadKv := llm.NumHeadKv()
if numHeadKv == 0 {
return 0
}
return llm.NumHead() / numHeadKv
}
func (llm ggufModel) readU8(r io.Reader) uint8 {

1
llm/llama.cpp Submodule

Submodule llm/llama.cpp added at 584d674be6

View File

@@ -1,18 +0,0 @@
package llm
//go:generate git submodule init
//go:generate git submodule update --force ggml
//go:generate git -C ggml apply ../patches/0001-add-detokenize-endpoint.patch
//go:generate git -C ggml apply ../patches/0002-34B-model-support.patch
//go:generate git -C ggml apply ../patches/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
//go:generate git -C ggml apply ../patches/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
//go:generate cmake -S ggml -B ggml/build/cpu -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_NAME=Darwin -DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
//go:generate cmake --build ggml/build/cpu --target server --config Release
//go:generate mv ggml/build/cpu/bin/server ggml/build/cpu/bin/ollama-runner
//go:generate git submodule update --force gguf
//go:generate git -C gguf apply ../patches/0001-update-default-log-target.patch
//go:generate cmake -S gguf -B gguf/build/cpu -DLLAMA_METAL=off -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_NAME=Darwin -DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=on
//go:generate cmake --build gguf/build/cpu --target server --config Release
//go:generate mv gguf/build/cpu/bin/server gguf/build/cpu/bin/ollama-runner

View File

@@ -1,18 +0,0 @@
package llm
//go:generate git submodule init
//go:generate git submodule update --force ggml
//go:generate git -C ggml apply ../patches/0001-add-detokenize-endpoint.patch
//go:generate git -C ggml apply ../patches/0002-34B-model-support.patch
//go:generate git -C ggml apply ../patches/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
//go:generate git -C ggml apply ../patches/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
//go:generate cmake -S ggml -B ggml/build/metal -DLLAMA_METAL=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
//go:generate cmake --build ggml/build/metal --target server --config Release
//go:generate mv ggml/build/metal/bin/server ggml/build/metal/bin/ollama-runner
//go:generate git submodule update --force gguf
//go:generate git -C gguf apply ../patches/0001-update-default-log-target.patch
//go:generate cmake -S gguf -B gguf/build/metal -DLLAMA_METAL=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
//go:generate cmake --build gguf/build/metal --target server --config Release
//go:generate mv gguf/build/metal/bin/server gguf/build/metal/bin/ollama-runner

View File

@@ -1,26 +0,0 @@
package llm
//go:generate git submodule init
//go:generate git submodule update --force ggml
//go:generate git -C ggml apply ../patches/0001-add-detokenize-endpoint.patch
//go:generate git -C ggml apply ../patches/0002-34B-model-support.patch
//go:generate git -C ggml apply ../patches/0005-ggml-support-CUDA-s-half-type-for-aarch64-1455-2670.patch
//go:generate git -C ggml apply ../patches/0001-copy-cuda-runtime-libraries.patch
//go:generate cmake -S ggml -B ggml/build/cpu -DLLAMA_K_QUANTS=on
//go:generate cmake --build ggml/build/cpu --target server --config Release
//go:generate mv ggml/build/cpu/bin/server ggml/build/cpu/bin/ollama-runner
//go:generate git submodule update --force gguf
//go:generate git -C gguf apply ../patches/0001-copy-cuda-runtime-libraries.patch
//go:generate git -C gguf apply ../patches/0001-update-default-log-target.patch
//go:generate cmake -S gguf -B gguf/build/cpu -DLLAMA_K_QUANTS=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off
//go:generate cmake --build gguf/build/cpu --target server --config Release
//go:generate mv gguf/build/cpu/bin/server gguf/build/cpu/bin/ollama-runner
//go:generate cmake -S ggml -B ggml/build/cuda -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on
//go:generate cmake --build ggml/build/cuda --target server --config Release
//go:generate mv ggml/build/cuda/bin/server ggml/build/cuda/bin/ollama-runner
//go:generate cmake -S gguf -B gguf/build/cuda -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off -DLLAMA_CUDA_PEER_MAX_BATCH_SIZE=0
//go:generate cmake --build gguf/build/cuda --target server --config Release
//go:generate mv gguf/build/cuda/bin/server gguf/build/cuda/bin/ollama-runner

View File

@@ -1,24 +0,0 @@
package llm
//go:generate git submodule init
//go:generate git submodule update --force ggml
//go:generate git -C ggml apply ../patches/0001-add-detokenize-endpoint.patch
//go:generate git -C ggml apply ../patches/0002-34B-model-support.patch
//go:generate cmake -S ggml -B ggml/build/cpu -DLLAMA_K_QUANTS=on
//go:generate cmake --build ggml/build/cpu --target server --config Release
//go:generate cmd /c move ggml\build\cpu\bin\Release\server.exe ggml\build\cpu\bin\Release\ollama-runner.exe
//go:generate git submodule update --force gguf
//go:generate git -C gguf apply ../patches/0001-update-default-log-target.patch
//go:generate cmake -S gguf -B gguf/build/cpu -DLLAMA_K_QUANTS=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off
//go:generate cmake --build gguf/build/cpu --target server --config Release
//go:generate cmd /c move gguf\build\cpu\bin\Release\server.exe gguf\build\cpu\bin\Release\ollama-runner.exe
//go:generate cmake -S ggml -B ggml/build/cuda -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on
//go:generate cmake --build ggml/build/cuda --target server --config Release
//go:generate cmd /c move ggml\build\cuda\bin\Release\server.exe ggml\build\cuda\bin\Release\ollama-runner.exe
//go:generate cmake -S gguf -B gguf/build/cuda -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off
//go:generate cmake --build gguf/build/cuda --target server --config Release
//go:generate cmd /c move gguf\build\cuda\bin\Release\server.exe gguf\build\cuda\bin\Release\ollama-runner.exe

View File

@@ -1,51 +0,0 @@
From 032ef7ff2423f5117bb59d42fb71be9cebf0a2de Mon Sep 17 00:00:00 2001
From: Bruce MacDonald <brucewmacdonald@gmail.com>
Date: Mon, 28 Aug 2023 18:08:12 -0400
Subject: [PATCH] add detokenize endpoint
---
examples/server/server.cpp | 21 +++++++++++++++++++++
1 file changed, 21 insertions(+)
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 9966045..5014691 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1075,6 +1075,12 @@ static json format_tokenizer_response(const std::vector<llama_token> &tokens)
{"tokens", tokens}};
}
+static json format_detokenized_response(std::string content)
+{
+ return json{
+ {"content", content}};
+}
+
static void parse_options_completion(const json &body, llama_server_context &llama)
{
gpt_params default_params;
@@ -1361,6 +1367,21 @@ int main(int argc, char **argv)
const json data = format_tokenizer_response(tokens);
return res.set_content(data.dump(), "application/json"); });
+ svr.Post("/detokenize", [&llama](const Request &req, Response &res)
+ {
+ auto lock = llama.lock();
+
+ const json body = json::parse(req.body);
+ std::string content;
+ if (body.count("tokens") != 0)
+ {
+ const std::vector<llama_token> tokens = body["tokens"];
+ content = tokens_to_str(llama.ctx, tokens.cbegin(), tokens.cend());
+ }
+
+ const json data = format_detokenized_response(content);
+ return res.set_content(data.dump(), "application/json"); });
+
svr.Post("/embedding", [&llama](const Request &req, Response &res)
{
auto lock = llama.lock();
--
2.39.2 (Apple Git-143)

View File

@@ -1,27 +0,0 @@
From 5dd02993e8cc2ce309157736b95bb572f274a3fd Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Wed, 20 Sep 2023 14:19:52 -0700
Subject: [PATCH] copy cuda runtime libraries
---
CMakeLists.txt | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 824d9f2..dd24137 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -274,6 +274,10 @@ if (LLAMA_CUBLAS)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
endif()
+ configure_file(${CUDAToolkit_LIBRARY_DIR}/libcudart.so ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/libcudart.so.${CUDAToolkit_VERSION_MAJOR}.0 COPYONLY)
+ configure_file(${CUDAToolkit_LIBRARY_DIR}/libcublas.so ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/libcublas.so.${CUDAToolkit_VERSION_MAJOR} COPYONLY)
+ configure_file(${CUDAToolkit_LIBRARY_DIR}/libcublasLt.so ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/libcublasLt.so.${CUDAToolkit_VERSION_MAJOR} COPYONLY)
+
if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
# 52 == lowest CUDA 12 standard
# 60 == f16 CUDA intrinsics
--
2.42.0

View File

@@ -1,25 +0,0 @@
From 6465fec6290f0a7f5d4d0fbe6bcf634e4810dde6 Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Mon, 23 Oct 2023 10:39:34 -0700
Subject: [PATCH] default log stderr
---
common/log.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/common/log.h b/common/log.h
index b8953fd..25522cd 100644
--- a/common/log.h
+++ b/common/log.h
@@ -90,7 +90,7 @@
// }
//
#ifndef LOG_TARGET
- #define LOG_TARGET log_handler()
+ #define LOG_TARGET nullptr
#endif
#ifndef LOG_TEE_TARGET
--
2.42.0

View File

@@ -1,89 +0,0 @@
From 6145068a6613c37bb43a7408b5496524bdcfc402 Mon Sep 17 00:00:00 2001
From: Bruce MacDonald <brucewmacdonald@gmail.com>
Date: Mon, 28 Aug 2023 18:08:53 -0400
Subject: [PATCH] 34B model support
---
llama.cpp | 10 ++++++++++
1 file changed, 10 insertions(+)
diff --git a/llama.cpp b/llama.cpp
index f2cbe76..62c5cdf 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -79,6 +79,7 @@ enum e_model {
MODEL_7B,
MODEL_13B,
MODEL_30B,
+ MODEL_34B,
MODEL_65B,
MODEL_70B,
};
@@ -122,6 +123,7 @@ static std::map<e_model, size_t> MEM_REQ_SCRATCH0(int n_ctx)
{ MODEL_7B, ((size_t) n_ctx / 16ull + 100ull) * MB },
{ MODEL_13B, ((size_t) n_ctx / 12ull + 120ull) * MB },
{ MODEL_30B, ((size_t) n_ctx / 9ull + 160ull) * MB },
+ { MODEL_34B, ((size_t) n_ctx / 9ull + 160ull) * MB },
{ MODEL_65B, ((size_t) n_ctx / 6ull + 256ull) * MB }, // guess
{ MODEL_70B, ((size_t) n_ctx / 7ull + 164ull) * MB },
};
@@ -135,6 +137,7 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
{ MODEL_7B, 160ull * MB },
{ MODEL_13B, 192ull * MB },
{ MODEL_30B, 256ull * MB },
+ { MODEL_34B, 256ull * MB },
{ MODEL_65B, 384ull * MB }, // guess
{ MODEL_70B, 304ull * MB },
};
@@ -149,6 +152,7 @@ static const std::map<e_model, size_t> & MEM_REQ_EVAL()
{ MODEL_7B, 10ull * MB },
{ MODEL_13B, 12ull * MB },
{ MODEL_30B, 16ull * MB },
+ { MODEL_34B, 16ull * MB },
{ MODEL_65B, 24ull * MB }, // guess
{ MODEL_70B, 24ull * MB },
};
@@ -164,6 +168,7 @@ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_BASE()
{ MODEL_7B, 512ull * kB },
{ MODEL_13B, 640ull * kB },
{ MODEL_30B, 768ull * kB },
+ { MODEL_34B, 768ull * kB },
{ MODEL_65B, 1280ull * kB },
{ MODEL_70B, 1280ull * kB },
};
@@ -179,6 +184,7 @@ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT()
{ MODEL_7B, 128ull },
{ MODEL_13B, 160ull },
{ MODEL_30B, 208ull },
+ { MODEL_34B, 208ull },
{ MODEL_65B, 256ull },
{ MODEL_70B, 256ull },
};
@@ -1027,6 +1033,7 @@ static const char * llama_model_type_name(e_model type) {
case MODEL_7B: return "7B";
case MODEL_13B: return "13B";
case MODEL_30B: return "30B";
+ case MODEL_34B: return "34B";
case MODEL_65B: return "65B";
case MODEL_70B: return "70B";
default: LLAMA_ASSERT(false);
@@ -1074,6 +1081,7 @@ static void llama_model_load_internal(
case 26: model.type = e_model::MODEL_3B; break;
case 32: model.type = e_model::MODEL_7B; break;
case 40: model.type = e_model::MODEL_13B; break;
+ case 48: model.type = e_model::MODEL_34B; break;
case 60: model.type = e_model::MODEL_30B; break;
case 80: model.type = e_model::MODEL_65B; break;
default:
@@ -1094,6 +1102,8 @@ static void llama_model_load_internal(
LLAMA_LOG_WARN("%s: warning: assuming 70B model based on GQA == %d\n", __func__, n_gqa);
model.type = e_model::MODEL_70B;
hparams.f_ffn_mult = 1.3f; // from the params.json of the 70B model
+ } else if (model.type == e_model::MODEL_34B && n_gqa == 8) {
+ hparams.f_ffn_mult = 1.0f; // from the params.json of the 34B model
}
hparams.rope_freq_base = rope_freq_base;
--
2.39.2 (Apple Git-143)

View File

@@ -1,30 +0,0 @@
From dadbed99e65252d79f81101a392d0d6497b86caa Mon Sep 17 00:00:00 2001
From: Shouzheng Liu <lshzh.hi@gmail.com>
Date: Mon, 21 Aug 2023 06:59:29 -0400
Subject: [PATCH] metal : fix synchronization in new matrix multiplication
kernel (#2686)
---
ggml-metal.metal | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/ggml-metal.metal b/ggml-metal.metal
index 3f31252..88d48f6 100644
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
@@ -1898,10 +1898,11 @@ kernel void kernel_mul_mm(device const uchar * src0,
threadgroup float *temp_str = ((threadgroup float *)shared_memory) \
+ 32 * (sgitg&1) + (16 * (sgitg>>1)) * BLOCK_SIZE_M;
for (int i = 0; i < 8; i++) {
+ threadgroup_barrier(mem_flags::mem_device);
simdgroup_store(c_res[i], temp_str + 8 * (i%4) + 8 * BLOCK_SIZE_M * (i/4), BLOCK_SIZE_M);
}
- threadgroup_barrier(mem_flags::mem_threadgroup);
+ threadgroup_barrier(mem_flags::mem_device);
device float *C = dst + BLOCK_SIZE_M * r0 + (BLOCK_SIZE_N * r1) * ne0 + im*ne1*ne0;
if (sgitg==0) {
for (int i = 0; i < n_rows; i++) {
--
2.41.0

Some files were not shown because too many files have changed in this diff Show More