Compare commits

..

118 Commits

Author SHA1 Message Date
Patrick Devine
cb576a6b23 fix ref 2024-08-26 19:59:33 -07:00
Patrick Devine
15b7ff3a89 more comments 2024-08-26 19:56:45 -07:00
Patrick Devine
3ad243466b comments 2024-08-26 19:54:06 -07:00
Patrick Devine
a13e583c49 cleanup whitespace 2024-08-26 18:09:21 -07:00
Patrick Devine
3c1994d0ee small change 2024-08-26 18:07:59 -07:00
Patrick Devine
1b2da3829d update the import docs 2024-08-26 18:04:46 -07:00
Daniel Hiltgen
0f92b19bec Only enable numa on CPUs (#6484)
The numa flag may be having a performance impact on multi-socket systems with GPU loads
2024-08-24 17:24:50 -07:00
Daniel Hiltgen
69be940bf6 gpu: Group GPU Library sets by variant (#6483)
The recent cuda variant changes uncovered a bug in ByLibrary
which failed to group by common variant for GPU types.
2024-08-23 15:11:56 -07:00
Michael Yang
9638c24c58 Merge pull request #5446 from ollama/mxyng/faq
update faq
2024-08-23 14:05:59 -07:00
Michael Yang
bb362caf88 update faq 2024-08-23 13:37:21 -07:00
Patrick Devine
0c819e167b convert safetensor adapters into GGUF (#6327) 2024-08-23 11:29:56 -07:00
Daniel Hiltgen
7a1e1c1caf gpu: Ensure driver version set before variant (#6480)
During rebasing, the ordering was inverted causing the cuda version
selection logic to break, with driver version being evaluated as zero
incorrectly causing a downgrade to v11.
2024-08-23 11:21:12 -07:00
Daniel Hiltgen
0b03b9c32f llm: Align cmake define for cuda no peer copy (#6455)
Define changed recently and this slipped through the cracks with the old
name.
2024-08-23 11:20:39 -07:00
Daniel Hiltgen
90ca84172c Fix embeddings memory corruption (#6467)
* Fix embeddings memory corruption

The patch was leading to a buffer overrun corruption.  Once removed though, parallism
in server.cpp lead to hitting an assert due to slot/seq IDs being >= token count.  To
work around this, only use slot 0 for embeddings.

* Fix embed integration test assumption

The token eval count has changed with recent llama.cpp bumps (0.3.5+)
2024-08-22 14:51:42 -07:00
Michael Yang
6bd8a4b0a1 Merge pull request #6064 from ollama/mxyng/convert-llama3
convert: update llama conversion for llama3.1
2024-08-21 12:57:09 -07:00
Michael Yang
77903ab8b4 llama3.1 2024-08-21 11:49:31 -07:00
Michael Yang
e22286c9e1 Merge pull request #5365 from ollama/mxyng/convert-gemma2
convert gemma2
2024-08-21 11:48:43 -07:00
Michael Yang
107f695929 Merge pull request #4917 from ollama/mxyng/convert-bert
convert bert model from safetensors
2024-08-21 11:48:29 -07:00
Michael Yang
4ecc70d3b4 Merge pull request #6386 from zwwhdls/fix-new-layer
fix: chmod new layer to 0o644 when creating it
2024-08-21 10:58:45 -07:00
Michael Yang
3546bbd08c convert gemma2 2024-08-20 17:27:51 -07:00
Michael Yang
beb49eef65 create bert models from cli 2024-08-20 17:27:34 -07:00
Michael Yang
5a28b9cf5f bert 2024-08-20 17:27:34 -07:00
Daniel Hiltgen
a017cf2fea Split rocm back out of bundle (#6432)
We're over budget for github's maximum release artifact size with rocm + 2 cuda
versions.  This splits rocm back out as a discrete artifact, but keeps the layout so it can
be extracted into the same location as the main bundle.
2024-08-20 07:26:38 -07:00
Daniel Hiltgen
19e5a890f7 CI: remove directories from dist dir before upload step (#6429) 2024-08-19 15:19:21 -07:00
Daniel Hiltgen
f91c9e3709 CI: handle directories during checksum (#6427) 2024-08-19 13:48:45 -07:00
Daniel Hiltgen
2df6905ede Merge pull request #6424 from dhiltgen/cuda_v12
Fix overlapping artifact name on CI
2024-08-19 12:11:58 -07:00
Daniel Hiltgen
d8be22e47d Fix overlapping artifact name on CI 2024-08-19 12:07:18 -07:00
Daniel Hiltgen
652c273f0e Merge pull request #5049 from dhiltgen/cuda_v12
Cuda v12
2024-08-19 11:14:24 -07:00
Daniel Hiltgen
88e7705079 Merge pull request #6402 from rick-github/numParallel
Override numParallel in pickBestPartialFitByLibrary() only if unset.
2024-08-19 11:07:22 -07:00
Daniel Hiltgen
f9e31da946 Review comments 2024-08-19 10:36:15 -07:00
Daniel Hiltgen
88bb9e3328 Adjust layout to bin+lib/ollama 2024-08-19 09:38:53 -07:00
Daniel Hiltgen
3b19cdba2a Remove Jetpack 2024-08-19 09:38:53 -07:00
Daniel Hiltgen
927d98a6cd Add windows cuda v12 + v11 support 2024-08-19 09:38:53 -07:00
Daniel Hiltgen
f6c811b320 Enable cuda v12 flags 2024-08-19 09:38:53 -07:00
Daniel Hiltgen
4fe3a556fa Add cuda v12 variant and selection logic
Based on compute capability and driver version, pick
v12 or v11 cuda variants.
2024-08-19 09:38:53 -07:00
Daniel Hiltgen
fc3b4cda89 Report GPU variant in log 2024-08-19 09:38:53 -07:00
Daniel Hiltgen
d470ebe78b Add Jetson cuda variants for arm
This adds new variants for arm64 specific to Jetson platforms
2024-08-19 09:38:53 -07:00
Daniel Hiltgen
c7bcb00319 Wire up ccache and pigz in the docker based build
This should help speed things up a little
2024-08-19 09:38:53 -07:00
Daniel Hiltgen
74d45f0102 Refactor linux packaging
This adjusts linux to follow a similar model to windows with a discrete archive
(zip/tgz) to cary the primary executable, and dependent libraries. Runners are
still carried as payloads inside the main binary

Darwin retain the payload model where the go binary is fully self contained.
2024-08-19 09:38:53 -07:00
Jeffrey Morgan
9fddef3731 server: limit upload parts to 16 (#6411) 2024-08-19 09:20:52 -07:00
Richard Lyons
885cf45087 Fix white space. 2024-08-18 03:07:16 +02:00
Richard Lyons
9352eeb752 Reset NumCtx. 2024-08-18 02:55:01 +02:00
Richard Lyons
0ad0e738cd Override numParallel only if unset. 2024-08-18 01:43:26 +02:00
zwwhdls
bdc4308afb fix: chmod new layer to 0o644 when creating it
Signed-off-by: zwwhdls <zww@hdls.me>
2024-08-16 11:43:19 +08:00
Daniel Hiltgen
d29cd4c2ed Merge pull request #6381 from eust-w/main
fix: Add tooltip to system tray icon
2024-08-15 15:31:15 -07:00
eust-w
a84c05cf91 fix: Add tooltip to system tray icon
- Updated setIcon method to include tooltip text for the system tray icon.
- Added NIF_TIP flag and set the tooltip text using UTF16 encoding.

Resolves: #6372
2024-08-16 06:00:12 +08:00
Michael Yang
e3d7f32af7 Merge pull request #6363 from ollama/mxyng/fix-noprune
fix: noprune on pull
2024-08-15 12:20:38 -07:00
Michael Yang
3a75e74e34 only skip invalid json manifests 2024-08-15 10:29:14 -07:00
Michael Yang
237dccba1e skip invalid manifest files 2024-08-14 16:55:45 -07:00
Michael Yang
b3f75fc812 fix noprune 2024-08-14 15:48:51 -07:00
Jeffrey Morgan
8200c371ae add CONTRIBUTING.md (#6349) 2024-08-14 15:19:50 -07:00
longtao
0a8d6ea86d Fix typo and improve readability (#5964)
* Fix typo and improve readability

Summary:
* Rename updatAvailableMenuID to updateAvailableMenuID
* Replace unused cmd parameter with _ in RunServer function
* Fix typos in comments

(cherry picked from commit 5b8715f0b04773369e8eb1f9e6737995a0ab3ba7)

* Update api/client.go

Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com>

---------

Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com>
2024-08-13 17:54:19 -07:00
Blake Mizerany
8e1050f366 server: reduce max connections used in download (#6347)
The previous value of 64 was WAY too high and unnecessary. It reached
diminishing returns and blew past it. This is a more reasonable number
for _most_ normal cases. For users on cloud servers with excellent
network quality, this will keep screaming for them, without hitting our
CDN limits. For users with relatively poor network quality, this will
keep them from saturating their network and causing other issues.
2024-08-13 16:47:35 -07:00
Bruce MacDonald
eda8a32a09 update chatml template format to latest in docs (#6344) 2024-08-13 16:39:18 -07:00
Michael Yang
a0a40aa20c Merge pull request #6346 from ollama/mxyng/lint 2024-08-13 14:58:35 -07:00
Michael Yang
2697d7f5aa lint
- fixes printf: non-constant format string in call to fmt.Printf
- fixes SA1032: arguments have the wrong order
- disables testifylint
2024-08-13 14:36:33 -07:00
Pamela Fox
1f32276178 Update openai.md to remove extra checkbox (#6345) 2024-08-13 13:36:05 -07:00
Daniel Hiltgen
4c4fe3f87f Merge pull request #6343 from dhiltgen/revert_win_go_version
Go back to a pinned Go version
2024-08-13 11:53:49 -07:00
Daniel Hiltgen
feedf49c71 Go back to a pinned Go version
Go version 1.22.6 is triggering AV false positives, so go back to 1.22.5
2024-08-13 11:45:44 -07:00
royjhan
8b00a415ab Load Embedding Model on Empty Input (#6325)
* load on empty input

* no load on invalid input
2024-08-13 10:19:56 -07:00
Michael Yang
01b80e9ffc Merge pull request #5443 from ollama/mxyng/convert-phi3
add conversion for microsoft phi 3 mini/medium 4k, 128k
2024-08-12 15:47:58 -07:00
Michael Yang
bd5e432630 update import.md 2024-08-12 15:13:29 -07:00
Bruce MacDonald
aec77d6a05 support new "longrope" attention factor 2024-08-12 15:13:29 -07:00
Michael Yang
6ffb5cb017 add conversion for microsoft phi 3 mini/medium 4k, 128 2024-08-12 15:13:29 -07:00
Josh
f7e3b9190f cmd: spinner progress for transfer model data (#6100) 2024-08-12 11:46:32 -07:00
Josh
980dd15f81 cmd: speed up gguf creates (#6324) 2024-08-12 11:46:09 -07:00
royjhan
01d544d373 OpenAI: Simplify input output in testing (#5858)
* simplify input output

* direct comp

* in line image

* rm error pointer type

* update response testing

* lint
2024-08-12 10:33:34 -07:00
Josh
1dc3ef3aa9 Revert "server: speed up single gguf creates (#5898)" (#6323)
This reverts commit 8aac22438e.
2024-08-12 09:57:51 -07:00
Josh
8aac22438e server: speed up single gguf creates (#5898) 2024-08-12 09:28:55 -07:00
Jeffrey Morgan
15c2d8fe14 server: parallelize embeddings in API web handler instead of in subprocess runner (#6220)
For simplicity, perform parallelization of embedding requests in the API handler instead of offloading this to the subprocess runner. This keeps the scheduling story simpler as it builds on existing parallel requests, similar to existing text completion functionality.
2024-08-11 11:57:10 -07:00
Daniel Hiltgen
25906d72d1 llm: prevent loading too large models on windows (#5926)
Don't allow loading models that would lead to memory exhaustion (across vram, system memory and disk paging). This check was already applied on Linux but should also be applied on Windows as well.
2024-08-11 11:30:20 -07:00
CognitiveTech
023451ce47 add integration obook-summary (#6305) 2024-08-10 18:43:08 -07:00
Jesse Gross
9b53e39d8e Merge pull request #6258 from coolljt0725/fix_typo
server/download.go: Fix a typo in log
2024-08-09 17:19:48 -07:00
Michael Yang
97fae2df95 Merge pull request #6235 from Nicholas42/fix_line_endings
Set *.png and *.ico to be treated as binary files.
2024-08-09 17:06:30 -07:00
Michael Yang
160d9d4900 Merge pull request #6171 from ollama/mxyng/remove-temp
removeall to remove non-empty temp dirs
2024-08-09 15:47:13 -07:00
Nicholas Schwab
d4e6407464 Restrict text files with explicit line feeds to *.go.
This partially reverts b732beba6a. It
seems like explicitly setting all files to use line feeds was done due
to issues with the go linter, hence it can be restricted to those files
(https://github.com/ollama/ollama/pull/6235#issuecomment-2278745953).
2024-08-09 23:14:13 +02:00
Daniel Hiltgen
b7f7d8cd15 Merge pull request #6291 from dhiltgen/no_sparse_fail
Don't hard fail on sparse setup error
2024-08-09 12:30:25 -07:00
Daniel Hiltgen
2fa1db4345 Don't hard fail on sparse setup error
It seems this can fail in some casees, but proceed
with the download anyway.
2024-08-09 12:16:19 -07:00
Daniel Hiltgen
71b0945fc6 Merge pull request #6290 from dhiltgen/intel_npe
Harden intel boostrap for nil pointers
2024-08-09 12:14:42 -07:00
Daniel Hiltgen
5bca2e60a7 Harden intel boostrap for nil pointers 2024-08-09 11:31:38 -07:00
Nicholas42
67472e0e89 Also flag *.icns as binary 2024-08-09 13:41:20 +02:00
Daniel Hiltgen
e9aa5117c4 Merge pull request #6133 from dhiltgen/cuda_repo
Adjust arm cuda repo paths
2024-08-08 12:33:35 -07:00
Daniel Hiltgen
2473bdba5e Merge pull request #6182 from dhiltgen/more_patterns
Catch one more error log
2024-08-08 12:33:17 -07:00
Jesse Gross
7d1c0047fa Merge pull request #6247 from ollama/jessegross/layers
Store layers inside manifests consistently as values.
2024-08-08 10:46:43 -07:00
Jitang Lei
7b61eba471 server/download.go: Fix a typo in log
Signed-off-by: Jitang Lei <leijitang@outlook.com>
2024-08-08 20:28:01 +08:00
Jesse Gross
7edaf6e7e8 manifest: Store layers inside manifests consistently as values.
Commit 1829fb61 ("manifest: Fix crash on startup when trying to clean up
unused files (#5840)") changed the config layer stored in manifests
from a pointer to a value. This was done in order to avoid potential
nil pointer dereferences after it is deserialized from JSON in the
event that the field is missing.

This changes the Layers slice to also be stored by value. This enables
consistency in handling across the two objects.
2024-08-07 17:03:06 -07:00
Jesse Gross
97ec8cfd4e image: Clarify argument to WriteManifest is config
When creating a model the config layer is appended to the list of
layers and then the last layer is used as the config when writing the
manifest. This change directly uses the config layer to write the
manifest. There is no behavior change but it is less error prone.
2024-08-07 16:58:42 -07:00
royjhan
5b3a21b578 add metrics to docs (#6079) 2024-08-07 14:43:44 -07:00
Kyle Kelley
ad0c19dde4 Use llama3.1 in tools example (#5985)
* Use llama3.1 in tools example

* Update api.md
2024-08-07 17:20:50 -04:00
Jesse Gross
69eb06c40e Merge pull request #6145 from ollama/jessegross/bug5840
Fix crash on startup when trying to clean up unused files (#5840)
2024-08-07 11:24:15 -07:00
Jesse Gross
1829fb61bd manifest: Fix crash on startup when trying to clean up unused files (#5840)
Currently if the config field is missing in the manifest file (or
corrupted), Ollama will crash when it tries to read it. This can
happen at startup or when pulling new models.

This data is mostly just used for showing model information so we
can be tolerant of it not being present - it is not required to
run the models. Besides avoiding crashing, this also gives us the
ability to restructure the config in the future by pulling it
into the main manifest file.
2024-08-07 10:30:44 -07:00
Nicholas Schwab
ce67706037 Set *.png and *.ico to be treated as binary files.
The change b732beba6 makes all files text files and sets lf as eol. This
will automatically change all files to have lf if they are touched by
git (e.g. via git status). This change cannot be stashed and makes it
hard to work with the repo (rebase and checkout don't really work). See
also #6183.

Here, we set the offending files (*.png and *.ico, but that might be
more in the future) to be treated as binary files and not be changed by
git.
2024-08-07 18:20:11 +02:00
Jesse Gross
685a53534b manifest: Don't prune layers if we can't open a manifest file
If there is an error when opening a manifest file (corrupted, permission denied, etc.)
then the referenced layers will not be included in the list of active
layers. This causes them to be deleted when pruning happens at startup
or a model is pulled.

In such a situation, we should prefer to preserve data in the hopes that
it can be recovered rather than being agressive about deletion.
2024-08-06 23:11:19 -07:00
Jeffrey Morgan
de4fc29773 llm: reserve required number of slots for embeddings (#6219) 2024-08-06 23:20:49 -04:00
Jeffrey Morgan
e04c7012c2 update llama.cpp submodule to 1e6f6554 (#6208) 2024-08-06 15:11:45 -04:00
Chua Chee Seng
d4a7216c82 Fixed invalid option provided not displaying the invalid option name problem. (#6202) 2024-08-06 14:37:16 -04:00
Daniel Hiltgen
a4fdd03c3b Merge pull request #6207 from dhiltgen/sparse_win
Ensure sparse files on windows during download
2024-08-06 11:06:06 -07:00
Daniel Hiltgen
fc85f50a2b Ensure sparse files on windows during download
The file.Truncate call on windows will write the whole file
unless you set the sparse flag, leading to heavy I/O at the
beginning of download.  This should improve our
I/O behavior on windows and put less stress on the users disk.
2024-08-06 10:58:08 -07:00
royjhan
86b907f82a sort batch results (#6189) 2024-08-05 16:55:34 -07:00
Michael Yang
10d49bce70 Merge pull request #6190 from ollama/mxyng/fix-integration
fix concurrency test
2024-08-05 16:45:49 -07:00
Michael Yang
7ed367419e fix concurrency test 2024-08-05 16:36:16 -07:00
Daniel Hiltgen
50ee8b5f56 Merge pull request #6186 from dhiltgen/numa
Implement linux NUMA detection
2024-08-05 15:20:06 -07:00
Michael Yang
03bdac0595 Merge pull request #6146 from ollama/mxyng/testing
use testing tempdirs
2024-08-05 13:00:05 -07:00
Daniel Hiltgen
f457d63400 Implement linux NUMA detection
If the system has multiple numa nodes, enable numa support in llama.cpp
If we detect numactl in the path, use that, else use the basic "distribute" mode.
2024-08-05 12:56:20 -07:00
Daniel Hiltgen
04210aa6dd Catch one more error log 2024-08-05 09:28:07 -07:00
Michael Yang
43f9d92008 close pid file 2024-08-05 00:41:16 -07:00
Michael Yang
ed6c8bfe57 removeall to remove non-empty temp dirs 2024-08-05 00:41:16 -07:00
Michael Yang
39f2bc6bfc Merge pull request #6167 from ollama/mxyng/line-feed
line feed
2024-08-05 00:06:28 -07:00
frob
b73b0940ef Disable paging for journalctl (#6154)
Users using `journalctl` to get logs for issue logging sometimes don't realize that paging is causing information to be missed.
2024-08-05 00:10:53 -04:00
Michael Yang
6a07344786 line feed 2024-08-04 17:25:41 -07:00
sryu1
8b920f35a4 Add Gemma 2 2b (#6151) 2024-08-04 10:58:39 -04:00
Ivan Charapanau
4221e39867 Reference ollama integration with Harbor (#6147) 2024-08-02 17:03:46 -07:00
Michael Yang
a091fadfda use testing tempdirs 2024-08-02 16:04:06 -07:00
Michael Yang
77ccbf04dc Merge pull request #6128 from ollama/mxyng/lint
enable gofmt/gofumpt/goimports/tenv
2024-08-02 14:58:40 -07:00
royjhan
4addf6b587 Update OpenAI Compatibility Docs with /v1/completions (#5311)
* Update docs

* token bug corrected

* Update docs/openai.md

* Update docs/openai.md

* add suffix

* merge conflicts

* merge conflicts
2024-08-02 13:16:23 -07:00
royjhan
85c7f11170 Update docs (#5310) 2024-08-02 13:05:57 -07:00
Daniel Hiltgen
df3802a65f Adjust arm cuda repo paths
Ubuntu distros fail to install cuda drivers since aarch64 isn't valid
2024-08-01 17:22:25 -07:00
Michael Yang
b732beba6a lint 2024-08-01 17:06:06 -07:00
137 changed files with 3582 additions and 1530 deletions

2
.gitattributes vendored
View File

@@ -1 +1,3 @@
llm/ext_server/* linguist-vendored
* text=auto
*.go text eol=lf

View File

@@ -31,7 +31,7 @@ jobs:
security set-keychain-settings -lut 3600 build.keychain
- uses: actions/setup-go@v5
with:
go-version: "stable"
go-version-file: go.mod
cache: true
- name: Build Darwin
env:
@@ -87,7 +87,7 @@ jobs:
write-host "plugin installed"
- uses: actions/setup-go@v5
with:
go-version: "stable"
go-version-file: go.mod
cache: true
- run: go get ./...
- run: |
@@ -141,7 +141,7 @@ jobs:
write-host "plugin installed"
- uses: actions/setup-go@v5
with:
go-version: "stable"
go-version-file: go.mod
cache: true
- name: 'Install ROCm'
run: |
@@ -187,6 +187,13 @@ jobs:
generate-windows-cuda:
environment: release
runs-on: windows
strategy:
matrix:
cuda:
- version: "11"
url: 'https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.89_win10.exe'
- version: "12"
url: 'https://developer.download.nvidia.com/compute/cuda/12.4.0/local_installers/cuda_12.4.0_551.61_windows.exe'
env:
KEY_CONTAINER: ${{ vars.KEY_CONTAINER }}
steps:
@@ -218,13 +225,13 @@ jobs:
write-host "plugin installed"
- uses: actions/setup-go@v5
with:
go-version: "stable"
go-version-file: go.mod
cache: true
- name: 'Install CUDA'
- name: 'Install CUDA ${{ matrix.cuda.version }}'
run: |
$ErrorActionPreference = "Stop"
write-host "downloading CUDA Installer"
Invoke-WebRequest -Uri "https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.89_win10.exe" -OutFile "${env:RUNNER_TEMP}\cuda-install.exe"
Invoke-WebRequest -Uri "${{ matrix.cuda.url }}" -OutFile "${env:RUNNER_TEMP}\cuda-install.exe"
write-host "Installing CUDA"
Start-Process "${env:RUNNER_TEMP}\cuda-install.exe" -ArgumentList '-s' -NoNewWindow -Wait
write-host "Completed CUDA"
@@ -256,15 +263,16 @@ jobs:
cp "${NVIDIA_DIR}\cublasLt64_*.dll" "dist\deps\"
- uses: actions/upload-artifact@v4
with:
name: generate-windows-cuda
name: generate-windows-cuda-${{ matrix.cuda.version }}
path: |
llm/build/**/bin/*
dist/windows-amd64/**
- uses: actions/upload-artifact@v4
with:
name: windows-cuda-deps
name: windows-cuda-deps-${{ matrix.cuda.version }}
path: dist/deps/*
# Import the prior generation steps and build the final windows assets
build-windows:
environment: release
@@ -306,7 +314,7 @@ jobs:
write-host "plugin installed"
- uses: actions/setup-go@v5
with:
go-version: "stable"
go-version-file: go.mod
cache: true
- run: go get
- uses: actions/download-artifact@v4
@@ -314,10 +322,16 @@ jobs:
name: generate-windows-cpu
- uses: actions/download-artifact@v4
with:
name: generate-windows-cuda
name: generate-windows-cuda-11
- uses: actions/download-artifact@v4
with:
name: windows-cuda-deps
name: generate-windows-cuda-12
- uses: actions/download-artifact@v4
with:
name: windows-cuda-deps-11
- uses: actions/download-artifact@v4
with:
name: windows-cuda-deps-12
- uses: actions/download-artifact@v4
with:
name: windows-rocm-deps
@@ -363,7 +377,6 @@ jobs:
- run: |
./scripts/build_linux.sh
./scripts/build_docker.sh
mv dist/deps/* dist/
- uses: actions/upload-artifact@v4
with:
name: dist-linux-amd64
@@ -459,7 +472,10 @@ jobs:
merge-multiple: true
- run: |
ls -lh dist/
(cd dist; sha256sum * > sha256sum.txt)
(cd dist; find . -type f | xargs sha256sum > ../sha256sum.txt)
mv sha256sum.txt dist/
mv dist/linux-???64 .
mv dist/linux-amd64-rocm .
cat dist/sha256sum.txt
- name: Create or update Release
run: |

View File

@@ -63,7 +63,7 @@ jobs:
- uses: actions/checkout@v4
- uses: actions/setup-go@v5
with:
go-version: "stable"
go-version-file: go.mod
cache: true
- run: go get ./...
- run: |
@@ -163,7 +163,7 @@ jobs:
- uses: actions/checkout@v4
- uses: actions/setup-go@v5
with:
go-version: "stable"
go-version-file: go.mod
cache: true
- name: 'Install ROCm'
run: |
@@ -200,7 +200,7 @@ jobs:
- uses: actions/checkout@v4
- uses: actions/setup-go@v5
with:
go-version: "stable"
go-version-file: go.mod
cache: true
- name: 'Install CUDA'
run: |
@@ -255,7 +255,7 @@ jobs:
submodules: recursive
- uses: actions/setup-go@v5
with:
go-version: "stable"
go-version-file: go.mod
cache: false
- run: |
case ${{ matrix.arch }} in
@@ -273,7 +273,7 @@ jobs:
if: ${{ startsWith(matrix.os, 'macos-') }}
- uses: golangci/golangci-lint-action@v6
with:
args: --timeout 8m0s -v ${{ startsWith(matrix.os, 'windows-') && '' || '--disable gofmt --disable goimports' }}
args: --timeout 8m0s -v
test:
strategy:
matrix:
@@ -297,7 +297,7 @@ jobs:
submodules: recursive
- uses: actions/setup-go@v5
with:
go-version: "stable"
go-version-file: go.mod
cache: true
- run: |
case ${{ matrix.arch }} in

View File

@@ -7,22 +7,31 @@ linters:
- bodyclose
- containedctx
- contextcheck
- errcheck
- exportloopref
- gci
- gocheckcompilerdirectives
# conditionally enable this on linux/macos
# - gofmt
# - goimports
- gofmt
- gofumpt
- gosimple
- govet
- ineffassign
- intrange
- makezero
- misspell
- nilerr
- nolintlint
- nosprintfhostport
- testifylint
- staticcheck
- tenv
- unconvert
- unused
- usestdlibvars
- wastedassign
- whitespace
- usestdlibvars
linters-settings:
gci:
sections: [standard, default, localmodule]
severity:
default-severity: error
rules:

37
CONTRIBUTING.md Normal file
View File

@@ -0,0 +1,37 @@
# Contributing to Ollama
Thank you for your interest in contributing to Ollama! Here are a few guidelines to help get you started.
## Set up
See the [development documentation](./docs/development.md) for instructions on how to build and run Ollama locally.
## Pull requests
### Ideal issues
* [Bugs](https://github.com/ollama/ollama/issues?q=is%3Aissue+is%3Aopen+label%3Abug): issues where Ollama stops working or where it results in an unexpected error.
* [Performance](https://github.com/ollama/ollama/issues?q=is%3Aissue+is%3Aopen+label%3Aperformance): issues to make Ollama faster at model inference, downloading or uploading.
* [Security](https://github.com/ollama/ollama/blob/main/SECURITY.md): issues that could lead to a security vulnerability. As mentioned in [SECURITY.md](https://github.com/ollama/ollama/blob/main/SECURITY.md), please do not disclose security vulnerabilities publicly.
### Issues that are harder to review
* New features: new features (e.g. API fields, environment variables) add surface area to Ollama and make it harder to maintain in the long run as they cannot be removed without potentially breaking users in the future.
* Refactoring: large code improvements are important, but can be harder or take longer to review and merge.
* Documentation: small updates to fill in or dorrect missing documentation is helpful, however large documentation additions can be hard to maintain over time.
### Issues that may not be accepted
* Changes that break backwards compatibility in Ollama's API (including the OpenAI-compatible API)
* Changes that add significant friction to the user experience
* Changes that create a large future maintenance burden for maintainers and contributors
### Best practices
* Commit messages: please leave both a title and a description in your commit messages. The title should be a short summary of the changes, with a leading word that explains the section of the code being changed (e.g. `api: fix parsing of prompt field`) . In the description, leave a short 2-3 sentences that explain more about the change and its impact.
* Tests: please add test coverage to changes where possible.
* Minimize dependencies: avoid adding new dependencies unless absolutely necessary.
## Need help?
If you need help with anything, feel free to reach out to us on our [Discord server](https://discord.gg/ollama).

View File

@@ -1,7 +1,9 @@
ARG GOLANG_VERSION=1.22.5
ARG CMAKE_VERSION=3.22.1
# this CUDA_VERSION corresponds with the one specified in docs/gpu.md
ARG CUDA_VERSION=11.3.1
ARG CUDA_VERSION_11=11.3.1
ARG CUDA_V11_ARCHITECTURES="50;52;53;60;61;62;70;72;75;80;86"
ARG CUDA_VERSION_12=12.4.0
ARG CUDA_V12_ARCHITECTURES="60;61;62;70;72;75;80;86;87;89;90;90a"
ARG ROCM_VERSION=6.1.2
# Copy the minimal context we need to run the generate scripts
@@ -10,7 +12,7 @@ COPY .git .git
COPY .gitmodules .gitmodules
COPY llm llm
FROM --platform=linux/amd64 nvidia/cuda:$CUDA_VERSION-devel-centos7 AS cuda-build-amd64
FROM --platform=linux/amd64 nvidia/cuda:$CUDA_VERSION_11-devel-centos7 AS cuda-11-build-amd64
ARG CMAKE_VERSION
COPY ./scripts/rh_linux_deps.sh /
RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
@@ -18,9 +20,34 @@ ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
COPY --from=llm-code / /go/src/github.com/ollama/ollama/
WORKDIR /go/src/github.com/ollama/ollama/llm/generate
ARG CGO_CFLAGS
RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
ARG CUDA_V11_ARCHITECTURES
ENV GOARCH amd64
RUN --mount=type=cache,target=/root/.ccache \
OLLAMA_SKIP_STATIC_GENERATE=1 \
OLLAMA_SKIP_CPU_GENERATE=1 \
CMAKE_CUDA_ARCHITECTURES="${CUDA_V11_ARCHITECTURES}" \
CUDA_VARIANT="_v11" \
bash gen_linux.sh
FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION-devel-rockylinux8 AS cuda-build-arm64
FROM --platform=linux/amd64 nvidia/cuda:$CUDA_VERSION_12-devel-centos7 AS cuda-12-build-amd64
ARG CMAKE_VERSION
COPY ./scripts/rh_linux_deps.sh /
RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
COPY --from=llm-code / /go/src/github.com/ollama/ollama/
WORKDIR /go/src/github.com/ollama/ollama/llm/generate
ARG CGO_CFLAGS
ARG CUDA_V12_ARCHITECTURES
ENV GOARCH amd64
RUN --mount=type=cache,target=/root/.ccache \
OLLAMA_SKIP_STATIC_GENERATE=1 \
OLLAMA_SKIP_CPU_GENERATE=1 \
CMAKE_CUDA_ARCHITECTURES="${CUDA_V12_ARCHITECTURES}" \
CUDA_VARIANT="_v12" \
OLLAMA_CUSTOM_CUDA_DEFS="-DGGML_CUDA_USE_GRAPHS=on" \
bash gen_linux.sh
FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION_11-devel-rockylinux8 AS cuda-11-build-server-arm64
ARG CMAKE_VERSION
COPY ./scripts/rh_linux_deps.sh /
RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
@@ -28,7 +55,32 @@ ENV PATH /opt/rh/gcc-toolset-10/root/usr/bin:$PATH
COPY --from=llm-code / /go/src/github.com/ollama/ollama/
WORKDIR /go/src/github.com/ollama/ollama/llm/generate
ARG CGO_CFLAGS
RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
ARG CUDA_V11_ARCHITECTURES
ENV GOARCH arm64
RUN OLLAMA_SKIP_STATIC_GENERATE=1 \
OLLAMA_SKIP_CPU_GENERATE=1 \
CMAKE_CUDA_ARCHITECTURES="${CUDA_V11_ARCHITECTURES}" \
CUDA_VARIANT="_v11" \
bash gen_linux.sh
FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION_12-devel-rockylinux8 AS cuda-12-build-server-arm64
ARG CMAKE_VERSION
COPY ./scripts/rh_linux_deps.sh /
RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
ENV PATH /opt/rh/gcc-toolset-10/root/usr/bin:$PATH
COPY --from=llm-code / /go/src/github.com/ollama/ollama/
WORKDIR /go/src/github.com/ollama/ollama/llm/generate
ARG CGO_CFLAGS
ARG CUDA_V12_ARCHITECTURES
ENV GOARCH arm64
RUN --mount=type=cache,target=/root/.ccache \
OLLAMA_SKIP_STATIC_GENERATE=1 \
OLLAMA_SKIP_CPU_GENERATE=1 \
CMAKE_CUDA_ARCHITECTURES="${CUDA_V12_ARCHITECTURES}" \
CUDA_VARIANT="_v12" \
OLLAMA_CUSTOM_CUDA_DEFS="-DGGML_CUDA_USE_GRAPHS=on" \
bash gen_linux.sh
FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete AS rocm-build-amd64
ARG CMAKE_VERSION
@@ -40,15 +92,11 @@ COPY --from=llm-code / /go/src/github.com/ollama/ollama/
WORKDIR /go/src/github.com/ollama/ollama/llm/generate
ARG CGO_CFLAGS
ARG AMDGPU_TARGETS
RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
RUN mkdir /tmp/scratch && \
for dep in $(zcat /go/src/github.com/ollama/ollama/llm/build/linux/x86_64/rocm*/bin/deps.txt.gz) ; do \
cp ${dep} /tmp/scratch/ || exit 1 ; \
done && \
(cd /opt/rocm/lib && tar cf - rocblas/library) | (cd /tmp/scratch/ && tar xf - ) && \
mkdir -p /go/src/github.com/ollama/ollama/dist/deps/ && \
(cd /tmp/scratch/ && tar czvf /go/src/github.com/ollama/ollama/dist/deps/ollama-linux-amd64-rocm.tgz . )
ENV GOARCH amd64
RUN --mount=type=cache,target=/root/.ccache \
OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 bash gen_linux.sh
RUN mkdir -p ../../dist/linux-amd64-rocm/lib/ollama && \
(cd /opt/rocm/lib && tar cf - rocblas/library) | (cd ../../dist/linux-amd64-rocm/lib/ollama && tar xf - )
FROM --platform=linux/amd64 centos:7 AS cpu-builder-amd64
ARG CMAKE_VERSION
@@ -59,16 +107,21 @@ ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
COPY --from=llm-code / /go/src/github.com/ollama/ollama/
ARG OLLAMA_CUSTOM_CPU_DEFS
ARG CGO_CFLAGS
ENV GOARCH amd64
WORKDIR /go/src/github.com/ollama/ollama/llm/generate
FROM --platform=linux/amd64 cpu-builder-amd64 AS static-build-amd64
RUN OLLAMA_CPU_TARGET="static" sh gen_linux.sh
RUN --mount=type=cache,target=/root/.ccache \
OLLAMA_CPU_TARGET="static" bash gen_linux.sh
FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu-build-amd64
RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu" sh gen_linux.sh
RUN --mount=type=cache,target=/root/.ccache \
OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu" bash gen_linux.sh
FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu_avx-build-amd64
RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu_avx" sh gen_linux.sh
RUN --mount=type=cache,target=/root/.ccache \
OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu_avx" bash gen_linux.sh
FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu_avx2-build-amd64
RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu_avx2" sh gen_linux.sh
RUN --mount=type=cache,target=/root/.ccache \
OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu_avx2" bash gen_linux.sh
FROM --platform=linux/arm64 rockylinux:8 AS cpu-builder-arm64
ARG CMAKE_VERSION
@@ -79,12 +132,15 @@ ENV PATH /opt/rh/gcc-toolset-10/root/usr/bin:$PATH
COPY --from=llm-code / /go/src/github.com/ollama/ollama/
ARG OLLAMA_CUSTOM_CPU_DEFS
ARG CGO_CFLAGS
ENV GOARCH arm64
WORKDIR /go/src/github.com/ollama/ollama/llm/generate
FROM --platform=linux/arm64 cpu-builder-arm64 AS static-build-arm64
RUN OLLAMA_CPU_TARGET="static" sh gen_linux.sh
RUN --mount=type=cache,target=/root/.ccache \
OLLAMA_CPU_TARGET="static" bash gen_linux.sh
FROM --platform=linux/arm64 cpu-builder-arm64 AS cpu-build-arm64
RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu" sh gen_linux.sh
RUN --mount=type=cache,target=/root/.ccache \
OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu" bash gen_linux.sh
# Intermediate stage used for ./scripts/build_linux.sh
@@ -95,12 +151,16 @@ COPY . .
COPY --from=static-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
COPY --from=cpu_avx-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
COPY --from=cpu_avx2-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
COPY --from=cuda-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
COPY --from=cuda-11-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/
COPY --from=cuda-11-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
COPY --from=cuda-12-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/
COPY --from=cuda-12-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/
COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/dist/deps/ ./dist/deps/
ARG GOFLAGS
ARG CGO_CFLAGS
RUN go build -trimpath .
RUN --mount=type=cache,target=/root/.ccache \
go build -trimpath -o dist/linux-amd64/bin/ollama .
# Intermediate stage used for ./scripts/build_linux.sh
FROM --platform=linux/arm64 cpu-build-arm64 AS build-arm64
@@ -109,23 +169,36 @@ ARG GOLANG_VERSION
WORKDIR /go/src/github.com/ollama/ollama
COPY . .
COPY --from=static-build-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
COPY --from=cuda-build-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
COPY --from=cuda-11-build-server-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
COPY --from=cuda-11-build-server-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
COPY --from=cuda-12-build-server-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
COPY --from=cuda-12-build-server-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
ARG GOFLAGS
ARG CGO_CFLAGS
RUN go build -trimpath .
RUN --mount=type=cache,target=/root/.ccache \
go build -trimpath -o dist/linux-arm64/bin/ollama .
# Strip out ROCm dependencies to keep the primary image lean
FROM --platform=linux/amd64 ubuntu:22.04 as amd64-libs-without-rocm
COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /scratch/
RUN cd /scratch/ollama/ && rm -rf rocblas libamd* libdrm* libroc* libhip* libhsa*
# Runtime stages
FROM --platform=linux/amd64 ubuntu:22.04 as runtime-amd64
COPY --from=amd64-libs-without-rocm /scratch/ /lib/
RUN apt-get update && apt-get install -y ca-certificates
COPY --from=build-amd64 /go/src/github.com/ollama/ollama/ollama /bin/ollama
COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/
FROM --platform=linux/arm64 ubuntu:22.04 as runtime-arm64
COPY --from=build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/
RUN apt-get update && apt-get install -y ca-certificates
COPY --from=build-arm64 /go/src/github.com/ollama/ollama/ollama /bin/ollama
COPY --from=build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/bin/ /bin/
# Radeon images are much larger so we keep it distinct from the CPU/CUDA image
FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete as runtime-rocm
RUN update-pciids
COPY --from=build-amd64 /go/src/github.com/ollama/ollama/ollama /bin/ollama
COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/
RUN ln -s /opt/rocm/lib /lib/ollama
EXPOSE 11434
ENV OLLAMA_HOST 0.0.0.0

View File

@@ -54,6 +54,7 @@ Here are some example models that can be downloaded:
| Llama 3.1 | 405B | 231GB | `ollama run llama3.1:405b` |
| Phi 3 Mini | 3.8B | 2.3GB | `ollama run phi3` |
| Phi 3 Medium | 14B | 7.9GB | `ollama run phi3:medium` |
| Gemma 2 | 2B | 1.6GB | `ollama run gemma2:2b` |
| Gemma 2 | 9B | 5.5GB | `ollama run gemma2` |
| Gemma 2 | 27B | 16GB | `ollama run gemma2:27b` |
| Mistral | 7B | 4.1GB | `ollama run mistral` |
@@ -300,6 +301,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [Sidellama](https://github.com/gyopak/sidellama) (browser-based LLM client)
- [LLMStack](https://github.com/trypromptly/LLMStack) (No-code multi-agent framework to build LLM agents and workflows)
- [BoltAI for Mac](https://boltai.com) (AI Chat Client for Mac)
- [Harbor](https://github.com/av/harbor) (Containerized LLM Toolkit with Ollama as default backend)
### Terminal
@@ -323,6 +325,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [tlm](https://github.com/yusufcanb/tlm)
- [podman-ollama](https://github.com/ericcurtin/podman-ollama)
- [gollama](https://github.com/sammcj/gollama)
- [Ollama eBook Summary](https://github.com/cognitivetech/ollama-ebook-summary/)
### Database

View File

@@ -18,6 +18,7 @@ import (
"bytes"
"context"
"encoding/json"
"errors"
"fmt"
"io"
"net/http"
@@ -172,7 +173,7 @@ func (c *Client) stream(ctx context.Context, method, path string, data any, fn f
}
if errorResponse.Error != "" {
return fmt.Errorf(errorResponse.Error)
return errors.New(errorResponse.Error)
}
if response.StatusCode >= http.StatusBadRequest {
@@ -297,7 +298,7 @@ func (c *Client) List(ctx context.Context) (*ListResponse, error) {
return &lr, nil
}
// List running models.
// ListRunning lists running models.
func (c *Client) ListRunning(ctx context.Context) (*ProcessResponse, error) {
var lr ProcessResponse
if err := c.do(ctx, http.MethodGet, "/api/ps", nil, &lr); err != nil {
@@ -332,7 +333,7 @@ func (c *Client) Show(ctx context.Context, req *ShowRequest) (*ShowResponse, err
return &resp, nil
}
// Hearbeat checks if the server has started and is responsive; if yes, it
// Heartbeat checks if the server has started and is responsive; if yes, it
// returns nil, otherwise an error.
func (c *Client) Heartbeat(ctx context.Context) error {
if err := c.do(ctx, http.MethodHead, "/", nil, nil); err != nil {

View File

@@ -231,7 +231,6 @@ type Options struct {
// Runner options which must be set when the model is loaded into memory
type Runner struct {
UseNUMA bool `json:"numa,omitempty"`
NumCtx int `json:"num_ctx,omitempty"`
NumBatch int `json:"num_batch,omitempty"`
NumGPU int `json:"num_gpu,omitempty"`
@@ -505,7 +504,7 @@ func (opts *Options) FromMap(m map[string]interface{}) error {
for key, val := range m {
opt, ok := jsonOpts[key]
if !ok {
slog.Warn("invalid option provided", "option", opt.Name)
slog.Warn("invalid option provided", "option", key)
continue
}
@@ -615,7 +614,6 @@ func DefaultOptions() Options {
F16KV: true,
UseMLock: false,
UseMMap: nil,
UseNUMA: false,
},
}
}

View File

@@ -2,7 +2,7 @@ package api
import (
"encoding/json"
"fmt"
"errors"
"math"
"testing"
"time"
@@ -192,7 +192,7 @@ func TestUseMmapFormatParams(t *testing.T) {
"use_mmap": {"foo"},
},
exp: nil,
err: fmt.Errorf("invalid bool value [foo]"),
err: errors.New("invalid bool value [foo]"),
},
}

View File

@@ -2,8 +2,8 @@
package lifecycle
import "fmt"
import "errors"
func GetStarted() error {
return fmt.Errorf("GetStarted not implemented")
return errors.New("not implemented")
}

View File

@@ -34,7 +34,6 @@ func GetStarted() error {
Sys: &syscall.SysProcAttr{CreationFlags: CREATE_NEW_CONSOLE, HideWindow: false},
}
proc, err := os.StartProcess(args[0], args, attrs)
if err != nil {
return fmt.Errorf("unable to start getting started shell %w", err)
}

View File

@@ -27,7 +27,7 @@ func InitLogging() {
// TODO - write one-line to the app.log file saying we're running in console mode to help avoid confusion
} else {
rotateLogs(AppLogFile)
logFile, err = os.OpenFile(AppLogFile, os.O_APPEND|os.O_WRONLY|os.O_CREATE, 0755)
logFile, err = os.OpenFile(AppLogFile, os.O_APPEND|os.O_WRONLY|os.O_CREATE, 0o755)
if err != nil {
slog.Error(fmt.Sprintf("failed to create server log %v", err))
return

View File

@@ -5,5 +5,5 @@ package lifecycle
import "log/slog"
func ShowLogs() {
slog.Warn("ShowLogs not yet implemented")
slog.Warn("not implemented")
}

View File

@@ -17,7 +17,7 @@ func TestRotateLogs(t *testing.T) {
// No log exists
rotateLogs(logFile)
require.NoError(t, os.WriteFile(logFile, []byte("1"), 0644))
require.NoError(t, os.WriteFile(logFile, []byte("1"), 0o644))
assert.FileExists(t, logFile)
// First rotation
rotateLogs(logFile)
@@ -32,7 +32,7 @@ func TestRotateLogs(t *testing.T) {
assert.NoFileExists(t, logFile)
for i := 2; i <= LogRotationCount+1; i++ {
require.NoError(t, os.WriteFile(logFile, []byte(strconv.Itoa(i)), 0644))
require.NoError(t, os.WriteFile(logFile, []byte(strconv.Itoa(i)), 0o644))
assert.FileExists(t, logFile)
rotateLogs(logFile)
assert.NoFileExists(t, logFile)

View File

@@ -55,7 +55,7 @@ func start(ctx context.Context, command string) (*exec.Cmd, error) {
}
rotateLogs(ServerLogFile)
logFile, err := os.OpenFile(ServerLogFile, os.O_APPEND|os.O_WRONLY|os.O_CREATE, 0755)
logFile, err := os.OpenFile(ServerLogFile, os.O_APPEND|os.O_WRONLY|os.O_CREATE, 0o755)
if err != nil {
return nil, fmt.Errorf("failed to create server log: %w", err)
}

View File

@@ -15,6 +15,7 @@ import (
"path"
"path/filepath"
"runtime"
"strconv"
"strings"
"time"
@@ -46,7 +47,7 @@ func IsNewReleaseAvailable(ctx context.Context) (bool, UpdateResponse) {
query.Add("os", runtime.GOOS)
query.Add("arch", runtime.GOARCH)
query.Add("version", version.Version)
query.Add("ts", fmt.Sprintf("%d", time.Now().Unix()))
query.Add("ts", strconv.FormatInt(time.Now().Unix(), 10))
nonce, err := auth.NewNonce(rand.Reader, 16)
if err != nil {

View File

@@ -4,9 +4,9 @@ package lifecycle
import (
"context"
"fmt"
"errors"
)
func DoUpgrade(cancel context.CancelFunc, done chan int) error {
return fmt.Errorf("DoUpgrade not yet implemented")
return errors.New("not implemented")
}

View File

@@ -2,6 +2,7 @@ package lifecycle
import (
"context"
"errors"
"fmt"
"log/slog"
"os"
@@ -15,7 +16,7 @@ func DoUpgrade(cancel context.CancelFunc, done chan int) error {
return fmt.Errorf("failed to lookup downloads: %s", err)
}
if len(files) == 0 {
return fmt.Errorf("no update downloads found")
return errors.New("no update downloads found")
} else if len(files) > 1 {
// Shouldn't happen
slog.Warn(fmt.Sprintf("multiple downloads found, using first one %v", files))
@@ -64,7 +65,7 @@ func DoUpgrade(cancel context.CancelFunc, done chan int) error {
}
} else {
// TODO - some details about why it didn't start, or is this a pedantic error case?
return fmt.Errorf("installer process did not start")
return errors.New("installer process did not start")
}
// TODO should we linger for a moment and check to make sure it's actually running by checking the pid?

View File

@@ -87,20 +87,11 @@ DialogFontSize=12
[Files]
Source: ".\app.exe"; DestDir: "{app}"; DestName: "{#MyAppExeName}" ; Flags: ignoreversion 64bit
Source: "..\ollama.exe"; DestDir: "{app}"; Flags: ignoreversion 64bit
Source: "..\dist\windows-{#ARCH}\ollama_runners\*"; DestDir: "{app}\ollama_runners"; Flags: ignoreversion 64bit recursesubdirs
Source: "..\ollama.exe"; DestDir: "{app}\bin"; Flags: ignoreversion 64bit
Source: "..\dist\windows-{#ARCH}\lib\ollama\runners\*"; DestDir: "{app}\lib\ollama\runners"; Flags: ignoreversion 64bit recursesubdirs
Source: "..\dist\ollama_welcome.ps1"; DestDir: "{app}"; Flags: ignoreversion
Source: ".\assets\app.ico"; DestDir: "{app}"; Flags: ignoreversion
#if DirExists("..\dist\windows-amd64\cuda")
Source: "..\dist\windows-amd64\cuda\*"; DestDir: "{app}\cuda\"; Flags: ignoreversion recursesubdirs
#endif
#if DirExists("..\dist\windows-amd64\oneapi")
Source: "..\dist\windows-amd64\oneapi\*"; DestDir: "{app}\oneapi\"; Flags: ignoreversion recursesubdirs
#endif
#if DirExists("..\dist\windows-amd64\rocm")
Source: "..\dist\windows-amd64\rocm\*"; DestDir: "{app}\rocm\"; Flags: ignoreversion recursesubdirs
#endif
Source: "..\dist\windows-amd64\lib\ollama\*"; DestDir: "{app}\lib\ollama\"; Flags: ignoreversion recursesubdirs
[Icons]
Name: "{group}\{#MyAppName}"; Filename: "{app}\{#MyAppExeName}"; IconFilename: "{app}\app.ico"
@@ -108,7 +99,7 @@ Name: "{userstartup}\{#MyAppName}"; Filename: "{app}\{#MyAppExeName}"; IconFilen
Name: "{userprograms}\{#MyAppName}"; Filename: "{app}\{#MyAppExeName}"; IconFilename: "{app}\app.ico"
[Run]
Filename: "{cmd}"; Parameters: "/C set PATH={app};%PATH% & ""{app}\{#MyAppExeName}"""; Flags: postinstall nowait runhidden
Filename: "{cmd}"; Parameters: "/C set PATH={app}\bin;%PATH% & ""{app}\{#MyAppExeName}"""; Flags: postinstall nowait runhidden
[UninstallRun]
; Filename: "{cmd}"; Parameters: "/C ""taskkill /im ''{#MyAppExeName}'' /f /t"; Flags: runhidden
@@ -143,8 +134,8 @@ SetupAppRunningError=Another Ollama installer is running.%n%nPlease cancel or fi
[Registry]
Root: HKCU; Subkey: "Environment"; \
ValueType: expandsz; ValueName: "Path"; ValueData: "{olddata};{app}"; \
Check: NeedsAddPath('{app}')
ValueType: expandsz; ValueName: "Path"; ValueData: "{olddata};{app}\bin"; \
Check: NeedsAddPath('{app}\bin')
[Code]

View File

@@ -3,11 +3,11 @@
package tray
import (
"fmt"
"errors"
"github.com/ollama/ollama/app/tray/commontray"
)
func InitPlatformTray(icon, updateIcon []byte) (commontray.OllamaTray, error) {
return nil, fmt.Errorf("NOT IMPLEMENTED YET")
return nil, errors.New("not implemented")
}

View File

@@ -11,9 +11,7 @@ import (
"golang.org/x/sys/windows"
)
var (
quitOnce sync.Once
)
var quitOnce sync.Once
func (t *winTray) Run() {
nativeLoop()

View File

@@ -11,12 +11,12 @@ import (
)
const (
updatAvailableMenuID = 1
updateMenuID = updatAvailableMenuID + 1
separatorMenuID = updateMenuID + 1
diagLogsMenuID = separatorMenuID + 1
diagSeparatorMenuID = diagLogsMenuID + 1
quitMenuID = diagSeparatorMenuID + 1
updateAvailableMenuID = 1
updateMenuID = updateAvailableMenuID + 1
separatorMenuID = updateMenuID + 1
diagLogsMenuID = separatorMenuID + 1
diagSeparatorMenuID = diagLogsMenuID + 1
quitMenuID = diagSeparatorMenuID + 1
)
func (t *winTray) initMenus() error {
@@ -35,7 +35,7 @@ func (t *winTray) initMenus() error {
func (t *winTray) UpdateAvailable(ver string) error {
if !t.updateNotified {
slog.Debug("updating menu and sending notification for new update")
if err := t.addOrUpdateMenuItem(updatAvailableMenuID, 0, updateAvailableMenuTitle, true); err != nil {
if err := t.addOrUpdateMenuItem(updateAvailableMenuID, 0, updateAvailableMenuTitle, true); err != nil {
return fmt.Errorf("unable to create menu entries %w", err)
}
if err := t.addOrUpdateMenuItem(updateMenuID, 0, updateMenutTitle, false); err != nil {

View File

@@ -11,10 +11,12 @@ import (
"path/filepath"
"sort"
"sync"
"syscall"
"unsafe"
"github.com/ollama/ollama/app/tray/commontray"
"golang.org/x/sys/windows"
"github.com/ollama/ollama/app/tray/commontray"
)
// Helpful sources: https://github.com/golang/exp/blob/master/shiny/driver/internal/win32
@@ -414,7 +416,7 @@ func iconBytesToFilePath(iconBytes []byte) (string, error) {
iconFilePath := filepath.Join(os.TempDir(), "ollama_temp_icon_"+dataHash)
if _, err := os.Stat(iconFilePath); os.IsNotExist(err) {
if err := os.WriteFile(iconFilePath, iconBytes, 0644); err != nil {
if err := os.WriteFile(iconFilePath, iconBytes, 0o644); err != nil {
return "", err
}
}
@@ -432,7 +434,12 @@ func (t *winTray) setIcon(src string) error {
t.muNID.Lock()
defer t.muNID.Unlock()
t.nid.Icon = h
t.nid.Flags |= NIF_ICON
t.nid.Flags |= NIF_ICON | NIF_TIP
if toolTipUTF16, err := syscall.UTF16FromString(commontray.ToolTip); err == nil {
copy(t.nid.Tip[:], toolTipUTF16)
} else {
return err
}
t.nid.Size = uint32(unsafe.Sizeof(*t.nid))
return t.nid.modify()

View File

@@ -61,6 +61,7 @@ const (
MIIM_SUBMENU = 0x00000004
MIM_APPLYTOSUBMENUS = 0x80000000
NIF_ICON = 0x00000002
NIF_TIP = 0x00000004
NIF_INFO = 0x00000010
NIF_MESSAGE = 0x00000001
SW_HIDE = 0

View File

@@ -5,6 +5,7 @@ import (
"context"
"crypto/rand"
"encoding/base64"
"errors"
"fmt"
"io"
"log/slog"
@@ -78,7 +79,7 @@ func Sign(ctx context.Context, bts []byte) (string, error) {
publicKey := ssh.MarshalAuthorizedKey(privateKey.PublicKey())
parts := bytes.Split(publicKey, []byte(" "))
if len(parts) < 2 {
return "", fmt.Errorf("malformed public key")
return "", errors.New("malformed public key")
}
signedData, err := privateKey.Sign(rand.Reader, bts)

View File

@@ -22,6 +22,7 @@ import (
"runtime"
"slices"
"strings"
"sync/atomic"
"syscall"
"time"
@@ -78,6 +79,7 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
status := "transferring model data"
spinner := progress.NewSpinner(status)
p.Add(status, spinner)
defer p.Stop()
for i := range modelfile.Commands {
switch modelfile.Commands[i].Name {
@@ -112,7 +114,7 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
path = tempfile
}
digest, err := createBlob(cmd, client, path)
digest, err := createBlob(cmd, client, path, spinner)
if err != nil {
return err
}
@@ -202,6 +204,12 @@ func tempZipFiles(path string) (string, error) {
// safetensors files might be unresolved git lfs references; skip if they are
// covers model-x-of-y.safetensors, model.fp32-x-of-y.safetensors, model.safetensors
files = append(files, st...)
} else if st, _ := glob(filepath.Join(path, "adapters.safetensors"), "application/octet-stream"); len(st) > 0 {
// covers adapters.safetensors
files = append(files, st...)
} else if st, _ := glob(filepath.Join(path, "adapter_model.safetensors"), "application/octet-stream"); len(st) > 0 {
// covers adapter_model.safetensors
files = append(files, st...)
} else if pt, _ := glob(filepath.Join(path, "pytorch_model*.bin"), "application/zip"); len(pt) > 0 {
// pytorch files might also be unresolved git lfs references; skip if they are
// covers pytorch_model-x-of-y.bin, pytorch_model.fp32-x-of-y.bin, pytorch_model.bin
@@ -221,6 +229,14 @@ func tempZipFiles(path string) (string, error) {
}
files = append(files, js...)
// bert models require a nested config.json
// TODO(mxyng): merge this with the glob above
js, err = glob(filepath.Join(path, "**/*.json"), "text/plain")
if err != nil {
return "", err
}
files = append(files, js...)
if tks, _ := glob(filepath.Join(path, "tokenizer.model"), "application/octet-stream"); len(tks) > 0 {
// add tokenizer.model if it exists, tokenizer.json is automatically picked up by the previous glob
// tokenizer.model might be a unresolved git lfs reference; error if it is
@@ -250,6 +266,11 @@ func tempZipFiles(path string) (string, error) {
return "", err
}
zfi.Name, err = filepath.Rel(path, file)
if err != nil {
return "", err
}
zf, err := zipfile.CreateHeader(zfi)
if err != nil {
return "", err
@@ -263,13 +284,20 @@ func tempZipFiles(path string) (string, error) {
return tempfile.Name(), nil
}
func createBlob(cmd *cobra.Command, client *api.Client, path string) (string, error) {
func createBlob(cmd *cobra.Command, client *api.Client, path string, spinner *progress.Spinner) (string, error) {
bin, err := os.Open(path)
if err != nil {
return "", err
}
defer bin.Close()
// Get file info to retrieve the size
fileInfo, err := bin.Stat()
if err != nil {
return "", err
}
fileSize := fileInfo.Size()
hash := sha256.New()
if _, err := io.Copy(hash, bin); err != nil {
return "", err
@@ -279,13 +307,43 @@ func createBlob(cmd *cobra.Command, client *api.Client, path string) (string, er
return "", err
}
var pw progressWriter
status := "transferring model data 0%"
spinner.SetMessage(status)
done := make(chan struct{})
defer close(done)
go func() {
ticker := time.NewTicker(60 * time.Millisecond)
defer ticker.Stop()
for {
select {
case <-ticker.C:
spinner.SetMessage(fmt.Sprintf("transferring model data %d%%", int(100*pw.n.Load()/fileSize)))
case <-done:
spinner.SetMessage("transferring model data 100%")
return
}
}
}()
digest := fmt.Sprintf("sha256:%x", hash.Sum(nil))
if err = client.CreateBlob(cmd.Context(), digest, bin); err != nil {
if err = client.CreateBlob(cmd.Context(), digest, io.TeeReader(bin, &pw)); err != nil {
return "", err
}
return digest, nil
}
type progressWriter struct {
n atomic.Int64
}
func (w *progressWriter) Write(p []byte) (n int, err error) {
w.n.Add(int64(len(p)))
return len(p), nil
}
func RunHandler(cmd *cobra.Command, args []string) error {
interactive := true
@@ -1086,7 +1144,7 @@ func generate(cmd *cobra.Command, opts runOptions) error {
return nil
}
func RunServer(cmd *cobra.Command, _ []string) error {
func RunServer(_ *cobra.Command, _ []string) error {
if err := initializeKeypair(); err != nil {
return err
}
@@ -1160,7 +1218,7 @@ func checkServerHeartbeat(cmd *cobra.Command, _ []string) error {
return err
}
if err := startApp(cmd.Context(), client); err != nil {
return fmt.Errorf("could not connect to ollama app, is it running?")
return errors.New("could not connect to ollama app, is it running?")
}
}
return nil

View File

@@ -604,7 +604,7 @@ func getImageData(filePath string) ([]byte, error) {
// Check if the file size exceeds 100MB
var maxSize int64 = 100 * 1024 * 1024 // 100MB in bytes
if info.Size() > maxSize {
return nil, fmt.Errorf("file size exceeds maximum limit (100MB)")
return nil, errors.New("file size exceeds maximum limit (100MB)")
}
buf = make([]byte, info.Size())

View File

@@ -2,7 +2,7 @@ package cmd
import (
"context"
"fmt"
"errors"
"os"
"os/exec"
"strings"
@@ -20,7 +20,7 @@ func startApp(ctx context.Context, client *api.Client) error {
return err
}
if !strings.Contains(link, "Ollama.app") {
return fmt.Errorf("could not find ollama app")
return errors.New("could not find ollama app")
}
path := strings.Split(link, "Ollama.app")
if err := exec.Command("/usr/bin/open", "-a", path[0]+"Ollama.app").Run(); err != nil {

View File

@@ -4,11 +4,11 @@ package cmd
import (
"context"
"fmt"
"errors"
"github.com/ollama/ollama/api"
)
func startApp(ctx context.Context, client *api.Client) error {
return fmt.Errorf("could not connect to ollama server, run 'ollama serve' to start it")
return errors.New("could not connect to ollama server, run 'ollama serve' to start it")
}

View File

@@ -31,7 +31,7 @@ func startApp(ctx context.Context, client *api.Client) error {
// Finally look in the path
appExe, err = exec.LookPath(AppName)
if err != nil {
return fmt.Errorf("could not locate ollama app")
return errors.New("could not locate ollama app")
}
}
}

View File

@@ -7,16 +7,27 @@ import (
"io"
"io/fs"
"log/slog"
"strings"
"github.com/ollama/ollama/llm"
)
type Parameters struct {
type ModelParameters struct {
Architectures []string `json:"architectures"`
VocabSize uint32 `json:"vocab_size"`
}
func (Parameters) KV(t *Tokenizer) llm.KV {
type AdapterParameters struct {
Alpha uint32 `json:"lora_alpha"`
LoraLayers uint32 `json:"lora_layers"`
LoraParameters struct {
Rank uint32 `json:"rank"`
Alpha float32 `json:"alpha"`
Scale float32 `json:"scale"`
} `json:"lora_parameters"`
}
func (ModelParameters) KV(t *Tokenizer) llm.KV {
kv := llm.KV{
"general.file_type": uint32(1),
"general.quantization_version": uint32(2),
@@ -27,6 +38,10 @@ func (Parameters) KV(t *Tokenizer) llm.KV {
"tokenizer.ggml.token_type": t.Vocabulary.Types,
}
if len(t.Merges) > 0 {
kv["tokenizer.ggml.merges"] = t.Merges
}
if t.Template != "" {
kv["tokenizer.chat_template"] = t.Template
}
@@ -39,40 +54,119 @@ func (Parameters) KV(t *Tokenizer) llm.KV {
return kv
}
func (Parameters) specialTokenTypes() []string {
func (p AdapterParameters) KV() llm.KV {
var alpha float32
if p.LoraParameters.Alpha == 0 {
alpha = float32(p.Alpha)
} else {
alpha = p.LoraParameters.Alpha
}
kv := llm.KV{
"adapter.lora.alpha": alpha,
"adapter.type": "lora",
"general.file_type": uint32(1),
"general.type": "adapter",
"general.version": "v0.2",
}
return kv
}
func (ModelParameters) specialTokenTypes() []string {
return []string{
"bos", "eos", "unk", "sep", "pad", "cls", "mask",
}
}
func (Parameters) writeFile(ws io.WriteSeeker, kv llm.KV, ts []llm.Tensor) error {
func (ModelParameters) writeFile(ws io.WriteSeeker, kv llm.KV, ts []llm.Tensor) error {
return llm.WriteGGUF(ws, kv, ts)
}
type Converter interface {
func (AdapterParameters) writeFile(ws io.WriteSeeker, kv llm.KV, ts []llm.Tensor) error {
return llm.WriteGGUF(ws, kv, ts)
}
type ModelConverter interface {
// KV maps parameters to LLM key-values
KV(*Tokenizer) llm.KV
// Tensors maps input tensors to LLM tensors. Model specific modifications can be done here.
Tensors([]Tensor) []llm.Tensor
// Replacements returns a list of string pairs to replace in tensor names.
// See [strings.Replacer](https://pkg.go.dev/strings#Replacer) for details
Replacements() []string
// tensorName returns the LLM tensor name for a specific input name
tensorName(string) string
// specialTokenTypes returns any special token types the model uses
specialTokenTypes() []string
// writeFile writes the model to the provided io.WriteSeeker
writeFile(io.WriteSeeker, llm.KV, []llm.Tensor) error
}
type moreParser interface {
parseMore(fs.FS) error
}
type AdapterConverter interface {
// KV maps parameters to LLM key-values
KV(llm.KV) llm.KV
// Tensors maps input tensors to LLM tensors. Adapter specific modifications can be done here.
Tensors([]Tensor) []llm.Tensor
// Replacements returns a list of string pairs to replace in tensor names.
// See [strings.Replacer](https://pkg.go.dev/strings#Replacer) for details
Replacements() []string
writeFile(io.WriteSeeker, llm.KV, []llm.Tensor) error
}
func ConvertAdapter(fsys fs.FS, ws io.WriteSeeker, baseKV llm.KV) error {
bts, err := fs.ReadFile(fsys, "adapter_config.json")
if err != nil {
return err
}
var p AdapterParameters
if err := json.Unmarshal(bts, &p); err != nil {
return err
}
arch, ok := baseKV["general.architecture"]
if !ok {
return errors.New("architecture not set for the base model")
}
var conv AdapterConverter
switch arch {
case "llama":
conv = &llamaAdapter{}
case "gemma2":
conv = &gemma2Adapter{}
default:
return errors.New("unsupported architecture")
}
ts, err := parseTensors(fsys, strings.NewReplacer(conv.Replacements()...))
if err != nil {
return err
}
if err := json.Unmarshal(bts, conv); err != nil {
return err
}
return conv.writeFile(ws, conv.KV(baseKV), conv.Tensors(ts))
}
// Convert writes an Ollama compatible model to the provided io.WriteSeeker based on configurations
// and files it finds in the input path.
// Supported input model formats include safetensors.
// Supported input tokenizers files include tokenizer.json (preferred) and tokenizer.model.
func Convert(fsys fs.FS, ws io.WriteSeeker) error {
func ConvertModel(fsys fs.FS, ws io.WriteSeeker) error {
bts, err := fs.ReadFile(fsys, "config.json")
if err != nil {
return err
}
var p Parameters
var p ModelParameters
if err := json.Unmarshal(bts, &p); err != nil {
return err
}
@@ -81,14 +175,20 @@ func Convert(fsys fs.FS, ws io.WriteSeeker) error {
return errors.New("unknown architecture")
}
var conv Converter
var conv ModelConverter
switch p.Architectures[0] {
case "LlamaForCausalLM", "MistralForCausalLM":
conv = &llama{}
conv = &llamaModel{}
case "MixtralForCausalLM":
conv = &mixtral{}
conv = &mixtralModel{}
case "GemmaForCausalLM":
conv = &gemma{}
conv = &gemmaModel{}
case "Gemma2ForCausalLM":
conv = &gemma2Model{}
case "Phi3ForCausalLM":
conv = &phi3Model{}
case "BertModel":
conv = &bertModel{}
default:
return errors.New("unsupported architecture")
}
@@ -97,6 +197,12 @@ func Convert(fsys fs.FS, ws io.WriteSeeker) error {
return err
}
if t, ok := conv.(moreParser); ok {
if err := t.parseMore(fsys); err != nil {
return err
}
}
t, err := parseTokenizer(fsys, conv.specialTokenTypes())
if err != nil {
return err
@@ -113,7 +219,7 @@ func Convert(fsys fs.FS, ws io.WriteSeeker) error {
slog.Debug("vocabulary", "size", len(t.Vocabulary.Tokens))
}
ts, err := parseTensors(fsys)
ts, err := parseTensors(fsys, strings.NewReplacer(conv.Replacements()...))
if err != nil {
return err
}

174
convert/convert_bert.go Normal file
View File

@@ -0,0 +1,174 @@
package convert
import (
"cmp"
"encoding/json"
"io/fs"
"path/filepath"
"slices"
"strings"
"github.com/ollama/ollama/llm"
)
type bertModel struct {
ModelParameters
NLayers uint32 `json:"n_layers"`
NumHiddenLayers uint32 `json:"num_hidden_layers"`
NLayer uint32 `json:"n_layer"`
MaxPositionEmbeddings uint32 `json:"max_position_embeddings"`
NCtx uint32 `json:"n_ctx"`
HiddenSize uint32 `json:"hidden_size"`
NEmbd uint32 `json:"n_embd"`
IntermediateSize uint32 `json:"intermediate_size"`
NInner uint32 `json:"n_inner"`
NumAttentionHeads uint32 `json:"num_attention_heads"`
NHead uint32 `json:"n_head"`
NumKeyValueHeads uint32 `json:"num_key_value_heads"`
LayerNormEPS float32 `json:"layer_norm_eps"`
LayerNormEpsilon float32 `json:"layer_norm_epsilon"`
NormEpsilon float32 `json:"norm_epsilon"`
PoolingType uint32
}
var (
_ ModelConverter = (*bertModel)(nil)
_ moreParser = (*bertModel)(nil)
)
func (p *bertModel) parseMore(fsys fs.FS) error {
bts, err := fs.ReadFile(fsys, "modules.json")
if err != nil {
return err
}
var modules []struct {
Type string `json:"type"`
Path string `json:"path"`
}
if err := json.Unmarshal(bts, &modules); err != nil {
return err
}
var pooling string
for _, m := range modules {
if m.Type == "sentence_transformers.models.Pooling" {
pooling = m.Path
break
}
}
if pooling != "" {
bts, err := fs.ReadFile(fsys, filepath.Join(pooling, "config.json"))
if err != nil {
return err
}
var pc struct {
PoolingModeCLSToken bool `json:"pooling_mode_cls_token"`
PoolingModeMeanTokens bool `json:"pooling_mode_mean_tokens"`
}
if err := json.Unmarshal(bts, &pc); err != nil {
return err
}
if pc.PoolingModeMeanTokens {
p.PoolingType = 1
} else if pc.PoolingModeCLSToken {
p.PoolingType = 2
}
}
return nil
}
func (p *bertModel) KV(t *Tokenizer) llm.KV {
kv := p.ModelParameters.KV(t)
kv["general.architecture"] = "bert"
kv["bert.attention.causal"] = false
kv["bert.pooling_type"] = p.PoolingType
kv["bert.block_count"] = cmp.Or(p.NLayers, p.NumHiddenLayers, p.NLayer)
if contextLength := cmp.Or(p.MaxPositionEmbeddings, p.NCtx); contextLength > 0 {
kv["bert.context_length"] = contextLength
}
if embeddingLength := cmp.Or(p.HiddenSize, p.NEmbd); embeddingLength > 0 {
kv["bert.embedding_length"] = cmp.Or(p.HiddenSize, p.NEmbd)
}
if feedForwardLength := cmp.Or(p.IntermediateSize, p.NInner); feedForwardLength > 0 {
kv["bert.feed_forward_length"] = cmp.Or(p.IntermediateSize, p.NInner)
}
if headCount := cmp.Or(p.NumAttentionHeads, p.NHead); headCount > 0 {
kv["bert.attention.head_count"] = cmp.Or(p.NumAttentionHeads, p.NHead)
}
if layerNormEpsilon := cmp.Or(p.LayerNormEPS, p.LayerNormEpsilon, p.NormEpsilon); layerNormEpsilon > 0 {
kv["bert.attention.layer_norm_epsilon"] = layerNormEpsilon
}
kv["tokenizer.ggml.model"] = "bert"
kv["tokenizer.ggml.token_type_count"] = uint32(2)
// convert to phantom space tokens
for i, e := range t.Tokens {
if strings.HasPrefix(e, "[") && strings.HasSuffix(e, "]") {
// noop
} else if strings.HasPrefix(e, "##") {
t.Tokens[i] = e[2:]
} else {
t.Tokens[i] = "\u2581" + e
}
}
kv["tokenizer.ggml.tokens"] = t.Tokens
return kv
}
func (p *bertModel) Tensors(ts []Tensor) []llm.Tensor {
var out []llm.Tensor
for _, t := range ts {
if slices.Contains([]string{
"embeddings.position_ids",
"pooler.dense.weight",
"pooler.dense.bias",
}, t.Name()) {
continue
}
out = append(out, llm.Tensor{
Name: t.Name(),
Kind: t.Kind(),
Shape: t.Shape(),
WriterTo: t,
})
}
return out
}
func (bertModel) Replacements() []string {
return []string{
"encoder.layer", "blk",
"encoder.layers", "blk",
"embeddings.word_embeddings", "token_embd",
"embeddings.token_type_embeddings", "token_types",
"embeddings.LayerNorm", "token_embd_norm",
"embeddings.position_embeddings", "position_embd",
"attention.self.query", "attn_q",
"attention.self.key", "attn_k",
"attention.self.value", "attn_v",
"attention.output.dense", "attn_output",
"attention.output.LayerNorm", "attn_output_norm",
"intermediate.dense", "ffn_up",
"output.dense", "ffn_down",
"output.LayerNorm", "layer_output_norm",
}
}

View File

@@ -9,8 +9,8 @@ import (
"github.com/ollama/ollama/llm"
)
type gemma struct {
Parameters
type gemmaModel struct {
ModelParameters
MaxPositionEmbeddings uint32 `json:"max_position_embeddings"`
HiddenSize uint32 `json:"hidden_size"`
HiddenLayers uint32 `json:"num_hidden_layers"`
@@ -21,12 +21,11 @@ type gemma struct {
HeadDim uint32 `json:"head_dim"`
}
var _ Converter = (*gemma)(nil)
var _ ModelConverter = (*gemmaModel)(nil)
func (p *gemma) KV(t *Tokenizer) llm.KV {
kv := p.Parameters.KV(t)
func (p *gemmaModel) KV(t *Tokenizer) llm.KV {
kv := p.ModelParameters.KV(t)
kv["general.architecture"] = "gemma"
kv["general.name"] = "gemma"
kv["gemma.context_length"] = p.MaxPositionEmbeddings
kv["gemma.embedding_length"] = p.HiddenSize
kv["gemma.block_count"] = p.HiddenLayers
@@ -43,16 +42,15 @@ func (p *gemma) KV(t *Tokenizer) llm.KV {
return kv
}
func (p *gemma) Tensors(ts []Tensor) []llm.Tensor {
func (p *gemmaModel) Tensors(ts []Tensor) []llm.Tensor {
var out []llm.Tensor
for _, t := range ts {
name := p.tensorName(t.Name())
if strings.HasSuffix(name, "_norm.weight") {
if strings.HasSuffix(t.Name(), "_norm.weight") {
t.SetRepacker(p.addOne)
}
out = append(out, llm.Tensor{
Name: name,
Name: t.Name(),
Kind: t.Kind(),
Shape: t.Shape(),
WriterTo: t,
@@ -62,8 +60,8 @@ func (p *gemma) Tensors(ts []Tensor) []llm.Tensor {
return out
}
func (p *gemma) tensorName(n string) string {
return strings.NewReplacer(
func (p *gemmaModel) Replacements() []string {
return []string{
"model.embed_tokens", "token_embd",
"model.norm", "output_norm",
"model.layers", "blk",
@@ -76,11 +74,10 @@ func (p *gemma) tensorName(n string) string {
"mlp.down_proj", "ffn_down",
"mlp.up_proj", "ffn_up",
"post_attention_layernorm", "ffn_norm",
"block_sparse_moe.gate", "ffn_inp",
).Replace(n)
}
}
func (*gemma) addOne(_ string, data []float32, shape []uint64) ([]float32, error) {
func (*gemmaModel) addOne(_ string, data []float32, shape []uint64) ([]float32, error) {
n := tensor.New(tensor.WithShape(int(shape[0])), tensor.WithBacking(data))
ones := tensor.Ones(tensor.Float32, int(shape[0]))

43
convert/convert_gemma2.go Normal file
View File

@@ -0,0 +1,43 @@
package convert
import (
"github.com/ollama/ollama/llm"
)
type gemma2Model struct {
gemmaModel
SlidingWindow uint32 `json:"sliding_window"`
AttentionLogitSoftcap float32 `json:"attn_logit_softcapping"`
FinalLogitSoftcap float32 `json:"final_logit_softcapping"`
}
func (p *gemma2Model) KV(t *Tokenizer) llm.KV {
kv := p.ModelParameters.KV(t)
kv["general.architecture"] = "gemma2"
kv["gemma2.context_length"] = p.MaxPositionEmbeddings
kv["gemma2.embedding_length"] = p.HiddenSize
kv["gemma2.block_count"] = p.HiddenLayers
kv["gemma2.feed_forward_length"] = p.IntermediateSize
kv["gemma2.attention.head_count"] = p.NumAttentionHeads
kv["gemma2.attention.head_count_kv"] = p.NumKeyValueHeads
kv["gemma2.attention.layer_norm_rms_epsilon"] = p.RMSNormEPS
kv["gemma2.attention.key_length"] = p.HeadDim
kv["gemma2.attention.value_length"] = p.HeadDim
kv["gemma2.attention.sliding_window"] = p.SlidingWindow
kv["gemma2.attn_logit_softcapping"] = p.AttentionLogitSoftcap
kv["gemma2.final_logit_softcapping"] = p.FinalLogitSoftcap
kv["tokenizer.ggml.eot_token_id"] = uint32(107)
kv["tokenizer.ggml.middle_token_id"] = uint32(68)
kv["tokenizer.ggml.prefix_token_id"] = uint32(67)
kv["tokenizer.ggml.suffix_token_id"] = uint32(69)
return kv
}
func (p *gemma2Model) Replacements() []string {
return append(
p.gemmaModel.Replacements(),
"post_attention_layernorm", "post_attention_norm",
"pre_feedforward_layernorm", "ffn_norm",
"post_feedforward_layernorm", "post_ffw_norm",
)
}

View File

@@ -0,0 +1,91 @@
package convert
import (
"strings"
"github.com/pdevine/tensor"
"github.com/pdevine/tensor/native"
"github.com/ollama/ollama/llm"
)
type gemma2Adapter struct {
AdapterParameters
}
var _ AdapterConverter = (*gemma2Adapter)(nil)
func (p *gemma2Adapter) KV(baseKV llm.KV) llm.KV {
kv := p.AdapterParameters.KV()
kv["general.architecture"] = "gemma2"
return kv
}
func (p *gemma2Adapter) Tensors(ts []Tensor) []llm.Tensor {
var out []llm.Tensor
for _, t := range ts {
shape := t.Shape()
if (strings.HasSuffix(t.Name(), "weight.lora_a") && shape[0] > shape[1]) ||
(strings.HasSuffix(t.Name(), "weight.lora_b") && shape[0] < shape[1]) {
shape[0], shape[1] = shape[1], shape[0]
t.SetRepacker(p.repack)
}
out = append(out, llm.Tensor{
Name: t.Name(),
Kind: t.Kind(),
Shape: t.Shape(),
WriterTo: t,
})
}
return out
}
func (p *gemma2Adapter) Replacements() []string {
return []string{
"base_model.model.", "",
"model.layers", "blk",
"self_attn.q_proj", "attn_q",
"self_attn.k_proj", "attn_k",
"self_attn.v_proj", "attn_v",
"self_attn.o_proj", "attn_output",
"mlp.gate_proj", "ffn_gate",
"mlp.down_proj", "ffn_down",
"mlp.up_proj", "ffn_up",
"lora_A.weight", "weight.lora_a",
"lora_B.weight", "weight.lora_b",
"lora_a", "weight.lora_a",
"lora_b", "weight.lora_b",
}
}
func (p *gemma2Adapter) repack(name string, data []float32, shape []uint64) ([]float32, error) {
dims := []int{int(shape[1]), int(shape[0])}
n := tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
if err := n.T(1, 0); err != nil {
return nil, err
}
if err := n.Reshape(dims...); err != nil {
return nil, err
}
if err := n.Transpose(); err != nil {
return nil, err
}
ts, err := native.SelectF32(n, 1)
if err != nil {
return nil, err
}
var f32s []float32
for _, t := range ts {
f32s = append(f32s, t...)
}
return f32s, nil
}

View File

@@ -3,15 +3,17 @@ package convert
import (
"cmp"
"fmt"
"math"
"strings"
"github.com/ollama/ollama/llm"
"github.com/pdevine/tensor"
"github.com/pdevine/tensor/native"
"github.com/ollama/ollama/llm"
)
type llama struct {
Parameters
type llamaModel struct {
ModelParameters
NLayers uint32 `json:"n_layers"`
NumHiddenLayers uint32 `json:"num_hidden_layers"`
NLayer uint32 `json:"n_layer"`
@@ -26,8 +28,14 @@ type llama struct {
NumKeyValueHeads uint32 `json:"num_key_value_heads"`
RopeTheta float32 `json:"rope_theta"`
RopeScaling struct {
Type string `json:"type"`
Factor float32 `json:"factor"`
Type string `json:"type"`
RopeType string `json:"rope_type"`
Factor float32 `json:"factor"`
LowFrequencyFactor float32 `json:"low_freq_factor"`
HighFrequencyFactor float32 `json:"high_freq_factor"`
OriginalMaxPositionalEmbeddings uint32 `json:"original_max_positional_embeddings"`
factors ropeFactor
} `json:"rope_scaling"`
RMSNormEPS float32 `json:"rms_norm_eps"`
LayerNormEPS float32 `json:"layer_norm_eps"`
@@ -36,12 +44,11 @@ type llama struct {
HeadDim uint32 `json:"head_dim"`
}
var _ Converter = (*llama)(nil)
var _ ModelConverter = (*llamaModel)(nil)
func (p *llama) KV(t *Tokenizer) llm.KV {
kv := p.Parameters.KV(t)
func (p *llamaModel) KV(t *Tokenizer) llm.KV {
kv := p.ModelParameters.KV(t)
kv["general.architecture"] = "llama"
kv["general.name"] = "llama"
kv["llama.vocab_size"] = p.VocabSize
kv["llama.block_count"] = cmp.Or(p.NLayers, p.NumHiddenLayers, p.NLayer)
@@ -70,6 +77,27 @@ func (p *llama) KV(t *Tokenizer) llm.KV {
if p.RopeScaling.Type == "linear" {
kv["llama.rope.scaling.type"] = p.RopeScaling.Type
kv["llama.rope.scaling.factor"] = p.RopeScaling.Factor
} else if p.RopeScaling.RopeType == "llama3" {
dim := p.HiddenSize / p.NumAttentionHeads
for i := uint32(0); i < dim; i += 2 {
factor := cmp.Or(p.RopeScaling.Factor, 8.0)
factorLow := cmp.Or(p.RopeScaling.LowFrequencyFactor, 1.0)
factorHigh := cmp.Or(p.RopeScaling.HighFrequencyFactor, 4.0)
original := cmp.Or(p.RopeScaling.OriginalMaxPositionalEmbeddings, 8192)
lambdaLow := float32(original) / factorLow
lambdaHigh := float32(original) / factorHigh
lambda := 2 * math.Pi * math.Pow(float64(p.RopeTheta), float64(i)/float64(dim))
if lambda < float64(lambdaHigh) {
p.RopeScaling.factors = append(p.RopeScaling.factors, 1.0)
} else if lambda > float64(lambdaLow) {
p.RopeScaling.factors = append(p.RopeScaling.factors, factor)
} else {
smooth := (float32(original)/float32(lambda) - factorLow) / (factorHigh - factorLow)
p.RopeScaling.factors = append(p.RopeScaling.factors, 1.0/((1-smooth)/factor+smooth))
}
}
}
if p.NumKeyValueHeads > 0 {
@@ -89,24 +117,29 @@ func (p *llama) KV(t *Tokenizer) llm.KV {
kv["llama.attention.value_length"] = p.HeadDim
}
if len(t.Merges) > 0 {
kv["tokenizer.ggml.merges"] = t.Merges
}
return kv
}
func (p *llama) Tensors(ts []Tensor) []llm.Tensor {
func (p *llamaModel) Tensors(ts []Tensor) []llm.Tensor {
var out []llm.Tensor
if p.RopeScaling.factors != nil {
out = append(out, llm.Tensor{
Name: "rope_freqs.weight",
Kind: 0,
Shape: []uint64{uint64(len(p.RopeScaling.factors))},
WriterTo: p.RopeScaling.factors,
})
}
for _, t := range ts {
name := p.tensorName(t.Name())
if strings.HasSuffix(name, "attn_q.weight") ||
strings.HasSuffix(name, "attn_k.weight") {
if strings.HasSuffix(t.Name(), "attn_q.weight") ||
strings.HasSuffix(t.Name(), "attn_k.weight") {
t.SetRepacker(p.repack)
}
out = append(out, llm.Tensor{
Name: name,
Name: t.Name(),
Kind: t.Kind(),
Shape: t.Shape(),
WriterTo: t,
@@ -116,8 +149,8 @@ func (p *llama) Tensors(ts []Tensor) []llm.Tensor {
return out
}
func (p *llama) tensorName(n string) string {
return strings.NewReplacer(
func (p *llamaModel) Replacements() []string {
return []string{
"lm_head", "output",
"model.embed_tokens", "token_embd",
"model.norm", "output_norm",
@@ -131,21 +164,19 @@ func (p *llama) tensorName(n string) string {
"mlp.down_proj", "ffn_down",
"mlp.up_proj", "ffn_up",
"post_attention_layernorm", "ffn_norm",
// mixtral
"block_sparse_moe.gate", "ffn_gate_inp",
).Replace(n)
}
}
func (p *llama) repack(name string, data []float32, shape []uint64) ([]float32, error) {
func (p *llamaModel) repack(name string, data []float32, shape []uint64) ([]float32, error) {
var dims []int
for _, dim := range shape {
dims = append(dims, int(dim))
}
var heads uint32
if strings.HasSuffix(name, "q_proj.weight") {
if strings.HasSuffix(name, "attn_q.weight") {
heads = p.NumAttentionHeads
} else if strings.HasSuffix(name, "k_proj.weight") {
} else if strings.HasSuffix(name, "attn_k.weight") {
heads = cmp.Or(p.NumKeyValueHeads, p.NumAttentionHeads)
} else {
return nil, fmt.Errorf("unknown tensor for repack: %s", name)

View File

@@ -0,0 +1,169 @@
package convert
import (
"cmp"
"strings"
"github.com/pdevine/tensor"
"github.com/pdevine/tensor/native"
"github.com/ollama/ollama/llm"
)
type llamaAdapter struct {
AdapterParameters
NumAttentionHeads uint32 `json:"num_attention_heads"`
NumKeyValueHeads uint32 `json:"num_key_value_heads"`
}
var _ AdapterConverter = (*llamaAdapter)(nil)
func (p *llamaAdapter) KV(baseKV llm.KV) llm.KV {
kv := p.AdapterParameters.KV()
kv["general.architecture"] = "llama"
kv["llama.attention.head_count"] = baseKV["llama.attention.head_count"]
kv["llama.attention.head_count_kv"] = baseKV["llama.attention.head_count_kv"]
p.NumAttentionHeads = baseKV["llama.attention.head_count"].(uint32)
return kv
}
func (p *llamaAdapter) Tensors(ts []Tensor) []llm.Tensor {
var out []llm.Tensor
for _, t := range ts {
shape := t.Shape()
if (strings.HasSuffix(t.Name(), "weight.lora_a") && shape[0] > shape[1]) ||
(strings.HasSuffix(t.Name(), "weight.lora_b") && shape[0] < shape[1]) {
shape[0], shape[1] = shape[1], shape[0]
t.SetRepacker(p.repackAndTranspose)
} else {
t.SetRepacker(p.repack)
}
out = append(out, llm.Tensor{
Name: t.Name(),
Kind: t.Kind(),
Shape: shape,
WriterTo: t,
})
}
return out
}
func (p *llamaAdapter) Replacements() []string {
return []string{
"base_model.model.", "",
"model.layers", "blk",
"self_attn.q_proj", "attn_q",
"self_attn.k_proj", "attn_k",
"self_attn.v_proj", "attn_v",
"self_attn.o_proj", "attn_output",
"mlp.gate_proj", "ffn_gate",
"mlp.down_proj", "ffn_down",
"mlp.up_proj", "ffn_up",
"lora_A.weight", "weight.lora_a",
"lora_B.weight", "weight.lora_b",
"lora_a", "weight.lora_a",
"lora_b", "weight.lora_b",
}
}
func (p *llamaAdapter) repack(name string, data []float32, shape []uint64) ([]float32, error) {
dims := []int{int(shape[1]), int(shape[0])}
var heads uint32
if strings.HasSuffix(name, "attn_q.weight.lora_a") {
heads = p.NumAttentionHeads
} else if strings.HasSuffix(name, "attn_k.weight.lora_a") {
heads = cmp.Or(p.NumKeyValueHeads, p.NumAttentionHeads)
} else {
return data, nil
}
n := tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
if err := n.Reshape(append([]int{int(heads), 2, dims[0] / int(heads) / 2}, dims[1:]...)...); err != nil {
return nil, err
}
if err := n.T(0, 2, 1, 3); err != nil {
return nil, err
}
if err := n.Reshape(dims...); err != nil {
return nil, err
}
if err := n.Transpose(); err != nil {
return nil, err
}
ts, err := native.SelectF32(n, 1)
if err != nil {
return nil, err
}
var f32s []float32
for _, t := range ts {
f32s = append(f32s, t...)
}
return f32s, nil
}
func (p *llamaAdapter) repackAndTranspose(name string, data []float32, shape []uint64) ([]float32, error) {
dims := []int{int(shape[1]), int(shape[0])}
n := tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
var heads uint32
if strings.HasSuffix(name, "attn_q.weight.lora_a") {
heads = p.NumAttentionHeads
} else if strings.HasSuffix(name, "attn_k.weight.lora_a") {
heads = cmp.Or(p.NumKeyValueHeads, p.NumAttentionHeads)
}
if heads > 0 {
if err := n.Reshape(append([]int{int(heads), 2, dims[0] / int(heads) / 2}, dims[1:]...)...); err != nil {
return nil, err
}
if err := n.T(0, 2, 1, 3); err != nil {
return nil, err
}
if err := n.Reshape(dims...); err != nil {
return nil, err
}
if err := n.Transpose(); err != nil {
return nil, err
}
}
if err := n.T(1, 0); err != nil {
return nil, err
}
if err := n.Reshape(dims...); err != nil {
return nil, err
}
if err := n.Transpose(); err != nil {
return nil, err
}
ts, err := native.SelectF32(n, 1)
if err != nil {
return nil, err
}
var f32s []float32
for _, t := range ts {
f32s = append(f32s, t...)
}
return f32s, nil
}

View File

@@ -9,16 +9,14 @@ import (
"github.com/ollama/ollama/llm"
)
type mixtral struct {
llama
type mixtralModel struct {
llamaModel
NumLocalExperts uint32 `json:"num_local_experts"`
NumExpertsPerToken uint32 `json:"num_experts_per_tok"`
}
var _ Converter = (*mixtral)(nil)
func (p *mixtral) KV(t *Tokenizer) llm.KV {
kv := p.llama.KV(t)
func (p *mixtralModel) KV(t *Tokenizer) llm.KV {
kv := p.llamaModel.KV(t)
if p.NumLocalExperts > 0 {
kv["llama.expert_count"] = p.NumLocalExperts
@@ -31,7 +29,7 @@ func (p *mixtral) KV(t *Tokenizer) llm.KV {
return kv
}
func (p *mixtral) Tensors(ts []Tensor) []llm.Tensor {
func (p *mixtralModel) Tensors(ts []Tensor) []llm.Tensor {
oldnew := []string{
"model.layers", "blk",
"w1", "ffn_gate_exps",
@@ -69,7 +67,14 @@ func (p *mixtral) Tensors(ts []Tensor) []llm.Tensor {
})
}
return append(out, p.llama.Tensors(ts)...)
return append(out, p.llamaModel.Tensors(ts)...)
}
func (p *mixtralModel) Replacements() []string {
return append(
p.llamaModel.Replacements(),
"block_sparse_moe.gate", "ffn_gate_inp",
)
}
type experts []Tensor

123
convert/convert_phi3.go Normal file
View File

@@ -0,0 +1,123 @@
package convert
import (
"cmp"
"encoding/binary"
"io"
"math"
"strings"
"sync"
"github.com/ollama/ollama/llm"
)
type phi3Model struct {
ModelParameters
NumHiddenLayers uint32 `json:"num_hidden_layers"`
NLayers uint32 `json:"n_layers"`
HiddenSize uint32 `json:"hidden_size"`
NEmbd uint32 `json:"n_embd"`
IntermediateSize uint32 `json:"intermediate_size"`
NumAttentionHeads uint32 `json:"num_attention_heads"`
NHead uint32 `json:"n_head"`
NumKeyValueHeads uint32 `json:"num_key_value_heads"`
NHeadKV uint32 `json:"n_head_kv"`
RopeTheta float32 `json:"rope_theta"`
RopeScaling struct {
Type string `json:"type"`
LongFactor ropeFactor `json:"long_factor"`
ShortFactor ropeFactor `json:"short_factor"`
} `json:"rope_scaling"`
RMSNormEPS float32 `json:"rms_norm_eps"`
NPositions uint32 `json:"n_positions"`
MaxPositionEmbeddings uint32 `json:"max_position_embeddings"`
OriginalMaxPositionEmbeddings uint32 `json:"original_max_position_embeddings"`
SlidingWindow uint32 `json:"sliding_window"`
}
var _ ModelConverter = (*phi3Model)(nil)
func (p *phi3Model) KV(t *Tokenizer) llm.KV {
kv := p.ModelParameters.KV(t)
kv["general.architecture"] = "phi3"
kv["phi3.context_length"] = p.MaxPositionEmbeddings
kv["phi3.embedding_length"] = cmp.Or(p.HiddenSize, p.NEmbd)
kv["phi3.feed_forward_length"] = p.IntermediateSize
kv["phi3.block_count"] = cmp.Or(p.NumHiddenLayers, p.NLayers)
kv["phi3.attention.head_count"] = cmp.Or(p.NumAttentionHeads, p.NHead)
kv["phi3.attention.head_count_kv"] = cmp.Or(p.NumKeyValueHeads, p.NHeadKV)
kv["phi3.attention.layer_norm_rms_epsilon"] = p.RMSNormEPS
kv["phi3.rope.dimension_count"] = p.HiddenSize / cmp.Or(p.NumAttentionHeads, p.NHead)
kv["phi3.rope.freq_base"] = p.RopeTheta
kv["phi3.rope.scaling.original_context_length"] = p.OriginalMaxPositionEmbeddings
kv["phi3.attention.sliding_window"] = p.SlidingWindow
scale := float64(p.MaxPositionEmbeddings) / float64(p.OriginalMaxPositionEmbeddings)
switch p.RopeScaling.Type {
case "":
// no scaling
case "su", "longrope":
kv["phi3.rope.scaling.attn_factor"] = float32(max(math.Sqrt(1+math.Log(scale)/math.Log(float64(p.OriginalMaxPositionEmbeddings))), 1.0))
case "yarn":
kv["phi3.rope.scaling.attn_factor"] = float32(max(0.1*math.Log(scale)+1.0, 1.0))
default:
panic("unknown rope scaling type")
}
return kv
}
func (p *phi3Model) Tensors(ts []Tensor) []llm.Tensor {
var addRopeFactors sync.Once
out := make([]llm.Tensor, 0, len(ts)+2)
for _, t := range ts {
if strings.HasPrefix(t.Name(), "blk.0.") {
addRopeFactors.Do(func() {
out = append(out, llm.Tensor{
Name: "rope_factors_long.weight",
Kind: 0,
Shape: []uint64{uint64(len(p.RopeScaling.LongFactor))},
WriterTo: p.RopeScaling.LongFactor,
}, llm.Tensor{
Name: "rope_factors_short.weight",
Kind: 0,
Shape: []uint64{uint64(len(p.RopeScaling.ShortFactor))},
WriterTo: p.RopeScaling.ShortFactor,
})
})
}
out = append(out, llm.Tensor{
Name: t.Name(),
Kind: t.Kind(),
Shape: t.Shape(),
WriterTo: t,
})
}
return out
}
func (p *phi3Model) Replacements() []string {
return []string{
"lm_head", "output",
"model.embed_tokens", "token_embd",
"model.norm", "output_norm",
"model.layers", "blk",
"input_layernorm", "attn_norm",
"self_attn.qkv_proj", "attn_qkv",
"self_attn.o_proj", "attn_output",
"mlp.down_proj", "ffn_down",
"mlp.gate_up_proj", "ffn_up",
"post_attention_layernorm", "ffn_norm",
}
}
type ropeFactor []float32
func (r ropeFactor) WriteTo(w io.Writer) (int64, error) {
err := binary.Write(w, binary.LittleEndian, r)
return 0, err
}

View File

@@ -1,7 +1,10 @@
package convert
import (
"bytes"
"crypto/sha256"
"encoding/binary"
"encoding/hex"
"encoding/json"
"flag"
"fmt"
@@ -14,8 +17,9 @@ import (
"slices"
"testing"
"github.com/ollama/ollama/llm"
"golang.org/x/exp/maps"
"github.com/ollama/ollama/llm"
)
func convertFull(t *testing.T, fsys fs.FS) (*os.File, llm.KV, llm.Tensors) {
@@ -27,7 +31,7 @@ func convertFull(t *testing.T, fsys fs.FS) (*os.File, llm.KV, llm.Tensors) {
}
defer f.Close()
if err := Convert(fsys, f); err != nil {
if err := ConvertModel(fsys, f); err != nil {
t.Fatal(err)
}
@@ -49,6 +53,34 @@ func convertFull(t *testing.T, fsys fs.FS) (*os.File, llm.KV, llm.Tensors) {
return r, m.KV(), m.Tensors()
}
func generateResultsJSON(t *testing.T, f *os.File, kv llm.KV, tensors llm.Tensors) map[string]string {
actual := make(map[string]string)
for k, v := range kv {
if s, ok := v.(json.Marshaler); !ok {
actual[k] = fmt.Sprintf("%v", v)
} else {
bts, err := json.Marshal(s)
if err != nil {
t.Fatal(err)
}
actual[k] = fmt.Sprintf("%x", sha256.Sum256(bts))
}
}
for _, tensor := range tensors.Items {
sha256sum := sha256.New()
sr := io.NewSectionReader(f, int64(tensors.Offset+tensor.Offset), int64(tensor.Size()))
if _, err := io.Copy(sha256sum, sr); err != nil {
t.Fatal(err)
}
actual[tensor.Name] = hex.EncodeToString(sha256sum.Sum(nil))
}
return actual
}
func TestMain(m *testing.M) {
var level slog.Level
flag.TextVar(&level, "level", slog.LevelInfo, "log level")
@@ -60,9 +92,14 @@ func TestMain(m *testing.M) {
func TestConvertFull(t *testing.T) {
cases := []string{
"Meta-Llama-3-8B-Instruct",
"Meta-Llama-3.1-8B-Instruct",
"Mistral-7B-Instruct-v0.2",
"Mixtral-8x7B-Instruct-v0.1",
"gemma-2b-it",
// microsoft/Phi-3-mini-128-instruct@d548c233192db00165d842bf8edff054bb3212f8
"Phi-3-mini-128k-instruct",
"all-MiniLM-L6-v2",
"gemma-2-9b-it",
}
for i := range cases {
@@ -78,29 +115,7 @@ func TestConvertFull(t *testing.T) {
}
f, kv, tensors := convertFull(t, os.DirFS(p))
actual := make(map[string]string)
for k, v := range kv {
if s, ok := v.(json.Marshaler); !ok {
actual[k] = fmt.Sprintf("%v", v)
} else {
bts, err := json.Marshal(s)
if err != nil {
t.Fatal(err)
}
actual[k] = fmt.Sprintf("%x", sha256.Sum256(bts))
}
}
for _, tensor := range tensors.Items {
sha256sum := sha256.New()
sr := io.NewSectionReader(f, int64(tensors.Offset+tensor.Offset), int64(tensor.Size()))
if _, err := io.Copy(sha256sum, sr); err != nil {
t.Fatal(err)
}
actual[tensor.Name] = fmt.Sprintf("%x", sha256sum.Sum(nil))
}
actual := generateResultsJSON(t, f, kv, tensors)
expectFile, err := os.Open(filepath.Join("testdata", fmt.Sprintf("%s.json", tt)))
if err != nil {
@@ -124,3 +139,209 @@ func TestConvertFull(t *testing.T) {
})
}
}
func TestConvertAdapter(t *testing.T) {
type AdapterCase struct {
Name string
BaseKV map[string]any
Expected map[string]string
}
cases := []AdapterCase{
{
Name: "discollama",
BaseKV: map[string]any{
"general.architecture": "llama",
"llama.attention.head_count": uint32(32),
"llama.attention.head_count_kv": uint32(8),
},
Expected: map[string]string{
"general.architecture": "llama",
"general.file_type": "1",
"general.parameter_count": "106496",
"general.type": "adapter",
"general.version": "v0.2",
"adapter.lora.alpha": "16",
"adapter.type": "lora",
"llama.attention.head_count": "32",
"llama.attention.head_count_kv": "8",
"blk.31.attn_q.weight.lora_a": "0eb3318b02cd313429bcc7621b539fdbb10240fea190c56c9e5f93fcd37a4e50",
"blk.31.attn_q.weight.lora_b": "0eb3318b02cd313429bcc7621b539fdbb10240fea190c56c9e5f93fcd37a4e50",
"blk.31.attn_v.weight.lora_a": "0eb3318b02cd313429bcc7621b539fdbb10240fea190c56c9e5f93fcd37a4e50",
"blk.31.attn_v.weight.lora_b": "071dcafe89df065d6e1c935ecb8fdf6479b3c202eb912e7da938597673ff5857",
},
},
}
for _, c := range cases {
t.Run(c.Name, func(t *testing.T) {
t.Parallel()
f, err := os.CreateTemp(t.TempDir(), "f16")
if err != nil {
t.Fatal(err)
}
defer f.Close()
tempDir := t.TempDir()
generateLoraTestData(t, tempDir)
if err = ConvertAdapter(os.DirFS(tempDir), f, c.BaseKV); err != nil {
t.Fatal(err)
}
r, err := os.Open(f.Name())
if err != nil {
t.Fatal(err)
}
defer r.Close()
m, _, err := llm.DecodeGGML(r, math.MaxInt)
if err != nil {
t.Fatal(err)
}
if _, err := r.Seek(0, io.SeekStart); err != nil {
t.Fatal(err)
}
actual := generateResultsJSON(t, r, m.KV(), m.Tensors())
keys := maps.Keys(c.Expected)
slices.Sort(keys)
for _, k := range keys {
if v, ok := actual[k]; !ok {
t.Errorf("missing %s", k)
} else if v != c.Expected[k] {
t.Errorf("unexpected %s: want %s, got %s", k, c.Expected[k], v)
}
}
})
}
}
func generateLoraTestData(t *testing.T, tempDir string) {
type tensorData struct {
Offsets []int `json:"data_offsets"`
Type string `json:"dtype"`
Shape []int `json:"shape"`
}
offset := 4096 * 8 * 4
td := map[string]*tensorData{"__metadata__": nil}
td["model.layers.31.self_attn.q_proj.lora_a"] = &tensorData{
Offsets: []int{0, offset},
Type: "F32",
Shape: []int{4096, 8},
}
td["model.layers.31.self_attn.q_proj.lora_b"] = &tensorData{
Offsets: []int{offset, offset * 2},
Type: "F32",
Shape: []int{8, 4096},
}
td["model.layers.31.self_attn.v_proj.lora_a"] = &tensorData{
Offsets: []int{offset * 2, offset * 3},
Type: "F32",
Shape: []int{4096, 8},
}
td["model.layers.31.self_attn.v_proj.lora_b"] = &tensorData{
Offsets: []int{offset * 3, offset*3 + 8*1024*4},
Type: "F32",
Shape: []int{8, 1024},
}
data, err := json.Marshal(td)
if err != nil {
t.Fatal(err)
}
var buf bytes.Buffer
l := int64(len(data))
err = binary.Write(&buf, binary.LittleEndian, l)
if err != nil {
t.Fatal(err)
}
_, err = buf.Write(data)
if err != nil {
t.Fatal(err)
}
// write some data for the tensors
ones := make([]float32, 4096*8)
for i := range ones {
ones[i] = float32(1)
}
for range 3 {
err = binary.Write(&buf, binary.LittleEndian, ones)
if err != nil {
t.Fatal(err)
}
}
ones = make([]float32, 1024*8)
for i := range ones {
ones[i] = float32(1)
}
err = binary.Write(&buf, binary.LittleEndian, ones)
if err != nil {
t.Fatal(err)
}
fdata, err := os.Create(filepath.Join(tempDir, "adapters.safetensors"))
if err != nil {
t.Fatal(err)
}
defer fdata.Close()
_, err = fdata.Write(buf.Bytes())
if err != nil {
t.Fatal(err)
}
configData := `
{
"adapter_path": "adapters-test",
"batch_size": 8,
"config": "config-tiny.json",
"data": "../discollama-completion",
"grad_checkpoint": null,
"iters": 1000,
"learning_rate": 1e-05,
"lora_layers": 1,
"lora_parameters": {
"rank": 8,
"alpha": 16,
"dropout": 0.0,
"scale": 2.0
},
"lr_schedule": null,
"max_seq_length": 2048,
"model": "/Users/pdevine/git/Meta-Llama-3-8B-Instruct",
"resume_adapter_file": null,
"save_every": 100,
"seed": 0,
"steps_per_eval": 200,
"steps_per_report": 10,
"test": false,
"test_batches": 500,
"train": true,
"use_dora": false,
"val_batches": 25
}
`
f, err := os.Create(filepath.Join(tempDir, "adapter_config.json"))
if err != nil {
t.Fatal(err)
}
defer f.Close()
_, err = f.WriteString(configData)
if err != nil {
t.Fatal(err)
}
}

View File

@@ -10,8 +10,8 @@ import (
)
type ZipReader struct {
r *zip.Reader
p string
r *zip.Reader
p string
// limit is the maximum size of a file that can be read directly
// from the zip archive. Files larger than this size will be extracted

View File

@@ -35,7 +35,9 @@ const (
)
func (t tensorBase) Kind() uint32 {
if strings.HasSuffix(t.name, ".block_sparse_moe.gate.weight") {
if strings.HasSuffix(t.name, ".ffn_gate_inp.weight") ||
t.name == "token_types.weight" {
// these tensors are always F32
return 0
}
@@ -55,13 +57,15 @@ func (t *tensorBase) SetRepacker(fn repacker) {
type repacker func(string, []float32, []uint64) ([]float32, error)
func parseTensors(fsys fs.FS) ([]Tensor, error) {
func parseTensors(fsys fs.FS, replacer *strings.Replacer) ([]Tensor, error) {
patterns := []struct {
Pattern string
Func func(fs.FS, ...string) ([]Tensor, error)
Func func(fs.FS, *strings.Replacer, ...string) ([]Tensor, error)
}{
{"model-*-of-*.safetensors", parseSafetensors},
{"model.safetensors", parseSafetensors},
{"adapters.safetensors", parseSafetensors},
{"adapter_model.safetensors", parseSafetensors},
{"pytorch_model-*-of-*.bin", parseTorch},
{"pytorch_model.bin", parseTorch},
{"consolidated.*.pth", parseTorch},
@@ -74,7 +78,7 @@ func parseTensors(fsys fs.FS) ([]Tensor, error) {
}
if len(matches) > 0 {
return pattern.Func(fsys, matches...)
return pattern.Func(fsys, replacer, matches...)
}
}

View File

@@ -8,6 +8,7 @@ import (
"io"
"io/fs"
"slices"
"strings"
"github.com/d4l3k/go-bfloat16"
"github.com/x448/float16"
@@ -20,7 +21,7 @@ type safetensorMetadata struct {
Offsets []int64 `json:"data_offsets"`
}
func parseSafetensors(fsys fs.FS, ps ...string) ([]Tensor, error) {
func parseSafetensors(fsys fs.FS, replacer *strings.Replacer, ps ...string) ([]Tensor, error) {
var ts []Tensor
for _, p := range ps {
f, err := fsys.Open(p)
@@ -56,7 +57,7 @@ func parseSafetensors(fsys fs.FS, ps ...string) ([]Tensor, error) {
offset: safetensorsPad(n, value.Offsets[0]),
size: safetensorsPad(n, value.Offsets[1]) - safetensorsPad(n, value.Offsets[0]),
tensorBase: &tensorBase{
name: key,
name: replacer.Replace(key),
shape: value.Shape,
},
})
@@ -111,8 +112,9 @@ func (st safetensor) WriteTo(w io.Writer) (int64, error) {
return 0, err
}
for _, b := range u16s {
f32s = append(f32s, float16.Frombits(b).Float32())
f32s = make([]float32, len(u16s))
for i := range u16s {
f32s[i] = float16.Frombits(u16s[i]).Float32()
}
case "BF16":

View File

@@ -3,12 +3,13 @@ package convert
import (
"io"
"io/fs"
"strings"
"github.com/nlpodyssey/gopickle/pytorch"
"github.com/nlpodyssey/gopickle/types"
)
func parseTorch(fsys fs.FS, ps ...string) ([]Tensor, error) {
func parseTorch(fsys fs.FS, replacer *strings.Replacer, ps ...string) ([]Tensor, error) {
var ts []Tensor
for _, p := range ps {
pt, err := pytorch.Load(p)
@@ -27,7 +28,7 @@ func parseTorch(fsys fs.FS, ps ...string) ([]Tensor, error) {
ts = append(ts, torch{
storage: t.(*pytorch.Tensor).Source,
tensorBase: &tensorBase{
name: k.(string),
name: replacer.Replace(k.(string)),
shape: shape,
},
})

View File

@@ -0,0 +1,3 @@
{
"rope_freqs.weight": "80fd5efb2f729381785b293a091a268cfeceb0079167f6ece9b07070e662b222"
}

View File

@@ -0,0 +1,225 @@
{
"general.architecture": "phi3",
"general.file_type": "1",
"general.quantization_version": "2",
"phi3.block_count": "32",
"phi3.context_length": "131072",
"phi3.embedding_length": "3072",
"phi3.feed_forward_length": "8192",
"phi3.rope.scaling.original_context_length": "4096",
"phi3.rope.dimension_count": "96",
"phi3.rope.freq_base": "10000",
"phi3.rope.scaling.attn_factor": "1.1902381",
"phi3.attention.head_count": "32",
"phi3.attention.head_count_kv": "32",
"phi3.attention.layer_norm_rms_epsilon": "1e-05",
"phi3.attention.sliding_window": "262144",
"tokenizer.ggml.model": "llama",
"tokenizer.ggml.pre": "default",
"tokenizer.ggml.add_bos_token": "false",
"tokenizer.ggml.add_eos_token": "false",
"tokenizer.ggml.bos_token_id": "1",
"tokenizer.ggml.eos_token_id": "32000",
"tokenizer.ggml.unknown_token_id": "0",
"tokenizer.ggml.padding_token_id": "32000",
"tokenizer.ggml.scores": "6e37bcde2adc7e350e87c496eddd7a2124329c1dc66c5bf3ad3997253e4f7a62",
"tokenizer.ggml.token_type": "b6ecf55ec64ee67d87750bdb8d757a2c58bf78377e9f4219f5689a6c4dea57ce",
"tokenizer.ggml.tokens": "d168da3ddd3eee820916945fcb9baf24dd3cde42f606cffa2d19e7c8a8743918",
"blk.0.attn_norm.weight": "216aeb2c9e0c271f899e1ef2a63cceeb8f41e97642e84fada54b1d3c1c11cf25",
"blk.0.attn_output.weight": "b597d56f7188ffc1fafc273fadc59d41738cffd677ae98c61a62c3285b3a3099",
"blk.0.attn_qkv.weight": "d28a6b44e13f59be5483e4be2bedb544e346168d720aca27f47d1a5a722be91e",
"blk.0.ffn_down.weight": "4a691370e5a61fcbbf540fbcbf4c0f1d15dec0364528c0e916d0744f6262b63b",
"blk.0.ffn_norm.weight": "0c00af2b4a3128bec64a0cbb1084b042fdbe13d9ad0d03bd577f9449dfead338",
"blk.0.ffn_up.weight": "b32b52f790c1c083bfb8a3126dc1111cfeeb28dc8c584a930a1e5334cb176bf4",
"blk.1.attn_norm.weight": "68748011503c6c029e8e69a84a8e5a89338f378769627b6dbf7f93d715c292e1",
"blk.1.attn_output.weight": "2267344add13b048ca59e4377c86dc512be8046a57156901fa32a20fa74e4ee0",
"blk.1.attn_qkv.weight": "9109d2e3d7a2eacfda5226587b8be124a3bf44b972da7ebb17aa15795897eacc",
"blk.1.ffn_down.weight": "d675df4df4dd039c0c339ad6445d39eddd2004db6bf35bed6314c7497245a633",
"blk.1.ffn_norm.weight": "3b5767ae977bc8baaa06b06efdbea193b6b3ba605ce76d77a76ce317e935500c",
"blk.1.ffn_up.weight": "80dfd6d9d234b00334c89b8e0a02f81899c2efd377321c34ba5ba51a5f61b5ff",
"blk.2.attn_norm.weight": "6a6743b057e5088f145bc179e92c9bfb41163e7295d7b81c62e23dd89d2b59c4",
"blk.2.attn_output.weight": "bc5491ea54e0db81462d7d9b7d25cbdda380c2db8de041bd1c4ab7b76a1d19c3",
"blk.2.attn_qkv.weight": "a61287a9852e2f5aca9c100b471d98398b2913a3497c743de3c70ec9ddd7087f",
"blk.2.ffn_down.weight": "4fddcc382c8dceeab027fe43d8d44e67edb5e8ce4b9a1b7f773c87770380ade1",
"blk.2.ffn_norm.weight": "07e05f82b3f63f711db3b684ca79aed25c0657917e66f88af47348a82065c227",
"blk.2.ffn_up.weight": "4835a682ef1826c12df01ae7663fc45f9c82bc8e64b665f13fb7da8e201ec0fb",
"blk.3.attn_norm.weight": "f22aba7c03999ba7136f39cda747a39715e498699dc1716cd97fc5dfc58d1b1c",
"blk.3.attn_output.weight": "53b579855366fd786c5126b2b30aac4d583ca7bda56833c4865f5cadb5c18c6d",
"blk.3.attn_qkv.weight": "bb56aba78158123140fcea59c69ac562ca208f6d3086819417cdad8c50f333ad",
"blk.3.ffn_down.weight": "97280897a7cd86db2830c004bccc5bc094f50e293baded0189159a2019145a6e",
"blk.3.ffn_norm.weight": "10a8c99f8b57a960e8e0a1133c4a26f9148403d1b9bff2eff114917de996f3b5",
"blk.3.ffn_up.weight": "7324046c915e75d621b2043597a245a428d8eea31869135e6257a861491d8dcc",
"blk.4.attn_norm.weight": "507d8e164de94646edbfe33def8e8fbf7c9a6ee3fbaedb5000f72d9f51ec5e36",
"blk.4.attn_output.weight": "bbb3429e6efa98c150e0fdbf48c16180cbf0d0cbc1b3c253c6c319d78f4593a2",
"blk.4.attn_qkv.weight": "b95ee5be0786d3901273d806c339fe6c20e6bfffd2a20672a9f56af80921e8ab",
"blk.4.ffn_down.weight": "806bbf91df92a5a22bd5aa1ffb7fc2869f7293ffc7704771c290ecc583b27975",
"blk.4.ffn_norm.weight": "cfc2930a81df7aee3a5e7f726a15c1182233e868bf0d9d37f6b6ae6d8c15c234",
"blk.4.ffn_up.weight": "c3390c69533de2c8424e8069323ccc5d0c4543111535da04cf2c7d26745576aa",
"blk.5.attn_norm.weight": "0d71c4fbcefabbd021569442853d2fe90668b19409ae2805a718a829ca60beab",
"blk.5.attn_output.weight": "10ebd93629112bf2df5c30dd0953a4a5e9020306768283181ed426934d47e14f",
"blk.5.attn_qkv.weight": "5cb05633369f12d4b00e0ff787736bd846856682115720ebc6cce05270c334f6",
"blk.5.ffn_down.weight": "e28bcc5094212eafc7476dbc5b7a520d25b79578cbf4229d698e2655956a80ad",
"blk.5.ffn_norm.weight": "b6f2c4cf9f34bb4d59989f96165c14a67dc1e266ad0a6d0fcc49f1add929e6ff",
"blk.5.ffn_up.weight": "0f9ef99423cc07ebedc0e9cfa95809f2d7108d910bb4ef97ebc0b0309c440750",
"blk.6.attn_norm.weight": "b3edcc47a42218234f7564d7470611b49401a41ae8cd42123f86557c69f5d7f2",
"blk.6.attn_output.weight": "eb9b7d257b388bb5b8fe0515e5c6873317239cb94cda236e4b6ada2a6c57c65c",
"blk.6.attn_qkv.weight": "eb968081f478c52f07bd9c2761741e982dba33cc4eeadeea3557d391b9ac2106",
"blk.6.ffn_down.weight": "1b8588bb7463206290322695577dcfced300895d6e6f4b26966c53a9ae2f0f84",
"blk.6.ffn_norm.weight": "1219c04b7770983c77814200eefe743f46d15328ea2b12711e44f8103eab08d3",
"blk.6.ffn_up.weight": "197ef287239fec47c55677f0fbb66eaf0644f775bc382de843971730721394f6",
"blk.7.attn_norm.weight": "b630ad08c80d564ed1c024384818e9fd3f22a36cd7a14aa96e7e2759a8285099",
"blk.7.attn_output.weight": "970255aa750828a47d6b9d399f9612b5bf25aefe7dadbcba41fc416d0d4067c1",
"blk.7.attn_qkv.weight": "ebb157c880293e6de8d629f263ba8853ed1dbdc02c311d43432bb8cfbb310739",
"blk.7.ffn_down.weight": "24bcd4db4cba844c89f878b81843c373dbbc0675e889d32c5b12e63384a7b670",
"blk.7.ffn_norm.weight": "b9c6f71001808ee873ce7db8056e4b53fb4cccec8b7f0f312899b575fae39d39",
"blk.7.ffn_up.weight": "979f1828d227455c26015a2a11afe9dd05f2bb97a8ba6b38c8dab3f50e627401",
"blk.8.attn_norm.weight": "4e8e347e3775010b7112ee630f2f4f2383be7ff64e6ca6154b9b22566552eaa6",
"blk.8.attn_output.weight": "65a44babf44a435a1829945211b3168f9ec78ac3cb7a049a733e93d11f0d6659",
"blk.8.attn_qkv.weight": "343ed07671da400b040812a4058482fa38284b5d9af9becfed07417fe26ce747",
"blk.8.ffn_down.weight": "7fb7e073e3c2c503c4e9d60efa0988fed7398d900cc003695fe3fffd3e188b82",
"blk.8.ffn_norm.weight": "b07c1f655d8593e3892a2cf73f8a0c19ce8e5cb613fafbe7cbd430da8ce4c57d",
"blk.8.ffn_up.weight": "8b26e14de54b3fdc2e2d3ea41720f9d9c236a93688c3b7fd7bf43f5fbb327c9b",
"blk.9.attn_norm.weight": "46394d408a8e316916177e6aa261de32e137a82d729c0b1800b072f0c38c39b6",
"blk.9.attn_output.weight": "d57f3d46107947a7073373a0b35d6ecf7759b5df15406f4a3590a60666af6b16",
"blk.9.attn_qkv.weight": "14bb8ace8c5453148f4b536e9f4279c813f31136716947256f5cca333448639c",
"blk.9.ffn_down.weight": "2b8d98e2b5ed68338f6e4de43bf7de0c4858cc69103cd5177725f7444eec7694",
"blk.9.ffn_norm.weight": "41a499dfd418cc4c6b8c12313f673f7e2cd4a3f9c4065eb6c4feb5eed02fb542",
"blk.9.ffn_up.weight": "143aab7533a64b17fbe201490a6f674bc7f0bd370c094500b2e100419073d1c2",
"blk.10.attn_norm.weight": "ebb670aafd36816a794347287269d8f1a5b19c1e3c0a1e38023bc19fdba9b073",
"blk.10.attn_output.weight": "b5d65bbc0ed5e49fdd9d754bc18163cd042a285024d0cf6f954c503bc8c877cb",
"blk.10.attn_qkv.weight": "f06b15bac88da798fa34a62b03eaac0dbe8b846020516603c387541f2d8dd672",
"blk.10.ffn_down.weight": "fb091fcd1b4de25d1bea94d1755e255cb02914a030d23e3a234e57b8d46bde6e",
"blk.10.ffn_norm.weight": "eb347bdf9c40414af87e13a8e72e40b31f004b50f7cb366f1a219ced60a61355",
"blk.10.ffn_up.weight": "ed2d52fc881a173f404fe8a1067862c9856d6c3e0d2e90a330a7aa394e3f84d1",
"blk.11.attn_norm.weight": "64e252603cf010a0e502ca39fdf8d0a196a79aec67c0d2bb9213fc0cb80c47d4",
"blk.11.attn_output.weight": "228e33e21c69f52efc74fdfc831bc9af271e44b2a29a3dced1d64e667ce36eb5",
"blk.11.attn_qkv.weight": "ab9ce6d4ef9e42ee0da3f20a7708a3bbc5e79e967b05fa86ba946a05e2eb63eb",
"blk.11.ffn_down.weight": "0ca133b7835c98dc77c25d64e4eb7873778bdb5e4d22d8b80f920f46865b43bd",
"blk.11.ffn_norm.weight": "02455741a0dfd161c79aa1ecc381901721f229fdcda5615622a629631fb61cfd",
"blk.11.ffn_up.weight": "9fecdcc099fbb8e23c6b1ea9294702a027f4a58d265543ec5e7be79b8f63b354",
"blk.12.attn_norm.weight": "783bb459911b1b3609a9b2bdfe272f1670add73b5471da738e07ac47e2e07dfd",
"blk.12.attn_output.weight": "1e1a914c9e48b857206ac5a1f7cead994bc1ea91d5d4fff8c834d73f2e38ef5d",
"blk.12.attn_qkv.weight": "5953e7185ccb87fb4dae8f9426ec86315d4c7794326e8ab59b3a95d4af2189f0",
"blk.12.ffn_down.weight": "a3eecf0f394f86e2cfb48a5940a5c50ca86d71883b2f79fcc642a935fabce0d4",
"blk.12.ffn_norm.weight": "0a4272e41373c23bd72f10d2d82930aa3a1480aac75832bfbf01cebf0b86b6a4",
"blk.12.ffn_up.weight": "06f42776de3a7ceac3025f26a7a8bd20e062233cce2bdaa2183470dc4b30b87d",
"blk.13.attn_norm.weight": "5915da60fb03e201fa649faba780e5fdf1c761c262b206e5415cf83181f65780",
"blk.13.attn_output.weight": "4dbf6eab074fa3835fd32bd631a8208e511037d5056d2fd3015735cca7674ef7",
"blk.13.attn_qkv.weight": "d3d8339a1c4782d9e73d77fdebe154d3c5b83ac40c9175b3e91a4977d08f876b",
"blk.13.ffn_down.weight": "de6772b46a55e1fd42b007637dfbf68b6598e5d5b61622da0935002e1e192d3a",
"blk.13.ffn_norm.weight": "5a640ea3b8c7be49c95a58a2327e10d8e8d9d142504bde5c8091613e5b961d7a",
"blk.13.ffn_up.weight": "f35e3545e4bd3531b2e843b5efd31dee0c13c807ee6386e65473ba67bbec30d0",
"blk.14.attn_norm.weight": "9b34986450b7c98b4927e81e61a816f9e84b1addc7c14926402100037aad6678",
"blk.14.attn_output.weight": "155d52efb23d366016d861a251d4d1f4a0c13699188c50d50dba016a0d8bfcd9",
"blk.14.attn_qkv.weight": "8e1415084e1f33c73a777f19e752489f4dd312cca047733e5ea643cd4a955e04",
"blk.14.ffn_down.weight": "a2a142226b94baa01ccb65bdea2b7418e49085c1d9c3c63e544e3112c58a25da",
"blk.14.ffn_norm.weight": "8aecfd9b0ae6affaea31a80c5c9a4a14b31deaa0db7bd8f6da2a64d23447921c",
"blk.14.ffn_up.weight": "0c1407237b8c1bd02f193346b5681926fe698a5055eac6a7450451b0f991707c",
"blk.15.attn_norm.weight": "e037bd19880bfa83d983200fb0c7866f8ad16c3ff5cc4b4f3a37ca7373870ff6",
"blk.15.attn_output.weight": "045fe4fc95cc129a1b92771b179c11b12845c4c088786c607f17bd98857e68e1",
"blk.15.attn_qkv.weight": "7621b7559705cab1d4dea1c69f76dbf9dc1c8837a203b656f484703b9c1b70ce",
"blk.15.ffn_down.weight": "7e5ac20e290bc60761e1cd972354fde225b7fa861048d44d9a0dd9b046d55f58",
"blk.15.ffn_norm.weight": "b6d830d88f1db1825687973c8c2b1a24c6fa84f07af8d0e3ef9c86009baca0b2",
"blk.15.ffn_up.weight": "dcda0957cd04fc45476774dba2bbf9aa89d6b05d5ca7b10ae6f73ad2c49b1cd3",
"blk.16.attn_norm.weight": "4ee9b70ba15cb2a08240f93990e90f5068c48fceb481f8e2186bec8b7214eb3f",
"blk.16.attn_output.weight": "315cfe5536658d2498192b2980eade15b2c9a4ff220e4011911457b1727fa103",
"blk.16.attn_qkv.weight": "3c8122e3ad637583b9dcde8ff3a323267d3014bb1f0f9771e5322260ca9ecc8d",
"blk.16.ffn_down.weight": "3b5fbebd5ee2b86cad96fb8a9b45a8770d08f82c1c8b74d7061e866f7020a18d",
"blk.16.ffn_norm.weight": "ffab69f20bda372de6e5878f0539163e2fc6ba113621ded95705fc3b1465c9f0",
"blk.16.ffn_up.weight": "0935ea3d258da42d6258406365f39f58ddaabfe97ea5977580db3635188f24a1",
"blk.17.attn_norm.weight": "f030441733f3d147b4a06a1eb4aeb8465c7c24d9c53bf4c48fe7e134d3629803",
"blk.17.attn_output.weight": "07a955ef09e8dc766ac0df647d0b2c69f23c4c69a7137654b4aad80303ed0eda",
"blk.17.attn_qkv.weight": "1c10688061e21e2fe12ad0cb54bf03895c1f83c3b0df743a42f548b52cbca1b2",
"blk.17.ffn_down.weight": "ebb9cc9836f41d88fdae2aa9a4355514e4edaec8d1577ffeb947a35204e77f52",
"blk.17.ffn_norm.weight": "50aff44f6528b13db5389f2ddcdb7676244947610bd7ffbff3f881c968c2a0d4",
"blk.17.ffn_up.weight": "d716537949582be33bde6b02e38f5a70081c9642a9fb05a61312126718b8d148",
"blk.18.attn_norm.weight": "0ea695c4e53d637902f46663a6ee42adc493c36794476acc7dbddaa05b13840d",
"blk.18.attn_output.weight": "5fd35b500221a612eb4f4bddf0e9b6b7db4d7733032a75f8802fb2d884647c2e",
"blk.18.attn_qkv.weight": "b0da37fd030fe69581f990bf23bfd35467a1bbe558af6de7c0924f6b72e92317",
"blk.18.ffn_down.weight": "b355c33f44b328f4bb977567de8f7544db4b005d7a8fbded658518ecf3c5a153",
"blk.18.ffn_norm.weight": "58b3fe9094079989a86e0387143259e1cc35952d24dc3df290c4ba6df44f5c51",
"blk.18.ffn_up.weight": "2ce530954c342c30ed2ead5353f931960bfae1d278868504c0efb973560fabbe",
"blk.19.attn_norm.weight": "533e9aed66feea8f0392aa81f9e293240e1f009a5334253915fb60c2749b615d",
"blk.19.attn_output.weight": "84f2d00f98a4113a779d3b5d1c3e7c914eb47784d3ab13b290367c124c2994aa",
"blk.19.attn_qkv.weight": "fbe6b9f53b07fa7537d3b3d452d20a9bc666f9fd41ec2091dd28bc2f70fc668f",
"blk.19.ffn_down.weight": "b30199e098c8bb3f890183d8b18471e80b62b604729b277ad62488dd71e1206b",
"blk.19.ffn_norm.weight": "c81373e41cd340b7badb19f9517c77c4250b4eb9a02dc758b8b49b652487d7ff",
"blk.19.ffn_up.weight": "5a5cb083ca7725720e3a890f7fa46354760e8007a8188849a092e305694a75e3",
"blk.20.attn_norm.weight": "4953091b4477e354357a8e743ba0a1900633e52f1599ee082a0c9b0b2b5cd978",
"blk.20.attn_output.weight": "62d54f7749cd6856097b2632066a322b0296df915fe66f382c5b5981be0d4f23",
"blk.20.attn_qkv.weight": "406de9e35b0729ebe902d7a47905cc7fb29a921431ed35dbef0c03e5690a1329",
"blk.20.ffn_down.weight": "62fb678b0d1261e19a4903a2b347d67afcc8acff01feb33a687a35a2d1e6f9a5",
"blk.20.ffn_norm.weight": "cd9d36b7e71e55c8925b97bb09c28219f182626bcff094878ae39c3db887a14b",
"blk.20.ffn_up.weight": "b9276771d79d3e932e73ccc520c3f8476342b9ef312ed2ee1e0da822e6e3ad18",
"blk.21.attn_norm.weight": "66d8c8a35e13ce9c2a0e75b670150e2c31484a55c2316df46075312196178ed3",
"blk.21.attn_output.weight": "12ab46c9382648f9b3350fdd92a6be6352743d62d6b520d7e2024e0c838588f5",
"blk.21.attn_qkv.weight": "a7909676ee1675ca23cd29a5fdd226df8dd9d68f94c6c9bbb51dd9fd38504008",
"blk.21.ffn_down.weight": "6fb317279c6542e82f97d5a12a60fac1bd0fa0405154f9fbe265e2fe39bd49cc",
"blk.21.ffn_norm.weight": "c0f703eb3ff161b5ba4490d87d8684b8a6c47a8f433e12f418333b9db439010a",
"blk.21.ffn_up.weight": "6dbdb80ef0c35e364bbce12d40d5e74c7963c7b55d58d9579567a07ffce7b863",
"blk.22.attn_norm.weight": "f94237433bf03d675cb2f655b81ca91a1ce2447bc6b00b13d6b0ccfe2d411eff",
"blk.22.attn_output.weight": "e821f95995ce497c01e63ca64f737713b1b65f11df1903e51d444aa516f33f71",
"blk.22.attn_qkv.weight": "1b0f717c73afb5eb4c82a1708c4e85c969e8a2a8770d9ddb78b1870a2d8a781e",
"blk.22.ffn_down.weight": "0f33f7a3cdc685484be99aa0c03642b0b20850a27d1fddbe054b13a9382f3ccb",
"blk.22.ffn_norm.weight": "9df285cf211ddd7df2b36a50489af574755c7d4d98b29a05cd04566ae613c8dc",
"blk.22.ffn_up.weight": "63ac300e1efb34041dd0136cf43ea622fac6f0caccce1cd9262f5e08d2cf179c",
"blk.23.attn_norm.weight": "5f72d9e88689b4027b28f5f8f26cd3abb03635ceea7ec98a4c91a9fc691f6707",
"blk.23.attn_output.weight": "6ecf04ff61125c5fc768f8656497152149373daf321ee9c957e8f7245a1184d1",
"blk.23.attn_qkv.weight": "a9d9978806724c2959f2cf386c233831f08e1e933dbf2b32665e788d9d512ea4",
"blk.23.ffn_down.weight": "72c7d17886a3da17fa0daa456aa5e877b2ef5b8b403182b870d9ca5ca9c70347",
"blk.23.ffn_norm.weight": "971e4b712e3025a13419b5b57d674b5e4ab7f18f74b57b9afc4671623da90c4b",
"blk.23.ffn_up.weight": "df2b5c7dbd5834545b815073af0c7355b065124e6d6f0fee78d8fa5b2076dc3e",
"blk.24.attn_norm.weight": "c41957c4a79ad3b16f6e11daec1c7f530b9f3f4b618e1e4367c3b67787ac4ab6",
"blk.24.attn_output.weight": "ef7d61f5fc88ac6f31bf60cb5f4d2d6b8df42d38825807112361a7224b0dee3b",
"blk.24.attn_qkv.weight": "3e6a58fe7d49c90bb6971efbad3371c32256881173ea5aee4b0c296cb206490f",
"blk.24.ffn_down.weight": "f43619144047de42fed81dfa495f1815d3cb771330e574043e2b67620819292c",
"blk.24.ffn_norm.weight": "5501d4a2a98c8ca6b42e77b53b221dbc08f530f6a067256d787534ec6fe028bd",
"blk.24.ffn_up.weight": "d64c8b0e509e2b1118f6000176f8956cacecdbb200c7e95ed93fb78b6e26c84a",
"blk.25.attn_norm.weight": "502fa3c302d371f61c5791f4615b73018ffb1daa09b6499b227116581244c5d4",
"blk.25.attn_output.weight": "ad8391d4e9c980856f2547aa945b2b6a407a6382158dc1ddd4f08d94ecc24be6",
"blk.25.attn_qkv.weight": "42e8983780d4a01a02c54ad23d4df21eea437f119a10af5a9c12a76a42d308c1",
"blk.25.ffn_down.weight": "302dd010d4e0ab4eeaee89090409ea0dddeeeed3236415eb8f97c942497eea91",
"blk.25.ffn_norm.weight": "fb34c1ee5bca96986c08834df0a0c047ba041c1123ac1f563e9d64312bf82d6a",
"blk.25.ffn_up.weight": "10739a8de156816d93c92b935386540bfa976bdbef204f0312960f6fc657582f",
"blk.26.attn_norm.weight": "7036c711609128c4e55968ff3681d3043338879a5737efd6c2ac9e1a2a61f1a0",
"blk.26.attn_output.weight": "db5db45dead5cb911fa01da59832f121b7c18b2d167bf53741c40819f24d346c",
"blk.26.attn_qkv.weight": "cae34c6b7f82ed14348d5ed30a79919c383737c1694a9cb9c0de609d3b0c1d0a",
"blk.26.ffn_down.weight": "491ec3a4da9b4f49f8ebc6be658ce397a9b801ae9fb35e82177e47808c65e5d0",
"blk.26.ffn_norm.weight": "fd7059d75d7f0e5288511ddeeb0f772eb3cae3ccfe4226b877015834edc3c386",
"blk.26.ffn_up.weight": "ea1ee1274c56458ce056d2205e5bb6e5422ce4cb0ad58006b8141749b97a0c39",
"blk.27.attn_norm.weight": "cc362c9a937609265052cd38544af17a1a7448cea086d4c801139e1fc865832d",
"blk.27.attn_output.weight": "ba757a81dabde9cb1b069d1bb616fe79649a1724f756567ec61caed1304fe6cf",
"blk.27.attn_qkv.weight": "1ab8d7d02d87756c12c2275636823aa5ede3d683178225c4cac4bd892c319bd4",
"blk.27.ffn_down.weight": "deb1c711c8a66acf4dcd2d088e1548f8e08f296f755e4067d6557fa55afde88c",
"blk.27.ffn_norm.weight": "fc6242d8cb8a4a37a8ddb7e41e7e60a63d4a89edf36acb35df052f10b9c91ece",
"blk.27.ffn_up.weight": "8df39b09c4801f343aca78f2918a1f6db78c8c55e591eda4c69eadb74c26e180",
"blk.28.attn_norm.weight": "75b539308f77e3cefdc6d98484d8b5cbf0538f0c2869a77b7373a145a18bc850",
"blk.28.attn_output.weight": "ae128940eb60a6d2e121762ef4b3e9dcf9eb3e105b249507fa7f12de0e19822c",
"blk.28.attn_qkv.weight": "bdda781c288e9326c240e33905f8e621b6a2ad902e620739d34f93fcd6f933de",
"blk.28.ffn_down.weight": "f1d6e6d1c286b1138bfd7e53fe477f399ae93bc2c04e35416f84218ed7247965",
"blk.28.ffn_norm.weight": "3f837ce82c8b9bde0d61d08b6f5fe5574886ea5328dbdc53f2929f18da8b4087",
"blk.28.ffn_up.weight": "2af027002e31d1b6cfedbdb30a2b9d7213f3aa691167c353913adfd48fda31e4",
"blk.29.attn_norm.weight": "61e8003b5329462ffe0fe172f2b160260de006aed858332d49d75504b6b6aa7a",
"blk.29.attn_output.weight": "ca44542a72a37476dc73dbdcc01f5b7497cb3ebc4ea230a55c9634ccd8e56ad4",
"blk.29.attn_qkv.weight": "abb3d9d6abe57872ae3daa51935d43264093ded5ce63b49d1e280ee5758be0e4",
"blk.29.ffn_down.weight": "6764b895fce881df097489c263446f0106de36217997660c15984b3ee22a5a06",
"blk.29.ffn_norm.weight": "89e03e9a33fc0e6e31ba9f0c2bd7c5734a118c5602bb90148793e08a80e8d0ae",
"blk.29.ffn_up.weight": "fa7ad57a84954f4121653152efed1a871d8adb20a1ea9086e3e849ce359d7d2e",
"blk.30.attn_norm.weight": "91a697aca1e42af54f806a20211031c3369e8d0bd58df1b0147fe24954e1f5a4",
"blk.30.attn_output.weight": "36063fcf766c89ac75be56f688cc63cefe5f2c733fbf4378ea9956ad386fa148",
"blk.30.attn_qkv.weight": "2cacd1161f1121a2c0b979930134f4666f73fb8d7237b3b0659ae091b15955a6",
"blk.30.ffn_down.weight": "9f3fcb6217100595850c05dc98f9ab2a263afdb6ab28df2fcb08aeff512057d7",
"blk.30.ffn_norm.weight": "6c600bc1fc7de39d4f8917b81fc7d1d5ed2a9b56492234c13a4bd6028c30d880",
"blk.30.ffn_up.weight": "73cabd1bb011956b2689ea3338bb76642ef3a57c197377d666d2ab5f56317668",
"blk.31.attn_norm.weight": "72d3e1cc771380645fa75a899858c95f39857a4f3f1ed60fe1578df383b8bc53",
"blk.31.attn_output.weight": "40089cdd29994dc19a1d89fa15902a89cfeca3540f12dc9bf4d00ef82506e456",
"blk.31.attn_qkv.weight": "1d0bb40e9258071ae14290a53c619a8e331dda07354d2a02ef45766c029ae5e4",
"blk.31.ffn_down.weight": "8defa0e06335b793fa8be03883f0a322d6c5b33f52c69c943c35c60d16e42c0a",
"blk.31.ffn_norm.weight": "33c55d9d0c496ccfb130361fe131649346e098abaaac39c0519507e5d846721d",
"blk.31.ffn_up.weight": "599f6503f61c692c1f82001973d35119f9688db5e6be9d9c298411491c93f09b",
"output.weight": "14b8dc662bfa3308ebb2e102c562d8e52c15670e538f20f3216a9c310ca9dd41",
"output_norm.weight": "7f2294ba94ce65681df6c7ddd8698799199b9d77dc83c10bdad5c3999f0fdb82",
"rope_factors_long.weight": "e34d378664e354652c38f47d10dafb0498ccc2fb042d39ff7fef768146fff22b",
"rope_factors_short.weight": "9379146a4988f373d362fe47b06c75e7fe7c54aa4dc9558758df79b7a87471fd",
"token_embd.weight": "19a03c1fb5ac0baee93b0a7d8b0f26e9a9b011e229b694afc50ebfc13d84f8bf"
}

124
convert/testdata/all-MiniLM-L6-v2.json vendored Normal file
View File

@@ -0,0 +1,124 @@
{
"general.architecture": "bert",
"general.file_type": "1",
"general.quantization_version": "2",
"bert.attention.causal": "false",
"bert.attention.head_count": "12",
"bert.attention.layer_norm_epsilon": "1e-12",
"bert.block_count": "6",
"bert.context_length": "512",
"bert.embedding_length": "384",
"bert.feed_forward_length": "1536",
"bert.pooling_type": "1",
"tokenizer.ggml.model": "bert",
"tokenizer.ggml.padding_token_id": "0",
"tokenizer.ggml.unknown_token_id": "100",
"tokenizer.ggml.cls_token_id": "101",
"tokenizer.ggml.seperator_token_id": "102",
"tokenizer.ggml.mask_token_id": "103",
"tokenizer.ggml.token_type_count": "2",
"tokenizer.ggml.scores": "6db964fe67338aca57790481a390121ff3dd643eebe49f7dd308029ad99abb6f",
"tokenizer.ggml.token_type": "98d247c5404b6b18f05f133b92dd56edf6efefefac326794b00d7b351f6c5aa1",
"tokenizer.ggml.tokens": "9efe405e229a45ff9916f54c475d151d2200cd2ab0006f347abfb069cf096c86",
"token_embd.weight": "8c1ee80a9ea4f65aa385ba30112010068af3d209bebc6e149d3d4589c2cd0a5a",
"position_embd.weight": "6c516f0b1c4e2388ab90394dd80ad69e4e4509b890982fc3408108ae66210eb6",
"token_types.weight": "f879f8e422ed211948f28b560d3c5e17aae7993f063b51196a28cf5c0fb3da21",
"token_embd_norm.weight": "75076e095d717aab96f8b6beeee503c27940d9a76f2b891a0e3de72f8a6043e4",
"token_embd_norm.bias": "298735285ffe944e1bf03e5d35c7280326b85cf121bde9874f1af5dc51ab939d",
"blk.0.attn_q.weight": "ab0923ce4c1549175112dcdfcc860fe30137f991e03ea6857fb5993670adaf6c",
"blk.0.attn_q.bias": "a3ec29551dabf976e1d34256b8ab5ab7b758f3ed9742c3cafdbd984d5441df62",
"blk.0.attn_k.weight": "4c1038a6d035c3e9ffed7fa672b614627814752503755fbad0cfb76a41ad71ba",
"blk.0.attn_k.bias": "e0363930eb588d91816aa3d230bb03b6e2551c165117b80b8d60397413819ef9",
"blk.0.attn_v.weight": "425e2e53e3f00ce98d29c3e6a161eb55d3e6ae0d96fdb9f6242d1c4fd6eef4b3",
"blk.0.attn_v.bias": "6579173a1e65ee124fbd0bd53cbdca4225515b4f2c5f18fb1bfd000f5978f9bb",
"blk.0.attn_output.weight": "a6d70a08cd7164de5d12af65d86d657c3db35aaecde778b2b3fda9193c4c9802",
"blk.0.attn_output.bias": "2b8d12c4f9a9c5bfaa29c597839568f6e0525cb41eeaf64ddeb6bd84dfeb9701",
"blk.0.attn_output_norm.weight": "bbe6e502a473228b525aeed26cc31b7db123ad63bdc5a6eebac6ea70b8b51d62",
"blk.0.attn_output_norm.bias": "36eaacaf0007c5c62daea97aab0115390c0682914f78482e37eb76885f4b7a50",
"blk.0.ffn_up.weight": "24654561c76ce387d125759ba843f06b904ef721fcceaeff6ccc62180a48e874",
"blk.0.ffn_up.bias": "fd3f0126aa1d95768fa60eb6f4ab8a2763cfcb7e5405f35b92353031d86f4d34",
"blk.0.ffn_down.weight": "97a829763a6a5bf3329ceb4d39c424ba4787d61653a5b0bbd1f84782e4d4e0ca",
"blk.0.ffn_down.bias": "7aa980c30ae8b4ee7f69df28808dbf5c431f56ccc4a80340f644a0419f16c054",
"blk.0.layer_output_norm.weight": "ef30dad4c2a083ae1ff5039a2a6cda60ecc89bf1e486a6f8c0d15f50589603f8",
"blk.0.layer_output_norm.bias": "8b1b77e67568b1bce43fc476de1b177c53ff688d66beb66995e8eb3dc290da8a",
"blk.1.attn_q.weight": "284331622a1f6f9b87ccee4f652bd66a394ca493c4d93be4d1844e4f6159ad10",
"blk.1.attn_q.bias": "e24ebd4860330e08f6bfdd077a82db0bee33f4c8846cf1db26327a34754c7069",
"blk.1.attn_k.weight": "729dd0d555544b5bd0f7580b3c8b384256b974605f0e7487b95f295aa032997d",
"blk.1.attn_k.bias": "2aa51a828a858f35473f54477583fea54ce2ccc34ea60fbd1d228fbe9bca827f",
"blk.1.attn_v.weight": "6be304671cc311d5ca5c103f2b51467ee800c589bc5b8101e09ff5aed1f68c21",
"blk.1.attn_v.bias": "43bcbab78a8819e07f723bc9e5b737b71e87a7594f15234e882b63e327a64199",
"blk.1.attn_output.weight": "15ec8a1a12b26c9976445308a09f748ab0e4bef0f583d13ab08c3129f8738d73",
"blk.1.attn_output.bias": "dac2146f4baa6ed16f6c0dc7443831fb7ec79bedcceafd80d1a4b628a1bb072d",
"blk.1.attn_output_norm.weight": "d2151eb33bffac536787a4c9a5d2b31c7a80b17c4611877842a3cce2cd6e98d8",
"blk.1.attn_output_norm.bias": "31e1b779716dafb855d2cf5631ee168a0ccf372eb9c6ea6091f66fa97a9b9d2d",
"blk.1.ffn_up.weight": "a57547fc3fc3b77406f5cdcb0c87af9bc184701f175c39c1f35297826fce3cc7",
"blk.1.ffn_up.bias": "123be6d541d086202913c75d878c54d59a749f3af7b58f7ef9eb9e7c62a24c9a",
"blk.1.ffn_down.weight": "cfdb79788377e5cbded8790cd41b9e66c397ecab75474071fcd7cf32d30f9613",
"blk.1.ffn_down.bias": "bcb58315519a573097960891c9ae41cf4c685ab78c3e0e77471471758a7eae88",
"blk.1.layer_output_norm.weight": "819b554271452bfb1d84c2603b90377b2e41a0ac1e3aa8b417ccf9dce63375bd",
"blk.1.layer_output_norm.bias": "47a3433ac27f5ce8947fb38dd491f3706df4ef6adb0ddf74612bf0f54b19e164",
"blk.2.attn_q.weight": "1557a9ea852b1880551f7290e00aded4f35e6c4180fdcbed1b0039bf805f639e",
"blk.2.attn_q.bias": "c3bfe5f3066f655fd36b055530997b59ff33ef013563aaeb3cb8ff07dabd59a9",
"blk.2.attn_k.weight": "cfd08eb69c61ae2f9f14f9b7ff5c5394ca264b1a9f3d48156677f90dd1766289",
"blk.2.attn_k.bias": "9b839bc0e79974a0b3f5d1895972bc6f5c9a1bc16052e1af786e6a530758152d",
"blk.2.attn_v.weight": "02b26b1208480eaeeb00e7b4cf8b690006ca14759357fc44ed4a2a8924ead993",
"blk.2.attn_v.bias": "e7e6f0089fded1659a867ab736c220d9653ea7da6b1b94baf5c8d30a748b63ab",
"blk.2.attn_output.weight": "a1db121c7d33806b349cadd050300a57db49fdc91224fd07c9ac43bf4299dc79",
"blk.2.attn_output.bias": "7675128b6a92555cd955c820311e91e9417d31f48848f45d047b4100c62148b3",
"blk.2.attn_output_norm.weight": "5b4595e0fbcba67a700c4331adf746d2fba3546364a4db5607ae241947bb1a21",
"blk.2.attn_output_norm.bias": "7b8e16826ea30e5a2ba0b02e0095a901775981a296e98819625320e983060d08",
"blk.2.ffn_up.weight": "a0d815d946ac07a65095c4ae4df77b818845e6d97795c7d82f55e689d944db59",
"blk.2.ffn_up.bias": "ce37c0a4174d6bf773ded7bd016ede627ad3bdb8bc99b9992a18dc8e8898f252",
"blk.2.ffn_down.weight": "f6231d2a25426fbd45b9f1160aa484220eb227ceef0348c4a6a6de890606e5ef",
"blk.2.ffn_down.bias": "429e00556e8dc63a785238b309b9d83738500c1ef6d736fe6526ad88ea496d27",
"blk.2.layer_output_norm.weight": "651457a573adf3f7dd9ee5dfe1c8e89389e94443993aab77ec6a0b05aa621e35",
"blk.2.layer_output_norm.bias": "41fbbeda7fd89b0cef5f945ae44011c316982390401d6f75ba8c6d365e185247",
"blk.3.attn_q.weight": "95a43f32949d2cb8d22815bb27a44abfc6665ba96221af817dfe058cb6ca72c6",
"blk.3.attn_q.bias": "f4e34385e75d8108b6b3bd336106e2133a8c9be0cc343dfe5dc48c32a823c7cb",
"blk.3.attn_k.weight": "6b892da6a17d4d3265265a15f695864a31813ee8c8e710ae9bc9e1adbc6c9a18",
"blk.3.attn_k.bias": "40b8067b641a56014cee42548240aa8930820958b1933004892b5f04fbaef39e",
"blk.3.attn_v.weight": "9fcd5922319dd2a461082a5ce040c1dfe65d87d70ca6547dd0b46eeecc3eeb2b",
"blk.3.attn_v.bias": "b528c56212e66931fdbe267ac327a9c2f87cd03baff3ea719e30afe681da15f1",
"blk.3.attn_output.weight": "e3b178c1b03981e75510e0d277af23ea59cc404b5394e61bd32291825719b502",
"blk.3.attn_output.bias": "712c84d39a6a5a9c06a09da8fd9939ba0d5525524a4bba61ea4de09b48f45cae",
"blk.3.attn_output_norm.weight": "d1ffac88e675592ff72f8a617be32b4a381d443b2f8f2645dbe44a1e5745aac0",
"blk.3.attn_output_norm.bias": "ea31a1c73146234c50e0e43f485c458413714867b8e2703af66482f7db2d6c40",
"blk.3.ffn_up.weight": "4ef4f3b9a1ea6ab2ef2eb6e8b008e06a44790d099d97482a05a51e39a29afac0",
"blk.3.ffn_up.bias": "06a4296dda16f452675c51f108079fe7722552d6521c737d97734943818b9a2b",
"blk.3.ffn_down.weight": "f114b2bebe392c7d80433bb880c6730293aa4561b0b0370dcdaf7472daebd847",
"blk.3.ffn_down.bias": "2c8e67831d28a3bf613fc7912ae3259b63d72abcaf4d30efd8800758400158de",
"blk.3.layer_output_norm.weight": "a1dfeb7b5a51dd56447312ca41e2ad2f361a3ea12ddc355127f5f4219fb0a482",
"blk.3.layer_output_norm.bias": "1ed630021b25c6c6fc93fd32988b9907df966d4982a93081f639aac3044618ab",
"blk.4.attn_q.weight": "b5fae4c1f9a5f33a2a2e816ac0c01c25f422e4efdd59ef1ed93da2610e5370fc",
"blk.4.attn_q.bias": "c2e376524ea98ac3b10d9eee19ecb1b1e261fa5149efe0232844c923dfb428fb",
"blk.4.attn_k.weight": "a4632f5ebf9321d9d08f9112a4e5dda2efe5671df4a4e67fee24845f5b14af16",
"blk.4.attn_k.bias": "a9a02ffb8b8b4f6dfe487a7e0341f1d5318c9d2b793a688f34cb1b22fc66ef60",
"blk.4.attn_v.weight": "10ad8deb81d9fa093b1e5c0f24ea82aa7df43e6aca49e260fcbea56eab8cc86a",
"blk.4.attn_v.bias": "7326813e181e021130bd33ac136293fcffccce2d1d8cb59041e5b13a8cceacf6",
"blk.4.attn_output.weight": "c92573088c7437c2b3cda51490e152c27fb19e5468df591eabba5a49d5398d44",
"blk.4.attn_output.bias": "14e10b419e5859af1eb685af5c330aee67048cd704dcead9217840c6f5393222",
"blk.4.attn_output_norm.weight": "02b6831c0e0fb0edbc579a92812a1dd972cb15d14fcd382d4427c5a7b300ac44",
"blk.4.attn_output_norm.bias": "7eed5cd503bb6bb6ceb1bc8b07cc077903a4f14fb8b9d6cdf39644815ecf1374",
"blk.4.ffn_up.weight": "8d0c91d62e74d6431321116a37cf3339e630bd50ba164d3304fc4fe8dd831223",
"blk.4.ffn_up.bias": "d325f07f73c005a273c484c7be8e7abb4d6e8a5c4fd093f5869133b97629d017",
"blk.4.ffn_down.weight": "7ba7bd81143f40537b84f938e403e19f30e4928625eb371de052b9025beb4d21",
"blk.4.ffn_down.bias": "2853d9c2a75288214a4bf4907dc19d04d01926f4913d302b1aa7bdbfcce0f7a1",
"blk.4.layer_output_norm.weight": "a4ed1885fa77b90fed5300c355ef0aa0c876a8c747151d9d790939d464d57d4f",
"blk.4.layer_output_norm.bias": "62142a81e813a9e636333b2b805d6bc3b17c5e7cd4b15adce1ada6bc9a32563c",
"blk.5.attn_q.weight": "afc1dff080a72c3daad01384b1448d476aaf789871017c8ff8e144788887995d",
"blk.5.attn_q.bias": "748a820371c1d4f872c84545b36358d239c35bf6c99e2812c237d88c3292763b",
"blk.5.attn_k.weight": "59e30c1ed8acd2cbb01de5f62e7804015b9ecf98ba157d98cab016344639eda5",
"blk.5.attn_k.bias": "f839520078f9e589496e982e86d0126c7aa14196047339abffcf49a696229f77",
"blk.5.attn_v.weight": "3e21fb874e21b90308e1f46af034a3c32d3eba1628d62ae5f2246d6af5818923",
"blk.5.attn_v.bias": "5cd4852bf95c1444d10d756750f6bf49f842c0b39e9953c7f408bb67c325ac8c",
"blk.5.attn_output.weight": "636ce6a7752895f204b9d01ba0aedd9a294f908b42f372c22a16d9dd590d7471",
"blk.5.attn_output.bias": "82d924d4b0d2b94f2bbff91619216d6967a3541ce9b1531a6a60457a67b5d219",
"blk.5.attn_output_norm.weight": "5e7bd0a8d3396080f3360d7c4700bf094a06216431bd014c4479eef72ecf4271",
"blk.5.attn_output_norm.bias": "66c6de5edda5466d029c6753780be81ccd4218bf8bc00680000e0f06856ab712",
"blk.5.ffn_up.weight": "5bbf6e7ea380e216e33f8bee06d25f2265359d3876a300e92bc6e41d48e33430",
"blk.5.ffn_up.bias": "9d795388bb36fb33ad3a37fea3ccb4937838e02800a608fb47d363cd06b47370",
"blk.5.ffn_down.weight": "2fd628974e7f075479dd227b46fbd48ae8d3ca34d735b36f391ac06410730368",
"blk.5.ffn_down.bias": "cd213ba9eaa75fa541648097fbe9c96e58077e6c3ad6ad2fb1f21f8350f44291",
"blk.5.layer_output_norm.weight": "159a9df41d15b7022d136f86a2a2631c4635f9816e957472217077b522bcf52a",
"blk.5.layer_output_norm.bias": "24c1f27ffd1eb4e5be7e3a2909943e6f0980635d761fa1efdd0c19645da23766"
}

6
convert/testdata/gemma-2-9b-it.json vendored Normal file
View File

@@ -0,0 +1,6 @@
{
"general.architecture": "gemma2",
"gemma2.attention.sliding_window": "4096",
"gemma2.attn_logit_softcapping": "50",
"gemma2.final_logit_softcapping": "30"
}

View File

@@ -1,7 +1,6 @@
package convert
import (
"cmp"
"crypto/sha256"
"encoding/hex"
"encoding/json"
@@ -11,6 +10,8 @@ import (
"log/slog"
"os"
"slices"
"golang.org/x/exp/maps"
)
const (
@@ -184,32 +185,32 @@ func parseVocabularyFromTokenizer(fsys fs.FS) (*Vocabulary, error) {
return nil, err
}
var tokens []token
tokens := make(map[int]token, len(t.Model.Vocab))
for k, v := range t.Model.Vocab {
tokens = append(tokens, token{
tokens[v] = token{
ID: v,
Content: k,
})
}
}
for _, t := range t.AddedTokens {
t.UserDefined = true
tokens = append(tokens, t)
for _, token := range t.AddedTokens {
token.UserDefined = true
tokens[token.ID] = token
}
slices.SortFunc(tokens, func(i, j token) int {
return cmp.Compare(i.ID, j.ID)
})
keys := maps.Keys(tokens)
slices.Sort(keys)
v := Vocabulary{Model: "gpt2"}
for _, t := range tokens {
v.Tokens = append(v.Tokens, t.Content)
v.Scores = append(v.Scores, float32(t.ID))
for _, k := range keys {
token := tokens[k]
v.Tokens = append(v.Tokens, token.Content)
v.Scores = append(v.Scores, float32(token.ID))
switch {
case t.Special:
case token.Special:
v.Types = append(v.Types, tokenTypeControl)
case t.UserDefined:
case token.UserDefined:
v.Types = append(v.Types, tokenTypeUserDefined)
default:
v.Types = append(v.Types, tokenTypeNormal)

View File

@@ -15,6 +15,11 @@ import (
)
func parseSentencePiece(fsys fs.FS) (*Vocabulary, error) {
ast, err := parseAdditionalSpecialTokens(fsys)
if err != nil {
return nil, err
}
bts, err := fs.ReadFile(fsys, "tokenizer.model")
if err != nil {
return nil, err
@@ -37,7 +42,12 @@ func parseSentencePiece(fsys fs.FS) (*Vocabulary, error) {
sentencepiece.ModelProto_SentencePiece_BYTE:
v.Types = append(v.Types, int32(t))
default:
v.Types = append(v.Types, int32(sentencepiece.ModelProto_SentencePiece_NORMAL))
tt := int32(sentencepiece.ModelProto_SentencePiece_NORMAL)
if slices.Contains(ast, piece.GetPiece()) {
tt = int32(sentencepiece.ModelProto_SentencePiece_CONTROL)
}
v.Types = append(v.Types, tt)
}
}
@@ -81,3 +91,23 @@ func parseSentencePiece(fsys fs.FS) (*Vocabulary, error) {
return &v, nil
}
func parseAdditionalSpecialTokens(fsys fs.FS) ([]string, error) {
f, err := fsys.Open("special_tokens_map.json")
if errors.Is(err, os.ErrNotExist) {
return nil, nil
} else if err != nil {
return nil, err
}
defer f.Close()
var m struct {
AdditionalSpecialTokens []string `json:"additional_special_tokens"`
}
if err := json.NewDecoder(f).Decode(&m); err != nil {
return nil, err
}
return m.AdditionalSpecialTokens, nil
}

View File

@@ -669,7 +669,7 @@ curl http://localhost:11434/api/chat -d '{
```
curl http://localhost:11434/api/chat -d '{
"model": "mistral",
"model": "llama3.1",
"messages": [
{
"role": "user",
@@ -708,7 +708,7 @@ curl http://localhost:11434/api/chat -d '{
```json
{
"model": "mistral:7b-instruct-v0.3-q4_K_M",
"model": "llama3.1",
"created_at": "2024-07-22T20:33:28.123648Z",
"message": {
"role": "assistant",
@@ -1175,7 +1175,10 @@ curl http://localhost:11434/api/embed -d '{
"embeddings": [[
0.010071029, -0.0017594862, 0.05007221, 0.04692972, 0.054916814,
0.008599704, 0.105441414, -0.025878139, 0.12958129, 0.031952348
]]
]],
"total_duration": 14143917,
"load_duration": 1019500,
"prompt_eval_count": 8
}
```

View File

@@ -1,71 +1,71 @@
# Ollama Docker image
### CPU only
```bash
docker run -d -v ollama:/root/.ollama -p 11434:11434 --name ollama ollama/ollama
```
### Nvidia GPU
Install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html#installation).
#### Install with Apt
1. Configure the repository
```bash
curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey \
| sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg
curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list \
| sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' \
| sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
sudo apt-get update
```
2. Install the NVIDIA Container Toolkit packages
```bash
sudo apt-get install -y nvidia-container-toolkit
```
#### Install with Yum or Dnf
1. Configure the repository
```bash
curl -s -L https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo \
| sudo tee /etc/yum.repos.d/nvidia-container-toolkit.repo
```
2. Install the NVIDIA Container Toolkit packages
```bash
sudo yum install -y nvidia-container-toolkit
```
#### Configure Docker to use Nvidia driver
```
sudo nvidia-ctk runtime configure --runtime=docker
sudo systemctl restart docker
```
#### Start the container
```bash
docker run -d --gpus=all -v ollama:/root/.ollama -p 11434:11434 --name ollama ollama/ollama
```
### AMD GPU
To run Ollama using Docker with AMD GPUs, use the `rocm` tag and the following command:
```
docker run -d --device /dev/kfd --device /dev/dri -v ollama:/root/.ollama -p 11434:11434 --name ollama ollama/ollama:rocm
```
### Run model locally
Now you can run a model:
```
docker exec -it ollama ollama run llama3.1
```
### Try different models
More models can be found on the [Ollama library](https://ollama.com/library).
# Ollama Docker image
### CPU only
```bash
docker run -d -v ollama:/root/.ollama -p 11434:11434 --name ollama ollama/ollama
```
### Nvidia GPU
Install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html#installation).
#### Install with Apt
1. Configure the repository
```bash
curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey \
| sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg
curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list \
| sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' \
| sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
sudo apt-get update
```
2. Install the NVIDIA Container Toolkit packages
```bash
sudo apt-get install -y nvidia-container-toolkit
```
#### Install with Yum or Dnf
1. Configure the repository
```bash
curl -s -L https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo \
| sudo tee /etc/yum.repos.d/nvidia-container-toolkit.repo
```
2. Install the NVIDIA Container Toolkit packages
```bash
sudo yum install -y nvidia-container-toolkit
```
#### Configure Docker to use Nvidia driver
```
sudo nvidia-ctk runtime configure --runtime=docker
sudo systemctl restart docker
```
#### Start the container
```bash
docker run -d --gpus=all -v ollama:/root/.ollama -p 11434:11434 --name ollama ollama/ollama
```
### AMD GPU
To run Ollama using Docker with AMD GPUs, use the `rocm` tag and the following command:
```
docker run -d --device /dev/kfd --device /dev/dri -v ollama:/root/.ollama -p 11434:11434 --name ollama ollama/ollama:rocm
```
### Run model locally
Now you can run a model:
```
docker exec -it ollama ollama run llama3.1
```
### Try different models
More models can be found on the [Ollama library](https://ollama.com/library).

View File

@@ -111,7 +111,10 @@ On Windows, Ollama inherits your user and system environment variables.
## How do I use Ollama behind a proxy?
Ollama is compatible with proxy servers if `HTTP_PROXY` or `HTTPS_PROXY` are configured. When using either variables, ensure it is set where `ollama serve` can access the values. When using `HTTPS_PROXY`, ensure the proxy certificate is installed as a system certificate. Refer to the section above for how to use environment variables on your platform.
Ollama pulls models from the Internet and may require a proxy server to access the models. Use `HTTPS_PROXY` to redirect outbound requests through the proxy. Ensure the proxy certificate is installed as a system certificate. Refer to the section above for how to use environment variables on your platform.
> [!NOTE]
> Avoid setting `HTTP_PROXY`. Ollama does not use HTTP for model pulls, only HTTPS. Setting `HTTP_PROXY` may interrupt client connections to the server.
### How do I use Ollama behind a proxy in Docker?
@@ -276,4 +279,4 @@ Note: Windows with Radeon GPUs currently default to 1 model maximum due to limit
## How does Ollama load models on multiple GPUs?
Installing multiple GPUs of the same brand can be a great way to increase your available VRAM to load larger models. When you load a new model, Ollama evaluates the required VRAM for the model against what is currently available. If the model will entirely fit on any single GPU, Ollama will load the model on that GPU. This typically provides the best performance as it reduces the amount of data transfering across the PCI bus during inference. If the model does not fit entirely on one GPU, then it will be spread across all the available GPUs.
Installing multiple GPUs of the same brand can be a great way to increase your available VRAM to load larger models. When you load a new model, Ollama evaluates the required VRAM for the model against what is currently available. If the model will entirely fit on any single GPU, Ollama will load the model on that GPU. This typically provides the best performance as it reduces the amount of data transfering across the PCI bus during inference. If the model does not fit entirely on one GPU, then it will be spread across all the available GPUs.

BIN
docs/images/ollama-keys.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 141 KiB

BIN
docs/images/signup.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 80 KiB

View File

@@ -1,42 +1,129 @@
# Import
# Importing a model
GGUF models and select Safetensors models can be imported directly into Ollama.
## Table of Contents
## Import GGUF
* [Importing a Safetensors adapter](#Importing-a-fine-tuned-adapter-from-Safetensors-weights)
* [Importing a Safetensors model](#Importing-a-model-from-Safetensors-weights)
* [Importing a GGUF file](#Importing-a-GGUF-based-model-or-adapter)
* [Sharing models on ollama.com](#Sharing-your-model-on-ollamacom)
A binary GGUF file can be imported directly into Ollama through a Modelfile.
## Importing a fine tuned adapter from Safetensors weights
First, create a `Modelfile` with a `FROM` command pointing at the base model you used for fine tuning, and an `ADAPTER` command which points to the directory with your Safetensors adapter:
```dockerfile
FROM /path/to/file.gguf
FROM <base model name>
ADAPTER /path/to/safetensors/adapter/directory
```
## Import Safetensors
Make sure that you use the same base model in the `FROM` command as you used to create the adapter otherwise you will get erratic results. Most frameworks use different quantization methods, so it's best to use non-quantized (i.e. non-QLoRA) adapters. If your adapter is in the same directory as your `Modelfile`, use `ADAPTER .` to specify the adapter path.
If the model being imported is one of these architectures, it can be imported directly into Ollama through a Modelfile:
Now run `ollama create` from the directory where the `Modelfile` was created:
- LlamaForCausalLM
- MistralForCausalLM
- GemmaForCausalLM
```bash
ollama create my-model
```
Lastly, test the model:
```bash
ollama run my-model
```
Ollama supports importing adapters based on several different model architectures including:
* Llama (including Llama 2, Llama 3, and Llama 3.1);
* Mistral (including Mistral 1, Mistral 2, and Mixtral); and
* Gemma (including Gemma 1 and Gemma 2)
You can create the adapter using a fine tuning framework or tool which can output adapters in the Safetensors format, such as:
* Hugging Face [fine tuning framework] (https://huggingface.co/docs/transformers/en/training)
* [Unsloth](https://github.com/unslothai/unsloth)
* [MLX](https://github.com/ml-explore/mlx)
## Importing a model from Safetensors weights
First, create a `Modelfile` with a `FROM` command which points to the directory containing your Safetensors weights:
```dockerfile
FROM /path/to/safetensors/directory
```
For architectures not directly convertable by Ollama, see llama.cpp's [guide](https://github.com/ggerganov/llama.cpp/blob/master/README.md#prepare-and-quantize) on conversion. After conversion, see [Import GGUF](#import-gguf).
If you create the Modelfile in the same directory as the weights, you can use the command `FROM .`.
## Automatic Quantization
Now run the `ollama create` command from the directory where you created the `Modelfile`:
> [!NOTE]
> Automatic quantization requires v0.1.35 or higher.
```shell
ollama create my-model
```
Ollama is capable of quantizing FP16 or FP32 models to any of the supported quantizations with the `-q/--quantize` flag in `ollama create`.
Lastly, test the model:
```shell
ollama run my-model
```
Ollama supports importing models for several different architectures including:
* Llama (including Llama 2, Llama 3, and Llama 3.1);
* Mistral (including Mistral 1, Mistral 2, and Mixtral);
* Gemma (including Gemma 1 and Gemma 2); and
* Phi3
This includes importing foundation models as well as any fine tuned models which which have been _fused_ with a foundation model.
## Importing a GGUF based model or adapter
If you have a GGUF based model or adapter it is possible to import it into Ollama. You can obtain a GGUF model or adapter by:
* converting a Safetensors model with the `convert_hf_to_gguf.py` from Llama.cpp;
* converting a Safetensors adapter with the `convert_lora_to_gguf.py` from Llama.cpp; or
* downloading a model or adapter from a place such as HuggingFace
To import a GGUF model, create a `Modelfile` containg:
```dockerfile
FROM /path/to/file.gguf
```
For a GGUF adapter, create the `Modelfile` with:
```dockerfile
FROM <model name>
ADAPTER /path/to/file.gguf
```
When importing a GGUF adapter, it's important to use the same base model as the base model that the adapter was created with. You can use:
* a model from Ollama
* a GGUF file
* a Safetensors based model
Once you have created your `Modelfile`, use the `ollama create` command to build the model.
```shell
ollama create my-model
```
## Quantizing a Model
Quantizing a model allows you to run models faster and with less memory consumption but at reduced accuracy. This allows you to run a model on more modest hardware.
Ollama can quantize FP16 and FP32 based models into different quantization levels using the `-q/--quantize` flag with the `ollama create` command.
First, create a Modelfile with the FP16 or FP32 based model you wish to quantize.
```dockerfile
FROM /path/to/my/gemma/f16/model
```
Use `ollama create` to then create the quantized model.
```shell
$ ollama create -q Q4_K_M mymodel
$ ollama create --quantize q4_K_M mymodel
transferring model data
quantizing F16 model to Q4_K_M
creating new layer sha256:735e246cc1abfd06e9cdcf95504d6789a6cd1ad7577108a70d9902fef503c1bd
@@ -47,42 +134,53 @@ success
### Supported Quantizations
- `Q4_0`
- `Q4_1`
- `Q5_0`
- `Q5_1`
- `Q8_0`
- `q4_0`
- `q4_1`
- `q5_0`
- `q5_1`
- `q8_0`
#### K-means Quantizations
- `Q3_K_S`
- `Q3_K_M`
- `Q3_K_L`
- `Q4_K_S`
- `Q4_K_M`
- `Q5_K_S`
- `Q5_K_M`
- `Q6_K`
- `q3_K_S`
- `q3_K_M`
- `q3_K_L`
- `q4_K_S`
- `q4_K_M`
- `q5_K_S`
- `q5_K_M`
- `q6_K`
## Template Detection
> [!NOTE]
> Template detection requires v0.1.42 or higher.
## Sharing your model on ollama.com
Ollama uses model metadata, specifically `tokenizer.chat_template`, to automatically create a template appropriate for the model you're importing.
You can share any model you have created by pushing it to [ollama.com](https://ollama.com) so that other users can try it out.
```dockerfile
FROM /path/to/my/gemma/model
```
First, use your browser to go to the [Ollama Sign-Up](https://ollama.com/signup) page. If you already have an account, you can skip this step.
![Sign-Up](images/signup.png)
The `Username` field will be used as part of your model's name (e.g. `jmorganca/mymodel`), so make sure you are comfortable with the username that you have selected.
Now that you have created an account and are signed-in, go to the [Ollama Keys Settings](https://ollama.com/settings/keys) page.
Follow the directions on the page to determine where your Ollama Public Key is located.
![Ollama Key](images/ollama-keys.png)
Click on the `Add Ollama Public Key` button, and copy and paste the contents of your Ollama Public Key into the text field.
To push a model to [ollama.com](https://ollama.com), first make sure that it is named correctly with your username. You may have to use the `ollama cp` command to copy
your model to give it the correct name. Once you're happy with your model's name, use the `ollama push` command to push it to [ollama.com](https://ollama.com).
```shell
$ ollama create mymodel
transferring model data
using autodetected template gemma-instruct
creating new layer sha256:baa2a0edc27d19cc6b7537578a9a7ba1a4e3214dc185ed5ae43692b319af7b84
creating new layer sha256:ba66c3309914dbef07e5149a648fd1877f030d337a4f240d444ea335008943cb
writing manifest
success
ollama cp mymodel myuser/mymodel
ollama push myuser/mymodel
```
Once your model has been pushed, other users can pull and run it by using the command:
```shell
ollama run myuser/mymodel
```
Defining a template in the Modelfile will disable this feature which may be useful if you want to use a different template than the autodetected one.

View File

@@ -20,13 +20,12 @@ GPU.
## Manual install
### Download the `ollama` binary
### Download `ollama`
Ollama is distributed as a self-contained binary. Download it to a directory in your PATH:
Download and extract the Linux package:
```bash
sudo curl -L https://ollama.com/download/ollama-linux-amd64 -o /usr/bin/ollama
sudo chmod +x /usr/bin/ollama
curl -fsSL https://ollama.com/download/ollama-linux-amd64.tgz | sudo tar zx -C /usr
```
### Adding Ollama as a startup service (recommended)
@@ -96,8 +95,7 @@ curl -fsSL https://ollama.com/install.sh | sh
Or by downloading the ollama binary:
```bash
sudo curl -L https://ollama.com/download/ollama-linux-amd64 -o /usr/bin/ollama
sudo chmod +x /usr/bin/ollama
curl -fsSL https://ollama.com/download/ollama-linux-amd64.tgz | sudo tar zx -C /usr
```
## Installing specific versions

View File

@@ -28,13 +28,35 @@ chat_completion = client.chat.completions.create(
model='llama3',
)
response = client.chat.completions.create(
model="llava",
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": "What's in this image?"},
{
"type": "image_url",
"image_url": "iVBORw0KGgoAAAANSUhEUgAAAG0AAABmCAYAAADBPx+VAAAACXBIWXMAAAsTAAALEwEAmpwYAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAA3VSURBVHgB7Z27r0zdG8fX743i1bi1ikMoFMQloXRpKFFIqI7LH4BEQ+NWIkjQuSWCRIEoULk0gsK1kCBI0IhrQVT7tz/7zZo888yz1r7MnDl7z5xvsjkzs2fP3uu71nNfa7lkAsm7d++Sffv2JbNmzUqcc8m0adOSzZs3Z+/XES4ZckAWJEGWPiCxjsQNLWmQsWjRIpMseaxcuTKpG/7HP27I8P79e7dq1ars/yL4/v27S0ejqwv+cUOGEGGpKHR37tzJCEpHV9tnT58+dXXCJDdECBE2Ojrqjh071hpNECjx4cMHVycM1Uhbv359B2F79+51586daxN/+pyRkRFXKyRDAqxEp4yMlDDzXG1NPnnyJKkThoK0VFd1ELZu3TrzXKxKfW7dMBQ6bcuWLW2v0VlHjx41z717927ba22U9APcw7Nnz1oGEPeL3m3p2mTAYYnFmMOMXybPPXv2bNIPpFZr1NHn4HMw0KRBjg9NuRw95s8PEcz/6DZELQd/09C9QGq5RsmSRybqkwHGjh07OsJSsYYm3ijPpyHzoiacg35MLdDSIS/O1yM778jOTwYUkKNHWUzUWaOsylE00MyI0fcnOwIdjvtNdW/HZwNLGg+sR1kMepSNJXmIwxBZiG8tDTpEZzKg0GItNsosY8USkxDhD0Rinuiko2gfL/RbiD2LZAjU9zKQJj8RDR0vJBR1/Phx9+PHj9Z7REF4nTZkxzX4LCXHrV271qXkBAPGfP/atWvu/PnzHe4C97F48eIsRLZ9+3a3f/9+87dwP1JxaF7/3r17ba+5l4EcaVo0lj3SBq5kGTJSQmLWMjgYNei2GPT1MuMqGTDEFHzeQSP2wi/jGnkmPJ/nhccs44jvDAxpVcxnq0F6eT8h4ni/iIWpR5lPyA6ETkNXoSukvpJAD3AsXLiwpZs49+fPn5ke4j10TqYvegSfn0OnafC+Tv9ooA/JPkgQysqQNBzagXY55nO/oa1F7qvIPWkRL12WRpMWUvpVDYmxAPehxWSe8ZEXL20sadYIozfmNch4QJPAfeJgW3rNsnzphBKNJM2KKODo1rVOMRYik5ETy3ix4qWNI81qAAirizgMIc+yhTytx0JWZuNI03qsrgWlGtwjoS9XwgUhWGyhUaRZZQNNIEwCiXD16tXcAHUs79co0vSD8rrJCIW98pzvxpAWyyo3HYwqS0+H0BjStClcZJT5coMm6D2LOF8TolGJtK9fvyZpyiC5ePFi9nc/oJU4eiEP0jVoAnHa9wyJycITMP78+eMeP37sXrx44d6+fdt6f82aNdkx1pg9e3Zb5W+RSRE+n+VjksQWifvVaTKFhn5O8my63K8Qabdv33b379/PiAP//vuvW7BggZszZ072/+TJk91YgkafPn166zXB1rQHFvouAWHq9z3SEevSUerqCn2/dDCeta2jxYbr69evk4MHDyY7d+7MjhMnTiTPnz9Pfv/+nfQT2ggpO2dMF8cghuoM7Ygj5iWCqRlGFml0QC/ftGmTmzt3rmsaKDsgBSPh0/8yPeLLBihLkOKJc0jp8H8vUzcxIA1k6QJ/c78tWEyj5P3o4u9+jywNPdJi5rAH9x0KHcl4Hg570eQp3+vHXGyrmEeigzQsQsjavXt38ujRo44LQuDDhw+TW7duRS1HGgMxhNXHgflaNTOsHyKvHK5Ijo2jbFjJBQK9YwFd6RVMzfgRBmEfP37suBBm/p49e1qjEP2mwTViNRo0VJWH1deMXcNK08uUjVUu7s/zRaL+oLNxz1bpANco4npUgX4G2eFbpDFyQoQxojBCpEGSytmOH8qrH5Q9vuzD6ofQylkCUmh8DBAr+q8JCyVNtWQIidKQE9wNtLSQnS4jDSsxNHogzFuQBw4cyM61UKVsjfr3ooBkPSqqQHesUPWVtzi9/vQi1T+rJj7WiTz4Pt/l3LxUkr5P2VYZaZ4URpsE+st/dujQoaBBYokbrz/8TJNQYLSonrPS9kUaSkPeZyj1AWSj+d+VBoy1pIWVNed8P0Ll/ee5HdGRhrHhR5GGN0r4LGZBaj8oFDJitBTJzIZgFcmU0Y8ytWMZMzJOaXUSrUs5RxKnrxmbb5YXO9VGUhtpXldhEUogFr3IzIsvlpmdosVcGVGXFWp2oU9kLFL3dEkSz6NHEY1sjSRdIuDFWEhd8KxFqsRi1uM/nz9/zpxnwlESONdg6dKlbsaMGS4EHFHtjFIDHwKOo46l4TxSuxgDzi+rE2jg+BaFruOX4HXa0Nnf1lwAPufZeF8/r6zD97WK2qFnGjBxTw5qNGPxT+5T/r7/7RawFC3j4vTp09koCxkeHjqbHJqArmH5UrFKKksnxrK7FuRIs8STfBZv+luugXZ2pR/pP9Ois4z+TiMzUUkUjD0iEi1fzX8GmXyuxUBRcaUfykV0YZnlJGKQpOiGB76x5GeWkWWJc3mOrK6S7xdND+W5N6XyaRgtWJFe13GkaZnKOsYqGdOVVVbGupsyA/l7emTLHi7vwTdirNEt0qxnzAvBFcnQF16xh/TMpUuXHDowhlA9vQVraQhkudRdzOnK+04ZSP3DUhVSP61YsaLtd/ks7ZgtPcXqPqEafHkdqa84X6aCeL7YWlv6edGFHb+ZFICPlljHhg0bKuk0CSvVznWsotRu433alNdFrqG45ejoaPCaUkWERpLXjzFL2Rpllp7PJU2a/v7Ab8N05/9t27Z16KUqoFGsxnI9EosS2niSYg9SpU6B4JgTrvVW1flt1sT+0ADIJU2maXzcUTraGCRaL1Wp9rUMk16PMom8QhruxzvZIegJjFU7LLCePfS8uaQdPny4jTTL0dbee5mYokQsXTIWNY46kuMbnt8Kmec+LGWtOVIl9cT1rCB0V8WqkjAsRwta93TbwNYoGKsUSChN44lgBNCoHLHzquYKrU6qZ8lolCIN0Rh6cP0Q3U6I6IXILYOQI513hJaSKAorFpuHXJNfVlpRtmYBk1Su1obZr5dnKAO+L10Hrj3WZW+E3qh6IszE37F6EB+68mGpvKm4eb9bFrlzrok7fvr0Kfv727dvWRmdVTJHw0qiiCUSZ6wCK+7XL/AcsgNyL74DQQ730sv78Su7+t/A36MdY0sW5o40ahslXr58aZ5HtZB8GH64m9EmMZ7FpYw4T6QnrZfgenrhFxaSiSGXtPnz57e9TkNZLvTjeqhr734CNtrK41L40sUQckmj1lGKQ0rC37x544r8eNXRpnVE3ZZY7zXo8NomiO0ZUCj2uHz58rbXoZ6gc0uA+F6ZeKS/jhRDUq8MKrTho9fEkihMmhxtBI1DxKFY9XLpVcSkfoi8JGnToZO5sU5aiDQIW716ddt7ZLYtMQlhECdBGXZZMWldY5BHm5xgAroWj4C0hbYkSc/jBmggIrXJWlZM6pSETsEPGqZOndr2uuuR5rF169a2HoHPdurUKZM4CO1WTPqaDaAd+GFGKdIQkxAn9RuEWcTRyN2KSUgiSgF5aWzPTeA/lN5rZubMmR2bE4SIC4nJoltgAV/dVefZm72AtctUCJU2CMJ327hxY9t7EHbkyJFseq+EJSY16RPo3Dkq1kkr7+q0bNmyDuLQcZBEPYmHVdOBiJyIlrRDq41YPWfXOxUysi5fvtyaj+2BpcnsUV/oSoEMOk2CQGlr4ckhBwaetBhjCwH0ZHtJROPJkyc7UjcYLDjmrH7ADTEBXFfOYmB0k9oYBOjJ8b4aOYSe7QkKcYhFlq3QYLQhSidNmtS2RATwy8YOM3EQJsUjKiaWZ+vZToUQgzhkHXudb/PW5YMHD9yZM2faPsMwoc7RciYJXbGuBqJ1UIGKKLv915jsvgtJxCZDubdXr165mzdvtr1Hz5LONA8jrUwKPqsmVesKa49S3Q4WxmRPUEYdTjgiUcfUwLx589ySJUva3oMkP6IYddq6HMS4o55xBJBUeRjzfa4Zdeg56QZ43LhxoyPo7Lf1kNt7oO8wWAbNwaYjIv5lhyS7kRf96dvm5Jah8vfvX3flyhX35cuX6HfzFHOToS1H4BenCaHvO8pr8iDuwoUL7tevX+b5ZdbBair0xkFIlFDlW4ZknEClsp/TzXyAKVOmmHWFVSbDNw1l1+4f90U6IY/q4V27dpnE9bJ+v87QEydjqx/UamVVPRG+mwkNTYN+9tjkwzEx+atCm/X9WvWtDtAb68Wy9LXa1UmvCDDIpPkyOQ5ZwSzJ4jMrvFcr0rSjOUh+GcT4LSg5ugkW1Io0/SCDQBojh0hPlaJdah+tkVYrnTZowP8iq1F1TgMBBauufyB33x1v+NWFYmT5KmppgHC+NkAgbmRkpD3yn9QIseXymoTQFGQmIOKTxiZIWpvAatenVqRVXf2nTrAWMsPnKrMZHz6bJq5jvce6QK8J1cQNgKxlJapMPdZSR64/UivS9NztpkVEdKcrs5alhhWP9NeqlfWopzhZScI6QxseegZRGeg5a8C3Re1Mfl1ScP36ddcUaMuv24iOJtz7sbUjTS4qBvKmstYJoUauiuD3k5qhyr7QdUHMeCgLa1Ear9NquemdXgmum4fvJ6w1lqsuDhNrg1qSpleJK7K3TF0Q2jSd94uSZ60kK1e3qyVpQK6PVWXp2/FC3mp6jBhKKOiY2h3gtUV64TWM6wDETRPLDfSakXmH3w8g9Jlug8ZtTt4kVF0kLUYYmCCtD/DrQ5YhMGbA9L3ucdjh0y8kOHW5gU/VEEmJTcL4Pz/f7mgoAbYkAAAAAElFTkSuQmCC",
},
],
}
],
max_tokens=300,
)
completion = client.completions.create(
model="llama3",
prompt="Say this is a test",
)
list_completion = client.models.list()
model = client.models.retrieve("llama3")
embeddings = client.embeddings.create(
model="all-minilm",
input=["why is the sky blue?", "why is the grass green?"]
input=["why is the sky blue?", "why is the grass green?"],
)
```
@@ -51,23 +73,44 @@ const openai = new OpenAI({
})
const chatCompletion = await openai.chat.completions.create({
messages: [{ role: 'user', content: 'Say this is a test' }],
model: 'llama3',
messages: [{ role: 'user', content: 'Say this is a test' }],
model: 'llama3',
})
const response = await openai.chat.completions.create({
model: "llava",
messages: [
{
role: "user",
content: [
{ type: "text", text: "What's in this image?" },
{
type: "image_url",
image_url: "iVBORw0KGgoAAAANSUhEUgAAAG0AAABmCAYAAADBPx+VAAAACXBIWXMAAAsTAAALEwEAmpwYAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAA3VSURBVHgB7Z27r0zdG8fX743i1bi1ikMoFMQloXRpKFFIqI7LH4BEQ+NWIkjQuSWCRIEoULk0gsK1kCBI0IhrQVT7tz/7zZo888yz1r7MnDl7z5xvsjkzs2fP3uu71nNfa7lkAsm7d++Sffv2JbNmzUqcc8m0adOSzZs3Z+/XES4ZckAWJEGWPiCxjsQNLWmQsWjRIpMseaxcuTKpG/7HP27I8P79e7dq1ars/yL4/v27S0ejqwv+cUOGEGGpKHR37tzJCEpHV9tnT58+dXXCJDdECBE2Ojrqjh071hpNECjx4cMHVycM1Uhbv359B2F79+51586daxN/+pyRkRFXKyRDAqxEp4yMlDDzXG1NPnnyJKkThoK0VFd1ELZu3TrzXKxKfW7dMBQ6bcuWLW2v0VlHjx41z717927ba22U9APcw7Nnz1oGEPeL3m3p2mTAYYnFmMOMXybPPXv2bNIPpFZr1NHn4HMw0KRBjg9NuRw95s8PEcz/6DZELQd/09C9QGq5RsmSRybqkwHGjh07OsJSsYYm3ijPpyHzoiacg35MLdDSIS/O1yM778jOTwYUkKNHWUzUWaOsylE00MyI0fcnOwIdjvtNdW/HZwNLGg+sR1kMepSNJXmIwxBZiG8tDTpEZzKg0GItNsosY8USkxDhD0Rinuiko2gfL/RbiD2LZAjU9zKQJj8RDR0vJBR1/Phx9+PHj9Z7REF4nTZkxzX4LCXHrV271qXkBAPGfP/atWvu/PnzHe4C97F48eIsRLZ9+3a3f/9+87dwP1JxaF7/3r17ba+5l4EcaVo0lj3SBq5kGTJSQmLWMjgYNei2GPT1MuMqGTDEFHzeQSP2wi/jGnkmPJ/nhccs44jvDAxpVcxnq0F6eT8h4ni/iIWpR5lPyA6ETkNXoSukvpJAD3AsXLiwpZs49+fPn5ke4j10TqYvegSfn0OnafC+Tv9ooA/JPkgQysqQNBzagXY55nO/oa1F7qvIPWkRL12WRpMWUvpVDYmxAPehxWSe8ZEXL20sadYIozfmNch4QJPAfeJgW3rNsnzphBKNJM2KKODo1rVOMRYik5ETy3ix4qWNI81qAAirizgMIc+yhTytx0JWZuNI03qsrgWlGtwjoS9XwgUhWGyhUaRZZQNNIEwCiXD16tXcAHUs79co0vSD8rrJCIW98pzvxpAWyyo3HYwqS0+H0BjStClcZJT5coMm6D2LOF8TolGJtK9fvyZpyiC5ePFi9nc/oJU4eiEP0jVoAnHa9wyJycITMP78+eMeP37sXrx44d6+fdt6f82aNdkx1pg9e3Zb5W+RSRE+n+VjksQWifvVaTKFhn5O8my63K8Qabdv33b379/PiAP//vuvW7BggZszZ072/+TJk91YgkafPn166zXB1rQHFvouAWHq9z3SEevSUerqCn2/dDCeta2jxYbr69evk4MHDyY7d+7MjhMnTiTPnz9Pfv/+nfQT2ggpO2dMF8cghuoM7Ygj5iWCqRlGFml0QC/ftGmTmzt3rmsaKDsgBSPh0/8yPeLLBihLkOKJc0jp8H8vUzcxIA1k6QJ/c78tWEyj5P3o4u9+jywNPdJi5rAH9x0KHcl4Hg570eQp3+vHXGyrmEeigzQsQsjavXt38ujRo44LQuDDhw+TW7duRS1HGgMxhNXHgflaNTOsHyKvHK5Ijo2jbFjJBQK9YwFd6RVMzfgRBmEfP37suBBm/p49e1qjEP2mwTViNRo0VJWH1deMXcNK08uUjVUu7s/zRaL+oLNxz1bpANco4npUgX4G2eFbpDFyQoQxojBCpEGSytmOH8qrH5Q9vuzD6ofQylkCUmh8DBAr+q8JCyVNtWQIidKQE9wNtLSQnS4jDSsxNHogzFuQBw4cyM61UKVsjfr3ooBkPSqqQHesUPWVtzi9/vQi1T+rJj7WiTz4Pt/l3LxUkr5P2VYZaZ4URpsE+st/dujQoaBBYokbrz/8TJNQYLSonrPS9kUaSkPeZyj1AWSj+d+VBoy1pIWVNed8P0Ll/ee5HdGRhrHhR5GGN0r4LGZBaj8oFDJitBTJzIZgFcmU0Y8ytWMZMzJOaXUSrUs5RxKnrxmbb5YXO9VGUhtpXldhEUogFr3IzIsvlpmdosVcGVGXFWp2oU9kLFL3dEkSz6NHEY1sjSRdIuDFWEhd8KxFqsRi1uM/nz9/zpxnwlESONdg6dKlbsaMGS4EHFHtjFIDHwKOo46l4TxSuxgDzi+rE2jg+BaFruOX4HXa0Nnf1lwAPufZeF8/r6zD97WK2qFnGjBxTw5qNGPxT+5T/r7/7RawFC3j4vTp09koCxkeHjqbHJqArmH5UrFKKksnxrK7FuRIs8STfBZv+luugXZ2pR/pP9Ois4z+TiMzUUkUjD0iEi1fzX8GmXyuxUBRcaUfykV0YZnlJGKQpOiGB76x5GeWkWWJc3mOrK6S7xdND+W5N6XyaRgtWJFe13GkaZnKOsYqGdOVVVbGupsyA/l7emTLHi7vwTdirNEt0qxnzAvBFcnQF16xh/TMpUuXHDowhlA9vQVraQhkudRdzOnK+04ZSP3DUhVSP61YsaLtd/ks7ZgtPcXqPqEafHkdqa84X6aCeL7YWlv6edGFHb+ZFICPlljHhg0bKuk0CSvVznWsotRu433alNdFrqG45ejoaPCaUkWERpLXjzFL2Rpllp7PJU2a/v7Ab8N05/9t27Z16KUqoFGsxnI9EosS2niSYg9SpU6B4JgTrvVW1flt1sT+0ADIJU2maXzcUTraGCRaL1Wp9rUMk16PMom8QhruxzvZIegJjFU7LLCePfS8uaQdPny4jTTL0dbee5mYokQsXTIWNY46kuMbnt8Kmec+LGWtOVIl9cT1rCB0V8WqkjAsRwta93TbwNYoGKsUSChN44lgBNCoHLHzquYKrU6qZ8lolCIN0Rh6cP0Q3U6I6IXILYOQI513hJaSKAorFpuHXJNfVlpRtmYBk1Su1obZr5dnKAO+L10Hrj3WZW+E3qh6IszE37F6EB+68mGpvKm4eb9bFrlzrok7fvr0Kfv727dvWRmdVTJHw0qiiCUSZ6wCK+7XL/AcsgNyL74DQQ730sv78Su7+t/A36MdY0sW5o40ahslXr58aZ5HtZB8GH64m9EmMZ7FpYw4T6QnrZfgenrhFxaSiSGXtPnz57e9TkNZLvTjeqhr734CNtrK41L40sUQckmj1lGKQ0rC37x544r8eNXRpnVE3ZZY7zXo8NomiO0ZUCj2uHz58rbXoZ6gc0uA+F6ZeKS/jhRDUq8MKrTho9fEkihMmhxtBI1DxKFY9XLpVcSkfoi8JGnToZO5sU5aiDQIW716ddt7ZLYtMQlhECdBGXZZMWldY5BHm5xgAroWj4C0hbYkSc/jBmggIrXJWlZM6pSETsEPGqZOndr2uuuR5rF169a2HoHPdurUKZM4CO1WTPqaDaAd+GFGKdIQkxAn9RuEWcTRyN2KSUgiSgF5aWzPTeA/lN5rZubMmR2bE4SIC4nJoltgAV/dVefZm72AtctUCJU2CMJ327hxY9t7EHbkyJFseq+EJSY16RPo3Dkq1kkr7+q0bNmyDuLQcZBEPYmHVdOBiJyIlrRDq41YPWfXOxUysi5fvtyaj+2BpcnsUV/oSoEMOk2CQGlr4ckhBwaetBhjCwH0ZHtJROPJkyc7UjcYLDjmrH7ADTEBXFfOYmB0k9oYBOjJ8b4aOYSe7QkKcYhFlq3QYLQhSidNmtS2RATwy8YOM3EQJsUjKiaWZ+vZToUQgzhkHXudb/PW5YMHD9yZM2faPsMwoc7RciYJXbGuBqJ1UIGKKLv915jsvgtJxCZDubdXr165mzdvtr1Hz5LONA8jrUwKPqsmVesKa49S3Q4WxmRPUEYdTjgiUcfUwLx589ySJUva3oMkP6IYddq6HMS4o55xBJBUeRjzfa4Zdeg56QZ43LhxoyPo7Lf1kNt7oO8wWAbNwaYjIv5lhyS7kRf96dvm5Jah8vfvX3flyhX35cuX6HfzFHOToS1H4BenCaHvO8pr8iDuwoUL7tevX+b5ZdbBair0xkFIlFDlW4ZknEClsp/TzXyAKVOmmHWFVSbDNw1l1+4f90U6IY/q4V27dpnE9bJ+v87QEydjqx/UamVVPRG+mwkNTYN+9tjkwzEx+atCm/X9WvWtDtAb68Wy9LXa1UmvCDDIpPkyOQ5ZwSzJ4jMrvFcr0rSjOUh+GcT4LSg5ugkW1Io0/SCDQBojh0hPlaJdah+tkVYrnTZowP8iq1F1TgMBBauufyB33x1v+NWFYmT5KmppgHC+NkAgbmRkpD3yn9QIseXymoTQFGQmIOKTxiZIWpvAatenVqRVXf2nTrAWMsPnKrMZHz6bJq5jvce6QK8J1cQNgKxlJapMPdZSR64/UivS9NztpkVEdKcrs5alhhWP9NeqlfWopzhZScI6QxseegZRGeg5a8C3Re1Mfl1ScP36ddcUaMuv24iOJtz7sbUjTS4qBvKmstYJoUauiuD3k5qhyr7QdUHMeCgLa1Ear9NquemdXgmum4fvJ6w1lqsuDhNrg1qSpleJK7K3TF0Q2jSd94uSZ60kK1e3qyVpQK6PVWXp2/FC3mp6jBhKKOiY2h3gtUV64TWM6wDETRPLDfSakXmH3w8g9Jlug8ZtTt4kVF0kLUYYmCCtD/DrQ5YhMGbA9L3ucdjh0y8kOHW5gU/VEEmJTcL4Pz/f7mgoAbYkAAAAAElFTkSuQmCC",
},
],
},
],
})
const completion = await openai.completions.create({
model: "llama3",
prompt: "Say this is a test.",
})
const listCompletion = await openai.models.list()
const model = await openai.models.retrieve("llama3");
const model = await openai.models.retrieve("llama3")
const embedding = await openai.embeddings.create({
model: "all-minilm",
input: ["why is the sky blue?", "why is the grass green?"],
});
})
```
### `curl`
```
``` shell
curl http://localhost:11434/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
@@ -84,6 +127,37 @@ curl http://localhost:11434/v1/chat/completions \
]
}'
curl http://localhost:11434/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "llava",
"messages": [
{
"role": "user",
"content": [
{
"type": "text",
"text": "What'\''s in this image?"
},
{
"type": "image_url",
"image_url": {
"url": "iVBORw0KGgoAAAANSUhEUgAAAG0AAABmCAYAAADBPx+VAAAACXBIWXMAAAsTAAALEwEAmpwYAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAA3VSURBVHgB7Z27r0zdG8fX743i1bi1ikMoFMQloXRpKFFIqI7LH4BEQ+NWIkjQuSWCRIEoULk0gsK1kCBI0IhrQVT7tz/7zZo888yz1r7MnDl7z5xvsjkzs2fP3uu71nNfa7lkAsm7d++Sffv2JbNmzUqcc8m0adOSzZs3Z+/XES4ZckAWJEGWPiCxjsQNLWmQsWjRIpMseaxcuTKpG/7HP27I8P79e7dq1ars/yL4/v27S0ejqwv+cUOGEGGpKHR37tzJCEpHV9tnT58+dXXCJDdECBE2Ojrqjh071hpNECjx4cMHVycM1Uhbv359B2F79+51586daxN/+pyRkRFXKyRDAqxEp4yMlDDzXG1NPnnyJKkThoK0VFd1ELZu3TrzXKxKfW7dMBQ6bcuWLW2v0VlHjx41z717927ba22U9APcw7Nnz1oGEPeL3m3p2mTAYYnFmMOMXybPPXv2bNIPpFZr1NHn4HMw0KRBjg9NuRw95s8PEcz/6DZELQd/09C9QGq5RsmSRybqkwHGjh07OsJSsYYm3ijPpyHzoiacg35MLdDSIS/O1yM778jOTwYUkKNHWUzUWaOsylE00MyI0fcnOwIdjvtNdW/HZwNLGg+sR1kMepSNJXmIwxBZiG8tDTpEZzKg0GItNsosY8USkxDhD0Rinuiko2gfL/RbiD2LZAjU9zKQJj8RDR0vJBR1/Phx9+PHj9Z7REF4nTZkxzX4LCXHrV271qXkBAPGfP/atWvu/PnzHe4C97F48eIsRLZ9+3a3f/9+87dwP1JxaF7/3r17ba+5l4EcaVo0lj3SBq5kGTJSQmLWMjgYNei2GPT1MuMqGTDEFHzeQSP2wi/jGnkmPJ/nhccs44jvDAxpVcxnq0F6eT8h4ni/iIWpR5lPyA6ETkNXoSukvpJAD3AsXLiwpZs49+fPn5ke4j10TqYvegSfn0OnafC+Tv9ooA/JPkgQysqQNBzagXY55nO/oa1F7qvIPWkRL12WRpMWUvpVDYmxAPehxWSe8ZEXL20sadYIozfmNch4QJPAfeJgW3rNsnzphBKNJM2KKODo1rVOMRYik5ETy3ix4qWNI81qAAirizgMIc+yhTytx0JWZuNI03qsrgWlGtwjoS9XwgUhWGyhUaRZZQNNIEwCiXD16tXcAHUs79co0vSD8rrJCIW98pzvxpAWyyo3HYwqS0+H0BjStClcZJT5coMm6D2LOF8TolGJtK9fvyZpyiC5ePFi9nc/oJU4eiEP0jVoAnHa9wyJycITMP78+eMeP37sXrx44d6+fdt6f82aNdkx1pg9e3Zb5W+RSRE+n+VjksQWifvVaTKFhn5O8my63K8Qabdv33b379/PiAP//vuvW7BggZszZ072/+TJk91YgkafPn166zXB1rQHFvouAWHq9z3SEevSUerqCn2/dDCeta2jxYbr69evk4MHDyY7d+7MjhMnTiTPnz9Pfv/+nfQT2ggpO2dMF8cghuoM7Ygj5iWCqRlGFml0QC/ftGmTmzt3rmsaKDsgBSPh0/8yPeLLBihLkOKJc0jp8H8vUzcxIA1k6QJ/c78tWEyj5P3o4u9+jywNPdJi5rAH9x0KHcl4Hg570eQp3+vHXGyrmEeigzQsQsjavXt38ujRo44LQuDDhw+TW7duRS1HGgMxhNXHgflaNTOsHyKvHK5Ijo2jbFjJBQK9YwFd6RVMzfgRBmEfP37suBBm/p49e1qjEP2mwTViNRo0VJWH1deMXcNK08uUjVUu7s/zRaL+oLNxz1bpANco4npUgX4G2eFbpDFyQoQxojBCpEGSytmOH8qrH5Q9vuzD6ofQylkCUmh8DBAr+q8JCyVNtWQIidKQE9wNtLSQnS4jDSsxNHogzFuQBw4cyM61UKVsjfr3ooBkPSqqQHesUPWVtzi9/vQi1T+rJj7WiTz4Pt/l3LxUkr5P2VYZaZ4URpsE+st/dujQoaBBYokbrz/8TJNQYLSonrPS9kUaSkPeZyj1AWSj+d+VBoy1pIWVNed8P0Ll/ee5HdGRhrHhR5GGN0r4LGZBaj8oFDJitBTJzIZgFcmU0Y8ytWMZMzJOaXUSrUs5RxKnrxmbb5YXO9VGUhtpXldhEUogFr3IzIsvlpmdosVcGVGXFWp2oU9kLFL3dEkSz6NHEY1sjSRdIuDFWEhd8KxFqsRi1uM/nz9/zpxnwlESONdg6dKlbsaMGS4EHFHtjFIDHwKOo46l4TxSuxgDzi+rE2jg+BaFruOX4HXa0Nnf1lwAPufZeF8/r6zD97WK2qFnGjBxTw5qNGPxT+5T/r7/7RawFC3j4vTp09koCxkeHjqbHJqArmH5UrFKKksnxrK7FuRIs8STfBZv+luugXZ2pR/pP9Ois4z+TiMzUUkUjD0iEi1fzX8GmXyuxUBRcaUfykV0YZnlJGKQpOiGB76x5GeWkWWJc3mOrK6S7xdND+W5N6XyaRgtWJFe13GkaZnKOsYqGdOVVVbGupsyA/l7emTLHi7vwTdirNEt0qxnzAvBFcnQF16xh/TMpUuXHDowhlA9vQVraQhkudRdzOnK+04ZSP3DUhVSP61YsaLtd/ks7ZgtPcXqPqEafHkdqa84X6aCeL7YWlv6edGFHb+ZFICPlljHhg0bKuk0CSvVznWsotRu433alNdFrqG45ejoaPCaUkWERpLXjzFL2Rpllp7PJU2a/v7Ab8N05/9t27Z16KUqoFGsxnI9EosS2niSYg9SpU6B4JgTrvVW1flt1sT+0ADIJU2maXzcUTraGCRaL1Wp9rUMk16PMom8QhruxzvZIegJjFU7LLCePfS8uaQdPny4jTTL0dbee5mYokQsXTIWNY46kuMbnt8Kmec+LGWtOVIl9cT1rCB0V8WqkjAsRwta93TbwNYoGKsUSChN44lgBNCoHLHzquYKrU6qZ8lolCIN0Rh6cP0Q3U6I6IXILYOQI513hJaSKAorFpuHXJNfVlpRtmYBk1Su1obZr5dnKAO+L10Hrj3WZW+E3qh6IszE37F6EB+68mGpvKm4eb9bFrlzrok7fvr0Kfv727dvWRmdVTJHw0qiiCUSZ6wCK+7XL/AcsgNyL74DQQ730sv78Su7+t/A36MdY0sW5o40ahslXr58aZ5HtZB8GH64m9EmMZ7FpYw4T6QnrZfgenrhFxaSiSGXtPnz57e9TkNZLvTjeqhr734CNtrK41L40sUQckmj1lGKQ0rC37x544r8eNXRpnVE3ZZY7zXo8NomiO0ZUCj2uHz58rbXoZ6gc0uA+F6ZeKS/jhRDUq8MKrTho9fEkihMmhxtBI1DxKFY9XLpVcSkfoi8JGnToZO5sU5aiDQIW716ddt7ZLYtMQlhECdBGXZZMWldY5BHm5xgAroWj4C0hbYkSc/jBmggIrXJWlZM6pSETsEPGqZOndr2uuuR5rF169a2HoHPdurUKZM4CO1WTPqaDaAd+GFGKdIQkxAn9RuEWcTRyN2KSUgiSgF5aWzPTeA/lN5rZubMmR2bE4SIC4nJoltgAV/dVefZm72AtctUCJU2CMJ327hxY9t7EHbkyJFseq+EJSY16RPo3Dkq1kkr7+q0bNmyDuLQcZBEPYmHVdOBiJyIlrRDq41YPWfXOxUysi5fvtyaj+2BpcnsUV/oSoEMOk2CQGlr4ckhBwaetBhjCwH0ZHtJROPJkyc7UjcYLDjmrH7ADTEBXFfOYmB0k9oYBOjJ8b4aOYSe7QkKcYhFlq3QYLQhSidNmtS2RATwy8YOM3EQJsUjKiaWZ+vZToUQgzhkHXudb/PW5YMHD9yZM2faPsMwoc7RciYJXbGuBqJ1UIGKKLv915jsvgtJxCZDubdXr165mzdvtr1Hz5LONA8jrUwKPqsmVesKa49S3Q4WxmRPUEYdTjgiUcfUwLx589ySJUva3oMkP6IYddq6HMS4o55xBJBUeRjzfa4Zdeg56QZ43LhxoyPo7Lf1kNt7oO8wWAbNwaYjIv5lhyS7kRf96dvm5Jah8vfvX3flyhX35cuX6HfzFHOToS1H4BenCaHvO8pr8iDuwoUL7tevX+b5ZdbBair0xkFIlFDlW4ZknEClsp/TzXyAKVOmmHWFVSbDNw1l1+4f90U6IY/q4V27dpnE9bJ+v87QEydjqx/UamVVPRG+mwkNTYN+9tjkwzEx+atCm/X9WvWtDtAb68Wy9LXa1UmvCDDIpPkyOQ5ZwSzJ4jMrvFcr0rSjOUh+GcT4LSg5ugkW1Io0/SCDQBojh0hPlaJdah+tkVYrnTZowP8iq1F1TgMBBauufyB33x1v+NWFYmT5KmppgHC+NkAgbmRkpD3yn9QIseXymoTQFGQmIOKTxiZIWpvAatenVqRVXf2nTrAWMsPnKrMZHz6bJq5jvce6QK8J1cQNgKxlJapMPdZSR64/UivS9NztpkVEdKcrs5alhhWP9NeqlfWopzhZScI6QxseegZRGeg5a8C3Re1Mfl1ScP36ddcUaMuv24iOJtz7sbUjTS4qBvKmstYJoUauiuD3k5qhyr7QdUHMeCgLa1Ear9NquemdXgmum4fvJ6w1lqsuDhNrg1qSpleJK7K3TF0Q2jSd94uSZ60kK1e3qyVpQK6PVWXp2/FC3mp6jBhKKOiY2h3gtUV64TWM6wDETRPLDfSakXmH3w8g9Jlug8ZtTt4kVF0kLUYYmCCtD/DrQ5YhMGbA9L3ucdjh0y8kOHW5gU/VEEmJTcL4Pz/f7mgoAbYkAAAAAElFTkSuQmCC"
}
}
]
}
],
"max_tokens": 300
}'
curl http://localhost:11434/v1/completions \
-H "Content-Type: application/json" \
-d '{
"model": "llama3",
"prompt": "Say this is a test"
}'
curl http://localhost:11434/v1/models
curl http://localhost:11434/v1/models/llama3
@@ -106,8 +180,8 @@ curl http://localhost:11434/v1/embeddings \
- [x] Streaming
- [x] JSON mode
- [x] Reproducible outputs
- [x] Vision
- [x] Tools (streaming support coming soon)
- [ ] Vision
- [ ] Logprobs
#### Supported request fields
@@ -115,7 +189,10 @@ curl http://localhost:11434/v1/embeddings \
- [x] `model`
- [x] `messages`
- [x] Text `content`
- [ ] Array of `content` parts
- [x] Image `content`
- [x] Base64 encoded image
- [ ] Image URL
- [x] Array of `content` parts
- [x] `frequency_penalty`
- [x] `presence_penalty`
- [x] `response_format`
@@ -131,6 +208,39 @@ curl http://localhost:11434/v1/embeddings \
- [ ] `user`
- [ ] `n`
### `/v1/completions`
#### Supported features
- [x] Completions
- [x] Streaming
- [x] JSON mode
- [x] Reproducible outputs
- [ ] Logprobs
#### Supported request fields
- [x] `model`
- [x] `prompt`
- [x] `frequency_penalty`
- [x] `presence_penalty`
- [x] `seed`
- [x] `stop`
- [x] `stream`
- [x] `temperature`
- [x] `top_p`
- [x] `max_tokens`
- [x] `suffix`
- [ ] `best_of`
- [ ] `echo`
- [ ] `logit_bias`
- [ ] `user`
- [ ] `n`
#### Notes
- `prompt` currently only accepts a string
### `/v1/models`
#### Notes

View File

@@ -112,15 +112,9 @@ Keep the following tips and best practices in mind when working with Go template
ChatML is a popular template format. It can be used for models such as Databrick's DBRX, Intel's Neural Chat, and Microsoft's Orca 2.
```gotmpl
{{- if .System }}<|im_start|>system
{{ .System }}<|im_end|>
{{ end }}
{{- range .Messages }}<|im_start|>{{ .Role }}
{{ .Content }}<|im_end|>
{{ end }}<|im_start|>assistant
{{ else }}
{{ if .System }}<|im_start|>system
{{ .System }}<|im_end|>
```
### Example Tools

View File

@@ -9,7 +9,7 @@ cat ~/.ollama/logs/server.log
On **Linux** systems with systemd, the logs can be found with this command:
```shell
journalctl -u ollama
journalctl -u ollama --no-pager
```
When you run Ollama in a **container**, the logs go to stdout/stderr in the container:

View File

@@ -174,7 +174,7 @@ func RunnersDir() (p string) {
defer func() {
if p == "" {
slog.Error("unable to locate llm runner directory. Set OLLAMA_RUNNERS_DIR to the location of 'ollama_runners'")
slog.Error("unable to locate llm runner directory. Set OLLAMA_RUNNERS_DIR to the location of 'ollama/runners'")
}
}()
@@ -190,17 +190,17 @@ func RunnersDir() (p string) {
}
var paths []string
for _, root := range []string{filepath.Dir(exe), cwd} {
for _, root := range []string{filepath.Dir(exe), filepath.Join(filepath.Dir(exe), ".."), cwd} {
paths = append(paths,
root,
filepath.Join(root, "windows-"+runtime.GOARCH),
filepath.Join(root, "dist", "windows-"+runtime.GOARCH),
filepath.Join(root, runtime.GOOS+"-"+runtime.GOARCH),
filepath.Join(root, "dist", runtime.GOOS+"-"+runtime.GOARCH),
)
}
// Try a few variations to improve developer experience when building from source in the local tree
for _, path := range paths {
candidate := filepath.Join(path, "ollama_runners")
candidate := filepath.Join(path, "lib", "ollama", "runners")
if _, err := os.Stat(candidate); err == nil {
p = candidate
break

View File

@@ -3,6 +3,7 @@ package format
import (
"fmt"
"math"
"strconv"
)
const (
@@ -28,6 +29,6 @@ func HumanNumber(b uint64) string {
case b >= Thousand:
return fmt.Sprintf("%.0fK", float64(b)/Thousand)
default:
return fmt.Sprintf("%d", b)
return strconv.FormatUint(b, 10)
}
}

2
go.mod
View File

@@ -1,6 +1,6 @@
module github.com/ollama/ollama
go 1.22.0
go 1.22.5
require (
github.com/containerd/console v1.0.3

View File

@@ -3,7 +3,7 @@
package gpu
import (
"fmt"
"errors"
"log/slog"
"os"
"path/filepath"
@@ -54,7 +54,7 @@ func commonAMDValidateLibDir() (string, error) {
// Installer payload location if we're running the installed binary
exe, err := os.Executable()
if err == nil {
rocmTargetDir := filepath.Join(filepath.Dir(exe), "rocm")
rocmTargetDir := filepath.Join(filepath.Dir(exe), "..", "lib", "ollama")
if rocmLibUsable(rocmTargetDir) {
slog.Debug("detected ROCM next to ollama executable " + rocmTargetDir)
return rocmTargetDir, nil
@@ -95,5 +95,5 @@ func commonAMDValidateLibDir() (string, error) {
}
}
return "", fmt.Errorf("no suitable rocm found, falling back to CPU")
return "", errors.New("no suitable rocm found, falling back to CPU")
}

View File

@@ -1,6 +1,7 @@
package gpu
import (
"errors"
"fmt"
"log/slog"
"syscall"
@@ -76,7 +77,7 @@ func (hl *HipLib) Release() {
func (hl *HipLib) AMDDriverVersion() (driverMajor, driverMinor int, err error) {
if hl.dll == 0 {
return 0, 0, fmt.Errorf("dll has been unloaded")
return 0, 0, errors.New("dll has been unloaded")
}
var version int
status, _, err := syscall.SyscallN(hl.hipDriverGetVersion, uintptr(unsafe.Pointer(&version)))
@@ -110,7 +111,7 @@ func (hl *HipLib) HipGetDeviceCount() int {
func (hl *HipLib) HipSetDevice(device int) error {
if hl.dll == 0 {
return fmt.Errorf("dll has been unloaded")
return errors.New("dll has been unloaded")
}
status, _, err := syscall.SyscallN(hl.hipSetDevice, uintptr(device))
if status != hipSuccess {
@@ -121,7 +122,7 @@ func (hl *HipLib) HipSetDevice(device int) error {
func (hl *HipLib) HipGetDeviceProperties(device int) (*hipDevicePropMinimal, error) {
if hl.dll == 0 {
return nil, fmt.Errorf("dll has been unloaded")
return nil, errors.New("dll has been unloaded")
}
var props hipDevicePropMinimal
status, _, err := syscall.SyscallN(hl.hipGetDeviceProperties, uintptr(unsafe.Pointer(&props)), uintptr(device))
@@ -134,7 +135,7 @@ func (hl *HipLib) HipGetDeviceProperties(device int) (*hipDevicePropMinimal, err
// free, total, err
func (hl *HipLib) HipMemGetInfo() (uint64, uint64, error) {
if hl.dll == 0 {
return 0, 0, fmt.Errorf("dll has been unloaded")
return 0, 0, errors.New("dll has been unloaded")
}
var totalMemory uint64
var freeMemory uint64

View File

@@ -393,7 +393,7 @@ func AMDValidateLibDir() (string, error) {
// If we still haven't found a usable rocm, the user will have to install it on their own
slog.Warn("amdgpu detected, but no compatible rocm library found. Either install rocm v6, or follow manual install instructions at https://github.com/ollama/ollama/blob/main/docs/linux.md#manual-install")
return "", fmt.Errorf("no suitable rocm found, falling back to CPU")
return "", errors.New("no suitable rocm found, falling back to CPU")
}
func AMDDriverVersion() (driverMajor, driverMinor int, err error) {

View File

@@ -2,7 +2,7 @@ package gpu
import (
"bytes"
"fmt"
"errors"
"log/slog"
"os"
"path/filepath"
@@ -85,7 +85,7 @@ func AMDGetGPUInfo() []RocmGPUInfo {
n = bytes.IndexByte(props.GcnArchName[:], 0)
gfx := string(props.GcnArchName[:n])
slog.Debug("hip device", "id", i, "name", name, "gfx", gfx)
//slog.Info(fmt.Sprintf("[%d] Integrated: %d", i, props.iGPU)) // DOESN'T REPORT CORRECTLY! Always 0
// slog.Info(fmt.Sprintf("[%d] Integrated: %d", i, props.iGPU)) // DOESN'T REPORT CORRECTLY! Always 0
// TODO Why isn't props.iGPU accurate!?
if strings.EqualFold(name, iGPUName) {
slog.Info("unsupported Radeon iGPU detected skipping", "id", i, "name", name, "gfx", gfx)
@@ -153,7 +153,7 @@ func AMDValidateLibDir() (string, error) {
// Installer payload (if we're running from some other location)
localAppData := os.Getenv("LOCALAPPDATA")
appDir := filepath.Join(localAppData, "Programs", "Ollama")
rocmTargetDir := filepath.Join(appDir, "rocm")
rocmTargetDir := filepath.Join(appDir, "..", "lib", "ollama")
if rocmLibUsable(rocmTargetDir) {
slog.Debug("detected ollama installed ROCm at " + rocmTargetDir)
return rocmTargetDir, nil
@@ -161,7 +161,7 @@ func AMDValidateLibDir() (string, error) {
// Should not happen on windows since we include it in the installer, but stand-alone binary might hit this
slog.Warn("amdgpu detected, but no compatible rocm library found. Please install ROCm")
return "", fmt.Errorf("no suitable rocm found, falling back to CPU")
return "", errors.New("no suitable rocm found, falling back to CPU")
}
func (gpus RocmGPUInfoList) RefreshFreeMemory() error {

View File

@@ -42,20 +42,16 @@ func PayloadsDir() (string, error) {
return "", fmt.Errorf("failed to generate tmp dir: %w", err)
}
} else {
err = os.MkdirAll(tmpDir, 0755)
err = os.MkdirAll(tmpDir, 0o755)
if err != nil {
return "", fmt.Errorf("failed to generate tmp dir %s: %w", tmpDir, err)
}
}
// Track our pid so we can clean up orphaned tmpdirs
pidFilePath := filepath.Join(tmpDir, "ollama.pid")
pidFile, err := os.OpenFile(pidFilePath, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, os.ModePerm)
if err != nil {
return "", err
}
if _, err := pidFile.Write([]byte(fmt.Sprint(os.Getpid()))); err != nil {
return "", err
n := filepath.Join(tmpDir, "ollama.pid")
if err := os.WriteFile(n, []byte(strconv.Itoa(os.Getpid())), 0o644); err != nil {
return "", fmt.Errorf("failed to write pid file %s: %w", n, err)
}
// We create a distinct subdirectory for payloads within the tmpdir
@@ -67,37 +63,44 @@ func PayloadsDir() (string, error) {
// Best effort to clean up prior tmpdirs
func cleanupTmpDirs() {
dirs, err := filepath.Glob(filepath.Join(os.TempDir(), "ollama*"))
matches, err := filepath.Glob(filepath.Join(os.TempDir(), "ollama*", "ollama.pid"))
if err != nil {
return
}
for _, d := range dirs {
info, err := os.Stat(d)
if err != nil || !info.IsDir() {
for _, match := range matches {
raw, err := os.ReadFile(match)
if errors.Is(err, os.ErrNotExist) {
slog.Debug("not a ollama runtime directory, skipping", "path", match)
continue
}
raw, err := os.ReadFile(filepath.Join(d, "ollama.pid"))
if err != nil {
slog.Warn("failed to read ollama.pid", "path", d, "error", err)
// No pid, ignore this tmpdir
} else if err != nil {
slog.Warn("could not read ollama.pid, skipping", "path", match, "error", err)
continue
}
pid, err := strconv.Atoi(string(raw))
if err != nil {
slog.Warn("failed to parse pid", "path", d, "error", err)
slog.Warn("invalid pid, skipping", "path", match, "error", err)
continue
}
proc, err := os.FindProcess(pid)
if err == nil && !errors.Is(proc.Signal(syscall.Signal(0)), os.ErrProcessDone) {
slog.Warn("found running ollama", "pid", pid, "path", d)
// Another running ollama, ignore this tmpdir
p, err := os.FindProcess(pid)
if err == nil && !errors.Is(p.Signal(syscall.Signal(0)), os.ErrProcessDone) {
slog.Warn("process still running, skipping", "pid", pid, "path", match)
continue
}
if err := os.Remove(d); err != nil {
slog.Warn("unable to cleanup stale tmpdir", "path", d, "error", err)
if err := os.Remove(match); err != nil {
slog.Warn("could not cleanup stale pidfile", "path", match, "error", err)
}
runners := filepath.Join(filepath.Dir(match), "runners")
if err := os.RemoveAll(runners); err != nil {
slog.Warn("could not cleanup stale runners", "path", runners, "error", err)
}
if err := os.Remove(filepath.Dir(match)); err != nil {
slog.Warn("could not cleanup stale tmpdir", "path", filepath.Dir(match), "error", err)
}
}
}

View File

@@ -1,6 +1,11 @@
package gpu
import (
"os"
"path/filepath"
"runtime"
"strings"
"golang.org/x/sys/cpu"
)
@@ -14,3 +19,19 @@ func GetCPUCapability() CPUCapability {
// else LCD
return CPUCapabilityNone
}
func IsNUMA() bool {
if runtime.GOOS != "linux" {
// numa support in llama.cpp is linux only
return false
}
ids := map[string]interface{}{}
packageIds, _ := filepath.Glob("/sys/devices/system/cpu/cpu*/topology/physical_package_id")
for _, packageId := range packageIds {
id, err := os.ReadFile(packageId)
if err == nil {
ids[strings.TrimSpace(string(id))] = struct{}{}
}
}
return len(ids) > 1
}

View File

@@ -4,9 +4,17 @@ package gpu
import (
"log/slog"
"os"
"regexp"
"runtime"
"strconv"
"strings"
)
// Jetson devices have JETSON_JETPACK="x.y.z" factory set to the Jetpack version installed.
// Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices.
var CudaTegra string = os.Getenv("JETSON_JETPACK")
func cudaGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) {
ids := []string{}
for _, info := range gpuInfo {
@@ -19,3 +27,38 @@ func cudaGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) {
}
return "CUDA_VISIBLE_DEVICES", strings.Join(ids, ",")
}
func cudaVariant(gpuInfo CudaGPUInfo) string {
if runtime.GOARCH == "arm64" && runtime.GOOS == "linux" {
if CudaTegra != "" {
ver := strings.Split(CudaTegra, ".")
if len(ver) > 0 {
return "jetpack" + ver[0]
}
} else if data, err := os.ReadFile("/etc/nv_tegra_release"); err == nil {
r := regexp.MustCompile(` R(\d+) `)
m := r.FindSubmatch(data)
if len(m) != 2 {
slog.Info("Unexpected format for /etc/nv_tegra_release. Set JETSON_JETPACK to select version")
} else {
if l4t, err := strconv.Atoi(string(m[1])); err == nil {
// Note: mapping from L4t -> JP is inconsistent (can't just subtract 30)
// https://developer.nvidia.com/embedded/jetpack-archive
switch l4t {
case 35:
return "jetpack5"
case 36:
return "jetpack6"
default:
slog.Info("unsupported L4T version", "nv_tegra_release", string(data))
}
}
}
}
}
if gpuInfo.computeMajor < 6 || gpuInfo.DriverMajor < 12 {
return "v11"
}
return "v12"
}

View File

@@ -7,9 +7,9 @@ package gpu
#cgo windows LDFLAGS: -lpthread
#include "gpu_info.h"
*/
import "C"
import (
"fmt"
"log/slog"
@@ -64,13 +64,8 @@ var RocmComputeMin = 9
// TODO find a better way to detect iGPU instead of minimum memory
const IGPUMemLimit = 1 * format.GibiByte // 512G is what they typically report, so anything less than 1G must be iGPU
// Jetson devices have JETSON_JETPACK="x.y.z" factory set to the Jetpack version installed.
// Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices.
var CudaTegra string = os.Getenv("JETSON_JETPACK")
// Note: gpuMutex must already be held
func initCudaHandles() *cudaHandles {
// TODO - if the ollama build is CPU only, don't do these checks as they're irrelevant and confusing
cHandles := &cudaHandles{}
@@ -211,14 +206,16 @@ func GetGPUInfo() GpuInfoList {
if err != nil {
slog.Warn("error looking up system memory", "error", err)
}
cpus = []CPUInfo{CPUInfo{
GpuInfo: GpuInfo{
memInfo: mem,
Library: "cpu",
Variant: cpuCapability,
ID: "0",
cpus = []CPUInfo{
{
GpuInfo: GpuInfo{
memInfo: mem,
Library: "cpu",
Variant: cpuCapability.String(),
ID: "0",
},
},
}}
}
// Fallback to CPU mode if we're lacking required vector extensions on x86
if cpuCapability < GPURunnerCPUCapability && runtime.GOARCH == "amd64" {
@@ -228,11 +225,7 @@ func GetGPUInfo() GpuInfoList {
return GpuInfoList{cpus[0].GpuInfo}
}
// On windows we bundle the nvidia library one level above the runner dir
depPath := ""
if runtime.GOOS == "windows" && envconfig.RunnersDir() != "" {
depPath = filepath.Join(filepath.Dir(envconfig.RunnersDir()), "cuda")
}
depPath := LibraryDir()
// Load ALL libraries
cHandles = initCudaHandles()
@@ -268,11 +261,23 @@ func GetGPUInfo() GpuInfoList {
gpuInfo.FreeMemory = uint64(memInfo.free)
gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
gpuInfo.Compute = fmt.Sprintf("%d.%d", memInfo.major, memInfo.minor)
gpuInfo.computeMajor = int(memInfo.major)
gpuInfo.computeMinor = int(memInfo.minor)
gpuInfo.MinimumMemory = cudaMinimumMemory
gpuInfo.DependencyPath = depPath
gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
gpuInfo.DriverMajor = driverMajor
gpuInfo.DriverMinor = driverMinor
variant := cudaVariant(gpuInfo)
if depPath != "" {
gpuInfo.DependencyPath = depPath
// Check for variant specific directory
if variant != "" {
if _, err := os.Stat(filepath.Join(depPath, "cuda_"+variant)); err == nil {
gpuInfo.DependencyPath = filepath.Join(depPath, "cuda_"+variant)
}
}
}
gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
gpuInfo.Variant = variant
// query the management library as well so we can record any skew between the two
// which represents overhead on the GPU we must set aside on subsequent updates
@@ -304,38 +309,34 @@ func GetGPUInfo() GpuInfoList {
// Intel
if envconfig.IntelGPU() {
oHandles = initOneAPIHandles()
// On windows we bundle the oneapi library one level above the runner dir
depPath = ""
if runtime.GOOS == "windows" && envconfig.RunnersDir() != "" {
depPath = filepath.Join(filepath.Dir(envconfig.RunnersDir()), "oneapi")
}
for d := range oHandles.oneapi.num_drivers {
if oHandles.oneapi == nil {
// shouldn't happen
slog.Warn("nil oneapi handle with driver count", "count", int(oHandles.oneapi.num_drivers))
continue
}
devCount := C.oneapi_get_device_count(*oHandles.oneapi, C.int(d))
for i := range devCount {
gpuInfo := OneapiGPUInfo{
GpuInfo: GpuInfo{
Library: "oneapi",
},
driverIndex: int(d),
gpuIndex: int(i),
if oHandles != nil && oHandles.oneapi != nil {
for d := range oHandles.oneapi.num_drivers {
if oHandles.oneapi == nil {
// shouldn't happen
slog.Warn("nil oneapi handle with driver count", "count", int(oHandles.oneapi.num_drivers))
continue
}
devCount := C.oneapi_get_device_count(*oHandles.oneapi, C.int(d))
for i := range devCount {
gpuInfo := OneapiGPUInfo{
GpuInfo: GpuInfo{
Library: "oneapi",
},
driverIndex: int(d),
gpuIndex: int(i),
}
// TODO - split bootstrapping from updating free memory
C.oneapi_check_vram(*oHandles.oneapi, C.int(d), i, &memInfo)
// TODO - convert this to MinimumMemory based on testing...
var totalFreeMem float64 = float64(memInfo.free) * 0.95 // work-around: leave some reserve vram for mkl lib used in ggml-sycl backend.
memInfo.free = C.uint64_t(totalFreeMem)
gpuInfo.TotalMemory = uint64(memInfo.total)
gpuInfo.FreeMemory = uint64(memInfo.free)
gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
gpuInfo.DependencyPath = depPath
oneapiGPUs = append(oneapiGPUs, gpuInfo)
}
// TODO - split bootstrapping from updating free memory
C.oneapi_check_vram(*oHandles.oneapi, C.int(d), i, &memInfo)
// TODO - convert this to MinimumMemory based on testing...
var totalFreeMem float64 = float64(memInfo.free) * 0.95 // work-around: leave some reserve vram for mkl lib used in ggml-sycl backend.
memInfo.free = C.uint64_t(totalFreeMem)
gpuInfo.TotalMemory = uint64(memInfo.total)
gpuInfo.FreeMemory = uint64(memInfo.free)
gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
gpuInfo.DependencyPath = depPath
oneapiGPUs = append(oneapiGPUs, gpuInfo)
}
}
}
@@ -463,10 +464,12 @@ func GetGPUInfo() GpuInfoList {
func FindGPULibs(baseLibName string, defaultPatterns []string) []string {
// Multiple GPU libraries may exist, and some may not work, so keep trying until we exhaust them
var ldPaths []string
var patterns []string
gpuLibPaths := []string{}
slog.Debug("Searching for GPU library", "name", baseLibName)
// Start with our bundled libraries
patterns := []string{filepath.Join(LibraryDir(), baseLibName)}
switch runtime.GOOS {
case "windows":
ldPaths = strings.Split(os.Getenv("PATH"), ";")
@@ -475,13 +478,14 @@ func FindGPULibs(baseLibName string, defaultPatterns []string) []string {
default:
return gpuLibPaths
}
// Start with whatever we find in the PATH/LD_LIBRARY_PATH
// Then with whatever we find in the PATH/LD_LIBRARY_PATH
for _, ldPath := range ldPaths {
d, err := filepath.Abs(ldPath)
if err != nil {
continue
}
patterns = append(patterns, filepath.Join(d, baseLibName+"*"))
patterns = append(patterns, filepath.Join(d, baseLibName))
}
patterns = append(patterns, defaultPatterns...)
slog.Debug("gpu library search", "globs", patterns)
@@ -637,3 +641,31 @@ func (l GpuInfoList) GetVisibleDevicesEnv() (string, string) {
return "", ""
}
}
func LibraryDir() string {
// On Windows/linux we bundle the dependencies at the same level as the executable
appExe, err := os.Executable()
if err != nil {
slog.Warn("failed to lookup executable path", "error", err)
}
cwd, err := os.Getwd()
if err != nil {
slog.Warn("failed to lookup working directory", "error", err)
}
// Scan for any of our dependeices, and pick first match
for _, root := range []string{filepath.Dir(appExe), filepath.Join(filepath.Dir(appExe), ".."), cwd} {
libDep := filepath.Join("lib", "ollama")
if _, err := os.Stat(filepath.Join(root, libDep)); err == nil {
return filepath.Join(root, libDep)
}
// Developer mode, local build
if _, err := os.Stat(filepath.Join(root, runtime.GOOS+"-"+runtime.GOARCH, libDep)); err == nil {
return filepath.Join(root, runtime.GOOS+"-"+runtime.GOARCH, libDep)
}
if _, err := os.Stat(filepath.Join(root, "dist", runtime.GOOS+"-"+runtime.GOARCH, libDep)); err == nil {
return filepath.Join(root, "dist", runtime.GOOS+"-"+runtime.GOARCH, libDep)
}
}
slog.Warn("unable to locate gpu dependency libraries")
return ""
}

View File

@@ -8,6 +8,7 @@ package gpu
#include "gpu_info_darwin.h"
*/
import "C"
import (
"runtime"
@@ -24,7 +25,7 @@ func GetGPUInfo() GpuInfoList {
return []GpuInfo{
{
Library: "cpu",
Variant: GetCPUCapability(),
Variant: GetCPUCapability().String(),
memInfo: mem,
},
}
@@ -47,7 +48,7 @@ func GetCPUInfo() GpuInfoList {
return []GpuInfo{
{
Library: "cpu",
Variant: GetCPUCapability(),
Variant: GetCPUCapability().String(),
memInfo: mem,
},
}

View File

@@ -67,4 +67,4 @@ void cpu_check_ram(mem_info_t *resp);
#include "gpu_info_oneapi.h"
#endif // __GPU_INFO_H__
#endif // __APPLE__
#endif // __APPLE__

View File

@@ -43,10 +43,12 @@ var OneapiGlobs = []string{
"/usr/lib*/libze_intel_gpu.so*",
}
var CudartMgmtName = "libcudart.so*"
var NvcudaMgmtName = "libcuda.so*"
var NvmlMgmtName = "" // not currently wired on linux
var OneapiMgmtName = "libze_intel_gpu.so"
var (
CudartMgmtName = "libcudart.so*"
NvcudaMgmtName = "libcuda.so*"
NvmlMgmtName = "" // not currently wired on linux
OneapiMgmtName = "libze_intel_gpu.so*"
)
func GetCPUMem() (memInfo, error) {
var mem memInfo

View File

@@ -32,4 +32,29 @@ func TestCPUMemInfo(t *testing.T) {
}
}
func TestByLibrary(t *testing.T) {
type testCase struct {
input []GpuInfo
expect int
}
testCases := map[string]*testCase{
"empty": {input: []GpuInfo{}, expect: 0},
"cpu": {input: []GpuInfo{{Library: "cpu"}}, expect: 1},
"cpu + GPU": {input: []GpuInfo{{Library: "cpu"}, {Library: "cuda"}}, expect: 2},
"cpu + 2 GPU no variant": {input: []GpuInfo{{Library: "cpu"}, {Library: "cuda"}, {Library: "cuda"}}, expect: 2},
"cpu + 2 GPU same variant": {input: []GpuInfo{{Library: "cpu"}, {Library: "cuda", Variant: "v11"}, {Library: "cuda", Variant: "v11"}}, expect: 2},
"cpu + 2 GPU diff variant": {input: []GpuInfo{{Library: "cpu"}, {Library: "cuda", Variant: "v11"}, {Library: "cuda", Variant: "v12"}}, expect: 3},
}
for k, v := range testCases {
t.Run(k, func(t *testing.T) {
resp := (GpuInfoList)(v.input).ByLibrary()
if len(resp) != v.expect {
t.Fatalf("expected length %d, got %d => %+v", v.expect, len(resp), resp)
}
})
}
}
// TODO - add some logic to figure out card type through other means and actually verify we got back what we expected

View File

@@ -40,10 +40,12 @@ var OneapiGlobs = []string{
"c:\\Windows\\System32\\DriverStore\\FileRepository\\*\\ze_intel_gpu64.dll",
}
var CudartMgmtName = "cudart64_*.dll"
var NvcudaMgmtName = "nvcuda.dll"
var NvmlMgmtName = "nvml.dll"
var OneapiMgmtName = "ze_intel_gpu64.dll"
var (
CudartMgmtName = "cudart64_*.dll"
NvcudaMgmtName = "nvcuda.dll"
NvmlMgmtName = "nvml.dll"
OneapiMgmtName = "ze_intel_gpu64.dll"
)
func GetCPUMem() (memInfo, error) {
memStatus := MEMORYSTATUSEX{length: sizeofMemoryStatusEx}

View File

@@ -19,7 +19,7 @@ type GpuInfo struct {
Library string `json:"library,omitempty"`
// Optional variant to select (e.g. versions, cpu feature flags)
Variant CPUCapability `json:"variant"`
Variant string `json:"variant"`
// MinimumMemory represents the minimum memory required to use the GPU
MinimumMemory uint64 `json:"-"`
@@ -53,8 +53,10 @@ type CPUInfo struct {
type CudaGPUInfo struct {
GpuInfo
OSOverhead uint64 // Memory overhead between the driver library and management library
index int //nolint:unused,nolintlint
OSOverhead uint64 // Memory overhead between the driver library and management library
index int //nolint:unused,nolintlint
computeMajor int //nolint:unused,nolintlint
computeMinor int //nolint:unused,nolintlint
}
type CudaGPUInfoList []CudaGPUInfo
@@ -81,8 +83,8 @@ func (l GpuInfoList) ByLibrary() []GpuInfoList {
for _, info := range l {
found := false
requested := info.Library
if info.Variant != CPUCapabilityNone {
requested += "_" + info.Variant.String()
if info.Variant != CPUCapabilityNone.String() {
requested += "_" + info.Variant
}
for i, lib := range libs {
if lib == requested {
@@ -92,7 +94,7 @@ func (l GpuInfoList) ByLibrary() []GpuInfoList {
}
}
if !found {
libs = append(libs, info.Library)
libs = append(libs, requested)
resp = append(resp, []GpuInfo{info})
}
}
@@ -105,6 +107,7 @@ func (l GpuInfoList) LogDetails() {
slog.Info("inference compute",
"id", g.ID,
"library", g.Library,
"variant", g.Variant,
"compute", g.Compute,
"driver", fmt.Sprintf("%d.%d", g.DriverMajor, g.DriverMinor),
"name", g.Name,

View File

@@ -5,6 +5,7 @@ package integration
import (
"context"
"log/slog"
"os"
"strconv"
"sync"
"testing"
@@ -13,7 +14,6 @@ import (
"github.com/stretchr/testify/require"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/format"
)
@@ -41,8 +41,8 @@ func TestMultiModelConcurrency(t *testing.T) {
},
}
resp = [2][]string{
[]string{"sunlight"},
[]string{"england", "english", "massachusetts", "pilgrims", "british"},
{"sunlight"},
{"england", "english", "massachusetts", "pilgrims", "british"},
}
)
var wg sync.WaitGroup
@@ -71,12 +71,11 @@ func TestIntegrationConcurrentPredictOrcaMini(t *testing.T) {
reqLimit := len(req)
iterLimit := 5
vram := os.Getenv("OLLAMA_MAX_VRAM") // TODO - discover actual VRAM
if vram != "" {
max, err := strconv.ParseUint(vram, 10, 64)
if s := os.Getenv("OLLAMA_MAX_VRAM"); s != "" {
maxVram, err := strconv.ParseUint(s, 10, 64)
require.NoError(t, err)
// Don't hammer on small VRAM cards...
if max < 4*1024*1024*1024 {
if maxVram < 4*format.GibiByte {
reqLimit = min(reqLimit, 2)
iterLimit = 2
}
@@ -233,12 +232,12 @@ func TestMultiModelStress(t *testing.T) {
consumed := uint64(256 * format.MebiByte) // Assume some baseline usage
for i := 0; i < len(req); i++ {
// Always get at least 2 models, but dont' overshoot VRAM too much or we'll take too long
if i > 1 && consumed > vram {
slog.Info("achieved target vram exhaustion", "count", i, "vram", format.HumanBytes2(vram), "models", format.HumanBytes2(consumed))
if i > 1 && consumed > maxVram {
slog.Info("achieved target vram exhaustion", "count", i, "vram", format.HumanBytes2(maxVram), "models", format.HumanBytes2(consumed))
break
}
consumed += chosenModels[i].size
slog.Info("target vram", "count", i, "vram", format.HumanBytes2(vram), "models", format.HumanBytes2(consumed))
slog.Info("target vram", "count", i, "vram", format.HumanBytes2(maxVram), "models", format.HumanBytes2(consumed))
wg.Add(1)
go func(i int) {

View File

@@ -70,8 +70,8 @@ func TestAllMiniLMEmbed(t *testing.T) {
t.Fatalf("expected 0.010071031, got %.8f", res.Embeddings[0][0])
}
if res.PromptEvalCount != 8 {
t.Fatalf("expected 8 prompt tokens, got %d", res.PromptEvalCount)
if res.PromptEvalCount != 6 {
t.Fatalf("expected 6 prompt tokens, got %d", res.PromptEvalCount)
}
}
@@ -102,8 +102,8 @@ func TestAllMiniLMBatchEmbed(t *testing.T) {
t.Fatalf("expected 0.010071031 and -0.009802706, got %.8f and %.8f", res.Embeddings[0][0], res.Embeddings[1][0])
}
if res.PromptEvalCount != 16 {
t.Fatalf("expected 16 prompt tokens, got %d", res.PromptEvalCount)
if res.PromptEvalCount != 12 {
t.Fatalf("expected 12 prompt tokens, got %d", res.PromptEvalCount)
}
}

View File

@@ -35,8 +35,8 @@ var (
},
}
resp = [2][]string{
[]string{"sunlight"},
[]string{"england", "english", "massachusetts", "pilgrims"},
{"sunlight"},
{"england", "english", "massachusetts", "pilgrims"},
}
)

View File

@@ -29,7 +29,7 @@ func TestMaxQueue(t *testing.T) {
// Also note that by default Darwin can't sustain > ~128 connections without adjusting limits
threadCount := 32
if maxQueue := envconfig.MaxQueue(); maxQueue != 0 {
threadCount = maxQueue
threadCount = int(maxQueue)
} else {
t.Setenv("OLLAMA_MAX_QUEUE", strconv.Itoa(threadCount))
}

View File

@@ -162,7 +162,7 @@ func PullIfMissing(ctx context.Context, client *api.Client, modelName string) er
fn := func(resp api.ProgressResponse) error {
// fmt.Print(".")
if !stallTimer.Reset(stallDuration) {
return fmt.Errorf("stall was detected, aborting status reporting")
return errors.New("stall was detected, aborting status reporting")
}
return nil
}
@@ -180,7 +180,7 @@ func PullIfMissing(ctx context.Context, client *api.Client, modelName string) er
select {
case <-stallTimer.C:
return fmt.Errorf("download stalled")
return errors.New("download stalled")
case <-done:
return pullError
}
@@ -243,7 +243,7 @@ func DoGenerate(ctx context.Context, t *testing.T, client *api.Client, genReq ap
// fmt.Print(".")
buf.Write([]byte(response.Response))
if !stallTimer.Reset(streamTimeout) {
return fmt.Errorf("stall was detected while streaming response, aborting")
return errors.New("stall was detected while streaming response, aborting")
}
return nil
}
@@ -334,10 +334,10 @@ func GenerateRequests() ([]api.GenerateRequest, [][]string) {
},
},
[][]string{
[]string{"sunlight"},
[]string{"soil", "organic", "earth", "black", "tan"},
[]string{"england", "english", "massachusetts", "pilgrims", "british"},
[]string{"fourth", "july", "declaration", "independence"},
[]string{"nitrogen", "oxygen", "carbon", "dioxide"},
{"sunlight"},
{"soil", "organic", "earth", "black", "tan"},
{"england", "english", "massachusetts", "pilgrims", "british"},
{"fourth", "july", "declaration", "independence"},
{"nitrogen", "oxygen", "carbon", "dioxide"},
}
}

View File

@@ -1,13 +1,14 @@
set(TARGET ollama_llama_server)
option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
include_directories(${CMAKE_CURRENT_SOURCE_DIR})
add_executable(${TARGET} server.cpp utils.hpp json.hpp httplib.h)
install(TARGETS ${TARGET} RUNTIME)
target_compile_definitions(${TARGET} PRIVATE
SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
)
target_link_libraries(${TARGET} PRIVATE ggml llama common llava ${CMAKE_THREAD_LIBS_INIT})
if (WIN32)
TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
endif()
set(TARGET ollama_llama_server)
option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
set(LLAMA_SERVER_LDFLAGS $ENV{LLAMA_SERVER_LDFLAGS})
include_directories(${CMAKE_CURRENT_SOURCE_DIR})
add_executable(${TARGET} server.cpp utils.hpp json.hpp httplib.h)
install(TARGETS ${TARGET} RUNTIME)
target_compile_definitions(${TARGET} PRIVATE
SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
)
target_link_libraries(${TARGET} PRIVATE ggml llama common llava ${CMAKE_THREAD_LIBS_INIT} ${LLAMA_SERVER_LDFLAGS})
if (WIN32)
TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
endif()
target_compile_features(${TARGET} PRIVATE cxx_std_11)

View File

@@ -44,6 +44,7 @@
#include <errhandlingapi.h>
#endif
#include <algorithm>
#include <cstddef>
#include <thread>
#include <chrono>
@@ -402,7 +403,9 @@ struct llama_server_context
}
}
std::tie(model, ctx) = llama_init_from_gpt_params(params);
auto init_result = llama_init_from_gpt_params(params);
model = init_result.model;
ctx = init_result.context;
if (model == nullptr)
{
LOG_ERROR("unable to load model", {{"model", params.model}});
@@ -1221,7 +1224,6 @@ struct llama_server_context
res.result_json = json
{
{"embedding", std::vector<float>(embd, embd + n_embd)},
{"timings", slot.get_formated_timings()},
};
}
}
@@ -1427,7 +1429,13 @@ struct llama_server_context
switch (task.type)
{
case TASK_TYPE_COMPLETION: {
server_slot *slot = prefix_slot(task.data["prompt"]);
server_slot *slot = nullptr;
if (task.embedding_mode) {
// Embedding seq_id (aka slot id) must always be <= token length, so always use slot 0
slot = slots[0].available() ? &slots[0] : nullptr;
} else {
slot = prefix_slot(task.data["prompt"]);
}
if (slot == nullptr)
{
// if no slot is available, we defer this task for processing later
@@ -2420,7 +2428,10 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, g
invalid_param = true;
break;
}
params.lora_adapter.emplace_back(argv[i], 1.0f);
params.lora_adapters.push_back({
std::string(argv[i]),
1.0,
});
params.use_mmap = false;
}
else if (arg == "--lora-scaled")
@@ -2436,7 +2447,10 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, g
invalid_param = true;
break;
}
params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i]));
params.lora_adapters.push_back({
lora_adapter,
std::stof(argv[i])
});
params.use_mmap = false;
}
else if (arg == "-v" || arg == "--verbose")
@@ -3184,37 +3198,17 @@ int main(int argc, char **argv) {
prompt = "";
}
if (prompt.size() == 1) {
prompt = prompt[0];
}
// create and queue the task
json responses;
{
const int id_task = llama.queue_tasks.get_new_id();
llama.queue_results.add_waiting_task_id(id_task);
llama.request_completion(id_task, {{"prompt", prompt}}, true, -1);
const int task_id = llama.queue_tasks.get_new_id();
llama.queue_results.add_waiting_task_id(task_id);
llama.request_completion(task_id, {{"prompt", prompt}}, true, -1);
// get the result
task_result result = llama.queue_results.recv(id_task);
llama.queue_results.remove_waiting_task_id(id_task);
if (result.error) {
return res.set_content(result.result_json.dump(), "application/json; charset=utf-8");
}
// get the result
task_result result = llama.queue_results.recv(task_id);
llama.queue_results.remove_waiting_task_id(task_id);
responses = result.result_json.value("results", std::vector<json>{result.result_json});
json embeddings = json::array();
int prompt_n = 0;
for (auto & elem : responses) {
embeddings.push_back(elem.at("embedding"));
prompt_n += elem.at("timings").at("prompt_n").get<int>();
}
// send the result
json embedding_res = json{{"embedding", embeddings}, {"prompt_n", prompt_n}};
return res.set_content(embedding_res.dump(), "application/json; charset=utf-8");
}
// send the result
return res.set_content(result.result_json.dump(), "application/json; charset=utf-8");
});
// GG: if I put the main loop inside a thread, it crashes on the first request when build in Debug!?

View File

@@ -9,11 +9,14 @@ init_vars() {
ARCH="arm64"
;;
*)
ARCH=$(uname -m | sed -e "s/aarch64/arm64/g")
echo "GOARCH must be set"
echo "this script is meant to be run from within go generate"
exit 1
;;
esac
LLAMACPP_DIR=../llama.cpp
CMAKE_DEFS=""
CMAKE_DEFS="-DCMAKE_SKIP_RPATH=on"
CMAKE_TARGETS="--target ollama_llama_server"
if echo "${CGO_CFLAGS}" | grep -- '-g' >/dev/null; then
CMAKE_DEFS="-DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_VERBOSE_MAKEFILE=on -DLLAMA_GPROF=on -DLLAMA_SERVER_VERBOSE=on ${CMAKE_DEFS}"
@@ -27,6 +30,7 @@ init_vars() {
WHOLE_ARCHIVE="-Wl,-force_load"
NO_WHOLE_ARCHIVE=""
GCC_ARCH="-arch ${ARCH}"
DIST_BASE=../../dist/darwin-${GOARCH}/
;;
"Linux")
LIB_EXT="so"
@@ -35,6 +39,7 @@ init_vars() {
# Cross compiling not supported on linux - Use docker
GCC_ARCH=""
DIST_BASE=../../dist/linux-${GOARCH}/
;;
*)
;;
@@ -42,6 +47,7 @@ init_vars() {
if [ -z "${CMAKE_CUDA_ARCHITECTURES}" ] ; then
CMAKE_CUDA_ARCHITECTURES="50;52;61;70;75;80"
fi
GZIP=$(which pigz 2>/dev/null || echo "gzip")
}
git_module_setup() {
@@ -85,26 +91,36 @@ build() {
compress() {
echo "Compressing payloads to reduce overall binary size..."
pids=""
rm -rf ${BUILD_DIR}/bin/*.gz
for f in ${BUILD_DIR}/bin/* ; do
gzip -n --best -f ${f} &
pids+=" $!"
${GZIP} -n --best -f ${f} &
compress_pids+=" $!"
done
# check for lib directory
if [ -d ${BUILD_DIR}/lib ]; then
for f in ${BUILD_DIR}/lib/* ; do
gzip -n --best -f ${f} &
pids+=" $!"
${GZIP} -n --best -f ${f} &
compress_pids+=" $!"
done
fi
echo
for pid in ${pids}; do
}
wait_for_compress() {
for pid in ${compress_pids}; do
wait $pid
done
echo "Finished compression"
}
install() {
echo "Installing libraries to bin dir ${BUILD_DIR}/bin/"
for lib in $(find ${BUILD_DIR} -name \*.${LIB_EXT}); do
rm -f "${BUILD_DIR}/bin/$(basename ${lib})"
cp -af "${lib}" "${BUILD_DIR}/bin/"
done
}
# Keep the local tree clean after we're done with the build
cleanup() {
(cd ${LLAMACPP_DIR}/ && git checkout CMakeLists.txt)

View File

@@ -6,6 +6,7 @@
set -ex
set -o pipefail
compress_pids=""
echo "Starting darwin generate script"
source $(dirname $0)/gen_common.sh
init_vars
@@ -98,4 +99,5 @@ case "${GOARCH}" in
esac
cleanup
wait_for_compress
echo "go generate completed. LLM runners: $(cd ${BUILD_DIR}/..; echo *)"

View File

@@ -13,6 +13,7 @@
set -ex
set -o pipefail
compress_pids=""
# See https://llvm.org/docs/AMDGPUUsage.html#processors for reference
amdGPUs() {
@@ -51,7 +52,7 @@ if [ -z "${CUDACXX}" ]; then
export CUDACXX=$(command -v nvcc)
fi
fi
COMMON_CMAKE_DEFS="-DBUILD_SHARED_LIBS=off -DCMAKE_POSITION_INDEPENDENT_CODE=on -DGGML_NATIVE=off -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_OPENMP=off"
COMMON_CMAKE_DEFS="-DCMAKE_SKIP_RPATH=on -DBUILD_SHARED_LIBS=on -DCMAKE_POSITION_INDEPENDENT_CODE=on -DGGML_NATIVE=off -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_OPENMP=off"
source $(dirname $0)/gen_common.sh
init_vars
git_module_setup
@@ -77,10 +78,11 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
if [ -n "${OLLAMA_CUSTOM_CPU_DEFS}" ]; then
init_vars
echo "OLLAMA_CUSTOM_CPU_DEFS=\"${OLLAMA_CUSTOM_CPU_DEFS}\""
CMAKE_DEFS="${OLLAMA_CUSTOM_CPU_DEFS} -DBUILD_SHARED_LIBS=off -DCMAKE_POSITION_INDEPENDENT_CODE=on ${CMAKE_DEFS}"
CMAKE_DEFS="${OLLAMA_CUSTOM_CPU_DEFS} -DBUILD_SHARED_LIBS=on -DCMAKE_POSITION_INDEPENDENT_CODE=on ${CMAKE_DEFS}"
BUILD_DIR="../build/linux/${ARCH}/cpu"
echo "Building custom CPU"
build
install
compress
else
# Darwin Rosetta x86 emulation does NOT support AVX, AVX2, AVX512
@@ -93,7 +95,7 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
# -DGGML_AVX512_VBMI -- 2018 Intel Cannon Lake
# -DGGML_AVX512_VNNI -- 2021 Intel Alder Lake
COMMON_CPU_DEFS="-DBUILD_SHARED_LIBS=off -DCMAKE_POSITION_INDEPENDENT_CODE=on -DGGML_NATIVE=off -DGGML_OPENMP=off"
COMMON_CPU_DEFS="-DBUILD_SHARED_LIBS=on -DCMAKE_POSITION_INDEPENDENT_CODE=on -DGGML_NATIVE=off -DGGML_OPENMP=off"
if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu" ]; then
#
# CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
@@ -103,6 +105,7 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
BUILD_DIR="../build/linux/${ARCH}/cpu"
echo "Building LCD CPU"
build
install
compress
fi
@@ -120,6 +123,7 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
BUILD_DIR="../build/linux/${ARCH}/cpu_avx"
echo "Building AVX CPU"
build
install
compress
fi
@@ -133,6 +137,7 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
BUILD_DIR="../build/linux/${ARCH}/cpu_avx2"
echo "Building AVX2 CPU"
build
install
compress
fi
fi
@@ -160,7 +165,7 @@ if [ -z "${OLLAMA_SKIP_CUDA_GENERATE}" -a -d "${CUDA_LIB_DIR}" ]; then
echo "CUDA libraries detected - building dynamic CUDA library"
init_vars
CUDA_MAJOR=$(ls "${CUDA_LIB_DIR}"/libcudart.so.* | head -1 | cut -f3 -d. || true)
if [ -n "${CUDA_MAJOR}" ]; then
if [ -n "${CUDA_MAJOR}" -a -z "${CUDA_VARIANT}" ]; then
CUDA_VARIANT=_v${CUDA_MAJOR}
fi
if [ "${ARCH}" == "arm64" ]; then
@@ -178,29 +183,19 @@ if [ -z "${OLLAMA_SKIP_CUDA_GENERATE}" -a -d "${CUDA_LIB_DIR}" ]; then
CMAKE_CUDA_DEFS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} ${OLLAMA_CUSTOM_CUDA_DEFS}"
echo "Building custom CUDA GPU"
else
CMAKE_CUDA_DEFS="-DGGML_CUDA=on -DCMAKE_CUDA_FLAGS=-t8 -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}"
CMAKE_CUDA_DEFS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}"
fi
CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS} ${CMAKE_CUDA_DEFS}"
export CUDAFLAGS="-t8"
CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS} ${CMAKE_CUDA_DEFS} -DGGML_STATIC=off"
BUILD_DIR="../build/linux/${ARCH}/cuda${CUDA_VARIANT}"
EXTRA_LIBS="-L${CUDA_LIB_DIR} -lcudart -lcublas -lcublasLt -lcuda"
export LLAMA_SERVER_LDFLAGS="-L${CUDA_LIB_DIR} -lcudart -lcublas -lcublasLt -lcuda"
CUDA_DIST_DIR="${CUDA_DIST_DIR:-${DIST_BASE}/lib/ollama}"
build
# Carry the CUDA libs as payloads to help reduce dependency burden on users
#
# TODO - in the future we may shift to packaging these separately and conditionally
# downloading them in the install script.
DEPS="$(ldd ${BUILD_DIR}/bin/ollama_llama_server )"
for lib in libcudart.so libcublas.so libcublasLt.so ; do
DEP=$(echo "${DEPS}" | grep ${lib} | cut -f1 -d' ' | xargs || true)
if [ -n "${DEP}" -a -e "${CUDA_LIB_DIR}/${DEP}" ]; then
cp "${CUDA_LIB_DIR}/${DEP}" "${BUILD_DIR}/bin/"
elif [ -e "${CUDA_LIB_DIR}/${lib}.${CUDA_MAJOR}" ]; then
cp "${CUDA_LIB_DIR}/${lib}.${CUDA_MAJOR}" "${BUILD_DIR}/bin/"
elif [ -e "${CUDART_LIB_DIR}/${lib}" ]; then
cp -d ${CUDART_LIB_DIR}/${lib}* "${BUILD_DIR}/bin/"
else
cp -d "${CUDA_LIB_DIR}/${lib}*" "${BUILD_DIR}/bin/"
fi
install
echo "Installing CUDA dependencies in ${CUDA_DIST_DIR}"
mkdir -p "${CUDA_DIST_DIR}"
for lib in ${CUDA_LIB_DIR}/libcudart.so* ${CUDA_LIB_DIR}/libcublas.so* ${CUDA_LIB_DIR}/libcublasLt.so* ; do
cp -a "${lib}" "${CUDA_DIST_DIR}"
done
compress
@@ -218,21 +213,24 @@ if [ -z "${OLLAMA_SKIP_ONEAPI_GENERATE}" -a -d "${ONEAPI_ROOT}" ]; then
CC=icx
CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL=ON -DGGML_SYCL_F16=OFF"
BUILD_DIR="../build/linux/${ARCH}/oneapi"
EXTRA_LIBS="-fsycl -Wl,-rpath,${ONEAPI_ROOT}/compiler/latest/lib,-rpath,${ONEAPI_ROOT}/mkl/latest/lib,-rpath,${ONEAPI_ROOT}/tbb/latest/lib,-rpath,${ONEAPI_ROOT}/compiler/latest/opt/oclfpga/linux64/lib -lOpenCL -lmkl_core -lmkl_sycl_blas -lmkl_intel_ilp64 -lmkl_tbb_thread -ltbb"
ONEAPI_DIST_DIR="${DIST_BASE}/lib/ollama"
export LLAMA_SERVER_LDFLAGS="-fsycl -lOpenCL -lmkl_core -lmkl_sycl_blas -lmkl_intel_ilp64 -lmkl_tbb_thread -ltbb"
DEBUG_FLAGS="" # icx compiles with -O0 if we pass -g, so we must remove it
build
# copy oneAPI dependencies
mkdir -p "${ONEAPI_DIST_DIR}"
for dep in $(ldd "${BUILD_DIR}/bin/ollama_llama_server" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -e sycl -e mkl -e tbb); do
cp "${dep}" "${BUILD_DIR}/bin/"
cp -a "${dep}" "${ONEAPI_DIST_DIR}"
done
cp "${ONEAPI_ROOT}/compiler/latest/lib/libOpenCL.so" "${BUILD_DIR}/bin/"
cp "${ONEAPI_ROOT}/compiler/latest/lib/libimf.so" "${BUILD_DIR}/bin/"
cp "${ONEAPI_ROOT}/compiler/latest/lib/libintlc.so.5" "${BUILD_DIR}/bin/"
cp "${ONEAPI_ROOT}/compiler/latest/lib/libirng.so" "${BUILD_DIR}/bin/"
cp "${ONEAPI_ROOT}/compiler/latest/lib/libpi_level_zero.so" "${BUILD_DIR}/bin/"
cp "${ONEAPI_ROOT}/compiler/latest/lib/libsvml.so" "${BUILD_DIR}/bin/"
cp "${ONEAPI_ROOT}/compiler/latest/lib/libur_loader.so.0" "${BUILD_DIR}/bin/"
cp "${ONEAPI_ROOT}/compiler/latest/lib/libOpenCL.so" "${ONEAPI_DIST_DIR}"
cp "${ONEAPI_ROOT}/compiler/latest/lib/libimf.so" "${ONEAPI_DIST_DIR}"
cp "${ONEAPI_ROOT}/compiler/latest/lib/libintlc.so.5" "${ONEAPI_DIST_DIR}"
cp "${ONEAPI_ROOT}/compiler/latest/lib/libirng.so" "${ONEAPI_DIST_DIR}"
cp "${ONEAPI_ROOT}/compiler/latest/lib/libpi_level_zero.so" "${ONEAPI_DIST_DIR}"
cp "${ONEAPI_ROOT}/compiler/latest/lib/libsvml.so" "${ONEAPI_DIST_DIR}"
cp "${ONEAPI_ROOT}/compiler/latest/lib/libur_loader.so.0" "${ONEAPI_DIST_DIR}"
install
compress
fi
@@ -254,7 +252,7 @@ if [ -z "${OLLAMA_SKIP_ROCM_GENERATE}" -a -d "${ROCM_PATH}" ]; then
ROCM_VARIANT=_v$(ls ${ROCM_PATH}/lib/librocblas.so.*.*.????? | cut -f5 -d. || true)
fi
init_vars
CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DGGML_HIPBLAS=on -DLLAMA_CUDA_NO_PEER_COPY=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS=$(amdGPUs) -DGPU_TARGETS=$(amdGPUs)"
CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DGGML_HIPBLAS=on -DGGML_CUDA_NO_PEER_COPY=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS=$(amdGPUs) -DGPU_TARGETS=$(amdGPUs)"
# Users building from source can tune the exact flags we pass to cmake for configuring llama.cpp
if [ -n "${OLLAMA_CUSTOM_ROCM_DEFS}" ]; then
echo "OLLAMA_CUSTOM_ROCM_DEFS=\"${OLLAMA_CUSTOM_ROCM_DEFS}\""
@@ -262,23 +260,22 @@ if [ -z "${OLLAMA_SKIP_ROCM_GENERATE}" -a -d "${ROCM_PATH}" ]; then
echo "Building custom ROCM GPU"
fi
BUILD_DIR="../build/linux/${ARCH}/rocm${ROCM_VARIANT}"
EXTRA_LIBS="-L${ROCM_PATH}/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ -Wl,-rpath,\$ORIGIN/../../rocm/ -lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu"
# ROCm dependencies are too large to fit into a unified bundle
ROCM_DIST_DIR="${DIST_BASE}/../linux-${GOARCH}-rocm/lib/ollama"
# TODO figure out how to disable runpath (rpath)
# export CMAKE_HIP_FLAGS="-fno-rtlib-add-rpath" # doesn't work
export LLAMA_SERVER_LDFLAGS="-L${ROCM_PATH}/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ -lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu"
build
# Record the ROCM dependencies
rm -f "${BUILD_DIR}/bin/deps.txt"
touch "${BUILD_DIR}/bin/deps.txt"
for dep in $(ldd "${BUILD_DIR}/bin/ollama_llama_server" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -e rocm -e amdgpu -e libtinfo ); do
echo "${dep}" >> "${BUILD_DIR}/bin/deps.txt"
# copy the ROCM dependencies
mkdir -p "${ROCM_DIST_DIR}"
for dep in $(ldd "${BUILD_DIR}/bin/ollama_llama_server" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -v "${ARCH}/rocm${ROCM_VARIANT}" | grep -e rocm -e amdgpu -e libtinfo ); do
cp -a "${dep}"* "${ROCM_DIST_DIR}"
done
# bomb out if for some reason we didn't get a few deps
if [ $(cat "${BUILD_DIR}/bin/deps.txt" | wc -l ) -lt 8 ] ; then
cat "${BUILD_DIR}/bin/deps.txt"
echo "ERROR: deps file short"
exit 1
fi
install
compress
fi
cleanup
wait_for_compress
echo "go generate completed. LLM runners: $(cd ${BUILD_DIR}/..; echo *)"

View File

@@ -35,7 +35,7 @@ function init_vars {
)
$script:commonCpuDefs = @("-DCMAKE_POSITION_INDEPENDENT_CODE=on")
$script:ARCH = $Env:PROCESSOR_ARCHITECTURE.ToLower()
$script:DIST_BASE = "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_runners"
$script:DIST_BASE = "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\runners"
md "$script:DIST_BASE" -ea 0 > $null
if ($env:CGO_CFLAGS -contains "-g") {
$script:cmakeDefs += @("-DCMAKE_VERBOSE_MAKEFILE=on", "-DLLAMA_SERVER_VERBOSE=on", "-DCMAKE_BUILD_TYPE=RelWithDebInfo")
@@ -117,7 +117,7 @@ function build {
if ($cmakeDefs -contains "-G") {
$extra=@("-j8")
} else {
$extra= @("--", "/p:CL_MPcount=8")
$extra= @("--", "/maxCpuCount:8")
}
write-host "building with: cmake --build $script:buildDir --config $script:config $($script:cmakeTargets | ForEach-Object { `"--target`", $_ }) $extra"
& cmake --build $script:buildDir --config $script:config ($script:cmakeTargets | ForEach-Object { "--target", $_ }) $extra
@@ -261,7 +261,7 @@ function build_cuda() {
if ((-not "${env:OLLAMA_SKIP_CUDA_GENERATE}") -and ("${script:CUDA_LIB_DIR}")) {
# Then build cuda as a dynamically loaded library
$nvcc = "$script:CUDA_LIB_DIR\nvcc.exe"
$script:CUDA_VERSION=(get-item ($nvcc | split-path | split-path)).Basename
$script:CUDA_VERSION=((get-item ($nvcc | split-path | split-path)).Basename -Split "\.")[0]
if ($null -ne $script:CUDA_VERSION) {
$script:CUDA_VARIANT="_"+$script:CUDA_VERSION
}
@@ -273,9 +273,9 @@ function build_cuda() {
"-DGGML_CUDA=ON",
"-DGGML_AVX=on",
"-DGGML_AVX2=off",
"-DCUDAToolkit_INCLUDE_DIR=$script:CUDA_INCLUDE_DIR",
"-DCMAKE_CUDA_FLAGS=-t8",
"-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}"
"-DCMAKE_CUDA_FLAGS=-t6",
"-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}",
"-DCMAKE_CUDA_COMPILER_TOOLKIT_ROOT=$env:CUDA_PATH"
)
if ($null -ne $env:OLLAMA_CUSTOM_CUDA_DEFS) {
write-host "OLLAMA_CUSTOM_CUDA_DEFS=`"${env:OLLAMA_CUSTOM_CUDA_DEFS}`""
@@ -286,12 +286,11 @@ function build_cuda() {
sign
install
rm -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
md "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\" -ea 0 > $null
write-host "copying CUDA dependencies to ${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
cp "${script:CUDA_LIB_DIR}\cudart64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
cp "${script:CUDA_LIB_DIR}\cublas64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
cp "${script:CUDA_LIB_DIR}\cublasLt64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
md "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" -ea 0 > $null
write-host "copying CUDA dependencies to ${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
cp "${script:CUDA_LIB_DIR}\cudart64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
cp "${script:CUDA_LIB_DIR}\cublas64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
cp "${script:CUDA_LIB_DIR}\cublasLt64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
} else {
write-host "Skipping CUDA generation step"
}
@@ -325,18 +324,17 @@ function build_oneapi() {
sign
install
rm -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
md "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\" -ea 0 > $null
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libirngmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libmmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_level_zero.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_unified_runtime.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_win_proxy_loader.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\svml_dispmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\sycl7.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_core.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_sycl_blas.4.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_tbb_thread.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
md "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" -ea 0 > $null
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libirngmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libmmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_level_zero.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_unified_runtime.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_win_proxy_loader.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\svml_dispmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\sycl7.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_core.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_sycl_blas.4.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_tbb_thread.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
} else {
Write-Host "Skipping oneAPI generation step"
}
@@ -357,7 +355,7 @@ function build_rocm() {
"-DCMAKE_C_COMPILER=clang.exe",
"-DCMAKE_CXX_COMPILER=clang++.exe",
"-DGGML_HIPBLAS=on",
"-DLLAMA_CUDA_NO_PEER_COPY=on",
"-DGGML_CUDA_NO_PEER_COPY=on",
"-DHIP_PLATFORM=amd",
"-DGGML_AVX=on",
"-DGGML_AVX2=off",
@@ -386,12 +384,11 @@ function build_rocm() {
sign
install
rm -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\"
md "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\rocblas\library\" -ea 0 > $null
cp "${env:HIP_PATH}\bin\hipblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\"
cp "${env:HIP_PATH}\bin\rocblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\"
md "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\rocblas\library\" -ea 0 > $null
cp "${env:HIP_PATH}\bin\hipblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
cp "${env:HIP_PATH}\bin\rocblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
# amdhip64.dll dependency comes from the driver and must be installed on the host to use AMD GPUs
cp "${env:HIP_PATH}\bin\rocblas\library\*" "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\rocblas\library\"
cp "${env:HIP_PATH}\bin\rocblas\library\*" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\rocblas\library\"
} else {
write-host "Skipping ROCm generation step"
}

View File

@@ -43,6 +43,14 @@ func (kv KV) Architecture() string {
return "unknown"
}
func (kv KV) Kind() string {
if s, ok := kv["general.type"].(string); ok {
return s
}
return "unknown"
}
func (kv KV) ParameterCount() uint64 {
return kv.u64("general.parameter_count")
}
@@ -157,6 +165,14 @@ type Tensor struct {
io.WriterTo `json:"-"`
}
func (t Tensor) block() (n int) {
if _, err := fmt.Sscanf(t.Name, "blk.%d.", &n); err != nil {
return -1
}
return
}
func (t Tensor) blockSize() uint64 {
switch t.Kind {
case 0, 1, 24, 25, 26, 27, 28, 30: // F32, F16, I8, I16, I32, I64, F64, BF16

View File

@@ -532,15 +532,14 @@ func WriteGGUF(ws io.WriteSeeker, kv KV, ts []Tensor) error {
}
}
slices.SortFunc(ts, func(a, b Tensor) int {
var i, j int
if n, err := fmt.Sscanf(a.Name, "blk.%d", &i); err != nil || n != 1 {
return cmp.Compare(a.Name, b.Name)
} else if n, err := fmt.Sscanf(b.Name, "blk.%d", &j); err != nil || n != 1 {
return cmp.Compare(a.Name, b.Name)
slices.SortStableFunc(ts, func(a, b Tensor) int {
if i, j := a.block(), b.block(); i < 0 && j > 0 {
return 1
} else if i > 0 && j < 0 {
return -1
} else {
return cmp.Compare(i, j)
}
return cmp.Compare(i, j)
})
var s uint64

View File

@@ -11,8 +11,9 @@ package llm
// #include <stdlib.h>
// #include "llama.h"
import "C"
import (
"fmt"
"errors"
"unsafe"
)
@@ -33,7 +34,7 @@ func Quantize(infile, outfile string, ftype fileType) error {
params.ftype = ftype.Value()
if rc := C.llama_model_quantize(cinfile, coutfile, &params); rc != 0 {
return fmt.Errorf("failed to quantize model. This model architecture may not be supported, or you may need to upgrade Ollama to the latest version")
return errors.New("failed to quantize model. This model architecture may not be supported, or you may need to upgrade Ollama to the latest version")
}
return nil

View File

@@ -6,10 +6,11 @@ import (
"os"
"testing"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/gpu"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/gpu"
)
func TestEstimateGPULayers(t *testing.T) {
@@ -32,7 +33,6 @@ func TestEstimateGPULayers(t *testing.T) {
assert.Len(t, tensors, inputLayerCount+1)
err = WriteGGUF(f, KV{
"general.architecture": "llama",
"general.name": "name",
"llama.context_length": uint32(32),
"llama.embedding_length": uint32(4096),
"llama.block_count": uint32(inputLayerCount),

View File

@@ -1,60 +0,0 @@
diff --git a/src/llama.cpp b/src/llama.cpp
index 721b8f4e..cfe7ac40 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -8420,14 +8420,14 @@ struct llm_build_context {
}
struct ggml_tensor * build_inp_mean() {
- lctx.inp_mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens);
+ lctx.inp_mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, cparams.n_seq_max);
cb(lctx.inp_mean, "inp_mean", -1);
ggml_set_input(lctx.inp_mean);
return lctx.inp_mean;
}
struct ggml_tensor * build_inp_cls() {
- lctx.inp_cls = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+ lctx.inp_cls = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, cparams.n_seq_max);
cb(lctx.inp_cls, "inp_cls", -1);
ggml_set_input(lctx.inp_cls);
return lctx.inp_cls;
@@ -13847,19 +13847,16 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer));
float * data = (float *) lctx.inp_mean->data;
- memset(lctx.inp_mean->data, 0, n_tokens * n_tokens * ggml_element_size(lctx.inp_mean));
+ memset(lctx.inp_mean->data, 0, n_tokens * cparams.n_seq_max * ggml_element_size(lctx.inp_mean));
std::vector<uint64_t> sum(n_tokens, 0);
for (int i = 0; i < n_tokens; ++i) {
const llama_seq_id seq_id = batch.seq_id[i][0];
-
- GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == MEAN");
-
sum[seq_id] += 1;
}
- std::vector<float> div(n_tokens, 0.0f);
- for (int i = 0; i < n_tokens; ++i) {
+ std::vector<float> div(cparams.n_seq_max, 0.0f);
+ for (uint32_t i = 0; i < cparams.n_seq_max; ++i) {
const uint64_t s = sum[i];
if (s > 0) {
div[i] = 1.0f/float(s);
@@ -13879,14 +13876,11 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
uint32_t * data = (uint32_t *) lctx.inp_cls->data;
- memset(lctx.inp_cls->data, 0, n_tokens * ggml_element_size(lctx.inp_cls));
+ memset(lctx.inp_cls->data, 0, cparams.n_seq_max * ggml_element_size(lctx.inp_cls));
for (int i = 0; i < n_tokens; ++i) {
const llama_seq_id seq_id = batch.seq_id[i][0];
const llama_pos pos = batch.pos[i];
-
- GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == CLS");
-
if (pos == 0) {
data[seq_id] = i;
}

View File

@@ -1,40 +1,32 @@
diff --git a/common/common.cpp b/common/common.cpp
index dbb724fb..c26fe6ee 100644
index 2e8374d5..70d0afde 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -2087,14 +2087,27 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) {
const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]);
float lora_scale = std::get<1>(params.lora_adapter[i]);
+
+ // try to load as gguf
auto adapter = llama_lora_adapter_init(model, lora_adapter.c_str());
if (adapter == nullptr) {
- fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
@@ -2110,9 +2110,21 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str());
if (loaded_la.adapter == nullptr) {
fprintf(stderr, "%s: error: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
- llama_free(lctx);
- llama_free_model(model);
- return std::make_tuple(nullptr, nullptr);
+ fprintf(stderr, "%s: error: failed to apply lora adapter, trying ggla\n", __func__);
- return iparams;
+
+ // if that fails, try loading as ggla for compatibility
+ int err = llama_model_apply_lora_from_file(model,
+ lora_adapter.c_str(),
+ lora_scale,
+ la.path.c_str(),
+ la.scale,
+ nullptr,
+ params.n_threads);
+ if (err != 0) {
+ fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
+ llama_free(lctx);
+ llama_free_model(model);
+ return std::make_tuple(nullptr, nullptr);
+ return iparams;
+ } else {
+ break;
+ }
+ } else {
+ llama_lora_adapter_set(lctx, adapter, lora_scale);
}
- llama_lora_adapter_set(lctx, adapter, lora_scale);
iparams.lora_adapters.push_back(loaded_la); // copy to list of loaded adapters
}
if (params.ignore_eos) {
diff --git a/include/llama.h b/include/llama.h
index 93fd77ca..b0fb37a6 100644
--- a/include/llama.h
@@ -355,4 +347,4 @@ index 80a0dd0f..9d7b0e17 100644
+ return 1;
+ }
+}
\ No newline at end of file
\ No newline at end of file

View File

@@ -1,20 +0,0 @@
diff --git a/src/llama.cpp b/src/llama.cpp
index a207451f..fba6b175 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -4969,6 +4969,7 @@ static void llm_load_hparams(
hparams.attn_soft_cap = true;
switch (hparams.n_layer) {
+ case 26: model.type = e_model::MODEL_2B; break;
case 42: model.type = e_model::MODEL_9B; break;
case 46: model.type = e_model::MODEL_27B; break;
default: model.type = e_model::MODEL_UNKNOWN;
@@ -11736,6 +11737,7 @@ struct llm_build_context {
// ref: https://github.com/google/gemma_pytorch/commit/03e657582d17cb5a8617ebf333c1c16f3694670e
switch (model.type) {
+ case e_model::MODEL_2B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k))); break;
case e_model::MODEL_9B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k))); break;
case e_model::MODEL_27B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd / n_head))); break;
default: GGML_ABORT("fatal error");

View File

@@ -82,8 +82,8 @@ func serversForGpu(info gpu.GpuInfo) []string {
// glob workDir for files that start with ollama_
availableServers := getAvailableServers()
requested := info.Library
if info.Variant != gpu.CPUCapabilityNone {
requested += "_" + info.Variant.String()
if info.Variant != gpu.CPUCapabilityNone.String() {
requested += "_" + info.Variant
}
servers := []string{}

View File

@@ -33,7 +33,7 @@ type LlamaServer interface {
Ping(ctx context.Context) error
WaitUntilRunning(ctx context.Context) error
Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error
Embed(ctx context.Context, input []string) (*EmbedResponse, error)
Embedding(ctx context.Context, input string) ([]float32, error)
Tokenize(ctx context.Context, content string) ([]int, error)
Detokenize(ctx context.Context, tokens []int) (string, error)
Close() error
@@ -44,11 +44,12 @@ type LlamaServer interface {
// llmServer is an instance of the llama.cpp server
type llmServer struct {
port int
cmd *exec.Cmd
done chan error // Channel to signal when the process exits
status *StatusWriter
options api.Options
port int
cmd *exec.Cmd
done chan error // Channel to signal when the process exits
status *StatusWriter
options api.Options
numParallel int
estimate MemoryEstimate
totalLayers uint64
@@ -124,8 +125,9 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
}
}
// On linux, over-allocating CPU memory will almost always result in an error
if runtime.GOOS == "linux" {
// On linux and windows, over-allocating CPU memory will almost always result in an error
// Darwin has fully dynamic swap so has no direct concept of free swap space
if runtime.GOOS != "darwin" {
systemMemoryRequired := estimate.TotalSize - estimate.VRAMSize
available := systemFreeMemory + systemSwapFreeMemory
if systemMemoryRequired > available {
@@ -184,15 +186,15 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
params := []string{
"--model", model,
"--ctx-size", fmt.Sprintf("%d", opts.NumCtx),
"--batch-size", fmt.Sprintf("%d", opts.NumBatch),
"--ctx-size", strconv.Itoa(opts.NumCtx),
"--batch-size", strconv.Itoa(opts.NumBatch),
"--embedding",
}
params = append(params, "--log-disable")
if opts.NumGPU >= 0 {
params = append(params, "--n-gpu-layers", fmt.Sprintf("%d", opts.NumGPU))
params = append(params, "--n-gpu-layers", strconv.Itoa(opts.NumGPU))
}
if envconfig.Debug() {
@@ -200,7 +202,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
}
if opts.MainGPU > 0 {
params = append(params, "--main-gpu", fmt.Sprintf("%d", opts.MainGPU))
params = append(params, "--main-gpu", strconv.Itoa(opts.MainGPU))
}
if len(adapters) > 0 {
@@ -214,7 +216,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
}
if opts.NumThread > 0 {
params = append(params, "--threads", fmt.Sprintf("%d", opts.NumThread))
params = append(params, "--threads", strconv.Itoa(opts.NumThread))
}
if !opts.F16KV {
@@ -256,11 +258,17 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
params = append(params, "--mlock")
}
if opts.UseNUMA {
params = append(params, "--numa")
if gpu.IsNUMA() && gpus[0].Library == "cpu" {
numaMode := "distribute"
if runtime.GOOS == "linux" {
if _, err := exec.LookPath("numactl"); err == nil {
numaMode = "numactl"
}
}
params = append(params, "--numa", numaMode)
}
params = append(params, "--parallel", fmt.Sprintf("%d", numParallel))
params = append(params, "--parallel", strconv.Itoa(numParallel))
if estimate.TensorSplit != "" {
params = append(params, "--tensor-split", estimate.TensorSplit)
@@ -298,20 +306,18 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
if runtime.GOOS == "windows" {
pathEnv = "PATH"
}
// prepend the server directory to LD_LIBRARY_PATH/PATH and the parent dir for common dependencies
libraryPaths := []string{dir, filepath.Dir(dir)}
// Start with the server directory for the LD_LIBRARY_PATH/PATH
libraryPaths := []string{dir}
if libraryPath, ok := os.LookupEnv(pathEnv); ok {
// Append our runner directory to the path
// This will favor system libraries over our bundled library dependencies
// favor our bundled library dependencies over system libraries
libraryPaths = append(libraryPaths, filepath.SplitList(libraryPath)...)
}
// Note: we always put the dependency path first
// since this was the exact version we verified for AMD GPUs
// and we favor what the user had in their path
// since this was the exact version we compiled/linked against
if gpus[0].DependencyPath != "" {
// TODO refine for multi-gpu support
// assume gpus from the same library have the same dependency path
libraryPaths = append([]string{gpus[0].DependencyPath}, libraryPaths...)
}
@@ -337,6 +343,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
status: NewStatusWriter(os.Stderr),
options: opts,
estimate: estimate,
numParallel: numParallel,
sem: semaphore.NewWeighted(int64(numParallel)),
totalLayers: ggml.KV().BlockCount() + 1,
gpus: gpus,
@@ -425,7 +432,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
if strings.Contains(s.status.LastErrMsg, "unknown model") {
s.status.LastErrMsg = "this model is not supported by your version of Ollama. You may need to upgrade"
}
s.done <- fmt.Errorf(s.status.LastErrMsg)
s.done <- errors.New(s.status.LastErrMsg)
} else {
s.done <- err
}
@@ -874,16 +881,15 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
return nil
}
type EmbedRequest struct {
Content []string `json:"content"`
type EmbeddingRequest struct {
Content string `json:"content"`
}
type EmbedResponse struct {
Embedding [][]float32 `json:"embedding"`
PromptEvalCount int `json:"prompt_n"`
type EmbeddingResponse struct {
Embedding []float32 `json:"embedding"`
}
func (s *llmServer) Embed(ctx context.Context, input []string) (*EmbedResponse, error) {
func (s *llmServer) Embedding(ctx context.Context, input string) ([]float32, error) {
if err := s.sem.Acquire(ctx, 1); err != nil {
slog.Error("Failed to acquire semaphore", "error", err)
return nil, err
@@ -898,18 +904,18 @@ func (s *llmServer) Embed(ctx context.Context, input []string) (*EmbedResponse,
return nil, fmt.Errorf("unexpected server status: %s", status.ToString())
}
data, err := json.Marshal(EmbedRequest{Content: input})
data, err := json.Marshal(EmbeddingRequest{Content: input})
if err != nil {
return nil, fmt.Errorf("error marshaling embed data: %w", err)
}
req, err := http.NewRequestWithContext(ctx, http.MethodPost, fmt.Sprintf("http://127.0.0.1:%d/embedding", s.port), bytes.NewBuffer(data))
r, err := http.NewRequestWithContext(ctx, http.MethodPost, fmt.Sprintf("http://127.0.0.1:%d/embedding", s.port), bytes.NewBuffer(data))
if err != nil {
return nil, fmt.Errorf("error creating embed request: %w", err)
}
req.Header.Set("Content-Type", "application/json")
r.Header.Set("Content-Type", "application/json")
resp, err := http.DefaultClient.Do(req)
resp, err := http.DefaultClient.Do(r)
if err != nil {
return nil, fmt.Errorf("do embedding request: %w", err)
}
@@ -925,12 +931,12 @@ func (s *llmServer) Embed(ctx context.Context, input []string) (*EmbedResponse,
return nil, fmt.Errorf("%s", body)
}
var e EmbedResponse
var e EmbeddingResponse
if err := json.Unmarshal(body, &e); err != nil {
return nil, fmt.Errorf("unmarshal tokenize response: %w", err)
}
return &e, nil
return e.Embedding, nil
}
type TokenizeRequest struct {

View File

@@ -26,6 +26,7 @@ var errorPrefixes = []string{
"cudaMalloc failed",
"\"ERR\"",
"error loading model",
"GGML_ASSERT",
}
func (w *StatusWriter) Write(b []byte) (int, error) {

Some files were not shown because too many files have changed in this diff Show More