Compare commits

...

105 Commits

Author SHA1 Message Date
Jeffrey Morgan
5306b0269d Update linux.md 2023-09-25 16:10:32 -07:00
Michael Yang
7de0c8345d Merge pull request #595 from jmorganca/mxyng/install.sh
ignore systemctl is-system-running exit code
2023-09-25 15:49:47 -07:00
Michael Yang
1b9dcab3ab ignore systemctl is-system-running exit code 2023-09-25 15:47:45 -07:00
Bruce MacDonald
86279f4ae3 unbound max num gpu layers (#591)
---------

Co-authored-by: Michael Yang <mxyng@pm.me>
2023-09-25 18:36:46 -04:00
Michael Yang
b934bf23e6 exit on unknown distro (#594) 2023-09-25 15:30:58 -07:00
Michael Yang
2b8ef455ad Merge pull request #593 from jmorganca/mxyng/install.sh
update install.sh
2023-09-25 14:09:40 -07:00
Michael Yang
0c5f47177c update install.sh 2023-09-25 14:01:44 -07:00
Michael Yang
1210db2924 Merge pull request #592 from jmorganca/mxyng/install.sh
fix dkms on debian
2023-09-25 12:59:01 -07:00
Michael Yang
d0854bf1e6 fix dkms on debian 2023-09-25 12:57:25 -07:00
Michael Yang
8396463255 Merge pull request #590 from jmorganca/mxyng/install.sh
fix dkms install
2023-09-25 12:17:31 -07:00
Michael Yang
a027bbf4d7 fix dkms install 2023-09-25 12:16:41 -07:00
Michael Yang
ed94a3dd02 Merge pull request #589 from jmorganca/mxyng/install.sh
update install.sh
2023-09-25 11:08:25 -07:00
Michael Yang
f14f62ab3b update install.sh 2023-09-25 11:05:38 -07:00
Jeffrey Morgan
0fb5268496 Update linux.md 2023-09-25 10:06:23 -07:00
Bruce MacDonald
c65edb1506 fix linux installer warning logs (#588) 2023-09-25 11:22:56 -04:00
Twan L
1605af32ec Added a new community project (#574) 2023-09-25 10:40:59 -04:00
Jeffrey Morgan
ee3032ad89 improvements to docs/linux.md 2023-09-24 21:50:07 -07:00
Jeffrey Morgan
5b7a27281d improvements to docs/linux.md 2023-09-24 21:38:23 -07:00
Jeffrey Morgan
d2a784e33e add docs/linux.md 2023-09-24 21:34:44 -07:00
Jeffrey Morgan
413a2e4f91 set DEBIAN_FRONTEND=noninteractive correctly 2023-09-24 20:35:42 -07:00
Patrick Devine
b5614f3ebc fix end-of-line issue with the new prompt (#582) 2023-09-23 17:20:30 -07:00
Jeffrey Morgan
8b2ba9cab8 minor improvements to install.sh 2023-09-23 11:20:39 -04:00
Jeffrey Morgan
e29662ab5c fix minor install script issues on debian 2023-09-23 10:25:47 -04:00
Bruce MacDonald
cbc40aa996 debian installer support (#579)
* debian installer support

- normalize os name to lowercase
- check needed commands are available
- dont check sudo when root user
- share common install commands
- support debian cuda install
- skip aarm cuda install
- system user shared home dir

* refactor and add other platforms (#580)

---------

Co-authored-by: Michael Yang <mxyng@pm.me>
2023-09-23 09:46:47 -04:00
Jeffrey Morgan
5cb82540c9 install.sh: update install url 2023-09-23 09:35:14 -04:00
Jeffrey Morgan
d7849a1dc9 add .env to .dockerignore 2023-09-23 00:53:48 -04:00
Jeffrey Morgan
01c44d687e add multi line strings to final prompt 2023-09-23 00:27:24 -04:00
Jeffrey Morgan
9b12a511ca check other request fields before load short circuit in /api/generate 2023-09-22 23:50:55 -04:00
Jeffrey Morgan
e20362e0d5 fix multi line input in ollama run 2023-09-22 23:49:35 -04:00
Patrick Devine
c928ceb927 add word wrapping for lines which are longer than the terminal width (#553) 2023-09-22 13:36:08 -07:00
Michael Yang
e1a0846483 Merge pull request #571 from jmorganca/mxyng/update-dockerfile
update dockerfile.cuda
2023-09-22 12:34:41 -07:00
Jeffrey Morgan
f997e29e45 Add Dockerfile.build for building linux binaries (#558)
Add `Dockerfile.build` for building linux binaries

---------

Co-authored-by: Michael Yang <mxyng@pm.me>
2023-09-22 15:20:12 -04:00
Patrick Devine
87d9efb364 switch to forked readline lib which doesn't wreck the repl prompt (#578) 2023-09-22 12:17:45 -07:00
Michael Yang
93d3a2568d replace dockerfile 2023-09-22 11:57:38 -07:00
Michael Yang
5a81390b24 update dockerfile.cuda 2023-09-22 11:57:38 -07:00
Michael Yang
a89ef99aed Merge pull request #575 from jmorganca/mxyng/fix-ipv6-only
fix ipv6 parse ip
2023-09-22 11:47:11 -07:00
Bruce MacDonald
dc0c725ceb ubuntu cuda drivers (#576) 2023-09-22 19:43:14 +01:00
Bruce MacDonald
5d71bda478 close llm on interrupt (#577) 2023-09-22 19:41:52 +01:00
Michael Yang
88897a90e4 fix ipv6 parse ip 2023-09-22 10:41:32 -07:00
Bruce MacDonald
9df31c3518 linux installer script (#534)
Co-authored-by: Michael Yang <mxyng@pm.me>
2023-09-22 17:01:03 +01:00
Michael Yang
2044f9d4da Merge pull request #570 from jmorganca/mxyng/head-request
fix HEAD request
2023-09-21 16:56:17 -07:00
Michael Yang
0d186f3b33 Merge pull request #569 from jmorganca/mxyng/update-submodules
silence warm up log
2023-09-21 16:52:42 -07:00
Michael Yang
82f5b66c01 register HEAD /api/tags 2023-09-21 16:38:03 -07:00
Michael Yang
c986694367 fix HEAD / request
HEAD request should respond like their GET counterparts except without a
response body.
2023-09-21 16:35:58 -07:00
Michael Yang
058d0cd04b silence warm up log 2023-09-21 14:53:33 -07:00
Michael Yang
ee1c994d15 update submodule (#567) 2023-09-21 16:22:23 -04:00
Bruce MacDonald
4cba75efc5 remove tmp directories created by previous servers (#559)
* remove tmp directories created by previous servers

* clean up on server stop

* Update routes.go

* Update server/routes.go

Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com>

* create top-level temp ollama dir

* check file exists before creating

---------

Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com>
Co-authored-by: Michael Yang <mxyng@pm.me>
2023-09-21 20:38:49 +01:00
Michael Yang
8c83701e9f Merge pull request #566 from jmorganca/mxyng/api-check-model-exists
Use API to check if model exists and pull if necessary
2023-09-21 10:35:14 -07:00
Michael Yang
6137b12799 validate existence and pull model using api 2023-09-21 09:55:34 -07:00
Michael Yang
1fabba474b refactor default allow origins
this should be less error prone
2023-09-21 09:42:25 -07:00
Michael Yang
765770efdb Merge pull request #562 from jmorganca/mxyng/fix-ollama-host
fix OLLAMA_HOST parsing for ip6
2023-09-20 19:54:47 -07:00
Michael Yang
9297ff8330 fix OLLAMA_HOST parsing for ip6 2023-09-20 18:52:57 -07:00
Michael Yang
ee4fd16f2c Merge pull request #556 from jmorganca/pack-cuda
pack in cuda libs
2023-09-20 15:02:36 -07:00
Michael Yang
a9ed7cc6aa rename generate.go 2023-09-20 14:42:17 -07:00
Michael Yang
6c6a31a1e8 embed libraries using cmake 2023-09-20 14:41:57 -07:00
Bruce MacDonald
fc6ec356fc remove libcuda.so 2023-09-20 20:36:14 +01:00
Bruce MacDonald
1255bc9b45 only package 11.8 runner 2023-09-20 20:00:41 +01:00
Michael Yang
084e4c782a Merge pull request #557 from jmorganca/mxyng/cleanup
fix impossible condition
2023-09-20 11:51:01 -07:00
Michael Yang
58ffa03d8b fix impossible condition 2023-09-20 11:27:44 -07:00
Michael Yang
637f8bc6a5 Merge pull request #536 from jmorganca/mxyng/redirect-uploads
explicitly follow upload redirects
2023-09-20 11:27:03 -07:00
Michael Yang
499e9007a5 pick chunksize based on location 2023-09-20 11:10:24 -07:00
Bruce MacDonald
b9bb5ca288 use cuda_version 2023-09-20 17:58:16 +01:00
Bruce MacDonald
4e8be787c7 pack in cuda libs 2023-09-20 17:40:42 +01:00
Michael Yang
aa45d7c1df draft: explicitly follow upload redirects 2023-09-19 13:36:58 -07:00
Michael Yang
e35565c567 Merge pull request #555 from jmorganca/mxyng/fix-windows-startup
fix build
2023-09-19 10:51:58 -07:00
Michael Yang
a5520bfb42 fix build 2023-09-19 10:42:24 -07:00
Michael Yang
2627c464ba Merge pull request #554 from jmorganca/mxyng/fix-windows-startup
fix mkdir on windows
2023-09-19 09:42:12 -07:00
Michael Yang
b58d5d16b0 fix mkdir on windows 2023-09-19 09:41:13 -07:00
Patrick Devine
24580df958 only add a layer if there is actual data (#535) 2023-09-18 13:47:45 -07:00
Patrick Devine
80dd44e80a Cmd changes (#541) 2023-09-18 12:26:56 -07:00
James Braza
94e1d96b29 Updated README section on community projects for table (#550) 2023-09-18 15:22:50 -04:00
Bruce MacDonald
66003e1d05 subprocess improvements (#524)
* subprocess improvements

- increase start-up timeout
- when runner fails to start fail rather than timing out
- try runners in order rather than choosing 1 runner
- embed metal runner in metal dir rather than gpu
- refactor logging and error messages

* Update llama.go

* Update llama.go

* simplify by using glob
2023-09-18 15:16:32 -04:00
Michael Yang
c345053a8b Merge pull request #537 from jmorganca/mxyng/upload
fix error on upload chunk
2023-09-15 17:48:39 -07:00
Michael Yang
08d7c2a944 fix error on upload chunk 2023-09-15 15:59:30 -07:00
Michael Yang
bc9573dcb1 Merge pull request #530 from jmorganca/mxyng/progresswriter
implement ProgressWriter
2023-09-15 12:43:46 -07:00
Michael Yang
e53bc57d4d split uploadBlobChunked 2023-09-14 17:22:05 -07:00
Michael Yang
f0b398d17f implement ProgressWriter 2023-09-14 17:22:04 -07:00
Patrick Devine
8efbc5df55 DRAFT: add a simple python client to access ollama (#522) 2023-09-14 16:37:38 -07:00
Michael Yang
ccc3e9ac6d Merge pull request #531 from jmorganca/mxyng/content-length
set request.ContentLength
2023-09-14 13:33:11 -07:00
Michael Yang
daa4f096f9 set request.ContentLength
This informs the HTTP client the content length is known and disables
chunked Transfer-Encoding
2023-09-14 13:32:44 -07:00
Michael Yang
3ee85f1c6c Merge pull request #526 from jmorganca/mxyng/cleanup
remove unused
2023-09-14 13:10:59 -07:00
Bruce MacDonald
2540c9181c support for packaging in multiple cuda runners (#509)
* enable packaging multiple cuda versions
* use nvcc cuda version if available

---------

Co-authored-by: Michael Yang <mxyng@pm.me>
2023-09-14 15:08:13 -04:00
Michael Yang
83ffb154bc Merge pull request #507 from jmorganca/mxyng/build
update docker image
2023-09-14 11:25:59 -07:00
Michael Yang
9aa192c812 update cuda docker image 2023-09-14 11:25:20 -07:00
Matt Williams
fc8707686f Update API docs (#527)
* Update API docs

Signed-off-by: Matt Williams <m@technovangelist.com>

* strange TOC was getting auto generated

Signed-off-by: Matt Williams <m@technovangelist.com>

* Update docs/api.md

Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com>

* Update docs/api.md

Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com>

* Update docs/api.md

Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com>

* Update api.md

---------

Signed-off-by: Matt Williams <m@technovangelist.com>
Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com>
Co-authored-by: Michael Chiang <mchiang0610@users.noreply.github.com>
2023-09-14 08:51:26 -07:00
Michael Yang
f89c23764b Merge pull request #525 from jmorganca/mxyng/falcon-decode
fix: add falcon.go
2023-09-13 15:08:47 -07:00
Michael Yang
e6881cabd0 remove unused 2023-09-13 14:48:33 -07:00
Michael Yang
d028853879 fix: add falcon.go 2023-09-13 14:47:37 -07:00
Michael Yang
949553db23 Merge pull request #519 from jmorganca/mxyng/decode
Mxyng/decode
2023-09-13 12:43:57 -07:00
Michael Yang
0c5a454361 fix model type for 70b 2023-09-12 15:12:59 -07:00
Bruce MacDonald
f59c4d03f7 fix ggml arm64 cuda build (#520) 2023-09-12 17:06:48 -04:00
Michael Yang
7dee25a07f fix falcon decode
get model and file type from bin file
2023-09-12 12:34:53 -07:00
Bruce MacDonald
f221637053 first pass at linux gpu support (#454)
* linux gpu support
* handle multiple gpus
* add cuda docker image (#488)
---------

Co-authored-by: Michael Yang <mxyng@pm.me>
2023-09-12 11:04:35 -04:00
Patrick Devine
45ac07cd02 create the blobs directory correctly (#508) 2023-09-11 14:54:52 -07:00
Jeffrey Morgan
7d749cc787 fix darwin build script 2023-09-11 16:31:46 -04:00
Patrick Devine
e7e91cd71c add autoprune to remove unused layers (#491) 2023-09-11 11:46:35 -07:00
Jeffrey Morgan
3920e15386 add model format to config layer (#497) 2023-09-09 17:53:44 -04:00
Michael Yang
41e976edde Merge pull request #492 from jmorganca/mxyng/nil-pointer
fix nil pointer dereference
2023-09-07 17:25:23 -07:00
Michael Yang
de227b620f fix nil pointer dereference 2023-09-07 17:24:31 -07:00
Michael Yang
63def6ca49 Merge pull request #487 from jmorganca/mxyng/dockerignore
update dockerignore
2023-09-07 14:16:17 -07:00
Michael Yang
738fe9c4aa Merge pull request #486 from jmorganca/mxyng/fix-push
fix: retry push on expired token
2023-09-07 13:58:34 -07:00
Michael Yang
a8da0bacbe update dockerignore 2023-09-07 13:36:25 -07:00
Michael Yang
bf146fb072 fix retry on unauthorized chunk 2023-09-07 12:02:04 -07:00
Michael Yang
f0f4943577 fix get auth token 2023-09-07 12:01:56 -07:00
Bruce MacDonald
09dd2aeff9 GGUF support (#441) 2023-09-07 13:55:37 -04:00
41 changed files with 2213 additions and 688 deletions

View File

@@ -1,4 +1,8 @@
.vscode .vscode
ollama ollama
app app
dist
scripts
llm/llama.cpp/ggml llm/llama.cpp/ggml
llm/llama.cpp/gguf
.env

12
.gitmodules vendored
View File

@@ -1,4 +1,10 @@
[submodule "llm/llama.cpp/ggml"] [submodule "llm/llama.cpp/ggml"]
path = llm/llama.cpp/ggml path = llm/llama.cpp/ggml
url = https://github.com/ggerganov/llama.cpp.git url = https://github.com/ggerganov/llama.cpp.git
ignore = dirty ignore = dirty
shallow = true
[submodule "llm/llama.cpp/gguf"]
path = llm/llama.cpp/gguf
url = https://github.com/ggerganov/llama.cpp.git
ignore = dirty
shallow = true

View File

@@ -1,18 +1,28 @@
FROM golang:alpine ARG CUDA_VERSION=12.2.0
FROM nvidia/cuda:$CUDA_VERSION-devel-ubuntu22.04
ARG TARGETARCH
ARG VERSION=0.0.0
WORKDIR /go/src/github.com/jmorganca/ollama WORKDIR /go/src/github.com/jmorganca/ollama
RUN apk add --no-cache git build-base cmake RUN apt-get update && apt-get install -y git build-essential cmake
ADD https://dl.google.com/go/go1.21.1.linux-$TARGETARCH.tar.gz /tmp/go1.21.1.tar.gz
RUN mkdir -p /usr/local && tar xz -C /usr/local </tmp/go1.21.1.tar.gz
COPY . . COPY . .
RUN go generate ./... && go build -ldflags '-linkmode external -extldflags "-static"' . ENV GOARCH=$TARGETARCH
RUN /usr/local/go/bin/go generate ./... \
&& /usr/local/go/bin/go build -ldflags "-linkmode=external -extldflags='-static' -X=github.com/jmorganca/ollama/version.Version=$VERSION -X=github.com/jmorganca/ollama/server.mode=release" .
FROM alpine FROM ubuntu:22.04
ENV OLLAMA_HOST 0.0.0.0 ENV OLLAMA_HOST 0.0.0.0
RUN apk add --no-cache libstdc++
RUN apt-get update && apt-get install -y ca-certificates
ARG USER=ollama ARG USER=ollama
ARG GROUP=ollama ARG GROUP=ollama
RUN addgroup $GROUP && adduser -D -G $GROUP $USER RUN groupadd $GROUP && useradd -m -g $GROUP $USER
COPY --from=0 /go/src/github.com/jmorganca/ollama/ollama /bin/ollama COPY --from=0 /go/src/github.com/jmorganca/ollama/ollama /bin/ollama

29
Dockerfile.build Normal file
View File

@@ -0,0 +1,29 @@
ARG VERSION=0.0.0
# centos7 amd64 dependencies
FROM --platform=linux/amd64 nvidia/cuda:11.8.0-devel-centos7 AS base-amd64
RUN yum install -y https://repo.ius.io/ius-release-el7.rpm centos-release-scl && \
yum update -y && \
yum install -y devtoolset-10-gcc devtoolset-10-gcc-c++ git236 wget
RUN wget "https://github.com/Kitware/CMake/releases/download/v3.27.6/cmake-3.27.6-linux-x86_64.sh" -O cmake-installer.sh && chmod +x cmake-installer.sh && ./cmake-installer.sh --skip-license --prefix=/usr/local
ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
# centos8 arm64 dependencies
FROM --platform=linux/arm64 nvidia/cuda:11.4.3-devel-centos8 AS base-arm64
RUN sed -i -e 's/mirrorlist/#mirrorlist/g' -e 's|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g' /etc/yum.repos.d/CentOS-*
RUN yum install -y git cmake
FROM base-${TARGETARCH}
ARG TARGETARCH
# install go
ADD https://dl.google.com/go/go1.21.1.linux-$TARGETARCH.tar.gz /tmp/go1.21.1.tar.gz
RUN mkdir -p /usr/local && tar xz -C /usr/local </tmp/go1.21.1.tar.gz
# build the final binary
WORKDIR /go/src/github.com/jmorganca/ollama
COPY . .
ENV GOARCH=$TARGETARCH
RUN /usr/local/go/bin/go generate ./... && \
/usr/local/go/bin/go build -ldflags "-X=github.com/jmorganca/ollama/version.Version=$VERSION -X=github.com/jmorganca/ollama/server.mode=release" .

View File

@@ -206,10 +206,17 @@ curl -X POST http://localhost:11434/api/generate -d '{
## Community Projects using Ollama ## Community Projects using Ollama
- [LangChain](https://python.langchain.com/docs/integrations/llms/ollama) and [LangChain.js](https://js.langchain.com/docs/modules/model_io/models/llms/integrations/ollama) with a question-answering [example](https://js.langchain.com/docs/use_cases/question_answering/local_retrieval_qa). | Project | Description |
- [Continue](https://github.com/continuedev/continue) - embeds Ollama inside Visual Studio Code. The extension lets you highlight code to add to the prompt, ask questions in the sidebar, and generate code inline. | -------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------ |
- [LiteLLM](https://github.com/BerriAI/litellm) a lightweight python package to simplify LLM API calls | [LangChain][1] and [LangChain.js][2] | Also, there is a question-answering [example][3]. |
- [Discord AI Bot](https://github.com/mekb-turtle/discord-ai-bot) - interact with Ollama as a chatbot on Discord. | [Continue](https://github.com/continuedev/continue) | Embeds Ollama inside Visual Studio Code. The extension lets you highlight code to add to the prompt, ask questions in the sidebar, and generate code inline. |
- [Raycast Ollama](https://github.com/MassimilianoPasquini97/raycast_ollama) - Raycast extension to use Ollama for local llama inference on Raycast. | [LiteLLM](https://github.com/BerriAI/litellm) | Lightweight Python package to simplify LLM API calls. |
- [Simple HTML UI for Ollama](https://github.com/rtcfirefly/ollama-ui) | [Discord AI Bot](https://github.com/mekb-turtle/discord-ai-bot) | Interact with Ollama as a chatbot on Discord. |
- [Emacs client](https://github.com/zweifisch/ollama) for Ollama | [Raycast Ollama](https://github.com/MassimilianoPasquini97/raycast_ollama) | Raycast extension to use Ollama for local llama inference on Raycast. |
| [Simple HTML UI](https://github.com/rtcfirefly/ollama-ui) | Also, there is a Chrome extension. |
| [Ollama-GUI](https://github.com/ollama-interface/Ollama-Gui?tab=readme-ov-file) | 🖥️ Mac Chat Interface ⚡️ |
| [Emacs client](https://github.com/zweifisch/ollama) | |
[1]: https://python.langchain.com/docs/integrations/llms/ollama
[2]: https://js.langchain.com/docs/modules/model_io/models/llms/integrations/ollama
[3]: https://js.langchain.com/docs/use_cases/question_answering/local_retrieval_qa

225
api/client.py Normal file
View File

@@ -0,0 +1,225 @@
import os
import json
import requests
BASE_URL = os.environ.get('OLLAMA_HOST', 'http://localhost:11434')
# Generate a response for a given prompt with a provided model. This is a streaming endpoint, so will be a series of responses.
# The final response object will include statistics and additional data from the request. Use the callback function to override
# the default handler.
def generate(model_name, prompt, system=None, template=None, context=None, options=None, callback=None):
try:
url = f"{BASE_URL}/api/generate"
payload = {
"model": model_name,
"prompt": prompt,
"system": system,
"template": template,
"context": context,
"options": options
}
# Remove keys with None values
payload = {k: v for k, v in payload.items() if v is not None}
with requests.post(url, json=payload, stream=True) as response:
response.raise_for_status()
# Creating a variable to hold the context history of the final chunk
final_context = None
# Variable to hold concatenated response strings if no callback is provided
full_response = ""
# Iterating over the response line by line and displaying the details
for line in response.iter_lines():
if line:
# Parsing each line (JSON chunk) and extracting the details
chunk = json.loads(line)
# If a callback function is provided, call it with the chunk
if callback:
callback(chunk)
else:
# If this is not the last chunk, add the "response" field value to full_response and print it
if not chunk.get("done"):
response_piece = chunk.get("response", "")
full_response += response_piece
print(response_piece, end="", flush=True)
# Check if it's the last chunk (done is true)
if chunk.get("done"):
final_context = chunk.get("context")
# Return the full response and the final context
return full_response, final_context
except requests.exceptions.RequestException as e:
print(f"An error occurred: {e}")
return None, None
# Create a model from a Modelfile. Use the callback function to override the default handler.
def create(model_name, model_path, callback=None):
try:
url = f"{BASE_URL}/api/create"
payload = {"name": model_name, "path": model_path}
# Making a POST request with the stream parameter set to True to handle streaming responses
with requests.post(url, json=payload, stream=True) as response:
response.raise_for_status()
# Iterating over the response line by line and displaying the status
for line in response.iter_lines():
if line:
# Parsing each line (JSON chunk) and extracting the status
chunk = json.loads(line)
if callback:
callback(chunk)
else:
print(f"Status: {chunk.get('status')}")
except requests.exceptions.RequestException as e:
print(f"An error occurred: {e}")
# Pull a model from a the model registry. Cancelled pulls are resumed from where they left off, and multiple
# calls to will share the same download progress. Use the callback function to override the default handler.
def pull(model_name, insecure=False, callback=None):
try:
url = f"{BASE_URL}/api/pull"
payload = {
"name": model_name,
"insecure": insecure
}
# Making a POST request with the stream parameter set to True to handle streaming responses
with requests.post(url, json=payload, stream=True) as response:
response.raise_for_status()
# Iterating over the response line by line and displaying the details
for line in response.iter_lines():
if line:
# Parsing each line (JSON chunk) and extracting the details
chunk = json.loads(line)
# If a callback function is provided, call it with the chunk
if callback:
callback(chunk)
else:
# Print the status message directly to the console
print(chunk.get('status', ''), end='', flush=True)
# If there's layer data, you might also want to print that (adjust as necessary)
if 'digest' in chunk:
print(f" - Digest: {chunk['digest']}", end='', flush=True)
print(f" - Total: {chunk['total']}", end='', flush=True)
print(f" - Completed: {chunk['completed']}", end='\n', flush=True)
else:
print()
except requests.exceptions.RequestException as e:
print(f"An error occurred: {e}")
# Push a model to the model registry. Use the callback function to override the default handler.
def push(model_name, insecure=False, callback=None):
try:
url = f"{BASE_URL}/api/push"
payload = {
"name": model_name,
"insecure": insecure
}
# Making a POST request with the stream parameter set to True to handle streaming responses
with requests.post(url, json=payload, stream=True) as response:
response.raise_for_status()
# Iterating over the response line by line and displaying the details
for line in response.iter_lines():
if line:
# Parsing each line (JSON chunk) and extracting the details
chunk = json.loads(line)
# If a callback function is provided, call it with the chunk
if callback:
callback(chunk)
else:
# Print the status message directly to the console
print(chunk.get('status', ''), end='', flush=True)
# If there's layer data, you might also want to print that (adjust as necessary)
if 'digest' in chunk:
print(f" - Digest: {chunk['digest']}", end='', flush=True)
print(f" - Total: {chunk['total']}", end='', flush=True)
print(f" - Completed: {chunk['completed']}", end='\n', flush=True)
else:
print()
except requests.exceptions.RequestException as e:
print(f"An error occurred: {e}")
# List models that are available locally.
def list():
try:
response = requests.get(f"{BASE_URL}/api/tags")
response.raise_for_status()
data = response.json()
models = data.get('models', [])
return models
except requests.exceptions.RequestException as e:
print(f"An error occurred: {e}")
return None
# Copy a model. Creates a model with another name from an existing model.
def copy(source, destination):
try:
# Create the JSON payload
payload = {
"source": source,
"destination": destination
}
response = requests.post(f"{BASE_URL}/api/copy", json=payload)
response.raise_for_status()
# If the request was successful, return a message indicating that the copy was successful
return "Copy successful"
except requests.exceptions.RequestException as e:
print(f"An error occurred: {e}")
return None
# Delete a model and its data.
def delete(model_name):
try:
url = f"{BASE_URL}/api/delete"
payload = {"name": model_name}
response = requests.delete(url, json=payload)
response.raise_for_status()
return "Delete successful"
except requests.exceptions.RequestException as e:
print(f"An error occurred: {e}")
return None
# Show info about a model.
def show(model_name):
try:
url = f"{BASE_URL}/api/show"
payload = {"name": model_name}
response = requests.post(url, json=payload)
response.raise_for_status()
# Parse the JSON response and return it
data = response.json()
return data
except requests.exceptions.RequestException as e:
print(f"An error occurred: {e}")
return None
def heartbeat():
try:
url = f"{BASE_URL}/"
response = requests.head(url)
response.raise_for_status()
return "Ollama is running"
except requests.exceptions.RequestException as e:
print(f"An error occurred: {e}")
return "Ollama is not running"

View File

@@ -291,7 +291,7 @@ func DefaultOptions() Options {
NumCtx: 2048, NumCtx: 2048,
NumKeep: -1, NumKeep: -1,
NumBatch: 512, NumBatch: 512,
NumGPU: 1, NumGPU: -1, // -1 here indicates that NumGPU should be set dynamically
NumGQA: 1, NumGQA: 1,
LowVRAM: false, LowVRAM: false,
F16KV: true, F16KV: true,

View File

@@ -11,20 +11,19 @@ import (
"io" "io"
"log" "log"
"net" "net"
"net/http"
"os" "os"
"os/exec" "os/exec"
"path"
"path/filepath" "path/filepath"
"runtime" "runtime"
"strings" "strings"
"time" "time"
"github.com/chzyer/readline"
"github.com/dustin/go-humanize" "github.com/dustin/go-humanize"
"github.com/olekukonko/tablewriter" "github.com/olekukonko/tablewriter"
"github.com/pdevine/readline"
"github.com/spf13/cobra" "github.com/spf13/cobra"
"golang.org/x/crypto/ssh" "golang.org/x/crypto/ssh"
"golang.org/x/term"
"github.com/jmorganca/ollama/api" "github.com/jmorganca/ollama/api"
"github.com/jmorganca/ollama/format" "github.com/jmorganca/ollama/format"
@@ -33,6 +32,26 @@ import (
"github.com/jmorganca/ollama/version" "github.com/jmorganca/ollama/version"
) )
type Painter struct {
IsMultiLine bool
}
func (p Painter) Paint(line []rune, _ int) []rune {
termType := os.Getenv("TERM")
if termType == "xterm-256color" && len(line) == 0 {
var prompt string
if p.IsMultiLine {
prompt = "Use \"\"\" to end multi-line input"
} else {
prompt = "Send a message (/? for help)"
}
return []rune(fmt.Sprintf("\033[38;5;245m%s\033[%dD\033[0m", prompt, len(prompt)))
}
// add a space and a backspace to prevent the cursor from walking up the screen
line = append(line, []rune(" \b")...)
return line
}
func CreateHandler(cmd *cobra.Command, args []string) error { func CreateHandler(cmd *cobra.Command, args []string) error {
filename, _ := cmd.Flags().GetString("file") filename, _ := cmd.Flags().GetString("file")
filename, err := filepath.Abs(filename) filename, err := filepath.Abs(filename)
@@ -98,39 +117,28 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
} }
func RunHandler(cmd *cobra.Command, args []string) error { func RunHandler(cmd *cobra.Command, args []string) error {
insecure, err := cmd.Flags().GetBool("insecure") client, err := api.FromEnv()
if err != nil { if err != nil {
return err return err
} }
mp := server.ParseModelPath(args[0]) models, err := client.List(context.Background())
if err != nil { if err != nil {
return err return err
} }
if mp.ProtocolScheme == "http" && !insecure { modelName, modelTag, ok := strings.Cut(args[0], ":")
return fmt.Errorf("insecure protocol http") if !ok {
modelTag = "latest"
} }
fp, err := mp.GetManifestPath(false) for _, model := range models.Models {
if err != nil { if model.Name == strings.Join([]string{modelName, modelTag}, ":") {
return err return RunGenerate(cmd, args)
}
_, err = os.Stat(fp)
switch {
case errors.Is(err, os.ErrNotExist):
if err := pull(args[0], insecure); err != nil {
var apiStatusError api.StatusError
if !errors.As(err, &apiStatusError) {
return err
}
if apiStatusError.StatusCode != http.StatusBadGateway {
return err
}
} }
case err != nil: }
if err := PullHandler(cmd, args); err != nil {
return err return err
} }
@@ -387,71 +395,117 @@ func RunGenerate(cmd *cobra.Command, args []string) error {
type generateContextKey string type generateContextKey string
func generate(cmd *cobra.Command, model, prompt string) error { func generate(cmd *cobra.Command, model, prompt string) error {
if len(strings.TrimSpace(prompt)) > 0 { client, err := api.FromEnv()
client, err := api.FromEnv() if err != nil {
if err != nil { return err
return err
}
spinner := NewSpinner("")
go spinner.Spin(60 * time.Millisecond)
var latest api.GenerateResponse
generateContext, ok := cmd.Context().Value(generateContextKey("context")).([]int)
if !ok {
generateContext = []int{}
}
request := api.GenerateRequest{Model: model, Prompt: prompt, Context: generateContext}
fn := func(response api.GenerateResponse) error {
if !spinner.IsFinished() {
spinner.Finish()
}
latest = response
fmt.Print(response.Response)
return nil
}
if err := client.Generate(context.Background(), &request, fn); err != nil {
if strings.Contains(err.Error(), "failed to load model") {
// tell the user to check the server log, if it exists locally
home, nestedErr := os.UserHomeDir()
if nestedErr != nil {
// return the original error
return err
}
logPath := filepath.Join(home, ".ollama", "logs", "server.log")
if _, nestedErr := os.Stat(logPath); nestedErr == nil {
err = fmt.Errorf("%w\nFor more details, check the error logs at %s", err, logPath)
}
}
return err
}
fmt.Println()
fmt.Println()
if !latest.Done {
return errors.New("unexpected end of response")
}
verbose, err := cmd.Flags().GetBool("verbose")
if err != nil {
return err
}
if verbose {
latest.Summary()
}
ctx := cmd.Context()
ctx = context.WithValue(ctx, generateContextKey("context"), latest.Context)
cmd.SetContext(ctx)
} }
spinner := NewSpinner("")
go spinner.Spin(60 * time.Millisecond)
var latest api.GenerateResponse
generateContext, ok := cmd.Context().Value(generateContextKey("context")).([]int)
if !ok {
generateContext = []int{}
}
var wrapTerm bool
termType := os.Getenv("TERM")
if termType == "xterm-256color" {
wrapTerm = true
}
termWidth, _, err := term.GetSize(int(0))
if err != nil {
wrapTerm = false
}
// override wrapping if the user turned it off
nowrap, err := cmd.Flags().GetBool("nowordwrap")
if err != nil {
return err
}
if nowrap {
wrapTerm = false
}
var currentLineLength int
var wordBuffer string
request := api.GenerateRequest{Model: model, Prompt: prompt, Context: generateContext}
fn := func(response api.GenerateResponse) error {
if !spinner.IsFinished() {
spinner.Finish()
}
latest = response
if wrapTerm {
for _, ch := range response.Response {
if currentLineLength+1 > termWidth-5 {
// backtrack the length of the last word and clear to the end of the line
fmt.Printf("\x1b[%dD\x1b[K\n", len(wordBuffer))
fmt.Printf("%s%c", wordBuffer, ch)
currentLineLength = len(wordBuffer) + 1
} else {
fmt.Print(string(ch))
currentLineLength += 1
switch ch {
case ' ':
wordBuffer = ""
case '\n':
currentLineLength = 0
default:
wordBuffer += string(ch)
}
}
}
} else {
fmt.Print(response.Response)
}
return nil
}
if err := client.Generate(context.Background(), &request, fn); err != nil {
if strings.Contains(err.Error(), "failed to load model") {
// tell the user to check the server log, if it exists locally
home, nestedErr := os.UserHomeDir()
if nestedErr != nil {
// return the original error
return err
}
logPath := filepath.Join(home, ".ollama", "logs", "server.log")
if _, nestedErr := os.Stat(logPath); nestedErr == nil {
err = fmt.Errorf("%w\nFor more details, check the error logs at %s", err, logPath)
}
}
return err
}
if prompt != "" {
fmt.Println()
fmt.Println()
}
if !latest.Done {
return errors.New("unexpected end of response")
}
verbose, err := cmd.Flags().GetBool("verbose")
if err != nil {
return err
}
if verbose {
latest.Summary()
}
ctx := cmd.Context()
ctx = context.WithValue(ctx, generateContextKey("context"), latest.Context)
cmd.SetContext(ctx)
return nil return nil
} }
@@ -461,19 +515,21 @@ func generateInteractive(cmd *cobra.Command, model string) error {
return err return err
} }
// load the model
if err := generate(cmd, model, ""); err != nil {
return err
}
completer := readline.NewPrefixCompleter( completer := readline.NewPrefixCompleter(
readline.PcItem("/help"), readline.PcItem("/help"),
readline.PcItem("/list"), readline.PcItem("/list"),
readline.PcItem("/set", readline.PcItem("/set",
readline.PcItem("history"), readline.PcItem("history"),
readline.PcItem("nohistory"), readline.PcItem("nohistory"),
readline.PcItem("wordwrap"),
readline.PcItem("nowordwrap"),
readline.PcItem("verbose"), readline.PcItem("verbose"),
readline.PcItem("quiet"), readline.PcItem("quiet"),
readline.PcItem("mode",
readline.PcItem("vim"),
readline.PcItem("emacs"),
readline.PcItem("default"),
),
), ),
readline.PcItem("/show", readline.PcItem("/show",
readline.PcItem("license"), readline.PcItem("license"),
@@ -491,7 +547,10 @@ func generateInteractive(cmd *cobra.Command, model string) error {
fmt.Fprintln(os.Stderr, completer.Tree(" ")) fmt.Fprintln(os.Stderr, completer.Tree(" "))
} }
var painter Painter
config := readline.Config{ config := readline.Config{
Painter: &painter,
Prompt: ">>> ", Prompt: ">>> ",
HistoryFile: filepath.Join(home, ".ollama", "history"), HistoryFile: filepath.Join(home, ".ollama", "history"),
AutoComplete: completer, AutoComplete: completer,
@@ -527,6 +586,7 @@ func generateInteractive(cmd *cobra.Command, model string) error {
case isMultiLine: case isMultiLine:
if strings.HasSuffix(line, `"""`) { if strings.HasSuffix(line, `"""`) {
isMultiLine = false isMultiLine = false
painter.IsMultiLine = isMultiLine
multiLineBuffer += strings.TrimSuffix(line, `"""`) multiLineBuffer += strings.TrimSuffix(line, `"""`)
line = multiLineBuffer line = multiLineBuffer
multiLineBuffer = "" multiLineBuffer = ""
@@ -537,6 +597,7 @@ func generateInteractive(cmd *cobra.Command, model string) error {
} }
case strings.HasPrefix(line, `"""`): case strings.HasPrefix(line, `"""`):
isMultiLine = true isMultiLine = true
painter.IsMultiLine = isMultiLine
multiLineBuffer = strings.TrimPrefix(line, `"""`) + " " multiLineBuffer = strings.TrimPrefix(line, `"""`) + " "
scanner.SetPrompt("... ") scanner.SetPrompt("... ")
continue continue
@@ -545,45 +606,42 @@ func generateInteractive(cmd *cobra.Command, model string) error {
if err := ListHandler(cmd, args[1:]); err != nil { if err := ListHandler(cmd, args[1:]); err != nil {
return err return err
} }
continue
case strings.HasPrefix(line, "/set"): case strings.HasPrefix(line, "/set"):
args := strings.Fields(line) args := strings.Fields(line)
if len(args) > 1 { if len(args) > 1 {
switch args[1] { switch args[1] {
case "history": case "history":
scanner.HistoryEnable() scanner.HistoryEnable()
continue
case "nohistory": case "nohistory":
scanner.HistoryDisable() scanner.HistoryDisable()
continue case "wordwrap":
cmd.Flags().Set("nowordwrap", "false")
fmt.Println("Set 'wordwrap' mode.")
case "nowordwrap":
cmd.Flags().Set("nowordwrap", "true")
fmt.Println("Set 'nowordwrap' mode.")
case "verbose": case "verbose":
cmd.Flags().Set("verbose", "true") cmd.Flags().Set("verbose", "true")
continue fmt.Println("Set 'verbose' mode.")
case "quiet": case "quiet":
cmd.Flags().Set("verbose", "false") cmd.Flags().Set("verbose", "false")
continue fmt.Println("Set 'quiet' mode.")
case "mode": case "mode":
if len(args) > 2 { if len(args) > 2 {
switch args[2] { switch args[2] {
case "vim": case "vim":
scanner.SetVimMode(true) scanner.SetVimMode(true)
continue
case "emacs", "default": case "emacs", "default":
scanner.SetVimMode(false) scanner.SetVimMode(false)
continue
default: default:
usage() usage()
continue
} }
} else { } else {
usage() usage()
continue
} }
} }
} else { } else {
usage() usage()
continue
} }
case strings.HasPrefix(line, "/show"): case strings.HasPrefix(line, "/show"):
args := strings.Fields(line) args := strings.Fields(line)
@@ -591,7 +649,6 @@ func generateInteractive(cmd *cobra.Command, model string) error {
resp, err := server.GetModelInfo(model) resp, err := server.GetModelInfo(model)
if err != nil { if err != nil {
fmt.Println("error: couldn't get model") fmt.Println("error: couldn't get model")
continue
} }
switch args[1] { switch args[1] {
@@ -608,21 +665,22 @@ func generateInteractive(cmd *cobra.Command, model string) error {
default: default:
fmt.Println("error: unknown command") fmt.Println("error: unknown command")
} }
continue
} else { } else {
usage() usage()
continue
} }
case line == "/help", line == "/?": case line == "/help", line == "/?":
usage() usage()
continue
case line == "/exit", line == "/bye": case line == "/exit", line == "/bye":
return nil return nil
case strings.HasPrefix(line, "/"):
args := strings.Fields(line)
fmt.Printf("Unknown command '%s'. Type /? for help\n", args[0])
} }
if err := generate(cmd, model, line); err != nil { if len(line) > 0 && line[0] != '/' {
return err if err := generate(cmd, model, line); err != nil {
return err
}
} }
} }
} }
@@ -641,28 +699,19 @@ func generateBatch(cmd *cobra.Command, model string) error {
} }
func RunServer(cmd *cobra.Command, _ []string) error { func RunServer(cmd *cobra.Command, _ []string) error {
host, port := "127.0.0.1", "11434" host, port, err := net.SplitHostPort(os.Getenv("OLLAMA_HOST"))
parts := strings.Split(os.Getenv("OLLAMA_HOST"), ":")
if ip := net.ParseIP(parts[0]); ip != nil {
host = ip.String()
}
if len(parts) > 1 {
port = parts[1]
}
// deprecated: include port in OLLAMA_HOST
if p := os.Getenv("OLLAMA_PORT"); p != "" {
port = p
}
err := initializeKeypair()
if err != nil { if err != nil {
host, port = "127.0.0.1", "11434"
if ip := net.ParseIP(strings.Trim(os.Getenv("OLLAMA_HOST"), "[]")); ip != nil {
host = ip.String()
}
}
if err := initializeKeypair(); err != nil {
return err return err
} }
ln, err := net.Listen("tcp", fmt.Sprintf("%s:%s", host, port)) ln, err := net.Listen("tcp", net.JoinHostPort(host, port))
if err != nil { if err != nil {
return err return err
} }
@@ -672,6 +721,12 @@ func RunServer(cmd *cobra.Command, _ []string) error {
origins = strings.Split(o, ",") origins = strings.Split(o, ",")
} }
if noprune := os.Getenv("OLLAMA_NOPRUNE"); noprune == "" {
if err := server.PruneLayers(); err != nil {
return err
}
}
return server.Serve(ln, origins) return server.Serve(ln, origins)
} }
@@ -697,7 +752,7 @@ func initializeKeypair() error {
return err return err
} }
err = os.MkdirAll(path.Dir(privKeyPath), 0o700) err = os.MkdirAll(filepath.Dir(privKeyPath), 0o755)
if err != nil { if err != nil {
return fmt.Errorf("could not create directory %w", err) return fmt.Errorf("could not create directory %w", err)
} }
@@ -825,6 +880,7 @@ func NewCLI() *cobra.Command {
runCmd.Flags().Bool("verbose", false, "Show timings for response") runCmd.Flags().Bool("verbose", false, "Show timings for response")
runCmd.Flags().Bool("insecure", false, "Use an insecure registry") runCmd.Flags().Bool("insecure", false, "Use an insecure registry")
runCmd.Flags().Bool("nowordwrap", false, "Don't wrap words to the next line automatically")
serveCmd := &cobra.Command{ serveCmd := &cobra.Command{
Use: "serve", Use: "serve",

View File

@@ -3,18 +3,21 @@
## Endpoints ## Endpoints
- [Generate a completion](#generate-a-completion) - [Generate a completion](#generate-a-completion)
- [Create a model](#create-a-model) - [Create a Model](#create-a-model)
- [List local models](#list-local-models) - [List Local Models](#list-local-models)
- [Copy a model](#copy-a-model) - [Show Model Information](#show-model-information)
- [Delete a model](#delete-a-model) - [Copy a Model](#copy-a-model)
- [Pull a model](#pull-a-model) - [Delete a Model](#delete-a-model)
- [Generate embeddings](#generate-embeddings) - [Pull a Model](#pull-a-model)
- [Push a Model](#push-a-model)
- [Generate Embeddings](#generate-embeddings)
## Conventions ## Conventions
### Model names ### Model names
Model names follow a `model:tag` format. Some examples are `orca-mini:3b-q4_1` and `llama2:70b`. The tag is optional and if not provided will default to `latest`. The tag is used to identify a specific version. Model names follow a `model:tag` format. Some examples are `orca-mini:3b-q4_1` and `llama2:70b`. The tag is optional and, if not provided, will default to `latest`. The tag is used to identify a specific version.
### Durations ### Durations
@@ -22,7 +25,7 @@ All durations are returned in nanoseconds.
## Generate a completion ## Generate a completion
``` ```shell
POST /api/generate POST /api/generate
``` ```
@@ -42,7 +45,7 @@ Advanced parameters:
### Request ### Request
``` ```shell
curl -X POST http://localhost:11434/api/generate -d '{ curl -X POST http://localhost:11434/api/generate -d '{
"model": "llama2:7b", "model": "llama2:7b",
"prompt": "Why is the sky blue?" "prompt": "Why is the sky blue?"
@@ -95,7 +98,7 @@ To calculate how fast the response is generated in tokens per second (token/s),
## Create a Model ## Create a Model
``` ```shell
POST /api/create POST /api/create
``` ```
@@ -108,7 +111,7 @@ Create a model from a [`Modelfile`](./modelfile.md)
### Request ### Request
``` ```shell
curl -X POST http://localhost:11434/api/create -d '{ curl -X POST http://localhost:11434/api/create -d '{
"name": "mario", "name": "mario",
"path": "~/Modelfile" "path": "~/Modelfile"
@@ -117,7 +120,7 @@ curl -X POST http://localhost:11434/api/create -d '{
### Response ### Response
A stream of JSON objects. When finished, `status` is `success` A stream of JSON objects. When finished, `status` is `success`.
```json ```json
{ {
@@ -127,7 +130,7 @@ A stream of JSON objects. When finished, `status` is `success`
## List Local Models ## List Local Models
``` ```shell
GET /api/tags GET /api/tags
``` ```
@@ -135,7 +138,7 @@ List models that are available locally.
### Request ### Request
``` ```shell
curl http://localhost:11434/api/tags curl http://localhost:11434/api/tags
``` ```
@@ -158,9 +161,40 @@ curl http://localhost:11434/api/tags
} }
``` ```
## Show Model Information
```shell
POST /api/show
```
Show details about a model including modelfile, template, parameters, license, and system prompt.
### Parameters
- `name`: name of the model to show
### Request
```shell
curl http://localhost:11434/api/show -d '{
"name": "llama2:7b"
}'
```
### Response
```json
{
"license": "<contents of license block>",
"modelfile": "# Modelfile generated by \"ollama show\"\n# To build a new Modelfile based on this one, replace the FROM line with:\n# FROM llama2:latest\n\nFROM /Users/username/.ollama/models/blobs/sha256:8daa9615cce30c259a9555b1cc250d461d1bc69980a274b44d7eda0be78076d8\nTEMPLATE \"\"\"[INST] {{ if and .First .System }}<<SYS>>{{ .System }}<</SYS>>\n\n{{ end }}{{ .Prompt }} [/INST] \"\"\"\nSYSTEM \"\"\"\"\"\"\nPARAMETER stop [INST]\nPARAMETER stop [/INST]\nPARAMETER stop <<SYS>>\nPARAMETER stop <</SYS>>\n",
"parameters": "stop [INST]\nstop [/INST]\nstop <<SYS>>\nstop <</SYS>>",
"template": "[INST] {{ if and .First .System }}<<SYS>>{{ .System }}<</SYS>>\n\n{{ end }}{{ .Prompt }} [/INST] "
}
```
## Copy a Model ## Copy a Model
``` ```shell
POST /api/copy POST /api/copy
``` ```
@@ -168,7 +202,7 @@ Copy a model. Creates a model with another name from an existing model.
### Request ### Request
``` ```shell
curl http://localhost:11434/api/copy -d '{ curl http://localhost:11434/api/copy -d '{
"source": "llama2:7b", "source": "llama2:7b",
"destination": "llama2-backup" "destination": "llama2-backup"
@@ -177,7 +211,7 @@ curl http://localhost:11434/api/copy -d '{
## Delete a Model ## Delete a Model
``` ```shell
DELETE /api/delete DELETE /api/delete
``` ```
@@ -189,7 +223,7 @@ Delete a model and its data.
### Request ### Request
``` ```shell
curl -X DELETE http://localhost:11434/api/delete -d '{ curl -X DELETE http://localhost:11434/api/delete -d '{
"name": "llama2:13b" "name": "llama2:13b"
}' }'
@@ -197,19 +231,20 @@ curl -X DELETE http://localhost:11434/api/delete -d '{
## Pull a Model ## Pull a Model
``` ```shell
POST /api/pull POST /api/pull
``` ```
Download a model from a the model registry. Cancelled pulls are resumed from where they left off, and multiple calls to will share the same download progress. Download a model from the ollama library. Cancelled pulls are resumed from where they left off, and multiple calls will share the same download progress.
### Parameters ### Parameters
- `name`: name of the model to pull - `name`: name of the model to pull
- `insecure`: (optional) allow insecure connections to the library. Only use this if you are pulling from your own library during development.
### Request ### Request
``` ```shell
curl -X POST http://localhost:11434/api/pull -d '{ curl -X POST http://localhost:11434/api/pull -d '{
"name": "llama2:7b" "name": "llama2:7b"
}' }'
@@ -225,9 +260,63 @@ curl -X POST http://localhost:11434/api/pull -d '{
} }
``` ```
## Push a Model
```shell
POST /api/push
```
Upload a model to a model library. Requires registering for ollama.ai and adding a public key first.
### Parameters
- `name`: name of the model to push in the form of `<namespace>/<model>:<tag>`
- `insecure`: (optional) allow insecure connections to the library. Only use this if you are pushing to your library during development.
### Request
```shell
curl -X POST http://localhost:11434/api/push -d '{
"name": "mattw/pygmalion:latest"
}'
```
### Response
Streaming response that starts with:
```json
{"status":"retrieving manifest"}
```
and then:
```json
{
"status":"starting upload","digest":"sha256:bc07c81de745696fdf5afca05e065818a8149fb0c77266fb584d9b2cba3711ab",
"total":1928429856
}
```
Then there is a series of uploading responses:
```json
{
"status":"starting upload",
"digest":"sha256:bc07c81de745696fdf5afca05e065818a8149fb0c77266fb584d9b2cba3711ab",
"total":1928429856}
```
Finally, when the upload is complete:
```json
{"status":"pushing manifest"}
{"status":"success"}
```
## Generate Embeddings ## Generate Embeddings
``` ```shell
POST /api/embeddings POST /api/embeddings
``` ```
@@ -244,7 +333,7 @@ Advanced parameters:
### Request ### Request
``` ```shell
curl -X POST http://localhost:11434/api/embeddings -d '{ curl -X POST http://localhost:11434/api/embeddings -d '{
"model": "llama2:7b", "model": "llama2:7b",
"prompt": "Here is an article about llamas..." "prompt": "Here is an article about llamas..."
@@ -259,5 +348,4 @@ curl -X POST http://localhost:11434/api/embeddings -d '{
0.5670403838157654, 0.009260174818336964, 0.23178744316101074, -0.2916173040866852, -0.8924556970596313, 0.5670403838157654, 0.009260174818336964, 0.23178744316101074, -0.2916173040866852, -0.8924556970596313,
0.8785552978515625, -0.34576427936553955, 0.5742510557174683, -0.04222835972905159, -0.137906014919281 0.8785552978515625, -0.34576427936553955, 0.5742510557174683, -0.04222835972905159, -0.137906014919281
] ]
} }```
```

View File

@@ -6,6 +6,10 @@
Install required tools: Install required tools:
- cmake version 3.24 or higher
- go version 1.20 or higher
- gcc version 11.4.0 or higher
``` ```
brew install go cmake gcc brew install go cmake gcc
``` ```
@@ -27,3 +31,9 @@ Now you can run `ollama`:
``` ```
./ollama ./ollama
``` ```
## Building on Linux with GPU support
- Install cmake and nvidia-cuda-toolkit
- run `go generate ./...`
- run `go build .`

83
docs/linux.md Normal file
View File

@@ -0,0 +1,83 @@
# Installing Ollama on Linux
> Note: A one line installer for Ollama is available by running:
>
> ```
> curl https://ollama.ai/install.sh | sh
> ```
## Download the `ollama` binary
Ollama is distributed as a self-contained binary. Download it to a directory in your PATH:
```
sudo curl -L https://ollama.ai/download/ollama-linux-amd64 -o /usr/bin/ollama
sudo chmod +x /usr/bin/ollama
```
## Start Ollama
Start Ollama by running `ollama serve`:
```
ollama serve
```
Once Ollama is running, run a model in another terminal session:
```
ollama run llama2
```
## Install CUDA drivers (optional for Nvidia GPUs)
[Download and install](https://developer.nvidia.com/cuda-downloads) CUDA.
Verify that the drivers are installed by running the following command, which should print details about your GPU:
```
nvidia-smi
```
## Adding Ollama as a startup service (optional)
Create a user for Ollama:
```
sudo useradd -r -s /bin/false -m -d /usr/share/ollama ollama
```
Create a service file in `/etc/systemd/system/ollama.service`:
```ini
[Unit]
Description=Ollama Service
After=network-online.target
[Service]
ExecStart=/usr/bin/ollama serve
User=ollama
Group=ollama
Restart=always
RestartSec=3
Environment="HOME=/usr/share/ollama"
[Install]
WantedBy=default.target
```
Then start the service:
```
sudo systemctl daemon-reload
sudo systemctl enable ollama
```
### Viewing logs
To view logs of Ollama running as a startup service, run:
```
journalctl -u ollama
```

2
go.mod
View File

@@ -8,6 +8,7 @@ require (
github.com/mattn/go-runewidth v0.0.14 github.com/mattn/go-runewidth v0.0.14
github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db
github.com/olekukonko/tablewriter v0.0.5 github.com/olekukonko/tablewriter v0.0.5
github.com/pdevine/readline v1.5.2
github.com/spf13/cobra v1.7.0 github.com/spf13/cobra v1.7.0
) )
@@ -16,7 +17,6 @@ require github.com/rivo/uniseg v0.2.0 // indirect
require ( require (
github.com/bytedance/sonic v1.9.1 // indirect github.com/bytedance/sonic v1.9.1 // indirect
github.com/chenzhuoyu/base64x v0.0.0-20221115062448-fe3a3abad311 // indirect github.com/chenzhuoyu/base64x v0.0.0-20221115062448-fe3a3abad311 // indirect
github.com/chzyer/readline v1.5.1
github.com/gabriel-vasile/mimetype v1.4.2 // indirect github.com/gabriel-vasile/mimetype v1.4.2 // indirect
github.com/gin-contrib/cors v1.4.0 github.com/gin-contrib/cors v1.4.0
github.com/gin-contrib/sse v0.1.0 // indirect github.com/gin-contrib/sse v0.1.0 // indirect

5
go.sum
View File

@@ -6,8 +6,6 @@ github.com/chenzhuoyu/base64x v0.0.0-20221115062448-fe3a3abad311 h1:qSGYFH7+jGhD
github.com/chenzhuoyu/base64x v0.0.0-20221115062448-fe3a3abad311/go.mod h1:b583jCggY9gE99b6G5LEC39OIiVsWj+R97kbl5odCEk= github.com/chenzhuoyu/base64x v0.0.0-20221115062448-fe3a3abad311/go.mod h1:b583jCggY9gE99b6G5LEC39OIiVsWj+R97kbl5odCEk=
github.com/chzyer/logex v1.2.1 h1:XHDu3E6q+gdHgsdTPH6ImJMIp436vR6MPtH8gP05QzM= github.com/chzyer/logex v1.2.1 h1:XHDu3E6q+gdHgsdTPH6ImJMIp436vR6MPtH8gP05QzM=
github.com/chzyer/logex v1.2.1/go.mod h1:JLbx6lG2kDbNRFnfkgvh4eRJRPX1QCoOIWomwysCBrQ= github.com/chzyer/logex v1.2.1/go.mod h1:JLbx6lG2kDbNRFnfkgvh4eRJRPX1QCoOIWomwysCBrQ=
github.com/chzyer/readline v1.5.1 h1:upd/6fQk4src78LMRzh5vItIt361/o4uq553V8B5sGI=
github.com/chzyer/readline v1.5.1/go.mod h1:Eh+b79XXUwfKfcPLepksvw2tcLE/Ct21YObkaSkeBlk=
github.com/chzyer/test v1.0.0 h1:p3BQDXSxOhOG0P9z6/hGnII4LGiEPOYBhs8asl/fC04= github.com/chzyer/test v1.0.0 h1:p3BQDXSxOhOG0P9z6/hGnII4LGiEPOYBhs8asl/fC04=
github.com/chzyer/test v1.0.0/go.mod h1:2JlltgoNkt4TW/z9V/IzDdFaMTM2JPIi26O1pF38GC8= github.com/chzyer/test v1.0.0/go.mod h1:2JlltgoNkt4TW/z9V/IzDdFaMTM2JPIi26O1pF38GC8=
github.com/cpuguy83/go-md2man/v2 v2.0.2/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= github.com/cpuguy83/go-md2man/v2 v2.0.2/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=
@@ -80,6 +78,8 @@ github.com/olekukonko/tablewriter v0.0.5 h1:P2Ga83D34wi1o9J6Wh1mRuqd4mF/x/lgBS7N
github.com/olekukonko/tablewriter v0.0.5/go.mod h1:hPp6KlRPjbx+hW8ykQs1w3UBbZlj6HuIJcUGPhkA7kY= github.com/olekukonko/tablewriter v0.0.5/go.mod h1:hPp6KlRPjbx+hW8ykQs1w3UBbZlj6HuIJcUGPhkA7kY=
github.com/pbnjay/memory v0.0.0-20210728143218-7b4eea64cf58 h1:onHthvaw9LFnH4t2DcNVpwGmV9E1BkGknEliJkfwQj0= github.com/pbnjay/memory v0.0.0-20210728143218-7b4eea64cf58 h1:onHthvaw9LFnH4t2DcNVpwGmV9E1BkGknEliJkfwQj0=
github.com/pbnjay/memory v0.0.0-20210728143218-7b4eea64cf58/go.mod h1:DXv8WO4yhMYhSNPKjeNKa5WY9YCIEBRbNzFFPJbWO6Y= github.com/pbnjay/memory v0.0.0-20210728143218-7b4eea64cf58/go.mod h1:DXv8WO4yhMYhSNPKjeNKa5WY9YCIEBRbNzFFPJbWO6Y=
github.com/pdevine/readline v1.5.2 h1:oz6Y5GdTmhPG+08hhxcAvtHitSANWuA2100Sppb38xI=
github.com/pdevine/readline v1.5.2/go.mod h1:na/LbuE5PYwxI7GyopWdIs3U8HVe89lYlNTFTXH3wOw=
github.com/pelletier/go-toml/v2 v2.0.1/go.mod h1:r9LEWfGN8R5k0VXJ+0BkIe7MYkRdwZOjgMj2KwnJFUo= github.com/pelletier/go-toml/v2 v2.0.1/go.mod h1:r9LEWfGN8R5k0VXJ+0BkIe7MYkRdwZOjgMj2KwnJFUo=
github.com/pelletier/go-toml/v2 v2.0.8 h1:0ctb6s9mE31h0/lhu+J6OPmVeDxJn+kYnJc2jZR9tGQ= github.com/pelletier/go-toml/v2 v2.0.8 h1:0ctb6s9mE31h0/lhu+J6OPmVeDxJn+kYnJc2jZR9tGQ=
github.com/pelletier/go-toml/v2 v2.0.8/go.mod h1:vuYfssBdrU2XDZ9bYydBu6t+6a6PYNcZljzZR9VXg+4= github.com/pelletier/go-toml/v2 v2.0.8/go.mod h1:vuYfssBdrU2XDZ9bYydBu6t+6a6PYNcZljzZR9VXg+4=
@@ -120,7 +120,6 @@ golang.org/x/arch v0.3.0/go.mod h1:5om86z9Hs0C8fWVUuoMHwpExlXzs5Tkyp9hOrfG7pp8=
golang.org/x/crypto v0.0.0-20210711020723-a769d52b0f97/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= golang.org/x/crypto v0.0.0-20210711020723-a769d52b0f97/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
golang.org/x/crypto v0.10.0 h1:LKqV2xt9+kDzSTfOhx4FrkEBcMrAgHSYgzywV9zcGmM= golang.org/x/crypto v0.10.0 h1:LKqV2xt9+kDzSTfOhx4FrkEBcMrAgHSYgzywV9zcGmM=
golang.org/x/crypto v0.10.0/go.mod h1:o4eNf7Ede1fv+hwOwZsTHl9EsPFO6q6ZvYR8vYfY45I= golang.org/x/crypto v0.10.0/go.mod h1:o4eNf7Ede1fv+hwOwZsTHl9EsPFO6q6ZvYR8vYfY45I=
golang.org/x/exp v0.0.0-20230321023759-10a507213a29 h1:ooxPy7fPvB4kwsA2h+iBNHkAbp/4JxTSwCmvdjEYmug=
golang.org/x/exp v0.0.0-20230817173708-d852ddb80c63 h1:m64FZMko/V45gv0bNmrNYoDEq8U5YUhetc9cBWKS1TQ= golang.org/x/exp v0.0.0-20230817173708-d852ddb80c63 h1:m64FZMko/V45gv0bNmrNYoDEq8U5YUhetc9cBWKS1TQ=
golang.org/x/exp v0.0.0-20230817173708-d852ddb80c63/go.mod h1:0v4NqG35kSWCMzLaMeX+IQrlSnVE/bqGSyC2cz/9Le8= golang.org/x/exp v0.0.0-20230817173708-d852ddb80c63/go.mod h1:0v4NqG35kSWCMzLaMeX+IQrlSnVE/bqGSyC2cz/9Le8=
golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=

22
llm/falcon.go Normal file
View File

@@ -0,0 +1,22 @@
package llm
const ModelFamilyFalcon = "falcon"
const (
falconModelType7B = 32
falconModelType40B = 60
falconModelType180B = 80
)
func falconModelType(numLayer uint32) string {
switch numLayer {
case 32:
return "7B"
case 60:
return "40B"
case 80:
return "180B"
default:
return "Unknown"
}
}

View File

@@ -3,72 +3,96 @@ package llm
import ( import (
"encoding/binary" "encoding/binary"
"errors" "errors"
"fmt"
"io" "io"
) )
type ModelFamily string
type ModelType uint32
const (
ModelType3B ModelType = 26
ModelType7B ModelType = 32
ModelType13B ModelType = 40
ModelType34B ModelType = 48
ModelType30B ModelType = 60
ModelType65B ModelType = 80
)
func (mt ModelType) String() string {
switch mt {
case ModelType3B:
return "3B"
case ModelType7B:
return "7B"
case ModelType13B:
return "13B"
case ModelType34B:
return "34B"
case ModelType30B:
return "30B"
case ModelType65B:
return "65B"
default:
return "Unknown"
}
}
type FileType interface {
String() string
}
type GGML struct { type GGML struct {
magic uint32 magic uint32
container container
model model
} }
const (
fileTypeF32 uint32 = iota
fileTypeF16
fileTypeQ4_0
fileTypeQ4_1
fileTypeQ4_1_F16
fileTypeQ8_0 uint32 = iota + 2
fileTypeQ5_0
fileTypeQ5_1
fileTypeQ2_K
fileTypeQ3_K_S
fileTypeQ3_K_M
fileTypeQ3_K_L
fileTypeQ4_K_S
fileTypeQ4_K_M
fileTypeQ5_K_S
fileTypeQ5_K_M
fileTypeQ6_K
)
func fileType(fileType uint32) string {
switch fileType {
case fileTypeF32:
return "F32"
case fileTypeF16:
return "F16"
case fileTypeQ4_0:
return "Q4_0"
case fileTypeQ4_1:
return "Q4_1"
case fileTypeQ4_1_F16:
return "Q4_1_F16"
case fileTypeQ8_0:
return "Q8_0"
case fileTypeQ5_0:
return "Q5_0"
case fileTypeQ5_1:
return "Q5_1"
case fileTypeQ2_K:
return "Q2_K"
case fileTypeQ3_K_S:
return "Q3_K_S"
case fileTypeQ3_K_M:
return "Q3_K_M"
case fileTypeQ3_K_L:
return "Q3_K_L"
case fileTypeQ4_K_S:
return "Q4_K_S"
case fileTypeQ4_K_M:
return "Q4_K_M"
case fileTypeQ5_K_S:
return "Q5_K_S"
case fileTypeQ5_K_M:
return "Q5_K_M"
case fileTypeQ6_K:
return "Q6_K"
default:
return "Unknown"
}
}
type model interface { type model interface {
ModelFamily() ModelFamily ModelFamily() string
ModelType() ModelType ModelType() string
FileType() FileType FileType() string
NumLayers() int64
} }
type container interface { type container interface {
Name() string Name() string
Decode(io.Reader) error Decode(io.Reader) (model, error)
} }
type containerGGML struct { type containerGGML struct{}
}
func (c *containerGGML) Name() string { func (c *containerGGML) Name() string {
return "ggml" return "ggml"
} }
func (c *containerGGML) Decode(r io.Reader) error { func (c *containerGGML) Decode(r io.Reader) (model, error) {
return nil return nil, nil
} }
type containerGGMF struct { type containerGGMF struct {
@@ -79,18 +103,18 @@ func (c *containerGGMF) Name() string {
return "ggmf" return "ggmf"
} }
func (c *containerGGMF) Decode(r io.Reader) error { func (c *containerGGMF) Decode(r io.Reader) (model, error) {
var version uint32 var version uint32
binary.Read(r, binary.LittleEndian, &version) binary.Read(r, binary.LittleEndian, &version)
switch version { switch version {
case 1: case 1:
default: default:
return errors.New("invalid version") return nil, errors.New("invalid version")
} }
c.version = version c.version = version
return nil return nil, nil
} }
type containerGGJT struct { type containerGGJT struct {
@@ -101,18 +125,22 @@ func (c *containerGGJT) Name() string {
return "ggjt" return "ggjt"
} }
func (c *containerGGJT) Decode(r io.Reader) error { func (c *containerGGJT) Decode(r io.Reader) (model, error) {
var version uint32 var version uint32
binary.Read(r, binary.LittleEndian, &version) binary.Read(r, binary.LittleEndian, &version)
switch version { switch version {
case 1, 2, 3: case 1, 2, 3:
default: default:
return errors.New("invalid version") return nil, errors.New("invalid version")
} }
c.version = version c.version = version
return nil
// different model types may have different layouts for hyperparameters
var llama llamaModel
binary.Read(r, binary.LittleEndian, &llama.hyperparameters)
return &llama, nil
} }
type containerLORA struct { type containerLORA struct {
@@ -123,32 +151,34 @@ func (c *containerLORA) Name() string {
return "ggla" return "ggla"
} }
func (c *containerLORA) Decode(r io.Reader) error { func (c *containerLORA) Decode(r io.Reader) (model, error) {
var version uint32 var version uint32
binary.Read(r, binary.LittleEndian, &version) binary.Read(r, binary.LittleEndian, &version)
switch version { switch version {
case 1: case 1:
default: default:
return errors.New("invalid version") return nil, errors.New("invalid version")
} }
c.version = version c.version = version
return nil return nil, nil
} }
const ( const (
// / Magic constant for `ggml` files (unversioned). // Magic constant for `ggml` files (unversioned).
FILE_MAGIC_GGML = 0x67676d6c FILE_MAGIC_GGML = 0x67676d6c
// / Magic constant for `ggml` files (versioned, ggmf). // Magic constant for `ggml` files (versioned, ggmf).
FILE_MAGIC_GGMF = 0x67676d66 FILE_MAGIC_GGMF = 0x67676d66
// / Magic constant for `ggml` files (versioned, ggjt). // Magic constant for `ggml` files (versioned, ggjt).
FILE_MAGIC_GGJT = 0x67676a74 FILE_MAGIC_GGJT = 0x67676a74
// / Magic constant for `ggla` files (LoRA adapter). // Magic constant for `ggla` files (LoRA adapter).
FILE_MAGIC_GGLA = 0x67676C61 FILE_MAGIC_GGLA = 0x67676C61
// Magic constant for `gguf` files (versioned, gguf)
FILE_MAGIC_GGUF = 0x46554747
) )
func DecodeGGML(r io.ReadSeeker, hint ModelFamily) (*GGML, error) { func DecodeGGML(r io.ReadSeeker) (*GGML, error) {
var ggml GGML var ggml GGML
binary.Read(r, binary.LittleEndian, &ggml.magic) binary.Read(r, binary.LittleEndian, &ggml.magic)
@@ -161,24 +191,18 @@ func DecodeGGML(r io.ReadSeeker, hint ModelFamily) (*GGML, error) {
ggml.container = &containerGGJT{} ggml.container = &containerGGJT{}
case FILE_MAGIC_GGLA: case FILE_MAGIC_GGLA:
ggml.container = &containerLORA{} ggml.container = &containerLORA{}
case FILE_MAGIC_GGUF:
ggml.container = &containerGGUF{}
default: default:
return nil, errors.New("invalid file magic") return nil, errors.New("invalid file magic")
} }
if err := ggml.Decode(r); err != nil { model, err := ggml.Decode(r)
if err != nil {
return nil, err return nil, err
} }
// different model types may have different layouts for hyperparameters ggml.model = model
switch hint {
case ModelFamilyLlama:
var llama llamaModel
binary.Read(r, binary.LittleEndian, &llama.hyperparameters)
ggml.model = &llama
// TODO: sanity check hyperparameters
default:
return nil, fmt.Errorf("unsupported model type: %s", hint)
}
// final model type // final model type
return &ggml, nil return &ggml, nil

379
llm/gguf.go Normal file
View File

@@ -0,0 +1,379 @@
package llm
import (
"bytes"
"encoding/binary"
"errors"
"fmt"
"io"
)
type containerGGUF struct {
Version uint32
V1 struct {
NumTensor uint32
NumKV uint32
}
V2 struct {
NumTensor uint64
NumKV uint64
}
}
func (c *containerGGUF) Name() string {
return "gguf"
}
func (c *containerGGUF) Decode(r io.Reader) (model, error) {
binary.Read(r, binary.LittleEndian, &c.Version)
switch c.Version {
case 1:
binary.Read(r, binary.LittleEndian, &c.V1)
case 2:
binary.Read(r, binary.LittleEndian, &c.V2)
default:
return nil, errors.New("invalid version")
}
model := newGGUFModel(c)
if err := model.Decode(r); err != nil {
return nil, err
}
return model, nil
}
const (
ggufTypeUint8 uint32 = iota
ggufTypeInt8
ggufTypeUint16
ggufTypeInt16
ggufTypeUint32
ggufTypeInt32
ggufTypeFloat32
ggufTypeBool
ggufTypeString
ggufTypeArray
ggufTypeUint64
ggufTypeInt64
ggufTypeFloat64
)
type kv map[string]any
type ggufModel struct {
*containerGGUF
kv
}
func newGGUFModel(container *containerGGUF) *ggufModel {
return &ggufModel{
containerGGUF: container,
kv: make(kv),
}
}
func (llm *ggufModel) NumKV() uint64 {
if llm.Version == 1 {
return uint64(llm.V1.NumKV)
}
return llm.V2.NumKV
}
func (llm *ggufModel) ModelFamily() string {
t, ok := llm.kv["general.architecture"].(string)
if ok {
return t
}
return "unknown"
}
func (llm *ggufModel) ModelType() string {
switch llm.ModelFamily() {
case "llama":
if blocks, ok := llm.kv["llama.block_count"].(uint32); ok {
heads, headsOK := llm.kv["llama.head_count"].(uint32)
headKVs, headsKVsOK := llm.kv["llama.head_count_kv"].(uint32)
if headsOK && headsKVsOK && heads/headKVs == 8 {
return "70B"
}
return llamaModelType(blocks)
}
case "falcon":
if blocks, ok := llm.kv["falcon.block_count"].(uint32); ok {
return falconModelType(blocks)
}
}
return "Unknown"
}
func (llm *ggufModel) FileType() string {
t, ok := llm.kv["general.file_type"].(uint32)
if ok {
return fileType(t)
}
return "Unknown"
}
func (llm *ggufModel) Decode(r io.Reader) error {
read := llm.readString
if llm.Version == 1 {
read = llm.readStringV1
}
for i := 0; uint64(i) < llm.NumKV(); i++ {
k, err := read(r)
if err != nil {
return err
}
vtype := llm.readU32(r)
var v any
switch vtype {
case ggufTypeUint8:
v = llm.readU8(r)
case ggufTypeInt8:
v = llm.readI8(r)
case ggufTypeUint16:
v = llm.readU16(r)
case ggufTypeInt16:
v = llm.readI16(r)
case ggufTypeUint32:
v = llm.readU32(r)
case ggufTypeInt32:
v = llm.readI32(r)
case ggufTypeUint64:
v = llm.readU64(r)
case ggufTypeInt64:
v = llm.readI64(r)
case ggufTypeFloat32:
v = llm.readF32(r)
case ggufTypeFloat64:
v = llm.readF64(r)
case ggufTypeBool:
v = llm.readBool(r)
case ggufTypeString:
fn := llm.readString
if llm.Version == 1 {
fn = llm.readStringV1
}
s, err := fn(r)
if err != nil {
return err
}
v = s
case ggufTypeArray:
fn := llm.readArray
if llm.Version == 1 {
fn = llm.readArrayV1
}
a, err := fn(r)
if err != nil {
return err
}
v = a
default:
return fmt.Errorf("invalid type: %d", vtype)
}
llm.kv[k] = v
}
return nil
}
func (llm *ggufModel) NumLayers() int64 {
value, exists := llm.kv[fmt.Sprintf("%s.block_count", llm.ModelFamily())]
if !exists {
return 0
}
v := value.(uint32)
return int64(v)
}
func (ggufModel) readU8(r io.Reader) uint8 {
var u8 uint8
binary.Read(r, binary.LittleEndian, &u8)
return u8
}
func (ggufModel) readI8(r io.Reader) int8 {
var i8 int8
binary.Read(r, binary.LittleEndian, &i8)
return i8
}
func (ggufModel) readU16(r io.Reader) uint16 {
var u16 uint16
binary.Read(r, binary.LittleEndian, &u16)
return u16
}
func (ggufModel) readI16(r io.Reader) int16 {
var i16 int16
binary.Read(r, binary.LittleEndian, &i16)
return i16
}
func (ggufModel) readU32(r io.Reader) uint32 {
var u32 uint32
binary.Read(r, binary.LittleEndian, &u32)
return u32
}
func (ggufModel) readI32(r io.Reader) int32 {
var i32 int32
binary.Read(r, binary.LittleEndian, &i32)
return i32
}
func (ggufModel) readU64(r io.Reader) uint64 {
var u64 uint64
binary.Read(r, binary.LittleEndian, &u64)
return u64
}
func (ggufModel) readI64(r io.Reader) int64 {
var i64 int64
binary.Read(r, binary.LittleEndian, &i64)
return i64
}
func (ggufModel) readF32(r io.Reader) float32 {
var f32 float32
binary.Read(r, binary.LittleEndian, &f32)
return f32
}
func (ggufModel) readF64(r io.Reader) float64 {
var f64 float64
binary.Read(r, binary.LittleEndian, &f64)
return f64
}
func (ggufModel) readBool(r io.Reader) bool {
var b bool
binary.Read(r, binary.LittleEndian, &b)
return b
}
func (ggufModel) readStringV1(r io.Reader) (string, error) {
var nameLength uint32
binary.Read(r, binary.LittleEndian, &nameLength)
var b bytes.Buffer
if _, err := io.CopyN(&b, r, int64(nameLength)); err != nil {
return "", err
}
// gguf v1 strings are null-terminated
b.Truncate(b.Len() - 1)
return b.String(), nil
}
func (llm ggufModel) readString(r io.Reader) (string, error) {
var nameLength uint64
binary.Read(r, binary.LittleEndian, &nameLength)
var b bytes.Buffer
if _, err := io.CopyN(&b, r, int64(nameLength)); err != nil {
return "", err
}
return b.String(), nil
}
func (llm *ggufModel) readArrayV1(r io.Reader) (arr []any, err error) {
atype := llm.readU32(r)
n := llm.readU32(r)
for i := 0; uint32(i) < n; i++ {
switch atype {
case ggufTypeUint8:
arr = append(arr, llm.readU8(r))
case ggufTypeInt8:
arr = append(arr, llm.readU8(r))
case ggufTypeUint16:
arr = append(arr, llm.readU16(r))
case ggufTypeInt16:
arr = append(arr, llm.readI16(r))
case ggufTypeUint32:
arr = append(arr, llm.readU32(r))
case ggufTypeInt32:
arr = append(arr, llm.readI32(r))
case ggufTypeFloat32:
arr = append(arr, llm.readF32(r))
case ggufTypeBool:
arr = append(arr, llm.readBool(r))
case ggufTypeString:
s, err := llm.readStringV1(r)
if err != nil {
return nil, err
}
arr = append(arr, s)
default:
return nil, fmt.Errorf("invalid array type: %d", atype)
}
}
return
}
func (llm *ggufModel) readArray(r io.Reader) (arr []any, err error) {
atype := llm.readU32(r)
n := llm.readU64(r)
for i := 0; uint64(i) < n; i++ {
switch atype {
case ggufTypeUint8:
arr = append(arr, llm.readU8(r))
case ggufTypeInt8:
arr = append(arr, llm.readU8(r))
case ggufTypeUint16:
arr = append(arr, llm.readU16(r))
case ggufTypeInt16:
arr = append(arr, llm.readI16(r))
case ggufTypeUint32:
arr = append(arr, llm.readU32(r))
case ggufTypeInt32:
arr = append(arr, llm.readI32(r))
case ggufTypeUint64:
arr = append(arr, llm.readU64(r))
case ggufTypeInt64:
arr = append(arr, llm.readI64(r))
case ggufTypeFloat32:
arr = append(arr, llm.readF32(r))
case ggufTypeFloat64:
arr = append(arr, llm.readF64(r))
case ggufTypeBool:
arr = append(arr, llm.readBool(r))
case ggufTypeString:
s, err := llm.readString(r)
if err != nil {
return nil, err
}
arr = append(arr, s)
default:
return nil, fmt.Errorf("invalid array type: %d", atype)
}
}
return
}

View File

@@ -1,13 +0,0 @@
//go:build !darwin
// +build !darwin
package llm
//go:generate git submodule init
//go:generate git submodule update --force ggml
//go:generate git -C ggml apply ../ggml_patch/0001-add-detokenize-endpoint.patch
//go:generate git -C ggml apply ../ggml_patch/0002-34B-model-support.patch
//go:generate git -C ggml apply ../ggml_patch/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
//go:generate git -C ggml apply ../ggml_patch/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
//go:generate cmake --fresh -S ggml -B ggml/build/cpu -DLLAMA_K_QUANTS=on
//go:generate cmake --build ggml/build/cpu --target server --config Release

View File

@@ -1,10 +1,16 @@
package llm package llm
//go:generate git submodule init //go:generate git submodule init
//go:generate git submodule update --force ggml //go:generate git submodule update --force ggml
//go:generate git -C ggml apply ../ggml_patch/0001-add-detokenize-endpoint.patch //go:generate git -C ggml apply ../patches/0001-add-detokenize-endpoint.patch
//go:generate git -C ggml apply ../ggml_patch/0002-34B-model-support.patch //go:generate git -C ggml apply ../patches/0002-34B-model-support.patch
//go:generate git -C ggml apply ../ggml_patch/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch //go:generate git -C ggml apply ../patches/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
//go:generate git -C ggml apply ../ggml_patch/0004-metal-add-missing-barriers-for-mul-mat-2699.patch //go:generate git -C ggml apply ../patches/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
//go:generate cmake --fresh -S ggml -B ggml/build/cpu -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 //go:generate cmake -S ggml -B ggml/build/cpu -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
//go:generate cmake --build ggml/build/cpu --target server --config Release //go:generate cmake --build ggml/build/cpu --target server --config Release
//go:generate git submodule update --force gguf
//go:generate git -C gguf apply ../patches/0001-remove-warm-up-logging.patch
//go:generate cmake -S gguf -B gguf/build/cpu -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
//go:generate cmake --build gguf/build/cpu --target server --config Release

View File

@@ -1,10 +1,16 @@
package llm package llm
//go:generate git submodule init //go:generate git submodule init
//go:generate git submodule update --force ggml //go:generate git submodule update --force ggml
//go:generate git -C ggml apply ../ggml_patch/0001-add-detokenize-endpoint.patch //go:generate git -C ggml apply ../patches/0001-add-detokenize-endpoint.patch
//go:generate git -C ggml apply ../ggml_patch/0002-34B-model-support.patch //go:generate git -C ggml apply ../patches/0002-34B-model-support.patch
//go:generate git -C ggml apply ../ggml_patch/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch //go:generate git -C ggml apply ../patches/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
//go:generate git -C ggml apply ../ggml_patch/0004-metal-add-missing-barriers-for-mul-mat-2699.patch //go:generate git -C ggml apply ../patches/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
//go:generate cmake --fresh -S ggml -B ggml/build/gpu -DLLAMA_METAL=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 //go:generate cmake -S ggml -B ggml/build/metal -DLLAMA_METAL=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
//go:generate cmake --build ggml/build/gpu --target server --config Release //go:generate cmake --build ggml/build/metal --target server --config Release
//go:generate git submodule update --force gguf
//go:generate git -C gguf apply ../patches/0001-remove-warm-up-logging.patch
//go:generate cmake -S gguf -B gguf/build/metal -DLLAMA_METAL=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
//go:generate cmake --build gguf/build/metal --target server --config Release

View File

@@ -0,0 +1,22 @@
package llm
//go:generate git submodule init
//go:generate git submodule update --force ggml
//go:generate git -C ggml apply ../patches/0001-add-detokenize-endpoint.patch
//go:generate git -C ggml apply ../patches/0002-34B-model-support.patch
//go:generate git -C ggml apply ../patches/0005-ggml-support-CUDA-s-half-type-for-aarch64-1455-2670.patch
//go:generate git -C ggml apply ../patches/0001-copy-cuda-runtime-libraries.patch
//go:generate cmake -S ggml -B ggml/build/cpu -DLLAMA_K_QUANTS=on
//go:generate cmake --build ggml/build/cpu --target server --config Release
//go:generate git submodule update --force gguf
//go:generate git -C gguf apply ../patches/0001-copy-cuda-runtime-libraries.patch
//go:generate git -C gguf apply ../patches/0001-remove-warm-up-logging.patch
//go:generate cmake -S gguf -B gguf/build/cpu -DLLAMA_K_QUANTS=on
//go:generate cmake --build gguf/build/cpu --target server --config Release
//go:generate cmake -S ggml -B ggml/build/cuda -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on
//go:generate cmake --build ggml/build/cuda --target server --config Release
//go:generate cmake -S gguf -B gguf/build/cuda -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on
//go:generate cmake --build gguf/build/cuda --target server --config Release

View File

@@ -0,0 +1,14 @@
package llm
//go:generate git submodule init
//go:generate git submodule update --force ggml
//go:generate git -C ggml apply ../patches/0001-add-detokenize-endpoint.patch
//go:generate git -C ggml apply ../patches/0002-34B-model-support.patch
//go:generate cmake -S ggml -B ggml/build/cpu -DLLAMA_K_QUANTS=on
//go:generate cmake --build ggml/build/cpu --target server --config Release
//go:generate git submodule update --force gguf
//go:generate git -C gguf apply ../patches/0001-remove-warm-up-logging.patch
//go:generate cmake -S gguf -B gguf/build/cpu -DLLAMA_K_QUANTS=on
//go:generate cmake --build gguf/build/cpu --target server --config Release

View File

@@ -1,32 +0,0 @@
From 8c0ea847ac1460bca534d92266e3471cb31471be Mon Sep 17 00:00:00 2001
From: Bruce MacDonald <brucewmacdonald@gmail.com>
Date: Tue, 5 Sep 2023 16:05:08 -0400
Subject: [PATCH] metal: add missing barriers for mul-mat #2699
---
ggml-metal.metal | 2 ++
1 file changed, 2 insertions(+)
diff --git a/ggml-metal.metal b/ggml-metal.metal
index 3f31252..ce3541f 100644
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
@@ -1850,6 +1850,7 @@ kernel void kernel_mul_mm(device const uchar * src0,
//load data and store to threadgroup memory
half4x4 temp_a;
dequantize_func(x, il, temp_a);
+ threadgroup_barrier(mem_flags::mem_threadgroup);
#pragma unroll(16)
for (int i = 0; i < 16; i++) {
*(sa + SG_MAT_SIZE * ((tiitg / THREAD_PER_ROW / 8) \
@@ -1895,6 +1896,7 @@ kernel void kernel_mul_mm(device const uchar * src0,
}
} else {
// block is smaller than 64x32, we should avoid writing data outside of the matrix
+ threadgroup_barrier(mem_flags::mem_threadgroup);
threadgroup float *temp_str = ((threadgroup float *)shared_memory) \
+ 32 * (sgitg&1) + (16 * (sgitg>>1)) * BLOCK_SIZE_M;
for (int i = 0; i < 8; i++) {
--
2.39.2 (Apple Git-143)

1
llm/llama.cpp/gguf Submodule

Submodule llm/llama.cpp/gguf added at bc9d3e3971

View File

@@ -0,0 +1,27 @@
From 5dd02993e8cc2ce309157736b95bb572f274a3fd Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Wed, 20 Sep 2023 14:19:52 -0700
Subject: [PATCH] copy cuda runtime libraries
---
CMakeLists.txt | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 824d9f2..dd24137 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -274,6 +274,10 @@ if (LLAMA_CUBLAS)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
endif()
+ configure_file(${CUDAToolkit_LIBRARY_DIR}/libcudart.so ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/libcudart.so.${CUDAToolkit_VERSION_MAJOR}.0 COPYONLY)
+ configure_file(${CUDAToolkit_LIBRARY_DIR}/libcublas.so ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/libcublas.so.${CUDAToolkit_VERSION_MAJOR} COPYONLY)
+ configure_file(${CUDAToolkit_LIBRARY_DIR}/libcublasLt.so ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/libcublasLt.so.${CUDAToolkit_VERSION_MAJOR} COPYONLY)
+
if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
# 52 == lowest CUDA 12 standard
# 60 == f16 CUDA intrinsics
--
2.42.0

View File

@@ -0,0 +1,25 @@
From 07993bdc35345b67b27aa649a7c099ad42d80c4c Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Thu, 21 Sep 2023 14:43:21 -0700
Subject: [PATCH] remove warm up logging
---
common/common.cpp | 2 --
1 file changed, 2 deletions(-)
diff --git a/common/common.cpp b/common/common.cpp
index 2597ba0..b56549b 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -780,8 +780,6 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
}
{
- LOG("warming up the model with an empty run\n");
-
const std::vector<llama_token> tmp = { llama_token_bos(lctx), llama_token_eos(lctx), };
llama_eval(lctx, tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, params.n_threads);
llama_reset_timings(lctx);
--
2.42.0

View File

@@ -0,0 +1,32 @@
From 1e3bc523d8053a77df3ac7126a84d0297ee97ef6 Mon Sep 17 00:00:00 2001
From: Kylin <56434533+KyL0N@users.noreply.github.com>
Date: Tue, 22 Aug 2023 15:14:23 +0800
Subject: [PATCH] ggml : support CUDA's half type for aarch64(#1455) (#2670)
* ggml: support CUDA's half type for aarch64(#1455)
support CUDA's half type for aarch64 in ggml_fp16_t definition
* ggml: use __CUDACC__ to recognise nvcc compiler
---
ggml.h | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/ggml.h b/ggml.h
index 544ad2d..0ec7ec5 100644
--- a/ggml.h
+++ b/ggml.h
@@ -259,8 +259,9 @@
extern "C" {
#endif
-#ifdef __ARM_NEON
- // we use the built-in 16-bit float type
+#if defined(__ARM_NEON) && defined(__CUDACC__)
+ typedef half ggml_fp16_t;
+#elif defined(__ARM_NEON)
typedef __fp16 ggml_fp16_t;
#else
typedef uint16_t ggml_fp16_t;
--
2.39.2 (Apple Git-143)

View File

@@ -20,127 +20,140 @@ import (
"runtime" "runtime"
"strconv" "strconv"
"strings" "strings"
"sync"
"time" "time"
"github.com/jmorganca/ollama/api" "github.com/jmorganca/ollama/api"
) )
const ModelFamilyLlama ModelFamily = "llama" //go:embed llama.cpp/*/build/*/bin/*
//go:embed llama.cpp/ggml/build/*/bin/*
var llamaCppEmbed embed.FS var llamaCppEmbed embed.FS
var (
ggmlGPU = path.Join("llama.cpp", "ggml", "build", "gpu", "bin")
ggmlCPU = path.Join("llama.cpp", "ggml", "build", "cpu", "bin")
)
var (
ggmlInit sync.Once
ggmlRunnerPath string
)
func osPath(llamaPath string) string {
if runtime.GOOS == "windows" {
return path.Join(llamaPath, "Release")
}
return llamaPath
}
func initGGML() {
ggmlInit.Do(func() {
tmpDir, err := os.MkdirTemp("", "llama-*")
if err != nil {
log.Fatalf("llama.cpp: failed to create temp dir: %v", err)
}
llamaPath := osPath(ggmlGPU)
if _, err := fs.Stat(llamaCppEmbed, llamaPath); err != nil {
llamaPath = osPath(ggmlCPU)
if _, err := fs.Stat(llamaCppEmbed, llamaPath); err != nil {
log.Fatalf("llama.cpp executable not found")
}
}
files := []string{"server"}
switch runtime.GOOS {
case "windows":
files = []string{"server.exe"}
case "darwin":
if llamaPath == osPath(ggmlGPU) {
files = append(files, "ggml-metal.metal")
}
}
for _, f := range files {
srcPath := path.Join(llamaPath, f)
destPath := filepath.Join(tmpDir, f)
srcFile, err := llamaCppEmbed.Open(srcPath)
if err != nil {
log.Fatalf("read llama.cpp %s: %v", f, err)
}
defer srcFile.Close()
destFile, err := os.OpenFile(destPath, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
if err != nil {
log.Fatalf("write llama.cpp %s: %v", f, err)
}
defer destFile.Close()
if _, err := io.Copy(destFile, srcFile); err != nil {
log.Fatalf("copy llama.cpp %s: %v", f, err)
}
}
ggmlRunnerPath = filepath.Join(tmpDir, "server")
if runtime.GOOS == "windows" {
ggmlRunnerPath = filepath.Join(tmpDir, "server.exe")
}
})
}
type ModelRunner struct { type ModelRunner struct {
Path string // path to the model runner executable Path string // path to the model runner executable
} }
func ggmlRunner() ModelRunner { func chooseRunners(workDir, runnerType string) []ModelRunner {
initGGML() buildPath := path.Join("llama.cpp", runnerType, "build")
return ModelRunner{Path: ggmlRunnerPath} var runners []string
// set the runners based on the OS
// IMPORTANT: the order of the runners in the array is the priority order
switch runtime.GOOS {
case "darwin":
runners = []string{
path.Join(buildPath, "metal", "bin", "server"),
path.Join(buildPath, "cpu", "bin", "server"),
}
case "linux":
runners = []string{
path.Join(buildPath, "cuda", "bin", "server"),
path.Join(buildPath, "cpu", "bin", "server"),
}
case "windows":
// TODO: select windows GPU runner here when available
runners = []string{
path.Join(buildPath, "cpu", "bin", "Release", "server.exe"),
}
default:
log.Printf("unknown OS, running on CPU: %s", runtime.GOOS)
runners = []string{
path.Join(buildPath, "cpu", "bin", "server"),
}
}
runnerAvailable := false // if no runner files are found in the embed, this flag will cause a fast fail
for _, r := range runners {
// find all the files in the runner's bin directory
files, err := fs.Glob(llamaCppEmbed, filepath.Join(filepath.Dir(r), "*"))
if err != nil {
// this is expected, ollama may be compiled without all runners packed in
log.Printf("%s runner not found: %v", r, err)
continue
}
runnerAvailable = true
for _, f := range files {
srcFile, err := llamaCppEmbed.Open(f)
if err != nil {
log.Fatalf("read llama runner %s: %v", f, err)
}
defer srcFile.Close()
// create the directory in case it does not exist
destPath := filepath.Join(workDir, filepath.Dir(f))
if err := os.MkdirAll(destPath, 0o755); err != nil {
log.Fatalf("create runner temp dir %s: %v", filepath.Dir(f), err)
}
destFile := filepath.Join(destPath, filepath.Base(f))
_, err = os.Stat(destFile)
switch {
case errors.Is(err, os.ErrNotExist):
destFile, err := os.OpenFile(destFile, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
if err != nil {
log.Fatalf("write llama runner %s: %v", f, err)
}
defer destFile.Close()
if _, err := io.Copy(destFile, srcFile); err != nil {
log.Fatalf("copy llama runner %s: %v", f, err)
}
case err != nil:
log.Fatalf("stat llama runner %s: %v", f, err)
}
}
}
if !runnerAvailable {
log.Fatalf("%s runner not found", runnerType)
}
// return the runners to try in priority order
localRunnersByPriority := []ModelRunner{}
for _, r := range runners {
localRunnersByPriority = append(localRunnersByPriority, ModelRunner{Path: path.Join(workDir, r)})
}
return localRunnersByPriority
} }
type llamaModel struct { type llamaModel struct {
hyperparameters llamaHyperparameters hyperparameters llamaHyperparameters
} }
func (llm *llamaModel) ModelFamily() ModelFamily { func (llm *llamaModel) ModelFamily() string {
return ModelFamilyLlama return "llama"
} }
func (llm *llamaModel) ModelType() ModelType { func llamaModelType(numLayer uint32) string {
switch llm.hyperparameters.NumLayer { switch numLayer {
case 26: case 26:
return ModelType3B return "3B"
case 32: case 32:
return ModelType7B return "7B"
case 40: case 40:
return ModelType13B return "13B"
case 48: case 48:
return ModelType34B return "34B"
case 60: case 60:
return ModelType30B return "30B"
case 80: case 80:
return ModelType65B return "65B"
default:
return "Unknown"
} }
// TODO: find a better default
return ModelType7B
} }
func (llm *llamaModel) FileType() FileType { func (llm *llamaModel) ModelType() string {
return llm.hyperparameters.FileType return llamaModelType(llm.hyperparameters.NumLayer)
}
func (llm *llamaModel) FileType() string {
return fileType(llm.hyperparameters.FileType)
}
func (llm *llamaModel) NumLayers() int64 {
return int64(llm.hyperparameters.NumLayer)
} }
type llamaHyperparameters struct { type llamaHyperparameters struct {
@@ -157,70 +170,7 @@ type llamaHyperparameters struct {
NumRot uint32 NumRot uint32
// FileType describes the quantization level of the model, e.g. Q4_0, Q5_K, etc. // FileType describes the quantization level of the model, e.g. Q4_0, Q5_K, etc.
FileType llamaFileType FileType uint32
}
type llamaFileType uint32
const (
llamaFileTypeF32 llamaFileType = iota
llamaFileTypeF16
llamaFileTypeQ4_0
llamaFileTypeQ4_1
llamaFileTypeQ4_1_F16
llamaFileTypeQ8_0 llamaFileType = iota + 2
llamaFileTypeQ5_0
llamaFileTypeQ5_1
llamaFileTypeQ2_K
llamaFileTypeQ3_K_S
llamaFileTypeQ3_K_M
llamaFileTypeQ3_K_L
llamaFileTypeQ4_K_S
llamaFileTypeQ4_K_M
llamaFileTypeQ5_K_S
llamaFileTypeQ5_K_M
llamaFileTypeQ6_K
)
func (ft llamaFileType) String() string {
switch ft {
case llamaFileTypeF32:
return "F32"
case llamaFileTypeF16:
return "F16"
case llamaFileTypeQ4_0:
return "Q4_0"
case llamaFileTypeQ4_1:
return "Q4_1"
case llamaFileTypeQ4_1_F16:
return "Q4_1_F16"
case llamaFileTypeQ8_0:
return "Q8_0"
case llamaFileTypeQ5_0:
return "Q5_0"
case llamaFileTypeQ5_1:
return "Q5_1"
case llamaFileTypeQ2_K:
return "Q2_K"
case llamaFileTypeQ3_K_S:
return "Q3_K_S"
case llamaFileTypeQ3_K_M:
return "Q3_K_M"
case llamaFileTypeQ3_K_L:
return "Q3_K_L"
case llamaFileTypeQ4_K_S:
return "Q4_K_S"
case llamaFileTypeQ4_K_M:
return "Q4_K_M"
case llamaFileTypeQ5_K_S:
return "Q5_K_S"
case llamaFileTypeQ5_K_M:
return "Q5_K_M"
case llamaFileTypeQ6_K:
return "Q6_K"
default:
return "Unknown"
}
} }
type Running struct { type Running struct {
@@ -234,12 +184,66 @@ type llama struct {
Running Running
} }
func newLlama(model string, adapters []string, runner ModelRunner, opts api.Options) (*llama, error) { var errNoGPU = errors.New("nvidia-smi command failed")
if _, err := os.Stat(model); err != nil {
return nil, err // CheckVRAM returns the available VRAM in MiB on Linux machines with NVIDIA GPUs
func CheckVRAM() (int, error) {
cmd := exec.Command("nvidia-smi", "--query-gpu=memory.total", "--format=csv,noheader,nounits")
var stdout bytes.Buffer
cmd.Stdout = &stdout
err := cmd.Run()
if err != nil {
return 0, errNoGPU
} }
if _, err := os.Stat(runner.Path); err != nil { var total int
scanner := bufio.NewScanner(&stdout)
for scanner.Scan() {
line := scanner.Text()
vram, err := strconv.Atoi(line)
if err != nil {
return 0, fmt.Errorf("failed to parse available VRAM: %v", err)
}
total += vram
}
return total, nil
}
func NumGPU(numLayer, fileSizeBytes int64, opts api.Options) int {
if opts.NumGPU != -1 {
return opts.NumGPU
}
n := 1 // default to enable metal on macOS
if runtime.GOOS == "linux" {
vramMib, err := CheckVRAM()
if err != nil {
if err.Error() != "nvidia-smi command failed" {
log.Print(err.Error())
}
// nvidia driver not installed or no nvidia GPU found
return 0
}
totalVramBytes := int64(vramMib) * 1024 * 1024 // 1 MiB = 1024^2 bytes
// Calculate bytes per layer
// TODO: this is a rough heuristic, better would be to calculate this based on number of layers and context size
bytesPerLayer := fileSizeBytes / numLayer
// set n to the max number of layers we can fit in VRAM
return int(totalVramBytes / bytesPerLayer)
log.Printf("%d MiB VRAM available, loading up to %d GPU layers", vramMib, n)
}
// default to enable metal on macOS
return 1
}
func newLlama(model string, adapters []string, runners []ModelRunner, numLayers int64, opts api.Options) (*llama, error) {
fileInfo, err := os.Stat(model)
if err != nil {
return nil, err return nil, err
} }
@@ -250,14 +254,17 @@ func newLlama(model string, adapters []string, runner ModelRunner, opts api.Opti
params := []string{ params := []string{
"--model", model, "--model", model,
"--ctx-size", fmt.Sprintf("%d", opts.NumCtx), "--ctx-size", fmt.Sprintf("%d", opts.NumCtx),
"--gqa", fmt.Sprintf("%d", opts.NumGQA),
"--rope-freq-base", fmt.Sprintf("%f", opts.RopeFrequencyBase), "--rope-freq-base", fmt.Sprintf("%f", opts.RopeFrequencyBase),
"--rope-freq-scale", fmt.Sprintf("%f", opts.RopeFrequencyScale), "--rope-freq-scale", fmt.Sprintf("%f", opts.RopeFrequencyScale),
"--batch-size", fmt.Sprintf("%d", opts.NumBatch), "--batch-size", fmt.Sprintf("%d", opts.NumBatch),
"--n-gpu-layers", fmt.Sprintf("%d", opts.NumGPU), "--n-gpu-layers", fmt.Sprintf("%d", NumGPU(numLayers, fileInfo.Size(), opts)),
"--embedding", "--embedding",
} }
if opts.NumGQA > 0 {
params = append(params, "--gqa", fmt.Sprintf("%d", opts.NumGQA))
}
if len(adapters) > 0 { if len(adapters) > 0 {
// TODO: applying multiple adapters is not supported by the llama.cpp server yet // TODO: applying multiple adapters is not supported by the llama.cpp server yet
params = append(params, "--lora", adapters[0]) params = append(params, "--lora", adapters[0])
@@ -281,7 +288,12 @@ func newLlama(model string, adapters []string, runner ModelRunner, opts api.Opti
} }
// start the llama.cpp server with a retry in case the port is already in use // start the llama.cpp server with a retry in case the port is already in use
for try := 0; try < 3; try++ { for _, runner := range runners {
if _, err := os.Stat(runner.Path); err != nil {
log.Printf("llama runner not found: %v", err)
continue
}
port := rand.Intn(65535-49152) + 49152 // get a random port in the ephemeral range port := rand.Intn(65535-49152) + 49152 // get a random port in the ephemeral range
ctx, cancel := context.WithCancel(context.Background()) ctx, cancel := context.WithCancel(context.Background())
cmd := exec.CommandContext( cmd := exec.CommandContext(
@@ -289,67 +301,70 @@ func newLlama(model string, adapters []string, runner ModelRunner, opts api.Opti
runner.Path, runner.Path,
append(params, "--port", strconv.Itoa(port))..., append(params, "--port", strconv.Itoa(port))...,
) )
cmd.Env = append(os.Environ(), fmt.Sprintf("LD_LIBRARY_PATH=%s", filepath.Dir(runner.Path)))
cmd.Stdout = os.Stderr cmd.Stdout = os.Stderr
cmd.Stderr = os.Stderr cmd.Stderr = os.Stderr
llm := &llama{Options: opts, Running: Running{Port: port, Cmd: cmd, Cancel: cancel}} llm := &llama{Options: opts, Running: Running{Port: port, Cmd: cmd, Cancel: cancel}}
log.Print("starting llama runner")
if err := llm.Cmd.Start(); err != nil {
log.Printf("error starting the external llama runner: %v", err)
continue
}
// monitor the command, it is blocking, so if it exits we need to capture that
go func() {
err := llm.Cmd.Wait() // this will block until the command exits
if err != nil {
log.Printf("llama runner exited with error: %v", err)
} else {
log.Printf("llama runner exited")
}
}()
if err := waitForServer(llm); err != nil { if err := waitForServer(llm); err != nil {
log.Printf("error starting llama.cpp server: %v", err) log.Printf("error starting llama runner: %v", err)
llm.Close() llm.Close()
// try again // try again
continue continue
} }
// server started successfully // server started successfully
return llm, nil return llm, nil
} }
return nil, fmt.Errorf("max retry exceeded starting llama.cpp") return nil, fmt.Errorf("failed to start a llama runner")
} }
func waitForServer(llm *llama) error { func waitForServer(llm *llama) error {
log.Print("starting llama.cpp server")
var stderr bytes.Buffer
llm.Cmd.Stderr = &stderr
err := llm.Cmd.Start()
if err != nil {
return fmt.Errorf("error starting the external llama.cpp server: %w", err)
}
exitChan := make(chan error, 1)
// the server is a long running process, watch for it exiting to keep track of something going wrong
go func() {
err := llm.Cmd.Wait()
log.Print(stderr.String())
exitChan <- err
}()
// wait for the server to start responding // wait for the server to start responding
start := time.Now() start := time.Now()
expiresAt := time.Now().Add(30 * time.Second) expiresAt := time.Now().Add(2 * time.Minute) // be generous with timeout, large models can take a while to load
ticker := time.NewTicker(100 * time.Millisecond) ticker := time.NewTicker(200 * time.Millisecond)
log.Print("waiting for llama.cpp server to start responding") log.Print("waiting for llama runner to start responding")
for range ticker.C {
if time.Now().After(expiresAt) {
return fmt.Errorf("llama runner did not start within alloted time, retrying")
}
for { // check if the server process has terminated
select { if llm.Cmd.ProcessState != nil && llm.Cmd.ProcessState.Exited() {
case <-ticker.C: return fmt.Errorf("llama runner process has terminated")
if time.Now().After(expiresAt) { }
return fmt.Errorf("llama.cpp server did not start responding within 30 seconds, retrying")
} if err := llm.Ping(context.Background()); err == nil {
if err := llm.Ping(context.Background()); err == nil { break
log.Printf("llama.cpp server started in %f seconds", time.Since(start).Seconds())
return nil
}
case err := <-exitChan:
return fmt.Errorf("llama.cpp server exited unexpectedly: %w", err)
} }
} }
log.Printf("llama runner started in %f seconds", time.Since(start).Seconds())
return nil
} }
func (llm *llama) Close() { func (llm *llama) Close() {
llm.Running.Cmd.Cancel() llm.Cancel()
} }
func (llm *llama) SetOptions(opts api.Options) { func (llm *llama) SetOptions(opts api.Options) {
@@ -676,7 +691,7 @@ func (llm *llama) Embedding(ctx context.Context, input string) ([]float64, error
// Ping checks that the server subprocess is still running and responding to requests // Ping checks that the server subprocess is still running and responding to requests
func (llm *llama) Ping(ctx context.Context) error { func (llm *llama) Ping(ctx context.Context) error {
resp, err := http.Head(fmt.Sprintf("http://127.0.0.1:%d", llm.Running.Port)) resp, err := http.Head(fmt.Sprintf("http://127.0.0.1:%d", llm.Port))
if err != nil { if err != nil {
return fmt.Errorf("ping resp: %w", err) return fmt.Errorf("ping resp: %w", err)
} }

View File

@@ -21,7 +21,7 @@ type LLM interface {
Ping(context.Context) error Ping(context.Context) error
} }
func New(model string, adapters []string, opts api.Options) (LLM, error) { func New(workDir, model string, adapters []string, opts api.Options) (LLM, error) {
if _, err := os.Stat(model); err != nil { if _, err := os.Stat(model); err != nil {
return nil, err return nil, err
} }
@@ -32,15 +32,22 @@ func New(model string, adapters []string, opts api.Options) (LLM, error) {
} }
defer f.Close() defer f.Close()
ggml, err := DecodeGGML(f, ModelFamilyLlama) ggml, err := DecodeGGML(f)
if err != nil { if err != nil {
return nil, err return nil, err
} }
switch ggml.FileType().String() { switch ggml.FileType() {
case "F32", "Q5_0", "Q5_1", "Q8_0": case "Q8_0":
if ggml.Name() != "gguf" && opts.NumGPU != 0 {
// GGML Q8_0 do not support Metal API and will
// cause the runner to segmentation fault so disable GPU
log.Printf("WARNING: GPU disabled for F32, Q5_0, Q5_1, and Q8_0")
opts.NumGPU = 0
}
case "F32", "Q5_0", "Q5_1":
if opts.NumGPU != 0 { if opts.NumGPU != 0 {
// F32, F16, Q5_0, Q5_1, and Q8_0 do not support Metal API and will // F32, Q5_0, Q5_1, and Q8_0 do not support Metal API and will
// cause the runner to segmentation fault so disable GPU // cause the runner to segmentation fault so disable GPU
log.Printf("WARNING: GPU disabled for F32, Q5_0, Q5_1, and Q8_0") log.Printf("WARNING: GPU disabled for F32, Q5_0, Q5_1, and Q8_0")
opts.NumGPU = 0 opts.NumGPU = 0
@@ -49,35 +56,44 @@ func New(model string, adapters []string, opts api.Options) (LLM, error) {
totalResidentMemory := memory.TotalMemory() totalResidentMemory := memory.TotalMemory()
switch ggml.ModelType() { switch ggml.ModelType() {
case ModelType3B, ModelType7B: case "3B", "7B":
if ggml.FileType().String() == "F16" && totalResidentMemory < 16*1024*1024 { if ggml.FileType() == "F16" && totalResidentMemory < 16*1024*1024 {
return nil, fmt.Errorf("F16 model requires at least 16GB of memory") return nil, fmt.Errorf("F16 model requires at least 16GB of memory")
} else if totalResidentMemory < 8*1024*1024 { } else if totalResidentMemory < 8*1024*1024 {
return nil, fmt.Errorf("model requires at least 8GB of memory") return nil, fmt.Errorf("model requires at least 8GB of memory")
} }
case ModelType13B: case "13B":
if ggml.FileType().String() == "F16" && totalResidentMemory < 32*1024*1024 { if ggml.FileType() == "F16" && totalResidentMemory < 32*1024*1024 {
return nil, fmt.Errorf("F16 model requires at least 32GB of memory") return nil, fmt.Errorf("F16 model requires at least 32GB of memory")
} else if totalResidentMemory < 16*1024*1024 { } else if totalResidentMemory < 16*1024*1024 {
return nil, fmt.Errorf("model requires at least 16GB of memory") return nil, fmt.Errorf("model requires at least 16GB of memory")
} }
case ModelType30B, ModelType34B: case "30B", "34B", "40B":
if ggml.FileType().String() == "F16" && totalResidentMemory < 64*1024*1024 { if ggml.FileType() == "F16" && totalResidentMemory < 64*1024*1024 {
return nil, fmt.Errorf("F16 model requires at least 64GB of memory") return nil, fmt.Errorf("F16 model requires at least 64GB of memory")
} else if totalResidentMemory < 32*1024*1024 { } else if totalResidentMemory < 32*1024*1024 {
return nil, fmt.Errorf("model requires at least 32GB of memory") return nil, fmt.Errorf("model requires at least 32GB of memory")
} }
case ModelType65B: case "65B", "70B":
if ggml.FileType().String() == "F16" && totalResidentMemory < 128*1024*1024 { if ggml.FileType() == "F16" && totalResidentMemory < 128*1024*1024 {
return nil, fmt.Errorf("F16 model requires at least 128GB of memory") return nil, fmt.Errorf("F16 model requires at least 128GB of memory")
} else if totalResidentMemory < 64*1024*1024 { } else if totalResidentMemory < 64*1024*1024 {
return nil, fmt.Errorf("model requires at least 64GB of memory") return nil, fmt.Errorf("model requires at least 64GB of memory")
} }
case "180B":
if ggml.FileType() == "F16" && totalResidentMemory < 512*1024*1024 {
return nil, fmt.Errorf("F16 model requires at least 512GB of memory")
} else if totalResidentMemory < 128*1024*1024 {
return nil, fmt.Errorf("model requires at least 128GB of memory")
}
} }
switch ggml.ModelFamily() { switch ggml.Name() {
case ModelFamilyLlama: case "gguf":
return newLlama(model, adapters, ggmlRunner(), opts) opts.NumGQA = 0 // TODO: remove this when llama.cpp runners differ enough to need separate newLlama functions
return newLlama(model, adapters, chooseRunners(workDir, "gguf"), ggml.NumLayers(), opts)
case "ggml", "ggmf", "ggjt", "ggla":
return newLlama(model, adapters, chooseRunners(workDir, "ggml"), ggml.NumLayers(), opts)
default: default:
return nil, fmt.Errorf("unknown ggml type: %s", ggml.ModelFamily()) return nil, fmt.Errorf("unknown ggml type: %s", ggml.ModelFamily())
} }

View File

@@ -8,7 +8,7 @@ GO_LDFLAGS="$GO_LDFLAGS -X github.com/jmorganca/ollama/server.mode=release"
# build universal binary # build universal binary
GOARCH=arm64 go generate ./... GOARCH=arm64 go generate ./...
GOARCH=arm64 go build -ldflags "$GO_LDFLAGS" -o dist/ollama-darwin-arm64 GOARCH=arm64 go build -ldflags "$GO_LDFLAGS" -o dist/ollama-darwin-arm64
rm -rf llm/llama.cpp/ggml/build/*/bin rm -rf llm/llama.cpp/*/build/*/bin
GOARCH=amd64 go generate ./... GOARCH=amd64 go generate ./...
GOARCH=amd64 go build -ldflags "$GO_LDFLAGS" -o dist/ollama-darwin-amd64 GOARCH=amd64 go build -ldflags "$GO_LDFLAGS" -o dist/ollama-darwin-amd64
lipo -create -output dist/ollama dist/ollama-darwin-arm64 dist/ollama-darwin-amd64 lipo -create -output dist/ollama dist/ollama-darwin-arm64 dist/ollama-darwin-amd64

12
scripts/build_linux.sh Executable file
View File

@@ -0,0 +1,12 @@
#!/bin/bash
set -e
mkdir -p dist
for ARCH in arm64 amd64; do
docker buildx build --platform=linux/$ARCH -f Dockerfile.build . -t builder:$ARCH --load
docker create --platform linux/$ARCH --name builder builder:$ARCH
docker cp builder:/go/src/github.com/jmorganca/ollama/ollama ./dist/ollama-linux-$ARCH
docker rm builder
done

227
scripts/install.sh Normal file
View File

@@ -0,0 +1,227 @@
#!/bin/sh
# This script installs Ollama on Linux.
# It detects the current operating system architecture and installs the appropriate version of Ollama.
set -eu
status() { echo ">>> $*" >&2; }
error() { echo "ERROR $*"; exit 1; }
warning() { echo "WARNING: $*"; }
TEMP_DIR=$(mktemp -d)
cleanup() { rm -rf $TEMP_DIR; }
trap cleanup EXIT
available() { command -v $1 >/dev/null; }
require() {
local MISSING=''
for TOOL in $*; do
if ! available $TOOL; then
MISSING="$MISSING $TOOL"
fi
done
echo $MISSING
}
[ "$(uname -s)" = "Linux" ] || error 'This script is intended to run on Linux only.'
case "$(uname -m)" in
x86_64) ARCH="amd64" ;;
aarch64|arm64) ARCH="arm64" ;;
*) error "Unsupported architecture: $ARCH" ;;
esac
SUDO=
if [ "$(id -u)" -ne 0 ]; then
# Running as root, no need for sudo
if ! available sudo; then
error "This script requires superuser permissions. Please re-run as root."
fi
SUDO="sudo"
fi
NEEDS=$(require curl awk grep sed tee xargs)
if [ -n "$NEEDS" ]; then
status "ERROR: The following tools are required but missing:"
for NEED in $NEEDS; do
echo " - $NEED"
done
exit 1
fi
status "Downloading ollama..."
$SUDO curl --fail --show-error --location --progress-bar -o $TEMP_DIR/ollama "https://ollama.ai/download/ollama-linux-$ARCH"
status "Installing ollama to /usr/bin..."
$SUDO install -o0 -g0 -m755 -d /usr/bin
$SUDO install -o0 -g0 -m755 $TEMP_DIR/ollama /usr/bin/ollama
install_success() { status 'Install complete. Run "ollama" from the command line.'; }
trap install_success EXIT
# Everything from this point onwards is optional.
configure_systemd() {
if ! id ollama >/dev/null 2>&1; then
status "Creating ollama user..."
$SUDO useradd -r -s /bin/false -m -d /usr/share/ollama ollama
fi
status "Creating ollama systemd service..."
cat <<EOF | $SUDO tee /etc/systemd/system/ollama.service >/dev/null
[Unit]
Description=Ollama Service
After=network-online.target
[Service]
ExecStart=/usr/bin/ollama serve
User=ollama
Group=ollama
Restart=always
RestartSec=3
Environment="HOME=/usr/share/ollama"
[Install]
WantedBy=default.target
EOF
SYSTEMCTL_RUNNING="$(systemctl is-system-running || true)"
case $SYSTEMCTL_RUNNING in
running|degraded)
status "Enabling and starting ollama service..."
$SUDO systemctl daemon-reload
$SUDO systemctl enable ollama
$SUDO systemctl restart ollama
;;
esac
}
if available systemctl; then
configure_systemd
fi
if ! available lspci && ! available lshw; then
warning "Unable to detect NVIDIA GPU. Install lspci or lshw to automatically detect and install NVIDIA CUDA drivers."
exit 0
fi
check_gpu() {
case $1 in
lspci) available lspci && lspci -d '10de:' | grep -q 'NVIDIA' || return 1 ;;
lshw) available lshw && $SUDO lshw -c display -numeric | grep -q 'vendor: .* \[10DE\]' || return 1 ;;
nvidia-smi) available nvidia-smi || return 1 ;;
esac
}
if ! check_gpu lspci && ! check_gpu lshw; then
warning "No NVIDIA GPU detected. Ollama will run in CPU-only mode."
exit 0
fi
# ref: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#rhel-7-centos-7
# ref: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#rhel-8-rocky-8
# ref: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#rhel-9-rocky-9
# ref: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#fedora
install_cuda_driver_yum() {
status 'Installing NVIDIA repository...'
case $PACKAGE_MANAGER in
yum)
$SUDO $PACKAGE_MANAGER -y install yum-utils
$SUDO $PACKAGE_MANAGER-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-$1$2.repo
;;
dnf)
$SUDO $PACKAGE_MANAGER config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-$1$2.repo
;;
esac
case $1 in
rhel)
status 'Installing EPEL repository...'
# EPEL is required for third-party dependencies such as dkms and libvdpau
$SUDO $PACKAGE_MANAGER -y install https://dl.fedoraproject.org/pub/epel/epel-release-latest-$2.noarch.rpm || true
;;
esac
status 'Installing CUDA driver...'
if [ "$1" = 'centos' ] || [ "$1$2" = 'rhel7' ]; then
$SUDO $PACKAGE_MANAGER -y install nvidia-driver-latest-dkms
fi
$SUDO $PACKAGE_MANAGER -y install cuda-drivers
}
# ref: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#ubuntu
# ref: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#debian
install_cuda_driver_apt() {
status 'Installing NVIDIA repository...'
curl -fsSL -o $TEMP_DIR/cuda-keyring.deb https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-keyring_1.1-1_all.deb
case $1 in
debian)
status 'Enabling contrib sources...'
$SUDO sed 's/main/contrib/' < /etc/apt/sources.list | sudo tee /etc/apt/sources.list.d/contrib.list > /dev/null
;;
esac
status 'Installing CUDA driver...'
$SUDO dpkg -i $TEMP_DIR/cuda-keyring.deb
$SUDO apt-get update
[ -n "$SUDO" ] && SUDO_E="$SUDO -E" || SUDO_E=
DEBIAN_FRONTEND=noninteractive $SUDO_E apt-get -y install cuda-drivers -q
}
if [ ! -f "/etc/os-release" ]; then
error "Unknown distribution. Skipping CUDA installation."
fi
. /etc/os-release
OS_NAME=$ID
OS_VERSION=$VERSION_ID
PACKAGE_MANAGER=
for PACKAGE_MANAGER in dnf yum apt-get; do
if available $PACKAGE_MANAGER; then
break
fi
done
if [ -z "$PACKAGE_MANAGER" ]; then
error "Unknown package manager. Skipping CUDA installation."
fi
if ! check_gpu nvidia-smi || [ -z "$(nvidia-smi | grep -o "CUDA Version: [0-9]*\.[0-9]*")" ]; then
case $OS_NAME in
centos|rhel) install_cuda_driver_yum 'rhel' $OS_VERSION ;;
rocky) install_cuda_driver_yum 'rhel' $(echo $OS_VERSION | cut -c1) ;;
fedora) install_cuda_driver_yum $OS_NAME $OS_VERSION ;;
amzn) install_cuda_driver_yum 'fedora' '35' ;;
debian) install_cuda_driver_apt $OS_NAME $OS_VERSION ;;
ubuntu) install_cuda_driver_apt $OS_NAME $(echo $OS_VERSION | sed 's/\.//') ;;
*) exit ;;
esac
fi
if ! lsmod | grep -q nvidia; then
KERNEL_RELEASE="$(uname -r)"
case $OS_NAME in
centos|rhel|rocky|fedora|amzn) $SUDO $PACKAGE_MANAGER -y install kernel-devel-$KERNEL_RELEASE kernel-headers-$KERNEL_RELEASE ;;
debian|ubuntu) $SUDO apt-get -y install linux-headers-$KERNEL_RELEASE ;;
*) exit ;;
esac
NVIDIA_CUDA_VERSION=$($SUDO dkms status | awk -F: '/added/ { print $1 }')
if [ -n "$NVIDIA_CUDA_VERSION" ]; then
$SUDO dkms install $NVIDIA_CUDA_VERSION
fi
if lsmod | grep -q nouveau; then
status "Removing nouveau..."
$SUDO rmmod nouveau
fi
$SUDO modprobe nvidia
fi

View File

@@ -14,7 +14,7 @@ import (
"net/http" "net/http"
"net/url" "net/url"
"os" "os"
"path" "path/filepath"
"strconv" "strconv"
"strings" "strings"
"time" "time"
@@ -71,7 +71,7 @@ func (r AuthRedirect) URL() (*url.URL, error) {
return redirectURL, nil return redirectURL, nil
} }
func getAuthToken(ctx context.Context, redirData AuthRedirect, regOpts *RegistryOptions) (string, error) { func getAuthToken(ctx context.Context, redirData AuthRedirect) (string, error) {
redirectURL, err := redirData.URL() redirectURL, err := redirData.URL()
if err != nil { if err != nil {
return "", err return "", err
@@ -82,7 +82,7 @@ func getAuthToken(ctx context.Context, redirData AuthRedirect, regOpts *Registry
return "", err return "", err
} }
keyPath := path.Join(home, ".ollama", "id_ed25519") keyPath := filepath.Join(home, ".ollama", "id_ed25519")
rawKey, err := os.ReadFile(keyPath) rawKey, err := os.ReadFile(keyPath)
if err != nil { if err != nil {
@@ -103,7 +103,7 @@ func getAuthToken(ctx context.Context, redirData AuthRedirect, regOpts *Registry
headers := make(http.Header) headers := make(http.Header)
headers.Set("Authorization", sig) headers.Set("Authorization", sig)
resp, err := makeRequest(ctx, "GET", redirectURL, headers, nil, regOpts) resp, err := makeRequest(ctx, "GET", redirectURL, headers, nil, nil)
if err != nil { if err != nil {
log.Printf("couldn't get token: %q", err) log.Printf("couldn't get token: %q", err)
} }

View File

@@ -8,7 +8,7 @@ import (
"log" "log"
"net/http" "net/http"
"os" "os"
"path" "path/filepath"
"strconv" "strconv"
"sync" "sync"
"time" "time"
@@ -173,7 +173,7 @@ func doDownload(ctx context.Context, opts downloadOpts, f *FileDownload) error {
return fmt.Errorf("%w: on download registry responded with code %d: %v", errDownload, resp.StatusCode, string(body)) return fmt.Errorf("%w: on download registry responded with code %d: %v", errDownload, resp.StatusCode, string(body))
} }
err = os.MkdirAll(path.Dir(f.FilePath), 0o700) err = os.MkdirAll(filepath.Dir(f.FilePath), 0o700)
if err != nil { if err != nil {
return fmt.Errorf("make blobs directory: %w", err) return fmt.Errorf("make blobs directory: %w", err)
} }

View File

@@ -14,7 +14,6 @@ import (
"net/http" "net/http"
"net/url" "net/url"
"os" "os"
"path"
"path/filepath" "path/filepath"
"reflect" "reflect"
"runtime" "runtime"
@@ -114,10 +113,11 @@ type LayerReader struct {
} }
type ConfigV2 struct { type ConfigV2 struct {
ModelFamily llm.ModelFamily `json:"model_family"` ModelFormat string `json:"model_format"`
ModelType string `json:"model_type"` ModelFamily string `json:"model_family"`
FileType string `json:"file_type"` ModelType string `json:"model_type"`
RootFS RootFS `json:"rootfs"` FileType string `json:"file_type"`
RootFS RootFS `json:"rootfs"`
// required by spec // required by spec
Architecture string `json:"architecture"` Architecture string `json:"architecture"`
@@ -267,7 +267,30 @@ func filenameWithPath(path, f string) (string, error) {
return f, nil return f, nil
} }
func CreateModel(ctx context.Context, name string, path string, fn func(resp api.ProgressResponse)) error { func CreateModel(ctx context.Context, workDir, name string, path string, fn func(resp api.ProgressResponse)) error {
mp := ParseModelPath(name)
var manifest *ManifestV2
var err error
var noprune string
// build deleteMap to prune unused layers
deleteMap := make(map[string]bool)
if noprune = os.Getenv("OLLAMA_NOPRUNE"); noprune == "" {
manifest, _, err = GetManifest(mp)
if err != nil && !errors.Is(err, os.ErrNotExist) {
return err
}
if manifest != nil {
for _, l := range manifest.Layers {
deleteMap[l.Digest] = true
}
deleteMap[manifest.Config.Digest] = true
}
}
mf, err := os.Open(path) mf, err := os.Open(path)
if err != nil { if err != nil {
fn(api.ProgressResponse{Status: fmt.Sprintf("couldn't open modelfile '%s'", path)}) fn(api.ProgressResponse{Status: fmt.Sprintf("couldn't open modelfile '%s'", path)})
@@ -328,14 +351,15 @@ func CreateModel(ctx context.Context, name string, path string, fn func(resp api
} }
defer file.Close() defer file.Close()
ggml, err := llm.DecodeGGML(file, llm.ModelFamilyLlama) ggml, err := llm.DecodeGGML(file)
if err != nil { if err != nil {
return err return err
} }
config.ModelFormat = ggml.Name()
config.ModelFamily = ggml.ModelFamily() config.ModelFamily = ggml.ModelFamily()
config.ModelType = ggml.ModelType().String() config.ModelType = ggml.ModelType()
config.FileType = ggml.FileType().String() config.FileType = ggml.FileType()
// reset the file // reset the file
file.Seek(0, io.SeekStart) file.Seek(0, io.SeekStart)
@@ -366,9 +390,10 @@ func CreateModel(ctx context.Context, name string, path string, fn func(resp api
return err return err
} }
// copie the model metadata // copy the model metadata
config.ModelFamily = source.ModelFamily config.ModelFamily = source.ModelFamily
config.ModelType = source.ModelType config.ModelType = source.ModelType
config.ModelFormat = source.ModelFormat
config.FileType = source.FileType config.FileType = source.FileType
for _, l := range mf.Layers { for _, l := range mf.Layers {
@@ -435,8 +460,10 @@ func CreateModel(ctx context.Context, name string, path string, fn func(resp api
return err return err
} }
layer.MediaType = mediaType if layer.Size > 0 {
layers = append(layers, layer) layer.MediaType = mediaType
layers = append(layers, layer)
}
case "template", "system", "prompt": case "template", "system", "prompt":
fn(api.ProgressResponse{Status: fmt.Sprintf("creating model %s layer", c.Name)}) fn(api.ProgressResponse{Status: fmt.Sprintf("creating model %s layer", c.Name)})
// remove the layer if one exists // remove the layer if one exists
@@ -448,8 +475,10 @@ func CreateModel(ctx context.Context, name string, path string, fn func(resp api
return err return err
} }
layer.MediaType = mediaType if layer.Size > 0 {
layers = append(layers, layer) layer.MediaType = mediaType
layers = append(layers, layer)
}
default: default:
// runtime parameters, build a list of args for each parameter to allow multiple values to be specified (ex: multiple stop sequences) // runtime parameters, build a list of args for each parameter to allow multiple values to be specified (ex: multiple stop sequences)
params[c.Name] = append(params[c.Name], c.Args) params[c.Name] = append(params[c.Name], c.Args)
@@ -472,6 +501,12 @@ func CreateModel(ctx context.Context, name string, path string, fn func(resp api
} }
} }
if config.ModelType == "65B" {
if numGQA, ok := formattedParams["num_gqa"].(int); ok && numGQA == 8 {
config.ModelType = "70B"
}
}
bts, err := json.Marshal(formattedParams) bts, err := json.Marshal(formattedParams)
if err != nil { if err != nil {
return err return err
@@ -489,7 +524,7 @@ func CreateModel(ctx context.Context, name string, path string, fn func(resp api
} }
// generate the embedding layers // generate the embedding layers
embeddingLayers, err := embeddingLayers(embed) embeddingLayers, err := embeddingLayers(workDir, embed)
if err != nil { if err != nil {
return err return err
} }
@@ -503,6 +538,7 @@ func CreateModel(ctx context.Context, name string, path string, fn func(resp api
var manifestLayers []*Layer var manifestLayers []*Layer
for _, l := range layers { for _, l := range layers {
manifestLayers = append(manifestLayers, &l.Layer) manifestLayers = append(manifestLayers, &l.Layer)
delete(deleteMap, l.Layer.Digest)
} }
// Create a layer for the config object // Create a layer for the config object
@@ -512,6 +548,7 @@ func CreateModel(ctx context.Context, name string, path string, fn func(resp api
return err return err
} }
layers = append(layers, cfg) layers = append(layers, cfg)
delete(deleteMap, cfg.Layer.Digest)
if err := SaveLayers(layers, fn, false); err != nil { if err := SaveLayers(layers, fn, false); err != nil {
return err return err
@@ -524,6 +561,14 @@ func CreateModel(ctx context.Context, name string, path string, fn func(resp api
return err return err
} }
if noprune == "" {
fn(api.ProgressResponse{Status: "removing any unused layers"})
err = deleteUnusedLayers(nil, deleteMap, false)
if err != nil {
return err
}
}
fn(api.ProgressResponse{Status: "success"}) fn(api.ProgressResponse{Status: "success"})
return nil return nil
} }
@@ -536,7 +581,7 @@ type EmbeddingParams struct {
} }
// embeddingLayers loads the associated LLM and generates the embeddings to be stored from an input file // embeddingLayers loads the associated LLM and generates the embeddings to be stored from an input file
func embeddingLayers(e EmbeddingParams) ([]*LayerReader, error) { func embeddingLayers(workDir string, e EmbeddingParams) ([]*LayerReader, error) {
layers := []*LayerReader{} layers := []*LayerReader{}
if len(e.files) > 0 { if len(e.files) > 0 {
// check if the model is a file path or a model name // check if the model is a file path or a model name
@@ -549,7 +594,7 @@ func embeddingLayers(e EmbeddingParams) ([]*LayerReader, error) {
model = &Model{ModelPath: e.model} model = &Model{ModelPath: e.model}
} }
if err := load(context.Background(), model, e.opts, defaultSessionDuration); err != nil { if err := load(context.Background(), workDir, model, e.opts, defaultSessionDuration); err != nil {
return nil, fmt.Errorf("load model to generate embeddings: %v", err) return nil, fmt.Errorf("load model to generate embeddings: %v", err)
} }
@@ -779,14 +824,14 @@ func formatParams(params map[string][]string) (map[string]interface{}, error) {
return nil, fmt.Errorf("invalid float value %s", vals) return nil, fmt.Errorf("invalid float value %s", vals)
} }
out[key] = floatVal out[key] = float32(floatVal)
case reflect.Int: case reflect.Int:
intVal, err := strconv.ParseInt(vals[0], 10, 0) intVal, err := strconv.ParseInt(vals[0], 10, 64)
if err != nil { if err != nil {
return nil, fmt.Errorf("invalid int value %s", vals) return nil, fmt.Errorf("invalid int value %s", vals)
} }
out[key] = intVal out[key] = int(intVal)
case reflect.Bool: case reflect.Bool:
boolVal, err := strconv.ParseBool(vals[0]) boolVal, err := strconv.ParseBool(vals[0])
if err != nil { if err != nil {
@@ -866,18 +911,7 @@ func CopyModel(src, dest string) error {
return nil return nil
} }
func DeleteModel(name string) error { func deleteUnusedLayers(skipModelPath *ModelPath, deleteMap map[string]bool, dryRun bool) error {
mp := ParseModelPath(name)
manifest, _, err := GetManifest(mp)
if err != nil {
return err
}
deleteMap := make(map[string]bool)
for _, layer := range manifest.Layers {
deleteMap[layer.Digest] = true
}
deleteMap[manifest.Config.Digest] = true
fp, err := GetManifestPath() fp, err := GetManifestPath()
if err != nil { if err != nil {
return err return err
@@ -894,14 +928,13 @@ func DeleteModel(name string) error {
fmp := ParseModelPath(tag) fmp := ParseModelPath(tag)
// skip the manifest we're trying to delete // skip the manifest we're trying to delete
if mp.GetFullTagname() == fmp.GetFullTagname() { if skipModelPath != nil && skipModelPath.GetFullTagname() == fmp.GetFullTagname() {
return nil return nil
} }
// save (i.e. delete from the deleteMap) any files used in other manifests // save (i.e. delete from the deleteMap) any files used in other manifests
manifest, _, err := GetManifest(fmp) manifest, _, err := GetManifest(fmp)
if err != nil { if err != nil {
log.Printf("skipping file: %s", fp)
return nil return nil
} }
@@ -925,14 +958,72 @@ func DeleteModel(name string) error {
log.Printf("couldn't get file path for '%s': %v", k, err) log.Printf("couldn't get file path for '%s': %v", k, err)
continue continue
} }
if err := os.Remove(fp); err != nil { if !dryRun {
log.Printf("couldn't remove file '%s': %v", fp, err) if err := os.Remove(fp); err != nil {
continue log.Printf("couldn't remove file '%s': %v", fp, err)
continue
}
} else {
log.Printf("wanted to remove: %s", fp)
} }
} }
} }
fp, err = mp.GetManifestPath(false) return nil
}
func PruneLayers() error {
deleteMap := make(map[string]bool)
p, err := GetBlobsPath("")
if err != nil {
return err
}
blobs, err := os.ReadDir(p)
if err != nil {
log.Printf("couldn't read dir '%s': %v", p, err)
return err
}
for _, blob := range blobs {
name := blob.Name()
if runtime.GOOS == "windows" {
name = strings.ReplaceAll(name, "-", ":")
}
deleteMap[name] = true
}
log.Printf("total blobs: %d", len(deleteMap))
err = deleteUnusedLayers(nil, deleteMap, false)
if err != nil {
return err
}
log.Printf("total unused blobs removed: %d", len(deleteMap))
return nil
}
func DeleteModel(name string) error {
mp := ParseModelPath(name)
manifest, _, err := GetManifest(mp)
if err != nil {
return err
}
deleteMap := make(map[string]bool)
for _, layer := range manifest.Layers {
deleteMap[layer.Digest] = true
}
deleteMap[manifest.Config.Digest] = true
err = deleteUnusedLayers(&mp, deleteMap, false)
if err != nil {
return err
}
fp, err := mp.GetManifestPath(false)
if err != nil { if err != nil {
return err return err
} }
@@ -1063,14 +1154,14 @@ func PushModel(ctx context.Context, name string, regOpts *RegistryOptions, fn fu
Total: layer.Size, Total: layer.Size,
}) })
location, err := startUpload(ctx, mp, layer, regOpts) location, chunkSize, err := startUpload(ctx, mp, layer, regOpts)
if err != nil { if err != nil {
log.Printf("couldn't start upload: %v", err) log.Printf("couldn't start upload: %v", err)
return err return err
} }
if strings.HasPrefix(path.Base(location.Path), "sha256:") { if strings.HasPrefix(filepath.Base(location.Path), "sha256:") {
layer.Digest = path.Base(location.Path) layer.Digest = filepath.Base(location.Path)
fn(api.ProgressResponse{ fn(api.ProgressResponse{
Status: "using existing layer", Status: "using existing layer",
Digest: layer.Digest, Digest: layer.Digest,
@@ -1080,7 +1171,7 @@ func PushModel(ctx context.Context, name string, regOpts *RegistryOptions, fn fu
continue continue
} }
if err := uploadBlobChunked(ctx, location, layer, regOpts, fn); err != nil { if err := uploadBlob(ctx, location, layer, chunkSize, regOpts, fn); err != nil {
log.Printf("error uploading blob: %v", err) log.Printf("error uploading blob: %v", err)
return err return err
} }
@@ -1111,13 +1202,34 @@ func PushModel(ctx context.Context, name string, regOpts *RegistryOptions, fn fu
func PullModel(ctx context.Context, name string, regOpts *RegistryOptions, fn func(api.ProgressResponse)) error { func PullModel(ctx context.Context, name string, regOpts *RegistryOptions, fn func(api.ProgressResponse)) error {
mp := ParseModelPath(name) mp := ParseModelPath(name)
var manifest *ManifestV2
var err error
var noprune string
// build deleteMap to prune unused layers
deleteMap := make(map[string]bool)
if noprune = os.Getenv("OLLAMA_NOPRUNE"); noprune == "" {
manifest, _, err = GetManifest(mp)
if err != nil && !errors.Is(err, os.ErrNotExist) {
return err
}
if manifest != nil {
for _, l := range manifest.Layers {
deleteMap[l.Digest] = true
}
deleteMap[manifest.Config.Digest] = true
}
}
if mp.ProtocolScheme == "http" && !regOpts.Insecure { if mp.ProtocolScheme == "http" && !regOpts.Insecure {
return fmt.Errorf("insecure protocol http") return fmt.Errorf("insecure protocol http")
} }
fn(api.ProgressResponse{Status: "pulling manifest"}) fn(api.ProgressResponse{Status: "pulling manifest"})
manifest, err := pullModelManifest(ctx, mp, regOpts) manifest, err = pullModelManifest(ctx, mp, regOpts)
if err != nil { if err != nil {
return fmt.Errorf("pull model manifest: %s", err) return fmt.Errorf("pull model manifest: %s", err)
} }
@@ -1137,7 +1249,9 @@ func PullModel(ctx context.Context, name string, regOpts *RegistryOptions, fn fu
}); err != nil { }); err != nil {
return err return err
} }
delete(deleteMap, layer.Digest)
} }
delete(deleteMap, manifest.Config.Digest)
fn(api.ProgressResponse{Status: "verifying sha256 digest"}) fn(api.ProgressResponse{Status: "verifying sha256 digest"})
for _, layer := range layers { for _, layer := range layers {
@@ -1175,6 +1289,14 @@ func PullModel(ctx context.Context, name string, regOpts *RegistryOptions, fn fu
return err return err
} }
if noprune == "" {
fn(api.ProgressResponse{Status: "removing any unused layers"})
err = deleteUnusedLayers(nil, deleteMap, false)
if err != nil {
return err
}
}
fn(api.ProgressResponse{Status: "success"}) fn(api.ProgressResponse{Status: "success"})
return nil return nil
@@ -1275,7 +1397,7 @@ func makeRequestWithRetry(ctx context.Context, method string, requestURL *url.UR
case resp.StatusCode == http.StatusUnauthorized: case resp.StatusCode == http.StatusUnauthorized:
auth := resp.Header.Get("www-authenticate") auth := resp.Header.Get("www-authenticate")
authRedir := ParseAuthRedirectString(auth) authRedir := ParseAuthRedirectString(auth)
token, err := getAuthToken(ctx, authRedir, regOpts) token, err := getAuthToken(ctx, authRedir)
if err != nil { if err != nil {
return nil, err return nil, err
} }
@@ -1300,7 +1422,7 @@ func makeRequestWithRetry(ctx context.Context, method string, requestURL *url.UR
} }
func makeRequest(ctx context.Context, method string, requestURL *url.URL, headers http.Header, body io.Reader, regOpts *RegistryOptions) (*http.Response, error) { func makeRequest(ctx context.Context, method string, requestURL *url.URL, headers http.Header, body io.Reader, regOpts *RegistryOptions) (*http.Response, error) {
if requestURL.Scheme != "http" && regOpts.Insecure { if requestURL.Scheme != "http" && regOpts != nil && regOpts.Insecure {
requestURL.Scheme = "http" requestURL.Scheme = "http"
} }
@@ -1313,14 +1435,25 @@ func makeRequest(ctx context.Context, method string, requestURL *url.URL, header
req.Header = headers req.Header = headers
} }
if regOpts.Token != "" { if regOpts != nil {
req.Header.Set("Authorization", "Bearer "+regOpts.Token) if regOpts.Token != "" {
} else if regOpts.Username != "" && regOpts.Password != "" { req.Header.Set("Authorization", "Bearer "+regOpts.Token)
req.SetBasicAuth(regOpts.Username, regOpts.Password) } else if regOpts.Username != "" && regOpts.Password != "" {
req.SetBasicAuth(regOpts.Username, regOpts.Password)
}
} }
req.Header.Set("User-Agent", fmt.Sprintf("ollama/%s (%s %s) Go/%s", version.Version, runtime.GOARCH, runtime.GOOS, runtime.Version())) req.Header.Set("User-Agent", fmt.Sprintf("ollama/%s (%s %s) Go/%s", version.Version, runtime.GOARCH, runtime.GOOS, runtime.Version()))
if s := req.Header.Get("Content-Length"); s != "" {
contentLength, err := strconv.ParseInt(s, 10, 64)
if err != nil {
return nil, err
}
req.ContentLength = contentLength
}
client := &http.Client{ client := &http.Client{
CheckRedirect: func(req *http.Request, via []*http.Request) error { CheckRedirect: func(req *http.Request, via []*http.Request) error {
if len(via) >= 10 { if len(via) >= 10 {

View File

@@ -133,7 +133,12 @@ func GetBlobsPath(digest string) (string, error) {
} }
path := filepath.Join(home, ".ollama", "models", "blobs", digest) path := filepath.Join(home, ".ollama", "models", "blobs", digest)
if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil { dirPath := filepath.Dir(path)
if digest == "" {
dirPath = path
}
if err := os.MkdirAll(dirPath, 0o755); err != nil {
return "", err return "", err
} }

View File

@@ -12,6 +12,7 @@ import (
"os/signal" "os/signal"
"path/filepath" "path/filepath"
"reflect" "reflect"
"runtime"
"strconv" "strconv"
"strings" "strings"
"sync" "sync"
@@ -57,7 +58,7 @@ var loaded struct {
var defaultSessionDuration = 5 * time.Minute var defaultSessionDuration = 5 * time.Minute
// load a model into memory if it is not already loaded, it is up to the caller to lock loaded.mu before calling this function // load a model into memory if it is not already loaded, it is up to the caller to lock loaded.mu before calling this function
func load(ctx context.Context, model *Model, reqOpts map[string]interface{}, sessionDuration time.Duration) error { func load(ctx context.Context, workDir string, model *Model, reqOpts map[string]interface{}, sessionDuration time.Duration) error {
opts := api.DefaultOptions() opts := api.DefaultOptions()
if err := opts.FromMap(model.Options); err != nil { if err := opts.FromMap(model.Options); err != nil {
log.Printf("could not load model options: %v", err) log.Printf("could not load model options: %v", err)
@@ -93,7 +94,7 @@ func load(ctx context.Context, model *Model, reqOpts map[string]interface{}, ses
loaded.Embeddings = model.Embeddings loaded.Embeddings = model.Embeddings
} }
llmModel, err := llm.New(model.ModelPath, model.AdapterPaths, opts) llmModel, err := llm.New(workDir, model.ModelPath, model.AdapterPaths, opts)
if err != nil { if err != nil {
return err return err
} }
@@ -129,6 +130,7 @@ func load(ctx context.Context, model *Model, reqOpts map[string]interface{}, ses
llmModel.SetOptions(opts) llmModel.SetOptions(opts)
} }
} }
loaded.expireAt = time.Now().Add(sessionDuration) loaded.expireAt = time.Now().Add(sessionDuration)
if loaded.expireTimer == nil { if loaded.expireTimer == nil {
@@ -149,6 +151,7 @@ func load(ctx context.Context, model *Model, reqOpts map[string]interface{}, ses
loaded.digest = "" loaded.digest = ""
}) })
} }
loaded.expireTimer.Reset(sessionDuration) loaded.expireTimer.Reset(sessionDuration)
return nil return nil
} }
@@ -171,8 +174,11 @@ func GenerateHandler(c *gin.Context) {
return return
} }
sessionDuration := defaultSessionDuration // TODO: set this duration from the request if specified workDir := c.GetString("workDir")
if err := load(c.Request.Context(), model, req.Options, sessionDuration); err != nil {
// TODO: set this duration from the request if specified
sessionDuration := defaultSessionDuration
if err := load(c.Request.Context(), workDir, model, req.Options, sessionDuration); err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
return return
} }
@@ -217,8 +223,13 @@ func GenerateHandler(c *gin.Context) {
ch <- r ch <- r
} }
if err := loaded.llm.Predict(c.Request.Context(), req.Context, prompt, fn); err != nil { // an empty request loads the model
ch <- gin.H{"error": err.Error()} if req.Prompt == "" && req.Template == "" && req.System == "" {
ch <- api.GenerateResponse{Model: req.Model, Done: true}
} else {
if err := loaded.llm.Predict(c.Request.Context(), req.Context, prompt, fn); err != nil {
ch <- gin.H{"error": err.Error()}
}
} }
}() }()
@@ -240,7 +251,9 @@ func EmbeddingHandler(c *gin.Context) {
c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()}) c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
return return
} }
if err := load(c.Request.Context(), model, req.Options, 5*time.Minute); err != nil {
workDir := c.GetString("workDir")
if err := load(c.Request.Context(), workDir, model, req.Options, 5*time.Minute); err != nil {
c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()}) c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
return return
} }
@@ -330,6 +343,8 @@ func CreateModelHandler(c *gin.Context) {
return return
} }
workDir := c.GetString("workDir")
ch := make(chan any) ch := make(chan any)
go func() { go func() {
defer close(ch) defer close(ch)
@@ -340,7 +355,7 @@ func CreateModelHandler(c *gin.Context) {
ctx, cancel := context.WithCancel(c.Request.Context()) ctx, cancel := context.WithCancel(c.Request.Context())
defer cancel() defer cancel()
if err := CreateModel(ctx, req.Name, req.Path, fn); err != nil { if err := CreateModel(ctx, workDir, req.Name, req.Path, fn); err != nil {
ch <- gin.H{"error": err.Error()} ch <- gin.H{"error": err.Error()}
} }
}() }()
@@ -363,6 +378,7 @@ func DeleteModelHandler(c *gin.Context) {
} }
return return
} }
c.JSON(http.StatusOK, nil)
} }
func ShowModelHandler(c *gin.Context) { func ShowModelHandler(c *gin.Context) {
@@ -493,33 +509,40 @@ func CopyModelHandler(c *gin.Context) {
} }
} }
func Serve(ln net.Listener, origins []string) error { var defaultAllowOrigins = []string{
"localhost",
"127.0.0.1",
"0.0.0.0",
}
func Serve(ln net.Listener, allowOrigins []string) error {
config := cors.DefaultConfig() config := cors.DefaultConfig()
config.AllowWildcard = true config.AllowWildcard = true
config.AllowOrigins = append(origins, []string{
"http://localhost", config.AllowOrigins = allowOrigins
"http://localhost:*", for _, allowOrigin := range defaultAllowOrigins {
"https://localhost", config.AllowOrigins = append(config.AllowOrigins,
"https://localhost:*", fmt.Sprintf("http://%s", allowOrigin),
"http://127.0.0.1", fmt.Sprintf("https://%s", allowOrigin),
"http://127.0.0.1:*", fmt.Sprintf("http://%s:*", allowOrigin),
"https://127.0.0.1", fmt.Sprintf("https://%s:*", allowOrigin),
"https://127.0.0.1:*", )
"http://0.0.0.0", }
"http://0.0.0.0:*",
"https://0.0.0.0", workDir, err := os.MkdirTemp("", "ollama")
"https://0.0.0.0:*", if err != nil {
}...) return err
}
defer os.RemoveAll(workDir)
r := gin.Default() r := gin.Default()
r.Use(cors.New(config)) r.Use(
cors.New(config),
r.GET("/", func(c *gin.Context) { func(c *gin.Context) {
c.String(http.StatusOK, "Ollama is running") c.Set("workDir", workDir)
}) c.Next()
r.HEAD("/", func(c *gin.Context) { },
c.Status(http.StatusOK) )
})
r.POST("/api/pull", PullModelHandler) r.POST("/api/pull", PullModelHandler)
r.POST("/api/generate", GenerateHandler) r.POST("/api/generate", GenerateHandler)
@@ -527,10 +550,17 @@ func Serve(ln net.Listener, origins []string) error {
r.POST("/api/create", CreateModelHandler) r.POST("/api/create", CreateModelHandler)
r.POST("/api/push", PushModelHandler) r.POST("/api/push", PushModelHandler)
r.POST("/api/copy", CopyModelHandler) r.POST("/api/copy", CopyModelHandler)
r.GET("/api/tags", ListModelsHandler)
r.DELETE("/api/delete", DeleteModelHandler) r.DELETE("/api/delete", DeleteModelHandler)
r.POST("/api/show", ShowModelHandler) r.POST("/api/show", ShowModelHandler)
for _, method := range []string{http.MethodGet, http.MethodHead} {
r.Handle(method, "/", func(c *gin.Context) {
c.String(http.StatusOK, "Ollama is running")
})
r.Handle(method, "/api/tags", ListModelsHandler)
}
log.Printf("Listening on %s", ln.Addr()) log.Printf("Listening on %s", ln.Addr())
s := &http.Server{ s := &http.Server{
Handler: r, Handler: r,
@@ -538,15 +568,23 @@ func Serve(ln net.Listener, origins []string) error {
// listen for a ctrl+c and stop any loaded llm // listen for a ctrl+c and stop any loaded llm
signals := make(chan os.Signal, 1) signals := make(chan os.Signal, 1)
signal.Notify(signals, syscall.SIGINT) signal.Notify(signals, syscall.SIGINT, syscall.SIGTERM)
go func() { go func() {
<-signals <-signals
if loaded.llm != nil { if loaded.llm != nil {
loaded.llm.Close() loaded.llm.Close()
} }
os.RemoveAll(workDir)
os.Exit(0) os.Exit(0)
}() }()
if runtime.GOOS == "linux" {
// check compatibility to log warnings
if _, err := llm.CheckVRAM(); err != nil {
log.Printf("Warning: GPU support may not enabled, check you have installed install GPU drivers: %v", err)
}
}
return s.Serve(ln) return s.Serve(ln)
} }

View File

@@ -14,7 +14,12 @@ import (
"github.com/jmorganca/ollama/api" "github.com/jmorganca/ollama/api"
) )
func startUpload(ctx context.Context, mp ModelPath, layer *Layer, regOpts *RegistryOptions) (*url.URL, error) { const (
redirectChunkSize = 1024 * 1024 * 1024
regularChunkSize = 95 * 1024 * 1024
)
func startUpload(ctx context.Context, mp ModelPath, layer *Layer, regOpts *RegistryOptions) (*url.URL, int64, error) {
requestURL := mp.BaseURL() requestURL := mp.BaseURL()
requestURL = requestURL.JoinPath("v2", mp.GetNamespaceRepository(), "blobs/uploads/") requestURL = requestURL.JoinPath("v2", mp.GetNamespaceRepository(), "blobs/uploads/")
if layer.From != "" { if layer.From != "" {
@@ -27,20 +32,26 @@ func startUpload(ctx context.Context, mp ModelPath, layer *Layer, regOpts *Regis
resp, err := makeRequestWithRetry(ctx, "POST", requestURL, nil, nil, regOpts) resp, err := makeRequestWithRetry(ctx, "POST", requestURL, nil, nil, regOpts)
if err != nil { if err != nil {
log.Printf("couldn't start upload: %v", err) log.Printf("couldn't start upload: %v", err)
return nil, err return nil, 0, err
} }
defer resp.Body.Close() defer resp.Body.Close()
// Extract UUID location from header location := resp.Header.Get("Docker-Upload-Location")
location := resp.Header.Get("Location") chunkSize := redirectChunkSize
if location == "" { if location == "" {
return nil, fmt.Errorf("location header is missing in response") location = resp.Header.Get("Location")
chunkSize = regularChunkSize
} }
return url.Parse(location) locationURL, err := url.Parse(location)
if err != nil {
return nil, 0, err
}
return locationURL, int64(chunkSize), nil
} }
func uploadBlobChunked(ctx context.Context, requestURL *url.URL, layer *Layer, regOpts *RegistryOptions, fn func(api.ProgressResponse)) error { func uploadBlob(ctx context.Context, requestURL *url.URL, layer *Layer, chunkSize int64, regOpts *RegistryOptions, fn func(api.ProgressResponse)) error {
// TODO allow resumability // TODO allow resumability
// TODO allow canceling uploads via DELETE // TODO allow canceling uploads via DELETE
@@ -55,8 +66,12 @@ func uploadBlobChunked(ctx context.Context, requestURL *url.URL, layer *Layer, r
} }
defer f.Close() defer f.Close()
// 95MB chunk size pw := ProgressWriter{
chunkSize := 95 * 1024 * 1024 status: fmt.Sprintf("uploading %s", layer.Digest),
digest: layer.Digest,
total: layer.Size,
fn: fn,
}
for offset := int64(0); offset < int64(layer.Size); { for offset := int64(0); offset < int64(layer.Size); {
chunk := int64(layer.Size) - offset chunk := int64(layer.Size) - offset
@@ -64,80 +79,27 @@ func uploadBlobChunked(ctx context.Context, requestURL *url.URL, layer *Layer, r
chunk = int64(chunkSize) chunk = int64(chunkSize)
} }
sectionReader := io.NewSectionReader(f, int64(offset), chunk) resp, err := uploadBlobChunk(ctx, http.MethodPatch, requestURL, f, offset, chunk, regOpts, &pw)
for try := 0; try < MaxRetries; try++ { if err != nil {
r, w := io.Pipe() fn(api.ProgressResponse{
defer r.Close() Status: fmt.Sprintf("error uploading chunk: %v", err),
go func() { Digest: layer.Digest,
defer w.Close() Total: layer.Size,
Completed: int(offset),
})
for chunked := int64(0); chunked < chunk; { return err
n, err := io.CopyN(w, sectionReader, 1024*1024) }
if err != nil && !errors.Is(err, io.EOF) {
fn(api.ProgressResponse{
Status: fmt.Sprintf("error reading chunk: %v", err),
Digest: layer.Digest,
Total: layer.Size,
Completed: int(offset),
})
return offset += chunk
} location := resp.Header.Get("Docker-Upload-Location")
if location == "" {
location = resp.Header.Get("Location")
}
chunked += n requestURL, err = url.Parse(location)
fn(api.ProgressResponse{ if err != nil {
Status: fmt.Sprintf("uploading %s", layer.Digest), return err
Digest: layer.Digest,
Total: layer.Size,
Completed: int(offset) + int(chunked),
})
}
}()
headers := make(http.Header)
headers.Set("Content-Type", "application/octet-stream")
headers.Set("Content-Length", strconv.Itoa(int(chunk)))
headers.Set("Content-Range", fmt.Sprintf("%d-%d", offset, offset+sectionReader.Size()-1))
resp, err := makeRequest(ctx, "PATCH", requestURL, headers, r, regOpts)
if err != nil && !errors.Is(err, io.EOF) {
fn(api.ProgressResponse{
Status: fmt.Sprintf("error uploading chunk: %v", err),
Digest: layer.Digest,
Total: layer.Size,
Completed: int(offset),
})
return err
}
defer resp.Body.Close()
switch {
case resp.StatusCode == http.StatusUnauthorized:
auth := resp.Header.Get("www-authenticate")
authRedir := ParseAuthRedirectString(auth)
token, err := getAuthToken(ctx, authRedir, regOpts)
if err != nil {
return err
}
regOpts.Token = token
if _, err := sectionReader.Seek(0, io.SeekStart); err != nil {
return err
}
continue
case resp.StatusCode >= http.StatusBadRequest:
body, _ := io.ReadAll(resp.Body)
return fmt.Errorf("on upload registry responded with code %d: %s", resp.StatusCode, body)
}
offset += sectionReader.Size()
requestURL, err = url.Parse(resp.Header.Get("Location"))
if err != nil {
return err
}
break
} }
} }
@@ -163,3 +125,90 @@ func uploadBlobChunked(ctx context.Context, requestURL *url.URL, layer *Layer, r
} }
return nil return nil
} }
func uploadBlobChunk(ctx context.Context, method string, requestURL *url.URL, r io.ReaderAt, offset, limit int64, opts *RegistryOptions, pw *ProgressWriter) (*http.Response, error) {
sectionReader := io.NewSectionReader(r, int64(offset), limit)
headers := make(http.Header)
headers.Set("Content-Type", "application/octet-stream")
headers.Set("Content-Length", strconv.Itoa(int(limit)))
headers.Set("X-Redirect-Uploads", "1")
if method == http.MethodPatch {
headers.Set("Content-Range", fmt.Sprintf("%d-%d", offset, offset+sectionReader.Size()-1))
}
for try := 0; try < MaxRetries; try++ {
resp, err := makeRequest(ctx, method, requestURL, headers, io.TeeReader(sectionReader, pw), opts)
if err != nil && !errors.Is(err, io.EOF) {
return nil, err
}
defer resp.Body.Close()
switch {
case resp.StatusCode == http.StatusTemporaryRedirect:
location, err := resp.Location()
if err != nil {
return nil, err
}
pw.completed = int(offset)
if _, err := uploadBlobChunk(ctx, http.MethodPut, location, r, offset, limit, nil, pw); err != nil {
// retry
log.Printf("retrying redirected upload: %v", err)
continue
}
return resp, nil
case resp.StatusCode == http.StatusUnauthorized:
auth := resp.Header.Get("www-authenticate")
authRedir := ParseAuthRedirectString(auth)
token, err := getAuthToken(ctx, authRedir)
if err != nil {
return nil, err
}
opts.Token = token
pw.completed = int(offset)
sectionReader = io.NewSectionReader(r, offset, limit)
continue
case resp.StatusCode >= http.StatusBadRequest:
body, _ := io.ReadAll(resp.Body)
return nil, fmt.Errorf("on upload registry responded with code %d: %s", resp.StatusCode, body)
}
return resp, nil
}
return nil, fmt.Errorf("max retries exceeded")
}
type ProgressWriter struct {
status string
digest string
bucket int
completed int
total int
fn func(api.ProgressResponse)
}
func (pw *ProgressWriter) Write(b []byte) (int, error) {
n := len(b)
pw.bucket += n
pw.completed += n
// throttle status updates to not spam the client
if pw.bucket >= 1024*1024 || pw.completed >= pw.total {
pw.fn(api.ProgressResponse{
Status: pw.status,
Digest: pw.digest,
Total: pw.total,
Completed: pw.completed,
})
pw.bucket = 0
}
return n, nil
}