Compare commits
125 Commits
matt/strea
...
api
Author | SHA1 | Date | |
---|---|---|---|
![]() |
949fc4eafa | ||
![]() |
0a4f21c0a7 | ||
![]() |
9abb66254a | ||
![]() |
1d0ebe67e8 | ||
![]() |
a1b2d95f96 | ||
![]() |
c0b1bf7537 | ||
![]() |
cdfeb165ca | ||
![]() |
92d454ec5f | ||
![]() |
9333b0cc82 | ||
![]() |
9771b1ec51 | ||
![]() |
76db4a49cf | ||
![]() |
4aa0976a2e | ||
![]() |
92c20fdae6 | ||
![]() |
c951da7096 | ||
![]() |
24d82a23a2 | ||
![]() |
f40b3de758 | ||
![]() |
5f4008c296 | ||
![]() |
6ae33d8141 | ||
![]() |
c5664c1fef | ||
![]() |
958a5a8184 | ||
![]() |
8608eb4760 | ||
![]() |
a2b210130f | ||
![]() |
ed20837f9a | ||
![]() |
1db2a61dd0 | ||
![]() |
2ded8ab206 | ||
![]() |
e6b3648bbf | ||
![]() |
0625e805f0 | ||
![]() |
c38ec5befb | ||
![]() |
c577721a43 | ||
![]() |
29c056ea39 | ||
![]() |
9fc3bba9cf | ||
![]() |
7774ed4ae6 | ||
![]() |
11f920f209 | ||
![]() |
6e6b655956 | ||
![]() |
110ae89a6c | ||
![]() |
5e388f931e | ||
![]() |
d5ad41dd7b | ||
![]() |
d294a11bc9 | ||
![]() |
93d887e4bc | ||
![]() |
5306b0269d | ||
![]() |
7de0c8345d | ||
![]() |
1b9dcab3ab | ||
![]() |
86279f4ae3 | ||
![]() |
b934bf23e6 | ||
![]() |
2b8ef455ad | ||
![]() |
0c5f47177c | ||
![]() |
1210db2924 | ||
![]() |
d0854bf1e6 | ||
![]() |
8396463255 | ||
![]() |
a027bbf4d7 | ||
![]() |
ed94a3dd02 | ||
![]() |
f14f62ab3b | ||
![]() |
0fb5268496 | ||
![]() |
c65edb1506 | ||
![]() |
1605af32ec | ||
![]() |
ee3032ad89 | ||
![]() |
5b7a27281d | ||
![]() |
d2a784e33e | ||
![]() |
413a2e4f91 | ||
![]() |
b5614f3ebc | ||
![]() |
8b2ba9cab8 | ||
![]() |
e29662ab5c | ||
![]() |
cbc40aa996 | ||
![]() |
5cb82540c9 | ||
![]() |
d7849a1dc9 | ||
![]() |
01c44d687e | ||
![]() |
9b12a511ca | ||
![]() |
e20362e0d5 | ||
![]() |
c928ceb927 | ||
![]() |
e1a0846483 | ||
![]() |
f997e29e45 | ||
![]() |
87d9efb364 | ||
![]() |
93d3a2568d | ||
![]() |
5a81390b24 | ||
![]() |
a89ef99aed | ||
![]() |
dc0c725ceb | ||
![]() |
5d71bda478 | ||
![]() |
88897a90e4 | ||
![]() |
9df31c3518 | ||
![]() |
2044f9d4da | ||
![]() |
0d186f3b33 | ||
![]() |
82f5b66c01 | ||
![]() |
c986694367 | ||
![]() |
058d0cd04b | ||
![]() |
ee1c994d15 | ||
![]() |
4cba75efc5 | ||
![]() |
8c83701e9f | ||
![]() |
6137b12799 | ||
![]() |
1fabba474b | ||
![]() |
765770efdb | ||
![]() |
9297ff8330 | ||
![]() |
ee4fd16f2c | ||
![]() |
a9ed7cc6aa | ||
![]() |
6c6a31a1e8 | ||
![]() |
fc6ec356fc | ||
![]() |
1255bc9b45 | ||
![]() |
084e4c782a | ||
![]() |
58ffa03d8b | ||
![]() |
637f8bc6a5 | ||
![]() |
499e9007a5 | ||
![]() |
b9bb5ca288 | ||
![]() |
4e8be787c7 | ||
![]() |
aa45d7c1df | ||
![]() |
e35565c567 | ||
![]() |
a5520bfb42 | ||
![]() |
2627c464ba | ||
![]() |
b58d5d16b0 | ||
![]() |
24580df958 | ||
![]() |
80dd44e80a | ||
![]() |
94e1d96b29 | ||
![]() |
66003e1d05 | ||
![]() |
c345053a8b | ||
![]() |
08d7c2a944 | ||
![]() |
bc9573dcb1 | ||
![]() |
e53bc57d4d | ||
![]() |
f0b398d17f | ||
![]() |
8efbc5df55 | ||
![]() |
ccc3e9ac6d | ||
![]() |
daa4f096f9 | ||
![]() |
3ee85f1c6c | ||
![]() |
2540c9181c | ||
![]() |
83ffb154bc | ||
![]() |
9aa192c812 | ||
![]() |
fc8707686f | ||
![]() |
e6881cabd0 |
@@ -1,5 +1,8 @@
|
||||
.vscode
|
||||
ollama
|
||||
app
|
||||
dist
|
||||
scripts
|
||||
llm/llama.cpp/ggml
|
||||
llm/llama.cpp/gguf
|
||||
.env
|
||||
|
1
.gitmodules
vendored
1
.gitmodules
vendored
@@ -6,4 +6,5 @@
|
||||
[submodule "llm/llama.cpp/gguf"]
|
||||
path = llm/llama.cpp/gguf
|
||||
url = https://github.com/ggerganov/llama.cpp.git
|
||||
ignore = dirty
|
||||
shallow = true
|
||||
|
28
Dockerfile
28
Dockerfile
@@ -1,21 +1,23 @@
|
||||
FROM golang:alpine
|
||||
FROM nvidia/cuda:11.8.0-devel-ubuntu22.04
|
||||
|
||||
ARG TARGETARCH
|
||||
ARG VERSION=0.0.0
|
||||
ARG GOFLAGS="'-ldflags=-w -s'"
|
||||
|
||||
WORKDIR /go/src/github.com/jmorganca/ollama
|
||||
RUN apk add --no-cache git build-base cmake
|
||||
RUN apt-get update && apt-get install -y git build-essential cmake
|
||||
ADD https://dl.google.com/go/go1.21.1.linux-$TARGETARCH.tar.gz /tmp/go1.21.1.tar.gz
|
||||
RUN mkdir -p /usr/local && tar xz -C /usr/local </tmp/go1.21.1.tar.gz
|
||||
|
||||
COPY . .
|
||||
RUN go generate ./... && go build -ldflags '-linkmode external -extldflags "-static"' .
|
||||
|
||||
FROM alpine
|
||||
ENV OLLAMA_HOST 0.0.0.0
|
||||
RUN apk add --no-cache libstdc++
|
||||
|
||||
ARG USER=ollama
|
||||
ARG GROUP=ollama
|
||||
RUN addgroup $GROUP && adduser -D -G $GROUP $USER
|
||||
ENV GOARCH=$TARGETARCH
|
||||
RUN /usr/local/go/bin/go generate ./... \
|
||||
&& /usr/local/go/bin/go build .
|
||||
|
||||
FROM ubuntu:22.04
|
||||
RUN apt-get update && apt-get install -y ca-certificates
|
||||
COPY --from=0 /go/src/github.com/jmorganca/ollama/ollama /bin/ollama
|
||||
|
||||
USER $USER:$GROUP
|
||||
EXPOSE 11434
|
||||
ENV OLLAMA_HOST 0.0.0.0
|
||||
ENTRYPOINT ["/bin/ollama"]
|
||||
CMD ["serve"]
|
||||
|
32
Dockerfile.build
Normal file
32
Dockerfile.build
Normal file
@@ -0,0 +1,32 @@
|
||||
|
||||
# centos7 amd64 dependencies
|
||||
FROM --platform=linux/amd64 nvidia/cuda:11.8.0-devel-centos7 AS base-amd64
|
||||
RUN yum install -y https://repo.ius.io/ius-release-el7.rpm centos-release-scl && \
|
||||
yum update -y && \
|
||||
yum install -y devtoolset-10-gcc devtoolset-10-gcc-c++ git236 wget
|
||||
RUN wget "https://github.com/Kitware/CMake/releases/download/v3.27.6/cmake-3.27.6-linux-x86_64.sh" -O cmake-installer.sh && chmod +x cmake-installer.sh && ./cmake-installer.sh --skip-license --prefix=/usr/local
|
||||
ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
|
||||
|
||||
# centos8 arm64 dependencies
|
||||
FROM --platform=linux/arm64 nvidia/cuda:11.4.3-devel-centos8 AS base-arm64
|
||||
RUN sed -i -e 's/mirrorlist/#mirrorlist/g' -e 's|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g' /etc/yum.repos.d/CentOS-*
|
||||
RUN yum install -y git cmake
|
||||
|
||||
FROM base-${TARGETARCH}
|
||||
ARG TARGETARCH
|
||||
|
||||
# install go
|
||||
ADD https://dl.google.com/go/go1.21.1.linux-$TARGETARCH.tar.gz /tmp/go1.21.1.tar.gz
|
||||
RUN mkdir -p /usr/local && tar xz -C /usr/local </tmp/go1.21.1.tar.gz
|
||||
|
||||
# build the final binary
|
||||
WORKDIR /go/src/github.com/jmorganca/ollama
|
||||
COPY . .
|
||||
ENV GOOS=linux
|
||||
ENV GOARCH=$TARGETARCH
|
||||
|
||||
ARG VERSION=0.0.0
|
||||
ARG GOFLAGS="'-ldflags -w -s'"
|
||||
|
||||
RUN /usr/local/go/bin/go generate ./... && \
|
||||
/usr/local/go/bin/go build .
|
@@ -1,22 +0,0 @@
|
||||
FROM nvidia/cuda:12.2.0-devel-ubuntu22.04
|
||||
|
||||
WORKDIR /go/src/github.com/jmorganca/ollama
|
||||
RUN apt-get update && apt-get install -y git build-essential cmake
|
||||
ADD https://dl.google.com/go/go1.21.1.linux-amd64.tar.gz /tmp/go1.21.1.tar.gz
|
||||
RUN mkdir -p /usr/local && tar xz -C /usr/local </tmp/go1.21.1.tar.gz
|
||||
|
||||
COPY . .
|
||||
RUN /usr/local/go/bin/go generate ./... && /usr/local/go/bin/go build -ldflags '-linkmode external -extldflags "-static"' .
|
||||
|
||||
FROM nvidia/cuda:12.2.0-runtime-ubuntu22.04
|
||||
ENV OLLAMA_HOST 0.0.0.0
|
||||
|
||||
ARG USER=ollama
|
||||
ARG GROUP=ollama
|
||||
RUN groupadd $GROUP && useradd -m -g $GROUP $USER
|
||||
|
||||
COPY --from=0 /go/src/github.com/jmorganca/ollama/ollama /bin/ollama
|
||||
|
||||
USER $USER:$GROUP
|
||||
ENTRYPOINT ["/bin/ollama"]
|
||||
CMD ["serve"]
|
190
README.md
190
README.md
@@ -9,19 +9,27 @@
|
||||
|
||||
[](https://discord.gg/ollama)
|
||||
|
||||
Run, create, and share large language models (LLMs).
|
||||
Get up and running with large language models locally.
|
||||
|
||||
> Note: Ollama is in early preview. Please report any issues you find.
|
||||
### macOS
|
||||
|
||||
## Download
|
||||
[Download](https://ollama.ai/download/Ollama-darwin.zip)
|
||||
|
||||
- [Download](https://ollama.ai/download) for macOS
|
||||
- Download for Windows and Linux (coming soon)
|
||||
- Build [from source](#building)
|
||||
### Linux & WSL2
|
||||
|
||||
```
|
||||
curl https://ollama.ai/install.sh | sh
|
||||
```
|
||||
|
||||
[Manual install instructions](https://github.com/jmorganca/ollama/blob/main/docs/linux.md)
|
||||
|
||||
### Windows
|
||||
|
||||
coming soon
|
||||
|
||||
## Quickstart
|
||||
|
||||
To run and chat with [Llama 2](https://ai.meta.com/llama), the new model by Meta:
|
||||
To run and chat with [Llama 2](https://ollama.ai/library/llama2):
|
||||
|
||||
```
|
||||
ollama run llama2
|
||||
@@ -29,87 +37,50 @@ ollama run llama2
|
||||
|
||||
## Model library
|
||||
|
||||
Ollama supports a list of open-source models available on [ollama.ai/library](https://ollama.ai/library 'ollama model library')
|
||||
Ollama supports a list of open-source models available on [ollama.ai/library](https://ollama.ai/library "ollama model library")
|
||||
|
||||
Here are some example open-source models that can be downloaded:
|
||||
|
||||
| Model | Parameters | Size | Download |
|
||||
| ------------------------ | ---------- | ----- | ------------------------------- |
|
||||
| Llama2 | 7B | 3.8GB | `ollama pull llama2` |
|
||||
| Llama2 13B | 13B | 7.3GB | `ollama pull llama2:13b` |
|
||||
| Llama2 70B | 70B | 39GB | `ollama pull llama2:70b` |
|
||||
| Llama2 Uncensored | 7B | 3.8GB | `ollama pull llama2-uncensored` |
|
||||
| Code Llama | 7B | 3.8GB | `ollama pull codellama` |
|
||||
| Orca Mini | 3B | 1.9GB | `ollama pull orca-mini` |
|
||||
| Vicuna | 7B | 3.8GB | `ollama pull vicuna` |
|
||||
| Nous-Hermes | 7B | 3.8GB | `ollama pull nous-hermes` |
|
||||
| Nous-Hermes 13B | 13B | 7.3GB | `ollama pull nous-hermes:13b` |
|
||||
| Wizard Vicuna Uncensored | 13B | 7.3GB | `ollama pull wizard-vicuna` |
|
||||
| Model | Parameters | Size | Download |
|
||||
| ------------------ | ---------- | ----- | ------------------------------ |
|
||||
| Mistral | 7B | 4.1GB | `ollama run mistral` |
|
||||
| Llama 2 | 7B | 3.8GB | `ollama run llama2` |
|
||||
| Code Llama | 7B | 3.8GB | `ollama run codellama` |
|
||||
| Llama 2 Uncensored | 7B | 3.8GB | `ollama run llama2-uncensored` |
|
||||
| Llama 2 13B | 13B | 7.3GB | `ollama run llama2:13b` |
|
||||
| Llama 2 70B | 70B | 39GB | `ollama run llama2:70b` |
|
||||
| Orca Mini | 3B | 1.9GB | `ollama run orca-mini` |
|
||||
| Vicuna | 7B | 3.8GB | `ollama run vicuna` |
|
||||
|
||||
> Note: You should have at least 8 GB of RAM to run the 3B models, 16 GB to run the 7B models, and 32 GB to run the 13B models.
|
||||
|
||||
## Examples
|
||||
## Customize your own model
|
||||
|
||||
### Pull a public model
|
||||
### Import from GGUF or GGML
|
||||
|
||||
```
|
||||
ollama pull llama2
|
||||
```
|
||||
Ollama supports importing GGUF and GGML file formats in the Modelfile. This means if you have a model that is not in the Ollama library, you can create it, iterate on it, and upload it to the Ollama library to share with others when you are ready.
|
||||
|
||||
> This command can also be used to update a local model. Only updated changes will be pulled.
|
||||
1. Create a file named Modelfile, and add a `FROM` instruction with the local filepath to the model you want to import.
|
||||
|
||||
### Run a model interactively
|
||||
```
|
||||
FROM ./vicuna-33b.Q4_0.gguf
|
||||
```
|
||||
|
||||
```
|
||||
ollama run llama2
|
||||
>>> hi
|
||||
Hello! How can I help you today?
|
||||
```
|
||||
3. Create the model in Ollama
|
||||
|
||||
For multiline input, you can wrap text with `"""`:
|
||||
```
|
||||
ollama create name -f path_to_modelfile
|
||||
```
|
||||
|
||||
```
|
||||
>>> """Hello,
|
||||
... world!
|
||||
... """
|
||||
I'm a basic program that prints the famous "Hello, world!" message to the console.
|
||||
```
|
||||
5. Run the model
|
||||
|
||||
### Run a model non-interactively
|
||||
```
|
||||
ollama run name
|
||||
```
|
||||
|
||||
```
|
||||
$ ollama run llama2 'tell me a joke'
|
||||
Sure! Here's a quick one:
|
||||
Why did the scarecrow win an award? Because he was outstanding in his field!
|
||||
```
|
||||
### Customize a prompt
|
||||
|
||||
```
|
||||
$ cat <<EOF >prompts.txt
|
||||
tell me a joke about llamas
|
||||
tell me another one
|
||||
EOF
|
||||
$ ollama run llama2 <prompts.txt
|
||||
>>> tell me a joke about llamas
|
||||
Why did the llama refuse to play hide-and-seek?
|
||||
nobody likes to be hided!
|
||||
|
||||
>>> tell me another one
|
||||
Sure, here's another one:
|
||||
|
||||
Why did the llama go to the bar?
|
||||
To have a hay-often good time!
|
||||
```
|
||||
|
||||
### Run a model on contents of a text file
|
||||
|
||||
```
|
||||
$ ollama run llama2 "summarize this file:" "$(cat README.md)"
|
||||
Ollama is a lightweight, extensible framework for building and running language models on the local machine. It provides a simple API for creating, running, and managing models, as well as a library of pre-built models that can be easily used in a variety of applications.
|
||||
```
|
||||
|
||||
### Customize a model
|
||||
|
||||
Pull a base model:
|
||||
Models from the Ollama library can be customized with a prompt. The example
|
||||
|
||||
```
|
||||
ollama pull llama2
|
||||
@@ -138,30 +109,61 @@ ollama run mario
|
||||
Hello! It's your friend Mario.
|
||||
```
|
||||
|
||||
For more examples, see the [examples](./examples) directory. For more information on creating a Modelfile, see the [Modelfile](./docs/modelfile.md) documentation.
|
||||
For more examples, see the [examples](./examples) directory. For more information on working with a Modelfile, see the [Modelfile](./docs/modelfile.md) documentation.
|
||||
|
||||
### Listing local models
|
||||
## CLI Reference
|
||||
|
||||
### Create a model
|
||||
|
||||
`ollama create` is used to create a model from a Modelfile.
|
||||
|
||||
### Pull a model
|
||||
|
||||
```
|
||||
ollama list
|
||||
ollama pull llama2
|
||||
```
|
||||
|
||||
### Removing local models
|
||||
> This command can also be used to update a local model. Only the diff will be pulled.
|
||||
|
||||
### Remove a model
|
||||
|
||||
```
|
||||
ollama rm llama2
|
||||
```
|
||||
|
||||
## Model packages
|
||||
### Copy a model
|
||||
|
||||
### Overview
|
||||
```
|
||||
ollama cp llama2 my-llama2
|
||||
```
|
||||
|
||||
Ollama bundles model weights, configurations, and data into a single package, defined by a [Modelfile](./docs/modelfile.md).
|
||||
### Multiline input
|
||||
|
||||
<picture>
|
||||
<source media="(prefers-color-scheme: dark)" height="480" srcset="https://github.com/jmorganca/ollama/assets/251292/2fd96b5f-191b-45c1-9668-941cfad4eb70">
|
||||
<img alt="logo" height="480" src="https://github.com/jmorganca/ollama/assets/251292/2fd96b5f-191b-45c1-9668-941cfad4eb70">
|
||||
</picture>
|
||||
For multiline input, you can wrap text with `"""`:
|
||||
|
||||
```
|
||||
>>> """Hello,
|
||||
... world!
|
||||
... """
|
||||
I'm a basic program that prints the famous "Hello, world!" message to the console.
|
||||
```
|
||||
|
||||
### Pass in prompt as arguments
|
||||
|
||||
```
|
||||
$ ollama run llama2 "summarize this file:" "$(cat README.md)"
|
||||
Ollama is a lightweight, extensible framework for building and running language models on the local machine. It provides a simple API for creating, running, and managing models, as well as a library of pre-built models that can be easily used in a variety of applications.
|
||||
```
|
||||
|
||||
### List models on your computer
|
||||
|
||||
```
|
||||
ollama list
|
||||
```
|
||||
|
||||
### Start Ollama
|
||||
|
||||
`ollama serve` is used when you want to start ollama without running the desktop application.
|
||||
|
||||
## Building
|
||||
|
||||
@@ -204,12 +206,18 @@ curl -X POST http://localhost:11434/api/generate -d '{
|
||||
}'
|
||||
```
|
||||
|
||||
## Community Projects using Ollama
|
||||
## Community Integrations
|
||||
|
||||
- [LangChain](https://python.langchain.com/docs/integrations/llms/ollama) and [LangChain.js](https://js.langchain.com/docs/modules/model_io/models/llms/integrations/ollama) with a question-answering [example](https://js.langchain.com/docs/use_cases/question_answering/local_retrieval_qa).
|
||||
- [Continue](https://github.com/continuedev/continue) - embeds Ollama inside Visual Studio Code. The extension lets you highlight code to add to the prompt, ask questions in the sidebar, and generate code inline.
|
||||
- [LiteLLM](https://github.com/BerriAI/litellm) a lightweight python package to simplify LLM API calls
|
||||
- [Discord AI Bot](https://github.com/mekb-turtle/discord-ai-bot) - interact with Ollama as a chatbot on Discord.
|
||||
- [Raycast Ollama](https://github.com/MassimilianoPasquini97/raycast_ollama) - Raycast extension to use Ollama for local llama inference on Raycast.
|
||||
- [Simple HTML UI for Ollama](https://github.com/rtcfirefly/ollama-ui)
|
||||
- [Emacs client](https://github.com/zweifisch/ollama) for Ollama
|
||||
- [LangChain](https://python.langchain.com/docs/integrations/llms/ollama) and [LangChain.js](https://js.langchain.com/docs/modules/model_io/models/llms/integrations/ollama) with [example](https://js.langchain.com/docs/use_cases/question_answering/local_retrieval_qa)
|
||||
- [LlamaIndex](https://gpt-index.readthedocs.io/en/stable/examples/llm/ollama.html)
|
||||
- [Raycast extension](https://github.com/MassimilianoPasquini97/raycast_ollama)
|
||||
- [Discollama](https://github.com/mxyng/discollama) (Discord bot inside the Ollama discord channel)
|
||||
- [Continue](https://github.com/continuedev/continue)
|
||||
- [Obsidian Ollama plugin](https://github.com/hinterdupfinger/obsidian-ollama)
|
||||
- [Dagger Chatbot](https://github.com/samalba/dagger-chatbot)
|
||||
- [LiteLLM](https://github.com/BerriAI/litellm)
|
||||
- [Discord AI Bot](https://github.com/mekb-turtle/discord-ai-bot)
|
||||
- [HTML UI](https://github.com/rtcfirefly/ollama-ui)
|
||||
- [Typescript UI](https://github.com/ollama-interface/Ollama-Gui?tab=readme-ov-file)
|
||||
- [Dumbar](https://github.com/JerrySievert/Dumbar)
|
||||
- [Emacs client](https://github.com/zweifisch/ollama)
|
||||
|
225
api/client.py
Normal file
225
api/client.py
Normal file
@@ -0,0 +1,225 @@
|
||||
import os
|
||||
import json
|
||||
import requests
|
||||
|
||||
BASE_URL = os.environ.get('OLLAMA_HOST', 'http://localhost:11434')
|
||||
|
||||
# Generate a response for a given prompt with a provided model. This is a streaming endpoint, so will be a series of responses.
|
||||
# The final response object will include statistics and additional data from the request. Use the callback function to override
|
||||
# the default handler.
|
||||
def generate(model_name, prompt, system=None, template=None, context=None, options=None, callback=None):
|
||||
try:
|
||||
url = f"{BASE_URL}/api/generate"
|
||||
payload = {
|
||||
"model": model_name,
|
||||
"prompt": prompt,
|
||||
"system": system,
|
||||
"template": template,
|
||||
"context": context,
|
||||
"options": options
|
||||
}
|
||||
|
||||
# Remove keys with None values
|
||||
payload = {k: v for k, v in payload.items() if v is not None}
|
||||
|
||||
with requests.post(url, json=payload, stream=True) as response:
|
||||
response.raise_for_status()
|
||||
|
||||
# Creating a variable to hold the context history of the final chunk
|
||||
final_context = None
|
||||
|
||||
# Variable to hold concatenated response strings if no callback is provided
|
||||
full_response = ""
|
||||
|
||||
# Iterating over the response line by line and displaying the details
|
||||
for line in response.iter_lines():
|
||||
if line:
|
||||
# Parsing each line (JSON chunk) and extracting the details
|
||||
chunk = json.loads(line)
|
||||
|
||||
# If a callback function is provided, call it with the chunk
|
||||
if callback:
|
||||
callback(chunk)
|
||||
else:
|
||||
# If this is not the last chunk, add the "response" field value to full_response and print it
|
||||
if not chunk.get("done"):
|
||||
response_piece = chunk.get("response", "")
|
||||
full_response += response_piece
|
||||
print(response_piece, end="", flush=True)
|
||||
|
||||
# Check if it's the last chunk (done is true)
|
||||
if chunk.get("done"):
|
||||
final_context = chunk.get("context")
|
||||
|
||||
# Return the full response and the final context
|
||||
return full_response, final_context
|
||||
except requests.exceptions.RequestException as e:
|
||||
print(f"An error occurred: {e}")
|
||||
return None, None
|
||||
|
||||
# Create a model from a Modelfile. Use the callback function to override the default handler.
|
||||
def create(model_name, model_path, callback=None):
|
||||
try:
|
||||
url = f"{BASE_URL}/api/create"
|
||||
payload = {"name": model_name, "path": model_path}
|
||||
|
||||
# Making a POST request with the stream parameter set to True to handle streaming responses
|
||||
with requests.post(url, json=payload, stream=True) as response:
|
||||
response.raise_for_status()
|
||||
|
||||
# Iterating over the response line by line and displaying the status
|
||||
for line in response.iter_lines():
|
||||
if line:
|
||||
# Parsing each line (JSON chunk) and extracting the status
|
||||
chunk = json.loads(line)
|
||||
|
||||
if callback:
|
||||
callback(chunk)
|
||||
else:
|
||||
print(f"Status: {chunk.get('status')}")
|
||||
except requests.exceptions.RequestException as e:
|
||||
print(f"An error occurred: {e}")
|
||||
|
||||
# Pull a model from a the model registry. Cancelled pulls are resumed from where they left off, and multiple
|
||||
# calls to will share the same download progress. Use the callback function to override the default handler.
|
||||
def pull(model_name, insecure=False, callback=None):
|
||||
try:
|
||||
url = f"{BASE_URL}/api/pull"
|
||||
payload = {
|
||||
"name": model_name,
|
||||
"insecure": insecure
|
||||
}
|
||||
|
||||
# Making a POST request with the stream parameter set to True to handle streaming responses
|
||||
with requests.post(url, json=payload, stream=True) as response:
|
||||
response.raise_for_status()
|
||||
|
||||
# Iterating over the response line by line and displaying the details
|
||||
for line in response.iter_lines():
|
||||
if line:
|
||||
# Parsing each line (JSON chunk) and extracting the details
|
||||
chunk = json.loads(line)
|
||||
|
||||
# If a callback function is provided, call it with the chunk
|
||||
if callback:
|
||||
callback(chunk)
|
||||
else:
|
||||
# Print the status message directly to the console
|
||||
print(chunk.get('status', ''), end='', flush=True)
|
||||
|
||||
# If there's layer data, you might also want to print that (adjust as necessary)
|
||||
if 'digest' in chunk:
|
||||
print(f" - Digest: {chunk['digest']}", end='', flush=True)
|
||||
print(f" - Total: {chunk['total']}", end='', flush=True)
|
||||
print(f" - Completed: {chunk['completed']}", end='\n', flush=True)
|
||||
else:
|
||||
print()
|
||||
except requests.exceptions.RequestException as e:
|
||||
print(f"An error occurred: {e}")
|
||||
|
||||
# Push a model to the model registry. Use the callback function to override the default handler.
|
||||
def push(model_name, insecure=False, callback=None):
|
||||
try:
|
||||
url = f"{BASE_URL}/api/push"
|
||||
payload = {
|
||||
"name": model_name,
|
||||
"insecure": insecure
|
||||
}
|
||||
|
||||
# Making a POST request with the stream parameter set to True to handle streaming responses
|
||||
with requests.post(url, json=payload, stream=True) as response:
|
||||
response.raise_for_status()
|
||||
|
||||
# Iterating over the response line by line and displaying the details
|
||||
for line in response.iter_lines():
|
||||
if line:
|
||||
# Parsing each line (JSON chunk) and extracting the details
|
||||
chunk = json.loads(line)
|
||||
|
||||
# If a callback function is provided, call it with the chunk
|
||||
if callback:
|
||||
callback(chunk)
|
||||
else:
|
||||
# Print the status message directly to the console
|
||||
print(chunk.get('status', ''), end='', flush=True)
|
||||
|
||||
# If there's layer data, you might also want to print that (adjust as necessary)
|
||||
if 'digest' in chunk:
|
||||
print(f" - Digest: {chunk['digest']}", end='', flush=True)
|
||||
print(f" - Total: {chunk['total']}", end='', flush=True)
|
||||
print(f" - Completed: {chunk['completed']}", end='\n', flush=True)
|
||||
else:
|
||||
print()
|
||||
except requests.exceptions.RequestException as e:
|
||||
print(f"An error occurred: {e}")
|
||||
|
||||
# List models that are available locally.
|
||||
def list():
|
||||
try:
|
||||
response = requests.get(f"{BASE_URL}/api/tags")
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
models = data.get('models', [])
|
||||
return models
|
||||
|
||||
except requests.exceptions.RequestException as e:
|
||||
print(f"An error occurred: {e}")
|
||||
return None
|
||||
|
||||
# Copy a model. Creates a model with another name from an existing model.
|
||||
def copy(source, destination):
|
||||
try:
|
||||
# Create the JSON payload
|
||||
payload = {
|
||||
"source": source,
|
||||
"destination": destination
|
||||
}
|
||||
|
||||
response = requests.post(f"{BASE_URL}/api/copy", json=payload)
|
||||
response.raise_for_status()
|
||||
|
||||
# If the request was successful, return a message indicating that the copy was successful
|
||||
return "Copy successful"
|
||||
|
||||
except requests.exceptions.RequestException as e:
|
||||
print(f"An error occurred: {e}")
|
||||
return None
|
||||
|
||||
# Delete a model and its data.
|
||||
def delete(model_name):
|
||||
try:
|
||||
url = f"{BASE_URL}/api/delete"
|
||||
payload = {"name": model_name}
|
||||
response = requests.delete(url, json=payload)
|
||||
response.raise_for_status()
|
||||
return "Delete successful"
|
||||
except requests.exceptions.RequestException as e:
|
||||
print(f"An error occurred: {e}")
|
||||
return None
|
||||
|
||||
# Show info about a model.
|
||||
def show(model_name):
|
||||
try:
|
||||
url = f"{BASE_URL}/api/show"
|
||||
payload = {"name": model_name}
|
||||
response = requests.post(url, json=payload)
|
||||
response.raise_for_status()
|
||||
|
||||
# Parse the JSON response and return it
|
||||
data = response.json()
|
||||
return data
|
||||
except requests.exceptions.RequestException as e:
|
||||
print(f"An error occurred: {e}")
|
||||
return None
|
||||
|
||||
def heartbeat():
|
||||
try:
|
||||
url = f"{BASE_URL}/"
|
||||
response = requests.head(url)
|
||||
response.raise_for_status()
|
||||
return "Ollama is running"
|
||||
except requests.exceptions.RequestException as e:
|
||||
print(f"An error occurred: {e}")
|
||||
return "Ollama is not running"
|
||||
|
||||
|
26
api/types.go
26
api/types.go
@@ -31,6 +31,22 @@ func (e StatusError) Error() string {
|
||||
}
|
||||
}
|
||||
|
||||
// /api/chat
|
||||
type Message struct {
|
||||
Role string `json:"role"`
|
||||
Content string `json:"content"`
|
||||
}
|
||||
|
||||
type ChatRequest struct {
|
||||
Model string `json:"model"`
|
||||
Messages []Message `json:"messages"`
|
||||
}
|
||||
|
||||
type ChatResponse struct {
|
||||
CreatedAt time.Time `json:"created_at"`
|
||||
Message Message `json:"message"`
|
||||
}
|
||||
|
||||
type GenerateRequest struct {
|
||||
Model string `json:"model"`
|
||||
Prompt string `json:"prompt"`
|
||||
@@ -81,22 +97,18 @@ type CopyRequest struct {
|
||||
type PullRequest struct {
|
||||
Name string `json:"name"`
|
||||
Insecure bool `json:"insecure,omitempty"`
|
||||
Username string `json:"username"`
|
||||
Password string `json:"password"`
|
||||
}
|
||||
|
||||
type ProgressResponse struct {
|
||||
Status string `json:"status"`
|
||||
Digest string `json:"digest,omitempty"`
|
||||
Total int `json:"total,omitempty"`
|
||||
Completed int `json:"completed,omitempty"`
|
||||
Total int64 `json:"total,omitempty"`
|
||||
Completed int64 `json:"completed,omitempty"`
|
||||
}
|
||||
|
||||
type PushRequest struct {
|
||||
Name string `json:"name"`
|
||||
Insecure bool `json:"insecure,omitempty"`
|
||||
Username string `json:"username"`
|
||||
Password string `json:"password"`
|
||||
}
|
||||
|
||||
type ListResponse struct {
|
||||
@@ -106,7 +118,7 @@ type ListResponse struct {
|
||||
type ModelResponse struct {
|
||||
Name string `json:"name"`
|
||||
ModifiedAt time.Time `json:"modified_at"`
|
||||
Size int `json:"size"`
|
||||
Size int64 `json:"size"`
|
||||
Digest string `json:"digest"`
|
||||
}
|
||||
|
||||
|
@@ -5,7 +5,7 @@ import winston from 'winston'
|
||||
import 'winston-daily-rotate-file'
|
||||
import * as path from 'path'
|
||||
|
||||
import { analytics, id } from './telemetry'
|
||||
import { v4 as uuidv4 } from 'uuid'
|
||||
import { installed } from './install'
|
||||
|
||||
require('@electron/remote/main').initialize()
|
||||
@@ -164,11 +164,11 @@ app.on('before-quit', () => {
|
||||
|
||||
function init() {
|
||||
if (app.isPackaged) {
|
||||
heartbeat()
|
||||
autoUpdater.checkForUpdates()
|
||||
setInterval(() => {
|
||||
heartbeat()
|
||||
autoUpdater.checkForUpdates()
|
||||
if (!updateAvailable) {
|
||||
autoUpdater.checkForUpdates()
|
||||
}
|
||||
}, 60 * 60 * 1000)
|
||||
}
|
||||
|
||||
@@ -234,28 +234,26 @@ app.on('window-all-closed', () => {
|
||||
}
|
||||
})
|
||||
|
||||
// In this file you can include the rest of your app's specific main process
|
||||
// code. You can also put them in separate files and import them here.
|
||||
let aid = ''
|
||||
try {
|
||||
aid = id()
|
||||
} catch (e) {}
|
||||
function id(): string {
|
||||
const id = store.get('id') as string
|
||||
|
||||
autoUpdater.setFeedURL({
|
||||
url: `https://ollama.ai/api/update?os=${process.platform}&arch=${process.arch}&version=${app.getVersion()}&id=${aid}`,
|
||||
})
|
||||
if (id) {
|
||||
return id
|
||||
}
|
||||
|
||||
async function heartbeat() {
|
||||
analytics.track({
|
||||
anonymousId: aid,
|
||||
event: 'heartbeat',
|
||||
properties: {
|
||||
version: app.getVersion(),
|
||||
},
|
||||
})
|
||||
const uuid = uuidv4()
|
||||
store.set('id', uuid)
|
||||
return uuid
|
||||
}
|
||||
|
||||
autoUpdater.setFeedURL({
|
||||
url: `https://ollama.ai/api/update?os=${process.platform}&arch=${
|
||||
process.arch
|
||||
}&version=${app.getVersion()}&id=${id()}`,
|
||||
})
|
||||
|
||||
autoUpdater.on('error', e => {
|
||||
logger.error(`update check failed - ${e.message}`)
|
||||
console.error(`update check failed - ${e.message}`)
|
||||
})
|
||||
|
||||
|
@@ -1,19 +0,0 @@
|
||||
import { Analytics } from '@segment/analytics-node'
|
||||
import { v4 as uuidv4 } from 'uuid'
|
||||
import Store from 'electron-store'
|
||||
|
||||
const store = new Store()
|
||||
|
||||
export const analytics = new Analytics({ writeKey: process.env.TELEMETRY_WRITE_KEY || '<empty>' })
|
||||
|
||||
export function id(): string {
|
||||
const id = store.get('id') as string
|
||||
|
||||
if (id) {
|
||||
return id
|
||||
}
|
||||
|
||||
const uuid = uuidv4()
|
||||
store.set('id', uuid)
|
||||
return uuid
|
||||
}
|
353
cmd/cmd.go
353
cmd/cmd.go
@@ -11,20 +11,21 @@ import (
|
||||
"io"
|
||||
"log"
|
||||
"net"
|
||||
"net/http"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path"
|
||||
"os/signal"
|
||||
"path/filepath"
|
||||
"runtime"
|
||||
"strings"
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
"github.com/chzyer/readline"
|
||||
"github.com/dustin/go-humanize"
|
||||
"github.com/olekukonko/tablewriter"
|
||||
"github.com/pdevine/readline"
|
||||
"github.com/spf13/cobra"
|
||||
"golang.org/x/crypto/ssh"
|
||||
"golang.org/x/term"
|
||||
|
||||
"github.com/jmorganca/ollama/api"
|
||||
"github.com/jmorganca/ollama/format"
|
||||
@@ -33,6 +34,26 @@ import (
|
||||
"github.com/jmorganca/ollama/version"
|
||||
)
|
||||
|
||||
type Painter struct {
|
||||
IsMultiLine bool
|
||||
}
|
||||
|
||||
func (p Painter) Paint(line []rune, _ int) []rune {
|
||||
termType := os.Getenv("TERM")
|
||||
if termType == "xterm-256color" && len(line) == 0 {
|
||||
var prompt string
|
||||
if p.IsMultiLine {
|
||||
prompt = "Use \"\"\" to end multi-line input"
|
||||
} else {
|
||||
prompt = "Send a message (/? for help)"
|
||||
}
|
||||
return []rune(fmt.Sprintf("\033[38;5;245m%s\033[%dD\033[0m", prompt, len(prompt)))
|
||||
}
|
||||
// add a space and a backspace to prevent the cursor from walking up the screen
|
||||
line = append(line, []rune(" \b")...)
|
||||
return line
|
||||
}
|
||||
|
||||
func CreateHandler(cmd *cobra.Command, args []string) error {
|
||||
filename, _ := cmd.Flags().GetString("file")
|
||||
filename, err := filepath.Abs(filename)
|
||||
@@ -59,18 +80,18 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
|
||||
currentDigest = resp.Digest
|
||||
switch {
|
||||
case strings.Contains(resp.Status, "embeddings"):
|
||||
bar = progressbar.Default(int64(resp.Total), resp.Status)
|
||||
bar.Set(resp.Completed)
|
||||
bar = progressbar.Default(resp.Total, resp.Status)
|
||||
bar.Set64(resp.Completed)
|
||||
default:
|
||||
// pulling
|
||||
bar = progressbar.DefaultBytes(
|
||||
int64(resp.Total),
|
||||
resp.Total,
|
||||
resp.Status,
|
||||
)
|
||||
bar.Set(resp.Completed)
|
||||
bar.Set64(resp.Completed)
|
||||
}
|
||||
} else if resp.Digest == currentDigest && resp.Digest != "" {
|
||||
bar.Set(resp.Completed)
|
||||
bar.Set64(resp.Completed)
|
||||
} else {
|
||||
currentDigest = ""
|
||||
if spinner != nil {
|
||||
@@ -98,39 +119,24 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
|
||||
}
|
||||
|
||||
func RunHandler(cmd *cobra.Command, args []string) error {
|
||||
insecure, err := cmd.Flags().GetBool("insecure")
|
||||
client, err := api.FromEnv()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
mp := server.ParseModelPath(args[0])
|
||||
models, err := client.List(context.Background())
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if mp.ProtocolScheme == "http" && !insecure {
|
||||
return fmt.Errorf("insecure protocol http")
|
||||
}
|
||||
|
||||
fp, err := mp.GetManifestPath(false)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
_, err = os.Stat(fp)
|
||||
switch {
|
||||
case errors.Is(err, os.ErrNotExist):
|
||||
if err := pull(args[0], insecure); err != nil {
|
||||
var apiStatusError api.StatusError
|
||||
if !errors.As(err, &apiStatusError) {
|
||||
return err
|
||||
}
|
||||
|
||||
if apiStatusError.StatusCode != http.StatusBadGateway {
|
||||
return err
|
||||
}
|
||||
canonicalModelPath := server.ParseModelPath(args[0])
|
||||
for _, model := range models.Models {
|
||||
if model.Name == canonicalModelPath.GetShortTagname() {
|
||||
return RunGenerate(cmd, args)
|
||||
}
|
||||
case err != nil:
|
||||
}
|
||||
|
||||
if err := PullHandler(cmd, args); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
@@ -156,13 +162,13 @@ func PushHandler(cmd *cobra.Command, args []string) error {
|
||||
if resp.Digest != currentDigest && resp.Digest != "" {
|
||||
currentDigest = resp.Digest
|
||||
bar = progressbar.DefaultBytes(
|
||||
int64(resp.Total),
|
||||
resp.Total,
|
||||
fmt.Sprintf("pushing %s...", resp.Digest[7:19]),
|
||||
)
|
||||
|
||||
bar.Set(resp.Completed)
|
||||
bar.Set64(resp.Completed)
|
||||
} else if resp.Digest == currentDigest && resp.Digest != "" {
|
||||
bar.Set(resp.Completed)
|
||||
bar.Set64(resp.Completed)
|
||||
} else {
|
||||
currentDigest = ""
|
||||
fmt.Println(resp.Status)
|
||||
@@ -345,13 +351,13 @@ func pull(model string, insecure bool) error {
|
||||
if resp.Digest != currentDigest && resp.Digest != "" {
|
||||
currentDigest = resp.Digest
|
||||
bar = progressbar.DefaultBytes(
|
||||
int64(resp.Total),
|
||||
resp.Total,
|
||||
fmt.Sprintf("pulling %s...", resp.Digest[7:19]),
|
||||
)
|
||||
|
||||
bar.Set(resp.Completed)
|
||||
bar.Set64(resp.Completed)
|
||||
} else if resp.Digest == currentDigest && resp.Digest != "" {
|
||||
bar.Set(resp.Completed)
|
||||
bar.Set64(resp.Completed)
|
||||
} else {
|
||||
currentDigest = ""
|
||||
fmt.Println(resp.Status)
|
||||
@@ -387,70 +393,135 @@ func RunGenerate(cmd *cobra.Command, args []string) error {
|
||||
type generateContextKey string
|
||||
|
||||
func generate(cmd *cobra.Command, model, prompt string) error {
|
||||
if len(strings.TrimSpace(prompt)) > 0 {
|
||||
client, err := api.FromEnv()
|
||||
if err != nil {
|
||||
return err
|
||||
client, err := api.FromEnv()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
spinner := NewSpinner("")
|
||||
go spinner.Spin(60 * time.Millisecond)
|
||||
|
||||
var latest api.GenerateResponse
|
||||
|
||||
generateContext, ok := cmd.Context().Value(generateContextKey("context")).([]int)
|
||||
if !ok {
|
||||
generateContext = []int{}
|
||||
}
|
||||
|
||||
var wrapTerm bool
|
||||
termType := os.Getenv("TERM")
|
||||
if termType == "xterm-256color" {
|
||||
wrapTerm = true
|
||||
}
|
||||
|
||||
termWidth, _, err := term.GetSize(int(0))
|
||||
if err != nil {
|
||||
wrapTerm = false
|
||||
}
|
||||
|
||||
// override wrapping if the user turned it off
|
||||
nowrap, err := cmd.Flags().GetBool("nowordwrap")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if nowrap {
|
||||
wrapTerm = false
|
||||
}
|
||||
|
||||
cancelCtx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
|
||||
sigChan := make(chan os.Signal, 1)
|
||||
signal.Notify(sigChan, syscall.SIGINT)
|
||||
var abort bool
|
||||
|
||||
go func() {
|
||||
<-sigChan
|
||||
cancel()
|
||||
abort = true
|
||||
}()
|
||||
|
||||
var currentLineLength int
|
||||
var wordBuffer string
|
||||
|
||||
request := api.GenerateRequest{Model: model, Prompt: prompt, Context: generateContext}
|
||||
fn := func(response api.GenerateResponse) error {
|
||||
if !spinner.IsFinished() {
|
||||
spinner.Finish()
|
||||
}
|
||||
|
||||
spinner := NewSpinner("")
|
||||
go spinner.Spin(60 * time.Millisecond)
|
||||
latest = response
|
||||
|
||||
var latest api.GenerateResponse
|
||||
if wrapTerm {
|
||||
for _, ch := range response.Response {
|
||||
if currentLineLength+1 > termWidth-5 {
|
||||
// backtrack the length of the last word and clear to the end of the line
|
||||
fmt.Printf("\x1b[%dD\x1b[K\n", len(wordBuffer))
|
||||
fmt.Printf("%s%c", wordBuffer, ch)
|
||||
currentLineLength = len(wordBuffer) + 1
|
||||
} else {
|
||||
fmt.Print(string(ch))
|
||||
currentLineLength += 1
|
||||
|
||||
generateContext, ok := cmd.Context().Value(generateContextKey("context")).([]int)
|
||||
if !ok {
|
||||
generateContext = []int{}
|
||||
}
|
||||
|
||||
request := api.GenerateRequest{Model: model, Prompt: prompt, Context: generateContext}
|
||||
fn := func(response api.GenerateResponse) error {
|
||||
if !spinner.IsFinished() {
|
||||
spinner.Finish()
|
||||
switch ch {
|
||||
case ' ':
|
||||
wordBuffer = ""
|
||||
case '\n':
|
||||
currentLineLength = 0
|
||||
default:
|
||||
wordBuffer += string(ch)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
latest = response
|
||||
|
||||
} else {
|
||||
fmt.Print(response.Response)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
if err := client.Generate(cancelCtx, &request, fn); err != nil {
|
||||
if strings.Contains(err.Error(), "failed to load model") {
|
||||
// tell the user to check the server log, if it exists locally
|
||||
home, nestedErr := os.UserHomeDir()
|
||||
if nestedErr != nil {
|
||||
// return the original error
|
||||
return err
|
||||
}
|
||||
logPath := filepath.Join(home, ".ollama", "logs", "server.log")
|
||||
if _, nestedErr := os.Stat(logPath); nestedErr == nil {
|
||||
err = fmt.Errorf("%w\nFor more details, check the error logs at %s", err, logPath)
|
||||
}
|
||||
} else if strings.Contains(err.Error(), "context canceled") && abort {
|
||||
spinner.Finish()
|
||||
return nil
|
||||
}
|
||||
|
||||
if err := client.Generate(context.Background(), &request, fn); err != nil {
|
||||
if strings.Contains(err.Error(), "failed to load model") {
|
||||
// tell the user to check the server log, if it exists locally
|
||||
home, nestedErr := os.UserHomeDir()
|
||||
if nestedErr != nil {
|
||||
// return the original error
|
||||
return err
|
||||
}
|
||||
logPath := filepath.Join(home, ".ollama", "logs", "server.log")
|
||||
if _, nestedErr := os.Stat(logPath); nestedErr == nil {
|
||||
err = fmt.Errorf("%w\nFor more details, check the error logs at %s", err, logPath)
|
||||
}
|
||||
}
|
||||
return err
|
||||
}
|
||||
|
||||
fmt.Println()
|
||||
fmt.Println()
|
||||
|
||||
if !latest.Done {
|
||||
return errors.New("unexpected end of response")
|
||||
}
|
||||
|
||||
verbose, err := cmd.Flags().GetBool("verbose")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if verbose {
|
||||
latest.Summary()
|
||||
}
|
||||
|
||||
ctx := cmd.Context()
|
||||
ctx = context.WithValue(ctx, generateContextKey("context"), latest.Context)
|
||||
cmd.SetContext(ctx)
|
||||
return err
|
||||
}
|
||||
if prompt != "" {
|
||||
fmt.Println()
|
||||
fmt.Println()
|
||||
}
|
||||
|
||||
if !latest.Done {
|
||||
if abort {
|
||||
return nil
|
||||
}
|
||||
return errors.New("unexpected end of response")
|
||||
}
|
||||
|
||||
verbose, err := cmd.Flags().GetBool("verbose")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if verbose {
|
||||
latest.Summary()
|
||||
}
|
||||
|
||||
ctx := cmd.Context()
|
||||
ctx = context.WithValue(ctx, generateContextKey("context"), latest.Context)
|
||||
cmd.SetContext(ctx)
|
||||
|
||||
return nil
|
||||
}
|
||||
@@ -461,19 +532,21 @@ func generateInteractive(cmd *cobra.Command, model string) error {
|
||||
return err
|
||||
}
|
||||
|
||||
// load the model
|
||||
if err := generate(cmd, model, ""); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
completer := readline.NewPrefixCompleter(
|
||||
readline.PcItem("/help"),
|
||||
readline.PcItem("/list"),
|
||||
readline.PcItem("/set",
|
||||
readline.PcItem("history"),
|
||||
readline.PcItem("nohistory"),
|
||||
readline.PcItem("wordwrap"),
|
||||
readline.PcItem("nowordwrap"),
|
||||
readline.PcItem("verbose"),
|
||||
readline.PcItem("quiet"),
|
||||
readline.PcItem("mode",
|
||||
readline.PcItem("vim"),
|
||||
readline.PcItem("emacs"),
|
||||
readline.PcItem("default"),
|
||||
),
|
||||
),
|
||||
readline.PcItem("/show",
|
||||
readline.PcItem("license"),
|
||||
@@ -491,7 +564,10 @@ func generateInteractive(cmd *cobra.Command, model string) error {
|
||||
fmt.Fprintln(os.Stderr, completer.Tree(" "))
|
||||
}
|
||||
|
||||
var painter Painter
|
||||
|
||||
config := readline.Config{
|
||||
Painter: &painter,
|
||||
Prompt: ">>> ",
|
||||
HistoryFile: filepath.Join(home, ".ollama", "history"),
|
||||
AutoComplete: completer,
|
||||
@@ -513,7 +589,7 @@ func generateInteractive(cmd *cobra.Command, model string) error {
|
||||
return nil
|
||||
case errors.Is(err, readline.ErrInterrupt):
|
||||
if line == "" {
|
||||
return nil
|
||||
fmt.Println("Use Ctrl-D or /bye to exit.")
|
||||
}
|
||||
|
||||
continue
|
||||
@@ -527,6 +603,7 @@ func generateInteractive(cmd *cobra.Command, model string) error {
|
||||
case isMultiLine:
|
||||
if strings.HasSuffix(line, `"""`) {
|
||||
isMultiLine = false
|
||||
painter.IsMultiLine = isMultiLine
|
||||
multiLineBuffer += strings.TrimSuffix(line, `"""`)
|
||||
line = multiLineBuffer
|
||||
multiLineBuffer = ""
|
||||
@@ -537,6 +614,7 @@ func generateInteractive(cmd *cobra.Command, model string) error {
|
||||
}
|
||||
case strings.HasPrefix(line, `"""`):
|
||||
isMultiLine = true
|
||||
painter.IsMultiLine = isMultiLine
|
||||
multiLineBuffer = strings.TrimPrefix(line, `"""`) + " "
|
||||
scanner.SetPrompt("... ")
|
||||
continue
|
||||
@@ -545,45 +623,44 @@ func generateInteractive(cmd *cobra.Command, model string) error {
|
||||
if err := ListHandler(cmd, args[1:]); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
continue
|
||||
case strings.HasPrefix(line, "/set"):
|
||||
args := strings.Fields(line)
|
||||
if len(args) > 1 {
|
||||
switch args[1] {
|
||||
case "history":
|
||||
scanner.HistoryEnable()
|
||||
continue
|
||||
case "nohistory":
|
||||
scanner.HistoryDisable()
|
||||
continue
|
||||
case "wordwrap":
|
||||
cmd.Flags().Set("nowordwrap", "false")
|
||||
fmt.Println("Set 'wordwrap' mode.")
|
||||
case "nowordwrap":
|
||||
cmd.Flags().Set("nowordwrap", "true")
|
||||
fmt.Println("Set 'nowordwrap' mode.")
|
||||
case "verbose":
|
||||
cmd.Flags().Set("verbose", "true")
|
||||
continue
|
||||
fmt.Println("Set 'verbose' mode.")
|
||||
case "quiet":
|
||||
cmd.Flags().Set("verbose", "false")
|
||||
continue
|
||||
fmt.Println("Set 'quiet' mode.")
|
||||
case "mode":
|
||||
if len(args) > 2 {
|
||||
switch args[2] {
|
||||
case "vim":
|
||||
scanner.SetVimMode(true)
|
||||
continue
|
||||
case "emacs", "default":
|
||||
scanner.SetVimMode(false)
|
||||
continue
|
||||
default:
|
||||
usage()
|
||||
continue
|
||||
}
|
||||
} else {
|
||||
usage()
|
||||
continue
|
||||
}
|
||||
default:
|
||||
fmt.Printf("Unknown command '/set %s'. Type /? for help\n", args[1])
|
||||
}
|
||||
} else {
|
||||
usage()
|
||||
continue
|
||||
}
|
||||
case strings.HasPrefix(line, "/show"):
|
||||
args := strings.Fields(line)
|
||||
@@ -591,7 +668,7 @@ func generateInteractive(cmd *cobra.Command, model string) error {
|
||||
resp, err := server.GetModelInfo(model)
|
||||
if err != nil {
|
||||
fmt.Println("error: couldn't get model")
|
||||
continue
|
||||
return err
|
||||
}
|
||||
|
||||
switch args[1] {
|
||||
@@ -606,23 +683,24 @@ func generateInteractive(cmd *cobra.Command, model string) error {
|
||||
case "template":
|
||||
fmt.Println(resp.Template)
|
||||
default:
|
||||
fmt.Println("error: unknown command")
|
||||
fmt.Printf("Unknown command '/show %s'. Type /? for help\n", args[1])
|
||||
}
|
||||
|
||||
continue
|
||||
} else {
|
||||
usage()
|
||||
continue
|
||||
}
|
||||
case line == "/help", line == "/?":
|
||||
usage()
|
||||
continue
|
||||
case line == "/exit", line == "/bye":
|
||||
return nil
|
||||
case strings.HasPrefix(line, "/"):
|
||||
args := strings.Fields(line)
|
||||
fmt.Printf("Unknown command '%s'. Type /? for help\n", args[0])
|
||||
}
|
||||
|
||||
if err := generate(cmd, model, line); err != nil {
|
||||
return err
|
||||
if len(line) > 0 && line[0] != '/' {
|
||||
if err := generate(cmd, model, line); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -641,28 +719,19 @@ func generateBatch(cmd *cobra.Command, model string) error {
|
||||
}
|
||||
|
||||
func RunServer(cmd *cobra.Command, _ []string) error {
|
||||
host, port := "127.0.0.1", "11434"
|
||||
|
||||
parts := strings.Split(os.Getenv("OLLAMA_HOST"), ":")
|
||||
if ip := net.ParseIP(parts[0]); ip != nil {
|
||||
host = ip.String()
|
||||
}
|
||||
|
||||
if len(parts) > 1 {
|
||||
port = parts[1]
|
||||
}
|
||||
|
||||
// deprecated: include port in OLLAMA_HOST
|
||||
if p := os.Getenv("OLLAMA_PORT"); p != "" {
|
||||
port = p
|
||||
}
|
||||
|
||||
err := initializeKeypair()
|
||||
host, port, err := net.SplitHostPort(os.Getenv("OLLAMA_HOST"))
|
||||
if err != nil {
|
||||
host, port = "127.0.0.1", "11434"
|
||||
if ip := net.ParseIP(strings.Trim(os.Getenv("OLLAMA_HOST"), "[]")); ip != nil {
|
||||
host = ip.String()
|
||||
}
|
||||
}
|
||||
|
||||
if err := initializeKeypair(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
ln, err := net.Listen("tcp", fmt.Sprintf("%s:%s", host, port))
|
||||
ln, err := net.Listen("tcp", net.JoinHostPort(host, port))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -676,6 +745,15 @@ func RunServer(cmd *cobra.Command, _ []string) error {
|
||||
if err := server.PruneLayers(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
manifestsPath, err := server.GetManifestPath()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if err := server.PruneDirectory(manifestsPath); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
return server.Serve(ln, origins)
|
||||
@@ -703,7 +781,7 @@ func initializeKeypair() error {
|
||||
return err
|
||||
}
|
||||
|
||||
err = os.MkdirAll(path.Dir(privKeyPath), 0o700)
|
||||
err = os.MkdirAll(filepath.Dir(privKeyPath), 0o755)
|
||||
if err != nil {
|
||||
return fmt.Errorf("could not create directory %w", err)
|
||||
}
|
||||
@@ -831,6 +909,7 @@ func NewCLI() *cobra.Command {
|
||||
|
||||
runCmd.Flags().Bool("verbose", false, "Show timings for response")
|
||||
runCmd.Flags().Bool("insecure", false, "Use an insecure registry")
|
||||
runCmd.Flags().Bool("nowordwrap", false, "Don't wrap words to the next line automatically")
|
||||
|
||||
serveCmd := &cobra.Command{
|
||||
Use: "serve",
|
||||
|
142
docs/api.md
142
docs/api.md
@@ -3,26 +3,33 @@
|
||||
## Endpoints
|
||||
|
||||
- [Generate a completion](#generate-a-completion)
|
||||
- [Create a model](#create-a-model)
|
||||
- [List local models](#list-local-models)
|
||||
- [Copy a model](#copy-a-model)
|
||||
- [Delete a model](#delete-a-model)
|
||||
- [Pull a model](#pull-a-model)
|
||||
- [Generate embeddings](#generate-embeddings)
|
||||
- [Create a Model](#create-a-model)
|
||||
- [List Local Models](#list-local-models)
|
||||
- [Show Model Information](#show-model-information)
|
||||
- [Copy a Model](#copy-a-model)
|
||||
- [Delete a Model](#delete-a-model)
|
||||
- [Pull a Model](#pull-a-model)
|
||||
- [Push a Model](#push-a-model)
|
||||
- [Generate Embeddings](#generate-embeddings)
|
||||
|
||||
|
||||
## Conventions
|
||||
|
||||
### Model names
|
||||
|
||||
Model names follow a `model:tag` format. Some examples are `orca-mini:3b-q4_1` and `llama2:70b`. The tag is optional and if not provided will default to `latest`. The tag is used to identify a specific version.
|
||||
Model names follow a `model:tag` format. Some examples are `orca-mini:3b-q4_1` and `llama2:70b`. The tag is optional and, if not provided, will default to `latest`. The tag is used to identify a specific version.
|
||||
|
||||
### Durations
|
||||
|
||||
All durations are returned in nanoseconds.
|
||||
|
||||
### Streaming responses
|
||||
|
||||
Certain endpoints stream responses as JSON objects delineated with the newline (`\n`) character.
|
||||
|
||||
## Generate a completion
|
||||
|
||||
```
|
||||
```shell
|
||||
POST /api/generate
|
||||
```
|
||||
|
||||
@@ -42,7 +49,7 @@ Advanced parameters:
|
||||
|
||||
### Request
|
||||
|
||||
```
|
||||
```shell
|
||||
curl -X POST http://localhost:11434/api/generate -d '{
|
||||
"model": "llama2:7b",
|
||||
"prompt": "Why is the sky blue?"
|
||||
@@ -95,7 +102,7 @@ To calculate how fast the response is generated in tokens per second (token/s),
|
||||
|
||||
## Create a Model
|
||||
|
||||
```
|
||||
```shell
|
||||
POST /api/create
|
||||
```
|
||||
|
||||
@@ -108,7 +115,7 @@ Create a model from a [`Modelfile`](./modelfile.md)
|
||||
|
||||
### Request
|
||||
|
||||
```
|
||||
```shell
|
||||
curl -X POST http://localhost:11434/api/create -d '{
|
||||
"name": "mario",
|
||||
"path": "~/Modelfile"
|
||||
@@ -117,7 +124,7 @@ curl -X POST http://localhost:11434/api/create -d '{
|
||||
|
||||
### Response
|
||||
|
||||
A stream of JSON objects. When finished, `status` is `success`
|
||||
A stream of JSON objects. When finished, `status` is `success`.
|
||||
|
||||
```json
|
||||
{
|
||||
@@ -127,7 +134,7 @@ A stream of JSON objects. When finished, `status` is `success`
|
||||
|
||||
## List Local Models
|
||||
|
||||
```
|
||||
```shell
|
||||
GET /api/tags
|
||||
```
|
||||
|
||||
@@ -135,7 +142,7 @@ List models that are available locally.
|
||||
|
||||
### Request
|
||||
|
||||
```
|
||||
```shell
|
||||
curl http://localhost:11434/api/tags
|
||||
```
|
||||
|
||||
@@ -158,9 +165,40 @@ curl http://localhost:11434/api/tags
|
||||
}
|
||||
```
|
||||
|
||||
## Show Model Information
|
||||
|
||||
```shell
|
||||
POST /api/show
|
||||
```
|
||||
|
||||
Show details about a model including modelfile, template, parameters, license, and system prompt.
|
||||
|
||||
### Parameters
|
||||
|
||||
- `name`: name of the model to show
|
||||
|
||||
### Request
|
||||
|
||||
```shell
|
||||
curl http://localhost:11434/api/show -d '{
|
||||
"name": "llama2:7b"
|
||||
}'
|
||||
```
|
||||
|
||||
### Response
|
||||
|
||||
```json
|
||||
{
|
||||
"license": "<contents of license block>",
|
||||
"modelfile": "# Modelfile generated by \"ollama show\"\n# To build a new Modelfile based on this one, replace the FROM line with:\n# FROM llama2:latest\n\nFROM /Users/username/.ollama/models/blobs/sha256:8daa9615cce30c259a9555b1cc250d461d1bc69980a274b44d7eda0be78076d8\nTEMPLATE \"\"\"[INST] {{ if and .First .System }}<<SYS>>{{ .System }}<</SYS>>\n\n{{ end }}{{ .Prompt }} [/INST] \"\"\"\nSYSTEM \"\"\"\"\"\"\nPARAMETER stop [INST]\nPARAMETER stop [/INST]\nPARAMETER stop <<SYS>>\nPARAMETER stop <</SYS>>\n",
|
||||
"parameters": "stop [INST]\nstop [/INST]\nstop <<SYS>>\nstop <</SYS>>",
|
||||
"template": "[INST] {{ if and .First .System }}<<SYS>>{{ .System }}<</SYS>>\n\n{{ end }}{{ .Prompt }} [/INST] "
|
||||
}
|
||||
```
|
||||
|
||||
## Copy a Model
|
||||
|
||||
```
|
||||
```shell
|
||||
POST /api/copy
|
||||
```
|
||||
|
||||
@@ -168,7 +206,7 @@ Copy a model. Creates a model with another name from an existing model.
|
||||
|
||||
### Request
|
||||
|
||||
```
|
||||
```shell
|
||||
curl http://localhost:11434/api/copy -d '{
|
||||
"source": "llama2:7b",
|
||||
"destination": "llama2-backup"
|
||||
@@ -177,7 +215,7 @@ curl http://localhost:11434/api/copy -d '{
|
||||
|
||||
## Delete a Model
|
||||
|
||||
```
|
||||
```shell
|
||||
DELETE /api/delete
|
||||
```
|
||||
|
||||
@@ -189,7 +227,7 @@ Delete a model and its data.
|
||||
|
||||
### Request
|
||||
|
||||
```
|
||||
```shell
|
||||
curl -X DELETE http://localhost:11434/api/delete -d '{
|
||||
"name": "llama2:13b"
|
||||
}'
|
||||
@@ -197,19 +235,20 @@ curl -X DELETE http://localhost:11434/api/delete -d '{
|
||||
|
||||
## Pull a Model
|
||||
|
||||
```
|
||||
```shell
|
||||
POST /api/pull
|
||||
```
|
||||
|
||||
Download a model from a the model registry. Cancelled pulls are resumed from where they left off, and multiple calls to will share the same download progress.
|
||||
Download a model from the ollama library. Cancelled pulls are resumed from where they left off, and multiple calls will share the same download progress.
|
||||
|
||||
### Parameters
|
||||
|
||||
- `name`: name of the model to pull
|
||||
- `insecure`: (optional) allow insecure connections to the library. Only use this if you are pulling from your own library during development.
|
||||
|
||||
### Request
|
||||
|
||||
```
|
||||
```shell
|
||||
curl -X POST http://localhost:11434/api/pull -d '{
|
||||
"name": "llama2:7b"
|
||||
}'
|
||||
@@ -225,9 +264,63 @@ curl -X POST http://localhost:11434/api/pull -d '{
|
||||
}
|
||||
```
|
||||
|
||||
## Push a Model
|
||||
|
||||
```shell
|
||||
POST /api/push
|
||||
```
|
||||
|
||||
Upload a model to a model library. Requires registering for ollama.ai and adding a public key first.
|
||||
|
||||
### Parameters
|
||||
|
||||
- `name`: name of the model to push in the form of `<namespace>/<model>:<tag>`
|
||||
- `insecure`: (optional) allow insecure connections to the library. Only use this if you are pushing to your library during development.
|
||||
|
||||
### Request
|
||||
|
||||
```shell
|
||||
curl -X POST http://localhost:11434/api/push -d '{
|
||||
"name": "mattw/pygmalion:latest"
|
||||
}'
|
||||
```
|
||||
|
||||
### Response
|
||||
|
||||
Streaming response that starts with:
|
||||
|
||||
```json
|
||||
{"status":"retrieving manifest"}
|
||||
```
|
||||
|
||||
and then:
|
||||
|
||||
```json
|
||||
{
|
||||
"status":"starting upload","digest":"sha256:bc07c81de745696fdf5afca05e065818a8149fb0c77266fb584d9b2cba3711ab",
|
||||
"total":1928429856
|
||||
}
|
||||
```
|
||||
|
||||
Then there is a series of uploading responses:
|
||||
|
||||
```json
|
||||
{
|
||||
"status":"starting upload",
|
||||
"digest":"sha256:bc07c81de745696fdf5afca05e065818a8149fb0c77266fb584d9b2cba3711ab",
|
||||
"total":1928429856}
|
||||
```
|
||||
|
||||
Finally, when the upload is complete:
|
||||
|
||||
```json
|
||||
{"status":"pushing manifest"}
|
||||
{"status":"success"}
|
||||
```
|
||||
|
||||
## Generate Embeddings
|
||||
|
||||
```
|
||||
```shell
|
||||
POST /api/embeddings
|
||||
```
|
||||
|
||||
@@ -244,7 +337,7 @@ Advanced parameters:
|
||||
|
||||
### Request
|
||||
|
||||
```
|
||||
```shell
|
||||
curl -X POST http://localhost:11434/api/embeddings -d '{
|
||||
"model": "llama2:7b",
|
||||
"prompt": "Here is an article about llamas..."
|
||||
@@ -259,5 +352,4 @@ curl -X POST http://localhost:11434/api/embeddings -d '{
|
||||
0.5670403838157654, 0.009260174818336964, 0.23178744316101074, -0.2916173040866852, -0.8924556970596313,
|
||||
0.8785552978515625, -0.34576427936553955, 0.5742510557174683, -0.04222835972905159, -0.137906014919281
|
||||
]
|
||||
}
|
||||
```
|
||||
}```
|
||||
|
@@ -14,4 +14,6 @@ OLLAMA_ORIGINS=http://192.168.1.1:*,https://example.com ollama serve
|
||||
|
||||
## Where are models stored?
|
||||
|
||||
Raw model data is stored under `~/.ollama/models`.
|
||||
* macOS: Raw model data is stored under `~/.ollama/models`.
|
||||
* Linux: Raw model data is stored under `/usr/share/ollama/.ollama/models`
|
||||
|
||||
|
83
docs/linux.md
Normal file
83
docs/linux.md
Normal file
@@ -0,0 +1,83 @@
|
||||
# Installing Ollama on Linux
|
||||
|
||||
> Note: A one line installer for Ollama is available by running:
|
||||
>
|
||||
> ```
|
||||
> curl https://ollama.ai/install.sh | sh
|
||||
> ```
|
||||
|
||||
## Download the `ollama` binary
|
||||
|
||||
Ollama is distributed as a self-contained binary. Download it to a directory in your PATH:
|
||||
|
||||
```
|
||||
sudo curl -L https://ollama.ai/download/ollama-linux-amd64 -o /usr/bin/ollama
|
||||
sudo chmod +x /usr/bin/ollama
|
||||
```
|
||||
|
||||
## Start Ollama
|
||||
|
||||
Start Ollama by running `ollama serve`:
|
||||
|
||||
```
|
||||
ollama serve
|
||||
```
|
||||
|
||||
Once Ollama is running, run a model in another terminal session:
|
||||
|
||||
```
|
||||
ollama run llama2
|
||||
```
|
||||
|
||||
## Install CUDA drivers (optional – for Nvidia GPUs)
|
||||
|
||||
[Download and install](https://developer.nvidia.com/cuda-downloads) CUDA.
|
||||
|
||||
Verify that the drivers are installed by running the following command, which should print details about your GPU:
|
||||
|
||||
```
|
||||
nvidia-smi
|
||||
```
|
||||
|
||||
## Adding Ollama as a startup service (optional)
|
||||
|
||||
Create a user for Ollama:
|
||||
|
||||
```
|
||||
sudo useradd -r -s /bin/false -m -d /usr/share/ollama ollama
|
||||
```
|
||||
|
||||
Create a service file in `/etc/systemd/system/ollama.service`:
|
||||
|
||||
```ini
|
||||
[Unit]
|
||||
Description=Ollama Service
|
||||
After=network-online.target
|
||||
|
||||
[Service]
|
||||
ExecStart=/usr/bin/ollama serve
|
||||
User=ollama
|
||||
Group=ollama
|
||||
Restart=always
|
||||
RestartSec=3
|
||||
Environment="HOME=/usr/share/ollama"
|
||||
|
||||
[Install]
|
||||
WantedBy=default.target
|
||||
```
|
||||
|
||||
Then start the service:
|
||||
|
||||
```
|
||||
sudo systemctl daemon-reload
|
||||
sudo systemctl enable ollama
|
||||
```
|
||||
|
||||
### Viewing logs
|
||||
|
||||
To view logs of Ollama running as a startup service, run:
|
||||
|
||||
```
|
||||
journalctl -u ollama
|
||||
```
|
||||
|
@@ -94,6 +94,7 @@ This bin file location should be specified as an absolute path or relative to th
|
||||
### EMBED
|
||||
|
||||
The EMBED instruction is used to add embeddings of files to a model. This is useful for adding custom data that the model can reference when generating an answer. Note that currently only text files are supported, formatted with each line as one embedding.
|
||||
|
||||
```
|
||||
FROM <model name>:<tag>
|
||||
EMBED <file path>.txt
|
||||
@@ -118,13 +119,14 @@ PARAMETER <parameter> <parametervalue>
|
||||
| mirostat_tau | Controls the balance between coherence and diversity of the output. A lower value will result in more focused and coherent text. (Default: 5.0) | float | mirostat_tau 5.0 |
|
||||
| num_ctx | Sets the size of the context window used to generate the next token. (Default: 2048) | int | num_ctx 4096 |
|
||||
| num_gqa | The number of GQA groups in the transformer layer. Required for some models, for example it is 8 for llama2:70b | int | num_gqa 1 |
|
||||
| num_gpu | The number of GPUs to use. On macOS it defaults to 1 to enable metal support, 0 to disable. | int | num_gpu 1 |
|
||||
| num_gpu | The number of layers to send to the GPU(s). On macOS it defaults to 1 to enable metal support, 0 to disable. | int | num_gpu 50 |
|
||||
| num_thread | Sets the number of threads to use during computation. By default, Ollama will detect this for optimal performance. It is recommended to set this value to the number of physical CPU cores your system has (as opposed to the logical number of cores). | int | num_thread 8 |
|
||||
| repeat_last_n | Sets how far back for the model to look back to prevent repetition. (Default: 64, 0 = disabled, -1 = num_ctx) | int | repeat_last_n 64 |
|
||||
| repeat_penalty | Sets how strongly to penalize repetitions. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. (Default: 1.1) | float | repeat_penalty 1.1 |
|
||||
| temperature | The temperature of the model. Increasing the temperature will make the model answer more creatively. (Default: 0.8) | float | temperature 0.7 |
|
||||
| stop | Sets the stop sequences to use. | string | stop "AI assistant:" |
|
||||
| tfs_z | Tail free sampling is used to reduce the impact of less probable tokens from the output. A higher value (e.g., 2.0) will reduce the impact more, while a value of 1.0 disables this setting. (default: 1) | float | tfs_z 1 |
|
||||
| num_predict | Maximum number of tokens to predict when generating text. (Default: 128, -1 = infinite generation, -2 = fill context) | int | num_predict 42 |
|
||||
| top_k | Reduces the probability of generating nonsense. A higher value (e.g. 100) will give more diverse answers, while a lower value (e.g. 10) will be more conservative. (Default: 40) | int | top_k 40 |
|
||||
| top_p | Works together with top-k. A higher value (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. (Default: 0.9) | float | top_p 0.9 |
|
||||
|
||||
|
2
go.mod
2
go.mod
@@ -8,6 +8,7 @@ require (
|
||||
github.com/mattn/go-runewidth v0.0.14
|
||||
github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db
|
||||
github.com/olekukonko/tablewriter v0.0.5
|
||||
github.com/pdevine/readline v1.5.2
|
||||
github.com/spf13/cobra v1.7.0
|
||||
)
|
||||
|
||||
@@ -16,7 +17,6 @@ require github.com/rivo/uniseg v0.2.0 // indirect
|
||||
require (
|
||||
github.com/bytedance/sonic v1.9.1 // indirect
|
||||
github.com/chenzhuoyu/base64x v0.0.0-20221115062448-fe3a3abad311 // indirect
|
||||
github.com/chzyer/readline v1.5.1
|
||||
github.com/gabriel-vasile/mimetype v1.4.2 // indirect
|
||||
github.com/gin-contrib/cors v1.4.0
|
||||
github.com/gin-contrib/sse v0.1.0 // indirect
|
||||
|
5
go.sum
5
go.sum
@@ -6,8 +6,6 @@ github.com/chenzhuoyu/base64x v0.0.0-20221115062448-fe3a3abad311 h1:qSGYFH7+jGhD
|
||||
github.com/chenzhuoyu/base64x v0.0.0-20221115062448-fe3a3abad311/go.mod h1:b583jCggY9gE99b6G5LEC39OIiVsWj+R97kbl5odCEk=
|
||||
github.com/chzyer/logex v1.2.1 h1:XHDu3E6q+gdHgsdTPH6ImJMIp436vR6MPtH8gP05QzM=
|
||||
github.com/chzyer/logex v1.2.1/go.mod h1:JLbx6lG2kDbNRFnfkgvh4eRJRPX1QCoOIWomwysCBrQ=
|
||||
github.com/chzyer/readline v1.5.1 h1:upd/6fQk4src78LMRzh5vItIt361/o4uq553V8B5sGI=
|
||||
github.com/chzyer/readline v1.5.1/go.mod h1:Eh+b79XXUwfKfcPLepksvw2tcLE/Ct21YObkaSkeBlk=
|
||||
github.com/chzyer/test v1.0.0 h1:p3BQDXSxOhOG0P9z6/hGnII4LGiEPOYBhs8asl/fC04=
|
||||
github.com/chzyer/test v1.0.0/go.mod h1:2JlltgoNkt4TW/z9V/IzDdFaMTM2JPIi26O1pF38GC8=
|
||||
github.com/cpuguy83/go-md2man/v2 v2.0.2/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=
|
||||
@@ -80,6 +78,8 @@ github.com/olekukonko/tablewriter v0.0.5 h1:P2Ga83D34wi1o9J6Wh1mRuqd4mF/x/lgBS7N
|
||||
github.com/olekukonko/tablewriter v0.0.5/go.mod h1:hPp6KlRPjbx+hW8ykQs1w3UBbZlj6HuIJcUGPhkA7kY=
|
||||
github.com/pbnjay/memory v0.0.0-20210728143218-7b4eea64cf58 h1:onHthvaw9LFnH4t2DcNVpwGmV9E1BkGknEliJkfwQj0=
|
||||
github.com/pbnjay/memory v0.0.0-20210728143218-7b4eea64cf58/go.mod h1:DXv8WO4yhMYhSNPKjeNKa5WY9YCIEBRbNzFFPJbWO6Y=
|
||||
github.com/pdevine/readline v1.5.2 h1:oz6Y5GdTmhPG+08hhxcAvtHitSANWuA2100Sppb38xI=
|
||||
github.com/pdevine/readline v1.5.2/go.mod h1:na/LbuE5PYwxI7GyopWdIs3U8HVe89lYlNTFTXH3wOw=
|
||||
github.com/pelletier/go-toml/v2 v2.0.1/go.mod h1:r9LEWfGN8R5k0VXJ+0BkIe7MYkRdwZOjgMj2KwnJFUo=
|
||||
github.com/pelletier/go-toml/v2 v2.0.8 h1:0ctb6s9mE31h0/lhu+J6OPmVeDxJn+kYnJc2jZR9tGQ=
|
||||
github.com/pelletier/go-toml/v2 v2.0.8/go.mod h1:vuYfssBdrU2XDZ9bYydBu6t+6a6PYNcZljzZR9VXg+4=
|
||||
@@ -120,7 +120,6 @@ golang.org/x/arch v0.3.0/go.mod h1:5om86z9Hs0C8fWVUuoMHwpExlXzs5Tkyp9hOrfG7pp8=
|
||||
golang.org/x/crypto v0.0.0-20210711020723-a769d52b0f97/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
|
||||
golang.org/x/crypto v0.10.0 h1:LKqV2xt9+kDzSTfOhx4FrkEBcMrAgHSYgzywV9zcGmM=
|
||||
golang.org/x/crypto v0.10.0/go.mod h1:o4eNf7Ede1fv+hwOwZsTHl9EsPFO6q6ZvYR8vYfY45I=
|
||||
golang.org/x/exp v0.0.0-20230321023759-10a507213a29 h1:ooxPy7fPvB4kwsA2h+iBNHkAbp/4JxTSwCmvdjEYmug=
|
||||
golang.org/x/exp v0.0.0-20230817173708-d852ddb80c63 h1:m64FZMko/V45gv0bNmrNYoDEq8U5YUhetc9cBWKS1TQ=
|
||||
golang.org/x/exp v0.0.0-20230817173708-d852ddb80c63/go.mod h1:0v4NqG35kSWCMzLaMeX+IQrlSnVE/bqGSyC2cz/9Le8=
|
||||
golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
|
||||
|
20
llm/ggml.go
20
llm/ggml.go
@@ -4,8 +4,6 @@ import (
|
||||
"encoding/binary"
|
||||
"errors"
|
||||
"io"
|
||||
"path"
|
||||
"sync"
|
||||
)
|
||||
|
||||
type GGML struct {
|
||||
@@ -79,6 +77,7 @@ type model interface {
|
||||
ModelFamily() string
|
||||
ModelType() string
|
||||
FileType() string
|
||||
NumLayers() int64
|
||||
}
|
||||
|
||||
type container interface {
|
||||
@@ -166,23 +165,6 @@ func (c *containerLORA) Decode(r io.Reader) (model, error) {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
var (
|
||||
ggmlGPU = path.Join("llama.cpp", "ggml", "build", "gpu", "bin")
|
||||
ggmlCPU = path.Join("llama.cpp", "ggml", "build", "cpu", "bin")
|
||||
)
|
||||
|
||||
var (
|
||||
ggmlInit sync.Once
|
||||
ggmlRunnerPath string
|
||||
)
|
||||
|
||||
func ggmlRunner() ModelRunner {
|
||||
ggmlInit.Do(func() {
|
||||
ggmlRunnerPath = chooseRunner(ggmlGPU, ggmlCPU)
|
||||
})
|
||||
return ModelRunner{Path: ggmlRunnerPath}
|
||||
}
|
||||
|
||||
const (
|
||||
// Magic constant for `ggml` files (unversioned).
|
||||
FILE_MAGIC_GGML = 0x67676d6c
|
||||
|
30
llm/gguf.go
30
llm/gguf.go
@@ -6,8 +6,6 @@ import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"path"
|
||||
"sync"
|
||||
)
|
||||
|
||||
type containerGGUF struct {
|
||||
@@ -197,6 +195,16 @@ func (llm *ggufModel) Decode(r io.Reader) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (llm *ggufModel) NumLayers() int64 {
|
||||
value, exists := llm.kv[fmt.Sprintf("%s.block_count", llm.ModelFamily())]
|
||||
if !exists {
|
||||
return 0
|
||||
}
|
||||
|
||||
v := value.(uint32)
|
||||
return int64(v)
|
||||
}
|
||||
|
||||
func (ggufModel) readU8(r io.Reader) uint8 {
|
||||
var u8 uint8
|
||||
binary.Read(r, binary.LittleEndian, &u8)
|
||||
@@ -369,21 +377,3 @@ func (llm *ggufModel) readArray(r io.Reader) (arr []any, err error) {
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
var (
|
||||
ggufGPU = path.Join("llama.cpp", "gguf", "build", "gpu", "bin")
|
||||
ggufCPU = path.Join("llama.cpp", "gguf", "build", "cpu", "bin")
|
||||
)
|
||||
|
||||
var (
|
||||
ggufInit sync.Once
|
||||
ggufRunnerPath string
|
||||
)
|
||||
|
||||
func ggufRunner() ModelRunner {
|
||||
ggufInit.Do(func() {
|
||||
ggufRunnerPath = chooseRunner(ggufGPU, ggufCPU)
|
||||
})
|
||||
|
||||
return ModelRunner{Path: ggufRunnerPath}
|
||||
}
|
||||
|
@@ -3,14 +3,14 @@ package llm
|
||||
//go:generate git submodule init
|
||||
|
||||
//go:generate git submodule update --force ggml
|
||||
//go:generate -command git-apply git -C ggml apply
|
||||
//go:generate git-apply ../ggml_patch/0001-add-detokenize-endpoint.patch
|
||||
//go:generate git-apply ../ggml_patch/0002-34B-model-support.patch
|
||||
//go:generate git-apply ../ggml_patch/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
|
||||
//go:generate git-apply ../ggml_patch/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
|
||||
//go:generate git -C ggml apply ../patches/0001-add-detokenize-endpoint.patch
|
||||
//go:generate git -C ggml apply ../patches/0002-34B-model-support.patch
|
||||
//go:generate git -C ggml apply ../patches/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
|
||||
//go:generate git -C ggml apply ../patches/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
|
||||
//go:generate cmake -S ggml -B ggml/build/cpu -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
|
||||
//go:generate cmake --build ggml/build/cpu --target server --config Release
|
||||
|
||||
//go:generate git submodule update --force gguf
|
||||
//go:generate git -C gguf apply ../patches/0001-remove-warm-up-logging.patch
|
||||
//go:generate cmake -S gguf -B gguf/build/cpu -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
|
||||
//go:generate cmake --build gguf/build/cpu --target server --config Release
|
||||
|
@@ -3,14 +3,14 @@ package llm
|
||||
//go:generate git submodule init
|
||||
|
||||
//go:generate git submodule update --force ggml
|
||||
//go:generate -command git-apply git -C ggml apply
|
||||
//go:generate git-apply ../ggml_patch/0001-add-detokenize-endpoint.patch
|
||||
//go:generate git-apply ../ggml_patch/0002-34B-model-support.patch
|
||||
//go:generate git-apply ../ggml_patch/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
|
||||
//go:generate git-apply ../ggml_patch/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
|
||||
//go:generate cmake -S ggml -B ggml/build/gpu -DLLAMA_METAL=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
|
||||
//go:generate cmake --build ggml/build/gpu --target server --config Release
|
||||
//go:generate git -C ggml apply ../patches/0001-add-detokenize-endpoint.patch
|
||||
//go:generate git -C ggml apply ../patches/0002-34B-model-support.patch
|
||||
//go:generate git -C ggml apply ../patches/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
|
||||
//go:generate git -C ggml apply ../patches/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
|
||||
//go:generate cmake -S ggml -B ggml/build/metal -DLLAMA_METAL=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
|
||||
//go:generate cmake --build ggml/build/metal --target server --config Release
|
||||
|
||||
//go:generate git submodule update --force gguf
|
||||
//go:generate cmake -S gguf -B gguf/build/gpu -DLLAMA_METAL=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
|
||||
//go:generate cmake --build gguf/build/gpu --target server --config Release
|
||||
//go:generate git -C gguf apply ../patches/0001-remove-warm-up-logging.patch
|
||||
//go:generate cmake -S gguf -B gguf/build/metal -DLLAMA_METAL=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
|
||||
//go:generate cmake --build gguf/build/metal --target server --config Release
|
||||
|
@@ -3,13 +3,20 @@ package llm
|
||||
//go:generate git submodule init
|
||||
|
||||
//go:generate git submodule update --force ggml
|
||||
//go:generate -command git-apply git -C ggml apply
|
||||
//go:generate git-apply ../ggml_patch/0001-add-detokenize-endpoint.patch
|
||||
//go:generate git-apply ../ggml_patch/0002-34B-model-support.patch
|
||||
//go:generate git-apply ../ggml_patch/0005-ggml-support-CUDA-s-half-type-for-aarch64-1455-2670.patch
|
||||
//go:generate cmake -S ggml -B ggml/build/gpu -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on
|
||||
//go:generate cmake --build ggml/build/gpu --target server --config Release
|
||||
//go:generate git -C ggml apply ../patches/0001-add-detokenize-endpoint.patch
|
||||
//go:generate git -C ggml apply ../patches/0002-34B-model-support.patch
|
||||
//go:generate git -C ggml apply ../patches/0005-ggml-support-CUDA-s-half-type-for-aarch64-1455-2670.patch
|
||||
//go:generate git -C ggml apply ../patches/0001-copy-cuda-runtime-libraries.patch
|
||||
//go:generate cmake -S ggml -B ggml/build/cpu -DLLAMA_K_QUANTS=on
|
||||
//go:generate cmake --build ggml/build/cpu --target server --config Release
|
||||
|
||||
//go:generate git submodule update --force gguf
|
||||
//go:generate cmake -S gguf -B gguf/build/gpu -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on
|
||||
//go:generate cmake --build gguf/build/gpu --target server --config Release
|
||||
//go:generate git -C gguf apply ../patches/0001-copy-cuda-runtime-libraries.patch
|
||||
//go:generate git -C gguf apply ../patches/0001-remove-warm-up-logging.patch
|
||||
//go:generate cmake -S gguf -B gguf/build/cpu -DLLAMA_K_QUANTS=on
|
||||
//go:generate cmake --build gguf/build/cpu --target server --config Release
|
||||
|
||||
//go:generate cmake -S ggml -B ggml/build/cuda -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on
|
||||
//go:generate cmake --build ggml/build/cuda --target server --config Release
|
||||
//go:generate cmake -S gguf -B gguf/build/cuda -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on
|
||||
//go:generate cmake --build gguf/build/cuda --target server --config Release
|
||||
|
@@ -1,17 +1,14 @@
|
||||
//go:build !darwin
|
||||
// +build !darwin
|
||||
|
||||
package llm
|
||||
|
||||
//go:generate git submodule init
|
||||
|
||||
//go:generate git submodule update --force ggml
|
||||
//go:generate -command git-apply git -C ggml apply
|
||||
//go:generate git-apply ../ggml_patch/0001-add-detokenize-endpoint.patch
|
||||
//go:generate git-apply ../ggml_patch/0002-34B-model-support.patch
|
||||
//go:generate git -C ggml apply ../patches/0001-add-detokenize-endpoint.patch
|
||||
//go:generate git -C ggml apply ../patches/0002-34B-model-support.patch
|
||||
//go:generate cmake -S ggml -B ggml/build/cpu -DLLAMA_K_QUANTS=on
|
||||
//go:generate cmake --build ggml/build/cpu --target server --config Release
|
||||
|
||||
//go:generate git submodule update --force gguf
|
||||
//go:generate git -C gguf apply ../patches/0001-remove-warm-up-logging.patch
|
||||
//go:generate cmake -S gguf -B gguf/build/cpu -DLLAMA_K_QUANTS=on
|
||||
//go:generate cmake --build gguf/build/cpu --target server --config Release
|
@@ -1,32 +0,0 @@
|
||||
From 8c0ea847ac1460bca534d92266e3471cb31471be Mon Sep 17 00:00:00 2001
|
||||
From: Bruce MacDonald <brucewmacdonald@gmail.com>
|
||||
Date: Tue, 5 Sep 2023 16:05:08 -0400
|
||||
Subject: [PATCH] metal: add missing barriers for mul-mat #2699
|
||||
|
||||
---
|
||||
ggml-metal.metal | 2 ++
|
||||
1 file changed, 2 insertions(+)
|
||||
|
||||
diff --git a/ggml-metal.metal b/ggml-metal.metal
|
||||
index 3f31252..ce3541f 100644
|
||||
--- a/ggml-metal.metal
|
||||
+++ b/ggml-metal.metal
|
||||
@@ -1850,6 +1850,7 @@ kernel void kernel_mul_mm(device const uchar * src0,
|
||||
//load data and store to threadgroup memory
|
||||
half4x4 temp_a;
|
||||
dequantize_func(x, il, temp_a);
|
||||
+ threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
#pragma unroll(16)
|
||||
for (int i = 0; i < 16; i++) {
|
||||
*(sa + SG_MAT_SIZE * ((tiitg / THREAD_PER_ROW / 8) \
|
||||
@@ -1895,6 +1896,7 @@ kernel void kernel_mul_mm(device const uchar * src0,
|
||||
}
|
||||
} else {
|
||||
// block is smaller than 64x32, we should avoid writing data outside of the matrix
|
||||
+ threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
threadgroup float *temp_str = ((threadgroup float *)shared_memory) \
|
||||
+ 32 * (sgitg&1) + (16 * (sgitg>>1)) * BLOCK_SIZE_M;
|
||||
for (int i = 0; i < 8; i++) {
|
||||
--
|
||||
2.39.2 (Apple Git-143)
|
||||
|
Submodule llm/llama.cpp/gguf updated: 53885d7256...bc9d3e3971
27
llm/llama.cpp/patches/0001-copy-cuda-runtime-libraries.patch
Normal file
27
llm/llama.cpp/patches/0001-copy-cuda-runtime-libraries.patch
Normal file
@@ -0,0 +1,27 @@
|
||||
From 5dd02993e8cc2ce309157736b95bb572f274a3fd Mon Sep 17 00:00:00 2001
|
||||
From: Michael Yang <mxyng@pm.me>
|
||||
Date: Wed, 20 Sep 2023 14:19:52 -0700
|
||||
Subject: [PATCH] copy cuda runtime libraries
|
||||
|
||||
---
|
||||
CMakeLists.txt | 4 ++++
|
||||
1 file changed, 4 insertions(+)
|
||||
|
||||
diff --git a/CMakeLists.txt b/CMakeLists.txt
|
||||
index 824d9f2..dd24137 100644
|
||||
--- a/CMakeLists.txt
|
||||
+++ b/CMakeLists.txt
|
||||
@@ -274,6 +274,10 @@ if (LLAMA_CUBLAS)
|
||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
|
||||
endif()
|
||||
|
||||
+ configure_file(${CUDAToolkit_LIBRARY_DIR}/libcudart.so ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/libcudart.so.${CUDAToolkit_VERSION_MAJOR}.0 COPYONLY)
|
||||
+ configure_file(${CUDAToolkit_LIBRARY_DIR}/libcublas.so ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/libcublas.so.${CUDAToolkit_VERSION_MAJOR} COPYONLY)
|
||||
+ configure_file(${CUDAToolkit_LIBRARY_DIR}/libcublasLt.so ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/libcublasLt.so.${CUDAToolkit_VERSION_MAJOR} COPYONLY)
|
||||
+
|
||||
if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
|
||||
# 52 == lowest CUDA 12 standard
|
||||
# 60 == f16 CUDA intrinsics
|
||||
--
|
||||
2.42.0
|
||||
|
25
llm/llama.cpp/patches/0001-remove-warm-up-logging.patch
Normal file
25
llm/llama.cpp/patches/0001-remove-warm-up-logging.patch
Normal file
@@ -0,0 +1,25 @@
|
||||
From 07993bdc35345b67b27aa649a7c099ad42d80c4c Mon Sep 17 00:00:00 2001
|
||||
From: Michael Yang <mxyng@pm.me>
|
||||
Date: Thu, 21 Sep 2023 14:43:21 -0700
|
||||
Subject: [PATCH] remove warm up logging
|
||||
|
||||
---
|
||||
common/common.cpp | 2 --
|
||||
1 file changed, 2 deletions(-)
|
||||
|
||||
diff --git a/common/common.cpp b/common/common.cpp
|
||||
index 2597ba0..b56549b 100644
|
||||
--- a/common/common.cpp
|
||||
+++ b/common/common.cpp
|
||||
@@ -780,8 +780,6 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
|
||||
}
|
||||
|
||||
{
|
||||
- LOG("warming up the model with an empty run\n");
|
||||
-
|
||||
const std::vector<llama_token> tmp = { llama_token_bos(lctx), llama_token_eos(lctx), };
|
||||
llama_eval(lctx, tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, params.n_threads);
|
||||
llama_reset_timings(lctx);
|
||||
--
|
||||
2.42.0
|
||||
|
226
llm/llama.go
226
llm/llama.go
@@ -28,71 +28,96 @@ import (
|
||||
//go:embed llama.cpp/*/build/*/bin/*
|
||||
var llamaCppEmbed embed.FS
|
||||
|
||||
func osPath(llamaPath string) string {
|
||||
if runtime.GOOS == "windows" {
|
||||
return path.Join(llamaPath, "Release")
|
||||
}
|
||||
|
||||
return llamaPath
|
||||
type ModelRunner struct {
|
||||
Path string // path to the model runner executable
|
||||
}
|
||||
|
||||
func chooseRunner(gpuPath, cpuPath string) string {
|
||||
tmpDir, err := os.MkdirTemp("", "llama-*")
|
||||
if err != nil {
|
||||
log.Fatalf("llama.cpp: failed to create temp dir: %v", err)
|
||||
}
|
||||
func chooseRunners(workDir, runnerType string) []ModelRunner {
|
||||
buildPath := path.Join("llama.cpp", runnerType, "build")
|
||||
var runners []string
|
||||
|
||||
llamaPath := osPath(gpuPath)
|
||||
if _, err := fs.Stat(llamaCppEmbed, llamaPath); err != nil {
|
||||
llamaPath = osPath(cpuPath)
|
||||
if _, err := fs.Stat(llamaCppEmbed, llamaPath); err != nil {
|
||||
log.Fatalf("llama.cpp executable not found")
|
||||
}
|
||||
}
|
||||
|
||||
files := []string{"server"}
|
||||
// set the runners based on the OS
|
||||
// IMPORTANT: the order of the runners in the array is the priority order
|
||||
switch runtime.GOOS {
|
||||
case "windows":
|
||||
files = []string{"server.exe"}
|
||||
case "darwin":
|
||||
if llamaPath == osPath(gpuPath) {
|
||||
files = append(files, "ggml-metal.metal")
|
||||
runners = []string{
|
||||
path.Join(buildPath, "metal", "bin", "server"),
|
||||
path.Join(buildPath, "cpu", "bin", "server"),
|
||||
}
|
||||
case "linux":
|
||||
// check if there is a GPU available
|
||||
if _, err := CheckVRAM(); errors.Is(err, errNoGPU) {
|
||||
// this error was logged on start-up, so we don't need to log it again
|
||||
llamaPath = osPath(cpuPath)
|
||||
runners = []string{
|
||||
path.Join(buildPath, "cuda", "bin", "server"),
|
||||
path.Join(buildPath, "cpu", "bin", "server"),
|
||||
}
|
||||
case "windows":
|
||||
// TODO: select windows GPU runner here when available
|
||||
runners = []string{
|
||||
path.Join(buildPath, "cpu", "bin", "Release", "server.exe"),
|
||||
}
|
||||
default:
|
||||
log.Printf("unknown OS, running on CPU: %s", runtime.GOOS)
|
||||
runners = []string{
|
||||
path.Join(buildPath, "cpu", "bin", "server"),
|
||||
}
|
||||
}
|
||||
|
||||
for _, f := range files {
|
||||
srcPath := path.Join(llamaPath, f)
|
||||
destPath := filepath.Join(tmpDir, f)
|
||||
|
||||
srcFile, err := llamaCppEmbed.Open(srcPath)
|
||||
runnerAvailable := false // if no runner files are found in the embed, this flag will cause a fast fail
|
||||
for _, r := range runners {
|
||||
// find all the files in the runner's bin directory
|
||||
files, err := fs.Glob(llamaCppEmbed, path.Join(path.Dir(r), "*"))
|
||||
if err != nil {
|
||||
log.Fatalf("read llama.cpp %s: %v", f, err)
|
||||
// this is expected, ollama may be compiled without all runners packed in
|
||||
log.Printf("%s runner not found: %v", r, err)
|
||||
continue
|
||||
}
|
||||
defer srcFile.Close()
|
||||
|
||||
destFile, err := os.OpenFile(destPath, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
|
||||
if err != nil {
|
||||
log.Fatalf("write llama.cpp %s: %v", f, err)
|
||||
}
|
||||
defer destFile.Close()
|
||||
for _, f := range files {
|
||||
runnerAvailable = true
|
||||
|
||||
if _, err := io.Copy(destFile, srcFile); err != nil {
|
||||
log.Fatalf("copy llama.cpp %s: %v", f, err)
|
||||
srcFile, err := llamaCppEmbed.Open(f)
|
||||
if err != nil {
|
||||
log.Fatalf("read llama runner %s: %v", f, err)
|
||||
}
|
||||
defer srcFile.Close()
|
||||
|
||||
// create the directory in case it does not exist, filepath.Dir() converts the file path to the OS's format
|
||||
destPath := filepath.Join(workDir, filepath.Dir(f))
|
||||
if err := os.MkdirAll(destPath, 0o755); err != nil {
|
||||
log.Fatalf("create runner temp dir %s: %v", filepath.Dir(f), err)
|
||||
}
|
||||
|
||||
// create the path to the destination file, filepath.Base() converts the file path to the OS's format
|
||||
destFile := filepath.Join(destPath, filepath.Base(f))
|
||||
|
||||
_, err = os.Stat(destFile)
|
||||
switch {
|
||||
case errors.Is(err, os.ErrNotExist):
|
||||
destFile, err := os.OpenFile(destFile, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
|
||||
if err != nil {
|
||||
log.Fatalf("write llama runner %s: %v", f, err)
|
||||
}
|
||||
defer destFile.Close()
|
||||
|
||||
if _, err := io.Copy(destFile, srcFile); err != nil {
|
||||
log.Fatalf("copy llama runner %s: %v", f, err)
|
||||
}
|
||||
case err != nil:
|
||||
log.Fatalf("stat llama runner %s: %v", f, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
runPath := filepath.Join(tmpDir, "server")
|
||||
if runtime.GOOS == "windows" {
|
||||
runPath = filepath.Join(tmpDir, "server.exe")
|
||||
if !runnerAvailable {
|
||||
log.Fatalf("%s runner not found", runnerType)
|
||||
}
|
||||
|
||||
return runPath
|
||||
// return the runners to try in priority order
|
||||
localRunnersByPriority := []ModelRunner{}
|
||||
for _, r := range runners {
|
||||
// clean the ModelRunner paths so that they match the OS we are running on
|
||||
localRunnersByPriority = append(localRunnersByPriority, ModelRunner{Path: filepath.Clean(path.Join(workDir, r))})
|
||||
}
|
||||
|
||||
return localRunnersByPriority
|
||||
}
|
||||
|
||||
type llamaModel struct {
|
||||
@@ -130,6 +155,10 @@ func (llm *llamaModel) FileType() string {
|
||||
return fileType(llm.hyperparameters.FileType)
|
||||
}
|
||||
|
||||
func (llm *llamaModel) NumLayers() int64 {
|
||||
return int64(llm.hyperparameters.NumLayer)
|
||||
}
|
||||
|
||||
type llamaHyperparameters struct {
|
||||
// NumVocab is the size of the model's vocabulary.
|
||||
NumVocab uint32
|
||||
@@ -153,10 +182,6 @@ type Running struct {
|
||||
Cancel context.CancelFunc
|
||||
}
|
||||
|
||||
type ModelRunner struct {
|
||||
Path string // path to the model runner executable
|
||||
}
|
||||
|
||||
type llama struct {
|
||||
api.Options
|
||||
Running
|
||||
@@ -165,7 +190,7 @@ type llama struct {
|
||||
var errNoGPU = errors.New("nvidia-smi command failed")
|
||||
|
||||
// CheckVRAM returns the available VRAM in MiB on Linux machines with NVIDIA GPUs
|
||||
func CheckVRAM() (int, error) {
|
||||
func CheckVRAM() (int64, error) {
|
||||
cmd := exec.Command("nvidia-smi", "--query-gpu=memory.total", "--format=csv,noheader,nounits")
|
||||
var stdout bytes.Buffer
|
||||
cmd.Stdout = &stdout
|
||||
@@ -174,11 +199,11 @@ func CheckVRAM() (int, error) {
|
||||
return 0, errNoGPU
|
||||
}
|
||||
|
||||
var total int
|
||||
var total int64
|
||||
scanner := bufio.NewScanner(&stdout)
|
||||
for scanner.Scan() {
|
||||
line := scanner.Text()
|
||||
vram, err := strconv.Atoi(line)
|
||||
vram, err := strconv.ParseInt(strings.TrimSpace(line), 10, 64)
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("failed to parse available VRAM: %v", err)
|
||||
}
|
||||
@@ -189,13 +214,13 @@ func CheckVRAM() (int, error) {
|
||||
return total, nil
|
||||
}
|
||||
|
||||
func NumGPU(opts api.Options) int {
|
||||
func NumGPU(numLayer, fileSizeBytes int64, opts api.Options) int {
|
||||
if opts.NumGPU != -1 {
|
||||
return opts.NumGPU
|
||||
}
|
||||
n := 1 // default to enable metal on macOS
|
||||
if runtime.GOOS == "linux" {
|
||||
vram, err := CheckVRAM()
|
||||
vramMib, err := CheckVRAM()
|
||||
if err != nil {
|
||||
if err.Error() != "nvidia-smi command failed" {
|
||||
log.Print(err.Error())
|
||||
@@ -203,37 +228,25 @@ func NumGPU(opts api.Options) int {
|
||||
// nvidia driver not installed or no nvidia GPU found
|
||||
return 0
|
||||
}
|
||||
// TODO: this is a very rough heuristic, better would be to calculate this based on number of layers and context size
|
||||
switch {
|
||||
case vram < 500:
|
||||
log.Printf("WARNING: Low VRAM detected, disabling GPU")
|
||||
n = 0
|
||||
case vram < 1000:
|
||||
n = 4
|
||||
case vram < 2000:
|
||||
n = 8
|
||||
case vram < 4000:
|
||||
n = 12
|
||||
case vram < 8000:
|
||||
n = 16
|
||||
case vram < 12000:
|
||||
n = 24
|
||||
case vram < 16000:
|
||||
n = 32
|
||||
default:
|
||||
n = 48
|
||||
}
|
||||
log.Printf("%d MB VRAM available, loading %d GPU layers", vram, n)
|
||||
|
||||
totalVramBytes := int64(vramMib) * 1024 * 1024 // 1 MiB = 1024^2 bytes
|
||||
|
||||
// Calculate bytes per layer
|
||||
// TODO: this is a rough heuristic, better would be to calculate this based on number of layers and context size
|
||||
bytesPerLayer := fileSizeBytes / numLayer
|
||||
|
||||
// set n to the max number of layers we can fit in VRAM
|
||||
return int(totalVramBytes / bytesPerLayer)
|
||||
|
||||
log.Printf("%d MiB VRAM available, loading up to %d GPU layers", vramMib, n)
|
||||
}
|
||||
return n
|
||||
// default to enable metal on macOS
|
||||
return 1
|
||||
}
|
||||
|
||||
func newLlama(model string, adapters []string, runner ModelRunner, opts api.Options) (*llama, error) {
|
||||
if _, err := os.Stat(model); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if _, err := os.Stat(runner.Path); err != nil {
|
||||
func newLlama(model string, adapters []string, runners []ModelRunner, numLayers int64, opts api.Options) (*llama, error) {
|
||||
fileInfo, err := os.Stat(model)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
@@ -247,7 +260,7 @@ func newLlama(model string, adapters []string, runner ModelRunner, opts api.Opti
|
||||
"--rope-freq-base", fmt.Sprintf("%f", opts.RopeFrequencyBase),
|
||||
"--rope-freq-scale", fmt.Sprintf("%f", opts.RopeFrequencyScale),
|
||||
"--batch-size", fmt.Sprintf("%d", opts.NumBatch),
|
||||
"--n-gpu-layers", fmt.Sprintf("%d", NumGPU(opts)),
|
||||
"--n-gpu-layers", fmt.Sprintf("%d", NumGPU(numLayers, fileInfo.Size(), opts)),
|
||||
"--embedding",
|
||||
}
|
||||
|
||||
@@ -278,7 +291,12 @@ func newLlama(model string, adapters []string, runner ModelRunner, opts api.Opti
|
||||
}
|
||||
|
||||
// start the llama.cpp server with a retry in case the port is already in use
|
||||
for try := 0; try < 3; try++ {
|
||||
for _, runner := range runners {
|
||||
if _, err := os.Stat(runner.Path); err != nil {
|
||||
log.Printf("llama runner not found: %v", err)
|
||||
continue
|
||||
}
|
||||
|
||||
port := rand.Intn(65535-49152) + 49152 // get a random port in the ephemeral range
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
cmd := exec.CommandContext(
|
||||
@@ -286,20 +304,30 @@ func newLlama(model string, adapters []string, runner ModelRunner, opts api.Opti
|
||||
runner.Path,
|
||||
append(params, "--port", strconv.Itoa(port))...,
|
||||
)
|
||||
|
||||
cmd.Env = append(os.Environ(), fmt.Sprintf("LD_LIBRARY_PATH=%s", filepath.Dir(runner.Path)))
|
||||
cmd.Stdout = os.Stderr
|
||||
cmd.Stderr = os.Stderr
|
||||
|
||||
llm := &llama{Options: opts, Running: Running{Port: port, Cmd: cmd, Cancel: cancel}}
|
||||
|
||||
log.Print("starting llama.cpp server")
|
||||
log.Print("starting llama runner")
|
||||
if err := llm.Cmd.Start(); err != nil {
|
||||
log.Printf("error starting the external llama.cpp server: %v", err)
|
||||
log.Printf("error starting the external llama runner: %v", err)
|
||||
continue
|
||||
}
|
||||
|
||||
// monitor the command, it is blocking, so if it exits we need to capture that
|
||||
go func() {
|
||||
err := llm.Cmd.Wait() // this will block until the command exits
|
||||
if err != nil {
|
||||
log.Printf("llama runner exited with error: %v", err)
|
||||
} else {
|
||||
log.Printf("llama runner exited")
|
||||
}
|
||||
}()
|
||||
|
||||
if err := waitForServer(llm); err != nil {
|
||||
log.Printf("error starting llama.cpp server: %v", err)
|
||||
log.Printf("error starting llama runner: %v", err)
|
||||
llm.Close()
|
||||
// try again
|
||||
continue
|
||||
@@ -309,19 +337,24 @@ func newLlama(model string, adapters []string, runner ModelRunner, opts api.Opti
|
||||
return llm, nil
|
||||
}
|
||||
|
||||
return nil, fmt.Errorf("max retry exceeded starting llama.cpp")
|
||||
return nil, fmt.Errorf("failed to start a llama runner")
|
||||
}
|
||||
|
||||
func waitForServer(llm *llama) error {
|
||||
// wait for the server to start responding
|
||||
start := time.Now()
|
||||
expiresAt := time.Now().Add(45 * time.Second)
|
||||
expiresAt := time.Now().Add(2 * time.Minute) // be generous with timeout, large models can take a while to load
|
||||
ticker := time.NewTicker(200 * time.Millisecond)
|
||||
|
||||
log.Print("waiting for llama.cpp server to start responding")
|
||||
log.Print("waiting for llama runner to start responding")
|
||||
for range ticker.C {
|
||||
if time.Now().After(expiresAt) {
|
||||
return fmt.Errorf("llama.cpp server did not start within alloted time, retrying")
|
||||
return fmt.Errorf("llama runner did not start within alloted time, retrying")
|
||||
}
|
||||
|
||||
// check if the server process has terminated
|
||||
if llm.Cmd.ProcessState != nil && llm.Cmd.ProcessState.Exited() {
|
||||
return fmt.Errorf("llama runner process has terminated")
|
||||
}
|
||||
|
||||
if err := llm.Ping(context.Background()); err == nil {
|
||||
@@ -329,15 +362,12 @@ func waitForServer(llm *llama) error {
|
||||
}
|
||||
}
|
||||
|
||||
log.Printf("llama.cpp server started in %f seconds", time.Since(start).Seconds())
|
||||
log.Printf("llama runner started in %f seconds", time.Since(start).Seconds())
|
||||
return nil
|
||||
}
|
||||
|
||||
func (llm *llama) Close() {
|
||||
llm.Cancel()
|
||||
if err := llm.Cmd.Wait(); err != nil {
|
||||
log.Printf("llama.cpp server exited with error: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func (llm *llama) SetOptions(opts api.Options) {
|
||||
|
@@ -21,7 +21,7 @@ type LLM interface {
|
||||
Ping(context.Context) error
|
||||
}
|
||||
|
||||
func New(model string, adapters []string, opts api.Options) (LLM, error) {
|
||||
func New(workDir, model string, adapters []string, opts api.Options) (LLM, error) {
|
||||
if _, err := os.Stat(model); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@@ -91,9 +91,9 @@ func New(model string, adapters []string, opts api.Options) (LLM, error) {
|
||||
switch ggml.Name() {
|
||||
case "gguf":
|
||||
opts.NumGQA = 0 // TODO: remove this when llama.cpp runners differ enough to need separate newLlama functions
|
||||
return newLlama(model, adapters, ggufRunner(), opts)
|
||||
return newLlama(model, adapters, chooseRunners(workDir, "gguf"), ggml.NumLayers(), opts)
|
||||
case "ggml", "ggmf", "ggjt", "ggla":
|
||||
return newLlama(model, adapters, ggmlRunner(), opts)
|
||||
return newLlama(model, adapters, chooseRunners(workDir, "ggml"), ggml.NumLayers(), opts)
|
||||
default:
|
||||
return nil, fmt.Errorf("unknown ggml type: %s", ggml.ModelFamily())
|
||||
}
|
||||
|
21
scripts/build.sh
Normal file
21
scripts/build.sh
Normal file
@@ -0,0 +1,21 @@
|
||||
#!/bin/sh
|
||||
|
||||
set -eu
|
||||
|
||||
usage() {
|
||||
echo "usage: $(basename $0) VERSION"
|
||||
exit 1
|
||||
}
|
||||
|
||||
[ "$#" -eq 1 ] || usage
|
||||
|
||||
export VERSION="$1"
|
||||
|
||||
# build universal MacOS binary
|
||||
sh $(dirname $0)/build_darwin.sh
|
||||
|
||||
# # build arm64 and amd64 Linux binaries
|
||||
sh $(dirname $0)/build_linux.sh
|
||||
|
||||
# # build arm64 and amd64 Docker images
|
||||
sh $(dirname $0)/build_docker.sh
|
@@ -1,29 +1,30 @@
|
||||
#!/bin/bash
|
||||
#!/bin/sh
|
||||
|
||||
set -eu
|
||||
|
||||
export VERSION=${VERSION:-0.0.0}
|
||||
export GOFLAGS="'-ldflags=-w -s \"-X=github.com/jmorganca/ollama/version.Version=$VERSION\" \"-X=github.com/jmorganca/ollama/server.mode=release\"'"
|
||||
|
||||
mkdir -p dist
|
||||
|
||||
GO_LDFLAGS="-X github.com/jmorganca/ollama/version.Version=$VERSION"
|
||||
GO_LDFLAGS="$GO_LDFLAGS -X github.com/jmorganca/ollama/server.mode=release"
|
||||
for TARGETARCH in arm64 amd64; do
|
||||
GOOS=darwin GOARCH=$TARGETARCH go generate ./...
|
||||
GOOS=darwin GOARCH=$TARGETARCH go build -o dist/ollama-darwin-$TARGETARCH
|
||||
done
|
||||
|
||||
# build universal binary
|
||||
GOARCH=arm64 go generate ./...
|
||||
GOARCH=arm64 go build -ldflags "$GO_LDFLAGS" -o dist/ollama-darwin-arm64
|
||||
rm -rf llm/llama.cpp/*/build/*/bin
|
||||
GOARCH=amd64 go generate ./...
|
||||
GOARCH=amd64 go build -ldflags "$GO_LDFLAGS" -o dist/ollama-darwin-amd64
|
||||
lipo -create -output dist/ollama dist/ollama-darwin-arm64 dist/ollama-darwin-amd64
|
||||
rm dist/ollama-darwin-amd64 dist/ollama-darwin-arm64
|
||||
lipo -create -output dist/ollama dist/ollama-darwin-*
|
||||
rm -f dist/ollama-darwin-*
|
||||
codesign --deep --force --options=runtime --sign "$APPLE_IDENTITY" --timestamp dist/ollama
|
||||
chmod +x dist/ollama
|
||||
|
||||
# build and sign the mac app
|
||||
npm install --prefix app
|
||||
npm run --prefix app make:sign
|
||||
cp app/out/make/zip/darwin/universal/Ollama-darwin-universal-${VERSION:-0.0.0}.zip dist/Ollama-darwin.zip
|
||||
cp app/out/make/zip/darwin/universal/Ollama-darwin-universal-$VERSION.zip dist/Ollama-darwin.zip
|
||||
|
||||
# sign the binary and rename it
|
||||
codesign -f --timestamp -s "$APPLE_IDENTITY" --identifier ai.ollama.ollama --options=runtime dist/ollama
|
||||
ditto -c -k --keepParent dist/ollama dist/temp.zip
|
||||
xcrun notarytool submit dist/temp.zip --wait --timeout 10m --apple-id $APPLE_ID --password $APPLE_PASSWORD --team-id $APPLE_TEAM_ID
|
||||
mv dist/ollama dist/ollama-darwin
|
||||
rm dist/temp.zip
|
||||
rm -f dist/temp.zip
|
||||
|
15
scripts/build_docker.sh
Normal file
15
scripts/build_docker.sh
Normal file
@@ -0,0 +1,15 @@
|
||||
#!/bin/sh
|
||||
|
||||
set -eu
|
||||
|
||||
export VERSION=${VERSION:-0.0.0}
|
||||
export GOFLAGS="'-ldflags=-w -s \"-X=github.com/jmorganca/ollama/version.Version=$VERSION\" \"-X=github.com/jmorganca/ollama/server.mode=release\"'"
|
||||
|
||||
docker buildx build \
|
||||
--load \
|
||||
--platform=linux/arm64,linux/amd64 \
|
||||
--build-arg=VERSION \
|
||||
--build-arg=GOFLAGS \
|
||||
-f Dockerfile \
|
||||
-t ollama \
|
||||
.
|
15
scripts/build_linux.sh
Executable file
15
scripts/build_linux.sh
Executable file
@@ -0,0 +1,15 @@
|
||||
#!/bin/sh
|
||||
|
||||
set -eu
|
||||
|
||||
export VERSION=${VERSION:-0.0.0}
|
||||
export GOFLAGS="'-ldflags=-w -s \"-X=github.com/jmorganca/ollama/version.Version=$VERSION\" \"-X=github.com/jmorganca/ollama/server.mode=release\"'"
|
||||
|
||||
mkdir -p dist
|
||||
|
||||
for TARGETARCH in arm64 amd64; do
|
||||
docker buildx build --load --platform=linux/$TARGETARCH --build-arg=VERSION --build-arg=GOFLAGS -f Dockerfile.build -t builder:$TARGETARCH .
|
||||
docker create --platform linux/$TARGETARCH --name builder-$TARGETARCH builder:$TARGETARCH
|
||||
docker cp builder-$TARGETARCH:/go/src/github.com/jmorganca/ollama/ollama ./dist/ollama-linux-$TARGETARCH
|
||||
docker rm builder-$TARGETARCH
|
||||
done
|
243
scripts/install.sh
Normal file
243
scripts/install.sh
Normal file
@@ -0,0 +1,243 @@
|
||||
#!/bin/sh
|
||||
# This script installs Ollama on Linux.
|
||||
# It detects the current operating system architecture and installs the appropriate version of Ollama.
|
||||
|
||||
set -eu
|
||||
|
||||
status() { echo ">>> $*" >&2; }
|
||||
error() { echo "ERROR $*"; exit 1; }
|
||||
warning() { echo "WARNING: $*"; }
|
||||
|
||||
TEMP_DIR=$(mktemp -d)
|
||||
cleanup() { rm -rf $TEMP_DIR; }
|
||||
trap cleanup EXIT
|
||||
|
||||
available() { command -v $1 >/dev/null; }
|
||||
require() {
|
||||
local MISSING=''
|
||||
for TOOL in $*; do
|
||||
if ! available $TOOL; then
|
||||
MISSING="$MISSING $TOOL"
|
||||
fi
|
||||
done
|
||||
|
||||
echo $MISSING
|
||||
}
|
||||
|
||||
[ "$(uname -s)" = "Linux" ] || error 'This script is intended to run on Linux only.'
|
||||
|
||||
case "$(uname -m)" in
|
||||
x86_64) ARCH="amd64" ;;
|
||||
aarch64|arm64) ARCH="arm64" ;;
|
||||
*) error "Unsupported architecture: $ARCH" ;;
|
||||
esac
|
||||
|
||||
SUDO=
|
||||
if [ "$(id -u)" -ne 0 ]; then
|
||||
# Running as root, no need for sudo
|
||||
if ! available sudo; then
|
||||
error "This script requires superuser permissions. Please re-run as root."
|
||||
fi
|
||||
|
||||
SUDO="sudo"
|
||||
fi
|
||||
|
||||
NEEDS=$(require curl awk grep sed tee xargs)
|
||||
if [ -n "$NEEDS" ]; then
|
||||
status "ERROR: The following tools are required but missing:"
|
||||
for NEED in $NEEDS; do
|
||||
echo " - $NEED"
|
||||
done
|
||||
exit 1
|
||||
fi
|
||||
|
||||
status "Downloading ollama..."
|
||||
curl --fail --show-error --location --progress-bar -o $TEMP_DIR/ollama "https://ollama.ai/download/ollama-linux-$ARCH"
|
||||
|
||||
for BINDIR in /usr/local/bin /usr/bin /bin; do
|
||||
echo $PATH | grep -q $BINDIR && break || continue
|
||||
done
|
||||
|
||||
status "Installing ollama to $BINDIR..."
|
||||
$SUDO install -o0 -g0 -m755 -d $BINDIR
|
||||
$SUDO install -o0 -g0 -m755 $TEMP_DIR/ollama $BINDIR/ollama
|
||||
|
||||
install_success() { status 'Install complete. Run "ollama" from the command line.'; }
|
||||
trap install_success EXIT
|
||||
|
||||
# Everything from this point onwards is optional.
|
||||
|
||||
configure_systemd() {
|
||||
if ! id ollama >/dev/null 2>&1; then
|
||||
status "Creating ollama user..."
|
||||
$SUDO useradd -r -s /bin/false -m -d /usr/share/ollama ollama
|
||||
fi
|
||||
|
||||
status "Creating ollama systemd service..."
|
||||
cat <<EOF | $SUDO tee /etc/systemd/system/ollama.service >/dev/null
|
||||
[Unit]
|
||||
Description=Ollama Service
|
||||
After=network-online.target
|
||||
|
||||
[Service]
|
||||
ExecStart=$BINDIR/ollama serve
|
||||
User=ollama
|
||||
Group=ollama
|
||||
Restart=always
|
||||
RestartSec=3
|
||||
Environment="HOME=/usr/share/ollama"
|
||||
Environment="PATH=$PATH"
|
||||
|
||||
[Install]
|
||||
WantedBy=default.target
|
||||
EOF
|
||||
SYSTEMCTL_RUNNING="$(systemctl is-system-running || true)"
|
||||
case $SYSTEMCTL_RUNNING in
|
||||
running|degraded)
|
||||
status "Enabling and starting ollama service..."
|
||||
$SUDO systemctl daemon-reload
|
||||
$SUDO systemctl enable ollama
|
||||
|
||||
start_service() { $SUDO systemctl restart ollama; }
|
||||
trap start_service EXIT
|
||||
;;
|
||||
esac
|
||||
}
|
||||
|
||||
if available systemctl; then
|
||||
configure_systemd
|
||||
fi
|
||||
|
||||
if ! available lspci && ! available lshw; then
|
||||
warning "Unable to detect NVIDIA GPU. Install lspci or lshw to automatically detect and install NVIDIA CUDA drivers."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
check_gpu() {
|
||||
case $1 in
|
||||
lspci) available lspci && lspci -d '10de:' | grep -q 'NVIDIA' || return 1 ;;
|
||||
lshw) available lshw && $SUDO lshw -c display -numeric | grep -q 'vendor: .* \[10DE\]' || return 1 ;;
|
||||
nvidia-smi) available nvidia-smi || return 1 ;;
|
||||
esac
|
||||
}
|
||||
|
||||
if check_gpu nvidia-smi; then
|
||||
status "NVIDIA GPU installed."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
if ! check_gpu lspci && ! check_gpu lshw; then
|
||||
warning "No NVIDIA GPU detected. Ollama will run in CPU-only mode."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# ref: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#rhel-7-centos-7
|
||||
# ref: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#rhel-8-rocky-8
|
||||
# ref: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#rhel-9-rocky-9
|
||||
# ref: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#fedora
|
||||
install_cuda_driver_yum() {
|
||||
status 'Installing NVIDIA repository...'
|
||||
case $PACKAGE_MANAGER in
|
||||
yum)
|
||||
$SUDO $PACKAGE_MANAGER -y install yum-utils
|
||||
$SUDO $PACKAGE_MANAGER-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-$1$2.repo
|
||||
;;
|
||||
dnf)
|
||||
$SUDO $PACKAGE_MANAGER config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-$1$2.repo
|
||||
;;
|
||||
esac
|
||||
|
||||
case $1 in
|
||||
rhel)
|
||||
status 'Installing EPEL repository...'
|
||||
# EPEL is required for third-party dependencies such as dkms and libvdpau
|
||||
$SUDO $PACKAGE_MANAGER -y install https://dl.fedoraproject.org/pub/epel/epel-release-latest-$2.noarch.rpm || true
|
||||
;;
|
||||
esac
|
||||
|
||||
status 'Installing CUDA driver...'
|
||||
|
||||
if [ "$1" = 'centos' ] || [ "$1$2" = 'rhel7' ]; then
|
||||
$SUDO $PACKAGE_MANAGER -y install nvidia-driver-latest-dkms
|
||||
fi
|
||||
|
||||
$SUDO $PACKAGE_MANAGER -y install cuda-drivers
|
||||
}
|
||||
|
||||
# ref: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#ubuntu
|
||||
# ref: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#debian
|
||||
install_cuda_driver_apt() {
|
||||
status 'Installing NVIDIA repository...'
|
||||
curl -fsSL -o $TEMP_DIR/cuda-keyring.deb https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-keyring_1.1-1_all.deb
|
||||
|
||||
case $1 in
|
||||
debian)
|
||||
status 'Enabling contrib sources...'
|
||||
$SUDO sed 's/main/contrib/' < /etc/apt/sources.list | sudo tee /etc/apt/sources.list.d/contrib.list > /dev/null
|
||||
;;
|
||||
esac
|
||||
|
||||
status 'Installing CUDA driver...'
|
||||
$SUDO dpkg -i $TEMP_DIR/cuda-keyring.deb
|
||||
$SUDO apt-get update
|
||||
|
||||
[ -n "$SUDO" ] && SUDO_E="$SUDO -E" || SUDO_E=
|
||||
DEBIAN_FRONTEND=noninteractive $SUDO_E apt-get -y install cuda-drivers -q
|
||||
}
|
||||
|
||||
if [ ! -f "/etc/os-release" ]; then
|
||||
error "Unknown distribution. Skipping CUDA installation."
|
||||
fi
|
||||
|
||||
. /etc/os-release
|
||||
|
||||
OS_NAME=$ID
|
||||
OS_VERSION=$VERSION_ID
|
||||
|
||||
PACKAGE_MANAGER=
|
||||
for PACKAGE_MANAGER in dnf yum apt-get; do
|
||||
if available $PACKAGE_MANAGER; then
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
if [ -z "$PACKAGE_MANAGER" ]; then
|
||||
error "Unknown package manager. Skipping CUDA installation."
|
||||
fi
|
||||
|
||||
if ! check_gpu nvidia-smi || [ -z "$(nvidia-smi | grep -o "CUDA Version: [0-9]*\.[0-9]*")" ]; then
|
||||
case $OS_NAME in
|
||||
centos|rhel) install_cuda_driver_yum 'rhel' $OS_VERSION ;;
|
||||
rocky) install_cuda_driver_yum 'rhel' $(echo $OS_VERSION | cut -c1) ;;
|
||||
fedora) install_cuda_driver_yum $OS_NAME $OS_VERSION ;;
|
||||
amzn) install_cuda_driver_yum 'fedora' '35' ;;
|
||||
debian) install_cuda_driver_apt $OS_NAME $OS_VERSION ;;
|
||||
ubuntu) install_cuda_driver_apt $OS_NAME $(echo $OS_VERSION | sed 's/\.//') ;;
|
||||
*) exit ;;
|
||||
esac
|
||||
fi
|
||||
|
||||
if ! lsmod | grep -q nvidia; then
|
||||
KERNEL_RELEASE="$(uname -r)"
|
||||
case $OS_NAME in
|
||||
centos|rhel|rocky|amzn) $SUDO $PACKAGE_MANAGER -y install kernel-devel-$KERNEL_RELEASE kernel-headers-$KERNEL_RELEASE ;;
|
||||
fedora) $SUDO $PACKAGE_MANAGER -y install kernel-devel-$KERNEL_RELEASE ;;
|
||||
debian|ubuntu) $SUDO apt-get -y install linux-headers-$KERNEL_RELEASE ;;
|
||||
*) exit ;;
|
||||
esac
|
||||
|
||||
NVIDIA_CUDA_VERSION=$($SUDO dkms status | awk -F: '/added/ { print $1 }')
|
||||
if [ -n "$NVIDIA_CUDA_VERSION" ]; then
|
||||
$SUDO dkms install $NVIDIA_CUDA_VERSION
|
||||
fi
|
||||
|
||||
if lsmod | grep -q nouveau; then
|
||||
status 'Reboot to complete NVIDIA CUDA driver install.'
|
||||
exit 0
|
||||
fi
|
||||
|
||||
$SUDO modprobe nvidia
|
||||
fi
|
||||
|
||||
|
||||
status "NVIDIA CUDA drivers installed."
|
@@ -14,7 +14,7 @@ import (
|
||||
"net/http"
|
||||
"net/url"
|
||||
"os"
|
||||
"path"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
@@ -71,7 +71,7 @@ func (r AuthRedirect) URL() (*url.URL, error) {
|
||||
return redirectURL, nil
|
||||
}
|
||||
|
||||
func getAuthToken(ctx context.Context, redirData AuthRedirect, regOpts *RegistryOptions) (string, error) {
|
||||
func getAuthToken(ctx context.Context, redirData AuthRedirect) (string, error) {
|
||||
redirectURL, err := redirData.URL()
|
||||
if err != nil {
|
||||
return "", err
|
||||
@@ -82,7 +82,7 @@ func getAuthToken(ctx context.Context, redirData AuthRedirect, regOpts *Registry
|
||||
return "", err
|
||||
}
|
||||
|
||||
keyPath := path.Join(home, ".ollama", "id_ed25519")
|
||||
keyPath := filepath.Join(home, ".ollama", "id_ed25519")
|
||||
|
||||
rawKey, err := os.ReadFile(keyPath)
|
||||
if err != nil {
|
||||
|
@@ -8,7 +8,7 @@ import (
|
||||
"log"
|
||||
"net/http"
|
||||
"os"
|
||||
"path"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"sync"
|
||||
"time"
|
||||
@@ -46,8 +46,8 @@ func downloadBlob(ctx context.Context, opts downloadOpts) error {
|
||||
// we already have the file, so return
|
||||
opts.fn(api.ProgressResponse{
|
||||
Digest: opts.digest,
|
||||
Total: int(fi.Size()),
|
||||
Completed: int(fi.Size()),
|
||||
Total: fi.Size(),
|
||||
Completed: fi.Size(),
|
||||
})
|
||||
|
||||
return nil
|
||||
@@ -93,8 +93,8 @@ func monitorDownload(ctx context.Context, opts downloadOpts, f *FileDownload) er
|
||||
// successful download while monitoring
|
||||
opts.fn(api.ProgressResponse{
|
||||
Digest: f.Digest,
|
||||
Total: int(fi.Size()),
|
||||
Completed: int(fi.Size()),
|
||||
Total: fi.Size(),
|
||||
Completed: fi.Size(),
|
||||
})
|
||||
return true, false, nil
|
||||
}
|
||||
@@ -109,8 +109,8 @@ func monitorDownload(ctx context.Context, opts downloadOpts, f *FileDownload) er
|
||||
opts.fn(api.ProgressResponse{
|
||||
Status: fmt.Sprintf("downloading %s", f.Digest),
|
||||
Digest: f.Digest,
|
||||
Total: int(f.Total),
|
||||
Completed: int(f.Completed),
|
||||
Total: f.Total,
|
||||
Completed: f.Completed,
|
||||
})
|
||||
return false, false, nil
|
||||
}()
|
||||
@@ -129,8 +129,8 @@ func monitorDownload(ctx context.Context, opts downloadOpts, f *FileDownload) er
|
||||
}
|
||||
|
||||
var (
|
||||
chunkSize = 1024 * 1024 // 1 MiB in bytes
|
||||
errDownload = fmt.Errorf("download failed")
|
||||
chunkSize int64 = 1024 * 1024 // 1 MiB in bytes
|
||||
errDownload = fmt.Errorf("download failed")
|
||||
)
|
||||
|
||||
// doDownload downloads a blob from the registry and stores it in the blobs directory
|
||||
@@ -147,7 +147,7 @@ func doDownload(ctx context.Context, opts downloadOpts, f *FileDownload) error {
|
||||
default:
|
||||
size = fi.Size()
|
||||
// Ensure the size is divisible by the chunk size by removing excess bytes
|
||||
size -= size % int64(chunkSize)
|
||||
size -= size % chunkSize
|
||||
|
||||
err := os.Truncate(f.FilePath+"-partial", size)
|
||||
if err != nil {
|
||||
@@ -173,7 +173,7 @@ func doDownload(ctx context.Context, opts downloadOpts, f *FileDownload) error {
|
||||
return fmt.Errorf("%w: on download registry responded with code %d: %v", errDownload, resp.StatusCode, string(body))
|
||||
}
|
||||
|
||||
err = os.MkdirAll(path.Dir(f.FilePath), 0o700)
|
||||
err = os.MkdirAll(filepath.Dir(f.FilePath), 0o700)
|
||||
if err != nil {
|
||||
return fmt.Errorf("make blobs directory: %w", err)
|
||||
}
|
||||
@@ -200,8 +200,8 @@ outerLoop:
|
||||
opts.fn(api.ProgressResponse{
|
||||
Status: fmt.Sprintf("downloading %s", f.Digest),
|
||||
Digest: f.Digest,
|
||||
Total: int(f.Total),
|
||||
Completed: int(f.Completed),
|
||||
Total: f.Total,
|
||||
Completed: f.Completed,
|
||||
})
|
||||
|
||||
if f.Completed >= f.Total {
|
||||
@@ -213,8 +213,8 @@ outerLoop:
|
||||
opts.fn(api.ProgressResponse{
|
||||
Status: fmt.Sprintf("error renaming file: %v", err),
|
||||
Digest: f.Digest,
|
||||
Total: int(f.Total),
|
||||
Completed: int(f.Completed),
|
||||
Total: f.Total,
|
||||
Completed: f.Completed,
|
||||
})
|
||||
return err
|
||||
}
|
||||
@@ -223,7 +223,7 @@ outerLoop:
|
||||
}
|
||||
}
|
||||
|
||||
n, err := io.CopyN(out, resp.Body, int64(chunkSize))
|
||||
n, err := io.CopyN(out, resp.Body, chunkSize)
|
||||
if err != nil && !errors.Is(err, io.EOF) {
|
||||
return fmt.Errorf("%w: %w", errDownload, err)
|
||||
}
|
||||
|
139
server/images.go
139
server/images.go
@@ -14,7 +14,6 @@ import (
|
||||
"net/http"
|
||||
"net/url"
|
||||
"os"
|
||||
"path"
|
||||
"path/filepath"
|
||||
"reflect"
|
||||
"runtime"
|
||||
@@ -55,6 +54,54 @@ type Model struct {
|
||||
Embeddings []vector.Embedding
|
||||
}
|
||||
|
||||
func (m *Model) ChatPrompt(messages []api.Message) (string, error) {
|
||||
tmpl, err := template.New("").Parse(m.Template)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
var vars struct {
|
||||
System string
|
||||
Prompt string
|
||||
First bool
|
||||
}
|
||||
|
||||
vars.First = true
|
||||
|
||||
var sb strings.Builder
|
||||
flush := func() {
|
||||
tmpl.Execute(&sb, vars)
|
||||
vars.System = ""
|
||||
vars.Prompt = ""
|
||||
}
|
||||
|
||||
// build the chat history from messages
|
||||
for _, m := range messages {
|
||||
if m.Role == "system" {
|
||||
if vars.System != "" {
|
||||
flush()
|
||||
}
|
||||
vars.System = m.Content
|
||||
}
|
||||
|
||||
if m.Role == "user" {
|
||||
if vars.Prompt != "" {
|
||||
flush()
|
||||
}
|
||||
vars.Prompt = m.Content
|
||||
}
|
||||
|
||||
if m.Role == "assistant" {
|
||||
flush()
|
||||
sb.Write([]byte(m.Content))
|
||||
}
|
||||
}
|
||||
|
||||
flush()
|
||||
|
||||
return sb.String(), nil
|
||||
}
|
||||
|
||||
func (m *Model) Prompt(request api.GenerateRequest, embedding string) (string, error) {
|
||||
t := m.Template
|
||||
if request.Template != "" {
|
||||
@@ -104,7 +151,7 @@ type ManifestV2 struct {
|
||||
type Layer struct {
|
||||
MediaType string `json:"mediaType"`
|
||||
Digest string `json:"digest"`
|
||||
Size int `json:"size"`
|
||||
Size int64 `json:"size"`
|
||||
From string `json:"from,omitempty"`
|
||||
}
|
||||
|
||||
@@ -130,11 +177,11 @@ type RootFS struct {
|
||||
DiffIDs []string `json:"diff_ids"`
|
||||
}
|
||||
|
||||
func (m *ManifestV2) GetTotalSize() int {
|
||||
var total int
|
||||
func (m *ManifestV2) GetTotalSize() (total int64) {
|
||||
for _, layer := range m.Layers {
|
||||
total += layer.Size
|
||||
}
|
||||
|
||||
total += m.Config.Size
|
||||
return total
|
||||
}
|
||||
@@ -268,7 +315,7 @@ func filenameWithPath(path, f string) (string, error) {
|
||||
return f, nil
|
||||
}
|
||||
|
||||
func CreateModel(ctx context.Context, name string, path string, fn func(resp api.ProgressResponse)) error {
|
||||
func CreateModel(ctx context.Context, workDir, name string, path string, fn func(resp api.ProgressResponse)) error {
|
||||
mp := ParseModelPath(name)
|
||||
|
||||
var manifest *ManifestV2
|
||||
@@ -391,7 +438,7 @@ func CreateModel(ctx context.Context, name string, path string, fn func(resp api
|
||||
return err
|
||||
}
|
||||
|
||||
// copie the model metadata
|
||||
// copy the model metadata
|
||||
config.ModelFamily = source.ModelFamily
|
||||
config.ModelType = source.ModelType
|
||||
config.ModelFormat = source.ModelFormat
|
||||
@@ -461,8 +508,10 @@ func CreateModel(ctx context.Context, name string, path string, fn func(resp api
|
||||
return err
|
||||
}
|
||||
|
||||
layer.MediaType = mediaType
|
||||
layers = append(layers, layer)
|
||||
if layer.Size > 0 {
|
||||
layer.MediaType = mediaType
|
||||
layers = append(layers, layer)
|
||||
}
|
||||
case "template", "system", "prompt":
|
||||
fn(api.ProgressResponse{Status: fmt.Sprintf("creating model %s layer", c.Name)})
|
||||
// remove the layer if one exists
|
||||
@@ -474,8 +523,10 @@ func CreateModel(ctx context.Context, name string, path string, fn func(resp api
|
||||
return err
|
||||
}
|
||||
|
||||
layer.MediaType = mediaType
|
||||
layers = append(layers, layer)
|
||||
if layer.Size > 0 {
|
||||
layer.MediaType = mediaType
|
||||
layers = append(layers, layer)
|
||||
}
|
||||
default:
|
||||
// runtime parameters, build a list of args for each parameter to allow multiple values to be specified (ex: multiple stop sequences)
|
||||
params[c.Name] = append(params[c.Name], c.Args)
|
||||
@@ -521,7 +572,7 @@ func CreateModel(ctx context.Context, name string, path string, fn func(resp api
|
||||
}
|
||||
|
||||
// generate the embedding layers
|
||||
embeddingLayers, err := embeddingLayers(embed)
|
||||
embeddingLayers, err := embeddingLayers(workDir, embed)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -578,7 +629,7 @@ type EmbeddingParams struct {
|
||||
}
|
||||
|
||||
// embeddingLayers loads the associated LLM and generates the embeddings to be stored from an input file
|
||||
func embeddingLayers(e EmbeddingParams) ([]*LayerReader, error) {
|
||||
func embeddingLayers(workDir string, e EmbeddingParams) ([]*LayerReader, error) {
|
||||
layers := []*LayerReader{}
|
||||
if len(e.files) > 0 {
|
||||
// check if the model is a file path or a model name
|
||||
@@ -591,7 +642,7 @@ func embeddingLayers(e EmbeddingParams) ([]*LayerReader, error) {
|
||||
model = &Model{ModelPath: e.model}
|
||||
}
|
||||
|
||||
if err := load(context.Background(), model, e.opts, defaultSessionDuration); err != nil {
|
||||
if err := load(context.Background(), workDir, model, e.opts, defaultSessionDuration); err != nil {
|
||||
return nil, fmt.Errorf("load model to generate embeddings: %v", err)
|
||||
}
|
||||
|
||||
@@ -646,8 +697,8 @@ func embeddingLayers(e EmbeddingParams) ([]*LayerReader, error) {
|
||||
e.fn(api.ProgressResponse{
|
||||
Status: fmt.Sprintf("creating embeddings for file %s", filePath),
|
||||
Digest: fileDigest,
|
||||
Total: len(data) - 1,
|
||||
Completed: i,
|
||||
Total: int64(len(data) - 1),
|
||||
Completed: int64(i),
|
||||
})
|
||||
if len(existing[d]) > 0 {
|
||||
// already have an embedding for this line
|
||||
@@ -672,7 +723,7 @@ func embeddingLayers(e EmbeddingParams) ([]*LayerReader, error) {
|
||||
Layer: Layer{
|
||||
MediaType: "application/vnd.ollama.image.embed",
|
||||
Digest: digest,
|
||||
Size: r.Len(),
|
||||
Size: r.Size(),
|
||||
},
|
||||
Reader: r,
|
||||
}
|
||||
@@ -1002,6 +1053,39 @@ func PruneLayers() error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func PruneDirectory(path string) error {
|
||||
info, err := os.Lstat(path)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if info.IsDir() && info.Mode()&os.ModeSymlink == 0 {
|
||||
entries, err := os.ReadDir(path)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
for _, entry := range entries {
|
||||
if err := PruneDirectory(filepath.Join(path, entry.Name())); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
entries, err = os.ReadDir(path)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if len(entries) > 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
return os.Remove(path)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func DeleteModel(name string) error {
|
||||
mp := ParseModelPath(name)
|
||||
manifest, _, err := GetManifest(mp)
|
||||
@@ -1151,14 +1235,14 @@ func PushModel(ctx context.Context, name string, regOpts *RegistryOptions, fn fu
|
||||
Total: layer.Size,
|
||||
})
|
||||
|
||||
location, err := startUpload(ctx, mp, layer, regOpts)
|
||||
location, chunkSize, err := startUpload(ctx, mp, layer, regOpts)
|
||||
if err != nil {
|
||||
log.Printf("couldn't start upload: %v", err)
|
||||
return err
|
||||
}
|
||||
|
||||
if strings.HasPrefix(path.Base(location.Path), "sha256:") {
|
||||
layer.Digest = path.Base(location.Path)
|
||||
if strings.HasPrefix(filepath.Base(location.Path), "sha256:") {
|
||||
layer.Digest = filepath.Base(location.Path)
|
||||
fn(api.ProgressResponse{
|
||||
Status: "using existing layer",
|
||||
Digest: layer.Digest,
|
||||
@@ -1168,7 +1252,7 @@ func PushModel(ctx context.Context, name string, regOpts *RegistryOptions, fn fu
|
||||
continue
|
||||
}
|
||||
|
||||
if err := uploadBlobChunked(ctx, location, layer, regOpts, fn); err != nil {
|
||||
if err := uploadBlob(ctx, location, layer, chunkSize, regOpts, fn); err != nil {
|
||||
log.Printf("error uploading blob: %v", err)
|
||||
return err
|
||||
}
|
||||
@@ -1353,14 +1437,14 @@ func createConfigLayer(config ConfigV2, layers []string) (*LayerReader, error) {
|
||||
}
|
||||
|
||||
// GetSHA256Digest returns the SHA256 hash of a given buffer and returns it, and the size of buffer
|
||||
func GetSHA256Digest(r io.Reader) (string, int) {
|
||||
func GetSHA256Digest(r io.Reader) (string, int64) {
|
||||
h := sha256.New()
|
||||
n, err := io.Copy(h, r)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
return fmt.Sprintf("sha256:%x", h.Sum(nil)), int(n)
|
||||
return fmt.Sprintf("sha256:%x", h.Sum(nil)), n
|
||||
}
|
||||
|
||||
// Function to check if a blob already exists in the Docker registry
|
||||
@@ -1394,7 +1478,7 @@ func makeRequestWithRetry(ctx context.Context, method string, requestURL *url.UR
|
||||
case resp.StatusCode == http.StatusUnauthorized:
|
||||
auth := resp.Header.Get("www-authenticate")
|
||||
authRedir := ParseAuthRedirectString(auth)
|
||||
token, err := getAuthToken(ctx, authRedir, regOpts)
|
||||
token, err := getAuthToken(ctx, authRedir)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@@ -1442,6 +1526,15 @@ func makeRequest(ctx context.Context, method string, requestURL *url.URL, header
|
||||
|
||||
req.Header.Set("User-Agent", fmt.Sprintf("ollama/%s (%s %s) Go/%s", version.Version, runtime.GOARCH, runtime.GOOS, runtime.Version()))
|
||||
|
||||
if s := req.Header.Get("Content-Length"); s != "" {
|
||||
contentLength, err := strconv.ParseInt(s, 10, 64)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
req.ContentLength = contentLength
|
||||
}
|
||||
|
||||
client := &http.Client{
|
||||
CheckRedirect: func(req *http.Request, via []*http.Request) error {
|
||||
if len(via) >= 10 {
|
||||
|
@@ -4,9 +4,9 @@ import "testing"
|
||||
|
||||
func TestParseModelPath(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
arg string
|
||||
want ModelPath
|
||||
name string
|
||||
arg string
|
||||
want ModelPath
|
||||
}{
|
||||
{
|
||||
"full path https",
|
||||
|
163
server/routes.go
163
server/routes.go
@@ -58,7 +58,7 @@ var loaded struct {
|
||||
var defaultSessionDuration = 5 * time.Minute
|
||||
|
||||
// load a model into memory if it is not already loaded, it is up to the caller to lock loaded.mu before calling this function
|
||||
func load(ctx context.Context, model *Model, reqOpts map[string]interface{}, sessionDuration time.Duration) error {
|
||||
func load(ctx context.Context, workDir string, model *Model, reqOpts map[string]interface{}, sessionDuration time.Duration) error {
|
||||
opts := api.DefaultOptions()
|
||||
if err := opts.FromMap(model.Options); err != nil {
|
||||
log.Printf("could not load model options: %v", err)
|
||||
@@ -94,7 +94,7 @@ func load(ctx context.Context, model *Model, reqOpts map[string]interface{}, ses
|
||||
loaded.Embeddings = model.Embeddings
|
||||
}
|
||||
|
||||
llmModel, err := llm.New(model.ModelPath, model.AdapterPaths, opts)
|
||||
llmModel, err := llm.New(workDir, model.ModelPath, model.AdapterPaths, opts)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -130,6 +130,7 @@ func load(ctx context.Context, model *Model, reqOpts map[string]interface{}, ses
|
||||
llmModel.SetOptions(opts)
|
||||
}
|
||||
}
|
||||
|
||||
loaded.expireAt = time.Now().Add(sessionDuration)
|
||||
|
||||
if loaded.expireTimer == nil {
|
||||
@@ -150,10 +151,59 @@ func load(ctx context.Context, model *Model, reqOpts map[string]interface{}, ses
|
||||
loaded.digest = ""
|
||||
})
|
||||
}
|
||||
|
||||
loaded.expireTimer.Reset(sessionDuration)
|
||||
return nil
|
||||
}
|
||||
|
||||
func ChatModelHandler(c *gin.Context) {
|
||||
loaded.mu.Lock()
|
||||
defer loaded.mu.Unlock()
|
||||
|
||||
var req api.ChatRequest
|
||||
if err := c.ShouldBindJSON(&req); err != nil {
|
||||
c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
|
||||
return
|
||||
}
|
||||
|
||||
model, err := GetModel(req.Model)
|
||||
if err != nil {
|
||||
c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
|
||||
return
|
||||
}
|
||||
|
||||
prompt, err := model.ChatPrompt(req.Messages)
|
||||
if err != nil {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
||||
return
|
||||
}
|
||||
|
||||
var response string
|
||||
fn := func(r api.GenerateResponse) {
|
||||
response += r.Response
|
||||
}
|
||||
|
||||
workDir := c.GetString("workDir")
|
||||
if err := load(c.Request.Context(), workDir, model, nil, defaultSessionDuration); err != nil {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
||||
return
|
||||
}
|
||||
|
||||
fmt.Println(prompt)
|
||||
|
||||
if err := loaded.llm.Predict(c.Request.Context(), []int{}, prompt, fn); err != nil {
|
||||
c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
|
||||
}
|
||||
|
||||
c.JSON(http.StatusOK, api.ChatResponse{
|
||||
Message: api.Message{
|
||||
Role: "assistant",
|
||||
Content: response,
|
||||
},
|
||||
CreatedAt: time.Now().UTC(),
|
||||
})
|
||||
}
|
||||
|
||||
func GenerateHandler(c *gin.Context) {
|
||||
loaded.mu.Lock()
|
||||
defer loaded.mu.Unlock()
|
||||
@@ -172,8 +222,11 @@ func GenerateHandler(c *gin.Context) {
|
||||
return
|
||||
}
|
||||
|
||||
sessionDuration := defaultSessionDuration // TODO: set this duration from the request if specified
|
||||
if err := load(c.Request.Context(), model, req.Options, sessionDuration); err != nil {
|
||||
workDir := c.GetString("workDir")
|
||||
|
||||
// TODO: set this duration from the request if specified
|
||||
sessionDuration := defaultSessionDuration
|
||||
if err := load(c.Request.Context(), workDir, model, req.Options, sessionDuration); err != nil {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
||||
return
|
||||
}
|
||||
@@ -218,8 +271,13 @@ func GenerateHandler(c *gin.Context) {
|
||||
ch <- r
|
||||
}
|
||||
|
||||
if err := loaded.llm.Predict(c.Request.Context(), req.Context, prompt, fn); err != nil {
|
||||
ch <- gin.H{"error": err.Error()}
|
||||
// an empty request loads the model
|
||||
if req.Prompt == "" && req.Template == "" && req.System == "" {
|
||||
ch <- api.GenerateResponse{Model: req.Model, Done: true}
|
||||
} else {
|
||||
if err := loaded.llm.Predict(c.Request.Context(), req.Context, prompt, fn); err != nil {
|
||||
ch <- gin.H{"error": err.Error()}
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
@@ -241,7 +299,9 @@ func EmbeddingHandler(c *gin.Context) {
|
||||
c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
|
||||
return
|
||||
}
|
||||
if err := load(c.Request.Context(), model, req.Options, 5*time.Minute); err != nil {
|
||||
|
||||
workDir := c.GetString("workDir")
|
||||
if err := load(c.Request.Context(), workDir, model, req.Options, 5*time.Minute); err != nil {
|
||||
c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
|
||||
return
|
||||
}
|
||||
@@ -280,8 +340,6 @@ func PullModelHandler(c *gin.Context) {
|
||||
|
||||
regOpts := &RegistryOptions{
|
||||
Insecure: req.Insecure,
|
||||
Username: req.Username,
|
||||
Password: req.Password,
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithCancel(c.Request.Context())
|
||||
@@ -311,8 +369,6 @@ func PushModelHandler(c *gin.Context) {
|
||||
|
||||
regOpts := &RegistryOptions{
|
||||
Insecure: req.Insecure,
|
||||
Username: req.Username,
|
||||
Password: req.Password,
|
||||
}
|
||||
|
||||
ctx := context.Background()
|
||||
@@ -331,6 +387,8 @@ func CreateModelHandler(c *gin.Context) {
|
||||
return
|
||||
}
|
||||
|
||||
workDir := c.GetString("workDir")
|
||||
|
||||
ch := make(chan any)
|
||||
go func() {
|
||||
defer close(ch)
|
||||
@@ -341,7 +399,7 @@ func CreateModelHandler(c *gin.Context) {
|
||||
ctx, cancel := context.WithCancel(c.Request.Context())
|
||||
defer cancel()
|
||||
|
||||
if err := CreateModel(ctx, req.Name, req.Path, fn); err != nil {
|
||||
if err := CreateModel(ctx, workDir, req.Name, req.Path, fn); err != nil {
|
||||
ch <- gin.H{"error": err.Error()}
|
||||
}
|
||||
}()
|
||||
@@ -364,6 +422,18 @@ func DeleteModelHandler(c *gin.Context) {
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
manifestsPath, err := GetManifestPath()
|
||||
if err != nil {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
||||
return
|
||||
}
|
||||
|
||||
if err := PruneDirectory(manifestsPath); err != nil {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
||||
return
|
||||
}
|
||||
|
||||
c.JSON(http.StatusOK, nil)
|
||||
}
|
||||
|
||||
@@ -495,44 +565,59 @@ func CopyModelHandler(c *gin.Context) {
|
||||
}
|
||||
}
|
||||
|
||||
func Serve(ln net.Listener, origins []string) error {
|
||||
var defaultAllowOrigins = []string{
|
||||
"localhost",
|
||||
"127.0.0.1",
|
||||
"0.0.0.0",
|
||||
}
|
||||
|
||||
func Serve(ln net.Listener, allowOrigins []string) error {
|
||||
config := cors.DefaultConfig()
|
||||
config.AllowWildcard = true
|
||||
config.AllowOrigins = append(origins, []string{
|
||||
"http://localhost",
|
||||
"http://localhost:*",
|
||||
"https://localhost",
|
||||
"https://localhost:*",
|
||||
"http://127.0.0.1",
|
||||
"http://127.0.0.1:*",
|
||||
"https://127.0.0.1",
|
||||
"https://127.0.0.1:*",
|
||||
"http://0.0.0.0",
|
||||
"http://0.0.0.0:*",
|
||||
"https://0.0.0.0",
|
||||
"https://0.0.0.0:*",
|
||||
}...)
|
||||
|
||||
config.AllowOrigins = allowOrigins
|
||||
for _, allowOrigin := range defaultAllowOrigins {
|
||||
config.AllowOrigins = append(config.AllowOrigins,
|
||||
fmt.Sprintf("http://%s", allowOrigin),
|
||||
fmt.Sprintf("https://%s", allowOrigin),
|
||||
fmt.Sprintf("http://%s:*", allowOrigin),
|
||||
fmt.Sprintf("https://%s:*", allowOrigin),
|
||||
)
|
||||
}
|
||||
|
||||
workDir, err := os.MkdirTemp("", "ollama")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer os.RemoveAll(workDir)
|
||||
|
||||
r := gin.Default()
|
||||
r.Use(cors.New(config))
|
||||
|
||||
r.GET("/", func(c *gin.Context) {
|
||||
c.String(http.StatusOK, "Ollama is running")
|
||||
})
|
||||
r.HEAD("/", func(c *gin.Context) {
|
||||
c.Status(http.StatusOK)
|
||||
})
|
||||
r.Use(
|
||||
cors.New(config),
|
||||
func(c *gin.Context) {
|
||||
c.Set("workDir", workDir)
|
||||
c.Next()
|
||||
},
|
||||
)
|
||||
|
||||
r.POST("/api/chat", ChatModelHandler)
|
||||
r.POST("/api/pull", PullModelHandler)
|
||||
r.POST("/api/generate", GenerateHandler)
|
||||
r.POST("/api/embeddings", EmbeddingHandler)
|
||||
r.POST("/api/create", CreateModelHandler)
|
||||
r.POST("/api/push", PushModelHandler)
|
||||
r.POST("/api/copy", CopyModelHandler)
|
||||
r.GET("/api/tags", ListModelsHandler)
|
||||
r.DELETE("/api/delete", DeleteModelHandler)
|
||||
r.POST("/api/show", ShowModelHandler)
|
||||
|
||||
for _, method := range []string{http.MethodGet, http.MethodHead} {
|
||||
r.Handle(method, "/", func(c *gin.Context) {
|
||||
c.String(http.StatusOK, "Ollama is running")
|
||||
})
|
||||
|
||||
r.Handle(method, "/api/tags", ListModelsHandler)
|
||||
}
|
||||
|
||||
log.Printf("Listening on %s", ln.Addr())
|
||||
s := &http.Server{
|
||||
Handler: r,
|
||||
@@ -540,19 +625,20 @@ func Serve(ln net.Listener, origins []string) error {
|
||||
|
||||
// listen for a ctrl+c and stop any loaded llm
|
||||
signals := make(chan os.Signal, 1)
|
||||
signal.Notify(signals, syscall.SIGINT)
|
||||
signal.Notify(signals, syscall.SIGINT, syscall.SIGTERM)
|
||||
go func() {
|
||||
<-signals
|
||||
if loaded.llm != nil {
|
||||
loaded.llm.Close()
|
||||
}
|
||||
os.RemoveAll(workDir)
|
||||
os.Exit(0)
|
||||
}()
|
||||
|
||||
if runtime.GOOS == "linux" {
|
||||
// check compatibility to log warnings
|
||||
if _, err := llm.CheckVRAM(); err != nil {
|
||||
log.Printf("Warning: GPU support not enabled, you may need to install GPU drivers: %v", err)
|
||||
log.Printf("Warning: GPU support may not enabled, check you have installed install GPU drivers: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -573,6 +659,7 @@ func streamResponse(c *gin.Context, ch chan any) {
|
||||
return false
|
||||
}
|
||||
|
||||
// Delineate chunks with new-line delimiter
|
||||
bts = append(bts, '\n')
|
||||
if _, err := w.Write(bts); err != nil {
|
||||
log.Printf("streamResponse: w.Write failed with %s", err)
|
||||
|
224
server/upload.go
224
server/upload.go
@@ -14,7 +14,12 @@ import (
|
||||
"github.com/jmorganca/ollama/api"
|
||||
)
|
||||
|
||||
func startUpload(ctx context.Context, mp ModelPath, layer *Layer, regOpts *RegistryOptions) (*url.URL, error) {
|
||||
const (
|
||||
redirectChunkSize int64 = 1024 * 1024 * 1024
|
||||
regularChunkSize int64 = 95 * 1024 * 1024
|
||||
)
|
||||
|
||||
func startUpload(ctx context.Context, mp ModelPath, layer *Layer, regOpts *RegistryOptions) (*url.URL, int64, error) {
|
||||
requestURL := mp.BaseURL()
|
||||
requestURL = requestURL.JoinPath("v2", mp.GetNamespaceRepository(), "blobs/uploads/")
|
||||
if layer.From != "" {
|
||||
@@ -27,20 +32,26 @@ func startUpload(ctx context.Context, mp ModelPath, layer *Layer, regOpts *Regis
|
||||
resp, err := makeRequestWithRetry(ctx, "POST", requestURL, nil, nil, regOpts)
|
||||
if err != nil {
|
||||
log.Printf("couldn't start upload: %v", err)
|
||||
return nil, err
|
||||
return nil, 0, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
// Extract UUID location from header
|
||||
location := resp.Header.Get("Location")
|
||||
location := resp.Header.Get("Docker-Upload-Location")
|
||||
chunkSize := redirectChunkSize
|
||||
if location == "" {
|
||||
return nil, fmt.Errorf("location header is missing in response")
|
||||
location = resp.Header.Get("Location")
|
||||
chunkSize = regularChunkSize
|
||||
}
|
||||
|
||||
return url.Parse(location)
|
||||
locationURL, err := url.Parse(location)
|
||||
if err != nil {
|
||||
return nil, 0, err
|
||||
}
|
||||
|
||||
return locationURL, chunkSize, nil
|
||||
}
|
||||
|
||||
func uploadBlobChunked(ctx context.Context, requestURL *url.URL, layer *Layer, regOpts *RegistryOptions, fn func(api.ProgressResponse)) error {
|
||||
func uploadBlob(ctx context.Context, requestURL *url.URL, layer *Layer, chunkSize int64, regOpts *RegistryOptions, fn func(api.ProgressResponse)) error {
|
||||
// TODO allow resumability
|
||||
// TODO allow canceling uploads via DELETE
|
||||
|
||||
@@ -55,96 +66,40 @@ func uploadBlobChunked(ctx context.Context, requestURL *url.URL, layer *Layer, r
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
// 95MB chunk size
|
||||
chunkSize := 95 * 1024 * 1024
|
||||
pw := ProgressWriter{
|
||||
status: fmt.Sprintf("uploading %s", layer.Digest),
|
||||
digest: layer.Digest,
|
||||
total: layer.Size,
|
||||
fn: fn,
|
||||
}
|
||||
|
||||
for offset := int64(0); offset < int64(layer.Size); {
|
||||
chunk := int64(layer.Size) - offset
|
||||
if chunk > int64(chunkSize) {
|
||||
chunk = int64(chunkSize)
|
||||
for offset := int64(0); offset < layer.Size; {
|
||||
chunk := layer.Size - offset
|
||||
if chunk > chunkSize {
|
||||
chunk = chunkSize
|
||||
}
|
||||
|
||||
sectionReader := io.NewSectionReader(f, int64(offset), chunk)
|
||||
for try := 0; try < MaxRetries; try++ {
|
||||
ch := make(chan error, 1)
|
||||
resp, err := uploadBlobChunk(ctx, http.MethodPatch, requestURL, f, offset, chunk, regOpts, &pw)
|
||||
if err != nil {
|
||||
fn(api.ProgressResponse{
|
||||
Status: fmt.Sprintf("error uploading chunk: %v", err),
|
||||
Digest: layer.Digest,
|
||||
Total: layer.Size,
|
||||
Completed: offset,
|
||||
})
|
||||
|
||||
r, w := io.Pipe()
|
||||
defer r.Close()
|
||||
go func() {
|
||||
defer w.Close()
|
||||
return err
|
||||
}
|
||||
|
||||
for chunked := int64(0); chunked < chunk; {
|
||||
select {
|
||||
case err := <-ch:
|
||||
log.Printf("chunk interrupted: %v", err)
|
||||
return
|
||||
default:
|
||||
n, err := io.CopyN(w, sectionReader, 1024*1024)
|
||||
if err != nil && !errors.Is(err, io.EOF) {
|
||||
fn(api.ProgressResponse{
|
||||
Status: fmt.Sprintf("error reading chunk: %v", err),
|
||||
Digest: layer.Digest,
|
||||
Total: layer.Size,
|
||||
Completed: int(offset),
|
||||
})
|
||||
offset += chunk
|
||||
location := resp.Header.Get("Docker-Upload-Location")
|
||||
if location == "" {
|
||||
location = resp.Header.Get("Location")
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
chunked += n
|
||||
fn(api.ProgressResponse{
|
||||
Status: fmt.Sprintf("uploading %s", layer.Digest),
|
||||
Digest: layer.Digest,
|
||||
Total: layer.Size,
|
||||
Completed: int(offset) + int(chunked),
|
||||
})
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
headers := make(http.Header)
|
||||
headers.Set("Content-Type", "application/octet-stream")
|
||||
headers.Set("Content-Length", strconv.Itoa(int(chunk)))
|
||||
headers.Set("Content-Range", fmt.Sprintf("%d-%d", offset, offset+sectionReader.Size()-1))
|
||||
resp, err := makeRequest(ctx, "PATCH", requestURL, headers, r, regOpts)
|
||||
if err != nil && !errors.Is(err, io.EOF) {
|
||||
fn(api.ProgressResponse{
|
||||
Status: fmt.Sprintf("error uploading chunk: %v", err),
|
||||
Digest: layer.Digest,
|
||||
Total: layer.Size,
|
||||
Completed: int(offset),
|
||||
})
|
||||
|
||||
return err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
switch {
|
||||
case resp.StatusCode == http.StatusUnauthorized:
|
||||
ch <- errors.New("unauthorized")
|
||||
|
||||
auth := resp.Header.Get("www-authenticate")
|
||||
authRedir := ParseAuthRedirectString(auth)
|
||||
token, err := getAuthToken(ctx, authRedir, regOpts)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
regOpts.Token = token
|
||||
sectionReader = io.NewSectionReader(f, int64(offset), chunk)
|
||||
continue
|
||||
case resp.StatusCode >= http.StatusBadRequest:
|
||||
body, _ := io.ReadAll(resp.Body)
|
||||
return fmt.Errorf("on upload registry responded with code %d: %s", resp.StatusCode, body)
|
||||
}
|
||||
|
||||
offset += sectionReader.Size()
|
||||
requestURL, err = url.Parse(resp.Header.Get("Location"))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
break
|
||||
requestURL, err = url.Parse(location)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
@@ -170,3 +125,90 @@ func uploadBlobChunked(ctx context.Context, requestURL *url.URL, layer *Layer, r
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func uploadBlobChunk(ctx context.Context, method string, requestURL *url.URL, r io.ReaderAt, offset, limit int64, opts *RegistryOptions, pw *ProgressWriter) (*http.Response, error) {
|
||||
sectionReader := io.NewSectionReader(r, offset, limit)
|
||||
|
||||
headers := make(http.Header)
|
||||
headers.Set("Content-Type", "application/octet-stream")
|
||||
headers.Set("Content-Length", strconv.Itoa(int(limit)))
|
||||
headers.Set("X-Redirect-Uploads", "1")
|
||||
|
||||
if method == http.MethodPatch {
|
||||
headers.Set("Content-Range", fmt.Sprintf("%d-%d", offset, offset+sectionReader.Size()-1))
|
||||
}
|
||||
|
||||
for try := 0; try < MaxRetries; try++ {
|
||||
resp, err := makeRequest(ctx, method, requestURL, headers, io.TeeReader(sectionReader, pw), opts)
|
||||
if err != nil && !errors.Is(err, io.EOF) {
|
||||
return nil, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
switch {
|
||||
case resp.StatusCode == http.StatusTemporaryRedirect:
|
||||
location, err := resp.Location()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
pw.completed = offset
|
||||
if _, err := uploadBlobChunk(ctx, http.MethodPut, location, r, offset, limit, nil, pw); err != nil {
|
||||
// retry
|
||||
log.Printf("retrying redirected upload: %v", err)
|
||||
continue
|
||||
}
|
||||
|
||||
return resp, nil
|
||||
case resp.StatusCode == http.StatusUnauthorized:
|
||||
auth := resp.Header.Get("www-authenticate")
|
||||
authRedir := ParseAuthRedirectString(auth)
|
||||
token, err := getAuthToken(ctx, authRedir)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
opts.Token = token
|
||||
|
||||
pw.completed = offset
|
||||
sectionReader = io.NewSectionReader(r, offset, limit)
|
||||
continue
|
||||
case resp.StatusCode >= http.StatusBadRequest:
|
||||
body, _ := io.ReadAll(resp.Body)
|
||||
return nil, fmt.Errorf("on upload registry responded with code %d: %s", resp.StatusCode, body)
|
||||
}
|
||||
|
||||
return resp, nil
|
||||
}
|
||||
|
||||
return nil, fmt.Errorf("max retries exceeded")
|
||||
}
|
||||
|
||||
type ProgressWriter struct {
|
||||
status string
|
||||
digest string
|
||||
bucket int64
|
||||
completed int64
|
||||
total int64
|
||||
fn func(api.ProgressResponse)
|
||||
}
|
||||
|
||||
func (pw *ProgressWriter) Write(b []byte) (int, error) {
|
||||
n := len(b)
|
||||
pw.bucket += int64(n)
|
||||
|
||||
// throttle status updates to not spam the client
|
||||
if pw.bucket >= 1024*1024 || pw.completed+pw.bucket >= pw.total {
|
||||
pw.completed += pw.bucket
|
||||
pw.fn(api.ProgressResponse{
|
||||
Status: pw.status,
|
||||
Digest: pw.digest,
|
||||
Total: pw.total,
|
||||
Completed: pw.completed,
|
||||
})
|
||||
|
||||
pw.bucket = 0
|
||||
}
|
||||
|
||||
return n, nil
|
||||
}
|
||||
|
Reference in New Issue
Block a user