Compare commits
105 Commits
format-con
...
v0.1.0
Author | SHA1 | Date | |
---|---|---|---|
![]() |
5306b0269d | ||
![]() |
7de0c8345d | ||
![]() |
1b9dcab3ab | ||
![]() |
86279f4ae3 | ||
![]() |
b934bf23e6 | ||
![]() |
2b8ef455ad | ||
![]() |
0c5f47177c | ||
![]() |
1210db2924 | ||
![]() |
d0854bf1e6 | ||
![]() |
8396463255 | ||
![]() |
a027bbf4d7 | ||
![]() |
ed94a3dd02 | ||
![]() |
f14f62ab3b | ||
![]() |
0fb5268496 | ||
![]() |
c65edb1506 | ||
![]() |
1605af32ec | ||
![]() |
ee3032ad89 | ||
![]() |
5b7a27281d | ||
![]() |
d2a784e33e | ||
![]() |
413a2e4f91 | ||
![]() |
b5614f3ebc | ||
![]() |
8b2ba9cab8 | ||
![]() |
e29662ab5c | ||
![]() |
cbc40aa996 | ||
![]() |
5cb82540c9 | ||
![]() |
d7849a1dc9 | ||
![]() |
01c44d687e | ||
![]() |
9b12a511ca | ||
![]() |
e20362e0d5 | ||
![]() |
c928ceb927 | ||
![]() |
e1a0846483 | ||
![]() |
f997e29e45 | ||
![]() |
87d9efb364 | ||
![]() |
93d3a2568d | ||
![]() |
5a81390b24 | ||
![]() |
a89ef99aed | ||
![]() |
dc0c725ceb | ||
![]() |
5d71bda478 | ||
![]() |
88897a90e4 | ||
![]() |
9df31c3518 | ||
![]() |
2044f9d4da | ||
![]() |
0d186f3b33 | ||
![]() |
82f5b66c01 | ||
![]() |
c986694367 | ||
![]() |
058d0cd04b | ||
![]() |
ee1c994d15 | ||
![]() |
4cba75efc5 | ||
![]() |
8c83701e9f | ||
![]() |
6137b12799 | ||
![]() |
1fabba474b | ||
![]() |
765770efdb | ||
![]() |
9297ff8330 | ||
![]() |
ee4fd16f2c | ||
![]() |
a9ed7cc6aa | ||
![]() |
6c6a31a1e8 | ||
![]() |
fc6ec356fc | ||
![]() |
1255bc9b45 | ||
![]() |
084e4c782a | ||
![]() |
58ffa03d8b | ||
![]() |
637f8bc6a5 | ||
![]() |
499e9007a5 | ||
![]() |
b9bb5ca288 | ||
![]() |
4e8be787c7 | ||
![]() |
aa45d7c1df | ||
![]() |
e35565c567 | ||
![]() |
a5520bfb42 | ||
![]() |
2627c464ba | ||
![]() |
b58d5d16b0 | ||
![]() |
24580df958 | ||
![]() |
80dd44e80a | ||
![]() |
94e1d96b29 | ||
![]() |
66003e1d05 | ||
![]() |
c345053a8b | ||
![]() |
08d7c2a944 | ||
![]() |
bc9573dcb1 | ||
![]() |
e53bc57d4d | ||
![]() |
f0b398d17f | ||
![]() |
8efbc5df55 | ||
![]() |
ccc3e9ac6d | ||
![]() |
daa4f096f9 | ||
![]() |
3ee85f1c6c | ||
![]() |
2540c9181c | ||
![]() |
83ffb154bc | ||
![]() |
9aa192c812 | ||
![]() |
fc8707686f | ||
![]() |
f89c23764b | ||
![]() |
e6881cabd0 | ||
![]() |
d028853879 | ||
![]() |
949553db23 | ||
![]() |
0c5a454361 | ||
![]() |
f59c4d03f7 | ||
![]() |
7dee25a07f | ||
![]() |
f221637053 | ||
![]() |
45ac07cd02 | ||
![]() |
7d749cc787 | ||
![]() |
e7e91cd71c | ||
![]() |
3920e15386 | ||
![]() |
41e976edde | ||
![]() |
de227b620f | ||
![]() |
63def6ca49 | ||
![]() |
738fe9c4aa | ||
![]() |
a8da0bacbe | ||
![]() |
bf146fb072 | ||
![]() |
f0f4943577 | ||
![]() |
09dd2aeff9 |
@@ -1,4 +1,8 @@
|
||||
.vscode
|
||||
ollama
|
||||
app
|
||||
dist
|
||||
scripts
|
||||
llm/llama.cpp/ggml
|
||||
llm/llama.cpp/gguf
|
||||
.env
|
||||
|
12
.gitmodules
vendored
12
.gitmodules
vendored
@@ -1,4 +1,10 @@
|
||||
[submodule "llm/llama.cpp/ggml"]
|
||||
path = llm/llama.cpp/ggml
|
||||
url = https://github.com/ggerganov/llama.cpp.git
|
||||
ignore = dirty
|
||||
path = llm/llama.cpp/ggml
|
||||
url = https://github.com/ggerganov/llama.cpp.git
|
||||
ignore = dirty
|
||||
shallow = true
|
||||
[submodule "llm/llama.cpp/gguf"]
|
||||
path = llm/llama.cpp/gguf
|
||||
url = https://github.com/ggerganov/llama.cpp.git
|
||||
ignore = dirty
|
||||
shallow = true
|
||||
|
22
Dockerfile
22
Dockerfile
@@ -1,18 +1,28 @@
|
||||
FROM golang:alpine
|
||||
ARG CUDA_VERSION=12.2.0
|
||||
|
||||
FROM nvidia/cuda:$CUDA_VERSION-devel-ubuntu22.04
|
||||
|
||||
ARG TARGETARCH
|
||||
ARG VERSION=0.0.0
|
||||
|
||||
WORKDIR /go/src/github.com/jmorganca/ollama
|
||||
RUN apk add --no-cache git build-base cmake
|
||||
RUN apt-get update && apt-get install -y git build-essential cmake
|
||||
ADD https://dl.google.com/go/go1.21.1.linux-$TARGETARCH.tar.gz /tmp/go1.21.1.tar.gz
|
||||
RUN mkdir -p /usr/local && tar xz -C /usr/local </tmp/go1.21.1.tar.gz
|
||||
|
||||
COPY . .
|
||||
RUN go generate ./... && go build -ldflags '-linkmode external -extldflags "-static"' .
|
||||
ENV GOARCH=$TARGETARCH
|
||||
RUN /usr/local/go/bin/go generate ./... \
|
||||
&& /usr/local/go/bin/go build -ldflags "-linkmode=external -extldflags='-static' -X=github.com/jmorganca/ollama/version.Version=$VERSION -X=github.com/jmorganca/ollama/server.mode=release" .
|
||||
|
||||
FROM alpine
|
||||
FROM ubuntu:22.04
|
||||
ENV OLLAMA_HOST 0.0.0.0
|
||||
RUN apk add --no-cache libstdc++
|
||||
|
||||
RUN apt-get update && apt-get install -y ca-certificates
|
||||
|
||||
ARG USER=ollama
|
||||
ARG GROUP=ollama
|
||||
RUN addgroup $GROUP && adduser -D -G $GROUP $USER
|
||||
RUN groupadd $GROUP && useradd -m -g $GROUP $USER
|
||||
|
||||
COPY --from=0 /go/src/github.com/jmorganca/ollama/ollama /bin/ollama
|
||||
|
||||
|
29
Dockerfile.build
Normal file
29
Dockerfile.build
Normal file
@@ -0,0 +1,29 @@
|
||||
ARG VERSION=0.0.0
|
||||
|
||||
# centos7 amd64 dependencies
|
||||
FROM --platform=linux/amd64 nvidia/cuda:11.8.0-devel-centos7 AS base-amd64
|
||||
RUN yum install -y https://repo.ius.io/ius-release-el7.rpm centos-release-scl && \
|
||||
yum update -y && \
|
||||
yum install -y devtoolset-10-gcc devtoolset-10-gcc-c++ git236 wget
|
||||
RUN wget "https://github.com/Kitware/CMake/releases/download/v3.27.6/cmake-3.27.6-linux-x86_64.sh" -O cmake-installer.sh && chmod +x cmake-installer.sh && ./cmake-installer.sh --skip-license --prefix=/usr/local
|
||||
ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
|
||||
|
||||
# centos8 arm64 dependencies
|
||||
FROM --platform=linux/arm64 nvidia/cuda:11.4.3-devel-centos8 AS base-arm64
|
||||
RUN sed -i -e 's/mirrorlist/#mirrorlist/g' -e 's|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g' /etc/yum.repos.d/CentOS-*
|
||||
RUN yum install -y git cmake
|
||||
|
||||
FROM base-${TARGETARCH}
|
||||
ARG TARGETARCH
|
||||
|
||||
# install go
|
||||
ADD https://dl.google.com/go/go1.21.1.linux-$TARGETARCH.tar.gz /tmp/go1.21.1.tar.gz
|
||||
RUN mkdir -p /usr/local && tar xz -C /usr/local </tmp/go1.21.1.tar.gz
|
||||
|
||||
# build the final binary
|
||||
WORKDIR /go/src/github.com/jmorganca/ollama
|
||||
COPY . .
|
||||
ENV GOARCH=$TARGETARCH
|
||||
|
||||
RUN /usr/local/go/bin/go generate ./... && \
|
||||
/usr/local/go/bin/go build -ldflags "-X=github.com/jmorganca/ollama/version.Version=$VERSION -X=github.com/jmorganca/ollama/server.mode=release" .
|
21
README.md
21
README.md
@@ -206,10 +206,17 @@ curl -X POST http://localhost:11434/api/generate -d '{
|
||||
|
||||
## Community Projects using Ollama
|
||||
|
||||
- [LangChain](https://python.langchain.com/docs/integrations/llms/ollama) and [LangChain.js](https://js.langchain.com/docs/modules/model_io/models/llms/integrations/ollama) with a question-answering [example](https://js.langchain.com/docs/use_cases/question_answering/local_retrieval_qa).
|
||||
- [Continue](https://github.com/continuedev/continue) - embeds Ollama inside Visual Studio Code. The extension lets you highlight code to add to the prompt, ask questions in the sidebar, and generate code inline.
|
||||
- [LiteLLM](https://github.com/BerriAI/litellm) a lightweight python package to simplify LLM API calls
|
||||
- [Discord AI Bot](https://github.com/mekb-turtle/discord-ai-bot) - interact with Ollama as a chatbot on Discord.
|
||||
- [Raycast Ollama](https://github.com/MassimilianoPasquini97/raycast_ollama) - Raycast extension to use Ollama for local llama inference on Raycast.
|
||||
- [Simple HTML UI for Ollama](https://github.com/rtcfirefly/ollama-ui)
|
||||
- [Emacs client](https://github.com/zweifisch/ollama) for Ollama
|
||||
| Project | Description |
|
||||
| -------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| [LangChain][1] and [LangChain.js][2] | Also, there is a question-answering [example][3]. |
|
||||
| [Continue](https://github.com/continuedev/continue) | Embeds Ollama inside Visual Studio Code. The extension lets you highlight code to add to the prompt, ask questions in the sidebar, and generate code inline. |
|
||||
| [LiteLLM](https://github.com/BerriAI/litellm) | Lightweight Python package to simplify LLM API calls. |
|
||||
| [Discord AI Bot](https://github.com/mekb-turtle/discord-ai-bot) | Interact with Ollama as a chatbot on Discord. |
|
||||
| [Raycast Ollama](https://github.com/MassimilianoPasquini97/raycast_ollama) | Raycast extension to use Ollama for local llama inference on Raycast. |
|
||||
| [Simple HTML UI](https://github.com/rtcfirefly/ollama-ui) | Also, there is a Chrome extension. |
|
||||
| [Ollama-GUI](https://github.com/ollama-interface/Ollama-Gui?tab=readme-ov-file) | 🖥️ Mac Chat Interface ⚡️ |
|
||||
| [Emacs client](https://github.com/zweifisch/ollama) | |
|
||||
|
||||
[1]: https://python.langchain.com/docs/integrations/llms/ollama
|
||||
[2]: https://js.langchain.com/docs/modules/model_io/models/llms/integrations/ollama
|
||||
[3]: https://js.langchain.com/docs/use_cases/question_answering/local_retrieval_qa
|
||||
|
225
api/client.py
Normal file
225
api/client.py
Normal file
@@ -0,0 +1,225 @@
|
||||
import os
|
||||
import json
|
||||
import requests
|
||||
|
||||
BASE_URL = os.environ.get('OLLAMA_HOST', 'http://localhost:11434')
|
||||
|
||||
# Generate a response for a given prompt with a provided model. This is a streaming endpoint, so will be a series of responses.
|
||||
# The final response object will include statistics and additional data from the request. Use the callback function to override
|
||||
# the default handler.
|
||||
def generate(model_name, prompt, system=None, template=None, context=None, options=None, callback=None):
|
||||
try:
|
||||
url = f"{BASE_URL}/api/generate"
|
||||
payload = {
|
||||
"model": model_name,
|
||||
"prompt": prompt,
|
||||
"system": system,
|
||||
"template": template,
|
||||
"context": context,
|
||||
"options": options
|
||||
}
|
||||
|
||||
# Remove keys with None values
|
||||
payload = {k: v for k, v in payload.items() if v is not None}
|
||||
|
||||
with requests.post(url, json=payload, stream=True) as response:
|
||||
response.raise_for_status()
|
||||
|
||||
# Creating a variable to hold the context history of the final chunk
|
||||
final_context = None
|
||||
|
||||
# Variable to hold concatenated response strings if no callback is provided
|
||||
full_response = ""
|
||||
|
||||
# Iterating over the response line by line and displaying the details
|
||||
for line in response.iter_lines():
|
||||
if line:
|
||||
# Parsing each line (JSON chunk) and extracting the details
|
||||
chunk = json.loads(line)
|
||||
|
||||
# If a callback function is provided, call it with the chunk
|
||||
if callback:
|
||||
callback(chunk)
|
||||
else:
|
||||
# If this is not the last chunk, add the "response" field value to full_response and print it
|
||||
if not chunk.get("done"):
|
||||
response_piece = chunk.get("response", "")
|
||||
full_response += response_piece
|
||||
print(response_piece, end="", flush=True)
|
||||
|
||||
# Check if it's the last chunk (done is true)
|
||||
if chunk.get("done"):
|
||||
final_context = chunk.get("context")
|
||||
|
||||
# Return the full response and the final context
|
||||
return full_response, final_context
|
||||
except requests.exceptions.RequestException as e:
|
||||
print(f"An error occurred: {e}")
|
||||
return None, None
|
||||
|
||||
# Create a model from a Modelfile. Use the callback function to override the default handler.
|
||||
def create(model_name, model_path, callback=None):
|
||||
try:
|
||||
url = f"{BASE_URL}/api/create"
|
||||
payload = {"name": model_name, "path": model_path}
|
||||
|
||||
# Making a POST request with the stream parameter set to True to handle streaming responses
|
||||
with requests.post(url, json=payload, stream=True) as response:
|
||||
response.raise_for_status()
|
||||
|
||||
# Iterating over the response line by line and displaying the status
|
||||
for line in response.iter_lines():
|
||||
if line:
|
||||
# Parsing each line (JSON chunk) and extracting the status
|
||||
chunk = json.loads(line)
|
||||
|
||||
if callback:
|
||||
callback(chunk)
|
||||
else:
|
||||
print(f"Status: {chunk.get('status')}")
|
||||
except requests.exceptions.RequestException as e:
|
||||
print(f"An error occurred: {e}")
|
||||
|
||||
# Pull a model from a the model registry. Cancelled pulls are resumed from where they left off, and multiple
|
||||
# calls to will share the same download progress. Use the callback function to override the default handler.
|
||||
def pull(model_name, insecure=False, callback=None):
|
||||
try:
|
||||
url = f"{BASE_URL}/api/pull"
|
||||
payload = {
|
||||
"name": model_name,
|
||||
"insecure": insecure
|
||||
}
|
||||
|
||||
# Making a POST request with the stream parameter set to True to handle streaming responses
|
||||
with requests.post(url, json=payload, stream=True) as response:
|
||||
response.raise_for_status()
|
||||
|
||||
# Iterating over the response line by line and displaying the details
|
||||
for line in response.iter_lines():
|
||||
if line:
|
||||
# Parsing each line (JSON chunk) and extracting the details
|
||||
chunk = json.loads(line)
|
||||
|
||||
# If a callback function is provided, call it with the chunk
|
||||
if callback:
|
||||
callback(chunk)
|
||||
else:
|
||||
# Print the status message directly to the console
|
||||
print(chunk.get('status', ''), end='', flush=True)
|
||||
|
||||
# If there's layer data, you might also want to print that (adjust as necessary)
|
||||
if 'digest' in chunk:
|
||||
print(f" - Digest: {chunk['digest']}", end='', flush=True)
|
||||
print(f" - Total: {chunk['total']}", end='', flush=True)
|
||||
print(f" - Completed: {chunk['completed']}", end='\n', flush=True)
|
||||
else:
|
||||
print()
|
||||
except requests.exceptions.RequestException as e:
|
||||
print(f"An error occurred: {e}")
|
||||
|
||||
# Push a model to the model registry. Use the callback function to override the default handler.
|
||||
def push(model_name, insecure=False, callback=None):
|
||||
try:
|
||||
url = f"{BASE_URL}/api/push"
|
||||
payload = {
|
||||
"name": model_name,
|
||||
"insecure": insecure
|
||||
}
|
||||
|
||||
# Making a POST request with the stream parameter set to True to handle streaming responses
|
||||
with requests.post(url, json=payload, stream=True) as response:
|
||||
response.raise_for_status()
|
||||
|
||||
# Iterating over the response line by line and displaying the details
|
||||
for line in response.iter_lines():
|
||||
if line:
|
||||
# Parsing each line (JSON chunk) and extracting the details
|
||||
chunk = json.loads(line)
|
||||
|
||||
# If a callback function is provided, call it with the chunk
|
||||
if callback:
|
||||
callback(chunk)
|
||||
else:
|
||||
# Print the status message directly to the console
|
||||
print(chunk.get('status', ''), end='', flush=True)
|
||||
|
||||
# If there's layer data, you might also want to print that (adjust as necessary)
|
||||
if 'digest' in chunk:
|
||||
print(f" - Digest: {chunk['digest']}", end='', flush=True)
|
||||
print(f" - Total: {chunk['total']}", end='', flush=True)
|
||||
print(f" - Completed: {chunk['completed']}", end='\n', flush=True)
|
||||
else:
|
||||
print()
|
||||
except requests.exceptions.RequestException as e:
|
||||
print(f"An error occurred: {e}")
|
||||
|
||||
# List models that are available locally.
|
||||
def list():
|
||||
try:
|
||||
response = requests.get(f"{BASE_URL}/api/tags")
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
models = data.get('models', [])
|
||||
return models
|
||||
|
||||
except requests.exceptions.RequestException as e:
|
||||
print(f"An error occurred: {e}")
|
||||
return None
|
||||
|
||||
# Copy a model. Creates a model with another name from an existing model.
|
||||
def copy(source, destination):
|
||||
try:
|
||||
# Create the JSON payload
|
||||
payload = {
|
||||
"source": source,
|
||||
"destination": destination
|
||||
}
|
||||
|
||||
response = requests.post(f"{BASE_URL}/api/copy", json=payload)
|
||||
response.raise_for_status()
|
||||
|
||||
# If the request was successful, return a message indicating that the copy was successful
|
||||
return "Copy successful"
|
||||
|
||||
except requests.exceptions.RequestException as e:
|
||||
print(f"An error occurred: {e}")
|
||||
return None
|
||||
|
||||
# Delete a model and its data.
|
||||
def delete(model_name):
|
||||
try:
|
||||
url = f"{BASE_URL}/api/delete"
|
||||
payload = {"name": model_name}
|
||||
response = requests.delete(url, json=payload)
|
||||
response.raise_for_status()
|
||||
return "Delete successful"
|
||||
except requests.exceptions.RequestException as e:
|
||||
print(f"An error occurred: {e}")
|
||||
return None
|
||||
|
||||
# Show info about a model.
|
||||
def show(model_name):
|
||||
try:
|
||||
url = f"{BASE_URL}/api/show"
|
||||
payload = {"name": model_name}
|
||||
response = requests.post(url, json=payload)
|
||||
response.raise_for_status()
|
||||
|
||||
# Parse the JSON response and return it
|
||||
data = response.json()
|
||||
return data
|
||||
except requests.exceptions.RequestException as e:
|
||||
print(f"An error occurred: {e}")
|
||||
return None
|
||||
|
||||
def heartbeat():
|
||||
try:
|
||||
url = f"{BASE_URL}/"
|
||||
response = requests.head(url)
|
||||
response.raise_for_status()
|
||||
return "Ollama is running"
|
||||
except requests.exceptions.RequestException as e:
|
||||
print(f"An error occurred: {e}")
|
||||
return "Ollama is not running"
|
||||
|
||||
|
@@ -291,7 +291,7 @@ func DefaultOptions() Options {
|
||||
NumCtx: 2048,
|
||||
NumKeep: -1,
|
||||
NumBatch: 512,
|
||||
NumGPU: 1,
|
||||
NumGPU: -1, // -1 here indicates that NumGPU should be set dynamically
|
||||
NumGQA: 1,
|
||||
LowVRAM: false,
|
||||
F16KV: true,
|
||||
|
316
cmd/cmd.go
316
cmd/cmd.go
@@ -11,20 +11,19 @@ import (
|
||||
"io"
|
||||
"log"
|
||||
"net"
|
||||
"net/http"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path"
|
||||
"path/filepath"
|
||||
"runtime"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/chzyer/readline"
|
||||
"github.com/dustin/go-humanize"
|
||||
"github.com/olekukonko/tablewriter"
|
||||
"github.com/pdevine/readline"
|
||||
"github.com/spf13/cobra"
|
||||
"golang.org/x/crypto/ssh"
|
||||
"golang.org/x/term"
|
||||
|
||||
"github.com/jmorganca/ollama/api"
|
||||
"github.com/jmorganca/ollama/format"
|
||||
@@ -33,6 +32,26 @@ import (
|
||||
"github.com/jmorganca/ollama/version"
|
||||
)
|
||||
|
||||
type Painter struct {
|
||||
IsMultiLine bool
|
||||
}
|
||||
|
||||
func (p Painter) Paint(line []rune, _ int) []rune {
|
||||
termType := os.Getenv("TERM")
|
||||
if termType == "xterm-256color" && len(line) == 0 {
|
||||
var prompt string
|
||||
if p.IsMultiLine {
|
||||
prompt = "Use \"\"\" to end multi-line input"
|
||||
} else {
|
||||
prompt = "Send a message (/? for help)"
|
||||
}
|
||||
return []rune(fmt.Sprintf("\033[38;5;245m%s\033[%dD\033[0m", prompt, len(prompt)))
|
||||
}
|
||||
// add a space and a backspace to prevent the cursor from walking up the screen
|
||||
line = append(line, []rune(" \b")...)
|
||||
return line
|
||||
}
|
||||
|
||||
func CreateHandler(cmd *cobra.Command, args []string) error {
|
||||
filename, _ := cmd.Flags().GetString("file")
|
||||
filename, err := filepath.Abs(filename)
|
||||
@@ -98,39 +117,28 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
|
||||
}
|
||||
|
||||
func RunHandler(cmd *cobra.Command, args []string) error {
|
||||
insecure, err := cmd.Flags().GetBool("insecure")
|
||||
client, err := api.FromEnv()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
mp := server.ParseModelPath(args[0])
|
||||
models, err := client.List(context.Background())
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if mp.ProtocolScheme == "http" && !insecure {
|
||||
return fmt.Errorf("insecure protocol http")
|
||||
modelName, modelTag, ok := strings.Cut(args[0], ":")
|
||||
if !ok {
|
||||
modelTag = "latest"
|
||||
}
|
||||
|
||||
fp, err := mp.GetManifestPath(false)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
_, err = os.Stat(fp)
|
||||
switch {
|
||||
case errors.Is(err, os.ErrNotExist):
|
||||
if err := pull(args[0], insecure); err != nil {
|
||||
var apiStatusError api.StatusError
|
||||
if !errors.As(err, &apiStatusError) {
|
||||
return err
|
||||
}
|
||||
|
||||
if apiStatusError.StatusCode != http.StatusBadGateway {
|
||||
return err
|
||||
}
|
||||
for _, model := range models.Models {
|
||||
if model.Name == strings.Join([]string{modelName, modelTag}, ":") {
|
||||
return RunGenerate(cmd, args)
|
||||
}
|
||||
case err != nil:
|
||||
}
|
||||
|
||||
if err := PullHandler(cmd, args); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
@@ -387,71 +395,117 @@ func RunGenerate(cmd *cobra.Command, args []string) error {
|
||||
type generateContextKey string
|
||||
|
||||
func generate(cmd *cobra.Command, model, prompt string) error {
|
||||
if len(strings.TrimSpace(prompt)) > 0 {
|
||||
client, err := api.FromEnv()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
spinner := NewSpinner("")
|
||||
go spinner.Spin(60 * time.Millisecond)
|
||||
|
||||
var latest api.GenerateResponse
|
||||
|
||||
generateContext, ok := cmd.Context().Value(generateContextKey("context")).([]int)
|
||||
if !ok {
|
||||
generateContext = []int{}
|
||||
}
|
||||
|
||||
request := api.GenerateRequest{Model: model, Prompt: prompt, Context: generateContext}
|
||||
fn := func(response api.GenerateResponse) error {
|
||||
if !spinner.IsFinished() {
|
||||
spinner.Finish()
|
||||
}
|
||||
|
||||
latest = response
|
||||
|
||||
fmt.Print(response.Response)
|
||||
return nil
|
||||
}
|
||||
|
||||
if err := client.Generate(context.Background(), &request, fn); err != nil {
|
||||
if strings.Contains(err.Error(), "failed to load model") {
|
||||
// tell the user to check the server log, if it exists locally
|
||||
home, nestedErr := os.UserHomeDir()
|
||||
if nestedErr != nil {
|
||||
// return the original error
|
||||
return err
|
||||
}
|
||||
logPath := filepath.Join(home, ".ollama", "logs", "server.log")
|
||||
if _, nestedErr := os.Stat(logPath); nestedErr == nil {
|
||||
err = fmt.Errorf("%w\nFor more details, check the error logs at %s", err, logPath)
|
||||
}
|
||||
}
|
||||
return err
|
||||
}
|
||||
|
||||
fmt.Println()
|
||||
fmt.Println()
|
||||
|
||||
if !latest.Done {
|
||||
return errors.New("unexpected end of response")
|
||||
}
|
||||
|
||||
verbose, err := cmd.Flags().GetBool("verbose")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if verbose {
|
||||
latest.Summary()
|
||||
}
|
||||
|
||||
ctx := cmd.Context()
|
||||
ctx = context.WithValue(ctx, generateContextKey("context"), latest.Context)
|
||||
cmd.SetContext(ctx)
|
||||
client, err := api.FromEnv()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
spinner := NewSpinner("")
|
||||
go spinner.Spin(60 * time.Millisecond)
|
||||
|
||||
var latest api.GenerateResponse
|
||||
|
||||
generateContext, ok := cmd.Context().Value(generateContextKey("context")).([]int)
|
||||
if !ok {
|
||||
generateContext = []int{}
|
||||
}
|
||||
|
||||
var wrapTerm bool
|
||||
termType := os.Getenv("TERM")
|
||||
if termType == "xterm-256color" {
|
||||
wrapTerm = true
|
||||
}
|
||||
|
||||
termWidth, _, err := term.GetSize(int(0))
|
||||
if err != nil {
|
||||
wrapTerm = false
|
||||
}
|
||||
|
||||
// override wrapping if the user turned it off
|
||||
nowrap, err := cmd.Flags().GetBool("nowordwrap")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if nowrap {
|
||||
wrapTerm = false
|
||||
}
|
||||
|
||||
var currentLineLength int
|
||||
var wordBuffer string
|
||||
|
||||
request := api.GenerateRequest{Model: model, Prompt: prompt, Context: generateContext}
|
||||
fn := func(response api.GenerateResponse) error {
|
||||
if !spinner.IsFinished() {
|
||||
spinner.Finish()
|
||||
}
|
||||
|
||||
latest = response
|
||||
|
||||
if wrapTerm {
|
||||
for _, ch := range response.Response {
|
||||
if currentLineLength+1 > termWidth-5 {
|
||||
// backtrack the length of the last word and clear to the end of the line
|
||||
fmt.Printf("\x1b[%dD\x1b[K\n", len(wordBuffer))
|
||||
fmt.Printf("%s%c", wordBuffer, ch)
|
||||
currentLineLength = len(wordBuffer) + 1
|
||||
} else {
|
||||
fmt.Print(string(ch))
|
||||
currentLineLength += 1
|
||||
|
||||
switch ch {
|
||||
case ' ':
|
||||
wordBuffer = ""
|
||||
case '\n':
|
||||
currentLineLength = 0
|
||||
default:
|
||||
wordBuffer += string(ch)
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
fmt.Print(response.Response)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
if err := client.Generate(context.Background(), &request, fn); err != nil {
|
||||
if strings.Contains(err.Error(), "failed to load model") {
|
||||
// tell the user to check the server log, if it exists locally
|
||||
home, nestedErr := os.UserHomeDir()
|
||||
if nestedErr != nil {
|
||||
// return the original error
|
||||
return err
|
||||
}
|
||||
logPath := filepath.Join(home, ".ollama", "logs", "server.log")
|
||||
if _, nestedErr := os.Stat(logPath); nestedErr == nil {
|
||||
err = fmt.Errorf("%w\nFor more details, check the error logs at %s", err, logPath)
|
||||
}
|
||||
}
|
||||
return err
|
||||
}
|
||||
if prompt != "" {
|
||||
fmt.Println()
|
||||
fmt.Println()
|
||||
}
|
||||
|
||||
if !latest.Done {
|
||||
return errors.New("unexpected end of response")
|
||||
}
|
||||
|
||||
verbose, err := cmd.Flags().GetBool("verbose")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if verbose {
|
||||
latest.Summary()
|
||||
}
|
||||
|
||||
ctx := cmd.Context()
|
||||
ctx = context.WithValue(ctx, generateContextKey("context"), latest.Context)
|
||||
cmd.SetContext(ctx)
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -461,19 +515,21 @@ func generateInteractive(cmd *cobra.Command, model string) error {
|
||||
return err
|
||||
}
|
||||
|
||||
// load the model
|
||||
if err := generate(cmd, model, ""); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
completer := readline.NewPrefixCompleter(
|
||||
readline.PcItem("/help"),
|
||||
readline.PcItem("/list"),
|
||||
readline.PcItem("/set",
|
||||
readline.PcItem("history"),
|
||||
readline.PcItem("nohistory"),
|
||||
readline.PcItem("wordwrap"),
|
||||
readline.PcItem("nowordwrap"),
|
||||
readline.PcItem("verbose"),
|
||||
readline.PcItem("quiet"),
|
||||
readline.PcItem("mode",
|
||||
readline.PcItem("vim"),
|
||||
readline.PcItem("emacs"),
|
||||
readline.PcItem("default"),
|
||||
),
|
||||
),
|
||||
readline.PcItem("/show",
|
||||
readline.PcItem("license"),
|
||||
@@ -491,7 +547,10 @@ func generateInteractive(cmd *cobra.Command, model string) error {
|
||||
fmt.Fprintln(os.Stderr, completer.Tree(" "))
|
||||
}
|
||||
|
||||
var painter Painter
|
||||
|
||||
config := readline.Config{
|
||||
Painter: &painter,
|
||||
Prompt: ">>> ",
|
||||
HistoryFile: filepath.Join(home, ".ollama", "history"),
|
||||
AutoComplete: completer,
|
||||
@@ -527,6 +586,7 @@ func generateInteractive(cmd *cobra.Command, model string) error {
|
||||
case isMultiLine:
|
||||
if strings.HasSuffix(line, `"""`) {
|
||||
isMultiLine = false
|
||||
painter.IsMultiLine = isMultiLine
|
||||
multiLineBuffer += strings.TrimSuffix(line, `"""`)
|
||||
line = multiLineBuffer
|
||||
multiLineBuffer = ""
|
||||
@@ -537,6 +597,7 @@ func generateInteractive(cmd *cobra.Command, model string) error {
|
||||
}
|
||||
case strings.HasPrefix(line, `"""`):
|
||||
isMultiLine = true
|
||||
painter.IsMultiLine = isMultiLine
|
||||
multiLineBuffer = strings.TrimPrefix(line, `"""`) + " "
|
||||
scanner.SetPrompt("... ")
|
||||
continue
|
||||
@@ -545,45 +606,42 @@ func generateInteractive(cmd *cobra.Command, model string) error {
|
||||
if err := ListHandler(cmd, args[1:]); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
continue
|
||||
case strings.HasPrefix(line, "/set"):
|
||||
args := strings.Fields(line)
|
||||
if len(args) > 1 {
|
||||
switch args[1] {
|
||||
case "history":
|
||||
scanner.HistoryEnable()
|
||||
continue
|
||||
case "nohistory":
|
||||
scanner.HistoryDisable()
|
||||
continue
|
||||
case "wordwrap":
|
||||
cmd.Flags().Set("nowordwrap", "false")
|
||||
fmt.Println("Set 'wordwrap' mode.")
|
||||
case "nowordwrap":
|
||||
cmd.Flags().Set("nowordwrap", "true")
|
||||
fmt.Println("Set 'nowordwrap' mode.")
|
||||
case "verbose":
|
||||
cmd.Flags().Set("verbose", "true")
|
||||
continue
|
||||
fmt.Println("Set 'verbose' mode.")
|
||||
case "quiet":
|
||||
cmd.Flags().Set("verbose", "false")
|
||||
continue
|
||||
fmt.Println("Set 'quiet' mode.")
|
||||
case "mode":
|
||||
if len(args) > 2 {
|
||||
switch args[2] {
|
||||
case "vim":
|
||||
scanner.SetVimMode(true)
|
||||
continue
|
||||
case "emacs", "default":
|
||||
scanner.SetVimMode(false)
|
||||
continue
|
||||
default:
|
||||
usage()
|
||||
continue
|
||||
}
|
||||
} else {
|
||||
usage()
|
||||
continue
|
||||
}
|
||||
}
|
||||
} else {
|
||||
usage()
|
||||
continue
|
||||
}
|
||||
case strings.HasPrefix(line, "/show"):
|
||||
args := strings.Fields(line)
|
||||
@@ -591,7 +649,6 @@ func generateInteractive(cmd *cobra.Command, model string) error {
|
||||
resp, err := server.GetModelInfo(model)
|
||||
if err != nil {
|
||||
fmt.Println("error: couldn't get model")
|
||||
continue
|
||||
}
|
||||
|
||||
switch args[1] {
|
||||
@@ -608,21 +665,22 @@ func generateInteractive(cmd *cobra.Command, model string) error {
|
||||
default:
|
||||
fmt.Println("error: unknown command")
|
||||
}
|
||||
|
||||
continue
|
||||
} else {
|
||||
usage()
|
||||
continue
|
||||
}
|
||||
case line == "/help", line == "/?":
|
||||
usage()
|
||||
continue
|
||||
case line == "/exit", line == "/bye":
|
||||
return nil
|
||||
case strings.HasPrefix(line, "/"):
|
||||
args := strings.Fields(line)
|
||||
fmt.Printf("Unknown command '%s'. Type /? for help\n", args[0])
|
||||
}
|
||||
|
||||
if err := generate(cmd, model, line); err != nil {
|
||||
return err
|
||||
if len(line) > 0 && line[0] != '/' {
|
||||
if err := generate(cmd, model, line); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -641,28 +699,19 @@ func generateBatch(cmd *cobra.Command, model string) error {
|
||||
}
|
||||
|
||||
func RunServer(cmd *cobra.Command, _ []string) error {
|
||||
host, port := "127.0.0.1", "11434"
|
||||
|
||||
parts := strings.Split(os.Getenv("OLLAMA_HOST"), ":")
|
||||
if ip := net.ParseIP(parts[0]); ip != nil {
|
||||
host = ip.String()
|
||||
}
|
||||
|
||||
if len(parts) > 1 {
|
||||
port = parts[1]
|
||||
}
|
||||
|
||||
// deprecated: include port in OLLAMA_HOST
|
||||
if p := os.Getenv("OLLAMA_PORT"); p != "" {
|
||||
port = p
|
||||
}
|
||||
|
||||
err := initializeKeypair()
|
||||
host, port, err := net.SplitHostPort(os.Getenv("OLLAMA_HOST"))
|
||||
if err != nil {
|
||||
host, port = "127.0.0.1", "11434"
|
||||
if ip := net.ParseIP(strings.Trim(os.Getenv("OLLAMA_HOST"), "[]")); ip != nil {
|
||||
host = ip.String()
|
||||
}
|
||||
}
|
||||
|
||||
if err := initializeKeypair(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
ln, err := net.Listen("tcp", fmt.Sprintf("%s:%s", host, port))
|
||||
ln, err := net.Listen("tcp", net.JoinHostPort(host, port))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -672,6 +721,12 @@ func RunServer(cmd *cobra.Command, _ []string) error {
|
||||
origins = strings.Split(o, ",")
|
||||
}
|
||||
|
||||
if noprune := os.Getenv("OLLAMA_NOPRUNE"); noprune == "" {
|
||||
if err := server.PruneLayers(); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
return server.Serve(ln, origins)
|
||||
}
|
||||
|
||||
@@ -697,7 +752,7 @@ func initializeKeypair() error {
|
||||
return err
|
||||
}
|
||||
|
||||
err = os.MkdirAll(path.Dir(privKeyPath), 0o700)
|
||||
err = os.MkdirAll(filepath.Dir(privKeyPath), 0o755)
|
||||
if err != nil {
|
||||
return fmt.Errorf("could not create directory %w", err)
|
||||
}
|
||||
@@ -825,6 +880,7 @@ func NewCLI() *cobra.Command {
|
||||
|
||||
runCmd.Flags().Bool("verbose", false, "Show timings for response")
|
||||
runCmd.Flags().Bool("insecure", false, "Use an insecure registry")
|
||||
runCmd.Flags().Bool("nowordwrap", false, "Don't wrap words to the next line automatically")
|
||||
|
||||
serveCmd := &cobra.Command{
|
||||
Use: "serve",
|
||||
|
138
docs/api.md
138
docs/api.md
@@ -3,18 +3,21 @@
|
||||
## Endpoints
|
||||
|
||||
- [Generate a completion](#generate-a-completion)
|
||||
- [Create a model](#create-a-model)
|
||||
- [List local models](#list-local-models)
|
||||
- [Copy a model](#copy-a-model)
|
||||
- [Delete a model](#delete-a-model)
|
||||
- [Pull a model](#pull-a-model)
|
||||
- [Generate embeddings](#generate-embeddings)
|
||||
- [Create a Model](#create-a-model)
|
||||
- [List Local Models](#list-local-models)
|
||||
- [Show Model Information](#show-model-information)
|
||||
- [Copy a Model](#copy-a-model)
|
||||
- [Delete a Model](#delete-a-model)
|
||||
- [Pull a Model](#pull-a-model)
|
||||
- [Push a Model](#push-a-model)
|
||||
- [Generate Embeddings](#generate-embeddings)
|
||||
|
||||
|
||||
## Conventions
|
||||
|
||||
### Model names
|
||||
|
||||
Model names follow a `model:tag` format. Some examples are `orca-mini:3b-q4_1` and `llama2:70b`. The tag is optional and if not provided will default to `latest`. The tag is used to identify a specific version.
|
||||
Model names follow a `model:tag` format. Some examples are `orca-mini:3b-q4_1` and `llama2:70b`. The tag is optional and, if not provided, will default to `latest`. The tag is used to identify a specific version.
|
||||
|
||||
### Durations
|
||||
|
||||
@@ -22,7 +25,7 @@ All durations are returned in nanoseconds.
|
||||
|
||||
## Generate a completion
|
||||
|
||||
```
|
||||
```shell
|
||||
POST /api/generate
|
||||
```
|
||||
|
||||
@@ -42,7 +45,7 @@ Advanced parameters:
|
||||
|
||||
### Request
|
||||
|
||||
```
|
||||
```shell
|
||||
curl -X POST http://localhost:11434/api/generate -d '{
|
||||
"model": "llama2:7b",
|
||||
"prompt": "Why is the sky blue?"
|
||||
@@ -95,7 +98,7 @@ To calculate how fast the response is generated in tokens per second (token/s),
|
||||
|
||||
## Create a Model
|
||||
|
||||
```
|
||||
```shell
|
||||
POST /api/create
|
||||
```
|
||||
|
||||
@@ -108,7 +111,7 @@ Create a model from a [`Modelfile`](./modelfile.md)
|
||||
|
||||
### Request
|
||||
|
||||
```
|
||||
```shell
|
||||
curl -X POST http://localhost:11434/api/create -d '{
|
||||
"name": "mario",
|
||||
"path": "~/Modelfile"
|
||||
@@ -117,7 +120,7 @@ curl -X POST http://localhost:11434/api/create -d '{
|
||||
|
||||
### Response
|
||||
|
||||
A stream of JSON objects. When finished, `status` is `success`
|
||||
A stream of JSON objects. When finished, `status` is `success`.
|
||||
|
||||
```json
|
||||
{
|
||||
@@ -127,7 +130,7 @@ A stream of JSON objects. When finished, `status` is `success`
|
||||
|
||||
## List Local Models
|
||||
|
||||
```
|
||||
```shell
|
||||
GET /api/tags
|
||||
```
|
||||
|
||||
@@ -135,7 +138,7 @@ List models that are available locally.
|
||||
|
||||
### Request
|
||||
|
||||
```
|
||||
```shell
|
||||
curl http://localhost:11434/api/tags
|
||||
```
|
||||
|
||||
@@ -158,9 +161,40 @@ curl http://localhost:11434/api/tags
|
||||
}
|
||||
```
|
||||
|
||||
## Show Model Information
|
||||
|
||||
```shell
|
||||
POST /api/show
|
||||
```
|
||||
|
||||
Show details about a model including modelfile, template, parameters, license, and system prompt.
|
||||
|
||||
### Parameters
|
||||
|
||||
- `name`: name of the model to show
|
||||
|
||||
### Request
|
||||
|
||||
```shell
|
||||
curl http://localhost:11434/api/show -d '{
|
||||
"name": "llama2:7b"
|
||||
}'
|
||||
```
|
||||
|
||||
### Response
|
||||
|
||||
```json
|
||||
{
|
||||
"license": "<contents of license block>",
|
||||
"modelfile": "# Modelfile generated by \"ollama show\"\n# To build a new Modelfile based on this one, replace the FROM line with:\n# FROM llama2:latest\n\nFROM /Users/username/.ollama/models/blobs/sha256:8daa9615cce30c259a9555b1cc250d461d1bc69980a274b44d7eda0be78076d8\nTEMPLATE \"\"\"[INST] {{ if and .First .System }}<<SYS>>{{ .System }}<</SYS>>\n\n{{ end }}{{ .Prompt }} [/INST] \"\"\"\nSYSTEM \"\"\"\"\"\"\nPARAMETER stop [INST]\nPARAMETER stop [/INST]\nPARAMETER stop <<SYS>>\nPARAMETER stop <</SYS>>\n",
|
||||
"parameters": "stop [INST]\nstop [/INST]\nstop <<SYS>>\nstop <</SYS>>",
|
||||
"template": "[INST] {{ if and .First .System }}<<SYS>>{{ .System }}<</SYS>>\n\n{{ end }}{{ .Prompt }} [/INST] "
|
||||
}
|
||||
```
|
||||
|
||||
## Copy a Model
|
||||
|
||||
```
|
||||
```shell
|
||||
POST /api/copy
|
||||
```
|
||||
|
||||
@@ -168,7 +202,7 @@ Copy a model. Creates a model with another name from an existing model.
|
||||
|
||||
### Request
|
||||
|
||||
```
|
||||
```shell
|
||||
curl http://localhost:11434/api/copy -d '{
|
||||
"source": "llama2:7b",
|
||||
"destination": "llama2-backup"
|
||||
@@ -177,7 +211,7 @@ curl http://localhost:11434/api/copy -d '{
|
||||
|
||||
## Delete a Model
|
||||
|
||||
```
|
||||
```shell
|
||||
DELETE /api/delete
|
||||
```
|
||||
|
||||
@@ -189,7 +223,7 @@ Delete a model and its data.
|
||||
|
||||
### Request
|
||||
|
||||
```
|
||||
```shell
|
||||
curl -X DELETE http://localhost:11434/api/delete -d '{
|
||||
"name": "llama2:13b"
|
||||
}'
|
||||
@@ -197,19 +231,20 @@ curl -X DELETE http://localhost:11434/api/delete -d '{
|
||||
|
||||
## Pull a Model
|
||||
|
||||
```
|
||||
```shell
|
||||
POST /api/pull
|
||||
```
|
||||
|
||||
Download a model from a the model registry. Cancelled pulls are resumed from where they left off, and multiple calls to will share the same download progress.
|
||||
Download a model from the ollama library. Cancelled pulls are resumed from where they left off, and multiple calls will share the same download progress.
|
||||
|
||||
### Parameters
|
||||
|
||||
- `name`: name of the model to pull
|
||||
- `insecure`: (optional) allow insecure connections to the library. Only use this if you are pulling from your own library during development.
|
||||
|
||||
### Request
|
||||
|
||||
```
|
||||
```shell
|
||||
curl -X POST http://localhost:11434/api/pull -d '{
|
||||
"name": "llama2:7b"
|
||||
}'
|
||||
@@ -225,9 +260,63 @@ curl -X POST http://localhost:11434/api/pull -d '{
|
||||
}
|
||||
```
|
||||
|
||||
## Push a Model
|
||||
|
||||
```shell
|
||||
POST /api/push
|
||||
```
|
||||
|
||||
Upload a model to a model library. Requires registering for ollama.ai and adding a public key first.
|
||||
|
||||
### Parameters
|
||||
|
||||
- `name`: name of the model to push in the form of `<namespace>/<model>:<tag>`
|
||||
- `insecure`: (optional) allow insecure connections to the library. Only use this if you are pushing to your library during development.
|
||||
|
||||
### Request
|
||||
|
||||
```shell
|
||||
curl -X POST http://localhost:11434/api/push -d '{
|
||||
"name": "mattw/pygmalion:latest"
|
||||
}'
|
||||
```
|
||||
|
||||
### Response
|
||||
|
||||
Streaming response that starts with:
|
||||
|
||||
```json
|
||||
{"status":"retrieving manifest"}
|
||||
```
|
||||
|
||||
and then:
|
||||
|
||||
```json
|
||||
{
|
||||
"status":"starting upload","digest":"sha256:bc07c81de745696fdf5afca05e065818a8149fb0c77266fb584d9b2cba3711ab",
|
||||
"total":1928429856
|
||||
}
|
||||
```
|
||||
|
||||
Then there is a series of uploading responses:
|
||||
|
||||
```json
|
||||
{
|
||||
"status":"starting upload",
|
||||
"digest":"sha256:bc07c81de745696fdf5afca05e065818a8149fb0c77266fb584d9b2cba3711ab",
|
||||
"total":1928429856}
|
||||
```
|
||||
|
||||
Finally, when the upload is complete:
|
||||
|
||||
```json
|
||||
{"status":"pushing manifest"}
|
||||
{"status":"success"}
|
||||
```
|
||||
|
||||
## Generate Embeddings
|
||||
|
||||
```
|
||||
```shell
|
||||
POST /api/embeddings
|
||||
```
|
||||
|
||||
@@ -244,7 +333,7 @@ Advanced parameters:
|
||||
|
||||
### Request
|
||||
|
||||
```
|
||||
```shell
|
||||
curl -X POST http://localhost:11434/api/embeddings -d '{
|
||||
"model": "llama2:7b",
|
||||
"prompt": "Here is an article about llamas..."
|
||||
@@ -259,5 +348,4 @@ curl -X POST http://localhost:11434/api/embeddings -d '{
|
||||
0.5670403838157654, 0.009260174818336964, 0.23178744316101074, -0.2916173040866852, -0.8924556970596313,
|
||||
0.8785552978515625, -0.34576427936553955, 0.5742510557174683, -0.04222835972905159, -0.137906014919281
|
||||
]
|
||||
}
|
||||
```
|
||||
}```
|
||||
|
@@ -6,6 +6,10 @@
|
||||
|
||||
Install required tools:
|
||||
|
||||
- cmake version 3.24 or higher
|
||||
- go version 1.20 or higher
|
||||
- gcc version 11.4.0 or higher
|
||||
|
||||
```
|
||||
brew install go cmake gcc
|
||||
```
|
||||
@@ -27,3 +31,9 @@ Now you can run `ollama`:
|
||||
```
|
||||
./ollama
|
||||
```
|
||||
|
||||
## Building on Linux with GPU support
|
||||
|
||||
- Install cmake and nvidia-cuda-toolkit
|
||||
- run `go generate ./...`
|
||||
- run `go build .`
|
||||
|
83
docs/linux.md
Normal file
83
docs/linux.md
Normal file
@@ -0,0 +1,83 @@
|
||||
# Installing Ollama on Linux
|
||||
|
||||
> Note: A one line installer for Ollama is available by running:
|
||||
>
|
||||
> ```
|
||||
> curl https://ollama.ai/install.sh | sh
|
||||
> ```
|
||||
|
||||
## Download the `ollama` binary
|
||||
|
||||
Ollama is distributed as a self-contained binary. Download it to a directory in your PATH:
|
||||
|
||||
```
|
||||
sudo curl -L https://ollama.ai/download/ollama-linux-amd64 -o /usr/bin/ollama
|
||||
sudo chmod +x /usr/bin/ollama
|
||||
```
|
||||
|
||||
## Start Ollama
|
||||
|
||||
Start Ollama by running `ollama serve`:
|
||||
|
||||
```
|
||||
ollama serve
|
||||
```
|
||||
|
||||
Once Ollama is running, run a model in another terminal session:
|
||||
|
||||
```
|
||||
ollama run llama2
|
||||
```
|
||||
|
||||
## Install CUDA drivers (optional – for Nvidia GPUs)
|
||||
|
||||
[Download and install](https://developer.nvidia.com/cuda-downloads) CUDA.
|
||||
|
||||
Verify that the drivers are installed by running the following command, which should print details about your GPU:
|
||||
|
||||
```
|
||||
nvidia-smi
|
||||
```
|
||||
|
||||
## Adding Ollama as a startup service (optional)
|
||||
|
||||
Create a user for Ollama:
|
||||
|
||||
```
|
||||
sudo useradd -r -s /bin/false -m -d /usr/share/ollama ollama
|
||||
```
|
||||
|
||||
Create a service file in `/etc/systemd/system/ollama.service`:
|
||||
|
||||
```ini
|
||||
[Unit]
|
||||
Description=Ollama Service
|
||||
After=network-online.target
|
||||
|
||||
[Service]
|
||||
ExecStart=/usr/bin/ollama serve
|
||||
User=ollama
|
||||
Group=ollama
|
||||
Restart=always
|
||||
RestartSec=3
|
||||
Environment="HOME=/usr/share/ollama"
|
||||
|
||||
[Install]
|
||||
WantedBy=default.target
|
||||
```
|
||||
|
||||
Then start the service:
|
||||
|
||||
```
|
||||
sudo systemctl daemon-reload
|
||||
sudo systemctl enable ollama
|
||||
```
|
||||
|
||||
### Viewing logs
|
||||
|
||||
To view logs of Ollama running as a startup service, run:
|
||||
|
||||
```
|
||||
journalctl -u ollama
|
||||
```
|
||||
|
2
go.mod
2
go.mod
@@ -8,6 +8,7 @@ require (
|
||||
github.com/mattn/go-runewidth v0.0.14
|
||||
github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db
|
||||
github.com/olekukonko/tablewriter v0.0.5
|
||||
github.com/pdevine/readline v1.5.2
|
||||
github.com/spf13/cobra v1.7.0
|
||||
)
|
||||
|
||||
@@ -16,7 +17,6 @@ require github.com/rivo/uniseg v0.2.0 // indirect
|
||||
require (
|
||||
github.com/bytedance/sonic v1.9.1 // indirect
|
||||
github.com/chenzhuoyu/base64x v0.0.0-20221115062448-fe3a3abad311 // indirect
|
||||
github.com/chzyer/readline v1.5.1
|
||||
github.com/gabriel-vasile/mimetype v1.4.2 // indirect
|
||||
github.com/gin-contrib/cors v1.4.0
|
||||
github.com/gin-contrib/sse v0.1.0 // indirect
|
||||
|
5
go.sum
5
go.sum
@@ -6,8 +6,6 @@ github.com/chenzhuoyu/base64x v0.0.0-20221115062448-fe3a3abad311 h1:qSGYFH7+jGhD
|
||||
github.com/chenzhuoyu/base64x v0.0.0-20221115062448-fe3a3abad311/go.mod h1:b583jCggY9gE99b6G5LEC39OIiVsWj+R97kbl5odCEk=
|
||||
github.com/chzyer/logex v1.2.1 h1:XHDu3E6q+gdHgsdTPH6ImJMIp436vR6MPtH8gP05QzM=
|
||||
github.com/chzyer/logex v1.2.1/go.mod h1:JLbx6lG2kDbNRFnfkgvh4eRJRPX1QCoOIWomwysCBrQ=
|
||||
github.com/chzyer/readline v1.5.1 h1:upd/6fQk4src78LMRzh5vItIt361/o4uq553V8B5sGI=
|
||||
github.com/chzyer/readline v1.5.1/go.mod h1:Eh+b79XXUwfKfcPLepksvw2tcLE/Ct21YObkaSkeBlk=
|
||||
github.com/chzyer/test v1.0.0 h1:p3BQDXSxOhOG0P9z6/hGnII4LGiEPOYBhs8asl/fC04=
|
||||
github.com/chzyer/test v1.0.0/go.mod h1:2JlltgoNkt4TW/z9V/IzDdFaMTM2JPIi26O1pF38GC8=
|
||||
github.com/cpuguy83/go-md2man/v2 v2.0.2/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=
|
||||
@@ -80,6 +78,8 @@ github.com/olekukonko/tablewriter v0.0.5 h1:P2Ga83D34wi1o9J6Wh1mRuqd4mF/x/lgBS7N
|
||||
github.com/olekukonko/tablewriter v0.0.5/go.mod h1:hPp6KlRPjbx+hW8ykQs1w3UBbZlj6HuIJcUGPhkA7kY=
|
||||
github.com/pbnjay/memory v0.0.0-20210728143218-7b4eea64cf58 h1:onHthvaw9LFnH4t2DcNVpwGmV9E1BkGknEliJkfwQj0=
|
||||
github.com/pbnjay/memory v0.0.0-20210728143218-7b4eea64cf58/go.mod h1:DXv8WO4yhMYhSNPKjeNKa5WY9YCIEBRbNzFFPJbWO6Y=
|
||||
github.com/pdevine/readline v1.5.2 h1:oz6Y5GdTmhPG+08hhxcAvtHitSANWuA2100Sppb38xI=
|
||||
github.com/pdevine/readline v1.5.2/go.mod h1:na/LbuE5PYwxI7GyopWdIs3U8HVe89lYlNTFTXH3wOw=
|
||||
github.com/pelletier/go-toml/v2 v2.0.1/go.mod h1:r9LEWfGN8R5k0VXJ+0BkIe7MYkRdwZOjgMj2KwnJFUo=
|
||||
github.com/pelletier/go-toml/v2 v2.0.8 h1:0ctb6s9mE31h0/lhu+J6OPmVeDxJn+kYnJc2jZR9tGQ=
|
||||
github.com/pelletier/go-toml/v2 v2.0.8/go.mod h1:vuYfssBdrU2XDZ9bYydBu6t+6a6PYNcZljzZR9VXg+4=
|
||||
@@ -120,7 +120,6 @@ golang.org/x/arch v0.3.0/go.mod h1:5om86z9Hs0C8fWVUuoMHwpExlXzs5Tkyp9hOrfG7pp8=
|
||||
golang.org/x/crypto v0.0.0-20210711020723-a769d52b0f97/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
|
||||
golang.org/x/crypto v0.10.0 h1:LKqV2xt9+kDzSTfOhx4FrkEBcMrAgHSYgzywV9zcGmM=
|
||||
golang.org/x/crypto v0.10.0/go.mod h1:o4eNf7Ede1fv+hwOwZsTHl9EsPFO6q6ZvYR8vYfY45I=
|
||||
golang.org/x/exp v0.0.0-20230321023759-10a507213a29 h1:ooxPy7fPvB4kwsA2h+iBNHkAbp/4JxTSwCmvdjEYmug=
|
||||
golang.org/x/exp v0.0.0-20230817173708-d852ddb80c63 h1:m64FZMko/V45gv0bNmrNYoDEq8U5YUhetc9cBWKS1TQ=
|
||||
golang.org/x/exp v0.0.0-20230817173708-d852ddb80c63/go.mod h1:0v4NqG35kSWCMzLaMeX+IQrlSnVE/bqGSyC2cz/9Le8=
|
||||
golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
|
||||
|
22
llm/falcon.go
Normal file
22
llm/falcon.go
Normal file
@@ -0,0 +1,22 @@
|
||||
package llm
|
||||
|
||||
const ModelFamilyFalcon = "falcon"
|
||||
|
||||
const (
|
||||
falconModelType7B = 32
|
||||
falconModelType40B = 60
|
||||
falconModelType180B = 80
|
||||
)
|
||||
|
||||
func falconModelType(numLayer uint32) string {
|
||||
switch numLayer {
|
||||
case 32:
|
||||
return "7B"
|
||||
case 60:
|
||||
return "40B"
|
||||
case 80:
|
||||
return "180B"
|
||||
default:
|
||||
return "Unknown"
|
||||
}
|
||||
}
|
164
llm/ggml.go
164
llm/ggml.go
@@ -3,72 +3,96 @@ package llm
|
||||
import (
|
||||
"encoding/binary"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
)
|
||||
|
||||
type ModelFamily string
|
||||
|
||||
type ModelType uint32
|
||||
|
||||
const (
|
||||
ModelType3B ModelType = 26
|
||||
ModelType7B ModelType = 32
|
||||
ModelType13B ModelType = 40
|
||||
ModelType34B ModelType = 48
|
||||
ModelType30B ModelType = 60
|
||||
ModelType65B ModelType = 80
|
||||
)
|
||||
|
||||
func (mt ModelType) String() string {
|
||||
switch mt {
|
||||
case ModelType3B:
|
||||
return "3B"
|
||||
case ModelType7B:
|
||||
return "7B"
|
||||
case ModelType13B:
|
||||
return "13B"
|
||||
case ModelType34B:
|
||||
return "34B"
|
||||
case ModelType30B:
|
||||
return "30B"
|
||||
case ModelType65B:
|
||||
return "65B"
|
||||
default:
|
||||
return "Unknown"
|
||||
}
|
||||
}
|
||||
|
||||
type FileType interface {
|
||||
String() string
|
||||
}
|
||||
|
||||
type GGML struct {
|
||||
magic uint32
|
||||
container
|
||||
model
|
||||
}
|
||||
|
||||
const (
|
||||
fileTypeF32 uint32 = iota
|
||||
fileTypeF16
|
||||
fileTypeQ4_0
|
||||
fileTypeQ4_1
|
||||
fileTypeQ4_1_F16
|
||||
fileTypeQ8_0 uint32 = iota + 2
|
||||
fileTypeQ5_0
|
||||
fileTypeQ5_1
|
||||
fileTypeQ2_K
|
||||
fileTypeQ3_K_S
|
||||
fileTypeQ3_K_M
|
||||
fileTypeQ3_K_L
|
||||
fileTypeQ4_K_S
|
||||
fileTypeQ4_K_M
|
||||
fileTypeQ5_K_S
|
||||
fileTypeQ5_K_M
|
||||
fileTypeQ6_K
|
||||
)
|
||||
|
||||
func fileType(fileType uint32) string {
|
||||
switch fileType {
|
||||
case fileTypeF32:
|
||||
return "F32"
|
||||
case fileTypeF16:
|
||||
return "F16"
|
||||
case fileTypeQ4_0:
|
||||
return "Q4_0"
|
||||
case fileTypeQ4_1:
|
||||
return "Q4_1"
|
||||
case fileTypeQ4_1_F16:
|
||||
return "Q4_1_F16"
|
||||
case fileTypeQ8_0:
|
||||
return "Q8_0"
|
||||
case fileTypeQ5_0:
|
||||
return "Q5_0"
|
||||
case fileTypeQ5_1:
|
||||
return "Q5_1"
|
||||
case fileTypeQ2_K:
|
||||
return "Q2_K"
|
||||
case fileTypeQ3_K_S:
|
||||
return "Q3_K_S"
|
||||
case fileTypeQ3_K_M:
|
||||
return "Q3_K_M"
|
||||
case fileTypeQ3_K_L:
|
||||
return "Q3_K_L"
|
||||
case fileTypeQ4_K_S:
|
||||
return "Q4_K_S"
|
||||
case fileTypeQ4_K_M:
|
||||
return "Q4_K_M"
|
||||
case fileTypeQ5_K_S:
|
||||
return "Q5_K_S"
|
||||
case fileTypeQ5_K_M:
|
||||
return "Q5_K_M"
|
||||
case fileTypeQ6_K:
|
||||
return "Q6_K"
|
||||
default:
|
||||
return "Unknown"
|
||||
}
|
||||
}
|
||||
|
||||
type model interface {
|
||||
ModelFamily() ModelFamily
|
||||
ModelType() ModelType
|
||||
FileType() FileType
|
||||
ModelFamily() string
|
||||
ModelType() string
|
||||
FileType() string
|
||||
NumLayers() int64
|
||||
}
|
||||
|
||||
type container interface {
|
||||
Name() string
|
||||
Decode(io.Reader) error
|
||||
Decode(io.Reader) (model, error)
|
||||
}
|
||||
|
||||
type containerGGML struct {
|
||||
}
|
||||
type containerGGML struct{}
|
||||
|
||||
func (c *containerGGML) Name() string {
|
||||
return "ggml"
|
||||
}
|
||||
|
||||
func (c *containerGGML) Decode(r io.Reader) error {
|
||||
return nil
|
||||
func (c *containerGGML) Decode(r io.Reader) (model, error) {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
type containerGGMF struct {
|
||||
@@ -79,18 +103,18 @@ func (c *containerGGMF) Name() string {
|
||||
return "ggmf"
|
||||
}
|
||||
|
||||
func (c *containerGGMF) Decode(r io.Reader) error {
|
||||
func (c *containerGGMF) Decode(r io.Reader) (model, error) {
|
||||
var version uint32
|
||||
binary.Read(r, binary.LittleEndian, &version)
|
||||
|
||||
switch version {
|
||||
case 1:
|
||||
default:
|
||||
return errors.New("invalid version")
|
||||
return nil, errors.New("invalid version")
|
||||
}
|
||||
|
||||
c.version = version
|
||||
return nil
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
type containerGGJT struct {
|
||||
@@ -101,18 +125,22 @@ func (c *containerGGJT) Name() string {
|
||||
return "ggjt"
|
||||
}
|
||||
|
||||
func (c *containerGGJT) Decode(r io.Reader) error {
|
||||
func (c *containerGGJT) Decode(r io.Reader) (model, error) {
|
||||
var version uint32
|
||||
binary.Read(r, binary.LittleEndian, &version)
|
||||
|
||||
switch version {
|
||||
case 1, 2, 3:
|
||||
default:
|
||||
return errors.New("invalid version")
|
||||
return nil, errors.New("invalid version")
|
||||
}
|
||||
|
||||
c.version = version
|
||||
return nil
|
||||
|
||||
// different model types may have different layouts for hyperparameters
|
||||
var llama llamaModel
|
||||
binary.Read(r, binary.LittleEndian, &llama.hyperparameters)
|
||||
return &llama, nil
|
||||
}
|
||||
|
||||
type containerLORA struct {
|
||||
@@ -123,32 +151,34 @@ func (c *containerLORA) Name() string {
|
||||
return "ggla"
|
||||
}
|
||||
|
||||
func (c *containerLORA) Decode(r io.Reader) error {
|
||||
func (c *containerLORA) Decode(r io.Reader) (model, error) {
|
||||
var version uint32
|
||||
binary.Read(r, binary.LittleEndian, &version)
|
||||
|
||||
switch version {
|
||||
case 1:
|
||||
default:
|
||||
return errors.New("invalid version")
|
||||
return nil, errors.New("invalid version")
|
||||
}
|
||||
|
||||
c.version = version
|
||||
return nil
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
const (
|
||||
// / Magic constant for `ggml` files (unversioned).
|
||||
// Magic constant for `ggml` files (unversioned).
|
||||
FILE_MAGIC_GGML = 0x67676d6c
|
||||
// / Magic constant for `ggml` files (versioned, ggmf).
|
||||
// Magic constant for `ggml` files (versioned, ggmf).
|
||||
FILE_MAGIC_GGMF = 0x67676d66
|
||||
// / Magic constant for `ggml` files (versioned, ggjt).
|
||||
// Magic constant for `ggml` files (versioned, ggjt).
|
||||
FILE_MAGIC_GGJT = 0x67676a74
|
||||
// / Magic constant for `ggla` files (LoRA adapter).
|
||||
// Magic constant for `ggla` files (LoRA adapter).
|
||||
FILE_MAGIC_GGLA = 0x67676C61
|
||||
// Magic constant for `gguf` files (versioned, gguf)
|
||||
FILE_MAGIC_GGUF = 0x46554747
|
||||
)
|
||||
|
||||
func DecodeGGML(r io.ReadSeeker, hint ModelFamily) (*GGML, error) {
|
||||
func DecodeGGML(r io.ReadSeeker) (*GGML, error) {
|
||||
var ggml GGML
|
||||
binary.Read(r, binary.LittleEndian, &ggml.magic)
|
||||
|
||||
@@ -161,24 +191,18 @@ func DecodeGGML(r io.ReadSeeker, hint ModelFamily) (*GGML, error) {
|
||||
ggml.container = &containerGGJT{}
|
||||
case FILE_MAGIC_GGLA:
|
||||
ggml.container = &containerLORA{}
|
||||
case FILE_MAGIC_GGUF:
|
||||
ggml.container = &containerGGUF{}
|
||||
default:
|
||||
return nil, errors.New("invalid file magic")
|
||||
}
|
||||
|
||||
if err := ggml.Decode(r); err != nil {
|
||||
model, err := ggml.Decode(r)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// different model types may have different layouts for hyperparameters
|
||||
switch hint {
|
||||
case ModelFamilyLlama:
|
||||
var llama llamaModel
|
||||
binary.Read(r, binary.LittleEndian, &llama.hyperparameters)
|
||||
ggml.model = &llama
|
||||
// TODO: sanity check hyperparameters
|
||||
default:
|
||||
return nil, fmt.Errorf("unsupported model type: %s", hint)
|
||||
}
|
||||
ggml.model = model
|
||||
|
||||
// final model type
|
||||
return &ggml, nil
|
||||
|
379
llm/gguf.go
Normal file
379
llm/gguf.go
Normal file
@@ -0,0 +1,379 @@
|
||||
package llm
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/binary"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
)
|
||||
|
||||
type containerGGUF struct {
|
||||
Version uint32
|
||||
|
||||
V1 struct {
|
||||
NumTensor uint32
|
||||
NumKV uint32
|
||||
}
|
||||
|
||||
V2 struct {
|
||||
NumTensor uint64
|
||||
NumKV uint64
|
||||
}
|
||||
}
|
||||
|
||||
func (c *containerGGUF) Name() string {
|
||||
return "gguf"
|
||||
}
|
||||
|
||||
func (c *containerGGUF) Decode(r io.Reader) (model, error) {
|
||||
binary.Read(r, binary.LittleEndian, &c.Version)
|
||||
|
||||
switch c.Version {
|
||||
case 1:
|
||||
binary.Read(r, binary.LittleEndian, &c.V1)
|
||||
case 2:
|
||||
binary.Read(r, binary.LittleEndian, &c.V2)
|
||||
default:
|
||||
return nil, errors.New("invalid version")
|
||||
}
|
||||
|
||||
model := newGGUFModel(c)
|
||||
if err := model.Decode(r); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return model, nil
|
||||
}
|
||||
|
||||
const (
|
||||
ggufTypeUint8 uint32 = iota
|
||||
ggufTypeInt8
|
||||
ggufTypeUint16
|
||||
ggufTypeInt16
|
||||
ggufTypeUint32
|
||||
ggufTypeInt32
|
||||
ggufTypeFloat32
|
||||
ggufTypeBool
|
||||
ggufTypeString
|
||||
ggufTypeArray
|
||||
ggufTypeUint64
|
||||
ggufTypeInt64
|
||||
ggufTypeFloat64
|
||||
)
|
||||
|
||||
type kv map[string]any
|
||||
|
||||
type ggufModel struct {
|
||||
*containerGGUF
|
||||
kv
|
||||
}
|
||||
|
||||
func newGGUFModel(container *containerGGUF) *ggufModel {
|
||||
return &ggufModel{
|
||||
containerGGUF: container,
|
||||
kv: make(kv),
|
||||
}
|
||||
}
|
||||
|
||||
func (llm *ggufModel) NumKV() uint64 {
|
||||
if llm.Version == 1 {
|
||||
return uint64(llm.V1.NumKV)
|
||||
}
|
||||
|
||||
return llm.V2.NumKV
|
||||
}
|
||||
|
||||
func (llm *ggufModel) ModelFamily() string {
|
||||
t, ok := llm.kv["general.architecture"].(string)
|
||||
if ok {
|
||||
return t
|
||||
}
|
||||
|
||||
return "unknown"
|
||||
}
|
||||
|
||||
func (llm *ggufModel) ModelType() string {
|
||||
switch llm.ModelFamily() {
|
||||
case "llama":
|
||||
if blocks, ok := llm.kv["llama.block_count"].(uint32); ok {
|
||||
heads, headsOK := llm.kv["llama.head_count"].(uint32)
|
||||
headKVs, headsKVsOK := llm.kv["llama.head_count_kv"].(uint32)
|
||||
if headsOK && headsKVsOK && heads/headKVs == 8 {
|
||||
return "70B"
|
||||
}
|
||||
|
||||
return llamaModelType(blocks)
|
||||
}
|
||||
case "falcon":
|
||||
if blocks, ok := llm.kv["falcon.block_count"].(uint32); ok {
|
||||
return falconModelType(blocks)
|
||||
}
|
||||
}
|
||||
|
||||
return "Unknown"
|
||||
}
|
||||
|
||||
func (llm *ggufModel) FileType() string {
|
||||
t, ok := llm.kv["general.file_type"].(uint32)
|
||||
if ok {
|
||||
return fileType(t)
|
||||
}
|
||||
|
||||
return "Unknown"
|
||||
}
|
||||
|
||||
func (llm *ggufModel) Decode(r io.Reader) error {
|
||||
read := llm.readString
|
||||
if llm.Version == 1 {
|
||||
read = llm.readStringV1
|
||||
}
|
||||
|
||||
for i := 0; uint64(i) < llm.NumKV(); i++ {
|
||||
k, err := read(r)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
vtype := llm.readU32(r)
|
||||
|
||||
var v any
|
||||
switch vtype {
|
||||
case ggufTypeUint8:
|
||||
v = llm.readU8(r)
|
||||
case ggufTypeInt8:
|
||||
v = llm.readI8(r)
|
||||
case ggufTypeUint16:
|
||||
v = llm.readU16(r)
|
||||
case ggufTypeInt16:
|
||||
v = llm.readI16(r)
|
||||
case ggufTypeUint32:
|
||||
v = llm.readU32(r)
|
||||
case ggufTypeInt32:
|
||||
v = llm.readI32(r)
|
||||
case ggufTypeUint64:
|
||||
v = llm.readU64(r)
|
||||
case ggufTypeInt64:
|
||||
v = llm.readI64(r)
|
||||
case ggufTypeFloat32:
|
||||
v = llm.readF32(r)
|
||||
case ggufTypeFloat64:
|
||||
v = llm.readF64(r)
|
||||
case ggufTypeBool:
|
||||
v = llm.readBool(r)
|
||||
case ggufTypeString:
|
||||
fn := llm.readString
|
||||
if llm.Version == 1 {
|
||||
fn = llm.readStringV1
|
||||
}
|
||||
|
||||
s, err := fn(r)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
v = s
|
||||
case ggufTypeArray:
|
||||
fn := llm.readArray
|
||||
if llm.Version == 1 {
|
||||
fn = llm.readArrayV1
|
||||
}
|
||||
|
||||
a, err := fn(r)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
v = a
|
||||
default:
|
||||
return fmt.Errorf("invalid type: %d", vtype)
|
||||
}
|
||||
|
||||
llm.kv[k] = v
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (llm *ggufModel) NumLayers() int64 {
|
||||
value, exists := llm.kv[fmt.Sprintf("%s.block_count", llm.ModelFamily())]
|
||||
if !exists {
|
||||
return 0
|
||||
}
|
||||
|
||||
v := value.(uint32)
|
||||
return int64(v)
|
||||
}
|
||||
|
||||
func (ggufModel) readU8(r io.Reader) uint8 {
|
||||
var u8 uint8
|
||||
binary.Read(r, binary.LittleEndian, &u8)
|
||||
return u8
|
||||
}
|
||||
|
||||
func (ggufModel) readI8(r io.Reader) int8 {
|
||||
var i8 int8
|
||||
binary.Read(r, binary.LittleEndian, &i8)
|
||||
return i8
|
||||
}
|
||||
|
||||
func (ggufModel) readU16(r io.Reader) uint16 {
|
||||
var u16 uint16
|
||||
binary.Read(r, binary.LittleEndian, &u16)
|
||||
return u16
|
||||
}
|
||||
|
||||
func (ggufModel) readI16(r io.Reader) int16 {
|
||||
var i16 int16
|
||||
binary.Read(r, binary.LittleEndian, &i16)
|
||||
return i16
|
||||
}
|
||||
|
||||
func (ggufModel) readU32(r io.Reader) uint32 {
|
||||
var u32 uint32
|
||||
binary.Read(r, binary.LittleEndian, &u32)
|
||||
return u32
|
||||
}
|
||||
|
||||
func (ggufModel) readI32(r io.Reader) int32 {
|
||||
var i32 int32
|
||||
binary.Read(r, binary.LittleEndian, &i32)
|
||||
return i32
|
||||
}
|
||||
|
||||
func (ggufModel) readU64(r io.Reader) uint64 {
|
||||
var u64 uint64
|
||||
binary.Read(r, binary.LittleEndian, &u64)
|
||||
return u64
|
||||
}
|
||||
|
||||
func (ggufModel) readI64(r io.Reader) int64 {
|
||||
var i64 int64
|
||||
binary.Read(r, binary.LittleEndian, &i64)
|
||||
return i64
|
||||
}
|
||||
|
||||
func (ggufModel) readF32(r io.Reader) float32 {
|
||||
var f32 float32
|
||||
binary.Read(r, binary.LittleEndian, &f32)
|
||||
return f32
|
||||
}
|
||||
|
||||
func (ggufModel) readF64(r io.Reader) float64 {
|
||||
var f64 float64
|
||||
binary.Read(r, binary.LittleEndian, &f64)
|
||||
return f64
|
||||
}
|
||||
|
||||
func (ggufModel) readBool(r io.Reader) bool {
|
||||
var b bool
|
||||
binary.Read(r, binary.LittleEndian, &b)
|
||||
return b
|
||||
}
|
||||
|
||||
func (ggufModel) readStringV1(r io.Reader) (string, error) {
|
||||
var nameLength uint32
|
||||
binary.Read(r, binary.LittleEndian, &nameLength)
|
||||
|
||||
var b bytes.Buffer
|
||||
if _, err := io.CopyN(&b, r, int64(nameLength)); err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
// gguf v1 strings are null-terminated
|
||||
b.Truncate(b.Len() - 1)
|
||||
|
||||
return b.String(), nil
|
||||
}
|
||||
|
||||
func (llm ggufModel) readString(r io.Reader) (string, error) {
|
||||
var nameLength uint64
|
||||
binary.Read(r, binary.LittleEndian, &nameLength)
|
||||
|
||||
var b bytes.Buffer
|
||||
if _, err := io.CopyN(&b, r, int64(nameLength)); err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
return b.String(), nil
|
||||
}
|
||||
|
||||
func (llm *ggufModel) readArrayV1(r io.Reader) (arr []any, err error) {
|
||||
atype := llm.readU32(r)
|
||||
n := llm.readU32(r)
|
||||
|
||||
for i := 0; uint32(i) < n; i++ {
|
||||
switch atype {
|
||||
case ggufTypeUint8:
|
||||
arr = append(arr, llm.readU8(r))
|
||||
case ggufTypeInt8:
|
||||
arr = append(arr, llm.readU8(r))
|
||||
case ggufTypeUint16:
|
||||
arr = append(arr, llm.readU16(r))
|
||||
case ggufTypeInt16:
|
||||
arr = append(arr, llm.readI16(r))
|
||||
case ggufTypeUint32:
|
||||
arr = append(arr, llm.readU32(r))
|
||||
case ggufTypeInt32:
|
||||
arr = append(arr, llm.readI32(r))
|
||||
case ggufTypeFloat32:
|
||||
arr = append(arr, llm.readF32(r))
|
||||
case ggufTypeBool:
|
||||
arr = append(arr, llm.readBool(r))
|
||||
case ggufTypeString:
|
||||
s, err := llm.readStringV1(r)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
arr = append(arr, s)
|
||||
default:
|
||||
return nil, fmt.Errorf("invalid array type: %d", atype)
|
||||
}
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
func (llm *ggufModel) readArray(r io.Reader) (arr []any, err error) {
|
||||
atype := llm.readU32(r)
|
||||
n := llm.readU64(r)
|
||||
|
||||
for i := 0; uint64(i) < n; i++ {
|
||||
switch atype {
|
||||
case ggufTypeUint8:
|
||||
arr = append(arr, llm.readU8(r))
|
||||
case ggufTypeInt8:
|
||||
arr = append(arr, llm.readU8(r))
|
||||
case ggufTypeUint16:
|
||||
arr = append(arr, llm.readU16(r))
|
||||
case ggufTypeInt16:
|
||||
arr = append(arr, llm.readI16(r))
|
||||
case ggufTypeUint32:
|
||||
arr = append(arr, llm.readU32(r))
|
||||
case ggufTypeInt32:
|
||||
arr = append(arr, llm.readI32(r))
|
||||
case ggufTypeUint64:
|
||||
arr = append(arr, llm.readU64(r))
|
||||
case ggufTypeInt64:
|
||||
arr = append(arr, llm.readI64(r))
|
||||
case ggufTypeFloat32:
|
||||
arr = append(arr, llm.readF32(r))
|
||||
case ggufTypeFloat64:
|
||||
arr = append(arr, llm.readF64(r))
|
||||
case ggufTypeBool:
|
||||
arr = append(arr, llm.readBool(r))
|
||||
case ggufTypeString:
|
||||
s, err := llm.readString(r)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
arr = append(arr, s)
|
||||
default:
|
||||
return nil, fmt.Errorf("invalid array type: %d", atype)
|
||||
}
|
||||
}
|
||||
|
||||
return
|
||||
}
|
@@ -1,13 +0,0 @@
|
||||
//go:build !darwin
|
||||
// +build !darwin
|
||||
|
||||
package llm
|
||||
|
||||
//go:generate git submodule init
|
||||
//go:generate git submodule update --force ggml
|
||||
//go:generate git -C ggml apply ../ggml_patch/0001-add-detokenize-endpoint.patch
|
||||
//go:generate git -C ggml apply ../ggml_patch/0002-34B-model-support.patch
|
||||
//go:generate git -C ggml apply ../ggml_patch/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
|
||||
//go:generate git -C ggml apply ../ggml_patch/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
|
||||
//go:generate cmake --fresh -S ggml -B ggml/build/cpu -DLLAMA_K_QUANTS=on
|
||||
//go:generate cmake --build ggml/build/cpu --target server --config Release
|
@@ -1,10 +1,16 @@
|
||||
package llm
|
||||
|
||||
//go:generate git submodule init
|
||||
|
||||
//go:generate git submodule update --force ggml
|
||||
//go:generate git -C ggml apply ../ggml_patch/0001-add-detokenize-endpoint.patch
|
||||
//go:generate git -C ggml apply ../ggml_patch/0002-34B-model-support.patch
|
||||
//go:generate git -C ggml apply ../ggml_patch/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
|
||||
//go:generate git -C ggml apply ../ggml_patch/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
|
||||
//go:generate cmake --fresh -S ggml -B ggml/build/cpu -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
|
||||
//go:generate git -C ggml apply ../patches/0001-add-detokenize-endpoint.patch
|
||||
//go:generate git -C ggml apply ../patches/0002-34B-model-support.patch
|
||||
//go:generate git -C ggml apply ../patches/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
|
||||
//go:generate git -C ggml apply ../patches/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
|
||||
//go:generate cmake -S ggml -B ggml/build/cpu -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
|
||||
//go:generate cmake --build ggml/build/cpu --target server --config Release
|
||||
|
||||
//go:generate git submodule update --force gguf
|
||||
//go:generate git -C gguf apply ../patches/0001-remove-warm-up-logging.patch
|
||||
//go:generate cmake -S gguf -B gguf/build/cpu -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
|
||||
//go:generate cmake --build gguf/build/cpu --target server --config Release
|
||||
|
@@ -1,10 +1,16 @@
|
||||
package llm
|
||||
|
||||
//go:generate git submodule init
|
||||
|
||||
//go:generate git submodule update --force ggml
|
||||
//go:generate git -C ggml apply ../ggml_patch/0001-add-detokenize-endpoint.patch
|
||||
//go:generate git -C ggml apply ../ggml_patch/0002-34B-model-support.patch
|
||||
//go:generate git -C ggml apply ../ggml_patch/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
|
||||
//go:generate git -C ggml apply ../ggml_patch/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
|
||||
//go:generate cmake --fresh -S ggml -B ggml/build/gpu -DLLAMA_METAL=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
|
||||
//go:generate cmake --build ggml/build/gpu --target server --config Release
|
||||
//go:generate git -C ggml apply ../patches/0001-add-detokenize-endpoint.patch
|
||||
//go:generate git -C ggml apply ../patches/0002-34B-model-support.patch
|
||||
//go:generate git -C ggml apply ../patches/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
|
||||
//go:generate git -C ggml apply ../patches/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
|
||||
//go:generate cmake -S ggml -B ggml/build/metal -DLLAMA_METAL=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
|
||||
//go:generate cmake --build ggml/build/metal --target server --config Release
|
||||
|
||||
//go:generate git submodule update --force gguf
|
||||
//go:generate git -C gguf apply ../patches/0001-remove-warm-up-logging.patch
|
||||
//go:generate cmake -S gguf -B gguf/build/metal -DLLAMA_METAL=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
|
||||
//go:generate cmake --build gguf/build/metal --target server --config Release
|
||||
|
22
llm/llama.cpp/generate_linux.go
Normal file
22
llm/llama.cpp/generate_linux.go
Normal file
@@ -0,0 +1,22 @@
|
||||
package llm
|
||||
|
||||
//go:generate git submodule init
|
||||
|
||||
//go:generate git submodule update --force ggml
|
||||
//go:generate git -C ggml apply ../patches/0001-add-detokenize-endpoint.patch
|
||||
//go:generate git -C ggml apply ../patches/0002-34B-model-support.patch
|
||||
//go:generate git -C ggml apply ../patches/0005-ggml-support-CUDA-s-half-type-for-aarch64-1455-2670.patch
|
||||
//go:generate git -C ggml apply ../patches/0001-copy-cuda-runtime-libraries.patch
|
||||
//go:generate cmake -S ggml -B ggml/build/cpu -DLLAMA_K_QUANTS=on
|
||||
//go:generate cmake --build ggml/build/cpu --target server --config Release
|
||||
|
||||
//go:generate git submodule update --force gguf
|
||||
//go:generate git -C gguf apply ../patches/0001-copy-cuda-runtime-libraries.patch
|
||||
//go:generate git -C gguf apply ../patches/0001-remove-warm-up-logging.patch
|
||||
//go:generate cmake -S gguf -B gguf/build/cpu -DLLAMA_K_QUANTS=on
|
||||
//go:generate cmake --build gguf/build/cpu --target server --config Release
|
||||
|
||||
//go:generate cmake -S ggml -B ggml/build/cuda -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on
|
||||
//go:generate cmake --build ggml/build/cuda --target server --config Release
|
||||
//go:generate cmake -S gguf -B gguf/build/cuda -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on
|
||||
//go:generate cmake --build gguf/build/cuda --target server --config Release
|
14
llm/llama.cpp/generate_windows.go
Normal file
14
llm/llama.cpp/generate_windows.go
Normal file
@@ -0,0 +1,14 @@
|
||||
package llm
|
||||
|
||||
//go:generate git submodule init
|
||||
|
||||
//go:generate git submodule update --force ggml
|
||||
//go:generate git -C ggml apply ../patches/0001-add-detokenize-endpoint.patch
|
||||
//go:generate git -C ggml apply ../patches/0002-34B-model-support.patch
|
||||
//go:generate cmake -S ggml -B ggml/build/cpu -DLLAMA_K_QUANTS=on
|
||||
//go:generate cmake --build ggml/build/cpu --target server --config Release
|
||||
|
||||
//go:generate git submodule update --force gguf
|
||||
//go:generate git -C gguf apply ../patches/0001-remove-warm-up-logging.patch
|
||||
//go:generate cmake -S gguf -B gguf/build/cpu -DLLAMA_K_QUANTS=on
|
||||
//go:generate cmake --build gguf/build/cpu --target server --config Release
|
@@ -1,32 +0,0 @@
|
||||
From 8c0ea847ac1460bca534d92266e3471cb31471be Mon Sep 17 00:00:00 2001
|
||||
From: Bruce MacDonald <brucewmacdonald@gmail.com>
|
||||
Date: Tue, 5 Sep 2023 16:05:08 -0400
|
||||
Subject: [PATCH] metal: add missing barriers for mul-mat #2699
|
||||
|
||||
---
|
||||
ggml-metal.metal | 2 ++
|
||||
1 file changed, 2 insertions(+)
|
||||
|
||||
diff --git a/ggml-metal.metal b/ggml-metal.metal
|
||||
index 3f31252..ce3541f 100644
|
||||
--- a/ggml-metal.metal
|
||||
+++ b/ggml-metal.metal
|
||||
@@ -1850,6 +1850,7 @@ kernel void kernel_mul_mm(device const uchar * src0,
|
||||
//load data and store to threadgroup memory
|
||||
half4x4 temp_a;
|
||||
dequantize_func(x, il, temp_a);
|
||||
+ threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
#pragma unroll(16)
|
||||
for (int i = 0; i < 16; i++) {
|
||||
*(sa + SG_MAT_SIZE * ((tiitg / THREAD_PER_ROW / 8) \
|
||||
@@ -1895,6 +1896,7 @@ kernel void kernel_mul_mm(device const uchar * src0,
|
||||
}
|
||||
} else {
|
||||
// block is smaller than 64x32, we should avoid writing data outside of the matrix
|
||||
+ threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
threadgroup float *temp_str = ((threadgroup float *)shared_memory) \
|
||||
+ 32 * (sgitg&1) + (16 * (sgitg>>1)) * BLOCK_SIZE_M;
|
||||
for (int i = 0; i < 8; i++) {
|
||||
--
|
||||
2.39.2 (Apple Git-143)
|
||||
|
1
llm/llama.cpp/gguf
Submodule
1
llm/llama.cpp/gguf
Submodule
Submodule llm/llama.cpp/gguf added at bc9d3e3971
27
llm/llama.cpp/patches/0001-copy-cuda-runtime-libraries.patch
Normal file
27
llm/llama.cpp/patches/0001-copy-cuda-runtime-libraries.patch
Normal file
@@ -0,0 +1,27 @@
|
||||
From 5dd02993e8cc2ce309157736b95bb572f274a3fd Mon Sep 17 00:00:00 2001
|
||||
From: Michael Yang <mxyng@pm.me>
|
||||
Date: Wed, 20 Sep 2023 14:19:52 -0700
|
||||
Subject: [PATCH] copy cuda runtime libraries
|
||||
|
||||
---
|
||||
CMakeLists.txt | 4 ++++
|
||||
1 file changed, 4 insertions(+)
|
||||
|
||||
diff --git a/CMakeLists.txt b/CMakeLists.txt
|
||||
index 824d9f2..dd24137 100644
|
||||
--- a/CMakeLists.txt
|
||||
+++ b/CMakeLists.txt
|
||||
@@ -274,6 +274,10 @@ if (LLAMA_CUBLAS)
|
||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
|
||||
endif()
|
||||
|
||||
+ configure_file(${CUDAToolkit_LIBRARY_DIR}/libcudart.so ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/libcudart.so.${CUDAToolkit_VERSION_MAJOR}.0 COPYONLY)
|
||||
+ configure_file(${CUDAToolkit_LIBRARY_DIR}/libcublas.so ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/libcublas.so.${CUDAToolkit_VERSION_MAJOR} COPYONLY)
|
||||
+ configure_file(${CUDAToolkit_LIBRARY_DIR}/libcublasLt.so ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/libcublasLt.so.${CUDAToolkit_VERSION_MAJOR} COPYONLY)
|
||||
+
|
||||
if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
|
||||
# 52 == lowest CUDA 12 standard
|
||||
# 60 == f16 CUDA intrinsics
|
||||
--
|
||||
2.42.0
|
||||
|
25
llm/llama.cpp/patches/0001-remove-warm-up-logging.patch
Normal file
25
llm/llama.cpp/patches/0001-remove-warm-up-logging.patch
Normal file
@@ -0,0 +1,25 @@
|
||||
From 07993bdc35345b67b27aa649a7c099ad42d80c4c Mon Sep 17 00:00:00 2001
|
||||
From: Michael Yang <mxyng@pm.me>
|
||||
Date: Thu, 21 Sep 2023 14:43:21 -0700
|
||||
Subject: [PATCH] remove warm up logging
|
||||
|
||||
---
|
||||
common/common.cpp | 2 --
|
||||
1 file changed, 2 deletions(-)
|
||||
|
||||
diff --git a/common/common.cpp b/common/common.cpp
|
||||
index 2597ba0..b56549b 100644
|
||||
--- a/common/common.cpp
|
||||
+++ b/common/common.cpp
|
||||
@@ -780,8 +780,6 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
|
||||
}
|
||||
|
||||
{
|
||||
- LOG("warming up the model with an empty run\n");
|
||||
-
|
||||
const std::vector<llama_token> tmp = { llama_token_bos(lctx), llama_token_eos(lctx), };
|
||||
llama_eval(lctx, tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, params.n_threads);
|
||||
llama_reset_timings(lctx);
|
||||
--
|
||||
2.42.0
|
||||
|
@@ -0,0 +1,32 @@
|
||||
From 1e3bc523d8053a77df3ac7126a84d0297ee97ef6 Mon Sep 17 00:00:00 2001
|
||||
From: Kylin <56434533+KyL0N@users.noreply.github.com>
|
||||
Date: Tue, 22 Aug 2023 15:14:23 +0800
|
||||
Subject: [PATCH] ggml : support CUDA's half type for aarch64(#1455) (#2670)
|
||||
|
||||
* ggml: support CUDA's half type for aarch64(#1455)
|
||||
support CUDA's half type for aarch64 in ggml_fp16_t definition
|
||||
|
||||
* ggml: use __CUDACC__ to recognise nvcc compiler
|
||||
---
|
||||
ggml.h | 5 +++--
|
||||
1 file changed, 3 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/ggml.h b/ggml.h
|
||||
index 544ad2d..0ec7ec5 100644
|
||||
--- a/ggml.h
|
||||
+++ b/ggml.h
|
||||
@@ -259,8 +259,9 @@
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
-#ifdef __ARM_NEON
|
||||
- // we use the built-in 16-bit float type
|
||||
+#if defined(__ARM_NEON) && defined(__CUDACC__)
|
||||
+ typedef half ggml_fp16_t;
|
||||
+#elif defined(__ARM_NEON)
|
||||
typedef __fp16 ggml_fp16_t;
|
||||
#else
|
||||
typedef uint16_t ggml_fp16_t;
|
||||
--
|
||||
2.39.2 (Apple Git-143)
|
||||
|
@@ -20,127 +20,140 @@ import (
|
||||
"runtime"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/jmorganca/ollama/api"
|
||||
)
|
||||
|
||||
const ModelFamilyLlama ModelFamily = "llama"
|
||||
|
||||
//go:embed llama.cpp/ggml/build/*/bin/*
|
||||
//go:embed llama.cpp/*/build/*/bin/*
|
||||
var llamaCppEmbed embed.FS
|
||||
|
||||
var (
|
||||
ggmlGPU = path.Join("llama.cpp", "ggml", "build", "gpu", "bin")
|
||||
ggmlCPU = path.Join("llama.cpp", "ggml", "build", "cpu", "bin")
|
||||
)
|
||||
|
||||
var (
|
||||
ggmlInit sync.Once
|
||||
ggmlRunnerPath string
|
||||
)
|
||||
|
||||
func osPath(llamaPath string) string {
|
||||
if runtime.GOOS == "windows" {
|
||||
return path.Join(llamaPath, "Release")
|
||||
}
|
||||
|
||||
return llamaPath
|
||||
}
|
||||
|
||||
func initGGML() {
|
||||
ggmlInit.Do(func() {
|
||||
tmpDir, err := os.MkdirTemp("", "llama-*")
|
||||
if err != nil {
|
||||
log.Fatalf("llama.cpp: failed to create temp dir: %v", err)
|
||||
}
|
||||
|
||||
llamaPath := osPath(ggmlGPU)
|
||||
if _, err := fs.Stat(llamaCppEmbed, llamaPath); err != nil {
|
||||
llamaPath = osPath(ggmlCPU)
|
||||
if _, err := fs.Stat(llamaCppEmbed, llamaPath); err != nil {
|
||||
log.Fatalf("llama.cpp executable not found")
|
||||
}
|
||||
}
|
||||
|
||||
files := []string{"server"}
|
||||
switch runtime.GOOS {
|
||||
case "windows":
|
||||
files = []string{"server.exe"}
|
||||
case "darwin":
|
||||
if llamaPath == osPath(ggmlGPU) {
|
||||
files = append(files, "ggml-metal.metal")
|
||||
}
|
||||
}
|
||||
|
||||
for _, f := range files {
|
||||
srcPath := path.Join(llamaPath, f)
|
||||
destPath := filepath.Join(tmpDir, f)
|
||||
|
||||
srcFile, err := llamaCppEmbed.Open(srcPath)
|
||||
if err != nil {
|
||||
log.Fatalf("read llama.cpp %s: %v", f, err)
|
||||
}
|
||||
defer srcFile.Close()
|
||||
|
||||
destFile, err := os.OpenFile(destPath, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
|
||||
if err != nil {
|
||||
log.Fatalf("write llama.cpp %s: %v", f, err)
|
||||
}
|
||||
defer destFile.Close()
|
||||
|
||||
if _, err := io.Copy(destFile, srcFile); err != nil {
|
||||
log.Fatalf("copy llama.cpp %s: %v", f, err)
|
||||
}
|
||||
}
|
||||
|
||||
ggmlRunnerPath = filepath.Join(tmpDir, "server")
|
||||
if runtime.GOOS == "windows" {
|
||||
ggmlRunnerPath = filepath.Join(tmpDir, "server.exe")
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
type ModelRunner struct {
|
||||
Path string // path to the model runner executable
|
||||
}
|
||||
|
||||
func ggmlRunner() ModelRunner {
|
||||
initGGML()
|
||||
return ModelRunner{Path: ggmlRunnerPath}
|
||||
func chooseRunners(workDir, runnerType string) []ModelRunner {
|
||||
buildPath := path.Join("llama.cpp", runnerType, "build")
|
||||
var runners []string
|
||||
|
||||
// set the runners based on the OS
|
||||
// IMPORTANT: the order of the runners in the array is the priority order
|
||||
switch runtime.GOOS {
|
||||
case "darwin":
|
||||
runners = []string{
|
||||
path.Join(buildPath, "metal", "bin", "server"),
|
||||
path.Join(buildPath, "cpu", "bin", "server"),
|
||||
}
|
||||
case "linux":
|
||||
runners = []string{
|
||||
path.Join(buildPath, "cuda", "bin", "server"),
|
||||
path.Join(buildPath, "cpu", "bin", "server"),
|
||||
}
|
||||
case "windows":
|
||||
// TODO: select windows GPU runner here when available
|
||||
runners = []string{
|
||||
path.Join(buildPath, "cpu", "bin", "Release", "server.exe"),
|
||||
}
|
||||
default:
|
||||
log.Printf("unknown OS, running on CPU: %s", runtime.GOOS)
|
||||
runners = []string{
|
||||
path.Join(buildPath, "cpu", "bin", "server"),
|
||||
}
|
||||
}
|
||||
|
||||
runnerAvailable := false // if no runner files are found in the embed, this flag will cause a fast fail
|
||||
for _, r := range runners {
|
||||
// find all the files in the runner's bin directory
|
||||
files, err := fs.Glob(llamaCppEmbed, filepath.Join(filepath.Dir(r), "*"))
|
||||
if err != nil {
|
||||
// this is expected, ollama may be compiled without all runners packed in
|
||||
log.Printf("%s runner not found: %v", r, err)
|
||||
continue
|
||||
}
|
||||
runnerAvailable = true
|
||||
|
||||
for _, f := range files {
|
||||
srcFile, err := llamaCppEmbed.Open(f)
|
||||
if err != nil {
|
||||
log.Fatalf("read llama runner %s: %v", f, err)
|
||||
}
|
||||
defer srcFile.Close()
|
||||
|
||||
// create the directory in case it does not exist
|
||||
destPath := filepath.Join(workDir, filepath.Dir(f))
|
||||
if err := os.MkdirAll(destPath, 0o755); err != nil {
|
||||
log.Fatalf("create runner temp dir %s: %v", filepath.Dir(f), err)
|
||||
}
|
||||
|
||||
destFile := filepath.Join(destPath, filepath.Base(f))
|
||||
|
||||
_, err = os.Stat(destFile)
|
||||
switch {
|
||||
case errors.Is(err, os.ErrNotExist):
|
||||
destFile, err := os.OpenFile(destFile, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
|
||||
if err != nil {
|
||||
log.Fatalf("write llama runner %s: %v", f, err)
|
||||
}
|
||||
defer destFile.Close()
|
||||
|
||||
if _, err := io.Copy(destFile, srcFile); err != nil {
|
||||
log.Fatalf("copy llama runner %s: %v", f, err)
|
||||
}
|
||||
case err != nil:
|
||||
log.Fatalf("stat llama runner %s: %v", f, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
if !runnerAvailable {
|
||||
log.Fatalf("%s runner not found", runnerType)
|
||||
}
|
||||
|
||||
// return the runners to try in priority order
|
||||
localRunnersByPriority := []ModelRunner{}
|
||||
for _, r := range runners {
|
||||
localRunnersByPriority = append(localRunnersByPriority, ModelRunner{Path: path.Join(workDir, r)})
|
||||
}
|
||||
|
||||
return localRunnersByPriority
|
||||
}
|
||||
|
||||
type llamaModel struct {
|
||||
hyperparameters llamaHyperparameters
|
||||
}
|
||||
|
||||
func (llm *llamaModel) ModelFamily() ModelFamily {
|
||||
return ModelFamilyLlama
|
||||
func (llm *llamaModel) ModelFamily() string {
|
||||
return "llama"
|
||||
}
|
||||
|
||||
func (llm *llamaModel) ModelType() ModelType {
|
||||
switch llm.hyperparameters.NumLayer {
|
||||
func llamaModelType(numLayer uint32) string {
|
||||
switch numLayer {
|
||||
case 26:
|
||||
return ModelType3B
|
||||
return "3B"
|
||||
case 32:
|
||||
return ModelType7B
|
||||
return "7B"
|
||||
case 40:
|
||||
return ModelType13B
|
||||
return "13B"
|
||||
case 48:
|
||||
return ModelType34B
|
||||
return "34B"
|
||||
case 60:
|
||||
return ModelType30B
|
||||
return "30B"
|
||||
case 80:
|
||||
return ModelType65B
|
||||
return "65B"
|
||||
default:
|
||||
return "Unknown"
|
||||
}
|
||||
|
||||
// TODO: find a better default
|
||||
return ModelType7B
|
||||
}
|
||||
|
||||
func (llm *llamaModel) FileType() FileType {
|
||||
return llm.hyperparameters.FileType
|
||||
func (llm *llamaModel) ModelType() string {
|
||||
return llamaModelType(llm.hyperparameters.NumLayer)
|
||||
}
|
||||
|
||||
func (llm *llamaModel) FileType() string {
|
||||
return fileType(llm.hyperparameters.FileType)
|
||||
}
|
||||
|
||||
func (llm *llamaModel) NumLayers() int64 {
|
||||
return int64(llm.hyperparameters.NumLayer)
|
||||
}
|
||||
|
||||
type llamaHyperparameters struct {
|
||||
@@ -157,70 +170,7 @@ type llamaHyperparameters struct {
|
||||
NumRot uint32
|
||||
|
||||
// FileType describes the quantization level of the model, e.g. Q4_0, Q5_K, etc.
|
||||
FileType llamaFileType
|
||||
}
|
||||
|
||||
type llamaFileType uint32
|
||||
|
||||
const (
|
||||
llamaFileTypeF32 llamaFileType = iota
|
||||
llamaFileTypeF16
|
||||
llamaFileTypeQ4_0
|
||||
llamaFileTypeQ4_1
|
||||
llamaFileTypeQ4_1_F16
|
||||
llamaFileTypeQ8_0 llamaFileType = iota + 2
|
||||
llamaFileTypeQ5_0
|
||||
llamaFileTypeQ5_1
|
||||
llamaFileTypeQ2_K
|
||||
llamaFileTypeQ3_K_S
|
||||
llamaFileTypeQ3_K_M
|
||||
llamaFileTypeQ3_K_L
|
||||
llamaFileTypeQ4_K_S
|
||||
llamaFileTypeQ4_K_M
|
||||
llamaFileTypeQ5_K_S
|
||||
llamaFileTypeQ5_K_M
|
||||
llamaFileTypeQ6_K
|
||||
)
|
||||
|
||||
func (ft llamaFileType) String() string {
|
||||
switch ft {
|
||||
case llamaFileTypeF32:
|
||||
return "F32"
|
||||
case llamaFileTypeF16:
|
||||
return "F16"
|
||||
case llamaFileTypeQ4_0:
|
||||
return "Q4_0"
|
||||
case llamaFileTypeQ4_1:
|
||||
return "Q4_1"
|
||||
case llamaFileTypeQ4_1_F16:
|
||||
return "Q4_1_F16"
|
||||
case llamaFileTypeQ8_0:
|
||||
return "Q8_0"
|
||||
case llamaFileTypeQ5_0:
|
||||
return "Q5_0"
|
||||
case llamaFileTypeQ5_1:
|
||||
return "Q5_1"
|
||||
case llamaFileTypeQ2_K:
|
||||
return "Q2_K"
|
||||
case llamaFileTypeQ3_K_S:
|
||||
return "Q3_K_S"
|
||||
case llamaFileTypeQ3_K_M:
|
||||
return "Q3_K_M"
|
||||
case llamaFileTypeQ3_K_L:
|
||||
return "Q3_K_L"
|
||||
case llamaFileTypeQ4_K_S:
|
||||
return "Q4_K_S"
|
||||
case llamaFileTypeQ4_K_M:
|
||||
return "Q4_K_M"
|
||||
case llamaFileTypeQ5_K_S:
|
||||
return "Q5_K_S"
|
||||
case llamaFileTypeQ5_K_M:
|
||||
return "Q5_K_M"
|
||||
case llamaFileTypeQ6_K:
|
||||
return "Q6_K"
|
||||
default:
|
||||
return "Unknown"
|
||||
}
|
||||
FileType uint32
|
||||
}
|
||||
|
||||
type Running struct {
|
||||
@@ -234,12 +184,66 @@ type llama struct {
|
||||
Running
|
||||
}
|
||||
|
||||
func newLlama(model string, adapters []string, runner ModelRunner, opts api.Options) (*llama, error) {
|
||||
if _, err := os.Stat(model); err != nil {
|
||||
return nil, err
|
||||
var errNoGPU = errors.New("nvidia-smi command failed")
|
||||
|
||||
// CheckVRAM returns the available VRAM in MiB on Linux machines with NVIDIA GPUs
|
||||
func CheckVRAM() (int, error) {
|
||||
cmd := exec.Command("nvidia-smi", "--query-gpu=memory.total", "--format=csv,noheader,nounits")
|
||||
var stdout bytes.Buffer
|
||||
cmd.Stdout = &stdout
|
||||
err := cmd.Run()
|
||||
if err != nil {
|
||||
return 0, errNoGPU
|
||||
}
|
||||
|
||||
if _, err := os.Stat(runner.Path); err != nil {
|
||||
var total int
|
||||
scanner := bufio.NewScanner(&stdout)
|
||||
for scanner.Scan() {
|
||||
line := scanner.Text()
|
||||
vram, err := strconv.Atoi(line)
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("failed to parse available VRAM: %v", err)
|
||||
}
|
||||
|
||||
total += vram
|
||||
}
|
||||
|
||||
return total, nil
|
||||
}
|
||||
|
||||
func NumGPU(numLayer, fileSizeBytes int64, opts api.Options) int {
|
||||
if opts.NumGPU != -1 {
|
||||
return opts.NumGPU
|
||||
}
|
||||
n := 1 // default to enable metal on macOS
|
||||
if runtime.GOOS == "linux" {
|
||||
vramMib, err := CheckVRAM()
|
||||
if err != nil {
|
||||
if err.Error() != "nvidia-smi command failed" {
|
||||
log.Print(err.Error())
|
||||
}
|
||||
// nvidia driver not installed or no nvidia GPU found
|
||||
return 0
|
||||
}
|
||||
|
||||
totalVramBytes := int64(vramMib) * 1024 * 1024 // 1 MiB = 1024^2 bytes
|
||||
|
||||
// Calculate bytes per layer
|
||||
// TODO: this is a rough heuristic, better would be to calculate this based on number of layers and context size
|
||||
bytesPerLayer := fileSizeBytes / numLayer
|
||||
|
||||
// set n to the max number of layers we can fit in VRAM
|
||||
return int(totalVramBytes / bytesPerLayer)
|
||||
|
||||
log.Printf("%d MiB VRAM available, loading up to %d GPU layers", vramMib, n)
|
||||
}
|
||||
// default to enable metal on macOS
|
||||
return 1
|
||||
}
|
||||
|
||||
func newLlama(model string, adapters []string, runners []ModelRunner, numLayers int64, opts api.Options) (*llama, error) {
|
||||
fileInfo, err := os.Stat(model)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
@@ -250,14 +254,17 @@ func newLlama(model string, adapters []string, runner ModelRunner, opts api.Opti
|
||||
params := []string{
|
||||
"--model", model,
|
||||
"--ctx-size", fmt.Sprintf("%d", opts.NumCtx),
|
||||
"--gqa", fmt.Sprintf("%d", opts.NumGQA),
|
||||
"--rope-freq-base", fmt.Sprintf("%f", opts.RopeFrequencyBase),
|
||||
"--rope-freq-scale", fmt.Sprintf("%f", opts.RopeFrequencyScale),
|
||||
"--batch-size", fmt.Sprintf("%d", opts.NumBatch),
|
||||
"--n-gpu-layers", fmt.Sprintf("%d", opts.NumGPU),
|
||||
"--n-gpu-layers", fmt.Sprintf("%d", NumGPU(numLayers, fileInfo.Size(), opts)),
|
||||
"--embedding",
|
||||
}
|
||||
|
||||
if opts.NumGQA > 0 {
|
||||
params = append(params, "--gqa", fmt.Sprintf("%d", opts.NumGQA))
|
||||
}
|
||||
|
||||
if len(adapters) > 0 {
|
||||
// TODO: applying multiple adapters is not supported by the llama.cpp server yet
|
||||
params = append(params, "--lora", adapters[0])
|
||||
@@ -281,7 +288,12 @@ func newLlama(model string, adapters []string, runner ModelRunner, opts api.Opti
|
||||
}
|
||||
|
||||
// start the llama.cpp server with a retry in case the port is already in use
|
||||
for try := 0; try < 3; try++ {
|
||||
for _, runner := range runners {
|
||||
if _, err := os.Stat(runner.Path); err != nil {
|
||||
log.Printf("llama runner not found: %v", err)
|
||||
continue
|
||||
}
|
||||
|
||||
port := rand.Intn(65535-49152) + 49152 // get a random port in the ephemeral range
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
cmd := exec.CommandContext(
|
||||
@@ -289,67 +301,70 @@ func newLlama(model string, adapters []string, runner ModelRunner, opts api.Opti
|
||||
runner.Path,
|
||||
append(params, "--port", strconv.Itoa(port))...,
|
||||
)
|
||||
cmd.Env = append(os.Environ(), fmt.Sprintf("LD_LIBRARY_PATH=%s", filepath.Dir(runner.Path)))
|
||||
cmd.Stdout = os.Stderr
|
||||
cmd.Stderr = os.Stderr
|
||||
|
||||
llm := &llama{Options: opts, Running: Running{Port: port, Cmd: cmd, Cancel: cancel}}
|
||||
|
||||
log.Print("starting llama runner")
|
||||
if err := llm.Cmd.Start(); err != nil {
|
||||
log.Printf("error starting the external llama runner: %v", err)
|
||||
continue
|
||||
}
|
||||
|
||||
// monitor the command, it is blocking, so if it exits we need to capture that
|
||||
go func() {
|
||||
err := llm.Cmd.Wait() // this will block until the command exits
|
||||
if err != nil {
|
||||
log.Printf("llama runner exited with error: %v", err)
|
||||
} else {
|
||||
log.Printf("llama runner exited")
|
||||
}
|
||||
}()
|
||||
|
||||
if err := waitForServer(llm); err != nil {
|
||||
log.Printf("error starting llama.cpp server: %v", err)
|
||||
log.Printf("error starting llama runner: %v", err)
|
||||
llm.Close()
|
||||
// try again
|
||||
continue
|
||||
}
|
||||
|
||||
// server started successfully
|
||||
return llm, nil
|
||||
}
|
||||
|
||||
return nil, fmt.Errorf("max retry exceeded starting llama.cpp")
|
||||
return nil, fmt.Errorf("failed to start a llama runner")
|
||||
}
|
||||
|
||||
func waitForServer(llm *llama) error {
|
||||
log.Print("starting llama.cpp server")
|
||||
var stderr bytes.Buffer
|
||||
llm.Cmd.Stderr = &stderr
|
||||
err := llm.Cmd.Start()
|
||||
if err != nil {
|
||||
return fmt.Errorf("error starting the external llama.cpp server: %w", err)
|
||||
}
|
||||
|
||||
exitChan := make(chan error, 1)
|
||||
|
||||
// the server is a long running process, watch for it exiting to keep track of something going wrong
|
||||
go func() {
|
||||
err := llm.Cmd.Wait()
|
||||
log.Print(stderr.String())
|
||||
exitChan <- err
|
||||
}()
|
||||
|
||||
// wait for the server to start responding
|
||||
start := time.Now()
|
||||
expiresAt := time.Now().Add(30 * time.Second)
|
||||
ticker := time.NewTicker(100 * time.Millisecond)
|
||||
expiresAt := time.Now().Add(2 * time.Minute) // be generous with timeout, large models can take a while to load
|
||||
ticker := time.NewTicker(200 * time.Millisecond)
|
||||
|
||||
log.Print("waiting for llama.cpp server to start responding")
|
||||
log.Print("waiting for llama runner to start responding")
|
||||
for range ticker.C {
|
||||
if time.Now().After(expiresAt) {
|
||||
return fmt.Errorf("llama runner did not start within alloted time, retrying")
|
||||
}
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ticker.C:
|
||||
if time.Now().After(expiresAt) {
|
||||
return fmt.Errorf("llama.cpp server did not start responding within 30 seconds, retrying")
|
||||
}
|
||||
if err := llm.Ping(context.Background()); err == nil {
|
||||
log.Printf("llama.cpp server started in %f seconds", time.Since(start).Seconds())
|
||||
return nil
|
||||
}
|
||||
case err := <-exitChan:
|
||||
return fmt.Errorf("llama.cpp server exited unexpectedly: %w", err)
|
||||
// check if the server process has terminated
|
||||
if llm.Cmd.ProcessState != nil && llm.Cmd.ProcessState.Exited() {
|
||||
return fmt.Errorf("llama runner process has terminated")
|
||||
}
|
||||
|
||||
if err := llm.Ping(context.Background()); err == nil {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
log.Printf("llama runner started in %f seconds", time.Since(start).Seconds())
|
||||
return nil
|
||||
}
|
||||
|
||||
func (llm *llama) Close() {
|
||||
llm.Running.Cmd.Cancel()
|
||||
llm.Cancel()
|
||||
}
|
||||
|
||||
func (llm *llama) SetOptions(opts api.Options) {
|
||||
@@ -676,7 +691,7 @@ func (llm *llama) Embedding(ctx context.Context, input string) ([]float64, error
|
||||
|
||||
// Ping checks that the server subprocess is still running and responding to requests
|
||||
func (llm *llama) Ping(ctx context.Context) error {
|
||||
resp, err := http.Head(fmt.Sprintf("http://127.0.0.1:%d", llm.Running.Port))
|
||||
resp, err := http.Head(fmt.Sprintf("http://127.0.0.1:%d", llm.Port))
|
||||
if err != nil {
|
||||
return fmt.Errorf("ping resp: %w", err)
|
||||
}
|
48
llm/llm.go
48
llm/llm.go
@@ -21,7 +21,7 @@ type LLM interface {
|
||||
Ping(context.Context) error
|
||||
}
|
||||
|
||||
func New(model string, adapters []string, opts api.Options) (LLM, error) {
|
||||
func New(workDir, model string, adapters []string, opts api.Options) (LLM, error) {
|
||||
if _, err := os.Stat(model); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@@ -32,15 +32,22 @@ func New(model string, adapters []string, opts api.Options) (LLM, error) {
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
ggml, err := DecodeGGML(f, ModelFamilyLlama)
|
||||
ggml, err := DecodeGGML(f)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
switch ggml.FileType().String() {
|
||||
case "F32", "Q5_0", "Q5_1", "Q8_0":
|
||||
switch ggml.FileType() {
|
||||
case "Q8_0":
|
||||
if ggml.Name() != "gguf" && opts.NumGPU != 0 {
|
||||
// GGML Q8_0 do not support Metal API and will
|
||||
// cause the runner to segmentation fault so disable GPU
|
||||
log.Printf("WARNING: GPU disabled for F32, Q5_0, Q5_1, and Q8_0")
|
||||
opts.NumGPU = 0
|
||||
}
|
||||
case "F32", "Q5_0", "Q5_1":
|
||||
if opts.NumGPU != 0 {
|
||||
// F32, F16, Q5_0, Q5_1, and Q8_0 do not support Metal API and will
|
||||
// F32, Q5_0, Q5_1, and Q8_0 do not support Metal API and will
|
||||
// cause the runner to segmentation fault so disable GPU
|
||||
log.Printf("WARNING: GPU disabled for F32, Q5_0, Q5_1, and Q8_0")
|
||||
opts.NumGPU = 0
|
||||
@@ -49,35 +56,44 @@ func New(model string, adapters []string, opts api.Options) (LLM, error) {
|
||||
|
||||
totalResidentMemory := memory.TotalMemory()
|
||||
switch ggml.ModelType() {
|
||||
case ModelType3B, ModelType7B:
|
||||
if ggml.FileType().String() == "F16" && totalResidentMemory < 16*1024*1024 {
|
||||
case "3B", "7B":
|
||||
if ggml.FileType() == "F16" && totalResidentMemory < 16*1024*1024 {
|
||||
return nil, fmt.Errorf("F16 model requires at least 16GB of memory")
|
||||
} else if totalResidentMemory < 8*1024*1024 {
|
||||
return nil, fmt.Errorf("model requires at least 8GB of memory")
|
||||
}
|
||||
case ModelType13B:
|
||||
if ggml.FileType().String() == "F16" && totalResidentMemory < 32*1024*1024 {
|
||||
case "13B":
|
||||
if ggml.FileType() == "F16" && totalResidentMemory < 32*1024*1024 {
|
||||
return nil, fmt.Errorf("F16 model requires at least 32GB of memory")
|
||||
} else if totalResidentMemory < 16*1024*1024 {
|
||||
return nil, fmt.Errorf("model requires at least 16GB of memory")
|
||||
}
|
||||
case ModelType30B, ModelType34B:
|
||||
if ggml.FileType().String() == "F16" && totalResidentMemory < 64*1024*1024 {
|
||||
case "30B", "34B", "40B":
|
||||
if ggml.FileType() == "F16" && totalResidentMemory < 64*1024*1024 {
|
||||
return nil, fmt.Errorf("F16 model requires at least 64GB of memory")
|
||||
} else if totalResidentMemory < 32*1024*1024 {
|
||||
return nil, fmt.Errorf("model requires at least 32GB of memory")
|
||||
}
|
||||
case ModelType65B:
|
||||
if ggml.FileType().String() == "F16" && totalResidentMemory < 128*1024*1024 {
|
||||
case "65B", "70B":
|
||||
if ggml.FileType() == "F16" && totalResidentMemory < 128*1024*1024 {
|
||||
return nil, fmt.Errorf("F16 model requires at least 128GB of memory")
|
||||
} else if totalResidentMemory < 64*1024*1024 {
|
||||
return nil, fmt.Errorf("model requires at least 64GB of memory")
|
||||
}
|
||||
case "180B":
|
||||
if ggml.FileType() == "F16" && totalResidentMemory < 512*1024*1024 {
|
||||
return nil, fmt.Errorf("F16 model requires at least 512GB of memory")
|
||||
} else if totalResidentMemory < 128*1024*1024 {
|
||||
return nil, fmt.Errorf("model requires at least 128GB of memory")
|
||||
}
|
||||
}
|
||||
|
||||
switch ggml.ModelFamily() {
|
||||
case ModelFamilyLlama:
|
||||
return newLlama(model, adapters, ggmlRunner(), opts)
|
||||
switch ggml.Name() {
|
||||
case "gguf":
|
||||
opts.NumGQA = 0 // TODO: remove this when llama.cpp runners differ enough to need separate newLlama functions
|
||||
return newLlama(model, adapters, chooseRunners(workDir, "gguf"), ggml.NumLayers(), opts)
|
||||
case "ggml", "ggmf", "ggjt", "ggla":
|
||||
return newLlama(model, adapters, chooseRunners(workDir, "ggml"), ggml.NumLayers(), opts)
|
||||
default:
|
||||
return nil, fmt.Errorf("unknown ggml type: %s", ggml.ModelFamily())
|
||||
}
|
||||
|
@@ -8,7 +8,7 @@ GO_LDFLAGS="$GO_LDFLAGS -X github.com/jmorganca/ollama/server.mode=release"
|
||||
# build universal binary
|
||||
GOARCH=arm64 go generate ./...
|
||||
GOARCH=arm64 go build -ldflags "$GO_LDFLAGS" -o dist/ollama-darwin-arm64
|
||||
rm -rf llm/llama.cpp/ggml/build/*/bin
|
||||
rm -rf llm/llama.cpp/*/build/*/bin
|
||||
GOARCH=amd64 go generate ./...
|
||||
GOARCH=amd64 go build -ldflags "$GO_LDFLAGS" -o dist/ollama-darwin-amd64
|
||||
lipo -create -output dist/ollama dist/ollama-darwin-arm64 dist/ollama-darwin-amd64
|
||||
|
12
scripts/build_linux.sh
Executable file
12
scripts/build_linux.sh
Executable file
@@ -0,0 +1,12 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
|
||||
mkdir -p dist
|
||||
|
||||
for ARCH in arm64 amd64; do
|
||||
docker buildx build --platform=linux/$ARCH -f Dockerfile.build . -t builder:$ARCH --load
|
||||
docker create --platform linux/$ARCH --name builder builder:$ARCH
|
||||
docker cp builder:/go/src/github.com/jmorganca/ollama/ollama ./dist/ollama-linux-$ARCH
|
||||
docker rm builder
|
||||
done
|
227
scripts/install.sh
Normal file
227
scripts/install.sh
Normal file
@@ -0,0 +1,227 @@
|
||||
#!/bin/sh
|
||||
# This script installs Ollama on Linux.
|
||||
# It detects the current operating system architecture and installs the appropriate version of Ollama.
|
||||
|
||||
set -eu
|
||||
|
||||
status() { echo ">>> $*" >&2; }
|
||||
error() { echo "ERROR $*"; exit 1; }
|
||||
warning() { echo "WARNING: $*"; }
|
||||
|
||||
TEMP_DIR=$(mktemp -d)
|
||||
cleanup() { rm -rf $TEMP_DIR; }
|
||||
trap cleanup EXIT
|
||||
|
||||
available() { command -v $1 >/dev/null; }
|
||||
require() {
|
||||
local MISSING=''
|
||||
for TOOL in $*; do
|
||||
if ! available $TOOL; then
|
||||
MISSING="$MISSING $TOOL"
|
||||
fi
|
||||
done
|
||||
|
||||
echo $MISSING
|
||||
}
|
||||
|
||||
[ "$(uname -s)" = "Linux" ] || error 'This script is intended to run on Linux only.'
|
||||
|
||||
case "$(uname -m)" in
|
||||
x86_64) ARCH="amd64" ;;
|
||||
aarch64|arm64) ARCH="arm64" ;;
|
||||
*) error "Unsupported architecture: $ARCH" ;;
|
||||
esac
|
||||
|
||||
SUDO=
|
||||
if [ "$(id -u)" -ne 0 ]; then
|
||||
# Running as root, no need for sudo
|
||||
if ! available sudo; then
|
||||
error "This script requires superuser permissions. Please re-run as root."
|
||||
fi
|
||||
|
||||
SUDO="sudo"
|
||||
fi
|
||||
|
||||
NEEDS=$(require curl awk grep sed tee xargs)
|
||||
if [ -n "$NEEDS" ]; then
|
||||
status "ERROR: The following tools are required but missing:"
|
||||
for NEED in $NEEDS; do
|
||||
echo " - $NEED"
|
||||
done
|
||||
exit 1
|
||||
fi
|
||||
|
||||
status "Downloading ollama..."
|
||||
$SUDO curl --fail --show-error --location --progress-bar -o $TEMP_DIR/ollama "https://ollama.ai/download/ollama-linux-$ARCH"
|
||||
|
||||
status "Installing ollama to /usr/bin..."
|
||||
$SUDO install -o0 -g0 -m755 -d /usr/bin
|
||||
$SUDO install -o0 -g0 -m755 $TEMP_DIR/ollama /usr/bin/ollama
|
||||
|
||||
install_success() { status 'Install complete. Run "ollama" from the command line.'; }
|
||||
trap install_success EXIT
|
||||
|
||||
# Everything from this point onwards is optional.
|
||||
|
||||
configure_systemd() {
|
||||
if ! id ollama >/dev/null 2>&1; then
|
||||
status "Creating ollama user..."
|
||||
$SUDO useradd -r -s /bin/false -m -d /usr/share/ollama ollama
|
||||
fi
|
||||
|
||||
status "Creating ollama systemd service..."
|
||||
cat <<EOF | $SUDO tee /etc/systemd/system/ollama.service >/dev/null
|
||||
[Unit]
|
||||
Description=Ollama Service
|
||||
After=network-online.target
|
||||
|
||||
[Service]
|
||||
ExecStart=/usr/bin/ollama serve
|
||||
User=ollama
|
||||
Group=ollama
|
||||
Restart=always
|
||||
RestartSec=3
|
||||
Environment="HOME=/usr/share/ollama"
|
||||
|
||||
[Install]
|
||||
WantedBy=default.target
|
||||
EOF
|
||||
SYSTEMCTL_RUNNING="$(systemctl is-system-running || true)"
|
||||
case $SYSTEMCTL_RUNNING in
|
||||
running|degraded)
|
||||
status "Enabling and starting ollama service..."
|
||||
$SUDO systemctl daemon-reload
|
||||
$SUDO systemctl enable ollama
|
||||
$SUDO systemctl restart ollama
|
||||
;;
|
||||
esac
|
||||
}
|
||||
|
||||
if available systemctl; then
|
||||
configure_systemd
|
||||
fi
|
||||
|
||||
if ! available lspci && ! available lshw; then
|
||||
warning "Unable to detect NVIDIA GPU. Install lspci or lshw to automatically detect and install NVIDIA CUDA drivers."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
check_gpu() {
|
||||
case $1 in
|
||||
lspci) available lspci && lspci -d '10de:' | grep -q 'NVIDIA' || return 1 ;;
|
||||
lshw) available lshw && $SUDO lshw -c display -numeric | grep -q 'vendor: .* \[10DE\]' || return 1 ;;
|
||||
nvidia-smi) available nvidia-smi || return 1 ;;
|
||||
esac
|
||||
}
|
||||
|
||||
if ! check_gpu lspci && ! check_gpu lshw; then
|
||||
warning "No NVIDIA GPU detected. Ollama will run in CPU-only mode."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# ref: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#rhel-7-centos-7
|
||||
# ref: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#rhel-8-rocky-8
|
||||
# ref: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#rhel-9-rocky-9
|
||||
# ref: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#fedora
|
||||
install_cuda_driver_yum() {
|
||||
status 'Installing NVIDIA repository...'
|
||||
case $PACKAGE_MANAGER in
|
||||
yum)
|
||||
$SUDO $PACKAGE_MANAGER -y install yum-utils
|
||||
$SUDO $PACKAGE_MANAGER-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-$1$2.repo
|
||||
;;
|
||||
dnf)
|
||||
$SUDO $PACKAGE_MANAGER config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-$1$2.repo
|
||||
;;
|
||||
esac
|
||||
|
||||
case $1 in
|
||||
rhel)
|
||||
status 'Installing EPEL repository...'
|
||||
# EPEL is required for third-party dependencies such as dkms and libvdpau
|
||||
$SUDO $PACKAGE_MANAGER -y install https://dl.fedoraproject.org/pub/epel/epel-release-latest-$2.noarch.rpm || true
|
||||
;;
|
||||
esac
|
||||
|
||||
status 'Installing CUDA driver...'
|
||||
|
||||
if [ "$1" = 'centos' ] || [ "$1$2" = 'rhel7' ]; then
|
||||
$SUDO $PACKAGE_MANAGER -y install nvidia-driver-latest-dkms
|
||||
fi
|
||||
|
||||
$SUDO $PACKAGE_MANAGER -y install cuda-drivers
|
||||
}
|
||||
|
||||
# ref: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#ubuntu
|
||||
# ref: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#debian
|
||||
install_cuda_driver_apt() {
|
||||
status 'Installing NVIDIA repository...'
|
||||
curl -fsSL -o $TEMP_DIR/cuda-keyring.deb https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-keyring_1.1-1_all.deb
|
||||
|
||||
case $1 in
|
||||
debian)
|
||||
status 'Enabling contrib sources...'
|
||||
$SUDO sed 's/main/contrib/' < /etc/apt/sources.list | sudo tee /etc/apt/sources.list.d/contrib.list > /dev/null
|
||||
;;
|
||||
esac
|
||||
|
||||
status 'Installing CUDA driver...'
|
||||
$SUDO dpkg -i $TEMP_DIR/cuda-keyring.deb
|
||||
$SUDO apt-get update
|
||||
|
||||
[ -n "$SUDO" ] && SUDO_E="$SUDO -E" || SUDO_E=
|
||||
DEBIAN_FRONTEND=noninteractive $SUDO_E apt-get -y install cuda-drivers -q
|
||||
}
|
||||
|
||||
if [ ! -f "/etc/os-release" ]; then
|
||||
error "Unknown distribution. Skipping CUDA installation."
|
||||
fi
|
||||
|
||||
. /etc/os-release
|
||||
|
||||
OS_NAME=$ID
|
||||
OS_VERSION=$VERSION_ID
|
||||
|
||||
PACKAGE_MANAGER=
|
||||
for PACKAGE_MANAGER in dnf yum apt-get; do
|
||||
if available $PACKAGE_MANAGER; then
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
if [ -z "$PACKAGE_MANAGER" ]; then
|
||||
error "Unknown package manager. Skipping CUDA installation."
|
||||
fi
|
||||
|
||||
if ! check_gpu nvidia-smi || [ -z "$(nvidia-smi | grep -o "CUDA Version: [0-9]*\.[0-9]*")" ]; then
|
||||
case $OS_NAME in
|
||||
centos|rhel) install_cuda_driver_yum 'rhel' $OS_VERSION ;;
|
||||
rocky) install_cuda_driver_yum 'rhel' $(echo $OS_VERSION | cut -c1) ;;
|
||||
fedora) install_cuda_driver_yum $OS_NAME $OS_VERSION ;;
|
||||
amzn) install_cuda_driver_yum 'fedora' '35' ;;
|
||||
debian) install_cuda_driver_apt $OS_NAME $OS_VERSION ;;
|
||||
ubuntu) install_cuda_driver_apt $OS_NAME $(echo $OS_VERSION | sed 's/\.//') ;;
|
||||
*) exit ;;
|
||||
esac
|
||||
fi
|
||||
|
||||
if ! lsmod | grep -q nvidia; then
|
||||
KERNEL_RELEASE="$(uname -r)"
|
||||
case $OS_NAME in
|
||||
centos|rhel|rocky|fedora|amzn) $SUDO $PACKAGE_MANAGER -y install kernel-devel-$KERNEL_RELEASE kernel-headers-$KERNEL_RELEASE ;;
|
||||
debian|ubuntu) $SUDO apt-get -y install linux-headers-$KERNEL_RELEASE ;;
|
||||
*) exit ;;
|
||||
esac
|
||||
|
||||
NVIDIA_CUDA_VERSION=$($SUDO dkms status | awk -F: '/added/ { print $1 }')
|
||||
if [ -n "$NVIDIA_CUDA_VERSION" ]; then
|
||||
$SUDO dkms install $NVIDIA_CUDA_VERSION
|
||||
fi
|
||||
|
||||
if lsmod | grep -q nouveau; then
|
||||
status "Removing nouveau..."
|
||||
$SUDO rmmod nouveau
|
||||
fi
|
||||
|
||||
$SUDO modprobe nvidia
|
||||
fi
|
@@ -14,7 +14,7 @@ import (
|
||||
"net/http"
|
||||
"net/url"
|
||||
"os"
|
||||
"path"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
@@ -71,7 +71,7 @@ func (r AuthRedirect) URL() (*url.URL, error) {
|
||||
return redirectURL, nil
|
||||
}
|
||||
|
||||
func getAuthToken(ctx context.Context, redirData AuthRedirect, regOpts *RegistryOptions) (string, error) {
|
||||
func getAuthToken(ctx context.Context, redirData AuthRedirect) (string, error) {
|
||||
redirectURL, err := redirData.URL()
|
||||
if err != nil {
|
||||
return "", err
|
||||
@@ -82,7 +82,7 @@ func getAuthToken(ctx context.Context, redirData AuthRedirect, regOpts *Registry
|
||||
return "", err
|
||||
}
|
||||
|
||||
keyPath := path.Join(home, ".ollama", "id_ed25519")
|
||||
keyPath := filepath.Join(home, ".ollama", "id_ed25519")
|
||||
|
||||
rawKey, err := os.ReadFile(keyPath)
|
||||
if err != nil {
|
||||
@@ -103,7 +103,7 @@ func getAuthToken(ctx context.Context, redirData AuthRedirect, regOpts *Registry
|
||||
|
||||
headers := make(http.Header)
|
||||
headers.Set("Authorization", sig)
|
||||
resp, err := makeRequest(ctx, "GET", redirectURL, headers, nil, regOpts)
|
||||
resp, err := makeRequest(ctx, "GET", redirectURL, headers, nil, nil)
|
||||
if err != nil {
|
||||
log.Printf("couldn't get token: %q", err)
|
||||
}
|
||||
|
@@ -8,7 +8,7 @@ import (
|
||||
"log"
|
||||
"net/http"
|
||||
"os"
|
||||
"path"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"sync"
|
||||
"time"
|
||||
@@ -173,7 +173,7 @@ func doDownload(ctx context.Context, opts downloadOpts, f *FileDownload) error {
|
||||
return fmt.Errorf("%w: on download registry responded with code %d: %v", errDownload, resp.StatusCode, string(body))
|
||||
}
|
||||
|
||||
err = os.MkdirAll(path.Dir(f.FilePath), 0o700)
|
||||
err = os.MkdirAll(filepath.Dir(f.FilePath), 0o700)
|
||||
if err != nil {
|
||||
return fmt.Errorf("make blobs directory: %w", err)
|
||||
}
|
||||
|
231
server/images.go
231
server/images.go
@@ -14,7 +14,6 @@ import (
|
||||
"net/http"
|
||||
"net/url"
|
||||
"os"
|
||||
"path"
|
||||
"path/filepath"
|
||||
"reflect"
|
||||
"runtime"
|
||||
@@ -114,10 +113,11 @@ type LayerReader struct {
|
||||
}
|
||||
|
||||
type ConfigV2 struct {
|
||||
ModelFamily llm.ModelFamily `json:"model_family"`
|
||||
ModelType string `json:"model_type"`
|
||||
FileType string `json:"file_type"`
|
||||
RootFS RootFS `json:"rootfs"`
|
||||
ModelFormat string `json:"model_format"`
|
||||
ModelFamily string `json:"model_family"`
|
||||
ModelType string `json:"model_type"`
|
||||
FileType string `json:"file_type"`
|
||||
RootFS RootFS `json:"rootfs"`
|
||||
|
||||
// required by spec
|
||||
Architecture string `json:"architecture"`
|
||||
@@ -267,7 +267,30 @@ func filenameWithPath(path, f string) (string, error) {
|
||||
return f, nil
|
||||
}
|
||||
|
||||
func CreateModel(ctx context.Context, name string, path string, fn func(resp api.ProgressResponse)) error {
|
||||
func CreateModel(ctx context.Context, workDir, name string, path string, fn func(resp api.ProgressResponse)) error {
|
||||
mp := ParseModelPath(name)
|
||||
|
||||
var manifest *ManifestV2
|
||||
var err error
|
||||
var noprune string
|
||||
|
||||
// build deleteMap to prune unused layers
|
||||
deleteMap := make(map[string]bool)
|
||||
|
||||
if noprune = os.Getenv("OLLAMA_NOPRUNE"); noprune == "" {
|
||||
manifest, _, err = GetManifest(mp)
|
||||
if err != nil && !errors.Is(err, os.ErrNotExist) {
|
||||
return err
|
||||
}
|
||||
|
||||
if manifest != nil {
|
||||
for _, l := range manifest.Layers {
|
||||
deleteMap[l.Digest] = true
|
||||
}
|
||||
deleteMap[manifest.Config.Digest] = true
|
||||
}
|
||||
}
|
||||
|
||||
mf, err := os.Open(path)
|
||||
if err != nil {
|
||||
fn(api.ProgressResponse{Status: fmt.Sprintf("couldn't open modelfile '%s'", path)})
|
||||
@@ -328,14 +351,15 @@ func CreateModel(ctx context.Context, name string, path string, fn func(resp api
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
ggml, err := llm.DecodeGGML(file, llm.ModelFamilyLlama)
|
||||
ggml, err := llm.DecodeGGML(file)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
config.ModelFormat = ggml.Name()
|
||||
config.ModelFamily = ggml.ModelFamily()
|
||||
config.ModelType = ggml.ModelType().String()
|
||||
config.FileType = ggml.FileType().String()
|
||||
config.ModelType = ggml.ModelType()
|
||||
config.FileType = ggml.FileType()
|
||||
|
||||
// reset the file
|
||||
file.Seek(0, io.SeekStart)
|
||||
@@ -366,9 +390,10 @@ func CreateModel(ctx context.Context, name string, path string, fn func(resp api
|
||||
return err
|
||||
}
|
||||
|
||||
// copie the model metadata
|
||||
// copy the model metadata
|
||||
config.ModelFamily = source.ModelFamily
|
||||
config.ModelType = source.ModelType
|
||||
config.ModelFormat = source.ModelFormat
|
||||
config.FileType = source.FileType
|
||||
|
||||
for _, l := range mf.Layers {
|
||||
@@ -435,8 +460,10 @@ func CreateModel(ctx context.Context, name string, path string, fn func(resp api
|
||||
return err
|
||||
}
|
||||
|
||||
layer.MediaType = mediaType
|
||||
layers = append(layers, layer)
|
||||
if layer.Size > 0 {
|
||||
layer.MediaType = mediaType
|
||||
layers = append(layers, layer)
|
||||
}
|
||||
case "template", "system", "prompt":
|
||||
fn(api.ProgressResponse{Status: fmt.Sprintf("creating model %s layer", c.Name)})
|
||||
// remove the layer if one exists
|
||||
@@ -448,8 +475,10 @@ func CreateModel(ctx context.Context, name string, path string, fn func(resp api
|
||||
return err
|
||||
}
|
||||
|
||||
layer.MediaType = mediaType
|
||||
layers = append(layers, layer)
|
||||
if layer.Size > 0 {
|
||||
layer.MediaType = mediaType
|
||||
layers = append(layers, layer)
|
||||
}
|
||||
default:
|
||||
// runtime parameters, build a list of args for each parameter to allow multiple values to be specified (ex: multiple stop sequences)
|
||||
params[c.Name] = append(params[c.Name], c.Args)
|
||||
@@ -472,6 +501,12 @@ func CreateModel(ctx context.Context, name string, path string, fn func(resp api
|
||||
}
|
||||
}
|
||||
|
||||
if config.ModelType == "65B" {
|
||||
if numGQA, ok := formattedParams["num_gqa"].(int); ok && numGQA == 8 {
|
||||
config.ModelType = "70B"
|
||||
}
|
||||
}
|
||||
|
||||
bts, err := json.Marshal(formattedParams)
|
||||
if err != nil {
|
||||
return err
|
||||
@@ -489,7 +524,7 @@ func CreateModel(ctx context.Context, name string, path string, fn func(resp api
|
||||
}
|
||||
|
||||
// generate the embedding layers
|
||||
embeddingLayers, err := embeddingLayers(embed)
|
||||
embeddingLayers, err := embeddingLayers(workDir, embed)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -503,6 +538,7 @@ func CreateModel(ctx context.Context, name string, path string, fn func(resp api
|
||||
var manifestLayers []*Layer
|
||||
for _, l := range layers {
|
||||
manifestLayers = append(manifestLayers, &l.Layer)
|
||||
delete(deleteMap, l.Layer.Digest)
|
||||
}
|
||||
|
||||
// Create a layer for the config object
|
||||
@@ -512,6 +548,7 @@ func CreateModel(ctx context.Context, name string, path string, fn func(resp api
|
||||
return err
|
||||
}
|
||||
layers = append(layers, cfg)
|
||||
delete(deleteMap, cfg.Layer.Digest)
|
||||
|
||||
if err := SaveLayers(layers, fn, false); err != nil {
|
||||
return err
|
||||
@@ -524,6 +561,14 @@ func CreateModel(ctx context.Context, name string, path string, fn func(resp api
|
||||
return err
|
||||
}
|
||||
|
||||
if noprune == "" {
|
||||
fn(api.ProgressResponse{Status: "removing any unused layers"})
|
||||
err = deleteUnusedLayers(nil, deleteMap, false)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
fn(api.ProgressResponse{Status: "success"})
|
||||
return nil
|
||||
}
|
||||
@@ -536,7 +581,7 @@ type EmbeddingParams struct {
|
||||
}
|
||||
|
||||
// embeddingLayers loads the associated LLM and generates the embeddings to be stored from an input file
|
||||
func embeddingLayers(e EmbeddingParams) ([]*LayerReader, error) {
|
||||
func embeddingLayers(workDir string, e EmbeddingParams) ([]*LayerReader, error) {
|
||||
layers := []*LayerReader{}
|
||||
if len(e.files) > 0 {
|
||||
// check if the model is a file path or a model name
|
||||
@@ -549,7 +594,7 @@ func embeddingLayers(e EmbeddingParams) ([]*LayerReader, error) {
|
||||
model = &Model{ModelPath: e.model}
|
||||
}
|
||||
|
||||
if err := load(context.Background(), model, e.opts, defaultSessionDuration); err != nil {
|
||||
if err := load(context.Background(), workDir, model, e.opts, defaultSessionDuration); err != nil {
|
||||
return nil, fmt.Errorf("load model to generate embeddings: %v", err)
|
||||
}
|
||||
|
||||
@@ -779,14 +824,14 @@ func formatParams(params map[string][]string) (map[string]interface{}, error) {
|
||||
return nil, fmt.Errorf("invalid float value %s", vals)
|
||||
}
|
||||
|
||||
out[key] = floatVal
|
||||
out[key] = float32(floatVal)
|
||||
case reflect.Int:
|
||||
intVal, err := strconv.ParseInt(vals[0], 10, 0)
|
||||
intVal, err := strconv.ParseInt(vals[0], 10, 64)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("invalid int value %s", vals)
|
||||
}
|
||||
|
||||
out[key] = intVal
|
||||
out[key] = int(intVal)
|
||||
case reflect.Bool:
|
||||
boolVal, err := strconv.ParseBool(vals[0])
|
||||
if err != nil {
|
||||
@@ -866,18 +911,7 @@ func CopyModel(src, dest string) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func DeleteModel(name string) error {
|
||||
mp := ParseModelPath(name)
|
||||
manifest, _, err := GetManifest(mp)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
deleteMap := make(map[string]bool)
|
||||
for _, layer := range manifest.Layers {
|
||||
deleteMap[layer.Digest] = true
|
||||
}
|
||||
deleteMap[manifest.Config.Digest] = true
|
||||
|
||||
func deleteUnusedLayers(skipModelPath *ModelPath, deleteMap map[string]bool, dryRun bool) error {
|
||||
fp, err := GetManifestPath()
|
||||
if err != nil {
|
||||
return err
|
||||
@@ -894,14 +928,13 @@ func DeleteModel(name string) error {
|
||||
fmp := ParseModelPath(tag)
|
||||
|
||||
// skip the manifest we're trying to delete
|
||||
if mp.GetFullTagname() == fmp.GetFullTagname() {
|
||||
if skipModelPath != nil && skipModelPath.GetFullTagname() == fmp.GetFullTagname() {
|
||||
return nil
|
||||
}
|
||||
|
||||
// save (i.e. delete from the deleteMap) any files used in other manifests
|
||||
manifest, _, err := GetManifest(fmp)
|
||||
if err != nil {
|
||||
log.Printf("skipping file: %s", fp)
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -925,14 +958,72 @@ func DeleteModel(name string) error {
|
||||
log.Printf("couldn't get file path for '%s': %v", k, err)
|
||||
continue
|
||||
}
|
||||
if err := os.Remove(fp); err != nil {
|
||||
log.Printf("couldn't remove file '%s': %v", fp, err)
|
||||
continue
|
||||
if !dryRun {
|
||||
if err := os.Remove(fp); err != nil {
|
||||
log.Printf("couldn't remove file '%s': %v", fp, err)
|
||||
continue
|
||||
}
|
||||
} else {
|
||||
log.Printf("wanted to remove: %s", fp)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fp, err = mp.GetManifestPath(false)
|
||||
return nil
|
||||
}
|
||||
|
||||
func PruneLayers() error {
|
||||
deleteMap := make(map[string]bool)
|
||||
p, err := GetBlobsPath("")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
blobs, err := os.ReadDir(p)
|
||||
if err != nil {
|
||||
log.Printf("couldn't read dir '%s': %v", p, err)
|
||||
return err
|
||||
}
|
||||
|
||||
for _, blob := range blobs {
|
||||
name := blob.Name()
|
||||
if runtime.GOOS == "windows" {
|
||||
name = strings.ReplaceAll(name, "-", ":")
|
||||
}
|
||||
deleteMap[name] = true
|
||||
}
|
||||
|
||||
log.Printf("total blobs: %d", len(deleteMap))
|
||||
|
||||
err = deleteUnusedLayers(nil, deleteMap, false)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
log.Printf("total unused blobs removed: %d", len(deleteMap))
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func DeleteModel(name string) error {
|
||||
mp := ParseModelPath(name)
|
||||
manifest, _, err := GetManifest(mp)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
deleteMap := make(map[string]bool)
|
||||
for _, layer := range manifest.Layers {
|
||||
deleteMap[layer.Digest] = true
|
||||
}
|
||||
deleteMap[manifest.Config.Digest] = true
|
||||
|
||||
err = deleteUnusedLayers(&mp, deleteMap, false)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
fp, err := mp.GetManifestPath(false)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -1063,14 +1154,14 @@ func PushModel(ctx context.Context, name string, regOpts *RegistryOptions, fn fu
|
||||
Total: layer.Size,
|
||||
})
|
||||
|
||||
location, err := startUpload(ctx, mp, layer, regOpts)
|
||||
location, chunkSize, err := startUpload(ctx, mp, layer, regOpts)
|
||||
if err != nil {
|
||||
log.Printf("couldn't start upload: %v", err)
|
||||
return err
|
||||
}
|
||||
|
||||
if strings.HasPrefix(path.Base(location.Path), "sha256:") {
|
||||
layer.Digest = path.Base(location.Path)
|
||||
if strings.HasPrefix(filepath.Base(location.Path), "sha256:") {
|
||||
layer.Digest = filepath.Base(location.Path)
|
||||
fn(api.ProgressResponse{
|
||||
Status: "using existing layer",
|
||||
Digest: layer.Digest,
|
||||
@@ -1080,7 +1171,7 @@ func PushModel(ctx context.Context, name string, regOpts *RegistryOptions, fn fu
|
||||
continue
|
||||
}
|
||||
|
||||
if err := uploadBlobChunked(ctx, location, layer, regOpts, fn); err != nil {
|
||||
if err := uploadBlob(ctx, location, layer, chunkSize, regOpts, fn); err != nil {
|
||||
log.Printf("error uploading blob: %v", err)
|
||||
return err
|
||||
}
|
||||
@@ -1111,13 +1202,34 @@ func PushModel(ctx context.Context, name string, regOpts *RegistryOptions, fn fu
|
||||
func PullModel(ctx context.Context, name string, regOpts *RegistryOptions, fn func(api.ProgressResponse)) error {
|
||||
mp := ParseModelPath(name)
|
||||
|
||||
var manifest *ManifestV2
|
||||
var err error
|
||||
var noprune string
|
||||
|
||||
// build deleteMap to prune unused layers
|
||||
deleteMap := make(map[string]bool)
|
||||
|
||||
if noprune = os.Getenv("OLLAMA_NOPRUNE"); noprune == "" {
|
||||
manifest, _, err = GetManifest(mp)
|
||||
if err != nil && !errors.Is(err, os.ErrNotExist) {
|
||||
return err
|
||||
}
|
||||
|
||||
if manifest != nil {
|
||||
for _, l := range manifest.Layers {
|
||||
deleteMap[l.Digest] = true
|
||||
}
|
||||
deleteMap[manifest.Config.Digest] = true
|
||||
}
|
||||
}
|
||||
|
||||
if mp.ProtocolScheme == "http" && !regOpts.Insecure {
|
||||
return fmt.Errorf("insecure protocol http")
|
||||
}
|
||||
|
||||
fn(api.ProgressResponse{Status: "pulling manifest"})
|
||||
|
||||
manifest, err := pullModelManifest(ctx, mp, regOpts)
|
||||
manifest, err = pullModelManifest(ctx, mp, regOpts)
|
||||
if err != nil {
|
||||
return fmt.Errorf("pull model manifest: %s", err)
|
||||
}
|
||||
@@ -1137,7 +1249,9 @@ func PullModel(ctx context.Context, name string, regOpts *RegistryOptions, fn fu
|
||||
}); err != nil {
|
||||
return err
|
||||
}
|
||||
delete(deleteMap, layer.Digest)
|
||||
}
|
||||
delete(deleteMap, manifest.Config.Digest)
|
||||
|
||||
fn(api.ProgressResponse{Status: "verifying sha256 digest"})
|
||||
for _, layer := range layers {
|
||||
@@ -1175,6 +1289,14 @@ func PullModel(ctx context.Context, name string, regOpts *RegistryOptions, fn fu
|
||||
return err
|
||||
}
|
||||
|
||||
if noprune == "" {
|
||||
fn(api.ProgressResponse{Status: "removing any unused layers"})
|
||||
err = deleteUnusedLayers(nil, deleteMap, false)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
fn(api.ProgressResponse{Status: "success"})
|
||||
|
||||
return nil
|
||||
@@ -1275,7 +1397,7 @@ func makeRequestWithRetry(ctx context.Context, method string, requestURL *url.UR
|
||||
case resp.StatusCode == http.StatusUnauthorized:
|
||||
auth := resp.Header.Get("www-authenticate")
|
||||
authRedir := ParseAuthRedirectString(auth)
|
||||
token, err := getAuthToken(ctx, authRedir, regOpts)
|
||||
token, err := getAuthToken(ctx, authRedir)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@@ -1300,7 +1422,7 @@ func makeRequestWithRetry(ctx context.Context, method string, requestURL *url.UR
|
||||
}
|
||||
|
||||
func makeRequest(ctx context.Context, method string, requestURL *url.URL, headers http.Header, body io.Reader, regOpts *RegistryOptions) (*http.Response, error) {
|
||||
if requestURL.Scheme != "http" && regOpts.Insecure {
|
||||
if requestURL.Scheme != "http" && regOpts != nil && regOpts.Insecure {
|
||||
requestURL.Scheme = "http"
|
||||
}
|
||||
|
||||
@@ -1313,14 +1435,25 @@ func makeRequest(ctx context.Context, method string, requestURL *url.URL, header
|
||||
req.Header = headers
|
||||
}
|
||||
|
||||
if regOpts.Token != "" {
|
||||
req.Header.Set("Authorization", "Bearer "+regOpts.Token)
|
||||
} else if regOpts.Username != "" && regOpts.Password != "" {
|
||||
req.SetBasicAuth(regOpts.Username, regOpts.Password)
|
||||
if regOpts != nil {
|
||||
if regOpts.Token != "" {
|
||||
req.Header.Set("Authorization", "Bearer "+regOpts.Token)
|
||||
} else if regOpts.Username != "" && regOpts.Password != "" {
|
||||
req.SetBasicAuth(regOpts.Username, regOpts.Password)
|
||||
}
|
||||
}
|
||||
|
||||
req.Header.Set("User-Agent", fmt.Sprintf("ollama/%s (%s %s) Go/%s", version.Version, runtime.GOARCH, runtime.GOOS, runtime.Version()))
|
||||
|
||||
if s := req.Header.Get("Content-Length"); s != "" {
|
||||
contentLength, err := strconv.ParseInt(s, 10, 64)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
req.ContentLength = contentLength
|
||||
}
|
||||
|
||||
client := &http.Client{
|
||||
CheckRedirect: func(req *http.Request, via []*http.Request) error {
|
||||
if len(via) >= 10 {
|
||||
|
@@ -133,7 +133,12 @@ func GetBlobsPath(digest string) (string, error) {
|
||||
}
|
||||
|
||||
path := filepath.Join(home, ".ollama", "models", "blobs", digest)
|
||||
if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil {
|
||||
dirPath := filepath.Dir(path)
|
||||
if digest == "" {
|
||||
dirPath = path
|
||||
}
|
||||
|
||||
if err := os.MkdirAll(dirPath, 0o755); err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
|
104
server/routes.go
104
server/routes.go
@@ -12,6 +12,7 @@ import (
|
||||
"os/signal"
|
||||
"path/filepath"
|
||||
"reflect"
|
||||
"runtime"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
@@ -57,7 +58,7 @@ var loaded struct {
|
||||
var defaultSessionDuration = 5 * time.Minute
|
||||
|
||||
// load a model into memory if it is not already loaded, it is up to the caller to lock loaded.mu before calling this function
|
||||
func load(ctx context.Context, model *Model, reqOpts map[string]interface{}, sessionDuration time.Duration) error {
|
||||
func load(ctx context.Context, workDir string, model *Model, reqOpts map[string]interface{}, sessionDuration time.Duration) error {
|
||||
opts := api.DefaultOptions()
|
||||
if err := opts.FromMap(model.Options); err != nil {
|
||||
log.Printf("could not load model options: %v", err)
|
||||
@@ -93,7 +94,7 @@ func load(ctx context.Context, model *Model, reqOpts map[string]interface{}, ses
|
||||
loaded.Embeddings = model.Embeddings
|
||||
}
|
||||
|
||||
llmModel, err := llm.New(model.ModelPath, model.AdapterPaths, opts)
|
||||
llmModel, err := llm.New(workDir, model.ModelPath, model.AdapterPaths, opts)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -129,6 +130,7 @@ func load(ctx context.Context, model *Model, reqOpts map[string]interface{}, ses
|
||||
llmModel.SetOptions(opts)
|
||||
}
|
||||
}
|
||||
|
||||
loaded.expireAt = time.Now().Add(sessionDuration)
|
||||
|
||||
if loaded.expireTimer == nil {
|
||||
@@ -149,6 +151,7 @@ func load(ctx context.Context, model *Model, reqOpts map[string]interface{}, ses
|
||||
loaded.digest = ""
|
||||
})
|
||||
}
|
||||
|
||||
loaded.expireTimer.Reset(sessionDuration)
|
||||
return nil
|
||||
}
|
||||
@@ -171,8 +174,11 @@ func GenerateHandler(c *gin.Context) {
|
||||
return
|
||||
}
|
||||
|
||||
sessionDuration := defaultSessionDuration // TODO: set this duration from the request if specified
|
||||
if err := load(c.Request.Context(), model, req.Options, sessionDuration); err != nil {
|
||||
workDir := c.GetString("workDir")
|
||||
|
||||
// TODO: set this duration from the request if specified
|
||||
sessionDuration := defaultSessionDuration
|
||||
if err := load(c.Request.Context(), workDir, model, req.Options, sessionDuration); err != nil {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
||||
return
|
||||
}
|
||||
@@ -217,8 +223,13 @@ func GenerateHandler(c *gin.Context) {
|
||||
ch <- r
|
||||
}
|
||||
|
||||
if err := loaded.llm.Predict(c.Request.Context(), req.Context, prompt, fn); err != nil {
|
||||
ch <- gin.H{"error": err.Error()}
|
||||
// an empty request loads the model
|
||||
if req.Prompt == "" && req.Template == "" && req.System == "" {
|
||||
ch <- api.GenerateResponse{Model: req.Model, Done: true}
|
||||
} else {
|
||||
if err := loaded.llm.Predict(c.Request.Context(), req.Context, prompt, fn); err != nil {
|
||||
ch <- gin.H{"error": err.Error()}
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
@@ -240,7 +251,9 @@ func EmbeddingHandler(c *gin.Context) {
|
||||
c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
|
||||
return
|
||||
}
|
||||
if err := load(c.Request.Context(), model, req.Options, 5*time.Minute); err != nil {
|
||||
|
||||
workDir := c.GetString("workDir")
|
||||
if err := load(c.Request.Context(), workDir, model, req.Options, 5*time.Minute); err != nil {
|
||||
c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
|
||||
return
|
||||
}
|
||||
@@ -330,6 +343,8 @@ func CreateModelHandler(c *gin.Context) {
|
||||
return
|
||||
}
|
||||
|
||||
workDir := c.GetString("workDir")
|
||||
|
||||
ch := make(chan any)
|
||||
go func() {
|
||||
defer close(ch)
|
||||
@@ -340,7 +355,7 @@ func CreateModelHandler(c *gin.Context) {
|
||||
ctx, cancel := context.WithCancel(c.Request.Context())
|
||||
defer cancel()
|
||||
|
||||
if err := CreateModel(ctx, req.Name, req.Path, fn); err != nil {
|
||||
if err := CreateModel(ctx, workDir, req.Name, req.Path, fn); err != nil {
|
||||
ch <- gin.H{"error": err.Error()}
|
||||
}
|
||||
}()
|
||||
@@ -363,6 +378,7 @@ func DeleteModelHandler(c *gin.Context) {
|
||||
}
|
||||
return
|
||||
}
|
||||
c.JSON(http.StatusOK, nil)
|
||||
}
|
||||
|
||||
func ShowModelHandler(c *gin.Context) {
|
||||
@@ -493,33 +509,40 @@ func CopyModelHandler(c *gin.Context) {
|
||||
}
|
||||
}
|
||||
|
||||
func Serve(ln net.Listener, origins []string) error {
|
||||
var defaultAllowOrigins = []string{
|
||||
"localhost",
|
||||
"127.0.0.1",
|
||||
"0.0.0.0",
|
||||
}
|
||||
|
||||
func Serve(ln net.Listener, allowOrigins []string) error {
|
||||
config := cors.DefaultConfig()
|
||||
config.AllowWildcard = true
|
||||
config.AllowOrigins = append(origins, []string{
|
||||
"http://localhost",
|
||||
"http://localhost:*",
|
||||
"https://localhost",
|
||||
"https://localhost:*",
|
||||
"http://127.0.0.1",
|
||||
"http://127.0.0.1:*",
|
||||
"https://127.0.0.1",
|
||||
"https://127.0.0.1:*",
|
||||
"http://0.0.0.0",
|
||||
"http://0.0.0.0:*",
|
||||
"https://0.0.0.0",
|
||||
"https://0.0.0.0:*",
|
||||
}...)
|
||||
|
||||
config.AllowOrigins = allowOrigins
|
||||
for _, allowOrigin := range defaultAllowOrigins {
|
||||
config.AllowOrigins = append(config.AllowOrigins,
|
||||
fmt.Sprintf("http://%s", allowOrigin),
|
||||
fmt.Sprintf("https://%s", allowOrigin),
|
||||
fmt.Sprintf("http://%s:*", allowOrigin),
|
||||
fmt.Sprintf("https://%s:*", allowOrigin),
|
||||
)
|
||||
}
|
||||
|
||||
workDir, err := os.MkdirTemp("", "ollama")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer os.RemoveAll(workDir)
|
||||
|
||||
r := gin.Default()
|
||||
r.Use(cors.New(config))
|
||||
|
||||
r.GET("/", func(c *gin.Context) {
|
||||
c.String(http.StatusOK, "Ollama is running")
|
||||
})
|
||||
r.HEAD("/", func(c *gin.Context) {
|
||||
c.Status(http.StatusOK)
|
||||
})
|
||||
r.Use(
|
||||
cors.New(config),
|
||||
func(c *gin.Context) {
|
||||
c.Set("workDir", workDir)
|
||||
c.Next()
|
||||
},
|
||||
)
|
||||
|
||||
r.POST("/api/pull", PullModelHandler)
|
||||
r.POST("/api/generate", GenerateHandler)
|
||||
@@ -527,10 +550,17 @@ func Serve(ln net.Listener, origins []string) error {
|
||||
r.POST("/api/create", CreateModelHandler)
|
||||
r.POST("/api/push", PushModelHandler)
|
||||
r.POST("/api/copy", CopyModelHandler)
|
||||
r.GET("/api/tags", ListModelsHandler)
|
||||
r.DELETE("/api/delete", DeleteModelHandler)
|
||||
r.POST("/api/show", ShowModelHandler)
|
||||
|
||||
for _, method := range []string{http.MethodGet, http.MethodHead} {
|
||||
r.Handle(method, "/", func(c *gin.Context) {
|
||||
c.String(http.StatusOK, "Ollama is running")
|
||||
})
|
||||
|
||||
r.Handle(method, "/api/tags", ListModelsHandler)
|
||||
}
|
||||
|
||||
log.Printf("Listening on %s", ln.Addr())
|
||||
s := &http.Server{
|
||||
Handler: r,
|
||||
@@ -538,15 +568,23 @@ func Serve(ln net.Listener, origins []string) error {
|
||||
|
||||
// listen for a ctrl+c and stop any loaded llm
|
||||
signals := make(chan os.Signal, 1)
|
||||
signal.Notify(signals, syscall.SIGINT)
|
||||
signal.Notify(signals, syscall.SIGINT, syscall.SIGTERM)
|
||||
go func() {
|
||||
<-signals
|
||||
if loaded.llm != nil {
|
||||
loaded.llm.Close()
|
||||
}
|
||||
os.RemoveAll(workDir)
|
||||
os.Exit(0)
|
||||
}()
|
||||
|
||||
if runtime.GOOS == "linux" {
|
||||
// check compatibility to log warnings
|
||||
if _, err := llm.CheckVRAM(); err != nil {
|
||||
log.Printf("Warning: GPU support may not enabled, check you have installed install GPU drivers: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
return s.Serve(ln)
|
||||
}
|
||||
|
||||
|
209
server/upload.go
209
server/upload.go
@@ -14,7 +14,12 @@ import (
|
||||
"github.com/jmorganca/ollama/api"
|
||||
)
|
||||
|
||||
func startUpload(ctx context.Context, mp ModelPath, layer *Layer, regOpts *RegistryOptions) (*url.URL, error) {
|
||||
const (
|
||||
redirectChunkSize = 1024 * 1024 * 1024
|
||||
regularChunkSize = 95 * 1024 * 1024
|
||||
)
|
||||
|
||||
func startUpload(ctx context.Context, mp ModelPath, layer *Layer, regOpts *RegistryOptions) (*url.URL, int64, error) {
|
||||
requestURL := mp.BaseURL()
|
||||
requestURL = requestURL.JoinPath("v2", mp.GetNamespaceRepository(), "blobs/uploads/")
|
||||
if layer.From != "" {
|
||||
@@ -27,20 +32,26 @@ func startUpload(ctx context.Context, mp ModelPath, layer *Layer, regOpts *Regis
|
||||
resp, err := makeRequestWithRetry(ctx, "POST", requestURL, nil, nil, regOpts)
|
||||
if err != nil {
|
||||
log.Printf("couldn't start upload: %v", err)
|
||||
return nil, err
|
||||
return nil, 0, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
// Extract UUID location from header
|
||||
location := resp.Header.Get("Location")
|
||||
location := resp.Header.Get("Docker-Upload-Location")
|
||||
chunkSize := redirectChunkSize
|
||||
if location == "" {
|
||||
return nil, fmt.Errorf("location header is missing in response")
|
||||
location = resp.Header.Get("Location")
|
||||
chunkSize = regularChunkSize
|
||||
}
|
||||
|
||||
return url.Parse(location)
|
||||
locationURL, err := url.Parse(location)
|
||||
if err != nil {
|
||||
return nil, 0, err
|
||||
}
|
||||
|
||||
return locationURL, int64(chunkSize), nil
|
||||
}
|
||||
|
||||
func uploadBlobChunked(ctx context.Context, requestURL *url.URL, layer *Layer, regOpts *RegistryOptions, fn func(api.ProgressResponse)) error {
|
||||
func uploadBlob(ctx context.Context, requestURL *url.URL, layer *Layer, chunkSize int64, regOpts *RegistryOptions, fn func(api.ProgressResponse)) error {
|
||||
// TODO allow resumability
|
||||
// TODO allow canceling uploads via DELETE
|
||||
|
||||
@@ -55,8 +66,12 @@ func uploadBlobChunked(ctx context.Context, requestURL *url.URL, layer *Layer, r
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
// 95MB chunk size
|
||||
chunkSize := 95 * 1024 * 1024
|
||||
pw := ProgressWriter{
|
||||
status: fmt.Sprintf("uploading %s", layer.Digest),
|
||||
digest: layer.Digest,
|
||||
total: layer.Size,
|
||||
fn: fn,
|
||||
}
|
||||
|
||||
for offset := int64(0); offset < int64(layer.Size); {
|
||||
chunk := int64(layer.Size) - offset
|
||||
@@ -64,80 +79,27 @@ func uploadBlobChunked(ctx context.Context, requestURL *url.URL, layer *Layer, r
|
||||
chunk = int64(chunkSize)
|
||||
}
|
||||
|
||||
sectionReader := io.NewSectionReader(f, int64(offset), chunk)
|
||||
for try := 0; try < MaxRetries; try++ {
|
||||
r, w := io.Pipe()
|
||||
defer r.Close()
|
||||
go func() {
|
||||
defer w.Close()
|
||||
resp, err := uploadBlobChunk(ctx, http.MethodPatch, requestURL, f, offset, chunk, regOpts, &pw)
|
||||
if err != nil {
|
||||
fn(api.ProgressResponse{
|
||||
Status: fmt.Sprintf("error uploading chunk: %v", err),
|
||||
Digest: layer.Digest,
|
||||
Total: layer.Size,
|
||||
Completed: int(offset),
|
||||
})
|
||||
|
||||
for chunked := int64(0); chunked < chunk; {
|
||||
n, err := io.CopyN(w, sectionReader, 1024*1024)
|
||||
if err != nil && !errors.Is(err, io.EOF) {
|
||||
fn(api.ProgressResponse{
|
||||
Status: fmt.Sprintf("error reading chunk: %v", err),
|
||||
Digest: layer.Digest,
|
||||
Total: layer.Size,
|
||||
Completed: int(offset),
|
||||
})
|
||||
return err
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
offset += chunk
|
||||
location := resp.Header.Get("Docker-Upload-Location")
|
||||
if location == "" {
|
||||
location = resp.Header.Get("Location")
|
||||
}
|
||||
|
||||
chunked += n
|
||||
fn(api.ProgressResponse{
|
||||
Status: fmt.Sprintf("uploading %s", layer.Digest),
|
||||
Digest: layer.Digest,
|
||||
Total: layer.Size,
|
||||
Completed: int(offset) + int(chunked),
|
||||
})
|
||||
}
|
||||
}()
|
||||
|
||||
headers := make(http.Header)
|
||||
headers.Set("Content-Type", "application/octet-stream")
|
||||
headers.Set("Content-Length", strconv.Itoa(int(chunk)))
|
||||
headers.Set("Content-Range", fmt.Sprintf("%d-%d", offset, offset+sectionReader.Size()-1))
|
||||
resp, err := makeRequest(ctx, "PATCH", requestURL, headers, r, regOpts)
|
||||
if err != nil && !errors.Is(err, io.EOF) {
|
||||
fn(api.ProgressResponse{
|
||||
Status: fmt.Sprintf("error uploading chunk: %v", err),
|
||||
Digest: layer.Digest,
|
||||
Total: layer.Size,
|
||||
Completed: int(offset),
|
||||
})
|
||||
|
||||
return err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
switch {
|
||||
case resp.StatusCode == http.StatusUnauthorized:
|
||||
auth := resp.Header.Get("www-authenticate")
|
||||
authRedir := ParseAuthRedirectString(auth)
|
||||
token, err := getAuthToken(ctx, authRedir, regOpts)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
regOpts.Token = token
|
||||
if _, err := sectionReader.Seek(0, io.SeekStart); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
continue
|
||||
case resp.StatusCode >= http.StatusBadRequest:
|
||||
body, _ := io.ReadAll(resp.Body)
|
||||
return fmt.Errorf("on upload registry responded with code %d: %s", resp.StatusCode, body)
|
||||
}
|
||||
|
||||
offset += sectionReader.Size()
|
||||
requestURL, err = url.Parse(resp.Header.Get("Location"))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
break
|
||||
requestURL, err = url.Parse(location)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
@@ -163,3 +125,90 @@ func uploadBlobChunked(ctx context.Context, requestURL *url.URL, layer *Layer, r
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func uploadBlobChunk(ctx context.Context, method string, requestURL *url.URL, r io.ReaderAt, offset, limit int64, opts *RegistryOptions, pw *ProgressWriter) (*http.Response, error) {
|
||||
sectionReader := io.NewSectionReader(r, int64(offset), limit)
|
||||
|
||||
headers := make(http.Header)
|
||||
headers.Set("Content-Type", "application/octet-stream")
|
||||
headers.Set("Content-Length", strconv.Itoa(int(limit)))
|
||||
headers.Set("X-Redirect-Uploads", "1")
|
||||
|
||||
if method == http.MethodPatch {
|
||||
headers.Set("Content-Range", fmt.Sprintf("%d-%d", offset, offset+sectionReader.Size()-1))
|
||||
}
|
||||
|
||||
for try := 0; try < MaxRetries; try++ {
|
||||
resp, err := makeRequest(ctx, method, requestURL, headers, io.TeeReader(sectionReader, pw), opts)
|
||||
if err != nil && !errors.Is(err, io.EOF) {
|
||||
return nil, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
switch {
|
||||
case resp.StatusCode == http.StatusTemporaryRedirect:
|
||||
location, err := resp.Location()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
pw.completed = int(offset)
|
||||
if _, err := uploadBlobChunk(ctx, http.MethodPut, location, r, offset, limit, nil, pw); err != nil {
|
||||
// retry
|
||||
log.Printf("retrying redirected upload: %v", err)
|
||||
continue
|
||||
}
|
||||
|
||||
return resp, nil
|
||||
case resp.StatusCode == http.StatusUnauthorized:
|
||||
auth := resp.Header.Get("www-authenticate")
|
||||
authRedir := ParseAuthRedirectString(auth)
|
||||
token, err := getAuthToken(ctx, authRedir)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
opts.Token = token
|
||||
|
||||
pw.completed = int(offset)
|
||||
sectionReader = io.NewSectionReader(r, offset, limit)
|
||||
continue
|
||||
case resp.StatusCode >= http.StatusBadRequest:
|
||||
body, _ := io.ReadAll(resp.Body)
|
||||
return nil, fmt.Errorf("on upload registry responded with code %d: %s", resp.StatusCode, body)
|
||||
}
|
||||
|
||||
return resp, nil
|
||||
}
|
||||
|
||||
return nil, fmt.Errorf("max retries exceeded")
|
||||
}
|
||||
|
||||
type ProgressWriter struct {
|
||||
status string
|
||||
digest string
|
||||
bucket int
|
||||
completed int
|
||||
total int
|
||||
fn func(api.ProgressResponse)
|
||||
}
|
||||
|
||||
func (pw *ProgressWriter) Write(b []byte) (int, error) {
|
||||
n := len(b)
|
||||
pw.bucket += n
|
||||
pw.completed += n
|
||||
|
||||
// throttle status updates to not spam the client
|
||||
if pw.bucket >= 1024*1024 || pw.completed >= pw.total {
|
||||
pw.fn(api.ProgressResponse{
|
||||
Status: pw.status,
|
||||
Digest: pw.digest,
|
||||
Total: pw.total,
|
||||
Completed: pw.completed,
|
||||
})
|
||||
|
||||
pw.bucket = 0
|
||||
}
|
||||
|
||||
return n, nil
|
||||
}
|
||||
|
Reference in New Issue
Block a user