Compare commits

...

69 Commits

Author SHA1 Message Date
Patrick Devine
9a483dc7b7 refactor to use client.List instead of walking the filesystem 2024-01-26 18:34:42 -08:00
Patrick Devine
366b38460f fix linter 2024-01-26 18:34:42 -08:00
Patrick Devine
021b1bdc4a add --upgrade-all flag to refresh any stale models 2024-01-26 18:34:40 -08:00
Patrick Devine
b5cf31b460 add keep_alive to generate/chat/embedding api endpoints (#2146) 2024-01-26 14:28:02 -08:00
Daniel Hiltgen
cc4915e262 Merge pull request #2214 from dhiltgen/reject_cuda_without_avx
Detect lack of AVX and fallback to CPU mode
2024-01-26 12:06:44 -08:00
Daniel Hiltgen
667a2ba18a Detect lack of AVX and fallback to CPU mode
We build the GPU libraries with AVX enabled to ensure that if not all
layers fit on the GPU we get better performance in a mixed mode.
If the user is using a virtualization/emulation system that lacks AVX
this used to result in an illegal instruction error and crash before this
fix.  Now we will report a warning in the server log, and just use
CPU mode to ensure we don't crash.
2024-01-26 11:36:03 -08:00
Michael Yang
e054ebe059 Merge pull request #2212 from ollama/mxyng/fix-build
fix build
2024-01-26 11:19:08 -08:00
Michael Yang
9d3dcfd0ec fix logging 2024-01-26 11:04:27 -08:00
Michael Yang
6e0ea5ecc8 Merge pull request #1916 from ollama/mxyng/inactivity-monitor
download: add inactivity monitor
2024-01-26 10:56:00 -08:00
Daniel Hiltgen
a47d8b2557 Merge pull request #2197 from dhiltgen/remove_rocm_image
Add back ROCm container support
2024-01-26 09:34:23 -08:00
Daniel Hiltgen
30c43c285c Merge pull request #2195 from dhiltgen/rocm_real_gpus
Ignore AMD integrated GPUs
2024-01-26 09:30:24 -08:00
Daniel Hiltgen
23a7ea593b Merge pull request #2209 from dhiltgen/harden_mgmt
Fix crash on cuda ml init failure
2024-01-26 09:30:13 -08:00
Daniel Hiltgen
75c44aa319 Add back ROCm container support
This adds ROCm support back as a discrete image.
2024-01-26 09:24:29 -08:00
Daniel Hiltgen
9d7b5d6c91 Ignore AMD integrated GPUs
Detect and ignore integrated GPUs reported by rocm.
2024-01-26 09:21:35 -08:00
Daniel Hiltgen
5d9c4a5f5a Fix crash on cuda ml init failure
The new driver lookup code was triggering after init failure due to a missing return
2024-01-26 09:18:33 -08:00
Daniel Hiltgen
197e420a97 Merge pull request #2196 from dhiltgen/remove_rocm_image
Switch back to ubuntu base
2024-01-25 16:50:32 -08:00
Daniel Hiltgen
a34e1ad3cf Switch back to ubuntu base
The size increase for rocm support in the standard image is problematic
We'll revisit multiple tags for rocm support in a follow up PR.
2024-01-25 16:46:01 -08:00
Michael Yang
2ae0556292 Merge pull request #1679 from ollama/mxyng/build-gpus
build cuda and rocm
2024-01-25 16:38:14 -08:00
Jeffrey Morgan
5be9bdd444 Update modelfile.md 2024-01-25 16:29:48 -08:00
Jeffrey Morgan
b706794905 Update modelfile.md to include MESSAGE 2024-01-25 16:29:32 -08:00
Michael Yang
a8c5413d06 only generate gpu libs 2024-01-25 15:41:56 -08:00
Michael Yang
5580de4571 archive ollama binaries 2024-01-25 15:40:16 -08:00
Michael Yang
946431d5b0 build cuda and rocm 2024-01-25 15:40:15 -08:00
Michael Yang
0610126049 remove env setting 2024-01-25 15:39:43 -08:00
Jeffrey Morgan
3ebd6a83fc update submodule to cd4fddb29f81d6a1f6d51a0c016bc6b486d68def 2024-01-25 13:54:11 -08:00
Jeffrey Morgan
a64570dcae Fix clearing kv cache between requests with the same prompt (#2186)
* Fix clearing kv cache between requests with the same prompt

* fix powershell script
2024-01-25 13:46:20 -08:00
Patrick Devine
7c40a67841 Save and load sessions (#2063) 2024-01-25 12:12:36 -08:00
Michael Yang
e64b5b07a2 Merge pull request #2181 from ollama/mxyng/stub-lint
stub generate outputs for lint
2024-01-25 11:55:15 -08:00
Michael Yang
9e1e295cdc Merge pull request #2175 from ollama/mxyng/refactor-tensor-read
refactor tensor read
2024-01-25 09:22:42 -08:00
Jeffrey Morgan
a643823f86 Update README.md 2024-01-24 21:36:56 -08:00
Michael Yang
8e5d359a03 stub generate outputs for lint 2024-01-24 17:36:10 -08:00
Daniel Hiltgen
a170888dd4 Merge pull request #2174 from dhiltgen/rocm_real_gpus
More logging for gpu management
2024-01-24 11:09:17 -08:00
Michael Yang
cd22855ef8 refactor tensor read 2024-01-24 10:48:31 -08:00
Daniel Hiltgen
013fd07139 More logging for gpu management
Fix an ordering glitch of dlerr/dlclose and add more logging to help
root cause some crashes users are hitting. This also refines the
function pointer names to use the underlying function names instead
of simplified names for readability.
2024-01-24 10:32:36 -08:00
Daniel Hiltgen
f63dc2db5c Merge pull request #2162 from dhiltgen/rocm_real_gpus
Report more information about GPUs in verbose mode
2024-01-23 17:45:40 -08:00
Jeffrey Morgan
eaa5a396d9 Update README.md 2024-01-23 16:08:15 -08:00
Jeffrey Morgan
8ed22f5d72 Update README.md 2024-01-23 14:38:01 -08:00
Daniel Hiltgen
987c16b2f7 Report more information about GPUs in verbose mode
This adds additional calls to both CUDA and ROCm management libraries to
discover additional attributes about the GPU(s) detected in the system, and
wires up runtime verbosity selection.  When users hit problems with GPUs we can
ask them to run with `OLLAMA_DEBUG=1 ollama serve` and share the results.
2024-01-23 11:37:02 -08:00
Jeffrey Morgan
950f636d64 Update README.md 2024-01-23 10:29:10 -08:00
Jeffrey Morgan
4458efb73a Load all layers on arm64 macOS if model is small enough (#2149) 2024-01-22 17:40:06 -08:00
Daniel Hiltgen
ceea599494 Merge pull request #2150 from dhiltgen/default_version
Set a default version using git describe
2024-01-22 17:38:27 -08:00
Daniel Hiltgen
3005ec74b3 Set a default version using git describe
If a VERSION is not specified, this will generate a version string that
represents the state of the repo.  For example `0.1.21-12-gffaf52e-dirty`
representing 12 commits away from 0.1.21 tag, on commit gffaf52e
and the tree is dirty.
2024-01-22 17:12:20 -08:00
Daniel Hiltgen
0759d8996e Merge pull request #2148 from dhiltgen/intel_mac
Refine Accelerate usage on mac
2024-01-22 16:56:58 -08:00
Daniel Hiltgen
0f5b843319 Refine Accelerate usage on mac
For old macs, accelerate seems to cause crashes, but for
AVX2 capable macs, it does not.
2024-01-22 16:25:56 -08:00
Jeffrey Morgan
ffaf52e1e9 update submodule to 011e8ec577fd135cbc02993d3ea9840c516d6a1c 2024-01-22 15:16:54 -08:00
Michael Yang
940b10b036 Merge pull request #2144 from jmorganca/mxyng/update-faq
faq: update to use launchctl setenv
2024-01-22 13:46:57 -08:00
Daniel Hiltgen
3bc28736cd Merge pull request #2143 from dhiltgen/llm_verbosity
Refine debug logging for llm
2024-01-22 13:19:16 -08:00
Michael Yang
93a756266c faq: update to use launchctl setenv 2024-01-22 13:10:13 -08:00
Daniel Hiltgen
a0a829bf7a Merge pull request #2142 from dhiltgen/debug_on_fail
Debug logging on init failure
2024-01-22 12:29:22 -08:00
Daniel Hiltgen
730dcfcc7a Refine debug logging for llm
This wires up logging in llama.cpp to always go to stderr, and also
turns up logging if OLLAMA_DEBUG is set.
2024-01-22 12:26:49 -08:00
Daniel Hiltgen
27a2d5af54 Debug logging on init failure 2024-01-22 12:08:22 -08:00
Jeffrey Morgan
5f81a33f43 update submodule to 6f9939d (#2115) 2024-01-22 11:56:40 -08:00
Michael Yang
6225fde046 Merge pull request #2102 from jmorganca/mxyng/fix-create-override
fix: remove overwritten model layers
2024-01-22 09:37:48 -08:00
Meng Zhuo
069184562b readline: drop not use min function (#2134) 2024-01-22 08:15:08 -08:00
Daniel Hiltgen
5576bb2348 Merge pull request #2130 from dhiltgen/more_faster
Make CPU builds parallel and customizable AMD GPUs
2024-01-21 16:14:12 -08:00
Daniel Hiltgen
2738837786 Merge pull request #2131 from dhiltgen/probe_cards_at_init
Probe GPUs before backend init
2024-01-21 16:13:47 -08:00
Daniel Hiltgen
ec3764538d Probe GPUs before backend init
Detect potential error scenarios so we can fallback to CPU mode without
hitting asserts.
2024-01-21 15:59:38 -08:00
Daniel Hiltgen
df54c723ae Make CPU builds parallel and customizable AMD GPUs
The linux build now support parallel CPU builds to speed things up.
This also exposes AMD GPU targets as an optional setting for advaced
users who want to alter our default set.
2024-01-21 15:12:21 -08:00
Daniel Hiltgen
fa8c990e58 Merge pull request #2127 from dhiltgen/rocm_container
Combine the 2 Dockerfiles and add ROCm
2024-01-21 11:49:01 -08:00
Daniel Hiltgen
da72235ebf Combine the 2 Dockerfiles and add ROCm
This renames Dockerfile.build to Dockerfile, and adds some new stages
to support 2 modes of building - the build_linux.sh script uses
intermediate stages to extract the artifacts for ./dist, and the default
build generates a container image usable by both cuda and rocm cards.
This required transitioniing the x86 base to the rocm image to avoid
layer bloat.
2024-01-21 11:37:11 -08:00
Jeffrey Morgan
89c4aee29e Unlock mutex when failing to load model (#2117) 2024-01-20 20:54:46 -05:00
Jeffrey Morgan
f32ea81b21 increase minimum overhead to 1024MiB (#2114) 2024-01-20 17:11:38 -05:00
Jeffrey Morgan
4c54f0ddeb sign dylibs on macOS (#2101) 2024-01-19 19:24:11 -05:00
Michael Yang
c08dfaa23d fix: remove overwritten model layers
if create overrides a manifest, first add the older manifest's layers to
the delete map so they can be cleaned up
2024-01-19 14:58:37 -08:00
Daniel Hiltgen
3b76e736ae Merge pull request #2100 from dhiltgen/more_wsl_globs
More WSL paths
2024-01-19 13:41:08 -08:00
Daniel Hiltgen
552db98bf1 More WSL paths 2024-01-19 13:23:29 -08:00
Daniel Hiltgen
fdcdfef620 Merge pull request #2099 from dhiltgen/fix_cuda_model_swap
Switch to local dlopen symbols
2024-01-19 12:22:04 -08:00
Daniel Hiltgen
6a042438af Switch to local dlopen symbols 2024-01-19 11:37:02 -08:00
Michael Yang
27331ae3a8 download: add inactivity monitor
if a download part is inactive for some time, restart it
2024-01-12 15:23:15 -08:00
39 changed files with 1311 additions and 478 deletions

View File

@@ -23,29 +23,72 @@ jobs:
with: with:
go-version: '1.21' go-version: '1.21'
cache: true cache: true
- if: ${{ startsWith(matrix.os, 'windows-') }}
shell: pwsh
run: |
$path = vswhere -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath
if ($path) {
$path = join-path $path 'Common7\Tools\vsdevcmd.bat'
if (test-path $path) {
cmd /s /c """$path"" $args && set" | where { $_ -match '(\w+)=(.*)' } | foreach {
echo "$($Matches[1])=$($Matches[2])" | Out-File -FilePath $Env:GITHUB_ENV -Encoding utf8 -Append
}
}
}
echo "C:\Program Files\Git\usr\bin" | Out-File -FilePath $Env:GITHUB_PATH -Encoding utf8 -Append
- run: go get ./... - run: go get ./...
- run: go generate -x ./... - run: go generate -x ./...
- uses: actions/upload-artifact@v4 - uses: actions/upload-artifact@v4
with: with:
name: ${{ matrix.os }}-${{ matrix.arch }}-libraries name: ${{ matrix.os }}-${{ matrix.arch }}-libraries
path: | path: llm/llama.cpp/build/**/lib/*
llm/llama.cpp/build/**/lib/* generate-cuda:
strategy:
matrix:
cuda-version:
- '11.8.0'
runs-on: ubuntu-latest
container: nvidia/cuda:${{ matrix.cuda-version }}-devel-ubuntu20.04
steps:
- run: |
apt-get update && apt-get install -y git build-essential curl
curl -fsSL https://github.com/Kitware/CMake/releases/download/v3.28.1/cmake-3.28.1-linux-x86_64.tar.gz \
| tar -zx -C /usr --strip-components 1
env:
DEBIAN_FRONTEND: noninteractive
- uses: actions/checkout@v4
- uses: actions/setup-go@v4
with:
go-version: '1.21'
cache: true
- run: go get ./...
- run: |
git config --global --add safe.directory /__w/ollama/ollama
go generate -x ./...
env:
OLLAMA_SKIP_CPU_GENERATE: '1'
- uses: actions/upload-artifact@v4
with:
name: cuda-${{ matrix.cuda-version }}-libraries
path: llm/llama.cpp/build/**/lib/*
generate-rocm:
strategy:
matrix:
rocm-version:
- '5.7.1'
- '6.0'
runs-on: ubuntu-latest
container: rocm/dev-ubuntu-20.04:${{ matrix.rocm-version }}
steps:
- run: |
apt-get update && apt-get install -y git build-essential curl rocm-libs
curl -fsSL https://github.com/Kitware/CMake/releases/download/v3.28.1/cmake-3.28.1-linux-x86_64.tar.gz \
| tar -zx -C /usr --strip-components 1
env:
DEBIAN_FRONTEND: noninteractive
- uses: actions/checkout@v4
- uses: actions/setup-go@v4
with:
go-version: '1.21'
cache: true
- run: go get ./...
- run: |
git config --global --add safe.directory /__w/ollama/ollama
go generate -x ./...
env:
OLLAMA_SKIP_CPU_GENERATE: '1'
- uses: actions/upload-artifact@v4
with:
name: rocm-${{ matrix.rocm-version }}-libraries
path: llm/llama.cpp/build/**/lib/*
lint: lint:
needs: generate
strategy: strategy:
matrix: matrix:
os: [ubuntu-latest, macos-latest, windows-latest] os: [ubuntu-latest, macos-latest, windows-latest]
@@ -69,10 +112,19 @@ jobs:
with: with:
go-version: '1.21' go-version: '1.21'
cache: false cache: false
- uses: actions/download-artifact@v4 - run: |
with: mkdir -p llm/llama.cpp/build/linux/${{ matrix.arch }}/stub/lib/
name: ${{ matrix.os }}-${{ matrix.arch }}-libraries touch llm/llama.cpp/build/linux/${{ matrix.arch }}/stub/lib/stub.so
path: llm/llama.cpp/build if: ${{ startsWith(matrix.os, 'ubuntu-') }}
- run: |
mkdir -p llm/llama.cpp/build/darwin/${{ matrix.arch }}/stub/lib/
touch llm/llama.cpp/build/darwin/${{ matrix.arch }}/stub/lib/stub.dylib
touch llm/llama.cpp/ggml-metal.metal
if: ${{ startsWith(matrix.os, 'macos-') }}
- run: |
mkdir -p llm/llama.cpp/build/windows/${{ matrix.arch }}/stub/lib/
touch llm/llama.cpp/build/windows/${{ matrix.arch }}/stub/lib/stub.dll
if: ${{ startsWith(matrix.os, 'windows-') }}
- uses: golangci/golangci-lint-action@v3 - uses: golangci/golangci-lint-action@v3
test: test:
needs: generate needs: generate
@@ -104,3 +156,7 @@ jobs:
path: llm/llama.cpp/build path: llm/llama.cpp/build
- run: go build - run: go build
- run: go test -v ./... - run: go test -v ./...
- uses: actions/upload-artifact@v4
with:
name: ${{ matrix.os }}-binaries
path: ollama

View File

@@ -1,27 +1,135 @@
FROM nvidia/cuda:11.8.0-devel-ubuntu22.04 ARG GOLANG_VERSION=1.21.3
ARG CMAKE_VERSION=3.22.1
ARG CUDA_VERSION=11.3.1
ARG TARGETARCH # Copy the minimal context we need to run the generate scripts
ARG GOFLAGS="'-ldflags=-w -s'" FROM scratch AS llm-code
COPY .git .git
COPY .gitmodules .gitmodules
COPY llm llm
FROM --platform=linux/amd64 nvidia/cuda:$CUDA_VERSION-devel-centos7 AS cuda-build-amd64
ARG CMAKE_VERSION
COPY ./scripts/rh_linux_deps.sh /
RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
COPY --from=llm-code / /go/src/github.com/jmorganca/ollama/
WORKDIR /go/src/github.com/jmorganca/ollama/llm/generate
ARG CGO_CFLAGS
RUN OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION-devel-rockylinux8 AS cuda-build-arm64
ARG CMAKE_VERSION
COPY ./scripts/rh_linux_deps.sh /
RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
ENV PATH /opt/rh/gcc-toolset-10/root/usr/bin:$PATH
COPY --from=llm-code / /go/src/github.com/jmorganca/ollama/
WORKDIR /go/src/github.com/jmorganca/ollama/llm/generate
ARG CGO_CFLAGS
RUN OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
FROM --platform=linux/amd64 rocm/dev-centos-7:5.7.1-complete AS rocm-5-build-amd64
ARG CMAKE_VERSION
COPY ./scripts/rh_linux_deps.sh /
RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
ENV LIBRARY_PATH /opt/amdgpu/lib64
COPY --from=llm-code / /go/src/github.com/jmorganca/ollama/
WORKDIR /go/src/github.com/jmorganca/ollama/llm/generate
ARG CGO_CFLAGS
ARG AMDGPU_TARGETS
RUN OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
FROM --platform=linux/amd64 rocm/dev-centos-7:6.0-complete AS rocm-6-build-amd64
ARG CMAKE_VERSION
COPY ./scripts/rh_linux_deps.sh /
RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
ENV LIBRARY_PATH /opt/amdgpu/lib64
COPY --from=llm-code / /go/src/github.com/jmorganca/ollama/
WORKDIR /go/src/github.com/jmorganca/ollama/llm/generate
ARG CGO_CFLAGS
ARG AMDGPU_TARGETS
RUN OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
FROM --platform=linux/amd64 centos:7 AS cpu-builder-amd64
ARG CMAKE_VERSION
ARG GOLANG_VERSION
COPY ./scripts/rh_linux_deps.sh /
RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
COPY --from=llm-code / /go/src/github.com/jmorganca/ollama/
ARG OLLAMA_CUSTOM_CPU_DEFS
ARG CGO_CFLAGS
WORKDIR /go/src/github.com/jmorganca/ollama/llm/generate
FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu-build-amd64
RUN OLLAMA_CPU_TARGET="cpu" sh gen_linux.sh
FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu_avx-build-amd64
RUN OLLAMA_CPU_TARGET="cpu_avx" sh gen_linux.sh
FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu_avx2-build-amd64
RUN OLLAMA_CPU_TARGET="cpu_avx2" sh gen_linux.sh
FROM --platform=linux/arm64 centos:7 AS cpu-build-arm64
ARG CMAKE_VERSION
ARG GOLANG_VERSION
COPY ./scripts/rh_linux_deps.sh /
RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
COPY --from=llm-code / /go/src/github.com/jmorganca/ollama/
WORKDIR /go/src/github.com/jmorganca/ollama/llm/generate
# Note, we only build the "base" CPU variant on arm since avx/avx2 are x86 features
ARG OLLAMA_CUSTOM_CPU_DEFS
ARG CGO_CFLAGS
RUN OLLAMA_CPU_TARGET="cpu" sh gen_linux.sh
# Intermediate stage used for ./scripts/build_linux.sh
FROM --platform=linux/amd64 cpu-build-amd64 AS build-amd64
ENV CGO_ENABLED 1
WORKDIR /go/src/github.com/jmorganca/ollama WORKDIR /go/src/github.com/jmorganca/ollama
RUN apt-get update && apt-get install -y git build-essential cmake
ADD https://dl.google.com/go/go1.21.3.linux-$TARGETARCH.tar.gz /tmp/go1.21.3.tar.gz
RUN mkdir -p /usr/local && tar xz -C /usr/local </tmp/go1.21.3.tar.gz
COPY . . COPY . .
ENV GOARCH=$TARGETARCH COPY --from=cpu_avx-build-amd64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
ENV GOFLAGS=$GOFLAGS COPY --from=cpu_avx2-build-amd64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
RUN /usr/local/go/bin/go generate ./... \ COPY --from=cuda-build-amd64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
&& /usr/local/go/bin/go build . COPY --from=rocm-5-build-amd64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
COPY --from=rocm-6-build-amd64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
ARG GOFLAGS
ARG CGO_CFLAGS
RUN go build .
FROM ubuntu:22.04 # Intermediate stage used for ./scripts/build_linux.sh
FROM --platform=linux/arm64 cpu-build-arm64 AS build-arm64
ENV CGO_ENABLED 1
ARG GOLANG_VERSION
WORKDIR /go/src/github.com/jmorganca/ollama
COPY . .
COPY --from=cuda-build-arm64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
ARG GOFLAGS
ARG CGO_CFLAGS
RUN go build .
# Runtime stages
FROM --platform=linux/amd64 ubuntu:22.04 as runtime-amd64
RUN apt-get update && apt-get install -y ca-certificates RUN apt-get update && apt-get install -y ca-certificates
COPY --from=0 /go/src/github.com/jmorganca/ollama/ollama /bin/ollama COPY --from=build-amd64 /go/src/github.com/jmorganca/ollama/ollama /bin/ollama
FROM --platform=linux/arm64 ubuntu:22.04 as runtime-arm64
RUN apt-get update && apt-get install -y ca-certificates
COPY --from=build-arm64 /go/src/github.com/jmorganca/ollama/ollama /bin/ollama
# Radeon images are much larger so we keep it distinct from the CPU/CUDA image
FROM --platform=linux/amd64 rocm/dev-centos-7:5.7.1-complete as runtime-rocm
RUN update-pciids
COPY --from=build-amd64 /go/src/github.com/jmorganca/ollama/ollama /bin/ollama
EXPOSE 11434 EXPOSE 11434
ENV OLLAMA_HOST 0.0.0.0 ENV OLLAMA_HOST 0.0.0.0
# set some environment variable for better NVIDIA compatibility ENTRYPOINT ["/bin/ollama"]
ENV PATH=/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin CMD ["serve"]
FROM runtime-$TARGETARCH
EXPOSE 11434
ENV OLLAMA_HOST 0.0.0.0
ENV PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
ENV LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64 ENV LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64
ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility

View File

@@ -1,99 +0,0 @@
ARG GOLANG_VERSION=1.21.3
ARG CMAKE_VERSION=3.22.1
ARG CUDA_VERSION=11.3.1
# Copy the minimal context we need to run the generate scripts
FROM scratch AS llm-code
COPY .git .git
COPY .gitmodules .gitmodules
COPY llm llm
FROM --platform=linux/amd64 nvidia/cuda:$CUDA_VERSION-devel-centos7 AS cuda-build-amd64
ARG CMAKE_VERSION
ARG CGO_CFLAGS
COPY ./scripts/rh_linux_deps.sh /
RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
COPY --from=llm-code / /go/src/github.com/jmorganca/ollama/
WORKDIR /go/src/github.com/jmorganca/ollama/llm/generate
RUN OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION-devel-rockylinux8 AS cuda-build-arm64
ARG CMAKE_VERSION
ARG CGO_CFLAGS
COPY ./scripts/rh_linux_deps.sh /
RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
ENV PATH /opt/rh/gcc-toolset-10/root/usr/bin:$PATH
COPY --from=llm-code / /go/src/github.com/jmorganca/ollama/
WORKDIR /go/src/github.com/jmorganca/ollama/llm/generate
RUN OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
FROM --platform=linux/amd64 rocm/dev-centos-7:5.7.1-complete AS rocm-5-build-amd64
ARG CMAKE_VERSION
ARG CGO_CFLAGS
COPY ./scripts/rh_linux_deps.sh /
RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
ENV LIBRARY_PATH /opt/amdgpu/lib64
COPY --from=llm-code / /go/src/github.com/jmorganca/ollama/
WORKDIR /go/src/github.com/jmorganca/ollama/llm/generate
RUN OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
FROM --platform=linux/amd64 rocm/dev-centos-7:6.0-complete AS rocm-6-build-amd64
ARG CMAKE_VERSION
ARG CGO_CFLAGS
COPY ./scripts/rh_linux_deps.sh /
RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
ENV LIBRARY_PATH /opt/amdgpu/lib64
COPY --from=llm-code / /go/src/github.com/jmorganca/ollama/
WORKDIR /go/src/github.com/jmorganca/ollama/llm/generate
RUN OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
FROM --platform=linux/amd64 centos:7 AS cpu-build-amd64
ARG CMAKE_VERSION
ARG GOLANG_VERSION
ARG OLLAMA_CUSTOM_CPU_DEFS
ARG CGO_CFLAGS
COPY ./scripts/rh_linux_deps.sh /
RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
COPY --from=llm-code / /go/src/github.com/jmorganca/ollama/
WORKDIR /go/src/github.com/jmorganca/ollama/llm/generate
RUN sh gen_linux.sh
FROM --platform=linux/arm64 centos:7 AS cpu-build-arm64
ARG CMAKE_VERSION
ARG GOLANG_VERSION
ARG OLLAMA_CUSTOM_CPU_DEFS
ARG CGO_CFLAGS
COPY ./scripts/rh_linux_deps.sh /
RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
COPY --from=llm-code / /go/src/github.com/jmorganca/ollama/
WORKDIR /go/src/github.com/jmorganca/ollama/llm/generate
RUN sh gen_linux.sh
FROM --platform=linux/amd64 cpu-build-amd64 AS build-amd64
ENV CGO_ENABLED 1
ARG GOFLAGS
ARG CGO_CFLAGS
WORKDIR /go/src/github.com/jmorganca/ollama
COPY . .
COPY --from=cuda-build-amd64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
COPY --from=rocm-5-build-amd64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
COPY --from=rocm-6-build-amd64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
RUN go build .
FROM --platform=linux/arm64 cpu-build-arm64 AS build-arm64
ENV CGO_ENABLED 1
ARG GOLANG_VERSION
ARG GOFLAGS
ARG CGO_CFLAGS
WORKDIR /go/src/github.com/jmorganca/ollama
COPY . .
COPY --from=cuda-build-arm64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
RUN go build .
FROM build-$TARGETARCH

View File

@@ -1,8 +1,5 @@
<div align="center"> <div align="center">
<picture> <img alt="ollama" height="200px" src="https://github.com/jmorganca/ollama/assets/3325447/0d0b44e2-8f4a-4e99-9b52-a5c1c741c8f7">
<source media="(prefers-color-scheme: dark)" height="200px" srcset="https://github.com/jmorganca/ollama/assets/3325447/56ea1849-1284-4645-8970-956de6e51c3c">
<img alt="logo" height="200px" src="https://github.com/jmorganca/ollama/assets/3325447/0d0b44e2-8f4a-4e99-9b52-a5c1c741c8f7">
</picture>
</div> </div>
# Ollama # Ollama
@@ -31,6 +28,11 @@ curl https://ollama.ai/install.sh | sh
The official [Ollama Docker image](https://hub.docker.com/r/ollama/ollama) `ollama/ollama` is available on Docker Hub. The official [Ollama Docker image](https://hub.docker.com/r/ollama/ollama) `ollama/ollama` is available on Docker Hub.
### Libraries
- [ollama-python](https://github.com/ollama/ollama-python)
- [ollama-js](https://github.com/ollama/ollama-js)
## Quickstart ## Quickstart
To run and chat with [Llama 2](https://ollama.ai/library/llama2): To run and chat with [Llama 2](https://ollama.ai/library/llama2):
@@ -248,10 +250,6 @@ curl http://localhost:11434/api/chat -d '{
See the [API documentation](./docs/api.md) for all endpoints. See the [API documentation](./docs/api.md) for all endpoints.
## Integrations
- [ollama-python](https://github.com/jmorganca/ollama-python)
## Community Integrations ## Community Integrations
### Web & Desktop ### Web & Desktop

View File

@@ -34,24 +34,26 @@ func (e StatusError) Error() string {
type ImageData []byte type ImageData []byte
type GenerateRequest struct { type GenerateRequest struct {
Model string `json:"model"` Model string `json:"model"`
Prompt string `json:"prompt"` Prompt string `json:"prompt"`
System string `json:"system"` System string `json:"system"`
Template string `json:"template"` Template string `json:"template"`
Context []int `json:"context,omitempty"` Context []int `json:"context,omitempty"`
Stream *bool `json:"stream,omitempty"` Stream *bool `json:"stream,omitempty"`
Raw bool `json:"raw,omitempty"` Raw bool `json:"raw,omitempty"`
Format string `json:"format"` Format string `json:"format"`
Images []ImageData `json:"images,omitempty"` KeepAlive *Duration `json:"keep_alive,omitempty"`
Images []ImageData `json:"images,omitempty"`
Options map[string]interface{} `json:"options"` Options map[string]interface{} `json:"options"`
} }
type ChatRequest struct { type ChatRequest struct {
Model string `json:"model"` Model string `json:"model"`
Messages []Message `json:"messages"` Messages []Message `json:"messages"`
Stream *bool `json:"stream,omitempty"` Stream *bool `json:"stream,omitempty"`
Format string `json:"format"` Format string `json:"format"`
KeepAlive *Duration `json:"keep_alive,omitempty"`
Options map[string]interface{} `json:"options"` Options map[string]interface{} `json:"options"`
} }
@@ -126,8 +128,9 @@ type Runner struct {
} }
type EmbeddingRequest struct { type EmbeddingRequest struct {
Model string `json:"model"` Model string `json:"model"`
Prompt string `json:"prompt"` Prompt string `json:"prompt"`
KeepAlive *Duration `json:"keep_alive,omitempty"`
Options map[string]interface{} `json:"options"` Options map[string]interface{} `json:"options"`
} }
@@ -171,6 +174,7 @@ type ShowResponse struct {
Template string `json:"template,omitempty"` Template string `json:"template,omitempty"`
System string `json:"system,omitempty"` System string `json:"system,omitempty"`
Details ModelDetails `json:"details,omitempty"` Details ModelDetails `json:"details,omitempty"`
Messages []Message `json:"messages,omitempty"`
} }
type CopyRequest struct { type CopyRequest struct {
@@ -179,11 +183,12 @@ type CopyRequest struct {
} }
type PullRequest struct { type PullRequest struct {
Model string `json:"model"` Model string `json:"model"`
Insecure bool `json:"insecure,omitempty"` Insecure bool `json:"insecure,omitempty"`
Username string `json:"username"` Username string `json:"username"`
Password string `json:"password"` Password string `json:"password"`
Stream *bool `json:"stream,omitempty"` Stream *bool `json:"stream,omitempty"`
CurrentDigest string `json:"current_digest,omitempty"`
// Name is deprecated, see Model // Name is deprecated, see Model
Name string `json:"name"` Name string `json:"name"`
@@ -236,6 +241,8 @@ type GenerateResponse struct {
} }
type ModelDetails struct { type ModelDetails struct {
ParentModel string `json:"parent_model"`
Digest string `json:"digest"`
Format string `json:"format"` Format string `json:"format"`
Family string `json:"family"` Family string `json:"family"`
Families []string `json:"families"` Families []string `json:"families"`
@@ -411,14 +418,19 @@ func (d *Duration) UnmarshalJSON(b []byte) (err error) {
case float64: case float64:
if t < 0 { if t < 0 {
t = math.MaxFloat64 t = math.MaxFloat64
d.Duration = time.Duration(t)
} else {
d.Duration = time.Duration(t * float64(time.Second))
} }
d.Duration = time.Duration(t)
case string: case string:
d.Duration, err = time.ParseDuration(t) d.Duration, err = time.ParseDuration(t)
if err != nil { if err != nil {
return err return err
} }
if d.Duration < 0 {
mf := math.MaxFloat64
d.Duration = time.Duration(mf)
}
} }
return nil return nil

View File

@@ -357,6 +357,42 @@ func CopyHandler(cmd *cobra.Command, args []string) error {
} }
func PullHandler(cmd *cobra.Command, args []string) error { func PullHandler(cmd *cobra.Command, args []string) error {
upgradeAll, err := cmd.Flags().GetBool("upgrade-all")
if err != nil {
return err
}
if !upgradeAll {
if len(args) == 0 {
return fmt.Errorf("no model specified to pull")
}
return pull(cmd, args[0], "")
}
client, err := api.ClientFromEnvironment()
if err != nil {
return err
}
models, err := client.List(cmd.Context())
if err != nil {
return err
}
for _, m := range (*models).Models {
err = pull(cmd, m.Name, "sha256:"+m.Digest)
if err != nil {
if strings.Contains(err.Error(), "file does not exist") {
fmt.Printf("model '%s' is no longer available\n", m.Name)
continue
}
return err
}
}
return nil
}
func pull(cmd *cobra.Command, name string, currentDigest string) error {
insecure, err := cmd.Flags().GetBool("insecure") insecure, err := cmd.Flags().GetBool("insecure")
if err != nil { if err != nil {
return err return err
@@ -368,7 +404,7 @@ func PullHandler(cmd *cobra.Command, args []string) error {
} }
p := progress.NewProgress(os.Stderr) p := progress.NewProgress(os.Stderr)
defer p.Stop() defer p.StopWithoutClear()
bars := make(map[string]*progress.Bar) bars := make(map[string]*progress.Bar)
@@ -402,7 +438,7 @@ func PullHandler(cmd *cobra.Command, args []string) error {
return nil return nil
} }
request := api.PullRequest{Name: args[0], Insecure: insecure} request := api.PullRequest{Name: name, Insecure: insecure, CurrentDigest: currentDigest}
if err := client.Pull(cmd.Context(), &request, fn); err != nil { if err := client.Pull(cmd.Context(), &request, fn); err != nil {
return err return err
} }
@@ -458,15 +494,17 @@ func RunGenerate(cmd *cobra.Command, args []string) error {
type generateContextKey string type generateContextKey string
type runOptions struct { type runOptions struct {
Model string Model string
Prompt string ParentModel string
Messages []api.Message Prompt string
WordWrap bool Messages []api.Message
Format string WordWrap bool
System string Format string
Template string System string
Images []api.ImageData Template string
Options map[string]interface{} Images []api.ImageData
Options map[string]interface{}
MultiModal bool
} }
type displayResponseState struct { type displayResponseState struct {
@@ -882,12 +920,13 @@ func NewCLI() *cobra.Command {
pullCmd := &cobra.Command{ pullCmd := &cobra.Command{
Use: "pull MODEL", Use: "pull MODEL",
Short: "Pull a model from a registry", Short: "Pull a model from a registry",
Args: cobra.ExactArgs(1), Args: cobra.RangeArgs(0, 1),
PreRunE: checkServerHeartbeat, PreRunE: checkServerHeartbeat,
RunE: PullHandler, RunE: PullHandler,
} }
pullCmd.Flags().Bool("insecure", false, "Use an insecure registry") pullCmd.Flags().Bool("insecure", false, "Use an insecure registry")
pullCmd.Flags().Bool("upgrade-all", false, "Upgrade all models if they're out of date")
pushCmd := &cobra.Command{ pushCmd := &cobra.Command{
Use: "push MODEL", Use: "push MODEL",

View File

@@ -7,12 +7,14 @@ import (
"net/http" "net/http"
"os" "os"
"regexp" "regexp"
"sort"
"strings" "strings"
"github.com/spf13/cobra" "github.com/spf13/cobra"
"golang.org/x/exp/slices" "golang.org/x/exp/slices"
"github.com/jmorganca/ollama/api" "github.com/jmorganca/ollama/api"
"github.com/jmorganca/ollama/progress"
"github.com/jmorganca/ollama/readline" "github.com/jmorganca/ollama/readline"
) )
@@ -25,43 +27,75 @@ const (
MultilineTemplate MultilineTemplate
) )
func modelIsMultiModal(cmd *cobra.Command, name string) bool { func loadModel(cmd *cobra.Command, opts *runOptions) error {
// get model details
client, err := api.ClientFromEnvironment() client, err := api.ClientFromEnvironment()
if err != nil { if err != nil {
fmt.Println("error: couldn't connect to ollama server") return err
return false
} }
req := api.ShowRequest{Name: name} p := progress.NewProgress(os.Stderr)
resp, err := client.Show(cmd.Context(), &req) defer p.StopAndClear()
spinner := progress.NewSpinner("")
p.Add("", spinner)
showReq := api.ShowRequest{Name: opts.Model}
showResp, err := client.Show(cmd.Context(), &showReq)
if err != nil { if err != nil {
return false return err
}
opts.MultiModal = slices.Contains(showResp.Details.Families, "clip")
opts.ParentModel = showResp.Details.ParentModel
if len(showResp.Messages) > 0 {
opts.Messages = append(opts.Messages, showResp.Messages...)
} }
return slices.Contains(resp.Details.Families, "clip") chatReq := &api.ChatRequest{
Model: opts.Model,
Messages: []api.Message{},
}
err = client.Chat(cmd.Context(), chatReq, func(resp api.ChatResponse) error {
p.StopAndClear()
if len(opts.Messages) > 0 {
for _, msg := range opts.Messages {
switch msg.Role {
case "user":
fmt.Printf(">>> %s\n", msg.Content)
case "assistant":
state := &displayResponseState{}
displayResponse(msg.Content, opts.WordWrap, state)
fmt.Println()
fmt.Println()
}
}
}
return nil
})
if err != nil {
return err
}
return nil
} }
func generateInteractive(cmd *cobra.Command, opts runOptions) error { func generateInteractive(cmd *cobra.Command, opts runOptions) error {
multiModal := modelIsMultiModal(cmd, opts.Model) opts.Messages = make([]api.Message, 0)
// load the model err := loadModel(cmd, &opts)
loadOpts := runOptions{ if err != nil {
Model: opts.Model,
Prompt: "",
Messages: []api.Message{},
}
if _, err := chat(cmd, loadOpts); err != nil {
return err return err
} }
usage := func() { usage := func() {
fmt.Fprintln(os.Stderr, "Available Commands:") fmt.Fprintln(os.Stderr, "Available Commands:")
fmt.Fprintln(os.Stderr, " /set Set session variables") fmt.Fprintln(os.Stderr, " /set Set session variables")
fmt.Fprintln(os.Stderr, " /show Show model information") fmt.Fprintln(os.Stderr, " /show Show model information")
fmt.Fprintln(os.Stderr, " /bye Exit") fmt.Fprintln(os.Stderr, " /load <model> Load a session or model")
fmt.Fprintln(os.Stderr, " /?, /help Help for a command") fmt.Fprintln(os.Stderr, " /save <model> Save your current session")
fmt.Fprintln(os.Stderr, " /? shortcuts Help for keyboard shortcuts") fmt.Fprintln(os.Stderr, " /bye Exit")
fmt.Fprintln(os.Stderr, " /?, /help Help for a command")
fmt.Fprintln(os.Stderr, " /? shortcuts Help for keyboard shortcuts")
fmt.Fprintln(os.Stderr, "") fmt.Fprintln(os.Stderr, "")
fmt.Fprintln(os.Stderr, "Use \"\"\" to begin a multi-line message.") fmt.Fprintln(os.Stderr, "Use \"\"\" to begin a multi-line message.")
fmt.Fprintln(os.Stderr, "") fmt.Fprintln(os.Stderr, "")
@@ -140,7 +174,6 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
var sb strings.Builder var sb strings.Builder
var multiline MultilineState var multiline MultilineState
opts.Messages = make([]api.Message, 0)
for { for {
line, err := scanner.Readline() line, err := scanner.Readline()
@@ -203,6 +236,44 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
if err := ListHandler(cmd, args[1:]); err != nil { if err := ListHandler(cmd, args[1:]); err != nil {
return err return err
} }
case strings.HasPrefix(line, "/load"):
args := strings.Fields(line)
if len(args) != 2 {
fmt.Println("Usage:\n /load <modelname>")
continue
}
opts.Model = args[1]
opts.Messages = []api.Message{}
fmt.Printf("Loading model '%s'\n", opts.Model)
if err := loadModel(cmd, &opts); err != nil {
return err
}
continue
case strings.HasPrefix(line, "/save"):
args := strings.Fields(line)
if len(args) != 2 {
fmt.Println("Usage:\n /save <modelname>")
continue
}
client, err := api.ClientFromEnvironment()
if err != nil {
fmt.Println("error: couldn't connect to ollama server")
return err
}
req := &api.CreateRequest{
Name: args[1],
Modelfile: buildModelfile(opts),
}
fn := func(resp api.ProgressResponse) error { return nil }
err = client.Create(cmd.Context(), req, fn)
if err != nil {
fmt.Println("error: couldn't save model")
return err
}
fmt.Printf("Created new model '%s'\n", args[1])
continue
case strings.HasPrefix(line, "/set"): case strings.HasPrefix(line, "/set"):
args := strings.Fields(line) args := strings.Fields(line)
if len(args) > 1 { if len(args) > 1 {
@@ -389,7 +460,7 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
args := strings.Fields(line) args := strings.Fields(line)
isFile := false isFile := false
if multiModal { if opts.MultiModal {
for _, f := range extractFileNames(line) { for _, f := range extractFileNames(line) {
if strings.HasPrefix(f, args[0]) { if strings.HasPrefix(f, args[0]) {
isFile = true isFile = true
@@ -411,7 +482,7 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
if sb.Len() > 0 && multiline == MultilineNone { if sb.Len() > 0 && multiline == MultilineNone {
newMessage := api.Message{Role: "user", Content: sb.String()} newMessage := api.Message{Role: "user", Content: sb.String()}
if multiModal { if opts.MultiModal {
msg, images, err := extractFileData(sb.String()) msg, images, err := extractFileData(sb.String())
if err != nil { if err != nil {
return err return err
@@ -454,6 +525,38 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
} }
} }
func buildModelfile(opts runOptions) string {
var mf strings.Builder
model := opts.ParentModel
if model == "" {
model = opts.Model
}
fmt.Fprintf(&mf, "FROM %s\n", model)
if opts.System != "" {
fmt.Fprintf(&mf, "SYSTEM \"\"\"%s\"\"\"\n", opts.System)
}
if opts.Template != "" {
fmt.Fprintf(&mf, "TEMPLATE \"\"\"%s\"\"\"\n", opts.Template)
}
keys := make([]string, 0)
for k := range opts.Options {
keys = append(keys, k)
}
sort.Strings(keys)
for _, k := range keys {
fmt.Fprintf(&mf, "PARAMETER %s %v\n", k, opts.Options[k])
}
fmt.Fprintln(&mf)
for _, msg := range opts.Messages {
fmt.Fprintf(&mf, "MESSAGE %s \"\"\"%s\"\"\"\n", msg.Role, msg.Content)
}
return mf.String()
}
func normalizeFilePath(fp string) string { func normalizeFilePath(fp string) string {
// Define a map of escaped characters and their replacements // Define a map of escaped characters and their replacements
replacements := map[string]string{ replacements := map[string]string{

View File

@@ -1,9 +1,13 @@
package cmd package cmd
import ( import (
"bytes"
"testing" "testing"
"text/template"
"github.com/stretchr/testify/assert" "github.com/stretchr/testify/assert"
"github.com/jmorganca/ollama/api"
) )
func TestExtractFilenames(t *testing.T) { func TestExtractFilenames(t *testing.T) {
@@ -49,3 +53,64 @@ d:\path with\spaces\seven.svg inbetween7 c:\users\jdoe\eight.png inbetween8
assert.Contains(t, res[9], "ten.svg") assert.Contains(t, res[9], "ten.svg")
assert.Contains(t, res[9], "E:") assert.Contains(t, res[9], "E:")
} }
func TestModelfileBuilder(t *testing.T) {
opts := runOptions{
Model: "hork",
System: "You are part horse and part shark, but all hork. Do horklike things",
Template: "This is a template.",
Messages: []api.Message{
{Role: "user", Content: "Hey there hork!"},
{Role: "assistant", Content: "Yes it is true, I am half horse, half shark."},
},
Options: map[string]interface{}{},
}
opts.Options["temperature"] = 0.9
opts.Options["seed"] = 42
opts.Options["penalize_newline"] = false
opts.Options["stop"] = []string{"hi", "there"}
mf := buildModelfile(opts)
expectedModelfile := `FROM {{.Model}}
SYSTEM """{{.System}}"""
TEMPLATE """{{.Template}}"""
PARAMETER penalize_newline false
PARAMETER seed 42
PARAMETER stop [hi there]
PARAMETER temperature 0.9
MESSAGE user """Hey there hork!"""
MESSAGE assistant """Yes it is true, I am half horse, half shark."""
`
tmpl, err := template.New("").Parse(expectedModelfile)
assert.Nil(t, err)
var buf bytes.Buffer
err = tmpl.Execute(&buf, opts)
assert.Nil(t, err)
assert.Equal(t, buf.String(), mf)
opts.ParentModel = "horseshark"
mf = buildModelfile(opts)
expectedModelfile = `FROM {{.ParentModel}}
SYSTEM """{{.System}}"""
TEMPLATE """{{.Template}}"""
PARAMETER penalize_newline false
PARAMETER seed 42
PARAMETER stop [hi there]
PARAMETER temperature 0.9
MESSAGE user """Hey there hork!"""
MESSAGE assistant """Yes it is true, I am half horse, half shark."""
`
tmpl, err = template.New("").Parse(expectedModelfile)
assert.Nil(t, err)
var parentBuf bytes.Buffer
err = tmpl.Execute(&parentBuf, opts)
assert.Nil(t, err)
assert.Equal(t, parentBuf.String(), mf)
}

View File

@@ -74,7 +74,8 @@ Typically the build scripts will auto-detect ROCm, however, if your Linux distro
or installation approach uses unusual paths, you can specify the location by or installation approach uses unusual paths, you can specify the location by
specifying an environment variable `ROCM_PATH` to the location of the ROCm specifying an environment variable `ROCM_PATH` to the location of the ROCm
install (typically `/opt/rocm`), and `CLBlast_DIR` to the location of the install (typically `/opt/rocm`), and `CLBlast_DIR` to the location of the
CLBlast install (typically `/usr/lib/cmake/CLBlast`). CLBlast install (typically `/usr/lib/cmake/CLBlast`). You can also customize
the AMD GPU targets by setting AMDGPU_TARGETS (e.g. `AMDGPU_TARGETS="gfx1101;gfx1102"`)
``` ```
go generate ./... go generate ./...

View File

@@ -8,35 +8,38 @@ To upgrade Ollama, run the installation process again. On the Mac, click the Oll
Review the [Troubleshooting](./troubleshooting.md) docs for more about using logs. Review the [Troubleshooting](./troubleshooting.md) docs for more about using logs.
## How do I use Ollama server environment variables on Mac ## How do I configure Ollama server?
On macOS, Ollama runs in the background and is managed by the menubar app. If adding environment variables, Ollama will need to be run manually. Ollama server can be configured with environment variables.
1. Click the menubar icon for Ollama and choose **Quit Ollama**. ### Setting environment variables on Mac
2. Open a new terminal window and run the following command (this example uses `OLLAMA_HOST` with an IP address of `123.1.1.1`):
```bash If Ollama is run as a macOS application, environment variables should be set using `launchctl`:
OLLAMA_HOST=123.1.1.1 ollama serve
```
## How do I use Ollama server environment variables on Linux? 1. For each environment variable, call `launchctl setenv`.
If Ollama is installed with the install script, a systemd service was created, running as the Ollama user. To add an environment variable, such as OLLAMA_HOST, follow these steps: ```bash
launchctl setenv OLLAMA_HOST "0.0.0.0"
```
1. Create a `systemd` drop-in directory and add a config file. This is only needed once. 2. Restart Ollama application.
```bash ### Setting environment variables on Linux
mkdir -p /etc/systemd/system/ollama.service.d
echo '[Service]' >>/etc/systemd/system/ollama.service.d/environment.conf
```
2. For each environment variable, add it to the config file: If Ollama is run as a systemd service, environment variables should be set using `systemctl`:
```bash 1. Edit the systemd service by calling `systemctl edit ollama.service`. This will open an editor.
echo 'Environment="OLLAMA_HOST=0.0.0.0:11434"' >>/etc/systemd/system/ollama.service.d/environment.conf
```
3. Reload `systemd` and restart Ollama: 2. For each environment variable, add a line `Environment` under section `[Service]`:
```ini
[Service]
Environment="OLLAMA_HOST=0.0.0.0"
```
3. Save and exit.
4. Reload `systemd` and restart Ollama:
```bash ```bash
systemctl daemon-reload systemctl daemon-reload
@@ -45,26 +48,26 @@ If Ollama is installed with the install script, a systemd service was created, r
## How can I expose Ollama on my network? ## How can I expose Ollama on my network?
Ollama binds to 127.0.0.1 port 11434 by default. Change the bind address with the `OLLAMA_HOST` environment variable. Refer to the section above for how to use environment variables on your platform. Ollama binds 127.0.0.1 port 11434 by default. Change the bind address with the `OLLAMA_HOST` environment variable.
Refer to the section [above](#how-do-i-configure-ollama-server) for how to set environment variables on your platform.
## How can I allow additional web origins to access Ollama? ## How can I allow additional web origins to access Ollama?
Ollama allows cross-origin requests from `127.0.0.1` and `0.0.0.0` by default. Add additional origins with the `OLLAMA_ORIGINS` environment variable. For example, to add all ports on 192.168.1.1 and https://example.com, use: Ollama allows cross-origin requests from `127.0.0.1` and `0.0.0.0` by default. Additional origins can be configured with `OLLAMA_ORIGINS`.
```shell Refer to the section [above](#how-do-i-configure-ollama-server) for how to set environment variables on your platform.
OLLAMA_ORIGINS=http://192.168.1.1:*,https://example.com
```
Refer to the section above for how to use environment variables on your platform.
## Where are models stored? ## Where are models stored?
- macOS: `~/.ollama/models`. - macOS: `~/.ollama/models`.
- Linux: `/usr/share/ollama/.ollama/models` - Linux: `/usr/share/ollama/.ollama/models`
## How do I set them to a different location? ### How do I set them to a different location?
If a different directory needs to be used, set the environment variable `OLLAMA_MODELS` to the chosen directory. Refer to the section above for how to use environment variables on your platform. If a different directory needs to be used, set the environment variable `OLLAMA_MODELS` to the chosen directory.
Refer to the section [above](#how-do-i-configure-ollama-server) for how to set environment variables on your platform.
## Does Ollama send my prompts and answers back to Ollama.ai to use in any way? ## Does Ollama send my prompts and answers back to Ollama.ai to use in any way?

View File

@@ -19,6 +19,7 @@ A model file is the blueprint to create and share models with Ollama.
- [SYSTEM](#system) - [SYSTEM](#system)
- [ADAPTER](#adapter) - [ADAPTER](#adapter)
- [LICENSE](#license) - [LICENSE](#license)
- [MESSAGE](#message)
- [Notes](#notes) - [Notes](#notes)
## Format ## Format
@@ -38,6 +39,7 @@ INSTRUCTION arguments
| [`SYSTEM`](#system) | Specifies the system message that will be set in the template. | | [`SYSTEM`](#system) | Specifies the system message that will be set in the template. |
| [`ADAPTER`](#adapter) | Defines the (Q)LoRA adapters to apply to the model. | | [`ADAPTER`](#adapter) | Defines the (Q)LoRA adapters to apply to the model. |
| [`LICENSE`](#license) | Specifies the legal license. | | [`LICENSE`](#license) | Specifies the legal license. |
| [`MESSAGE`](#message) | Specify message history. |
## Examples ## Examples
@@ -205,6 +207,19 @@ LICENSE """
""" """
``` ```
### MESSAGE
The `MESSAGE` instruction allows you to specify a message history for the model to use when responding:
```modelfile
MESSAGE user Is Toronto in Canada?
MESSAGE assistant yes
MESSAGE user Is Sacramento in Canada?
MESSAGE assistant no
MESSAGE user Is Ontario in Canada?
MESSAGE assistant yes
```
## Notes ## Notes
- the **`Modelfile` is not case sensitive**. In the examples, uppercase instructions are used to make it easier to distinguish it from arguments. - the **`Modelfile` is not case sensitive**. In the examples, uppercase instructions are used to make it easier to distinguish it from arguments.

View File

@@ -16,6 +16,7 @@ import (
"os" "os"
"path/filepath" "path/filepath"
"runtime" "runtime"
"strconv"
"strings" "strings"
"sync" "sync"
"unsafe" "unsafe"
@@ -38,12 +39,15 @@ var CudaLinuxGlobs = []string{
"/usr/lib/x86_64-linux-gnu/nvidia/current/libnvidia-ml.so*", "/usr/lib/x86_64-linux-gnu/nvidia/current/libnvidia-ml.so*",
"/usr/lib/x86_64-linux-gnu/libnvidia-ml.so*", "/usr/lib/x86_64-linux-gnu/libnvidia-ml.so*",
"/usr/lib/wsl/lib/libnvidia-ml.so*", "/usr/lib/wsl/lib/libnvidia-ml.so*",
"/usr/lib/wsl/drivers/*/libnvidia-ml.so*",
"/opt/cuda/lib64/libnvidia-ml.so*", "/opt/cuda/lib64/libnvidia-ml.so*",
"/opt/cuda/targets/x86_64-linux/lib/stubs/libnvidia-ml.so*",
"/usr/lib*/libnvidia-ml.so*", "/usr/lib*/libnvidia-ml.so*",
"/usr/local/lib*/libnvidia-ml.so*", "/usr/local/lib*/libnvidia-ml.so*",
"/usr/lib/aarch64-linux-gnu/nvidia/current/libnvidia-ml.so*", "/usr/lib/aarch64-linux-gnu/nvidia/current/libnvidia-ml.so*",
"/usr/lib/aarch64-linux-gnu/libnvidia-ml.so*", "/usr/lib/aarch64-linux-gnu/libnvidia-ml.so*",
// TODO: are these stubs ever valid?
"/opt/cuda/targets/x86_64-linux/lib/stubs/libnvidia-ml.so*",
} }
var CudaWindowsGlobs = []string{ var CudaWindowsGlobs = []string{
@@ -118,9 +122,15 @@ func GetGPUInfo() GpuInfo {
initGPUHandles() initGPUHandles()
} }
// All our GPU builds have AVX enabled, so fallback to CPU if we don't detect at least AVX
cpuVariant := GetCPUVariant()
if cpuVariant == "" {
slog.Warn("CPU does not have AVX or AVX2, disabling GPU support.")
}
var memInfo C.mem_info_t var memInfo C.mem_info_t
resp := GpuInfo{} resp := GpuInfo{}
if gpuHandles.cuda != nil { if gpuHandles.cuda != nil && cpuVariant != "" {
C.cuda_check_vram(*gpuHandles.cuda, &memInfo) C.cuda_check_vram(*gpuHandles.cuda, &memInfo)
if memInfo.err != nil { if memInfo.err != nil {
slog.Info(fmt.Sprintf("error looking up CUDA GPU memory: %s", C.GoString(memInfo.err))) slog.Info(fmt.Sprintf("error looking up CUDA GPU memory: %s", C.GoString(memInfo.err)))
@@ -139,12 +149,33 @@ func GetGPUInfo() GpuInfo {
slog.Info(fmt.Sprintf("CUDA GPU is too old. Falling back to CPU mode. Compute Capability detected: %d.%d", cc.major, cc.minor)) slog.Info(fmt.Sprintf("CUDA GPU is too old. Falling back to CPU mode. Compute Capability detected: %d.%d", cc.major, cc.minor))
} }
} }
} else if gpuHandles.rocm != nil { } else if gpuHandles.rocm != nil && cpuVariant != "" {
C.rocm_check_vram(*gpuHandles.rocm, &memInfo) C.rocm_check_vram(*gpuHandles.rocm, &memInfo)
if memInfo.err != nil { if memInfo.err != nil {
slog.Info(fmt.Sprintf("error looking up ROCm GPU memory: %s", C.GoString(memInfo.err))) slog.Info(fmt.Sprintf("error looking up ROCm GPU memory: %s", C.GoString(memInfo.err)))
C.free(unsafe.Pointer(memInfo.err)) C.free(unsafe.Pointer(memInfo.err))
} else if memInfo.igpu_index >= 0 && memInfo.count == 1 {
// Only one GPU detected and it appears to be an integrated GPU - skip it
slog.Info("ROCm unsupported integrated GPU detected")
} else { } else {
if memInfo.igpu_index >= 0 {
// We have multiple GPUs reported, and one of them is an integrated GPU
// so we have to set the env var to bypass it
// If the user has specified their own ROCR_VISIBLE_DEVICES, don't clobber it
val := os.Getenv("ROCR_VISIBLE_DEVICES")
if val == "" {
devices := []string{}
for i := 0; i < int(memInfo.count); i++ {
if i == int(memInfo.igpu_index) {
continue
}
devices = append(devices, strconv.Itoa(i))
}
val = strings.Join(devices, ",")
os.Setenv("ROCR_VISIBLE_DEVICES", val)
}
slog.Info(fmt.Sprintf("ROCm integrated GPU detected - ROCR_VISIBLE_DEVICES=%s", val))
}
resp.Library = "rocm" resp.Library = "rocm"
var version C.rocm_version_resp_t var version C.rocm_version_resp_t
C.rocm_get_version(*gpuHandles.rocm, &version) C.rocm_get_version(*gpuHandles.rocm, &version)
@@ -160,7 +191,7 @@ func GetGPUInfo() GpuInfo {
if resp.Library == "" { if resp.Library == "" {
C.cpu_check_ram(&memInfo) C.cpu_check_ram(&memInfo)
resp.Library = "cpu" resp.Library = "cpu"
resp.Variant = GetCPUVariant() resp.Variant = cpuVariant
} }
if memInfo.err != nil { if memInfo.err != nil {
slog.Info(fmt.Sprintf("error looking up CPU memory: %s", C.GoString(memInfo.err))) slog.Info(fmt.Sprintf("error looking up CPU memory: %s", C.GoString(memInfo.err)))
@@ -190,13 +221,15 @@ func getCPUMem() (memInfo, error) {
func CheckVRAM() (int64, error) { func CheckVRAM() (int64, error) {
gpuInfo := GetGPUInfo() gpuInfo := GetGPUInfo()
if gpuInfo.FreeMemory > 0 && (gpuInfo.Library == "cuda" || gpuInfo.Library == "rocm") { if gpuInfo.FreeMemory > 0 && (gpuInfo.Library == "cuda" || gpuInfo.Library == "rocm") {
// leave 10% or 512MiB of VRAM free per GPU to handle unaccounted for overhead // leave 10% or 1024MiB of VRAM free per GPU to handle unaccounted for overhead
overhead := gpuInfo.FreeMemory / 10 overhead := gpuInfo.FreeMemory / 10
gpus := uint64(gpuInfo.DeviceCount) gpus := uint64(gpuInfo.DeviceCount)
if overhead < gpus*512*1024*1024 { if overhead < gpus*1024*1024*1024 {
overhead = gpus * 512 * 1024 * 1024 overhead = gpus * 1024 * 1024 * 1024
} }
return int64(gpuInfo.FreeMemory - overhead), nil avail := int64(gpuInfo.FreeMemory - overhead)
slog.Debug(fmt.Sprintf("%s detected %d devices with %dM available memory", gpuInfo.Library, gpuInfo.DeviceCount, avail/1024/1024))
return avail, nil
} }
return 0, fmt.Errorf("no GPU detected") // TODO - better handling of CPU based memory determiniation return 0, fmt.Errorf("no GPU detected") // TODO - better handling of CPU based memory determiniation
@@ -258,6 +291,7 @@ func FindGPULibs(baseLibName string, patterns []string) []string {
func LoadCUDAMgmt(cudaLibPaths []string) *C.cuda_handle_t { func LoadCUDAMgmt(cudaLibPaths []string) *C.cuda_handle_t {
var resp C.cuda_init_resp_t var resp C.cuda_init_resp_t
resp.ch.verbose = getVerboseState()
for _, libPath := range cudaLibPaths { for _, libPath := range cudaLibPaths {
lib := C.CString(libPath) lib := C.CString(libPath)
defer C.free(unsafe.Pointer(lib)) defer C.free(unsafe.Pointer(lib))
@@ -274,6 +308,7 @@ func LoadCUDAMgmt(cudaLibPaths []string) *C.cuda_handle_t {
func LoadROCMMgmt(rocmLibPaths []string) *C.rocm_handle_t { func LoadROCMMgmt(rocmLibPaths []string) *C.rocm_handle_t {
var resp C.rocm_init_resp_t var resp C.rocm_init_resp_t
resp.rh.verbose = getVerboseState()
for _, libPath := range rocmLibPaths { for _, libPath := range rocmLibPaths {
lib := C.CString(libPath) lib := C.CString(libPath)
defer C.free(unsafe.Pointer(lib)) defer C.free(unsafe.Pointer(lib))
@@ -287,3 +322,10 @@ func LoadROCMMgmt(rocmLibPaths []string) *C.rocm_handle_t {
} }
return nil return nil
} }
func getVerboseState() C.uint16_t {
if debug := os.Getenv("OLLAMA_DEBUG"); debug != "" {
return C.uint16_t(1)
}
return C.uint16_t(0)
}

View File

@@ -27,6 +27,13 @@
#endif #endif
#define LOG(verbose, ...) \
do { \
if (verbose) { \
fprintf(stderr, __VA_ARGS__); \
} \
} while (0)
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
#endif #endif
@@ -35,6 +42,7 @@ typedef struct mem_info {
uint64_t total; uint64_t total;
uint64_t free; uint64_t free;
unsigned int count; unsigned int count;
int igpu_index; // If >= 0, we detected an integrated GPU to ignore
char *err; // If non-nill, caller responsible for freeing char *err; // If non-nill, caller responsible for freeing
} mem_info_t; } mem_info_t;

View File

@@ -4,8 +4,6 @@
#include <string.h> #include <string.h>
#define CUDA_LOOKUP_SIZE 6
void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) { void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
nvmlReturn_t ret; nvmlReturn_t ret;
resp->err = NULL; resp->err = NULL;
@@ -16,18 +14,26 @@ void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
struct lookup { struct lookup {
char *s; char *s;
void **p; void **p;
} l[CUDA_LOOKUP_SIZE] = { } l[] = {
{"nvmlInit_v2", (void *)&resp->ch.initFn}, {"nvmlInit_v2", (void *)&resp->ch.nvmlInit_v2},
{"nvmlShutdown", (void *)&resp->ch.shutdownFn}, {"nvmlShutdown", (void *)&resp->ch.nvmlShutdown},
{"nvmlDeviceGetHandleByIndex", (void *)&resp->ch.getHandle}, {"nvmlDeviceGetHandleByIndex", (void *)&resp->ch.nvmlDeviceGetHandleByIndex},
{"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.getMemInfo}, {"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.nvmlDeviceGetMemoryInfo},
{"nvmlDeviceGetCount_v2", (void *)&resp->ch.getCount}, {"nvmlDeviceGetCount_v2", (void *)&resp->ch.nvmlDeviceGetCount_v2},
{"nvmlDeviceGetCudaComputeCapability", (void *)&resp->ch.getComputeCapability}, {"nvmlDeviceGetCudaComputeCapability", (void *)&resp->ch.nvmlDeviceGetCudaComputeCapability},
{"nvmlSystemGetDriverVersion", (void *)&resp->ch.nvmlSystemGetDriverVersion},
{"nvmlDeviceGetName", (void *)&resp->ch.nvmlDeviceGetName},
{"nvmlDeviceGetSerial", (void *)&resp->ch.nvmlDeviceGetSerial},
{"nvmlDeviceGetVbiosVersion", (void *)&resp->ch.nvmlDeviceGetVbiosVersion},
{"nvmlDeviceGetBoardPartNumber", (void *)&resp->ch.nvmlDeviceGetBoardPartNumber},
{"nvmlDeviceGetBrand", (void *)&resp->ch.nvmlDeviceGetBrand},
{NULL, NULL},
}; };
resp->ch.handle = LOAD_LIBRARY(cuda_lib_path, RTLD_LAZY); resp->ch.handle = LOAD_LIBRARY(cuda_lib_path, RTLD_LAZY);
if (!resp->ch.handle) { if (!resp->ch.handle) {
char *msg = LOAD_ERR(); char *msg = LOAD_ERR();
LOG(resp->ch.verbose, "library %s load err: %s\n", cuda_lib_path, msg);
snprintf(buf, buflen, snprintf(buf, buflen,
"Unable to load %s library to query for Nvidia GPUs: %s", "Unable to load %s library to query for Nvidia GPUs: %s",
cuda_lib_path, msg); cuda_lib_path, msg);
@@ -36,12 +42,19 @@ void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
return; return;
} }
for (i = 0; i < CUDA_LOOKUP_SIZE; i++) { // TODO - fix this to use a null terminated list // TODO once we've squashed the remaining corner cases remove this log
LOG(resp->ch.verbose, "wiring nvidia management library functions in %s\n", cuda_lib_path);
for (i = 0; l[i].s != NULL; i++) {
// TODO once we've squashed the remaining corner cases remove this log
LOG(resp->ch.verbose, "dlsym: %s\n", l[i].s);
*l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s); *l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
if (!l[i].p) { if (!l[i].p) {
UNLOAD_LIBRARY(resp->ch.handle);
resp->ch.handle = NULL; resp->ch.handle = NULL;
char *msg = LOAD_ERR(); char *msg = LOAD_ERR();
LOG(resp->ch.verbose, "dlerr: %s\n", msg);
UNLOAD_LIBRARY(resp->ch.handle);
snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s, snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
msg); msg);
free(msg); free(msg);
@@ -50,15 +63,23 @@ void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
} }
} }
ret = (*resp->ch.initFn)(); ret = (*resp->ch.nvmlInit_v2)();
if (ret != NVML_SUCCESS) { if (ret != NVML_SUCCESS) {
LOG(resp->ch.verbose, "nvmlInit_v2 err: %d\n", ret);
UNLOAD_LIBRARY(resp->ch.handle); UNLOAD_LIBRARY(resp->ch.handle);
resp->ch.handle = NULL; resp->ch.handle = NULL;
snprintf(buf, buflen, "nvml vram init failure: %d", ret); snprintf(buf, buflen, "nvml vram init failure: %d", ret);
resp->err = strdup(buf); resp->err = strdup(buf);
return;
} }
return; // Report driver version if we're in verbose mode, ignore errors
ret = (*resp->ch.nvmlSystemGetDriverVersion)(buf, buflen);
if (ret != NVML_SUCCESS) {
LOG(resp->ch.verbose, "nvmlSystemGetDriverVersion failed: %d\n", ret);
} else {
LOG(resp->ch.verbose, "CUDA driver version: %s\n", buf);
}
} }
void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) { void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
@@ -75,7 +96,7 @@ void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
return; return;
} }
ret = (*h.getCount)(&resp->count); ret = (*h.nvmlDeviceGetCount_v2)(&resp->count);
if (ret != NVML_SUCCESS) { if (ret != NVML_SUCCESS) {
snprintf(buf, buflen, "unable to get device count: %d", ret); snprintf(buf, buflen, "unable to get device count: %d", ret);
resp->err = strdup(buf); resp->err = strdup(buf);
@@ -85,19 +106,57 @@ void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
resp->total = 0; resp->total = 0;
resp->free = 0; resp->free = 0;
for (i = 0; i < resp->count; i++) { for (i = 0; i < resp->count; i++) {
ret = (*h.getHandle)(i, &device); ret = (*h.nvmlDeviceGetHandleByIndex)(i, &device);
if (ret != NVML_SUCCESS) { if (ret != NVML_SUCCESS) {
snprintf(buf, buflen, "unable to get device handle %d: %d", i, ret); snprintf(buf, buflen, "unable to get device handle %d: %d", i, ret);
resp->err = strdup(buf); resp->err = strdup(buf);
return; return;
} }
ret = (*h.getMemInfo)(device, &memInfo); ret = (*h.nvmlDeviceGetMemoryInfo)(device, &memInfo);
if (ret != NVML_SUCCESS) { if (ret != NVML_SUCCESS) {
snprintf(buf, buflen, "device memory info lookup failure %d: %d", i, ret); snprintf(buf, buflen, "device memory info lookup failure %d: %d", i, ret);
resp->err = strdup(buf); resp->err = strdup(buf);
return; return;
} }
if (h.verbose) {
nvmlBrandType_t brand = 0;
// When in verbose mode, report more information about
// the card we discover, but don't fail on error
ret = (*h.nvmlDeviceGetName)(device, buf, buflen);
if (ret != RSMI_STATUS_SUCCESS) {
LOG(h.verbose, "nvmlDeviceGetName failed: %d\n", ret);
} else {
LOG(h.verbose, "[%d] CUDA device name: %s\n", i, buf);
}
ret = (*h.nvmlDeviceGetBoardPartNumber)(device, buf, buflen);
if (ret != RSMI_STATUS_SUCCESS) {
LOG(h.verbose, "nvmlDeviceGetBoardPartNumber failed: %d\n", ret);
} else {
LOG(h.verbose, "[%d] CUDA part number: %s\n", i, buf);
}
ret = (*h.nvmlDeviceGetSerial)(device, buf, buflen);
if (ret != RSMI_STATUS_SUCCESS) {
LOG(h.verbose, "nvmlDeviceGetSerial failed: %d\n", ret);
} else {
LOG(h.verbose, "[%d] CUDA S/N: %s\n", i, buf);
}
ret = (*h.nvmlDeviceGetVbiosVersion)(device, buf, buflen);
if (ret != RSMI_STATUS_SUCCESS) {
LOG(h.verbose, "nvmlDeviceGetVbiosVersion failed: %d\n", ret);
} else {
LOG(h.verbose, "[%d] CUDA vbios version: %s\n", i, buf);
}
ret = (*h.nvmlDeviceGetBrand)(device, &brand);
if (ret != RSMI_STATUS_SUCCESS) {
LOG(h.verbose, "nvmlDeviceGetBrand failed: %d\n", ret);
} else {
LOG(h.verbose, "[%d] CUDA brand: %d\n", i, brand);
}
}
LOG(h.verbose, "[%d] CUDA totalMem %ld\n", i, memInfo.total);
LOG(h.verbose, "[%d] CUDA usedMem %ld\n", i, memInfo.free);
resp->total += memInfo.total; resp->total += memInfo.total;
resp->free += memInfo.free; resp->free += memInfo.free;
@@ -122,7 +181,7 @@ void cuda_compute_capability(cuda_handle_t h, cuda_compute_capability_t *resp) {
} }
unsigned int devices; unsigned int devices;
ret = (*h.getCount)(&devices); ret = (*h.nvmlDeviceGetCount_v2)(&devices);
if (ret != NVML_SUCCESS) { if (ret != NVML_SUCCESS) {
snprintf(buf, buflen, "unable to get device count: %d", ret); snprintf(buf, buflen, "unable to get device count: %d", ret);
resp->err = strdup(buf); resp->err = strdup(buf);
@@ -130,14 +189,14 @@ void cuda_compute_capability(cuda_handle_t h, cuda_compute_capability_t *resp) {
} }
for (i = 0; i < devices; i++) { for (i = 0; i < devices; i++) {
ret = (*h.getHandle)(i, &device); ret = (*h.nvmlDeviceGetHandleByIndex)(i, &device);
if (ret != NVML_SUCCESS) { if (ret != NVML_SUCCESS) {
snprintf(buf, buflen, "unable to get device handle %d: %d", i, ret); snprintf(buf, buflen, "unable to get device handle %d: %d", i, ret);
resp->err = strdup(buf); resp->err = strdup(buf);
return; return;
} }
ret = (*h.getComputeCapability)(device, &major, &minor); ret = (*h.nvmlDeviceGetCudaComputeCapability)(device, &major, &minor);
if (ret != NVML_SUCCESS) { if (ret != NVML_SUCCESS) {
snprintf(buf, buflen, "device compute capability lookup failure %d: %d", i, ret); snprintf(buf, buflen, "device compute capability lookup failure %d: %d", i, ret);
resp->err = strdup(buf); resp->err = strdup(buf);

View File

@@ -15,14 +15,26 @@ typedef struct nvmlMemory_st {
unsigned long long used; unsigned long long used;
} nvmlMemory_t; } nvmlMemory_t;
typedef enum nvmlBrandType_enum
{
NVML_BRAND_UNKNOWN = 0,
} nvmlBrandType_t;
typedef struct cuda_handle { typedef struct cuda_handle {
void *handle; void *handle;
nvmlReturn_t (*initFn)(void); uint16_t verbose;
nvmlReturn_t (*shutdownFn)(void); nvmlReturn_t (*nvmlInit_v2)(void);
nvmlReturn_t (*getHandle)(unsigned int, nvmlDevice_t *); nvmlReturn_t (*nvmlShutdown)(void);
nvmlReturn_t (*getMemInfo)(nvmlDevice_t, nvmlMemory_t *); nvmlReturn_t (*nvmlDeviceGetHandleByIndex)(unsigned int, nvmlDevice_t *);
nvmlReturn_t (*getCount)(unsigned int *); nvmlReturn_t (*nvmlDeviceGetMemoryInfo)(nvmlDevice_t, nvmlMemory_t *);
nvmlReturn_t (*getComputeCapability)(nvmlDevice_t, int* major, int* minor); nvmlReturn_t (*nvmlDeviceGetCount_v2)(unsigned int *);
nvmlReturn_t (*nvmlDeviceGetCudaComputeCapability)(nvmlDevice_t, int* major, int* minor);
nvmlReturn_t (*nvmlSystemGetDriverVersion) (char* version, unsigned int length);
nvmlReturn_t (*nvmlDeviceGetName) (nvmlDevice_t device, char* name, unsigned int length);
nvmlReturn_t (*nvmlDeviceGetSerial) (nvmlDevice_t device, char* serial, unsigned int length);
nvmlReturn_t (*nvmlDeviceGetVbiosVersion) (nvmlDevice_t device, char* version, unsigned int length);
nvmlReturn_t (*nvmlDeviceGetBoardPartNumber) (nvmlDevice_t device, char* partNumber, unsigned int length);
nvmlReturn_t (*nvmlDeviceGetBrand) (nvmlDevice_t device, nvmlBrandType_t* type);
} cuda_handle_t; } cuda_handle_t;
typedef struct cuda_init_resp { typedef struct cuda_init_resp {

View File

@@ -4,8 +4,6 @@
#include <string.h> #include <string.h>
#define ROCM_LOOKUP_SIZE 5
void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) { void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
rsmi_status_t ret; rsmi_status_t ret;
resp->err = NULL; resp->err = NULL;
@@ -15,13 +13,22 @@ void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
struct lookup { struct lookup {
char *s; char *s;
void **p; void **p;
} l[ROCM_LOOKUP_SIZE] = { } l[] = {
{"rsmi_init", (void *)&resp->rh.initFn}, {"rsmi_init", (void *)&resp->rh.rsmi_init},
{"rsmi_shut_down", (void *)&resp->rh.shutdownFn}, {"rsmi_shut_down", (void *)&resp->rh.rsmi_shut_down},
{"rsmi_dev_memory_total_get", (void *)&resp->rh.totalMemFn}, {"rsmi_dev_memory_total_get", (void *)&resp->rh.rsmi_dev_memory_total_get},
{"rsmi_dev_memory_usage_get", (void *)&resp->rh.usageMemFn}, {"rsmi_dev_memory_usage_get", (void *)&resp->rh.rsmi_dev_memory_usage_get},
{"rsmi_version_get", (void *)&resp->rh.versionGetFn}, {"rsmi_version_get", (void *)&resp->rh.rsmi_version_get},
// { "rsmi_dev_id_get", (void*)&resp->rh.getHandle }, {"rsmi_num_monitor_devices", (void*)&resp->rh.rsmi_num_monitor_devices},
{"rsmi_dev_id_get", (void*)&resp->rh.rsmi_dev_id_get},
{"rsmi_dev_name_get", (void *)&resp->rh.rsmi_dev_name_get},
{"rsmi_dev_brand_get", (void *)&resp->rh.rsmi_dev_brand_get},
{"rsmi_dev_vendor_name_get", (void *)&resp->rh.rsmi_dev_vendor_name_get},
{"rsmi_dev_vram_vendor_get", (void *)&resp->rh.rsmi_dev_vram_vendor_get},
{"rsmi_dev_serial_number_get", (void *)&resp->rh.rsmi_dev_serial_number_get},
{"rsmi_dev_subsystem_name_get", (void *)&resp->rh.rsmi_dev_subsystem_name_get},
{"rsmi_dev_vbios_version_get", (void *)&resp->rh.rsmi_dev_vbios_version_get},
{NULL, NULL},
}; };
resp->rh.handle = LOAD_LIBRARY(rocm_lib_path, RTLD_LAZY); resp->rh.handle = LOAD_LIBRARY(rocm_lib_path, RTLD_LAZY);
@@ -35,12 +42,19 @@ void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
return; return;
} }
for (i = 0; i < ROCM_LOOKUP_SIZE; i++) { // TODO once we've squashed the remaining corner cases remove this log
LOG(resp->rh.verbose, "wiring rocm management library functions in %s\n", rocm_lib_path);
for (i = 0; l[i].s != NULL; i++) {
// TODO once we've squashed the remaining corner cases remove this log
LOG(resp->rh.verbose, "dlsym: %s\n", l[i].s);
*l[i].p = LOAD_SYMBOL(resp->rh.handle, l[i].s); *l[i].p = LOAD_SYMBOL(resp->rh.handle, l[i].s);
if (!l[i].p) { if (!l[i].p) {
UNLOAD_LIBRARY(resp->rh.handle);
resp->rh.handle = NULL; resp->rh.handle = NULL;
char *msg = LOAD_ERR(); char *msg = LOAD_ERR();
LOG(resp->rh.verbose, "dlerr: %s\n", msg);
UNLOAD_LIBRARY(resp->rh.handle);
snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s, snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
msg); msg);
free(msg); free(msg);
@@ -49,8 +63,9 @@ void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
} }
} }
ret = (*resp->rh.initFn)(0); ret = (*resp->rh.rsmi_init)(0);
if (ret != RSMI_STATUS_SUCCESS) { if (ret != RSMI_STATUS_SUCCESS) {
LOG(resp->rh.verbose, "rsmi_init err: %d\n", ret);
UNLOAD_LIBRARY(resp->rh.handle); UNLOAD_LIBRARY(resp->rh.handle);
resp->rh.handle = NULL; resp->rh.handle = NULL;
snprintf(buf, buflen, "rocm vram init failure: %d", ret); snprintf(buf, buflen, "rocm vram init failure: %d", ret);
@@ -62,8 +77,7 @@ void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
void rocm_check_vram(rocm_handle_t h, mem_info_t *resp) { void rocm_check_vram(rocm_handle_t h, mem_info_t *resp) {
resp->err = NULL; resp->err = NULL;
// uint32_t num_devices; resp->igpu_index = -1;
// uint16_t device;
uint64_t totalMem = 0; uint64_t totalMem = 0;
uint64_t usedMem = 0; uint64_t usedMem = 0;
rsmi_status_t ret; rsmi_status_t ret;
@@ -76,34 +90,88 @@ void rocm_check_vram(rocm_handle_t h, mem_info_t *resp) {
return; return;
} }
// TODO - iterate through devices... ret = ret = (*h.rsmi_num_monitor_devices)(&resp->count);
// rsmi_num_monitor_devices(&num_devices);
// ret = (*h.getHandle)(0, &device);
// if (ret != RSMI_STATUS_SUCCESS) {
// printf("rocm vram device lookup failure: %d\n", ret);
// return -1;
// }
// Get total memory - used memory for available memory
ret = (*h.totalMemFn)(0, RSMI_MEM_TYPE_VRAM, &totalMem);
if (ret != RSMI_STATUS_SUCCESS) { if (ret != RSMI_STATUS_SUCCESS) {
snprintf(buf, buflen, "rocm total mem lookup failure: %d", ret); snprintf(buf, buflen, "unable to get device count: %d", ret);
resp->err = strdup(buf);
return;
}
ret = (*h.usageMemFn)(0, RSMI_MEM_TYPE_VRAM, &usedMem);
if (ret != RSMI_STATUS_SUCCESS) {
snprintf(buf, buflen, "rocm usage mem lookup failure: %d", ret);
resp->err = strdup(buf); resp->err = strdup(buf);
return; return;
} }
LOG(h.verbose, "discovered %d ROCm GPU Devices\n", resp->count);
// TODO: set this to the actual number of devices resp->total = 0;
resp->count = 1; resp->free = 0;
resp->total = totalMem; for (i = 0; i < resp->count; i++) {
resp->free = totalMem - usedMem; if (h.verbose) {
return; // When in verbose mode, report more information about
// the card we discover, but don't fail on error
ret = (*h.rsmi_dev_name_get)(i, buf, buflen);
if (ret != RSMI_STATUS_SUCCESS) {
LOG(h.verbose, "rsmi_dev_name_get failed: %d\n", ret);
} else {
LOG(h.verbose, "[%d] ROCm device name: %s\n", i, buf);
}
ret = (*h.rsmi_dev_brand_get)(i, buf, buflen);
if (ret != RSMI_STATUS_SUCCESS) {
LOG(h.verbose, "rsmi_dev_brand_get failed: %d\n", ret);
} else {
LOG(h.verbose, "[%d] ROCm brand: %s\n", i, buf);
}
ret = (*h.rsmi_dev_vendor_name_get)(i, buf, buflen);
if (ret != RSMI_STATUS_SUCCESS) {
LOG(h.verbose, "rsmi_dev_vendor_name_get failed: %d\n", ret);
} else {
LOG(h.verbose, "[%d] ROCm vendor: %s\n", i, buf);
}
ret = (*h.rsmi_dev_vram_vendor_get)(i, buf, buflen);
if (ret != RSMI_STATUS_SUCCESS) {
LOG(h.verbose, "rsmi_dev_vram_vendor_get failed: %d\n", ret);
} else {
LOG(h.verbose, "[%d] ROCm VRAM vendor: %s\n", i, buf);
}
ret = (*h.rsmi_dev_serial_number_get)(i, buf, buflen);
if (ret != RSMI_STATUS_SUCCESS) {
LOG(h.verbose, "rsmi_dev_serial_number_get failed: %d\n", ret);
} else {
LOG(h.verbose, "[%d] ROCm S/N: %s\n", i, buf);
}
ret = (*h.rsmi_dev_subsystem_name_get)(i, buf, buflen);
if (ret != RSMI_STATUS_SUCCESS) {
LOG(h.verbose, "rsmi_dev_subsystem_name_get failed: %d\n", ret);
} else {
LOG(h.verbose, "[%d] ROCm subsystem name: %s\n", i, buf);
}
ret = (*h.rsmi_dev_vbios_version_get)(i, buf, buflen);
if (ret != RSMI_STATUS_SUCCESS) {
LOG(h.verbose, "rsmi_dev_vbios_version_get failed: %d\n", ret);
} else {
LOG(h.verbose, "[%d] ROCm vbios version: %s\n", i, buf);
}
}
// Get total memory - used memory for available memory
ret = (*h.rsmi_dev_memory_total_get)(i, RSMI_MEM_TYPE_VRAM, &totalMem);
if (ret != RSMI_STATUS_SUCCESS) {
snprintf(buf, buflen, "rocm total mem lookup failure: %d", ret);
resp->err = strdup(buf);
return;
}
ret = (*h.rsmi_dev_memory_usage_get)(i, RSMI_MEM_TYPE_VRAM, &usedMem);
if (ret != RSMI_STATUS_SUCCESS) {
snprintf(buf, buflen, "rocm usage mem lookup failure: %d", ret);
resp->err = strdup(buf);
return;
}
LOG(h.verbose, "[%d] ROCm totalMem %ld\n", i, totalMem);
LOG(h.verbose, "[%d] ROCm usedMem %ld\n", i, usedMem);
if (totalMem < 1024 * 1024 * 1024) {
// Do not add up integrated GPU memory capacity, it's a bogus 512M, and actually uses system memory
LOG(h.verbose, "[%d] ROCm integrated GPU\n", i);
resp->igpu_index = i;
} else {
resp->total += totalMem;
resp->free += totalMem - usedMem;
}
}
} }
void rocm_get_version(rocm_handle_t h, rocm_version_resp_t *resp) { void rocm_get_version(rocm_handle_t h, rocm_version_resp_t *resp) {
@@ -116,7 +184,7 @@ void rocm_get_version(rocm_handle_t h, rocm_version_resp_t *resp) {
} }
rsmi_version_t ver; rsmi_version_t ver;
rsmi_status_t ret; rsmi_status_t ret;
ret = h.versionGetFn(&ver); ret = h.rsmi_version_get(&ver);
if (ret != RSMI_STATUS_SUCCESS) { if (ret != RSMI_STATUS_SUCCESS) {
snprintf(buf, buflen, "unexpected response on version lookup %d", ret); snprintf(buf, buflen, "unexpected response on version lookup %d", ret);
resp->status = 1; resp->status = 1;

View File

@@ -24,12 +24,21 @@ typedef enum rsmi_memory_type {
typedef struct rocm_handle { typedef struct rocm_handle {
void *handle; void *handle;
rsmi_status_t (*initFn)(uint64_t); uint16_t verbose;
rsmi_status_t (*shutdownFn)(void); rsmi_status_t (*rsmi_init)(uint64_t);
rsmi_status_t (*totalMemFn)(uint32_t, rsmi_memory_type_t, uint64_t *); rsmi_status_t (*rsmi_shut_down)(void);
rsmi_status_t (*usageMemFn)(uint32_t, rsmi_memory_type_t, uint64_t *); rsmi_status_t (*rsmi_dev_memory_total_get)(uint32_t, rsmi_memory_type_t, uint64_t *);
rsmi_status_t (*versionGetFn) (rsmi_version_t *version); rsmi_status_t (*rsmi_dev_memory_usage_get)(uint32_t, rsmi_memory_type_t, uint64_t *);
// rsmi_status_t (*getHandle)(uint32_t, uint16_t *); rsmi_status_t (*rsmi_version_get) (rsmi_version_t *version);
rsmi_status_t (*rsmi_num_monitor_devices) (uint32_t *);
rsmi_status_t (*rsmi_dev_id_get)(uint32_t, uint16_t *);
rsmi_status_t (*rsmi_dev_name_get) (uint32_t,char *,size_t);
rsmi_status_t (*rsmi_dev_brand_get) (uint32_t, char *, uint32_t);
rsmi_status_t (*rsmi_dev_vendor_name_get) (uint32_t, char *, uint32_t);
rsmi_status_t (*rsmi_dev_vram_vendor_get) (uint32_t, char *, uint32_t);
rsmi_status_t (*rsmi_dev_serial_number_get) (uint32_t, char *, uint32_t);
rsmi_status_t (*rsmi_dev_subsystem_name_get) (uint32_t, char *, uint32_t);
rsmi_status_t (*rsmi_dev_vbios_version_get) (uint32_t, char *, uint32_t);
} rocm_handle_t; } rocm_handle_t;
typedef struct rocm_init_resp { typedef struct rocm_init_resp {

View File

@@ -59,7 +59,7 @@ void dyn_init(const char *libPath, struct dynamic_llama_server *s,
}; };
printf("loading library %s\n", libPath); printf("loading library %s\n", libPath);
s->handle = LOAD_LIBRARY(libPath, RTLD_GLOBAL|RTLD_NOW); s->handle = LOAD_LIBRARY(libPath, RTLD_LOCAL|RTLD_NOW);
if (!s->handle) { if (!s->handle) {
err->id = -1; err->id = -1;
char *msg = LOAD_ERR(); char *msg = LOAD_ERR();

View File

@@ -136,12 +136,21 @@ func newDynExtServer(library, model string, adapters, projectors []string, opts
sparams.n_threads = C.uint(opts.NumThread) sparams.n_threads = C.uint(opts.NumThread)
if debug := os.Getenv("OLLAMA_DEBUG"); debug != "" {
sparams.verbose_logging = C.bool(true)
} else {
sparams.verbose_logging = C.bool(false)
}
slog.Info("Initializing llama server") slog.Info("Initializing llama server")
initResp := newExtServerResp(128) initResp := newExtServerResp(128)
defer freeExtServerResp(initResp) defer freeExtServerResp(initResp)
C.dyn_llama_server_init(llm.s, &sparams, &initResp) C.dyn_llama_server_init(llm.s, &sparams, &initResp)
if initResp.id < 0 { if initResp.id < 0 {
return nil, extServerResponseToErr(initResp) mutex.Unlock()
err := extServerResponseToErr(initResp)
slog.Debug(fmt.Sprintf("failure during initialization: %s", err))
return nil, err
} }
slog.Info("Starting llama main loop") slog.Info("Starting llama main loop")
@@ -181,6 +190,7 @@ func (llm *dynExtServer) Predict(ctx context.Context, predict PredictOpts, fn fu
"seed": predict.Options.Seed, "seed": predict.Options.Seed,
"stop": predict.Options.Stop, "stop": predict.Options.Stop,
"image_data": imageData, "image_data": imageData,
"cache_prompt": true,
} }
if predict.Format == "json" { if predict.Format == "json" {

View File

@@ -3,22 +3,44 @@
// Necessary evil since the server types are not defined in a header // Necessary evil since the server types are not defined in a header
#include "server.cpp" #include "server.cpp"
// Low level API access to verify GPU access
#if defined(GGML_USE_CUBLAS)
#if defined(GGML_USE_HIPBLAS)
#include <hip/hip_runtime.h>
#include <hipblas/hipblas.h>
#include <hip/hip_fp16.h>
#ifdef __HIP_PLATFORM_AMD__
// for rocblas_initialize()
#include "rocblas/rocblas.h"
#endif // __HIP_PLATFORM_AMD__
#define cudaGetDevice hipGetDevice
#define cudaError_t hipError_t
#define cudaSuccess hipSuccess
#define cudaGetErrorString hipGetErrorString
#else
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <cuda_fp16.h>
#endif // defined(GGML_USE_HIPBLAS)
#endif // GGML_USE_CUBLAS
// Expose the llama server as a callable extern "C" API // Expose the llama server as a callable extern "C" API
llama_server_context *llama = NULL; llama_server_context *llama = NULL;
std::atomic<bool> ext_server_running(false); std::atomic<bool> ext_server_running(false);
std::thread ext_server_thread; std::thread ext_server_thread;
void llama_server_init(ext_server_params *sparams, ext_server_resp_t *err) { void llama_server_init(ext_server_params *sparams, ext_server_resp_t *err) {
#if SERVER_VERBOSE != 1
log_disable();
#endif
LOG_TEE("system info: %s", llama_print_system_info());
assert(err != NULL && sparams != NULL); assert(err != NULL && sparams != NULL);
log_set_target(stderr);
if (!sparams->verbose_logging) {
log_disable();
}
LOG_TEE("system info: %s\n", llama_print_system_info());
err->id = 0; err->id = 0;
err->msg[0] = '\0'; err->msg[0] = '\0';
try { try {
llama = new llama_server_context; llama = new llama_server_context;
log_set_target(stdout);
gpt_params params; gpt_params params;
params.n_ctx = sparams->n_ctx; params.n_ctx = sparams->n_ctx;
params.n_batch = sparams->n_batch; params.n_batch = sparams->n_batch;
@@ -60,6 +82,18 @@ void llama_server_init(ext_server_params *sparams, ext_server_resp_t *err) {
params.mmproj = std::string(sparams->mmproj); params.mmproj = std::string(sparams->mmproj);
} }
#if defined(GGML_USE_CUBLAS)
// Before attempting to init the backend which will assert on error, verify the CUDA/ROCM GPU is accessible
LOG_TEE("Performing pre-initialization of GPU\n");
int id;
cudaError_t cudaErr = cudaGetDevice(&id);
if (cudaErr != cudaSuccess) {
err->id = -1;
snprintf(err->msg, err->msg_len, "Unable to init GPU: %s", cudaGetErrorString(cudaErr));
return;
}
#endif
llama_backend_init(params.numa); llama_backend_init(params.numa);
// load the model // load the model

View File

@@ -45,6 +45,7 @@ typedef struct ext_server_params {
bool embedding; // get only sentence embedding bool embedding; // get only sentence embedding
ext_server_lora_adapter_t *lora_adapters; ext_server_lora_adapter_t *lora_adapters;
char *mmproj; char *mmproj;
bool verbose_logging; // Enable verbose logging of the server
} ext_server_params_t; } ext_server_params_t;
typedef struct ext_server_task_result { typedef struct ext_server_task_result {

View File

@@ -61,6 +61,17 @@ apply_patches() {
if ! grep ollama ${LLAMACPP_DIR}/examples/server/CMakeLists.txt; then if ! grep ollama ${LLAMACPP_DIR}/examples/server/CMakeLists.txt; then
echo 'include (../../../ext_server/CMakeLists.txt) # ollama' >>${LLAMACPP_DIR}/examples/server/CMakeLists.txt echo 'include (../../../ext_server/CMakeLists.txt) # ollama' >>${LLAMACPP_DIR}/examples/server/CMakeLists.txt
fi fi
# apply temporary patches until fix is upstream
for patch in ../patches/*.diff; do
for file in $(grep "^+++ " ${patch} | cut -f2 -d' ' | cut -f2- -d/); do
(cd ${LLAMACPP_DIR}; git checkout ${file})
done
done
for patch in ../patches/*.diff; do
(cd ${LLAMACPP_DIR} && git apply ${patch})
done
# Avoid duplicate main symbols when we link into the cgo binary # Avoid duplicate main symbols when we link into the cgo binary
sed -e 's/int main(/int __main(/g' <${LLAMACPP_DIR}/examples/server/server.cpp >${LLAMACPP_DIR}/examples/server/server.cpp.tmp && sed -e 's/int main(/int __main(/g' <${LLAMACPP_DIR}/examples/server/server.cpp >${LLAMACPP_DIR}/examples/server/server.cpp.tmp &&
mv ${LLAMACPP_DIR}/examples/server/server.cpp.tmp ${LLAMACPP_DIR}/examples/server/server.cpp mv ${LLAMACPP_DIR}/examples/server/server.cpp.tmp ${LLAMACPP_DIR}/examples/server/server.cpp
@@ -83,8 +94,9 @@ build() {
compress_libs() { compress_libs() {
echo "Compressing payloads to reduce overall binary size..." echo "Compressing payloads to reduce overall binary size..."
pids="" pids=""
rm -rf ${BUILD_DIR}/lib/*.${LIB_EXT}*.gz
for lib in ${BUILD_DIR}/lib/*.${LIB_EXT}* ; do for lib in ${BUILD_DIR}/lib/*.${LIB_EXT}* ; do
gzip --best ${lib} & gzip --best -f ${lib} &
pids+=" $!" pids+=" $!"
done done
echo echo

View File

@@ -12,7 +12,13 @@ init_vars
git_module_setup git_module_setup
apply_patches apply_patches
COMMON_DARWIN_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 -DCMAKE_SYSTEM_NAME=Darwin -DLLAMA_ACCELERATE=off" sign() {
if [ -n "$APPLE_IDENTITY" ]; then
codesign -f --timestamp --deep --options=runtime --sign "$APPLE_IDENTITY" --identifier ai.ollama.ollama $1
fi
}
COMMON_DARWIN_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 -DCMAKE_SYSTEM_NAME=Darwin"
case "${GOARCH}" in case "${GOARCH}" in
"amd64") "amd64")
@@ -21,10 +27,11 @@ case "${GOARCH}" in
# #
# CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta) # CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
# #
CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}" CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
BUILD_DIR="${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu" BUILD_DIR="${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu"
echo "Building LCD CPU" echo "Building LCD CPU"
build build
sign ${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu/lib/libext_server.dylib
compress_libs compress_libs
# #
@@ -32,10 +39,11 @@ case "${GOARCH}" in
# Approximately 400% faster than LCD on same CPU # Approximately 400% faster than LCD on same CPU
# #
init_vars init_vars
CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}" CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
BUILD_DIR="${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu_avx" BUILD_DIR="${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu_avx"
echo "Building AVX CPU" echo "Building AVX CPU"
build build
sign ${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu_avx/lib/libext_server.dylib
compress_libs compress_libs
# #
@@ -43,17 +51,20 @@ case "${GOARCH}" in
# Approximately 10% faster than AVX on same CPU # Approximately 10% faster than AVX on same CPU
# #
init_vars init_vars
CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on ${CMAKE_DEFS}" CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=on -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on ${CMAKE_DEFS}"
BUILD_DIR="${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu_avx2" BUILD_DIR="${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu_avx2"
echo "Building AVX2 CPU" echo "Building AVX2 CPU"
EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation"
build build
sign ${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu_avx2/lib/libext_server.dylib
compress_libs compress_libs
;; ;;
"arm64") "arm64")
CMAKE_DEFS="${COMMON_DARWIN_DEFS} -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DLLAMA_METAL=on -DLLAMA_ACCELERATE=on ${CMAKE_DEFS}" CMAKE_DEFS="${COMMON_DARWIN_DEFS} -DLLAMA_ACCELERATE=on -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DLLAMA_METAL=on ${CMAKE_DEFS}"
BUILD_DIR="${LLAMACPP_DIR}/build/darwin/${ARCH}/metal" BUILD_DIR="${LLAMACPP_DIR}/build/darwin/${ARCH}/metal"
EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders" EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders"
build build
sign ${LLAMACPP_DIR}/build/darwin/${ARCH}/metal/lib/libext_server.dylib
compress_libs compress_libs
;; ;;
*) *)

View File

@@ -16,6 +16,10 @@ set -o pipefail
# See https://llvm.org/docs/AMDGPUUsage.html#processors for reference # See https://llvm.org/docs/AMDGPUUsage.html#processors for reference
amdGPUs() { amdGPUs() {
if [ -n "${AMDGPU_TARGETS}" ]; then
echo "${AMDGPU_TARGETS}"
return
fi
GPU_LIST=( GPU_LIST=(
"gfx803" "gfx803"
"gfx900" "gfx900"
@@ -73,36 +77,42 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
# -DLLAMA_AVX512_VNNI -- 2021 Intel Alder Lake # -DLLAMA_AVX512_VNNI -- 2021 Intel Alder Lake
COMMON_CPU_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off" COMMON_CPU_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off"
# if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu" ]; then
# CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta) #
# # CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}" #
BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu" CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
echo "Building LCD CPU" BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu"
build echo "Building LCD CPU"
compress_libs build
compress_libs
fi
# if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu_avx" ]; then
# ~2011 CPU Dynamic library with more capabilities turned on to optimize performance #
# Approximately 400% faster than LCD on same CPU # ~2011 CPU Dynamic library with more capabilities turned on to optimize performance
# # Approximately 400% faster than LCD on same CPU
init_vars #
CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}" init_vars
BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu_avx" CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
echo "Building AVX CPU" BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu_avx"
build echo "Building AVX CPU"
compress_libs build
compress_libs
fi
# if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu_avx2" ]; then
# ~2013 CPU Dynamic library #
# Approximately 10% faster than AVX on same CPU # ~2013 CPU Dynamic library
# # Approximately 10% faster than AVX on same CPU
init_vars #
CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on ${CMAKE_DEFS}" init_vars
BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu_avx2" CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on ${CMAKE_DEFS}"
echo "Building AVX2 CPU" BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu_avx2"
build echo "Building AVX2 CPU"
compress_libs build
compress_libs
fi
fi fi
else else
echo "Skipping CPU generation step as requested" echo "Skipping CPU generation step as requested"

View File

@@ -40,6 +40,29 @@ function apply_patches {
if (!(Select-String -Path "${script:llamacppDir}/examples/server/CMakeLists.txt" -Pattern 'ollama')) { if (!(Select-String -Path "${script:llamacppDir}/examples/server/CMakeLists.txt" -Pattern 'ollama')) {
Add-Content -Path "${script:llamacppDir}/examples/server/CMakeLists.txt" -Value 'include (../../../ext_server/CMakeLists.txt) # ollama' Add-Content -Path "${script:llamacppDir}/examples/server/CMakeLists.txt" -Value 'include (../../../ext_server/CMakeLists.txt) # ollama'
} }
# Apply temporary patches until fix is upstream
$patches = Get-ChildItem "../patches/*.diff"
foreach ($patch in $patches) {
# Extract file paths from the patch file
$filePaths = Get-Content $patch.FullName | Where-Object { $_ -match '^\+\+\+ ' } | ForEach-Object {
$parts = $_ -split ' '
($parts[1] -split '/', 2)[1]
}
# Checkout each file
foreach ($file in $filePaths) {
Set-Location -Path ${script:llamacppDir}
git checkout $file
}
}
# Apply each patch
foreach ($patch in $patches) {
Set-Location -Path ${script:llamacppDir}
git apply $patch.FullName
}
# Avoid duplicate main symbols when we link into the cgo binary # Avoid duplicate main symbols when we link into the cgo binary
$content = Get-Content -Path "${script:llamacppDir}/examples/server/server.cpp" $content = Get-Content -Path "${script:llamacppDir}/examples/server/server.cpp"
$content = $content -replace 'int main\(', 'int __main(' $content = $content -replace 'int main\(', 'int __main('
@@ -76,7 +99,7 @@ function compress_libs {
write-host "Compressing dlls..." write-host "Compressing dlls..."
$libs = dir "${script:buildDir}/lib/*.dll" $libs = dir "${script:buildDir}/lib/*.dll"
foreach ($file in $libs) { foreach ($file in $libs) {
& "$script:GZIP" --best $file & "$script:GZIP" --best -f $file
} }
} }

View File

@@ -69,12 +69,65 @@ type tensor struct {
name string name string
kind uint32 kind uint32
offset uint64 offset uint64
size uint64
// shape is the number of elements in each dimension // shape is the number of elements in each dimension
shape [4]uint64 shape [4]uint64
} }
func (t tensor) blockSize() uint64 {
switch {
case t.kind < 2:
return 1
case t.kind < 10:
return 32
default:
return 256
}
}
func (t tensor) typeSize() uint64 {
blockSize := t.blockSize()
switch t.kind {
case 0: // FP32
return 4
case 1: // FP16
return 2
case 2: // Q4_0
return 2 + blockSize/2
case 3: // Q4_1
return 2 + 2 + blockSize/2
case 6: // Q5_0
return 2 + 4 + blockSize/2
case 7: // Q5_1
return 2 + 2 + 4 + blockSize/2
case 8: // Q8_0
return 2 + blockSize
case 9: // Q8_1
return 4 + 4 + blockSize
case 10: // Q2_K
return blockSize/16 + blockSize/4 + 2 + 2
case 11: // Q3_K
return blockSize/8 + blockSize/4 + 12 + 2
case 12: // Q4_K
return 2 + 2 + 12 + blockSize/2
case 13: // Q5_K
return 2 + 2 + 12 + blockSize/8 + blockSize/2
case 14: // Q6_K
return blockSize/2 + blockSize/4 + blockSize/16 + 2
default:
return 0
}
}
func (t tensor) parameters() uint64 {
return t.shape[0] * t.shape[1] * t.shape[2] * t.shape[3]
}
func (t tensor) size() uint64 {
return t.parameters() * t.typeSize() / t.blockSize()
}
type ggufModel struct { type ggufModel struct {
*containerGGUF *containerGGUF
@@ -201,61 +254,15 @@ func (llm *ggufModel) Decode(rso *readSeekOffset) error {
shape[i] = llm.readU64(rso) shape[i] = llm.readU64(rso)
} }
kind := llm.readU32(rso) tensor := tensor{
offset := llm.readU64(rso)
var blockSize uint64
switch {
case kind < 2:
blockSize = 1
case kind < 10:
blockSize = 32
default:
blockSize = 256
}
var typeSize uint64
switch kind {
case 0: // FP32
typeSize = 4
case 1: // FP16
typeSize = 2
case 2: // Q4_0
typeSize = 2 + blockSize/2
case 3: // Q4_1
typeSize = 2 + 2 + blockSize/2
case 6: // Q5_0
typeSize = 2 + 4 + blockSize/2
case 7: // Q5_1
typeSize = 2 + 2 + 4 + blockSize/2
case 8: // Q8_0
typeSize = 2 + blockSize
case 9: // Q8_1
typeSize = 4 + 4 + blockSize
case 10: // Q2_K
typeSize = blockSize/16 + blockSize/4 + 2 + 2
case 11: // Q3_K
typeSize = blockSize/8 + blockSize/4 + 12 + 2
case 12: // Q4_K
typeSize = 2 + 2 + 12 + blockSize/2
case 13: // Q5_K
typeSize = 2 + 2 + 12 + blockSize/8 + blockSize/2
case 14: // Q6_K
typeSize = blockSize/2 + blockSize/4 + blockSize/16 + 2
}
parameters := shape[0] * shape[1] * shape[2] * shape[3]
size := parameters * typeSize / blockSize
llm.tensors = append(llm.tensors, tensor{
name: name, name: name,
kind: kind, kind: llm.readU32(rso),
offset: offset, offset: llm.readU64(rso),
size: size,
shape: shape, shape: shape,
}) }
llm.parameters += parameters llm.tensors = append(llm.tensors, tensor)
llm.parameters += tensor.parameters()
} }
alignment, ok := llm.kv["general.alignment"].(uint32) alignment, ok := llm.kv["general.alignment"].(uint32)
@@ -265,7 +272,7 @@ func (llm *ggufModel) Decode(rso *readSeekOffset) error {
rso.Seek(int64(alignment)-rso.offset%int64(alignment), io.SeekCurrent) rso.Seek(int64(alignment)-rso.offset%int64(alignment), io.SeekCurrent)
for _, tensor := range llm.tensors { for _, tensor := range llm.tensors {
padded := (int64(tensor.size) + int64(alignment) - 1) & ^(int64(alignment) - 1) padded := (int64(tensor.size()) + int64(alignment) - 1) & ^(int64(alignment) - 1)
rso.Seek(padded, io.SeekCurrent) rso.Seek(padded, io.SeekCurrent)
} }

View File

@@ -70,7 +70,8 @@ func New(workDir, model string, adapters, projectors []string, opts api.Options)
break break
} }
opts.NumGPU = 1 // TODO: implement layer splitting on macOS
opts.NumGPU = 999
default: default:
if info.Library == "cpu" { if info.Library == "cpu" {
slog.Info("GPU not available, falling back to CPU") slog.Info("GPU not available, falling back to CPU")

30
llm/patches/01-cache.diff Normal file
View File

@@ -0,0 +1,30 @@
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 0462fbd2..4fa7b57f 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1857,12 +1857,6 @@ struct llama_server_context
LOG_TEE("slot %d : in cache: %i tokens | to process: %i tokens\n", slot.id, slot.n_past, slot.num_prompt_tokens_processed);
}
- LOG_TEE("slot %d : kv cache rm - [%d, end)\n", slot.id, (int) system_tokens.size() + slot.n_past);
-
- llama_kv_cache_seq_rm(ctx, slot.id, system_tokens.size() + slot.n_past, -1);
-
- slot.cache_tokens = prompt_tokens;
-
if (slot.n_past == slot.num_prompt_tokens && slot.n_past > 0)
{
// we have to evaluate at least 1 token to generate logits.
@@ -1870,6 +1864,12 @@ struct llama_server_context
slot.n_past--;
}
+ LOG_TEE("slot %d : kv cache rm - [%d, end)\n", slot.id, (int) system_tokens.size() + slot.n_past);
+
+ llama_kv_cache_seq_rm(ctx, slot.id, system_tokens.size() + slot.n_past, -1);
+
+ slot.cache_tokens = prompt_tokens;
+
LOG_VERBOSE("prompt ingested", {
{"n_past", slot.n_past},
{"cached", tokens_to_str(ctx, slot.cache_tokens.cbegin(), slot.cache_tokens.cbegin() + slot.n_past)},

View File

@@ -7,6 +7,7 @@ import (
"fmt" "fmt"
"io" "io"
"log/slog" "log/slog"
"slices"
) )
type Command struct { type Command struct {
@@ -56,6 +57,16 @@ func Parse(reader io.Reader) ([]Command, error) {
command.Args = string(bytes.TrimSpace(fields[1])) command.Args = string(bytes.TrimSpace(fields[1]))
case "EMBED": case "EMBED":
return nil, fmt.Errorf("deprecated command: EMBED is no longer supported, use the /embed API endpoint instead") return nil, fmt.Errorf("deprecated command: EMBED is no longer supported, use the /embed API endpoint instead")
case "MESSAGE":
command.Name = string(bytes.ToLower(fields[0]))
fields = bytes.SplitN(fields[1], []byte(" "), 2)
if len(fields) < 2 {
return nil, fmt.Errorf("should be in the format <role> <message>")
}
if !slices.Contains([]string{"system", "user", "assistant"}, string(bytes.ToLower(fields[0]))) {
return nil, fmt.Errorf("role must be one of \"system\", \"user\", or \"assistant\"")
}
command.Args = fmt.Sprintf("%s: %s", string(bytes.ToLower(fields[0])), string(fields[1]))
default: default:
if !bytes.HasPrefix(fields[0], []byte("#")) { if !bytes.HasPrefix(fields[0], []byte("#")) {
// log a warning for unknown commands // log a warning for unknown commands

View File

@@ -61,3 +61,38 @@ PARAMETER param1
assert.ErrorContains(t, err, "missing value for [param1]") assert.ErrorContains(t, err, "missing value for [param1]")
} }
func Test_Parser_Messages(t *testing.T) {
input := `
FROM foo
MESSAGE system You are a Parser. Always Parse things.
MESSAGE user Hey there!
MESSAGE assistant Hello, I want to parse all the things!
`
reader := strings.NewReader(input)
commands, err := Parse(reader)
assert.Nil(t, err)
expectedCommands := []Command{
{Name: "model", Args: "foo"},
{Name: "message", Args: "system: You are a Parser. Always Parse things."},
{Name: "message", Args: "user: Hey there!"},
{Name: "message", Args: "assistant: Hello, I want to parse all the things!"},
}
assert.Equal(t, expectedCommands, commands)
}
func Test_Parser_Messages_BadRole(t *testing.T) {
input := `
FROM foo
MESSAGE badguy I'm a bad guy!
`
reader := strings.NewReader(input)
_, err := Parse(reader)
assert.ErrorContains(t, err, "role must be one of \"system\", \"user\", or \"assistant\"")
}

View File

@@ -52,6 +52,10 @@ func (p *Progress) Stop() bool {
return stopped return stopped
} }
func (p *Progress) StopWithoutClear() bool {
return p.stop()
}
func (p *Progress) StopAndClear() bool { func (p *Progress) StopAndClear() bool {
fmt.Fprint(p.w, "\033[?25l") fmt.Fprint(p.w, "\033[?25l")
defer fmt.Fprint(p.w, "\033[?25h") defer fmt.Fprint(p.w, "\033[?25h")

View File

@@ -133,13 +133,6 @@ func (b *Buffer) Size() int {
return b.Buf.Size() return b.Buf.Size()
} }
func min(n, m int) int {
if n > m {
return m
}
return n
}
func (b *Buffer) Add(r rune) { func (b *Buffer) Add(r rune) {
if b.Pos == b.Buf.Size() { if b.Pos == b.Buf.Size() {
fmt.Printf("%c", r) fmt.Printf("%c", r)

View File

@@ -2,7 +2,7 @@
set -e set -e
export VERSION=${VERSION:-0.0.0} export VERSION=${VERSION:-$(git describe --tags --first-parent --abbrev=7 --long --dirty --always | sed -e "s/^v//g")}
export GOFLAGS="'-ldflags=-w -s \"-X=github.com/jmorganca/ollama/version.Version=$VERSION\" \"-X=github.com/jmorganca/ollama/server.mode=release\"'" export GOFLAGS="'-ldflags=-w -s \"-X=github.com/jmorganca/ollama/version.Version=$VERSION\" \"-X=github.com/jmorganca/ollama/server.mode=release\"'"
mkdir -p dist mkdir -p dist

View File

@@ -2,7 +2,7 @@
set -eu set -eu
export VERSION=${VERSION:-0.0.0} export VERSION=${VERSION:-$(git describe --tags --first-parent --abbrev=7 --long --dirty --always | sed -e "s/^v//g")}
export GOFLAGS="'-ldflags=-w -s \"-X=github.com/jmorganca/ollama/version.Version=$VERSION\" \"-X=github.com/jmorganca/ollama/server.mode=release\"'" export GOFLAGS="'-ldflags=-w -s \"-X=github.com/jmorganca/ollama/version.Version=$VERSION\" \"-X=github.com/jmorganca/ollama/server.mode=release\"'"
docker build \ docker build \
@@ -13,3 +13,13 @@ docker build \
-f Dockerfile \ -f Dockerfile \
-t ollama/ollama:$VERSION \ -t ollama/ollama:$VERSION \
. .
docker build \
--load \
--platform=linux/amd64 \
--build-arg=VERSION \
--build-arg=GOFLAGS \
--target runtime-rocm \
-f Dockerfile \
-t ollama/ollama:$VERSION-rocm \
.

View File

@@ -2,14 +2,24 @@
set -eu set -eu
export VERSION=${VERSION:-0.0.0} export VERSION=${VERSION:-$(git describe --tags --first-parent --abbrev=7 --long --dirty --always | sed -e "s/^v//g")}
export GOFLAGS="'-ldflags=-w -s \"-X=github.com/jmorganca/ollama/version.Version=$VERSION\" \"-X=github.com/jmorganca/ollama/server.mode=release\"'" export GOFLAGS="'-ldflags=-w -s \"-X=github.com/jmorganca/ollama/version.Version=$VERSION\" \"-X=github.com/jmorganca/ollama/server.mode=release\"'"
BUILD_ARCH=${BUILD_ARCH:-"amd64 arm64"} BUILD_ARCH=${BUILD_ARCH:-"amd64 arm64"}
export AMDGPU_TARGETS=${AMDGPU_TARGETS:=""}
mkdir -p dist mkdir -p dist
for TARGETARCH in ${BUILD_ARCH}; do for TARGETARCH in ${BUILD_ARCH}; do
docker build --platform=linux/$TARGETARCH --build-arg=GOFLAGS --build-arg=CGO_CFLAGS --build-arg=OLLAMA_CUSTOM_CPU_DEFS -f Dockerfile.build -t builder:$TARGETARCH . docker build \
--platform=linux/$TARGETARCH \
--build-arg=GOFLAGS \
--build-arg=CGO_CFLAGS \
--build-arg=OLLAMA_CUSTOM_CPU_DEFS \
--build-arg=AMDGPU_TARGETS \
--target build-$TARGETARCH \
-f Dockerfile \
-t builder:$TARGETARCH \
.
docker create --platform linux/$TARGETARCH --name builder-$TARGETARCH builder:$TARGETARCH docker create --platform linux/$TARGETARCH --name builder-$TARGETARCH builder:$TARGETARCH
docker cp builder-$TARGETARCH:/go/src/github.com/jmorganca/ollama/ollama ./dist/ollama-linux-$TARGETARCH docker cp builder-$TARGETARCH:/go/src/github.com/jmorganca/ollama/ollama ./dist/ollama-linux-$TARGETARCH
docker rm builder-$TARGETARCH docker rm builder-$TARGETARCH

View File

@@ -25,6 +25,11 @@ import (
"github.com/jmorganca/ollama/format" "github.com/jmorganca/ollama/format"
) )
const maxRetries = 6
var errMaxRetriesExceeded = errors.New("max retries exceeded")
var errPartStalled = errors.New("part stalled")
var blobDownloadManager sync.Map var blobDownloadManager sync.Map
type blobDownload struct { type blobDownload struct {
@@ -44,10 +49,11 @@ type blobDownload struct {
} }
type blobDownloadPart struct { type blobDownloadPart struct {
N int N int
Offset int64 Offset int64
Size int64 Size int64
Completed int64 Completed int64
lastUpdated time.Time
*blobDownload `json:"-"` *blobDownload `json:"-"`
} }
@@ -72,6 +78,13 @@ func (p *blobDownloadPart) StopsAt() int64 {
return p.Offset + p.Size return p.Offset + p.Size
} }
func (p *blobDownloadPart) Write(b []byte) (n int, err error) {
n = len(b)
p.blobDownload.Completed.Add(int64(n))
p.lastUpdated = time.Now()
return n, nil
}
func (b *blobDownload) Prepare(ctx context.Context, requestURL *url.URL, opts *RegistryOptions) error { func (b *blobDownload) Prepare(ctx context.Context, requestURL *url.URL, opts *RegistryOptions) error {
partFilePaths, err := filepath.Glob(b.Name + "-partial-*") partFilePaths, err := filepath.Glob(b.Name + "-partial-*")
if err != nil { if err != nil {
@@ -157,6 +170,9 @@ func (b *blobDownload) run(ctx context.Context, requestURL *url.URL, opts *Regis
case errors.Is(err, context.Canceled), errors.Is(err, syscall.ENOSPC): case errors.Is(err, context.Canceled), errors.Is(err, syscall.ENOSPC):
// return immediately if the context is canceled or the device is out of space // return immediately if the context is canceled or the device is out of space
return err return err
case errors.Is(err, errPartStalled):
try--
continue
case err != nil: case err != nil:
sleep := time.Second * time.Duration(math.Pow(2, float64(try))) sleep := time.Second * time.Duration(math.Pow(2, float64(try)))
slog.Info(fmt.Sprintf("%s part %d attempt %d failed: %v, retrying in %s", b.Digest[7:19], part.N, try, err, sleep)) slog.Info(fmt.Sprintf("%s part %d attempt %d failed: %v, retrying in %s", b.Digest[7:19], part.N, try, err, sleep))
@@ -195,28 +211,54 @@ func (b *blobDownload) run(ctx context.Context, requestURL *url.URL, opts *Regis
} }
func (b *blobDownload) downloadChunk(ctx context.Context, requestURL *url.URL, w io.Writer, part *blobDownloadPart, opts *RegistryOptions) error { func (b *blobDownload) downloadChunk(ctx context.Context, requestURL *url.URL, w io.Writer, part *blobDownloadPart, opts *RegistryOptions) error {
headers := make(http.Header) g, ctx := errgroup.WithContext(ctx)
headers.Set("Range", fmt.Sprintf("bytes=%d-%d", part.StartsAt(), part.StopsAt()-1)) g.Go(func() error {
resp, err := makeRequestWithRetry(ctx, http.MethodGet, requestURL, headers, nil, opts) headers := make(http.Header)
if err != nil { headers.Set("Range", fmt.Sprintf("bytes=%d-%d", part.StartsAt(), part.StopsAt()-1))
return err resp, err := makeRequestWithRetry(ctx, http.MethodGet, requestURL, headers, nil, opts)
} if err != nil {
defer resp.Body.Close() return err
}
defer resp.Body.Close()
n, err := io.Copy(w, io.TeeReader(resp.Body, b)) n, err := io.Copy(w, io.TeeReader(resp.Body, part))
if err != nil && !errors.Is(err, context.Canceled) && !errors.Is(err, io.ErrUnexpectedEOF) { if err != nil && !errors.Is(err, context.Canceled) && !errors.Is(err, io.ErrUnexpectedEOF) {
// rollback progress // rollback progress
b.Completed.Add(-n) b.Completed.Add(-n)
return err return err
} }
part.Completed += n part.Completed += n
if err := b.writePart(part.Name(), part); err != nil { if err := b.writePart(part.Name(), part); err != nil {
return err return err
} }
// return nil or context.Canceled or UnexpectedEOF (resumable) // return nil or context.Canceled or UnexpectedEOF (resumable)
return err return err
})
g.Go(func() error {
ticker := time.NewTicker(time.Second)
for {
select {
case <-ticker.C:
if part.Completed >= part.Size {
return nil
}
if !part.lastUpdated.IsZero() && time.Since(part.lastUpdated) > 5*time.Second {
slog.Info(fmt.Sprintf("%s part %d stalled; retrying", b.Digest[7:19], part.N))
// reset last updated
part.lastUpdated = time.Time{}
return errPartStalled
}
case <-ctx.Done():
return ctx.Err()
}
}
})
return g.Wait()
} }
func (b *blobDownload) newPart(offset, size int64) error { func (b *blobDownload) newPart(offset, size int64) error {
@@ -255,12 +297,6 @@ func (b *blobDownload) writePart(partName string, part *blobDownloadPart) error
return json.NewEncoder(partFile).Encode(part) return json.NewEncoder(partFile).Encode(part)
} }
func (b *blobDownload) Write(p []byte) (n int, err error) {
n = len(p)
b.Completed.Add(int64(n))
return n, nil
}
func (b *blobDownload) acquire() { func (b *blobDownload) acquire() {
b.references.Add(1) b.references.Add(1)
} }
@@ -279,20 +315,19 @@ func (b *blobDownload) Wait(ctx context.Context, fn func(api.ProgressResponse))
for { for {
select { select {
case <-ticker.C: case <-ticker.C:
fn(api.ProgressResponse{
Status: fmt.Sprintf("pulling %s", b.Digest[7:19]),
Digest: b.Digest,
Total: b.Total,
Completed: b.Completed.Load(),
})
if b.done || b.err != nil {
return b.err
}
case <-ctx.Done(): case <-ctx.Done():
return ctx.Err() return ctx.Err()
} }
fn(api.ProgressResponse{
Status: fmt.Sprintf("pulling %s", b.Digest[7:19]),
Digest: b.Digest,
Total: b.Total,
Completed: b.Completed.Load(),
})
if b.done || b.err != nil {
return b.err
}
} }
} }
@@ -303,10 +338,6 @@ type downloadOpts struct {
fn func(api.ProgressResponse) fn func(api.ProgressResponse)
} }
const maxRetries = 6
var errMaxRetriesExceeded = errors.New("max retries exceeded")
// downloadBlob downloads a blob from the registry and stores it in the blobs directory // downloadBlob downloads a blob from the registry and stores it in the blobs directory
func downloadBlob(ctx context.Context, opts downloadOpts) error { func downloadBlob(ctx context.Context, opts downloadOpts) error {
fp, err := GetBlobsPath(opts.digest) fp, err := GetBlobsPath(opts.digest)

View File

@@ -41,7 +41,7 @@ type Model struct {
Config ConfigV2 Config ConfigV2
ShortName string ShortName string
ModelPath string ModelPath string
OriginalModel string ParentModel string
AdapterPaths []string AdapterPaths []string
ProjectorPaths []string ProjectorPaths []string
Template string Template string
@@ -50,6 +50,12 @@ type Model struct {
Digest string Digest string
Size int64 Size int64
Options map[string]interface{} Options map[string]interface{}
Messages []Message
}
type Message struct {
Role string `json:"role"`
Content string `json:"content"`
} }
type PromptVars struct { type PromptVars struct {
@@ -333,7 +339,7 @@ func GetModel(name string) (*Model, error) {
switch layer.MediaType { switch layer.MediaType {
case "application/vnd.ollama.image.model": case "application/vnd.ollama.image.model":
model.ModelPath = filename model.ModelPath = filename
model.OriginalModel = layer.From model.ParentModel = layer.From
case "application/vnd.ollama.image.embed": case "application/vnd.ollama.image.embed":
// Deprecated in versions > 0.1.2 // Deprecated in versions > 0.1.2
// TODO: remove this warning in a future version // TODO: remove this warning in a future version
@@ -374,6 +380,16 @@ func GetModel(name string) (*Model, error) {
if err = json.NewDecoder(params).Decode(&model.Options); err != nil { if err = json.NewDecoder(params).Decode(&model.Options); err != nil {
return nil, err return nil, err
} }
case "application/vnd.ollama.image.messages":
msgs, err := os.Open(filename)
if err != nil {
return nil, err
}
defer msgs.Close()
if err = json.NewDecoder(msgs).Decode(&model.Messages); err != nil {
return nil, err
}
case "application/vnd.ollama.image.license": case "application/vnd.ollama.image.license":
bts, err := os.ReadFile(filename) bts, err := os.ReadFile(filename)
if err != nil { if err != nil {
@@ -412,6 +428,13 @@ func realpath(mfDir, from string) string {
} }
func CreateModel(ctx context.Context, name, modelFileDir string, commands []parser.Command, fn func(resp api.ProgressResponse)) error { func CreateModel(ctx context.Context, name, modelFileDir string, commands []parser.Command, fn func(resp api.ProgressResponse)) error {
deleteMap := make(map[string]struct{})
if manifest, _, err := GetManifest(ParseModelPath(name)); err == nil {
for _, layer := range append(manifest.Layers, manifest.Config) {
deleteMap[layer.Digest] = struct{}{}
}
}
config := ConfigV2{ config := ConfigV2{
OS: "linux", OS: "linux",
Architecture: "amd64", Architecture: "amd64",
@@ -420,15 +443,13 @@ func CreateModel(ctx context.Context, name, modelFileDir string, commands []pars
}, },
} }
deleteMap := make(map[string]struct{})
var layers Layers var layers Layers
messages := []string{}
params := make(map[string][]string) params := make(map[string][]string)
fromParams := make(map[string]any) fromParams := make(map[string]any)
for _, c := range commands { for _, c := range commands {
slog.Info(fmt.Sprintf("[%s] - %s", c.Name, c.Args))
mediatype := fmt.Sprintf("application/vnd.ollama.image.%s", c.Name) mediatype := fmt.Sprintf("application/vnd.ollama.image.%s", c.Name)
switch c.Name { switch c.Name {
@@ -450,7 +471,7 @@ func CreateModel(ctx context.Context, name, modelFileDir string, commands []pars
switch { switch {
case errors.Is(err, os.ErrNotExist): case errors.Is(err, os.ErrNotExist):
fn(api.ProgressResponse{Status: "pulling model"}) fn(api.ProgressResponse{Status: "pulling model"})
if err := PullModel(ctx, c.Args, &RegistryOptions{}, fn); err != nil { if err := PullModel(ctx, c.Args, "", &RegistryOptions{}, fn); err != nil {
return err return err
} }
@@ -602,11 +623,37 @@ func CreateModel(ctx context.Context, name, modelFileDir string, commands []pars
} }
layers.Replace(layer) layers.Replace(layer)
case "message":
messages = append(messages, c.Args)
default: default:
params[c.Name] = append(params[c.Name], c.Args) params[c.Name] = append(params[c.Name], c.Args)
} }
} }
if len(messages) > 0 {
fn(api.ProgressResponse{Status: "creating parameters layer"})
msgs := make([]api.Message, 0)
for _, m := range messages {
// todo: handle images
msg := strings.SplitN(m, ": ", 2)
msgs = append(msgs, api.Message{Role: msg[0], Content: msg[1]})
}
var b bytes.Buffer
if err := json.NewEncoder(&b).Encode(msgs); err != nil {
return err
}
layer, err := NewLayer(&b, "application/vnd.ollama.image.messages")
if err != nil {
return err
}
layers.Replace(layer)
}
if len(params) > 0 { if len(params) > 0 {
fn(api.ProgressResponse{Status: "creating parameters layer"}) fn(api.ProgressResponse{Status: "creating parameters layer"})
@@ -903,8 +950,8 @@ func ShowModelfile(model *Model) (string, error) {
mt.Model = model mt.Model = model
mt.From = model.ModelPath mt.From = model.ModelPath
if model.OriginalModel != "" { if model.ParentModel != "" {
mt.From = model.OriginalModel mt.From = model.ParentModel
} }
modelFile := `# Modelfile generated by "ollama show" modelFile := `# Modelfile generated by "ollama show"
@@ -994,7 +1041,7 @@ func PushModel(ctx context.Context, name string, regOpts *RegistryOptions, fn fu
return nil return nil
} }
func PullModel(ctx context.Context, name string, regOpts *RegistryOptions, fn func(api.ProgressResponse)) error { func PullModel(ctx context.Context, name, currentDigest string, regOpts *RegistryOptions, fn func(api.ProgressResponse)) error {
mp := ParseModelPath(name) mp := ParseModelPath(name)
var manifest *ManifestV2 var manifest *ManifestV2
@@ -1022,13 +1069,23 @@ func PullModel(ctx context.Context, name string, regOpts *RegistryOptions, fn fu
return fmt.Errorf("insecure protocol http") return fmt.Errorf("insecure protocol http")
} }
fn(api.ProgressResponse{Status: "pulling manifest"}) if currentDigest == "" {
fn(api.ProgressResponse{Status: "pulling manifest"})
}
manifest, err = pullModelManifest(ctx, mp, regOpts) manifest, err = pullModelManifest(ctx, mp, currentDigest, regOpts)
if err != nil { if err != nil {
return fmt.Errorf("pull model manifest: %s", err) return fmt.Errorf("pull model manifest: %s", err)
} }
if currentDigest != "" {
if manifest == nil {
// we already have the model
return nil
}
fn(api.ProgressResponse{Status: "upgrading " + mp.GetShortTagname()})
}
var layers []*Layer var layers []*Layer
layers = append(layers, manifest.Layers...) layers = append(layers, manifest.Layers...)
layers = append(layers, manifest.Config) layers = append(layers, manifest.Config)
@@ -1100,17 +1157,27 @@ func PullModel(ctx context.Context, name string, regOpts *RegistryOptions, fn fu
return nil return nil
} }
func pullModelManifest(ctx context.Context, mp ModelPath, regOpts *RegistryOptions) (*ManifestV2, error) { func pullModelManifest(ctx context.Context, mp ModelPath, currentDigest string, regOpts *RegistryOptions) (*ManifestV2, error) {
requestURL := mp.BaseURL().JoinPath("v2", mp.GetNamespaceRepository(), "manifests", mp.Tag) requestURL := mp.BaseURL().JoinPath("v2", mp.GetNamespaceRepository(), "manifests", mp.Tag)
headers := make(http.Header) headers := make(http.Header)
headers.Set("Accept", "application/vnd.docker.distribution.manifest.v2+json") headers.Set("Accept", "application/vnd.docker.distribution.manifest.v2+json")
if currentDigest != "" {
headers.Set("If-None-Match", currentDigest)
}
resp, err := makeRequestWithRetry(ctx, http.MethodGet, requestURL, headers, nil, regOpts) resp, err := makeRequestWithRetry(ctx, http.MethodGet, requestURL, headers, nil, regOpts)
if err != nil { if err != nil {
return nil, err return nil, err
} }
defer resp.Body.Close() defer resp.Body.Close()
// todo we can potentially read the manifest locally and return it here
if resp.StatusCode == http.StatusNotModified {
return nil, nil
}
var m *ManifestV2 var m *ManifestV2
if err := json.NewDecoder(resp.Body).Decode(&m); err != nil { if err := json.NewDecoder(resp.Body).Decode(&m); err != nil {
return nil, err return nil, err

View File

@@ -186,7 +186,13 @@ func GenerateHandler(c *gin.Context) {
return return
} }
sessionDuration := defaultSessionDuration var sessionDuration time.Duration
if req.KeepAlive == nil {
sessionDuration = defaultSessionDuration
} else {
sessionDuration = req.KeepAlive.Duration
}
if err := load(c, model, opts, sessionDuration); err != nil { if err := load(c, model, opts, sessionDuration); err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
return return
@@ -378,7 +384,14 @@ func EmbeddingHandler(c *gin.Context) {
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
return return
} }
sessionDuration := defaultSessionDuration
var sessionDuration time.Duration
if req.KeepAlive == nil {
sessionDuration = defaultSessionDuration
} else {
sessionDuration = req.KeepAlive.Duration
}
if err := load(c, model, opts, sessionDuration); err != nil { if err := load(c, model, opts, sessionDuration); err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
return return
@@ -438,7 +451,7 @@ func PullModelHandler(c *gin.Context) {
ctx, cancel := context.WithCancel(c.Request.Context()) ctx, cancel := context.WithCancel(c.Request.Context())
defer cancel() defer cancel()
if err := PullModel(ctx, model, regOpts, fn); err != nil { if err := PullModel(ctx, model, req.CurrentDigest, regOpts, fn); err != nil {
ch <- gin.H{"error": err.Error()} ch <- gin.H{"error": err.Error()}
} }
}() }()
@@ -659,6 +672,8 @@ func GetModelInfo(req api.ShowRequest) (*api.ShowResponse, error) {
} }
modelDetails := api.ModelDetails{ modelDetails := api.ModelDetails{
ParentModel: model.ParentModel,
Digest: "sha256:" + model.Digest,
Format: model.Config.ModelFormat, Format: model.Config.ModelFormat,
Family: model.Config.ModelFamily, Family: model.Config.ModelFamily,
Families: model.Config.ModelFamilies, Families: model.Config.ModelFamilies,
@@ -674,11 +689,17 @@ func GetModelInfo(req api.ShowRequest) (*api.ShowResponse, error) {
model.Template = req.Template model.Template = req.Template
} }
msgs := make([]api.Message, 0)
for _, msg := range model.Messages {
msgs = append(msgs, api.Message{Role: msg.Role, Content: msg.Content})
}
resp := &api.ShowResponse{ resp := &api.ShowResponse{
License: strings.Join(model.License, "\n"), License: strings.Join(model.License, "\n"),
System: model.System, System: model.System,
Template: model.Template, Template: model.Template,
Details: modelDetails, Details: modelDetails,
Messages: msgs,
} }
var params []string var params []string
@@ -1067,7 +1088,14 @@ func ChatHandler(c *gin.Context) {
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
return return
} }
sessionDuration := defaultSessionDuration
var sessionDuration time.Duration
if req.KeepAlive == nil {
sessionDuration = defaultSessionDuration
} else {
sessionDuration = req.KeepAlive.Duration
}
if err := load(c, model, opts, sessionDuration); err != nil { if err := load(c, model, opts, sessionDuration); err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
return return
@@ -1075,7 +1103,13 @@ func ChatHandler(c *gin.Context) {
// an empty request loads the model // an empty request loads the model
if len(req.Messages) == 0 { if len(req.Messages) == 0 {
c.JSON(http.StatusOK, api.ChatResponse{CreatedAt: time.Now().UTC(), Model: req.Model, Done: true, Message: api.Message{Role: "assistant"}}) resp := api.ChatResponse{
CreatedAt: time.Now().UTC(),
Model: req.Model,
Done: true,
Message: api.Message{Role: "assistant"},
}
c.JSON(http.StatusOK, resp)
return return
} }