Compare commits
56 Commits
mattw/quan
...
cuda-searc
Author | SHA1 | Date | |
---|---|---|---|
![]() |
be721ca0df | ||
![]() |
34344d801c | ||
![]() |
e868c8a5c7 | ||
![]() |
c336693f07 | ||
![]() |
e89dc1d54b | ||
![]() |
1961a81f03 | ||
![]() |
8a8c7e7f8d | ||
![]() |
6df83e6daa | ||
![]() |
62023177f6 | ||
![]() |
6164f378f2 | ||
![]() |
f387e9631b | ||
![]() |
6566387ae3 | ||
![]() |
37708931fb | ||
![]() |
f6cb0a553c | ||
![]() |
2680078c13 | ||
![]() |
f1b7e5f560 | ||
![]() |
cb534e6ac2 | ||
![]() |
58ce2d8273 | ||
![]() |
18ddf6d57d | ||
![]() |
61e6502449 | ||
![]() |
08f1e18965 | ||
![]() |
7e8f7c8358 | ||
![]() |
3f3eb19a3b | ||
![]() |
059ae4585e | ||
![]() |
6347f501ca | ||
![]() |
5feec959ad | ||
![]() |
dbdd50b283 | ||
![]() |
d74ce6bd4f | ||
![]() |
57942b4676 | ||
![]() |
e0d05b0f1e | ||
![]() |
2d9dd14f27 | ||
![]() |
1caa56128f | ||
![]() |
0101e76dbe | ||
![]() |
2ef9352b94 | ||
![]() |
5580ae2472 | ||
![]() |
3a9f447141 | ||
![]() |
9c2941e61b | ||
![]() |
238ac5e765 | ||
![]() |
4f4980b66b | ||
![]() |
22e93efa41 | ||
![]() |
2909dce894 | ||
![]() |
df32537312 | ||
![]() |
3367b5f3df | ||
![]() |
46edbbc518 | ||
![]() |
d2ff18cd6b | ||
![]() |
df086d3c8c | ||
![]() |
8baaaa39c0 | ||
![]() |
f9961c70ae | ||
![]() |
cd8fad3398 | ||
![]() |
9983fa5f4e | ||
![]() |
dfda91c2ee | ||
![]() |
fac9060da5 | ||
![]() |
a554616f8e | ||
![]() |
77d96da94b | ||
![]() |
0d6e3565ae | ||
![]() |
e201efa14b |
@@ -2,7 +2,7 @@
|
||||
ollama
|
||||
app
|
||||
dist
|
||||
llm/llama.cpp/gguf
|
||||
llm/llama.cpp
|
||||
.env
|
||||
.cache
|
||||
test_data
|
9
.gitmodules
vendored
9
.gitmodules
vendored
@@ -1,5 +1,4 @@
|
||||
[submodule "llm/llama.cpp/gguf"]
|
||||
path = llm/llama.cpp/gguf
|
||||
url = https://github.com/ggerganov/llama.cpp.git
|
||||
ignore = dirty
|
||||
shallow = true
|
||||
[submodule "llama.cpp"]
|
||||
path = llm/llama.cpp
|
||||
url = https://github.com/ggerganov/llama.cpp.git
|
||||
shallow = true
|
159
Dockerfile.build
159
Dockerfile.build
@@ -1,74 +1,101 @@
|
||||
# Ubuntu 20.04 amd64 dependencies
|
||||
FROM --platform=linux/amd64 ubuntu:20.04 AS base-amd64
|
||||
ARG CUDA_VERSION=11.3.1-1
|
||||
ARG CMAKE_VERSION=3.22.1
|
||||
# ROCm only supports amd64
|
||||
ARG ROCM_VERSION=6.0
|
||||
ARG CLBLAST_VER=1.6.1
|
||||
|
||||
# Note: https://rocm.docs.amd.com/en/latest/release/user_kernel_space_compat_matrix.html
|
||||
RUN apt-get update && \
|
||||
apt-get install -y wget gnupg && \
|
||||
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-ubuntu2004.pin && \
|
||||
mv cuda-ubuntu2004.pin /etc/apt/preferences.d/cuda-repository-pin-600 && \
|
||||
apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub && \
|
||||
echo "deb [by-hash=no] https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/ /" > /etc/apt/sources.list.d/cuda.list && \
|
||||
wget "https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.sh" -O /tmp/cmake-installer.sh && \
|
||||
chmod +x /tmp/cmake-installer.sh && /tmp/cmake-installer.sh --skip-license --prefix=/usr && \
|
||||
mkdir --parents --mode=0755 /etc/apt/keyrings && \
|
||||
wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | gpg --dearmor > /etc/apt/keyrings/rocm.gpg && \
|
||||
echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/${ROCM_VERSION} focal main" > /etc/apt/sources.list.d/rocm.list && \
|
||||
echo "Package: *" > /etc/apt/preferences.d/rocm-pin-600 && \
|
||||
echo "Pin: release o=repo.radeon.com" >> /etc/apt/preferences.d/rocm-pin-600 && \
|
||||
echo "Pin-Priority: 600" >> /etc/apt/preferences.d/rocm-pin-600 && \
|
||||
apt-get update && \
|
||||
DEBIAN_FRONTEND=noninteractive apt-get -y install cuda=${CUDA_VERSION} rocm-hip-libraries rocm-device-libs rocm-libs rocm-ocl-icd rocm-hip-sdk rocm-hip-libraries rocm-cmake rocm-clang-ocl rocm-dev
|
||||
|
||||
# CLBlast
|
||||
RUN wget -qO- https://github.com/CNugteren/CLBlast/archive/refs/tags/${CLBLAST_VER}.tar.gz | tar zxv -C /tmp/ && \
|
||||
cd /tmp/CLBlast-${CLBLAST_VER} && mkdir build && cd build && cmake .. && make && make install
|
||||
|
||||
ENV ROCM_PATH=/opt/rocm
|
||||
|
||||
# Ubuntu 22.04 arm64 dependencies
|
||||
FROM --platform=linux/arm64 ubuntu:20.04 AS base-arm64
|
||||
ARG CUDA_VERSION=11.3.1-1
|
||||
ARG CMAKE_VERSION=3.27.6
|
||||
RUN apt-get update && \
|
||||
apt-get install -y wget gnupg && \
|
||||
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/sbsa/cuda-ubuntu2004.pin && \
|
||||
mv cuda-ubuntu2004.pin /etc/apt/preferences.d/cuda-repository-pin-600 && \
|
||||
apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/sbsa//3bf863cc.pub && \
|
||||
echo "deb [by-hash=no] https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/sbsa/ /" > /etc/apt/sources.list.d/cuda.list && \
|
||||
wget "https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-aarch64.sh" -O /tmp/cmake-installer.sh && \
|
||||
chmod +x /tmp/cmake-installer.sh && /tmp/cmake-installer.sh --skip-license --prefix=/usr && \
|
||||
apt-get update && \
|
||||
apt-cache madison cuda && \
|
||||
DEBIAN_FRONTEND=noninteractive apt-get -y install cuda=${CUDA_VERSION}
|
||||
|
||||
FROM base-${TARGETARCH}
|
||||
ARG TARGETARCH
|
||||
ARG GOFLAGS="'-ldflags -w -s'"
|
||||
ARG CGO_CFLAGS
|
||||
ARG GOLANG_VERSION=1.21.3
|
||||
ARG CMAKE_VERSION=3.22.1
|
||||
ARG CUDA_VERSION=11.3.1
|
||||
ARG ROCM_VERSION=5.7.1
|
||||
|
||||
# Common toolchain
|
||||
RUN apt-get update && \
|
||||
DEBIAN_FRONTEND=noninteractive apt-get install -y gcc-10 g++-10 cpp-10 git ocl-icd-opencl-dev && \
|
||||
update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 100 --slave /usr/bin/g++ g++ /usr/bin/g++-10 --slave /usr/bin/gcov gcov /usr/bin/gcov-10
|
||||
FROM --platform=linux/amd64 nvidia/cuda:$CUDA_VERSION-devel-centos7 AS cuda-build-amd64
|
||||
|
||||
# install go
|
||||
ADD https://dl.google.com/go/go${GOLANG_VERSION}.linux-$TARGETARCH.tar.gz /tmp/go${GOLANG_VERSION}.tar.gz
|
||||
RUN mkdir -p /usr/local && tar xz -C /usr/local </tmp/go${GOLANG_VERSION}.tar.gz
|
||||
ARG CMAKE_VERSION
|
||||
|
||||
RUN yum install -y https://repo.ius.io/ius-release-el7.rpm centos-release-scl \
|
||||
&& yum update -y \
|
||||
&& yum install -y devtoolset-10-gcc devtoolset-10-gcc-c++ git236
|
||||
ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
|
||||
|
||||
ADD https://github.com/Kitware/CMake/releases/download/v$CMAKE_VERSION/cmake-$CMAKE_VERSION-linux-x86_64.tar.gz /tmp/cmake-$CMAKE_VERSION.tar.gz
|
||||
RUN tar -zx -C /usr --strip-components 1 </tmp/cmake-$CMAKE_VERSION.tar.gz
|
||||
|
||||
# build the final binary
|
||||
WORKDIR /go/src/github.com/jmorganca/ollama
|
||||
COPY . .
|
||||
|
||||
ENV GOOS=linux
|
||||
ENV GOARCH=$TARGETARCH
|
||||
ENV GOFLAGS=$GOFLAGS
|
||||
ENV CGO_CFLAGS=${CGO_CFLAGS}
|
||||
WORKDIR llm/generate
|
||||
RUN sh gen_linux.sh
|
||||
|
||||
RUN /usr/local/go/bin/go generate ./... && \
|
||||
/usr/local/go/bin/go build .
|
||||
FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION-devel-rockylinux8 AS cuda-build-arm64
|
||||
|
||||
ARG CMAKE_VERSION
|
||||
|
||||
RUN dnf install -y git cmake
|
||||
|
||||
WORKDIR /go/src/github.com/jmorganca/ollama
|
||||
COPY . .
|
||||
|
||||
WORKDIR llm/generate
|
||||
RUN sh gen_linux.sh
|
||||
|
||||
FROM --platform=linux/amd64 rocm/dev-centos-7:$ROCM_VERSION-complete AS rocm-build-amd64
|
||||
|
||||
ARG CMAKE_VERSION
|
||||
|
||||
RUN yum install -y https://repo.ius.io/ius-release-el7.rpm centos-release-scl \
|
||||
&& yum update -y \
|
||||
&& yum remove -y git \
|
||||
&& yum install -y devtoolset-10-gcc devtoolset-10-gcc-c++ git236
|
||||
ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
|
||||
ENV LIBRARY_PATH /opt/amdgpu/lib64
|
||||
|
||||
ADD https://github.com/Kitware/CMake/releases/download/v$CMAKE_VERSION/cmake-$CMAKE_VERSION-linux-x86_64.tar.gz /tmp/cmake-$CMAKE_VERSION.tar.gz
|
||||
RUN tar -zx -C /usr --strip-components 1 </tmp/cmake-$CMAKE_VERSION.tar.gz
|
||||
|
||||
WORKDIR /go/src/github.com/jmorganca/ollama
|
||||
COPY . .
|
||||
|
||||
WORKDIR llm/generate
|
||||
RUN sh gen_linux.sh
|
||||
|
||||
FROM --platform=linux/amd64 centos:7 AS build-amd64
|
||||
ENV CGO_ENABLED 1
|
||||
|
||||
ARG GOLANG_VERSION
|
||||
ARG GOFLAGS
|
||||
ARG CGO_FLAGS
|
||||
|
||||
RUN yum install -y centos-release-scl \
|
||||
&& yum update -y \
|
||||
&& yum install -y devtoolset-10-gcc devtoolset-10-gcc-c++
|
||||
ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
|
||||
|
||||
ADD https://dl.google.com/go/go$GOLANG_VERSION.linux-amd64.tar.gz /tmp/go-$GOLANG_VERSION.tar.gz
|
||||
RUN mkdir -p /usr/local && tar xz -C /usr/local </tmp/go-$GOLANG_VERSION.tar.gz
|
||||
ENV PATH /usr/local/go/bin:$PATH
|
||||
|
||||
WORKDIR /go/src/github.com/jmorganca/ollama
|
||||
COPY . .
|
||||
COPY --from=cuda-build-amd64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/cpu/lib llm/llama.cpp/build/linux/cpu/lib
|
||||
COPY --from=cuda-build-amd64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/cuda/lib llm/llama.cpp/build/linux/cuda/lib
|
||||
COPY --from=rocm-build-amd64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/rocm/lib llm/llama.cpp/build/linux/rocm/lib
|
||||
RUN go build .
|
||||
|
||||
FROM --platform=linux/arm64 centos:7 AS build-arm64
|
||||
ENV CGO_ENABLED 1
|
||||
|
||||
ARG GOLANG_VERSION
|
||||
ARG GOFLAGS
|
||||
ARG CGO_FLAGS
|
||||
|
||||
RUN yum install -y centos-release-scl \
|
||||
&& yum update -y \
|
||||
&& yum install -y devtoolset-10-gcc devtoolset-10-gcc-c++
|
||||
ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
|
||||
|
||||
ADD https://dl.google.com/go/go$GOLANG_VERSION.linux-arm64.tar.gz /tmp/go-$GOLANG_VERSION.tar.gz
|
||||
RUN mkdir -p /usr/local && tar xz -C /usr/local </tmp/go-$GOLANG_VERSION.tar.gz
|
||||
ENV PATH /usr/local/go/bin:$PATH
|
||||
|
||||
WORKDIR /go/src/github.com/jmorganca/ollama
|
||||
COPY . .
|
||||
COPY --from=cuda-build-arm64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/cpu/lib llm/llama.cpp/build/linux/cpu/lib
|
||||
COPY --from=cuda-build-arm64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/cuda/lib llm/llama.cpp/build/linux/cuda/lib
|
||||
RUN go build .
|
||||
|
||||
FROM build-$TARGETARCH
|
||||
|
@@ -292,6 +292,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
||||
- [LlamaIndex](https://gpt-index.readthedocs.io/en/stable/examples/llm/ollama.html)
|
||||
- [LiteLLM](https://github.com/BerriAI/litellm)
|
||||
- [OllamaSharp for .NET](https://github.com/awaescher/OllamaSharp)
|
||||
- [Ollama for Ruby](https://github.com/gbaptista/ollama-ai)
|
||||
- [Ollama-rs for Rust](https://github.com/pepperoni21/ollama-rs)
|
||||
- [Ollama4j for Java](https://github.com/amithkoujalgi/ollama4j)
|
||||
- [ModelFusion Typescript Library](https://modelfusion.dev/integration/model-provider/ollama)
|
||||
|
@@ -309,6 +309,13 @@ func (c *Client) Heartbeat(ctx context.Context) error {
|
||||
}
|
||||
return nil
|
||||
}
|
||||
func (c *Client) Embeddings(ctx context.Context, req *EmbeddingRequest) (*EmbeddingResponse, error) {
|
||||
var resp EmbeddingResponse
|
||||
if err := c.do(ctx, http.MethodPost, "/api/embeddings", req, &resp); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return &resp, nil
|
||||
}
|
||||
|
||||
func (c *Client) CreateBlob(ctx context.Context, digest string, r io.Reader) error {
|
||||
if err := c.do(ctx, http.MethodHead, fmt.Sprintf("/api/blobs/%s", digest), nil, nil); err != nil {
|
||||
|
@@ -148,7 +148,12 @@ type DeleteRequest struct {
|
||||
}
|
||||
|
||||
type ShowRequest struct {
|
||||
Name string `json:"name"`
|
||||
Name string `json:"name"`
|
||||
Model string `json:"model"`
|
||||
System string `json:"system"`
|
||||
Template string `json:"template"`
|
||||
|
||||
Options map[string]interface{} `json:"options"`
|
||||
}
|
||||
|
||||
type ShowResponse struct {
|
||||
|
527
cmd/cmd.go
527
cmd/cmd.go
@@ -17,7 +17,6 @@ import (
|
||||
"os/exec"
|
||||
"os/signal"
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
"runtime"
|
||||
"strings"
|
||||
"syscall"
|
||||
@@ -26,14 +25,12 @@ import (
|
||||
"github.com/olekukonko/tablewriter"
|
||||
"github.com/spf13/cobra"
|
||||
"golang.org/x/crypto/ssh"
|
||||
"golang.org/x/exp/slices"
|
||||
"golang.org/x/term"
|
||||
|
||||
"github.com/jmorganca/ollama/api"
|
||||
"github.com/jmorganca/ollama/format"
|
||||
"github.com/jmorganca/ollama/parser"
|
||||
"github.com/jmorganca/ollama/progress"
|
||||
"github.com/jmorganca/ollama/readline"
|
||||
"github.com/jmorganca/ollama/server"
|
||||
"github.com/jmorganca/ollama/version"
|
||||
)
|
||||
@@ -156,7 +153,7 @@ func RunHandler(cmd *cobra.Command, args []string) error {
|
||||
var statusError api.StatusError
|
||||
switch {
|
||||
case errors.As(err, &statusError) && statusError.StatusCode == http.StatusNotFound:
|
||||
if err := PullHandler(cmd, args); err != nil {
|
||||
if err := PullHandler(cmd, []string{name}); err != nil {
|
||||
return err
|
||||
}
|
||||
case err != nil:
|
||||
@@ -572,31 +569,12 @@ func generate(cmd *cobra.Command, opts generateOptions) error {
|
||||
}
|
||||
|
||||
if err := client.Generate(ctx, &request, fn); err != nil {
|
||||
switch {
|
||||
case errors.Is(err, context.Canceled):
|
||||
if errors.Is(err, context.Canceled) {
|
||||
return nil
|
||||
case strings.Contains(err.Error(), "unsupported model format"):
|
||||
// pull and retry to see if the model has been updated
|
||||
parts := strings.Split(opts.Model, string(os.PathSeparator))
|
||||
if len(parts) == 1 {
|
||||
// this is a library model, log some info
|
||||
fmt.Fprintln(os.Stderr, "This model is no longer compatible with Ollama. Pulling a new version...")
|
||||
}
|
||||
if err := PullHandler(cmd, []string{opts.Model}); err != nil {
|
||||
fmt.Printf("Error: %s\n", err)
|
||||
return fmt.Errorf("unsupported model, please update this model to gguf format") // relay the original error
|
||||
}
|
||||
// retry
|
||||
if err := client.Generate(ctx, &request, fn); err != nil {
|
||||
if errors.Is(err, context.Canceled) {
|
||||
return nil
|
||||
}
|
||||
return err
|
||||
}
|
||||
default:
|
||||
return err
|
||||
}
|
||||
return err
|
||||
}
|
||||
|
||||
if opts.Prompt != "" {
|
||||
fmt.Println()
|
||||
fmt.Println()
|
||||
@@ -621,459 +599,6 @@ func generate(cmd *cobra.Command, opts generateOptions) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
type MultilineState int
|
||||
|
||||
const (
|
||||
MultilineNone MultilineState = iota
|
||||
MultilinePrompt
|
||||
MultilineSystem
|
||||
MultilineTemplate
|
||||
)
|
||||
|
||||
func modelIsMultiModal(cmd *cobra.Command, name string) bool {
|
||||
// get model details
|
||||
client, err := api.ClientFromEnvironment()
|
||||
if err != nil {
|
||||
fmt.Println("error: couldn't connect to ollama server")
|
||||
return false
|
||||
}
|
||||
|
||||
req := api.ShowRequest{Name: name}
|
||||
resp, err := client.Show(cmd.Context(), &req)
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
|
||||
return slices.Contains(resp.Details.Families, "clip")
|
||||
}
|
||||
|
||||
func generateInteractive(cmd *cobra.Command, opts generateOptions) error {
|
||||
multiModal := modelIsMultiModal(cmd, opts.Model)
|
||||
|
||||
// load the model
|
||||
loadOpts := generateOptions{
|
||||
Model: opts.Model,
|
||||
Prompt: "",
|
||||
Images: []ImageData{},
|
||||
}
|
||||
if err := generate(cmd, loadOpts); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
usage := func() {
|
||||
fmt.Fprintln(os.Stderr, "Available Commands:")
|
||||
fmt.Fprintln(os.Stderr, " /set Set session variables")
|
||||
fmt.Fprintln(os.Stderr, " /show Show model information")
|
||||
fmt.Fprintln(os.Stderr, " /bye Exit")
|
||||
fmt.Fprintln(os.Stderr, " /?, /help Help for a command")
|
||||
fmt.Fprintln(os.Stderr, " /? shortcuts Help for keyboard shortcuts")
|
||||
fmt.Fprintln(os.Stderr, "")
|
||||
fmt.Fprintln(os.Stderr, "Use \"\"\" to begin a multi-line message.")
|
||||
fmt.Fprintln(os.Stderr, "")
|
||||
}
|
||||
|
||||
usageSet := func() {
|
||||
fmt.Fprintln(os.Stderr, "Available Commands:")
|
||||
fmt.Fprintln(os.Stderr, " /set parameter ... Set a parameter")
|
||||
fmt.Fprintln(os.Stderr, " /set system <string> Set system message")
|
||||
fmt.Fprintln(os.Stderr, " /set template <string> Set prompt template")
|
||||
fmt.Fprintln(os.Stderr, " /set history Enable history")
|
||||
fmt.Fprintln(os.Stderr, " /set nohistory Disable history")
|
||||
fmt.Fprintln(os.Stderr, " /set wordwrap Enable wordwrap")
|
||||
fmt.Fprintln(os.Stderr, " /set nowordwrap Disable wordwrap")
|
||||
fmt.Fprintln(os.Stderr, " /set format json Enable JSON mode")
|
||||
fmt.Fprintln(os.Stderr, " /set noformat Disable formatting")
|
||||
fmt.Fprintln(os.Stderr, " /set verbose Show LLM stats")
|
||||
fmt.Fprintln(os.Stderr, " /set quiet Disable LLM stats")
|
||||
fmt.Fprintln(os.Stderr, "")
|
||||
}
|
||||
|
||||
usageShortcuts := func() {
|
||||
fmt.Fprintln(os.Stderr, "Available keyboard shortcuts:")
|
||||
fmt.Fprintln(os.Stderr, " Ctrl + a Move to the beginning of the line (Home)")
|
||||
fmt.Fprintln(os.Stderr, " Ctrl + e Move to the end of the line (End)")
|
||||
fmt.Fprintln(os.Stderr, " Alt + b Move back (left) one word")
|
||||
fmt.Fprintln(os.Stderr, " Alt + f Move forward (right) one word")
|
||||
fmt.Fprintln(os.Stderr, " Ctrl + k Delete the sentence after the cursor")
|
||||
fmt.Fprintln(os.Stderr, " Ctrl + u Delete the sentence before the cursor")
|
||||
fmt.Fprintln(os.Stderr, "")
|
||||
fmt.Fprintln(os.Stderr, " Ctrl + l Clear the screen")
|
||||
fmt.Fprintln(os.Stderr, " Ctrl + c Stop the model from responding")
|
||||
fmt.Fprintln(os.Stderr, " Ctrl + d Exit ollama (/bye)")
|
||||
fmt.Fprintln(os.Stderr, "")
|
||||
}
|
||||
|
||||
usageShow := func() {
|
||||
fmt.Fprintln(os.Stderr, "Available Commands:")
|
||||
fmt.Fprintln(os.Stderr, " /show license Show model license")
|
||||
fmt.Fprintln(os.Stderr, " /show modelfile Show Modelfile for this model")
|
||||
fmt.Fprintln(os.Stderr, " /show parameters Show parameters for this model")
|
||||
fmt.Fprintln(os.Stderr, " /show system Show system message")
|
||||
fmt.Fprintln(os.Stderr, " /show template Show prompt template")
|
||||
fmt.Fprintln(os.Stderr, "")
|
||||
}
|
||||
|
||||
// only list out the most common parameters
|
||||
usageParameters := func() {
|
||||
fmt.Fprintln(os.Stderr, "Available Parameters:")
|
||||
fmt.Fprintln(os.Stderr, " /set parameter seed <int> Random number seed")
|
||||
fmt.Fprintln(os.Stderr, " /set parameter num_predict <int> Max number of tokens to predict")
|
||||
fmt.Fprintln(os.Stderr, " /set parameter top_k <int> Pick from top k num of tokens")
|
||||
fmt.Fprintln(os.Stderr, " /set parameter top_p <float> Pick token based on sum of probabilities")
|
||||
fmt.Fprintln(os.Stderr, " /set parameter num_ctx <int> Set the context size")
|
||||
fmt.Fprintln(os.Stderr, " /set parameter temperature <float> Set creativity level")
|
||||
fmt.Fprintln(os.Stderr, " /set parameter repeat_penalty <float> How strongly to penalize repetitions")
|
||||
fmt.Fprintln(os.Stderr, " /set parameter repeat_last_n <int> Set how far back to look for repetitions")
|
||||
fmt.Fprintln(os.Stderr, " /set parameter num_gpu <int> The number of layers to send to the GPU")
|
||||
fmt.Fprintln(os.Stderr, " /set parameter stop \"<string>\", ... Set the stop parameters")
|
||||
fmt.Fprintln(os.Stderr, "")
|
||||
}
|
||||
|
||||
scanner, err := readline.New(readline.Prompt{
|
||||
Prompt: ">>> ",
|
||||
AltPrompt: "... ",
|
||||
Placeholder: "Send a message (/? for help)",
|
||||
AltPlaceholder: `Use """ to end multi-line input`,
|
||||
})
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
fmt.Print(readline.StartBracketedPaste)
|
||||
defer fmt.Printf(readline.EndBracketedPaste)
|
||||
|
||||
var multiline MultilineState
|
||||
var prompt string
|
||||
|
||||
for {
|
||||
line, err := scanner.Readline()
|
||||
switch {
|
||||
case errors.Is(err, io.EOF):
|
||||
fmt.Println()
|
||||
return nil
|
||||
case errors.Is(err, readline.ErrInterrupt):
|
||||
if line == "" {
|
||||
fmt.Println("\nUse Ctrl + d or /bye to exit.")
|
||||
}
|
||||
|
||||
scanner.Prompt.UseAlt = false
|
||||
prompt = ""
|
||||
|
||||
continue
|
||||
case err != nil:
|
||||
return err
|
||||
}
|
||||
|
||||
switch {
|
||||
case strings.HasPrefix(prompt, `"""`):
|
||||
// if the prompt so far starts with """ then we're in multiline mode
|
||||
// and we need to keep reading until we find a line that ends with """
|
||||
cut, found := strings.CutSuffix(line, `"""`)
|
||||
prompt += cut
|
||||
|
||||
if !found {
|
||||
prompt += "\n"
|
||||
continue
|
||||
}
|
||||
|
||||
prompt = strings.TrimPrefix(prompt, `"""`)
|
||||
scanner.Prompt.UseAlt = false
|
||||
|
||||
switch multiline {
|
||||
case MultilineSystem:
|
||||
opts.System = prompt
|
||||
prompt = ""
|
||||
fmt.Println("Set system message.")
|
||||
case MultilineTemplate:
|
||||
opts.Template = prompt
|
||||
prompt = ""
|
||||
fmt.Println("Set prompt template.")
|
||||
}
|
||||
multiline = MultilineNone
|
||||
case strings.HasPrefix(line, `"""`) && len(prompt) == 0:
|
||||
scanner.Prompt.UseAlt = true
|
||||
multiline = MultilinePrompt
|
||||
prompt += line + "\n"
|
||||
continue
|
||||
case scanner.Pasting:
|
||||
prompt += line + "\n"
|
||||
continue
|
||||
case strings.HasPrefix(line, "/list"):
|
||||
args := strings.Fields(line)
|
||||
if err := ListHandler(cmd, args[1:]); err != nil {
|
||||
return err
|
||||
}
|
||||
case strings.HasPrefix(line, "/set"):
|
||||
args := strings.Fields(line)
|
||||
if len(args) > 1 {
|
||||
switch args[1] {
|
||||
case "history":
|
||||
scanner.HistoryEnable()
|
||||
case "nohistory":
|
||||
scanner.HistoryDisable()
|
||||
case "wordwrap":
|
||||
opts.WordWrap = true
|
||||
fmt.Println("Set 'wordwrap' mode.")
|
||||
case "nowordwrap":
|
||||
opts.WordWrap = false
|
||||
fmt.Println("Set 'nowordwrap' mode.")
|
||||
case "verbose":
|
||||
cmd.Flags().Set("verbose", "true")
|
||||
fmt.Println("Set 'verbose' mode.")
|
||||
case "quiet":
|
||||
cmd.Flags().Set("verbose", "false")
|
||||
fmt.Println("Set 'quiet' mode.")
|
||||
case "format":
|
||||
if len(args) < 3 || args[2] != "json" {
|
||||
fmt.Println("Invalid or missing format. For 'json' mode use '/set format json'")
|
||||
} else {
|
||||
opts.Format = args[2]
|
||||
fmt.Printf("Set format to '%s' mode.\n", args[2])
|
||||
}
|
||||
case "noformat":
|
||||
opts.Format = ""
|
||||
fmt.Println("Disabled format.")
|
||||
case "parameter":
|
||||
if len(args) < 4 {
|
||||
usageParameters()
|
||||
continue
|
||||
}
|
||||
var params []string
|
||||
for _, p := range args[3:] {
|
||||
params = append(params, p)
|
||||
}
|
||||
fp, err := api.FormatParams(map[string][]string{args[2]: params})
|
||||
if err != nil {
|
||||
fmt.Printf("Couldn't set parameter: %q\n\n", err)
|
||||
continue
|
||||
}
|
||||
fmt.Printf("Set parameter '%s' to '%s'\n\n", args[2], strings.Join(params, ", "))
|
||||
opts.Options[args[2]] = fp[args[2]]
|
||||
case "system", "template":
|
||||
if len(args) < 3 {
|
||||
usageSet()
|
||||
continue
|
||||
}
|
||||
line := strings.Join(args[2:], " ")
|
||||
line = strings.TrimPrefix(line, `"""`)
|
||||
if strings.HasPrefix(args[2], `"""`) {
|
||||
cut, found := strings.CutSuffix(line, `"""`)
|
||||
prompt += cut
|
||||
if found {
|
||||
if args[1] == "system" {
|
||||
opts.System = prompt
|
||||
fmt.Println("Set system message.")
|
||||
} else {
|
||||
opts.Template = prompt
|
||||
fmt.Println("Set prompt template.")
|
||||
}
|
||||
prompt = ""
|
||||
} else {
|
||||
prompt = `"""` + prompt + "\n"
|
||||
if args[1] == "system" {
|
||||
multiline = MultilineSystem
|
||||
} else {
|
||||
multiline = MultilineTemplate
|
||||
}
|
||||
scanner.Prompt.UseAlt = true
|
||||
}
|
||||
} else {
|
||||
opts.System = line
|
||||
fmt.Println("Set system message.")
|
||||
}
|
||||
default:
|
||||
fmt.Printf("Unknown command '/set %s'. Type /? for help\n", args[1])
|
||||
}
|
||||
} else {
|
||||
usageSet()
|
||||
}
|
||||
case strings.HasPrefix(line, "/show"):
|
||||
args := strings.Fields(line)
|
||||
if len(args) > 1 {
|
||||
client, err := api.ClientFromEnvironment()
|
||||
if err != nil {
|
||||
fmt.Println("error: couldn't connect to ollama server")
|
||||
return err
|
||||
}
|
||||
resp, err := client.Show(cmd.Context(), &api.ShowRequest{Name: opts.Model})
|
||||
if err != nil {
|
||||
fmt.Println("error: couldn't get model")
|
||||
return err
|
||||
}
|
||||
|
||||
switch args[1] {
|
||||
case "license":
|
||||
if resp.License == "" {
|
||||
fmt.Print("No license was specified for this model.\n\n")
|
||||
} else {
|
||||
fmt.Println(resp.License)
|
||||
}
|
||||
case "modelfile":
|
||||
fmt.Println(resp.Modelfile)
|
||||
case "parameters":
|
||||
if resp.Parameters == "" {
|
||||
fmt.Print("No parameters were specified for this model.\n\n")
|
||||
} else {
|
||||
if len(opts.Options) > 0 {
|
||||
fmt.Println("User defined parameters:")
|
||||
for k, v := range opts.Options {
|
||||
fmt.Printf("%-*s %v\n", 30, k, v)
|
||||
}
|
||||
fmt.Println()
|
||||
}
|
||||
fmt.Println("Model defined parameters:")
|
||||
fmt.Println(resp.Parameters)
|
||||
}
|
||||
case "system":
|
||||
switch {
|
||||
case opts.System != "":
|
||||
fmt.Println(opts.System + "\n")
|
||||
case resp.System != "":
|
||||
fmt.Println(resp.System + "\n")
|
||||
default:
|
||||
fmt.Print("No system message was specified for this model.\n\n")
|
||||
}
|
||||
case "template":
|
||||
switch {
|
||||
case opts.Template != "":
|
||||
fmt.Println(opts.Template + "\n")
|
||||
case resp.Template != "":
|
||||
fmt.Println(resp.Template)
|
||||
default:
|
||||
fmt.Print("No prompt template was specified for this model.\n\n")
|
||||
}
|
||||
default:
|
||||
fmt.Printf("Unknown command '/show %s'. Type /? for help\n", args[1])
|
||||
}
|
||||
} else {
|
||||
usageShow()
|
||||
}
|
||||
case strings.HasPrefix(line, "/help"), strings.HasPrefix(line, "/?"):
|
||||
args := strings.Fields(line)
|
||||
if len(args) > 1 {
|
||||
switch args[1] {
|
||||
case "set", "/set":
|
||||
usageSet()
|
||||
case "show", "/show":
|
||||
usageShow()
|
||||
case "shortcut", "shortcuts":
|
||||
usageShortcuts()
|
||||
}
|
||||
} else {
|
||||
usage()
|
||||
}
|
||||
case line == "/exit", line == "/bye":
|
||||
return nil
|
||||
case strings.HasPrefix(line, "/"):
|
||||
args := strings.Fields(line)
|
||||
isFile := false
|
||||
|
||||
if multiModal {
|
||||
for _, f := range extractFileNames(line) {
|
||||
if strings.HasPrefix(f, args[0]) {
|
||||
isFile = true
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if isFile {
|
||||
prompt += line
|
||||
} else {
|
||||
fmt.Printf("Unknown command '%s'. Type /? for help\n", args[0])
|
||||
continue
|
||||
}
|
||||
default:
|
||||
prompt += line
|
||||
}
|
||||
|
||||
if len(prompt) > 0 && multiline == MultilineNone {
|
||||
opts.Prompt = prompt
|
||||
if multiModal {
|
||||
newPrompt, images, err := extractFileData(prompt)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
opts.Prompt = newPrompt
|
||||
|
||||
// reset the context if we find another image
|
||||
if len(images) > 0 {
|
||||
opts.Images = images
|
||||
ctx := cmd.Context()
|
||||
ctx = context.WithValue(ctx, generateContextKey("context"), []int{})
|
||||
cmd.SetContext(ctx)
|
||||
}
|
||||
if len(opts.Images) == 0 {
|
||||
fmt.Println("This model requires you to add a jpeg, png, or svg image.")
|
||||
fmt.Println()
|
||||
prompt = ""
|
||||
continue
|
||||
}
|
||||
}
|
||||
if err := generate(cmd, opts); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
prompt = ""
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func normalizeFilePath(fp string) string {
|
||||
// Define a map of escaped characters and their replacements
|
||||
replacements := map[string]string{
|
||||
"\\ ": " ", // Escaped space
|
||||
"\\(": "(", // Escaped left parenthesis
|
||||
"\\)": ")", // Escaped right parenthesis
|
||||
"\\[": "[", // Escaped left square bracket
|
||||
"\\]": "]", // Escaped right square bracket
|
||||
"\\{": "{", // Escaped left curly brace
|
||||
"\\}": "}", // Escaped right curly brace
|
||||
"\\$": "$", // Escaped dollar sign
|
||||
"\\&": "&", // Escaped ampersand
|
||||
"\\;": ";", // Escaped semicolon
|
||||
"\\'": "'", // Escaped single quote
|
||||
"\\\\": "\\", // Escaped backslash
|
||||
"\\*": "*", // Escaped asterisk
|
||||
"\\?": "?", // Escaped question mark
|
||||
}
|
||||
|
||||
for escaped, actual := range replacements {
|
||||
fp = strings.ReplaceAll(fp, escaped, actual)
|
||||
}
|
||||
return fp
|
||||
}
|
||||
|
||||
func extractFileNames(input string) []string {
|
||||
// Regex to match file paths starting with / or ./ and include escaped spaces (\ or %20)
|
||||
// and followed by more characters and a file extension
|
||||
regexPattern := `(?:\./|/)[\S\\ ]+?\.(?i:jpg|jpeg|png|svg)\b`
|
||||
re := regexp.MustCompile(regexPattern)
|
||||
|
||||
return re.FindAllString(input, -1)
|
||||
}
|
||||
|
||||
func extractFileData(input string) (string, []ImageData, error) {
|
||||
filePaths := extractFileNames(input)
|
||||
var imgs []ImageData
|
||||
|
||||
for _, fp := range filePaths {
|
||||
nfp := normalizeFilePath(fp)
|
||||
data, err := getImageData(nfp)
|
||||
if err != nil {
|
||||
if os.IsNotExist(err) {
|
||||
continue
|
||||
}
|
||||
fmt.Printf("Couldn't process image: %q\n", err)
|
||||
return "", imgs, err
|
||||
}
|
||||
fmt.Printf("Added image '%s'\n", nfp)
|
||||
input = strings.ReplaceAll(input, fp, "")
|
||||
imgs = append(imgs, data)
|
||||
}
|
||||
return input, imgs, nil
|
||||
}
|
||||
|
||||
func RunServer(cmd *cobra.Command, _ []string) error {
|
||||
host, port, err := net.SplitHostPort(os.Getenv("OLLAMA_HOST"))
|
||||
if err != nil {
|
||||
@@ -1095,50 +620,6 @@ func RunServer(cmd *cobra.Command, _ []string) error {
|
||||
return server.Serve(ln)
|
||||
}
|
||||
|
||||
func getImageData(filePath string) ([]byte, error) {
|
||||
file, err := os.Open(filePath)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
buf := make([]byte, 512)
|
||||
_, err = file.Read(buf)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
contentType := http.DetectContentType(buf)
|
||||
allowedTypes := []string{"image/jpeg", "image/jpg", "image/svg+xml", "image/png"}
|
||||
if !slices.Contains(allowedTypes, contentType) {
|
||||
return nil, fmt.Errorf("invalid image type: %s", contentType)
|
||||
}
|
||||
|
||||
info, err := file.Stat()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Check if the file size exceeds 100MB
|
||||
var maxSize int64 = 100 * 1024 * 1024 // 100MB in bytes
|
||||
if info.Size() > maxSize {
|
||||
return nil, fmt.Errorf("file size exceeds maximum limit (100MB)")
|
||||
}
|
||||
|
||||
buf = make([]byte, info.Size())
|
||||
_, err = file.Seek(0, 0)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
_, err = io.ReadFull(file, buf)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return buf, nil
|
||||
}
|
||||
|
||||
func initializeKeypair() error {
|
||||
home, err := os.UserHomeDir()
|
||||
if err != nil {
|
||||
|
545
cmd/interactive.go
Normal file
545
cmd/interactive.go
Normal file
@@ -0,0 +1,545 @@
|
||||
package cmd
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"os"
|
||||
"regexp"
|
||||
"strings"
|
||||
|
||||
"github.com/spf13/cobra"
|
||||
"golang.org/x/exp/slices"
|
||||
|
||||
"github.com/jmorganca/ollama/api"
|
||||
"github.com/jmorganca/ollama/readline"
|
||||
)
|
||||
|
||||
type MultilineState int
|
||||
|
||||
const (
|
||||
MultilineNone MultilineState = iota
|
||||
MultilinePrompt
|
||||
MultilineSystem
|
||||
MultilineTemplate
|
||||
)
|
||||
|
||||
func modelIsMultiModal(cmd *cobra.Command, name string) bool {
|
||||
// get model details
|
||||
client, err := api.ClientFromEnvironment()
|
||||
if err != nil {
|
||||
fmt.Println("error: couldn't connect to ollama server")
|
||||
return false
|
||||
}
|
||||
|
||||
req := api.ShowRequest{Name: name}
|
||||
resp, err := client.Show(cmd.Context(), &req)
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
|
||||
return slices.Contains(resp.Details.Families, "clip")
|
||||
}
|
||||
|
||||
func generateInteractive(cmd *cobra.Command, opts generateOptions) error {
|
||||
multiModal := modelIsMultiModal(cmd, opts.Model)
|
||||
|
||||
// load the model
|
||||
loadOpts := generateOptions{
|
||||
Model: opts.Model,
|
||||
Prompt: "",
|
||||
Images: []ImageData{},
|
||||
}
|
||||
if err := generate(cmd, loadOpts); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
usage := func() {
|
||||
fmt.Fprintln(os.Stderr, "Available Commands:")
|
||||
fmt.Fprintln(os.Stderr, " /set Set session variables")
|
||||
fmt.Fprintln(os.Stderr, " /show Show model information")
|
||||
fmt.Fprintln(os.Stderr, " /bye Exit")
|
||||
fmt.Fprintln(os.Stderr, " /?, /help Help for a command")
|
||||
fmt.Fprintln(os.Stderr, " /? shortcuts Help for keyboard shortcuts")
|
||||
fmt.Fprintln(os.Stderr, "")
|
||||
fmt.Fprintln(os.Stderr, "Use \"\"\" to begin a multi-line message.")
|
||||
fmt.Fprintln(os.Stderr, "")
|
||||
}
|
||||
|
||||
usageSet := func() {
|
||||
fmt.Fprintln(os.Stderr, "Available Commands:")
|
||||
fmt.Fprintln(os.Stderr, " /set parameter ... Set a parameter")
|
||||
fmt.Fprintln(os.Stderr, " /set system <string> Set system message")
|
||||
fmt.Fprintln(os.Stderr, " /set template <string> Set prompt template")
|
||||
fmt.Fprintln(os.Stderr, " /set history Enable history")
|
||||
fmt.Fprintln(os.Stderr, " /set nohistory Disable history")
|
||||
fmt.Fprintln(os.Stderr, " /set wordwrap Enable wordwrap")
|
||||
fmt.Fprintln(os.Stderr, " /set nowordwrap Disable wordwrap")
|
||||
fmt.Fprintln(os.Stderr, " /set format json Enable JSON mode")
|
||||
fmt.Fprintln(os.Stderr, " /set noformat Disable formatting")
|
||||
fmt.Fprintln(os.Stderr, " /set verbose Show LLM stats")
|
||||
fmt.Fprintln(os.Stderr, " /set quiet Disable LLM stats")
|
||||
fmt.Fprintln(os.Stderr, "")
|
||||
}
|
||||
|
||||
usageShortcuts := func() {
|
||||
fmt.Fprintln(os.Stderr, "Available keyboard shortcuts:")
|
||||
fmt.Fprintln(os.Stderr, " Ctrl + a Move to the beginning of the line (Home)")
|
||||
fmt.Fprintln(os.Stderr, " Ctrl + e Move to the end of the line (End)")
|
||||
fmt.Fprintln(os.Stderr, " Alt + b Move back (left) one word")
|
||||
fmt.Fprintln(os.Stderr, " Alt + f Move forward (right) one word")
|
||||
fmt.Fprintln(os.Stderr, " Ctrl + k Delete the sentence after the cursor")
|
||||
fmt.Fprintln(os.Stderr, " Ctrl + u Delete the sentence before the cursor")
|
||||
fmt.Fprintln(os.Stderr, "")
|
||||
fmt.Fprintln(os.Stderr, " Ctrl + l Clear the screen")
|
||||
fmt.Fprintln(os.Stderr, " Ctrl + c Stop the model from responding")
|
||||
fmt.Fprintln(os.Stderr, " Ctrl + d Exit ollama (/bye)")
|
||||
fmt.Fprintln(os.Stderr, "")
|
||||
}
|
||||
|
||||
usageShow := func() {
|
||||
fmt.Fprintln(os.Stderr, "Available Commands:")
|
||||
fmt.Fprintln(os.Stderr, " /show info Show details for this model")
|
||||
fmt.Fprintln(os.Stderr, " /show license Show model license")
|
||||
fmt.Fprintln(os.Stderr, " /show modelfile Show Modelfile for this model")
|
||||
fmt.Fprintln(os.Stderr, " /show parameters Show parameters for this model")
|
||||
fmt.Fprintln(os.Stderr, " /show system Show system message")
|
||||
fmt.Fprintln(os.Stderr, " /show template Show prompt template")
|
||||
fmt.Fprintln(os.Stderr, "")
|
||||
}
|
||||
|
||||
// only list out the most common parameters
|
||||
usageParameters := func() {
|
||||
fmt.Fprintln(os.Stderr, "Available Parameters:")
|
||||
fmt.Fprintln(os.Stderr, " /set parameter seed <int> Random number seed")
|
||||
fmt.Fprintln(os.Stderr, " /set parameter num_predict <int> Max number of tokens to predict")
|
||||
fmt.Fprintln(os.Stderr, " /set parameter top_k <int> Pick from top k num of tokens")
|
||||
fmt.Fprintln(os.Stderr, " /set parameter top_p <float> Pick token based on sum of probabilities")
|
||||
fmt.Fprintln(os.Stderr, " /set parameter num_ctx <int> Set the context size")
|
||||
fmt.Fprintln(os.Stderr, " /set parameter temperature <float> Set creativity level")
|
||||
fmt.Fprintln(os.Stderr, " /set parameter repeat_penalty <float> How strongly to penalize repetitions")
|
||||
fmt.Fprintln(os.Stderr, " /set parameter repeat_last_n <int> Set how far back to look for repetitions")
|
||||
fmt.Fprintln(os.Stderr, " /set parameter num_gpu <int> The number of layers to send to the GPU")
|
||||
fmt.Fprintln(os.Stderr, " /set parameter stop \"<string>\", ... Set the stop parameters")
|
||||
fmt.Fprintln(os.Stderr, "")
|
||||
}
|
||||
|
||||
scanner, err := readline.New(readline.Prompt{
|
||||
Prompt: ">>> ",
|
||||
AltPrompt: "... ",
|
||||
Placeholder: "Send a message (/? for help)",
|
||||
AltPlaceholder: `Use """ to end multi-line input`,
|
||||
})
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
fmt.Print(readline.StartBracketedPaste)
|
||||
defer fmt.Printf(readline.EndBracketedPaste)
|
||||
|
||||
var sb strings.Builder
|
||||
var multiline MultilineState
|
||||
|
||||
for {
|
||||
line, err := scanner.Readline()
|
||||
switch {
|
||||
case errors.Is(err, io.EOF):
|
||||
fmt.Println()
|
||||
return nil
|
||||
case errors.Is(err, readline.ErrInterrupt):
|
||||
if line == "" {
|
||||
fmt.Println("\nUse Ctrl + d or /bye to exit.")
|
||||
}
|
||||
|
||||
scanner.Prompt.UseAlt = false
|
||||
sb.Reset()
|
||||
|
||||
continue
|
||||
case err != nil:
|
||||
return err
|
||||
}
|
||||
|
||||
switch {
|
||||
case multiline != MultilineNone:
|
||||
// check if there's a multiline terminating string
|
||||
before, ok := strings.CutSuffix(line, `"""`)
|
||||
sb.WriteString(before)
|
||||
if !ok {
|
||||
fmt.Fprintln(&sb)
|
||||
continue
|
||||
}
|
||||
|
||||
switch multiline {
|
||||
case MultilineSystem:
|
||||
opts.System = sb.String()
|
||||
fmt.Println("Set system message.")
|
||||
sb.Reset()
|
||||
case MultilineTemplate:
|
||||
opts.Template = sb.String()
|
||||
fmt.Println("Set prompt template.")
|
||||
sb.Reset()
|
||||
}
|
||||
|
||||
multiline = MultilineNone
|
||||
scanner.Prompt.UseAlt = false
|
||||
case strings.HasPrefix(line, `"""`):
|
||||
line := strings.TrimPrefix(line, `"""`)
|
||||
line, ok := strings.CutSuffix(line, `"""`)
|
||||
sb.WriteString(line)
|
||||
if !ok {
|
||||
// no multiline terminating string; need more input
|
||||
fmt.Fprintln(&sb)
|
||||
multiline = MultilinePrompt
|
||||
scanner.Prompt.UseAlt = true
|
||||
break
|
||||
}
|
||||
case scanner.Pasting:
|
||||
fmt.Fprintln(&sb, line)
|
||||
continue
|
||||
case strings.HasPrefix(line, "/list"):
|
||||
args := strings.Fields(line)
|
||||
if err := ListHandler(cmd, args[1:]); err != nil {
|
||||
return err
|
||||
}
|
||||
case strings.HasPrefix(line, "/set"):
|
||||
args := strings.Fields(line)
|
||||
if len(args) > 1 {
|
||||
switch args[1] {
|
||||
case "history":
|
||||
scanner.HistoryEnable()
|
||||
case "nohistory":
|
||||
scanner.HistoryDisable()
|
||||
case "wordwrap":
|
||||
opts.WordWrap = true
|
||||
fmt.Println("Set 'wordwrap' mode.")
|
||||
case "nowordwrap":
|
||||
opts.WordWrap = false
|
||||
fmt.Println("Set 'nowordwrap' mode.")
|
||||
case "verbose":
|
||||
cmd.Flags().Set("verbose", "true")
|
||||
fmt.Println("Set 'verbose' mode.")
|
||||
case "quiet":
|
||||
cmd.Flags().Set("verbose", "false")
|
||||
fmt.Println("Set 'quiet' mode.")
|
||||
case "format":
|
||||
if len(args) < 3 || args[2] != "json" {
|
||||
fmt.Println("Invalid or missing format. For 'json' mode use '/set format json'")
|
||||
} else {
|
||||
opts.Format = args[2]
|
||||
fmt.Printf("Set format to '%s' mode.\n", args[2])
|
||||
}
|
||||
case "noformat":
|
||||
opts.Format = ""
|
||||
fmt.Println("Disabled format.")
|
||||
case "parameter":
|
||||
if len(args) < 4 {
|
||||
usageParameters()
|
||||
continue
|
||||
}
|
||||
var params []string
|
||||
for _, p := range args[3:] {
|
||||
params = append(params, p)
|
||||
}
|
||||
fp, err := api.FormatParams(map[string][]string{args[2]: params})
|
||||
if err != nil {
|
||||
fmt.Printf("Couldn't set parameter: %q\n\n", err)
|
||||
continue
|
||||
}
|
||||
fmt.Printf("Set parameter '%s' to '%s'\n\n", args[2], strings.Join(params, ", "))
|
||||
opts.Options[args[2]] = fp[args[2]]
|
||||
case "system", "template":
|
||||
if len(args) < 3 {
|
||||
usageSet()
|
||||
continue
|
||||
}
|
||||
|
||||
if args[1] == "system" {
|
||||
multiline = MultilineSystem
|
||||
} else if args[1] == "template" {
|
||||
multiline = MultilineTemplate
|
||||
}
|
||||
|
||||
line := strings.Join(args[2:], " ")
|
||||
line, ok := strings.CutPrefix(line, `"""`)
|
||||
if !ok {
|
||||
multiline = MultilineNone
|
||||
} else {
|
||||
// only cut suffix if the line is multiline
|
||||
line, ok = strings.CutSuffix(line, `"""`)
|
||||
if ok {
|
||||
multiline = MultilineNone
|
||||
}
|
||||
}
|
||||
|
||||
sb.WriteString(line)
|
||||
if multiline != MultilineNone {
|
||||
scanner.Prompt.UseAlt = true
|
||||
continue
|
||||
}
|
||||
|
||||
if args[1] == "system" {
|
||||
opts.System = sb.String()
|
||||
fmt.Println("Set system message.")
|
||||
} else if args[1] == "template" {
|
||||
opts.Template = sb.String()
|
||||
fmt.Println("Set prompt template.")
|
||||
}
|
||||
|
||||
sb.Reset()
|
||||
continue
|
||||
default:
|
||||
fmt.Printf("Unknown command '/set %s'. Type /? for help\n", args[1])
|
||||
}
|
||||
} else {
|
||||
usageSet()
|
||||
}
|
||||
case strings.HasPrefix(line, "/show"):
|
||||
args := strings.Fields(line)
|
||||
if len(args) > 1 {
|
||||
client, err := api.ClientFromEnvironment()
|
||||
if err != nil {
|
||||
fmt.Println("error: couldn't connect to ollama server")
|
||||
return err
|
||||
}
|
||||
req := &api.ShowRequest{
|
||||
Name: opts.Model,
|
||||
System: opts.System,
|
||||
Template: opts.Template,
|
||||
Options: opts.Options,
|
||||
}
|
||||
resp, err := client.Show(cmd.Context(), req)
|
||||
if err != nil {
|
||||
fmt.Println("error: couldn't get model")
|
||||
return err
|
||||
}
|
||||
|
||||
switch args[1] {
|
||||
case "info":
|
||||
fmt.Println("Model details:")
|
||||
if len(resp.Details.Families) > 0 {
|
||||
fmt.Printf("Family %s\n", strings.Join(resp.Details.Families, ", "))
|
||||
} else if resp.Details.Family != "" {
|
||||
fmt.Printf("Family %s\n", resp.Details.Family)
|
||||
}
|
||||
fmt.Printf("Parameter Size %s\n", resp.Details.ParameterSize)
|
||||
fmt.Printf("Quantization Level %s\n", resp.Details.QuantizationLevel)
|
||||
fmt.Println("")
|
||||
case "license":
|
||||
if resp.License == "" {
|
||||
fmt.Print("No license was specified for this model.\n\n")
|
||||
} else {
|
||||
fmt.Println(resp.License)
|
||||
}
|
||||
case "modelfile":
|
||||
fmt.Println(resp.Modelfile)
|
||||
case "parameters":
|
||||
if resp.Parameters == "" {
|
||||
fmt.Print("No parameters were specified for this model.\n\n")
|
||||
} else {
|
||||
if len(opts.Options) > 0 {
|
||||
fmt.Println("User defined parameters:")
|
||||
for k, v := range opts.Options {
|
||||
fmt.Printf("%-*s %v\n", 30, k, v)
|
||||
}
|
||||
fmt.Println()
|
||||
}
|
||||
fmt.Println("Model defined parameters:")
|
||||
fmt.Println(resp.Parameters)
|
||||
}
|
||||
case "system":
|
||||
switch {
|
||||
case opts.System != "":
|
||||
fmt.Println(opts.System + "\n")
|
||||
case resp.System != "":
|
||||
fmt.Println(resp.System + "\n")
|
||||
default:
|
||||
fmt.Print("No system message was specified for this model.\n\n")
|
||||
}
|
||||
case "template":
|
||||
switch {
|
||||
case opts.Template != "":
|
||||
fmt.Println(opts.Template + "\n")
|
||||
case resp.Template != "":
|
||||
fmt.Println(resp.Template)
|
||||
default:
|
||||
fmt.Print("No prompt template was specified for this model.\n\n")
|
||||
}
|
||||
default:
|
||||
fmt.Printf("Unknown command '/show %s'. Type /? for help\n", args[1])
|
||||
}
|
||||
} else {
|
||||
usageShow()
|
||||
}
|
||||
case strings.HasPrefix(line, "/help"), strings.HasPrefix(line, "/?"):
|
||||
args := strings.Fields(line)
|
||||
if len(args) > 1 {
|
||||
switch args[1] {
|
||||
case "set", "/set":
|
||||
usageSet()
|
||||
case "show", "/show":
|
||||
usageShow()
|
||||
case "shortcut", "shortcuts":
|
||||
usageShortcuts()
|
||||
}
|
||||
} else {
|
||||
usage()
|
||||
}
|
||||
case line == "/exit", line == "/bye":
|
||||
return nil
|
||||
case strings.HasPrefix(line, "/"):
|
||||
args := strings.Fields(line)
|
||||
isFile := false
|
||||
|
||||
if multiModal {
|
||||
for _, f := range extractFileNames(line) {
|
||||
if strings.HasPrefix(f, args[0]) {
|
||||
isFile = true
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if !isFile {
|
||||
fmt.Printf("Unknown command '%s'. Type /? for help\n", args[0])
|
||||
continue
|
||||
}
|
||||
|
||||
sb.WriteString(line)
|
||||
default:
|
||||
sb.WriteString(line)
|
||||
}
|
||||
|
||||
if sb.Len() > 0 && multiline == MultilineNone {
|
||||
opts.Prompt = sb.String()
|
||||
if multiModal {
|
||||
newPrompt, images, err := extractFileData(sb.String())
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
opts.Prompt = newPrompt
|
||||
|
||||
// reset the context if we find another image
|
||||
if len(images) > 0 {
|
||||
opts.Images = images
|
||||
ctx := cmd.Context()
|
||||
ctx = context.WithValue(ctx, generateContextKey("context"), []int{})
|
||||
cmd.SetContext(ctx)
|
||||
}
|
||||
if len(opts.Images) == 0 {
|
||||
fmt.Println("This model requires you to add a jpeg, png, or svg image.")
|
||||
fmt.Println()
|
||||
sb.Reset()
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
if err := generate(cmd, opts); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
sb.Reset()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func normalizeFilePath(fp string) string {
|
||||
// Define a map of escaped characters and their replacements
|
||||
replacements := map[string]string{
|
||||
"\\ ": " ", // Escaped space
|
||||
"\\(": "(", // Escaped left parenthesis
|
||||
"\\)": ")", // Escaped right parenthesis
|
||||
"\\[": "[", // Escaped left square bracket
|
||||
"\\]": "]", // Escaped right square bracket
|
||||
"\\{": "{", // Escaped left curly brace
|
||||
"\\}": "}", // Escaped right curly brace
|
||||
"\\$": "$", // Escaped dollar sign
|
||||
"\\&": "&", // Escaped ampersand
|
||||
"\\;": ";", // Escaped semicolon
|
||||
"\\'": "'", // Escaped single quote
|
||||
"\\\\": "\\", // Escaped backslash
|
||||
"\\*": "*", // Escaped asterisk
|
||||
"\\?": "?", // Escaped question mark
|
||||
}
|
||||
|
||||
for escaped, actual := range replacements {
|
||||
fp = strings.ReplaceAll(fp, escaped, actual)
|
||||
}
|
||||
return fp
|
||||
}
|
||||
|
||||
func extractFileNames(input string) []string {
|
||||
// Regex to match file paths starting with optional drive letter, / ./ \ or .\ and include escaped or unescaped spaces (\ or %20)
|
||||
// and followed by more characters and a file extension
|
||||
// This will capture non filename strings, but we'll check for file existence to remove mismatches
|
||||
regexPattern := `(?:[a-zA-Z]:)?(?:\./|/|\\)[\S\\ ]+?\.(?i:jpg|jpeg|png|svg)\b`
|
||||
re := regexp.MustCompile(regexPattern)
|
||||
|
||||
return re.FindAllString(input, -1)
|
||||
}
|
||||
|
||||
func extractFileData(input string) (string, []ImageData, error) {
|
||||
filePaths := extractFileNames(input)
|
||||
var imgs []ImageData
|
||||
|
||||
for _, fp := range filePaths {
|
||||
nfp := normalizeFilePath(fp)
|
||||
data, err := getImageData(nfp)
|
||||
if err != nil {
|
||||
if os.IsNotExist(err) {
|
||||
continue
|
||||
}
|
||||
fmt.Printf("Couldn't process image: %q\n", err)
|
||||
return "", imgs, err
|
||||
}
|
||||
fmt.Printf("Added image '%s'\n", nfp)
|
||||
input = strings.ReplaceAll(input, fp, "")
|
||||
imgs = append(imgs, data)
|
||||
}
|
||||
return input, imgs, nil
|
||||
}
|
||||
|
||||
func getImageData(filePath string) ([]byte, error) {
|
||||
file, err := os.Open(filePath)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
buf := make([]byte, 512)
|
||||
_, err = file.Read(buf)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
contentType := http.DetectContentType(buf)
|
||||
allowedTypes := []string{"image/jpeg", "image/jpg", "image/svg+xml", "image/png"}
|
||||
if !slices.Contains(allowedTypes, contentType) {
|
||||
return nil, fmt.Errorf("invalid image type: %s", contentType)
|
||||
}
|
||||
|
||||
info, err := file.Stat()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Check if the file size exceeds 100MB
|
||||
var maxSize int64 = 100 * 1024 * 1024 // 100MB in bytes
|
||||
if info.Size() > maxSize {
|
||||
return nil, fmt.Errorf("file size exceeds maximum limit (100MB)")
|
||||
}
|
||||
|
||||
buf = make([]byte, info.Size())
|
||||
_, err = file.Seek(0, 0)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
_, err = io.ReadFull(file, buf)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return buf, nil
|
||||
}
|
51
cmd/interactive_test.go
Normal file
51
cmd/interactive_test.go
Normal file
@@ -0,0 +1,51 @@
|
||||
package cmd
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
|
||||
func TestExtractFilenames(t *testing.T) {
|
||||
// Unix style paths
|
||||
input := ` some preamble
|
||||
./relative\ path/one.png inbetween1 ./not a valid two.jpg inbetween2
|
||||
/unescaped space /three.jpeg inbetween3 /valid\ path/dir/four.png "./quoted with spaces/five.svg`
|
||||
res := extractFileNames(input)
|
||||
assert.Len(t, res, 5)
|
||||
assert.Contains(t, res[0], "one.png")
|
||||
assert.Contains(t, res[1], "two.jpg")
|
||||
assert.Contains(t, res[2], "three.jpeg")
|
||||
assert.Contains(t, res[3], "four.png")
|
||||
assert.Contains(t, res[4], "five.svg")
|
||||
assert.NotContains(t, res[4], '"')
|
||||
assert.NotContains(t, res, "inbtween")
|
||||
|
||||
// Windows style paths
|
||||
input = ` some preamble
|
||||
c:/users/jdoe/one.png inbetween1 c:/program files/someplace/two.jpg inbetween2
|
||||
/absolute/nospace/three.jpeg inbetween3 /absolute/with space/four.png inbetween4
|
||||
./relative\ path/five.svg inbetween5 "./relative with/spaces/six.png inbetween6
|
||||
d:\path with\spaces\seven.svg inbetween7 c:\users\jdoe\eight.png inbetween8
|
||||
d:\program files\someplace\nine.png inbetween9 "E:\program files\someplace\ten.svg some ending
|
||||
`
|
||||
res = extractFileNames(input)
|
||||
assert.Len(t, res, 10)
|
||||
assert.NotContains(t, res, "inbtween")
|
||||
assert.Contains(t, res[0], "one.png")
|
||||
assert.Contains(t, res[0], "c:")
|
||||
assert.Contains(t, res[1], "two.jpg")
|
||||
assert.Contains(t, res[1], "c:")
|
||||
assert.Contains(t, res[2], "three.jpeg")
|
||||
assert.Contains(t, res[3], "four.png")
|
||||
assert.Contains(t, res[4], "five.svg")
|
||||
assert.Contains(t, res[5], "six.png")
|
||||
assert.Contains(t, res[6], "seven.svg")
|
||||
assert.Contains(t, res[6], "d:")
|
||||
assert.Contains(t, res[7], "eight.png")
|
||||
assert.Contains(t, res[7], "c:")
|
||||
assert.Contains(t, res[8], "nine.png")
|
||||
assert.Contains(t, res[8], "d:")
|
||||
assert.Contains(t, res[9], "ten.svg")
|
||||
assert.Contains(t, res[9], "E:")
|
||||
}
|
@@ -12,7 +12,7 @@ Import models using source model weights found on Hugging Face and similar sites
|
||||
|
||||
Installing on Linux in most cases is easy using the script on Ollama.ai. To get more detail about the install, including CUDA drivers, see the **[Linux Documentation](./linux.md)**.
|
||||
|
||||
Many of our users like the flexibility of using our official Docker Image. Learn more about using Docker with Ollama using the **[Docker Documentation](./docker.md)**.
|
||||
Many of our users like the flexibility of using our official Docker Image. Learn more about using Docker with Ollama using the **[Docker Documentation](https://hub.docker.com/r/ollama/ollama)**.
|
||||
|
||||
It is easy to install on Linux and Mac, but many users will choose to build Ollama on their own. To do this, refer to the **[Development Documentation](./development.md)**.
|
||||
|
||||
|
@@ -409,7 +409,7 @@ A stream of JSON objects is returned:
|
||||
"model": "llama2",
|
||||
"created_at": "2023-08-04T08:52:19.385406455-07:00",
|
||||
"message": {
|
||||
"role": "assisant",
|
||||
"role": "assistant",
|
||||
"content": "The",
|
||||
"images": null
|
||||
},
|
||||
@@ -505,7 +505,7 @@ A stream of JSON objects is returned:
|
||||
"model": "llama2",
|
||||
"created_at": "2023-08-04T08:52:19.385406455-07:00",
|
||||
"message": {
|
||||
"role": "assisant",
|
||||
"role": "assistant",
|
||||
"content": "The"
|
||||
},
|
||||
"done": false
|
||||
|
@@ -38,37 +38,71 @@ Now you can run `ollama`:
|
||||
./ollama
|
||||
```
|
||||
|
||||
## Building on Linux with GPU support
|
||||
### Linux
|
||||
|
||||
#### Linux CUDA (NVIDIA)
|
||||
|
||||
### Linux/Windows CUDA (NVIDIA)
|
||||
*Your operating system distribution may already have packages for NVIDIA CUDA. Distro packages are often preferable, but instructions are distro-specific. Please consult distro-specific docs for dependencies if available!*
|
||||
|
||||
Install `cmake` and `golang` as well as [NVIDIA CUDA](https://developer.nvidia.com/cuda-downloads) development and runtime packages.
|
||||
Then generate dependencies:
|
||||
|
||||
```
|
||||
go generate ./...
|
||||
```
|
||||
|
||||
Then build the binary:
|
||||
|
||||
```
|
||||
go build .
|
||||
```
|
||||
|
||||
### Linux ROCm (AMD)
|
||||
#### Linux ROCm (AMD)
|
||||
|
||||
*Your operating system distribution may already have packages for AMD ROCm and CLBlast. Distro packages are often preferable, but instructions are distro-specific. Please consult distro-specific docs for dependencies if available!*
|
||||
|
||||
Install [CLBlast](https://github.com/CNugteren/CLBlast/blob/master/doc/installation.md) and [ROCm](https://rocm.docs.amd.com/en/latest/deploy/linux/quick_start.html) developement packages first, as well as `cmake` and `golang`.
|
||||
Adjust the paths below (correct for Arch) as appropriate for your distributions install locations and generate dependencies:
|
||||
|
||||
```
|
||||
CLBlast_DIR=/usr/lib/cmake/CLBlast ROCM_PATH=/opt/rocm go generate ./...
|
||||
```
|
||||
|
||||
Then build the binary:
|
||||
|
||||
```
|
||||
go build .
|
||||
```
|
||||
|
||||
ROCm requires elevated privileges to access the GPU at runtime. On most distros you can add your user account to the `render` group, or run as root.
|
||||
|
||||
## Containerized Build
|
||||
#### Containerized Linux Build
|
||||
|
||||
If you have Docker available, you can build linux binaries with `./scripts/build_linux.sh` which has the CUDA and ROCm dependencies included.
|
||||
If you have Docker available, you can build linux binaries with `./scripts/build_linux.sh` which has the CUDA and ROCm dependencies included. The resulting binary is placed in `./dist`
|
||||
|
||||
|
||||
### Windows
|
||||
|
||||
Note: The windows build for Ollama is still under development.
|
||||
|
||||
Install required tools:
|
||||
|
||||
- MSVC toolchain - C/C++ and cmake as minimal requirements
|
||||
- go version 1.20 or higher
|
||||
- MinGW (pick one variant) with GCC.
|
||||
- <https://www.mingw-w64.org/>
|
||||
- <https://www.msys2.org/>
|
||||
|
||||
```powershell
|
||||
$env:CGO_ENABLED="1"
|
||||
|
||||
go generate ./...
|
||||
|
||||
go build .
|
||||
```
|
||||
|
||||
#### Windows CUDA (NVIDIA)
|
||||
|
||||
In addition to the common Windows development tools described above, install:
|
||||
|
||||
- [NVIDIA CUDA](https://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/index.html)
|
||||
|
23
docs/faq.md
23
docs/faq.md
@@ -112,26 +112,3 @@ This can impact both installing Ollama, as well as downloading models.
|
||||
Open `Control Panel > Networking and Internet > View network status and tasks` and click on `Change adapter settings` on the left panel. Find the `vEthernel (WSL)` adapter, right click and select `Properties`.
|
||||
Click on `Configure` and open the `Advanced` tab. Search through each of the properties until you find `Large Send Offload Version 2 (IPv4)` and `Large Send Offload Version 2 (IPv6)`. *Disable* both of these
|
||||
properties.
|
||||
|
||||
## What does the q in the model tag mean? What is quantization?
|
||||
|
||||
Whenever you pull a model without a tag, Ollama will actually pull the q4_0 quantization of the model. You can verify this on the tags page. On https://ollama.ai/library/llama2/tags you can see that the hash for the latest tag matches the hash for the 7b model. 
|
||||
|
||||
Looking at the that page for any model, you can see several quantization options available. Quantization is a method of compression that allows the model to fit in less space and thus use less RAM and VRAM on your machine.
|
||||
|
||||
At a high level, a model is made of an enormous collection of nodes that determine how to generate text. These nodes are connected at different levels with weights. The training process adjusts these weights to be able to output the right text every time.
|
||||
|
||||
Most of the source models that we use start with weights that are 32bit floating-point numbers. Those weights, and another concept called biases, add up to be the parameters. So a source model with 7 billion parameters has 7 billion 32bit floating-point numbers, plus a description of all the nodes and more. That adds up to needing at least 28 Gigabytes of memory to load, if you choose to load one of those source models.
|
||||
|
||||
Quantization turns those 32bit floating point weights into much smaller integers. The number next to the q indicates the bit size of the weights. So a q4 model converted those 32bit floats into 4bit integers. A 4bit quantization takes up the space for 7billion 4bit integers, plus a little overhead. That comes out to almost 4 Gigabytes. Obviously, there is some loss of information in this process of going from 30GB to 4GB, but it turns out in most cases it isn't really noticeable. In fact, even the 2bit quantization which fits in less than 3GB can be very useful.
|
||||
|
||||
There are three major sets of quantizations you will see in the Ollama Library of models: **fp16**, models with just a q and a number, like **q4_0**, and then models with a **K** in the tag. The **fp16** model is one that has been converted and quantized from the source 32bit to 16bit. This will be about half the size of the 32bit source model and is the largest quantization we deliver in the library. The **q4_0**, **q4_1**, **q5_0**, etc. models use two different quantization methods that were the original methods.
|
||||
|
||||
The models with a **K** are often referred to as K Quants. This is a method that allows for models of a similar quality but smaller than the original method used. Essentially, it finds clusters of weights and quantizes those together, allowing for higher precision while using the same bit sizes as the regular quantization options. But this requires a set of maps for the model to figure out the original values which have a computational cost. You may see some impact on the speed of models with K quants compared to the regular quantizations.
|
||||
|
||||
## What is context, can I increase it, and why doesn't every model support a huge context?
|
||||
|
||||
Context refers to the size of the input you can send to a model and get sensible output back. Many models have a context size of 2048 tokens. It's sometimes possible to give it more using the **num_ctx** parameter, but the answers start to degrade. This is because half of the context is "freed" up to allow for more memory. Newer models have been able to increase that context size using different methods. This increase in context size results in a corresponding increase in memory required, sometimes by orders of magnitude.
|
||||
|
||||
> !WARNING]
|
||||
> Currently, over-allocating context size may result in model quality or stability issues.
|
||||
|
@@ -156,11 +156,12 @@ PARAMETER <parameter> <parametervalue>
|
||||
|
||||
#### Template Variables
|
||||
|
||||
| Variable | Description |
|
||||
| --------------- | ------------------------------------------------------------------------------------------------------------- |
|
||||
| `{{ .System }}` | The system message used to specify custom behavior, this must also be set in the Modelfile as an instruction. |
|
||||
| `{{ .Prompt }}` | The incoming prompt, this is not specified in the model file and will be set based on input. |
|
||||
| `{{ .First }}` | A boolean value used to render specific template information for the first generation of a session. |
|
||||
| Variable | Description |
|
||||
| ----------------- | ------------------------------------------------------------------------------------------------------------- |
|
||||
| `{{ .System }}` | The system message used to specify custom behavior, this must also be set in the Modelfile as an instruction. |
|
||||
| `{{ .Prompt }}` | The incoming prompt, this is not specified in the model file and will be set based on input. |
|
||||
| `{{ .Response }}` | The response from the LLM, if not specified response is appended to the end of the template. |
|
||||
| `{{ .First }}` | A boolean value used to render specific template information for the first generation of a session. |
|
||||
|
||||
```modelfile
|
||||
TEMPLATE """
|
||||
|
52
gpu/gpu.go
52
gpu/gpu.go
@@ -16,8 +16,6 @@ import (
|
||||
"runtime"
|
||||
"sync"
|
||||
"unsafe"
|
||||
|
||||
"github.com/jmorganca/ollama/api"
|
||||
)
|
||||
|
||||
type handles struct {
|
||||
@@ -28,6 +26,9 @@ type handles struct {
|
||||
var gpuMutex sync.Mutex
|
||||
var gpuHandles *handles = nil
|
||||
|
||||
// With our current CUDA compile flags, 5.2 and older will not work properly
|
||||
const CudaComputeMajorMin = 6
|
||||
|
||||
// Note: gpuMutex must already be held
|
||||
func initGPUHandles() {
|
||||
// TODO - if the ollama build is CPU only, don't do these checks as they're irrelevant and confusing
|
||||
@@ -73,7 +74,18 @@ func GetGPUInfo() GpuInfo {
|
||||
log.Printf("error looking up CUDA GPU memory: %s", C.GoString(memInfo.err))
|
||||
C.free(unsafe.Pointer(memInfo.err))
|
||||
} else {
|
||||
resp.Library = "cuda"
|
||||
// Verify minimum compute capability
|
||||
var cc C.cuda_compute_capability_t
|
||||
C.cuda_compute_capability(*gpuHandles.cuda, &cc)
|
||||
if cc.err != nil {
|
||||
log.Printf("error looking up CUDA GPU compute capability: %s", C.GoString(cc.err))
|
||||
C.free(unsafe.Pointer(cc.err))
|
||||
} else if cc.major >= CudaComputeMajorMin {
|
||||
log.Printf("CUDA Compute Capability detected: %d.%d", cc.major, cc.minor)
|
||||
resp.Library = "cuda"
|
||||
} else {
|
||||
log.Printf("CUDA GPU is too old. Falling back to CPU mode. Compute Capability detected: %d.%d", cc.major, cc.minor)
|
||||
}
|
||||
}
|
||||
} else if gpuHandles.rocm != nil {
|
||||
C.rocm_check_vram(*gpuHandles.rocm, &memInfo)
|
||||
@@ -98,6 +110,8 @@ func GetGPUInfo() GpuInfo {
|
||||
C.free(unsafe.Pointer(memInfo.err))
|
||||
return resp
|
||||
}
|
||||
|
||||
resp.DeviceCount = uint32(memInfo.count)
|
||||
resp.FreeMemory = uint64(memInfo.free)
|
||||
resp.TotalMemory = uint64(memInfo.total)
|
||||
return resp
|
||||
@@ -119,31 +133,13 @@ func getCPUMem() (memInfo, error) {
|
||||
func CheckVRAM() (int64, error) {
|
||||
gpuInfo := GetGPUInfo()
|
||||
if gpuInfo.FreeMemory > 0 && (gpuInfo.Library == "cuda" || gpuInfo.Library == "rocm") {
|
||||
return int64(gpuInfo.FreeMemory), nil
|
||||
// leave 10% or 384Mi of VRAM free for unaccounted for overhead
|
||||
overhead := gpuInfo.FreeMemory * uint64(gpuInfo.DeviceCount) / 10
|
||||
if overhead < 384*1024*1024 {
|
||||
overhead = 384 * 1024 * 1024
|
||||
}
|
||||
return int64(gpuInfo.FreeMemory - overhead), nil
|
||||
}
|
||||
|
||||
return 0, fmt.Errorf("no GPU detected") // TODO - better handling of CPU based memory determiniation
|
||||
}
|
||||
|
||||
func NumGPU(numLayer, fileSizeBytes int64, opts api.Options) int {
|
||||
if opts.NumGPU != -1 {
|
||||
return opts.NumGPU
|
||||
}
|
||||
info := GetGPUInfo()
|
||||
if info.Library == "cpu" || info.Library == "default" {
|
||||
return 0
|
||||
}
|
||||
|
||||
/*
|
||||
Calculate bytes per layer, this will roughly be the size of the model file divided by the number of layers.
|
||||
We can store the model weights and the kv cache in vram,
|
||||
to enable kv chache vram storage add two additional layers to the number of layers retrieved from the model file.
|
||||
*/
|
||||
bytesPerLayer := uint64(fileSizeBytes / numLayer)
|
||||
|
||||
// 75% of the absolute max number of layers we can fit in available VRAM, off-loading too many layers to the GPU can cause OOM errors
|
||||
layers := int(info.FreeMemory/bytesPerLayer) * 3 / 4
|
||||
|
||||
log.Printf("%d MB VRAM available, loading up to %d %s GPU layers out of %d", info.FreeMemory/(1024*1024), layers, info.Library, numLayer)
|
||||
|
||||
return layers
|
||||
}
|
||||
|
@@ -6,18 +6,31 @@ import "C"
|
||||
import (
|
||||
"runtime"
|
||||
|
||||
"github.com/jmorganca/ollama/api"
|
||||
"github.com/pbnjay/memory"
|
||||
)
|
||||
|
||||
// CheckVRAM returns the free VRAM in bytes on Linux machines with NVIDIA GPUs
|
||||
func CheckVRAM() (int64, error) {
|
||||
// TODO - assume metal, and return free memory?
|
||||
return 0, nil
|
||||
if runtime.GOARCH == "amd64" {
|
||||
// gpu not supported, this may not be metal
|
||||
return 0, nil
|
||||
}
|
||||
|
||||
// on macOS, there's already buffer for available vram (see below) so just return the total
|
||||
systemMemory := int64(memory.TotalMemory())
|
||||
|
||||
// macOS limits how much memory is available to the GPU based on the amount of system memory
|
||||
// TODO: handle case where iogpu.wired_limit_mb is set to a higher value
|
||||
if systemMemory <= 36*1024*1024*1024 {
|
||||
systemMemory = systemMemory * 2 / 3
|
||||
} else {
|
||||
systemMemory = systemMemory * 3 / 4
|
||||
}
|
||||
|
||||
return systemMemory, nil
|
||||
}
|
||||
|
||||
func GetGPUInfo() GpuInfo {
|
||||
// TODO - Metal vs. x86 macs...
|
||||
mem, _ := getCPUMem()
|
||||
return GpuInfo{
|
||||
Library: "default",
|
||||
@@ -29,22 +42,10 @@ func getCPUMem() (memInfo, error) {
|
||||
return memInfo{
|
||||
TotalMemory: 0,
|
||||
FreeMemory: 0,
|
||||
DeviceCount: 0,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func NumGPU(numLayer, fileSizeBytes int64, opts api.Options) int {
|
||||
if opts.NumGPU != -1 {
|
||||
return opts.NumGPU
|
||||
}
|
||||
|
||||
// metal only supported on arm64
|
||||
if runtime.GOARCH == "arm64" {
|
||||
return 1
|
||||
}
|
||||
|
||||
return 0
|
||||
}
|
||||
|
||||
func nativeInit() error {
|
||||
return nil
|
||||
}
|
||||
|
@@ -34,6 +34,7 @@ extern "C" {
|
||||
typedef struct mem_info {
|
||||
uint64_t total;
|
||||
uint64_t free;
|
||||
unsigned int count;
|
||||
char *err; // If non-nill, caller responsible for freeing
|
||||
} mem_info_t;
|
||||
|
||||
|
@@ -8,6 +8,7 @@ void cpu_check_ram(mem_info_t *resp) {
|
||||
MEMORYSTATUSEX info;
|
||||
info.dwLength = sizeof(info);
|
||||
if (GlobalMemoryStatusEx(&info) != 0) {
|
||||
resp->count = 1;
|
||||
resp->total = info.ullTotalPhys;
|
||||
resp->free = info.ullAvailPhys;
|
||||
} else {
|
||||
@@ -26,6 +27,7 @@ void cpu_check_ram(mem_info_t *resp) {
|
||||
if (sysinfo(&info) != 0) {
|
||||
resp->err = strdup(strerror(errno));
|
||||
} else {
|
||||
resp->count = 1;
|
||||
resp->total = info.totalram * info.mem_unit;
|
||||
resp->free = info.freeram * info.mem_unit;
|
||||
}
|
||||
|
@@ -7,9 +7,17 @@
|
||||
#ifndef _WIN32
|
||||
const char *cuda_lib_paths[] = {
|
||||
"libnvidia-ml.so",
|
||||
"/usr/lib/wsl/lib/libnvidia-ml.so", // TODO Maybe glob?
|
||||
"/usr/lib/wsl/lib/libnvidia-ml.so.1",
|
||||
"/usr/local/cuda/lib64/libnvidia-ml.so",
|
||||
"/usr/lib/libnvidia-ml.so",
|
||||
"/usr/lib/libnvidia-ml.so.1",
|
||||
"/usr/lib/x86_64-linux-gnu/nvidia/current/libnvidia-ml.so",
|
||||
"/usr/lib/wsl/lib/libnvidia-ml.so.1", // TODO Maybe glob?
|
||||
"/usr/lib/x86_64-linux-gnu/libnvidia-ml.so",
|
||||
"/usr/lib/x86_64-linux-gnu/libnvidia-ml.so.1",
|
||||
"/usr/lib/aarch64-linux-gnu/nvidia/current/libnvidia-ml.so",
|
||||
"/usr/lib/aarch64-linux-gnu/libnvidia-ml.so",
|
||||
"/usr/lib/aarch64-linux-gnu/libnvidia-ml.so.1",
|
||||
NULL,
|
||||
};
|
||||
#else
|
||||
@@ -20,6 +28,8 @@ const char *cuda_lib_paths[] = {
|
||||
};
|
||||
#endif
|
||||
|
||||
#define CUDA_LOOKUP_SIZE 6
|
||||
|
||||
void cuda_init(cuda_init_resp_t *resp) {
|
||||
nvmlReturn_t ret;
|
||||
resp->err = NULL;
|
||||
@@ -30,11 +40,13 @@ void cuda_init(cuda_init_resp_t *resp) {
|
||||
struct lookup {
|
||||
char *s;
|
||||
void **p;
|
||||
} l[4] = {
|
||||
} l[CUDA_LOOKUP_SIZE] = {
|
||||
{"nvmlInit_v2", (void *)&resp->ch.initFn},
|
||||
{"nvmlShutdown", (void *)&resp->ch.shutdownFn},
|
||||
{"nvmlDeviceGetHandleByIndex", (void *)&resp->ch.getHandle},
|
||||
{"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.getMemInfo},
|
||||
{"nvmlDeviceGetCount_v2", (void *)&resp->ch.getCount},
|
||||
{"nvmlDeviceGetCudaComputeCapability", (void *)&resp->ch.getComputeCapability},
|
||||
};
|
||||
|
||||
for (i = 0; cuda_lib_paths[i] != NULL && resp->ch.handle == NULL; i++) {
|
||||
@@ -52,7 +64,7 @@ void cuda_init(cuda_init_resp_t *resp) {
|
||||
return;
|
||||
}
|
||||
|
||||
for (i = 0; i < 4; i++) { // TODO - fix this to use a null terminated list
|
||||
for (i = 0; i < CUDA_LOOKUP_SIZE; i++) { // TODO - fix this to use a null terminated list
|
||||
*l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
|
||||
if (!l[i].p) {
|
||||
UNLOAD_LIBRARY(resp->ch.handle);
|
||||
@@ -89,22 +101,81 @@ void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
|
||||
return;
|
||||
}
|
||||
|
||||
// TODO - handle multiple GPUs
|
||||
ret = (*h.getHandle)(0, &device);
|
||||
ret = (*h.getCount)(&resp->count);
|
||||
if (ret != NVML_SUCCESS) {
|
||||
snprintf(buf, buflen, "unable to get device handle: %d", ret);
|
||||
snprintf(buf, buflen, "unable to get device count: %d", ret);
|
||||
resp->err = strdup(buf);
|
||||
return;
|
||||
}
|
||||
|
||||
ret = (*h.getMemInfo)(device, &memInfo);
|
||||
resp->total = 0;
|
||||
resp->free = 0;
|
||||
for (i = 0; i < resp->count; i++) {
|
||||
ret = (*h.getHandle)(i, &device);
|
||||
if (ret != NVML_SUCCESS) {
|
||||
snprintf(buf, buflen, "unable to get device handle %d: %d", i, ret);
|
||||
resp->err = strdup(buf);
|
||||
return;
|
||||
}
|
||||
|
||||
ret = (*h.getMemInfo)(device, &memInfo);
|
||||
if (ret != NVML_SUCCESS) {
|
||||
snprintf(buf, buflen, "device memory info lookup failure %d: %d", i, ret);
|
||||
resp->err = strdup(buf);
|
||||
return;
|
||||
}
|
||||
|
||||
resp->total += memInfo.total;
|
||||
resp->free += memInfo.free;
|
||||
}
|
||||
}
|
||||
|
||||
void cuda_compute_capability(cuda_handle_t h, cuda_compute_capability_t *resp) {
|
||||
resp->err = NULL;
|
||||
resp->major = 0;
|
||||
resp->minor = 0;
|
||||
nvmlDevice_t device;
|
||||
int major = 0;
|
||||
int minor = 0;
|
||||
nvmlReturn_t ret;
|
||||
const int buflen = 256;
|
||||
char buf[buflen + 1];
|
||||
int i;
|
||||
|
||||
if (h.handle == NULL) {
|
||||
resp->err = strdup("nvml handle not initialized");
|
||||
return;
|
||||
}
|
||||
|
||||
unsigned int devices;
|
||||
ret = (*h.getCount)(&devices);
|
||||
if (ret != NVML_SUCCESS) {
|
||||
snprintf(buf, buflen, "device memory info lookup failure: %d", ret);
|
||||
snprintf(buf, buflen, "unable to get device count: %d", ret);
|
||||
resp->err = strdup(buf);
|
||||
return;
|
||||
}
|
||||
resp->total = memInfo.total;
|
||||
resp->free = memInfo.free;
|
||||
return;
|
||||
|
||||
for (i = 0; i < devices; i++) {
|
||||
ret = (*h.getHandle)(i, &device);
|
||||
if (ret != NVML_SUCCESS) {
|
||||
snprintf(buf, buflen, "unable to get device handle %d: %d", i, ret);
|
||||
resp->err = strdup(buf);
|
||||
return;
|
||||
}
|
||||
|
||||
ret = (*h.getComputeCapability)(device, &major, &minor);
|
||||
if (ret != NVML_SUCCESS) {
|
||||
snprintf(buf, buflen, "device compute capability lookup failure %d: %d", i, ret);
|
||||
resp->err = strdup(buf);
|
||||
return;
|
||||
}
|
||||
// Report the lowest major.minor we detect as that limits our compatibility
|
||||
if (resp->major == 0 || resp->major > major ) {
|
||||
resp->major = major;
|
||||
resp->minor = minor;
|
||||
} else if ( resp->major == major && resp->minor > minor ) {
|
||||
resp->minor = minor;
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif // __APPLE__
|
@@ -21,6 +21,8 @@ typedef struct cuda_handle {
|
||||
nvmlReturn_t (*shutdownFn)(void);
|
||||
nvmlReturn_t (*getHandle)(unsigned int, nvmlDevice_t *);
|
||||
nvmlReturn_t (*getMemInfo)(nvmlDevice_t, nvmlMemory_t *);
|
||||
nvmlReturn_t (*getCount)(unsigned int *);
|
||||
nvmlReturn_t (*getComputeCapability)(nvmlDevice_t, int* major, int* minor);
|
||||
} cuda_handle_t;
|
||||
|
||||
typedef struct cuda_init_resp {
|
||||
@@ -28,8 +30,15 @@ typedef struct cuda_init_resp {
|
||||
cuda_handle_t ch;
|
||||
} cuda_init_resp_t;
|
||||
|
||||
typedef struct cuda_compute_capability {
|
||||
char *err;
|
||||
int major;
|
||||
int minor;
|
||||
} cuda_compute_capability_t;
|
||||
|
||||
void cuda_init(cuda_init_resp_t *resp);
|
||||
void cuda_check_vram(cuda_handle_t ch, mem_info_t *resp);
|
||||
void cuda_compute_capability(cuda_handle_t ch, cuda_compute_capability_t *cc);
|
||||
|
||||
#endif // __GPU_INFO_CUDA_H__
|
||||
#endif // __APPLE__
|
@@ -110,6 +110,8 @@ void rocm_check_vram(rocm_handle_t h, mem_info_t *resp) {
|
||||
return;
|
||||
}
|
||||
|
||||
// TODO: set this to the actual number of devices
|
||||
resp->count = 1;
|
||||
resp->total = totalMem;
|
||||
resp->free = totalMem - usedMem;
|
||||
return;
|
||||
|
@@ -18,6 +18,7 @@ func TestBasicGetGPUInfo(t *testing.T) {
|
||||
case "linux", "windows":
|
||||
assert.Greater(t, info.TotalMemory, uint64(0))
|
||||
assert.Greater(t, info.FreeMemory, uint64(0))
|
||||
assert.Greater(t, info.DeviceCount, uint64(0))
|
||||
default:
|
||||
return
|
||||
}
|
||||
@@ -35,7 +36,6 @@ func TestCPUMemInfo(t *testing.T) {
|
||||
default:
|
||||
return
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// TODO - add some logic to figure out card type through other means and actually verify we got back what we expected
|
||||
|
@@ -3,6 +3,7 @@ package gpu
|
||||
type memInfo struct {
|
||||
TotalMemory uint64 `json:"total_memory,omitempty"`
|
||||
FreeMemory uint64 `json:"free_memory,omitempty"`
|
||||
DeviceCount uint32 `json:"device_count,omitempty"`
|
||||
}
|
||||
|
||||
// Beginning of an `ollama info` command
|
||||
|
@@ -2,7 +2,7 @@
|
||||
|
||||
set(TARGET ext_server)
|
||||
option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
|
||||
add_library(${TARGET} STATIC ../../../ext_server.cpp)
|
||||
add_library(${TARGET} STATIC ../../../ext_server/ext_server.cpp)
|
||||
target_include_directories(${TARGET} PRIVATE ../../common)
|
||||
target_include_directories(${TARGET} PRIVATE ../..)
|
||||
target_include_directories(${TARGET} PRIVATE ../../..)
|
4
llm/ext_server/README.md
Normal file
4
llm/ext_server/README.md
Normal file
@@ -0,0 +1,4 @@
|
||||
# Extern C Server
|
||||
|
||||
This directory contains a thin facade we layer on top of the Llama.cpp server
|
||||
to expose `extern C` interfaces to access the functionality through direct API calls in-process
|
@@ -12,6 +12,7 @@ void llama_server_init(ext_server_params *sparams, ext_server_resp_t *err) {
|
||||
#if SERVER_VERBOSE != 1
|
||||
log_disable();
|
||||
#endif
|
||||
LOG_TEE("system info: %s", llama_print_system_info());
|
||||
assert(err != NULL && sparams != NULL);
|
||||
err->id = 0;
|
||||
err->msg[0] = '\0';
|
@@ -1,26 +1,26 @@
|
||||
package llm
|
||||
|
||||
/*
|
||||
#cgo CFLAGS: -I${SRCDIR}/llama.cpp -I${SRCDIR}/llama.cpp/gguf -I${SRCDIR}/llama.cpp/gguf/common -I${SRCDIR}/llama.cpp/gguf/examples/server
|
||||
#cgo CFLAGS: -I${SRCDIR}/ext_server -I${SRCDIR}/llama.cpp -I${SRCDIR}/llama.cpp/common -I${SRCDIR}/llama.cpp/examples/server
|
||||
#cgo CFLAGS: -DNDEBUG -DLLAMA_SERVER_LIBRARY=1 -D_XOPEN_SOURCE=600 -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64
|
||||
#cgo CFLAGS: -Wmissing-noreturn -Wall -Wextra -Wcast-qual -Wno-unused-function -Wno-array-bounds
|
||||
#cgo CPPFLAGS: -Ofast -Wall -Wextra -Wno-unused-function -Wno-unused-variable -Wno-deprecated-declarations -Wno-unused-but-set-variable
|
||||
#cgo CFLAGS: -Wmissing-noreturn -Wextra -Wcast-qual -Wno-unused-function -Wno-array-bounds
|
||||
#cgo CPPFLAGS: -Ofast -Wextra -Wno-unused-function -Wno-unused-variable -Wno-deprecated-declarations -Wno-unused-but-set-variable
|
||||
#cgo darwin CFLAGS: -D_DARWIN_C_SOURCE
|
||||
#cgo darwin CPPFLAGS: -DGGML_USE_ACCELERATE
|
||||
#cgo darwin CPPFLAGS: -DGGML_USE_METAL -DGGML_METAL_NDEBUG
|
||||
#cgo darwin LDFLAGS: -lc++ -framework Accelerate
|
||||
#cgo darwin LDFLAGS: -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
|
||||
#cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/darwin/metal/lib/libcommon.a
|
||||
#cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/darwin/metal/lib/libext_server.a
|
||||
#cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/darwin/metal/lib/libllama.a
|
||||
#cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/darwin/metal/lib/libggml_static.a
|
||||
#cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/build/darwin/metal/lib/libcommon.a
|
||||
#cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/build/darwin/metal/lib/libext_server.a
|
||||
#cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/build/darwin/metal/lib/libllama.a
|
||||
#cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/build/darwin/metal/lib/libggml_static.a
|
||||
#cgo linux CFLAGS: -D_GNU_SOURCE
|
||||
#cgo linux windows CFLAGS: -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_USE_CUBLAS
|
||||
#cgo linux LDFLAGS: -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/local/cuda/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib/stubs
|
||||
#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/linux/cpu/lib/libext_server.a
|
||||
#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/linux/cpu/lib/libcommon.a
|
||||
#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/linux/cpu/lib/libllama.a
|
||||
#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/linux/cpu/lib/libggml_static.a
|
||||
#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/build/linux/cpu/lib/libext_server.a
|
||||
#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/build/linux/cpu/lib/libcommon.a
|
||||
#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/build/linux/cpu/lib/libllama.a
|
||||
#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/build/linux/cpu/lib/libggml_static.a
|
||||
#cgo linux LDFLAGS: -lrt -ldl -lstdc++ -lm
|
||||
#cgo linux windows LDFLAGS: -lpthread
|
||||
|
||||
@@ -35,14 +35,12 @@ import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"log"
|
||||
"os"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
"unsafe"
|
||||
|
||||
"github.com/jmorganca/ollama/api"
|
||||
"github.com/jmorganca/ollama/gpu"
|
||||
)
|
||||
|
||||
type extServer interface {
|
||||
@@ -82,25 +80,20 @@ func extServerResponseToErr(resp C.ext_server_resp_t) error {
|
||||
return fmt.Errorf(C.GoString(resp.msg))
|
||||
}
|
||||
|
||||
func newExtServer(server extServer, model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) {
|
||||
func newExtServer(server extServer, model string, adapters, projectors []string, opts api.Options) (extServer, error) {
|
||||
if !mutex.TryLock() {
|
||||
log.Printf("concurrent llm servers not yet supported, waiting for prior server to complete")
|
||||
mutex.Lock()
|
||||
}
|
||||
fileInfo, err := os.Stat(model)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var sparams C.ext_server_params_t
|
||||
sparams.model = C.CString(model)
|
||||
defer C.free(unsafe.Pointer(sparams.model))
|
||||
|
||||
numGPU := gpu.NumGPU(numLayers, fileInfo.Size(), opts)
|
||||
|
||||
sparams.embedding = true
|
||||
sparams.n_ctx = C.uint(opts.NumCtx)
|
||||
sparams.n_batch = C.uint(opts.NumBatch)
|
||||
sparams.n_gpu_layers = C.int(numGPU)
|
||||
sparams.n_gpu_layers = C.int(opts.NumGPU)
|
||||
sparams.main_gpu = C.int(opts.MainGPU)
|
||||
sparams.n_parallel = 1 // TODO - wire up concurrency
|
||||
|
||||
|
@@ -54,9 +54,9 @@ func (llm *llamaExtServer) llama_server_release_json_resp(json_resp **C.char) {
|
||||
C.llama_server_release_json_resp(json_resp)
|
||||
}
|
||||
|
||||
func newDefaultExtServer(model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) {
|
||||
func newDefaultExtServer(model string, adapters, projectors []string, opts api.Options) (extServer, error) {
|
||||
server := &llamaExtServer{opts}
|
||||
return newExtServer(server, model, adapters, projectors, numLayers, opts)
|
||||
return newExtServer(server, model, adapters, projectors, opts)
|
||||
}
|
||||
|
||||
func (llm *llamaExtServer) Predict(ctx context.Context, pred PredictOpts, fn func(PredictResult)) error {
|
||||
|
@@ -4,9 +4,9 @@ import (
|
||||
"github.com/jmorganca/ollama/api"
|
||||
)
|
||||
|
||||
func newDefaultExtServer(model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) {
|
||||
func newDefaultExtServer(model string, adapters, projectors []string, opts api.Options) (extServer, error) {
|
||||
// On windows we always load the llama.cpp libraries dynamically to avoid startup DLL dependencies
|
||||
// This ensures we can update the PATH at runtime to get everything loaded
|
||||
|
||||
return newDynamicShimExtServer(AvailableShims["cpu"], model, adapters, projectors, numLayers, opts)
|
||||
return newDynamicShimExtServer(AvailableShims["cpu"], model, adapters, projectors, opts)
|
||||
}
|
||||
|
@@ -1,8 +1,7 @@
|
||||
# common logic accross linux and darwin
|
||||
|
||||
init_vars() {
|
||||
LLAMACPP_DIR=gguf
|
||||
PATCHES="0001-Expose-callable-API-for-server.patch"
|
||||
LLAMACPP_DIR=../llama.cpp
|
||||
CMAKE_DEFS=""
|
||||
CMAKE_TARGETS="--target ggml --target ggml_static --target llama --target build_info --target common --target ext_server --target llava_static"
|
||||
if echo "${CGO_CFLAGS}" | grep -- '-g' >/dev/null; then
|
||||
@@ -18,19 +17,24 @@ git_module_setup() {
|
||||
echo "Skipping submodule initialization"
|
||||
return
|
||||
fi
|
||||
# Make sure the tree is clean after the directory moves
|
||||
if [ -d "${LLAMACPP_DIR}/gguf" ]; then
|
||||
echo "Cleaning up old submodule"
|
||||
rm -rf ${LLAMACPP_DIR}
|
||||
fi
|
||||
git submodule init
|
||||
git submodule update --force gguf
|
||||
git submodule update --force ${LLAMACPP_DIR}
|
||||
|
||||
}
|
||||
|
||||
apply_patches() {
|
||||
# Wire up our CMakefile
|
||||
if ! grep ollama gguf/examples/server/CMakeLists.txt; then
|
||||
echo 'include (../../../CMakeLists.txt) # ollama' >>gguf/examples/server/CMakeLists.txt
|
||||
if ! grep ollama ${LLAMACPP_DIR}/examples/server/CMakeLists.txt; then
|
||||
echo 'include (../../../ext_server/CMakeLists.txt) # ollama' >>${LLAMACPP_DIR}/examples/server/CMakeLists.txt
|
||||
fi
|
||||
# Avoid duplicate main symbols when we link into the cgo binary
|
||||
sed -e 's/int main(/int __main(/g' <./gguf/examples/server/server.cpp >./gguf/examples/server/server.cpp.tmp &&
|
||||
mv ./gguf/examples/server/server.cpp.tmp ./gguf/examples/server/server.cpp
|
||||
sed -e 's/int main(/int __main(/g' <${LLAMACPP_DIR}/examples/server/server.cpp >${LLAMACPP_DIR}/examples/server/server.cpp.tmp &&
|
||||
mv ${LLAMACPP_DIR}/examples/server/server.cpp.tmp ${LLAMACPP_DIR}/examples/server/server.cpp
|
||||
}
|
||||
|
||||
build() {
|
||||
@@ -49,5 +53,5 @@ install() {
|
||||
|
||||
# Keep the local tree clean after we're done with the build
|
||||
cleanup() {
|
||||
(cd gguf/examples/server/ && git checkout CMakeLists.txt server.cpp)
|
||||
(cd ${LLAMACPP_DIR}/examples/server/ && git checkout CMakeLists.txt server.cpp)
|
||||
}
|
@@ -1,6 +1,6 @@
|
||||
#!/bin/bash
|
||||
# This script is intended to run inside the go generate
|
||||
# working directory must be ../llm/llama.cpp
|
||||
# working directory must be ./llm/generate/
|
||||
|
||||
# TODO - add hardening to detect missing tools (cmake, etc.)
|
||||
|
||||
@@ -9,14 +9,14 @@ set -o pipefail
|
||||
echo "Starting darwin generate script"
|
||||
source $(dirname $0)/gen_common.sh
|
||||
init_vars
|
||||
CMAKE_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 -DLLAMA_METAL=on -DLLAMA_ACCELERATE=on ${CMAKE_DEFS}"
|
||||
BUILD_DIR="gguf/build/darwin/metal"
|
||||
CMAKE_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 -DCMAKE_SYSTEM_NAME=Darwin -DLLAMA_ACCELERATE=on ${CMAKE_DEFS}"
|
||||
BUILD_DIR="${LLAMACPP_DIR}/build/darwin/metal"
|
||||
case "${GOARCH}" in
|
||||
"amd64")
|
||||
CMAKE_DEFS="-DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
|
||||
CMAKE_DEFS="-DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DLLAMA_METAL=off -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
|
||||
;;
|
||||
"arm64")
|
||||
CMAKE_DEFS="-DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 ${CMAKE_DEFS}"
|
||||
CMAKE_DEFS="-DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 -DLLAMA_METAL=on ${CMAKE_DEFS}"
|
||||
;;
|
||||
*)
|
||||
echo "GOARCH must be set"
|
@@ -1,6 +1,6 @@
|
||||
#!/bin/bash
|
||||
# This script is intended to run inside the go generate
|
||||
# working directory must be llm/llama.cpp
|
||||
# working directory must be llm/generate/
|
||||
|
||||
# First we build our default built-in library which will be linked into the CGO
|
||||
# binary as a normal dependency. This default build is CPU based.
|
||||
@@ -52,7 +52,7 @@ apply_patches
|
||||
# CPU first for the default library
|
||||
#
|
||||
CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS}"
|
||||
BUILD_DIR="gguf/build/linux/cpu"
|
||||
BUILD_DIR="${LLAMACPP_DIR}/build/linux/cpu"
|
||||
|
||||
build
|
||||
install
|
||||
@@ -64,7 +64,7 @@ if [ -d /usr/local/cuda/lib64/ ]; then
|
||||
echo "CUDA libraries detected - building dynamic CUDA library"
|
||||
init_vars
|
||||
CMAKE_DEFS="-DLLAMA_CUBLAS=on ${COMMON_CMAKE_DEFS} ${CMAKE_DEFS}"
|
||||
BUILD_DIR="gguf/build/linux/cuda"
|
||||
BUILD_DIR="${LLAMACPP_DIR}/build/linux/cuda"
|
||||
CUDA_LIB_DIR=/usr/local/cuda/lib64
|
||||
build
|
||||
install
|
||||
@@ -98,7 +98,7 @@ if [ -d "${ROCM_PATH}" ]; then
|
||||
echo "ROCm libraries detected - building dynamic ROCm library"
|
||||
init_vars
|
||||
CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DLLAMA_HIPBLAS=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS=$(amdGPUs) -DGPU_TARGETS=$(amdGPUs)"
|
||||
BUILD_DIR="gguf/build/linux/rocm"
|
||||
BUILD_DIR="${LLAMACPP_DIR}/build/linux/rocm"
|
||||
build
|
||||
install
|
||||
gcc -fPIC -g -shared -o ${BUILD_DIR}/lib/libext_server.so \
|
@@ -3,7 +3,7 @@
|
||||
$ErrorActionPreference = "Stop"
|
||||
|
||||
function init_vars {
|
||||
$script:patches = @("0001-Expose-callable-API-for-server.patch")
|
||||
$script:llamacppDir = "../llama.cpp"
|
||||
$script:cmakeDefs = @("-DBUILD_SHARED_LIBS=on", "-DLLAMA_NATIVE=off", "-DLLAMA_F16C=off", "-DLLAMA_FMA=off", "-DLLAMA_AVX512=off", "-DLLAMA_AVX2=off", "-DLLAMA_AVX=on", "-A","x64")
|
||||
$script:cmakeTargets = @("ggml", "ggml_static", "llama", "build_info", "common", "ext_server_shared", "llava_static")
|
||||
if ($env:CGO_CFLAGS -contains "-g") {
|
||||
@@ -19,25 +19,25 @@ function git_module_setup {
|
||||
# TODO add flags to skip the init/patch logic to make it easier to mod llama.cpp code in-repo
|
||||
& git submodule init
|
||||
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
|
||||
& git submodule update --force gguf
|
||||
& git submodule update --force "${script:llamacppDir}"
|
||||
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
|
||||
}
|
||||
|
||||
function apply_patches {
|
||||
# Wire up our CMakefile
|
||||
if (!(Select-String -Path "gguf/examples/server/CMakeLists.txt" -Pattern 'ollama')) {
|
||||
Add-Content -Path "gguf/examples/server/CMakeLists.txt" -Value 'include (../../../CMakeLists.txt) # ollama'
|
||||
if (!(Select-String -Path "${script:llamacppDir}/examples/server/CMakeLists.txt" -Pattern 'ollama')) {
|
||||
Add-Content -Path "${script:llamacppDir}/examples/server/CMakeLists.txt" -Value 'include (../../../ext_server/CMakeLists.txt) # ollama'
|
||||
}
|
||||
# Avoid duplicate main symbols when we link into the cgo binary
|
||||
$content = Get-Content -Path "./gguf/examples/server/server.cpp"
|
||||
$content = Get-Content -Path "${script:llamacppDir}/examples/server/server.cpp"
|
||||
$content = $content -replace 'int main\(', 'int __main('
|
||||
Set-Content -Path "./gguf/examples/server/server.cpp" -Value $content
|
||||
Set-Content -Path "${script:llamacppDir}/examples/server/server.cpp" -Value $content
|
||||
}
|
||||
|
||||
function build {
|
||||
write-host "generating config with: cmake -S gguf -B $script:buildDir $script:cmakeDefs"
|
||||
write-host "generating config with: cmake -S ${script:llamacppDir} -B $script:buildDir $script:cmakeDefs"
|
||||
& cmake --version
|
||||
& cmake -S gguf -B $script:buildDir $script:cmakeDefs
|
||||
& cmake -S "${script:llamacppDir}" -B $script:buildDir $script:cmakeDefs
|
||||
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
|
||||
write-host "building with: cmake --build $script:buildDir --config $script:config ($script:cmakeTargets | ForEach-Object { "--target", $_ })"
|
||||
& cmake --build $script:buildDir --config $script:config ($script:cmakeTargets | ForEach-Object { "--target", $_ })
|
||||
@@ -55,7 +55,7 @@ function install {
|
||||
}
|
||||
|
||||
function cleanup {
|
||||
Set-Location "gguf/examples/server"
|
||||
Set-Location "${script:llamacppDir}/examples/server"
|
||||
git checkout CMakeLists.txt server.cpp
|
||||
}
|
||||
|
||||
@@ -64,20 +64,20 @@ git_module_setup
|
||||
apply_patches
|
||||
|
||||
# first build CPU based
|
||||
$script:buildDir="gguf/build/windows/cpu"
|
||||
$script:buildDir="${script:llamacppDir}/build/windows/cpu"
|
||||
|
||||
build
|
||||
install
|
||||
|
||||
# Then build cuda as a dynamically loaded library
|
||||
init_vars
|
||||
$script:buildDir="gguf/build/windows/cuda"
|
||||
$script:buildDir="${script:llamacppDir}/build/windows/cuda"
|
||||
$script:cmakeDefs += @("-DLLAMA_CUBLAS=ON")
|
||||
build
|
||||
install
|
||||
|
||||
# TODO - actually implement ROCm support on windows
|
||||
$script:buildDir="gguf/build/windows/rocm"
|
||||
$script:buildDir="${script:llamacppDir}/build/windows/rocm"
|
||||
|
||||
rm -ea 0 -recurse -force -path "${script:buildDir}/lib"
|
||||
md "${script:buildDir}/lib" -ea 0 > $null
|
@@ -1,3 +1,3 @@
|
||||
package llm
|
||||
package generate
|
||||
|
||||
//go:generate sh ./gen_darwin.sh
|
@@ -1,3 +1,3 @@
|
||||
package llm
|
||||
package generate
|
||||
|
||||
//go:generate bash ./gen_linux.sh
|
@@ -1,3 +1,3 @@
|
||||
package llm
|
||||
package generate
|
||||
|
||||
//go:generate powershell -ExecutionPolicy Bypass -File ./gen_windows.ps1
|
@@ -78,7 +78,11 @@ type model interface {
|
||||
ModelFamily() string
|
||||
ModelType() string
|
||||
FileType() string
|
||||
NumLayers() int64
|
||||
NumLayers() uint32
|
||||
NumGQA() uint32
|
||||
NumEmbed() uint32
|
||||
NumHead() uint32
|
||||
NumHeadKv() uint32
|
||||
}
|
||||
|
||||
type container interface {
|
||||
|
41
llm/gguf.go
41
llm/gguf.go
@@ -272,14 +272,49 @@ func (llm *ggufModel) Decode(rso *readSeekOffset) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (llm *ggufModel) NumLayers() int64 {
|
||||
func (llm *ggufModel) NumLayers() uint32 {
|
||||
value, exists := llm.kv[fmt.Sprintf("%s.block_count", llm.ModelFamily())]
|
||||
if !exists {
|
||||
return 0
|
||||
}
|
||||
|
||||
v := value.(uint32)
|
||||
return int64(v)
|
||||
return value.(uint32)
|
||||
}
|
||||
|
||||
func (llm *ggufModel) NumHead() uint32 {
|
||||
value, exists := llm.kv[fmt.Sprintf("%s.attention.head_count", llm.ModelFamily())]
|
||||
if !exists {
|
||||
return 0
|
||||
}
|
||||
|
||||
return value.(uint32)
|
||||
}
|
||||
|
||||
func (llm *ggufModel) NumEmbed() uint32 {
|
||||
value, exists := llm.kv[fmt.Sprintf("%s.embedding_length", llm.ModelFamily())]
|
||||
if !exists {
|
||||
return 0
|
||||
}
|
||||
|
||||
return value.(uint32)
|
||||
}
|
||||
|
||||
func (llm *ggufModel) NumHeadKv() uint32 {
|
||||
value, exists := llm.kv[fmt.Sprintf("%s.attention.head_count_kv", llm.ModelFamily())]
|
||||
if !exists {
|
||||
return 0
|
||||
}
|
||||
|
||||
return value.(uint32)
|
||||
}
|
||||
|
||||
func (llm *ggufModel) NumGQA() uint32 {
|
||||
numHeadKv := llm.NumHeadKv()
|
||||
if numHeadKv == 0 {
|
||||
return 0
|
||||
}
|
||||
|
||||
return llm.NumHead() / numHeadKv
|
||||
}
|
||||
|
||||
func (llm ggufModel) readU8(r io.Reader) uint8 {
|
||||
|
61
llm/llama.go
61
llm/llama.go
@@ -8,7 +8,6 @@ import (
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/jmorganca/ollama/api"
|
||||
@@ -43,69 +42,11 @@ number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws
|
||||
ws ::= ([ \t\n] ws)?
|
||||
`
|
||||
|
||||
type llamaModel struct {
|
||||
hyperparameters llamaHyperparameters
|
||||
}
|
||||
|
||||
func (llm *llamaModel) ModelFamily() string {
|
||||
return "llama"
|
||||
}
|
||||
|
||||
func llamaModelType(numLayer uint32) string {
|
||||
switch numLayer {
|
||||
case 26:
|
||||
return "3B"
|
||||
case 32:
|
||||
return "7B"
|
||||
case 40:
|
||||
return "13B"
|
||||
case 48:
|
||||
return "34B"
|
||||
case 60:
|
||||
return "30B"
|
||||
case 80:
|
||||
return "65B"
|
||||
default:
|
||||
return "unknown"
|
||||
}
|
||||
}
|
||||
|
||||
func (llm *llamaModel) ModelType() string {
|
||||
return llamaModelType(llm.hyperparameters.NumLayer)
|
||||
}
|
||||
|
||||
func (llm *llamaModel) FileType() string {
|
||||
return fileType(llm.hyperparameters.FileType)
|
||||
}
|
||||
|
||||
func (llm *llamaModel) NumLayers() int64 {
|
||||
return int64(llm.hyperparameters.NumLayer)
|
||||
}
|
||||
|
||||
type llamaHyperparameters struct {
|
||||
// NumVocab is the size of the model's vocabulary.
|
||||
NumVocab uint32
|
||||
|
||||
// NumEmbd is the size of the model's embedding layer.
|
||||
NumEmbd uint32
|
||||
NumMult uint32
|
||||
NumHead uint32
|
||||
|
||||
// NumLayer is the number of layers in the model.
|
||||
NumLayer uint32
|
||||
NumRot uint32
|
||||
|
||||
// FileType describes the quantization level of the model, e.g. Q4_0, Q5_K, etc.
|
||||
FileType uint32
|
||||
}
|
||||
|
||||
type Running struct {
|
||||
Port int
|
||||
Cmd *exec.Cmd
|
||||
Cancel context.CancelFunc
|
||||
exitOnce sync.Once
|
||||
exitCh chan error // channel to receive the exit status of the subprocess
|
||||
*StatusWriter // captures error messages from the llama runner process
|
||||
*StatusWriter // captures error messages from the llama runner process
|
||||
}
|
||||
|
||||
type ImageData struct {
|
||||
|
115
llm/llm.go
115
llm/llm.go
@@ -7,10 +7,7 @@ import (
|
||||
"os"
|
||||
"runtime"
|
||||
|
||||
"github.com/pbnjay/memory"
|
||||
|
||||
"github.com/jmorganca/ollama/api"
|
||||
"github.com/jmorganca/ollama/format"
|
||||
"github.com/jmorganca/ollama/gpu"
|
||||
)
|
||||
|
||||
@@ -40,40 +37,95 @@ func New(workDir, model string, adapters, projectors []string, opts api.Options)
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if runtime.GOOS == "darwin" {
|
||||
var requiredMemory int64
|
||||
var f16Multiplier int64 = 2
|
||||
if opts.NumCtx < 4 {
|
||||
opts.NumCtx = 4
|
||||
}
|
||||
|
||||
switch ggml.ModelType() {
|
||||
case "3B", "7B":
|
||||
requiredMemory = 8 * format.GigaByte
|
||||
case "13B":
|
||||
requiredMemory = 16 * format.GigaByte
|
||||
case "30B", "34B", "40B":
|
||||
requiredMemory = 32 * format.GigaByte
|
||||
case "47B":
|
||||
requiredMemory = 48 * format.GigaByte
|
||||
case "65B", "70B":
|
||||
requiredMemory = 64 * format.GigaByte
|
||||
case "180B":
|
||||
requiredMemory = 128 * format.GigaByte
|
||||
f16Multiplier = 4
|
||||
}
|
||||
fmt.Println("size", ggml.Size)
|
||||
fmt.Println("filetype", ggml.FileType())
|
||||
fmt.Println("architecture", ggml.ModelFamily())
|
||||
fmt.Println("type", ggml.ModelType())
|
||||
fmt.Println("name", ggml.Name())
|
||||
fmt.Println("embd", ggml.NumEmbed())
|
||||
fmt.Println("head", ggml.NumHead())
|
||||
fmt.Println("head_kv", ggml.NumHeadKv())
|
||||
fmt.Println("gqa", ggml.NumGQA())
|
||||
|
||||
systemMemory := int64(memory.TotalMemory())
|
||||
available, _ := gpu.CheckVRAM()
|
||||
|
||||
if ggml.FileType() == "F16" && requiredMemory*f16Multiplier > systemMemory {
|
||||
return nil, fmt.Errorf("F16 model requires at least %s of memory", format.HumanBytes(requiredMemory))
|
||||
} else if requiredMemory > systemMemory {
|
||||
return nil, fmt.Errorf("model requires at least %s of memory", format.HumanBytes(requiredMemory))
|
||||
// For now assume filesize = model size
|
||||
// TODO: use actual model size
|
||||
requiredModel := ggml.Size
|
||||
|
||||
// fp16 k,v matrices require = n_ctx * n_layer * n_embd / n_head * n_head_kv * 2 bytes each * 2 key and value
|
||||
requiredKv := 2 * 2 * int64(opts.NumCtx) * int64(ggml.NumLayers()) * int64(ggml.NumEmbed()) * int64(ggml.NumHeadKv()) / int64(ggml.NumHead())
|
||||
|
||||
// this amount is the overhead + tensors in memory
|
||||
// TODO: get this from the llama.cpp's graph calcluations instead of
|
||||
// estimating it's 1/6 * kv_cache_size * num_gqa
|
||||
requiredAlloc := int64(ggml.NumGQA()) * requiredKv / 6
|
||||
|
||||
requiredTotal := requiredModel + requiredKv + requiredAlloc
|
||||
|
||||
log.Println("system memory bytes:", available)
|
||||
log.Println("required model bytes:", requiredModel)
|
||||
log.Println("required kv bytes:", requiredKv)
|
||||
log.Println("required alloc bytes:", requiredAlloc)
|
||||
log.Println("required total bytes:", requiredTotal)
|
||||
|
||||
info := gpu.GetGPUInfo()
|
||||
library := info.Library
|
||||
|
||||
if opts.NumGPU == -1 {
|
||||
// default to offloading all layers
|
||||
opts.NumGPU = int(ggml.NumLayers()) + 1
|
||||
}
|
||||
|
||||
// decide how many layers to put on the GPU
|
||||
if opts.NumGPU > 0 {
|
||||
switch runtime.GOOS {
|
||||
case "darwin":
|
||||
if requiredTotal > available {
|
||||
log.Println("not enough vram available, falling back to CPU only")
|
||||
opts.NumGPU = 0
|
||||
}
|
||||
default:
|
||||
if library == "cpu" || library == "default" {
|
||||
opts.NumGPU = 0
|
||||
break
|
||||
}
|
||||
|
||||
// no offloading required
|
||||
if requiredTotal <= available {
|
||||
break
|
||||
}
|
||||
|
||||
// requiredAlloc is always loaded for the CUDA runner, so don't load it if it won't fit
|
||||
if requiredAlloc > available {
|
||||
log.Printf("not enough vram available, falling back to CPU only")
|
||||
library = "cpu"
|
||||
opts.NumGPU = 0
|
||||
break
|
||||
}
|
||||
|
||||
available -= requiredAlloc
|
||||
|
||||
// fill remaining vram with layers
|
||||
log.Println("splitting", available, "of available memory bytes into layers")
|
||||
bytesPerLayer := int64((requiredModel + requiredKv) / int64(ggml.NumLayers()))
|
||||
log.Println("bytes per layer:", bytesPerLayer)
|
||||
layers := available / bytesPerLayer
|
||||
log.Println("total required with split:", requiredAlloc+(layers*bytesPerLayer))
|
||||
if layers < int64(opts.NumGPU) {
|
||||
opts.NumGPU = int(layers)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
opts.NumGQA = 0
|
||||
opts.RopeFrequencyBase = 0.0
|
||||
opts.RopeFrequencyScale = 0.0
|
||||
gpuInfo := gpu.GetGPUInfo()
|
||||
return newLlmServer(gpuInfo.Library, model, adapters, projectors, ggml.NumLayers(), opts)
|
||||
return newLlmServer(library, model, adapters, projectors, opts)
|
||||
}
|
||||
|
||||
// Give any native cgo implementations an opportunity to initialize
|
||||
@@ -81,9 +133,9 @@ func Init(workdir string) error {
|
||||
return nativeInit(workdir)
|
||||
}
|
||||
|
||||
func newLlmServer(library, model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) {
|
||||
func newLlmServer(library, model string, adapters, projectors []string, opts api.Options) (extServer, error) {
|
||||
if _, libPresent := AvailableShims[library]; libPresent && library != "default" {
|
||||
srv, err := newDynamicShimExtServer(AvailableShims[library], model, adapters, projectors, numLayers, opts)
|
||||
srv, err := newDynamicShimExtServer(AvailableShims[library], model, adapters, projectors, opts)
|
||||
if err == nil {
|
||||
return srv, nil
|
||||
}
|
||||
@@ -91,6 +143,5 @@ func newLlmServer(library, model string, adapters, projectors []string, numLayer
|
||||
// TODO - update some state to indicate we were unable to load the GPU library for future "info" ux
|
||||
}
|
||||
|
||||
return newDefaultExtServer(model, adapters, projectors, numLayers, opts)
|
||||
|
||||
return newDefaultExtServer(model, adapters, projectors, opts)
|
||||
}
|
||||
|
@@ -13,16 +13,16 @@ import (
|
||||
"github.com/jmorganca/ollama/api"
|
||||
)
|
||||
|
||||
//go:embed llama.cpp/gguf/ggml-metal.metal
|
||||
//go:embed llama.cpp/ggml-metal.metal
|
||||
var libEmbed embed.FS
|
||||
|
||||
func newDynamicShimExtServer(library, model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) {
|
||||
func newDynamicShimExtServer(library, model string, adapters, projectors []string, opts api.Options) (extServer, error) {
|
||||
// should never happen...
|
||||
return nil, fmt.Errorf("Dynamic library loading not supported on Mac")
|
||||
}
|
||||
|
||||
func nativeInit(workdir string) error {
|
||||
err := extractPayloadFiles(workdir, "llama.cpp/gguf/ggml-metal.metal")
|
||||
err := extractPayloadFiles(workdir, "llama.cpp/ggml-metal.metal")
|
||||
if err != nil {
|
||||
if err == payloadMissing {
|
||||
// TODO perhaps consider this a hard failure on arm macs?
|
||||
|
@@ -34,6 +34,8 @@ type shimExtServer struct {
|
||||
var shimMutex sync.Mutex
|
||||
var llm *shimExtServer
|
||||
|
||||
const pathComponentCount = 6
|
||||
|
||||
func (llm *shimExtServer) llama_server_init(sparams *C.ext_server_params_t, err *C.ext_server_resp_t) {
|
||||
C.dynamic_shim_llama_server_init(llm.s, sparams, err)
|
||||
}
|
||||
@@ -70,7 +72,7 @@ func (llm *shimExtServer) llama_server_release_json_resp(json_resp **C.char) {
|
||||
C.dynamic_shim_llama_server_release_json_resp(llm.s, json_resp)
|
||||
}
|
||||
|
||||
func newDynamicShimExtServer(library, model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) {
|
||||
func newDynamicShimExtServer(library, model string, adapters, projectors []string, opts api.Options) (extServer, error) {
|
||||
shimMutex.Lock()
|
||||
defer shimMutex.Unlock()
|
||||
updatePath(filepath.Dir(library))
|
||||
@@ -88,7 +90,7 @@ func newDynamicShimExtServer(library, model string, adapters, projectors []strin
|
||||
options: opts,
|
||||
}
|
||||
log.Printf("Loading Dynamic Shim llm server: %s", library)
|
||||
return newExtServer(llm, model, adapters, projectors, numLayers, opts)
|
||||
return newExtServer(llm, model, adapters, projectors, opts)
|
||||
}
|
||||
|
||||
func (llm *shimExtServer) Predict(ctx context.Context, pred PredictOpts, fn func(PredictResult)) error {
|
||||
@@ -112,7 +114,7 @@ func (llm *shimExtServer) Close() {
|
||||
}
|
||||
|
||||
func nativeInit(workdir string) error {
|
||||
libs, err := extractDynamicLibs(workdir, "llama.cpp/gguf/build/*/*/lib/*")
|
||||
libs, err := extractDynamicLibs(workdir, "llama.cpp/build/*/*/lib/*")
|
||||
if err != nil {
|
||||
if err == payloadMissing {
|
||||
log.Printf("%s", payloadMissing)
|
||||
@@ -151,13 +153,13 @@ func extractDynamicLibs(workDir, glob string) ([]string, error) {
|
||||
|
||||
for _, file := range files {
|
||||
pathComps := strings.Split(file, "/")
|
||||
if len(pathComps) != 7 {
|
||||
if len(pathComps) != pathComponentCount {
|
||||
log.Printf("unexpected payload components: %v", pathComps)
|
||||
continue
|
||||
}
|
||||
// llama.cpp/gguf/build/$OS/$VARIANT/lib/$LIBRARY
|
||||
// llama.cpp/build/$OS/$VARIANT/lib/$LIBRARY
|
||||
// Include the variant in the path to avoid conflicts between multiple server libs
|
||||
targetDir := filepath.Join(workDir, pathComps[4])
|
||||
targetDir := filepath.Join(workDir, pathComps[pathComponentCount-3])
|
||||
srcFile, err := libEmbed.Open(file)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("read payload %s: %v", file, err)
|
||||
|
@@ -10,7 +10,7 @@ import (
|
||||
"strings"
|
||||
)
|
||||
|
||||
//go:embed llama.cpp/gguf/build/*/*/lib/*.so
|
||||
//go:embed llama.cpp/build/*/*/lib/*.so
|
||||
var libEmbed embed.FS
|
||||
|
||||
func updatePath(dir string) {
|
||||
|
@@ -8,7 +8,7 @@ import (
|
||||
"strings"
|
||||
)
|
||||
|
||||
//go:embed llama.cpp/gguf/build/windows/*/lib/*.dll
|
||||
//go:embed llama.cpp/build/windows/*/lib/*.dll
|
||||
var libEmbed embed.FS
|
||||
|
||||
func updatePath(dir string) {
|
||||
|
63
parser/parser_test.go
Normal file
63
parser/parser_test.go
Normal file
@@ -0,0 +1,63 @@
|
||||
package parser
|
||||
|
||||
import (
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
|
||||
func Test_Parser(t *testing.T) {
|
||||
|
||||
input := `
|
||||
FROM model1
|
||||
ADAPTER adapter1
|
||||
LICENSE MIT
|
||||
PARAMETER param1 value1
|
||||
PARAMETER param2 value2
|
||||
TEMPLATE template1
|
||||
`
|
||||
|
||||
reader := strings.NewReader(input)
|
||||
|
||||
commands, err := Parse(reader)
|
||||
assert.Nil(t, err)
|
||||
|
||||
expectedCommands := []Command{
|
||||
{Name: "model", Args: "model1"},
|
||||
{Name: "adapter", Args: "adapter1"},
|
||||
{Name: "license", Args: "MIT"},
|
||||
{Name: "param1", Args: "value1"},
|
||||
{Name: "param2", Args: "value2"},
|
||||
{Name: "template", Args: "template1"},
|
||||
}
|
||||
|
||||
assert.Equal(t, expectedCommands, commands)
|
||||
}
|
||||
|
||||
func Test_Parser_NoFromLine(t *testing.T) {
|
||||
|
||||
input := `
|
||||
PARAMETER param1 value1
|
||||
PARAMETER param2 value2
|
||||
`
|
||||
|
||||
reader := strings.NewReader(input)
|
||||
|
||||
_, err := Parse(reader)
|
||||
assert.ErrorContains(t, err, "no FROM line")
|
||||
}
|
||||
|
||||
func Test_Parser_MissingValue(t *testing.T) {
|
||||
|
||||
input := `
|
||||
FROM foo
|
||||
PARAMETER param1
|
||||
`
|
||||
|
||||
reader := strings.NewReader(input)
|
||||
|
||||
_, err := Parse(reader)
|
||||
assert.ErrorContains(t, err, "missing value for [param1]")
|
||||
|
||||
}
|
@@ -25,10 +25,7 @@ func NewBuffer(prompt *Prompt) (*Buffer, error) {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
lwidth := width - len(prompt.Prompt)
|
||||
if prompt.UseAlt {
|
||||
lwidth = width - len(prompt.AltPrompt)
|
||||
}
|
||||
lwidth := width - len(prompt.prompt())
|
||||
|
||||
b := &Buffer{
|
||||
Pos: 0,
|
||||
@@ -78,7 +75,7 @@ func (b *Buffer) MoveRight() {
|
||||
if b.Pos < b.Size() {
|
||||
b.Pos += 1
|
||||
if b.Pos%b.LineWidth == 0 {
|
||||
fmt.Printf(CursorDown + CursorBOL + cursorRightN(b.PromptSize()))
|
||||
fmt.Printf(CursorDown + CursorBOL + cursorRightN(len(b.Prompt.prompt())))
|
||||
} else {
|
||||
fmt.Print(CursorRight)
|
||||
}
|
||||
@@ -109,7 +106,7 @@ func (b *Buffer) MoveToStart() {
|
||||
fmt.Print(CursorUp)
|
||||
}
|
||||
}
|
||||
fmt.Printf(CursorBOL + cursorRightN(b.PromptSize()))
|
||||
fmt.Printf(CursorBOL + cursorRightN(len(b.Prompt.prompt())))
|
||||
b.Pos = 0
|
||||
}
|
||||
}
|
||||
@@ -123,7 +120,7 @@ func (b *Buffer) MoveToEnd() {
|
||||
fmt.Print(CursorDown)
|
||||
}
|
||||
remainder := b.Size() % b.LineWidth
|
||||
fmt.Printf(CursorBOL + cursorRightN(b.PromptSize()+remainder))
|
||||
fmt.Printf(CursorBOL + cursorRightN(len(b.Prompt.prompt())+remainder))
|
||||
} else {
|
||||
fmt.Print(cursorRightN(b.Size() - b.Pos))
|
||||
}
|
||||
@@ -143,13 +140,6 @@ func min(n, m int) int {
|
||||
return n
|
||||
}
|
||||
|
||||
func (b *Buffer) PromptSize() int {
|
||||
if b.Prompt.UseAlt {
|
||||
return len(b.Prompt.AltPrompt)
|
||||
}
|
||||
return len(b.Prompt.Prompt)
|
||||
}
|
||||
|
||||
func (b *Buffer) Add(r rune) {
|
||||
if b.Pos == b.Buf.Size() {
|
||||
fmt.Printf("%c", r)
|
||||
@@ -232,7 +222,7 @@ func (b *Buffer) Remove() {
|
||||
remainingLines := (b.Size() - b.Pos) / b.LineWidth
|
||||
fmt.Printf(cursorDownN(remainingLines+1) + CursorBOL + ClearToEOL)
|
||||
place := b.Pos % b.LineWidth
|
||||
fmt.Printf(cursorUpN(remainingLines+1) + cursorRightN(place+len(b.Prompt.Prompt)))
|
||||
fmt.Printf(cursorUpN(remainingLines+1) + cursorRightN(place+len(b.Prompt.prompt())))
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -247,7 +237,7 @@ func (b *Buffer) Delete() {
|
||||
remainingLines := (b.Size() - b.Pos) / b.LineWidth
|
||||
fmt.Printf(cursorDownN(remainingLines) + CursorBOL + ClearToEOL)
|
||||
place := b.Pos % b.LineWidth
|
||||
fmt.Printf(cursorUpN(remainingLines) + cursorRightN(place+len(b.Prompt.Prompt)))
|
||||
fmt.Printf(cursorUpN(remainingLines) + cursorRightN(place+len(b.Prompt.prompt())))
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -294,15 +284,15 @@ func (b *Buffer) DeleteWord() {
|
||||
}
|
||||
|
||||
func (b *Buffer) ClearScreen() {
|
||||
fmt.Printf(ClearScreen + CursorReset + b.Prompt.Prompt)
|
||||
fmt.Printf(ClearScreen + CursorReset + b.Prompt.prompt())
|
||||
if b.IsEmpty() {
|
||||
ph := b.Prompt.Placeholder
|
||||
ph := b.Prompt.placeholder()
|
||||
fmt.Printf(ColorGrey + ph + cursorLeftN(len(ph)) + ColorDefault)
|
||||
} else {
|
||||
currPos := b.Pos
|
||||
b.Pos = 0
|
||||
b.drawRemaining()
|
||||
fmt.Printf(CursorReset + cursorRightN(len(b.Prompt.Prompt)))
|
||||
fmt.Printf(CursorReset + cursorRightN(len(b.Prompt.prompt())))
|
||||
if currPos > 0 {
|
||||
targetLine := currPos / b.LineWidth
|
||||
if targetLine > 0 {
|
||||
@@ -329,7 +319,7 @@ func (b *Buffer) IsEmpty() bool {
|
||||
func (b *Buffer) Replace(r []rune) {
|
||||
b.Pos = 0
|
||||
b.Buf.Clear()
|
||||
fmt.Printf(ClearLine + CursorBOL + b.Prompt.Prompt)
|
||||
fmt.Printf(ClearLine + CursorBOL + b.Prompt.prompt())
|
||||
for _, c := range r {
|
||||
b.Add(c)
|
||||
}
|
||||
|
@@ -16,6 +16,20 @@ type Prompt struct {
|
||||
UseAlt bool
|
||||
}
|
||||
|
||||
func (p *Prompt) prompt() string {
|
||||
if p.UseAlt {
|
||||
return p.AltPrompt
|
||||
}
|
||||
return p.Prompt
|
||||
}
|
||||
|
||||
func (p *Prompt) placeholder() string {
|
||||
if p.UseAlt {
|
||||
return p.AltPlaceholder
|
||||
}
|
||||
return p.Placeholder
|
||||
}
|
||||
|
||||
type Terminal struct {
|
||||
outchan chan rune
|
||||
}
|
||||
@@ -46,8 +60,9 @@ func New(prompt Prompt) (*Instance, error) {
|
||||
}
|
||||
|
||||
func (i *Instance) Readline() (string, error) {
|
||||
prompt := i.Prompt.Prompt
|
||||
if i.Prompt.UseAlt || i.Pasting {
|
||||
prompt := i.Prompt.prompt()
|
||||
if i.Pasting {
|
||||
// force alt prompt when pasting
|
||||
prompt = i.Prompt.AltPrompt
|
||||
}
|
||||
fmt.Print(prompt)
|
||||
@@ -71,10 +86,7 @@ func (i *Instance) Readline() (string, error) {
|
||||
// don't show placeholder when pasting unless we're in multiline mode
|
||||
showPlaceholder := !i.Pasting || i.Prompt.UseAlt
|
||||
if buf.IsEmpty() && showPlaceholder {
|
||||
ph := i.Prompt.Placeholder
|
||||
if i.Prompt.UseAlt {
|
||||
ph = i.Prompt.AltPlaceholder
|
||||
}
|
||||
ph := i.Prompt.placeholder()
|
||||
fmt.Printf(ColorGrey + ph + fmt.Sprintf(CursorLeftN, len(ph)) + ColorDefault)
|
||||
}
|
||||
|
||||
|
@@ -8,9 +8,9 @@ export GOFLAGS="'-ldflags=-w -s \"-X=github.com/jmorganca/ollama/version.Version
|
||||
mkdir -p dist
|
||||
|
||||
for TARGETARCH in arm64 amd64; do
|
||||
rm -rf llm/llama.cpp/build
|
||||
GOOS=darwin GOARCH=$TARGETARCH go generate ./...
|
||||
CGO_ENABLED=1 GOOS=darwin GOARCH=$TARGETARCH go build -o dist/ollama-darwin-$TARGETARCH
|
||||
rm -rf llm/llama.cpp/*/build
|
||||
done
|
||||
|
||||
lipo -create -output dist/ollama dist/ollama-darwin-*
|
||||
|
@@ -8,7 +8,7 @@ export GOFLAGS="'-ldflags=-w -s \"-X=github.com/jmorganca/ollama/version.Version
|
||||
mkdir -p dist
|
||||
|
||||
for TARGETARCH in amd64 arm64; do
|
||||
docker build --platform=linux/$TARGETARCH --build-arg=VERSION --build-arg=GOFLAGS --build-arg=CGO_CFLAGS -f Dockerfile.build -t builder:$TARGETARCH .
|
||||
docker build --platform=linux/$TARGETARCH --build-arg=GOFLAGS --build-arg=CGO_CFLAGS -f Dockerfile.build -t builder:$TARGETARCH .
|
||||
docker create --platform linux/$TARGETARCH --name builder-$TARGETARCH builder:$TARGETARCH
|
||||
docker cp builder-$TARGETARCH:/go/src/github.com/jmorganca/ollama/ollama ./dist/ollama-linux-$TARGETARCH
|
||||
docker rm builder-$TARGETARCH
|
||||
|
@@ -478,32 +478,6 @@ func CreateModel(ctx context.Context, name, modelFileDir string, commands []pars
|
||||
return err
|
||||
}
|
||||
|
||||
// if the model is not in gguf format, pull the base model to try and get it in gguf format
|
||||
if fromConfig.ModelFormat != "gguf" {
|
||||
fn(api.ProgressResponse{Status: "updating base model"})
|
||||
parent, err := GetModel(c.Args)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
originalModel := parent.OriginalModel
|
||||
if originalModel == "" {
|
||||
originalModel = parent.ShortName
|
||||
}
|
||||
if err := PullModel(ctx, originalModel, &RegistryOptions{}, fn); err != nil {
|
||||
log.Printf("error pulling parent model: %v", err)
|
||||
}
|
||||
|
||||
// Reset the file pointer to the beginning of the file
|
||||
_, err = fromConfigFile.Seek(0, 0)
|
||||
if err != nil {
|
||||
return fmt.Errorf("update from config after pull: %w", err)
|
||||
}
|
||||
if err := json.NewDecoder(fromConfigFile).Decode(&fromConfig); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
// if the model is still not in gguf format, error out
|
||||
if fromConfig.ModelFormat != "gguf" {
|
||||
return fmt.Errorf("%s is not in gguf format, this base model is not compatible with this version of ollama", c.Args)
|
||||
|
@@ -610,12 +610,18 @@ func ShowModelHandler(c *gin.Context) {
|
||||
return
|
||||
}
|
||||
|
||||
if req.Name == "" {
|
||||
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "name is required"})
|
||||
switch {
|
||||
case req.Model == "" && req.Name == "":
|
||||
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "model is required"})
|
||||
return
|
||||
case req.Model != "" && req.Name != "":
|
||||
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "both model and name are set"})
|
||||
return
|
||||
case req.Model == "" && req.Name != "":
|
||||
req.Model = req.Name
|
||||
}
|
||||
|
||||
resp, err := GetModelInfo(req.Name)
|
||||
resp, err := GetModelInfo(req)
|
||||
if err != nil {
|
||||
if os.IsNotExist(err) {
|
||||
c.JSON(http.StatusNotFound, gin.H{"error": fmt.Sprintf("model '%s' not found", req.Name)})
|
||||
@@ -628,8 +634,8 @@ func ShowModelHandler(c *gin.Context) {
|
||||
c.JSON(http.StatusOK, resp)
|
||||
}
|
||||
|
||||
func GetModelInfo(name string) (*api.ShowResponse, error) {
|
||||
model, err := GetModel(name)
|
||||
func GetModelInfo(req api.ShowRequest) (*api.ShowResponse, error) {
|
||||
model, err := GetModel(req.Model)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@@ -642,6 +648,14 @@ func GetModelInfo(name string) (*api.ShowResponse, error) {
|
||||
QuantizationLevel: model.Config.FileType,
|
||||
}
|
||||
|
||||
if req.System != "" {
|
||||
model.System = req.System
|
||||
}
|
||||
|
||||
if req.Template != "" {
|
||||
model.Template = req.Template
|
||||
}
|
||||
|
||||
resp := &api.ShowResponse{
|
||||
License: strings.Join(model.License, "\n"),
|
||||
System: model.System,
|
||||
@@ -649,13 +663,6 @@ func GetModelInfo(name string) (*api.ShowResponse, error) {
|
||||
Details: modelDetails,
|
||||
}
|
||||
|
||||
mf, err := ShowModelfile(model)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
resp.Modelfile = mf
|
||||
|
||||
var params []string
|
||||
cs := 30
|
||||
for k, v := range model.Options {
|
||||
@@ -685,6 +692,19 @@ func GetModelInfo(name string) (*api.ShowResponse, error) {
|
||||
}
|
||||
resp.Parameters = strings.Join(params, "\n")
|
||||
|
||||
for k, v := range req.Options {
|
||||
if _, ok := req.Options[k]; ok {
|
||||
model.Options[k] = v
|
||||
}
|
||||
}
|
||||
|
||||
mf, err := ShowModelfile(model)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
resp.Modelfile = mf
|
||||
|
||||
return resp, nil
|
||||
}
|
||||
|
||||
@@ -837,6 +857,7 @@ func (s *Server) GenerateRoutes() http.Handler {
|
||||
|
||||
config := cors.DefaultConfig()
|
||||
config.AllowWildcard = true
|
||||
config.AllowBrowserExtensions = true
|
||||
|
||||
config.AllowOrigins = origins
|
||||
for _, allowOrigin := range defaultAllowOrigins {
|
||||
|
Reference in New Issue
Block a user