update submodule to cd4fddb29f81d6a1f6d51a0c016bc6b486d68def

Fix clearing kv cache between requests with the same prompt (#2186 )
* Fix clearing kv cache between requests with the same prompt * fix powershell script
2024-01-25 13:54:11 -08:00 · 2024-01-25 13:46:20 -08:00 · 2024-01-25 12:12:36 -08:00 · 2024-01-25 11:55:15 -08:00 · 2024-01-25 09:22:42 -08:00 · 2024-01-24 21:36:56 -08:00
147 changed files with 9145 additions and 3288 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -2,8 +2,7 @@
 ollama
 app
 dist
-scripts
-llm/llama.cpp/ggml
-llm/llama.cpp/gguf
+llm/llama.cpp
 .env
 .cache
+test_data
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -0,0 +1,114 @@
+name: test
+
+on:
+  pull_request:
+
+jobs:
+  generate:
+    strategy:
+      matrix:
+        os: [ubuntu-latest, macos-latest, windows-latest]
+        arch: [amd64, arm64]
+        exclude:
+          - os: ubuntu-latest
+            arch: arm64
+          - os: windows-latest
+            arch: arm64
+    runs-on: ${{ matrix.os }}
+    env:
+      GOARCH: ${{ matrix.arch }}
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-go@v5
+        with:
+          go-version: '1.21'
+          cache: true
+      - if: ${{ startsWith(matrix.os, 'windows-') }}
+        shell: pwsh
+        run: |
+          $path = vswhere -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath
+          if ($path) {
+              $path = join-path $path 'Common7\Tools\vsdevcmd.bat'
+              if (test-path $path) {
+                  cmd /s /c """$path"" $args && set" | where { $_ -match '(\w+)=(.*)' } | foreach {
+                      echo "$($Matches[1])=$($Matches[2])" | Out-File -FilePath $Env:GITHUB_ENV -Encoding utf8 -Append
+                  }
+              }
+          }
+
+          echo "C:\Program Files\Git\usr\bin" | Out-File -FilePath $Env:GITHUB_PATH -Encoding utf8 -Append
+      - run: go get ./...
+      - run: go generate -x ./...
+      - uses: actions/upload-artifact@v4
+        with:
+          name: ${{ matrix.os }}-${{ matrix.arch }}-libraries
+          path: |
+            llm/llama.cpp/build/**/lib/*
+  lint:
+    strategy:
+      matrix:
+        os: [ubuntu-latest, macos-latest, windows-latest]
+        arch: [amd64, arm64]
+        exclude:
+          - os: ubuntu-latest
+            arch: arm64
+          - os: windows-latest
+            arch: arm64
+          - os: macos-latest
+            arch: amd64
+    runs-on: ${{ matrix.os }}
+    env:
+      GOARCH: ${{ matrix.arch }}
+      CGO_ENABLED: "1"
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          submodules: recursive
+      - uses: actions/setup-go@v5
+        with:
+          go-version: '1.21'
+          cache: false
+      - run: |
+          mkdir -p llm/llama.cpp/build/linux/${{ matrix.arch }}/stub/lib/
+          touch llm/llama.cpp/build/linux/${{ matrix.arch }}/stub/lib/stub.so
+        if: ${{ startsWith(matrix.os, 'ubuntu-') }}
+      - run: |
+          mkdir -p llm/llama.cpp/build/darwin/${{ matrix.arch }}/stub/lib/
+          touch llm/llama.cpp/build/darwin/${{ matrix.arch }}/stub/lib/stub.dylib
+          touch llm/llama.cpp/ggml-metal.metal
+        if: ${{ startsWith(matrix.os, 'macos-') }}
+      - run: |
+          mkdir -p llm/llama.cpp/build/windows/${{ matrix.arch }}/stub/lib/
+          touch llm/llama.cpp/build/windows/${{ matrix.arch }}/stub/lib/stub.dll
+        if: ${{ startsWith(matrix.os, 'windows-') }}
+      - uses: golangci/golangci-lint-action@v3
+  test:
+    needs: generate
+    strategy:
+      matrix:
+        os: [ubuntu-latest, macos-latest, windows-latest]
+        arch: [amd64]
+        exclude:
+          - os: ubuntu-latest
+            arch: arm64
+          - os: windows-latest
+            arch: arm64
+    runs-on: ${{ matrix.os }}
+    env:
+      GOARCH: ${{ matrix.arch }}
+      CGO_ENABLED: "1"
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          submodules: recursive
+      - uses: actions/setup-go@v5
+        with:
+          go-version: '1.21'
+          cache: true
+      - run: go get
+      - uses: actions/download-artifact@v4
+        with:
+          name: ${{ matrix.os }}-${{ matrix.arch }}-libraries
+          path: llm/llama.cpp/build
+      - run: go build
+      - run: go test -v ./...
--- a/.gitignore
+++ b/.gitignore
@@ -8,4 +8,5 @@ ollama
 ggml-metal.metal
 .cache
 *.exe
-.idea
+.idea
+test_data
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,10 +1,4 @@
-[submodule "llm/llama.cpp/ggml"]
-    path = llm/llama.cpp/ggml
-    url = https://github.com/ggerganov/llama.cpp.git
-    ignore = dirty
-    shallow = true
-[submodule "llm/llama.cpp/gguf"]
-    path = llm/llama.cpp/gguf
-    url = https://github.com/ggerganov/llama.cpp.git
-    ignore = dirty
-    shallow = true
+[submodule "llama.cpp"]
+	path = llm/llama.cpp
+	url = https://github.com/ggerganov/llama.cpp.git
+	shallow = true
--- a/.golangci.yaml
+++ b/.golangci.yaml
@@ -0,0 +1,27 @@
+run:
+  timeout: 5m
+linters:
+  enable:
+    - asasalint
+    - bidichk
+    - bodyclose
+    - containedctx
+    - contextcheck
+    - exportloopref
+    - gocheckcompilerdirectives
+    # FIXME: for some reason this errors on windows
+    # - gofmt
+    # - goimports
+    - misspell
+    - nilerr
+    - unused
+linters-settings:
+  errcheck:
+    # exclude the following functions since we don't generally
+    # need to be concerned with the returned errors
+    exclude-functions:
+      - encoding/binary.Read
+      - (*os.File).Seek
+      - (*bufio.Writer).WriteString
+      - (*github.com/spf13/pflag.FlagSet).Set
+      - (*github.com/jmorganca/ollama/llm.readSeekOffset).Seek
--- a/129
+++ b/129
@@ -1,23 +1,126 @@
-FROM nvidia/cuda:11.8.0-devel-ubuntu22.04
+ARG GOLANG_VERSION=1.21.3
+ARG CMAKE_VERSION=3.22.1
+ARG CUDA_VERSION=11.3.1

-ARG TARGETARCH
-ARG GOFLAGS="'-ldflags=-w -s'"
+# Copy the minimal context we need to run the generate scripts
+FROM scratch AS llm-code
+COPY .git .git
+COPY .gitmodules .gitmodules
+COPY llm llm

+FROM --platform=linux/amd64 nvidia/cuda:$CUDA_VERSION-devel-centos7 AS cuda-build-amd64
+ARG CMAKE_VERSION
+COPY ./scripts/rh_linux_deps.sh /
+RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
+ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
+COPY --from=llm-code / /go/src/github.com/jmorganca/ollama/
+WORKDIR /go/src/github.com/jmorganca/ollama/llm/generate
+ARG CGO_CFLAGS
+RUN OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
+
+FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION-devel-rockylinux8 AS cuda-build-arm64
+ARG CMAKE_VERSION
+COPY ./scripts/rh_linux_deps.sh /
+RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
+ENV PATH /opt/rh/gcc-toolset-10/root/usr/bin:$PATH
+COPY --from=llm-code / /go/src/github.com/jmorganca/ollama/
+WORKDIR /go/src/github.com/jmorganca/ollama/llm/generate
+ARG CGO_CFLAGS
+RUN OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
+
+FROM --platform=linux/amd64 rocm/dev-centos-7:5.7.1-complete AS rocm-5-build-amd64
+ARG CMAKE_VERSION
+COPY ./scripts/rh_linux_deps.sh /
+RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
+ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
+ENV LIBRARY_PATH /opt/amdgpu/lib64
+COPY --from=llm-code / /go/src/github.com/jmorganca/ollama/
+WORKDIR /go/src/github.com/jmorganca/ollama/llm/generate
+ARG CGO_CFLAGS
+ARG AMDGPU_TARGETS
+RUN OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
+
+FROM --platform=linux/amd64 rocm/dev-centos-7:6.0-complete AS rocm-6-build-amd64
+ARG CMAKE_VERSION
+COPY ./scripts/rh_linux_deps.sh /
+RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
+ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
+ENV LIBRARY_PATH /opt/amdgpu/lib64
+COPY --from=llm-code / /go/src/github.com/jmorganca/ollama/
+WORKDIR /go/src/github.com/jmorganca/ollama/llm/generate
+ARG CGO_CFLAGS
+ARG AMDGPU_TARGETS
+RUN OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
+
+FROM --platform=linux/amd64 centos:7 AS cpu-builder-amd64
+ARG CMAKE_VERSION
+ARG GOLANG_VERSION
+COPY ./scripts/rh_linux_deps.sh /
+RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
+ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
+COPY --from=llm-code / /go/src/github.com/jmorganca/ollama/
+ARG OLLAMA_CUSTOM_CPU_DEFS
+ARG CGO_CFLAGS
+WORKDIR /go/src/github.com/jmorganca/ollama/llm/generate
+
+FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu-build-amd64
+RUN OLLAMA_CPU_TARGET="cpu" sh gen_linux.sh
+FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu_avx-build-amd64
+RUN OLLAMA_CPU_TARGET="cpu_avx" sh gen_linux.sh
+FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu_avx2-build-amd64
+RUN OLLAMA_CPU_TARGET="cpu_avx2" sh gen_linux.sh
+
+FROM --platform=linux/arm64 centos:7 AS cpu-build-arm64
+ARG CMAKE_VERSION
+ARG GOLANG_VERSION
+COPY ./scripts/rh_linux_deps.sh /
+RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
+ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
+COPY --from=llm-code / /go/src/github.com/jmorganca/ollama/
+WORKDIR /go/src/github.com/jmorganca/ollama/llm/generate
+# Note, we only build the "base" CPU variant on arm since avx/avx2 are x86 features
+ARG OLLAMA_CUSTOM_CPU_DEFS
+ARG CGO_CFLAGS
+RUN OLLAMA_CPU_TARGET="cpu" sh gen_linux.sh
+
+# Intermediate stage used for ./scripts/build_linux.sh
+FROM --platform=linux/amd64 cpu-build-amd64 AS build-amd64
+ENV CGO_ENABLED 1
 WORKDIR /go/src/github.com/jmorganca/ollama
-RUN apt-get update && apt-get install -y git build-essential cmake
-ADD https://dl.google.com/go/go1.21.3.linux-$TARGETARCH.tar.gz /tmp/go1.21.3.tar.gz
-RUN mkdir -p /usr/local && tar xz -C /usr/local </tmp/go1.21.3.tar.gz
-
 COPY . .
-ENV GOARCH=$TARGETARCH
-ENV GOFLAGS=$GOFLAGS
-RUN /usr/local/go/bin/go generate ./... \
-    && /usr/local/go/bin/go build .
+COPY --from=cpu_avx-build-amd64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
+COPY --from=cpu_avx2-build-amd64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
+COPY --from=cuda-build-amd64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
+COPY --from=rocm-5-build-amd64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
+COPY --from=rocm-6-build-amd64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
+ARG GOFLAGS
+ARG CGO_CFLAGS
+RUN go build .

-FROM ubuntu:22.04
+# Intermediate stage used for ./scripts/build_linux.sh
+FROM --platform=linux/arm64 cpu-build-arm64 AS build-arm64
+ENV CGO_ENABLED 1
+ARG GOLANG_VERSION
+WORKDIR /go/src/github.com/jmorganca/ollama
+COPY . .
+COPY --from=cuda-build-arm64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
+ARG GOFLAGS
+ARG CGO_CFLAGS
+RUN go build .
+
+# Runtime stages
+FROM --platform=linux/amd64 rocm/dev-centos-7:6.0-complete as runtime-amd64
+COPY --from=build-amd64 /go/src/github.com/jmorganca/ollama/ollama /bin/ollama
+FROM --platform=linux/arm64 ubuntu:22.04 as runtime-arm64
 RUN apt-get update && apt-get install -y ca-certificates
-COPY --from=0 /go/src/github.com/jmorganca/ollama/ollama /bin/ollama
+COPY --from=build-arm64 /go/src/github.com/jmorganca/ollama/ollama /bin/ollama
+
+FROM runtime-$TARGETARCH
 EXPOSE 11434
 ENV OLLAMA_HOST 0.0.0.0
+ENV PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
+ENV LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/opt/rocm/lib:
+ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
+
 ENTRYPOINT ["/bin/ollama"]
 CMD ["serve"]
--- a/Dockerfile.build
+++ b/Dockerfile.build
@@ -1,31 +0,0 @@
-# centos7 amd64 dependencies
-FROM --platform=linux/amd64 nvidia/cuda:11.3.1-devel-centos7 AS base-amd64
-RUN yum install -y https://repo.ius.io/ius-release-el7.rpm centos-release-scl && \
-    yum update -y && \
-    yum install -y devtoolset-10-gcc devtoolset-10-gcc-c++ git236 wget
-RUN wget "https://github.com/Kitware/CMake/releases/download/v3.27.6/cmake-3.27.6-linux-x86_64.sh" -O cmake-installer.sh && chmod +x cmake-installer.sh && ./cmake-installer.sh --skip-license --prefix=/usr/local
-ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
-
-# centos8 arm64 dependencies
-FROM --platform=linux/arm64 nvidia/cuda-arm64:11.3.1-devel-centos8 AS base-arm64
-RUN sed -i -e 's/mirrorlist/#mirrorlist/g' -e 's|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g' /etc/yum.repos.d/CentOS-*
-RUN yum install -y git cmake
-
-FROM base-${TARGETARCH}
-ARG TARGETARCH
-ARG GOFLAGS="'-ldflags -w -s'"
-
-# install go
-ADD https://dl.google.com/go/go1.21.3.linux-$TARGETARCH.tar.gz /tmp/go1.21.3.tar.gz
-RUN mkdir -p /usr/local && tar xz -C /usr/local </tmp/go1.21.3.tar.gz
-
-# build the final binary
-WORKDIR /go/src/github.com/jmorganca/ollama
-COPY . .
-
-ENV GOOS=linux
-ENV GOARCH=$TARGETARCH
-ENV GOFLAGS=$GOFLAGS
-
-RUN /usr/local/go/bin/go generate ./... && \
-    /usr/local/go/bin/go build .
--- a/README.md
+++ b/README.md
@@ -1,8 +1,5 @@
 <div align="center">
-  <picture>
-    <source media="(prefers-color-scheme: dark)" height="200px" srcset="https://github.com/jmorganca/ollama/assets/3325447/56ea1849-1284-4645-8970-956de6e51c3c">
-    <img alt="logo" height="200px" src="https://github.com/jmorganca/ollama/assets/3325447/0d0b44e2-8f4a-4e99-9b52-a5c1c741c8f7">
-  </picture>
+  <img alt="ollama" height="200px" src="https://github.com/jmorganca/ollama/assets/3325447/0d0b44e2-8f4a-4e99-9b52-a5c1c741c8f7">
 </div>

 # Ollama
@@ -17,7 +14,7 @@ Get up and running with large language models locally.

 ### Windows

-Coming soon!
+Coming soon! For now, you can install Ollama on Windows via WSL2.

 ### Linux & WSL2

@@ -31,6 +28,11 @@ curl https://ollama.ai/install.sh | sh

 The official [Ollama Docker image](https://hub.docker.com/r/ollama/ollama) `ollama/ollama` is available on Docker Hub.

+### Libraries
+
+- [ollama-python](https://github.com/ollama/ollama-python)
+- [ollama-js](https://github.com/ollama/ollama-js)
+
 ## Quickstart

 To run and chat with [Llama 2](https://ollama.ai/library/llama2):
@@ -47,20 +49,23 @@ Here are some example open-source models that can be downloaded:

 | Model              | Parameters | Size  | Download                       |
 | ------------------ | ---------- | ----- | ------------------------------ |
+| Llama 2            | 7B         | 3.8GB | `ollama run llama2`            |
+| Mistral            | 7B         | 4.1GB | `ollama run mistral`           |
+| Dolphin Phi        | 2.7B       | 1.6GB | `ollama run dolphin-phi`       |
+| Phi-2              | 2.7B       | 1.7GB | `ollama run phi`               |
 | Neural Chat        | 7B         | 4.1GB | `ollama run neural-chat`       |
 | Starling           | 7B         | 4.1GB | `ollama run starling-lm`       |
-| Mistral            | 7B         | 4.1GB | `ollama run mistral`           |
-| Llama 2            | 7B         | 3.8GB | `ollama run llama2`            |
 | Code Llama         | 7B         | 3.8GB | `ollama run codellama`         |
 | Llama 2 Uncensored | 7B         | 3.8GB | `ollama run llama2-uncensored` |
 | Llama 2 13B        | 13B        | 7.3GB | `ollama run llama2:13b`        |
 | Llama 2 70B        | 70B        | 39GB  | `ollama run llama2:70b`        |
 | Orca Mini          | 3B         | 1.9GB | `ollama run orca-mini`         |
 | Vicuna             | 7B         | 3.8GB | `ollama run vicuna`            |
+| LLaVA              | 7B         | 4.5GB | `ollama run llava`             |

-> Note: You should have at least 8 GB of RAM to run the 3B models, 16 GB to run the 7B models, and 32 GB to run the 13B models.
+> Note: You should have at least 8 GB of RAM available to run the 7B models, 16 GB to run the 13B models, and 32 GB to run the 33B models.

-## Customize your own model
+## Customize a model

 ### Import from GGUF

@@ -104,7 +109,7 @@ FROM llama2
 # set the temperature to 1 [higher is more creative, lower is more coherent]
 PARAMETER temperature 1

-# set the system prompt
+# set the system message
 SYSTEM """
 You are Mario from Super Mario Bros. Answer as Mario, the assistant, only.
 """
@@ -127,6 +132,10 @@ For more examples, see the [examples](examples) directory. For more information

 `ollama create` is used to create a model from a Modelfile.

+```
+ollama create mymodel -f ./Modelfile
+```
+
 ### Pull a model

 ```
@@ -158,6 +167,13 @@ For multiline input, you can wrap text with `"""`:
 I'm a basic program that prints the famous "Hello, world!" message to the console.
 ```

+### Multimodal models
+
+```
+>>> What's in this image? /Users/jmorgan/Desktop/smile.png
+The image features a yellow smiley face, which is likely the central focus of the picture.
+```
+
 ### Pass in prompt as arguments

 ```
@@ -183,13 +199,19 @@ Install `cmake` and `go`:
 brew install cmake go
 ```

-Then generate dependencies and build:
-
+Then generate dependencies:
 ```
 go generate ./...
+```
+Then build the binary:
+```
 go build .
 ```

+More detailed instructions can be found in the [developer guide](https://github.com/jmorganca/ollama/blob/main/docs/development.md)
+
+
+### Running local builds
 Next, start the server:

 ```
@@ -205,7 +227,8 @@ Finally, in a separate shell, run a model:
 ## REST API

 Ollama has a REST API for running and managing models.
-For example, to generate text from a model:
+
+### Generate a response

 ```
 curl http://localhost:11434/api/generate -d '{
@@ -214,16 +237,23 @@ curl http://localhost:11434/api/generate -d '{
 }'
 ```

+### Chat with a model
+
+```
+curl http://localhost:11434/api/chat -d '{
+  "model": "mistral",
+  "messages": [
+    { "role": "user", "content": "why is the sky blue?" }
+  ]
+}'
+```
+
 See the [API documentation](./docs/api.md) for all endpoints.

 ## Community Integrations

-### Mobile
-
- [Mobile Artificial Intelligence Distribution](https://github.com/MaidFoundation/Maid) (Maid)
-
 ### Web & Desktop
-
+- [Bionic GPT](https://github.com/bionic-gpt/bionic-gpt)
 - [HTML UI](https://github.com/rtcfirefly/ollama-ui)
 - [Chatbot UI](https://github.com/ivanfioravanti/chatbot-ollama)
 - [Typescript UI](https://github.com/ollama-interface/Ollama-Gui?tab=readme-ov-file)
@@ -233,6 +263,9 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [big-AGI](https://github.com/enricoros/big-agi/blob/main/docs/config-ollama.md)
 - [Cheshire Cat assistant framework](https://github.com/cheshire-cat-ai/core)
 - [Amica](https://github.com/semperai/amica)
+- [chatd](https://github.com/BruceMacD/chatd)
+- [Ollama-SwiftUI](https://github.com/kghandour/Ollama-SwiftUI)
+

 ### Terminal

@@ -244,6 +277,11 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [ogpt.nvim](https://github.com/huynle/ogpt.nvim)
 - [gptel Emacs client](https://github.com/karthink/gptel)
 - [Oatmeal](https://github.com/dustinblackman/oatmeal)
+- [cmdh](https://github.com/pgibler/cmdh)
+
+### Database
+
+- [MindsDB](https://github.com/mindsdb/mindsdb/blob/staging/mindsdb/integrations/handlers/ollama_handler/README.md)

 ### Package managers

@@ -256,16 +294,22 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [LlamaIndex](https://gpt-index.readthedocs.io/en/stable/examples/llm/ollama.html)
 - [LiteLLM](https://github.com/BerriAI/litellm)
 - [OllamaSharp for .NET](https://github.com/awaescher/OllamaSharp)
+- [Ollama for Ruby](https://github.com/gbaptista/ollama-ai)
 - [Ollama-rs for Rust](https://github.com/pepperoni21/ollama-rs)
 - [Ollama4j for Java](https://github.com/amithkoujalgi/ollama4j)
 - [ModelFusion Typescript Library](https://modelfusion.dev/integration/model-provider/ollama)
 - [OllamaKit for Swift](https://github.com/kevinhermawan/OllamaKit)
 - [Ollama for Dart](https://github.com/breitburg/dart-ollama)
 - [Ollama for Laravel](https://github.com/cloudstudio/ollama-laravel)
+- [LangChainDart](https://github.com/davidmigloz/langchain_dart)
+- [Semantic Kernel - Python](https://github.com/microsoft/semantic-kernel/tree/main/python/semantic_kernel/connectors/ai/ollama)
+- [Haystack](https://github.com/deepset-ai/haystack-integrations/blob/main/integrations/ollama.md)
+

 ### Mobile

- [Maid](https://github.com/danemadsen/Maid) (Mobile Artificial Intelligence Distribution)
+- [Enchanted](https://github.com/AugustDev/enchanted)
+- [Maid](https://github.com/Mobile-Artificial-Intelligence/maid)

 ### Extensions & Plugins

@@ -276,7 +320,9 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Logseq Ollama plugin](https://github.com/omagdy7/ollama-logseq)
 - [Dagger Chatbot](https://github.com/samalba/dagger-chatbot)
 - [Discord AI Bot](https://github.com/mekb-turtle/discord-ai-bot)
+- [Ollama Telegram Bot](https://github.com/ruecat/ollama-telegram)
 - [Hass Ollama Conversation](https://github.com/ej52/hass-ollama-conversation)
 - [Rivet plugin](https://github.com/abrenneke/rivet-plugin-ollama)
 - [Llama Coder](https://github.com/ex3ndr/llama-coder) (Copilot alternative using Ollama)
 - [Obsidian BMO Chatbot plugin](https://github.com/longy2k/obsidian-bmo-chatbot)
+- [Open Interpreter](https://docs.openinterpreter.com/language-model-setup/local-models/ollama)
--- a/api/client.go
+++ b/api/client.go
@@ -221,6 +221,19 @@ func (c *Client) Generate(ctx context.Context, req *GenerateRequest, fn Generate
 	})
 }

+type ChatResponseFunc func(ChatResponse) error
+
+func (c *Client) Chat(ctx context.Context, req *ChatRequest, fn ChatResponseFunc) error {
+	return c.stream(ctx, http.MethodPost, "/api/chat", req, func(bts []byte) error {
+		var resp ChatResponse
+		if err := json.Unmarshal(bts, &resp); err != nil {
+			return err
+		}
+
+		return fn(resp)
+	})
+}
+
 type PullProgressFunc func(ProgressResponse) error

 func (c *Client) Pull(ctx context.Context, req *PullRequest, fn PullProgressFunc) error {
@@ -296,6 +309,13 @@ func (c *Client) Heartbeat(ctx context.Context) error {
 	}
 	return nil
 }
+func (c *Client) Embeddings(ctx context.Context, req *EmbeddingRequest) (*EmbeddingResponse, error) {
+	var resp EmbeddingResponse
+	if err := c.do(ctx, http.MethodPost, "/api/embeddings", req, &resp); err != nil {
+		return nil, err
+	}
+	return &resp, nil
+}

 func (c *Client) CreateBlob(ctx context.Context, digest string, r io.Reader) error {
 	if err := c.do(ctx, http.MethodHead, fmt.Sprintf("/api/blobs/%s", digest), nil, nil); err != nil {
@@ -311,3 +331,15 @@ func (c *Client) CreateBlob(ctx context.Context, digest string, r io.Reader) err

 	return nil
 }
+
+func (c *Client) Version(ctx context.Context) (string, error) {
+	var version struct {
+		Version string `json:"version"`
+	}
+
+	if err := c.do(ctx, http.MethodGet, "/api/version", nil, &version); err != nil {
+		return "", err
+	}
+
+	return version.Version, nil
+}
--- a/api/client.py
+++ b/api/client.py
@@ -1,284 +0,0 @@
-import os
-import json
-import requests
-import os
-import hashlib
-import json
-from pathlib import Path
-
-BASE_URL = os.environ.get('OLLAMA_HOST', 'http://localhost:11434')
-
-# Generate a response for a given prompt with a provided model. This is a streaming endpoint, so will be a series of responses.
-# The final response object will include statistics and additional data from the request. Use the callback function to override
-# the default handler.
-def generate(model_name, prompt, system=None, template=None, format="", context=None, options=None, callback=None):
-    try:
-        url = f"{BASE_URL}/api/generate"
-        payload = {
-            "model": model_name, 
-            "prompt": prompt, 
-            "system": system, 
-            "template": template, 
-            "context": context, 
-            "options": options,
-            "format": format,
-        }
-        
-        # Remove keys with None values
-        payload = {k: v for k, v in payload.items() if v is not None}
-        
-        with requests.post(url, json=payload, stream=True) as response:
-            response.raise_for_status()
-            
-            # Creating a variable to hold the context history of the final chunk
-            final_context = None
-            
-            # Variable to hold concatenated response strings if no callback is provided
-            full_response = ""
-
-            # Iterating over the response line by line and displaying the details
-            for line in response.iter_lines():
-                if line:
-                    # Parsing each line (JSON chunk) and extracting the details
-                    chunk = json.loads(line)
-                    
-                    # If a callback function is provided, call it with the chunk
-                    if callback:
-                        callback(chunk)
-                    else:
-                        # If this is not the last chunk, add the "response" field value to full_response and print it
-                        if not chunk.get("done"):
-                            response_piece = chunk.get("response", "")
-                            full_response += response_piece
-                            print(response_piece, end="", flush=True)
-                    
-                    # Check if it's the last chunk (done is true)
-                    if chunk.get("done"):
-                        final_context = chunk.get("context")
-            
-            # Return the full response and the final context
-            return full_response, final_context
-    except requests.exceptions.RequestException as e:
-        print(f"An error occurred: {e}")
-        return None, None
-    
-
-# Create a blob file on the server if it doesn't exist.
-def create_blob(digest, file_path):
-    url = f"{BASE_URL}/api/blobs/{digest}"
-
-    # Check if the blob exists
-    response = requests.head(url)
-    if response.status_code != 404:
-        return  # Blob already exists, no need to upload
-    response.raise_for_status()
-
-    # Upload the blob
-    with open(file_path, 'rb') as file_data:
-        requests.post(url, data=file_data)
-
-
-# Create a model from a Modelfile. Use the callback function to override the default handler.
-def create(model_name, filename, callback=None):
-    try:
-        file_path = Path(filename).expanduser().resolve()
-        processed_lines = []
-
-        # Read and process the modelfile
-        with open(file_path, 'r') as f:
-            for line in f:            
-                # Skip empty or whitespace-only lines
-                if not line.strip():
-                    continue
-            
-                command, args = line.split(maxsplit=1)
-
-                if command.upper() in ["FROM", "ADAPTER"]:
-                    path = Path(args.strip()).expanduser()
-
-                    # Check if path is relative and resolve it
-                    if not path.is_absolute():
-                        path = (file_path.parent / path)
-
-                    # Skip if file does not exist for "model", this is handled by the server
-                    if not path.exists():
-                        processed_lines.append(line)
-                        continue
-
-                    # Calculate SHA-256 hash
-                    with open(path, 'rb') as bin_file:
-                        hash = hashlib.sha256()
-                        hash.update(bin_file.read())
-                        blob = f"sha256:{hash.hexdigest()}"
-                
-                    # Add the file to the remote server
-                    create_blob(blob, path)
-
-                    # Replace path with digest in the line
-                    line = f"{command} @{blob}\n"
-
-                processed_lines.append(line)
-
-        # Combine processed lines back into a single string
-        modelfile_content = '\n'.join(processed_lines)
-
-        url = f"{BASE_URL}/api/create"
-        payload = {"name": model_name, "modelfile": modelfile_content}
-
-        # Making a POST request with the stream parameter set to True to handle streaming responses
-        with requests.post(url, json=payload, stream=True) as response:
-            response.raise_for_status()
-            # Iterating over the response line by line and displaying the status
-            for line in response.iter_lines():
-                if line:
-                    chunk = json.loads(line)
-                    if callback:
-                        callback(chunk)
-                    else:
-                        print(f"Status: {chunk.get('status')}")
-
-    except Exception as e:
-        print(f"An error occurred: {e}")
-
-
-# Pull a model from a the model registry. Cancelled pulls are resumed from where they left off, and multiple
-# calls to will share the same download progress. Use the callback function to override the default handler.
-def pull(model_name, insecure=False, callback=None):
-    try:
-        url = f"{BASE_URL}/api/pull"
-        payload = {
-            "name": model_name,
-            "insecure": insecure
-        }
-
-        # Making a POST request with the stream parameter set to True to handle streaming responses
-        with requests.post(url, json=payload, stream=True) as response:
-            response.raise_for_status()
-
-            # Iterating over the response line by line and displaying the details
-            for line in response.iter_lines():
-                if line:
-                    # Parsing each line (JSON chunk) and extracting the details
-                    chunk = json.loads(line)
-
-                    # If a callback function is provided, call it with the chunk
-                    if callback:
-                        callback(chunk)
-                    else:
-                        # Print the status message directly to the console
-                        print(chunk.get('status', ''), end='', flush=True)
-                    
-                    # If there's layer data, you might also want to print that (adjust as necessary)
-                    if 'digest' in chunk:
-                        print(f" - Digest: {chunk['digest']}", end='', flush=True)
-                        print(f" - Total: {chunk['total']}", end='', flush=True)
-                        print(f" - Completed: {chunk['completed']}", end='\n', flush=True)
-                    else:
-                        print()
-    except requests.exceptions.RequestException as e:
-        print(f"An error occurred: {e}")
-
-# Push a model to the model registry. Use the callback function to override the default handler.
-def push(model_name, insecure=False, callback=None):
-    try:
-        url = f"{BASE_URL}/api/push"
-        payload = {
-            "name": model_name,
-            "insecure": insecure
-        }
-
-        # Making a POST request with the stream parameter set to True to handle streaming responses
-        with requests.post(url, json=payload, stream=True) as response:
-            response.raise_for_status()
-
-            # Iterating over the response line by line and displaying the details
-            for line in response.iter_lines():
-                if line:
-                    # Parsing each line (JSON chunk) and extracting the details
-                    chunk = json.loads(line)
-
-                    # If a callback function is provided, call it with the chunk
-                    if callback:
-                        callback(chunk)
-                    else:
-                        # Print the status message directly to the console
-                        print(chunk.get('status', ''), end='', flush=True)
-                    
-                    # If there's layer data, you might also want to print that (adjust as necessary)
-                    if 'digest' in chunk:
-                        print(f" - Digest: {chunk['digest']}", end='', flush=True)
-                        print(f" - Total: {chunk['total']}", end='', flush=True)
-                        print(f" - Completed: {chunk['completed']}", end='\n', flush=True)
-                    else:
-                        print()
-    except requests.exceptions.RequestException as e:
-        print(f"An error occurred: {e}")
-
-# List models that are available locally.
-def list():
-    try:
-        response = requests.get(f"{BASE_URL}/api/tags")
-        response.raise_for_status()
-        data = response.json()
-        models = data.get('models', [])
-        return models
-
-    except requests.exceptions.RequestException as e:
-        print(f"An error occurred: {e}")
-        return None
-
-# Copy a model. Creates a model with another name from an existing model.
-def copy(source, destination):
-    try:
-        # Create the JSON payload
-        payload = {
-            "source": source,
-            "destination": destination
-        }
-        
-        response = requests.post(f"{BASE_URL}/api/copy", json=payload)
-        response.raise_for_status()
-        
-        # If the request was successful, return a message indicating that the copy was successful
-        return "Copy successful"
-
-    except requests.exceptions.RequestException as e:
-        print(f"An error occurred: {e}")
-        return None
-
-# Delete a model and its data.
-def delete(model_name):
-    try:
-        url = f"{BASE_URL}/api/delete"
-        payload = {"name": model_name}
-        response = requests.delete(url, json=payload)
-        response.raise_for_status()
-        return "Delete successful"
-    except requests.exceptions.RequestException as e:
-        print(f"An error occurred: {e}")
-        return None
-
-# Show info about a model.
-def show(model_name):
-    try:
-        url = f"{BASE_URL}/api/show"
-        payload = {"name": model_name}
-        response = requests.post(url, json=payload)
-        response.raise_for_status()
-        
-        # Parse the JSON response and return it
-        data = response.json()
-        return data
-    except requests.exceptions.RequestException as e:
-        print(f"An error occurred: {e}")
-        return None
-
-def heartbeat():
-    try:
-        url = f"{BASE_URL}/"
-        response = requests.head(url)
-        response.raise_for_status()
-        return "Ollama is running"
-    except requests.exceptions.RequestException as e:
-        print(f"An error occurred: {e}")
-        return "Ollama is not running"
--- a/api/types.go
+++ b/api/types.go
@@ -6,6 +6,7 @@ import (
 	"math"
 	"os"
 	"reflect"
+	"strconv"
 	"strings"
 	"time"
 )
@@ -30,19 +31,56 @@ func (e StatusError) Error() string {
 	}
 }

+type ImageData []byte
+
 type GenerateRequest struct {
-	Model    string `json:"model"`
-	Prompt   string `json:"prompt"`
-	System   string `json:"system"`
-	Template string `json:"template"`
-	Context  []int  `json:"context,omitempty"`
-	Stream   *bool  `json:"stream,omitempty"`
-	Raw      bool   `json:"raw,omitempty"`
-	Format   string `json:"format"`
+	Model    string      `json:"model"`
+	Prompt   string      `json:"prompt"`
+	System   string      `json:"system"`
+	Template string      `json:"template"`
+	Context  []int       `json:"context,omitempty"`
+	Stream   *bool       `json:"stream,omitempty"`
+	Raw      bool        `json:"raw,omitempty"`
+	Format   string      `json:"format"`
+	Images   []ImageData `json:"images,omitempty"`

 	Options map[string]interface{} `json:"options"`
 }

+type ChatRequest struct {
+	Model    string    `json:"model"`
+	Messages []Message `json:"messages"`
+	Stream   *bool     `json:"stream,omitempty"`
+	Format   string    `json:"format"`
+
+	Options map[string]interface{} `json:"options"`
+}
+
+type Message struct {
+	Role    string      `json:"role"` // one of ["system", "user", "assistant"]
+	Content string      `json:"content"`
+	Images  []ImageData `json:"images,omitempty"`
+}
+
+type ChatResponse struct {
+	Model     string    `json:"model"`
+	CreatedAt time.Time `json:"created_at"`
+	Message   Message   `json:"message"`
+
+	Done bool `json:"done"`
+
+	Metrics
+}
+
+type Metrics struct {
+	TotalDuration      time.Duration `json:"total_duration,omitempty"`
+	LoadDuration       time.Duration `json:"load_duration,omitempty"`
+	PromptEvalCount    int           `json:"prompt_eval_count,omitempty"`
+	PromptEvalDuration time.Duration `json:"prompt_eval_duration,omitempty"`
+	EvalCount          int           `json:"eval_count,omitempty"`
+	EvalDuration       time.Duration `json:"eval_duration,omitempty"`
+}
+
 // Options specfied in GenerateRequest, if you add a new option here add it to the API docs also
 type Options struct {
 	Runner
@@ -99,26 +137,41 @@ type EmbeddingResponse struct {
 }

 type CreateRequest struct {
-	Name      string `json:"name"`
+	Model     string `json:"model"`
 	Path      string `json:"path"`
 	Modelfile string `json:"modelfile"`
 	Stream    *bool  `json:"stream,omitempty"`
+
+	// Name is deprecated, see Model
+	Name string `json:"name"`
 }

 type DeleteRequest struct {
+	Model string `json:"model"`
+
+	// Name is deprecated, see Model
 	Name string `json:"name"`
 }

 type ShowRequest struct {
+	Model    string `json:"model"`
+	System   string `json:"system"`
+	Template string `json:"template"`
+
+	Options map[string]interface{} `json:"options"`
+
+	// Name is deprecated, see Model
 	Name string `json:"name"`
 }

 type ShowResponse struct {
-	License    string `json:"license,omitempty"`
-	Modelfile  string `json:"modelfile,omitempty"`
-	Parameters string `json:"parameters,omitempty"`
-	Template   string `json:"template,omitempty"`
-	System     string `json:"system,omitempty"`
+	License    string       `json:"license,omitempty"`
+	Modelfile  string       `json:"modelfile,omitempty"`
+	Parameters string       `json:"parameters,omitempty"`
+	Template   string       `json:"template,omitempty"`
+	System     string       `json:"system,omitempty"`
+	Details    ModelDetails `json:"details,omitempty"`
+	Messages   []Message    `json:"messages,omitempty"`
 }

 type CopyRequest struct {
@@ -127,11 +180,14 @@ type CopyRequest struct {
 }

 type PullRequest struct {
-	Name     string `json:"name"`
+	Model    string `json:"model"`
 	Insecure bool   `json:"insecure,omitempty"`
 	Username string `json:"username"`
 	Password string `json:"password"`
 	Stream   *bool  `json:"stream,omitempty"`
+
+	// Name is deprecated, see Model
+	Name string `json:"name"`
 }

 type ProgressResponse struct {
@@ -142,11 +198,14 @@ type ProgressResponse struct {
 }

 type PushRequest struct {
-	Name     string `json:"name"`
+	Model    string `json:"model"`
 	Insecure bool   `json:"insecure,omitempty"`
 	Username string `json:"username"`
 	Password string `json:"password"`
 	Stream   *bool  `json:"stream,omitempty"`
+
+	// Name is deprecated, see Model
+	Name string `json:"name"`
 }

 type ListResponse struct {
@@ -154,10 +213,12 @@ type ListResponse struct {
 }

 type ModelResponse struct {
-	Name       string    `json:"name"`
-	ModifiedAt time.Time `json:"modified_at"`
-	Size       int64     `json:"size"`
-	Digest     string    `json:"digest"`
+	Name       string       `json:"name"`
+	Model      string       `json:"model"`
+	ModifiedAt time.Time    `json:"modified_at"`
+	Size       int64        `json:"size"`
+	Digest     string       `json:"digest"`
+	Details    ModelDetails `json:"details,omitempty"`
 }

 type TokenResponse struct {
@@ -172,39 +233,43 @@ type GenerateResponse struct {
 	Done    bool  `json:"done"`
 	Context []int `json:"context,omitempty"`

-	TotalDuration      time.Duration `json:"total_duration,omitempty"`
-	LoadDuration       time.Duration `json:"load_duration,omitempty"`
-	PromptEvalCount    int           `json:"prompt_eval_count,omitempty"`
-	PromptEvalDuration time.Duration `json:"prompt_eval_duration,omitempty"`
-	EvalCount          int           `json:"eval_count,omitempty"`
-	EvalDuration       time.Duration `json:"eval_duration,omitempty"`
+	Metrics
 }

-func (r *GenerateResponse) Summary() {
-	if r.TotalDuration > 0 {
-		fmt.Fprintf(os.Stderr, "total duration:       %v\n", r.TotalDuration)
+type ModelDetails struct {
+	ParentModel       string   `json:"parent_model"`
+	Format            string   `json:"format"`
+	Family            string   `json:"family"`
+	Families          []string `json:"families"`
+	ParameterSize     string   `json:"parameter_size"`
+	QuantizationLevel string   `json:"quantization_level"`
+}
+
+func (m *Metrics) Summary() {
+	if m.TotalDuration > 0 {
+		fmt.Fprintf(os.Stderr, "total duration:       %v\n", m.TotalDuration)
 	}

-	if r.LoadDuration > 0 {
-		fmt.Fprintf(os.Stderr, "load duration:        %v\n", r.LoadDuration)
+	if m.LoadDuration > 0 {
+		fmt.Fprintf(os.Stderr, "load duration:        %v\n", m.LoadDuration)
 	}

-	if r.PromptEvalCount > 0 {
-		fmt.Fprintf(os.Stderr, "prompt eval count:    %d token(s)\n", r.PromptEvalCount)
+	if m.PromptEvalCount > 0 {
+		fmt.Fprintf(os.Stderr, "prompt eval count:    %d token(s)\n", m.PromptEvalCount)
 	}

-	if r.PromptEvalDuration > 0 {
-		fmt.Fprintf(os.Stderr, "prompt eval duration: %s\n", r.PromptEvalDuration)
-		fmt.Fprintf(os.Stderr, "prompt eval rate:     %.2f tokens/s\n", float64(r.PromptEvalCount)/r.PromptEvalDuration.Seconds())
+	if m.PromptEvalDuration > 0 {
+		fmt.Fprintf(os.Stderr, "prompt eval duration: %s\n", m.PromptEvalDuration)
+		fmt.Fprintf(os.Stderr, "prompt eval rate:     %.2f tokens/s\n", float64(m.PromptEvalCount)/m.PromptEvalDuration.Seconds())
 	}

-	if r.EvalCount > 0 {
-		fmt.Fprintf(os.Stderr, "eval count:           %d token(s)\n", r.EvalCount)
+	if m.EvalCount > 0 {
+		fmt.Fprintf(os.Stderr, "eval count:           %d token(s)\n", m.EvalCount)
 	}

-	if r.EvalDuration > 0 {
-		fmt.Fprintf(os.Stderr, "eval duration:        %s\n", r.EvalDuration)
-		fmt.Fprintf(os.Stderr, "eval rate:            %.2f tokens/s\n", float64(r.EvalCount)/r.EvalDuration.Seconds())
+	if m.EvalDuration > 0 {
+		fmt.Fprintf(os.Stderr, "eval duration:        %s\n", m.EvalDuration)
+		fmt.Fprintf(os.Stderr, "eval rate:            %.2f tokens/s\n", float64(m.EvalCount)/m.EvalDuration.Seconds())
 	}
 }

@@ -360,3 +425,63 @@ func (d *Duration) UnmarshalJSON(b []byte) (err error) {

 	return nil
 }
+
+// FormatParams converts specified parameter options to their correct types
+func FormatParams(params map[string][]string) (map[string]interface{}, error) {
+	opts := Options{}
+	valueOpts := reflect.ValueOf(&opts).Elem() // names of the fields in the options struct
+	typeOpts := reflect.TypeOf(opts)           // types of the fields in the options struct
+
+	// build map of json struct tags to their types
+	jsonOpts := make(map[string]reflect.StructField)
+	for _, field := range reflect.VisibleFields(typeOpts) {
+		jsonTag := strings.Split(field.Tag.Get("json"), ",")[0]
+		if jsonTag != "" {
+			jsonOpts[jsonTag] = field
+		}
+	}
+
+	out := make(map[string]interface{})
+	// iterate params and set values based on json struct tags
+	for key, vals := range params {
+		if opt, ok := jsonOpts[key]; !ok {
+			return nil, fmt.Errorf("unknown parameter '%s'", key)
+		} else {
+			field := valueOpts.FieldByName(opt.Name)
+			if field.IsValid() && field.CanSet() {
+				switch field.Kind() {
+				case reflect.Float32:
+					floatVal, err := strconv.ParseFloat(vals[0], 32)
+					if err != nil {
+						return nil, fmt.Errorf("invalid float value %s", vals)
+					}
+
+					out[key] = float32(floatVal)
+				case reflect.Int:
+					intVal, err := strconv.ParseInt(vals[0], 10, 64)
+					if err != nil {
+						return nil, fmt.Errorf("invalid int value %s", vals)
+					}
+
+					out[key] = intVal
+				case reflect.Bool:
+					boolVal, err := strconv.ParseBool(vals[0])
+					if err != nil {
+						return nil, fmt.Errorf("invalid bool value %s", vals)
+					}
+
+					out[key] = boolVal
+				case reflect.String:
+					out[key] = vals[0]
+				case reflect.Slice:
+					// TODO: only string slices are supported right now
+					out[key] = vals
+				default:
+					return nil, fmt.Errorf("unknown type %s for %s", field.Kind(), key)
+				}
+			}
+		}
+	}
+
+	return out, nil
+}
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -31,7 +31,6 @@ import (
 	"github.com/jmorganca/ollama/format"
 	"github.com/jmorganca/ollama/parser"
 	"github.com/jmorganca/ollama/progress"
-	"github.com/jmorganca/ollama/readline"
 	"github.com/jmorganca/ollama/server"
 	"github.com/jmorganca/ollama/version"
 )
@@ -133,7 +132,7 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
 	}

 	request := api.CreateRequest{Name: args[0], Modelfile: string(modelfile)}
-	if err := client.Create(context.Background(), &request, fn); err != nil {
+	if err := client.Create(cmd.Context(), &request, fn); err != nil {
 		return err
 	}

@@ -148,11 +147,11 @@ func RunHandler(cmd *cobra.Command, args []string) error {

 	name := args[0]
 	// check if the model exists on the server
-	_, err = client.Show(context.Background(), &api.ShowRequest{Name: name})
+	_, err = client.Show(cmd.Context(), &api.ShowRequest{Name: name})
 	var statusError api.StatusError
 	switch {
 	case errors.As(err, &statusError) && statusError.StatusCode == http.StatusNotFound:
-		if err := PullHandler(cmd, args); err != nil {
+		if err := PullHandler(cmd, []string{name}); err != nil {
 			return err
 		}
 	case err != nil:
@@ -208,7 +207,7 @@ func PushHandler(cmd *cobra.Command, args []string) error {
 	}

 	request := api.PushRequest{Name: args[0], Insecure: insecure}
-	if err := client.Push(context.Background(), &request, fn); err != nil {
+	if err := client.Push(cmd.Context(), &request, fn); err != nil {
 		return err
 	}

@@ -222,7 +221,7 @@ func ListHandler(cmd *cobra.Command, args []string) error {
 		return err
 	}

-	models, err := client.List(context.Background())
+	models, err := client.List(cmd.Context())
 	if err != nil {
 		return err
 	}
@@ -257,7 +256,7 @@ func DeleteHandler(cmd *cobra.Command, args []string) error {

 	for _, name := range args {
 		req := api.DeleteRequest{Name: name}
-		if err := client.Delete(context.Background(), &req); err != nil {
+		if err := client.Delete(cmd.Context(), &req); err != nil {
 			return err
 		}
 		fmt.Printf("deleted '%s'\n", name)
@@ -322,7 +321,7 @@ func ShowHandler(cmd *cobra.Command, args []string) error {
 	}

 	req := api.ShowRequest{Name: args[0]}
-	resp, err := client.Show(context.Background(), &req)
+	resp, err := client.Show(cmd.Context(), &req)
 	if err != nil {
 		return err
 	}
@@ -350,7 +349,7 @@ func CopyHandler(cmd *cobra.Command, args []string) error {
 	}

 	req := api.CopyRequest{Source: args[0], Destination: args[1]}
-	if err := client.Copy(context.Background(), &req); err != nil {
+	if err := client.Copy(cmd.Context(), &req); err != nil {
 		return err
 	}
 	fmt.Printf("copied '%s' to '%s'\n", args[0], args[1])
@@ -404,7 +403,7 @@ func PullHandler(cmd *cobra.Command, args []string) error {
 	}

 	request := api.PullRequest{Name: args[0], Insecure: insecure}
-	if err := client.Pull(context.Background(), &request, fn); err != nil {
+	if err := client.Pull(cmd.Context(), &request, fn); err != nil {
 		return err
 	}

@@ -412,13 +411,21 @@ func PullHandler(cmd *cobra.Command, args []string) error {
 }

 func RunGenerate(cmd *cobra.Command, args []string) error {
+	interactive := true
+
+	opts := runOptions{
+		Model:    args[0],
+		WordWrap: os.Getenv("TERM") == "xterm-256color",
+		Options:  map[string]interface{}{},
+	}
+
 	format, err := cmd.Flags().GetString("format")
 	if err != nil {
 		return err
 	}
+	opts.Format = format

 	prompts := args[1:]
-
 	// prepend stdin to the prompt if provided
 	if !term.IsTerminal(int(os.Stdin.Fd())) {
 		in, err := io.ReadAll(os.Stdin)
@@ -427,34 +434,160 @@ func RunGenerate(cmd *cobra.Command, args []string) error {
 		}

 		prompts = append([]string{string(in)}, prompts...)
+		opts.WordWrap = false
+		interactive = false
 	}
-
-	// output is being piped
-	if !term.IsTerminal(int(os.Stdout.Fd())) {
-		return generate(cmd, args[0], strings.Join(prompts, " "), false, format)
+	opts.Prompt = strings.Join(prompts, " ")
+	if len(prompts) > 0 {
+		interactive = false
 	}

-	wordWrap := os.Getenv("TERM") == "xterm-256color"
-
 	nowrap, err := cmd.Flags().GetBool("nowordwrap")
 	if err != nil {
 		return err
 	}
-	if nowrap {
-		wordWrap = false
+	opts.WordWrap = !nowrap
+
+	if !interactive {
+		return generate(cmd, opts)
 	}

-	// prompts are provided via stdin or args so don't enter interactive mode
-	if len(prompts) > 0 {
-		return generate(cmd, args[0], strings.Join(prompts, " "), wordWrap, format)
-	}
-
-	return generateInteractive(cmd, args[0], wordWrap, format)
+	return generateInteractive(cmd, opts)
 }

 type generateContextKey string

-func generate(cmd *cobra.Command, model, prompt string, wordWrap bool, format string) error {
+type runOptions struct {
+	Model       string
+	ParentModel string
+	Prompt      string
+	Messages    []api.Message
+	WordWrap    bool
+	Format      string
+	System      string
+	Template    string
+	Images      []api.ImageData
+	Options     map[string]interface{}
+	MultiModal  bool
+}
+
+type displayResponseState struct {
+	lineLength int
+	wordBuffer string
+}
+
+func displayResponse(content string, wordWrap bool, state *displayResponseState) {
+	termWidth, _, _ := term.GetSize(int(os.Stdout.Fd()))
+	if wordWrap && termWidth >= 10 {
+		for _, ch := range content {
+			if state.lineLength+1 > termWidth-5 {
+				if len(state.wordBuffer) > termWidth-10 {
+					fmt.Printf("%s%c", state.wordBuffer, ch)
+					state.wordBuffer = ""
+					state.lineLength = 0
+					continue
+				}
+
+				// backtrack the length of the last word and clear to the end of the line
+				fmt.Printf("\x1b[%dD\x1b[K\n", len(state.wordBuffer))
+				fmt.Printf("%s%c", state.wordBuffer, ch)
+				state.lineLength = len(state.wordBuffer) + 1
+			} else {
+				fmt.Print(string(ch))
+				state.lineLength += 1
+
+				switch ch {
+				case ' ':
+					state.wordBuffer = ""
+				case '\n':
+					state.lineLength = 0
+				default:
+					state.wordBuffer += string(ch)
+				}
+			}
+		}
+	} else {
+		fmt.Printf("%s%s", state.wordBuffer, content)
+		if len(state.wordBuffer) > 0 {
+			state.wordBuffer = ""
+		}
+	}
+}
+
+func chat(cmd *cobra.Command, opts runOptions) (*api.Message, error) {
+	client, err := api.ClientFromEnvironment()
+	if err != nil {
+		return nil, err
+	}
+
+	p := progress.NewProgress(os.Stderr)
+	defer p.StopAndClear()
+
+	spinner := progress.NewSpinner("")
+	p.Add("", spinner)
+
+	cancelCtx, cancel := context.WithCancel(cmd.Context())
+	defer cancel()
+
+	sigChan := make(chan os.Signal, 1)
+	signal.Notify(sigChan, syscall.SIGINT)
+
+	go func() {
+		<-sigChan
+		cancel()
+	}()
+
+	var state *displayResponseState = &displayResponseState{}
+	var latest api.ChatResponse
+	var fullResponse strings.Builder
+	var role string
+
+	fn := func(response api.ChatResponse) error {
+		p.StopAndClear()
+
+		latest = response
+
+		role = response.Message.Role
+		content := response.Message.Content
+		fullResponse.WriteString(content)
+
+		displayResponse(content, opts.WordWrap, state)
+
+		return nil
+	}
+
+	req := &api.ChatRequest{
+		Model:    opts.Model,
+		Messages: opts.Messages,
+		Format:   opts.Format,
+		Options:  opts.Options,
+	}
+
+	if err := client.Chat(cancelCtx, req, fn); err != nil {
+		if errors.Is(err, context.Canceled) {
+			return nil, nil
+		}
+		return nil, err
+	}
+
+	if len(opts.Messages) > 0 {
+		fmt.Println()
+		fmt.Println()
+	}
+
+	verbose, err := cmd.Flags().GetBool("verbose")
+	if err != nil {
+		return nil, err
+	}
+
+	if verbose {
+		latest.Summary()
+	}
+
+	return &api.Message{Role: role, Content: fullResponse.String()}, nil
+}
+
+func generate(cmd *cobra.Command, opts runOptions) error {
 	client, err := api.ClientFromEnvironment()
 	if err != nil {
 		return err
@@ -473,77 +606,54 @@ func generate(cmd *cobra.Command, model, prompt string, wordWrap bool, format st
 		generateContext = []int{}
 	}

-	termWidth, _, err := term.GetSize(int(os.Stdout.Fd()))
-	if err != nil {
-		wordWrap = false
-	}
-
-	cancelCtx, cancel := context.WithCancel(context.Background())
+	ctx, cancel := context.WithCancel(cmd.Context())
 	defer cancel()

 	sigChan := make(chan os.Signal, 1)
 	signal.Notify(sigChan, syscall.SIGINT)
-	var abort bool

 	go func() {
 		<-sigChan
 		cancel()
-		abort = true
 	}()

-	var currentLineLength int
-	var wordBuffer string
+	var state *displayResponseState = &displayResponseState{}

-	request := api.GenerateRequest{Model: model, Prompt: prompt, Context: generateContext, Format: format}
 	fn := func(response api.GenerateResponse) error {
 		p.StopAndClear()

 		latest = response
+		content := response.Response

-		if wordWrap {
-			for _, ch := range response.Response {
-				if currentLineLength+1 > termWidth-5 {
-					// backtrack the length of the last word and clear to the end of the line
-					fmt.Printf("\x1b[%dD\x1b[K\n", len(wordBuffer))
-					fmt.Printf("%s%c", wordBuffer, ch)
-					currentLineLength = len(wordBuffer) + 1
-				} else {
-					fmt.Print(string(ch))
-					currentLineLength += 1
-
-					switch ch {
-					case ' ':
-						wordBuffer = ""
-					case '\n':
-						currentLineLength = 0
-					default:
-						wordBuffer += string(ch)
-					}
-				}
-			}
-		} else {
-			fmt.Print(response.Response)
-		}
+		displayResponse(content, opts.WordWrap, state)

 		return nil
 	}

-	if err := client.Generate(cancelCtx, &request, fn); err != nil {
-		if strings.Contains(err.Error(), "context canceled") && abort {
+	request := api.GenerateRequest{
+		Model:    opts.Model,
+		Prompt:   opts.Prompt,
+		Context:  generateContext,
+		Format:   opts.Format,
+		System:   opts.System,
+		Template: opts.Template,
+		Options:  opts.Options,
+	}
+
+	if err := client.Generate(ctx, &request, fn); err != nil {
+		if errors.Is(err, context.Canceled) {
 			return nil
 		}
 		return err
 	}
-	if prompt != "" {
+
+	if opts.Prompt != "" {
 		fmt.Println()
 		fmt.Println()
 	}

 	if !latest.Done {
-		if abort {
-			return nil
-		}
-		return errors.New("unexpected end of response")
+		return nil
 	}

 	verbose, err := cmd.Flags().GetBool("verbose")
@@ -555,227 +665,12 @@ func generate(cmd *cobra.Command, model, prompt string, wordWrap bool, format st
 		latest.Summary()
 	}

-	ctx := cmd.Context()
-	ctx = context.WithValue(ctx, generateContextKey("context"), latest.Context)
+	ctx = context.WithValue(cmd.Context(), generateContextKey("context"), latest.Context)
 	cmd.SetContext(ctx)

 	return nil
 }

-func generateInteractive(cmd *cobra.Command, model string, wordWrap bool, format string) error {
-	// load the model
-	if err := generate(cmd, model, "", false, ""); err != nil {
-		return err
-	}
-
-	usage := func() {
-		fmt.Fprintln(os.Stderr, "Available Commands:")
-		fmt.Fprintln(os.Stderr, "  /set         Set session variables")
-		fmt.Fprintln(os.Stderr, "  /show        Show model information")
-		fmt.Fprintln(os.Stderr, "  /bye         Exit")
-		fmt.Fprintln(os.Stderr, "  /?, /help    Help for a command")
-		fmt.Fprintln(os.Stderr, "")
-		fmt.Fprintln(os.Stderr, "Use \"\"\" to begin a multi-line message.")
-		fmt.Fprintln(os.Stderr, "")
-	}
-
-	usageSet := func() {
-		fmt.Fprintln(os.Stderr, "Available Commands:")
-		fmt.Fprintln(os.Stderr, "  /set history      Enable history")
-		fmt.Fprintln(os.Stderr, "  /set nohistory    Disable history")
-		fmt.Fprintln(os.Stderr, "  /set wordwrap     Enable wordwrap")
-		fmt.Fprintln(os.Stderr, "  /set nowordwrap   Disable wordwrap")
-		fmt.Fprintln(os.Stderr, "  /set format json  Enable JSON mode")
-		fmt.Fprintln(os.Stderr, "  /set noformat     Disable formatting")
-		fmt.Fprintln(os.Stderr, "  /set verbose      Show LLM stats")
-		fmt.Fprintln(os.Stderr, "  /set quiet        Disable LLM stats")
-		fmt.Fprintln(os.Stderr, "")
-	}
-
-	usageShow := func() {
-		fmt.Fprintln(os.Stderr, "Available Commands:")
-		fmt.Fprintln(os.Stderr, "  /show license      Show model license")
-		fmt.Fprintln(os.Stderr, "  /show modelfile    Show Modelfile for this model")
-		fmt.Fprintln(os.Stderr, "  /show parameters   Show parameters for this model")
-		fmt.Fprintln(os.Stderr, "  /show system       Show system prompt")
-		fmt.Fprintln(os.Stderr, "  /show template     Show prompt template")
-		fmt.Fprintln(os.Stderr, "")
-	}
-
-	scanner, err := readline.New(readline.Prompt{
-		Prompt:         ">>> ",
-		AltPrompt:      "... ",
-		Placeholder:    "Send a message (/? for help)",
-		AltPlaceholder: `Use """ to end multi-line input`,
-	})
-	if err != nil {
-		return err
-	}
-
-	fmt.Print(readline.StartBracketedPaste)
-	defer fmt.Printf(readline.EndBracketedPaste)
-
-	var prompt string
-
-	for {
-		line, err := scanner.Readline()
-		switch {
-		case errors.Is(err, io.EOF):
-			fmt.Println()
-			return nil
-		case errors.Is(err, readline.ErrInterrupt):
-			if line == "" {
-				fmt.Println("\nUse Ctrl-D or /bye to exit.")
-			}
-
-			scanner.Prompt.UseAlt = false
-			prompt = ""
-
-			continue
-		case err != nil:
-			return err
-		}
-
-		switch {
-		case strings.HasPrefix(prompt, `"""`):
-			// if the prompt so far starts with """ then we're in multiline mode
-			// and we need to keep reading until we find a line that ends with """
-			cut, found := strings.CutSuffix(line, `"""`)
-			prompt += cut + "\n"
-
-			if !found {
-				continue
-			}
-
-			prompt = strings.TrimPrefix(prompt, `"""`)
-			scanner.Prompt.UseAlt = false
-		case strings.HasPrefix(line, `"""`) && len(prompt) == 0:
-			scanner.Prompt.UseAlt = true
-			prompt += line + "\n"
-			continue
-		case scanner.Pasting:
-			prompt += line + "\n"
-			continue
-		case strings.HasPrefix(line, "/list"):
-			args := strings.Fields(line)
-			if err := ListHandler(cmd, args[1:]); err != nil {
-				return err
-			}
-		case strings.HasPrefix(line, "/set"):
-			args := strings.Fields(line)
-			if len(args) > 1 {
-				switch args[1] {
-				case "history":
-					scanner.HistoryEnable()
-				case "nohistory":
-					scanner.HistoryDisable()
-				case "wordwrap":
-					wordWrap = true
-					fmt.Println("Set 'wordwrap' mode.")
-				case "nowordwrap":
-					wordWrap = false
-					fmt.Println("Set 'nowordwrap' mode.")
-				case "verbose":
-					cmd.Flags().Set("verbose", "true")
-					fmt.Println("Set 'verbose' mode.")
-				case "quiet":
-					cmd.Flags().Set("verbose", "false")
-					fmt.Println("Set 'quiet' mode.")
-				case "format":
-					if len(args) < 3 || args[2] != "json" {
-						fmt.Println("Invalid or missing format. For 'json' mode use '/set format json'")
-					} else {
-						format = args[2]
-						fmt.Printf("Set format to '%s' mode.\n", args[2])
-					}
-				case "noformat":
-					format = ""
-					fmt.Println("Disabled format.")
-				default:
-					fmt.Printf("Unknown command '/set %s'. Type /? for help\n", args[1])
-				}
-			} else {
-				usageSet()
-			}
-		case strings.HasPrefix(line, "/show"):
-			args := strings.Fields(line)
-			if len(args) > 1 {
-				client, err := api.ClientFromEnvironment()
-				if err != nil {
-					fmt.Println("error: couldn't connect to ollama server")
-					return err
-				}
-				resp, err := client.Show(cmd.Context(), &api.ShowRequest{Name: model})
-				if err != nil {
-					fmt.Println("error: couldn't get model")
-					return err
-				}
-
-				switch args[1] {
-				case "license":
-					if resp.License == "" {
-						fmt.Print("No license was specified for this model.\n\n")
-					} else {
-						fmt.Println(resp.License)
-					}
-				case "modelfile":
-					fmt.Println(resp.Modelfile)
-				case "parameters":
-					if resp.Parameters == "" {
-						fmt.Print("No parameters were specified for this model.\n\n")
-					} else {
-						fmt.Println(resp.Parameters)
-					}
-				case "system":
-					if resp.System == "" {
-						fmt.Print("No system prompt was specified for this model.\n\n")
-					} else {
-						fmt.Println(resp.System)
-					}
-				case "template":
-					if resp.Template == "" {
-						fmt.Print("No prompt template was specified for this model.\n\n")
-					} else {
-						fmt.Println(resp.Template)
-					}
-				default:
-					fmt.Printf("Unknown command '/show %s'. Type /? for help\n", args[1])
-				}
-			} else {
-				usageShow()
-			}
-		case strings.HasPrefix(line, "/help"), strings.HasPrefix(line, "/?"):
-			args := strings.Fields(line)
-			if len(args) > 1 {
-				switch args[1] {
-				case "set", "/set":
-					usageSet()
-				case "show", "/show":
-					usageShow()
-				}
-			} else {
-				usage()
-			}
-		case line == "/exit", line == "/bye":
-			return nil
-		case strings.HasPrefix(line, "/"):
-			args := strings.Fields(line)
-			fmt.Printf("Unknown command '%s'. Type /? for help\n", args[0])
-			continue
-		default:
-			prompt += line
-		}
-
-		if len(prompt) > 0 && prompt[0] != '/' {
-			if err := generate(cmd, model, prompt, wordWrap, format); err != nil {
-				return err
-			}
-
-			prompt = ""
-		}
-	}
-}
-
 func RunServer(cmd *cobra.Command, _ []string) error {
 	host, port, err := net.SplitHostPort(os.Getenv("OLLAMA_HOST"))
 	if err != nil {
@@ -794,12 +689,7 @@ func RunServer(cmd *cobra.Command, _ []string) error {
 		return err
 	}

-	var origins []string
-	if o := os.Getenv("OLLAMA_ORIGINS"); o != "" {
-		origins = strings.Split(o, ",")
-	}
-
-	return server.Serve(ln, origins)
+	return server.Serve(ln)
 }

 func initializeKeypair() error {
@@ -851,7 +741,7 @@ func initializeKeypair() error {
 	return nil
 }

-func startMacApp(client *api.Client) error {
+func startMacApp(ctx context.Context, client *api.Client) error {
 	exe, err := os.Executable()
 	if err != nil {
 		return err
@@ -875,24 +765,24 @@ func startMacApp(client *api.Client) error {
 		case <-timeout:
 			return errors.New("timed out waiting for server to start")
 		case <-tick:
-			if err := client.Heartbeat(context.Background()); err == nil {
+			if err := client.Heartbeat(ctx); err == nil {
 				return nil // server has started
 			}
 		}
 	}
 }

-func checkServerHeartbeat(_ *cobra.Command, _ []string) error {
+func checkServerHeartbeat(cmd *cobra.Command, _ []string) error {
 	client, err := api.ClientFromEnvironment()
 	if err != nil {
 		return err
 	}
-	if err := client.Heartbeat(context.Background()); err != nil {
+	if err := client.Heartbeat(cmd.Context()); err != nil {
 		if !strings.Contains(err.Error(), "connection refused") {
 			return err
 		}
 		if runtime.GOOS == "darwin" {
-			if err := startMacApp(client); err != nil {
+			if err := startMacApp(cmd.Context(), client); err != nil {
 				return fmt.Errorf("could not connect to ollama app, is it running?")
 			}
 		} else {
@@ -902,8 +792,29 @@ func checkServerHeartbeat(_ *cobra.Command, _ []string) error {
 	return nil
 }

+func versionHandler(cmd *cobra.Command, _ []string) {
+	client, err := api.ClientFromEnvironment()
+	if err != nil {
+		return
+	}
+
+	serverVersion, err := client.Version(cmd.Context())
+	if err != nil {
+		fmt.Println("Warning: could not connect to a running Ollama instance")
+	}
+
+	if serverVersion != "" {
+		fmt.Printf("ollama version is %s\n", serverVersion)
+	}
+
+	if serverVersion != version.Version {
+		fmt.Printf("Warning: client version is %s\n", version.Version)
+	}
+}
+
 func NewCLI() *cobra.Command {
 	log.SetFlags(log.LstdFlags | log.Lshortfile)
+	cobra.EnableCommandSorting = false

 	rootCmd := &cobra.Command{
 		Use:           "ollama",
@@ -913,10 +824,17 @@ func NewCLI() *cobra.Command {
 		CompletionOptions: cobra.CompletionOptions{
 			DisableDefaultCmd: true,
 		},
-		Version: version.Version,
+		Run: func(cmd *cobra.Command, args []string) {
+			if version, _ := cmd.Flags().GetBool("version"); version {
+				versionHandler(cmd, args)
+				return
+			}
+
+			cmd.Print(cmd.UsageString())
+		},
 	}

-	cobra.EnableCommandSorting = false
+	rootCmd.Flags().BoolP("version", "v", false, "Show version information")

 	createCmd := &cobra.Command{
 		Use:     "create MODEL",
@@ -940,7 +858,7 @@ func NewCLI() *cobra.Command {
 	showCmd.Flags().Bool("modelfile", false, "Show Modelfile of a model")
 	showCmd.Flags().Bool("parameters", false, "Show parameters of a model")
 	showCmd.Flags().Bool("template", false, "Show template of a model")
-	showCmd.Flags().Bool("system", false, "Show system prompt of a model")
+	showCmd.Flags().Bool("system", false, "Show system message of a model")

 	runCmd := &cobra.Command{
 		Use:     "run MODEL [PROMPT]",
--- a/cmd/interactive.go
+++ b/cmd/interactive.go
@@ -0,0 +1,658 @@
+package cmd
+
+import (
+	"errors"
+	"fmt"
+	"io"
+	"net/http"
+	"os"
+	"regexp"
+	"sort"
+	"strings"
+
+	"github.com/spf13/cobra"
+	"golang.org/x/exp/slices"
+
+	"github.com/jmorganca/ollama/api"
+	"github.com/jmorganca/ollama/progress"
+	"github.com/jmorganca/ollama/readline"
+)
+
+type MultilineState int
+
+const (
+	MultilineNone MultilineState = iota
+	MultilinePrompt
+	MultilineSystem
+	MultilineTemplate
+)
+
+func loadModel(cmd *cobra.Command, opts *runOptions) error {
+	client, err := api.ClientFromEnvironment()
+	if err != nil {
+		return err
+	}
+
+	p := progress.NewProgress(os.Stderr)
+	defer p.StopAndClear()
+
+	spinner := progress.NewSpinner("")
+	p.Add("", spinner)
+
+	showReq := api.ShowRequest{Name: opts.Model}
+	showResp, err := client.Show(cmd.Context(), &showReq)
+	if err != nil {
+		return err
+	}
+	opts.MultiModal = slices.Contains(showResp.Details.Families, "clip")
+	opts.ParentModel = showResp.Details.ParentModel
+
+	if len(showResp.Messages) > 0 {
+		opts.Messages = append(opts.Messages, showResp.Messages...)
+	}
+
+	chatReq := &api.ChatRequest{
+		Model:    opts.Model,
+		Messages: []api.Message{},
+	}
+	err = client.Chat(cmd.Context(), chatReq, func(resp api.ChatResponse) error {
+		p.StopAndClear()
+		if len(opts.Messages) > 0 {
+			for _, msg := range opts.Messages {
+				switch msg.Role {
+				case "user":
+					fmt.Printf(">>> %s\n", msg.Content)
+				case "assistant":
+					state := &displayResponseState{}
+					displayResponse(msg.Content, opts.WordWrap, state)
+					fmt.Println()
+					fmt.Println()
+				}
+			}
+		}
+		return nil
+	})
+	if err != nil {
+		return err
+	}
+
+	return nil
+}
+
+func generateInteractive(cmd *cobra.Command, opts runOptions) error {
+	opts.Messages = make([]api.Message, 0)
+
+	err := loadModel(cmd, &opts)
+	if err != nil {
+		return err
+	}
+
+	usage := func() {
+		fmt.Fprintln(os.Stderr, "Available Commands:")
+		fmt.Fprintln(os.Stderr, "  /set            Set session variables")
+		fmt.Fprintln(os.Stderr, "  /show           Show model information")
+		fmt.Fprintln(os.Stderr, "  /load <model>   Load a session or model")
+		fmt.Fprintln(os.Stderr, "  /save <model>   Save your current session")
+		fmt.Fprintln(os.Stderr, "  /bye            Exit")
+		fmt.Fprintln(os.Stderr, "  /?, /help       Help for a command")
+		fmt.Fprintln(os.Stderr, "  /? shortcuts    Help for keyboard shortcuts")
+		fmt.Fprintln(os.Stderr, "")
+		fmt.Fprintln(os.Stderr, "Use \"\"\" to begin a multi-line message.")
+		fmt.Fprintln(os.Stderr, "")
+	}
+
+	usageSet := func() {
+		fmt.Fprintln(os.Stderr, "Available Commands:")
+		fmt.Fprintln(os.Stderr, "  /set parameter ...     Set a parameter")
+		fmt.Fprintln(os.Stderr, "  /set system <string>   Set system message")
+		fmt.Fprintln(os.Stderr, "  /set template <string> Set prompt template")
+		fmt.Fprintln(os.Stderr, "  /set history           Enable history")
+		fmt.Fprintln(os.Stderr, "  /set nohistory         Disable history")
+		fmt.Fprintln(os.Stderr, "  /set wordwrap          Enable wordwrap")
+		fmt.Fprintln(os.Stderr, "  /set nowordwrap        Disable wordwrap")
+		fmt.Fprintln(os.Stderr, "  /set format json       Enable JSON mode")
+		fmt.Fprintln(os.Stderr, "  /set noformat          Disable formatting")
+		fmt.Fprintln(os.Stderr, "  /set verbose           Show LLM stats")
+		fmt.Fprintln(os.Stderr, "  /set quiet             Disable LLM stats")
+		fmt.Fprintln(os.Stderr, "")
+	}
+
+	usageShortcuts := func() {
+		fmt.Fprintln(os.Stderr, "Available keyboard shortcuts:")
+		fmt.Fprintln(os.Stderr, "  Ctrl + a            Move to the beginning of the line (Home)")
+		fmt.Fprintln(os.Stderr, "  Ctrl + e            Move to the end of the line (End)")
+		fmt.Fprintln(os.Stderr, "   Alt + b            Move back (left) one word")
+		fmt.Fprintln(os.Stderr, "   Alt + f            Move forward (right) one word")
+		fmt.Fprintln(os.Stderr, "  Ctrl + k            Delete the sentence after the cursor")
+		fmt.Fprintln(os.Stderr, "  Ctrl + u            Delete the sentence before the cursor")
+		fmt.Fprintln(os.Stderr, "")
+		fmt.Fprintln(os.Stderr, "  Ctrl + l            Clear the screen")
+		fmt.Fprintln(os.Stderr, "  Ctrl + c            Stop the model from responding")
+		fmt.Fprintln(os.Stderr, "  Ctrl + d            Exit ollama (/bye)")
+		fmt.Fprintln(os.Stderr, "")
+	}
+
+	usageShow := func() {
+		fmt.Fprintln(os.Stderr, "Available Commands:")
+		fmt.Fprintln(os.Stderr, "  /show info         Show details for this model")
+		fmt.Fprintln(os.Stderr, "  /show license      Show model license")
+		fmt.Fprintln(os.Stderr, "  /show modelfile    Show Modelfile for this model")
+		fmt.Fprintln(os.Stderr, "  /show parameters   Show parameters for this model")
+		fmt.Fprintln(os.Stderr, "  /show system       Show system message")
+		fmt.Fprintln(os.Stderr, "  /show template     Show prompt template")
+		fmt.Fprintln(os.Stderr, "")
+	}
+
+	// only list out the most common parameters
+	usageParameters := func() {
+		fmt.Fprintln(os.Stderr, "Available Parameters:")
+		fmt.Fprintln(os.Stderr, "  /set parameter seed <int>             Random number seed")
+		fmt.Fprintln(os.Stderr, "  /set parameter num_predict <int>      Max number of tokens to predict")
+		fmt.Fprintln(os.Stderr, "  /set parameter top_k <int>            Pick from top k num of tokens")
+		fmt.Fprintln(os.Stderr, "  /set parameter top_p <float>          Pick token based on sum of probabilities")
+		fmt.Fprintln(os.Stderr, "  /set parameter num_ctx <int>          Set the context size")
+		fmt.Fprintln(os.Stderr, "  /set parameter temperature <float>    Set creativity level")
+		fmt.Fprintln(os.Stderr, "  /set parameter repeat_penalty <float> How strongly to penalize repetitions")
+		fmt.Fprintln(os.Stderr, "  /set parameter repeat_last_n <int>    Set how far back to look for repetitions")
+		fmt.Fprintln(os.Stderr, "  /set parameter num_gpu <int>          The number of layers to send to the GPU")
+		fmt.Fprintln(os.Stderr, "  /set parameter stop \"<string>\", ...   Set the stop parameters")
+		fmt.Fprintln(os.Stderr, "")
+	}
+
+	scanner, err := readline.New(readline.Prompt{
+		Prompt:         ">>> ",
+		AltPrompt:      "... ",
+		Placeholder:    "Send a message (/? for help)",
+		AltPlaceholder: `Use """ to end multi-line input`,
+	})
+	if err != nil {
+		return err
+	}
+
+	fmt.Print(readline.StartBracketedPaste)
+	defer fmt.Printf(readline.EndBracketedPaste)
+
+	var sb strings.Builder
+	var multiline MultilineState
+
+	for {
+		line, err := scanner.Readline()
+		switch {
+		case errors.Is(err, io.EOF):
+			fmt.Println()
+			return nil
+		case errors.Is(err, readline.ErrInterrupt):
+			if line == "" {
+				fmt.Println("\nUse Ctrl + d or /bye to exit.")
+			}
+
+			scanner.Prompt.UseAlt = false
+			sb.Reset()
+
+			continue
+		case err != nil:
+			return err
+		}
+
+		switch {
+		case multiline != MultilineNone:
+			// check if there's a multiline terminating string
+			before, ok := strings.CutSuffix(line, `"""`)
+			sb.WriteString(before)
+			if !ok {
+				fmt.Fprintln(&sb)
+				continue
+			}
+
+			switch multiline {
+			case MultilineSystem:
+				opts.System = sb.String()
+				fmt.Println("Set system message.")
+				sb.Reset()
+			case MultilineTemplate:
+				opts.Template = sb.String()
+				fmt.Println("Set prompt template.")
+				sb.Reset()
+			}
+
+			multiline = MultilineNone
+			scanner.Prompt.UseAlt = false
+		case strings.HasPrefix(line, `"""`):
+			line := strings.TrimPrefix(line, `"""`)
+			line, ok := strings.CutSuffix(line, `"""`)
+			sb.WriteString(line)
+			if !ok {
+				// no multiline terminating string; need more input
+				fmt.Fprintln(&sb)
+				multiline = MultilinePrompt
+				scanner.Prompt.UseAlt = true
+				break
+			}
+		case scanner.Pasting:
+			fmt.Fprintln(&sb, line)
+			continue
+		case strings.HasPrefix(line, "/list"):
+			args := strings.Fields(line)
+			if err := ListHandler(cmd, args[1:]); err != nil {
+				return err
+			}
+		case strings.HasPrefix(line, "/load"):
+			args := strings.Fields(line)
+			if len(args) != 2 {
+				fmt.Println("Usage:\n  /load <modelname>")
+				continue
+			}
+			opts.Model = args[1]
+			opts.Messages = []api.Message{}
+			fmt.Printf("Loading model '%s'\n", opts.Model)
+			if err := loadModel(cmd, &opts); err != nil {
+				return err
+			}
+			continue
+		case strings.HasPrefix(line, "/save"):
+			args := strings.Fields(line)
+			if len(args) != 2 {
+				fmt.Println("Usage:\n  /save <modelname>")
+				continue
+			}
+
+			client, err := api.ClientFromEnvironment()
+			if err != nil {
+				fmt.Println("error: couldn't connect to ollama server")
+				return err
+			}
+
+			req := &api.CreateRequest{
+				Name:      args[1],
+				Modelfile: buildModelfile(opts),
+			}
+			fn := func(resp api.ProgressResponse) error { return nil }
+			err = client.Create(cmd.Context(), req, fn)
+			if err != nil {
+				fmt.Println("error: couldn't save model")
+				return err
+			}
+			fmt.Printf("Created new model '%s'\n", args[1])
+			continue
+		case strings.HasPrefix(line, "/set"):
+			args := strings.Fields(line)
+			if len(args) > 1 {
+				switch args[1] {
+				case "history":
+					scanner.HistoryEnable()
+				case "nohistory":
+					scanner.HistoryDisable()
+				case "wordwrap":
+					opts.WordWrap = true
+					fmt.Println("Set 'wordwrap' mode.")
+				case "nowordwrap":
+					opts.WordWrap = false
+					fmt.Println("Set 'nowordwrap' mode.")
+				case "verbose":
+					cmd.Flags().Set("verbose", "true")
+					fmt.Println("Set 'verbose' mode.")
+				case "quiet":
+					cmd.Flags().Set("verbose", "false")
+					fmt.Println("Set 'quiet' mode.")
+				case "format":
+					if len(args) < 3 || args[2] != "json" {
+						fmt.Println("Invalid or missing format. For 'json' mode use '/set format json'")
+					} else {
+						opts.Format = args[2]
+						fmt.Printf("Set format to '%s' mode.\n", args[2])
+					}
+				case "noformat":
+					opts.Format = ""
+					fmt.Println("Disabled format.")
+				case "parameter":
+					if len(args) < 4 {
+						usageParameters()
+						continue
+					}
+					params := args[3:]
+					fp, err := api.FormatParams(map[string][]string{args[2]: params})
+					if err != nil {
+						fmt.Printf("Couldn't set parameter: %q\n", err)
+						continue
+					}
+					fmt.Printf("Set parameter '%s' to '%s'\n", args[2], strings.Join(params, ", "))
+					opts.Options[args[2]] = fp[args[2]]
+				case "system", "template":
+					if len(args) < 3 {
+						usageSet()
+						continue
+					}
+
+					if args[1] == "system" {
+						multiline = MultilineSystem
+					} else if args[1] == "template" {
+						multiline = MultilineTemplate
+					}
+
+					line := strings.Join(args[2:], " ")
+					line, ok := strings.CutPrefix(line, `"""`)
+					if !ok {
+						multiline = MultilineNone
+					} else {
+						// only cut suffix if the line is multiline
+						line, ok = strings.CutSuffix(line, `"""`)
+						if ok {
+							multiline = MultilineNone
+						}
+					}
+
+					sb.WriteString(line)
+					if multiline != MultilineNone {
+						scanner.Prompt.UseAlt = true
+						continue
+					}
+
+					if args[1] == "system" {
+						opts.System = sb.String()
+						fmt.Println("Set system message.")
+					} else if args[1] == "template" {
+						opts.Template = sb.String()
+						fmt.Println("Set prompt template.")
+					}
+
+					sb.Reset()
+					continue
+				default:
+					fmt.Printf("Unknown command '/set %s'. Type /? for help\n", args[1])
+				}
+			} else {
+				usageSet()
+			}
+		case strings.HasPrefix(line, "/show"):
+			args := strings.Fields(line)
+			if len(args) > 1 {
+				client, err := api.ClientFromEnvironment()
+				if err != nil {
+					fmt.Println("error: couldn't connect to ollama server")
+					return err
+				}
+				req := &api.ShowRequest{
+					Name:     opts.Model,
+					System:   opts.System,
+					Template: opts.Template,
+					Options:  opts.Options,
+				}
+				resp, err := client.Show(cmd.Context(), req)
+				if err != nil {
+					fmt.Println("error: couldn't get model")
+					return err
+				}
+
+				switch args[1] {
+				case "info":
+					fmt.Println("Model details:")
+					if len(resp.Details.Families) > 0 {
+						fmt.Printf("Family              %s\n", strings.Join(resp.Details.Families, ", "))
+					} else if resp.Details.Family != "" {
+						fmt.Printf("Family              %s\n", resp.Details.Family)
+					}
+					fmt.Printf("Parameter Size      %s\n", resp.Details.ParameterSize)
+					fmt.Printf("Quantization Level  %s\n", resp.Details.QuantizationLevel)
+					fmt.Println("")
+				case "license":
+					if resp.License == "" {
+						fmt.Println("No license was specified for this model.")
+					} else {
+						fmt.Println(resp.License)
+					}
+				case "modelfile":
+					fmt.Println(resp.Modelfile)
+				case "parameters":
+					if resp.Parameters == "" {
+						fmt.Println("No parameters were specified for this model.")
+					} else {
+						if len(opts.Options) > 0 {
+							fmt.Println("User defined parameters:")
+							for k, v := range opts.Options {
+								fmt.Printf("%-*s %v\n", 30, k, v)
+							}
+							fmt.Println()
+						}
+						fmt.Println("Model defined parameters:")
+						fmt.Println(resp.Parameters)
+					}
+				case "system":
+					switch {
+					case opts.System != "":
+						fmt.Println(opts.System + "\n")
+					case resp.System != "":
+						fmt.Println(resp.System + "\n")
+					default:
+						fmt.Println("No system message was specified for this model.")
+					}
+				case "template":
+					switch {
+					case opts.Template != "":
+						fmt.Println(opts.Template + "\n")
+					case resp.Template != "":
+						fmt.Println(resp.Template)
+					default:
+						fmt.Println("No prompt template was specified for this model.")
+					}
+				default:
+					fmt.Printf("Unknown command '/show %s'. Type /? for help\n", args[1])
+				}
+			} else {
+				usageShow()
+			}
+		case strings.HasPrefix(line, "/help"), strings.HasPrefix(line, "/?"):
+			args := strings.Fields(line)
+			if len(args) > 1 {
+				switch args[1] {
+				case "set", "/set":
+					usageSet()
+				case "show", "/show":
+					usageShow()
+				case "shortcut", "shortcuts":
+					usageShortcuts()
+				}
+			} else {
+				usage()
+			}
+		case line == "/exit", line == "/bye":
+			return nil
+		case strings.HasPrefix(line, "/"):
+			args := strings.Fields(line)
+			isFile := false
+
+			if opts.MultiModal {
+				for _, f := range extractFileNames(line) {
+					if strings.HasPrefix(f, args[0]) {
+						isFile = true
+						break
+					}
+				}
+			}
+
+			if !isFile {
+				fmt.Printf("Unknown command '%s'. Type /? for help\n", args[0])
+				continue
+			}
+
+			sb.WriteString(line)
+		default:
+			sb.WriteString(line)
+		}
+
+		if sb.Len() > 0 && multiline == MultilineNone {
+			newMessage := api.Message{Role: "user", Content: sb.String()}
+
+			if opts.MultiModal {
+				msg, images, err := extractFileData(sb.String())
+				if err != nil {
+					return err
+				}
+				newMessage.Content = msg
+
+				// reset the context if we find another image
+				if len(images) > 0 {
+					newMessage.Images = append(newMessage.Images, images...)
+					// reset the context for the new image
+					opts.Messages = []api.Message{}
+				} else {
+					if len(opts.Messages) > 1 {
+						newMessage.Images = append(newMessage.Images, opts.Messages[len(opts.Messages)-2].Images...)
+					}
+				}
+				if len(newMessage.Images) == 0 {
+					fmt.Println("This model requires you to add a jpeg, png, or svg image.")
+					fmt.Println()
+					sb.Reset()
+					continue
+				}
+			}
+
+			if opts.System != "" {
+				opts.Messages = append(opts.Messages, api.Message{Role: "system", Content: opts.System})
+			}
+			opts.Messages = append(opts.Messages, newMessage)
+
+			assistant, err := chat(cmd, opts)
+			if err != nil {
+				return err
+			}
+			if assistant != nil {
+				opts.Messages = append(opts.Messages, *assistant)
+			}
+
+			sb.Reset()
+		}
+	}
+}
+
+func buildModelfile(opts runOptions) string {
+	var mf strings.Builder
+	model := opts.ParentModel
+	if model == "" {
+		model = opts.Model
+	}
+	fmt.Fprintf(&mf, "FROM %s\n", model)
+	if opts.System != "" {
+		fmt.Fprintf(&mf, "SYSTEM \"\"\"%s\"\"\"\n", opts.System)
+	}
+
+	if opts.Template != "" {
+		fmt.Fprintf(&mf, "TEMPLATE \"\"\"%s\"\"\"\n", opts.Template)
+	}
+
+	keys := make([]string, 0)
+	for k := range opts.Options {
+		keys = append(keys, k)
+	}
+	sort.Strings(keys)
+	for _, k := range keys {
+		fmt.Fprintf(&mf, "PARAMETER %s %v\n", k, opts.Options[k])
+	}
+	fmt.Fprintln(&mf)
+
+	for _, msg := range opts.Messages {
+		fmt.Fprintf(&mf, "MESSAGE %s \"\"\"%s\"\"\"\n", msg.Role, msg.Content)
+	}
+
+	return mf.String()
+}
+
+func normalizeFilePath(fp string) string {
+	// Define a map of escaped characters and their replacements
+	replacements := map[string]string{
+		"\\ ":  " ",  // Escaped space
+		"\\(":  "(",  // Escaped left parenthesis
+		"\\)":  ")",  // Escaped right parenthesis
+		"\\[":  "[",  // Escaped left square bracket
+		"\\]":  "]",  // Escaped right square bracket
+		"\\{":  "{",  // Escaped left curly brace
+		"\\}":  "}",  // Escaped right curly brace
+		"\\$":  "$",  // Escaped dollar sign
+		"\\&":  "&",  // Escaped ampersand
+		"\\;":  ";",  // Escaped semicolon
+		"\\'":  "'",  // Escaped single quote
+		"\\\\": "\\", // Escaped backslash
+		"\\*":  "*",  // Escaped asterisk
+		"\\?":  "?",  // Escaped question mark
+	}
+
+	for escaped, actual := range replacements {
+		fp = strings.ReplaceAll(fp, escaped, actual)
+	}
+	return fp
+}
+
+func extractFileNames(input string) []string {
+	// Regex to match file paths starting with optional drive letter, / ./ \ or .\ and include escaped or unescaped spaces (\ or %20)
+	// and followed by more characters and a file extension
+	// This will capture non filename strings, but we'll check for file existence to remove mismatches
+	regexPattern := `(?:[a-zA-Z]:)?(?:\./|/|\\)[\S\\ ]+?\.(?i:jpg|jpeg|png|svg)\b`
+	re := regexp.MustCompile(regexPattern)
+
+	return re.FindAllString(input, -1)
+}
+
+func extractFileData(input string) (string, []api.ImageData, error) {
+	filePaths := extractFileNames(input)
+	var imgs []api.ImageData
+
+	for _, fp := range filePaths {
+		nfp := normalizeFilePath(fp)
+		data, err := getImageData(nfp)
+		if err != nil {
+			if os.IsNotExist(err) {
+				continue
+			}
+			fmt.Printf("Couldn't process image: %q\n", err)
+			return "", imgs, err
+		}
+		fmt.Printf("Added image '%s'\n", nfp)
+		input = strings.ReplaceAll(input, fp, "")
+		imgs = append(imgs, data)
+	}
+	return input, imgs, nil
+}
+
+func getImageData(filePath string) ([]byte, error) {
+	file, err := os.Open(filePath)
+	if err != nil {
+		return nil, err
+	}
+	defer file.Close()
+
+	buf := make([]byte, 512)
+	_, err = file.Read(buf)
+	if err != nil {
+		return nil, err
+	}
+
+	contentType := http.DetectContentType(buf)
+	allowedTypes := []string{"image/jpeg", "image/jpg", "image/svg+xml", "image/png"}
+	if !slices.Contains(allowedTypes, contentType) {
+		return nil, fmt.Errorf("invalid image type: %s", contentType)
+	}
+
+	info, err := file.Stat()
+	if err != nil {
+		return nil, err
+	}
+
+	// Check if the file size exceeds 100MB
+	var maxSize int64 = 100 * 1024 * 1024 // 100MB in bytes
+	if info.Size() > maxSize {
+		return nil, fmt.Errorf("file size exceeds maximum limit (100MB)")
+	}
+
+	buf = make([]byte, info.Size())
+	_, err = file.Seek(0, 0)
+	if err != nil {
+		return nil, err
+	}
+
+	_, err = io.ReadFull(file, buf)
+	if err != nil {
+		return nil, err
+	}
+
+	return buf, nil
+}
--- a/cmd/interactive_test.go
+++ b/cmd/interactive_test.go
@@ -0,0 +1,116 @@
+package cmd
+
+import (
+	"bytes"
+	"testing"
+	"text/template"
+
+	"github.com/stretchr/testify/assert"
+
+	"github.com/jmorganca/ollama/api"
+)
+
+func TestExtractFilenames(t *testing.T) {
+	// Unix style paths
+	input := ` some preamble 
+ ./relative\ path/one.png inbetween1 ./not a valid two.jpg inbetween2
+/unescaped space /three.jpeg inbetween3 /valid\ path/dir/four.png "./quoted with spaces/five.svg`
+	res := extractFileNames(input)
+	assert.Len(t, res, 5)
+	assert.Contains(t, res[0], "one.png")
+	assert.Contains(t, res[1], "two.jpg")
+	assert.Contains(t, res[2], "three.jpeg")
+	assert.Contains(t, res[3], "four.png")
+	assert.Contains(t, res[4], "five.svg")
+	assert.NotContains(t, res[4], '"')
+	assert.NotContains(t, res, "inbtween")
+
+	// Windows style paths
+	input = ` some preamble
+ c:/users/jdoe/one.png inbetween1 c:/program files/someplace/two.jpg inbetween2 
+ /absolute/nospace/three.jpeg inbetween3 /absolute/with space/four.png inbetween4
+./relative\ path/five.svg inbetween5 "./relative with/spaces/six.png inbetween6
+d:\path with\spaces\seven.svg inbetween7 c:\users\jdoe\eight.png inbetween8 
+ d:\program files\someplace\nine.png inbetween9 "E:\program files\someplace\ten.svg some ending
+`
+	res = extractFileNames(input)
+	assert.Len(t, res, 10)
+	assert.NotContains(t, res, "inbtween")
+	assert.Contains(t, res[0], "one.png")
+	assert.Contains(t, res[0], "c:")
+	assert.Contains(t, res[1], "two.jpg")
+	assert.Contains(t, res[1], "c:")
+	assert.Contains(t, res[2], "three.jpeg")
+	assert.Contains(t, res[3], "four.png")
+	assert.Contains(t, res[4], "five.svg")
+	assert.Contains(t, res[5], "six.png")
+	assert.Contains(t, res[6], "seven.svg")
+	assert.Contains(t, res[6], "d:")
+	assert.Contains(t, res[7], "eight.png")
+	assert.Contains(t, res[7], "c:")
+	assert.Contains(t, res[8], "nine.png")
+	assert.Contains(t, res[8], "d:")
+	assert.Contains(t, res[9], "ten.svg")
+	assert.Contains(t, res[9], "E:")
+}
+
+func TestModelfileBuilder(t *testing.T) {
+	opts := runOptions{
+		Model:    "hork",
+		System:   "You are part horse and part shark, but all hork. Do horklike things",
+		Template: "This is a template.",
+		Messages: []api.Message{
+			{Role: "user", Content: "Hey there hork!"},
+			{Role: "assistant", Content: "Yes it is true, I am half horse, half shark."},
+		},
+		Options: map[string]interface{}{},
+	}
+
+	opts.Options["temperature"] = 0.9
+	opts.Options["seed"] = 42
+	opts.Options["penalize_newline"] = false
+	opts.Options["stop"] = []string{"hi", "there"}
+
+	mf := buildModelfile(opts)
+	expectedModelfile := `FROM {{.Model}}
+SYSTEM """{{.System}}"""
+TEMPLATE """{{.Template}}"""
+PARAMETER penalize_newline false
+PARAMETER seed 42
+PARAMETER stop [hi there]
+PARAMETER temperature 0.9
+
+MESSAGE user """Hey there hork!"""
+MESSAGE assistant """Yes it is true, I am half horse, half shark."""
+`
+
+	tmpl, err := template.New("").Parse(expectedModelfile)
+	assert.Nil(t, err)
+
+	var buf bytes.Buffer
+	err = tmpl.Execute(&buf, opts)
+	assert.Nil(t, err)
+	assert.Equal(t, buf.String(), mf)
+
+	opts.ParentModel = "horseshark"
+	mf = buildModelfile(opts)
+	expectedModelfile = `FROM {{.ParentModel}}
+SYSTEM """{{.System}}"""
+TEMPLATE """{{.Template}}"""
+PARAMETER penalize_newline false
+PARAMETER seed 42
+PARAMETER stop [hi there]
+PARAMETER temperature 0.9
+
+MESSAGE user """Hey there hork!"""
+MESSAGE assistant """Yes it is true, I am half horse, half shark."""
+`
+
+	tmpl, err = template.New("").Parse(expectedModelfile)
+	assert.Nil(t, err)
+
+	var parentBuf bytes.Buffer
+	err = tmpl.Execute(&parentBuf, opts)
+	assert.Nil(t, err)
+	assert.Equal(t, parentBuf.String(), mf)
+}
--- a/docs/README.md
+++ b/docs/README.md
@@ -1,6 +1,25 @@
 # Documentation

- [Modelfile](./modelfile.md)
- [How to develop Ollama](./development.md)
- [API](./api.md)
- [Tutorials](./tutorials.md)
+To get started, see the project's **[quickstart](../README.md#quickstart)**.
+
+Ollama is a tool for running AI models on your hardware. Many users will choose to use the Command Line Interface (CLI) to work with Ollama. Learn more about all the commands in the CLI in the **[Main Readme](../README.md)**.
+
+Use the RESTful API using any language, including Python, JavaScript, Typescript, Go, Rust, and many more. Learn more about using the API in the **[API Documentation](./api.md)**.
+
+Create new models or modify models already in the library using the Modelfile. Learn more about the Modelfile syntax in the **[Modelfile Documentation](./modelfile.md)**.
+
+Import models using source model weights found on Hugging Face and similar sites by referring to the **[Import Documentation](./import.md)**.
+
+Installing on Linux in most cases is easy using the script on Ollama.ai. To get more detail about the install, including CUDA drivers, see the **[Linux Documentation](./linux.md)**.
+
+Many of our users like the flexibility of using our official Docker Image. Learn more about using Docker with Ollama using the **[Docker Documentation](https://hub.docker.com/r/ollama/ollama)**.
+
+It is easy to install on Linux and Mac, but many users will choose to build Ollama on their own. To do this, refer to the **[Development Documentation](./development.md)**.
+
+If encountering a problem with Ollama, the best place to start is the logs. Find more information about them here in the **[Troubleshooting Guide](./troubleshooting.md)**.
+
+Finally for all the questions that don't fit anywhere else, there is the **[FAQ](./faq.md)**
+
+[Tutorials](./tutorials.md) apply the documentation to tasks.
+
+For working code examples of using Ollama, see [Examples](../examples).
--- a/docs/api.md
+++ b/docs/api.md
@@ -3,6 +3,7 @@
 ## Endpoints

 - [Generate a completion](#generate-a-completion)
+- [Generate a chat completion](#generate-a-chat-completion)
 - [Create a Model](#create-a-model)
 - [List Local Models](#list-local-models)
 - [Show Model Information](#show-model-information)
@@ -16,7 +17,7 @@

 ### Model names

-Model names follow a `model:tag` format. Some examples are `orca-mini:3b-q4_1` and `llama2:70b`. The tag is optional and, if not provided, will default to `latest`. The tag is used to identify a specific version.
+Model names follow a `model:tag` format, where `model` can have an optional namespace such as `example/model`. Some examples are `orca-mini:3b-q4_1` and `llama2:70b`. The tag is optional and, if not provided, will default to `latest`. The tag is used to identify a specific version.

 ### Durations

@@ -24,7 +25,7 @@ All durations are returned in nanoseconds.

 ### Streaming responses

-Certain endpoints stream responses as JSON objects delineated with the newline (`\n`) character.
+Certain endpoints stream responses as JSON objects and can optional return non-streamed responses.

 ## Generate a completion

@@ -32,32 +33,35 @@ Certain endpoints stream responses as JSON objects delineated with the newline (
 POST /api/generate
 ```

-Generate a response for a given prompt with a provided model. This is a streaming endpoint, so will be a series of responses. The final response object will include statistics and additional data from the request.
+Generate a response for a given prompt with a provided model. This is a streaming endpoint, so there will be a series of responses. The final response object will include statistics and additional data from the request.

 ### Parameters

 - `model`: (required) the [model name](#model-names)
 - `prompt`: the prompt to generate a response for
+- `images`: (optional) a list of base64-encoded images (for multimodal models such as `llava`)

 Advanced parameters (optional):

 - `format`: the format to return a response in. Currently the only accepted value is `json`
 - `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
- `system`: system prompt to (overrides what is defined in the `Modelfile`)
- `template`: the full prompt or prompt template (overrides what is defined in the `Modelfile`)
+- `system`: system message to (overrides what is defined in the `Modelfile`)
+- `template`: the prompt template to use (overrides what is defined in the `Modelfile`)
 - `context`: the context parameter returned from a previous request to `/generate`, this can be used to keep a short conversational memory
 - `stream`: if `false` the response will be returned as a single response object, rather than a stream of objects
- `raw`: if `true` no formatting will be applied to the prompt and no context will be returned. You may choose to use the `raw` parameter if you are specifying a full templated prompt in your request to the API, and are managing history yourself.
+- `raw`: if `true` no formatting will be applied to the prompt. You may choose to use the `raw` parameter if you are specifying a full templated prompt in your request to the API.

-### JSON mode
+#### JSON mode

-Enable JSON mode by setting the `format` parameter to `json`. This will structure the response as valid JSON. See the JSON mode [example](#request-json-mode) below.
+Enable JSON mode by setting the `format` parameter to `json`. This will structure the response as a valid JSON object. See the JSON mode [example](#generate-request-json-mode) below.

 > Note: it's important to instruct the model to use JSON in the `prompt`. Otherwise, the model may generate large amounts whitespace.

 ### Examples

-#### Request
+#### Generate request (Streaming)
+
+##### Request

 ```shell
 curl http://localhost:11434/api/generate -d '{
@@ -66,7 +70,7 @@ curl http://localhost:11434/api/generate -d '{
 }'
 ```

-#### Response
+##### Response

 A stream of JSON objects is returned:

@@ -83,8 +87,6 @@ The final response in the stream also includes additional data about the generat

 - `total_duration`: time spent generating the response
 - `load_duration`: time spent in nanoseconds loading the model
- `sample_count`: number of samples generated
- `sample_duration`: time spent generating samples
 - `prompt_eval_count`: number of tokens in the prompt
 - `prompt_eval_duration`: time spent in nanoseconds evaluating the prompt
 - `eval_count`: number of tokens the response
@@ -99,21 +101,23 @@ To calculate how fast the response is generated in tokens per second (token/s),
  "model": "llama2",
  "created_at": "2023-08-04T19:22:45.499127Z",
  "response": "",
-  "context": [1, 2, 3],
  "done": true,
-  "total_duration": 5589157167,
-  "load_duration": 3013701500,
-  "sample_count": 114,
-  "sample_duration": 81442000,
-  "prompt_eval_count": 46,
-  "prompt_eval_duration": 1160282000,
-  "eval_count": 113,
-  "eval_duration": 1325948000
+  "context": [1, 2, 3],
+  "total_duration": 10706818083,
+  "load_duration": 6338219291,
+  "prompt_eval_count": 26,
+  "prompt_eval_duration": 130079000,
+  "eval_count": 259,
+  "eval_duration": 4232710000
 }
 ```

 #### Request (No streaming)

+##### Request
+
+A response can be received in one reply when streaming is off.
+
 ```shell
 curl http://localhost:11434/api/generate -d '{
  "model": "llama2",
@@ -122,7 +126,7 @@ curl http://localhost:11434/api/generate -d '{
 }'
 ```

-#### Response
+##### Response

 If `stream` is set to `false`, the response will be a single JSON object:

@@ -131,51 +135,23 @@ If `stream` is set to `false`, the response will be a single JSON object:
  "model": "llama2",
  "created_at": "2023-08-04T19:22:45.499127Z",
  "response": "The sky is blue because it is the color of the sky.",
+  "done": true,
  "context": [1, 2, 3],
-  "done": true,
-  "total_duration": 5589157167,
-  "load_duration": 3013701500,
-  "sample_count": 114,
-  "sample_duration": 81442000,
-  "prompt_eval_count": 46,
-  "prompt_eval_duration": 1160282000,
-  "eval_count": 13,
-  "eval_duration": 1325948000
-}
-```
-
-#### Request (Raw mode)
-
-In some cases you may wish to bypass the templating system and provide a full prompt. In this case, you can use the `raw` parameter to disable formatting and context.
-
-```shell
-curl http://localhost:11434/api/generate -d '{
-  "model": "mistral",
-  "prompt": "[INST] why is the sky blue? [/INST]",
-  "raw": true,
-  "stream": false
-}'
-```
-
-#### Response
-
-```json
-{
-  "model": "mistral",
-  "created_at": "2023-11-03T15:36:02.583064Z",
-  "response": " The sky appears blue because of a phenomenon called Rayleigh scattering.",
-  "done": true,
-  "total_duration": 14648695333,
-  "load_duration": 3302671417,
-  "prompt_eval_count": 14,
-  "prompt_eval_duration": 286243000,
-  "eval_count": 129,
-  "eval_duration": 10931424000
+  "total_duration": 5043500667,
+  "load_duration": 5025959,
+  "prompt_eval_count": 26,
+  "prompt_eval_duration": 325953000,
+  "eval_count": 290,
+  "eval_duration": 4709213000
 }
 ```

 #### Request (JSON mode)

+> When `format` is set to `json`, the output will always be a well-formed JSON object. It's important to also instruct the model to respond in JSON.
+
+##### Request
+
 ```shell
 curl http://localhost:11434/api/generate -d '{
  "model": "llama2",
@@ -185,7 +161,7 @@ curl http://localhost:11434/api/generate -d '{
 }'
 ```

-#### Response
+##### Response

 ```json
 {
@@ -193,12 +169,13 @@ curl http://localhost:11434/api/generate -d '{
  "created_at": "2023-11-09T21:07:55.186497Z",
  "response": "{\n\"morning\": {\n\"color\": \"blue\"\n},\n\"noon\": {\n\"color\": \"blue-gray\"\n},\n\"afternoon\": {\n\"color\": \"warm gray\"\n},\n\"evening\": {\n\"color\": \"orange\"\n}\n}\n",
  "done": true,
-  "total_duration": 4661289125,
-  "load_duration": 1714434500,
+  "context": [1, 2, 3],
+  "total_duration": 4648158584,
+  "load_duration": 4071084,
  "prompt_eval_count": 36,
-  "prompt_eval_duration": 264132000,
-  "eval_count": 75,
-  "eval_duration": 2112149000
+  "prompt_eval_duration": 439038000,
+  "eval_count": 180,
+  "eval_duration": 4196918000
 }
 ```

@@ -221,10 +198,77 @@ The value of `response` will be a string containing JSON similar to:
 }
 ```

-#### Request (With options)
+#### Request (with images)
+
+To submit images to multimodal models such as `llava` or `bakllava`, provide a list of base64-encoded `images`:
+
+#### Request
+
+```shell
+curl http://localhost:11434/api/generate -d '{
+  "model": "llava",
+  "prompt":"What is in this picture?",
+  "stream": false,
+  "images": ["iVBORw0KGgoAAAANSUhEUgAAAG0AAABmCAYAAADBPx+VAAAACXBIWXMAAAsTAAALEwEAmpwYAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAA3VSURBVHgB7Z27r0zdG8fX743i1bi1ikMoFMQloXRpKFFIqI7LH4BEQ+NWIkjQuSWCRIEoULk0gsK1kCBI0IhrQVT7tz/7zZo888yz1r7MnDl7z5xvsjkzs2fP3uu71nNfa7lkAsm7d++Sffv2JbNmzUqcc8m0adOSzZs3Z+/XES4ZckAWJEGWPiCxjsQNLWmQsWjRIpMseaxcuTKpG/7HP27I8P79e7dq1ars/yL4/v27S0ejqwv+cUOGEGGpKHR37tzJCEpHV9tnT58+dXXCJDdECBE2Ojrqjh071hpNECjx4cMHVycM1Uhbv359B2F79+51586daxN/+pyRkRFXKyRDAqxEp4yMlDDzXG1NPnnyJKkThoK0VFd1ELZu3TrzXKxKfW7dMBQ6bcuWLW2v0VlHjx41z717927ba22U9APcw7Nnz1oGEPeL3m3p2mTAYYnFmMOMXybPPXv2bNIPpFZr1NHn4HMw0KRBjg9NuRw95s8PEcz/6DZELQd/09C9QGq5RsmSRybqkwHGjh07OsJSsYYm3ijPpyHzoiacg35MLdDSIS/O1yM778jOTwYUkKNHWUzUWaOsylE00MyI0fcnOwIdjvtNdW/HZwNLGg+sR1kMepSNJXmIwxBZiG8tDTpEZzKg0GItNsosY8USkxDhD0Rinuiko2gfL/RbiD2LZAjU9zKQJj8RDR0vJBR1/Phx9+PHj9Z7REF4nTZkxzX4LCXHrV271qXkBAPGfP/atWvu/PnzHe4C97F48eIsRLZ9+3a3f/9+87dwP1JxaF7/3r17ba+5l4EcaVo0lj3SBq5kGTJSQmLWMjgYNei2GPT1MuMqGTDEFHzeQSP2wi/jGnkmPJ/nhccs44jvDAxpVcxnq0F6eT8h4ni/iIWpR5lPyA6ETkNXoSukvpJAD3AsXLiwpZs49+fPn5ke4j10TqYvegSfn0OnafC+Tv9ooA/JPkgQysqQNBzagXY55nO/oa1F7qvIPWkRL12WRpMWUvpVDYmxAPehxWSe8ZEXL20sadYIozfmNch4QJPAfeJgW3rNsnzphBKNJM2KKODo1rVOMRYik5ETy3ix4qWNI81qAAirizgMIc+yhTytx0JWZuNI03qsrgWlGtwjoS9XwgUhWGyhUaRZZQNNIEwCiXD16tXcAHUs79co0vSD8rrJCIW98pzvxpAWyyo3HYwqS0+H0BjStClcZJT5coMm6D2LOF8TolGJtK9fvyZpyiC5ePFi9nc/oJU4eiEP0jVoAnHa9wyJycITMP78+eMeP37sXrx44d6+fdt6f82aNdkx1pg9e3Zb5W+RSRE+n+VjksQWifvVaTKFhn5O8my63K8Qabdv33b379/PiAP//vuvW7BggZszZ072/+TJk91YgkafPn166zXB1rQHFvouAWHq9z3SEevSUerqCn2/dDCeta2jxYbr69evk4MHDyY7d+7MjhMnTiTPnz9Pfv/+nfQT2ggpO2dMF8cghuoM7Ygj5iWCqRlGFml0QC/ftGmTmzt3rmsaKDsgBSPh0/8yPeLLBihLkOKJc0jp8H8vUzcxIA1k6QJ/c78tWEyj5P3o4u9+jywNPdJi5rAH9x0KHcl4Hg570eQp3+vHXGyrmEeigzQsQsjavXt38ujRo44LQuDDhw+TW7duRS1HGgMxhNXHgflaNTOsHyKvHK5Ijo2jbFjJBQK9YwFd6RVMzfgRBmEfP37suBBm/p49e1qjEP2mwTViNRo0VJWH1deMXcNK08uUjVUu7s/zRaL+oLNxz1bpANco4npUgX4G2eFbpDFyQoQxojBCpEGSytmOH8qrH5Q9vuzD6ofQylkCUmh8DBAr+q8JCyVNtWQIidKQE9wNtLSQnS4jDSsxNHogzFuQBw4cyM61UKVsjfr3ooBkPSqqQHesUPWVtzi9/vQi1T+rJj7WiTz4Pt/l3LxUkr5P2VYZaZ4URpsE+st/dujQoaBBYokbrz/8TJNQYLSonrPS9kUaSkPeZyj1AWSj+d+VBoy1pIWVNed8P0Ll/ee5HdGRhrHhR5GGN0r4LGZBaj8oFDJitBTJzIZgFcmU0Y8ytWMZMzJOaXUSrUs5RxKnrxmbb5YXO9VGUhtpXldhEUogFr3IzIsvlpmdosVcGVGXFWp2oU9kLFL3dEkSz6NHEY1sjSRdIuDFWEhd8KxFqsRi1uM/nz9/zpxnwlESONdg6dKlbsaMGS4EHFHtjFIDHwKOo46l4TxSuxgDzi+rE2jg+BaFruOX4HXa0Nnf1lwAPufZeF8/r6zD97WK2qFnGjBxTw5qNGPxT+5T/r7/7RawFC3j4vTp09koCxkeHjqbHJqArmH5UrFKKksnxrK7FuRIs8STfBZv+luugXZ2pR/pP9Ois4z+TiMzUUkUjD0iEi1fzX8GmXyuxUBRcaUfykV0YZnlJGKQpOiGB76x5GeWkWWJc3mOrK6S7xdND+W5N6XyaRgtWJFe13GkaZnKOsYqGdOVVVbGupsyA/l7emTLHi7vwTdirNEt0qxnzAvBFcnQF16xh/TMpUuXHDowhlA9vQVraQhkudRdzOnK+04ZSP3DUhVSP61YsaLtd/ks7ZgtPcXqPqEafHkdqa84X6aCeL7YWlv6edGFHb+ZFICPlljHhg0bKuk0CSvVznWsotRu433alNdFrqG45ejoaPCaUkWERpLXjzFL2Rpllp7PJU2a/v7Ab8N05/9t27Z16KUqoFGsxnI9EosS2niSYg9SpU6B4JgTrvVW1flt1sT+0ADIJU2maXzcUTraGCRaL1Wp9rUMk16PMom8QhruxzvZIegJjFU7LLCePfS8uaQdPny4jTTL0dbee5mYokQsXTIWNY46kuMbnt8Kmec+LGWtOVIl9cT1rCB0V8WqkjAsRwta93TbwNYoGKsUSChN44lgBNCoHLHzquYKrU6qZ8lolCIN0Rh6cP0Q3U6I6IXILYOQI513hJaSKAorFpuHXJNfVlpRtmYBk1Su1obZr5dnKAO+L10Hrj3WZW+E3qh6IszE37F6EB+68mGpvKm4eb9bFrlzrok7fvr0Kfv727dvWRmdVTJHw0qiiCUSZ6wCK+7XL/AcsgNyL74DQQ730sv78Su7+t/A36MdY0sW5o40ahslXr58aZ5HtZB8GH64m9EmMZ7FpYw4T6QnrZfgenrhFxaSiSGXtPnz57e9TkNZLvTjeqhr734CNtrK41L40sUQckmj1lGKQ0rC37x544r8eNXRpnVE3ZZY7zXo8NomiO0ZUCj2uHz58rbXoZ6gc0uA+F6ZeKS/jhRDUq8MKrTho9fEkihMmhxtBI1DxKFY9XLpVcSkfoi8JGnToZO5sU5aiDQIW716ddt7ZLYtMQlhECdBGXZZMWldY5BHm5xgAroWj4C0hbYkSc/jBmggIrXJWlZM6pSETsEPGqZOndr2uuuR5rF169a2HoHPdurUKZM4CO1WTPqaDaAd+GFGKdIQkxAn9RuEWcTRyN2KSUgiSgF5aWzPTeA/lN5rZubMmR2bE4SIC4nJoltgAV/dVefZm72AtctUCJU2CMJ327hxY9t7EHbkyJFseq+EJSY16RPo3Dkq1kkr7+q0bNmyDuLQcZBEPYmHVdOBiJyIlrRDq41YPWfXOxUysi5fvtyaj+2BpcnsUV/oSoEMOk2CQGlr4ckhBwaetBhjCwH0ZHtJROPJkyc7UjcYLDjmrH7ADTEBXFfOYmB0k9oYBOjJ8b4aOYSe7QkKcYhFlq3QYLQhSidNmtS2RATwy8YOM3EQJsUjKiaWZ+vZToUQgzhkHXudb/PW5YMHD9yZM2faPsMwoc7RciYJXbGuBqJ1UIGKKLv915jsvgtJxCZDubdXr165mzdvtr1Hz5LONA8jrUwKPqsmVesKa49S3Q4WxmRPUEYdTjgiUcfUwLx589ySJUva3oMkP6IYddq6HMS4o55xBJBUeRjzfa4Zdeg56QZ43LhxoyPo7Lf1kNt7oO8wWAbNwaYjIv5lhyS7kRf96dvm5Jah8vfvX3flyhX35cuX6HfzFHOToS1H4BenCaHvO8pr8iDuwoUL7tevX+b5ZdbBair0xkFIlFDlW4ZknEClsp/TzXyAKVOmmHWFVSbDNw1l1+4f90U6IY/q4V27dpnE9bJ+v87QEydjqx/UamVVPRG+mwkNTYN+9tjkwzEx+atCm/X9WvWtDtAb68Wy9LXa1UmvCDDIpPkyOQ5ZwSzJ4jMrvFcr0rSjOUh+GcT4LSg5ugkW1Io0/SCDQBojh0hPlaJdah+tkVYrnTZowP8iq1F1TgMBBauufyB33x1v+NWFYmT5KmppgHC+NkAgbmRkpD3yn9QIseXymoTQFGQmIOKTxiZIWpvAatenVqRVXf2nTrAWMsPnKrMZHz6bJq5jvce6QK8J1cQNgKxlJapMPdZSR64/UivS9NztpkVEdKcrs5alhhWP9NeqlfWopzhZScI6QxseegZRGeg5a8C3Re1Mfl1ScP36ddcUaMuv24iOJtz7sbUjTS4qBvKmstYJoUauiuD3k5qhyr7QdUHMeCgLa1Ear9NquemdXgmum4fvJ6w1lqsuDhNrg1qSpleJK7K3TF0Q2jSd94uSZ60kK1e3qyVpQK6PVWXp2/FC3mp6jBhKKOiY2h3gtUV64TWM6wDETRPLDfSakXmH3w8g9Jlug8ZtTt4kVF0kLUYYmCCtD/DrQ5YhMGbA9L3ucdjh0y8kOHW5gU/VEEmJTcL4Pz/f7mgoAbYkAAAAAElFTkSuQmCC"]
+}'
+```
+
+#### Response
+
+```
+{
+  "model": "llava",
+  "created_at": "2023-11-03T15:36:02.583064Z",
+  "response": "A happy cartoon character, which is cute and cheerful.",
+  "done": true,
+  "context": [1, 2, 3],
+  "total_duration": 2938432250,
+  "load_duration": 2559292,
+  "prompt_eval_count": 1,
+  "prompt_eval_duration": 2195557000,
+  "eval_count": 44,
+  "eval_duration": 736432000
+}
+```
+
+#### Request (Raw Mode)
+
+In some cases, you may wish to bypass the templating system and provide a full prompt. In this case, you can use the `raw` parameter to disable templating. Also note that raw mode will not return a context.
+
+##### Request
+
+```shell
+curl http://localhost:11434/api/generate -d '{
+  "model": "mistral",
+  "prompt": "[INST] why is the sky blue? [/INST]",
+  "raw": true,
+  "stream": false
+}'
+```
+
+##### Response
+
+```json
+{
+  "model": "mistral",
+  "created_at": "2023-11-03T15:36:02.583064Z",
+  "response": " The sky appears blue because of a phenomenon called Rayleigh scattering.",
+  "done": true,
+  "total_duration": 8493852375,
+  "load_duration": 6589624375,
+  "prompt_eval_count": 14,
+  "prompt_eval_duration": 119039000,
+  "eval_count": 110,
+  "eval_duration": 1779061000
+}
+```
+
+#### Generate request (With options)

 If you want to set custom options for the model at runtime rather than in the Modelfile, you can do so with the `options` parameter. This example sets every available option, but you can set any of them individually and omit the ones you do not want to override.

+##### Request
+
 ```shell
 curl http://localhost:11434/api/generate -d '{
  "model": "llama2",
@@ -249,14 +293,13 @@ curl http://localhost:11434/api/generate -d '{
    "penalize_newline": true,
    "stop": ["\n", "user:"],
    "numa": false,
-    "num_ctx": 4,
+    "num_ctx": 1024,
    "num_batch": 2,
    "num_gqa": 1,
    "num_gpu": 1,
    "main_gpu": 0,
    "low_vram": false,
    "f16_kv": true,
-    "logits_all": false,
    "vocab_only": false,
    "use_mmap": true,
    "use_mlock": false,
@@ -264,27 +307,264 @@ curl http://localhost:11434/api/generate -d '{
    "rope_frequency_base": 1.1,
    "rope_frequency_scale": 0.8,
    "num_thread": 8
-    }
+  }
 }'
 ```

-#### Response
+##### Response

 ```json
 {
  "model": "llama2",
  "created_at": "2023-08-04T19:22:45.499127Z",
  "response": "The sky is blue because it is the color of the sky.",
-  "context": [1, 2, 3],
  "done": true,
-  "total_duration": 5589157167,
-  "load_duration": 3013701500,
-  "sample_count": 114,
-  "sample_duration": 81442000,
-  "prompt_eval_count": 46,
-  "prompt_eval_duration": 1160282000,
-  "eval_count": 13,
-  "eval_duration": 1325948000
+  "context": [1, 2, 3],
+  "total_duration": 4935886791,
+  "load_duration": 534986708,
+  "prompt_eval_count": 26,
+  "prompt_eval_duration": 107345000,
+  "eval_count": 237,
+  "eval_duration": 4289432000
+}
+```
+
+#### Load a model
+
+If an empty prompt is provided, the model will be loaded into memory.
+
+##### Request
+
+```shell
+curl http://localhost:11434/api/generate -d '{
+  "model": "llama2"
+}'
+```
+
+##### Response
+
+A single JSON object is returned:
+
+```json
+{
+  "model": "llama2",
+  "created_at": "2023-12-18T19:52:07.071755Z",
+  "response": "",
+  "done": true
+}
+```
+
+## Generate a chat completion
+
+```shell
+POST /api/chat
+```
+
+Generate the next message in a chat with a provided model. This is a streaming endpoint, so there will be a series of responses. Streaming can be disabled using `"stream": false`. The final response object will include statistics and additional data from the request.
+
+### Parameters
+
+- `model`: (required) the [model name](#model-names)
+- `messages`: the messages of the chat, this can be used to keep a chat memory
+
+The `message` object has the following fields:
+
+- `role`: the role of the message, either `system`, `user` or `assistant`
+- `content`: the content of the message
+- `images` (optional): a list of images to include in the message (for multimodal models such as `llava`)
+
+Advanced parameters (optional):
+
+- `format`: the format to return a response in. Currently the only accepted value is `json`
+- `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
+- `template`: the prompt template to use (overrides what is defined in the `Modelfile`)
+- `stream`: if `false` the response will be returned as a single response object, rather than a stream of objects
+
+### Examples
+
+#### Chat Request (Streaming)
+
+##### Request
+
+Send a chat message with a streaming response.
+
+```shell
+curl http://localhost:11434/api/chat -d '{
+  "model": "llama2",
+  "messages": [
+    {
+      "role": "user",
+      "content": "why is the sky blue?"
+    }
+  ]
+}'
+```
+
+##### Response
+
+A stream of JSON objects is returned:
+
+```json
+{
+  "model": "llama2",
+  "created_at": "2023-08-04T08:52:19.385406455-07:00",
+  "message": {
+    "role": "assistant",
+    "content": "The",
+    "images": null
+  },
+  "done": false
+}
+```
+
+Final response:
+
+```json
+{
+  "model": "llama2",
+  "created_at": "2023-08-04T19:22:45.499127Z",
+  "done": true,
+  "total_duration": 4883583458,
+  "load_duration": 1334875,
+  "prompt_eval_count": 26,
+  "prompt_eval_duration": 342546000,
+  "eval_count": 282,
+  "eval_duration": 4535599000
+}
+```
+
+#### Chat request (No streaming)
+
+##### Request
+
+```shell
+curl http://localhost:11434/api/chat -d '{
+  "model": "llama2",
+  "messages": [
+    {
+      "role": "user",
+      "content": "why is the sky blue?"
+    }
+  ],
+  "stream": false
+}'
+```
+
+##### Response
+
+```json
+{
+  "model": "registry.ollama.ai/library/llama2:latest",
+  "created_at": "2023-12-12T14:13:43.416799Z",
+  "message": {
+    "role": "assistant",
+    "content": "Hello! How are you today?"
+  },
+  "done": true,
+  "total_duration": 5191566416,
+  "load_duration": 2154458,
+  "prompt_eval_count": 26,
+  "prompt_eval_duration": 383809000,
+  "eval_count": 298,
+  "eval_duration": 4799921000
+}
+```
+
+#### Chat request (With History)
+
+Send a chat message with a conversation history. You can use this same approach to start the conversation using multi-shot or chain-of-thought prompting.
+
+##### Request
+
+```shell
+curl http://localhost:11434/api/chat -d '{
+  "model": "llama2",
+  "messages": [
+    {
+      "role": "user",
+      "content": "why is the sky blue?"
+    },
+    {
+      "role": "assistant",
+      "content": "due to rayleigh scattering."
+    },
+    {
+      "role": "user",
+      "content": "how is that different than mie scattering?"
+    }
+  ]
+}'
+```
+
+##### Response
+
+A stream of JSON objects is returned:
+
+```json
+{
+  "model": "llama2",
+  "created_at": "2023-08-04T08:52:19.385406455-07:00",
+  "message": {
+    "role": "assistant",
+    "content": "The"
+  },
+  "done": false
+}
+```
+
+Final response:
+
+```json
+{
+  "model": "llama2",
+  "created_at": "2023-08-04T19:22:45.499127Z",
+  "done": true,
+  "total_duration": 8113331500,
+  "load_duration": 6396458,
+  "prompt_eval_count": 61,
+  "prompt_eval_duration": 398801000,
+  "eval_count": 468,
+  "eval_duration": 7701267000
+}
+```
+
+#### Chat request (with images)
+
+##### Request
+
+Send a chat message with a conversation history.
+
+```shell
+curl http://localhost:11434/api/chat -d '{
+  "model": "llava",
+  "messages": [
+    {
+      "role": "user",
+      "content": "what is in this image?",
+      "images": ["iVBORw0KGgoAAAANSUhEUgAAAG0AAABmCAYAAADBPx+VAAAACXBIWXMAAAsTAAALEwEAmpwYAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAA3VSURBVHgB7Z27r0zdG8fX743i1bi1ikMoFMQloXRpKFFIqI7LH4BEQ+NWIkjQuSWCRIEoULk0gsK1kCBI0IhrQVT7tz/7zZo888yz1r7MnDl7z5xvsjkzs2fP3uu71nNfa7lkAsm7d++Sffv2JbNmzUqcc8m0adOSzZs3Z+/XES4ZckAWJEGWPiCxjsQNLWmQsWjRIpMseaxcuTKpG/7HP27I8P79e7dq1ars/yL4/v27S0ejqwv+cUOGEGGpKHR37tzJCEpHV9tnT58+dXXCJDdECBE2Ojrqjh071hpNECjx4cMHVycM1Uhbv359B2F79+51586daxN/+pyRkRFXKyRDAqxEp4yMlDDzXG1NPnnyJKkThoK0VFd1ELZu3TrzXKxKfW7dMBQ6bcuWLW2v0VlHjx41z717927ba22U9APcw7Nnz1oGEPeL3m3p2mTAYYnFmMOMXybPPXv2bNIPpFZr1NHn4HMw0KRBjg9NuRw95s8PEcz/6DZELQd/09C9QGq5RsmSRybqkwHGjh07OsJSsYYm3ijPpyHzoiacg35MLdDSIS/O1yM778jOTwYUkKNHWUzUWaOsylE00MyI0fcnOwIdjvtNdW/HZwNLGg+sR1kMepSNJXmIwxBZiG8tDTpEZzKg0GItNsosY8USkxDhD0Rinuiko2gfL/RbiD2LZAjU9zKQJj8RDR0vJBR1/Phx9+PHj9Z7REF4nTZkxzX4LCXHrV271qXkBAPGfP/atWvu/PnzHe4C97F48eIsRLZ9+3a3f/9+87dwP1JxaF7/3r17ba+5l4EcaVo0lj3SBq5kGTJSQmLWMjgYNei2GPT1MuMqGTDEFHzeQSP2wi/jGnkmPJ/nhccs44jvDAxpVcxnq0F6eT8h4ni/iIWpR5lPyA6ETkNXoSukvpJAD3AsXLiwpZs49+fPn5ke4j10TqYvegSfn0OnafC+Tv9ooA/JPkgQysqQNBzagXY55nO/oa1F7qvIPWkRL12WRpMWUvpVDYmxAPehxWSe8ZEXL20sadYIozfmNch4QJPAfeJgW3rNsnzphBKNJM2KKODo1rVOMRYik5ETy3ix4qWNI81qAAirizgMIc+yhTytx0JWZuNI03qsrgWlGtwjoS9XwgUhWGyhUaRZZQNNIEwCiXD16tXcAHUs79co0vSD8rrJCIW98pzvxpAWyyo3HYwqS0+H0BjStClcZJT5coMm6D2LOF8TolGJtK9fvyZpyiC5ePFi9nc/oJU4eiEP0jVoAnHa9wyJycITMP78+eMeP37sXrx44d6+fdt6f82aNdkx1pg9e3Zb5W+RSRE+n+VjksQWifvVaTKFhn5O8my63K8Qabdv33b379/PiAP//vuvW7BggZszZ072/+TJk91YgkafPn166zXB1rQHFvouAWHq9z3SEevSUerqCn2/dDCeta2jxYbr69evk4MHDyY7d+7MjhMnTiTPnz9Pfv/+nfQT2ggpO2dMF8cghuoM7Ygj5iWCqRlGFml0QC/ftGmTmzt3rmsaKDsgBSPh0/8yPeLLBihLkOKJc0jp8H8vUzcxIA1k6QJ/c78tWEyj5P3o4u9+jywNPdJi5rAH9x0KHcl4Hg570eQp3+vHXGyrmEeigzQsQsjavXt38ujRo44LQuDDhw+TW7duRS1HGgMxhNXHgflaNTOsHyKvHK5Ijo2jbFjJBQK9YwFd6RVMzfgRBmEfP37suBBm/p49e1qjEP2mwTViNRo0VJWH1deMXcNK08uUjVUu7s/zRaL+oLNxz1bpANco4npUgX4G2eFbpDFyQoQxojBCpEGSytmOH8qrH5Q9vuzD6ofQylkCUmh8DBAr+q8JCyVNtWQIidKQE9wNtLSQnS4jDSsxNHogzFuQBw4cyM61UKVsjfr3ooBkPSqqQHesUPWVtzi9/vQi1T+rJj7WiTz4Pt/l3LxUkr5P2VYZaZ4URpsE+st/dujQoaBBYokbrz/8TJNQYLSonrPS9kUaSkPeZyj1AWSj+d+VBoy1pIWVNed8P0Ll/ee5HdGRhrHhR5GGN0r4LGZBaj8oFDJitBTJzIZgFcmU0Y8ytWMZMzJOaXUSrUs5RxKnrxmbb5YXO9VGUhtpXldhEUogFr3IzIsvlpmdosVcGVGXFWp2oU9kLFL3dEkSz6NHEY1sjSRdIuDFWEhd8KxFqsRi1uM/nz9/zpxnwlESONdg6dKlbsaMGS4EHFHtjFIDHwKOo46l4TxSuxgDzi+rE2jg+BaFruOX4HXa0Nnf1lwAPufZeF8/r6zD97WK2qFnGjBxTw5qNGPxT+5T/r7/7RawFC3j4vTp09koCxkeHjqbHJqArmH5UrFKKksnxrK7FuRIs8STfBZv+luugXZ2pR/pP9Ois4z+TiMzUUkUjD0iEi1fzX8GmXyuxUBRcaUfykV0YZnlJGKQpOiGB76x5GeWkWWJc3mOrK6S7xdND+W5N6XyaRgtWJFe13GkaZnKOsYqGdOVVVbGupsyA/l7emTLHi7vwTdirNEt0qxnzAvBFcnQF16xh/TMpUuXHDowhlA9vQVraQhkudRdzOnK+04ZSP3DUhVSP61YsaLtd/ks7ZgtPcXqPqEafHkdqa84X6aCeL7YWlv6edGFHb+ZFICPlljHhg0bKuk0CSvVznWsotRu433alNdFrqG45ejoaPCaUkWERpLXjzFL2Rpllp7PJU2a/v7Ab8N05/9t27Z16KUqoFGsxnI9EosS2niSYg9SpU6B4JgTrvVW1flt1sT+0ADIJU2maXzcUTraGCRaL1Wp9rUMk16PMom8QhruxzvZIegJjFU7LLCePfS8uaQdPny4jTTL0dbee5mYokQsXTIWNY46kuMbnt8Kmec+LGWtOVIl9cT1rCB0V8WqkjAsRwta93TbwNYoGKsUSChN44lgBNCoHLHzquYKrU6qZ8lolCIN0Rh6cP0Q3U6I6IXILYOQI513hJaSKAorFpuHXJNfVlpRtmYBk1Su1obZr5dnKAO+L10Hrj3WZW+E3qh6IszE37F6EB+68mGpvKm4eb9bFrlzrok7fvr0Kfv727dvWRmdVTJHw0qiiCUSZ6wCK+7XL/AcsgNyL74DQQ730sv78Su7+t/A36MdY0sW5o40ahslXr58aZ5HtZB8GH64m9EmMZ7FpYw4T6QnrZfgenrhFxaSiSGXtPnz57e9TkNZLvTjeqhr734CNtrK41L40sUQckmj1lGKQ0rC37x544r8eNXRpnVE3ZZY7zXo8NomiO0ZUCj2uHz58rbXoZ6gc0uA+F6ZeKS/jhRDUq8MKrTho9fEkihMmhxtBI1DxKFY9XLpVcSkfoi8JGnToZO5sU5aiDQIW716ddt7ZLYtMQlhECdBGXZZMWldY5BHm5xgAroWj4C0hbYkSc/jBmggIrXJWlZM6pSETsEPGqZOndr2uuuR5rF169a2HoHPdurUKZM4CO1WTPqaDaAd+GFGKdIQkxAn9RuEWcTRyN2KSUgiSgF5aWzPTeA/lN5rZubMmR2bE4SIC4nJoltgAV/dVefZm72AtctUCJU2CMJ327hxY9t7EHbkyJFseq+EJSY16RPo3Dkq1kkr7+q0bNmyDuLQcZBEPYmHVdOBiJyIlrRDq41YPWfXOxUysi5fvtyaj+2BpcnsUV/oSoEMOk2CQGlr4ckhBwaetBhjCwH0ZHtJROPJkyc7UjcYLDjmrH7ADTEBXFfOYmB0k9oYBOjJ8b4aOYSe7QkKcYhFlq3QYLQhSidNmtS2RATwy8YOM3EQJsUjKiaWZ+vZToUQgzhkHXudb/PW5YMHD9yZM2faPsMwoc7RciYJXbGuBqJ1UIGKKLv915jsvgtJxCZDubdXr165mzdvtr1Hz5LONA8jrUwKPqsmVesKa49S3Q4WxmRPUEYdTjgiUcfUwLx589ySJUva3oMkP6IYddq6HMS4o55xBJBUeRjzfa4Zdeg56QZ43LhxoyPo7Lf1kNt7oO8wWAbNwaYjIv5lhyS7kRf96dvm5Jah8vfvX3flyhX35cuX6HfzFHOToS1H4BenCaHvO8pr8iDuwoUL7tevX+b5ZdbBair0xkFIlFDlW4ZknEClsp/TzXyAKVOmmHWFVSbDNw1l1+4f90U6IY/q4V27dpnE9bJ+v87QEydjqx/UamVVPRG+mwkNTYN+9tjkwzEx+atCm/X9WvWtDtAb68Wy9LXa1UmvCDDIpPkyOQ5ZwSzJ4jMrvFcr0rSjOUh+GcT4LSg5ugkW1Io0/SCDQBojh0hPlaJdah+tkVYrnTZowP8iq1F1TgMBBauufyB33x1v+NWFYmT5KmppgHC+NkAgbmRkpD3yn9QIseXymoTQFGQmIOKTxiZIWpvAatenVqRVXf2nTrAWMsPnKrMZHz6bJq5jvce6QK8J1cQNgKxlJapMPdZSR64/UivS9NztpkVEdKcrs5alhhWP9NeqlfWopzhZScI6QxseegZRGeg5a8C3Re1Mfl1ScP36ddcUaMuv24iOJtz7sbUjTS4qBvKmstYJoUauiuD3k5qhyr7QdUHMeCgLa1Ear9NquemdXgmum4fvJ6w1lqsuDhNrg1qSpleJK7K3TF0Q2jSd94uSZ60kK1e3qyVpQK6PVWXp2/FC3mp6jBhKKOiY2h3gtUV64TWM6wDETRPLDfSakXmH3w8g9Jlug8ZtTt4kVF0kLUYYmCCtD/DrQ5YhMGbA9L3ucdjh0y8kOHW5gU/VEEmJTcL4Pz/f7mgoAbYkAAAAAElFTkSuQmCC"]
+    },
+  ]
+}'
+```
+
+##### Response
+
+```json
+{
+  "model": "llava",
+  "created_at": "2023-12-13T22:42:50.203334Z",
+  "message": {
+    "role": "assistant",
+    "content": " The image features a cute, little pig with an angry facial expression. It's wearing a heart on its shirt and is waving in the air. This scene appears to be part of a drawing or sketching project.",
+    "images": null
+  },
+  "done": true,
+  "total_duration": 1668506709,
+  "load_duration": 1986209,
+  "prompt_eval_count": 26,
+  "prompt_eval_duration": 359682000,
+  "eval_count": 83,
+  "eval_duration": 1303285000
 }
 ```

@@ -294,7 +574,7 @@ curl http://localhost:11434/api/generate -d '{
 POST /api/create
 ```

-Create a model from a [`Modelfile`](./modelfile.md). It is recommended to set `modelfile` to the content of the Modelfile rather than just set `path`. This is a requirement for remote create. Remote model creation should also create any file blobs, fields such as `FROM` and `ADAPTER`, explicitly with the server using [Create a Blob](#create-a-blob) and the value to the path indicated in the response.
+Create a model from a [`Modelfile`](./modelfile.md). It is recommended to set `modelfile` to the content of the Modelfile rather than just set `path`. This is a requirement for remote create. Remote model creation must also create any file blobs, fields such as `FROM` and `ADAPTER`, explicitly with the server using [Create a Blob](#create-a-blob) and the value to the path indicated in the response.

 ### Parameters

@@ -305,7 +585,11 @@ Create a model from a [`Modelfile`](./modelfile.md). It is recommended to set `m

 ### Examples

-#### Request
+#### Create a new model
+
+Create a new model from a `Modelfile`.
+
+##### Request

 ```shell
 curl http://localhost:11434/api/create -d '{
@@ -314,14 +598,22 @@ curl http://localhost:11434/api/create -d '{
 }'
 ```

-#### Response
+##### Response

-A stream of JSON objects. When finished, `status` is `success`.
+A stream of JSON objects. Notice that the final JSON object shows a `"status": "success"`.

 ```json
-{
-  "status": "parsing modelfile"
-}
+{"status":"reading model metadata"}
+{"status":"creating system layer"}
+{"status":"using already created layer sha256:22f7f8ef5f4c791c1b03d7eb414399294764d7cc82c7e94aa81a1feb80a983a2"}
+{"status":"using already created layer sha256:8c17c2ebb0ea011be9981cc3922db8ca8fa61e828c5d3f44cb6ae342bf80460b"}
+{"status":"using already created layer sha256:7c23fb36d80141c4ab8cdbb61ee4790102ebd2bf7aeff414453177d4f2110e5d"}
+{"status":"using already created layer sha256:2e0493f67d0c8c9c68a8aeacdf6a38a2151cb3c4c1d42accf296e19810527988"}
+{"status":"using already created layer sha256:2759286baa875dc22de5394b4a925701b1896a7e3f8e53275c36f75a877a82c9"}
+{"status":"writing layer sha256:df30045fe90f0d750db82a058109cecd6d4de9c90a3d75b19c09e5f64580bb42"}
+{"status":"writing layer sha256:f18a68eb09bf925bb1b669490407c1b1251c5db98dc4d3d81f3088498ea55690"}
+{"status":"writing manifest"}
+{"status":"success"}
 ```

 ### Check if a Blob Exists
@@ -330,7 +622,7 @@ A stream of JSON objects. When finished, `status` is `success`.
 HEAD /api/blobs/:digest
 ```

-Check if a blob is known to the server.
+Ensures that the file blob used for a FROM or ADAPTER field exists on the server. This is checking your Ollama server and not Ollama.ai.

 #### Query Parameters

@@ -354,7 +646,7 @@ Return 200 OK if the blob exists, 404 Not Found if it does not.
 POST /api/blobs/:digest
 ```

-Create a blob from a file. Returns the server file path.
+Create a blob from a file on the server. Returns the server file path.

 #### Query Parameters

@@ -370,7 +662,7 @@ curl -T model.bin -X POST http://localhost:11434/api/blobs/sha256:29fdb92e57cf08

 ##### Response

-Return 201 Created if the blob was successfully created.
+Return 201 Created if the blob was successfully created, 400 Bad Request if the digest used is not expected.

 ## List Local Models

@@ -396,14 +688,30 @@ A single JSON object will be returned.
 {
  "models": [
    {
-      "name": "llama2",
-      "modified_at": "2023-08-02T17:02:23.713454393-07:00",
-      "size": 3791730596
+      "name": "codellama:13b",
+      "modified_at": "2023-11-04T14:56:49.277302595-07:00",
+      "size": 7365960935,
+      "digest": "9f438cb9cd581fc025612d27f7c1a6669ff83a8bb0ed86c94fcf4c5440555697",
+      "details": {
+        "format": "gguf",
+        "family": "llama",
+        "families": null,
+        "parameter_size": "13B",
+        "quantization_level": "Q4_0"
+      }
    },
    {
-      "name": "llama2:13b",
-      "modified_at": "2023-08-08T12:08:38.093596297-07:00",
-      "size": 7323310500
+      "name": "llama2:latest",
+      "modified_at": "2023-12-07T09:32:18.757212583-08:00",
+      "size": 3825819519,
+      "digest": "fe938a131f40e6f6d40083c9f0f430a515233eb2edaa6d72eb85c50d64f2300e",
+      "details": {
+        "format": "gguf",
+        "family": "llama",
+        "families": null,
+        "parameter_size": "7B",
+        "quantization_level": "Q4_0"
+      }
    }
  ]
 }
@@ -415,7 +723,7 @@ A single JSON object will be returned.
 POST /api/show
 ```

-Show details about a model including modelfile, template, parameters, license, and system prompt.
+Show information about a model including details, modelfile, template, parameters, license, and system prompt.

 ### Parameters

@@ -435,10 +743,16 @@ curl http://localhost:11434/api/show -d '{

 ```json
 {
-  "license": "<contents of license block>",
-  "modelfile": "# Modelfile generated by \"ollama show\"\n# To build a new Modelfile based on this one, replace the FROM line with:\n# FROM llama2:latest\n\nFROM /Users/username/.ollama/models/blobs/sha256:8daa9615cce30c259a9555b1cc250d461d1bc69980a274b44d7eda0be78076d8\nTEMPLATE \"\"\"[INST] {{ if and .First .System }}<<SYS>>{{ .System }}<</SYS>>\n\n{{ end }}{{ .Prompt }} [/INST] \"\"\"\nSYSTEM \"\"\"\"\"\"\nPARAMETER stop [INST]\nPARAMETER stop [/INST]\nPARAMETER stop <<SYS>>\nPARAMETER stop <</SYS>>\n",
-  "parameters": "stop                           [INST]\nstop                           [/INST]\nstop                           <<SYS>>\nstop                           <</SYS>>",
-  "template": "[INST] {{ if and .First .System }}<<SYS>>{{ .System }}<</SYS>>\n\n{{ end }}{{ .Prompt }} [/INST] "
+  "modelfile": "# Modelfile generated by \"ollama show\"\n# To build a new Modelfile based on this one, replace the FROM line with:\n# FROM llava:latest\n\nFROM /Users/matt/.ollama/models/blobs/sha256:200765e1283640ffbd013184bf496e261032fa75b99498a9613be4e94d63ad52\nTEMPLATE \"\"\"{{ .System }}\nUSER: {{ .Prompt }}\nASSSISTANT: \"\"\"\nPARAMETER num_ctx 4096\nPARAMETER stop \"\u003c/s\u003e\"\nPARAMETER stop \"USER:\"\nPARAMETER stop \"ASSSISTANT:\"",
+  "parameters": "num_ctx                        4096\nstop                           \u003c/s\u003e\nstop                           USER:\nstop                           ASSSISTANT:",
+  "template": "{{ .System }}\nUSER: {{ .Prompt }}\nASSSISTANT: ",
+  "details": {
+    "format": "gguf",
+    "family": "llama",
+    "families": ["llama", "clip"],
+    "parameter_size": "7B",
+    "quantization_level": "Q4_0"
+  }
 }
 ```

@@ -463,7 +777,7 @@ curl http://localhost:11434/api/copy -d '{

 #### Response

-The only response is a 200 OK if successful.
+Returns a 200 OK if successful, or a 404 Not Found if the source model doesn't exist.

 ## Delete a Model

@@ -489,7 +803,7 @@ curl -X DELETE http://localhost:11434/api/delete -d '{

 #### Response

-If successful, the only response is a 200 OK.
+Returns a 200 OK if successful, 404 Not Found if the model to be deleted doesn't exist.

 ## Pull a Model

--- a/docs/development.md
+++ b/docs/development.md
@@ -1,20 +1,26 @@
 # Development

- Install cmake or (optionally, required tools for GPUs)
- run `go generate ./...`
- run `go build .`
-
 Install required tools:

 - cmake version 3.24 or higher
- go version 1.20 or higher
+- go version 1.21 or higher
 - gcc version 11.4.0 or higher

 ```bash
 brew install go cmake gcc
 ```

-Get the required libraries:
+Optionally enable debugging and more verbose logging:
+
+```bash
+# At build time
+export CGO_CFLAGS="-g"
+
+# At runtime
+export OLLAMA_DEBUG=1
+```
+
+Get the required libraries and build the native LLM code:

 ```bash
 go generate ./...
@@ -32,8 +38,100 @@ Now you can run `ollama`:
 ./ollama
 ```

-## Building on Linux with GPU support
+### Linux

- Install cmake and nvidia-cuda-toolkit
- run `go generate ./...`
- run `go build .`
+#### Linux CUDA (NVIDIA)
+
+*Your operating system distribution may already have packages for NVIDIA CUDA. Distro packages are often preferable, but instructions are distro-specific. Please consult distro-specific docs for dependencies if available!*
+
+Install `cmake` and `golang` as well as [NVIDIA CUDA](https://developer.nvidia.com/cuda-downloads)
+development and runtime packages. 
+
+Typically the build scripts will auto-detect CUDA, however, if your Linux distro
+or installation approach uses unusual paths, you can specify the location by
+specifying an environment variable `CUDA_LIB_DIR` to the location of the shared
+libraries, and `CUDACXX` to the location of the nvcc compiler.
+
+Then generate dependencies:
+
+```
+go generate ./...
+```
+
+Then build the binary:
+
+```
+go build .
+```
+
+#### Linux ROCm (AMD)
+
+*Your operating system distribution may already have packages for AMD ROCm and CLBlast. Distro packages are often preferable, but instructions are distro-specific. Please consult distro-specific docs for dependencies if available!*
+
+Install [CLBlast](https://github.com/CNugteren/CLBlast/blob/master/doc/installation.md) and [ROCm](https://rocm.docs.amd.com/en/latest/deploy/linux/quick_start.html) developement packages first, as well as `cmake` and `golang`.
+
+Typically the build scripts will auto-detect ROCm, however, if your Linux distro
+or installation approach uses unusual paths, you can specify the location by
+specifying an environment variable `ROCM_PATH` to the location of the ROCm
+install (typically `/opt/rocm`), and `CLBlast_DIR` to the location of the
+CLBlast install (typically `/usr/lib/cmake/CLBlast`).  You can also customize
+the AMD GPU targets by setting AMDGPU_TARGETS (e.g. `AMDGPU_TARGETS="gfx1101;gfx1102"`)
+
+```
+go generate ./...
+```
+
+Then build the binary:
+
+```
+go build .
+```
+
+ROCm requires elevated privileges to access the GPU at runtime.  On most distros you can add your user account to the `render` group, or run as root.
+
+#### Advanced CPU Settings
+
+By default, running `go generate ./...` will compile a few different variations
+of the LLM library based on common CPU families and vector math capabilities,
+including a lowest-common-denominator which should run on almost any 64 bit CPU
+somewhat slowly.  At runtime, Ollama will auto-detect the optimal variation to
+load.  If you would like to build a CPU-based build customized for your
+processor, you can set `OLLAMA_CUSTOM_CPU_DEFS` to the llama.cpp flags you would
+like to use.  For example, to compile an optimized binary for an Intel i9-9880H,
+you might use:
+
+```
+OLLAMA_CUSTOM_CPU_DEFS="-DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_F16C=on -DLLAMA_FMA=on" go generate ./...
+go build .
+```
+
+#### Containerized Linux Build
+
+If you have Docker available, you can build linux binaries with `./scripts/build_linux.sh` which has the CUDA and ROCm dependencies included.  The resulting binary is placed in `./dist`
+
+
+### Windows
+
+Note: The windows build for Ollama is still under development.
+
+Install required tools:
+
+- MSVC toolchain - C/C++ and cmake as minimal requirements
+- go version 1.21 or higher
+- MinGW (pick one variant) with GCC.
+  - <https://www.mingw-w64.org/>
+  - <https://www.msys2.org/>
+
+```powershell
+$env:CGO_ENABLED="1"
+
+go generate ./...
+
+go build .
+```
+
+#### Windows CUDA (NVIDIA)
+
+In addition to the common Windows development tools described above, install:
+
+- [NVIDIA CUDA](https://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/index.html)
--- a/docs/faq.md
+++ b/docs/faq.md
@@ -1,138 +1,91 @@
 # FAQ

+## How can I upgrade Ollama?
+
+To upgrade Ollama, run the installation process again. On the Mac, click the Ollama icon in the menubar and choose the restart option if an update is available.
+
 ## How can I view the logs?

-On macOS:
+Review the [Troubleshooting](./troubleshooting.md) docs for more about using logs.

-```
-cat ~/.ollama/logs/server.log
-```
+## How do I configure Ollama server?

-On Linux:
+Ollama server can be configured with environment variables.

-```
-journalctl -u ollama
-```
+### Setting environment variables on Mac

-If you're running `ollama serve` directly, the logs will be printed to the console.
+If Ollama is run as a macOS application, environment variables should be set using `launchctl`:
+
+1. For each environment variable, call `launchctl setenv`.
+
+    ```bash
+    launchctl setenv OLLAMA_HOST "0.0.0.0"
+    ```
+
+2. Restart Ollama application.
+
+### Setting environment variables on Linux
+
+If Ollama is run as a systemd service, environment variables should be set using `systemctl`:
+
+1. Edit the systemd service by calling `systemctl edit ollama.service`. This will open an editor.
+
+2. For each environment variable, add a line `Environment` under section `[Service]`:
+
+    ```ini
+    [Service]
+    Environment="OLLAMA_HOST=0.0.0.0"
+    ```
+
+3. Save and exit.
+
+4. Reload `systemd` and restart Ollama:
+
+   ```bash
+   systemctl daemon-reload
+   systemctl restart ollama
+   ```

 ## How can I expose Ollama on my network?

-Ollama binds to 127.0.0.1 port 11434 by default. Change the bind address with the `OLLAMA_HOST` environment variable.
+Ollama binds 127.0.0.1 port 11434 by default. Change the bind address with the `OLLAMA_HOST` environment variable.

-On macOS:
-
-```bash
-OLLAMA_HOST=0.0.0.0:11435 ollama serve
-```
-
-On Linux:
-
-Create a `systemd` drop-in directory and set `Environment=OLLAMA_HOST`
-
-```bash
-mkdir -p /etc/systemd/system/ollama.service.d
-echo '[Service]' >>/etc/systemd/system/ollama.service.d/environment.conf
-```
-
-```bash
-echo 'Environment="OLLAMA_HOST=0.0.0.0:11434"' >>/etc/systemd/system/ollama.service.d/environment.conf
-```
-
-Reload `systemd` and restart Ollama:
-
-```bash
-systemctl daemon-reload
-systemctl restart ollama
-```
+Refer to the section [above](#how-do-i-configure-ollama-server) for how to set environment variables on your platform.

 ## How can I allow additional web origins to access Ollama?

-Ollama allows cross origin requests from `127.0.0.1` and `0.0.0.0` by default. Add additional origins with the `OLLAMA_ORIGINS` environment variable:
+Ollama allows cross-origin requests from `127.0.0.1` and `0.0.0.0` by default. Additional origins can be configured with `OLLAMA_ORIGINS`.

-On macOS:
-
-```bash
-OLLAMA_ORIGINS=http://192.168.1.1:*,https://example.com ollama serve
-```
-
-On Linux:
-
-```bash
-echo 'Environment="OLLAMA_ORIGINS=http://129.168.1.1:*,https://example.com"' >>/etc/systemd/system/ollama.service.d/environment.conf
-```
-
-Reload `systemd` and restart Ollama:
-
-```bash
-systemctl daemon-reload
-systemctl restart ollama
-```
+Refer to the section [above](#how-do-i-configure-ollama-server) for how to set environment variables on your platform.

 ## Where are models stored?

- macOS: Raw model data is stored under `~/.ollama/models`.
- Linux: Raw model data is stored under `/usr/share/ollama/.ollama/models`
+- macOS: `~/.ollama/models`.
+- Linux: `/usr/share/ollama/.ollama/models`

-Below the models directory you will find a structure similar to the following:
+### How do I set them to a different location?

-```shell
-.
-├── blobs
-└── manifests
-   └── registry.ollama.ai
-      ├── f0rodo
-      ├── library
-      ├── mattw
-      └── saikatkumardey
-```
+If a different directory needs to be used, set the environment variable `OLLAMA_MODELS` to the chosen directory.

-There is a `manifests/registry.ollama.ai/namespace` path. In example above, the user has downloaded models from the official `library`, `f0rodo`, `mattw`, and `saikatkumardey` namespaces. Within each of those directories, you will find directories for each of the models downloaded. And in there you will find a file name representing each tag. Each tag file is the manifest for the model.  
-
-The manifest lists all the layers used in this model. You will see a `media type` for each layer, along with a digest. That digest corresponds with a file in the `models/blobs directory`.
-
-### How can I change where Ollama stores models?
-
-To modify where models are stored, you can use the `OLLAMA_MODELS` environment variable. Note that on Linux this means defining `OLLAMA_MODELS` in a drop-in `/etc/systemd/system/ollama.service.d` service file, reloading systemd, and restarting the ollama service.
+Refer to the section [above](#how-do-i-configure-ollama-server) for how to set environment variables on your platform.

 ## Does Ollama send my prompts and answers back to Ollama.ai to use in any way?

-No. Anything you do with Ollama, such as generate a response from the model, stays with you. We don't collect any data about how you use the model. You are always in control of your own data.
+No, Ollama runs entirely locally, and conversation data will never leave your machine.

 ## How can I use Ollama in Visual Studio Code?

-There is already a large collection of plugins available for VSCode as well as other editors that leverage Ollama. You can see the list of [extensions & plugins](https://github.com/jmorganca/ollama#extensions--plugins) at the bottom of the main repository readme.
+There is already a large collection of plugins available for VSCode as well as other editors that leverage Ollama. See the list of [extensions & plugins](https://github.com/jmorganca/ollama#extensions--plugins) at the bottom of the main repository readme.

 ## How do I use Ollama behind a proxy?

-Ollama is compatible with proxy servers if `HTTP_PROXY` or `HTTPS_PROXY` are configured. When using either variables, ensure it is set where `ollama serve` can access the values.
-
-When using `HTTPS_PROXY`, ensure the proxy certificate is installed as a system certificate.
-
-On macOS:
-
-```bash
-HTTPS_PROXY=http://proxy.example.com ollama serve
-```
-
-On Linux:
-
-```bash
-echo 'Environment="HTTPS_PROXY=https://proxy.example.com"' >>/etc/systemd/system/ollama.service.d/environment.conf
-```
-
-Reload `systemd` and restart Ollama:
-
-```bash
-systemctl daemon-reload
-systemctl restart ollama
-```
+Ollama is compatible with proxy servers if `HTTP_PROXY` or `HTTPS_PROXY` are configured. When using either variables, ensure it is set where `ollama serve` can access the values. When using `HTTPS_PROXY`, ensure the proxy certificate is installed as a system certificate. Refer to the section above for how to use environment variables on your platform.

 ### How do I use Ollama behind a proxy in Docker?

 The Ollama Docker container image can be configured to use a proxy by passing `-e HTTPS_PROXY=https://proxy.example.com` when starting the container.

-Alternatively, Docker daemon can be configured to use a proxy. Instructions are available for Docker Desktop on [macOS](https://docs.docker.com/desktop/settings/mac/#proxies), [Windows](https://docs.docker.com/desktop/settings/windows/#proxies), and [Linux](https://docs.docker.com/desktop/settings/linux/#proxies), and Docker [daemon with systemd](https://docs.docker.com/config/daemon/systemd/#httphttps-proxy).
+Alternatively, the Docker daemon can be configured to use a proxy. Instructions are available for Docker Desktop on [macOS](https://docs.docker.com/desktop/settings/mac/#proxies), [Windows](https://docs.docker.com/desktop/settings/windows/#proxies), and [Linux](https://docs.docker.com/desktop/settings/linux/#proxies), and Docker [daemon with systemd](https://docs.docker.com/config/daemon/systemd/#httphttps-proxy).

 Ensure the certificate is installed as a system certificate when using HTTPS. This may require a new Docker image when using a self-signed certificate.

@@ -154,3 +107,11 @@ docker run -d -e HTTPS_PROXY=https://my.proxy.example.com -p 11434:11434 ollama-
 The Ollama Docker container can be configured with GPU acceleration in Linux or Windows (with WSL2). This requires the [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia-container-toolkit). See [ollama/ollama](https://hub.docker.com/r/ollama/ollama) for more details.

 GPU acceleration is not available for Docker Desktop in macOS due to the lack of GPU passthrough and emulation.
+
+## Why is networking slow in WSL2 on Windows 10?
+
+This can impact both installing Ollama, as well as downloading models.
+
+Open `Control Panel > Networking and Internet > View network status and tasks` and click on `Change adapter settings` on the left panel. Find the `vEthernel (WSL)` adapter, right click and select `Properties`.
+Click on `Configure` and open the `Advanced` tab. Search through each of the properties until you find `Large Send Offload Version 2 (IPv4)` and `Large Send Offload Version 2 (IPv6)`. *Disable* both of these
+properties.
--- a/docs/import.md
+++ b/docs/import.md
@@ -43,7 +43,6 @@ Ollama supports a set of model architectures, with support for more coming soon:

 - Llama & Mistral
 - Falcon & RW
- GPT-NeoX
 - BigCode

 To view a model's architecture, check the `config.json` file in its HuggingFace repo. You should see an entry under `architectures` (e.g. `LlamaForCausalLM`).
@@ -73,7 +72,7 @@ docker run --rm -v .:/model ollama/quantize -q q4_0 /model
 This will output two files into the directory:

 - `f16.bin`: the model converted to GGUF
- `q4_0.bin` the model quantized to a 4-bit quantization (we will use this file to create the Ollama model)
+- `q4_0.bin` the model quantized to a 4-bit quantization (Ollama will use this file to create the Ollama model)

 ### Step 3: Write a `Modelfile`

@@ -149,6 +148,7 @@ The quantization options are as follow (from highest highest to lowest levels of
 - `q5_K_M`
 - `q6_K`
 - `q8_0`
+- `f16`

 ## Manually converting & quantizing models

@@ -184,9 +184,6 @@ python convert.py <path to model directory>
 # FalconForCausalLM
 python convert-falcon-hf-to-gguf.py <path to model directory>

-# GPTNeoXForCausalLM
-python convert-gptneox-hf-to-gguf.py <path to model directory>
-
 # GPTBigCodeForCausalLM
 python convert-starcoder-hf-to-gguf.py <path to model directory>
 ```
--- a/docs/linux.md
+++ b/docs/linux.md
@@ -109,8 +109,9 @@ Remove the ollama binary from your bin directory (either `/usr/local/bin`, `/usr
 sudo rm $(which ollama)
 ```

-Remove the downloaded models and Ollama service user:
+Remove the downloaded models and Ollama service user and group:
 ```bash
 sudo rm -r /usr/share/ollama
 sudo userdel ollama
+sudo groupdel ollama
 ```
--- a/docs/modelfile.md
+++ b/docs/modelfile.md
@@ -1,6 +1,6 @@
 # Ollama Model File

-> Note: this `Modelfile` syntax is in development
+> Note: `Modelfile` syntax is in development

 A model file is the blueprint to create and share models with Ollama.

@@ -30,14 +30,14 @@ The format of the `Modelfile`:
 INSTRUCTION arguments
 ```

-| Instruction                         | Description                                                   |
-| ----------------------------------- | ------------------------------------------------------------- |
-| [`FROM`](#from-required) (required) | Defines the base model to use.                                |
-| [`PARAMETER`](#parameter)           | Sets the parameters for how Ollama will run the model.        |
-| [`TEMPLATE`](#template)             | The full prompt template to be sent to the model.             |
-| [`SYSTEM`](#system)                 | Specifies the system prompt that will be set in the template. |
-| [`ADAPTER`](#adapter)               | Defines the (Q)LoRA adapters to apply to the model.           |
-| [`LICENSE`](#license)               | Specifies the legal license.                                  |
+| Instruction                         | Description                                                    |
+| ----------------------------------- | -------------------------------------------------------------- |
+| [`FROM`](#from-required) (required) | Defines the base model to use.                                 |
+| [`PARAMETER`](#parameter)           | Sets the parameters for how Ollama will run the model.         |
+| [`TEMPLATE`](#template)             | The full prompt template to be sent to the model.              |
+| [`SYSTEM`](#system)                 | Specifies the system message that will be set in the template. |
+| [`ADAPTER`](#adapter)               | Defines the (Q)LoRA adapters to apply to the model.            |
+| [`LICENSE`](#license)               | Specifies the legal license.                                   |

 ## Examples

@@ -52,7 +52,7 @@ PARAMETER temperature 1
 # sets the context window size to 4096, this controls how many tokens the LLM can use as context to generate the next token
 PARAMETER num_ctx 4096

-# sets a custom system prompt to specify the behavior of the chat assistant
+# sets a custom system message to specify the behavior of the chat assistant
 SYSTEM You are Mario from super mario bros, acting as an assistant.
 ```

@@ -70,12 +70,12 @@ More examples are available in the [examples directory](../examples).
 There are two ways to view `Modelfile`s underlying the models in [ollama.ai/library][1]:

 - Option 1: view a details page from a model's tags page:
-   1. Go to a particular model's tags (e.g. https://ollama.ai/library/llama2/tags)
-   2. Click on a tag (e.g. https://ollama.ai/library/llama2:13b)
-   3. Scroll down to "Layers"
+  1.  Go to a particular model's tags (e.g. https://ollama.ai/library/llama2/tags)
+  2.  Click on a tag (e.g. https://ollama.ai/library/llama2:13b)
+  3.  Scroll down to "Layers"
      - Note: if the [`FROM` instruction](#from-required) is not present,
        it means the model was created from a local file
- Option 2: use `ollama show` to print the `Modelfile` like so:
+- Option 2: use `ollama show` to print the `Modelfile` for any local models like so:

  ```bash
  > ollama show --modelfile llama2:13b
@@ -152,15 +152,16 @@ PARAMETER <parameter> <parametervalue>

 ### TEMPLATE

-`TEMPLATE` of the full prompt template to be passed into the model. It may include (optionally) a system prompt and a user's prompt. This is used to create a full custom prompt, and syntax may be model specific. You can usually find the template for a given model in the readme for that model.
+`TEMPLATE` of the full prompt template to be passed into the model. It may include (optionally) a system message and a user's prompt. This is used to create a full custom prompt, and syntax may be model specific. You can usually find the template for a given model in the readme for that model.

 #### Template Variables

-| Variable        | Description                                                                                                  |
-| --------------- | ------------------------------------------------------------------------------------------------------------ |
-| `{{ .System }}` | The system prompt used to specify custom behavior, this must also be set in the Modelfile as an instruction. |
-| `{{ .Prompt }}` | The incoming prompt, this is not specified in the model file and will be set based on input.                 |
-| `{{ .First }}`  | A boolean value used to render specific template information for the first generation of a session.          |
+| Variable          | Description                                                                                                   |
+| ----------------- | ------------------------------------------------------------------------------------------------------------- |
+| `{{ .System }}`   | The system message used to specify custom behavior, this must also be set in the Modelfile as an instruction. |
+| `{{ .Prompt }}`   | The incoming prompt, this is not specified in the model file and will be set based on input.                  |
+| `{{ .Response }}` | The response from the LLM, if not specified response is appended to the end of the template.                  |
+| `{{ .First }}`    | A boolean value used to render specific template information for the first generation of a session.           |

 ```modelfile
 TEMPLATE """
@@ -180,7 +181,7 @@ SYSTEM """<system message>"""

 ### SYSTEM

-The `SYSTEM` instruction specifies the system prompt to be used in the template, if applicable.
+The `SYSTEM` instruction specifies the system message to be used in the template, if applicable.

 ```modelfile
 SYSTEM """<system message>"""
@@ -206,7 +207,7 @@ LICENSE """

 ## Notes

- the **`Modelfile` is not case sensitive**. In the examples, we use uppercase for instructions to make it easier to distinguish it from arguments.
- Instructions can be in any order. In the examples, we start with FROM instruction to keep it easily readable.
+- the **`Modelfile` is not case sensitive**. In the examples, uppercase instructions are used to make it easier to distinguish it from arguments.
+- Instructions can be in any order. In the examples, the `FROM` instruction is first to keep it easily readable.

 [1]: https://ollama.ai/library
--- a/docs/troubleshooting.md
+++ b/docs/troubleshooting.md
@@ -0,0 +1,53 @@
+# How to troubleshoot issues
+
+Sometimes Ollama may not perform as expected. One of the best ways to figure out what happened is to take a look at the logs. Find the logs on Mac by running the command:
+
+```shell
+cat ~/.ollama/logs/server.log
+```
+
+On Linux systems with systemd, the logs can be found with this command:
+
+```shell
+journalctl -u ollama
+```
+
+If manually running `ollama serve` in a terminal, the logs will be on that terminal.
+
+Join the [Discord](https://discord.gg/ollama) for help interpreting the logs.
+
+## LLM libraries
+
+Ollama includes multiple LLM libraries compiled for different GPUs and CPU
+vector features.  Ollama tries to pick the best one based on the capabilities of
+your system.  If this autodetection has problems, or you run into other problems
+(e.g. crashes in your GPU) you can workaround this by forcing a specific LLM
+library.  `cpu_avx2` will perform the best, followed by `cpu_avx` an the slowest
+but most compatible is `cpu`.  Rosetta emulation under MacOS will work with the
+`cpu` library. 
+
+In the server log, you will see a message that looks something like this (varies
+from release to release):
+
+```
+Dynamic LLM libraries [rocm_v6 cpu cpu_avx cpu_avx2 cuda_v11 rocm_v5]
+```
+
+**Experimental LLM Library Override**
+
+You can set OLLAMA_LLM_LIBRARY to any of the available LLM libraries to bypass
+autodetection, so for example, if you have a CUDA card, but want to force the
+CPU LLM library with AVX2 vector support, use:
+
+```
+OLLAMA_LLM_LIBRARY="cpu_avx2" ollama serve
+```
+
+You can see what features your CPU has with the following.  
+```
+cat /proc/cpuinfo| grep flags  | head -1
+```
+
+## Known issues
+
+* N/A
--- a/docs/tutorials/fly-gpu.md
+++ b/docs/tutorials/fly-gpu.md
@@ -0,0 +1,83 @@
+# Running Ollama on Fly.io GPU Instances
+
+Ollama runs with little to no configuration on [Fly.io GPU instances](https://fly.io/docs/gpus/gpu-quickstart/). If you don't have access to GPUs yet, you'll need to [apply for access](https://fly.io/gpu/) on the waitlist. Once you're accepted, you'll get an email with instructions on how to get started.
+
+Create a new app with `fly apps create`:
+
+```bash
+fly apps create
+```
+
+Then create a `fly.toml` file in a new folder that looks like this:
+
+```toml
+app = "sparkling-violet-709"
+primary_region = "ord"
+vm.size = "a100-40gb" # see https://fly.io/docs/gpus/gpu-quickstart/ for more info
+
+[build]
+  image = "ollama/ollama"
+
+[http_service]
+  internal_port = 11434
+  force_https = false
+  auto_stop_machines = true
+  auto_start_machines = true
+  min_machines_running = 0
+  processes = ["app"]
+
+[mounts]
+  source = "models"
+  destination = "/root/.ollama"
+  initial_size = "100gb"
+```
+
+Then create a [new private IPv6 address](https://fly.io/docs/reference/private-networking/#flycast-private-load-balancing) for your app:
+
+```bash
+fly ips allocate-v6 --private
+```
+
+Then deploy your app:
+
+```bash
+fly deploy
+```
+
+And finally you can access it interactively with a new Fly.io Machine:
+
+```
+fly machine run -e OLLAMA_HOST=http://your-app-name.flycast --shell ollama/ollama
+```
+
+```bash
+$ ollama run openchat:7b-v3.5-fp16
+>>> How do I bake chocolate chip cookies?
+ To bake chocolate chip cookies, follow these steps:
+
+1. Preheat the oven to 375°F (190°C) and line a baking sheet with parchment paper or silicone baking mat.
+
+2. In a large bowl, mix together 1 cup of unsalted butter (softened), 3/4 cup granulated sugar, and 3/4
+cup packed brown sugar until light and fluffy.
+
+3. Add 2 large eggs, one at a time, to the butter mixture, beating well after each addition. Stir in 1
+teaspoon of pure vanilla extract.
+
+4. In a separate bowl, whisk together 2 cups all-purpose flour, 1/2 teaspoon baking soda, and 1/2 teaspoon
+salt. Gradually add the dry ingredients to the wet ingredients, stirring until just combined.
+
+5. Fold in 2 cups of chocolate chips (or chunks) into the dough.
+
+6. Drop rounded tablespoons of dough onto the prepared baking sheet, spacing them about 2 inches apart.
+
+7. Bake for 10-12 minutes, or until the edges are golden brown. The centers should still be slightly soft.
+
+8. Allow the cookies to cool on the baking sheet for a few minutes before transferring them to a wire rack
+to cool completely.
+
+Enjoy your homemade chocolate chip cookies!
+```
+
+When you set it up like this, it will automatically turn off when you're done using it. Then when you access it again, it will automatically turn back on. This is a great way to save money on GPU instances when you're not using them. If you want a persistent wake-on-use connection to your Ollama instance, you can set up a [connection to your Fly network using WireGuard](https://fly.io/docs/reference/private-networking/#discovering-apps-through-dns-on-a-wireguard-connection). Then you can access your Ollama instance at `http://your-app-name.flycast`.
+
+And that's it!
--- a/docs/tutorials/langchainpy.md
+++ b/docs/tutorials/langchainpy.md
@@ -42,12 +42,13 @@ text_splitter=RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
 all_splits = text_splitter.split_documents(data)
 ```

-It's split up, but we have to find the relevant splits and then submit those to the model. We can do this by creating embeddings and storing them in a vector database. For now, we don't have embeddings built in to Ollama, though we will be adding that soon, so for now, we can use the GPT4All library for that. We will use ChromaDB in this example for a vector database. `pip install GPT4All chromadb`
+It's split up, but we have to find the relevant splits and then submit those to the model. We can do this by creating embeddings and storing them in a vector database. We can use Ollama directly to instantiate an embedding model. We will use ChromaDB in this example for a vector database. `pip install GPT4All chromadb`

 ```python
-from langchain.embeddings import GPT4AllEmbeddings
+from langchain.embeddings import OllamaEmbeddings
 from langchain.vectorstores import Chroma
-vectorstore = Chroma.from_documents(documents=all_splits, embedding=GPT4AllEmbeddings())
+oembed = OllamaEmbeddings(base_url="http://localhost:11434", model="llama2")
+vectorstore = Chroma.from_documents(documents=all_splits, embedding=oembed)
 ```

 Now let's ask a question from the document. **Who was Neleus, and who is in his family?** Neleus is a character in the Odyssey, and the answer can be found in our text.
--- a/examples/.gitignore
+++ b/examples/.gitignore
@@ -1,7 +1,10 @@
 node_modules
+bun.lockb
+.vscode
 # OSX
 .DS_STORE

+
 # Models
 models/

--- a/examples/golang-simplegenerate/main.go
+++ b/examples/golang-simplegenerate/main.go
@@ -18,6 +18,8 @@ func main() {
 		os.Exit(1)
 	}

+	defer resp.Body.Close()
+	
 	responseData, err := io.ReadAll(resp.Body)
 	if err != nil {
 		log.Fatal(err)
--- a/examples/kubernetes/gpu.yaml
+++ b/examples/kubernetes/gpu.yaml
@@ -25,9 +25,11 @@ spec:
        image: ollama/ollama:latest
        env:
        - name: PATH
-          value: /usr/local/nvidia/bin:/usr/local/nvidia/lib64:/usr/bin:/usr/sbin:/bin:/sbin
+          value: /usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
        - name: LD_LIBRARY_PATH
-          value: /usr/local/nvidia/lib64
+          value: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
+        - name: NVIDIA_DRIVER_CAPABILITIES
+          value: compute,utility
        ports:
        - name: http
          containerPort: 11434
--- a/examples/langchain-python-rag-websummary/README.md
+++ b/examples/langchain-python-rag-websummary/README.md
@@ -1,15 +1,23 @@
 # LangChain Web Summarization

-This example summarizes a website
+This example summarizes the website, [https://ollama.ai/blog/run-llama2-uncensored-locally](https://ollama.ai/blog/run-llama2-uncensored-locally)

-## Setup
+## Running the Example

-```
-pip install -r requirements.txt
-```
+1. Ensure you have the `llama2` model installed:

-## Run
+   ```bash
+   ollama pull llama2
+   ```

-```
-python main.py
-```
+2. Install the Python Requirements.
+
+   ```bash
+   pip install -r requirements.txt
+   ```
+
+3. Run the example:
+
+   ```bash
+   python main.py
+   ```
--- a/examples/langchain-python-rag-websummary/requirements.txt
+++ b/examples/langchain-python-rag-websummary/requirements.txt
@@ -1,2 +1 @@
 langchain==0.0.259
-bs4==0.0.1
--- a/examples/langchain-python-simple/README.md
+++ b/examples/langchain-python-simple/README.md
@@ -2,20 +2,23 @@

 This example is a basic "hello world" of using LangChain with Ollama.

-## Setup
+## Running the Example

-```
-pip install -r requirements.txt
-```
+1. Ensure you have the `llama2` model installed:

-## Run
+   ```bash
+   ollama pull llama2
+   ```

-```
-python main.py
-```
+2. Install the Python Requirements.

-Running this example will print the response for "hello":
+   ```bash
+   pip install -r requirements.txt
+   ```

-```
-Hello! It's nice to meet you. hopefully you are having a great day! Is there something I can help you with or would you like to chat?
-```
+3. Run the example:
+
+   ```bash
+   python main.py
+   ```
+  
--- a/examples/langchain-python-simple/main.py
+++ b/examples/langchain-python-simple/main.py
@@ -1,4 +1,6 @@
 from langchain.llms import Ollama
+
+input = input("What is your question?")
 llm = Ollama(model="llama2")
-res = llm.predict("hello")
+res = llm.predict(input)
 print (res)
--- a/examples/langchain-typescript-simple/README.md
+++ b/examples/langchain-typescript-simple/README.md
@@ -2,20 +2,22 @@

 This example is a basic "hello world" of using LangChain with Ollama using Node.js and Typescript.

-## Setup
+## Running the Example

-```shell
-npm install
-```
+1. Install the prerequisites:

-## Run
+   ```bash
+   npm install
+   ```

-```shell
-ts-node main.ts
-```
+2. Ensure the `mistral` model is available:

-Running this example will print the response for "hello":
+   ```bash
+   ollama pull mistral
+   ```

-```plaintext
-Hello! It's nice to meet you. hopefully you are having a great day! Is there something I can help you with or would you like to chat?
-```
+3. Run the example:
+
+   ```bash
+   npm start
+   ```
--- a/examples/langchain-typescript-simple/main.ts
+++ b/examples/langchain-typescript-simple/main.ts
@@ -1,15 +1,25 @@
-import { Ollama} from 'langchain/llms/ollama';
+import { Ollama } from 'langchain/llms/ollama';
+import * as readline from "readline";

 async function main() {
  const ollama = new Ollama({
    model: 'mistral'    
    // other parameters can be found at https://js.langchain.com/docs/api/llms_ollama/classes/Ollama
-  })
-  const stream = await ollama.stream("Hello");
+  });

-  for await (const chunk of stream) {
-    process.stdout.write(chunk);
-  }
+  const rl = readline.createInterface({
+    input: process.stdin,
+    output: process.stdout,
+  });
+
+  rl.question("What is your question: \n", async (user_input) => {
+    const stream = await ollama.stream(user_input);
+  
+    for await (const chunk of stream) {
+      process.stdout.write(chunk);
+    }
+    rl.close();
+  })
 }

 main();
--- a/examples/langchain-typescript-simple/package-lock.json
+++ b/examples/langchain-typescript-simple/package-lock.json
@@ -1,5 +1,5 @@
 {
-  "name": "with-langchain-typescript-simplegenerate",
+  "name": "langchain-typescript-simple",
  "lockfileVersion": 3,
  "requires": true,
  "packages": {
--- a/examples/langchain-typescript-simple/package.json
+++ b/examples/langchain-typescript-simple/package.json
@@ -1,8 +1,13 @@
 {
+  "scripts": {
+    "start": "tsx main.ts"
+  },
  "devDependencies": {
-    "typescript": "^5.2.2"
+    "tsx": "^4.6.2",
+    "typescript": "^5.3.3"
  },
  "dependencies": {
-    "langchain": "^0.0.165"
+    "langchain": "^0.0.165",
+    "readline": "^1.3.0"
  }
 }
--- a/examples/modelfile-10tweets/Modelfile
+++ b/examples/modelfile-10tweets/Modelfile
@@ -1,7 +0,0 @@
-# Modelfile for creating a list of ten tweets from a topic
-# Run `ollama create 10tweets -f ./Modelfile` and then `ollama run 10tweets` and enter a topic
-
-FROM llama2
-SYSTEM """
-You are a content marketer who needs to come up with 10 short but succinct tweets. The answer should be a list of ten tweets. Each tweet can have a maximum of 280 characters and should include hashtags. Each user input will be a subject and you should expand it in ten creative ways. Never stop after just one tweet. Always include ten. 
-"""
--- a/examples/modelfile-10tweets/README.md
+++ b/examples/modelfile-10tweets/README.md
@@ -1,23 +0,0 @@
-# Ten Tweets Modelfile
-
-This is a simple modelfile that generates ten tweets based off any topic.
-
-```bash
-ollama create tentweets
-
-ollama run tentweets
->>> underwater basketweaving
- Great! Here are ten creative tweets about underwater basketweaving:
-
-1. "Just discovered the ultimate stress-reliever: Underwater basketweaving! 🌊🧵 #UnderwaterBasketweaving #StressRelief"
-2. "Who needs meditation when you can do underwater basketweaving? 😴👀 #PeacefulDistraction #UnderwaterBasketweaving"
-3. "Just spent an hour in the pool and still managed to knot my basket. Goal: untangle it before next session. 💪🏽 #ChallengeAccepted #UnderwaterBasketweaving"
-4. "When life gives you lemons, make underwater basketweaving! 🍋🧵 #LemonadeLife #UnderwaterBasketweaving"
-5. "Just realized my underwater basketweaving skills could come in handy during a zombie apocalypse. 😂🧡 #SurvivalTips #UnderwaterBasketweaving"
-6. "I'm not lazy, I'm just conserving energy for my next underwater basketweaving session. 😴💤 #LazyDay #UnderwaterBasketweaving"
-7. "Just found my inner peace while doing underwater basketweaving. It's like meditation, but with knots! 🙏🧵 #Mindfulness #UnderwaterBasketweaving"
-8. "Why study for exams when you can do underwater basketweaving and forget all your worries? 😜🧵 #ProcrastinationStation #UnderwaterBasketweaving"
-9. "Just had to cut my underwater basketweaving session short due to a sudden urge to breathe. 🤯🌊 #AquaticAdventures #UnderwaterBasketweaving"
-10. "I'm not sure what's more impressive: my underwater basketweaving skills or the fact that I didn't drown trying to make this tweet. 😅🧵 #Accomplishment 
-#UnderwaterBasketweaving"
-```
--- a/examples/modelfile-devopsengineer/Modelfile
+++ b/examples/modelfile-devopsengineer/Modelfile
@@ -1,8 +0,0 @@
-# Modelfile for creating a devops engineer assistant
-# Run `ollama create devops-engineer -f ./Modelfile` and then `ollama run devops-engineer` and enter a topic
-
-FROM mistral
-PARAMETER temperature 1
-SYSTEM """
-You are a senior devops engineer, acting as an assistant. You offer help with cloud technologies like: Terraform, AWS, kubernetes, python. You answer with code examples when possible
-"""
--- a/examples/modelfile-devopsengineer/README.md
+++ b/examples/modelfile-devopsengineer/README.md
@@ -1,40 +0,0 @@
-# DevOps Engineer Modelfile
-
-This is a simple modelfile that takes will respond to any questions that a senior DevOps engineer should know how to solve.
-
-````text
-ollama create devopsengineer
-
-ollama run devopsengineer
->>> how can i proxy all connections from inside the office through a single server that caches large files. Provide a config file for nginx to achieve this?
-
-Here is an example configuration file for Nginx that can be used to proxy all connections from inside the office network to a centralized cache server:
-```perl
-http {
-    upstream cache_server {
-        server <cache_server_ip>;
-        server <cache_server_ip>;
-        server <cache_server_ip>;
-    }
-    
-    server {
-        listen 80;
-        server_name localhost;
-        
-        location / {
-            proxy_pass http://cache_server;
-            proxy_set_header Host $host;
-            proxy_set_header X-Real-IP $remote_addr;
-            proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
-        }
-    }
-}
-```
-In this configuration, we define an `upstream` block that lists the IP addresses of the cache servers. We then define a server block that listens on port 80 and routes all 
-traffic to the `/` location to the cache servers using the `proxy_pass` directive. The `proxy_set_header` directives are used to preserve the source IP address of the client
-request when forwarding it to the cache server.
-
-To use this configuration, you would need to replace the placeholder `<cache_server_ip>` with the actual IP addresses of your cache servers. You would also need to make sure
-that the cache servers are configured to accept incoming connections from the Nginx server and handle requests for files.
-
-````
--- a/examples/modelfile-midjourney/Modelfile
+++ b/examples/modelfile-midjourney/Modelfile
@@ -1,11 +0,0 @@
-# Modelfile for creating a Midjourney prompts from a topic
-# This prompt was adapted from the original at https://www.greataiprompts.com/guide/midjourney/best-chatgpt-prompt-for-midjourney/
-# Run `ollama create mj -f ./Modelfile` and then `ollama run mj` and enter a topic
-
-FROM zephyr
-PARAMETER temperature 0.8
-PARAMETER top_k 500
-PARAMETER top_p 0.9
-SYSTEM """
-Embrace your role as a creative illustrator. Based on a concept provided, you must produce a single paragraph with a multifaceted description of an image, ensuring significant details of the concept and more is represented in your instructions. You do not need to write complete sentences but rather short concepts with the following information: the level of detail that should be represented, an artistic style and maybe a specific name of a painter or illustrator, the ideal color pallete, lighting, mood, perspective, the setting, time of day, weather, the season, the time period, location, materials, the textures, patterns, lines, brushstrokes, techniques, the medium, the genre, the rendering style. Don't include everything and keep the description length under 250 words. 
-"""
--- a/examples/modelfile-midjourney/README.md
+++ b/examples/modelfile-midjourney/README.md
@@ -1,11 +0,0 @@
-# Midjourney Prompt Generator Modelfile
-
-This simple modelfile will help create a prompt to feed to Midjourney.
-
-```text
-ollama create midjourney
-
-ollama run midjourney
->>> a sports car in the mountains. 
-A sleek, high-performance automobile cuts through a serpentine mountain landscape. The concept is a classic illustration of speed and power, depicted in the style of pop art by Andy Warhol. The color palette is dominated by bold, primary hues of red, blue, and yellow, with striking accent colors of white, black, and metallic shades. The lighting is bright and focused, casting sharp shadows on the rugged terrain. A sense of excitement and anticipation permeates throughout the scene, as the car navigates a treacherous course through the winding road. The perspective is low, allowing for a full view of the vehicle's sleek lines and intricate details. The setting takes place in the afternoon during a sunny day in autumn, as evidenced by the vibrant foliage on the mountainside. The time period is modern, with nods to classic car design. The materials are primarily digital, allowing for smooth curves and sharp contrasts. The textures are sleek and polished, with meticulously detailed lines and brushstrokes that accentuate the car's aerodynamic design. The patterns consist of geometric shapes and bold stripes, adding to the car's dynamic appeal. The genre is modern realism, with a focus on precision and detail. The rendering style is highly technical, capturing the nuances and subtleties of the vehicle and its surroundings in breathtaking detail.
-```
--- a/examples/modelfile-recipemaker/Modelfile
+++ b/examples/modelfile-recipemaker/Modelfile
@@ -1,6 +0,0 @@
-# Modelfile for creating a recipe from a list of ingredients
-# Run `ollama create recipemaker -f ./Modelfile` and then `ollama run recipemaker` and feed it lists of ingredients to create recipes around.
-FROM nous-hermes
-SYSTEM """
-The instruction will be a list of ingredients. You should generate a recipe that can be made in less than an hour. You can also include ingredients that most people will find in their pantry every day. The recipe should be 4 people and you should include a description of what the meal will taste like
-"""
--- a/examples/modelfile-recipemaker/README.md
+++ b/examples/modelfile-recipemaker/README.md
@@ -1,20 +0,0 @@
-# Recipe Maker Modelfile 
-
-Simple modelfile to generate a recipe from a short list of ingredients.
-
-```
-ollama create recipemaker
-
-ollama run recipemaker
->>> chilli pepper, white chocolate, kale
- Ingredients:
- 1 small chili pepper
- 4 squares of white chocolate
- handful of kale leaves
-
-Instructions:
-1. In a blender or food processor, puree the chilies and white chocolate until smooth.
-2. Add the chopped kale leaves to the blender and pulse until well combined.
-3. Serve immediately as a dip for crackers or use it as an ingredient in your favorite recipe. The mixture of spicy chili pepper with sweet white chocolate and nutritious 
-kale will make your taste buds dance with delight!
-```
--- a/examples/modelfile-sentiments/Modelfile
+++ b/examples/modelfile-sentiments/Modelfile
@@ -1,28 +0,0 @@
-# Modelfile for creating a sentiment analyzer. 
-# Run `ollama create sentiments -f pathtofile` and then `ollama run sentiments` and enter a topic
-
-FROM orca
-TEMPLATE """
-{{- if .First }}
-### System:
-{{ .System }}
-{{- end }}
-### User: 
-I hate it when my phone dies
-### Response: 
-NEGATIVE
-### User: 
-He is awesome
-### Response: 
-POSITIVE
-### User: 
-This is the link to the article
-### Response: 
-NEUTRAL
-### User:
-{{ .Prompt }}
-
-### Response:
-"""
-
-SYSTEM """You are a sentiment analyzer. You will receive text and output only one word, either POSITIVE or NEGATIVE or NEUTRAL, depending on the sentiment of the text."""
--- a/examples/modelfile-sentiments/Readme.md
+++ b/examples/modelfile-sentiments/Readme.md
@@ -1,25 +0,0 @@
-# Sentiments Modelfile
-
-This is a simple sentiments analyzer using the Orca model. When you pull Orca from the registry, it has a Template already defined that looks like this:
-
-```Modelfile
-{{- if .First }}
-### System:
-{{ .System }}
-{{- end }}
-
-### User:
-{{ .Prompt }}
-
-### Response:
-```
-
-If we just wanted to have the text:
-
-```Plaintext
-You are a sentiment analyzer. You will receive text and output only one word, either POSITIVE or NEGATIVE or NEUTRAL, depending on the sentiment of the text.
-```
-
-then we could have put this in a SYSTEM block. But we want to provide examples which require updating the full Template. Any Modelfile you create will inherit all the settings from the source model. But in this example, we are overriding the Template.
-
-When providing examples for the input and output, you should include the way the model usually provides information. Since the Orca model expects a user prompt to appear after ### User: and the response is after ### Response, we should format our examples like that as well. If we were using the Llama 2 model, the format would be a bit different.
--- a/examples/modelfile-tweetwriter/Modelfile
+++ b/examples/modelfile-tweetwriter/Modelfile
@@ -1,7 +0,0 @@
-# Modelfile for creating a tweet from a topic
-# Run `ollama create tweetwriter -f ./Modelfile` and then `ollama run tweetwriter` and enter a topic
-
-FROM nous-hermes
-SYSTEM """
-You are a content marketer who needs to come up with a short but succinct tweet. Make sure to include the appropriate hashtags and links. Sometimes when appropriate, describe a meme that can be included as well. All answers should be in the form of a tweet which has a max size of 280 characters. Every instruction will be the topic to create a tweet about.
-"""
--- a/examples/modelfile-tweetwriter/readme.md
+++ b/examples/modelfile-tweetwriter/readme.md
@@ -0,0 +1,23 @@
+# Example Modelfile - Tweetwriter
+
+This simple examples shows what you can do without any code, simply relying on a Modelfile. The file has two instructions:
+
+1. FROM - The From instructions defines the parent model to use for this one. If you choose a model from the library, you can enter just the model name. For all other models, you need to specify the namespace as well. You could also use a local file. Just include the relative path to the converted, quantized model weights file. To learn more about creating that file, see the `import.md` file in the docs folder of this repository.
+2. SYSTEM - This defines the system prompt for the model and overrides the system prompt from the parent model.
+
+## Running the Example
+
+1. Create the model:
+
+   ```bash
+   ollama create tweetwriter
+   ```
+
+2. Enter a topic to generate a tweet about.
+3. Show the Modelfile in the REPL.
+
+   ```bash
+   /show modelfile
+   ```
+
+   Notice that the FROM and SYSTEM match what was in the file. But there is also a TEMPLATE and PARAMETER. These are inherited from the parent model.
--- a/examples/python-dockerit/README.md
+++ b/examples/python-dockerit/README.md
@@ -1,15 +1,31 @@
 # DockerIt

-DockerIt is a tool to help you build and run your application in a Docker container. It consists of a model that defines the system prompt and model weights to use, along with a python script to then build the container and run the image automatically. 
+DockerIt is a tool to help you build and run your application in a Docker container. It consists of a model that defines the system prompt and model weights to use, along with a python script to then build the container and run the image automatically.
+
+## Running the Example
+
+1. Ensure you have the `mattw/dockerit` model installed:
+
+   ```bash
+   ollama pull mattw/dockerit
+   ```
+
+2. Make sure Docker is running on your machine.
+
+3. Install the Python Requirements.
+
+   ```bash
+   pip install -r requirements.txt
+   ```
+
+4. Run the example:
+
+   ```bash
+   python dockerit.py "simple postgres server with admin password set to 123"
+   ```
+
+5. Enter the name you would like to use for your container image.

 ## Caveats

-This is an simple example. It's assuming the Dockerfile content generated is going to work. In many cases, even with simple web servers, it fails when trying to copy files that don't exist. It's simply an example of what you could possibly do.
-
-## Example Usage
-
-```bash
-> python3 ./dockerit.py "simple postgres server with admin password set to 123"
-Enter the name of the image: matttest
-Container named happy_keller  started with id:  7c201bb6c30f02b356ddbc8e2a5af9d7d7d7b8c228519c9a501d15c0bd9d6b3e
-```
+This is a simple example. It's assuming the Dockerfile content generated is going to work. In many cases, even with simple web servers, it fails when trying to copy files that don't exist. It's simply an example of what you could possibly do.
--- a/examples/python-json-datagenerator/readme.md
+++ b/examples/python-json-datagenerator/readme.md
@@ -4,6 +4,32 @@

 There are two python scripts in this example. `randomaddresses.py` generates random addresses from different countries. `predefinedschema.py` sets a template for the model to fill in.

+## Running the Example
+
+1. Ensure you have the `llama2` model installed:
+
+   ```bash
+   ollama pull llama2
+   ```
+
+2. Install the Python Requirements.
+
+   ```bash
+   pip install -r requirements.txt
+   ```
+
+3. Run the Random Addresses example:
+
+   ```bash
+   python randomaddresses.py
+   ```
+
+4. Run the Predefined Schema example:
+
+   ```bash
+   python predefinedschema.py
+   ```
+
 ## Review the Code

 Both programs are basically the same, with a different prompt for each, demonstrating two different ideas. The key part of getting JSON out of a model is to state in the prompt or system prompt that it should respond using JSON, and specifying the `format` as `json` in the data body.
--- a/examples/python-loganalysis/loganalysis.py
+++ b/examples/python-loganalysis/loganalysis.py
@@ -16,12 +16,12 @@ def find_errors_in_log_file():
  with open(log_file_path, 'r') as log_file:
    log_lines = log_file.readlines()

-error_logs = []
-    for i, line in enumerate(log_lines):
-        if "error" in line.lower():
-            start_index = max(0, i - prelines)
-            end_index = min(len(log_lines), i + postlines + 1)
-            error_logs.extend(log_lines[start_index:end_index])
+  error_logs = []
+  for i, line in enumerate(log_lines):
+      if "error" in line.lower():
+          start_index = max(0, i - prelines)
+          end_index = min(len(log_lines), i + postlines + 1)
+          error_logs.extend(log_lines[start_index:end_index])

  return error_logs

@@ -32,7 +32,6 @@ data = {
  "model": "mattw/loganalyzer"
 }

-
 response = requests.post("http://localhost:11434/api/generate", json=data, stream=True)
 for line in response.iter_lines():
  if line:
--- a/examples/python-loganalysis/readme.md
+++ b/examples/python-loganalysis/readme.md
@@ -2,12 +2,34 @@

 ![loganalyzer 2023-11-10 08_53_29](https://github.com/jmorganca/ollama/assets/633681/ad30f1fc-321f-4953-8914-e30e24db9921)

-This example shows one possible way to create a log file analyzer. To use it, run:
+This example shows one possible way to create a log file analyzer. It uses the model **mattw/loganalyzer** which is based on **codebooga**, a 34b parameter model.
+
+To use it, run:

 `python loganalysis.py <logfile>`

 You can try this with the `logtest.logfile` file included in this directory.

+## Running the Example
+
+1. Ensure you have the `mattw/loganalyzer` model installed:
+
+   ```bash
+   ollama pull mattw/loganalyzer
+   ```
+
+2. Install the Python Requirements.
+
+   ```bash
+   pip install -r requirements.txt
+   ```
+
+3. Run the example:
+
+   ```bash
+   python loganalysis.py logtest.logfile
+   ```
+
 ## Review the code

 The first part of this example is a Modelfile that takes `codebooga` and applies a new System Prompt:
@@ -45,4 +67,4 @@ for line in response.iter_lines():

 There is a lot more that can be done here. This is a simple way to detect errors, looking for the word error. Perhaps it would be interesting to find anomalous activity in the logs. It could be interesting to create embeddings for each line and compare them, looking for similar lines. Or look into applying Levenshtein Distance algorithms to find similar lines to help identify the anomalous lines.

-Also try different models and different prompts to analyze the data. You could consider adding retrieval augmented generation (RAG) to this to help understand newer log formats.
+Try different models and different prompts to analyze the data. You could consider adding retrieval augmented generation (RAG) to this to help understand newer log formats.
--- a/examples/python-rag-newssummary/README.md
+++ b/examples/python-rag-newssummary/README.md
@@ -14,9 +14,22 @@ This example goes through a series of steps:

 This example lets you pick from a few different topic areas, then summarize the most recent x articles for that topic. It then creates chunks of sentences from each article and then generates embeddings for each of those chunks.

-You can run the example like this:
+## Running the Example

-```bash
-pip install -r requirements.txt
-python summ.py
-```
+1. Ensure you have the `mistral-openorca` model installed:
+
+   ```bash
+   ollama pull mistral-openorca
+   ```
+
+2. Install the Python Requirements.
+
+   ```bash
+   pip install -r requirements.txt
+   ```
+
+3. Run the example:
+
+   ```bash
+   python summ.py
+   ```
--- a/examples/python-simplechat/client.py
+++ b/examples/python-simplechat/client.py
@@ -0,0 +1,47 @@
+import json
+import requests
+
+# NOTE: ollama must be running for this to work, start the ollama app or run `ollama serve`
+model = "llama2"  # TODO: update this for whatever model you wish to use
+
+
+def chat(messages):
+    r = requests.post(
+        "http://0.0.0.0:11434/api/chat",
+        json={"model": model, "messages": messages, "stream": True},
+    )
+    r.raise_for_status()
+    output = ""
+
+    for line in r.iter_lines():
+        body = json.loads(line)
+        if "error" in body:
+            raise Exception(body["error"])
+        if body.get("done") is False:
+            message = body.get("message", "")
+            content = message.get("content", "")
+            output += content
+            # the response streams one token at a time, print that as we receive it
+            print(content, end="", flush=True)
+
+        if body.get("done", False):
+            message["content"] = output
+            return message
+
+
+def main():
+    messages = []
+
+    while True:
+        user_input = input("Enter a prompt: ")
+        if not user_input:
+            exit()
+        print()
+        messages.append({"role": "user", "content": user_input})
+        message = chat(messages)
+        messages.append(message)
+        print("\n\n")
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/python-simplechat/readme.md
+++ b/examples/python-simplechat/readme.md
@@ -0,0 +1,44 @@
+# Simple Chat Example
+
+The **chat** endpoint is one of two ways to generate text from an LLM with Ollama, and is introduced in version 0.1.14. At a high level, you provide the endpoint an array of objects with a role and content specified. Then with each output and prompt, you add more of those role/content objects, which builds up the history.
+
+## Running the Example
+
+1. Ensure you have the `llama2` model installed:
+
+   ```bash
+   ollama pull llama2
+   ```
+
+2. Install the Python Requirements.
+
+   ```bash
+   pip install -r requirements.txt
+   ```
+
+3. Run the example:
+
+   ```bash
+   python client.py
+   ```
+
+## Review the Code
+
+You can see in the **chat** function that actually calling the endpoint is done simply with:
+
+```python
+r = requests.post(
+  "http://0.0.0.0:11434/api/chat",
+  json={"model": model, "messages": messages, "stream": True},
+)
+```
+
+With the **generate** endpoint, you need to provide a `prompt`. But with **chat**, you provide `messages`. And the resulting stream of responses includes a `message` object with a `content` field.
+
+The final JSON object doesn't provide the full content, so you will need to build the content yourself.
+
+In the **main** function, we collect `user_input` and add it as a message to our messages and that is passed to the chat function. When the LLM is done responding the output is added as another message.
+
+## Next Steps
+
+In this example, all generations are kept. You might want to experiment with summarizing everything older than 10 conversations to enable longer history with less context being used.
--- a/examples/python-simplechat/requirements.txt
+++ b/examples/python-simplechat/requirements.txt
@@ -0,0 +1 @@
+Requests==2.31.0
--- a/examples/python-simplegenerate/README.md
+++ b/examples/python-simplegenerate/README.md
@@ -0,0 +1,29 @@
+# Simple Generate Example
+
+This is a simple example using the **Generate** endpoint.
+
+## Running the Example
+
+1. Ensure you have the `stablelm-zephyr` model installed:
+
+   ```bash
+   ollama pull stablelm-zephyr
+   ```
+
+2. Install the Python Requirements.
+
+   ```bash
+   pip install -r requirements.txt
+   ```
+
+3. Run the example:
+
+   ```bash
+   python client.py
+   ```
+
+## Review the Code
+
+The **main** function simply asks for input, then passes that to the generate function. The output from generate is then passed back to generate on the next run.
+
+The **generate** function uses `requests.post` to call `/api/generate`, passing the model, prompt, and context. The `generate` endpoint returns a stream of JSON blobs that are then iterated through, looking for the response values. That is then printed out. The final JSON object includes the full context of the conversation so far, and that is the return value from the function.
--- a/examples/python-simplegenerate/client.py
+++ b/examples/python-simplegenerate/client.py
@@ -2,7 +2,7 @@ import json
 import requests

 # NOTE: ollama must be running for this to work, start the ollama app or run `ollama serve`
-model = 'llama2' # TODO: update this for whatever model you wish to use
+model = 'stablelm-zephyr' # TODO: update this for whatever model you wish to use

 def generate(prompt, context):
    r = requests.post('http://localhost:11434/api/generate',
@@ -30,6 +30,8 @@ def main():
    context = [] # the context stores a conversation history, you can use this to make the model more context aware
    while True:
        user_input = input("Enter a prompt: ")
+        if not user_input:
+            exit()
        print()
        context = generate(user_input, context)
        print()
--- a/examples/python-simplegenerate/requirements.txt
+++ b/examples/python-simplegenerate/requirements.txt
@@ -0,0 +1 @@
+Requests==2.31.0
--- a/examples/typescript-mentors/README.md
+++ b/examples/typescript-mentors/README.md
@@ -4,18 +4,62 @@ This example demonstrates how one would create a set of 'mentors' you can have a

 ## Usage

-```bash
-ts-node ./character-generator.ts "Lorne Greene"
-```
+1. Add llama2 to have the mentors ask your questions:

-This will create `lornegreene/Modelfile`. Now you can create a model with this command:
+   ```bash
+   ollama pull llama2
+   ```

-```bash
-ollama create lornegreene -f lornegreene/Modelfile
-```
+2. Install prerequisites:

-If you want to add your own mentors, you will have to update the code to look at your namespace instead of **mattw**. Also set the list of mentors to include yours.
+   ```bash
+   npm install
+   ```

-```bash
-ts-node ./mentors.ts "What is a Jackalope?"
-```
+3. Ask a question:
+
+   ```bash
+   npm start "what is a jackalope"
+   ```
+
+You can also add your own character to be chosen at random when you ask a question.
+
+1. Make sure you have the right model installed:
+
+   ```bash
+   ollama pull stablebeluga2:70b-q4_K_M
+   ```
+  
+2. Create a new character:
+  
+   ```bash
+   npm run charactergen "Lorne Greene"
+   ```
+
+   You can choose any well-known person you like. This example will create `lornegreene/Modelfile`.
+
+3. Now you can create a model with this command:
+
+   ```bash
+   ollama create <YourNamespace>/lornegreene -f lornegreene/Modelfile
+   ```
+
+   `YourNamespace` is whatever name you set up when you signed up at [https://ollama.ai/signup](https://ollama.ai/signup).
+
+4. To add this to your mentors, you will have to update the code as follows. On line 8 of `mentors.ts`, add an object to the array, replacing `<YourNamespace>` with the namespace you used above.
+
+   ```bash
+   {ns: "<YourNamespace>", char: "Lorne Greene"}
+   ```
+
+## Review the Code
+
+There are two scripts you can run in this example. The first is the main script to ask the mentors a question. The other one lets you generate a character to add to the mentors. Both scripts are mostly about adjusting the prompts at each inference stage.
+
+### mentors.ts
+
+In the **main** function, it starts by generating a list of mentors. This chooses 3 from a list of interesting characters. Then we ask for a question, and then things get interesting. We set the prompt for each of the 3 mentors a little differently. And the 2nd and 3rd mentors see what the previous folks said. The other functions in mentors sets the prompts for each mentor.
+
+### character-generator.ts
+
+**Character Generator** simply customizes the prompt to build a character profile for any famous person. And most of the script is just tweaking the prompt. This uses Stable Beluga 2 70b parameters. The 70b models tend to do better writing a bio about a character than smaller models, and Stable Beluga seemed to do better than Llama 2. Since this is used at development time for the characters, it doesn't affect the runtime of asking the mentors for their input.
--- a/examples/typescript-mentors/mentors.ts
+++ b/examples/typescript-mentors/mentors.ts
@@ -2,10 +2,11 @@ import { Ollama } from 'ollama-node';

 const mentorCount = 3;
 const ollama = new Ollama();
+type Mentor = { ns: string, char: string };

-function getMentors(): string[] {
-  const mentors = ['Gary Vaynerchuk', 'Kanye West', 'Martha Stewart', 'Neil deGrasse Tyson', 'Owen Wilson', 'Ronald Reagan', 'Donald Trump', 'Barack Obama', 'Jeff Bezos'];
-  const chosenMentors: string[] = [];
+function getMentors(): Mentor[] {
+  const mentors = [{ ns: 'mattw', char: 'Gary Vaynerchuk' }, { ns: 'mattw', char: 'Kanye West'}, {ns: 'mattw', char: 'Martha Stewart'}, {ns: 'mattw', char: 'Neil deGrasse Tyson'}, {ns: 'mattw', char: 'Owen Wilson'}, {ns: 'mattw', char: 'Ronald Reagan'}, {ns: 'mattw', char: 'Donald Trump'}, {ns: 'mattw', char: 'Barack Obama'}, {ns: 'mattw', char: 'Jeff Bezos'}];
+  const chosenMentors: Mentor[] = [];
  for (let i = 0; i < mentorCount; i++) {
    const mentor = mentors[Math.floor(Math.random() * mentors.length)];
    chosenMentors.push(mentor);
@@ -14,12 +15,12 @@ function getMentors(): string[] {
  return chosenMentors;
 }

-function getMentorFileName(mentor: string): string {
-  const model = mentor.toLowerCase().replace(/\s/g, '');
-  return `mattw/${model}`;
+function getMentorFileName(mentor: Mentor): string {
+  const model = mentor.char.toLowerCase().replace(/\s/g, '');
+  return `${mentor.ns}/${model}`;
 }

-async function getSystemPrompt(mentor: string, isLast: boolean, question: string): Promise<string> {
+async function getSystemPrompt(mentor: Mentor, isLast: boolean, question: string): Promise<string> {
  ollama.setModel(getMentorFileName(mentor));
  const info = await ollama.showModelInfo()
  let SystemPrompt = info.system || '';
@@ -43,8 +44,8 @@ async function main() {
    ollama.setModel(getMentorFileName(mentor));
    ollama.setSystemPrompt(SystemPrompt);
    let output = '';
-    process.stdout.write(`\n${mentor}: `);
-    for await (const chunk of ollama.streamingGenerate(theConversation + `Continue the conversation as if you were ${mentor} on the question "${question}".`)) {
+    process.stdout.write(`\n${mentor.char}: `);
+    for await (const chunk of ollama.streamingGenerate(theConversation + `Continue the conversation as if you were ${mentor.char} on the question "${question}".`)) {
      if (chunk.response) {
        output += chunk.response;
        process.stdout.write(chunk.response);
@@ -52,7 +53,7 @@ async function main() {
        process.stdout.write('\n');
      }
    }
-    theConversation += `${mentor}: ${output}\n\n`
+    theConversation += `${mentor.char}: ${output}\n\n`
  }
 }

--- a/examples/typescript-mentors/package.json
+++ b/examples/typescript-mentors/package.json
@@ -1,7 +1,15 @@
 {
+  "scripts": {
+    "charactergen": "tsx character-generator.ts", 
+    "start": "tsx mentors.ts"
+  },
  "dependencies": {
    "fs": "^0.0.1-security",
    "ollama-node": "^0.0.3",
    "path": "^0.12.7"
+  },
+  "devDependencies": {
+    "tsx": "^4.6.2",
+    "typescript": "^5.3.3"
  }
 }
--- a/examples/typescript-simplechat/client.ts
+++ b/examples/typescript-simplechat/client.ts
@@ -0,0 +1,77 @@
+import * as readline from "readline";
+
+const model = "llama2";
+type Message = {
+  role: "assistant" | "user" | "system";
+  content: string;
+}
+const messages: Message[] = [{
+  role: "system",
+  content: "You are a helpful AI agent."
+}]
+
+const rl = readline.createInterface({
+  input: process.stdin,
+  output: process.stdout
+})
+
+async function chat(messages: Message[]): Promise<Message> {
+  const body = {
+    model: model,
+    messages: messages
+  }
+
+  const response = await fetch("http://localhost:11434/api/chat", {
+    method: "POST",
+    body: JSON.stringify(body)
+  })
+
+  const reader = response.body?.getReader()
+  if (!reader) {
+    throw new Error("Failed to read response body")
+  }
+  let content = ""
+  while (true) {
+    const { done, value } = await reader.read()
+    if (done) {
+      break;
+    }
+    const rawjson = new TextDecoder().decode(value);
+    const json = JSON.parse(rawjson)
+
+    if (json.done === false) {
+      process.stdout.write(json.message.content);
+      content += json.message.content
+    }
+
+  }
+  return { role: "assistant", content: content };
+}
+
+async function askQuestion(): Promise<void> {
+  return new Promise<void>((resolve) => {
+    rl.question("\n\nAsk a question: (press enter alone to quit)\n\n", async (user_input) => {
+      if (user_input.trim() === "") {
+        rl.close();
+        console.log("Thankyou. Goodbye.\n")
+        console.log("=======\nHere is the message history that was used in this conversation.\n=======\n")
+        messages.forEach(message => {
+          console.log(message)
+        })
+        resolve();
+      } else {
+        console.log();
+        messages.push({ role: "user", content: user_input });
+        messages.push(await chat(messages));
+        await askQuestion(); // Ask the next question
+      }
+    });
+  });
+}
+
+async function main() {
+  await askQuestion();
+
+}
+
+main();
--- a/examples/typescript-simplechat/package.json
+++ b/examples/typescript-simplechat/package.json
@@ -0,0 +1,12 @@
+{ 
+  "scripts": {
+    "start": "tsx client.ts"
+  }, 
+  "dependencies": {
+     "@types/node": "^20.10.4", 
+     "prompt-sync": "^4.2.0", 
+     "readline": "^1.3.0", 
+     "tsx": "^4.6.2", 
+     "typescript": "^5.3.3" 
+     } 
+    }
--- a/examples/typescript-simplechat/readme.md
+++ b/examples/typescript-simplechat/readme.md
@@ -0,0 +1,35 @@
+# Simple Chat Example
+
+The **chat** endpoint, available as of v0.1.14, is one of two ways to generate text from an LLM with Ollama. At a high level, you provide the endpoint an array of message objects with a role and content specified. Then with each output and prompt, you add more messages, which builds up the history.
+
+## Run the Example
+
+`npm start`
+
+## Review the Code
+
+You can see in the **chat** function that is actually calling the endpoint is simply done with:
+
+```typescript
+const body = {
+  model: model,
+  messages: messages
+}
+
+const response = await fetch("http://localhost:11434/api/chat", {
+  method: "POST",
+  body: JSON.stringify(body)
+})
+```
+
+With the **generate** endpoint, you need to provide a `prompt`. But with **chat**, you provide `messages`. And the resulting stream of responses includes a `message` object with a `content` field.
+
+The final JSON object doesn't provide the full content, so you will need to build the content yourself. In this example, **chat** takes the full array of messages and outputs the resulting message from this call of the chat endpoint.
+
+In the **askQuestion** function, we collect `user_input` and add it as a message to our messages, and that is passed to the chat function. When the LLM is done responding, the output is added as another message to the messages array.
+
+At the end, you will see a printout of all the messages.
+
+## Next Steps
+
+In this example, all generations are kept. You might want to experiment with summarizing everything older than 10 conversations to enable longer history with less context being used.
--- a/go.mod
+++ b/go.mod
@@ -1,18 +1,22 @@
 module github.com/jmorganca/ollama

-go 1.20
+go 1.21

 require (
 	github.com/emirpasic/gods v1.18.1
 	github.com/gin-gonic/gin v1.9.1
-	github.com/mattn/go-runewidth v0.0.14
-	github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db
 	github.com/olekukonko/tablewriter v0.0.5
 	github.com/spf13/cobra v1.7.0
+	github.com/stretchr/testify v1.8.4
 	golang.org/x/sync v0.3.0
 )

-require github.com/rivo/uniseg v0.2.0 // indirect
+require (
+	github.com/davecgh/go-spew v1.1.1 // indirect
+	github.com/mattn/go-runewidth v0.0.14 // indirect
+	github.com/pmezard/go-difflib v1.0.0 // indirect
+	github.com/rivo/uniseg v0.2.0 // indirect
+)

 require (
 	github.com/bytedance/sonic v1.9.1 // indirect
@@ -41,7 +45,7 @@ require (
 	golang.org/x/crypto v0.14.0
 	golang.org/x/exp v0.0.0-20230817173708-d852ddb80c63
 	golang.org/x/net v0.17.0 // indirect
-	golang.org/x/sys v0.13.0 // indirect
+	golang.org/x/sys v0.13.0
 	golang.org/x/term v0.13.0
 	golang.org/x/text v0.13.0 // indirect
 	google.golang.org/protobuf v1.30.0 // indirect
--- a/go.sum
+++ b/go.sum
@@ -63,8 +63,6 @@ github.com/mattn/go-isatty v0.0.19/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D
 github.com/mattn/go-runewidth v0.0.9/go.mod h1:H031xJmbD/WCDINGzjvQ9THkh0rPKHF+m2gUSrubnMI=
 github.com/mattn/go-runewidth v0.0.14 h1:+xnbZSEeDbOIg5/mE6JF0w6n9duR1l3/WmbinWVwUuU=
 github.com/mattn/go-runewidth v0.0.14/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w=
-github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db h1:62I3jR2EmQ4l5rM/4FEfDWcRD+abF5XlKShorW5LRoQ=
-github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db/go.mod h1:l0dey0ia/Uv7NcFFVbCLtqEBQbrT4OCwCSKTEv6enCw=
 github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
 github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg=
 github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
@@ -100,8 +98,9 @@ github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/
 github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
 github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
 github.com/stretchr/testify v1.8.2/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
-github.com/stretchr/testify v1.8.3 h1:RP3t2pwF7cMEbC1dqtB6poj3niw/9gnV4Cjg5oW5gtY=
 github.com/stretchr/testify v1.8.3/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
+github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk=
+github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
 github.com/twitchyliquid64/golang-asm v0.15.1 h1:SU5vSMR7hnwNxj24w34ZyCi/FmDZTkS4MhqMhdFk5YI=
 github.com/twitchyliquid64/golang-asm v0.15.1/go.mod h1:a1lVb/DtPvCB8fslRZhAngC2+aY1QWCk3Cedj/Gdt08=
 github.com/ugorji/go v1.2.7/go.mod h1:nF9osbDWLy6bDVv/Rtoh6QgnvNDpmCalQV5urGCCS6M=
--- a/gpu/cpu_common.go
+++ b/gpu/cpu_common.go
@@ -0,0 +1,21 @@
+package gpu
+
+import (
+	"log/slog"
+
+	"golang.org/x/sys/cpu"
+)
+
+func GetCPUVariant() string {
+	if cpu.X86.HasAVX2 {
+		slog.Info("CPU has AVX2")
+		return "avx2"
+	}
+	if cpu.X86.HasAVX {
+		slog.Info("CPU has AVX")
+		return "avx"
+	}
+	slog.Info("CPU does not have vector extensions")
+	// else LCD
+	return ""
+}
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@@ -0,0 +1,301 @@
+//go:build linux || windows
+
+package gpu
+
+/*
+#cgo linux LDFLAGS: -lrt -lpthread -ldl -lstdc++ -lm
+#cgo windows LDFLAGS: -lpthread
+
+#include "gpu_info.h"
+
+*/
+import "C"
+import (
+	"fmt"
+	"log/slog"
+	"os"
+	"path/filepath"
+	"runtime"
+	"strings"
+	"sync"
+	"unsafe"
+)
+
+type handles struct {
+	cuda *C.cuda_handle_t
+	rocm *C.rocm_handle_t
+}
+
+var gpuMutex sync.Mutex
+var gpuHandles *handles = nil
+
+// With our current CUDA compile flags, 5.2 and older will not work properly
+const CudaComputeMajorMin = 6
+
+// Possible locations for the nvidia-ml library
+var CudaLinuxGlobs = []string{
+	"/usr/local/cuda/lib64/libnvidia-ml.so*",
+	"/usr/lib/x86_64-linux-gnu/nvidia/current/libnvidia-ml.so*",
+	"/usr/lib/x86_64-linux-gnu/libnvidia-ml.so*",
+	"/usr/lib/wsl/lib/libnvidia-ml.so*",
+	"/usr/lib/wsl/drivers/*/libnvidia-ml.so*",
+	"/opt/cuda/lib64/libnvidia-ml.so*",
+	"/usr/lib*/libnvidia-ml.so*",
+	"/usr/local/lib*/libnvidia-ml.so*",
+	"/usr/lib/aarch64-linux-gnu/nvidia/current/libnvidia-ml.so*",
+	"/usr/lib/aarch64-linux-gnu/libnvidia-ml.so*",
+
+	// TODO: are these stubs ever valid?
+	"/opt/cuda/targets/x86_64-linux/lib/stubs/libnvidia-ml.so*",
+}
+
+var CudaWindowsGlobs = []string{
+	"c:\\Windows\\System32\\nvml.dll",
+}
+
+var RocmLinuxGlobs = []string{
+	"/opt/rocm*/lib*/librocm_smi64.so*",
+}
+
+var RocmWindowsGlobs = []string{
+	"c:\\Windows\\System32\\rocm_smi64.dll",
+}
+
+// Note: gpuMutex must already be held
+func initGPUHandles() {
+
+	// TODO - if the ollama build is CPU only, don't do these checks as they're irrelevant and confusing
+
+	gpuHandles = &handles{nil, nil}
+	var cudaMgmtName string
+	var cudaMgmtPatterns []string
+	var rocmMgmtName string
+	var rocmMgmtPatterns []string
+	switch runtime.GOOS {
+	case "windows":
+		cudaMgmtName = "nvml.dll"
+		cudaMgmtPatterns = make([]string, len(CudaWindowsGlobs))
+		copy(cudaMgmtPatterns, CudaWindowsGlobs)
+		rocmMgmtName = "rocm_smi64.dll"
+		rocmMgmtPatterns = make([]string, len(RocmWindowsGlobs))
+		copy(rocmMgmtPatterns, RocmWindowsGlobs)
+	case "linux":
+		cudaMgmtName = "libnvidia-ml.so"
+		cudaMgmtPatterns = make([]string, len(CudaLinuxGlobs))
+		copy(cudaMgmtPatterns, CudaLinuxGlobs)
+		rocmMgmtName = "librocm_smi64.so"
+		rocmMgmtPatterns = make([]string, len(RocmLinuxGlobs))
+		copy(rocmMgmtPatterns, RocmLinuxGlobs)
+	default:
+		return
+	}
+
+	slog.Info("Detecting GPU type")
+	cudaLibPaths := FindGPULibs(cudaMgmtName, cudaMgmtPatterns)
+	if len(cudaLibPaths) > 0 {
+		cuda := LoadCUDAMgmt(cudaLibPaths)
+		if cuda != nil {
+			slog.Info("Nvidia GPU detected")
+			gpuHandles.cuda = cuda
+			return
+		}
+	}
+
+	rocmLibPaths := FindGPULibs(rocmMgmtName, rocmMgmtPatterns)
+	if len(rocmLibPaths) > 0 {
+		rocm := LoadROCMMgmt(rocmLibPaths)
+		if rocm != nil {
+			slog.Info("Radeon GPU detected")
+			gpuHandles.rocm = rocm
+			return
+		}
+	}
+}
+
+func GetGPUInfo() GpuInfo {
+	// TODO - consider exploring lspci (and equivalent on windows) to check for
+	// GPUs so we can report warnings if we see Nvidia/AMD but fail to load the libraries
+	gpuMutex.Lock()
+	defer gpuMutex.Unlock()
+	if gpuHandles == nil {
+		initGPUHandles()
+	}
+
+	var memInfo C.mem_info_t
+	resp := GpuInfo{}
+	if gpuHandles.cuda != nil {
+		C.cuda_check_vram(*gpuHandles.cuda, &memInfo)
+		if memInfo.err != nil {
+			slog.Info(fmt.Sprintf("error looking up CUDA GPU memory: %s", C.GoString(memInfo.err)))
+			C.free(unsafe.Pointer(memInfo.err))
+		} else {
+			// Verify minimum compute capability
+			var cc C.cuda_compute_capability_t
+			C.cuda_compute_capability(*gpuHandles.cuda, &cc)
+			if cc.err != nil {
+				slog.Info(fmt.Sprintf("error looking up CUDA GPU compute capability: %s", C.GoString(cc.err)))
+				C.free(unsafe.Pointer(cc.err))
+			} else if cc.major >= CudaComputeMajorMin {
+				slog.Info(fmt.Sprintf("CUDA Compute Capability detected: %d.%d", cc.major, cc.minor))
+				resp.Library = "cuda"
+			} else {
+				slog.Info(fmt.Sprintf("CUDA GPU is too old. Falling back to CPU mode. Compute Capability detected: %d.%d", cc.major, cc.minor))
+			}
+		}
+	} else if gpuHandles.rocm != nil {
+		C.rocm_check_vram(*gpuHandles.rocm, &memInfo)
+		if memInfo.err != nil {
+			slog.Info(fmt.Sprintf("error looking up ROCm GPU memory: %s", C.GoString(memInfo.err)))
+			C.free(unsafe.Pointer(memInfo.err))
+		} else {
+			resp.Library = "rocm"
+			var version C.rocm_version_resp_t
+			C.rocm_get_version(*gpuHandles.rocm, &version)
+			verString := C.GoString(version.str)
+			if version.status == 0 {
+				resp.Variant = "v" + verString
+			} else {
+				slog.Info(fmt.Sprintf("failed to look up ROCm version: %s", verString))
+			}
+			C.free(unsafe.Pointer(version.str))
+		}
+	}
+	if resp.Library == "" {
+		C.cpu_check_ram(&memInfo)
+		resp.Library = "cpu"
+		resp.Variant = GetCPUVariant()
+	}
+	if memInfo.err != nil {
+		slog.Info(fmt.Sprintf("error looking up CPU memory: %s", C.GoString(memInfo.err)))
+		C.free(unsafe.Pointer(memInfo.err))
+		return resp
+	}
+
+	resp.DeviceCount = uint32(memInfo.count)
+	resp.FreeMemory = uint64(memInfo.free)
+	resp.TotalMemory = uint64(memInfo.total)
+	return resp
+}
+
+func getCPUMem() (memInfo, error) {
+	var ret memInfo
+	var info C.mem_info_t
+	C.cpu_check_ram(&info)
+	if info.err != nil {
+		defer C.free(unsafe.Pointer(info.err))
+		return ret, fmt.Errorf(C.GoString(info.err))
+	}
+	ret.FreeMemory = uint64(info.free)
+	ret.TotalMemory = uint64(info.total)
+	return ret, nil
+}
+
+func CheckVRAM() (int64, error) {
+	gpuInfo := GetGPUInfo()
+	if gpuInfo.FreeMemory > 0 && (gpuInfo.Library == "cuda" || gpuInfo.Library == "rocm") {
+		// leave 10% or 1024MiB of VRAM free per GPU to handle unaccounted for overhead
+		overhead := gpuInfo.FreeMemory / 10
+		gpus := uint64(gpuInfo.DeviceCount)
+		if overhead < gpus*1024*1024*1024 {
+			overhead = gpus * 1024 * 1024 * 1024
+		}
+		return int64(gpuInfo.FreeMemory - overhead), nil
+	}
+
+	return 0, fmt.Errorf("no GPU detected") // TODO - better handling of CPU based memory determiniation
+}
+
+func FindGPULibs(baseLibName string, patterns []string) []string {
+	// Multiple GPU libraries may exist, and some may not work, so keep trying until we exhaust them
+	var ldPaths []string
+	gpuLibPaths := []string{}
+	slog.Info(fmt.Sprintf("Searching for GPU management library %s", baseLibName))
+
+	switch runtime.GOOS {
+	case "windows":
+		ldPaths = strings.Split(os.Getenv("PATH"), ";")
+	case "linux":
+		ldPaths = strings.Split(os.Getenv("LD_LIBRARY_PATH"), ":")
+	default:
+		return gpuLibPaths
+	}
+	// Start with whatever we find in the PATH/LD_LIBRARY_PATH
+	for _, ldPath := range ldPaths {
+		d, err := filepath.Abs(ldPath)
+		if err != nil {
+			continue
+		}
+		patterns = append(patterns, filepath.Join(d, baseLibName+"*"))
+	}
+	slog.Debug(fmt.Sprintf("gpu management search paths: %v", patterns))
+	for _, pattern := range patterns {
+		// Ignore glob discovery errors
+		matches, _ := filepath.Glob(pattern)
+		for _, match := range matches {
+			// Resolve any links so we don't try the same lib multiple times
+			// and weed out any dups across globs
+			libPath := match
+			tmp := match
+			var err error
+			for ; err == nil; tmp, err = os.Readlink(libPath) {
+				if !filepath.IsAbs(tmp) {
+					tmp = filepath.Join(filepath.Dir(libPath), tmp)
+				}
+				libPath = tmp
+			}
+			new := true
+			for _, cmp := range gpuLibPaths {
+				if cmp == libPath {
+					new = false
+					break
+				}
+			}
+			if new {
+				gpuLibPaths = append(gpuLibPaths, libPath)
+			}
+		}
+	}
+	slog.Info(fmt.Sprintf("Discovered GPU libraries: %v", gpuLibPaths))
+	return gpuLibPaths
+}
+
+func LoadCUDAMgmt(cudaLibPaths []string) *C.cuda_handle_t {
+	var resp C.cuda_init_resp_t
+	resp.ch.verbose = getVerboseState()
+	for _, libPath := range cudaLibPaths {
+		lib := C.CString(libPath)
+		defer C.free(unsafe.Pointer(lib))
+		C.cuda_init(lib, &resp)
+		if resp.err != nil {
+			slog.Info(fmt.Sprintf("Unable to load CUDA management library %s: %s", libPath, C.GoString(resp.err)))
+			C.free(unsafe.Pointer(resp.err))
+		} else {
+			return &resp.ch
+		}
+	}
+	return nil
+}
+
+func LoadROCMMgmt(rocmLibPaths []string) *C.rocm_handle_t {
+	var resp C.rocm_init_resp_t
+	resp.rh.verbose = getVerboseState()
+	for _, libPath := range rocmLibPaths {
+		lib := C.CString(libPath)
+		defer C.free(unsafe.Pointer(lib))
+		C.rocm_init(lib, &resp)
+		if resp.err != nil {
+			slog.Info(fmt.Sprintf("Unable to load ROCm management library %s: %s", libPath, C.GoString(resp.err)))
+			C.free(unsafe.Pointer(resp.err))
+		} else {
+			return &resp.rh
+		}
+	}
+	return nil
+}
+
+func getVerboseState() C.uint16_t {
+	if debug := os.Getenv("OLLAMA_DEBUG"); debug != "" {
+		return C.uint16_t(1)
+	}
+	return C.uint16_t(0)
+}
--- a/gpu/gpu_darwin.go
+++ b/gpu/gpu_darwin.go
@@ -0,0 +1,54 @@
+//go:build darwin
+
+package gpu
+
+import "C"
+import (
+	"runtime"
+
+	"github.com/pbnjay/memory"
+)
+
+// CheckVRAM returns the free VRAM in bytes on Linux machines with NVIDIA GPUs
+func CheckVRAM() (int64, error) {
+	if runtime.GOARCH == "amd64" {
+		// gpu not supported, this may not be metal
+		return 0, nil
+	}
+
+	// on macOS, there's already buffer for available vram (see below) so just return the total
+	systemMemory := int64(memory.TotalMemory())
+
+	// macOS limits how much memory is available to the GPU based on the amount of system memory
+	// TODO: handle case where iogpu.wired_limit_mb is set to a higher value
+	if systemMemory <= 36*1024*1024*1024 {
+		systemMemory = systemMemory * 2 / 3
+	} else {
+		systemMemory = systemMemory * 3 / 4
+	}
+
+	return systemMemory, nil
+}
+
+func GetGPUInfo() GpuInfo {
+	mem, _ := getCPUMem()
+	if runtime.GOARCH == "amd64" {
+		return GpuInfo{
+			Library: "cpu",
+			Variant: GetCPUVariant(),
+			memInfo: mem,
+		}
+	}
+	return GpuInfo{
+		Library: "metal",
+		memInfo: mem,
+	}
+}
+
+func getCPUMem() (memInfo, error) {
+	return memInfo{
+		TotalMemory: 0,
+		FreeMemory:  0,
+		DeviceCount: 0,
+	}, nil
+}
--- a/gpu/gpu_info.h
+++ b/gpu/gpu_info.h
@@ -0,0 +1,58 @@
+#ifndef __APPLE__
+#ifndef __GPU_INFO_H__
+#define __GPU_INFO_H__
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#ifndef _WIN32
+#include <dlfcn.h>
+#define LOAD_LIBRARY(lib, flags) dlopen(lib, flags)
+#define LOAD_SYMBOL(handle, sym) dlsym(handle, sym)
+#define LOAD_ERR() strdup(dlerror())
+#define UNLOAD_LIBRARY(handle) dlclose(handle)
+#else
+#include <windows.h>
+#define LOAD_LIBRARY(lib, flags) LoadLibrary(lib)
+#define LOAD_SYMBOL(handle, sym) GetProcAddress(handle, sym)
+#define UNLOAD_LIBRARY(handle) FreeLibrary(handle)
+#define LOAD_ERR() ({\
+  LPSTR messageBuffer = NULL; \
+  size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, \
+                                 NULL, GetLastError(), MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&messageBuffer, 0, NULL); \
+  char *resp = strdup(messageBuffer); \
+  LocalFree(messageBuffer); \
+  resp; \
+})
+
+#endif
+
+#define LOG(verbose, ...) \
+  do { \
+    if (verbose) { \
+      fprintf(stderr, __VA_ARGS__); \
+    } \
+  } while (0)
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct mem_info {
+  uint64_t total;
+  uint64_t free;
+  unsigned int count;
+  char *err;  // If non-nill, caller responsible for freeing
+} mem_info_t;
+
+void cpu_check_ram(mem_info_t *resp);
+
+#ifdef __cplusplus
+}
+#endif
+
+#include "gpu_info_cuda.h"
+#include "gpu_info_rocm.h"
+
+#endif  // __GPU_INFO_H__
+#endif  // __APPLE__
--- a/gpu/gpu_info_cpu.c
+++ b/gpu/gpu_info_cpu.c
@@ -0,0 +1,45 @@
+#include "gpu_info.h"
+// Fallbacks for CPU mode
+
+#ifdef _WIN32
+#include <sysinfoapi.h>
+void cpu_check_ram(mem_info_t *resp) {
+  resp->err = NULL;
+  MEMORYSTATUSEX info;
+  info.dwLength = sizeof(info);
+  if (GlobalMemoryStatusEx(&info) != 0) {
+    resp->count = 1;
+    resp->total = info.ullTotalPhys;
+    resp->free = info.ullAvailPhys;
+  } else {
+    resp->err = LOAD_ERR();
+  }
+  return;
+}
+
+#elif __linux__
+#include <errno.h>
+#include <string.h>
+#include <sys/sysinfo.h>
+void cpu_check_ram(mem_info_t *resp) {
+  struct sysinfo info;
+  resp->err = NULL;
+  if (sysinfo(&info) != 0) {
+    resp->err = strdup(strerror(errno));
+  } else {
+    resp->count = 1;
+    resp->total = info.totalram * info.mem_unit;
+    resp->free = info.freeram * info.mem_unit;
+  }
+  return;
+}
+
+#elif __APPLE__
+// TODO consider an Apple implementation that does something useful
+// mem_info_t cpu_check_ram() {
+//   mem_info_t resp = {0, 0, NULL};
+//   return resp;
+// }
+#else
+#error "Unsupported platform"
+#endif
--- a/gpu/gpu_info_cuda.c
+++ b/gpu/gpu_info_cuda.c
@@ -0,0 +1,213 @@
+#ifndef __APPLE__  // TODO - maybe consider nvidia support on intel macs?
+
+#include "gpu_info_cuda.h"
+
+#include <string.h>
+
+void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
+  nvmlReturn_t ret;
+  resp->err = NULL;
+  const int buflen = 256;
+  char buf[buflen + 1];
+  int i;
+
+  struct lookup {
+    char *s;
+    void **p;
+  } l[] = {
+      {"nvmlInit_v2", (void *)&resp->ch.nvmlInit_v2},
+      {"nvmlShutdown", (void *)&resp->ch.nvmlShutdown},
+      {"nvmlDeviceGetHandleByIndex", (void *)&resp->ch.nvmlDeviceGetHandleByIndex},
+      {"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.nvmlDeviceGetMemoryInfo},
+      {"nvmlDeviceGetCount_v2", (void *)&resp->ch.nvmlDeviceGetCount_v2},
+      {"nvmlDeviceGetCudaComputeCapability", (void *)&resp->ch.nvmlDeviceGetCudaComputeCapability},
+      {"nvmlSystemGetDriverVersion", (void *)&resp->ch.nvmlSystemGetDriverVersion},
+      {"nvmlDeviceGetName", (void *)&resp->ch.nvmlDeviceGetName},
+      {"nvmlDeviceGetSerial", (void *)&resp->ch.nvmlDeviceGetSerial},
+      {"nvmlDeviceGetVbiosVersion", (void *)&resp->ch.nvmlDeviceGetVbiosVersion},
+      {"nvmlDeviceGetBoardPartNumber", (void *)&resp->ch.nvmlDeviceGetBoardPartNumber},
+      {"nvmlDeviceGetBrand", (void *)&resp->ch.nvmlDeviceGetBrand},
+      {NULL, NULL},
+  };
+
+  resp->ch.handle = LOAD_LIBRARY(cuda_lib_path, RTLD_LAZY);
+  if (!resp->ch.handle) {
+    char *msg = LOAD_ERR();
+    LOG(resp->ch.verbose, "library %s load err: %s\n", cuda_lib_path, msg);
+    snprintf(buf, buflen,
+             "Unable to load %s library to query for Nvidia GPUs: %s",
+             cuda_lib_path, msg);
+    free(msg);
+    resp->err = strdup(buf);
+    return;
+  }
+
+  // TODO once we've squashed the remaining corner cases remove this log
+  LOG(resp->ch.verbose, "wiring nvidia management library functions in %s\n", cuda_lib_path);
+  
+  for (i = 0; l[i].s != NULL; i++) {
+    // TODO once we've squashed the remaining corner cases remove this log
+    LOG(resp->ch.verbose, "dlsym: %s\n", l[i].s);
+
+    *l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
+    if (!l[i].p) {
+      resp->ch.handle = NULL;
+      char *msg = LOAD_ERR();
+      LOG(resp->ch.verbose, "dlerr: %s\n", msg);
+      UNLOAD_LIBRARY(resp->ch.handle);
+      snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
+               msg);
+      free(msg);
+      resp->err = strdup(buf);
+      return;
+    }
+  }
+
+  ret = (*resp->ch.nvmlInit_v2)();
+  if (ret != NVML_SUCCESS) {
+    LOG(resp->ch.verbose, "nvmlInit_v2 err: %d\n", ret);
+    UNLOAD_LIBRARY(resp->ch.handle);
+    resp->ch.handle = NULL;
+    snprintf(buf, buflen, "nvml vram init failure: %d", ret);
+    resp->err = strdup(buf);
+  }
+
+  // Report driver version if we're in verbose mode, ignore errors
+  ret = (*resp->ch.nvmlSystemGetDriverVersion)(buf, buflen);
+  if (ret != NVML_SUCCESS) {
+    LOG(resp->ch.verbose, "nvmlSystemGetDriverVersion failed: %d\n", ret);
+  } else {
+    LOG(resp->ch.verbose, "CUDA driver version: %s\n", buf);
+  }
+}
+
+void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
+  resp->err = NULL;
+  nvmlDevice_t device;
+  nvmlMemory_t memInfo = {0};
+  nvmlReturn_t ret;
+  const int buflen = 256;
+  char buf[buflen + 1];
+  int i;
+
+  if (h.handle == NULL) {
+    resp->err = strdup("nvml handle sn't initialized");
+    return;
+  }
+
+  ret = (*h.nvmlDeviceGetCount_v2)(&resp->count);
+  if (ret != NVML_SUCCESS) {
+    snprintf(buf, buflen, "unable to get device count: %d", ret);
+    resp->err = strdup(buf);
+    return;
+  }
+
+  resp->total = 0;
+  resp->free = 0;
+  for (i = 0; i < resp->count; i++) {
+    ret = (*h.nvmlDeviceGetHandleByIndex)(i, &device);
+    if (ret != NVML_SUCCESS) {
+      snprintf(buf, buflen, "unable to get device handle %d: %d", i, ret);
+      resp->err = strdup(buf);
+      return;
+    }
+
+    ret = (*h.nvmlDeviceGetMemoryInfo)(device, &memInfo);
+    if (ret != NVML_SUCCESS) {
+      snprintf(buf, buflen, "device memory info lookup failure %d: %d", i, ret);
+      resp->err = strdup(buf);
+      return;
+    }
+    if (h.verbose) {
+      nvmlBrandType_t brand = 0;
+      // When in verbose mode, report more information about
+      // the card we discover, but don't fail on error
+      ret = (*h.nvmlDeviceGetName)(device, buf, buflen);
+      if (ret != RSMI_STATUS_SUCCESS) {
+        LOG(h.verbose, "nvmlDeviceGetName failed: %d\n", ret);
+      } else {
+        LOG(h.verbose, "[%d] CUDA device name: %s\n", i, buf);
+      }
+      ret = (*h.nvmlDeviceGetBoardPartNumber)(device, buf, buflen);
+      if (ret != RSMI_STATUS_SUCCESS) {
+        LOG(h.verbose, "nvmlDeviceGetBoardPartNumber failed: %d\n", ret);
+      } else {
+        LOG(h.verbose, "[%d] CUDA part number: %s\n", i, buf);
+      }
+      ret = (*h.nvmlDeviceGetSerial)(device, buf, buflen);
+      if (ret != RSMI_STATUS_SUCCESS) {
+        LOG(h.verbose, "nvmlDeviceGetSerial failed: %d\n", ret);
+      } else {
+        LOG(h.verbose, "[%d] CUDA S/N: %s\n", i, buf);
+      }
+      ret = (*h.nvmlDeviceGetVbiosVersion)(device, buf, buflen);
+      if (ret != RSMI_STATUS_SUCCESS) {
+        LOG(h.verbose, "nvmlDeviceGetVbiosVersion failed: %d\n", ret);
+      } else {
+        LOG(h.verbose, "[%d] CUDA vbios version: %s\n", i, buf);
+      }
+      ret = (*h.nvmlDeviceGetBrand)(device, &brand);
+      if (ret != RSMI_STATUS_SUCCESS) {
+        LOG(h.verbose, "nvmlDeviceGetBrand failed: %d\n", ret);
+      } else {
+        LOG(h.verbose, "[%d] CUDA brand: %d\n", i, brand);
+      }
+    }
+
+    LOG(h.verbose, "[%d] CUDA totalMem %ld\n", i, memInfo.total);
+    LOG(h.verbose, "[%d] CUDA usedMem %ld\n", i, memInfo.free);
+
+    resp->total += memInfo.total;
+    resp->free += memInfo.free;
+  }
+}
+
+void cuda_compute_capability(cuda_handle_t h, cuda_compute_capability_t *resp) {
+  resp->err = NULL;
+  resp->major = 0;
+  resp->minor = 0;
+  nvmlDevice_t device;
+  int major = 0;
+  int minor = 0;
+  nvmlReturn_t ret;
+  const int buflen = 256;
+  char buf[buflen + 1];
+  int i;
+
+  if (h.handle == NULL) {
+    resp->err = strdup("nvml handle not initialized");
+    return;
+  }
+
+  unsigned int devices;
+  ret = (*h.nvmlDeviceGetCount_v2)(&devices);
+  if (ret != NVML_SUCCESS) {
+    snprintf(buf, buflen, "unable to get device count: %d", ret);
+    resp->err = strdup(buf);
+    return;
+  }
+
+  for (i = 0; i < devices; i++) {
+    ret = (*h.nvmlDeviceGetHandleByIndex)(i, &device);
+    if (ret != NVML_SUCCESS) {
+      snprintf(buf, buflen, "unable to get device handle %d: %d", i, ret);
+      resp->err = strdup(buf);
+      return;
+    }
+
+    ret = (*h.nvmlDeviceGetCudaComputeCapability)(device, &major, &minor);
+    if (ret != NVML_SUCCESS) {
+      snprintf(buf, buflen, "device compute capability lookup failure %d: %d", i, ret);
+      resp->err = strdup(buf);
+      return;
+    }
+    // Report the lowest major.minor we detect as that limits our compatibility
+    if (resp->major == 0 || resp->major > major ) {
+      resp->major = major;
+      resp->minor = minor;
+    } else if ( resp->major == major && resp->minor > minor ) {
+      resp->minor = minor;
+    }
+  }
+}
+#endif  // __APPLE__
--- a/gpu/gpu_info_cuda.h
+++ b/gpu/gpu_info_cuda.h
@@ -0,0 +1,56 @@
+#ifndef __APPLE__
+#ifndef __GPU_INFO_CUDA_H__
+#define __GPU_INFO_CUDA_H__
+#include "gpu_info.h"
+
+// Just enough typedef's to dlopen/dlsym for memory information
+typedef enum nvmlReturn_enum {
+  NVML_SUCCESS = 0,
+  // Other values omitted for now...
+} nvmlReturn_t;
+typedef void *nvmlDevice_t;  // Opaque is sufficient
+typedef struct nvmlMemory_st {
+  unsigned long long total;
+  unsigned long long free;
+  unsigned long long used;
+} nvmlMemory_t;
+
+typedef enum nvmlBrandType_enum
+{
+    NVML_BRAND_UNKNOWN          = 0,
+} nvmlBrandType_t;
+
+typedef struct cuda_handle {
+  void *handle;
+  uint16_t verbose;
+  nvmlReturn_t (*nvmlInit_v2)(void);
+  nvmlReturn_t (*nvmlShutdown)(void);
+  nvmlReturn_t (*nvmlDeviceGetHandleByIndex)(unsigned int, nvmlDevice_t *);
+  nvmlReturn_t (*nvmlDeviceGetMemoryInfo)(nvmlDevice_t, nvmlMemory_t *);
+  nvmlReturn_t (*nvmlDeviceGetCount_v2)(unsigned int *);
+  nvmlReturn_t (*nvmlDeviceGetCudaComputeCapability)(nvmlDevice_t, int* major, int* minor);
+  nvmlReturn_t (*nvmlSystemGetDriverVersion) (char* version, unsigned int  length);
+  nvmlReturn_t (*nvmlDeviceGetName) (nvmlDevice_t device, char* name, unsigned int  length);
+  nvmlReturn_t (*nvmlDeviceGetSerial) (nvmlDevice_t device, char* serial, unsigned int  length);
+  nvmlReturn_t (*nvmlDeviceGetVbiosVersion) (nvmlDevice_t device, char* version, unsigned int  length);
+  nvmlReturn_t (*nvmlDeviceGetBoardPartNumber) (nvmlDevice_t device, char* partNumber, unsigned int  length);
+  nvmlReturn_t (*nvmlDeviceGetBrand) (nvmlDevice_t device, nvmlBrandType_t* type);
+} cuda_handle_t;
+
+typedef struct cuda_init_resp {
+  char *err;  // If err is non-null handle is invalid
+  cuda_handle_t ch;
+} cuda_init_resp_t;
+
+typedef struct cuda_compute_capability {
+  char *err;
+  int major;
+  int minor;
+} cuda_compute_capability_t;
+
+void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp);
+void cuda_check_vram(cuda_handle_t ch, mem_info_t *resp);
+void cuda_compute_capability(cuda_handle_t ch, cuda_compute_capability_t *cc);
+
+#endif  // __GPU_INFO_CUDA_H__
+#endif  // __APPLE__
--- a/gpu/gpu_info_rocm.c
+++ b/gpu/gpu_info_rocm.c
@@ -0,0 +1,191 @@
+#ifndef __APPLE__
+
+#include "gpu_info_rocm.h"
+
+#include <string.h>
+
+void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
+  rsmi_status_t ret;
+  resp->err = NULL;
+  const int buflen = 256;
+  char buf[buflen + 1];
+  int i;
+  struct lookup {
+    char *s;
+    void **p;
+  } l[] = {
+      {"rsmi_init", (void *)&resp->rh.rsmi_init},
+      {"rsmi_shut_down", (void *)&resp->rh.rsmi_shut_down},
+      {"rsmi_dev_memory_total_get", (void *)&resp->rh.rsmi_dev_memory_total_get},
+      {"rsmi_dev_memory_usage_get", (void *)&resp->rh.rsmi_dev_memory_usage_get},
+      {"rsmi_version_get", (void *)&resp->rh.rsmi_version_get},
+      {"rsmi_num_monitor_devices", (void*)&resp->rh.rsmi_num_monitor_devices},
+      {"rsmi_dev_id_get", (void*)&resp->rh.rsmi_dev_id_get},
+      {"rsmi_dev_name_get", (void *)&resp->rh.rsmi_dev_name_get},
+      {"rsmi_dev_brand_get", (void *)&resp->rh.rsmi_dev_brand_get},
+      {"rsmi_dev_vendor_name_get", (void *)&resp->rh.rsmi_dev_vendor_name_get},
+      {"rsmi_dev_vram_vendor_get", (void *)&resp->rh.rsmi_dev_vram_vendor_get},
+      {"rsmi_dev_serial_number_get", (void *)&resp->rh.rsmi_dev_serial_number_get},
+      {"rsmi_dev_subsystem_name_get", (void *)&resp->rh.rsmi_dev_subsystem_name_get},
+      {"rsmi_dev_vbios_version_get", (void *)&resp->rh.rsmi_dev_vbios_version_get},
+      {NULL, NULL},
+  };
+
+  resp->rh.handle = LOAD_LIBRARY(rocm_lib_path, RTLD_LAZY);
+  if (!resp->rh.handle) {
+    char *msg = LOAD_ERR();
+    snprintf(buf, buflen,
+             "Unable to load %s library to query for Radeon GPUs: %s\n",
+             rocm_lib_path, msg);
+    free(msg);
+    resp->err = strdup(buf);
+    return;
+  }
+
+  // TODO once we've squashed the remaining corner cases remove this log
+  LOG(resp->rh.verbose, "wiring rocm management library functions in %s\n", rocm_lib_path);
+
+  for (i = 0; l[i].s != NULL; i++) {
+    // TODO once we've squashed the remaining corner cases remove this log
+    LOG(resp->rh.verbose, "dlsym: %s\n", l[i].s);
+
+    *l[i].p = LOAD_SYMBOL(resp->rh.handle, l[i].s);
+    if (!l[i].p) {
+      resp->rh.handle = NULL;
+      char *msg = LOAD_ERR();
+      LOG(resp->rh.verbose, "dlerr: %s\n", msg);
+      UNLOAD_LIBRARY(resp->rh.handle);
+      snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
+               msg);
+      free(msg);
+      resp->err = strdup(buf);
+      return;
+    }
+  }
+
+  ret = (*resp->rh.rsmi_init)(0);
+  if (ret != RSMI_STATUS_SUCCESS) {
+    LOG(resp->rh.verbose, "rsmi_init err: %d\n", ret);
+    UNLOAD_LIBRARY(resp->rh.handle);
+    resp->rh.handle = NULL;
+    snprintf(buf, buflen, "rocm vram init failure: %d", ret);
+    resp->err = strdup(buf);
+  }
+
+  return;
+}
+
+void rocm_check_vram(rocm_handle_t h, mem_info_t *resp) {
+  resp->err = NULL;
+  uint64_t totalMem = 0;
+  uint64_t usedMem = 0;
+  rsmi_status_t ret;
+  const int buflen = 256;
+  char buf[buflen + 1];
+  int i;
+
+  if (h.handle == NULL) {
+    resp->err = strdup("rocm handle not initialized");
+    return;
+  }
+
+  ret = (*h.rsmi_num_monitor_devices)(&resp->count);
+  if (ret != RSMI_STATUS_SUCCESS) {
+    snprintf(buf, buflen, "unable to get device count: %d", ret);
+    resp->err = strdup(buf);
+    return;
+  }
+  LOG(h.verbose, "discovered %d ROCm GPU Devices\n", resp->count);
+
+  resp->total = 0;
+  resp->free = 0;
+  for (i = 0; i < resp->count; i++) {
+    if (h.verbose) {
+      // When in verbose mode, report more information about
+      // the card we discover, but don't fail on error
+      ret = (*h.rsmi_dev_name_get)(i, buf, buflen);
+      if (ret != RSMI_STATUS_SUCCESS) {
+        LOG(h.verbose, "rsmi_dev_name_get failed: %d\n", ret);
+      } else {
+        LOG(h.verbose, "[%d] ROCm device name: %s\n", i, buf);
+      }
+      ret = (*h.rsmi_dev_brand_get)(i, buf, buflen);
+      if (ret != RSMI_STATUS_SUCCESS) {
+        LOG(h.verbose, "rsmi_dev_brand_get failed: %d\n", ret);
+      } else {
+        LOG(h.verbose, "[%d] ROCm brand: %s\n", i, buf);
+      }
+      ret = (*h.rsmi_dev_vendor_name_get)(i, buf, buflen);
+      if (ret != RSMI_STATUS_SUCCESS) {
+        LOG(h.verbose, "rsmi_dev_vendor_name_get failed: %d\n", ret);
+      } else {
+        LOG(h.verbose, "[%d] ROCm vendor: %s\n", i, buf);
+      }
+      ret = (*h.rsmi_dev_vram_vendor_get)(i, buf, buflen);
+      if (ret != RSMI_STATUS_SUCCESS) {
+        LOG(h.verbose, "rsmi_dev_vram_vendor_get failed: %d\n", ret);
+      } else {
+        LOG(h.verbose, "[%d] ROCm VRAM vendor: %s\n", i, buf);
+      }
+      ret = (*h.rsmi_dev_serial_number_get)(i, buf, buflen);
+      if (ret != RSMI_STATUS_SUCCESS) {
+        LOG(h.verbose, "rsmi_dev_serial_number_get failed: %d\n", ret);
+      } else {
+        LOG(h.verbose, "[%d] ROCm S/N: %s\n", i, buf);
+      }
+      ret = (*h.rsmi_dev_subsystem_name_get)(i, buf, buflen);
+      if (ret != RSMI_STATUS_SUCCESS) {
+        LOG(h.verbose, "rsmi_dev_subsystem_name_get failed: %d\n", ret);
+      } else {
+        LOG(h.verbose, "[%d] ROCm subsystem name: %s\n", i, buf);
+      }
+      ret = (*h.rsmi_dev_vbios_version_get)(i, buf, buflen);
+      if (ret != RSMI_STATUS_SUCCESS) {
+        LOG(h.verbose, "rsmi_dev_vbios_version_get failed: %d\n", ret);
+      } else {
+        LOG(h.verbose, "[%d] ROCm vbios version: %s\n", i, buf);
+      }
+    }
+
+    // Get total memory - used memory for available memory
+    ret = (*h.rsmi_dev_memory_total_get)(i, RSMI_MEM_TYPE_VRAM, &totalMem);
+    if (ret != RSMI_STATUS_SUCCESS) {
+      snprintf(buf, buflen, "rocm total mem lookup failure: %d", ret);
+      resp->err = strdup(buf);
+      return;
+    }
+    ret = (*h.rsmi_dev_memory_usage_get)(i, RSMI_MEM_TYPE_VRAM, &usedMem);
+    if (ret != RSMI_STATUS_SUCCESS) {
+      snprintf(buf, buflen, "rocm usage mem lookup failure: %d", ret);
+      resp->err = strdup(buf);
+      return;
+    }
+    LOG(h.verbose, "[%d] ROCm totalMem %ld\n", i, totalMem);
+    LOG(h.verbose, "[%d] ROCm usedMem %ld\n", i, usedMem);
+    resp->total += totalMem;
+    resp->free += totalMem - usedMem;
+  }
+}
+
+void rocm_get_version(rocm_handle_t h, rocm_version_resp_t *resp) {
+  const int buflen = 256;
+  char buf[buflen + 1];
+  if (h.handle == NULL) {
+    resp->str = strdup("nvml handle not initialized");
+    resp->status = 1;
+    return;
+  }
+  rsmi_version_t ver;
+  rsmi_status_t ret;
+  ret = h.rsmi_version_get(&ver);
+  if (ret != RSMI_STATUS_SUCCESS) {
+    snprintf(buf, buflen, "unexpected response on version lookup %d", ret);
+    resp->status = 1;
+  } else {
+    snprintf(buf, buflen, "%d", ver.major);
+    resp->status = 0;
+  }
+  resp->str = strdup(buf);
+}
+
+#endif  // __APPLE__
--- a/gpu/gpu_info_rocm.h
+++ b/gpu/gpu_info_rocm.h
@@ -0,0 +1,59 @@
+#ifndef __APPLE__
+#ifndef __GPU_INFO_ROCM_H__
+#define __GPU_INFO_ROCM_H__
+#include "gpu_info.h"
+
+// Just enough typedef's to dlopen/dlsym for memory information
+typedef enum rsmi_status_return {
+  RSMI_STATUS_SUCCESS = 0,
+  // Other values omitted for now...
+} rsmi_status_t;
+
+typedef enum rsmi_memory_type {
+  RSMI_MEM_TYPE_VRAM = 0,
+  RSMI_MEM_TYPE_VIS_VRAM,
+  RSMI_MEM_TYPE_GTT,
+} rsmi_memory_type_t;
+
+ typedef struct {
+     uint32_t major;     
+     uint32_t minor;     
+     uint32_t patch;     
+     const char *build;  
+ } rsmi_version_t;
+
+typedef struct rocm_handle {
+  void *handle;
+  uint16_t verbose;
+  rsmi_status_t (*rsmi_init)(uint64_t);
+  rsmi_status_t (*rsmi_shut_down)(void);
+  rsmi_status_t (*rsmi_dev_memory_total_get)(uint32_t, rsmi_memory_type_t, uint64_t *);
+  rsmi_status_t (*rsmi_dev_memory_usage_get)(uint32_t, rsmi_memory_type_t, uint64_t *);
+  rsmi_status_t (*rsmi_version_get) (rsmi_version_t *version);
+  rsmi_status_t (*rsmi_num_monitor_devices) (uint32_t *);
+  rsmi_status_t (*rsmi_dev_id_get)(uint32_t, uint16_t *);
+  rsmi_status_t (*rsmi_dev_name_get) (uint32_t,char *,size_t);
+  rsmi_status_t (*rsmi_dev_brand_get) (uint32_t, char *, uint32_t);		
+  rsmi_status_t (*rsmi_dev_vendor_name_get) (uint32_t, char *, uint32_t);		
+  rsmi_status_t (*rsmi_dev_vram_vendor_get) (uint32_t, char *, uint32_t);		
+  rsmi_status_t (*rsmi_dev_serial_number_get) (uint32_t, char *, uint32_t);		
+  rsmi_status_t (*rsmi_dev_subsystem_name_get) (uint32_t, char *, uint32_t);		
+  rsmi_status_t (*rsmi_dev_vbios_version_get) (uint32_t, char *, uint32_t);		
+} rocm_handle_t;
+
+typedef struct rocm_init_resp {
+  char *err;  // If err is non-null handle is invalid
+  rocm_handle_t rh;
+} rocm_init_resp_t;
+
+typedef struct rocm_version_resp {
+  rsmi_status_t status;
+  char *str; // Contains version or error string if status != 0 
+} rocm_version_resp_t;
+
+void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp);
+void rocm_check_vram(rocm_handle_t rh, mem_info_t *resp);
+void rocm_get_version(rocm_handle_t rh, rocm_version_resp_t *resp);
+
+#endif  // __GPU_INFO_ROCM_H__
+#endif  // __APPLE__
--- a/gpu/gpu_test.go
+++ b/gpu/gpu_test.go
@@ -0,0 +1,41 @@
+package gpu
+
+import (
+	"runtime"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+)
+
+func TestBasicGetGPUInfo(t *testing.T) {
+	info := GetGPUInfo()
+	assert.Contains(t, "cuda rocm cpu metal", info.Library)
+
+	switch runtime.GOOS {
+	case "darwin":
+		// TODO - remove this once MacOS returns some size for CPU
+		return
+	case "linux", "windows":
+		assert.Greater(t, info.TotalMemory, uint64(0))
+		assert.Greater(t, info.FreeMemory, uint64(0))
+		assert.Greater(t, info.DeviceCount, uint32(0))
+	default:
+		return
+	}
+}
+
+func TestCPUMemInfo(t *testing.T) {
+	info, err := getCPUMem()
+	assert.NoError(t, err)
+	switch runtime.GOOS {
+	case "darwin":
+		t.Skip("CPU memory not populated on darwin")
+	case "linux", "windows":
+		assert.Greater(t, info.TotalMemory, uint64(0))
+		assert.Greater(t, info.FreeMemory, uint64(0))
+	default:
+		return
+	}
+}
+
+// TODO - add some logic to figure out card type through other means and actually verify we got back what we expected
--- a/gpu/types.go
+++ b/gpu/types.go
@@ -0,0 +1,18 @@
+package gpu
+
+type memInfo struct {
+	TotalMemory uint64 `json:"total_memory,omitempty"`
+	FreeMemory  uint64 `json:"free_memory,omitempty"`
+	DeviceCount uint32 `json:"device_count,omitempty"`
+}
+
+// Beginning of an `ollama info` command
+type GpuInfo struct {
+	memInfo
+	Library string `json:"library,omitempty"`
+
+	// Optional variant to select (e.g. versions, cpu feature flags)
+	Variant string `json:"variant,omitempty"`
+
+	// TODO add other useful attributes about the card here for discovery information
+}
--- a/llm/dyn_ext_server.c
+++ b/llm/dyn_ext_server.c
@@ -0,0 +1,145 @@
+#include "dyn_ext_server.h"
+
+#include <stdio.h>
+#include <string.h>
+
+#ifdef __linux__
+#include <dlfcn.h>
+#define LOAD_LIBRARY(lib, flags) dlopen(lib, flags)
+#define LOAD_SYMBOL(handle, sym) dlsym(handle, sym)
+#define LOAD_ERR() strdup(dlerror())
+#define UNLOAD_LIBRARY(handle) dlclose(handle)
+#elif _WIN32
+#include <windows.h>
+#define LOAD_LIBRARY(lib, flags) LoadLibrary(lib)
+#define LOAD_SYMBOL(handle, sym) GetProcAddress(handle, sym)
+#define UNLOAD_LIBRARY(handle) FreeLibrary(handle)
+inline char *LOAD_ERR() {
+  LPSTR messageBuffer = NULL;
+  size_t size = FormatMessageA(
+      FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM |
+          FORMAT_MESSAGE_IGNORE_INSERTS,
+      NULL, GetLastError(), MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT),
+      (LPSTR)&messageBuffer, 0, NULL);
+  char *resp = strdup(messageBuffer);
+  LocalFree(messageBuffer);
+  return resp;
+}
+#else
+#include <dlfcn.h>
+#define LOAD_LIBRARY(lib, flags) dlopen(lib, flags)
+#define LOAD_SYMBOL(handle, sym) dlsym(handle, sym)
+#define LOAD_ERR() strdup(dlerror())
+#define UNLOAD_LIBRARY(handle) dlclose(handle)
+#endif
+
+void dyn_init(const char *libPath, struct dynamic_llama_server *s,
+                       ext_server_resp_t *err) {
+  int i = 0;
+  struct lookup {
+    char *s;
+    void **p;
+  } l[] = {
+      {"llama_server_init", (void *)&s->llama_server_init},
+      {"llama_server_start", (void *)&s->llama_server_start},
+      {"llama_server_stop", (void *)&s->llama_server_stop},
+      {"llama_server_completion", (void *)&s->llama_server_completion},
+      {"llama_server_completion_next_result",
+       (void *)&s->llama_server_completion_next_result},
+      {"llama_server_completion_cancel",
+       (void *)&s->llama_server_completion_cancel},
+      {"llama_server_release_task_result",
+       (void *)&s->llama_server_release_task_result},
+      {"llama_server_tokenize", (void *)&s->llama_server_tokenize},
+      {"llama_server_detokenize", (void *)&s->llama_server_detokenize},
+      {"llama_server_embedding", (void *)&s->llama_server_embedding},
+      {"llama_server_release_json_resp",
+       (void *)&s->llama_server_release_json_resp},
+      {"", NULL},
+  };
+
+  printf("loading library %s\n", libPath);
+  s->handle = LOAD_LIBRARY(libPath, RTLD_LOCAL|RTLD_NOW);
+  if (!s->handle) {
+    err->id = -1;
+    char *msg = LOAD_ERR();
+    snprintf(err->msg, err->msg_len,
+             "Unable to load dynamic server library: %s", msg);
+    free(msg);
+    return;
+  }
+
+  for (i = 0; l[i].p != NULL; i++) {
+    *l[i].p = LOAD_SYMBOL(s->handle, l[i].s);
+    if (!l[i].p) {
+      UNLOAD_LIBRARY(s->handle);
+      err->id = -1;
+      char *msg = LOAD_ERR();
+      snprintf(err->msg, err->msg_len, "symbol lookup for %s failed: %s",
+               l[i].s, msg);
+      free(msg);
+      return;
+    }
+  }
+}
+
+inline void dyn_llama_server_init(struct dynamic_llama_server s,
+                                           ext_server_params_t *sparams,
+                                           ext_server_resp_t *err) {
+  s.llama_server_init(sparams, err);
+}
+
+inline void dyn_llama_server_start(struct dynamic_llama_server s) {
+  s.llama_server_start();
+}
+
+inline void dyn_llama_server_stop(struct dynamic_llama_server s) {
+  s.llama_server_stop();
+}
+
+inline void dyn_llama_server_completion(struct dynamic_llama_server s,
+                                                 const char *json_req,
+                                                 ext_server_resp_t *resp) {
+  s.llama_server_completion(json_req, resp);
+}
+
+inline void dyn_llama_server_completion_next_result(
+    struct dynamic_llama_server s, const int task_id,
+    ext_server_task_result_t *result) {
+  s.llama_server_completion_next_result(task_id, result);
+}
+
+inline void dyn_llama_server_completion_cancel(
+    struct dynamic_llama_server s, const int task_id, ext_server_resp_t *err) {
+  s.llama_server_completion_cancel(task_id, err);
+}
+inline void dyn_llama_server_release_task_result(
+    struct dynamic_llama_server s, ext_server_task_result_t *result) {
+  s.llama_server_release_task_result(result);
+}
+
+inline void dyn_llama_server_tokenize(struct dynamic_llama_server s,
+                                               const char *json_req,
+                                               char **json_resp,
+                                               ext_server_resp_t *err) {
+  s.llama_server_tokenize(json_req, json_resp, err);
+}
+
+inline void dyn_llama_server_detokenize(struct dynamic_llama_server s,
+                                                 const char *json_req,
+                                                 char **json_resp,
+                                                 ext_server_resp_t *err) {
+  s.llama_server_detokenize(json_req, json_resp, err);
+}
+
+inline void dyn_llama_server_embedding(struct dynamic_llama_server s,
+                                                const char *json_req,
+                                                char **json_resp,
+                                                ext_server_resp_t *err) {
+  s.llama_server_embedding(json_req, json_resp, err);
+}
+
+inline void dyn_llama_server_release_json_resp(
+    struct dynamic_llama_server s, char **json_resp) {
+  s.llama_server_release_json_resp(json_resp);
+}
--- a/llm/dyn_ext_server.go
+++ b/llm/dyn_ext_server.go
@@ -0,0 +1,387 @@
+package llm
+
+/*
+#cgo CFLAGS: -I${SRCDIR}/ext_server -I${SRCDIR}/llama.cpp -I${SRCDIR}/llama.cpp/common -I${SRCDIR}/llama.cpp/examples/server
+#cgo CFLAGS: -DNDEBUG -DLLAMA_SERVER_LIBRARY=1 -D_XOPEN_SOURCE=600 -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64
+#cgo CFLAGS: -Wmissing-noreturn -Wextra -Wcast-qual -Wno-unused-function -Wno-array-bounds
+#cgo CPPFLAGS: -Ofast -Wextra -Wno-unused-function -Wno-unused-variable -Wno-deprecated-declarations -Wno-unused-but-set-variable
+#cgo darwin CFLAGS: -D_DARWIN_C_SOURCE
+#cgo darwin CPPFLAGS:  -DGGML_USE_ACCELERATE
+#cgo darwin CPPFLAGS: -DGGML_USE_METAL -DGGML_METAL_NDEBUG
+#cgo darwin LDFLAGS: -lc++ -framework Accelerate
+#cgo darwin LDFLAGS: -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
+#cgo linux CFLAGS: -D_GNU_SOURCE
+#cgo linux LDFLAGS: -lrt -ldl -lstdc++ -lm
+#cgo linux windows LDFLAGS: -lpthread
+
+#include <stdlib.h>
+#include "dyn_ext_server.h"
+
+*/
+import "C"
+
+import (
+	"bytes"
+	"context"
+	"encoding/json"
+	"fmt"
+	"log/slog"
+	"os"
+	"path/filepath"
+	"runtime"
+	"strings"
+	"sync"
+	"time"
+	"unsafe"
+
+	"github.com/jmorganca/ollama/api"
+)
+
+type dynExtServer struct {
+	s       C.struct_dynamic_llama_server
+	options api.Options
+}
+
+// Note: current implementation does not support concurrent instantiations
+var mutex sync.Mutex
+
+func newExtServerResp(len C.size_t) C.ext_server_resp_t {
+	var resp C.ext_server_resp_t
+	resp.msg_len = len
+	bytes := make([]byte, len)
+	resp.msg = (*C.char)(C.CBytes(bytes))
+	return resp
+}
+
+func freeExtServerResp(resp C.ext_server_resp_t) {
+	if resp.msg_len == 0 {
+		return
+	}
+	C.free(unsafe.Pointer(resp.msg))
+}
+
+func extServerResponseToErr(resp C.ext_server_resp_t) error {
+	return fmt.Errorf(C.GoString(resp.msg))
+}
+
+// Note: current implementation does not support concurrent instantiations
+var llm *dynExtServer
+
+func newDynExtServer(library, model string, adapters, projectors []string, opts api.Options) (LLM, error) {
+	if !mutex.TryLock() {
+		slog.Info("concurrent llm servers not yet supported, waiting for prior server to complete")
+		mutex.Lock()
+	}
+	updatePath(filepath.Dir(library))
+	libPath := C.CString(library)
+	defer C.free(unsafe.Pointer(libPath))
+	resp := newExtServerResp(512)
+	defer freeExtServerResp(resp)
+	var srv C.struct_dynamic_llama_server
+	C.dyn_init(libPath, &srv, &resp)
+	if resp.id < 0 {
+		mutex.Unlock()
+		return nil, fmt.Errorf("Unable to load dynamic library: %s", C.GoString(resp.msg))
+	}
+	llm = &dynExtServer{
+		s:       srv,
+		options: opts,
+	}
+	slog.Info(fmt.Sprintf("Loading Dynamic llm server: %s", library))
+
+	var sparams C.ext_server_params_t
+	sparams.model = C.CString(model)
+	defer C.free(unsafe.Pointer(sparams.model))
+
+	sparams.embedding = true
+	sparams.n_ctx = C.uint(opts.NumCtx)
+	sparams.n_batch = C.uint(opts.NumBatch)
+	sparams.n_gpu_layers = C.int(opts.NumGPU)
+	sparams.main_gpu = C.int(opts.MainGPU)
+	sparams.n_parallel = 1 // TODO - wire up concurrency
+
+	// Always use the value encoded in the model
+	sparams.rope_freq_base = 0.0
+	sparams.rope_freq_scale = 0.0
+	sparams.memory_f16 = C.bool(opts.F16KV)
+	sparams.use_mlock = C.bool(opts.UseMLock)
+	sparams.use_mmap = C.bool(opts.UseMMap)
+	sparams.numa = C.bool(opts.UseNUMA)
+
+	sparams.lora_adapters = nil
+	for i := 0; i < len(adapters); i++ {
+		la := (*C.ext_server_lora_adapter_t)(C.malloc(C.sizeof_ext_server_lora_adapter_t))
+		defer C.free(unsafe.Pointer(la))
+		la.adapter = C.CString(adapters[i])
+		defer C.free(unsafe.Pointer(la.adapter))
+		la.scale = C.float(1.0) // TODO expose scale/weights up through ollama UX
+		la.next = nil
+		if i == 0 {
+			sparams.lora_adapters = la
+		} else {
+			tmp := sparams.lora_adapters
+			for ; tmp.next != nil; tmp = tmp.next {
+			}
+			tmp.next = la
+		}
+	}
+
+	if len(projectors) > 0 {
+		// TODO: applying multiple projectors is not supported by the llama.cpp server yet
+		sparams.mmproj = C.CString(projectors[0])
+		defer C.free(unsafe.Pointer(sparams.mmproj))
+	} else {
+		sparams.mmproj = nil
+	}
+
+	sparams.n_threads = C.uint(opts.NumThread)
+
+	if debug := os.Getenv("OLLAMA_DEBUG"); debug != "" {
+		sparams.verbose_logging = C.bool(true)
+	} else {
+		sparams.verbose_logging = C.bool(false)
+	}
+
+	slog.Info("Initializing llama server")
+	initResp := newExtServerResp(128)
+	defer freeExtServerResp(initResp)
+	C.dyn_llama_server_init(llm.s, &sparams, &initResp)
+	if initResp.id < 0 {
+		mutex.Unlock()
+		err := extServerResponseToErr(initResp)
+		slog.Debug(fmt.Sprintf("failure during initialization: %s", err))
+		return nil, err
+	}
+
+	slog.Info("Starting llama main loop")
+	C.dyn_llama_server_start(llm.s)
+	return llm, nil
+}
+
+func (llm *dynExtServer) Predict(ctx context.Context, predict PredictOpts, fn func(PredictResult)) error {
+	resp := newExtServerResp(128)
+	defer freeExtServerResp(resp)
+	var imageData []ImageData
+	if len(predict.Images) > 0 {
+		for cnt, i := range predict.Images {
+			imageData = append(imageData, ImageData{Data: i, ID: cnt})
+		}
+	}
+	slog.Info(fmt.Sprintf("loaded %d images", len(imageData)))
+
+	request := map[string]any{
+		"prompt":            predict.Prompt,
+		"stream":            true,
+		"n_predict":         predict.Options.NumPredict,
+		"n_keep":            predict.Options.NumKeep,
+		"temperature":       predict.Options.Temperature,
+		"top_k":             predict.Options.TopK,
+		"top_p":             predict.Options.TopP,
+		"tfs_z":             predict.Options.TFSZ,
+		"typical_p":         predict.Options.TypicalP,
+		"repeat_last_n":     predict.Options.RepeatLastN,
+		"repeat_penalty":    predict.Options.RepeatPenalty,
+		"presence_penalty":  predict.Options.PresencePenalty,
+		"frequency_penalty": predict.Options.FrequencyPenalty,
+		"mirostat":          predict.Options.Mirostat,
+		"mirostat_tau":      predict.Options.MirostatTau,
+		"mirostat_eta":      predict.Options.MirostatEta,
+		"penalize_nl":       predict.Options.PenalizeNewline,
+		"seed":              predict.Options.Seed,
+		"stop":              predict.Options.Stop,
+		"image_data":        imageData,
+		"cache_prompt":      true,
+	}
+
+	if predict.Format == "json" {
+		request["grammar"] = jsonGrammar
+	}
+
+	retryDelay := 100 * time.Microsecond
+	for retries := 0; retries < maxRetries; retries++ {
+		if retries > 0 {
+			time.Sleep(retryDelay) // wait before retrying
+			retryDelay *= 2        // exponential backoff
+		}
+
+		// Handling JSON marshaling with special characters unescaped.
+		buffer := &bytes.Buffer{}
+		enc := json.NewEncoder(buffer)
+		enc.SetEscapeHTML(false)
+
+		if err := enc.Encode(request); err != nil {
+			return fmt.Errorf("failed to marshal data: %w", err)
+		}
+
+		req := C.CString(buffer.String())
+		defer C.free(unsafe.Pointer(req))
+
+		C.dyn_llama_server_completion(llm.s, req, &resp)
+		if resp.id < 0 {
+			return extServerResponseToErr(resp)
+		}
+
+		retryNeeded := false
+	out:
+		for {
+			select {
+			case <-ctx.Done():
+				// This handles the request cancellation
+				C.dyn_llama_server_completion_cancel(llm.s, resp.id, &resp)
+				if resp.id < 0 {
+					return extServerResponseToErr(resp)
+				} else {
+					return nil
+				}
+			default:
+				var result C.ext_server_task_result_t
+				C.dyn_llama_server_completion_next_result(llm.s, resp.id, &result)
+				json_resp := C.GoString(result.json_resp)
+				C.dyn_llama_server_release_task_result(llm.s, &result)
+
+				var p prediction
+				if err := json.Unmarshal([]byte(json_resp), &p); err != nil {
+					C.dyn_llama_server_completion_cancel(llm.s, resp.id, &resp)
+					if resp.id < 0 {
+						return fmt.Errorf("error unmarshaling llm prediction response: %w and cancel %s", err, C.GoString(resp.msg))
+					} else {
+						return fmt.Errorf("error unmarshaling llm prediction response: %w", err)
+					}
+				}
+
+				if bool(result.error) && strings.Contains(json_resp, "slot unavailable") {
+					retryNeeded = true
+					// task will already be canceled
+					break out
+				}
+
+				if p.Content != "" {
+					fn(PredictResult{
+						Content: p.Content,
+					})
+				}
+
+				if p.Stop {
+					fn(PredictResult{
+						Done:               true,
+						PromptEvalCount:    p.Timings.PromptN,
+						PromptEvalDuration: parseDurationMs(p.Timings.PromptMS),
+						EvalCount:          p.Timings.PredictedN,
+						EvalDuration:       parseDurationMs(p.Timings.PredictedMS),
+					})
+					return nil
+				}
+			}
+		}
+		if !retryNeeded {
+			return nil // success
+		}
+	}
+
+	// should never reach here ideally
+	return fmt.Errorf("max retries exceeded")
+}
+
+func (llm *dynExtServer) Encode(ctx context.Context, prompt string) ([]int, error) {
+	data, err := json.Marshal(TokenizeRequest{Content: prompt})
+	if err != nil {
+		return nil, fmt.Errorf("marshaling encode data: %w", err)
+	}
+	req := C.CString(string(data))
+	defer C.free(unsafe.Pointer(req))
+	var json_resp *C.char
+	resp := newExtServerResp(128)
+	defer freeExtServerResp(resp)
+	C.dyn_llama_server_tokenize(llm.s, req, &json_resp, &resp)
+	if resp.id < 0 {
+		return nil, extServerResponseToErr(resp)
+	}
+	defer C.dyn_llama_server_release_json_resp(llm.s, &json_resp)
+
+	var encoded TokenizeResponse
+	if err2 := json.Unmarshal([]byte(C.GoString(json_resp)), &encoded); err2 != nil {
+		return nil, fmt.Errorf("unmarshal encode response: %w", err2)
+	}
+
+	return encoded.Tokens, err
+}
+
+func (llm *dynExtServer) Decode(ctx context.Context, tokens []int) (string, error) {
+	if len(tokens) == 0 {
+		return "", nil
+	}
+	data, err := json.Marshal(DetokenizeRequest{Tokens: tokens})
+	if err != nil {
+		return "", fmt.Errorf("marshaling decode data: %w", err)
+	}
+
+	req := C.CString(string(data))
+	defer C.free(unsafe.Pointer(req))
+	var json_resp *C.char
+	resp := newExtServerResp(128)
+	defer freeExtServerResp(resp)
+	C.dyn_llama_server_detokenize(llm.s, req, &json_resp, &resp)
+	if resp.id < 0 {
+		return "", extServerResponseToErr(resp)
+	}
+	defer C.dyn_llama_server_release_json_resp(llm.s, &json_resp)
+
+	var decoded DetokenizeResponse
+	if err2 := json.Unmarshal([]byte(C.GoString(json_resp)), &decoded); err2 != nil {
+		return "", fmt.Errorf("unmarshal encode response: %w", err2)
+	}
+
+	return decoded.Content, err
+}
+
+func (llm *dynExtServer) Embedding(ctx context.Context, input string) ([]float64, error) {
+	data, err := json.Marshal(TokenizeRequest{Content: input})
+	if err != nil {
+		return nil, fmt.Errorf("error marshaling embed data: %w", err)
+	}
+
+	req := C.CString(string(data))
+	defer C.free(unsafe.Pointer(req))
+	var json_resp *C.char
+	resp := newExtServerResp(128)
+	defer freeExtServerResp(resp)
+	C.dyn_llama_server_embedding(llm.s, req, &json_resp, &resp)
+	if resp.id < 0 {
+		return nil, extServerResponseToErr(resp)
+	}
+	defer C.dyn_llama_server_release_json_resp(llm.s, &json_resp)
+
+	var embedding EmbeddingResponse
+	if err := json.Unmarshal([]byte(C.GoString(json_resp)), &embedding); err != nil {
+		return nil, fmt.Errorf("unmarshal tokenize response: %w", err)
+	}
+
+	return embedding.Embedding, nil
+}
+
+func (llm *dynExtServer) Close() {
+	C.dyn_llama_server_stop(llm.s)
+	mutex.Unlock()
+}
+
+func updatePath(dir string) {
+	if runtime.GOOS == "windows" {
+		tmpDir := filepath.Dir(dir)
+		pathComponents := strings.Split(os.Getenv("PATH"), ";")
+		i := 0
+		for _, comp := range pathComponents {
+			if strings.EqualFold(comp, dir) {
+				return
+			}
+			// Remove any other prior paths to our temp dir
+			if !strings.HasPrefix(strings.ToLower(comp), strings.ToLower(tmpDir)) {
+				pathComponents[i] = comp
+				i++
+			}
+		}
+		newPath := strings.Join(append([]string{dir}, pathComponents...), ";")
+		slog.Info(fmt.Sprintf("Updating PATH to %s", newPath))
+		os.Setenv("PATH", newPath)
+	}
+	// linux and darwin rely on rpath
+}
--- a/llm/dyn_ext_server.h
+++ b/llm/dyn_ext_server.h
@@ -0,0 +1,74 @@
+#include <stdlib.h>
+
+#include "ext_server.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+struct dynamic_llama_server {
+  void *handle;
+  void (*llama_server_init)(ext_server_params_t *sparams,
+                            ext_server_resp_t *err);
+  void (*llama_server_start)();
+  void (*llama_server_stop)();
+  void (*llama_server_completion)(const char *json_req,
+                                  ext_server_resp_t *resp);
+  void (*llama_server_completion_next_result)(const int task_id,
+                                              ext_server_task_result_t *result);
+  void (*llama_server_completion_cancel)(const int task_id,
+                                         ext_server_resp_t *err);
+  void (*llama_server_release_task_result)(ext_server_task_result_t *result);
+  void (*llama_server_tokenize)(const char *json_req, char **json_resp,
+                                ext_server_resp_t *err);
+  void (*llama_server_detokenize)(const char *json_req, char **json_resp,
+                                  ext_server_resp_t *err);
+  void (*llama_server_embedding)(const char *json_req, char **json_resp,
+                                 ext_server_resp_t *err);
+  void (*llama_server_release_json_resp)(char **json_resp);
+};
+
+void dyn_init(const char *libPath, struct dynamic_llama_server *s,
+                       ext_server_resp_t *err);
+
+// No good way to call C function pointers from Go so inline the indirection
+void dyn_llama_server_init(struct dynamic_llama_server s,
+                                    ext_server_params_t *sparams,
+                                    ext_server_resp_t *err);
+
+void dyn_llama_server_start(struct dynamic_llama_server s);
+
+void dyn_llama_server_stop(struct dynamic_llama_server s);
+
+void dyn_llama_server_completion(struct dynamic_llama_server s,
+                                          const char *json_req,
+                                          ext_server_resp_t *resp);
+
+void dyn_llama_server_completion_next_result(
+    struct dynamic_llama_server s, const int task_id,
+    ext_server_task_result_t *result);
+
+void dyn_llama_server_completion_cancel(struct dynamic_llama_server s,
+                                                 const int task_id,
+                                                 ext_server_resp_t *err);
+
+void dyn_llama_server_release_task_result(
+    struct dynamic_llama_server s, ext_server_task_result_t *result);
+
+void dyn_llama_server_tokenize(struct dynamic_llama_server s,
+                                        const char *json_req, char **json_resp,
+                                        ext_server_resp_t *err);
+
+void dyn_llama_server_detokenize(struct dynamic_llama_server s,
+                                          const char *json_req,
+                                          char **json_resp,
+                                          ext_server_resp_t *err);
+
+void dyn_llama_server_embedding(struct dynamic_llama_server s,
+                                         const char *json_req, char **json_resp,
+                                         ext_server_resp_t *err);
+void dyn_llama_server_release_json_resp(struct dynamic_llama_server s,
+                                                 char **json_resp);
+
+#ifdef __cplusplus
+}
+#endif
--- a/llm/ext_server/CMakeLists.txt
+++ b/llm/ext_server/CMakeLists.txt
@@ -0,0 +1,25 @@
+# Ollama specific CMakefile to include in llama.cpp/examples/server
+
+set(TARGET ext_server)
+option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
+if (WIN32)
+    add_library(${TARGET} SHARED ../../../ext_server/ext_server.cpp ../../llama.cpp)
+else()
+    add_library(${TARGET} STATIC ../../../ext_server/ext_server.cpp ../../llama.cpp)
+endif()
+target_include_directories(${TARGET} PRIVATE ../../common)
+target_include_directories(${TARGET} PRIVATE ../..)
+target_include_directories(${TARGET} PRIVATE ../../..)
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_definitions(${TARGET} PUBLIC LLAMA_SERVER_LIBRARY=1)
+target_link_libraries(${TARGET} PRIVATE ggml llava common )
+set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
+target_compile_definitions(${TARGET} PRIVATE SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>)
+install(TARGETS ext_server LIBRARY)
+
+if (CUDAToolkit_FOUND)
+    target_include_directories(${TARGET} PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
+    if (WIN32)
+        target_link_libraries(${TARGET} PRIVATE nvml)
+    endif()
+endif()
--- a/llm/ext_server/README.md
+++ b/llm/ext_server/README.md
@@ -0,0 +1,18 @@
+# Extern C Server
+
+This directory contains a thin facade we layer on top of the Llama.cpp server to
+expose `extern C` interfaces to access the functionality through direct API
+calls in-process.  The llama.cpp code uses compile time macros to configure GPU
+type along with other settings.  During the `go generate ./...` execution, the
+build will generate one or more copies of the llama.cpp `extern C` server based
+on what GPU libraries are detected to support multiple GPU types as well as CPU
+only support. The Ollama go build then embeds these different servers to support
+different GPUs and settings at runtime.
+
+If you are making changes to the code in this directory, make sure to disable
+caching during your go build to ensure you pick up your changes.  A typical
+iteration cycle from the top of the source tree looks like:
+
+```
+go generate ./... && go build -a .
+```
--- a/llm/ext_server/ext_server.cpp
+++ b/llm/ext_server/ext_server.cpp
@@ -0,0 +1,324 @@
+#include "ext_server.h"
+
+// Necessary evil since the server types are not defined in a header
+#include "server.cpp"
+
+// Low level API access to verify GPU access
+#if defined(GGML_USE_CUBLAS)
+#if defined(GGML_USE_HIPBLAS)
+#include <hip/hip_runtime.h>
+#include <hipblas/hipblas.h>
+#include <hip/hip_fp16.h>
+#ifdef __HIP_PLATFORM_AMD__
+// for rocblas_initialize()
+#include "rocblas/rocblas.h"
+#endif // __HIP_PLATFORM_AMD__
+#define cudaGetDevice hipGetDevice
+#define cudaError_t hipError_t
+#define cudaSuccess hipSuccess
+#define cudaGetErrorString hipGetErrorString
+#else
+#include <cuda_runtime.h>
+#include <cublas_v2.h>
+#include <cuda_fp16.h>
+#endif // defined(GGML_USE_HIPBLAS)
+#endif // GGML_USE_CUBLAS
+
+// Expose the llama server as a callable extern "C" API
+llama_server_context *llama = NULL;
+std::atomic<bool> ext_server_running(false);
+std::thread ext_server_thread;
+
+void llama_server_init(ext_server_params *sparams, ext_server_resp_t *err) {
+  assert(err != NULL && sparams != NULL);
+  log_set_target(stderr);
+  if (!sparams->verbose_logging) {
+    log_disable();
+  }
+
+  LOG_TEE("system info: %s\n", llama_print_system_info());
+  err->id = 0;
+  err->msg[0] = '\0';
+  try {
+    llama = new llama_server_context;
+    gpt_params params;
+    params.n_ctx = sparams->n_ctx;
+    params.n_batch = sparams->n_batch;
+    if (sparams->n_threads > 0) {
+      params.n_threads = sparams->n_threads;
+    }
+    params.n_parallel = sparams->n_parallel;
+    params.rope_freq_base = sparams->rope_freq_base;
+    params.rope_freq_scale = sparams->rope_freq_scale;
+
+    if (sparams->memory_f16) {
+      params.cache_type_k = "f16";
+      params.cache_type_v = "f16";
+    } else {
+      params.cache_type_k = "f32";
+      params.cache_type_v = "f32";
+    }
+
+    params.n_gpu_layers = sparams->n_gpu_layers;
+    params.main_gpu = sparams->main_gpu;
+    params.use_mlock = sparams->use_mlock;
+    params.use_mmap = sparams->use_mmap;
+    params.numa = sparams->numa;
+    params.embedding = sparams->embedding;
+    if (sparams->model != NULL) {
+      params.model = sparams->model;
+    }
+
+    if (sparams->lora_adapters != NULL) {
+      for (ext_server_lora_adapter *la = sparams->lora_adapters; la != NULL;
+          la = la->next) {
+        params.lora_adapter.push_back(std::make_tuple(la->adapter, la->scale));
+      }
+
+      params.use_mmap = false;
+    }
+
+    if (sparams->mmproj != NULL) {
+      params.mmproj = std::string(sparams->mmproj);
+    }
+
+#if defined(GGML_USE_CUBLAS)
+    // Before attempting to init the backend which will assert on error, verify the CUDA/ROCM GPU is accessible
+    LOG_TEE("Performing pre-initialization of GPU\n");
+    int id;
+    cudaError_t cudaErr = cudaGetDevice(&id);
+    if (cudaErr != cudaSuccess) {
+      err->id = -1;
+      snprintf(err->msg, err->msg_len, "Unable to init GPU: %s", cudaGetErrorString(cudaErr));
+      return;
+    }
+#endif
+
+    llama_backend_init(params.numa);
+
+    // load the model
+    if (!llama->load_model(params)) {
+      // TODO - consider modifying the logging logic or patching load_model so
+      // we can capture more detailed error messages and pass them back to the
+      // caller for better UX
+      err->id = -1;
+      snprintf(err->msg, err->msg_len, "error loading model %s",
+               params.model.c_str());
+      return;
+    }
+
+    llama->initialize();
+  } catch (std::exception &e) {
+    err->id = -1;
+    snprintf(err->msg, err->msg_len, "exception %s", e.what());
+  } catch (...) {
+    err->id = -1;
+    snprintf(err->msg, err->msg_len,
+             "Unknown exception initializing llama server");
+  }
+}
+
+void llama_server_start() {
+  assert(llama != NULL);
+  // TODO mutex to protect thread creation
+  ext_server_thread = std::thread([&]() {
+    ext_server_running = true;
+    try {
+      LOG_TEE("llama server main loop starting\n");
+      ggml_time_init();
+      while (ext_server_running.load()) {
+        if (!llama->update_slots()) {
+          LOG_TEE(
+              "unexpected error in llama server update_slots - exiting main "
+              "loop\n");
+          break;
+        }
+      }
+    } catch (std::exception &e) {
+      LOG_TEE("caught exception in llama server main loop: %s\n", e.what());
+    } catch (...) {
+      LOG_TEE("caught unknown exception in llama server main loop\n");
+    }
+    LOG_TEE("\nllama server shutting down\n");
+    llama_backend_free();
+  });
+}
+
+void llama_server_stop() {
+  assert(llama != NULL);
+  // TODO - too verbose, remove once things are solid
+  LOG_TEE("requesting llama server shutdown\n");
+  ext_server_running = false;
+
+  // unblocks the update_slots() loop so it can clean up and exit
+  llama->request_cancel(0);
+
+  ext_server_thread.join();
+  delete llama;
+  llama = NULL;
+  LOG_TEE("llama server shutdown complete\n");
+}
+
+void llama_server_completion(const char *json_req, ext_server_resp_t *resp) {
+  assert(llama != NULL && json_req != NULL && resp != NULL);
+  resp->id = -1;
+  resp->msg[0] = '\0';
+  try {
+    json data = json::parse(json_req);
+    resp->id = llama->request_completion(data, false, false, -1);
+  } catch (std::exception &e) {
+    snprintf(resp->msg, resp->msg_len, "exception %s", e.what());
+  } catch (...) {
+    snprintf(resp->msg, resp->msg_len, "Unknown exception during completion");
+  }
+}
+
+void llama_server_completion_next_result(const int task_id,
+                                         ext_server_task_result_t *resp) {
+  assert(llama != NULL && resp != NULL);
+  std::string msg;
+  resp->id = -1;
+  resp->stop = false;
+  resp->error = false;
+  resp->json_resp = NULL;
+  std::string result_json;
+  try {
+    task_result result = llama->next_result(task_id);
+    result_json =
+        result.result_json.dump(-1, ' ', false, json::error_handler_t::replace);
+    resp->id = result.id;
+    resp->stop = result.stop;
+    resp->error = result.error;
+    if (result.error) {
+      llama->request_cancel(task_id);
+    } else if (result.stop) {
+      llama->request_cancel(task_id);
+    }
+  } catch (std::exception &e) {
+    resp->error = true;
+    resp->id = -1;
+    result_json = "{\"error\":\"exception " + std::string(e.what()) + "\"}";
+    LOG_TEE("llama server completion exception %s\n", e.what());
+  } catch (...) {
+    resp->error = true;
+    resp->id = -1;
+    result_json = "{\"error\":\"Unknown exception during completion\"}";
+    LOG_TEE("llama server completion unknown exception\n");
+  }
+  const std::string::size_type size = result_json.size() + 1;
+  resp->json_resp = new char[size];
+  snprintf(resp->json_resp, size, "%s", result_json.c_str());
+}
+
+void llama_server_release_task_result(ext_server_task_result_t *result) {
+  if (result == NULL || result->json_resp == NULL) {
+    return;
+  }
+  delete[] result->json_resp;
+}
+
+void llama_server_completion_cancel(const int task_id, ext_server_resp_t *err) {
+  assert(llama != NULL && err != NULL);
+  err->id = 0;
+  err->msg[0] = '\0';
+  try {
+    llama->request_cancel(task_id);
+  } catch (std::exception &e) {
+    err->id = -1;
+    snprintf(err->msg, err->msg_len, "exception %s", e.what());
+  } catch (...) {
+    err->id = -1;
+    snprintf(err->msg, err->msg_len,
+             "Unknown exception completion cancel in llama server");
+  }
+}
+
+void llama_server_tokenize(const char *json_req, char **json_resp,
+                           ext_server_resp_t *err) {
+  assert(llama != NULL && json_req != NULL && json_resp != NULL && err != NULL);
+  *json_resp = NULL;
+  err->id = 0;
+  err->msg[0] = '\0';
+  try {
+    const json body = json::parse(json_req);
+    std::vector<llama_token> tokens;
+    if (body.count("content") != 0) {
+      tokens = llama->tokenize(body["content"], false);
+    }
+    const json data = format_tokenizer_response(tokens);
+    std::string result_json = data.dump();
+    const std::string::size_type size = result_json.size() + 1;
+    *json_resp = new char[size];
+    snprintf(*json_resp, size, "%s", result_json.c_str());
+  } catch (std::exception &e) {
+    err->id = -1;
+    snprintf(err->msg, err->msg_len, "exception %s", e.what());
+  } catch (...) {
+    err->id = -1;
+    snprintf(err->msg, err->msg_len, "Unknown exception during tokenize");
+  }
+}
+
+void llama_server_release_json_resp(char **json_resp) {
+  if (json_resp == NULL || *json_resp == NULL) {
+    return;
+  }
+  delete[] *json_resp;
+}
+
+void llama_server_detokenize(const char *json_req, char **json_resp,
+                             ext_server_resp_t *err) {
+  assert(llama != NULL && json_req != NULL && json_resp != NULL && err != NULL);
+  *json_resp = NULL;
+  err->id = 0;
+  err->msg[0] = '\0';
+  try {
+    const json body = json::parse(json_req);
+    std::string content;
+    if (body.count("tokens") != 0) {
+      const std::vector<llama_token> tokens = body["tokens"];
+      content = tokens_to_str(llama->ctx, tokens.cbegin(), tokens.cend());
+    }
+    const json data = format_detokenized_response(content);
+    std::string result_json = data.dump();
+    const std::string::size_type size = result_json.size() + 1;
+    *json_resp = new char[size];
+    snprintf(*json_resp, size, "%s", result_json.c_str());
+  } catch (std::exception &e) {
+    err->id = -1;
+    snprintf(err->msg, err->msg_len, "exception %s", e.what());
+  } catch (...) {
+    err->id = -1;
+    snprintf(err->msg, err->msg_len, "Unknown exception during detokenize");
+  }
+}
+
+void llama_server_embedding(const char *json_req, char **json_resp,
+                            ext_server_resp_t *err) {
+  assert(llama != NULL && json_req != NULL && json_resp != NULL && err != NULL);
+  *json_resp = NULL;
+  err->id = 0;
+  err->msg[0] = '\0';
+  try {
+    const json body = json::parse(json_req);
+    json prompt;
+    if (body.count("content") != 0) {
+      prompt = body["content"];
+    } else {
+      prompt = "";
+    }
+    const int task_id = llama->request_completion(
+        {{"prompt", prompt}, {"n_predict", 0}}, false, true, -1);
+    task_result result = llama->next_result(task_id);
+    std::string result_json = result.result_json.dump();
+    const std::string::size_type size = result_json.size() + 1;
+    *json_resp = new char[size];
+    snprintf(*json_resp, size, "%s", result_json.c_str());
+  } catch (std::exception &e) {
+    err->id = -1;
+    snprintf(err->msg, err->msg_len, "exception %s", e.what());
+  } catch (...) {
+    err->id = -1;
+    snprintf(err->msg, err->msg_len, "Unknown exception during embedding");
+  }
+}
--- a/llm/ext_server/ext_server.h
+++ b/llm/ext_server/ext_server.h
@@ -0,0 +1,95 @@
+#if defined(LLAMA_SERVER_LIBRARY)
+#ifndef LLAMA_SERVER_H
+#define LLAMA_SERVER_H
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+
+int __main(int argc, char **argv);
+
+// This exposes extern C entrypoints into the llama_server
+// To enable the server compile with LLAMA_SERVER_LIBRARY
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+typedef struct ext_server_resp {
+  int id;          // < 0 on error
+  size_t msg_len;  // caller must allocate msg and set msg_len
+  char *msg;
+} ext_server_resp_t;
+
+// Allocated and freed by caller
+typedef struct ext_server_lora_adapter {
+  char *adapter;
+  float scale;
+  struct ext_server_lora_adapter *next;
+} ext_server_lora_adapter_t;
+
+// Allocated and freed by caller
+typedef struct ext_server_params {
+  char *model;
+  uint32_t n_ctx;         // token context window, 0 = from model
+  uint32_t n_batch;       // prompt processing maximum batch size
+  uint32_t n_threads;     // number of threads to use for generation
+  int32_t n_parallel;     // number of parallel sequences to decodewra
+  float rope_freq_base;   // RoPE base frequency, 0 = from model
+  float rope_freq_scale;  // RoPE frequency scaling factor, 0 = from model
+  bool memory_f16;        // use f16 instead of f32 for memory kv
+  int32_t n_gpu_layers;  // number of layers to store in VRAM (-1 - use default)
+  int32_t main_gpu;      // the GPU that is used for scratch and small tensors
+  bool use_mlock;        // force system to keep model in RAM
+  bool use_mmap;         // use mmap if possible
+  bool numa;             // attempt optimizations that help on some NUMA systems
+  bool embedding;        // get only sentence embedding
+  ext_server_lora_adapter_t *lora_adapters;
+  char *mmproj;
+  bool verbose_logging;  // Enable verbose logging of the server
+} ext_server_params_t;
+
+typedef struct ext_server_task_result {
+  int id;
+  bool stop;
+  bool error;
+  char *json_resp;  // null terminated, memory managed by ext_server
+} ext_server_task_result_t;
+
+// Initialize the server once per process
+// err->id = 0 for success and err->msg[0] = NULL
+// err->id != 0 for failure, and err->msg contains error message
+void llama_server_init(ext_server_params_t *sparams, ext_server_resp_t *err);
+
+// Run the main loop, called once per init
+void llama_server_start();
+// Stop the main loop and free up resources allocated in init and start.  Init
+// must be called again to reuse
+void llama_server_stop();
+
+// json_req null terminated string, memory managed by caller
+// resp->id >= 0 on success (task ID)
+// resp->id < 0 on error, and resp->msg contains error message
+void llama_server_completion(const char *json_req, ext_server_resp_t *resp);
+
+// Caller must call llama_server_release_task_result to free resp->json_resp
+void llama_server_completion_next_result(const int task_id,
+                                         ext_server_task_result_t *result);
+void llama_server_completion_cancel(const int task_id, ext_server_resp_t *err);
+void llama_server_release_task_result(ext_server_task_result_t *result);
+
+// Caller must call llama_server_releaes_json_resp to free json_resp if err.id <
+// 0
+void llama_server_tokenize(const char *json_req, char **json_resp,
+                           ext_server_resp_t *err);
+void llama_server_detokenize(const char *json_req, char **json_resp,
+                             ext_server_resp_t *err);
+void llama_server_embedding(const char *json_req, char **json_resp,
+                            ext_server_resp_t *err);
+void llama_server_release_json_resp(char **json_resp);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+#endif  // LLAMA_SERVER_LIBRARY
--- a/llm/falcon.go
+++ b/llm/falcon.go
@@ -1,20 +0,0 @@
-package llm
-
-const (
-	falconModelType7B   = 32
-	falconModelType40B  = 60
-	falconModelType180B = 80
-)
-
-func falconModelType(numLayer uint32) string {
-	switch numLayer {
-	case 32:
-		return "7B"
-	case 60:
-		return "40B"
-	case 80:
-		return "180B"
-	default:
-		return "unknown"
-	}
-}
--- a/llm/generate/gen_common.sh
+++ b/llm/generate/gen_common.sh
@@ -0,0 +1,112 @@
+# common logic accross linux and darwin
+
+init_vars() {
+    case "${GOARCH}" in
+    "amd64")
+        ARCH="x86_64"
+        ;;
+    "arm64")
+        ARCH="arm64"
+        ;;
+    *)
+        ARCH=$(uname -m | sed -e "s/aarch64/arm64/g")
+    esac
+
+    LLAMACPP_DIR=../llama.cpp
+    CMAKE_DEFS=""
+    CMAKE_TARGETS="--target ext_server"
+    if echo "${CGO_CFLAGS}" | grep -- '-g' >/dev/null; then
+        CMAKE_DEFS="-DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_VERBOSE_MAKEFILE=on -DLLAMA_GPROF=on -DLLAMA_SERVER_VERBOSE=on ${CMAKE_DEFS}"
+    else
+        # TODO - add additional optimization flags...
+        CMAKE_DEFS="-DCMAKE_BUILD_TYPE=Release -DLLAMA_SERVER_VERBOSE=off ${CMAKE_DEFS}"
+    fi
+    case $(uname -s) in 
+    "Darwin")
+        LIB_EXT="dylib"
+        WHOLE_ARCHIVE="-Wl,-force_load"
+        NO_WHOLE_ARCHIVE=""
+        GCC_ARCH="-arch ${ARCH}"
+        ;;
+    "Linux")
+        LIB_EXT="so"
+        WHOLE_ARCHIVE="-Wl,--whole-archive"
+        NO_WHOLE_ARCHIVE="-Wl,--no-whole-archive"
+
+        # Cross compiling not supported on linux - Use docker
+        GCC_ARCH=""
+        ;;
+    *)
+        ;;
+    esac
+}
+
+git_module_setup() {
+    if [ -n "${OLLAMA_SKIP_PATCHING}" ]; then
+        echo "Skipping submodule initialization"
+        return
+    fi
+    # Make sure the tree is clean after the directory moves
+    if [ -d "${LLAMACPP_DIR}/gguf" ]; then
+        echo "Cleaning up old submodule"
+        rm -rf ${LLAMACPP_DIR}
+    fi
+    git submodule init
+    git submodule update --force ${LLAMACPP_DIR}
+
+}
+
+apply_patches() {
+    # Wire up our CMakefile
+    if ! grep ollama ${LLAMACPP_DIR}/examples/server/CMakeLists.txt; then
+        echo 'include (../../../ext_server/CMakeLists.txt) # ollama' >>${LLAMACPP_DIR}/examples/server/CMakeLists.txt
+    fi
+
+    # apply temporary patches until fix is upstream
+    for patch in ../patches/*.diff; do
+        for file in $(grep "^+++ " ${patch} | cut -f2 -d' ' | cut -f2- -d/); do
+            (cd ${LLAMACPP_DIR}; git checkout ${file})
+        done
+    done
+    for patch in ../patches/*.diff; do
+        (cd ${LLAMACPP_DIR} && git apply ${patch})
+    done
+
+    # Avoid duplicate main symbols when we link into the cgo binary
+    sed -e 's/int main(/int __main(/g' <${LLAMACPP_DIR}/examples/server/server.cpp >${LLAMACPP_DIR}/examples/server/server.cpp.tmp &&
+        mv ${LLAMACPP_DIR}/examples/server/server.cpp.tmp ${LLAMACPP_DIR}/examples/server/server.cpp
+}
+
+build() {
+    cmake -S ${LLAMACPP_DIR} -B ${BUILD_DIR} ${CMAKE_DEFS}
+    cmake --build ${BUILD_DIR} ${CMAKE_TARGETS} -j8
+    mkdir -p ${BUILD_DIR}/lib/
+    g++ -fPIC -g -shared -o ${BUILD_DIR}/lib/libext_server.${LIB_EXT} \
+        ${GCC_ARCH} \
+        ${WHOLE_ARCHIVE} ${BUILD_DIR}/examples/server/libext_server.a ${NO_WHOLE_ARCHIVE} \
+        ${BUILD_DIR}/common/libcommon.a \
+        ${BUILD_DIR}/libllama.a \
+        -Wl,-rpath,\$ORIGIN \
+        -lpthread -ldl -lm \
+        ${EXTRA_LIBS}
+}
+
+compress_libs() {
+    echo "Compressing payloads to reduce overall binary size..."
+    pids=""
+    rm -rf ${BUILD_DIR}/lib/*.${LIB_EXT}*.gz
+    for lib in ${BUILD_DIR}/lib/*.${LIB_EXT}* ; do
+        gzip --best -f ${lib} &
+        pids+=" $!"
+    done
+    echo 
+    for pid in ${pids}; do
+        wait $pid
+    done
+    echo "Finished compression"
+}
+
+# Keep the local tree clean after we're done with the build
+cleanup() {
+    (cd ${LLAMACPP_DIR}/examples/server/ && git checkout CMakeLists.txt server.cpp)
+}
--- a/llm/generate/gen_darwin.sh
+++ b/llm/generate/gen_darwin.sh
@@ -0,0 +1,77 @@
+#!/bin/bash
+# This script is intended to run inside the go generate
+# working directory must be ./llm/generate/
+
+# TODO - add hardening to detect missing tools (cmake, etc.)
+
+set -ex
+set -o pipefail
+echo "Starting darwin generate script"
+source $(dirname $0)/gen_common.sh
+init_vars
+git_module_setup
+apply_patches
+
+sign() {
+    if [ -n "$APPLE_IDENTITY" ]; then
+        codesign -f --timestamp --deep --options=runtime --sign "$APPLE_IDENTITY" --identifier ai.ollama.ollama $1
+    fi
+}
+
+COMMON_DARWIN_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 -DCMAKE_SYSTEM_NAME=Darwin"
+
+case "${GOARCH}" in
+"amd64")
+    COMMON_CPU_DEFS="${COMMON_DARWIN_DEFS} -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DLLAMA_METAL=off -DLLAMA_NATIVE=off"
+
+    #
+    # CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
+    #
+    CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
+    BUILD_DIR="${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu"
+    echo "Building LCD CPU"
+    build
+    sign ${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu/lib/libext_server.dylib
+    compress_libs
+
+    #
+    # ~2011 CPU Dynamic library with more capabilities turned on to optimize performance
+    # Approximately 400% faster than LCD on same CPU
+    #
+    init_vars
+    CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
+    BUILD_DIR="${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu_avx"
+    echo "Building AVX CPU"
+    build
+    sign ${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu_avx/lib/libext_server.dylib
+    compress_libs
+
+    #
+    # ~2013 CPU Dynamic library
+    # Approximately 10% faster than AVX on same CPU
+    #
+    init_vars
+    CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=on -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on ${CMAKE_DEFS}"
+    BUILD_DIR="${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu_avx2"
+    echo "Building AVX2 CPU"
+    EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation"
+    build
+    sign ${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu_avx2/lib/libext_server.dylib
+    compress_libs
+    ;;
+"arm64")
+    CMAKE_DEFS="${COMMON_DARWIN_DEFS} -DLLAMA_ACCELERATE=on -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DLLAMA_METAL=on ${CMAKE_DEFS}"
+    BUILD_DIR="${LLAMACPP_DIR}/build/darwin/${ARCH}/metal"
+    EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders"
+    build
+    sign ${LLAMACPP_DIR}/build/darwin/${ARCH}/metal/lib/libext_server.dylib
+    compress_libs
+    ;;
+*)
+    echo "GOARCH must be set"
+    echo "this script is meant to be run from within go generate"
+    exit 1
+    ;;
+esac
+
+cleanup
--- a/llm/generate/gen_linux.sh
+++ b/llm/generate/gen_linux.sh
@@ -0,0 +1,190 @@
+#!/bin/bash
+# This script is intended to run inside the go generate
+# working directory must be llm/generate/
+
+# First we build one or more CPU based LLM libraries
+#
+# Then if we detect CUDA, we build a CUDA dynamic library, and carry the required
+# library dependencies
+#
+# Then if we detect ROCm, we build a dynamically loaded ROCm lib.  The ROCM
+# libraries are quite large, and also dynamically load data files at runtime
+# which in turn are large, so we don't attempt to cary them as payload
+
+set -ex
+set -o pipefail
+
+# See https://llvm.org/docs/AMDGPUUsage.html#processors for reference
+amdGPUs() {
+    if [ -n "${AMDGPU_TARGETS}" ]; then
+        echo "${AMDGPU_TARGETS}"
+        return
+    fi
+    GPU_LIST=(
+        "gfx803"
+        "gfx900"
+        "gfx906:xnack-"
+        "gfx908:xnack-"
+        "gfx90a:xnack+"
+        "gfx90a:xnack-"
+        "gfx1010"
+        "gfx1012"
+        "gfx1030"
+        "gfx1100"
+        "gfx1101"
+        "gfx1102"
+    )
+    (
+        IFS=$';'
+        echo "'${GPU_LIST[*]}'"
+    )
+}
+
+echo "Starting linux generate script"
+if [ -z "${CUDACXX}" ]; then
+    if [ -x /usr/local/cuda/bin/nvcc ]; then
+        export CUDACXX=/usr/local/cuda/bin/nvcc
+    else
+        # Try the default location in case it exists
+        export CUDACXX=$(command -v nvcc)
+    fi
+fi
+COMMON_CMAKE_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off"
+source $(dirname $0)/gen_common.sh
+init_vars
+git_module_setup
+apply_patches
+
+if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
+    # Users building from source can tune the exact flags we pass to cmake for configuring
+    # llama.cpp, and we'll build only 1 CPU variant in that case as the default.
+    if [ -n "${OLLAMA_CUSTOM_CPU_DEFS}" ]; then
+        echo "OLLAMA_CUSTOM_CPU_DEFS=\"${OLLAMA_CUSTOM_CPU_DEFS}\""
+        CMAKE_DEFS="${OLLAMA_CUSTOM_CPU_DEFS} -DCMAKE_POSITION_INDEPENDENT_CODE=on ${CMAKE_DEFS}"
+        BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu"
+        echo "Building custom CPU"
+        build
+        compress_libs
+    else
+        # Darwin Rosetta x86 emulation does NOT support AVX, AVX2, AVX512
+        # -DLLAMA_AVX -- 2011 Intel Sandy Bridge & AMD Bulldozer
+        # -DLLAMA_F16C -- 2012 Intel Ivy Bridge & AMD 2011 Bulldozer (No significant improvement over just AVX)
+        # -DLLAMA_AVX2 -- 2013 Intel Haswell & 2015 AMD Excavator / 2017 AMD Zen
+        # -DLLAMA_FMA (FMA3) -- 2013 Intel Haswell & 2012 AMD Piledriver
+        # Note: the following seem to yield slower results than AVX2 - ymmv
+        # -DLLAMA_AVX512 -- 2017 Intel Skylake and High End DeskTop (HEDT)
+        # -DLLAMA_AVX512_VBMI -- 2018 Intel Cannon Lake
+        # -DLLAMA_AVX512_VNNI -- 2021 Intel Alder Lake
+
+        COMMON_CPU_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off"
+        if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu" ]; then
+            #
+            # CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
+            #
+            CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
+            BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu"
+            echo "Building LCD CPU"
+            build
+            compress_libs
+        fi
+
+        if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu_avx" ]; then
+            #
+            # ~2011 CPU Dynamic library with more capabilities turned on to optimize performance
+            # Approximately 400% faster than LCD on same CPU
+            #
+            init_vars
+            CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
+            BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu_avx"
+            echo "Building AVX CPU"
+            build
+            compress_libs
+        fi
+
+        if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu_avx2" ]; then
+            #
+            # ~2013 CPU Dynamic library
+            # Approximately 10% faster than AVX on same CPU
+            #
+            init_vars
+            CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on ${CMAKE_DEFS}"
+            BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu_avx2"
+            echo "Building AVX2 CPU"
+            build
+            compress_libs
+        fi
+    fi
+else
+    echo "Skipping CPU generation step as requested"
+fi
+
+# If needed, look for the default CUDA toolkit location
+if [ -z "${CUDA_LIB_DIR}" ] && [ -d /usr/local/cuda/lib64 ]; then
+    CUDA_LIB_DIR=/usr/local/cuda/lib64
+fi
+
+# If needed, look for CUDA on Arch Linux
+if [ -z "${CUDA_LIB_DIR}" ] && [ -d /opt/cuda/targets/x86_64-linux/lib ]; then
+    CUDA_LIB_DIR=/opt/cuda/targets/x86_64-linux/lib
+fi
+
+if [ -d "${CUDA_LIB_DIR}" ]; then
+    echo "CUDA libraries detected - building dynamic CUDA library"
+    init_vars
+    CUDA_MAJOR=$(ls "${CUDA_LIB_DIR}"/libcudart.so.* | head -1 | cut -f3 -d. || true)
+    if [ -n "${CUDA_MAJOR}" ]; then
+        CUDA_VARIANT=_v${CUDA_MAJOR}
+    fi
+    CMAKE_DEFS="-DLLAMA_CUBLAS=on ${COMMON_CMAKE_DEFS} ${CMAKE_DEFS}"
+    BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cuda${CUDA_VARIANT}"
+    EXTRA_LIBS="-L${CUDA_LIB_DIR} -lcudart -lcublas -lcublasLt -lcuda"
+    build
+
+    # Cary the CUDA libs as payloads to help reduce dependency burden on users
+    #
+    # TODO - in the future we may shift to packaging these separately and conditionally
+    #        downloading them in the install script.
+    DEPS="$(ldd ${BUILD_DIR}/lib/libext_server.so )"
+    for lib in libcudart.so libcublas.so libcublasLt.so ; do
+        DEP=$(echo "${DEPS}" | grep ${lib} | cut -f1 -d' ' | xargs || true)
+        if [ -n "${DEP}" -a -e "${CUDA_LIB_DIR}/${DEP}" ]; then
+            cp "${CUDA_LIB_DIR}/${DEP}" "${BUILD_DIR}/lib/"
+        elif [ -e "${CUDA_LIB_DIR}/${lib}.${CUDA_MAJOR}" ]; then
+            cp "${CUDA_LIB_DIR}/${lib}.${CUDA_MAJOR}" "${BUILD_DIR}/lib/"
+        else
+            cp -d "${CUDA_LIB_DIR}/${lib}*" "${BUILD_DIR}/lib/"
+        fi
+    done
+    compress_libs
+
+fi
+
+if [ -z "${ROCM_PATH}" ]; then
+    # Try the default location in case it exists
+    ROCM_PATH=/opt/rocm
+fi
+
+if [ -z "${CLBlast_DIR}" ]; then
+    # Try the default location in case it exists
+    if [ -d /usr/lib/cmake/CLBlast ]; then
+        export CLBlast_DIR=/usr/lib/cmake/CLBlast
+    fi
+fi
+
+if [ -d "${ROCM_PATH}" ]; then
+    echo "ROCm libraries detected - building dynamic ROCm library"
+    if [ -f ${ROCM_PATH}/lib/librocm_smi64.so.? ]; then
+        ROCM_VARIANT=_v$(ls ${ROCM_PATH}/lib/librocm_smi64.so.? | cut -f3 -d. || true)
+    fi
+    init_vars
+    CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DLLAMA_HIPBLAS=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS=$(amdGPUs) -DGPU_TARGETS=$(amdGPUs)"
+    BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/rocm${ROCM_VARIANT}"
+    EXTRA_LIBS="-L${ROCM_PATH}/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ -Wl,-rpath,${ROCM_PATH}/lib,-rpath,/opt/amdgpu/lib/x86_64-linux-gnu/ -lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu"
+    build
+
+    # Note: the ROCM libs and runtime library files are too large to embed, so we depend on
+    #       them being present at runtime on the host
+    compress_libs
+fi
+
+cleanup
--- a/llm/generate/gen_windows.ps1
+++ b/llm/generate/gen_windows.ps1
@@ -0,0 +1,170 @@
+#!powershell
+
+$ErrorActionPreference = "Stop"
+
+function init_vars {
+    $script:llamacppDir = "../llama.cpp"
+    $script:cmakeDefs = @("-DBUILD_SHARED_LIBS=on", "-DLLAMA_NATIVE=off",  "-A","x64")
+    $script:cmakeTargets = @("ext_server")
+    $script:ARCH = "amd64" # arm not yet supported.
+    if ($env:CGO_CFLAGS -contains "-g") {
+        $script:cmakeDefs += @("-DCMAKE_VERBOSE_MAKEFILE=on", "-DLLAMA_SERVER_VERBOSE=on")
+        $script:config = "RelWithDebInfo"
+    } else {
+        $script:cmakeDefs += @("-DLLAMA_SERVER_VERBOSE=off")
+        $script:config = "Release"
+    }
+    # Try to find the CUDA dir
+    if ($env:CUDA_LIB_DIR -eq $null) {
+        $d=(get-command -ea 'silentlycontinue' nvcc).path
+        if ($d -ne $null) {
+            $script:CUDA_LIB_DIR=($d| split-path -parent)
+        }
+    } else {
+        $script:CUDA_LIB_DIR=$env:CUDA_LIB_DIR
+    }
+    $script:GZIP=(get-command -ea 'silentlycontinue' gzip).path
+    $script:DUMPBIN=(get-command -ea 'silentlycontinue' dumpbin).path
+}
+
+function git_module_setup {
+    # TODO add flags to skip the init/patch logic to make it easier to mod llama.cpp code in-repo
+    & git submodule init
+    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
+    & git submodule update --force "${script:llamacppDir}"
+    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
+}
+
+function apply_patches {
+    # Wire up our CMakefile
+    if (!(Select-String -Path "${script:llamacppDir}/examples/server/CMakeLists.txt" -Pattern 'ollama')) {
+        Add-Content -Path "${script:llamacppDir}/examples/server/CMakeLists.txt" -Value 'include (../../../ext_server/CMakeLists.txt) # ollama'
+    }
+
+    # Apply temporary patches until fix is upstream
+    $patches = Get-ChildItem "../patches/*.diff"
+    foreach ($patch in $patches) {
+        # Extract file paths from the patch file
+        $filePaths = Get-Content $patch.FullName | Where-Object { $_ -match '^\+\+\+ ' } | ForEach-Object {
+            $parts = $_ -split ' '
+            ($parts[1] -split '/', 2)[1]
+        }
+
+        # Checkout each file
+        foreach ($file in $filePaths) {
+            Set-Location -Path ${script:llamacppDir}
+            git checkout $file
+        }
+    }
+
+    # Apply each patch
+    foreach ($patch in $patches) {
+        Set-Location -Path ${script:llamacppDir}
+        git apply $patch.FullName
+    }
+
+    # Avoid duplicate main symbols when we link into the cgo binary
+    $content = Get-Content -Path "${script:llamacppDir}/examples/server/server.cpp"
+    $content = $content -replace 'int main\(', 'int __main('
+    Set-Content -Path "${script:llamacppDir}/examples/server/server.cpp" -Value $content
+}
+
+function build {
+    write-host "generating config with: cmake -S ${script:llamacppDir} -B $script:buildDir $script:cmakeDefs"
+    & cmake --version
+    & cmake -S "${script:llamacppDir}" -B $script:buildDir $script:cmakeDefs
+    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
+    write-host "building with: cmake --build $script:buildDir --config $script:config ($script:cmakeTargets | ForEach-Object { "--target", $_ })"
+    & cmake --build $script:buildDir --config $script:config ($script:cmakeTargets | ForEach-Object { "--target", $_ })
+    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
+}
+
+function install {
+    rm -ea 0 -recurse -force -path "${script:buildDir}/lib"
+    md "${script:buildDir}/lib" -ea 0 > $null
+    cp "${script:buildDir}/bin/${script:config}/ext_server.dll" "${script:buildDir}/lib"
+    cp "${script:buildDir}/bin/${script:config}/llama.dll" "${script:buildDir}/lib"
+
+    # Display the dll dependencies in the build log
+    if ($script:DUMPBIN -ne $null) {
+        & "$script:DUMPBIN" /dependents "${script:buildDir}/bin/${script:config}/ext_server.dll" | select-string ".dll"
+    }
+}
+
+function compress_libs {
+    if ($script:GZIP -eq $null) {
+        write-host "gzip not installed, not compressing files"
+        return
+    }
+    write-host "Compressing dlls..."
+    $libs = dir "${script:buildDir}/lib/*.dll"
+    foreach ($file in $libs) {
+        & "$script:GZIP" --best -f $file
+    }
+}
+
+function cleanup {
+    Set-Location "${script:llamacppDir}/examples/server"
+    git checkout CMakeLists.txt server.cpp
+}
+
+init_vars
+git_module_setup
+apply_patches
+
+# -DLLAMA_AVX -- 2011 Intel Sandy Bridge & AMD Bulldozer
+# -DLLAMA_F16C -- 2012 Intel Ivy Bridge & AMD 2011 Bulldozer (No significant improvement over just AVX)
+# -DLLAMA_AVX2 -- 2013 Intel Haswell & 2015 AMD Excavator / 2017 AMD Zen
+# -DLLAMA_FMA (FMA3) -- 2013 Intel Haswell & 2012 AMD Piledriver
+
+$script:commonCpuDefs = @("-DCMAKE_POSITION_INDEPENDENT_CODE=on", "-DLLAMA_NATIVE=off")
+
+$script:cmakeDefs = $script:commonCpuDefs + @("-DLLAMA_AVX=off", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs
+$script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cpu"
+write-host "Building LCD CPU"
+build
+install
+compress_libs
+
+$script:cmakeDefs = $script:commonCpuDefs + @("-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs
+$script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cpu_avx"
+write-host "Building AVX CPU"
+build
+install
+compress_libs
+
+$script:cmakeDefs = $script:commonCpuDefs + @("-DLLAMA_AVX=on", "-DLLAMA_AVX2=on", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=on", "-DLLAMA_F16C=on") + $script:cmakeDefs
+$script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cpu_avx2"
+write-host "Building AVX2 CPU"
+build
+install
+compress_libs
+
+if ($null -ne $script:CUDA_LIB_DIR) {
+    # Then build cuda as a dynamically loaded library
+    $nvcc = (get-command -ea 'silentlycontinue' nvcc)
+    if ($null -ne $nvcc) {
+        $script:CUDA_VERSION=(get-item ($nvcc | split-path | split-path)).Basename
+    }
+    if ($null -ne $script:CUDA_VERSION) {
+        $script:CUDA_VARIANT="_"+$script:CUDA_VERSION
+    }
+    init_vars
+    $script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cuda$script:CUDA_VARIANT"
+    $script:cmakeDefs += @("-DLLAMA_CUBLAS=ON", "-DLLAMA_AVX=on")
+    build
+    install
+    cp "${script:CUDA_LIB_DIR}/cudart64_*.dll" "${script:buildDir}/lib"
+    cp "${script:CUDA_LIB_DIR}/cublas64_*.dll" "${script:buildDir}/lib"
+    cp "${script:CUDA_LIB_DIR}/cublasLt64_*.dll" "${script:buildDir}/lib"
+    compress_libs
+}
+# TODO - actually implement ROCm support on windows
+$script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/rocm"
+
+rm -ea 0 -recurse -force -path "${script:buildDir}/lib"
+md "${script:buildDir}/lib" -ea 0 > $null
+echo $null >> "${script:buildDir}/lib/.generated"
+
+cleanup
+write-host "`ngo generate completed"
--- a/llm/generate/generate_darwin.go
+++ b/llm/generate/generate_darwin.go
@@ -0,0 +1,3 @@
+package generate
+
+//go:generate sh ./gen_darwin.sh
--- a/llm/generate/generate_linux.go
+++ b/llm/generate/generate_linux.go
@@ -0,0 +1,3 @@
+package generate
+
+//go:generate bash ./gen_linux.sh
--- a/llm/generate/generate_windows.go
+++ b/llm/generate/generate_windows.go
@@ -0,0 +1,3 @@
+package generate
+
+//go:generate powershell -ExecutionPolicy Bypass -File ./gen_windows.ps1
--- a/llm/ggml.go
+++ b/llm/ggml.go
@@ -7,9 +7,10 @@ import (
 )

 type GGML struct {
-	magic uint32
 	container
 	model
+
+	Size int64
 }

 const (
@@ -77,70 +78,17 @@ type model interface {
 	ModelFamily() string
 	ModelType() string
 	FileType() string
-	NumLayers() int64
+	NumLayers() uint32
+	NumGQA() uint32
+	NumEmbed() uint32
+	NumHead() uint32
+	NumHeadKv() uint32
+	NumCtx() uint32
 }

 type container interface {
 	Name() string
-	Decode(io.Reader) (model, error)
-}
-
-type containerGGML struct{}
-
-func (c *containerGGML) Name() string {
-	return "ggml"
-}
-
-func (c *containerGGML) Decode(r io.Reader) (model, error) {
-	return nil, nil
-}
-
-type containerGGMF struct {
-	version uint32
-}
-
-func (c *containerGGMF) Name() string {
-	return "ggmf"
-}
-
-func (c *containerGGMF) Decode(r io.Reader) (model, error) {
-	var version uint32
-	binary.Read(r, binary.LittleEndian, &version)
-
-	switch version {
-	case 1:
-	default:
-		return nil, errors.New("invalid version")
-	}
-
-	c.version = version
-	return nil, nil
-}
-
-type containerGGJT struct {
-	version uint32
-}
-
-func (c *containerGGJT) Name() string {
-	return "ggjt"
-}
-
-func (c *containerGGJT) Decode(r io.Reader) (model, error) {
-	var version uint32
-	binary.Read(r, binary.LittleEndian, &version)
-
-	switch version {
-	case 1, 2, 3:
-	default:
-		return nil, errors.New("invalid version")
-	}
-
-	c.version = version
-
-	// different model types may have different layouts for hyperparameters
-	var llama llamaModel
-	binary.Read(r, binary.LittleEndian, &llama.hyperparameters)
-	return &llama, nil
+	Decode(*readSeekOffset) (model, error)
 }

 type containerLORA struct {
@@ -151,9 +99,9 @@ func (c *containerLORA) Name() string {
 	return "ggla"
 }

-func (c *containerLORA) Decode(r io.Reader) (model, error) {
+func (c *containerLORA) Decode(rso *readSeekOffset) (model, error) {
 	var version uint32
-	binary.Read(r, binary.LittleEndian, &version)
+	binary.Read(rso, binary.LittleEndian, &version)

 	switch version {
 	case 1:
@@ -162,6 +110,10 @@ func (c *containerLORA) Decode(r io.Reader) (model, error) {
 	}

 	c.version = version
+
+	// remaining file contents aren't decoded
+	rso.Seek(0, io.SeekEnd)
+
 	return nil, nil
 }

@@ -179,34 +131,60 @@ const (
 	FILE_MAGIC_GGUF_BE = 0x47475546
 )

-func DecodeGGML(r io.ReadSeeker) (*GGML, error) {
-	var ggml GGML
-	binary.Read(r, binary.LittleEndian, &ggml.magic)
+var ErrUnsupportedFormat = errors.New("unsupported model format")

-	switch ggml.magic {
-	case FILE_MAGIC_GGML:
-		ggml.container = &containerGGML{}
-	case FILE_MAGIC_GGMF:
-		ggml.container = &containerGGMF{}
-	case FILE_MAGIC_GGJT:
-		ggml.container = &containerGGJT{}
+func DecodeGGML(r io.ReadSeeker) (*GGML, error) {
+	ro := readSeekOffset{ReadSeeker: r}
+
+	var magic uint32
+	if err := binary.Read(&ro, binary.LittleEndian, &magic); err != nil {
+		return nil, err
+	}
+
+	var c container
+	switch magic {
+	case FILE_MAGIC_GGML, FILE_MAGIC_GGMF, FILE_MAGIC_GGJT:
+		return nil, ErrUnsupportedFormat
 	case FILE_MAGIC_GGLA:
-		ggml.container = &containerLORA{}
+		c = &containerLORA{}
 	case FILE_MAGIC_GGUF_LE:
-		ggml.container = &containerGGUF{bo: binary.LittleEndian}
+		c = &containerGGUF{bo: binary.LittleEndian}
 	case FILE_MAGIC_GGUF_BE:
-		ggml.container = &containerGGUF{bo: binary.BigEndian}
+		c = &containerGGUF{bo: binary.BigEndian}
 	default:
 		return nil, errors.New("invalid file magic")
 	}

-	model, err := ggml.Decode(r)
+	model, err := c.Decode(&ro)
 	if err != nil {
 		return nil, err
 	}

-	ggml.model = model
-
 	// final model type
-	return &ggml, nil
+	return &GGML{
+		container: c,
+		model:     model,
+		Size:      ro.offset,
+	}, nil
+}
+
+type readSeekOffset struct {
+	io.ReadSeeker
+	offset int64
+}
+
+func (rso *readSeekOffset) Seek(offset int64, whence int) (int64, error) {
+	offset, err := rso.ReadSeeker.Seek(offset, whence)
+	if err != nil {
+		return 0, err
+	}
+
+	rso.offset = offset
+	return offset, nil
+}
+
+func (rso *readSeekOffset) Read(p []byte) (int, error) {
+	n, err := rso.ReadSeeker.Read(p)
+	rso.offset += int64(n)
+	return n, err
 }
--- a/llm/gguf.go
+++ b/llm/gguf.go
@@ -23,26 +23,24 @@ type containerGGUF struct {
 		NumTensor uint64
 		NumKV     uint64
 	}
-
-	parameters uint64
 }

 func (c *containerGGUF) Name() string {
 	return "gguf"
 }

-func (c *containerGGUF) Decode(r io.Reader) (model, error) {
-	binary.Read(r, c.bo, &c.Version)
+func (c *containerGGUF) Decode(rso *readSeekOffset) (model, error) {
+	binary.Read(rso, c.bo, &c.Version)

 	switch c.Version {
 	case 1:
-		binary.Read(r, c.bo, &c.V1)
+		binary.Read(rso, c.bo, &c.V1)
 	default:
-		binary.Read(r, c.bo, &c.V2)
+		binary.Read(rso, c.bo, &c.V2)
 	}

 	model := newGGUFModel(c)
-	if err := model.Decode(r); err != nil {
+	if err := model.Decode(rso); err != nil {
 		return nil, err
 	}

@@ -67,9 +65,76 @@ const (

 type kv map[string]any

+type tensor struct {
+	name   string
+	kind   uint32
+	offset uint64
+
+	// shape is the number of elements in each dimension
+	shape [4]uint64
+}
+
+func (t tensor) blockSize() uint64 {
+	switch {
+	case t.kind < 2:
+		return 1
+	case t.kind < 10:
+		return 32
+	default:
+		return 256
+	}
+}
+
+func (t tensor) typeSize() uint64 {
+	blockSize := t.blockSize()
+
+	switch t.kind {
+	case 0: // FP32
+		return 4
+	case 1: // FP16
+		return 2
+	case 2: // Q4_0
+		return 2 + blockSize/2
+	case 3: // Q4_1
+		return 2 + 2 + blockSize/2
+	case 6: // Q5_0
+		return 2 + 4 + blockSize/2
+	case 7: // Q5_1
+		return 2 + 2 + 4 + blockSize/2
+	case 8: // Q8_0
+		return 2 + blockSize
+	case 9: // Q8_1
+		return 4 + 4 + blockSize
+	case 10: // Q2_K
+		return blockSize/16 + blockSize/4 + 2 + 2
+	case 11: // Q3_K
+		return blockSize/8 + blockSize/4 + 12 + 2
+	case 12: // Q4_K
+		return 2 + 2 + 12 + blockSize/2
+	case 13: // Q5_K
+		return 2 + 2 + 12 + blockSize/8 + blockSize/2
+	case 14: // Q6_K
+		return blockSize/2 + blockSize/4 + blockSize/16 + 2
+	default:
+		return 0
+	}
+}
+
+func (t tensor) parameters() uint64 {
+	return t.shape[0] * t.shape[1] * t.shape[2] * t.shape[3]
+}
+
+func (t tensor) size() uint64 {
+	return t.parameters() * t.typeSize() / t.blockSize()
+}
+
 type ggufModel struct {
 	*containerGGUF
+
 	kv
+	tensors []tensor
+
+	parameters uint64
 }

 func newGGUFModel(container *containerGGUF) *ggufModel {
@@ -96,8 +161,7 @@ func (llm *ggufModel) NumKV() uint64 {
 }

 func (llm *ggufModel) ModelFamily() string {
-	t, ok := llm.kv["general.architecture"].(string)
-	if ok {
+	if t, ok := llm.kv["general.architecture"].(string); ok {
 		return t
 	}

@@ -109,82 +173,60 @@ func (llm *ggufModel) ModelType() string {
 		return format.HumanNumber(llm.parameters)
 	}

-	switch llm.ModelFamily() {
-	case "llama":
-		if blocks, ok := llm.kv["llama.block_count"].(uint32); ok {
-			heads, headsOK := llm.kv["llama.head_count"].(uint32)
-			headKVs, headsKVsOK := llm.kv["llama.head_count_kv"].(uint32)
-			if headsOK && headsKVsOK && heads/headKVs == 8 {
-				return "70B"
-			}
-
-			return llamaModelType(blocks)
-		}
-	case "falcon":
-		if blocks, ok := llm.kv["falcon.block_count"].(uint32); ok {
-			return falconModelType(blocks)
-		}
-	case "starcoder":
-		if blocks, ok := llm.kv["starcoder.block_count"].(uint32); ok {
-			return starCoderModelType(blocks)
-		}
-	}
-
 	return "unknown"
 }

 func (llm *ggufModel) FileType() string {
-	t, ok := llm.kv["general.file_type"].(uint32)
-	if ok {
+	if t, ok := llm.kv["general.file_type"].(uint32); ok {
 		return fileType(t)
 	}

 	return "unknown"
 }

-func (llm *ggufModel) Decode(r io.Reader) error {
+func (llm *ggufModel) Decode(rso *readSeekOffset) error {
 	// decode key-values
 	for i := 0; uint64(i) < llm.NumKV(); i++ {
-		k, err := llm.readString(r)
+		k, err := llm.readString(rso)
 		if err != nil {
 			return err
 		}

-		vtype := llm.readU32(r)
+		vtype := llm.readU32(rso)

 		var v any
 		switch vtype {
 		case ggufTypeUint8:
-			v = llm.readU8(r)
+			v = llm.readU8(rso)
 		case ggufTypeInt8:
-			v = llm.readI8(r)
+			v = llm.readI8(rso)
 		case ggufTypeUint16:
-			v = llm.readU16(r)
+			v = llm.readU16(rso)
 		case ggufTypeInt16:
-			v = llm.readI16(r)
+			v = llm.readI16(rso)
 		case ggufTypeUint32:
-			v = llm.readU32(r)
+			v = llm.readU32(rso)
 		case ggufTypeInt32:
-			v = llm.readI32(r)
+			v = llm.readI32(rso)
 		case ggufTypeUint64:
-			v = llm.readU64(r)
+			v = llm.readU64(rso)
 		case ggufTypeInt64:
-			v = llm.readI64(r)
+			v = llm.readI64(rso)
 		case ggufTypeFloat32:
-			v = llm.readF32(r)
+			v = llm.readF32(rso)
 		case ggufTypeFloat64:
-			v = llm.readF64(r)
+			v = llm.readF64(rso)
 		case ggufTypeBool:
-			v = llm.readBool(r)
+			v = llm.readBool(rso)
 		case ggufTypeString:
-			s, err := llm.readString(r)
+			s, err := llm.readString(rso)
 			if err != nil {
 				return err
 			}

 			v = s
 		case ggufTypeArray:
-			a, err := llm.readArray(r)
+			a, err := llm.readArray(rso)
 			if err != nil {
 				return err
 			}
@@ -199,34 +241,96 @@ func (llm *ggufModel) Decode(r io.Reader) error {

 	// decode tensors
 	for i := 0; uint64(i) < llm.NumTensor(); i++ {
-		if _, err := llm.readString(r); err != nil {
+		name, err := llm.readString(rso)
+		if err != nil {
 			return err
 		}

-		dimensions := llm.readU32(r)
+		// dims is the number of dimensions in the tensor
+		dims := llm.readU32(rso)

-		var elements uint64 = 1
-		for i := 0; uint32(i) < dimensions; i++ {
-			elements *= llm.readU64(r)
+		shape := [4]uint64{1, 1, 1, 1}
+		for i := 0; uint32(i) < dims; i++ {
+			shape[i] = llm.readU64(rso)
 		}

-		llm.readU32(r) // type
-		llm.readU64(r) // offset
+		tensor := tensor{
+			name:   name,
+			kind:   llm.readU32(rso),
+			offset: llm.readU64(rso),
+			shape:  shape,
+		}

-		llm.parameters += elements
+		llm.tensors = append(llm.tensors, tensor)
+		llm.parameters += tensor.parameters()
+	}
+
+	alignment, ok := llm.kv["general.alignment"].(uint32)
+	if !ok {
+		alignment = 32
+	}
+
+	rso.Seek(int64(alignment)-rso.offset%int64(alignment), io.SeekCurrent)
+	for _, tensor := range llm.tensors {
+		padded := (int64(tensor.size()) + int64(alignment) - 1) & ^(int64(alignment) - 1)
+		rso.Seek(padded, io.SeekCurrent)
 	}

 	return nil
 }

-func (llm *ggufModel) NumLayers() int64 {
+func (llm *ggufModel) NumLayers() uint32 {
 	value, exists := llm.kv[fmt.Sprintf("%s.block_count", llm.ModelFamily())]
 	if !exists {
 		return 0
 	}

-	v := value.(uint32)
-	return int64(v)
+	return value.(uint32)
+}
+
+func (llm *ggufModel) NumHead() uint32 {
+	value, exists := llm.kv[fmt.Sprintf("%s.attention.head_count", llm.ModelFamily())]
+	if !exists {
+		return 0
+	}
+
+	return value.(uint32)
+}
+
+func (llm *ggufModel) NumEmbed() uint32 {
+	value, exists := llm.kv[fmt.Sprintf("%s.embedding_length", llm.ModelFamily())]
+	if !exists {
+		return 0
+	}
+
+	return value.(uint32)
+}
+
+func (llm *ggufModel) NumHeadKv() uint32 {
+	value, exists := llm.kv[fmt.Sprintf("%s.attention.head_count_kv", llm.ModelFamily())]
+	if !exists {
+		return 0
+	}
+
+	return value.(uint32)
+}
+
+func (llm *ggufModel) NumCtx() uint32 {
+	value, exists := llm.kv[fmt.Sprintf("%s.context_length", llm.ModelFamily())]
+	if !exists {
+		return 0
+	}
+
+	return value.(uint32)
+}
+
+func (llm *ggufModel) NumGQA() uint32 {
+	numHeadKv := llm.NumHeadKv()
+	if numHeadKv == 0 {
+		return 0
+	}
+
+	return llm.NumHead() / numHeadKv
 }

 func (llm ggufModel) readU8(r io.Reader) uint8 {
--- a/llm/llama.cpp
+++ b/llm/llama.cpp
--- a/llm/llama.cpp/generate_darwin_amd64.go
+++ b/llm/llama.cpp/generate_darwin_amd64.go
@@ -1,18 +0,0 @@
-package llm
-
-//go:generate git submodule init
-
-//go:generate git submodule update --force ggml
-//go:generate git -C ggml apply ../patches/0001-add-detokenize-endpoint.patch
-//go:generate git -C ggml apply ../patches/0002-34B-model-support.patch
-//go:generate git -C ggml apply ../patches/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
-//go:generate git -C ggml apply ../patches/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
-//go:generate cmake -S ggml -B ggml/build/cpu -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_NAME=Darwin -DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
-//go:generate cmake --build ggml/build/cpu --target server --config Release
-//go:generate mv ggml/build/cpu/bin/server ggml/build/cpu/bin/ollama-runner
-
-//go:generate git submodule update --force gguf
-//go:generate git -C gguf apply ../patches/0001-update-default-log-target.patch
-//go:generate cmake -S gguf -B gguf/build/cpu -DLLAMA_METAL=off -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_NAME=Darwin -DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=on
-//go:generate cmake --build gguf/build/cpu --target server --config Release
-//go:generate mv gguf/build/cpu/bin/server gguf/build/cpu/bin/ollama-runner
--- a/llm/llama.cpp/generate_darwin_arm64.go
+++ b/llm/llama.cpp/generate_darwin_arm64.go
@@ -1,18 +0,0 @@
-package llm
-
-//go:generate git submodule init
-
-//go:generate git submodule update --force ggml
-//go:generate git -C ggml apply ../patches/0001-add-detokenize-endpoint.patch
-//go:generate git -C ggml apply ../patches/0002-34B-model-support.patch
-//go:generate git -C ggml apply ../patches/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
-//go:generate git -C ggml apply ../patches/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
-//go:generate cmake -S ggml -B ggml/build/metal -DLLAMA_METAL=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
-//go:generate cmake --build ggml/build/metal --target server --config Release
-//go:generate mv ggml/build/metal/bin/server ggml/build/metal/bin/ollama-runner
-
-//go:generate git submodule update --force gguf
-//go:generate git -C gguf apply ../patches/0001-update-default-log-target.patch
-//go:generate cmake -S gguf -B gguf/build/metal -DLLAMA_METAL=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
-//go:generate cmake --build gguf/build/metal --target server --config Release
-//go:generate mv gguf/build/metal/bin/server gguf/build/metal/bin/ollama-runner
--- a/llm/llama.cpp/generate_linux.go
+++ b/llm/llama.cpp/generate_linux.go
@@ -1,26 +0,0 @@
-package llm
-
-//go:generate git submodule init
-
-//go:generate git submodule update --force ggml
-//go:generate git -C ggml apply ../patches/0001-add-detokenize-endpoint.patch
-//go:generate git -C ggml apply ../patches/0002-34B-model-support.patch
-//go:generate git -C ggml apply ../patches/0005-ggml-support-CUDA-s-half-type-for-aarch64-1455-2670.patch
-//go:generate git -C ggml apply ../patches/0001-copy-cuda-runtime-libraries.patch
-//go:generate cmake -S ggml -B ggml/build/cpu -DLLAMA_K_QUANTS=on
-//go:generate cmake --build ggml/build/cpu --target server --config Release
-//go:generate mv ggml/build/cpu/bin/server ggml/build/cpu/bin/ollama-runner
-
-//go:generate git submodule update --force gguf
-//go:generate git -C gguf apply ../patches/0001-copy-cuda-runtime-libraries.patch
-//go:generate git -C gguf apply ../patches/0001-update-default-log-target.patch
-//go:generate cmake -S gguf -B gguf/build/cpu -DLLAMA_K_QUANTS=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off
-//go:generate cmake --build gguf/build/cpu --target server --config Release
-//go:generate mv gguf/build/cpu/bin/server gguf/build/cpu/bin/ollama-runner
-
-//go:generate cmake -S ggml -B ggml/build/cuda -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on
-//go:generate cmake --build ggml/build/cuda --target server --config Release
-//go:generate mv ggml/build/cuda/bin/server ggml/build/cuda/bin/ollama-runner
-//go:generate cmake -S gguf -B gguf/build/cuda -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off -DLLAMA_CUDA_PEER_MAX_BATCH_SIZE=0
-//go:generate cmake --build gguf/build/cuda --target server --config Release
-//go:generate mv gguf/build/cuda/bin/server gguf/build/cuda/bin/ollama-runner
--- a/llm/llama.cpp/generate_windows.go
+++ b/llm/llama.cpp/generate_windows.go
@@ -1,24 +0,0 @@
-package llm
-
-//go:generate git submodule init
-
-//go:generate git submodule update --force ggml
-//go:generate git -C ggml apply ../patches/0001-add-detokenize-endpoint.patch
-//go:generate git -C ggml apply ../patches/0002-34B-model-support.patch
-//go:generate cmake -S ggml -B ggml/build/cpu -DLLAMA_K_QUANTS=on
-//go:generate cmake --build ggml/build/cpu --target server --config Release
-//go:generate cmd /c move ggml\build\cpu\bin\Release\server.exe ggml\build\cpu\bin\Release\ollama-runner.exe
-
-//go:generate git submodule update --force gguf
-//go:generate git -C gguf apply ../patches/0001-update-default-log-target.patch
-//go:generate cmake -S gguf -B gguf/build/cpu -DLLAMA_K_QUANTS=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off
-//go:generate cmake --build gguf/build/cpu --target server --config Release
-//go:generate cmd /c move gguf\build\cpu\bin\Release\server.exe gguf\build\cpu\bin\Release\ollama-runner.exe
-
-//go:generate cmake -S ggml -B ggml/build/cuda -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on
-//go:generate cmake --build ggml/build/cuda --target server --config Release
-//go:generate cmd /c move ggml\build\cuda\bin\Release\server.exe ggml\build\cuda\bin\Release\ollama-runner.exe
-
-//go:generate cmake -S gguf -B gguf/build/cuda -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off
-//go:generate cmake --build gguf/build/cuda --target server --config Release
-//go:generate cmd /c move gguf\build\cuda\bin\Release\server.exe gguf\build\cuda\bin\Release\ollama-runner.exe
--- a/llm/llama.cpp/ggml
+++ b/llm/llama.cpp/ggml
--- a/Show More
+++ b/Show More