add faq for what is context

Signed-off-by: Matt Williams <m@technovangelist.com>
Update readmes, requirements, packagejsons, etc for all examples (#1452 )
2023-12-22 09:20:23 -08:00 · 2023-12-22 09:10:41 -08:00 · 2023-12-22 09:10:01 -08:00 · 2023-12-22 08:57:17 -08:00 · 2023-12-22 08:47:18 -08:00 · 2023-12-22 08:16:31 -08:00
107 changed files with 4612 additions and 1886 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -2,8 +2,7 @@
 ollama
 app
 dist
-scripts
-llm/llama.cpp/ggml
 llm/llama.cpp/gguf
 .env
 .cache
+test_data
--- a/.gitignore
+++ b/.gitignore
@@ -8,4 +8,5 @@ ollama
 ggml-metal.metal
 .cache
 *.exe
-.idea
+.idea
+test_data
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,8 +1,3 @@
-[submodule "llm/llama.cpp/ggml"]
-    path = llm/llama.cpp/ggml
-    url = https://github.com/ggerganov/llama.cpp.git
-    ignore = dirty
-    shallow = true
 [submodule "llm/llama.cpp/gguf"]
    path = llm/llama.cpp/gguf
    url = https://github.com/ggerganov/llama.cpp.git
--- a/Dockerfile.build
+++ b/Dockerfile.build
@@ -1,23 +1,65 @@
-# centos7 amd64 dependencies
-FROM --platform=linux/amd64 nvidia/cuda:11.3.1-devel-centos7 AS base-amd64
-RUN yum install -y https://repo.ius.io/ius-release-el7.rpm centos-release-scl && \
-    yum update -y && \
-    yum install -y devtoolset-10-gcc devtoolset-10-gcc-c++ git236 wget
-RUN wget "https://github.com/Kitware/CMake/releases/download/v3.27.6/cmake-3.27.6-linux-x86_64.sh" -O cmake-installer.sh && chmod +x cmake-installer.sh && ./cmake-installer.sh --skip-license --prefix=/usr/local
-ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
+# Ubuntu 20.04 amd64 dependencies
+FROM --platform=linux/amd64 ubuntu:20.04 AS base-amd64
+ARG CUDA_VERSION=11.3.1-1
+ARG CMAKE_VERSION=3.22.1
+# ROCm only supports amd64
+ARG ROCM_VERSION=6.0
+ARG CLBLAST_VER=1.6.1

-# centos8 arm64 dependencies
-FROM --platform=linux/arm64 nvidia/cuda-arm64:11.3.1-devel-centos8 AS base-arm64
-RUN sed -i -e 's/mirrorlist/#mirrorlist/g' -e 's|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g' /etc/yum.repos.d/CentOS-*
-RUN yum install -y git cmake
+# Note: https://rocm.docs.amd.com/en/latest/release/user_kernel_space_compat_matrix.html
+RUN apt-get update && \
+    apt-get install -y wget gnupg && \
+    wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-ubuntu2004.pin && \
+    mv cuda-ubuntu2004.pin /etc/apt/preferences.d/cuda-repository-pin-600 && \
+    apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub && \
+    echo "deb [by-hash=no] https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/ /" > /etc/apt/sources.list.d/cuda.list && \
+    wget "https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.sh" -O /tmp/cmake-installer.sh && \
+    chmod +x /tmp/cmake-installer.sh && /tmp/cmake-installer.sh --skip-license --prefix=/usr && \
+    mkdir --parents --mode=0755 /etc/apt/keyrings && \
+    wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | gpg --dearmor > /etc/apt/keyrings/rocm.gpg && \
+    echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/${ROCM_VERSION} focal main" > /etc/apt/sources.list.d/rocm.list && \
+    echo "Package: *" > /etc/apt/preferences.d/rocm-pin-600 && \
+    echo "Pin: release o=repo.radeon.com" >> /etc/apt/preferences.d/rocm-pin-600 && \
+    echo "Pin-Priority: 600" >> /etc/apt/preferences.d/rocm-pin-600 && \
+    apt-get update && \
+    DEBIAN_FRONTEND=noninteractive apt-get -y install cuda=${CUDA_VERSION} rocm-hip-libraries rocm-device-libs rocm-libs rocm-ocl-icd rocm-hip-sdk rocm-hip-libraries rocm-cmake rocm-clang-ocl rocm-dev
+
+# CLBlast
+RUN wget -qO- https://github.com/CNugteren/CLBlast/archive/refs/tags/${CLBLAST_VER}.tar.gz | tar zxv -C /tmp/ && \
+    cd /tmp/CLBlast-${CLBLAST_VER} && mkdir build && cd build && cmake .. && make && make install
+
+ENV ROCM_PATH=/opt/rocm
+
+# Ubuntu 22.04 arm64 dependencies
+FROM --platform=linux/arm64 ubuntu:20.04 AS base-arm64
+ARG CUDA_VERSION=11.3.1-1
+ARG CMAKE_VERSION=3.27.6
+RUN apt-get update && \
+    apt-get install -y wget gnupg && \
+    wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/sbsa/cuda-ubuntu2004.pin && \
+    mv cuda-ubuntu2004.pin /etc/apt/preferences.d/cuda-repository-pin-600 && \
+    apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/sbsa//3bf863cc.pub && \
+    echo "deb [by-hash=no] https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/sbsa/ /" > /etc/apt/sources.list.d/cuda.list && \
+    wget "https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-aarch64.sh" -O /tmp/cmake-installer.sh && \
+    chmod +x /tmp/cmake-installer.sh && /tmp/cmake-installer.sh --skip-license --prefix=/usr && \
+    apt-get update && \
+    apt-cache madison cuda && \
+    DEBIAN_FRONTEND=noninteractive apt-get -y install cuda=${CUDA_VERSION} 

 FROM base-${TARGETARCH}
 ARG TARGETARCH
 ARG GOFLAGS="'-ldflags -w -s'"
+ARG CGO_CFLAGS
+ARG GOLANG_VERSION=1.21.3
+
+# Common toolchain
+RUN apt-get update && \
+    DEBIAN_FRONTEND=noninteractive apt-get install -y gcc-10 g++-10 cpp-10 git ocl-icd-opencl-dev && \
+    update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 100 --slave /usr/bin/g++ g++ /usr/bin/g++-10 --slave /usr/bin/gcov gcov /usr/bin/gcov-10

 # install go
-ADD https://dl.google.com/go/go1.21.3.linux-$TARGETARCH.tar.gz /tmp/go1.21.3.tar.gz
-RUN mkdir -p /usr/local && tar xz -C /usr/local </tmp/go1.21.3.tar.gz
+ADD https://dl.google.com/go/go${GOLANG_VERSION}.linux-$TARGETARCH.tar.gz /tmp/go${GOLANG_VERSION}.tar.gz
+RUN mkdir -p /usr/local && tar xz -C /usr/local </tmp/go${GOLANG_VERSION}.tar.gz

 # build the final binary
 WORKDIR /go/src/github.com/jmorganca/ollama
@@ -26,6 +68,7 @@ COPY . .
 ENV GOOS=linux
 ENV GOARCH=$TARGETARCH
 ENV GOFLAGS=$GOFLAGS
+ENV CGO_CFLAGS=${CGO_CFLAGS}

 RUN /usr/local/go/bin/go generate ./... && \
    /usr/local/go/bin/go build .
--- a/README.md
+++ b/README.md
@@ -17,7 +17,7 @@ Get up and running with large language models locally.

 ### Windows

-Coming soon!
+Coming soon! For now, you can install Ollama on Windows via WSL2.

 ### Linux & WSL2

@@ -47,18 +47,20 @@ Here are some example open-source models that can be downloaded:

 | Model              | Parameters | Size  | Download                       |
 | ------------------ | ---------- | ----- | ------------------------------ |
+| Llama 2            | 7B         | 3.8GB | `ollama run llama2`            |
+| Mistral            | 7B         | 4.1GB | `ollama run mistral`           |
+| Phi-2              | 2.7B       | 1.7GB | `ollama run phi`               |
 | Neural Chat        | 7B         | 4.1GB | `ollama run neural-chat`       |
 | Starling           | 7B         | 4.1GB | `ollama run starling-lm`       |
-| Mistral            | 7B         | 4.1GB | `ollama run mistral`           |
-| Llama 2            | 7B         | 3.8GB | `ollama run llama2`            |
 | Code Llama         | 7B         | 3.8GB | `ollama run codellama`         |
 | Llama 2 Uncensored | 7B         | 3.8GB | `ollama run llama2-uncensored` |
 | Llama 2 13B        | 13B        | 7.3GB | `ollama run llama2:13b`        |
 | Llama 2 70B        | 70B        | 39GB  | `ollama run llama2:70b`        |
 | Orca Mini          | 3B         | 1.9GB | `ollama run orca-mini`         |
 | Vicuna             | 7B         | 3.8GB | `ollama run vicuna`            |
+| LLaVA              | 7B         | 4.5GB | `ollama run llava`             |

-> Note: You should have at least 8 GB of RAM to run the 3B models, 16 GB to run the 7B models, and 32 GB to run the 13B models.
+> Note: You should have at least 8 GB of RAM available to run the 7B models, 16 GB to run the 13B models, and 32 GB to run the 33B models.

 ## Customize your own model

@@ -104,7 +106,7 @@ FROM llama2
 # set the temperature to 1 [higher is more creative, lower is more coherent]
 PARAMETER temperature 1

-# set the system prompt
+# set the system message
 SYSTEM """
 You are Mario from Super Mario Bros. Answer as Mario, the assistant, only.
 """
@@ -127,6 +129,10 @@ For more examples, see the [examples](examples) directory. For more information

 `ollama create` is used to create a model from a Modelfile.

+```
+ollama create mymodel -f ./Modelfile
+```
+
 ### Pull a model

 ```
@@ -158,6 +164,13 @@ For multiline input, you can wrap text with `"""`:
 I'm a basic program that prints the famous "Hello, world!" message to the console.
 ```

+### Multimodal models
+
+```
+>>> What's in this image? /Users/jmorgan/Desktop/smile.png
+The image features a yellow smiley face, which is likely the central focus of the picture.
+```
+
 ### Pass in prompt as arguments

 ```
@@ -183,13 +196,19 @@ Install `cmake` and `go`:
 brew install cmake go
 ```

-Then generate dependencies and build:
-
+Then generate dependencies:
 ```
 go generate ./...
+```
+Then build the binary:
+```
 go build .
 ```

+More detailed instructions can be found in the [developer guide](https://github.com/jmorganca/ollama/blob/main/docs/development.md)
+
+
+### Running local builds
 Next, start the server:

 ```
@@ -205,7 +224,8 @@ Finally, in a separate shell, run a model:
 ## REST API

 Ollama has a REST API for running and managing models.
-For example, to generate text from a model:
+
+### Generate a response

 ```
 curl http://localhost:11434/api/generate -d '{
@@ -214,7 +234,7 @@ curl http://localhost:11434/api/generate -d '{
 }'
 ```

-Or send a chat message (coming in 0.1.14):
+### Chat with a model

 ```
 curl http://localhost:11434/api/chat -d '{
@@ -230,7 +250,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 ## Community Integrations

 ### Web & Desktop
-
+- [Bionic GPT](https://github.com/bionic-gpt/bionic-gpt)
 - [HTML UI](https://github.com/rtcfirefly/ollama-ui)
 - [Chatbot UI](https://github.com/ivanfioravanti/chatbot-ollama)
 - [Typescript UI](https://github.com/ollama-interface/Ollama-Gui?tab=readme-ov-file)
@@ -252,6 +272,11 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [ogpt.nvim](https://github.com/huynle/ogpt.nvim)
 - [gptel Emacs client](https://github.com/karthink/gptel)
 - [Oatmeal](https://github.com/dustinblackman/oatmeal)
+- [cmdh](https://github.com/pgibler/cmdh)
+
+### Database
+
+- [MindsDB](https://github.com/mindsdb/mindsdb/blob/staging/mindsdb/integrations/handlers/ollama_handler/README.md)

 ### Package managers

@@ -270,10 +295,12 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [OllamaKit for Swift](https://github.com/kevinhermawan/OllamaKit)
 - [Ollama for Dart](https://github.com/breitburg/dart-ollama)
 - [Ollama for Laravel](https://github.com/cloudstudio/ollama-laravel)
+- [LangChainDart](https://github.com/davidmigloz/langchain_dart)

 ### Mobile

- [Maid](https://github.com/danemadsen/Maid) (Mobile Artificial Intelligence Distribution)
+- [Enchanted](https://github.com/AugustDev/enchanted)
+- [Maid](https://github.com/danemadsen/Maid)

 ### Extensions & Plugins

--- a/api/types.go
+++ b/api/types.go
@@ -31,15 +31,18 @@ func (e StatusError) Error() string {
 	}
 }

+type ImageData []byte
+
 type GenerateRequest struct {
-	Model    string `json:"model"`
-	Prompt   string `json:"prompt"`
-	System   string `json:"system"`
-	Template string `json:"template"`
-	Context  []int  `json:"context,omitempty"`
-	Stream   *bool  `json:"stream,omitempty"`
-	Raw      bool   `json:"raw,omitempty"`
-	Format   string `json:"format"`
+	Model    string      `json:"model"`
+	Prompt   string      `json:"prompt"`
+	System   string      `json:"system"`
+	Template string      `json:"template"`
+	Context  []int       `json:"context,omitempty"`
+	Stream   *bool       `json:"stream,omitempty"`
+	Raw      bool        `json:"raw,omitempty"`
+	Format   string      `json:"format"`
+	Images   []ImageData `json:"images,omitempty"`

 	Options map[string]interface{} `json:"options"`
 }
@@ -54,14 +57,15 @@ type ChatRequest struct {
 }

 type Message struct {
-	Role    string `json:"role"` // one of ["system", "user", "assistant"]
-	Content string `json:"content"`
+	Role    string      `json:"role"` // one of ["system", "user", "assistant"]
+	Content string      `json:"content"`
+	Images  []ImageData `json:"images,omitempty"`
 }

 type ChatResponse struct {
 	Model     string    `json:"model"`
 	CreatedAt time.Time `json:"created_at"`
-	Message   *Message  `json:"message,omitempty"`
+	Message   Message   `json:"message"`

 	Done bool `json:"done"`

@@ -148,11 +152,12 @@ type ShowRequest struct {
 }

 type ShowResponse struct {
-	License    string `json:"license,omitempty"`
-	Modelfile  string `json:"modelfile,omitempty"`
-	Parameters string `json:"parameters,omitempty"`
-	Template   string `json:"template,omitempty"`
-	System     string `json:"system,omitempty"`
+	License    string       `json:"license,omitempty"`
+	Modelfile  string       `json:"modelfile,omitempty"`
+	Parameters string       `json:"parameters,omitempty"`
+	Template   string       `json:"template,omitempty"`
+	System     string       `json:"system,omitempty"`
+	Details    ModelDetails `json:"details,omitempty"`
 }

 type CopyRequest struct {
@@ -188,10 +193,11 @@ type ListResponse struct {
 }

 type ModelResponse struct {
-	Name       string    `json:"name"`
-	ModifiedAt time.Time `json:"modified_at"`
-	Size       int64     `json:"size"`
-	Digest     string    `json:"digest"`
+	Name       string       `json:"name"`
+	ModifiedAt time.Time    `json:"modified_at"`
+	Size       int64        `json:"size"`
+	Digest     string       `json:"digest"`
+	Details    ModelDetails `json:"details,omitempty"`
 }

 type TokenResponse struct {
@@ -209,6 +215,14 @@ type GenerateResponse struct {
 	Metrics
 }

+type ModelDetails struct {
+	Format            string   `json:"format"`
+	Family            string   `json:"family"`
+	Families          []string `json:"families"`
+	ParameterSize     string   `json:"parameter_size"`
+	QuantizationLevel string   `json:"quantization_level"`
+}
+
 func (m *Metrics) Summary() {
 	if m.TotalDuration > 0 {
 		fmt.Fprintf(os.Stderr, "total duration:       %v\n", m.TotalDuration)
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -17,6 +17,7 @@ import (
 	"os/exec"
 	"os/signal"
 	"path/filepath"
+	"regexp"
 	"runtime"
 	"strings"
 	"syscall"
@@ -25,6 +26,7 @@ import (
 	"github.com/olekukonko/tablewriter"
 	"github.com/spf13/cobra"
 	"golang.org/x/crypto/ssh"
+	"golang.org/x/exp/slices"
 	"golang.org/x/term"

 	"github.com/jmorganca/ollama/api"
@@ -36,6 +38,8 @@ import (
 	"github.com/jmorganca/ollama/version"
 )

+type ImageData []byte
+
 func CreateHandler(cmd *cobra.Command, args []string) error {
 	filename, _ := cmd.Flags().GetString("file")
 	filename, err := filepath.Abs(filename)
@@ -418,6 +422,7 @@ func RunGenerate(cmd *cobra.Command, args []string) error {
 		Model:    args[0],
 		WordWrap: os.Getenv("TERM") == "xterm-256color",
 		Options:  map[string]interface{}{},
+		Images:   []ImageData{},
 	}

 	format, err := cmd.Flags().GetString("format")
@@ -427,7 +432,6 @@ func RunGenerate(cmd *cobra.Command, args []string) error {
 	opts.Format = format

 	prompts := args[1:]
-
 	// prepend stdin to the prompt if provided
 	if !term.IsTerminal(int(os.Stdin.Fd())) {
 		in, err := io.ReadAll(os.Stdin)
@@ -466,6 +470,7 @@ type generateOptions struct {
 	Format   string
 	System   string
 	Template string
+	Images   []ImageData
 	Options  map[string]interface{}
 }

@@ -551,6 +556,10 @@ func generate(cmd *cobra.Command, opts generateOptions) error {
 		return nil
 	}

+	images := make([]api.ImageData, 0)
+	for _, i := range opts.Images {
+		images = append(images, api.ImageData(i))
+	}
 	request := api.GenerateRequest{
 		Model:    opts.Model,
 		Prompt:   opts.Prompt,
@@ -559,13 +568,34 @@ func generate(cmd *cobra.Command, opts generateOptions) error {
 		System:   opts.System,
 		Template: opts.Template,
 		Options:  opts.Options,
+		Images:   images,
 	}

 	if err := client.Generate(ctx, &request, fn); err != nil {
-		if errors.Is(err, context.Canceled) {
+		switch {
+		case errors.Is(err, context.Canceled):
 			return nil
+		case strings.Contains(err.Error(), "unsupported model format"):
+			// pull and retry to see if the model has been updated
+			parts := strings.Split(opts.Model, string(os.PathSeparator))
+			if len(parts) == 1 {
+				// this is a library model, log some info
+				fmt.Fprintln(os.Stderr, "This model is no longer compatible with Ollama. Pulling a new version...")
+			}
+			if err := PullHandler(cmd, []string{opts.Model}); err != nil {
+				fmt.Printf("Error: %s\n", err)
+				return fmt.Errorf("unsupported model, please update this model to gguf format") // relay the original error
+			}
+			// retry
+			if err := client.Generate(ctx, &request, fn); err != nil {
+				if errors.Is(err, context.Canceled) {
+					return nil
+				}
+				return err
+			}
+		default:
+			return err
 		}
-		return err
 	}
 	if opts.Prompt != "" {
 		fmt.Println()
@@ -585,7 +615,9 @@ func generate(cmd *cobra.Command, opts generateOptions) error {
 		latest.Summary()
 	}

-	cmd.SetContext(context.WithValue(cmd.Context(), generateContextKey("context"), latest.Context))
+	ctx = context.WithValue(cmd.Context(), generateContextKey("context"), latest.Context)
+	cmd.SetContext(ctx)
+
 	return nil
 }

@@ -598,11 +630,31 @@ const (
 	MultilineTemplate
 )

+func modelIsMultiModal(cmd *cobra.Command, name string) bool {
+	// get model details
+	client, err := api.ClientFromEnvironment()
+	if err != nil {
+		fmt.Println("error: couldn't connect to ollama server")
+		return false
+	}
+
+	req := api.ShowRequest{Name: name}
+	resp, err := client.Show(cmd.Context(), &req)
+	if err != nil {
+		return false
+	}
+
+	return slices.Contains(resp.Details.Families, "clip")
+}
+
 func generateInteractive(cmd *cobra.Command, opts generateOptions) error {
+	multiModal := modelIsMultiModal(cmd, opts.Model)
+
 	// load the model
 	loadOpts := generateOptions{
 		Model:  opts.Model,
 		Prompt: "",
+		Images: []ImageData{},
 	}
 	if err := generate(cmd, loadOpts); err != nil {
 		return err
@@ -622,7 +674,7 @@ func generateInteractive(cmd *cobra.Command, opts generateOptions) error {
 	usageSet := func() {
 		fmt.Fprintln(os.Stderr, "Available Commands:")
 		fmt.Fprintln(os.Stderr, "  /set parameter ...     Set a parameter")
-		fmt.Fprintln(os.Stderr, "  /set system <string>   Set system prompt")
+		fmt.Fprintln(os.Stderr, "  /set system <string>   Set system message")
 		fmt.Fprintln(os.Stderr, "  /set template <string> Set prompt template")
 		fmt.Fprintln(os.Stderr, "  /set history           Enable history")
 		fmt.Fprintln(os.Stderr, "  /set nohistory         Disable history")
@@ -640,7 +692,7 @@ func generateInteractive(cmd *cobra.Command, opts generateOptions) error {
 		fmt.Fprintln(os.Stderr, "  /show license      Show model license")
 		fmt.Fprintln(os.Stderr, "  /show modelfile    Show Modelfile for this model")
 		fmt.Fprintln(os.Stderr, "  /show parameters   Show parameters for this model")
-		fmt.Fprintln(os.Stderr, "  /show system       Show system prompt")
+		fmt.Fprintln(os.Stderr, "  /show system       Show system message")
 		fmt.Fprintln(os.Stderr, "  /show template     Show prompt template")
 		fmt.Fprintln(os.Stderr, "")
 	}
@@ -701,9 +753,10 @@ func generateInteractive(cmd *cobra.Command, opts generateOptions) error {
 			// if the prompt so far starts with """ then we're in multiline mode
 			// and we need to keep reading until we find a line that ends with """
 			cut, found := strings.CutSuffix(line, `"""`)
-			prompt += cut + "\n"
+			prompt += cut

 			if !found {
+				prompt += "\n"
 				continue
 			}

@@ -714,11 +767,11 @@ func generateInteractive(cmd *cobra.Command, opts generateOptions) error {
 			case MultilineSystem:
 				opts.System = prompt
 				prompt = ""
-				fmt.Println("Set system template.")
+				fmt.Println("Set system message.")
 			case MultilineTemplate:
 				opts.Template = prompt
 				prompt = ""
-				fmt.Println("Set model template.")
+				fmt.Println("Set prompt template.")
 			}
 			multiline = MultilineNone
 		case strings.HasPrefix(line, `"""`) && len(prompt) == 0:
@@ -789,17 +842,18 @@ func generateInteractive(cmd *cobra.Command, opts generateOptions) error {
 					line = strings.TrimPrefix(line, `"""`)
 					if strings.HasPrefix(args[2], `"""`) {
 						cut, found := strings.CutSuffix(line, `"""`)
-						prompt += cut + "\n"
+						prompt += cut
 						if found {
-							opts.System = prompt
 							if args[1] == "system" {
-								fmt.Println("Set system template.")
+								opts.System = prompt
+								fmt.Println("Set system message.")
 							} else {
+								opts.Template = prompt
 								fmt.Println("Set prompt template.")
 							}
 							prompt = ""
 						} else {
-							prompt = `"""` + prompt
+							prompt = `"""` + prompt + "\n"
 							if args[1] == "system" {
 								multiline = MultilineSystem
 							} else {
@@ -809,7 +863,7 @@ func generateInteractive(cmd *cobra.Command, opts generateOptions) error {
 						}
 					} else {
 						opts.System = line
-						fmt.Println("Set system template.")
+						fmt.Println("Set system message.")
 					}
 				default:
 					fmt.Printf("Unknown command '/set %s'. Type /? for help\n", args[1])
@@ -861,7 +915,7 @@ func generateInteractive(cmd *cobra.Command, opts generateOptions) error {
 					case resp.System != "":
 						fmt.Println(resp.System + "\n")
 					default:
-						fmt.Print("No system prompt was specified for this model.\n\n")
+						fmt.Print("No system message was specified for this model.\n\n")
 					}
 				case "template":
 					switch {
@@ -894,14 +948,50 @@ func generateInteractive(cmd *cobra.Command, opts generateOptions) error {
 			return nil
 		case strings.HasPrefix(line, "/"):
 			args := strings.Fields(line)
-			fmt.Printf("Unknown command '%s'. Type /? for help\n", args[0])
-			continue
+			isFile := false
+
+			if multiModal {
+				for _, f := range extractFileNames(line) {
+					if strings.HasPrefix(f, args[0]) {
+						isFile = true
+						break
+					}
+				}
+			}
+
+			if isFile {
+				prompt += line
+			} else {
+				fmt.Printf("Unknown command '%s'. Type /? for help\n", args[0])
+				continue
+			}
 		default:
 			prompt += line
 		}

 		if len(prompt) > 0 && multiline == MultilineNone {
 			opts.Prompt = prompt
+			if multiModal {
+				newPrompt, images, err := extractFileData(prompt)
+				if err != nil {
+					return err
+				}
+				opts.Prompt = newPrompt
+
+				// reset the context if we find another image
+				if len(images) > 0 {
+					opts.Images = images
+					ctx := cmd.Context()
+					ctx = context.WithValue(ctx, generateContextKey("context"), []int{})
+					cmd.SetContext(ctx)
+				}
+				if len(opts.Images) == 0 {
+					fmt.Println("This model requires you to add a jpeg, png, or svg image.")
+					fmt.Println()
+					prompt = ""
+					continue
+				}
+			}
 			if err := generate(cmd, opts); err != nil {
 				return err
 			}
@@ -911,6 +1001,61 @@ func generateInteractive(cmd *cobra.Command, opts generateOptions) error {
 	}
 }

+func normalizeFilePath(fp string) string {
+	// Define a map of escaped characters and their replacements
+	replacements := map[string]string{
+		"\\ ":  " ",  // Escaped space
+		"\\(":  "(",  // Escaped left parenthesis
+		"\\)":  ")",  // Escaped right parenthesis
+		"\\[":  "[",  // Escaped left square bracket
+		"\\]":  "]",  // Escaped right square bracket
+		"\\{":  "{",  // Escaped left curly brace
+		"\\}":  "}",  // Escaped right curly brace
+		"\\$":  "$",  // Escaped dollar sign
+		"\\&":  "&",  // Escaped ampersand
+		"\\;":  ";",  // Escaped semicolon
+		"\\'":  "'",  // Escaped single quote
+		"\\\\": "\\", // Escaped backslash
+		"\\*":  "*",  // Escaped asterisk
+		"\\?":  "?",  // Escaped question mark
+	}
+
+	for escaped, actual := range replacements {
+		fp = strings.ReplaceAll(fp, escaped, actual)
+	}
+	return fp
+}
+
+func extractFileNames(input string) []string {
+	// Regex to match file paths starting with / or ./ and include escaped spaces (\ or %20)
+	// and followed by more characters and a file extension
+	regexPattern := `(?:\./|/)[\S\\ ]+?\.(?i:jpg|jpeg|png|svg)\b`
+	re := regexp.MustCompile(regexPattern)
+
+	return re.FindAllString(input, -1)
+}
+
+func extractFileData(input string) (string, []ImageData, error) {
+	filePaths := extractFileNames(input)
+	var imgs []ImageData
+
+	for _, fp := range filePaths {
+		nfp := normalizeFilePath(fp)
+		data, err := getImageData(nfp)
+		if err != nil {
+			if os.IsNotExist(err) {
+				continue
+			}
+			fmt.Printf("Couldn't process image: %q\n", err)
+			return "", imgs, err
+		}
+		fmt.Printf("Added image '%s'\n", nfp)
+		input = strings.ReplaceAll(input, fp, "")
+		imgs = append(imgs, data)
+	}
+	return input, imgs, nil
+}
+
 func RunServer(cmd *cobra.Command, _ []string) error {
 	host, port, err := net.SplitHostPort(os.Getenv("OLLAMA_HOST"))
 	if err != nil {
@@ -929,12 +1074,51 @@ func RunServer(cmd *cobra.Command, _ []string) error {
 		return err
 	}

-	var origins []string
-	if o := os.Getenv("OLLAMA_ORIGINS"); o != "" {
-		origins = strings.Split(o, ",")
+	return server.Serve(ln)
+}
+
+func getImageData(filePath string) ([]byte, error) {
+	file, err := os.Open(filePath)
+	if err != nil {
+		return nil, err
+	}
+	defer file.Close()
+
+	buf := make([]byte, 512)
+	_, err = file.Read(buf)
+	if err != nil {
+		return nil, err
 	}

-	return server.Serve(ln, origins)
+	contentType := http.DetectContentType(buf)
+	allowedTypes := []string{"image/jpeg", "image/jpg", "image/svg+xml", "image/png"}
+	if !slices.Contains(allowedTypes, contentType) {
+		return nil, fmt.Errorf("invalid image type: %s", contentType)
+	}
+
+	info, err := file.Stat()
+	if err != nil {
+		return nil, err
+	}
+
+	// Check if the file size exceeds 100MB
+	var maxSize int64 = 100 * 1024 * 1024 // 100MB in bytes
+	if info.Size() > maxSize {
+		return nil, fmt.Errorf("file size exceeds maximum limit (100MB)")
+	}
+
+	buf = make([]byte, info.Size())
+	_, err = file.Seek(0, 0)
+	if err != nil {
+		return nil, err
+	}
+
+	_, err = io.ReadFull(file, buf)
+	if err != nil {
+		return nil, err
+	}
+
+	return buf, nil
 }

 func initializeKeypair() error {
@@ -1103,7 +1287,7 @@ func NewCLI() *cobra.Command {
 	showCmd.Flags().Bool("modelfile", false, "Show Modelfile of a model")
 	showCmd.Flags().Bool("parameters", false, "Show parameters of a model")
 	showCmd.Flags().Bool("template", false, "Show template of a model")
-	showCmd.Flags().Bool("system", false, "Show system prompt of a model")
+	showCmd.Flags().Bool("system", false, "Show system message of a model")

 	runCmd := &cobra.Command{
 		Use:     "run MODEL [PROMPT]",
--- a/docs/README.md
+++ b/docs/README.md
@@ -1,6 +1,25 @@
 # Documentation

- [Modelfile](./modelfile.md)
- [How to develop Ollama](./development.md)
- [API](./api.md)
- [Tutorials](./tutorials.md)
+To get started, see the project's **[quicktart](../README.md#quickstart)**.
+
+Ollama is a tool for running AI models on your hardware. Many users will choose to use the Command Line Interface (CLI) to work with Ollama. Learn more about all the commands in the CLI in the **[Main Readme](../README.md)**.
+
+Use the RESTful API using any language, including Python, JavaScript, Typescript, Go, Rust, and many more. Learn more about using the API in the **[API Documentation](./api.md)**.
+
+Create new models or modify models already in the library using the Modelfile. Learn more about the Modelfile syntax in the **[Modelfile Documentation](./modelfile.md)**.
+
+Import models using source model weights found on Hugging Face and similar sites by referring to the **[Import Documentation](./import.md)**.
+
+Installing on Linux in most cases is easy using the script on Ollama.ai. To get more detail about the install, including CUDA drivers, see the **[Linux Documentation](./linux.md)**.
+
+Many of our users like the flexibility of using our official Docker Image. Learn more about using Docker with Ollama using the **[Docker Documentation](./docker.md)**.
+
+It is easy to install on Linux and Mac, but many users will choose to build Ollama on their own. To do this, refer to the **[Development Documentation](./development.md)**.
+
+If encountering a problem with Ollama, the best place to start is the logs. Find more information about them here in the **[Troubleshooting Guide](./troubleshooting.md)**.
+
+Finally for all the questions that don't fit anywhere else, there is the **[FAQ](./faq.md)**
+
+[Tutorials](./tutorials.md) apply the documentation to tasks.
+
+For working code examples of using Ollama, see [Examples](../examples).
--- a/docs/api.md
+++ b/docs/api.md
@@ -3,6 +3,7 @@
 ## Endpoints

 - [Generate a completion](#generate-a-completion)
+- [Generate a chat completion](#generate-a-chat-completion)
 - [Create a Model](#create-a-model)
 - [List Local Models](#list-local-models)
 - [Show Model Information](#show-model-information)
@@ -16,7 +17,7 @@

 ### Model names

-Model names follow a `model:tag` format. Some examples are `orca-mini:3b-q4_1` and `llama2:70b`. The tag is optional and, if not provided, will default to `latest`. The tag is used to identify a specific version.
+Model names follow a `model:tag` format, where `model` can have an optional namespace such as `example/model`. Some examples are `orca-mini:3b-q4_1` and `llama2:70b`. The tag is optional and, if not provided, will default to `latest`. The tag is used to identify a specific version.

 ### Durations

@@ -24,7 +25,8 @@ All durations are returned in nanoseconds.

 ### Streaming responses

-Certain endpoints stream responses as JSON objects.
+Certain endpoints stream responses as JSON objects and can optional return non-streamed responses.
+

 ## Generate a completion

@@ -38,26 +40,29 @@ Generate a response for a given prompt with a provided model. This is a streamin

 - `model`: (required) the [model name](#model-names)
 - `prompt`: the prompt to generate a response for
+- `images`: (optional) a list of base64-encoded images (for multimodal models such as `llava`)

 Advanced parameters (optional):

 - `format`: the format to return a response in. Currently the only accepted value is `json`
 - `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
- `system`: system prompt to (overrides what is defined in the `Modelfile`)
+- `system`: system message to (overrides what is defined in the `Modelfile`)
 - `template`: the full prompt or prompt template (overrides what is defined in the `Modelfile`)
 - `context`: the context parameter returned from a previous request to `/generate`, this can be used to keep a short conversational memory
 - `stream`: if `false` the response will be returned as a single response object, rather than a stream of objects
 - `raw`: if `true` no formatting will be applied to the prompt. You may choose to use the `raw` parameter if you are specifying a full templated prompt in your request to the API.

-### JSON mode
+#### JSON mode

-Enable JSON mode by setting the `format` parameter to `json`. This will structure the response as valid JSON. See the JSON mode [example](#request-json-mode) below.
+Enable JSON mode by setting the `format` parameter to `json`. This will structure the response as a valid JSON object. See the JSON mode [example](#generate-request-json-mode) below.

 > Note: it's important to instruct the model to use JSON in the `prompt`. Otherwise, the model may generate large amounts whitespace.

 ### Examples

-#### Request
+#### Generate request (Streaming)
+
+##### Request

 ```shell
 curl http://localhost:11434/api/generate -d '{
@@ -66,7 +71,7 @@ curl http://localhost:11434/api/generate -d '{
 }'
 ```

-#### Response
+##### Response

 A stream of JSON objects is returned:

@@ -83,8 +88,6 @@ The final response in the stream also includes additional data about the generat

 - `total_duration`: time spent generating the response
 - `load_duration`: time spent in nanoseconds loading the model
- `sample_count`: number of samples generated
- `sample_duration`: time spent generating samples
 - `prompt_eval_count`: number of tokens in the prompt
 - `prompt_eval_duration`: time spent in nanoseconds evaluating the prompt
 - `eval_count`: number of tokens the response
@@ -99,22 +102,22 @@ To calculate how fast the response is generated in tokens per second (token/s),
  "model": "llama2",
  "created_at": "2023-08-04T19:22:45.499127Z",
  "response": "",
-  "context": [1, 2, 3],
  "done": true,
-  "total_duration": 5589157167,
-  "load_duration": 3013701500,
-  "sample_count": 114,
-  "sample_duration": 81442000,
-  "prompt_eval_count": 46,
-  "prompt_eval_duration": 1160282000,
-  "eval_count": 113,
-  "eval_duration": 1325948000
+  "context": [1, 2, 3],
+  "total_duration":10706818083,
+  "load_duration":6338219291,
+  "prompt_eval_count":26,
+  "prompt_eval_duration":130079000,
+  "eval_count":259,
+  "eval_duration":4232710000
 }
 ```

 #### Request (No streaming)

-A response can be recieved in one reply when streaming is off.
+##### Request
+
+A response can be received in one reply when streaming is off.

 ```shell
 curl http://localhost:11434/api/generate -d '{
@@ -124,7 +127,7 @@ curl http://localhost:11434/api/generate -d '{
 }'
 ```

-#### Response
+##### Response

 If `stream` is set to `false`, the response will be a single JSON object:

@@ -133,52 +136,23 @@ If `stream` is set to `false`, the response will be a single JSON object:
  "model": "llama2",
  "created_at": "2023-08-04T19:22:45.499127Z",
  "response": "The sky is blue because it is the color of the sky.",
-  "context": [1, 2, 3],
  "done": true,
-  "total_duration": 5589157167,
-  "load_duration": 3013701500,
-  "sample_count": 114,
-  "sample_duration": 81442000,
-  "prompt_eval_count": 46,
-  "prompt_eval_duration": 1160282000,
-  "eval_count": 13,
-  "eval_duration": 1325948000
-}
-```
-
-#### Request (Raw Mode)
-
-In some cases you may wish to bypass the templating system and provide a full prompt. In this case, you can use the `raw` parameter to disable formatting.
-
-```shell
-curl http://localhost:11434/api/generate -d '{
-  "model": "mistral",
-  "prompt": "[INST] why is the sky blue? [/INST]",
-  "raw": true,
-  "stream": false
-}'
-```
-
-#### Response
-
-```json
-{
-  "model": "mistral",
-  "created_at": "2023-11-03T15:36:02.583064Z",
-  "response": " The sky appears blue because of a phenomenon called Rayleigh scattering.",
  "context": [1, 2, 3],
-  "done": true,
-  "total_duration": 14648695333,
-  "load_duration": 3302671417,
-  "prompt_eval_count": 14,
-  "prompt_eval_duration": 286243000,
-  "eval_count": 129,
-  "eval_duration": 10931424000
+  "total_duration": 5043500667,
+  "load_duration": 5025959,
+  "prompt_eval_count": 26,
+  "prompt_eval_duration": 325953000,
+  "eval_count": 290,
+  "eval_duration": 4709213000
 }
 ```

 #### Request (JSON mode)

+> When `format` is set to `json`, the output will always be a well-formed JSON object. It's important to also instruct the model to respond in JSON.
+
+##### Request
+
 ```shell
 curl http://localhost:11434/api/generate -d '{
  "model": "llama2",
@@ -188,7 +162,7 @@ curl http://localhost:11434/api/generate -d '{
 }'
 ```

-#### Response
+##### Response

 ```json
 {
@@ -196,12 +170,13 @@ curl http://localhost:11434/api/generate -d '{
  "created_at": "2023-11-09T21:07:55.186497Z",
  "response": "{\n\"morning\": {\n\"color\": \"blue\"\n},\n\"noon\": {\n\"color\": \"blue-gray\"\n},\n\"afternoon\": {\n\"color\": \"warm gray\"\n},\n\"evening\": {\n\"color\": \"orange\"\n}\n}\n",
  "done": true,
-  "total_duration": 4661289125,
-  "load_duration": 1714434500,
+  "context": [1, 2, 3], 
+  "total_duration": 4648158584,
+  "load_duration": 4071084,
  "prompt_eval_count": 36,
-  "prompt_eval_duration": 264132000,
-  "eval_count": 75,
-  "eval_duration": 2112149000
+  "prompt_eval_duration": 439038000,
+  "eval_count": 180,
+  "eval_duration": 4196918000
 }
 ```

@@ -224,10 +199,76 @@ The value of `response` will be a string containing JSON similar to:
 }
 ```

-#### Request (With options)
+#### Request (with images)
+
+To submit images to multimodal models such as `llava` or `bakllava`, provide a list of base64-encoded `images`:
+
+#### Request
+
+```shell
+curl http://localhost:11434/api/generate -d '{
+  "model": "llava",
+  "prompt":"What is in this picture?",
+  "stream": false,
+  "images": ["iVBORw0KGgoAAAANSUhEUgAAAG0AAABmCAYAAADBPx+VAAAACXBIWXMAAAsTAAALEwEAmpwYAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAA3VSURBVHgB7Z27r0zdG8fX743i1bi1ikMoFMQloXRpKFFIqI7LH4BEQ+NWIkjQuSWCRIEoULk0gsK1kCBI0IhrQVT7tz/7zZo888yz1r7MnDl7z5xvsjkzs2fP3uu71nNfa7lkAsm7d++Sffv2JbNmzUqcc8m0adOSzZs3Z+/XES4ZckAWJEGWPiCxjsQNLWmQsWjRIpMseaxcuTKpG/7HP27I8P79e7dq1ars/yL4/v27S0ejqwv+cUOGEGGpKHR37tzJCEpHV9tnT58+dXXCJDdECBE2Ojrqjh071hpNECjx4cMHVycM1Uhbv359B2F79+51586daxN/+pyRkRFXKyRDAqxEp4yMlDDzXG1NPnnyJKkThoK0VFd1ELZu3TrzXKxKfW7dMBQ6bcuWLW2v0VlHjx41z717927ba22U9APcw7Nnz1oGEPeL3m3p2mTAYYnFmMOMXybPPXv2bNIPpFZr1NHn4HMw0KRBjg9NuRw95s8PEcz/6DZELQd/09C9QGq5RsmSRybqkwHGjh07OsJSsYYm3ijPpyHzoiacg35MLdDSIS/O1yM778jOTwYUkKNHWUzUWaOsylE00MyI0fcnOwIdjvtNdW/HZwNLGg+sR1kMepSNJXmIwxBZiG8tDTpEZzKg0GItNsosY8USkxDhD0Rinuiko2gfL/RbiD2LZAjU9zKQJj8RDR0vJBR1/Phx9+PHj9Z7REF4nTZkxzX4LCXHrV271qXkBAPGfP/atWvu/PnzHe4C97F48eIsRLZ9+3a3f/9+87dwP1JxaF7/3r17ba+5l4EcaVo0lj3SBq5kGTJSQmLWMjgYNei2GPT1MuMqGTDEFHzeQSP2wi/jGnkmPJ/nhccs44jvDAxpVcxnq0F6eT8h4ni/iIWpR5lPyA6ETkNXoSukvpJAD3AsXLiwpZs49+fPn5ke4j10TqYvegSfn0OnafC+Tv9ooA/JPkgQysqQNBzagXY55nO/oa1F7qvIPWkRL12WRpMWUvpVDYmxAPehxWSe8ZEXL20sadYIozfmNch4QJPAfeJgW3rNsnzphBKNJM2KKODo1rVOMRYik5ETy3ix4qWNI81qAAirizgMIc+yhTytx0JWZuNI03qsrgWlGtwjoS9XwgUhWGyhUaRZZQNNIEwCiXD16tXcAHUs79co0vSD8rrJCIW98pzvxpAWyyo3HYwqS0+H0BjStClcZJT5coMm6D2LOF8TolGJtK9fvyZpyiC5ePFi9nc/oJU4eiEP0jVoAnHa9wyJycITMP78+eMeP37sXrx44d6+fdt6f82aNdkx1pg9e3Zb5W+RSRE+n+VjksQWifvVaTKFhn5O8my63K8Qabdv33b379/PiAP//vuvW7BggZszZ072/+TJk91YgkafPn166zXB1rQHFvouAWHq9z3SEevSUerqCn2/dDCeta2jxYbr69evk4MHDyY7d+7MjhMnTiTPnz9Pfv/+nfQT2ggpO2dMF8cghuoM7Ygj5iWCqRlGFml0QC/ftGmTmzt3rmsaKDsgBSPh0/8yPeLLBihLkOKJc0jp8H8vUzcxIA1k6QJ/c78tWEyj5P3o4u9+jywNPdJi5rAH9x0KHcl4Hg570eQp3+vHXGyrmEeigzQsQsjavXt38ujRo44LQuDDhw+TW7duRS1HGgMxhNXHgflaNTOsHyKvHK5Ijo2jbFjJBQK9YwFd6RVMzfgRBmEfP37suBBm/p49e1qjEP2mwTViNRo0VJWH1deMXcNK08uUjVUu7s/zRaL+oLNxz1bpANco4npUgX4G2eFbpDFyQoQxojBCpEGSytmOH8qrH5Q9vuzD6ofQylkCUmh8DBAr+q8JCyVNtWQIidKQE9wNtLSQnS4jDSsxNHogzFuQBw4cyM61UKVsjfr3ooBkPSqqQHesUPWVtzi9/vQi1T+rJj7WiTz4Pt/l3LxUkr5P2VYZaZ4URpsE+st/dujQoaBBYokbrz/8TJNQYLSonrPS9kUaSkPeZyj1AWSj+d+VBoy1pIWVNed8P0Ll/ee5HdGRhrHhR5GGN0r4LGZBaj8oFDJitBTJzIZgFcmU0Y8ytWMZMzJOaXUSrUs5RxKnrxmbb5YXO9VGUhtpXldhEUogFr3IzIsvlpmdosVcGVGXFWp2oU9kLFL3dEkSz6NHEY1sjSRdIuDFWEhd8KxFqsRi1uM/nz9/zpxnwlESONdg6dKlbsaMGS4EHFHtjFIDHwKOo46l4TxSuxgDzi+rE2jg+BaFruOX4HXa0Nnf1lwAPufZeF8/r6zD97WK2qFnGjBxTw5qNGPxT+5T/r7/7RawFC3j4vTp09koCxkeHjqbHJqArmH5UrFKKksnxrK7FuRIs8STfBZv+luugXZ2pR/pP9Ois4z+TiMzUUkUjD0iEi1fzX8GmXyuxUBRcaUfykV0YZnlJGKQpOiGB76x5GeWkWWJc3mOrK6S7xdND+W5N6XyaRgtWJFe13GkaZnKOsYqGdOVVVbGupsyA/l7emTLHi7vwTdirNEt0qxnzAvBFcnQF16xh/TMpUuXHDowhlA9vQVraQhkudRdzOnK+04ZSP3DUhVSP61YsaLtd/ks7ZgtPcXqPqEafHkdqa84X6aCeL7YWlv6edGFHb+ZFICPlljHhg0bKuk0CSvVznWsotRu433alNdFrqG45ejoaPCaUkWERpLXjzFL2Rpllp7PJU2a/v7Ab8N05/9t27Z16KUqoFGsxnI9EosS2niSYg9SpU6B4JgTrvVW1flt1sT+0ADIJU2maXzcUTraGCRaL1Wp9rUMk16PMom8QhruxzvZIegJjFU7LLCePfS8uaQdPny4jTTL0dbee5mYokQsXTIWNY46kuMbnt8Kmec+LGWtOVIl9cT1rCB0V8WqkjAsRwta93TbwNYoGKsUSChN44lgBNCoHLHzquYKrU6qZ8lolCIN0Rh6cP0Q3U6I6IXILYOQI513hJaSKAorFpuHXJNfVlpRtmYBk1Su1obZr5dnKAO+L10Hrj3WZW+E3qh6IszE37F6EB+68mGpvKm4eb9bFrlzrok7fvr0Kfv727dvWRmdVTJHw0qiiCUSZ6wCK+7XL/AcsgNyL74DQQ730sv78Su7+t/A36MdY0sW5o40ahslXr58aZ5HtZB8GH64m9EmMZ7FpYw4T6QnrZfgenrhFxaSiSGXtPnz57e9TkNZLvTjeqhr734CNtrK41L40sUQckmj1lGKQ0rC37x544r8eNXRpnVE3ZZY7zXo8NomiO0ZUCj2uHz58rbXoZ6gc0uA+F6ZeKS/jhRDUq8MKrTho9fEkihMmhxtBI1DxKFY9XLpVcSkfoi8JGnToZO5sU5aiDQIW716ddt7ZLYtMQlhECdBGXZZMWldY5BHm5xgAroWj4C0hbYkSc/jBmggIrXJWlZM6pSETsEPGqZOndr2uuuR5rF169a2HoHPdurUKZM4CO1WTPqaDaAd+GFGKdIQkxAn9RuEWcTRyN2KSUgiSgF5aWzPTeA/lN5rZubMmR2bE4SIC4nJoltgAV/dVefZm72AtctUCJU2CMJ327hxY9t7EHbkyJFseq+EJSY16RPo3Dkq1kkr7+q0bNmyDuLQcZBEPYmHVdOBiJyIlrRDq41YPWfXOxUysi5fvtyaj+2BpcnsUV/oSoEMOk2CQGlr4ckhBwaetBhjCwH0ZHtJROPJkyc7UjcYLDjmrH7ADTEBXFfOYmB0k9oYBOjJ8b4aOYSe7QkKcYhFlq3QYLQhSidNmtS2RATwy8YOM3EQJsUjKiaWZ+vZToUQgzhkHXudb/PW5YMHD9yZM2faPsMwoc7RciYJXbGuBqJ1UIGKKLv915jsvgtJxCZDubdXr165mzdvtr1Hz5LONA8jrUwKPqsmVesKa49S3Q4WxmRPUEYdTjgiUcfUwLx589ySJUva3oMkP6IYddq6HMS4o55xBJBUeRjzfa4Zdeg56QZ43LhxoyPo7Lf1kNt7oO8wWAbNwaYjIv5lhyS7kRf96dvm5Jah8vfvX3flyhX35cuX6HfzFHOToS1H4BenCaHvO8pr8iDuwoUL7tevX+b5ZdbBair0xkFIlFDlW4ZknEClsp/TzXyAKVOmmHWFVSbDNw1l1+4f90U6IY/q4V27dpnE9bJ+v87QEydjqx/UamVVPRG+mwkNTYN+9tjkwzEx+atCm/X9WvWtDtAb68Wy9LXa1UmvCDDIpPkyOQ5ZwSzJ4jMrvFcr0rSjOUh+GcT4LSg5ugkW1Io0/SCDQBojh0hPlaJdah+tkVYrnTZowP8iq1F1TgMBBauufyB33x1v+NWFYmT5KmppgHC+NkAgbmRkpD3yn9QIseXymoTQFGQmIOKTxiZIWpvAatenVqRVXf2nTrAWMsPnKrMZHz6bJq5jvce6QK8J1cQNgKxlJapMPdZSR64/UivS9NztpkVEdKcrs5alhhWP9NeqlfWopzhZScI6QxseegZRGeg5a8C3Re1Mfl1ScP36ddcUaMuv24iOJtz7sbUjTS4qBvKmstYJoUauiuD3k5qhyr7QdUHMeCgLa1Ear9NquemdXgmum4fvJ6w1lqsuDhNrg1qSpleJK7K3TF0Q2jSd94uSZ60kK1e3qyVpQK6PVWXp2/FC3mp6jBhKKOiY2h3gtUV64TWM6wDETRPLDfSakXmH3w8g9Jlug8ZtTt4kVF0kLUYYmCCtD/DrQ5YhMGbA9L3ucdjh0y8kOHW5gU/VEEmJTcL4Pz/f7mgoAbYkAAAAAElFTkSuQmCC"]
+}'
+```
+
+#### Response
+
+```
+{
+  "model": "llava",
+  "created_at": "2023-11-03T15:36:02.583064Z",
+  "response": "A happy cartoon character, which is cute and cheerful.",
+  "done": true,
+  "context": [1, 2, 3],
+  "total_duration": 2938432250,
+  "load_duration": 2559292,
+  "prompt_eval_count": 1,
+  "prompt_eval_duration": 2195557000,
+  "eval_count": 44,
+  "eval_duration": 736432000
+}
+```
+
+#### Request (Raw Mode)
+
+In some cases, you may wish to bypass the templating system and provide a full prompt. In this case, you can use the `raw` parameter to disable templating. Also note that raw mode will not return a context.
+##### Request
+
+```shell
+curl http://localhost:11434/api/generate -d '{
+  "model": "mistral",
+  "prompt": "[INST] why is the sky blue? [/INST]",
+  "raw": true,
+  "stream": false
+}'
+```
+
+##### Response
+
+```json
+{
+  "model": "mistral",
+  "created_at": "2023-11-03T15:36:02.583064Z",
+  "response": " The sky appears blue because of a phenomenon called Rayleigh scattering.",
+  "done": true,
+  "total_duration": 8493852375,
+  "load_duration": 6589624375,
+  "prompt_eval_count": 14,
+  "prompt_eval_duration": 119039000,
+  "eval_count": 110,
+  "eval_duration": 1779061000
+}
+```
+
+#### Generate request (With options)

 If you want to set custom options for the model at runtime rather than in the Modelfile, you can do so with the `options` parameter. This example sets every available option, but you can set any of them individually and omit the ones you do not want to override.

+##### Request
+
 ```shell
 curl http://localhost:11434/api/generate -d '{
  "model": "llama2",
@@ -259,7 +300,6 @@ curl http://localhost:11434/api/generate -d '{
    "main_gpu": 0,
    "low_vram": false,
    "f16_kv": true,
-    "logits_all": false,
    "vocab_only": false,
    "use_mmap": true,
    "use_mlock": false,
@@ -271,7 +311,7 @@ curl http://localhost:11434/api/generate -d '{
 }'
 ```

-#### Response
+##### Response

 ```json
 {
@@ -279,30 +319,60 @@ curl http://localhost:11434/api/generate -d '{
  "created_at": "2023-08-04T19:22:45.499127Z",
  "response": "The sky is blue because it is the color of the sky.",
  "done": true,
-  "total_duration": 5589157167,
-  "load_duration": 3013701500,
-  "sample_count": 114,
-  "sample_duration": 81442000,
-  "prompt_eval_count": 46,
-  "prompt_eval_duration": 1160282000,
-  "eval_count": 13,
-  "eval_duration": 1325948000
+  "context": [1, 2, 3], 
+  "total_duration": 4935886791,
+  "load_duration": 534986708,
+  "prompt_eval_count": 26,
+  "prompt_eval_duration": 107345000,
+  "eval_count": 237,
+  "eval_duration": 4289432000
 }
 ```

-## Send Chat Messages (coming in 0.1.14)
+#### Load a model
+
+If an empty prompt is provided, the model will be loaded into memory.
+
+##### Request
+
+```shell
+curl http://localhost:11434/api/generate -d '{
+  "model": "llama2"
+}'
+```
+
+##### Response
+
+A single JSON object is returned:
+
+```json
+{
+  "model":"llama2",
+  "created_at":"2023-12-18T19:52:07.071755Z",
+  "response":"",
+  "done":true
+}
+```
+
+## Generate a chat completion

 ```shell
 POST /api/chat
 ```

-Generate the next message in a chat with a provided model. This is a streaming endpoint, so there will be a series of responses. The final response object will include statistics and additional data from the request.
+Generate the next message in a chat with a provided model. This is a streaming endpoint, so there will be a series of responses. Streaming can be disabled using `"stream": false`. The final response object will include statistics and additional data from the request.

 ### Parameters

 - `model`: (required) the [model name](#model-names)
 - `messages`: the messages of the chat, this can be used to keep a chat memory

+The `message` object has the following fields:
+
+- `role`: the role of the message, either `system`, `user` or `assistant`
+- `content`: the content of the message
+- `images` (optional): a list of images to include in the message (for multimodal models such as `llava`)
+
 Advanced parameters (optional):

 - `format`: the format to return a response in. Currently the only accepted value is `json`
@@ -312,7 +382,9 @@ Advanced parameters (optional):

 ### Examples

-#### Request
+#### Chat Request (Streaming)
+
+##### Request

 Send a chat message with a streaming response.

@@ -328,7 +400,7 @@ curl http://localhost:11434/api/chat -d '{
 }'
 ```

-#### Response
+##### Response

 A stream of JSON objects is returned:

@@ -338,7 +410,8 @@ A stream of JSON objects is returned:
  "created_at": "2023-08-04T08:52:19.385406455-07:00",
  "message": {
    "role": "assisant",
-    "content": "The"
+    "content": "The", 
+    "images": null
  },
  "done": false
 }
@@ -351,20 +424,57 @@ Final response:
  "model": "llama2",
  "created_at": "2023-08-04T19:22:45.499127Z",
  "done": true,
-  "total_duration": 5589157167,
-  "load_duration": 3013701500,
-  "sample_count": 114,
-  "sample_duration": 81442000,
-  "prompt_eval_count": 46,
-  "prompt_eval_duration": 1160282000,
-  "eval_count": 113,
-  "eval_duration": 1325948000
+  "total_duration":4883583458,
+  "load_duration":1334875,
+  "prompt_eval_count":26,
+  "prompt_eval_duration":342546000,
+  "eval_count":282,
+  "eval_duration":4535599000
 }
 ```

-#### Request (With History)
+#### Chat request (No streaming)

-Send a chat message with a conversation history.
+##### Request
+
+```shell
+curl http://localhost:11434/api/chat -d '{
+  "model": "llama2",
+  "messages": [
+    {
+      "role": "user",
+      "content": "why is the sky blue?"
+    }
+  ], 
+  "stream": false
+}'
+```
+
+##### Response
+
+```json
+{
+  "model": "registry.ollama.ai/library/llama2:latest",
+  "created_at": "2023-12-12T14:13:43.416799Z",
+  "message": {
+    "role": "assistant",
+    "content": "Hello! How are you today?"
+  },
+  "done": true,
+  "total_duration": 5191566416,
+  "load_duration": 2154458,
+  "prompt_eval_count": 26,
+  "prompt_eval_duration": 383809000,
+  "eval_count": 298,
+  "eval_duration": 4799921000
+}
+```
+
+#### Chat request (With History)
+
+Send a chat message with a conversation history. You can use this same approach to start the conversation using multi-shot or chain-of-thought prompting.
+
+##### Request

 ```shell
 curl http://localhost:11434/api/chat -d '{
@@ -386,7 +496,7 @@ curl http://localhost:11434/api/chat -d '{
 }'
 ```

-#### Response
+##### Response

 A stream of JSON objects is returned:

@@ -409,14 +519,52 @@ Final response:
  "model": "llama2",
  "created_at": "2023-08-04T19:22:45.499127Z",
  "done": true,
-  "total_duration": 5589157167,
-  "load_duration": 3013701500,
-  "sample_count": 114,
-  "sample_duration": 81442000,
-  "prompt_eval_count": 46,
-  "prompt_eval_duration": 1160282000,
-  "eval_count": 113,
-  "eval_duration": 1325948000
+  "total_duration":8113331500,
+  "load_duration":6396458,
+  "prompt_eval_count":61,
+  "prompt_eval_duration":398801000,
+  "eval_count":468,
+  "eval_duration":7701267000
+}
+```
+
+#### Chat request (with images)
+
+##### Request
+
+Send a chat message with a conversation history.
+
+```shell
+curl http://localhost:11434/api/chat -d '{
+  "model": "llava",
+  "messages": [
+    {
+      "role": "user",
+      "content": "what is in this image?",
+      "images": ["iVBORw0KGgoAAAANSUhEUgAAAG0AAABmCAYAAADBPx+VAAAACXBIWXMAAAsTAAALEwEAmpwYAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAA3VSURBVHgB7Z27r0zdG8fX743i1bi1ikMoFMQloXRpKFFIqI7LH4BEQ+NWIkjQuSWCRIEoULk0gsK1kCBI0IhrQVT7tz/7zZo888yz1r7MnDl7z5xvsjkzs2fP3uu71nNfa7lkAsm7d++Sffv2JbNmzUqcc8m0adOSzZs3Z+/XES4ZckAWJEGWPiCxjsQNLWmQsWjRIpMseaxcuTKpG/7HP27I8P79e7dq1ars/yL4/v27S0ejqwv+cUOGEGGpKHR37tzJCEpHV9tnT58+dXXCJDdECBE2Ojrqjh071hpNECjx4cMHVycM1Uhbv359B2F79+51586daxN/+pyRkRFXKyRDAqxEp4yMlDDzXG1NPnnyJKkThoK0VFd1ELZu3TrzXKxKfW7dMBQ6bcuWLW2v0VlHjx41z717927ba22U9APcw7Nnz1oGEPeL3m3p2mTAYYnFmMOMXybPPXv2bNIPpFZr1NHn4HMw0KRBjg9NuRw95s8PEcz/6DZELQd/09C9QGq5RsmSRybqkwHGjh07OsJSsYYm3ijPpyHzoiacg35MLdDSIS/O1yM778jOTwYUkKNHWUzUWaOsylE00MyI0fcnOwIdjvtNdW/HZwNLGg+sR1kMepSNJXmIwxBZiG8tDTpEZzKg0GItNsosY8USkxDhD0Rinuiko2gfL/RbiD2LZAjU9zKQJj8RDR0vJBR1/Phx9+PHj9Z7REF4nTZkxzX4LCXHrV271qXkBAPGfP/atWvu/PnzHe4C97F48eIsRLZ9+3a3f/9+87dwP1JxaF7/3r17ba+5l4EcaVo0lj3SBq5kGTJSQmLWMjgYNei2GPT1MuMqGTDEFHzeQSP2wi/jGnkmPJ/nhccs44jvDAxpVcxnq0F6eT8h4ni/iIWpR5lPyA6ETkNXoSukvpJAD3AsXLiwpZs49+fPn5ke4j10TqYvegSfn0OnafC+Tv9ooA/JPkgQysqQNBzagXY55nO/oa1F7qvIPWkRL12WRpMWUvpVDYmxAPehxWSe8ZEXL20sadYIozfmNch4QJPAfeJgW3rNsnzphBKNJM2KKODo1rVOMRYik5ETy3ix4qWNI81qAAirizgMIc+yhTytx0JWZuNI03qsrgWlGtwjoS9XwgUhWGyhUaRZZQNNIEwCiXD16tXcAHUs79co0vSD8rrJCIW98pzvxpAWyyo3HYwqS0+H0BjStClcZJT5coMm6D2LOF8TolGJtK9fvyZpyiC5ePFi9nc/oJU4eiEP0jVoAnHa9wyJycITMP78+eMeP37sXrx44d6+fdt6f82aNdkx1pg9e3Zb5W+RSRE+n+VjksQWifvVaTKFhn5O8my63K8Qabdv33b379/PiAP//vuvW7BggZszZ072/+TJk91YgkafPn166zXB1rQHFvouAWHq9z3SEevSUerqCn2/dDCeta2jxYbr69evk4MHDyY7d+7MjhMnTiTPnz9Pfv/+nfQT2ggpO2dMF8cghuoM7Ygj5iWCqRlGFml0QC/ftGmTmzt3rmsaKDsgBSPh0/8yPeLLBihLkOKJc0jp8H8vUzcxIA1k6QJ/c78tWEyj5P3o4u9+jywNPdJi5rAH9x0KHcl4Hg570eQp3+vHXGyrmEeigzQsQsjavXt38ujRo44LQuDDhw+TW7duRS1HGgMxhNXHgflaNTOsHyKvHK5Ijo2jbFjJBQK9YwFd6RVMzfgRBmEfP37suBBm/p49e1qjEP2mwTViNRo0VJWH1deMXcNK08uUjVUu7s/zRaL+oLNxz1bpANco4npUgX4G2eFbpDFyQoQxojBCpEGSytmOH8qrH5Q9vuzD6ofQylkCUmh8DBAr+q8JCyVNtWQIidKQE9wNtLSQnS4jDSsxNHogzFuQBw4cyM61UKVsjfr3ooBkPSqqQHesUPWVtzi9/vQi1T+rJj7WiTz4Pt/l3LxUkr5P2VYZaZ4URpsE+st/dujQoaBBYokbrz/8TJNQYLSonrPS9kUaSkPeZyj1AWSj+d+VBoy1pIWVNed8P0Ll/ee5HdGRhrHhR5GGN0r4LGZBaj8oFDJitBTJzIZgFcmU0Y8ytWMZMzJOaXUSrUs5RxKnrxmbb5YXO9VGUhtpXldhEUogFr3IzIsvlpmdosVcGVGXFWp2oU9kLFL3dEkSz6NHEY1sjSRdIuDFWEhd8KxFqsRi1uM/nz9/zpxnwlESONdg6dKlbsaMGS4EHFHtjFIDHwKOo46l4TxSuxgDzi+rE2jg+BaFruOX4HXa0Nnf1lwAPufZeF8/r6zD97WK2qFnGjBxTw5qNGPxT+5T/r7/7RawFC3j4vTp09koCxkeHjqbHJqArmH5UrFKKksnxrK7FuRIs8STfBZv+luugXZ2pR/pP9Ois4z+TiMzUUkUjD0iEi1fzX8GmXyuxUBRcaUfykV0YZnlJGKQpOiGB76x5GeWkWWJc3mOrK6S7xdND+W5N6XyaRgtWJFe13GkaZnKOsYqGdOVVVbGupsyA/l7emTLHi7vwTdirNEt0qxnzAvBFcnQF16xh/TMpUuXHDowhlA9vQVraQhkudRdzOnK+04ZSP3DUhVSP61YsaLtd/ks7ZgtPcXqPqEafHkdqa84X6aCeL7YWlv6edGFHb+ZFICPlljHhg0bKuk0CSvVznWsotRu433alNdFrqG45ejoaPCaUkWERpLXjzFL2Rpllp7PJU2a/v7Ab8N05/9t27Z16KUqoFGsxnI9EosS2niSYg9SpU6B4JgTrvVW1flt1sT+0ADIJU2maXzcUTraGCRaL1Wp9rUMk16PMom8QhruxzvZIegJjFU7LLCePfS8uaQdPny4jTTL0dbee5mYokQsXTIWNY46kuMbnt8Kmec+LGWtOVIl9cT1rCB0V8WqkjAsRwta93TbwNYoGKsUSChN44lgBNCoHLHzquYKrU6qZ8lolCIN0Rh6cP0Q3U6I6IXILYOQI513hJaSKAorFpuHXJNfVlpRtmYBk1Su1obZr5dnKAO+L10Hrj3WZW+E3qh6IszE37F6EB+68mGpvKm4eb9bFrlzrok7fvr0Kfv727dvWRmdVTJHw0qiiCUSZ6wCK+7XL/AcsgNyL74DQQ730sv78Su7+t/A36MdY0sW5o40ahslXr58aZ5HtZB8GH64m9EmMZ7FpYw4T6QnrZfgenrhFxaSiSGXtPnz57e9TkNZLvTjeqhr734CNtrK41L40sUQckmj1lGKQ0rC37x544r8eNXRpnVE3ZZY7zXo8NomiO0ZUCj2uHz58rbXoZ6gc0uA+F6ZeKS/jhRDUq8MKrTho9fEkihMmhxtBI1DxKFY9XLpVcSkfoi8JGnToZO5sU5aiDQIW716ddt7ZLYtMQlhECdBGXZZMWldY5BHm5xgAroWj4C0hbYkSc/jBmggIrXJWlZM6pSETsEPGqZOndr2uuuR5rF169a2HoHPdurUKZM4CO1WTPqaDaAd+GFGKdIQkxAn9RuEWcTRyN2KSUgiSgF5aWzPTeA/lN5rZubMmR2bE4SIC4nJoltgAV/dVefZm72AtctUCJU2CMJ327hxY9t7EHbkyJFseq+EJSY16RPo3Dkq1kkr7+q0bNmyDuLQcZBEPYmHVdOBiJyIlrRDq41YPWfXOxUysi5fvtyaj+2BpcnsUV/oSoEMOk2CQGlr4ckhBwaetBhjCwH0ZHtJROPJkyc7UjcYLDjmrH7ADTEBXFfOYmB0k9oYBOjJ8b4aOYSe7QkKcYhFlq3QYLQhSidNmtS2RATwy8YOM3EQJsUjKiaWZ+vZToUQgzhkHXudb/PW5YMHD9yZM2faPsMwoc7RciYJXbGuBqJ1UIGKKLv915jsvgtJxCZDubdXr165mzdvtr1Hz5LONA8jrUwKPqsmVesKa49S3Q4WxmRPUEYdTjgiUcfUwLx589ySJUva3oMkP6IYddq6HMS4o55xBJBUeRjzfa4Zdeg56QZ43LhxoyPo7Lf1kNt7oO8wWAbNwaYjIv5lhyS7kRf96dvm5Jah8vfvX3flyhX35cuX6HfzFHOToS1H4BenCaHvO8pr8iDuwoUL7tevX+b5ZdbBair0xkFIlFDlW4ZknEClsp/TzXyAKVOmmHWFVSbDNw1l1+4f90U6IY/q4V27dpnE9bJ+v87QEydjqx/UamVVPRG+mwkNTYN+9tjkwzEx+atCm/X9WvWtDtAb68Wy9LXa1UmvCDDIpPkyOQ5ZwSzJ4jMrvFcr0rSjOUh+GcT4LSg5ugkW1Io0/SCDQBojh0hPlaJdah+tkVYrnTZowP8iq1F1TgMBBauufyB33x1v+NWFYmT5KmppgHC+NkAgbmRkpD3yn9QIseXymoTQFGQmIOKTxiZIWpvAatenVqRVXf2nTrAWMsPnKrMZHz6bJq5jvce6QK8J1cQNgKxlJapMPdZSR64/UivS9NztpkVEdKcrs5alhhWP9NeqlfWopzhZScI6QxseegZRGeg5a8C3Re1Mfl1ScP36ddcUaMuv24iOJtz7sbUjTS4qBvKmstYJoUauiuD3k5qhyr7QdUHMeCgLa1Ear9NquemdXgmum4fvJ6w1lqsuDhNrg1qSpleJK7K3TF0Q2jSd94uSZ60kK1e3qyVpQK6PVWXp2/FC3mp6jBhKKOiY2h3gtUV64TWM6wDETRPLDfSakXmH3w8g9Jlug8ZtTt4kVF0kLUYYmCCtD/DrQ5YhMGbA9L3ucdjh0y8kOHW5gU/VEEmJTcL4Pz/f7mgoAbYkAAAAAElFTkSuQmCC"]
+    },
+  ]
+}'
+```
+
+##### Response
+
+```json
+{
+  "model": "llava",
+  "created_at": "2023-12-13T22:42:50.203334Z",
+  "message": {
+    "role": "assistant",
+    "content": " The image features a cute, little pig with an angry facial expression. It's wearing a heart on its shirt and is waving in the air. This scene appears to be part of a drawing or sketching project.",
+    "images": null
+  },
+  "done": true,
+  "total_duration":1668506709,
+  "load_duration":1986209,
+  "prompt_eval_count":26,
+  "prompt_eval_duration":359682000,
+  "eval_count":83,
+  "eval_duration":1303285000
 }
 ```

@@ -426,7 +574,7 @@ Final response:
 POST /api/create
 ```

-Create a model from a [`Modelfile`](./modelfile.md). It is recommended to set `modelfile` to the content of the Modelfile rather than just set `path`. This is a requirement for remote create. Remote model creation should also create any file blobs, fields such as `FROM` and `ADAPTER`, explicitly with the server using [Create a Blob](#create-a-blob) and the value to the path indicated in the response.
+Create a model from a [`Modelfile`](./modelfile.md). It is recommended to set `modelfile` to the content of the Modelfile rather than just set `path`. This is a requirement for remote create. Remote model creation must also create any file blobs, fields such as `FROM` and `ADAPTER`, explicitly with the server using [Create a Blob](#create-a-blob) and the value to the path indicated in the response. 

 ### Parameters

@@ -437,7 +585,11 @@ Create a model from a [`Modelfile`](./modelfile.md). It is recommended to set `m

 ### Examples

-#### Request
+#### Create a new model
+
+Create a new model from a `Modelfile`.
+
+##### Request

 ```shell
 curl http://localhost:11434/api/create -d '{
@@ -446,14 +598,22 @@ curl http://localhost:11434/api/create -d '{
 }'
 ```

-#### Response
+##### Response

-A stream of JSON objects. When finished, `status` is `success`.
+A stream of JSON objects. Notice that the final JSON object shows a `"status": "success"`.

 ```json
-{
-  "status": "parsing modelfile"
-}
+{"status":"reading model metadata"}
+{"status":"creating system layer"}
+{"status":"using already created layer sha256:22f7f8ef5f4c791c1b03d7eb414399294764d7cc82c7e94aa81a1feb80a983a2"}
+{"status":"using already created layer sha256:8c17c2ebb0ea011be9981cc3922db8ca8fa61e828c5d3f44cb6ae342bf80460b"}
+{"status":"using already created layer sha256:7c23fb36d80141c4ab8cdbb61ee4790102ebd2bf7aeff414453177d4f2110e5d"}
+{"status":"using already created layer sha256:2e0493f67d0c8c9c68a8aeacdf6a38a2151cb3c4c1d42accf296e19810527988"}
+{"status":"using already created layer sha256:2759286baa875dc22de5394b4a925701b1896a7e3f8e53275c36f75a877a82c9"}
+{"status":"writing layer sha256:df30045fe90f0d750db82a058109cecd6d4de9c90a3d75b19c09e5f64580bb42"}
+{"status":"writing layer sha256:f18a68eb09bf925bb1b669490407c1b1251c5db98dc4d3d81f3088498ea55690"}
+{"status":"writing manifest"}
+{"status":"success"}
 ```

 ### Check if a Blob Exists
@@ -462,7 +622,8 @@ A stream of JSON objects. When finished, `status` is `success`.
 HEAD /api/blobs/:digest
 ```

-Check if a blob is known to the server.
+Ensures that the file blob used for a FROM or ADAPTER field exists on the server. This is checking your Ollama server and not Ollama.ai.
+

 #### Query Parameters

@@ -486,7 +647,7 @@ Return 200 OK if the blob exists, 404 Not Found if it does not.
 POST /api/blobs/:digest
 ```

-Create a blob from a file. Returns the server file path.
+Create a blob from a file on the server. Returns the server file path.

 #### Query Parameters

@@ -502,7 +663,7 @@ curl -T model.bin -X POST http://localhost:11434/api/blobs/sha256:29fdb92e57cf08

 ##### Response

-Return 201 Created if the blob was successfully created.
+Return 201 Created if the blob was successfully created, 400 Bad Request if the digest used is not expected.

 ## List Local Models

@@ -528,14 +689,30 @@ A single JSON object will be returned.
 {
  "models": [
    {
-      "name": "llama2",
-      "modified_at": "2023-08-02T17:02:23.713454393-07:00",
-      "size": 3791730596
+      "name": "codellama:13b",
+      "modified_at": "2023-11-04T14:56:49.277302595-07:00",
+      "size": 7365960935,
+      "digest": "9f438cb9cd581fc025612d27f7c1a6669ff83a8bb0ed86c94fcf4c5440555697",
+      "details": {
+        "format": "gguf",
+        "family": "llama",
+        "families": null,
+        "parameter_size": "13B",
+        "quantization_level": "Q4_0"
+      }
    },
    {
-      "name": "llama2:13b",
-      "modified_at": "2023-08-08T12:08:38.093596297-07:00",
-      "size": 7323310500
+      "name": "llama2:latest",
+      "modified_at": "2023-12-07T09:32:18.757212583-08:00",
+      "size": 3825819519,
+      "digest": "fe938a131f40e6f6d40083c9f0f430a515233eb2edaa6d72eb85c50d64f2300e",
+      "details": {
+        "format": "gguf",
+        "family": "llama",
+        "families": null,
+        "parameter_size": "7B",
+        "quantization_level": "Q4_0"
+      }
    }
  ]
 }
@@ -547,7 +724,7 @@ A single JSON object will be returned.
 POST /api/show
 ```

-Show details about a model including modelfile, template, parameters, license, and system prompt.
+Show information about a model including details, modelfile, template, parameters, license, and system prompt.

 ### Parameters

@@ -567,10 +744,16 @@ curl http://localhost:11434/api/show -d '{

 ```json
 {
-  "license": "<contents of license block>",
-  "modelfile": "# Modelfile generated by \"ollama show\"\n# To build a new Modelfile based on this one, replace the FROM line with:\n# FROM llama2:latest\n\nFROM /Users/username/.ollama/models/blobs/sha256:8daa9615cce30c259a9555b1cc250d461d1bc69980a274b44d7eda0be78076d8\nTEMPLATE \"\"\"[INST] {{ if and .First .System }}<<SYS>>{{ .System }}<</SYS>>\n\n{{ end }}{{ .Prompt }} [/INST] \"\"\"\nSYSTEM \"\"\"\"\"\"\nPARAMETER stop [INST]\nPARAMETER stop [/INST]\nPARAMETER stop <<SYS>>\nPARAMETER stop <</SYS>>\n",
-  "parameters": "stop                           [INST]\nstop                           [/INST]\nstop                           <<SYS>>\nstop                           <</SYS>>",
-  "template": "[INST] {{ if and .First .System }}<<SYS>>{{ .System }}<</SYS>>\n\n{{ end }}{{ .Prompt }} [/INST] "
+  "modelfile": "# Modelfile generated by \"ollama show\"\n# To build a new Modelfile based on this one, replace the FROM line with:\n# FROM llava:latest\n\nFROM /Users/matt/.ollama/models/blobs/sha256:200765e1283640ffbd013184bf496e261032fa75b99498a9613be4e94d63ad52\nTEMPLATE \"\"\"{{ .System }}\nUSER: {{ .Prompt }}\nASSSISTANT: \"\"\"\nPARAMETER num_ctx 4096\nPARAMETER stop \"\u003c/s\u003e\"\nPARAMETER stop \"USER:\"\nPARAMETER stop \"ASSSISTANT:\"",
+  "parameters": "num_ctx                        4096\nstop                           \u003c/s\u003e\nstop                           USER:\nstop                           ASSSISTANT:",
+  "template": "{{ .System }}\nUSER: {{ .Prompt }}\nASSSISTANT: ",
+  "details": {
+    "format": "gguf",
+    "family": "llama",
+    "families": ["llama", "clip"],
+    "parameter_size": "7B",
+    "quantization_level": "Q4_0"
+  }
 }
 ```

@@ -595,7 +778,7 @@ curl http://localhost:11434/api/copy -d '{

 #### Response

-The only response is a 200 OK if successful.
+Returns a 200 OK if successful, or a 404 Not Found if the source model doesn't exist.

 ## Delete a Model

@@ -621,7 +804,7 @@ curl -X DELETE http://localhost:11434/api/delete -d '{

 #### Response

-If successful, the only response is a 200 OK.
+Returns a 200 OK if successful, 404 Not Found if the model to be deleted doesn't exist.

 ## Pull a Model

--- a/docs/development.md
+++ b/docs/development.md
@@ -14,7 +14,13 @@ Install required tools:
 brew install go cmake gcc
 ```

-Get the required libraries:
+Optionally enable debugging and more verbose logging:
+
+```bash
+export CGO_CFLAGS="-g"
+```
+
+Get the required libraries and build the native LLM code:

 ```bash
 go generate ./...
@@ -34,6 +40,35 @@ Now you can run `ollama`:

 ## Building on Linux with GPU support

- Install cmake and nvidia-cuda-toolkit
- run `go generate ./...`
- run `go build .`
+
+### Linux/Windows CUDA (NVIDIA)
+*Your operating system distribution may already have packages for NVIDIA CUDA. Distro packages are often preferable, but instructions are distro-specific. Please consult distro-specific docs for dependencies if available!*
+
+Install `cmake` and `golang` as well as [NVIDIA CUDA](https://developer.nvidia.com/cuda-downloads) development and runtime packages.
+Then generate dependencies:
+```
+go generate ./...
+```
+Then build the binary:
+```
+go build .
+```
+
+### Linux ROCm (AMD)
+*Your operating system distribution may already have packages for AMD ROCm and CLBlast. Distro packages are often preferable, but instructions are distro-specific. Please consult distro-specific docs for dependencies if available!*
+
+Install [CLBlast](https://github.com/CNugteren/CLBlast/blob/master/doc/installation.md) and [ROCm](https://rocm.docs.amd.com/en/latest/deploy/linux/quick_start.html) developement packages first, as well as `cmake` and `golang`.
+Adjust the paths below (correct for Arch) as appropriate for your distributions install locations and generate dependencies:
+```
+CLBlast_DIR=/usr/lib/cmake/CLBlast ROCM_PATH=/opt/rocm go generate ./...
+```
+Then build the binary:
+```
+go build .
+```
+
+ROCm requires elevated privileges to access the GPU at runtime.  On most distros you can add your user account to the `render` group, or run as root.
+
+## Containerized Build
+
+If you have Docker available, you can build linux binaries with `./scripts/build_linux.sh` which has the CUDA and ROCm dependencies included.
--- a/docs/faq.md
+++ b/docs/faq.md
@@ -1,138 +1,90 @@
 # FAQ

+## How can I upgrade Ollama?
+
+To upgrade Ollama, run the installation process again. On the Mac, click the Ollama icon in the menubar and choose the restart option if an update is available.
+
 ## How can I view the logs?

-On macOS:
+Review the [Troubleshooting](./troubleshooting.md) docs for more about using logs.

-```
-cat ~/.ollama/logs/server.log
-```
+## How do I use Ollama server environment variables on Mac

-On Linux:
+On macOS, Ollama runs in the background and is managed by the menubar app. If adding environment variables, Ollama will need to be run manually.

-```
-journalctl -u ollama
-```
+1. Click the menubar icon for Ollama and choose **Quit Ollama**.
+2. Open a new terminal window and run the following command (this example uses `OLLAMA_HOST` with an IP address of `123.1.1.1`):

-If you're running `ollama serve` directly, the logs will be printed to the console.
+   ```bash
+   OLLAMA_HOST=123.1.1.1 ollama serve
+   ```
+
+## How do I use Ollama server environment variables on Linux?
+
+If Ollama is installed with the install script, a systemd service was created, running as the Ollama user. To add an environment variable, such as OLLAMA_HOST, follow these steps:
+
+1. Create a `systemd` drop-in directory and add a config file. This is only needed once.
+
+   ```bash
+   mkdir -p /etc/systemd/system/ollama.service.d
+   echo '[Service]' >>/etc/systemd/system/ollama.service.d/environment.conf
+   ```
+
+2. For each environment variable, add it to the config file:
+
+   ```bash
+   echo 'Environment="OLLAMA_HOST=0.0.0.0:11434"' >>/etc/systemd/system/ollama.service.d/environment.conf
+   ```
+
+3. Reload `systemd` and restart Ollama:
+
+   ```bash
+   systemctl daemon-reload
+   systemctl restart ollama
+   ```

 ## How can I expose Ollama on my network?

-Ollama binds to 127.0.0.1 port 11434 by default. Change the bind address with the `OLLAMA_HOST` environment variable.
-
-On macOS:
-
-```bash
-OLLAMA_HOST=0.0.0.0:11434 ollama serve
-```
-
-On Linux:
-
-Create a `systemd` drop-in directory and set `Environment=OLLAMA_HOST`
-
-```bash
-mkdir -p /etc/systemd/system/ollama.service.d
-echo '[Service]' >>/etc/systemd/system/ollama.service.d/environment.conf
-```
-
-```bash
-echo 'Environment="OLLAMA_HOST=0.0.0.0:11434"' >>/etc/systemd/system/ollama.service.d/environment.conf
-```
-
-Reload `systemd` and restart Ollama:
-
-```bash
-systemctl daemon-reload
-systemctl restart ollama
-```
+Ollama binds to 127.0.0.1 port 11434 by default. Change the bind address with the `OLLAMA_HOST` environment variable. Refer to the section above for how to use environment variables on your platform.

 ## How can I allow additional web origins to access Ollama?

-Ollama allows cross origin requests from `127.0.0.1` and `0.0.0.0` by default. Add additional origins with the `OLLAMA_ORIGINS` environment variable:
+Ollama allows cross-origin requests from `127.0.0.1` and `0.0.0.0` by default. Add additional origins with the `OLLAMA_ORIGINS` environment variable. For example, to add all ports on 192.168.1.1 and https://example.com, use:

-On macOS:
-
-```bash
-OLLAMA_ORIGINS=http://192.168.1.1:*,https://example.com ollama serve
+```shell
+OLLAMA_ORIGINS=http://192.168.1.1:*,https://example.com
 ```

-On Linux:
-
-```bash
-echo 'Environment="OLLAMA_ORIGINS=http://192.168.1.1:*,https://example.com"' >>/etc/systemd/system/ollama.service.d/environment.conf
-```
-
-Reload `systemd` and restart Ollama:
-
-```bash
-systemctl daemon-reload
-systemctl restart ollama
-```
+Refer to the section above for how to use environment variables on your platform.

 ## Where are models stored?

- macOS: Raw model data is stored under `~/.ollama/models`.
- Linux: Raw model data is stored under `/usr/share/ollama/.ollama/models`
+- macOS: `~/.ollama/models`.
+- Linux: `/usr/share/ollama/.ollama/models`

-Below the models directory you will find a structure similar to the following:
+See [the CLI Documentation](./cli.md) for more on this.

-```shell
-.
-├── blobs
-└── manifests
-   └── registry.ollama.ai
-      ├── f0rodo
-      ├── library
-      ├── mattw
-      └── saikatkumardey
-```
+## How do I set them to a different location?

-There is a `manifests/registry.ollama.ai/namespace` path. In example above, the user has downloaded models from the official `library`, `f0rodo`, `mattw`, and `saikatkumardey` namespaces. Within each of those directories, you will find directories for each of the models downloaded. And in there you will find a file name representing each tag. Each tag file is the manifest for the model.  
-
-The manifest lists all the layers used in this model. You will see a `media type` for each layer, along with a digest. That digest corresponds with a file in the `models/blobs directory`.
-
-### How can I change where Ollama stores models?
-
-To modify where models are stored, you can use the `OLLAMA_MODELS` environment variable. Note that on Linux this means defining `OLLAMA_MODELS` in a drop-in `/etc/systemd/system/ollama.service.d` service file, reloading systemd, and restarting the ollama service.
+If a different directory needs to be used, set the environment variable `OLLAMA_MODELS` to the chosen directory. Refer to the section above for how to use environment variables on your platform.

 ## Does Ollama send my prompts and answers back to Ollama.ai to use in any way?

-No. Anything you do with Ollama, such as generate a response from the model, stays with you. We don't collect any data about how you use the model. You are always in control of your own data.
+No, Ollama runs entirely locally, and conversation data will never leave your machine.

 ## How can I use Ollama in Visual Studio Code?

-There is already a large collection of plugins available for VSCode as well as other editors that leverage Ollama. You can see the list of [extensions & plugins](https://github.com/jmorganca/ollama#extensions--plugins) at the bottom of the main repository readme.
+There is already a large collection of plugins available for VSCode as well as other editors that leverage Ollama. See the list of [extensions & plugins](https://github.com/jmorganca/ollama#extensions--plugins) at the bottom of the main repository readme.

 ## How do I use Ollama behind a proxy?

-Ollama is compatible with proxy servers if `HTTP_PROXY` or `HTTPS_PROXY` are configured. When using either variables, ensure it is set where `ollama serve` can access the values.
-
-When using `HTTPS_PROXY`, ensure the proxy certificate is installed as a system certificate.
-
-On macOS:
-
-```bash
-HTTPS_PROXY=http://proxy.example.com ollama serve
-```
-
-On Linux:
-
-```bash
-echo 'Environment="HTTPS_PROXY=https://proxy.example.com"' >>/etc/systemd/system/ollama.service.d/environment.conf
-```
-
-Reload `systemd` and restart Ollama:
-
-```bash
-systemctl daemon-reload
-systemctl restart ollama
-```
+Ollama is compatible with proxy servers if `HTTP_PROXY` or `HTTPS_PROXY` are configured. When using either variables, ensure it is set where `ollama serve` can access the values. When using `HTTPS_PROXY`, ensure the proxy certificate is installed as a system certificate. Refer to the section above for how to use environment variables on your platform.

 ### How do I use Ollama behind a proxy in Docker?

 The Ollama Docker container image can be configured to use a proxy by passing `-e HTTPS_PROXY=https://proxy.example.com` when starting the container.

-Alternatively, Docker daemon can be configured to use a proxy. Instructions are available for Docker Desktop on [macOS](https://docs.docker.com/desktop/settings/mac/#proxies), [Windows](https://docs.docker.com/desktop/settings/windows/#proxies), and [Linux](https://docs.docker.com/desktop/settings/linux/#proxies), and Docker [daemon with systemd](https://docs.docker.com/config/daemon/systemd/#httphttps-proxy).
+Alternatively, the Docker daemon can be configured to use a proxy. Instructions are available for Docker Desktop on [macOS](https://docs.docker.com/desktop/settings/mac/#proxies), [Windows](https://docs.docker.com/desktop/settings/windows/#proxies), and [Linux](https://docs.docker.com/desktop/settings/linux/#proxies), and Docker [daemon with systemd](https://docs.docker.com/config/daemon/systemd/#httphttps-proxy).

 Ensure the certificate is installed as a system certificate when using HTTPS. This may require a new Docker image when using a self-signed certificate.

@@ -154,3 +106,15 @@ docker run -d -e HTTPS_PROXY=https://my.proxy.example.com -p 11434:11434 ollama-
 The Ollama Docker container can be configured with GPU acceleration in Linux or Windows (with WSL2). This requires the [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia-container-toolkit). See [ollama/ollama](https://hub.docker.com/r/ollama/ollama) for more details.

 GPU acceleration is not available for Docker Desktop in macOS due to the lack of GPU passthrough and emulation.
+
+## Why is networking slow in WSL2 on Windows 10?
+
+This can impact both installing Ollama, as well as downloading models.
+
+Open `Control Panel > Networking and Internet > View network status and tasks` and click on `Change adapter settings` on the left panel. Find the `vEthernel (WSL)` adapter, right click and select `Properties`.
+Click on `Configure` and open the `Advanced` tab. Search through each of the properties until you find `Large Send Offload Version 2 (IPv4)` and `Large Send Offload Version 2 (IPv6)`. *Disable* both of these
+properties.
+
+## What is context, can I increase it, and why doesn't every model support a huge context?
+
+Context refers to the size of the input that can be sent to a model and get sensible output back. Many models have a context size of 2048 tokens. It's sometimes possible to give it more, but the answers start to degrade. Newer models have been able to increase that context size using different methods. This increase in context size results in a corresponding increase in memory required, sometimes by orders of magnitude.
--- a/docs/import.md
+++ b/docs/import.md
@@ -72,7 +72,7 @@ docker run --rm -v .:/model ollama/quantize -q q4_0 /model
 This will output two files into the directory:

 - `f16.bin`: the model converted to GGUF
- `q4_0.bin` the model quantized to a 4-bit quantization (we will use this file to create the Ollama model)
+- `q4_0.bin` the model quantized to a 4-bit quantization (Ollama will use this file to create the Ollama model)

 ### Step 3: Write a `Modelfile`

--- a/docs/modelfile.md
+++ b/docs/modelfile.md
@@ -1,6 +1,6 @@
 # Ollama Model File

-> Note: this `Modelfile` syntax is in development
+> Note: `Modelfile` syntax is in development

 A model file is the blueprint to create and share models with Ollama.

@@ -30,14 +30,14 @@ The format of the `Modelfile`:
 INSTRUCTION arguments
 ```

-| Instruction                         | Description                                                   |
-| ----------------------------------- | ------------------------------------------------------------- |
-| [`FROM`](#from-required) (required) | Defines the base model to use.                                |
-| [`PARAMETER`](#parameter)           | Sets the parameters for how Ollama will run the model.        |
-| [`TEMPLATE`](#template)             | The full prompt template to be sent to the model.             |
-| [`SYSTEM`](#system)                 | Specifies the system prompt that will be set in the template. |
-| [`ADAPTER`](#adapter)               | Defines the (Q)LoRA adapters to apply to the model.           |
-| [`LICENSE`](#license)               | Specifies the legal license.                                  |
+| Instruction                         | Description                                                    |
+| ----------------------------------- | -------------------------------------------------------------- |
+| [`FROM`](#from-required) (required) | Defines the base model to use.                                 |
+| [`PARAMETER`](#parameter)           | Sets the parameters for how Ollama will run the model.         |
+| [`TEMPLATE`](#template)             | The full prompt template to be sent to the model.              |
+| [`SYSTEM`](#system)                 | Specifies the system message that will be set in the template. |
+| [`ADAPTER`](#adapter)               | Defines the (Q)LoRA adapters to apply to the model.            |
+| [`LICENSE`](#license)               | Specifies the legal license.                                   |

 ## Examples

@@ -52,7 +52,7 @@ PARAMETER temperature 1
 # sets the context window size to 4096, this controls how many tokens the LLM can use as context to generate the next token
 PARAMETER num_ctx 4096

-# sets a custom system prompt to specify the behavior of the chat assistant
+# sets a custom system message to specify the behavior of the chat assistant
 SYSTEM You are Mario from super mario bros, acting as an assistant.
 ```

@@ -70,12 +70,12 @@ More examples are available in the [examples directory](../examples).
 There are two ways to view `Modelfile`s underlying the models in [ollama.ai/library][1]:

 - Option 1: view a details page from a model's tags page:
-   1. Go to a particular model's tags (e.g. https://ollama.ai/library/llama2/tags)
-   2. Click on a tag (e.g. https://ollama.ai/library/llama2:13b)
-   3. Scroll down to "Layers"
+  1.  Go to a particular model's tags (e.g. https://ollama.ai/library/llama2/tags)
+  2.  Click on a tag (e.g. https://ollama.ai/library/llama2:13b)
+  3.  Scroll down to "Layers"
      - Note: if the [`FROM` instruction](#from-required) is not present,
        it means the model was created from a local file
- Option 2: use `ollama show` to print the `Modelfile` like so:
+- Option 2: use `ollama show` to print the `Modelfile` for any local models like so:

  ```bash
  > ollama show --modelfile llama2:13b
@@ -152,15 +152,15 @@ PARAMETER <parameter> <parametervalue>

 ### TEMPLATE

-`TEMPLATE` of the full prompt template to be passed into the model. It may include (optionally) a system prompt and a user's prompt. This is used to create a full custom prompt, and syntax may be model specific. You can usually find the template for a given model in the readme for that model.
+`TEMPLATE` of the full prompt template to be passed into the model. It may include (optionally) a system message and a user's prompt. This is used to create a full custom prompt, and syntax may be model specific. You can usually find the template for a given model in the readme for that model.

 #### Template Variables

-| Variable        | Description                                                                                                  |
-| --------------- | ------------------------------------------------------------------------------------------------------------ |
-| `{{ .System }}` | The system prompt used to specify custom behavior, this must also be set in the Modelfile as an instruction. |
-| `{{ .Prompt }}` | The incoming prompt, this is not specified in the model file and will be set based on input.                 |
-| `{{ .First }}`  | A boolean value used to render specific template information for the first generation of a session.          |
+| Variable        | Description                                                                                                   |
+| --------------- | ------------------------------------------------------------------------------------------------------------- |
+| `{{ .System }}` | The system message used to specify custom behavior, this must also be set in the Modelfile as an instruction. |
+| `{{ .Prompt }}` | The incoming prompt, this is not specified in the model file and will be set based on input.                  |
+| `{{ .First }}`  | A boolean value used to render specific template information for the first generation of a session.           |

 ```modelfile
 TEMPLATE """
@@ -180,7 +180,7 @@ SYSTEM """<system message>"""

 ### SYSTEM

-The `SYSTEM` instruction specifies the system prompt to be used in the template, if applicable.
+The `SYSTEM` instruction specifies the system message to be used in the template, if applicable.

 ```modelfile
 SYSTEM """<system message>"""
@@ -206,7 +206,7 @@ LICENSE """

 ## Notes

- the **`Modelfile` is not case sensitive**. In the examples, we use uppercase for instructions to make it easier to distinguish it from arguments.
- Instructions can be in any order. In the examples, we start with FROM instruction to keep it easily readable.
+- the **`Modelfile` is not case sensitive**. In the examples, uppercase instructions are used to make it easier to distinguish it from arguments.
+- Instructions can be in any order. In the examples, the `FROM` instruction is first to keep it easily readable.

 [1]: https://ollama.ai/library
--- a/docs/troubleshooting.md
+++ b/docs/troubleshooting.md
@@ -0,0 +1,22 @@
+# How to troubleshoot issues
+
+Sometimes Ollama may not perform as expected. One of the best ways to figure out what happened is to take a look at the logs. Find the logs on Mac by running the command:
+
+```shell
+cat ~/.ollama/logs/server.log
+```
+
+On Linux systems with systemd, the logs can be found with this command:
+
+```shell
+journalctl -u ollama
+```
+
+If manually running `ollama serve` in a terminal, the logs will be on that terminal.
+
+Join the [Discord](https://discord.gg/ollama) for help interpreting the logs.
+
+## Known issues
+
+
+* `signal: illegal instruction (core dumped)`: Ollama requires AVX support from the CPU. This was introduced in 2011 and CPUs started offering it in 2012. CPUs from before that and some lower end CPUs after that may not have AVX support and thus are not supported by Ollama. Some users have had luck with building Ollama on their machines disabling the need for AVX.
--- a/examples/.gitignore
+++ b/examples/.gitignore
@@ -1,7 +1,10 @@
 node_modules
+bun.lockb
+.vscode
 # OSX
 .DS_STORE

+
 # Models
 models/

--- a/examples/langchain-python-rag-websummary/README.md
+++ b/examples/langchain-python-rag-websummary/README.md
@@ -1,15 +1,23 @@
 # LangChain Web Summarization

-This example summarizes a website
+This example summarizes the website, [https://ollama.ai/blog/run-llama2-uncensored-locally](https://ollama.ai/blog/run-llama2-uncensored-locally)

-## Setup
+## Running the Example

-```
-pip install -r requirements.txt
-```
+1. Ensure you have the `llama2` model installed:

-## Run
+   ```bash
+   ollama pull llama2
+   ```

-```
-python main.py
-```
+2. Install the Python Requirements.
+
+   ```bash
+   pip install -r requirements.txt
+   ```
+
+3. Run the example:
+
+   ```bash
+   python main.py
+   ```
--- a/examples/langchain-python-rag-websummary/requirements.txt
+++ b/examples/langchain-python-rag-websummary/requirements.txt
@@ -1,2 +1 @@
 langchain==0.0.259
-bs4==0.0.1
--- a/examples/langchain-python-simple/README.md
+++ b/examples/langchain-python-simple/README.md
@@ -2,20 +2,23 @@

 This example is a basic "hello world" of using LangChain with Ollama.

-## Setup
+## Running the Example

-```
-pip install -r requirements.txt
-```
+1. Ensure you have the `llama2` model installed:

-## Run
+   ```bash
+   ollama pull llama2
+   ```

-```
-python main.py
-```
+2. Install the Python Requirements.

-Running this example will print the response for "hello":
+   ```bash
+   pip install -r requirements.txt
+   ```

-```
-Hello! It's nice to meet you. hopefully you are having a great day! Is there something I can help you with or would you like to chat?
-```
+3. Run the example:
+
+   ```bash
+   python main.py
+   ```
+  
--- a/examples/langchain-python-simple/main.py
+++ b/examples/langchain-python-simple/main.py
@@ -1,4 +1,6 @@
 from langchain.llms import Ollama
+
+input = input("What is your question?")
 llm = Ollama(model="llama2")
-res = llm.predict("hello")
+res = llm.predict(input)
 print (res)
--- a/examples/langchain-typescript-simple/README.md
+++ b/examples/langchain-typescript-simple/README.md
@@ -2,20 +2,22 @@

 This example is a basic "hello world" of using LangChain with Ollama using Node.js and Typescript.

-## Setup
+## Running the Example

-```shell
-npm install
-```
+1. Install the prerequisites:

-## Run
+   ```bash
+   npm install
+   ```

-```shell
-ts-node main.ts
-```
+2. Ensure the `mistral` model is available:

-Running this example will print the response for "hello":
+   ```bash
+   ollama pull mistral
+   ```

-```plaintext
-Hello! It's nice to meet you. hopefully you are having a great day! Is there something I can help you with or would you like to chat?
-```
+3. Run the example:
+
+   ```bash
+   npm start
+   ```
--- a/examples/langchain-typescript-simple/main.ts
+++ b/examples/langchain-typescript-simple/main.ts
@@ -1,15 +1,25 @@
-import { Ollama} from 'langchain/llms/ollama';
+import { Ollama } from 'langchain/llms/ollama';
+import * as readline from "readline";

 async function main() {
  const ollama = new Ollama({
    model: 'mistral'    
    // other parameters can be found at https://js.langchain.com/docs/api/llms_ollama/classes/Ollama
-  })
-  const stream = await ollama.stream("Hello");
+  });

-  for await (const chunk of stream) {
-    process.stdout.write(chunk);
-  }
+  const rl = readline.createInterface({
+    input: process.stdin,
+    output: process.stdout,
+  });
+
+  rl.question("What is your question: \n", async (user_input) => {
+    const stream = await ollama.stream(user_input);
+  
+    for await (const chunk of stream) {
+      process.stdout.write(chunk);
+    }
+    rl.close();
+  })
 }

 main();
--- a/examples/langchain-typescript-simple/package-lock.json
+++ b/examples/langchain-typescript-simple/package-lock.json
@@ -1,5 +1,5 @@
 {
-  "name": "with-langchain-typescript-simplegenerate",
+  "name": "langchain-typescript-simple",
  "lockfileVersion": 3,
  "requires": true,
  "packages": {
--- a/examples/langchain-typescript-simple/package.json
+++ b/examples/langchain-typescript-simple/package.json
@@ -1,8 +1,13 @@
 {
+  "scripts": {
+    "start": "tsx main.ts"
+  },
  "devDependencies": {
-    "typescript": "^5.2.2"
+    "tsx": "^4.6.2",
+    "typescript": "^5.3.3"
  },
  "dependencies": {
-    "langchain": "^0.0.165"
+    "langchain": "^0.0.165",
+    "readline": "^1.3.0"
  }
 }
--- a/examples/modelfile-10tweets/Modelfile
+++ b/examples/modelfile-10tweets/Modelfile
@@ -1,7 +0,0 @@
-# Modelfile for creating a list of ten tweets from a topic
-# Run `ollama create 10tweets -f ./Modelfile` and then `ollama run 10tweets` and enter a topic
-
-FROM llama2
-SYSTEM """
-You are a content marketer who needs to come up with 10 short but succinct tweets. The answer should be a list of ten tweets. Each tweet can have a maximum of 280 characters and should include hashtags. Each user input will be a subject and you should expand it in ten creative ways. Never stop after just one tweet. Always include ten. 
-"""
--- a/examples/modelfile-10tweets/README.md
+++ b/examples/modelfile-10tweets/README.md
@@ -1,23 +0,0 @@
-# Ten Tweets Modelfile
-
-This is a simple modelfile that generates ten tweets based off any topic.
-
-```bash
-ollama create tentweets
-
-ollama run tentweets
->>> underwater basketweaving
- Great! Here are ten creative tweets about underwater basketweaving:
-
-1. "Just discovered the ultimate stress-reliever: Underwater basketweaving! 🌊🧵 #UnderwaterBasketweaving #StressRelief"
-2. "Who needs meditation when you can do underwater basketweaving? 😴👀 #PeacefulDistraction #UnderwaterBasketweaving"
-3. "Just spent an hour in the pool and still managed to knot my basket. Goal: untangle it before next session. 💪🏽 #ChallengeAccepted #UnderwaterBasketweaving"
-4. "When life gives you lemons, make underwater basketweaving! 🍋🧵 #LemonadeLife #UnderwaterBasketweaving"
-5. "Just realized my underwater basketweaving skills could come in handy during a zombie apocalypse. 😂🧡 #SurvivalTips #UnderwaterBasketweaving"
-6. "I'm not lazy, I'm just conserving energy for my next underwater basketweaving session. 😴💤 #LazyDay #UnderwaterBasketweaving"
-7. "Just found my inner peace while doing underwater basketweaving. It's like meditation, but with knots! 🙏🧵 #Mindfulness #UnderwaterBasketweaving"
-8. "Why study for exams when you can do underwater basketweaving and forget all your worries? 😜🧵 #ProcrastinationStation #UnderwaterBasketweaving"
-9. "Just had to cut my underwater basketweaving session short due to a sudden urge to breathe. 🤯🌊 #AquaticAdventures #UnderwaterBasketweaving"
-10. "I'm not sure what's more impressive: my underwater basketweaving skills or the fact that I didn't drown trying to make this tweet. 😅🧵 #Accomplishment 
-#UnderwaterBasketweaving"
-```
--- a/examples/modelfile-devopsengineer/Modelfile
+++ b/examples/modelfile-devopsengineer/Modelfile
@@ -1,8 +0,0 @@
-# Modelfile for creating a devops engineer assistant
-# Run `ollama create devops-engineer -f ./Modelfile` and then `ollama run devops-engineer` and enter a topic
-
-FROM mistral
-PARAMETER temperature 1
-SYSTEM """
-You are a senior devops engineer, acting as an assistant. You offer help with cloud technologies like: Terraform, AWS, kubernetes, python. You answer with code examples when possible
-"""
--- a/examples/modelfile-devopsengineer/README.md
+++ b/examples/modelfile-devopsengineer/README.md
@@ -1,40 +0,0 @@
-# DevOps Engineer Modelfile
-
-This is a simple modelfile that takes will respond to any questions that a senior DevOps engineer should know how to solve.
-
-````text
-ollama create devopsengineer
-
-ollama run devopsengineer
->>> how can i proxy all connections from inside the office through a single server that caches large files. Provide a config file for nginx to achieve this?
-
-Here is an example configuration file for Nginx that can be used to proxy all connections from inside the office network to a centralized cache server:
-```perl
-http {
-    upstream cache_server {
-        server <cache_server_ip>;
-        server <cache_server_ip>;
-        server <cache_server_ip>;
-    }
-    
-    server {
-        listen 80;
-        server_name localhost;
-        
-        location / {
-            proxy_pass http://cache_server;
-            proxy_set_header Host $host;
-            proxy_set_header X-Real-IP $remote_addr;
-            proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
-        }
-    }
-}
-```
-In this configuration, we define an `upstream` block that lists the IP addresses of the cache servers. We then define a server block that listens on port 80 and routes all 
-traffic to the `/` location to the cache servers using the `proxy_pass` directive. The `proxy_set_header` directives are used to preserve the source IP address of the client
-request when forwarding it to the cache server.
-
-To use this configuration, you would need to replace the placeholder `<cache_server_ip>` with the actual IP addresses of your cache servers. You would also need to make sure
-that the cache servers are configured to accept incoming connections from the Nginx server and handle requests for files.
-
-````
--- a/examples/modelfile-midjourney/Modelfile
+++ b/examples/modelfile-midjourney/Modelfile
@@ -1,11 +0,0 @@
-# Modelfile for creating a Midjourney prompts from a topic
-# This prompt was adapted from the original at https://www.greataiprompts.com/guide/midjourney/best-chatgpt-prompt-for-midjourney/
-# Run `ollama create mj -f ./Modelfile` and then `ollama run mj` and enter a topic
-
-FROM zephyr
-PARAMETER temperature 0.8
-PARAMETER top_k 500
-PARAMETER top_p 0.9
-SYSTEM """
-Embrace your role as a creative illustrator. Based on a concept provided, you must produce a single paragraph with a multifaceted description of an image, ensuring significant details of the concept and more is represented in your instructions. You do not need to write complete sentences but rather short concepts with the following information: the level of detail that should be represented, an artistic style and maybe a specific name of a painter or illustrator, the ideal color pallete, lighting, mood, perspective, the setting, time of day, weather, the season, the time period, location, materials, the textures, patterns, lines, brushstrokes, techniques, the medium, the genre, the rendering style. Don't include everything and keep the description length under 250 words. 
-"""
--- a/examples/modelfile-midjourney/README.md
+++ b/examples/modelfile-midjourney/README.md
@@ -1,11 +0,0 @@
-# Midjourney Prompt Generator Modelfile
-
-This simple modelfile will help create a prompt to feed to Midjourney.
-
-```text
-ollama create midjourney
-
-ollama run midjourney
->>> a sports car in the mountains. 
-A sleek, high-performance automobile cuts through a serpentine mountain landscape. The concept is a classic illustration of speed and power, depicted in the style of pop art by Andy Warhol. The color palette is dominated by bold, primary hues of red, blue, and yellow, with striking accent colors of white, black, and metallic shades. The lighting is bright and focused, casting sharp shadows on the rugged terrain. A sense of excitement and anticipation permeates throughout the scene, as the car navigates a treacherous course through the winding road. The perspective is low, allowing for a full view of the vehicle's sleek lines and intricate details. The setting takes place in the afternoon during a sunny day in autumn, as evidenced by the vibrant foliage on the mountainside. The time period is modern, with nods to classic car design. The materials are primarily digital, allowing for smooth curves and sharp contrasts. The textures are sleek and polished, with meticulously detailed lines and brushstrokes that accentuate the car's aerodynamic design. The patterns consist of geometric shapes and bold stripes, adding to the car's dynamic appeal. The genre is modern realism, with a focus on precision and detail. The rendering style is highly technical, capturing the nuances and subtleties of the vehicle and its surroundings in breathtaking detail.
-```
--- a/examples/modelfile-recipemaker/Modelfile
+++ b/examples/modelfile-recipemaker/Modelfile
@@ -1,6 +0,0 @@
-# Modelfile for creating a recipe from a list of ingredients
-# Run `ollama create recipemaker -f ./Modelfile` and then `ollama run recipemaker` and feed it lists of ingredients to create recipes around.
-FROM nous-hermes
-SYSTEM """
-The instruction will be a list of ingredients. You should generate a recipe that can be made in less than an hour. You can also include ingredients that most people will find in their pantry every day. The recipe should be 4 people and you should include a description of what the meal will taste like
-"""
--- a/examples/modelfile-recipemaker/README.md
+++ b/examples/modelfile-recipemaker/README.md
@@ -1,20 +0,0 @@
-# Recipe Maker Modelfile 
-
-Simple modelfile to generate a recipe from a short list of ingredients.
-
-```
-ollama create recipemaker
-
-ollama run recipemaker
->>> chilli pepper, white chocolate, kale
- Ingredients:
- 1 small chili pepper
- 4 squares of white chocolate
- handful of kale leaves
-
-Instructions:
-1. In a blender or food processor, puree the chilies and white chocolate until smooth.
-2. Add the chopped kale leaves to the blender and pulse until well combined.
-3. Serve immediately as a dip for crackers or use it as an ingredient in your favorite recipe. The mixture of spicy chili pepper with sweet white chocolate and nutritious 
-kale will make your taste buds dance with delight!
-```
--- a/examples/modelfile-sentiments/Modelfile
+++ b/examples/modelfile-sentiments/Modelfile
@@ -1,28 +0,0 @@
-# Modelfile for creating a sentiment analyzer. 
-# Run `ollama create sentiments -f pathtofile` and then `ollama run sentiments` and enter a topic
-
-FROM orca
-TEMPLATE """
-{{- if .First }}
-### System:
-{{ .System }}
-{{- end }}
-### User: 
-I hate it when my phone dies
-### Response: 
-NEGATIVE
-### User: 
-He is awesome
-### Response: 
-POSITIVE
-### User: 
-This is the link to the article
-### Response: 
-NEUTRAL
-### User:
-{{ .Prompt }}
-
-### Response:
-"""
-
-SYSTEM """You are a sentiment analyzer. You will receive text and output only one word, either POSITIVE or NEGATIVE or NEUTRAL, depending on the sentiment of the text."""
--- a/examples/modelfile-sentiments/Readme.md
+++ b/examples/modelfile-sentiments/Readme.md
@@ -1,25 +0,0 @@
-# Sentiments Modelfile
-
-This is a simple sentiments analyzer using the Orca model. When you pull Orca from the registry, it has a Template already defined that looks like this:
-
-```Modelfile
-{{- if .First }}
-### System:
-{{ .System }}
-{{- end }}
-
-### User:
-{{ .Prompt }}
-
-### Response:
-```
-
-If we just wanted to have the text:
-
-```Plaintext
-You are a sentiment analyzer. You will receive text and output only one word, either POSITIVE or NEGATIVE or NEUTRAL, depending on the sentiment of the text.
-```
-
-then we could have put this in a SYSTEM block. But we want to provide examples which require updating the full Template. Any Modelfile you create will inherit all the settings from the source model. But in this example, we are overriding the Template.
-
-When providing examples for the input and output, you should include the way the model usually provides information. Since the Orca model expects a user prompt to appear after ### User: and the response is after ### Response, we should format our examples like that as well. If we were using the Llama 2 model, the format would be a bit different.
--- a/examples/modelfile-tweetwriter/Modelfile
+++ b/examples/modelfile-tweetwriter/Modelfile
@@ -1,7 +0,0 @@
-# Modelfile for creating a tweet from a topic
-# Run `ollama create tweetwriter -f ./Modelfile` and then `ollama run tweetwriter` and enter a topic
-
-FROM nous-hermes
-SYSTEM """
-You are a content marketer who needs to come up with a short but succinct tweet. Make sure to include the appropriate hashtags and links. Sometimes when appropriate, describe a meme that can be included as well. All answers should be in the form of a tweet which has a max size of 280 characters. Every instruction will be the topic to create a tweet about.
-"""
--- a/examples/modelfile-tweetwriter/readme.md
+++ b/examples/modelfile-tweetwriter/readme.md
@@ -0,0 +1,23 @@
+# Example Modelfile - Tweetwriter
+
+This simple examples shows what you can do without any code, simply relying on a Modelfile. The file has two instructions:
+
+1. FROM - The From instructions defines the parent model to use for this one. If you choose a model from the library, you can enter just the model name. For all other models, you need to specify the namespace as well. You could also use a local file. Just include the relative path to the converted, quantized model weights file. To learn more about creating that file, see the `import.md` file in the docs folder of this repository.
+2. SYSTEM - This defines the system prompt for the model and overrides the system prompt from the parent model.
+
+## Running the Example
+
+1. Create the model:
+
+   ```bash
+   ollama create tweetwriter
+   ```
+
+2. Enter a topic to generate a tweet about.
+3. Show the Modelfile in the REPL.
+
+   ```bash
+   /show modelfile
+   ```
+
+   Notice that the FROM and SYSTEM match what was in the file. But there is also a TEMPLATE and PARAMETER. These are inherited from the parent model.
--- a/examples/python-dockerit/README.md
+++ b/examples/python-dockerit/README.md
@@ -1,15 +1,31 @@
 # DockerIt

-DockerIt is a tool to help you build and run your application in a Docker container. It consists of a model that defines the system prompt and model weights to use, along with a python script to then build the container and run the image automatically. 
+DockerIt is a tool to help you build and run your application in a Docker container. It consists of a model that defines the system prompt and model weights to use, along with a python script to then build the container and run the image automatically.
+
+## Running the Example
+
+1. Ensure you have the `mattw/dockerit` model installed:
+
+   ```bash
+   ollama pull mattw/dockerit
+   ```
+
+2. Make sure Docker is running on your machine.
+
+3. Install the Python Requirements.
+
+   ```bash
+   pip install -r requirements.txt
+   ```
+
+4. Run the example:
+
+   ```bash
+   python dockerit.py "simple postgres server with admin password set to 123"
+   ```
+
+5. Enter the name you would like to use for your container image.

 ## Caveats

-This is an simple example. It's assuming the Dockerfile content generated is going to work. In many cases, even with simple web servers, it fails when trying to copy files that don't exist. It's simply an example of what you could possibly do.
-
-## Example Usage
-
-```bash
-> python3 ./dockerit.py "simple postgres server with admin password set to 123"
-Enter the name of the image: matttest
-Container named happy_keller  started with id:  7c201bb6c30f02b356ddbc8e2a5af9d7d7d7b8c228519c9a501d15c0bd9d6b3e
-```
+This is a simple example. It's assuming the Dockerfile content generated is going to work. In many cases, even with simple web servers, it fails when trying to copy files that don't exist. It's simply an example of what you could possibly do.
--- a/examples/python-json-datagenerator/readme.md
+++ b/examples/python-json-datagenerator/readme.md
@@ -4,6 +4,32 @@

 There are two python scripts in this example. `randomaddresses.py` generates random addresses from different countries. `predefinedschema.py` sets a template for the model to fill in.

+## Running the Example
+
+1. Ensure you have the `llama2` model installed:
+
+   ```bash
+   ollama pull llama2
+   ```
+
+2. Install the Python Requirements.
+
+   ```bash
+   pip install -r requirements.txt
+   ```
+
+3. Run the Random Addresses example:
+
+   ```bash
+   python randomaddresses.py
+   ```
+
+4. Run the Predefined Schema example:
+
+   ```bash
+   python predefinedschema.py
+   ```
+
 ## Review the Code

 Both programs are basically the same, with a different prompt for each, demonstrating two different ideas. The key part of getting JSON out of a model is to state in the prompt or system prompt that it should respond using JSON, and specifying the `format` as `json` in the data body.
--- a/examples/python-loganalysis/loganalysis.py
+++ b/examples/python-loganalysis/loganalysis.py
@@ -16,12 +16,12 @@ def find_errors_in_log_file():
  with open(log_file_path, 'r') as log_file:
    log_lines = log_file.readlines()

-error_logs = []
-    for i, line in enumerate(log_lines):
-        if "error" in line.lower():
-            start_index = max(0, i - prelines)
-            end_index = min(len(log_lines), i + postlines + 1)
-            error_logs.extend(log_lines[start_index:end_index])
+  error_logs = []
+  for i, line in enumerate(log_lines):
+      if "error" in line.lower():
+          start_index = max(0, i - prelines)
+          end_index = min(len(log_lines), i + postlines + 1)
+          error_logs.extend(log_lines[start_index:end_index])

  return error_logs

@@ -32,7 +32,6 @@ data = {
  "model": "mattw/loganalyzer"
 }

-
 response = requests.post("http://localhost:11434/api/generate", json=data, stream=True)
 for line in response.iter_lines():
  if line:
--- a/examples/python-loganalysis/readme.md
+++ b/examples/python-loganalysis/readme.md
@@ -2,12 +2,34 @@

 ![loganalyzer 2023-11-10 08_53_29](https://github.com/jmorganca/ollama/assets/633681/ad30f1fc-321f-4953-8914-e30e24db9921)

-This example shows one possible way to create a log file analyzer. To use it, run:
+This example shows one possible way to create a log file analyzer. It uses the model **mattw/loganalyzer** which is based on **codebooga**, a 34b parameter model.
+
+To use it, run:

 `python loganalysis.py <logfile>`

 You can try this with the `logtest.logfile` file included in this directory.

+## Running the Example
+
+1. Ensure you have the `mattw/loganalyzer` model installed:
+
+   ```bash
+   ollama pull mattw/loganalyzer
+   ```
+
+2. Install the Python Requirements.
+
+   ```bash
+   pip install -r requirements.txt
+   ```
+
+3. Run the example:
+
+   ```bash
+   python loganalysis.py logtest.logfile
+   ```
+
 ## Review the code

 The first part of this example is a Modelfile that takes `codebooga` and applies a new System Prompt:
@@ -45,4 +67,4 @@ for line in response.iter_lines():

 There is a lot more that can be done here. This is a simple way to detect errors, looking for the word error. Perhaps it would be interesting to find anomalous activity in the logs. It could be interesting to create embeddings for each line and compare them, looking for similar lines. Or look into applying Levenshtein Distance algorithms to find similar lines to help identify the anomalous lines.

-Also try different models and different prompts to analyze the data. You could consider adding retrieval augmented generation (RAG) to this to help understand newer log formats.
+Try different models and different prompts to analyze the data. You could consider adding retrieval augmented generation (RAG) to this to help understand newer log formats.
--- a/examples/python-rag-newssummary/README.md
+++ b/examples/python-rag-newssummary/README.md
@@ -14,9 +14,22 @@ This example goes through a series of steps:

 This example lets you pick from a few different topic areas, then summarize the most recent x articles for that topic. It then creates chunks of sentences from each article and then generates embeddings for each of those chunks.

-You can run the example like this:
+## Running the Example

-```bash
-pip install -r requirements.txt
-python summ.py
-```
+1. Ensure you have the `mistral-openorca` model installed:
+
+   ```bash
+   ollama pull mistral-openorca
+   ```
+
+2. Install the Python Requirements.
+
+   ```bash
+   pip install -r requirements.txt
+   ```
+
+3. Run the example:
+
+   ```bash
+   python summ.py
+   ```
--- a/examples/python-simplechat/client.py
+++ b/examples/python-simplechat/client.py
@@ -24,7 +24,6 @@ def chat(messages):
            # the response streams one token at a time, print that as we receive it
            print(content, end="", flush=True)

-
        if body.get("done", False):
            message["content"] = output
            return message
@@ -32,9 +31,11 @@ def chat(messages):

 def main():
    messages = []
-    
+
    while True:
        user_input = input("Enter a prompt: ")
+        if not user_input:
+            exit()
        print()
        messages.append({"role": "user", "content": user_input})
        message = chat(messages)
--- a/examples/python-simplechat/readme.md
+++ b/examples/python-simplechat/readme.md
@@ -1,6 +1,26 @@
 # Simple Chat Example

-The **chat** endpoint is one of two ways to generate text from an LLM with Ollama. At a high level you provide the endpoint an array of objects with a role and content specified. Then with each output and prompt, you add more of those role/content objects, which builds up the history.
+The **chat** endpoint is one of two ways to generate text from an LLM with Ollama, and is introduced in version 0.1.14. At a high level, you provide the endpoint an array of objects with a role and content specified. Then with each output and prompt, you add more of those role/content objects, which builds up the history.
+
+## Running the Example
+
+1. Ensure you have the `llama2` model installed:
+
+   ```bash
+   ollama pull llama2
+   ```
+
+2. Install the Python Requirements.
+
+   ```bash
+   pip install -r requirements.txt
+   ```
+
+3. Run the example:
+
+   ```bash
+   python client.py
+   ```

 ## Review the Code

--- a/examples/python-simplechat/requirements.txt
+++ b/examples/python-simplechat/requirements.txt
@@ -0,0 +1 @@
+Requests==2.31.0
--- a/examples/python-simplegenerate/README.md
+++ b/examples/python-simplegenerate/README.md
@@ -0,0 +1,29 @@
+# Simple Generate Example
+
+This is a simple example using the **Generate** endpoint.
+
+## Running the Example
+
+1. Ensure you have the `stablelm-zephyr` model installed:
+
+   ```bash
+   ollama pull stablelm-zephyr
+   ```
+
+2. Install the Python Requirements.
+
+   ```bash
+   pip install -r requirements.txt
+   ```
+
+3. Run the example:
+
+   ```bash
+   python client.py
+   ```
+
+## Review the Code
+
+The **main** function simply asks for input, then passes that to the generate function. The output from generate is then passed back to generate on the next run.
+
+The **generate** function uses `requests.post` to call `/api/generate`, passing the model, prompt, and context. The `generate` endpoint returns a stream of JSON blobs that are then iterated through, looking for the response values. That is then printed out. The final JSON object includes the full context of the conversation so far, and that is the return value from the function.
--- a/examples/python-simplegenerate/client.py
+++ b/examples/python-simplegenerate/client.py
@@ -2,7 +2,7 @@ import json
 import requests

 # NOTE: ollama must be running for this to work, start the ollama app or run `ollama serve`
-model = 'llama2' # TODO: update this for whatever model you wish to use
+model = 'stablelm-zephyr' # TODO: update this for whatever model you wish to use

 def generate(prompt, context):
    r = requests.post('http://localhost:11434/api/generate',
@@ -30,6 +30,8 @@ def main():
    context = [] # the context stores a conversation history, you can use this to make the model more context aware
    while True:
        user_input = input("Enter a prompt: ")
+        if not user_input:
+            exit()
        print()
        context = generate(user_input, context)
        print()
--- a/examples/python-simplegenerate/requirements.txt
+++ b/examples/python-simplegenerate/requirements.txt
@@ -0,0 +1 @@
+Requests==2.31.0
--- a/examples/typescript-mentors/README.md
+++ b/examples/typescript-mentors/README.md
@@ -4,18 +4,62 @@ This example demonstrates how one would create a set of 'mentors' you can have a

 ## Usage

-```bash
-ts-node ./character-generator.ts "Lorne Greene"
-```
+1. Add llama2 to have the mentors ask your questions:

-This will create `lornegreene/Modelfile`. Now you can create a model with this command:
+   ```bash
+   ollama pull llama2
+   ```

-```bash
-ollama create lornegreene -f lornegreene/Modelfile
-```
+2. Install prerequisites:

-If you want to add your own mentors, you will have to update the code to look at your namespace instead of **mattw**. Also set the list of mentors to include yours.
+   ```bash
+   npm install
+   ```

-```bash
-ts-node ./mentors.ts "What is a Jackalope?"
-```
+3. Ask a question:
+
+   ```bash
+   npm start "what is a jackalope"
+   ```
+
+You can also add your own character to be chosen at random when you ask a question.
+
+1. Make sure you have the right model installed:
+
+   ```bash
+   ollama pull stablebeluga2:70b-q4_K_M
+   ```
+  
+2. Create a new character:
+  
+   ```bash
+   npm run charactergen "Lorne Greene"
+   ```
+
+   You can choose any well-known person you like. This example will create `lornegreene/Modelfile`.
+
+3. Now you can create a model with this command:
+
+   ```bash
+   ollama create <YourNamespace>/lornegreene -f lornegreene/Modelfile
+   ```
+
+   `YourNamespace` is whatever name you set up when you signed up at [https://ollama.ai/signup](https://ollama.ai/signup).
+
+4. To add this to your mentors, you will have to update the code as follows. On line 8 of `mentors.ts`, add an object to the array, replacing `<YourNamespace>` with the namespace you used above.
+
+   ```bash
+   {ns: "<YourNamespace>", char: "Lorne Greene"}
+   ```
+
+## Review the Code
+
+There are two scripts you can run in this example. The first is the main script to ask the mentors a question. The other one lets you generate a character to add to the mentors. Both scripts are mostly about adjusting the prompts at each inference stage.
+
+### mentors.ts
+
+In the **main** function, it starts by generating a list of mentors. This chooses 3 from a list of interesting characters. Then we ask for a question, and then things get interesting. We set the prompt for each of the 3 mentors a little differently. And the 2nd and 3rd mentors see what the previous folks said. The other functions in mentors sets the prompts for each mentor.
+
+### character-generator.ts
+
+**Character Generator** simply customizes the prompt to build a character profile for any famous person. And most of the script is just tweaking the prompt. This uses Stable Beluga 2 70b parameters. The 70b models tend to do better writing a bio about a character than smaller models, and Stable Beluga seemed to do better than Llama 2. Since this is used at development time for the characters, it doesn't affect the runtime of asking the mentors for their input.
--- a/examples/typescript-mentors/mentors.ts
+++ b/examples/typescript-mentors/mentors.ts
@@ -2,10 +2,11 @@ import { Ollama } from 'ollama-node';

 const mentorCount = 3;
 const ollama = new Ollama();
+type Mentor = { ns: string, char: string };

-function getMentors(): string[] {
-  const mentors = ['Gary Vaynerchuk', 'Kanye West', 'Martha Stewart', 'Neil deGrasse Tyson', 'Owen Wilson', 'Ronald Reagan', 'Donald Trump', 'Barack Obama', 'Jeff Bezos'];
-  const chosenMentors: string[] = [];
+function getMentors(): Mentor[] {
+  const mentors = [{ ns: 'mattw', char: 'Gary Vaynerchuk' }, { ns: 'mattw', char: 'Kanye West'}, {ns: 'mattw', char: 'Martha Stewart'}, {ns: 'mattw', char: 'Neil deGrasse Tyson'}, {ns: 'mattw', char: 'Owen Wilson'}, {ns: 'mattw', char: 'Ronald Reagan'}, {ns: 'mattw', char: 'Donald Trump'}, {ns: 'mattw', char: 'Barack Obama'}, {ns: 'mattw', char: 'Jeff Bezos'}];
+  const chosenMentors: Mentor[] = [];
  for (let i = 0; i < mentorCount; i++) {
    const mentor = mentors[Math.floor(Math.random() * mentors.length)];
    chosenMentors.push(mentor);
@@ -14,12 +15,12 @@ function getMentors(): string[] {
  return chosenMentors;
 }

-function getMentorFileName(mentor: string): string {
-  const model = mentor.toLowerCase().replace(/\s/g, '');
-  return `mattw/${model}`;
+function getMentorFileName(mentor: Mentor): string {
+  const model = mentor.char.toLowerCase().replace(/\s/g, '');
+  return `${mentor.ns}/${model}`;
 }

-async function getSystemPrompt(mentor: string, isLast: boolean, question: string): Promise<string> {
+async function getSystemPrompt(mentor: Mentor, isLast: boolean, question: string): Promise<string> {
  ollama.setModel(getMentorFileName(mentor));
  const info = await ollama.showModelInfo()
  let SystemPrompt = info.system || '';
@@ -43,8 +44,8 @@ async function main() {
    ollama.setModel(getMentorFileName(mentor));
    ollama.setSystemPrompt(SystemPrompt);
    let output = '';
-    process.stdout.write(`\n${mentor}: `);
-    for await (const chunk of ollama.streamingGenerate(theConversation + `Continue the conversation as if you were ${mentor} on the question "${question}".`)) {
+    process.stdout.write(`\n${mentor.char}: `);
+    for await (const chunk of ollama.streamingGenerate(theConversation + `Continue the conversation as if you were ${mentor.char} on the question "${question}".`)) {
      if (chunk.response) {
        output += chunk.response;
        process.stdout.write(chunk.response);
@@ -52,7 +53,7 @@ async function main() {
        process.stdout.write('\n');
      }
    }
-    theConversation += `${mentor}: ${output}\n\n`
+    theConversation += `${mentor.char}: ${output}\n\n`
  }
 }

--- a/examples/typescript-mentors/package.json
+++ b/examples/typescript-mentors/package.json
@@ -1,7 +1,15 @@
 {
+  "scripts": {
+    "charactergen": "tsx character-generator.ts", 
+    "start": "tsx mentors.ts"
+  },
  "dependencies": {
    "fs": "^0.0.1-security",
    "ollama-node": "^0.0.3",
    "path": "^0.12.7"
+  },
+  "devDependencies": {
+    "tsx": "^4.6.2",
+    "typescript": "^5.3.3"
  }
 }
--- a/examples/typescript-simplechat/package.json
+++ b/examples/typescript-simplechat/package.json
@@ -1 +1,12 @@
-{ "dependencies": { "@types/node": "^20.10.4", "prompt-sync": "^4.2.0", "readline": "^1.3.0" } }
+{ 
+  "scripts": {
+    "start": "tsx client.ts"
+  }, 
+  "dependencies": {
+     "@types/node": "^20.10.4", 
+     "prompt-sync": "^4.2.0", 
+     "readline": "^1.3.0", 
+     "tsx": "^4.6.2", 
+     "typescript": "^5.3.3" 
+     } 
+    }
--- a/examples/typescript-simplechat/readme.md
+++ b/examples/typescript-simplechat/readme.md
@@ -1,14 +1,10 @@
 # Simple Chat Example

-The **chat** endpoint is one of two ways to generate text from an LLM with Ollama. At a high level you provide the endpoint an array of message objects with a role and content specified. Then with each output and prompt, you add more messages, which builds up the history.
+The **chat** endpoint, available as of v0.1.14, is one of two ways to generate text from an LLM with Ollama. At a high level, you provide the endpoint an array of message objects with a role and content specified. Then with each output and prompt, you add more messages, which builds up the history.

 ## Run the Example

-There are a few ways to run this, just like any Typescript code:
-
-1. Compile with `tsc` and then run it with `node client.js`.
-2. Install `tsx` and run it with `tsx client.ts`.
-3. Install `bun` and run it with `bun client.ts`.
+`npm start`

 ## Review the Code

@@ -30,7 +26,7 @@ With the **generate** endpoint, you need to provide a `prompt`. But with **chat*

 The final JSON object doesn't provide the full content, so you will need to build the content yourself. In this example, **chat** takes the full array of messages and outputs the resulting message from this call of the chat endpoint.

-In the **askQuestion** function, we collect `user_input` and add it as a message to our messages and that is passed to the chat function. When the LLM is done responding the output is added as another message to the messages array.
+In the **askQuestion** function, we collect `user_input` and add it as a message to our messages, and that is passed to the chat function. When the LLM is done responding, the output is added as another message to the messages array.

 At the end, you will see a printout of all the messages.

--- a/go.mod
+++ b/go.mod
@@ -7,11 +7,14 @@ require (
 	github.com/gin-gonic/gin v1.9.1
 	github.com/olekukonko/tablewriter v0.0.5
 	github.com/spf13/cobra v1.7.0
+	github.com/stretchr/testify v1.8.4
 	golang.org/x/sync v0.3.0
 )

 require (
+	github.com/davecgh/go-spew v1.1.1 // indirect
 	github.com/mattn/go-runewidth v0.0.14 // indirect
+	github.com/pmezard/go-difflib v1.0.0 // indirect
 	github.com/rivo/uniseg v0.2.0 // indirect
 )

--- a/go.sum
+++ b/go.sum
@@ -98,8 +98,9 @@ github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/
 github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
 github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
 github.com/stretchr/testify v1.8.2/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
-github.com/stretchr/testify v1.8.3 h1:RP3t2pwF7cMEbC1dqtB6poj3niw/9gnV4Cjg5oW5gtY=
 github.com/stretchr/testify v1.8.3/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
+github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk=
+github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
 github.com/twitchyliquid64/golang-asm v0.15.1 h1:SU5vSMR7hnwNxj24w34ZyCi/FmDZTkS4MhqMhdFk5YI=
 github.com/twitchyliquid64/golang-asm v0.15.1/go.mod h1:a1lVb/DtPvCB8fslRZhAngC2+aY1QWCk3Cedj/Gdt08=
 github.com/ugorji/go v1.2.7/go.mod h1:nF9osbDWLy6bDVv/Rtoh6QgnvNDpmCalQV5urGCCS6M=
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@@ -0,0 +1,134 @@
+//go:build linux || windows
+
+package gpu
+
+/*
+#cgo linux LDFLAGS: -lrt -lpthread -ldl -lstdc++ -lm
+#cgo windows LDFLAGS: -lpthread
+
+#include "gpu_info.h"
+
+*/
+import "C"
+import (
+	"fmt"
+	"log"
+	"sync"
+	"unsafe"
+
+	"github.com/jmorganca/ollama/api"
+)
+
+type handles struct {
+	cuda *C.cuda_handle_t
+	rocm *C.rocm_handle_t
+}
+
+var gpuMutex sync.Mutex
+var gpuHandles *handles = nil
+
+// Note: gpuMutex must already be held
+func initGPUHandles() {
+	// TODO - if the ollama build is CPU only, don't do these checks as they're irrelevant and confusing
+	log.Printf("Detecting GPU type")
+	gpuHandles = &handles{nil, nil}
+	var resp C.cuda_init_resp_t
+	C.cuda_init(&resp)
+	if resp.err != nil {
+		log.Printf("CUDA not detected: %s", C.GoString(resp.err))
+		C.free(unsafe.Pointer(resp.err))
+
+		var resp C.rocm_init_resp_t
+		C.rocm_init(&resp)
+		if resp.err != nil {
+			log.Printf("ROCm not detected: %s", C.GoString(resp.err))
+			C.free(unsafe.Pointer(resp.err))
+		} else {
+			log.Printf("Radeon GPU detected")
+			rocm := resp.rh
+			gpuHandles.rocm = &rocm
+		}
+	} else {
+		log.Printf("Nvidia GPU detected")
+		cuda := resp.ch
+		gpuHandles.cuda = &cuda
+	}
+}
+
+func GetGPUInfo() GpuInfo {
+	// TODO - consider exploring lspci (and equivalent on windows) to check for
+	// GPUs so we can report warnings if we see Nvidia/AMD but fail to load the libraries
+	gpuMutex.Lock()
+	defer gpuMutex.Unlock()
+	if gpuHandles == nil {
+		initGPUHandles()
+	}
+
+	var memInfo C.mem_info_t
+	resp := GpuInfo{"", "", 0, 0}
+	if gpuHandles.cuda != nil {
+		C.cuda_check_vram(*gpuHandles.cuda, &memInfo)
+		if memInfo.err != nil {
+			log.Printf("error looking up CUDA GPU memory: %s", C.GoString(memInfo.err))
+			C.free(unsafe.Pointer(memInfo.err))
+		} else {
+			resp.Driver = "CUDA"
+			resp.Library = "cuda_server"
+		}
+	} else if gpuHandles.rocm != nil {
+		C.rocm_check_vram(*gpuHandles.rocm, &memInfo)
+		if memInfo.err != nil {
+			log.Printf("error looking up ROCm GPU memory: %s", C.GoString(memInfo.err))
+			C.free(unsafe.Pointer(memInfo.err))
+		} else {
+			resp.Driver = "ROCM"
+			resp.Library = "rocm_server"
+		}
+	}
+	if resp.Driver == "" {
+		C.cpu_check_ram(&memInfo)
+		resp.Driver = "CPU"
+		// In the future we may offer multiple CPU variants to tune CPU features
+		resp.Library = "default"
+	}
+	if memInfo.err != nil {
+		log.Printf("error looking up CPU memory: %s", C.GoString(memInfo.err))
+		C.free(unsafe.Pointer(memInfo.err))
+		return resp
+	}
+	resp.FreeMemory = uint64(memInfo.free)
+	resp.TotalMemory = uint64(memInfo.total)
+	return resp
+}
+
+func CheckVRAM() (int64, error) {
+	gpuInfo := GetGPUInfo()
+	if gpuInfo.FreeMemory > 0 && gpuInfo.Driver != "CPU" {
+		return int64(gpuInfo.FreeMemory), nil
+	}
+	return 0, fmt.Errorf("no GPU detected") // TODO - better handling of CPU based memory determiniation
+}
+
+func NumGPU(numLayer, fileSizeBytes int64, opts api.Options) int {
+	if opts.NumGPU != -1 {
+		return opts.NumGPU
+	}
+	info := GetGPUInfo()
+	if info.Driver == "CPU" {
+		return 0
+	}
+
+	/*
+		Calculate bytes per layer, this will roughly be the size of the model file divided by the number of layers.
+		We can store the model weights and the kv cache in vram,
+		to enable kv chache vram storage add two additional layers to the number of layers retrieved from the model file.
+	*/
+	bytesPerLayer := uint64(fileSizeBytes / numLayer)
+
+	// 75% of the absolute max number of layers we can fit in available VRAM, off-loading too many layers to the GPU can cause OOM errors
+	layers := int(info.FreeMemory/bytesPerLayer) * 3 / 4
+
+	log.Printf("%d MB VRAM available, loading up to %d %s GPU layers out of %d", info.FreeMemory/(1024*1024), layers, info.Driver, numLayer)
+
+	return layers
+}
--- a/gpu/gpu_darwin.go
+++ b/gpu/gpu_darwin.go
@@ -0,0 +1,41 @@
+//go:build darwin
+
+package gpu
+
+import "C"
+import (
+	"runtime"
+
+	"github.com/jmorganca/ollama/api"
+)
+
+// CheckVRAM returns the free VRAM in bytes on Linux machines with NVIDIA GPUs
+func CheckVRAM() (int64, error) {
+	// TODO - assume metal, and return free memory?
+	return 0, nil
+
+}
+
+func GetGPUInfo() GpuInfo {
+	// TODO - Metal vs. x86 macs...
+
+	return GpuInfo{
+		Driver:      "METAL",
+		Library:     "default",
+		TotalMemory: 0,
+		FreeMemory:  0,
+	}
+}
+
+func NumGPU(numLayer, fileSizeBytes int64, opts api.Options) int {
+	if runtime.GOARCH == "arm64" {
+		return 1
+	}
+
+	// metal only supported on arm64
+	return 0
+}
+
+func nativeInit() error {
+	return nil
+}
--- a/gpu/gpu_info.h
+++ b/gpu/gpu_info.h
@@ -0,0 +1,49 @@
+#ifndef __APPLE__
+#ifndef __GPU_INFO_H__
+#define __GPU_INFO_H__
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#ifndef _WIN32
+#include <dlfcn.h>
+#define LOAD_LIBRARY(lib, flags) dlopen(lib, flags)
+#define LOAD_SYMBOL(handle, sym) dlsym(handle, sym)
+#define LOAD_ERR() dlerror()
+#define UNLOAD_LIBRARY(handle) dlclose(handle)
+#else
+#include <windows.h>
+#define LOAD_LIBRARY(lib, flags) LoadLibrary(lib)
+#define LOAD_SYMBOL(handle, sym) GetProcAddress(handle, sym)
+#define UNLOAD_LIBRARY(handle) FreeLibrary(handle)
+
+// TODO - refactor this with proper error message handling on windows
+inline static char *LOAD_ERR() {
+  static char errbuf[8];
+  snprintf(errbuf, 8, "0x%lx", GetLastError());
+  return errbuf;
+}
+
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct mem_info {
+  uint64_t total;
+  uint64_t free;
+  char *err;  // If non-nill, caller responsible for freeing
+} mem_info_t;
+
+void cpu_check_ram(mem_info_t *resp);
+
+#ifdef __cplusplus
+}
+#endif
+
+#include "gpu_info_cuda.h"
+#include "gpu_info_rocm.h"
+
+#endif  // __GPU_INFO_H__
+#endif  // __APPLE__
--- a/gpu/gpu_info_cpu.c
+++ b/gpu/gpu_info_cpu.c
@@ -0,0 +1,42 @@
+#include "gpu_info.h"
+// Fallbacks for CPU mode
+
+#ifdef _WIN32
+#include <sysinfoapi.h>
+void cpu_check_ram(mem_info_t *resp) {
+  resp->err = NULL;
+  MEMORYSTATUSEX info;
+  if (GlobalMemoryStatusEx(&info) != 0) {
+    resp->total = info.ullTotalPhys;
+    resp->free = info.ullAvailPhys;
+  } else {
+    resp->err = strdup(LOAD_ERR());
+  }
+  return;
+}
+
+#elif __linux__
+#include <errno.h>
+#include <string.h>
+#include <sys/sysinfo.h>
+void cpu_check_ram(mem_info_t *resp) {
+  struct sysinfo info;
+  resp->err = NULL;
+  if (sysinfo(&info) != 0) {
+    resp->err = strdup(strerror(errno));
+  } else {
+    resp->total = info.totalram * info.mem_unit;
+    resp->free = info.freeram * info.mem_unit;
+  }
+  return;
+}
+
+#elif __APPLE__
+// TODO consider an Apple implementation that does something useful
+// mem_info_t cpu_check_ram() {
+//   mem_info_t resp = {0, 0, NULL};
+//   return resp;
+// }
+#else
+#error "Unsupported platform"
+#endif
--- a/gpu/gpu_info_cuda.c
+++ b/gpu/gpu_info_cuda.c
@@ -0,0 +1,106 @@
+#ifndef __APPLE__  // TODO - maybe consider nvidia support on intel macs?
+
+#include "gpu_info_cuda.h"
+
+#include <string.h>
+
+#ifndef _WIN32
+const char *cuda_lib_paths[] = {
+    "libnvidia-ml.so",
+    "/usr/local/cuda/lib64/libnvidia-ml.so",
+    "/usr/lib/x86_64-linux-gnu/nvidia/current/libnvidia-ml.so",
+    "/usr/lib/wsl/lib/libnvidia-ml.so.1",  // TODO Maybe glob?
+    NULL,
+};
+#else
+const char *cuda_lib_paths[] = {
+    "nvml.dll",
+    "",
+    NULL,
+};
+#endif
+
+void cuda_init(cuda_init_resp_t *resp) {
+  nvmlReturn_t ret;
+  resp->err = NULL;
+  const int buflen = 256;
+  char buf[buflen + 1];
+  int i;
+
+  struct lookup {
+    char *s;
+    void **p;
+  } l[4] = {
+      {"nvmlInit_v2", (void *)&resp->ch.initFn},
+      {"nvmlShutdown", (void *)&resp->ch.shutdownFn},
+      {"nvmlDeviceGetHandleByIndex", (void *)&resp->ch.getHandle},
+      {"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.getMemInfo},
+  };
+
+  for (i = 0; cuda_lib_paths[i] != NULL && resp->ch.handle == NULL; i++) {
+    resp->ch.handle = LOAD_LIBRARY(cuda_lib_paths[i], RTLD_LAZY);
+  }
+  if (!resp->ch.handle) {
+    // TODO improve error message, as the LOAD_ERR will have typically have the
+    // final path that was checked which might be confusing.
+    snprintf(buf, buflen,
+             "Unable to load %s library to query for Nvidia GPUs: %s",
+             cuda_lib_paths[0], LOAD_ERR());
+    resp->err = strdup(buf);
+    return;
+  }
+
+  for (i = 0; i < 4; i++) {  // TODO - fix this to use a null terminated list
+    *l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
+    if (!l[i].p) {
+      UNLOAD_LIBRARY(resp->ch.handle);
+      resp->ch.handle = NULL;
+      snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
+               LOAD_ERR());
+      resp->err = strdup(buf);
+      return;
+    }
+  }
+
+  ret = (*resp->ch.initFn)();
+  if (ret != NVML_SUCCESS) {
+    snprintf(buf, buflen, "nvml vram init failure: %d", ret);
+    resp->err = strdup(buf);
+  }
+
+  return;
+}
+
+void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
+  resp->err = NULL;
+  nvmlDevice_t device;
+  nvmlMemory_t memInfo = {0};
+  nvmlReturn_t ret;
+  const int buflen = 256;
+  char buf[buflen + 1];
+  int i;
+
+  if (h.handle == NULL) {
+    resp->err = strdup("nvml handle sn't initialized");
+    return;
+  }
+
+  // TODO - handle multiple GPUs
+  ret = (*h.getHandle)(0, &device);
+  if (ret != NVML_SUCCESS) {
+    snprintf(buf, buflen, "unable to get device handle: %d", ret);
+    resp->err = strdup(buf);
+    return;
+  }
+
+  ret = (*h.getMemInfo)(device, &memInfo);
+  if (ret != NVML_SUCCESS) {
+    snprintf(buf, buflen, "device memory info lookup failure: %d", ret);
+    resp->err = strdup(buf);
+    return;
+  }
+  resp->total = memInfo.total;
+  resp->free = memInfo.free;
+  return;
+}
+#endif  // __APPLE__
--- a/gpu/gpu_info_cuda.h
+++ b/gpu/gpu_info_cuda.h
@@ -0,0 +1,35 @@
+#ifndef __APPLE__
+#ifndef __GPU_INFO_CUDA_H__
+#define __GPU_INFO_CUDA_H__
+#include "gpu_info.h"
+
+// Just enough typedef's to dlopen/dlsym for memory information
+typedef enum nvmlReturn_enum {
+  NVML_SUCCESS = 0,
+  // Other values omitted for now...
+} nvmlReturn_t;
+typedef void *nvmlDevice_t;  // Opaque is sufficient
+typedef struct nvmlMemory_st {
+  unsigned long long total;
+  unsigned long long free;
+  unsigned long long used;
+} nvmlMemory_t;
+
+typedef struct cuda_handle {
+  void *handle;
+  nvmlReturn_t (*initFn)(void);
+  nvmlReturn_t (*shutdownFn)(void);
+  nvmlReturn_t (*getHandle)(unsigned int, nvmlDevice_t *);
+  nvmlReturn_t (*getMemInfo)(nvmlDevice_t, nvmlMemory_t *);
+} cuda_handle_t;
+
+typedef struct cuda_init_resp {
+  char *err;  // If err is non-null handle is invalid
+  cuda_handle_t ch;
+} cuda_init_resp_t;
+
+void cuda_init(cuda_init_resp_t *resp);
+void cuda_check_vram(cuda_handle_t ch, mem_info_t *resp);
+
+#endif  // __GPU_INFO_CUDA_H__
+#endif  // __APPLE__
--- a/gpu/gpu_info_rocm.c
+++ b/gpu/gpu_info_rocm.c
@@ -0,0 +1,114 @@
+#ifndef __APPLE__
+
+#include "gpu_info_rocm.h"
+
+#include <string.h>
+
+#ifndef _WIN32
+const char *rocm_lib_paths[] = {
+    "librocm_smi64.so",
+    "/opt/rocm/lib/librocm_smi64.so",
+    NULL,
+};
+#else
+// TODO untested
+const char *rocm_lib_paths[] = {
+    "rocm_smi64.dll",
+    "/opt/rocm/lib/rocm_smi64.dll",
+    NULL,
+};
+#endif
+
+void rocm_init(rocm_init_resp_t *resp) {
+  rsmi_status_t ret;
+  resp->err = NULL;
+  const int buflen = 256;
+  char buf[buflen + 1];
+  int i;
+  struct lookup {
+    char *s;
+    void **p;
+  } l[4] = {
+      {"rsmi_init", (void *)&resp->rh.initFn},
+      {"rsmi_shut_down", (void *)&resp->rh.shutdownFn},
+      {"rsmi_dev_memory_total_get", (void *)&resp->rh.totalMemFn},
+      {"rsmi_dev_memory_usage_get", (void *)&resp->rh.usageMemFn},
+      // { "rsmi_dev_id_get", (void*)&resp->rh.getHandle },
+  };
+
+  for (i = 0; rocm_lib_paths[i] != NULL && resp->rh.handle == NULL; i++) {
+    resp->rh.handle = LOAD_LIBRARY(rocm_lib_paths[i], RTLD_LAZY);
+  }
+  if (!resp->rh.handle) {
+    snprintf(buf, buflen,
+             "Unable to load %s library to query for Radeon GPUs: %s\n",
+             rocm_lib_paths[0], LOAD_ERR());
+    resp->err = strdup(buf);
+    return;
+  }
+
+  for (i = 0; i < 4; i++) {
+    *l[i].p = LOAD_SYMBOL(resp->rh.handle, l[i].s);
+    if (!l[i].p) {
+      UNLOAD_LIBRARY(resp->rh.handle);
+      snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
+               LOAD_ERR());
+      resp->err = strdup(buf);
+      return;
+    }
+  }
+
+  ret = (*resp->rh.initFn)(0);
+  if (ret != RSMI_STATUS_SUCCESS) {
+    snprintf(buf, buflen, "rocm vram init failure: %d", ret);
+    resp->err = strdup(buf);
+  }
+
+  return;
+}
+
+void rocm_check_vram(rocm_handle_t h, mem_info_t *resp) {
+  resp->err = NULL;
+  // uint32_t num_devices;
+  // uint16_t device;
+  uint64_t totalMem = 0;
+  uint64_t usedMem = 0;
+  rsmi_status_t ret;
+  const int buflen = 256;
+  char buf[buflen + 1];
+  int i;
+
+  if (h.handle == NULL) {
+    resp->err = strdup("nvml handle sn't initialized");
+    return;
+  }
+
+  // TODO - iterate through devices...  ret =
+  // rsmi_num_monitor_devices(&num_devices);
+
+  // ret = (*h.getHandle)(0, &device);
+  // if (ret != RSMI_STATUS_SUCCESS) {
+  //     printf("rocm vram device lookup failure: %d\n", ret);
+  //     return -1;
+  // }
+
+  // Get total memory - used memory for available memory
+  ret = (*h.totalMemFn)(0, RSMI_MEM_TYPE_VRAM, &totalMem);
+  if (ret != RSMI_STATUS_SUCCESS) {
+    snprintf(buf, buflen, "rocm total mem lookup failure: %d", ret);
+    resp->err = strdup(buf);
+    return;
+  }
+  ret = (*h.usageMemFn)(0, RSMI_MEM_TYPE_VRAM, &usedMem);
+  if (ret != RSMI_STATUS_SUCCESS) {
+    snprintf(buf, buflen, "rocm usage mem lookup failure: %d", ret);
+    resp->err = strdup(buf);
+    return;
+  }
+
+  resp->total = totalMem;
+  resp->free = totalMem - usedMem;
+  return;
+}
+
+#endif  // __APPLE__
--- a/gpu/gpu_info_rocm.h
+++ b/gpu/gpu_info_rocm.h
@@ -0,0 +1,36 @@
+#ifndef __APPLE__
+#ifndef __GPU_INFO_ROCM_H__
+#define __GPU_INFO_ROCM_H__
+#include "gpu_info.h"
+
+// Just enough typedef's to dlopen/dlsym for memory information
+typedef enum rsmi_status_return {
+  RSMI_STATUS_SUCCESS = 0,
+  // Other values omitted for now...
+} rsmi_status_t;
+
+typedef enum rsmi_memory_type {
+  RSMI_MEM_TYPE_VRAM = 0,
+  RSMI_MEM_TYPE_VIS_VRAM,
+  RSMI_MEM_TYPE_GTT,
+} rsmi_memory_type_t;
+
+typedef struct rocm_handle {
+  void *handle;
+  rsmi_status_t (*initFn)(uint64_t);
+  rsmi_status_t (*shutdownFn)(void);
+  rsmi_status_t (*totalMemFn)(uint32_t, rsmi_memory_type_t, uint64_t *);
+  rsmi_status_t (*usageMemFn)(uint32_t, rsmi_memory_type_t, uint64_t *);
+  // rsmi_status_t (*getHandle)(uint32_t, uint16_t *);
+} rocm_handle_t;
+
+typedef struct rocm_init_resp {
+  char *err;  // If err is non-null handle is invalid
+  rocm_handle_t rh;
+} rocm_init_resp_t;
+
+void rocm_init(rocm_init_resp_t *resp);
+void rocm_check_vram(rocm_handle_t rh, mem_info_t *resp);
+
+#endif  // __GPU_INFO_ROCM_H__
+#endif  // __APPLE__
--- a/gpu/gpu_test.go
+++ b/gpu/gpu_test.go
@@ -0,0 +1,26 @@
+package gpu
+
+import (
+	"runtime"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+)
+
+func TestBasicGetGPUInfo(t *testing.T) {
+	info := GetGPUInfo()
+	assert.Contains(t, "CUDA ROCM CPU METAL", info.Driver)
+
+	switch runtime.GOOS {
+	case "darwin":
+		// TODO - remove this once MacOS returns some size for CPU
+		return
+	case "linux", "windows":
+		assert.Greater(t, info.TotalMemory, uint64(0))
+		assert.Greater(t, info.FreeMemory, uint64(0))
+	default:
+		return
+	}
+}
+
+// TODO - add some logic to figure out card type through other means and actually verify we got back what we expected
--- a/gpu/types.go
+++ b/gpu/types.go
@@ -0,0 +1,11 @@
+package gpu
+
+// Beginning of an `ollama info` command
+type GpuInfo struct {
+	Driver      string `json:"driver,omitempty"`
+	Library     string `json:"library,omitempty"`
+	TotalMemory uint64 `json:"total_memory,omitempty"`
+	FreeMemory  uint64 `json:"free_memory,omitempty"`
+
+	// TODO add other useful attributes about the card here for discovery information
+}
--- a/llm/dynamic_shim.c
+++ b/llm/dynamic_shim.c
@@ -0,0 +1,136 @@
+#include "dynamic_shim.h"
+
+#include <stdio.h>
+#include <string.h>
+
+#ifdef __linux__
+#include <dlfcn.h>
+#define LOAD_LIBRARY(lib, flags) dlopen(lib, flags | RTLD_DEEPBIND)
+#define LOAD_SYMBOL(handle, sym) dlsym(handle, sym)
+#define LOAD_ERR() dlerror()
+#define UNLOAD_LIBRARY(handle) dlclose(handle)
+#elif _WIN32
+#include <windows.h>
+#define LOAD_LIBRARY(lib, flags) LoadLibrary(lib)
+#define LOAD_SYMBOL(handle, sym) GetProcAddress(handle, sym)
+#define UNLOAD_LIBRARY(handle) FreeLibrary(handle)
+// TODO - refactor this with proper error message handling on windows
+inline static char *LOAD_ERR() {
+  static char errbuf[8];
+  snprintf(errbuf, 8, "0x%lx", GetLastError());
+  return errbuf;
+}
+#else
+#include <dlfcn.h>
+#define LOAD_LIBRARY(lib, flags) dlopen(lib, flags)
+#define LOAD_SYMBOL(handle, sym) dlsym(handle, sym)
+#define LOAD_ERR() dlerror()
+#define UNLOAD_LIBRARY(handle) dlclose(handle)
+#endif
+
+void dynamic_shim_init(const char *libPath, struct dynamic_llama_server *s,
+                       ext_server_resp_t *err) {
+  int i = 0;
+  struct lookup {
+    char *s;
+    void **p;
+  } l[] = {
+      {"llama_server_init", (void *)&s->llama_server_init},
+      {"llama_server_start", (void *)&s->llama_server_start},
+      {"llama_server_stop", (void *)&s->llama_server_stop},
+      {"llama_server_completion", (void *)&s->llama_server_completion},
+      {"llama_server_completion_next_result",
+       (void *)&s->llama_server_completion_next_result},
+      {"llama_server_completion_cancel",
+       (void *)&s->llama_server_completion_cancel},
+      {"llama_server_release_task_result",
+       (void *)&s->llama_server_release_task_result},
+      {"llama_server_tokenize", (void *)&s->llama_server_tokenize},
+      {"llama_server_detokenize", (void *)&s->llama_server_detokenize},
+      {"llama_server_embedding", (void *)&s->llama_server_embedding},
+      {"llama_server_release_json_resp",
+       (void *)&s->llama_server_release_json_resp},
+      {"", NULL},
+  };
+
+  printf("Lazy loading %s library\n", libPath);
+  s->handle = LOAD_LIBRARY(libPath, RTLD_NOW);
+  if (!s->handle) {
+    err->id = -1;
+    snprintf(err->msg, err->msg_len,
+             "Unable to load dynamic server library: %s", LOAD_ERR());
+    return;
+  }
+
+  for (i = 0; l[i].p != NULL; i++) {
+    *l[i].p = LOAD_SYMBOL(s->handle, l[i].s);
+    if (!l[i].p) {
+      UNLOAD_LIBRARY(s->handle);
+      err->id = -1;
+      snprintf(err->msg, err->msg_len, "symbol lookup for %s failed: %s",
+               l[i].s, LOAD_ERR());
+      return;
+    }
+  }
+}
+
+inline void dynamic_shim_llama_server_init(struct dynamic_llama_server s,
+                                           ext_server_params_t *sparams,
+                                           ext_server_resp_t *err) {
+  s.llama_server_init(sparams, err);
+}
+
+inline void dynamic_shim_llama_server_start(struct dynamic_llama_server s) {
+  s.llama_server_start();
+}
+
+inline void dynamic_shim_llama_server_stop(struct dynamic_llama_server s) {
+  s.llama_server_stop();
+}
+
+inline void dynamic_shim_llama_server_completion(struct dynamic_llama_server s,
+                                                 const char *json_req,
+                                                 ext_server_resp_t *resp) {
+  s.llama_server_completion(json_req, resp);
+}
+
+inline void dynamic_shim_llama_server_completion_next_result(
+    struct dynamic_llama_server s, const int task_id,
+    ext_server_task_result_t *result) {
+  s.llama_server_completion_next_result(task_id, result);
+}
+
+inline void dynamic_shim_llama_server_completion_cancel(
+    struct dynamic_llama_server s, const int task_id, ext_server_resp_t *err) {
+  s.llama_server_completion_cancel(task_id, err);
+}
+inline void dynamic_shim_llama_server_release_task_result(
+    struct dynamic_llama_server s, ext_server_task_result_t *result) {
+  s.llama_server_release_task_result(result);
+}
+
+inline void dynamic_shim_llama_server_tokenize(struct dynamic_llama_server s,
+                                               const char *json_req,
+                                               char **json_resp,
+                                               ext_server_resp_t *err) {
+  s.llama_server_tokenize(json_req, json_resp, err);
+}
+
+inline void dynamic_shim_llama_server_detokenize(struct dynamic_llama_server s,
+                                                 const char *json_req,
+                                                 char **json_resp,
+                                                 ext_server_resp_t *err) {
+  s.llama_server_detokenize(json_req, json_resp, err);
+}
+
+inline void dynamic_shim_llama_server_embedding(struct dynamic_llama_server s,
+                                                const char *json_req,
+                                                char **json_resp,
+                                                ext_server_resp_t *err) {
+  s.llama_server_embedding(json_req, json_resp, err);
+}
+
+inline void dynamic_shim_llama_server_release_json_resp(
+    struct dynamic_llama_server s, char **json_resp) {
+  s.llama_server_release_json_resp(json_resp);
+}
--- a/llm/dynamic_shim.h
+++ b/llm/dynamic_shim.h
@@ -0,0 +1,74 @@
+#include <stdlib.h>
+
+#include "server.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+struct dynamic_llama_server {
+  void *handle;
+  void (*llama_server_init)(ext_server_params_t *sparams,
+                            ext_server_resp_t *err);
+  void (*llama_server_start)();
+  void (*llama_server_stop)();
+  void (*llama_server_completion)(const char *json_req,
+                                  ext_server_resp_t *resp);
+  void (*llama_server_completion_next_result)(const int task_id,
+                                              ext_server_task_result_t *result);
+  void (*llama_server_completion_cancel)(const int task_id,
+                                         ext_server_resp_t *err);
+  void (*llama_server_release_task_result)(ext_server_task_result_t *result);
+  void (*llama_server_tokenize)(const char *json_req, char **json_resp,
+                                ext_server_resp_t *err);
+  void (*llama_server_detokenize)(const char *json_req, char **json_resp,
+                                  ext_server_resp_t *err);
+  void (*llama_server_embedding)(const char *json_req, char **json_resp,
+                                 ext_server_resp_t *err);
+  void (*llama_server_release_json_resp)(char **json_resp);
+};
+
+void dynamic_shim_init(const char *libPath, struct dynamic_llama_server *s,
+                       ext_server_resp_t *err);
+
+// No good way to call C function pointers from Go so inline the indirection
+void dynamic_shim_llama_server_init(struct dynamic_llama_server s,
+                                    ext_server_params_t *sparams,
+                                    ext_server_resp_t *err);
+
+void dynamic_shim_llama_server_start(struct dynamic_llama_server s);
+
+void dynamic_shim_llama_server_stop(struct dynamic_llama_server s);
+
+void dynamic_shim_llama_server_completion(struct dynamic_llama_server s,
+                                          const char *json_req,
+                                          ext_server_resp_t *resp);
+
+void dynamic_shim_llama_server_completion_next_result(
+    struct dynamic_llama_server s, const int task_id,
+    ext_server_task_result_t *result);
+
+void dynamic_shim_llama_server_completion_cancel(struct dynamic_llama_server s,
+                                                 const int task_id,
+                                                 ext_server_resp_t *err);
+
+void dynamic_shim_llama_server_release_task_result(
+    struct dynamic_llama_server s, ext_server_task_result_t *result);
+
+void dynamic_shim_llama_server_tokenize(struct dynamic_llama_server s,
+                                        const char *json_req, char **json_resp,
+                                        ext_server_resp_t *err);
+
+void dynamic_shim_llama_server_detokenize(struct dynamic_llama_server s,
+                                          const char *json_req,
+                                          char **json_resp,
+                                          ext_server_resp_t *err);
+
+void dynamic_shim_llama_server_embedding(struct dynamic_llama_server s,
+                                         const char *json_req, char **json_resp,
+                                         ext_server_resp_t *err);
+void dynamic_shim_llama_server_release_json_resp(struct dynamic_llama_server s,
+                                                 char **json_resp);
+
+#ifdef __cplusplus
+}
+#endif
--- a/llm/ext_server.go
+++ b/llm/ext_server.go
@@ -0,0 +1,423 @@
+package llm
+
+/*
+#cgo CFLAGS: -I${SRCDIR}/llama.cpp/gguf -I${SRCDIR}/llama.cpp/gguf/common -I${SRCDIR}/llama.cpp/gguf/examples/server
+#cgo CFLAGS: -DNDEBUG -DLLAMA_SERVER_LIBRARY=1 -D_XOPEN_SOURCE=600 -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64
+#cgo CFLAGS: -Wmissing-noreturn -Wall -Wextra -Wcast-qual -Wno-unused-function -Wno-array-bounds
+#cgo CPPFLAGS: -Ofast -Wall -Wextra -Wno-unused-function -Wno-unused-variable -Wno-deprecated-declarations -Wno-unused-but-set-variable
+#cgo darwin CFLAGS: -D_DARWIN_C_SOURCE
+#cgo darwin CPPFLAGS:  -DGGML_USE_ACCELERATE
+#cgo darwin CPPFLAGS: -DGGML_USE_METAL -DGGML_METAL_NDEBUG
+#cgo darwin LDFLAGS: -lc++ -framework Accelerate
+#cgo darwin LDFLAGS: -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
+#cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/metal/common/libcommon.a
+#cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/metal/examples/server/libext_server.a
+#cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/metal/libllama.a
+#cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/metal/libggml_static.a
+#cgo linux CFLAGS: -D_GNU_SOURCE
+#cgo linux windows CFLAGS: -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_USE_CUBLAS
+#cgo linux LDFLAGS: -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/local/cuda/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib/stubs
+#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cpu/examples/server/libext_server.a
+#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cpu/common/libcommon.a
+#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cpu/libllama.a
+#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cpu/libggml_static.a
+#cgo linux LDFLAGS: -lrt -lpthread -ldl -lstdc++ -lm
+#cgo windows LDFLAGS: -L${SRCDIR}/llama.cpp/gguf/build/wincpu/dist/lib
+#cgo windows LDFLAGS: -lcpu_server -lpthread
+
+#include <stdlib.h>
+#include "server.h"
+
+*/
+import "C"
+import (
+	"bytes"
+	"context"
+	"encoding/json"
+	"fmt"
+	"log"
+	"os"
+	"strings"
+	"sync"
+	"time"
+	"unsafe"
+
+	"github.com/jmorganca/ollama/api"
+	"github.com/jmorganca/ollama/gpu"
+)
+
+func newExtServerResp(len C.size_t) C.ext_server_resp_t {
+	var resp C.ext_server_resp_t
+	resp.msg_len = len
+	bytes := make([]byte, len)
+	resp.msg = (*C.char)(C.CBytes(bytes))
+	return resp
+}
+
+func freeExtServerResp(resp C.ext_server_resp_t) {
+	if resp.msg_len == 0 {
+		return
+	}
+	C.free(unsafe.Pointer(resp.msg))
+}
+
+func extServerResponseToErr(resp C.ext_server_resp_t) error {
+	return fmt.Errorf(C.GoString(resp.msg))
+}
+
+type extServer interface {
+	LLM
+	llama_server_init(sparams *C.ext_server_params_t, err *C.ext_server_resp_t)
+	llama_server_start()
+	llama_server_stop()
+	llama_server_completion(json_req *C.char, resp *C.ext_server_resp_t)
+	llama_server_completion_next_result(task_id C.int, resp *C.ext_server_task_result_t)
+	llama_server_completion_cancel(task_id C.int, err *C.ext_server_resp_t)
+	llama_server_release_task_result(result *C.ext_server_task_result_t)
+	llama_server_tokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t)
+	llama_server_detokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t)
+	llama_server_embedding(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t)
+	llama_server_release_json_resp(json_resp **C.char)
+}
+
+type llamaExtServer struct {
+	api.Options
+}
+
+// Note: current implementation does not support concurrent instantiations
+var mutex sync.Mutex
+
+func (llm *llamaExtServer) llama_server_init(sparams *C.ext_server_params_t, err *C.ext_server_resp_t) {
+	C.llama_server_init(sparams, err)
+}
+func (llm *llamaExtServer) llama_server_start() {
+	C.llama_server_start()
+}
+func (llm *llamaExtServer) llama_server_stop() {
+	C.llama_server_stop()
+}
+
+func (llm *llamaExtServer) llama_server_completion(json_req *C.char, resp *C.ext_server_resp_t) {
+	C.llama_server_completion(json_req, resp)
+}
+func (llm *llamaExtServer) llama_server_completion_next_result(task_id C.int, resp *C.ext_server_task_result_t) {
+	C.llama_server_completion_next_result(task_id, resp)
+}
+func (llm *llamaExtServer) llama_server_completion_cancel(task_id C.int, err *C.ext_server_resp_t) {
+	C.llama_server_completion_cancel(task_id, err)
+}
+func (llm *llamaExtServer) llama_server_release_task_result(result *C.ext_server_task_result_t) {
+	C.llama_server_release_task_result(result)
+}
+
+func (llm *llamaExtServer) llama_server_tokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
+	C.llama_server_tokenize(json_req, json_resp, err)
+}
+func (llm *llamaExtServer) llama_server_detokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
+	C.llama_server_detokenize(json_req, json_resp, err)
+}
+func (llm *llamaExtServer) llama_server_embedding(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
+	C.llama_server_embedding(json_req, json_resp, err)
+}
+func (llm *llamaExtServer) llama_server_release_json_resp(json_resp **C.char) {
+	C.llama_server_release_json_resp(json_resp)
+}
+
+func newDefaultExtServer(model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) {
+	server := &llamaExtServer{opts}
+	return newExtServer(server, model, adapters, projectors, numLayers, opts)
+}
+
+func newExtServer(server extServer, model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) {
+	if !mutex.TryLock() {
+		log.Printf("concurrent llm servers not yet supported, waiting for prior server to complete")
+		mutex.Lock()
+	}
+	fileInfo, err := os.Stat(model)
+	if err != nil {
+		return nil, err
+	}
+	var sparams C.ext_server_params_t
+	sparams.model = C.CString(model)
+	defer C.free(unsafe.Pointer(sparams.model))
+
+	numGPU := gpu.NumGPU(numLayers, fileInfo.Size(), opts)
+
+	sparams.embedding = true
+	sparams.n_ctx = C.uint(opts.NumCtx)
+	sparams.n_batch = C.uint(opts.NumBatch)
+	sparams.n_gpu_layers = C.int(numGPU)
+	sparams.main_gpu = C.int(opts.MainGPU)
+	sparams.n_parallel = 1 // TODO - wire up concurrency
+
+	// Always use the value encoded in the model
+	sparams.rope_freq_base = 0.0
+	sparams.rope_freq_scale = 0.0
+	sparams.memory_f16 = C.bool(opts.F16KV)
+	sparams.use_mlock = C.bool(opts.UseMLock)
+	sparams.use_mmap = C.bool(opts.UseMMap)
+	sparams.numa = C.bool(opts.UseNUMA)
+
+	sparams.lora_adapters = nil
+	for i := 0; i < len(adapters); i++ {
+		la := (*C.ext_server_lora_adapter_t)(C.malloc(C.sizeof_ext_server_lora_adapter_t))
+		defer C.free(unsafe.Pointer(la))
+		la.adapter = C.CString(adapters[i])
+		defer C.free(unsafe.Pointer(la.adapter))
+		la.scale = C.float(1.0) // TODO expose scale/weights up through ollama UX
+		la.next = nil
+		if i == 0 {
+			sparams.lora_adapters = la
+		} else {
+			tmp := sparams.lora_adapters
+			for ; tmp.next != nil; tmp = tmp.next {
+			}
+			tmp.next = la
+		}
+	}
+
+	if len(projectors) > 0 {
+		// TODO: applying multiple projectors is not supported by the llama.cpp server yet
+		sparams.mmproj = C.CString(projectors[0])
+		defer C.free(unsafe.Pointer(sparams.mmproj))
+	} else {
+		sparams.mmproj = nil
+	}
+
+	sparams.n_threads = C.uint(opts.NumThread)
+
+	log.Printf("Initializing internal llama server")
+	resp := newExtServerResp(128)
+	defer freeExtServerResp(resp)
+	server.llama_server_init(&sparams, &resp)
+	if resp.id < 0 {
+		return nil, extServerResponseToErr(resp)
+	}
+
+	log.Printf("Starting internal llama main loop")
+	server.llama_server_start()
+	return server, nil
+}
+
+func (llm *llamaExtServer) Predict(ctx context.Context, pred PredictOpts, fn func(PredictResult)) error {
+	return predict(llm, llm.Options, ctx, pred, fn)
+}
+
+func predict(llm extServer, opts api.Options, ctx context.Context, predict PredictOpts, fn func(PredictResult)) error {
+	resp := newExtServerResp(128)
+	defer freeExtServerResp(resp)
+	var imageData []ImageData
+	if len(predict.Images) > 0 {
+		for cnt, i := range predict.Images {
+			imageData = append(imageData, ImageData{Data: i, ID: cnt})
+		}
+	}
+	log.Printf("loaded %d images", len(imageData))
+
+	request := map[string]any{
+		"prompt":            predict.Prompt,
+		"stream":            true,
+		"n_predict":         opts.NumPredict,
+		"n_keep":            opts.NumKeep,
+		"temperature":       opts.Temperature,
+		"top_k":             opts.TopK,
+		"top_p":             opts.TopP,
+		"tfs_z":             opts.TFSZ,
+		"typical_p":         opts.TypicalP,
+		"repeat_last_n":     opts.RepeatLastN,
+		"repeat_penalty":    opts.RepeatPenalty,
+		"presence_penalty":  opts.PresencePenalty,
+		"frequency_penalty": opts.FrequencyPenalty,
+		"mirostat":          opts.Mirostat,
+		"mirostat_tau":      opts.MirostatTau,
+		"mirostat_eta":      opts.MirostatEta,
+		"penalize_nl":       opts.PenalizeNewline,
+		"seed":              opts.Seed,
+		"stop":              opts.Stop,
+		"image_data":        imageData,
+	}
+
+	if predict.Format == "json" {
+		request["grammar"] = jsonGrammar
+	}
+
+	retryDelay := 100 * time.Microsecond
+	for retries := 0; retries < maxRetries; retries++ {
+		if retries > 0 {
+			time.Sleep(retryDelay) // wait before retrying
+			retryDelay *= 2        // exponential backoff
+		}
+
+		// Handling JSON marshaling with special characters unescaped.
+		buffer := &bytes.Buffer{}
+		enc := json.NewEncoder(buffer)
+		enc.SetEscapeHTML(false)
+
+		if err := enc.Encode(request); err != nil {
+			return fmt.Errorf("failed to marshal data: %w", err)
+		}
+
+		req := C.CString(buffer.String())
+		defer C.free(unsafe.Pointer(req))
+
+		llm.llama_server_completion(req, &resp)
+		if resp.id < 0 {
+			return extServerResponseToErr(resp)
+		}
+
+		retryNeeded := false
+	out:
+		for {
+			select {
+			case <-ctx.Done():
+				// This handles the request cancellation
+				llm.llama_server_completion_cancel(resp.id, &resp)
+				if resp.id < 0 {
+					return extServerResponseToErr(resp)
+				} else {
+					return nil
+				}
+			default:
+				var result C.ext_server_task_result_t
+				llm.llama_server_completion_next_result(resp.id, &result)
+				json_resp := C.GoString(result.json_resp)
+				llm.llama_server_release_task_result(&result)
+
+				var p prediction
+				if err := json.Unmarshal([]byte(json_resp), &p); err != nil {
+					llm.llama_server_completion_cancel(resp.id, &resp)
+					if resp.id < 0 {
+						return fmt.Errorf("error unmarshaling llm prediction response: %w and cancel %s", err, C.GoString(resp.msg))
+					} else {
+						return fmt.Errorf("error unmarshaling llm prediction response: %w", err)
+					}
+				}
+
+				if bool(result.error) && strings.Contains(json_resp, "slot unavailable") {
+					retryNeeded = true
+					// task will already be canceled
+					break out
+				}
+
+				if p.Content != "" {
+					fn(PredictResult{
+						Content: p.Content,
+					})
+				}
+
+				if p.Stop {
+					fn(PredictResult{
+						Done:               true,
+						PromptEvalCount:    p.Timings.PromptN,
+						PromptEvalDuration: parseDurationMs(p.Timings.PromptMS),
+						EvalCount:          p.Timings.PredictedN,
+						EvalDuration:       parseDurationMs(p.Timings.PredictedMS),
+					})
+					return nil
+				}
+			}
+		}
+		if !retryNeeded {
+			return nil // success
+		}
+	}
+
+	// should never reach here ideally
+	return fmt.Errorf("max retries exceeded")
+}
+func (llm *llamaExtServer) Encode(ctx context.Context, prompt string) ([]int, error) {
+	return encode(llm, ctx, prompt)
+}
+
+func encode(llm extServer, ctx context.Context, prompt string) ([]int, error) {
+	data, err := json.Marshal(TokenizeRequest{Content: prompt})
+	if err != nil {
+		return nil, fmt.Errorf("marshaling encode data: %w", err)
+	}
+	req := C.CString(string(data))
+	defer C.free(unsafe.Pointer(req))
+	var json_resp *C.char
+	resp := newExtServerResp(128)
+	defer freeExtServerResp(resp)
+	llm.llama_server_tokenize(req, &json_resp, &resp)
+	if resp.id < 0 {
+		return nil, extServerResponseToErr(resp)
+	}
+	defer llm.llama_server_release_json_resp(&json_resp)
+
+	var encoded TokenizeResponse
+	if err2 := json.Unmarshal([]byte(C.GoString(json_resp)), &encoded); err2 != nil {
+		return nil, fmt.Errorf("unmarshal encode response: %w", err2)
+	}
+
+	return encoded.Tokens, err
+}
+
+func (llm *llamaExtServer) Decode(ctx context.Context, tokens []int) (string, error) {
+	return decode(llm, ctx, tokens)
+}
+
+func decode(llm extServer, ctx context.Context, tokens []int) (string, error) {
+	if len(tokens) == 0 {
+		return "", nil
+	}
+	data, err := json.Marshal(DetokenizeRequest{Tokens: tokens})
+	if err != nil {
+		return "", fmt.Errorf("marshaling decode data: %w", err)
+	}
+
+	req := C.CString(string(data))
+	defer C.free(unsafe.Pointer(req))
+	var json_resp *C.char
+	resp := newExtServerResp(128)
+	defer freeExtServerResp(resp)
+	llm.llama_server_detokenize(req, &json_resp, &resp)
+	if resp.id < 0 {
+		return "", extServerResponseToErr(resp)
+	}
+	defer llm.llama_server_release_json_resp(&json_resp)
+
+	var decoded DetokenizeResponse
+	if err2 := json.Unmarshal([]byte(C.GoString(json_resp)), &decoded); err2 != nil {
+		return "", fmt.Errorf("unmarshal encode response: %w", err2)
+	}
+
+	return decoded.Content, err
+}
+
+func (llm *llamaExtServer) Embedding(ctx context.Context, input string) ([]float64, error) {
+	return embedding(llm, ctx, input)
+}
+func embedding(llm extServer, ctx context.Context, input string) ([]float64, error) {
+	data, err := json.Marshal(TokenizeRequest{Content: input})
+	if err != nil {
+		return nil, fmt.Errorf("error marshaling embed data: %w", err)
+	}
+
+	req := C.CString(string(data))
+	defer C.free(unsafe.Pointer(req))
+	var json_resp *C.char
+	resp := newExtServerResp(128)
+	defer freeExtServerResp(resp)
+	llm.llama_server_embedding(req, &json_resp, &resp)
+	if resp.id < 0 {
+		return nil, extServerResponseToErr(resp)
+	}
+	defer llm.llama_server_release_json_resp(&json_resp)
+
+	var embedding EmbeddingResponse
+	if err := json.Unmarshal([]byte(C.GoString(json_resp)), &embedding); err != nil {
+		return nil, fmt.Errorf("unmarshal tokenize response: %w", err)
+	}
+
+	return embedding.Embedding, nil
+}
+
+func (llm *llamaExtServer) Close() {
+	close(llm)
+}
+
+func close(llm extServer) {
+	llm.llama_server_stop()
+	mutex.Unlock()
+}
--- a/llm/falcon.go
+++ b/llm/falcon.go
@@ -1,20 +0,0 @@
-package llm
-
-const (
-	falconModelType7B   = 32
-	falconModelType40B  = 60
-	falconModelType180B = 80
-)
-
-func falconModelType(numLayer uint32) string {
-	switch numLayer {
-	case 32:
-		return "7B"
-	case 60:
-		return "40B"
-	case 80:
-		return "180B"
-	default:
-		return "unknown"
-	}
-}
--- a/llm/ggml.go
+++ b/llm/ggml.go
@@ -86,74 +86,6 @@ type container interface {
 	Decode(*readSeekOffset) (model, error)
 }

-type containerGGML struct{}
-
-func (c *containerGGML) Name() string {
-	return "ggml"
-}
-
-func (c *containerGGML) Decode(ro *readSeekOffset) (model, error) {
-	// file contents aren't decoded
-	ro.Seek(0, io.SeekEnd)
-	return nil, nil
-}
-
-type containerGGMF struct {
-	version uint32
-}
-
-func (c *containerGGMF) Name() string {
-	return "ggmf"
-}
-
-func (c *containerGGMF) Decode(ro *readSeekOffset) (model, error) {
-	var version uint32
-	binary.Read(ro, binary.LittleEndian, &version)
-
-	switch version {
-	case 1:
-	default:
-		return nil, errors.New("invalid version")
-	}
-
-	c.version = version
-
-	// remaining file contents aren't decoded
-	ro.Seek(0, io.SeekEnd)
-
-	return nil, nil
-}
-
-type containerGGJT struct {
-	version uint32
-}
-
-func (c *containerGGJT) Name() string {
-	return "ggjt"
-}
-
-func (c *containerGGJT) Decode(ro *readSeekOffset) (model, error) {
-	var version uint32
-	binary.Read(ro, binary.LittleEndian, &version)
-
-	switch version {
-	case 1, 2, 3:
-	default:
-		return nil, errors.New("invalid version")
-	}
-
-	c.version = version
-
-	// different model types may have different layouts for hyperparameters
-	var llama llamaModel
-	binary.Read(ro, binary.LittleEndian, &llama.hyperparameters)
-
-	// remaining file contents aren't decoded
-	ro.Seek(0, io.SeekEnd)
-
-	return &llama, nil
-}
-
 type containerLORA struct {
 	version uint32
 }
@@ -194,6 +126,8 @@ const (
 	FILE_MAGIC_GGUF_BE = 0x47475546
 )

+var ErrUnsupportedFormat = errors.New("unsupported model format")
+
 func DecodeGGML(r io.ReadSeeker) (*GGML, error) {
 	ro := readSeekOffset{ReadSeeker: r}

@@ -204,12 +138,8 @@ func DecodeGGML(r io.ReadSeeker) (*GGML, error) {

 	var c container
 	switch magic {
-	case FILE_MAGIC_GGML:
-		c = &containerGGML{}
-	case FILE_MAGIC_GGMF:
-		c = &containerGGMF{}
-	case FILE_MAGIC_GGJT:
-		c = &containerGGJT{}
+	case FILE_MAGIC_GGML, FILE_MAGIC_GGMF, FILE_MAGIC_GGJT:
+		return nil, ErrUnsupportedFormat
 	case FILE_MAGIC_GGLA:
 		c = &containerLORA{}
 	case FILE_MAGIC_GGUF_LE:
--- a/llm/gguf.go
+++ b/llm/gguf.go
@@ -120,27 +120,6 @@ func (llm *ggufModel) ModelType() string {
 		return format.HumanNumber(llm.parameters)
 	}

-	switch llm.ModelFamily() {
-	case "llama":
-		if blocks, ok := llm.kv["llama.block_count"].(uint32); ok {
-			heads, headsOK := llm.kv["llama.head_count"].(uint32)
-			headKVs, headsKVsOK := llm.kv["llama.head_count_kv"].(uint32)
-			if headsOK && headsKVsOK && heads/headKVs == 8 {
-				return "70B"
-			}
-
-			return llamaModelType(blocks)
-		}
-	case "falcon":
-		if blocks, ok := llm.kv["falcon.block_count"].(uint32); ok {
-			return falconModelType(blocks)
-		}
-	case "starcoder":
-		if blocks, ok := llm.kv["starcoder.block_count"].(uint32); ok {
-			return starCoderModelType(blocks)
-		}
-	}
-
 	return "unknown"
 }

--- a/llm/llama.cpp/gen_common.sh
+++ b/llm/llama.cpp/gen_common.sh
@@ -0,0 +1,42 @@
+# common logic accross linux and darwin
+
+init_vars() {
+    LLAMACPP_DIR=gguf
+    PATCHES="0001-Expose-callable-API-for-server.patch"
+    CMAKE_DEFS="-DLLAMA_ACCELERATE=on"
+    # TODO - LLAMA_K_QUANTS is stale and needs to be mapped to newer cmake settings
+    CMAKE_TARGETS="--target ggml --target ggml_static --target llama --target build_info --target common --target ext_server --target llava_static"
+    if echo "${CGO_CFLAGS}" | grep -- '-g' >/dev/null; then
+        CMAKE_DEFS="-DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_VERBOSE_MAKEFILE=on -DLLAMA_GPROF=on -DLLAMA_SERVER_VERBOSE=on ${CMAKE_DEFS}"
+    else
+        # TODO - add additional optimization flags...
+        CMAKE_DEFS="-DCMAKE_BUILD_TYPE=Release -DLLAMA_SERVER_VERBOSE=off ${CMAKE_DEFS}"
+    fi
+}
+
+git_module_setup() {
+    if [ -n "${OLLAMA_SKIP_PATCHING}" ]; then
+        echo "Skipping submodule initialization"
+        return
+    fi
+    git submodule init
+    git submodule update --force gguf
+
+}
+
+apply_patches() {
+    if [ -n "${OLLAMA_SKIP_PATCHING}" ]; then
+        echo "Skipping submodule patching"
+        return
+    fi
+    # Workaround git apply not handling creation well for iteration
+    rm -f gguf/examples/server/server.h
+    for patch in ${PATCHES}; do
+        git -C gguf apply ../patches/${patch}
+    done
+}
+
+build() {
+    cmake -S ${LLAMACPP_DIR} -B ${BUILD_DIR} ${CMAKE_DEFS}
+    cmake --build ${BUILD_DIR} ${CMAKE_TARGETS} -j8
+}
--- a/llm/llama.cpp/gen_darwin.sh
+++ b/llm/llama.cpp/gen_darwin.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+# This script is intended to run inside the go generate
+# working directory must be ../llm/llama.cpp
+
+# TODO - add hardening to detect missing tools (cmake, etc.)
+
+set -ex
+set -o pipefail
+echo "Starting darwin generate script"
+source $(dirname $0)/gen_common.sh
+init_vars
+CMAKE_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 -DLLAMA_METAL=on ${CMAKE_DEFS}"
+BUILD_DIR="gguf/build/metal"
+case "${GOARCH}" in
+    "amd64")
+        CMAKE_DEFS="-DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 ${CMAKE_DEFS}"
+        ;;
+     "arm64")
+        CMAKE_DEFS="-DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 ${CMAKE_DEFS}"
+        ;;
+    *)
+        echo "GOARCH must be set"
+        echo "this script is meant to be run from within go generate"
+        exit 1
+        ;;
+esac
+
+git_module_setup
+apply_patches
+build
--- a/llm/llama.cpp/gen_linux.sh
+++ b/llm/llama.cpp/gen_linux.sh
@@ -0,0 +1,89 @@
+#!/bin/bash
+# This script is intended to run inside the go generate
+# working directory must be llm/llama.cpp
+
+# First we build our default built-in library which will be linked into the CGO
+# binary as a normal dependency. This default build is CPU based.
+#
+# Then we build a CUDA dynamic library (although statically linked with the CUDA
+# library dependencies for maximum portability)
+#
+# Then if we detect ROCm, we build a dynamically loaded ROCm lib.  ROCm is particularly
+# important to be a dynamic lib even if it's the only GPU library detected because
+# we can't redistribute the objectfiles but must rely on dynamic libraries at
+# runtime, which could lead the server not to start if not present.
+
+set -ex
+set -o pipefail
+
+echo "Starting linux generate script"
+if [ -z "${CUDACXX}" -a -x /usr/local/cuda/bin/nvcc ]; then
+    export CUDACXX=/usr/local/cuda/bin/nvcc
+fi
+COMMON_CMAKE_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_ACCELERATE=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off"
+OLLAMA_DYN_LIB_DIR="gguf/build/lib"
+source $(dirname $0)/gen_common.sh
+init_vars
+git_module_setup
+apply_patches
+
+mkdir -p ${OLLAMA_DYN_LIB_DIR}
+touch ${OLLAMA_DYN_LIB_DIR}/.generated
+
+#
+# CPU first for the default library
+#
+CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS}"
+BUILD_DIR="gguf/build/cpu"
+build
+
+if [ -d /usr/local/cuda/lib64/ ]; then
+    echo "CUDA libraries detected - building dynamic CUDA library"
+    init_vars
+    CMAKE_DEFS="-DLLAMA_CUBLAS=on ${COMMON_CMAKE_DEFS} ${CMAKE_DEFS}"
+    BUILD_DIR="gguf/build/cuda"
+    CUDA_LIB_DIR=/usr/local/cuda/lib64
+    build
+    gcc -fPIC -g -shared -o ${OLLAMA_DYN_LIB_DIR}/libcuda_server.so \
+        -Wl,--whole-archive \
+        ${BUILD_DIR}/examples/server/libext_server.a \
+        ${BUILD_DIR}/common/libcommon.a \
+        ${BUILD_DIR}/libllama.a \
+        -Wl,--no-whole-archive \
+        ${CUDA_LIB_DIR}/libcudart_static.a \
+        ${CUDA_LIB_DIR}/libcublas_static.a \
+        ${CUDA_LIB_DIR}/libcublasLt_static.a \
+        ${CUDA_LIB_DIR}/libcudadevrt.a \
+        ${CUDA_LIB_DIR}/libculibos.a \
+        -lrt -lpthread -ldl -lstdc++ -lm
+fi
+
+if [ -z "${ROCM_PATH}" ]; then
+    # Try the default location in case it exists
+    ROCM_PATH=/opt/rocm
+fi
+
+if [ -z "${CLBlast_DIR}" ]; then
+    # Try the default location in case it exists
+    if [ -d /usr/lib/cmake/CLBlast ]; then
+        export CLBlast_DIR=/usr/lib/cmake/CLBlast
+    fi
+fi
+
+if [ -d "${ROCM_PATH}" ]; then
+    echo "ROCm libraries detected - building dynamic ROCm library"
+    init_vars
+    CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DLLAMA_HIPBLAS=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS='gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102' -DGPU_TARGETS='gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102'"
+    BUILD_DIR="gguf/build/rocm"
+    build
+    gcc -fPIC -g -shared -o ${OLLAMA_DYN_LIB_DIR}/librocm_server.so \
+        -Wl,--whole-archive \
+        ${BUILD_DIR}/examples/server/libext_server.a \
+        ${BUILD_DIR}/common/libcommon.a \
+        ${BUILD_DIR}/libllama.a \
+        -Wl,--no-whole-archive \
+        -lrt -lpthread -ldl -lstdc++ -lm \
+        -L/opt/rocm/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ \
+        -Wl,-rpath,/opt/rocm/lib,-rpath,/opt/amdgpu/lib/x86_64-linux-gnu/ \
+        -lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu
+fi
--- a/llm/llama.cpp/gen_windows.ps1
+++ b/llm/llama.cpp/gen_windows.ps1
@@ -0,0 +1,93 @@
+#!powershell
+
+$ErrorActionPreference = "Stop"
+
+function init_vars {
+    $script:patches = @("0001-Expose-callable-API-for-server.patch")
+    $script:cmakeDefs = @("-DBUILD_SHARED_LIBS=on", "-DLLAMA_NATIVE=off", "-DLLAMA_F16C=off", "-DLLAMA_FMA=off", "-DLLAMA_AVX512=off", "-DLLAMA_AVX2=off", "-DLLAMA_AVX=on", "-DLLAMA_K_QUANTS=on", "-DLLAMA_ACCELERATE=on", "-A","x64")
+
+    if ($env:CGO_CFLAGS -contains "-g") {
+        $script:cmakeDefs += @("-DCMAKE_VERBOSE_MAKEFILE=on", "-DLLAMA_SERVER_VERBOSE=on")
+        $script:config = "RelWithDebInfo"
+    } else {
+        $script:cmakeDefs += @("-DLLAMA_SERVER_VERBOSE=off")
+        $script:config = "Release"
+    }
+}
+
+function git_module_setup {
+    # TODO add flags to skip the init/patch logic to make it easier to mod llama.cpp code in-repo
+    & git submodule init
+    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
+    & git submodule update --force gguf
+    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
+}
+
+function apply_patches {
+    rm -erroraction ignore -path "gguf/examples/server/server.h"
+    foreach ($patch in $script:patches) {
+        write-host "Applying patch $patch"
+        & git -C gguf apply ../patches/$patch
+        if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
+    }
+}
+
+function build {
+    write-host "generating config with: cmake -S gguf -B $script:buildDir $script:cmakeDefs"
+    & cmake --version
+    & cmake -S gguf -B $script:buildDir $script:cmakeDefs
+    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
+    write-host "building with: cmake --build $script:buildDir --config $script:config"
+    & cmake --build $script:buildDir --config $script:config
+    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
+}
+
+function install {
+    rm -erroraction ignore -recurse -force -path $script:installDir
+    & cmake --install $script:buildDir --prefix $script:installDir --config $script:config
+    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
+
+}
+
+init_vars
+git_module_setup
+apply_patches
+
+# first build CPU based
+$script:buildDir="gguf/build/wincpu"
+$script:installDir="gguf/build/wincpu/dist"
+
+build
+# install
+
+md gguf/build/lib -ea 0
+md gguf/build/wincpu/dist/lib -ea 0
+mv gguf/build/wincpu/bin/$script:config/ext_server_shared.dll gguf/build/wincpu/dist/lib/cpu_server.dll
+
+
+# Nope, this barfs on lots of symbol problems
+#mv gguf/build/wincpu/examples/server/$script:config/ext_server_shared.dll gguf/build/wincpu/dist/lib/cpu_server.lib
+# Nope: this needs lots of include paths to pull in things like msvcprt.lib and other deps
+# & cl.exe `
+#     gguf/build/wincpu/examples/server/$script:config/ext_server.lib `
+#     gguf/build/wincpu/common/$script:config/common.lib `
+#     gguf/build/wincpu/$script:config/llama.lib `
+#     gguf/build/wincpu/$script:config/ggml_static.lib `
+#     /link /DLL /DEF:cpu_server.def /NOENTRY /MACHINE:X64  /OUT:gguf/build/wincpu/dist/lib/cpu_server.dll
+# if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
+
+# Then build cuda as a dynamically loaded library
+init_vars
+$script:buildDir="gguf/build/wincuda"
+$script:installDir="gguf/build/wincuda/dist"
+$script:cmakeDefs += @("-DLLAMA_CUBLAS=ON", "-DBUILD_SHARED_LIBS=on")
+build
+install
+cp gguf/build/wincuda/dist/bin/ext_server_shared.dll gguf/build/lib/cuda_server.dll
+
+# TODO - more to do here to create a usable dll
+
+
+# TODO - implement ROCm support on windows
+md gguf/build/winrocm/lib -ea 0
+echo $null >> gguf/build/winrocm/lib/.generated
--- a/llm/llama.cpp/generate_darwin.go
+++ b/llm/llama.cpp/generate_darwin.go
@@ -0,0 +1,3 @@
+package llm
+
+//go:generate sh ./gen_darwin.sh
--- a/llm/llama.cpp/generate_darwin_amd64.go
+++ b/llm/llama.cpp/generate_darwin_amd64.go
@@ -1,18 +0,0 @@
-package llm
-
-//go:generate git submodule init
-
-//go:generate git submodule update --force ggml
-//go:generate git -C ggml apply ../patches/0001-add-detokenize-endpoint.patch
-//go:generate git -C ggml apply ../patches/0002-34B-model-support.patch
-//go:generate git -C ggml apply ../patches/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
-//go:generate git -C ggml apply ../patches/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
-//go:generate cmake -S ggml -B ggml/build/cpu -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_NAME=Darwin -DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
-//go:generate cmake --build ggml/build/cpu --target server --config Release
-//go:generate mv ggml/build/cpu/bin/server ggml/build/cpu/bin/ollama-runner
-
-//go:generate git submodule update --force gguf
-//go:generate git -C gguf apply ../patches/0001-update-default-log-target.patch
-//go:generate cmake -S gguf -B gguf/build/cpu -DLLAMA_METAL=off -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_NAME=Darwin -DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=on
-//go:generate cmake --build gguf/build/cpu --target server --config Release
-//go:generate mv gguf/build/cpu/bin/server gguf/build/cpu/bin/ollama-runner
--- a/llm/llama.cpp/generate_darwin_arm64.go
+++ b/llm/llama.cpp/generate_darwin_arm64.go
@@ -1,18 +0,0 @@
-package llm
-
-//go:generate git submodule init
-
-//go:generate git submodule update --force ggml
-//go:generate git -C ggml apply ../patches/0001-add-detokenize-endpoint.patch
-//go:generate git -C ggml apply ../patches/0002-34B-model-support.patch
-//go:generate git -C ggml apply ../patches/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
-//go:generate git -C ggml apply ../patches/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
-//go:generate cmake -S ggml -B ggml/build/metal -DLLAMA_METAL=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
-//go:generate cmake --build ggml/build/metal --target server --config Release
-//go:generate mv ggml/build/metal/bin/server ggml/build/metal/bin/ollama-runner
-
-//go:generate git submodule update --force gguf
-//go:generate git -C gguf apply ../patches/0001-update-default-log-target.patch
-//go:generate cmake -S gguf -B gguf/build/metal -DLLAMA_METAL=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
-//go:generate cmake --build gguf/build/metal --target server --config Release
-//go:generate mv gguf/build/metal/bin/server gguf/build/metal/bin/ollama-runner
--- a/llm/llama.cpp/generate_linux.go
+++ b/llm/llama.cpp/generate_linux.go
@@ -1,26 +1,3 @@
 package llm

-//go:generate git submodule init
-
-//go:generate git submodule update --force ggml
-//go:generate git -C ggml apply ../patches/0001-add-detokenize-endpoint.patch
-//go:generate git -C ggml apply ../patches/0002-34B-model-support.patch
-//go:generate git -C ggml apply ../patches/0005-ggml-support-CUDA-s-half-type-for-aarch64-1455-2670.patch
-//go:generate git -C ggml apply ../patches/0001-copy-cuda-runtime-libraries.patch
-//go:generate cmake -S ggml -B ggml/build/cpu -DLLAMA_K_QUANTS=on
-//go:generate cmake --build ggml/build/cpu --target server --config Release
-//go:generate mv ggml/build/cpu/bin/server ggml/build/cpu/bin/ollama-runner
-
-//go:generate git submodule update --force gguf
-//go:generate git -C gguf apply ../patches/0001-copy-cuda-runtime-libraries.patch
-//go:generate git -C gguf apply ../patches/0001-update-default-log-target.patch
-//go:generate cmake -S gguf -B gguf/build/cpu -DLLAMA_K_QUANTS=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off
-//go:generate cmake --build gguf/build/cpu --target server --config Release
-//go:generate mv gguf/build/cpu/bin/server gguf/build/cpu/bin/ollama-runner
-
-//go:generate cmake -S ggml -B ggml/build/cuda -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on
-//go:generate cmake --build ggml/build/cuda --target server --config Release
-//go:generate mv ggml/build/cuda/bin/server ggml/build/cuda/bin/ollama-runner
-//go:generate cmake -S gguf -B gguf/build/cuda -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off -DLLAMA_CUDA_PEER_MAX_BATCH_SIZE=0
-//go:generate cmake --build gguf/build/cuda --target server --config Release
-//go:generate mv gguf/build/cuda/bin/server gguf/build/cuda/bin/ollama-runner
+//go:generate bash ./gen_linux.sh
--- a/llm/llama.cpp/generate_windows.go
+++ b/llm/llama.cpp/generate_windows.go
@@ -1,24 +1,3 @@
 package llm

-//go:generate git submodule init
-
-//go:generate git submodule update --force ggml
-//go:generate git -C ggml apply ../patches/0001-add-detokenize-endpoint.patch
-//go:generate git -C ggml apply ../patches/0002-34B-model-support.patch
-//go:generate cmake -S ggml -B ggml/build/cpu -DLLAMA_K_QUANTS=on
-//go:generate cmake --build ggml/build/cpu --target server --config Release
-//go:generate cmd /c move ggml\build\cpu\bin\Release\server.exe ggml\build\cpu\bin\Release\ollama-runner.exe
-
-//go:generate git submodule update --force gguf
-//go:generate git -C gguf apply ../patches/0001-update-default-log-target.patch
-//go:generate cmake -S gguf -B gguf/build/cpu -DLLAMA_K_QUANTS=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off
-//go:generate cmake --build gguf/build/cpu --target server --config Release
-//go:generate cmd /c move gguf\build\cpu\bin\Release\server.exe gguf\build\cpu\bin\Release\ollama-runner.exe
-
-//go:generate cmake -S ggml -B ggml/build/cuda -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on
-//go:generate cmake --build ggml/build/cuda --target server --config Release
-//go:generate cmd /c move ggml\build\cuda\bin\Release\server.exe ggml\build\cuda\bin\Release\ollama-runner.exe
-
-//go:generate cmake -S gguf -B gguf/build/cuda -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off
-//go:generate cmake --build gguf/build/cuda --target server --config Release
-//go:generate cmd /c move gguf\build\cuda\bin\Release\server.exe gguf\build\cuda\bin\Release\ollama-runner.exe
+//go:generate powershell -ExecutionPolicy Bypass -File ./gen_windows.ps1
--- a/llm/llama.cpp/ggml
+++ b/llm/llama.cpp/ggml
--- a/llm/llama.cpp/gguf
+++ b/llm/llama.cpp/gguf
--- a/llm/llama.cpp/patches/0001-Expose-callable-API-for-server.patch
+++ b/llm/llama.cpp/patches/0001-Expose-callable-API-for-server.patch
@@ -0,0 +1,464 @@
+From 90c332fe2ef61149b38561d02836e66715df214d Mon Sep 17 00:00:00 2001
+From: Daniel Hiltgen <daniel@ollama.com>
+Date: Mon, 13 Nov 2023 12:25:58 -0800
+Subject: [PATCH] Expose callable API for server
+
+This adds an extern "C" interface within the example server
+---
+ examples/server/CMakeLists.txt |  27 ++++
+ examples/server/server.cpp     | 280 +++++++++++++++++++++++++++++++++
+ examples/server/server.h       |  89 +++++++++++
+ ggml-cuda.cu                   |   1 +
+ 4 files changed, 397 insertions(+)
+ create mode 100644 examples/server/server.h
+
+diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt
+index 859cd12..da2b9bf 100644
+--- a/examples/server/CMakeLists.txt
+++ b/examples/server/CMakeLists.txt
+@@ -11,3 +11,30 @@ if (WIN32)
+     TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
+ endif()
+ target_compile_features(${TARGET} PRIVATE cxx_std_11)
+
+set(TARGET ext_server)
+option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
+add_library(${TARGET} STATIC server.cpp)
+target_include_directories(${TARGET} PRIVATE ../../common)
+target_include_directories(${TARGET} PRIVATE ../..)
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_definitions(${TARGET} PUBLIC LLAMA_SERVER_LIBRARY=1)
+target_link_libraries(${TARGET} PRIVATE common llama llava ${CMAKE_THREAD_LIBS_INIT})
+target_compile_definitions(${TARGET} PRIVATE
+    SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
+)
+
+if (BUILD_SHARED_LIBS)
+    set_target_properties(ext_server PROPERTIES POSITION_INDEPENDENT_CODE ON)
+    target_compile_definitions(ext_server PRIVATE LLAMA_SHARED LLAMA_BUILD)
+    add_library(ext_server_shared SHARED $<TARGET_OBJECTS:ext_server>)
+    target_link_libraries(ext_server_shared PRIVATE ggml llama llava common ${CMAKE_THREAD_LIBS_INIT})
+    install(TARGETS ext_server_shared LIBRARY)
+endif()
+
+if (CUDAToolkit_FOUND)
+    target_include_directories(${TARGET} PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
+    if (WIN32)
+        target_link_libraries(ext_server_shared PRIVATE nvml)
+    endif()
+endif()
+\ No newline at end of file
+diff --git a/examples/server/server.cpp b/examples/server/server.cpp
+index 0403853..07fb05c 100644
+--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
+@@ -5,6 +5,9 @@
+ #include "../llava/clip.h"
+ 
+ #include "stb_image.h"
+#if defined(LLAMA_SERVER_LIBRARY)
+#include "server.h"
+#endif
+ 
+ #ifndef NDEBUG
+ // crash the server in debug mode, otherwise send an http 500 error
+@@ -2643,6 +2646,7 @@ static void append_to_generated_text_from_generated_token_probs(llama_server_con
+     }
+ }
+ 
+#ifndef LLAMA_SERVER_LIBRARY
+ int main(int argc, char **argv)
+ {
+ #if SERVER_VERBOSE != 1
+@@ -3123,3 +3127,279 @@ int main(int argc, char **argv)
+     llama_backend_free();
+     return 0;
+ }
+
+#else // LLAMA_SERVER_LIBRARY
+// Expose the llama server as a callable extern "C" API
+llama_server_context *llama = NULL;
+std::atomic<bool> ext_server_running(false);
+std::thread ext_server_thread;
+
+void llama_server_init(ext_server_params *sparams, ext_server_resp_t *err)
+{
+#if SERVER_VERBOSE != 1
+    LOG_TEE("disabling verbose llm logging\n");
+    log_disable();
+#endif
+    assert(err != NULL && sparams != NULL);
+    err->id = 0;
+    err->msg[0] = '\0';
+    try {
+        llama = new llama_server_context;
+        log_set_target(stdout);
+        gpt_params params;
+        params.n_ctx = sparams->n_ctx;
+        params.n_batch = sparams->n_batch;
+        if (sparams->n_threads > 0) {
+            params.n_threads = sparams->n_threads;
+        }
+        params.n_parallel = sparams->n_parallel;
+        params.rope_freq_base = sparams->rope_freq_base;
+        params.rope_freq_scale = sparams->rope_freq_scale;
+
+        if (sparams->memory_f16)  {
+            params.cache_type_k = "f16";
+            params.cache_type_v = "f16";
+        } else {
+            params.cache_type_k = "f32";
+            params.cache_type_v = "f32";
+        }
+
+        params.n_gpu_layers = sparams->n_gpu_layers;
+        params.main_gpu = sparams->main_gpu;
+        params.use_mlock = sparams->use_mlock;
+        params.use_mmap = sparams->use_mmap;
+        params.numa = sparams->numa;
+        params.embedding = sparams->embedding;
+        if (sparams->model != NULL) {
+            params.model = sparams->model;
+        }
+
+        for (ext_server_lora_adapter *la = sparams->lora_adapters; la != NULL; la = la->next) {
+            params.lora_adapter.push_back(std::make_tuple(la->adapter, la->scale));
+        }
+
+        if (sparams->mmproj != NULL) {
+            params.mmproj = std::string(sparams->mmproj);
+        }
+           
+        llama_backend_init(params.numa);
+
+        // load the model
+        if (!llama->load_model(params))
+        {
+            // TODO - consider modifying the logging logic or patching load_model so we can capture more detailed error messages
+            // and pass them back to the caller for better UX
+            err->id = -1;
+            snprintf(err->msg, err->msg_len, "error loading model %s", params.model.c_str());
+            return;
+        }
+
+        llama->initialize();
+    } catch (std::exception &e) {
+        err->id = -1;
+        snprintf(err->msg, err->msg_len, "exception %s", e.what());
+    } catch (...) {
+        err->id = -1;
+        snprintf(err->msg, err->msg_len, "Unknown exception initializing llama server");
+    }
+}
+
+void llama_server_start()
+{
+    assert(llama != NULL);
+     // TODO mutex to protect thread creation
+    ext_server_thread = std::thread([&]()
+    {
+        ext_server_running = true;
+        try {
+            LOG_TEE("llama server main loop starting\n");
+            ggml_time_init();
+            while (ext_server_running.load())
+            {
+                if (!llama->update_slots()) {
+                    LOG_TEE("unexpected error in llama server update_slots - exiting main loop\n");
+                    break;
+                }
+            }
+        } catch (std::exception &e) {
+            LOG_TEE("caught exception in llama server main loop: %s\n", e.what());
+        } catch (...) {
+            LOG_TEE("caught unknown exception in llama server main loop\n");
+        }
+        LOG_TEE("\nllama server shutting down\n");
+        llama_backend_free();
+    });
+}
+
+void llama_server_stop() {
+    assert(llama != NULL);
+    // TODO - too verbose, remove once things are solid
+    LOG_TEE("requesting llama server shutdown\n");
+    ext_server_running = false;
+    ext_server_thread.join();
+    delete llama;
+    llama = NULL;
+    LOG_TEE("llama server shutdown complete\n");
+}
+
+void llama_server_completion(const char *json_req, ext_server_resp_t *resp) {
+    assert(llama != NULL && json_req != NULL && resp != NULL);
+    resp->id = -1;
+    resp->msg[0] = '\0';
+    try {
+        json data = json::parse(json_req);
+        resp->id = llama->request_completion(data, false, false, -1);
+    } catch (std::exception &e) {
+        snprintf(resp->msg, resp->msg_len, "exception %s", e.what());
+    } catch (...) {
+        snprintf(resp->msg, resp->msg_len, "Unknown exception during completion");
+    }
+}
+
+void llama_server_completion_next_result(const int task_id, ext_server_task_result_t *resp) {
+    assert(llama != NULL && resp != NULL);
+    std::string msg;
+    resp->id = -1;
+    resp->stop = false;
+    resp->error = false;
+    resp->json_resp = NULL;
+    std::string result_json;
+    try {
+        task_result result = llama->next_result(task_id);
+        result_json = result.result_json.dump(-1, ' ', false, json::error_handler_t::replace);
+        resp->id = result.id;
+        resp->stop = result.stop;
+        resp->error = result.error;
+        if (result.error) {
+            llama->request_cancel(task_id);
+        } else if (result.stop) {
+            llama->request_cancel(task_id);
+        }
+    } catch (std::exception &e) {
+        resp->error = true;
+        resp->id = -1;
+        result_json = "{\"error\":\"exception " + std::string(e.what()) + "\"}";
+    } catch (...) {
+        resp->error = true;
+        resp->id = -1;
+        result_json = "{\"error\":\"Unknown exception during completion\"}";
+    }
+    const std::string::size_type size = result_json.size() + 1;
+    resp->json_resp = new char[size];
+    snprintf(resp->json_resp, size, "%s", result_json.c_str());
+}
+
+void llama_server_release_task_result(ext_server_task_result_t *result) {
+    if (result == NULL || result->json_resp == NULL) {
+        return;
+    }
+    delete[] result->json_resp;
+}
+
+void llama_server_completion_cancel(const int task_id, ext_server_resp_t *err) {
+    assert(llama != NULL && err != NULL);
+    err->id = 0;
+    err->msg[0] = '\0';
+    try {
+        llama->request_cancel(task_id);
+    } catch (std::exception &e) {
+        err->id = -1;
+        snprintf(err->msg, err->msg_len, "exception %s", e.what());
+    } catch (...) {
+        err->id = -1;
+        snprintf(err->msg, err->msg_len, "Unknown exception completion cancel in llama server");
+    }
+}
+
+void llama_server_tokenize(const char *json_req, char **json_resp, ext_server_resp_t *err) {
+    assert(llama != NULL && json_req != NULL && json_resp != NULL && err != NULL);
+    *json_resp = NULL;
+    err->id = 0;
+    err->msg[0] = '\0';
+    try {
+        const json body = json::parse(json_req);
+        std::vector<llama_token> tokens;
+        if (body.count("content") != 0)
+        {
+            tokens = llama->tokenize(body["content"], false);
+        }
+        const json data = format_tokenizer_response(tokens);
+        std::string result_json = data.dump();
+        const std::string::size_type size = result_json.size() + 1;
+        *json_resp = new char[size];
+        snprintf(*json_resp, size, "%s", result_json.c_str());
+    } catch (std::exception &e) {
+        err->id = -1;
+        snprintf(err->msg, err->msg_len, "exception %s", e.what());
+    } catch (...) {
+        err->id = -1;
+        snprintf(err->msg, err->msg_len, "Unknown exception during tokenize");
+    }
+}
+
+void llama_server_release_json_resp(char **json_resp) {
+    if (json_resp == NULL || *json_resp == NULL) {
+        return;
+    }
+    delete[] *json_resp;
+}
+
+void llama_server_detokenize(const char *json_req, char **json_resp, ext_server_resp_t *err) {
+    assert(llama != NULL && json_req != NULL && json_resp != NULL && err != NULL);
+    *json_resp = NULL;
+    err->id = 0;
+    err->msg[0] = '\0';
+    try {
+        const json body = json::parse(json_req);
+        std::string content;
+        if (body.count("tokens") != 0)
+        {
+            const std::vector<llama_token> tokens = body["tokens"];
+            content = tokens_to_str(llama->ctx, tokens.cbegin(), tokens.cend());
+        }
+        const json data = format_detokenized_response(content);
+        std::string result_json = data.dump();
+        const std::string::size_type size = result_json.size() + 1;
+        *json_resp = new char[size];
+        snprintf(*json_resp, size, "%s", result_json.c_str());
+    } catch (std::exception &e) {
+        err->id = -1;
+        snprintf(err->msg, err->msg_len, "exception %s", e.what());
+    } catch (...) {
+        err->id = -1;
+        snprintf(err->msg, err->msg_len, "Unknown exception during detokenize");
+    }
+}
+
+void llama_server_embedding(const char *json_req, char** json_resp, ext_server_resp_t *err) {
+    assert(llama != NULL && json_req != NULL && json_resp != NULL && err != NULL);
+    *json_resp = NULL;
+    err->id = 0;
+    err->msg[0] = '\0';
+    try {
+        const json body = json::parse(json_req);
+        json prompt;
+        if (body.count("content") != 0)
+        {
+            prompt = body["content"];
+        }
+        else
+        {
+            prompt = "";
+        }
+        const int task_id = llama->request_completion({ {"prompt", prompt}, { "n_predict", 0} }, false, true, -1);
+        task_result result = llama->next_result(task_id);
+        std::string result_json = result.result_json.dump();
+        const std::string::size_type size = result_json.size() + 1;
+        *json_resp = new char[size];
+        snprintf(*json_resp, size, "%s", result_json.c_str());
+    } catch (std::exception &e) {
+        err->id = -1;
+        snprintf(err->msg, err->msg_len, "exception %s", e.what());
+    } catch (...) {
+        err->id = -1;
+        snprintf(err->msg, err->msg_len, "Unknown exception during embedding");
+    }
+}
+
+#endif // LLAMA_SERVER_LIBRARY
+\ No newline at end of file
+diff --git a/examples/server/server.h b/examples/server/server.h
+new file mode 100644
+index 0000000..d22f1b6
+--- /dev/null
+++ b/examples/server/server.h
+@@ -0,0 +1,89 @@
+#if defined(LLAMA_SERVER_LIBRARY)
+#ifndef LLAMA_SERVER_H
+#define LLAMA_SERVER_H
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdbool.h>
+
+// This exposes extern C entrypoints into the llama_server 
+// To enable the server compile with LLAMA_SERVER_LIBRARY
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+    typedef struct ext_server_resp {
+        int id; // < 0 on error
+        size_t msg_len; // caller must allocate msg and set msg_len
+        char *msg;
+    } ext_server_resp_t;
+
+    // Allocated and freed by caller
+    typedef struct ext_server_lora_adapter {
+        char *adapter;
+        float scale;
+        struct ext_server_lora_adapter *next;
+    } ext_server_lora_adapter_t;
+
+    // Allocated and freed by caller
+    typedef struct ext_server_params
+    {
+        char *model;            
+        uint32_t n_ctx;         // text context, 0 = from model
+        uint32_t n_batch;       // prompt processing maximum batch size
+        uint32_t n_threads;     // number of threads to use for generation
+        int32_t n_parallel;     // number of parallel sequences to decodewra
+        float rope_freq_base;   // RoPE base frequency, 0 = from model
+        float rope_freq_scale;  // RoPE frequency scaling factor, 0 = from model
+        bool memory_f16;        // use f16 instead of f32 for memory kv
+        int32_t n_gpu_layers;   // number of layers to store in VRAM (-1 - use default)
+        int32_t main_gpu;       // the GPU that is used for scratch and small tensors
+        bool use_mlock;         // force system to keep model in RAM
+        bool use_mmap;          // use mmap if possible
+        bool numa;              // attempt optimizations that help on some NUMA systems
+        bool embedding;         // get only sentence embedding
+        ext_server_lora_adapter_t* lora_adapters;
+        char *mmproj;
+    } ext_server_params_t;
+
+    typedef struct ext_server_task_result
+    {
+        int id;
+        bool stop;
+        bool error;
+        char* json_resp; // null terminated, memory managed by ext_server
+    } ext_server_task_result_t;
+
+    // Initialize the server once per process
+    // err->id = 0 for success and err->msg[0] = NULL
+    // err->id != 0 for failure, and err->msg contains error message
+    void llama_server_init(ext_server_params_t *sparams, ext_server_resp_t *err);
+
+    // Run the main loop, called once per init
+    void llama_server_start();
+    // Stop the main loop and free up resources allocated in init and start.  Init must be called again to reuse
+    void llama_server_stop();
+
+    // json_req null terminated string, memory managed by caller
+    // resp->id >= 0 on success (task ID)
+    // resp->id < 0 on error, and resp->msg contains error message
+    void llama_server_completion(const char *json_req, ext_server_resp_t *resp);
+
+    // Caller must call llama_server_release_task_result to free resp->json_resp
+    void llama_server_completion_next_result(const int task_id, ext_server_task_result_t *result);
+    void llama_server_completion_cancel(const int task_id, ext_server_resp_t *err);
+    void llama_server_release_task_result(ext_server_task_result_t *result);
+
+    // Caller must call llama_server_releaes_json_resp to free json_resp if err.id < 0
+    void llama_server_tokenize(const char *json_req, char **json_resp, ext_server_resp_t *err);
+    void llama_server_detokenize(const char *json_req, char **json_resp, ext_server_resp_t *err);
+    void llama_server_embedding(const char *json_req, char** json_resp, ext_server_resp_t *err);
+    void llama_server_release_json_resp(char **json_resp);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+#endif // LLAMA_SERVER_LIBRARY
+\ No newline at end of file
+diff --git a/ggml-cuda.cu b/ggml-cuda.cu
+index f20846f..9640cf3 100644
+--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
+@@ -6757,6 +6757,7 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
+         CUDA_CHECK(cudaGetDevice(&id));
+         src_ptr = (char *) extra->data_device[id];
+     } else {
+        fprintf(stderr, "ggml_cuda_cpy_tensor_2d assert: backend: %d\n", src->backend);
+         GGML_ASSERT(false);
+     }
+     char * dst_ptr = (char *) dst;
+-- 
+2.39.3 (Apple Git-145)
+
--- a/llm/llama.cpp/patches/0001-add-detokenize-endpoint.patch
+++ b/llm/llama.cpp/patches/0001-add-detokenize-endpoint.patch
@@ -1,51 +0,0 @@
-From 032ef7ff2423f5117bb59d42fb71be9cebf0a2de Mon Sep 17 00:00:00 2001
-From: Bruce MacDonald <brucewmacdonald@gmail.com>
-Date: Mon, 28 Aug 2023 18:08:12 -0400
-Subject: [PATCH] add detokenize endpoint
-
---
- examples/server/server.cpp | 21 +++++++++++++++++++++
- 1 file changed, 21 insertions(+)
-
-diff --git a/examples/server/server.cpp b/examples/server/server.cpp
-index 9966045..5014691 100644
--- a/examples/server/server.cpp
-+++ b/examples/server/server.cpp
-@@ -1075,6 +1075,12 @@ static json format_tokenizer_response(const std::vector<llama_token> &tokens)
-         {"tokens", tokens}};
- }
- 
-+static json format_detokenized_response(std::string content)
-+{
-+    return json{
-+        {"content", content}};
-+}
-+
- static void parse_options_completion(const json &body, llama_server_context &llama)
- {
-     gpt_params default_params;
-@@ -1361,6 +1367,21 @@ int main(int argc, char **argv)
-         const json data = format_tokenizer_response(tokens);
-         return res.set_content(data.dump(), "application/json"); });
- 
-+    svr.Post("/detokenize", [&llama](const Request &req, Response &res)
-+             {
-+        auto lock = llama.lock();
-+
-+        const json body = json::parse(req.body);
-+        std::string content;
-+        if (body.count("tokens") != 0)
-+        {
-+            const std::vector<llama_token> tokens = body["tokens"];
-+            content = tokens_to_str(llama.ctx, tokens.cbegin(), tokens.cend());
-+        }
-+
-+        const json data = format_detokenized_response(content);
-+        return res.set_content(data.dump(), "application/json"); });
-+
-     svr.Post("/embedding", [&llama](const Request &req, Response &res)
-              {
-         auto lock = llama.lock();
-- 
-2.39.2 (Apple Git-143)
-
--- a/llm/llama.cpp/patches/0001-copy-cuda-runtime-libraries.patch
+++ b/llm/llama.cpp/patches/0001-copy-cuda-runtime-libraries.patch
@@ -1,27 +0,0 @@
-From 5dd02993e8cc2ce309157736b95bb572f274a3fd Mon Sep 17 00:00:00 2001
-From: Michael Yang <mxyng@pm.me>
-Date: Wed, 20 Sep 2023 14:19:52 -0700
-Subject: [PATCH] copy cuda runtime libraries
-
---
- CMakeLists.txt | 4 ++++
- 1 file changed, 4 insertions(+)
-
-diff --git a/CMakeLists.txt b/CMakeLists.txt
-index 824d9f2..dd24137 100644
--- a/CMakeLists.txt
-+++ b/CMakeLists.txt
-@@ -274,6 +274,10 @@ if (LLAMA_CUBLAS)
-             set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
-         endif()
- 
-+        configure_file(${CUDAToolkit_LIBRARY_DIR}/libcudart.so ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/libcudart.so.${CUDAToolkit_VERSION_MAJOR}.0 COPYONLY)
-+        configure_file(${CUDAToolkit_LIBRARY_DIR}/libcublas.so ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/libcublas.so.${CUDAToolkit_VERSION_MAJOR} COPYONLY)
-+        configure_file(${CUDAToolkit_LIBRARY_DIR}/libcublasLt.so ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/libcublasLt.so.${CUDAToolkit_VERSION_MAJOR} COPYONLY)
-+
-     if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
-         # 52 == lowest CUDA 12 standard
-         # 60 == f16 CUDA intrinsics
-- 
-2.42.0
-
--- a/llm/llama.cpp/patches/0001-update-default-log-target.patch
+++ b/llm/llama.cpp/patches/0001-update-default-log-target.patch
@@ -1,25 +0,0 @@
-From 6465fec6290f0a7f5d4d0fbe6bcf634e4810dde6 Mon Sep 17 00:00:00 2001
-From: Michael Yang <mxyng@pm.me>
-Date: Mon, 23 Oct 2023 10:39:34 -0700
-Subject: [PATCH] default log stderr
-
---
- common/log.h | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/common/log.h b/common/log.h
-index b8953fd..25522cd 100644
--- a/common/log.h
-+++ b/common/log.h
-@@ -90,7 +90,7 @@
- //  }
- //
- #ifndef LOG_TARGET
-    #define LOG_TARGET log_handler()
-+    #define LOG_TARGET nullptr
- #endif
- 
- #ifndef LOG_TEE_TARGET
-- 
-2.42.0
-
--- a/llm/llama.cpp/patches/0002-34B-model-support.patch
+++ b/llm/llama.cpp/patches/0002-34B-model-support.patch
@@ -1,89 +0,0 @@
-From 6145068a6613c37bb43a7408b5496524bdcfc402 Mon Sep 17 00:00:00 2001
-From: Bruce MacDonald <brucewmacdonald@gmail.com>
-Date: Mon, 28 Aug 2023 18:08:53 -0400
-Subject: [PATCH] 34B model support
-
---
- llama.cpp | 10 ++++++++++
- 1 file changed, 10 insertions(+)
-
-diff --git a/llama.cpp b/llama.cpp
-index f2cbe76..62c5cdf 100644
--- a/llama.cpp
-+++ b/llama.cpp
-@@ -79,6 +79,7 @@ enum e_model {
-     MODEL_7B,
-     MODEL_13B,
-     MODEL_30B,
-+    MODEL_34B,
-     MODEL_65B,
-     MODEL_70B,
- };
-@@ -122,6 +123,7 @@ static std::map<e_model, size_t> MEM_REQ_SCRATCH0(int n_ctx)
-         { MODEL_7B,   ((size_t) n_ctx / 16ull + 100ull) * MB },
-         { MODEL_13B,  ((size_t) n_ctx / 12ull + 120ull) * MB },
-         { MODEL_30B,  ((size_t) n_ctx /  9ull + 160ull) * MB },
-+        { MODEL_34B,  ((size_t) n_ctx / 9ull + 160ull) * MB },
-         { MODEL_65B,  ((size_t) n_ctx /  6ull + 256ull) * MB }, // guess
-         { MODEL_70B,  ((size_t) n_ctx /  7ull + 164ull) * MB },
-     };
-@@ -135,6 +137,7 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
-         { MODEL_7B,  160ull * MB },
-         { MODEL_13B, 192ull * MB },
-         { MODEL_30B, 256ull * MB },
-+        { MODEL_34B, 256ull * MB },
-         { MODEL_65B, 384ull * MB }, // guess
-         { MODEL_70B, 304ull * MB },
-     };
-@@ -149,6 +152,7 @@ static const std::map<e_model, size_t> & MEM_REQ_EVAL()
-         { MODEL_7B,  10ull * MB },
-         { MODEL_13B, 12ull * MB },
-         { MODEL_30B, 16ull * MB },
-+        { MODEL_34B, 16ull * MB },
-         { MODEL_65B, 24ull * MB }, // guess
-         { MODEL_70B, 24ull * MB },
-     };
-@@ -164,6 +168,7 @@ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_BASE()
-         { MODEL_7B,   512ull * kB },
-         { MODEL_13B,  640ull * kB },
-         { MODEL_30B,  768ull * kB },
-+        { MODEL_34B,  768ull * kB },
-         { MODEL_65B, 1280ull * kB },
-         { MODEL_70B, 1280ull * kB },
-     };
-@@ -179,6 +184,7 @@ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT()
-         { MODEL_7B,  128ull },
-         { MODEL_13B, 160ull },
-         { MODEL_30B, 208ull },
-+        { MODEL_34B, 208ull },
-         { MODEL_65B, 256ull },
-         { MODEL_70B, 256ull },
-     };
-@@ -1027,6 +1033,7 @@ static const char * llama_model_type_name(e_model type) {
-         case MODEL_7B: return "7B";
-         case MODEL_13B: return "13B";
-         case MODEL_30B: return "30B";
-+        case MODEL_34B: return "34B";
-         case MODEL_65B: return "65B";
-         case MODEL_70B: return "70B";
-         default: LLAMA_ASSERT(false);
-@@ -1074,6 +1081,7 @@ static void llama_model_load_internal(
-             case 26: model.type = e_model::MODEL_3B; break;
-             case 32: model.type = e_model::MODEL_7B; break;
-             case 40: model.type = e_model::MODEL_13B; break;
-+            case 48: model.type = e_model::MODEL_34B; break;
-             case 60: model.type = e_model::MODEL_30B; break;
-             case 80: model.type = e_model::MODEL_65B; break;
-             default:
-@@ -1094,6 +1102,8 @@ static void llama_model_load_internal(
-             LLAMA_LOG_WARN("%s: warning: assuming 70B model based on GQA == %d\n", __func__, n_gqa);
-             model.type = e_model::MODEL_70B;
-             hparams.f_ffn_mult = 1.3f; // from the params.json of the 70B model
-+        } else if (model.type == e_model::MODEL_34B && n_gqa == 8) {
-+            hparams.f_ffn_mult = 1.0f; // from the params.json of the 34B model
-         }
- 
-         hparams.rope_freq_base  = rope_freq_base;
-- 
-2.39.2 (Apple Git-143)
-
--- a/llm/llama.cpp/patches/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
+++ b/llm/llama.cpp/patches/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
@@ -1,30 +0,0 @@
-From dadbed99e65252d79f81101a392d0d6497b86caa Mon Sep 17 00:00:00 2001
-From: Shouzheng Liu <lshzh.hi@gmail.com>
-Date: Mon, 21 Aug 2023 06:59:29 -0400
-Subject: [PATCH] metal : fix synchronization in new matrix multiplication
- kernel (#2686)
-
---
- ggml-metal.metal | 3 ++-
- 1 file changed, 2 insertions(+), 1 deletion(-)
-
-diff --git a/ggml-metal.metal b/ggml-metal.metal
-index 3f31252..88d48f6 100644
--- a/ggml-metal.metal
-+++ b/ggml-metal.metal
-@@ -1898,10 +1898,11 @@ kernel void kernel_mul_mm(device const  uchar * src0,
-         threadgroup float *temp_str = ((threadgroup float *)shared_memory) \
-                                       + 32 * (sgitg&1) + (16 * (sgitg>>1)) * BLOCK_SIZE_M;
-         for (int i = 0; i < 8; i++) {
-+            threadgroup_barrier(mem_flags::mem_device);
-             simdgroup_store(c_res[i], temp_str + 8 * (i%4) + 8 * BLOCK_SIZE_M * (i/4), BLOCK_SIZE_M);
-         }
- 
-        threadgroup_barrier(mem_flags::mem_threadgroup);
-+        threadgroup_barrier(mem_flags::mem_device);
-         device float *C = dst + BLOCK_SIZE_M * r0 + (BLOCK_SIZE_N * r1) * ne0 + im*ne1*ne0;
-         if (sgitg==0) {
-             for (int i = 0; i < n_rows; i++) {
-- 
-2.41.0
-
--- a/llm/llama.cpp/patches/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
+++ b/llm/llama.cpp/patches/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
@@ -1,41 +0,0 @@
-From 14b1d7e6f720dee41ce5a826376df738096d9033 Mon Sep 17 00:00:00 2001
-From: Shouzheng Liu <lshzh.hi@gmail.com>
-Date: Tue, 22 Aug 2023 02:18:40 -0400
-Subject: [PATCH] metal : add missing barriers for mul-mat (#2699)
-
---
- ggml-metal.metal | 5 +++--
- 1 file changed, 3 insertions(+), 2 deletions(-)
-
-diff --git a/ggml-metal.metal b/ggml-metal.metal
-index 88d48f6..ce3541f 100644
--- a/ggml-metal.metal
-+++ b/ggml-metal.metal
-@@ -1850,6 +1850,7 @@ kernel void kernel_mul_mm(device const  uchar * src0,
-         //load data and store to threadgroup memory
-         half4x4 temp_a;
-         dequantize_func(x, il, temp_a);
-+        threadgroup_barrier(mem_flags::mem_threadgroup);
-         #pragma unroll(16)
-         for (int i = 0; i < 16; i++) {
-             *(sa + SG_MAT_SIZE * ((tiitg / THREAD_PER_ROW / 8) \
-@@ -1895,14 +1896,14 @@ kernel void kernel_mul_mm(device const  uchar * src0,
-         }
-     } else {
-         // block is smaller than 64x32, we should avoid writing data outside of the matrix
-+        threadgroup_barrier(mem_flags::mem_threadgroup);
-         threadgroup float *temp_str = ((threadgroup float *)shared_memory) \
-                                       + 32 * (sgitg&1) + (16 * (sgitg>>1)) * BLOCK_SIZE_M;
-         for (int i = 0; i < 8; i++) {
-            threadgroup_barrier(mem_flags::mem_device);
-             simdgroup_store(c_res[i], temp_str + 8 * (i%4) + 8 * BLOCK_SIZE_M * (i/4), BLOCK_SIZE_M);
-         }
- 
-        threadgroup_barrier(mem_flags::mem_device);
-+        threadgroup_barrier(mem_flags::mem_threadgroup);
-         device float *C = dst + BLOCK_SIZE_M * r0 + (BLOCK_SIZE_N * r1) * ne0 + im*ne1*ne0;
-         if (sgitg==0) {
-             for (int i = 0; i < n_rows; i++) {
-- 
-2.41.0
-
--- a/llm/llama.cpp/patches/0005-ggml-support-CUDA-s-half-type-for-aarch64-1455-2670.patch
+++ b/llm/llama.cpp/patches/0005-ggml-support-CUDA-s-half-type-for-aarch64-1455-2670.patch
@@ -1,32 +0,0 @@
-From 1e3bc523d8053a77df3ac7126a84d0297ee97ef6 Mon Sep 17 00:00:00 2001
-From: Kylin <56434533+KyL0N@users.noreply.github.com>
-Date: Tue, 22 Aug 2023 15:14:23 +0800
-Subject: [PATCH] ggml : support CUDA's half type for aarch64(#1455) (#2670)
-
-* ggml: support CUDA's half type for aarch64(#1455)
-support CUDA's half type for aarch64 in ggml_fp16_t definition
-
-* ggml: use __CUDACC__ to recognise nvcc compiler
---
- ggml.h | 5 +++--
- 1 file changed, 3 insertions(+), 2 deletions(-)
-
-diff --git a/ggml.h b/ggml.h
-index 544ad2d..0ec7ec5 100644
--- a/ggml.h
-+++ b/ggml.h
-@@ -259,8 +259,9 @@
- extern "C" {
- #endif
- 
-#ifdef __ARM_NEON
-    // we use the built-in 16-bit float type
-+#if defined(__ARM_NEON) && defined(__CUDACC__)
-+    typedef half ggml_fp16_t;
-+#elif defined(__ARM_NEON)
-     typedef __fp16 ggml_fp16_t;
- #else
-     typedef uint16_t ggml_fp16_t;
-- 
-2.39.2 (Apple Git-143)
-
--- a/llm/llama.go
+++ b/llm/llama.go
@@ -1,25 +1,16 @@
 package llm

 import (
-	"bufio"
 	"bytes"
 	"context"
-	"embed"
-	"encoding/json"
+	_ "embed"
 	"errors"
 	"fmt"
 	"io"
 	"io/fs"
-	"log"
-	"math/rand"
-	"net/http"
 	"os"
 	"os/exec"
-	"path"
 	"path/filepath"
-	"runtime"
-	"strconv"
-	"strings"
 	"sync"
 	"time"

@@ -55,109 +46,6 @@ number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws
 ws ::= ([ \t\n] ws)?
 `

-//go:embed llama.cpp/*/build/*/bin/*
-var llamaCppEmbed embed.FS
-
-type ModelRunner struct {
-	Type        string // "gguf" or "ggml"
-	Path        string // path to the model runner executable
-	Accelerated bool
-}
-
-func chooseRunners(workDir, runnerType string) []ModelRunner {
-	buildPath := path.Join("llama.cpp", runnerType, "build")
-	var runners []ModelRunner
-
-	// set the runners based on the OS
-	// IMPORTANT: the order of the runners in the array is the priority order
-	switch runtime.GOOS {
-	case "darwin":
-		if runtime.GOARCH == "arm64" {
-			runners = []ModelRunner{{Type: runnerType, Path: path.Join(buildPath, "metal", "bin", "ollama-runner")}}
-		} else {
-			runners = []ModelRunner{{Type: runnerType, Path: path.Join(buildPath, "cpu", "bin", "ollama-runner")}}
-		}
-	case "linux":
-		runners = []ModelRunner{
-			{Type: runnerType, Path: path.Join(buildPath, "cuda", "bin", "ollama-runner"), Accelerated: true},
-			{Type: runnerType, Path: path.Join(buildPath, "cpu", "bin", "ollama-runner")},
-		}
-	case "windows":
-		// TODO: select windows GPU runner here when available
-		runners = []ModelRunner{
-			{Type: runnerType, Path: path.Join(buildPath, "cuda", "bin", "Release", "ollama-runner.exe"), Accelerated: true},
-			{Type: runnerType, Path: path.Join(buildPath, "cpu", "bin", "Release", "ollama-runner.exe")},
-		}
-	default:
-		log.Printf("unknown OS, running on CPU: %s", runtime.GOOS)
-		runners = []ModelRunner{
-			{Type: runnerType, Path: path.Join(buildPath, "cpu", "bin", "ollama-runner")},
-		}
-	}
-
-	runnerAvailable := false // if no runner files are found in the embed, this flag will cause a fast fail
-	for _, r := range runners {
-		// find all the files in the runner's bin directory
-		files, err := fs.Glob(llamaCppEmbed, path.Join(path.Dir(r.Path), "*"))
-		if err != nil {
-			// this is expected, ollama may be compiled without all runners packed in
-			log.Printf("%s runner not found: %v", r.Path, err)
-			continue
-		}
-
-		for _, f := range files {
-			runnerAvailable = true
-
-			srcFile, err := llamaCppEmbed.Open(f)
-			if err != nil {
-				log.Fatalf("read llama runner %s: %v", f, err)
-			}
-			defer srcFile.Close()
-
-			// create the directory in case it does not exist, filepath.Dir() converts the file path to the OS's format
-			destPath := filepath.Join(workDir, filepath.Dir(f))
-			if err := os.MkdirAll(destPath, 0o755); err != nil {
-				log.Fatalf("create runner temp dir %s: %v", filepath.Dir(f), err)
-			}
-
-			// create the path to the destination file, filepath.Base() converts the file path to the OS's format
-			destFile := filepath.Join(destPath, filepath.Base(f))
-
-			_, err = os.Stat(destFile)
-			switch {
-			case errors.Is(err, os.ErrNotExist):
-				destFile, err := os.OpenFile(destFile, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
-				if err != nil {
-					log.Fatalf("write llama runner %s: %v", f, err)
-				}
-				defer destFile.Close()
-
-				if _, err := io.Copy(destFile, srcFile); err != nil {
-					log.Fatalf("copy llama runner %s: %v", f, err)
-				}
-			case err != nil:
-				log.Fatalf("stat llama runner %s: %v", f, err)
-			}
-		}
-	}
-	if !runnerAvailable {
-		log.Fatalf("%s runner not found", runnerType)
-	}
-
-	// return the runners to try in priority order
-	localRunnersByPriority := []ModelRunner{}
-	for _, r := range runners {
-		// clean the ModelRunner paths so that they match the OS we are running on
-		localRunnersByPriority = append(localRunnersByPriority, ModelRunner{
-			Type:        r.Type,
-			Path:        filepath.Clean(path.Join(workDir, r.Path)),
-			Accelerated: r.Accelerated,
-		})
-	}
-
-	return localRunnersByPriority
-}
-
 type llamaModel struct {
 	hyperparameters llamaHyperparameters
 }
@@ -223,82 +111,17 @@ type Running struct {
 	*StatusWriter            // captures error messages from the llama runner process
 }

-type llama struct {
-	api.Options
-	Running
+type ImageData struct {
+	Data []byte `json:"data"`
+	ID   int    `json:"id"`
 }

 var (
 	errNvidiaSMI     = errors.New("warning: gpu support may not be enabled, check that you have installed GPU drivers: nvidia-smi command failed")
 	errAvailableVRAM = errors.New("not enough VRAM available, falling back to CPU only")
+	payloadMissing   = fmt.Errorf("expected dynamic library payloads not included in this build of ollama")
 )

-// CheckVRAM returns the free VRAM in bytes on Linux machines with NVIDIA GPUs
-func CheckVRAM() (int64, error) {
-	cmd := exec.Command("nvidia-smi", "--query-gpu=memory.free", "--format=csv,noheader,nounits")
-	var stdout bytes.Buffer
-	cmd.Stdout = &stdout
-	err := cmd.Run()
-	if err != nil {
-		return 0, errNvidiaSMI
-	}
-
-	var freeMiB int64
-	scanner := bufio.NewScanner(&stdout)
-	for scanner.Scan() {
-		line := scanner.Text()
-		if strings.Contains(line, "[Insufficient Permissions]") {
-			return 0, fmt.Errorf("GPU support may not enabled, check you have installed GPU drivers and have the necessary permissions to run nvidia-smi")
-		}
-
-		vram, err := strconv.ParseInt(strings.TrimSpace(line), 10, 64)
-		if err != nil {
-			return 0, fmt.Errorf("failed to parse available VRAM: %v", err)
-		}
-
-		freeMiB += vram
-	}
-
-	freeBytes := freeMiB * 1024 * 1024
-	if freeBytes < 2*format.GigaByte {
-		log.Printf("less than 2 GB VRAM available")
-		return 0, errAvailableVRAM
-	}
-
-	return freeBytes, nil
-}
-
-func NumGPU(numLayer, fileSizeBytes int64, opts api.Options) int {
-	if opts.NumGPU != -1 {
-		return opts.NumGPU
-	}
-	if runtime.GOOS == "linux" || runtime.GOOS == "windows" {
-		freeBytes, err := CheckVRAM()
-		if err != nil {
-			if !errors.Is(err, errNvidiaSMI) {
-				log.Print(err.Error())
-			}
-			// nvidia driver not installed or no nvidia GPU found
-			return 0
-		}
-
-		/*
-		 Calculate bytes per layer, this will roughly be the size of the model file divided by the number of layers.
-		 We can store the model weights and the kv cache in vram,
-		 to enable kv chache vram storage add two additional layers to the number of layers retrieved from the model file.
-		*/
-		bytesPerLayer := fileSizeBytes / numLayer
-
-		// 75% of the absolute max number of layers we can fit in available VRAM, off-loading too many layers to the GPU can cause OOM errors
-		layers := int(freeBytes/bytesPerLayer) * 3 / 4
-		log.Printf("%d MB VRAM available, loading up to %d GPU layers", freeBytes/(1024*1024), layers)
-
-		return layers
-	}
-	// default to enable metal on macOS
-	return 1
-}
-
 // StatusWriter is a writer that captures error messages from the llama runner process
 type StatusWriter struct {
 	ErrCh      chan error
@@ -327,207 +150,6 @@ func (w *StatusWriter) Write(b []byte) (int, error) {
 	return os.Stderr.Write(b)
 }

-func newLlama(model string, adapters, projectors []string, runners []ModelRunner, numLayers int64, opts api.Options) (*llama, error) {
-	fileInfo, err := os.Stat(model)
-	if err != nil {
-		return nil, err
-	}
-
-	if len(adapters) > 1 {
-		return nil, errors.New("ollama supports only one lora adapter, but multiple were provided")
-	}
-
-	numGPU := NumGPU(numLayers, fileInfo.Size(), opts)
-	params := []string{
-		"--model", model,
-		"--ctx-size", fmt.Sprintf("%d", opts.NumCtx),
-		"--batch-size", fmt.Sprintf("%d", opts.NumBatch),
-		"--n-gpu-layers", fmt.Sprintf("%d", numGPU),
-		"--embedding",
-	}
-
-	if opts.MainGPU > 0 {
-		params = append(params, "--main-gpu", fmt.Sprintf("%d", opts.MainGPU))
-	}
-
-	if opts.RopeFrequencyBase > 0 {
-		params = append(params, "--rope-freq-base", fmt.Sprintf("%f", opts.RopeFrequencyBase))
-	}
-
-	if opts.RopeFrequencyScale > 0 {
-		params = append(params, "--rope-freq-scale", fmt.Sprintf("%f", opts.RopeFrequencyScale))
-	}
-
-	if opts.NumGQA > 0 {
-		params = append(params, "--gqa", fmt.Sprintf("%d", opts.NumGQA))
-	}
-
-	if len(adapters) > 0 {
-		// TODO: applying multiple adapters is not supported by the llama.cpp server yet
-		params = append(params, "--lora", adapters[0])
-	}
-
-	if len(projectors) > 0 {
-		// TODO: applying multiple projectors is not supported by the llama.cpp server yet
-		params = append(params, "--mmproj", projectors[0])
-	}
-
-	if opts.NumThread > 0 {
-		params = append(params, "--threads", fmt.Sprintf("%d", opts.NumThread))
-	}
-
-	if !opts.F16KV {
-		params = append(params, "--memory-f32")
-	}
-	if opts.UseMLock {
-		params = append(params, "--mlock")
-	}
-	if !opts.UseMMap {
-		params = append(params, "--no-mmap")
-	}
-	if opts.UseNUMA {
-		params = append(params, "--numa")
-	}
-
-	var runnerErr error
-
-	// start the llama.cpp server with a retry in case the port is already in use
-	for _, runner := range runners {
-		if runner.Accelerated && numGPU == 0 {
-			log.Printf("skipping accelerated runner because num_gpu=0")
-			continue
-		}
-
-		if _, err := os.Stat(runner.Path); err != nil {
-			log.Printf("llama runner not found: %v", err)
-			continue
-		}
-
-		port := rand.Intn(65535-49152) + 49152 // get a random port in the ephemeral range
-		params := append(params, "--port", strconv.Itoa(port))
-
-		if runner.Type == "gguf" {
-			params = append(params, "--parallel", "2")
-		}
-
-		ctx, cancel := context.WithCancel(context.Background())
-		cmd := exec.CommandContext(
-			ctx,
-			runner.Path,
-			params...,
-		)
-
-		var libraryPaths []string
-		if libraryPath, ok := os.LookupEnv("LD_LIBRARY_PATH"); ok {
-			libraryPaths = append(libraryPaths, libraryPath)
-		}
-
-		libraryPaths = append(libraryPaths, filepath.Dir(runner.Path))
-
-		cmd.Env = append(os.Environ(), fmt.Sprintf("LD_LIBRARY_PATH=%s", strings.Join(libraryPaths, ":")))
-		cmd.Stdout = os.Stderr
-		statusWriter := NewStatusWriter()
-		cmd.Stderr = statusWriter
-
-		llm := &llama{Options: opts, Running: Running{Port: port, Cmd: cmd, Cancel: cancel, exitCh: make(chan error)}}
-
-		log.Print("starting llama runner")
-		if err := llm.Cmd.Start(); err != nil {
-			log.Printf("error starting the external llama runner: %v", err)
-			continue
-		}
-
-		// monitor the llama runner process and signal when it exits
-		go func() {
-			err := llm.Cmd.Wait()
-			// default to printing the exit message of the command process, it will probably just say 'exit staus 1'
-			errMsg := err.Error()
-			// try to set a better error message if llama runner logs captured an error
-			if statusWriter.LastErrMsg != "" {
-				errMsg = statusWriter.LastErrMsg
-			}
-			log.Println(errMsg)
-			// llm.Cmd.Wait() can only be called once, use this exit channel to signal that the process has exited
-			llm.exitOnce.Do(func() {
-				close(llm.exitCh)
-			})
-		}()
-
-		if err := waitForServer(llm); err != nil {
-			log.Printf("error starting llama runner: %v", err)
-			llm.Close()
-
-			// default the runnerErr to the error returned by the most recent llama runner process
-			runnerErr = err
-
-			// capture the error directly from the runner process, if any
-			select {
-			case runnerErr = <-statusWriter.ErrCh:
-			default:
-				// the runner process probably timed out
-			}
-
-			// try again
-			continue
-		}
-
-		// server started successfully
-		return llm, nil
-	}
-
-	if runnerErr != nil {
-		// this is the error returned from the llama runner process that failed most recently
-		return nil, runnerErr
-	}
-
-	return nil, fmt.Errorf("failed to start a llama runner")
-}
-
-func waitForServer(llm *llama) error {
-	start := time.Now()
-	expiresAt := time.Now().Add(3 * time.Minute) // be generous with timeout, large models can take a while to load
-	ticker := time.NewTicker(200 * time.Millisecond)
-	defer ticker.Stop()
-
-	log.Print("waiting for llama runner to start responding")
-	for {
-		select {
-		case <-llm.exitCh:
-			// failed to start subprocess
-			return fmt.Errorf("llama runner process has terminated")
-		case <-ticker.C:
-			if time.Now().After(expiresAt) {
-				// timeout
-				return fmt.Errorf("timed out waiting for llama runner to start")
-			}
-
-			if err := llm.Ping(context.Background()); err == nil {
-				// success
-				log.Printf("llama runner started in %f seconds", time.Since(start).Seconds())
-				return nil
-			}
-		}
-	}
-}
-
-func (llm *llama) Close() {
-	// signal the sub-process to terminate
-	llm.Cancel()
-
-	// wait for the command to exit to prevent race conditions with the next run
-	<-llm.exitCh
-
-	if llm.StatusWriter != nil && llm.StatusWriter.LastErrMsg != "" {
-		log.Printf("llama runner stopped with error: %v", llm.StatusWriter.LastErrMsg)
-	} else {
-		log.Print("llama runner stopped successfully")
-	}
-}
-
-func (llm *llama) SetOptions(opts api.Options) {
-	llm.Options = opts
-}
-
 type prediction struct {
 	Content string `json:"content"`
 	Model   string `json:"model"`
@@ -543,18 +165,16 @@ type prediction struct {
 }

 const maxBufferSize = 512 * format.KiloByte
+const maxRetries = 3
+const retryDelay = 1 * time.Second

 type PredictOpts struct {
-	Prompt           string
-	Format           string
-	CheckpointStart  time.Time
-	CheckpointLoaded time.Time
+	Prompt string
+	Format string
+	Images []api.ImageData
 }

 type PredictResult struct {
-	CreatedAt          time.Time
-	TotalDuration      time.Duration
-	LoadDuration       time.Duration
 	Content            string
 	Done               bool
 	PromptEvalCount    int
@@ -563,128 +183,6 @@ type PredictResult struct {
 	EvalDuration       time.Duration
 }

-func (llm *llama) Predict(ctx context.Context, predict PredictOpts, fn func(PredictResult)) error {
-	request := map[string]any{
-		"prompt":            predict.Prompt,
-		"stream":            true,
-		"n_predict":         llm.NumPredict,
-		"n_keep":            llm.NumKeep,
-		"main_gpu":          llm.MainGPU,
-		"temperature":       llm.Temperature,
-		"top_k":             llm.TopK,
-		"top_p":             llm.TopP,
-		"tfs_z":             llm.TFSZ,
-		"typical_p":         llm.TypicalP,
-		"repeat_last_n":     llm.RepeatLastN,
-		"repeat_penalty":    llm.RepeatPenalty,
-		"presence_penalty":  llm.PresencePenalty,
-		"frequency_penalty": llm.FrequencyPenalty,
-		"mirostat":          llm.Mirostat,
-		"mirostat_tau":      llm.MirostatTau,
-		"mirostat_eta":      llm.MirostatEta,
-		"penalize_nl":       llm.PenalizeNewline,
-		"seed":              llm.Seed,
-		"stop":              llm.Stop,
-	}
-
-	if predict.Format == "json" {
-		request["grammar"] = jsonGrammar
-	}
-
-	// Handling JSON marshaling with special characters unescaped.
-	buffer := &bytes.Buffer{}
-	enc := json.NewEncoder(buffer)
-	enc.SetEscapeHTML(false)
-
-	if err := enc.Encode(request); err != nil {
-		return fmt.Errorf("failed to marshal data: %v", err)
-	}
-
-	endpoint := fmt.Sprintf("http://127.0.0.1:%d/completion", llm.Port)
-	req, err := http.NewRequestWithContext(ctx, http.MethodPost, endpoint, buffer)
-	if err != nil {
-		return fmt.Errorf("error creating POST request: %v", err)
-	}
-	req.Header.Set("Content-Type", "application/json")
-
-	resp, err := http.DefaultClient.Do(req)
-	if err != nil {
-		return fmt.Errorf("POST predict: %v", err)
-	}
-	defer resp.Body.Close()
-
-	if resp.StatusCode >= 400 {
-		bodyBytes, err := io.ReadAll(resp.Body)
-		if err != nil {
-			return fmt.Errorf("failed reading llm error response: %w", err)
-		}
-		log.Printf("llm predict error: %s", bodyBytes)
-		return fmt.Errorf("%s", bodyBytes)
-	}
-
-	scanner := bufio.NewScanner(resp.Body)
-	// increase the buffer size to avoid running out of space
-	buf := make([]byte, 0, maxBufferSize)
-	scanner.Buffer(buf, maxBufferSize)
-	for scanner.Scan() {
-		select {
-		case <-ctx.Done():
-			// This handles the request cancellation
-			return ctx.Err()
-		default:
-			line := scanner.Bytes()
-			if len(line) == 0 {
-				continue
-			}
-
-			evt, ok := bytes.CutPrefix(line, []byte("data: "))
-			if !ok {
-				return fmt.Errorf("error parsing llm response stream: %s", line)
-			}
-
-			var p prediction
-			if err := json.Unmarshal(evt, &p); err != nil {
-				return fmt.Errorf("error unmarshaling llm prediction response: %v", err)
-			}
-
-			if p.Content != "" {
-				fn(PredictResult{
-					CreatedAt: time.Now().UTC(),
-					Content:   p.Content,
-				})
-			}
-
-			if p.Stop {
-				fn(PredictResult{
-					CreatedAt:     time.Now().UTC(),
-					TotalDuration: time.Since(predict.CheckpointStart),
-
-					Done:               true,
-					PromptEvalCount:    p.Timings.PromptN,
-					PromptEvalDuration: parseDurationMs(p.Timings.PromptMS),
-					EvalCount:          p.Timings.PredictedN,
-					EvalDuration:       parseDurationMs(p.Timings.PredictedMS),
-				})
-				return nil
-			}
-		}
-	}
-
-	if err := scanner.Err(); err != nil {
-		if strings.Contains(err.Error(), "unexpected EOF") {
-			// this means the llama runner subprocess crashed
-			llm.Close()
-			if llm.StatusWriter != nil && llm.StatusWriter.LastErrMsg != "" {
-				return fmt.Errorf("llama runner exited: %v", llm.StatusWriter.LastErrMsg)
-			}
-			return fmt.Errorf("llama runner exited, you may not have enough available memory to run this model")
-		}
-		return fmt.Errorf("error reading llm response: %v", err)
-	}
-
-	return nil
-}
-
 type TokenizeRequest struct {
 	Content string `json:"content"`
 }
@@ -693,43 +191,6 @@ type TokenizeResponse struct {
 	Tokens []int `json:"tokens"`
 }

-func (llm *llama) Encode(ctx context.Context, prompt string) ([]int, error) {
-	endpoint := fmt.Sprintf("http://127.0.0.1:%d/tokenize", llm.Port)
-	data, err := json.Marshal(TokenizeRequest{Content: prompt})
-	if err != nil {
-		return nil, fmt.Errorf("marshaling encode data: %w", err)
-	}
-
-	req, err := http.NewRequestWithContext(ctx, http.MethodPost, endpoint, bytes.NewBuffer(data))
-	if err != nil {
-		return nil, fmt.Errorf("encode request: %w", err)
-	}
-	req.Header.Set("Content-Type", "application/json")
-
-	resp, err := http.DefaultClient.Do(req)
-	if err != nil {
-		return nil, fmt.Errorf("do encode request: %w", err)
-	}
-	defer resp.Body.Close()
-
-	body, err := io.ReadAll(resp.Body)
-	if err != nil {
-		return nil, fmt.Errorf("read encode request: %w", err)
-	}
-
-	if resp.StatusCode >= 400 {
-		log.Printf("llm encode error: %s", body)
-		return nil, fmt.Errorf("%s", body)
-	}
-
-	var encoded TokenizeResponse
-	if err := json.Unmarshal(body, &encoded); err != nil {
-		return nil, fmt.Errorf("unmarshal encode response: %w", err)
-	}
-
-	return encoded.Tokens, nil
-}
-
 type DetokenizeRequest struct {
 	Tokens []int `json:"tokens"`
 }
@@ -738,46 +199,6 @@ type DetokenizeResponse struct {
 	Content string `json:"content"`
 }

-func (llm *llama) Decode(ctx context.Context, tokens []int) (string, error) {
-	if len(tokens) == 0 {
-		return "", nil
-	}
-	endpoint := fmt.Sprintf("http://127.0.0.1:%d/detokenize", llm.Port)
-	data, err := json.Marshal(DetokenizeRequest{Tokens: tokens})
-	if err != nil {
-		return "", fmt.Errorf("marshaling decode data: %w", err)
-	}
-
-	req, err := http.NewRequestWithContext(ctx, http.MethodPost, endpoint, bytes.NewBuffer(data))
-	if err != nil {
-		return "", fmt.Errorf("decode request: %w", err)
-	}
-	req.Header.Set("Content-Type", "application/json")
-
-	resp, err := http.DefaultClient.Do(req)
-	if err != nil {
-		return "", fmt.Errorf("do decode request: %w", err)
-	}
-	defer resp.Body.Close()
-
-	body, err := io.ReadAll(resp.Body)
-	if err != nil {
-		return "", fmt.Errorf("read decode request: %w", err)
-	}
-
-	if resp.StatusCode >= 400 {
-		log.Printf("llm decode error: %s", body)
-		return "", fmt.Errorf("%s", body)
-	}
-
-	var decoded DetokenizeResponse
-	if err := json.Unmarshal(body, &decoded); err != nil {
-		return "", fmt.Errorf("unmarshal encode response: %w", err)
-	}
-
-	return decoded.Content, nil
-}
-
 type EmbeddingRequest struct {
 	Content string `json:"content"`
 }
@@ -786,51 +207,40 @@ type EmbeddingResponse struct {
 	Embedding []float64 `json:"embedding"`
 }

-func (llm *llama) Embedding(ctx context.Context, input string) ([]float64, error) {
-	endpoint := fmt.Sprintf("http://127.0.0.1:%d/embedding", llm.Port)
-	data, err := json.Marshal(TokenizeRequest{Content: input})
-	if err != nil {
-		return nil, fmt.Errorf("error marshaling embed data: %w", err)
+func extractDynamicLibs(workDir, glob string) ([]string, error) {
+	files, err := fs.Glob(libEmbed, glob)
+	if err != nil || len(files) == 0 {
+		return nil, payloadMissing
 	}
+	libs := make([]string, len(files))

-	req, err := http.NewRequestWithContext(ctx, http.MethodPost, endpoint, bytes.NewBuffer(data))
-	if err != nil {
-		return nil, fmt.Errorf("error creating embed request: %w", err)
+	for i, file := range files {
+		srcFile, err := libEmbed.Open(file)
+		if err != nil {
+			return nil, fmt.Errorf("read payload %s: %v", file, err)
+		}
+		defer srcFile.Close()
+		if err := os.MkdirAll(workDir, 0o755); err != nil {
+			return nil, fmt.Errorf("create payload temp dir %s: %v", workDir, err)
+		}
+
+		destFile := filepath.Join(workDir, filepath.Base(file))
+		libs[i] = destFile
+
+		_, err = os.Stat(destFile)
+		switch {
+		case errors.Is(err, os.ErrNotExist):
+			destFile, err := os.OpenFile(destFile, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
+			if err != nil {
+				return nil, fmt.Errorf("write payload %s: %v", file, err)
+			}
+			defer destFile.Close()
+			if _, err := io.Copy(destFile, srcFile); err != nil {
+				return nil, fmt.Errorf("copy payload %s: %v", file, err)
+			}
+		case err != nil:
+			return nil, fmt.Errorf("stat payload %s: %v", file, err)
+		}
 	}
-	req.Header.Set("Content-Type", "application/json")
-
-	resp, err := http.DefaultClient.Do(req)
-	if err != nil {
-		return nil, fmt.Errorf("POST embedding: %w", err)
-	}
-	defer resp.Body.Close()
-
-	body, err := io.ReadAll(resp.Body)
-	if err != nil {
-		return nil, fmt.Errorf("error reading embed response: %w", err)
-	}
-
-	if resp.StatusCode >= 400 {
-		log.Printf("llm encode error: %s", body)
-		return nil, fmt.Errorf("%s", body)
-	}
-
-	var embedding EmbeddingResponse
-	if err := json.Unmarshal(body, &embedding); err != nil {
-		return nil, fmt.Errorf("unmarshal tokenize response: %w", err)
-	}
-
-	return embedding.Embedding, nil
-}
-
-// Ping checks that the server subprocess is still running and responding to requests
-func (llm *llama) Ping(ctx context.Context) error {
-	resp, err := http.Head(fmt.Sprintf("http://127.0.0.1:%d", llm.Port))
-	if err != nil {
-		return fmt.Errorf("ping resp: %w", err)
-	}
-	if resp.StatusCode != http.StatusOK {
-		return fmt.Errorf("unexpected ping status: %s", resp.Status)
-	}
-	return nil
+	return libs, nil
 }
--- a/llm/llm.go
+++ b/llm/llm.go
@@ -11,6 +11,7 @@ import (

 	"github.com/jmorganca/ollama/api"
 	"github.com/jmorganca/ollama/format"
+	"github.com/jmorganca/ollama/gpu"
 )

 type LLM interface {
@@ -18,11 +19,11 @@ type LLM interface {
 	Embedding(context.Context, string) ([]float64, error)
 	Encode(context.Context, string) ([]int, error)
 	Decode(context.Context, []int) (string, error)
-	SetOptions(api.Options)
 	Close()
-	Ping(context.Context) error
 }

+var AvailableShims = map[string]string{}
+
 func New(workDir, model string, adapters, projectors []string, opts api.Options) (LLM, error) {
 	if _, err := os.Stat(model); err != nil {
 		return nil, err
@@ -76,16 +77,27 @@ func New(workDir, model string, adapters, projectors []string, opts api.Options)
 		}
 	}

-	switch ggml.Name() {
-	case "gguf":
-		// TODO: gguf will load these options automatically from the model binary
-		opts.NumGQA = 0
-		opts.RopeFrequencyBase = 0.0
-		opts.RopeFrequencyScale = 0.0
-		return newLlama(model, adapters, projectors, chooseRunners(workDir, "gguf"), ggml.NumLayers(), opts)
-	case "ggml", "ggmf", "ggjt", "ggla":
-		return newLlama(model, adapters, projectors, chooseRunners(workDir, "ggml"), ggml.NumLayers(), opts)
-	default:
-		return nil, fmt.Errorf("unknown ggml type: %s", ggml.ModelFamily())
-	}
+	opts.NumGQA = 0
+	opts.RopeFrequencyBase = 0.0
+	opts.RopeFrequencyScale = 0.0
+	gpuInfo := gpu.GetGPUInfo()
+	return newLlmServer(gpuInfo.Library, model, adapters, projectors, ggml.NumLayers(), opts)
+}
+
+// Give any native cgo implementations an opportunity to initialize
+func Init(workdir string) error {
+	return nativeInit(workdir)
+}
+
+func newLlmServer(library, model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) {
+	if _, libPresent := AvailableShims[library]; libPresent && library != "default" {
+		srv, err := newDynamicShimExtServer(AvailableShims[library], model, adapters, projectors, numLayers, opts)
+		if err == nil {
+			return srv, nil
+		}
+		log.Printf("Failed to load dynamic library - falling back to CPU mode %s", err)
+	}
+
+	return newDefaultExtServer(model, adapters, projectors, numLayers, opts)
+
 }
--- a/llm/shim_darwin.go
+++ b/llm/shim_darwin.go
@@ -0,0 +1,32 @@
+package llm
+
+import (
+	"embed"
+	"fmt"
+	"log"
+	"os"
+
+	"github.com/jmorganca/ollama/api"
+)
+
+//go:embed llama.cpp/gguf/ggml-metal.metal
+var libEmbed embed.FS
+
+func newDynamicShimExtServer(library, model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) {
+	// should never happen...
+	return nil, fmt.Errorf("Dynamic library loading not supported on Mac")
+}
+
+func nativeInit(workdir string) error {
+	_, err := extractDynamicLibs(workdir, "llama.cpp/gguf/ggml-metal.metal")
+	if err != nil {
+		if err == payloadMissing {
+			// TODO perhaps consider this a hard failure on arm macs?
+			log.Printf("ggml-meta.metal payload missing")
+			return nil
+		}
+		return err
+	}
+	os.Setenv("GGML_METAL_PATH_RESOURCES", workdir)
+	return nil
+}
--- a/llm/shim_ext_server.go
+++ b/llm/shim_ext_server.go
@@ -0,0 +1,152 @@
+//go:build !darwin
+
+package llm
+
+/*
+
+#include <stdlib.h>
+#include "dynamic_shim.h"
+
+*/
+import "C"
+import (
+	"context"
+	"embed"
+	"errors"
+	"fmt"
+	"io/fs"
+	"log"
+	"os"
+	"path/filepath"
+	"strings"
+	"sync"
+	"unsafe"
+
+	"github.com/jmorganca/ollama/api"
+)
+
+//go:embed llama.cpp/gguf/build/lib/*
+var libEmbed embed.FS
+
+var RocmShimMissing = fmt.Errorf("ROCm shim library not included in this build of ollama. Radeon GPUs are not supported")
+
+type shimExtServer struct {
+	s       C.struct_dynamic_llama_server
+	options api.Options
+}
+
+// Note: current implementation does not support concurrent instantiations
+var shimMutex sync.Mutex
+var llm *shimExtServer
+
+func (llm *shimExtServer) llama_server_init(sparams *C.ext_server_params_t, err *C.ext_server_resp_t) {
+	C.dynamic_shim_llama_server_init(llm.s, sparams, err)
+}
+func (llm *shimExtServer) llama_server_start() {
+	C.dynamic_shim_llama_server_start(llm.s)
+}
+func (llm *shimExtServer) llama_server_stop() {
+	C.dynamic_shim_llama_server_stop(llm.s)
+}
+
+func (llm *shimExtServer) llama_server_completion(json_req *C.char, resp *C.ext_server_resp_t) {
+	C.dynamic_shim_llama_server_completion(llm.s, json_req, resp)
+}
+func (llm *shimExtServer) llama_server_completion_next_result(task_id C.int, resp *C.ext_server_task_result_t) {
+	C.dynamic_shim_llama_server_completion_next_result(llm.s, task_id, resp)
+}
+func (llm *shimExtServer) llama_server_completion_cancel(task_id C.int, err *C.ext_server_resp_t) {
+	C.dynamic_shim_llama_server_completion_cancel(llm.s, task_id, err)
+}
+func (llm *shimExtServer) llama_server_release_task_result(result *C.ext_server_task_result_t) {
+	C.dynamic_shim_llama_server_release_task_result(llm.s, result)
+}
+
+func (llm *shimExtServer) llama_server_tokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
+	C.dynamic_shim_llama_server_tokenize(llm.s, json_req, json_resp, err)
+}
+func (llm *shimExtServer) llama_server_detokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
+	C.dynamic_shim_llama_server_detokenize(llm.s, json_req, json_resp, err)
+}
+func (llm *shimExtServer) llama_server_embedding(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
+	C.dynamic_shim_llama_server_embedding(llm.s, json_req, json_resp, err)
+}
+func (llm *shimExtServer) llama_server_release_json_resp(json_resp **C.char) {
+	C.dynamic_shim_llama_server_release_json_resp(llm.s, json_resp)
+}
+
+func newDynamicShimExtServer(library, model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) {
+	shimMutex.Lock()
+	defer shimMutex.Unlock()
+	libPath := C.CString(library)
+	defer C.free(unsafe.Pointer(libPath))
+	resp := newExtServerResp(128)
+	defer freeExtServerResp(resp)
+	var srv C.struct_dynamic_llama_server
+	C.dynamic_shim_init(libPath, &srv, &resp)
+	if resp.id < 0 {
+		return nil, fmt.Errorf("Unable to load dynamic library: %s", C.GoString(resp.msg))
+	}
+	llm = &shimExtServer{
+		s:       srv,
+		options: opts,
+	}
+	log.Printf("Loading Dynamic Shim llm server: %s", library)
+	return newExtServer(llm, model, adapters, projectors, numLayers, opts)
+}
+
+func (llm *shimExtServer) Predict(ctx context.Context, pred PredictOpts, fn func(PredictResult)) error {
+	return predict(llm, llm.options, ctx, pred, fn)
+}
+
+func (llm *shimExtServer) Encode(ctx context.Context, prompt string) ([]int, error) {
+	return encode(llm, ctx, prompt)
+}
+
+func (llm *shimExtServer) Decode(ctx context.Context, tokens []int) (string, error) {
+	return decode(llm, ctx, tokens)
+}
+
+func (llm *shimExtServer) Embedding(ctx context.Context, input string) ([]float64, error) {
+	return embedding(llm, ctx, input)
+}
+
+func (llm *shimExtServer) Close() {
+	close(llm)
+}
+
+func nativeInit(workdir string) error {
+	libs, err := extractDynamicLibs(workdir, "llama.cpp/gguf/build/lib/*server*")
+	if err != nil {
+		if err == payloadMissing {
+			log.Printf("%s", payloadMissing)
+			return nil
+		}
+		return err
+	}
+	for _, lib := range libs {
+		libName := strings.Split(strings.TrimPrefix(filepath.Base(lib), "lib"), ".")[0]
+		AvailableShims[libName] = lib
+	}
+
+	// Only check ROCm access if we have the dynamic lib loaded
+	if _, rocmPresent := AvailableShims["rocm_server"]; rocmPresent {
+		// Verify we have permissions - either running as root, or we have group access to the driver
+		fd, err := os.OpenFile("/dev/kfd", os.O_RDWR, 0666)
+		if err != nil {
+			if errors.Is(err, fs.ErrPermission) {
+				log.Fatalf("Radeon card detected, but permissions not set up properly.  Either run ollama as root, or add you user account to the render group.")
+				return err
+			} else if errors.Is(err, fs.ErrNotExist) {
+				// expected behavior without a radeon card
+				return nil
+			}
+
+			return fmt.Errorf("failed to check permission on /dev/kfd: %w", err)
+		}
+		fd.Close()
+
+	}
+
+	return nil
+}
--- a/llm/starcoder.go
+++ b/llm/starcoder.go
@@ -1,23 +0,0 @@
-package llm
-
-const (
-	starCoderModelType1B  = 24
-	starCoderModelType3B  = 36
-	starCoderModelType7B  = 42
-	starCoderModelType15B = 40
-)
-
-func starCoderModelType(numLayer uint32) string {
-	switch numLayer {
-	case 24:
-		return "1B"
-	case 36:
-		return "3B"
-	case 42:
-		return "7B"
-	case 40:
-		return "15B"
-	default:
-		return "unknown"
-	}
-}
--- a/readline/readline.go
+++ b/readline/readline.go
@@ -192,14 +192,7 @@ func (i *Instance) Readline() (string, error) {
 		case CharCtrlW:
 			buf.DeleteWord()
 		case CharCtrlZ:
-			if err := UnsetRawMode(fd, termios); err != nil {
-				return "", err
-			}
-
-			syscall.Kill(0, syscall.SIGSTOP)
-
-			// on resume...
-			return "", nil
+			return handleCharCtrlZ(fd, termios)
 		case CharEnter:
 			output := buf.String()
 			if output != "" {
--- a/readline/readline_unix.go
+++ b/readline/readline_unix.go
@@ -0,0 +1,18 @@
+//go:build !windows
+
+package readline
+
+import (
+	"syscall"
+)
+
+func handleCharCtrlZ(fd int, termios *Termios) (string, error) {
+	if err := UnsetRawMode(fd, termios); err != nil {
+		return "", err
+	}
+
+	syscall.Kill(0, syscall.SIGSTOP)
+
+	// on resume...
+	return "", nil
+}
--- a/readline/readline_windows.go
+++ b/readline/readline_windows.go
@@ -0,0 +1,6 @@
+package readline
+
+func handleCharCtrlZ(fd int, state *State) (string, error) {
+	// not supported
+	return "", nil
+}
--- a/scripts/build_darwin.sh
+++ b/scripts/build_darwin.sh
@@ -9,7 +9,7 @@ mkdir -p dist

 for TARGETARCH in arm64 amd64; do
    GOOS=darwin GOARCH=$TARGETARCH go generate ./...
-    GOOS=darwin GOARCH=$TARGETARCH go build -o dist/ollama-darwin-$TARGETARCH
+    CGO_ENABLED=1 GOOS=darwin GOARCH=$TARGETARCH go build -o dist/ollama-darwin-$TARGETARCH
    rm -rf llm/llama.cpp/*/build
 done

--- a/scripts/build_linux.sh
+++ b/scripts/build_linux.sh
@@ -7,8 +7,8 @@ export GOFLAGS="'-ldflags=-w -s \"-X=github.com/jmorganca/ollama/version.Version

 mkdir -p dist

-for TARGETARCH in arm64 amd64; do
-    docker buildx build --load --platform=linux/$TARGETARCH --build-arg=VERSION --build-arg=GOFLAGS -f Dockerfile.build -t builder:$TARGETARCH .
+for TARGETARCH in amd64 arm64; do
+    docker buildx build --load --platform=linux/$TARGETARCH --build-arg=VERSION --build-arg=GOFLAGS --build-arg=CGO_CFLAGS -f Dockerfile.build -t builder:$TARGETARCH .
    docker create --platform linux/$TARGETARCH --name builder-$TARGETARCH builder:$TARGETARCH
    docker cp builder-$TARGETARCH:/go/src/github.com/jmorganca/ollama/ollama ./dist/ollama-linux-$TARGETARCH
    docker rm builder-$TARGETARCH
--- a/scripts/build_remote.py
+++ b/scripts/build_remote.py
@@ -0,0 +1,68 @@
+#!/usr/bin/env python3
+import subprocess
+import sys
+from urllib.parse import urlparse
+from git import Repo
+
+# Helper script to be able to build on remote repos using git to push local changes
+# (e.g. particularly helpful to target a remote windows build system)
+#
+# Typical windows remote git config looks like this:
+#
+#[remote "windows-pa"]
+#        url = jdoe@desktop-foo:C:/Users/Jdoe/code/ollama
+#        fetch = +refs/heads/*:refs/remotes/windows-pa/*
+#        uploadpack = powershell git upload-pack
+#        receivepack = powershell git receive-pack
+#
+
+# TODO - add argpare and make this more configurable 
+# - force flag becomes optional
+# - generate, build or test ...
+
+# Note: remote repo will need this run once:
+# git config --local receive.denyCurrentBranch updateInstead
+repo = Repo(".")
+
+# On linux, add links in /usr/local/bin to the go binaries to avoid needing this
+# GoCmd = "/usr/local/go/bin/go" 
+GoCmd = "go" 
+
+if repo.is_dirty():
+    print("Tree is dirty.  Commit your changes before running this script")
+    sys.exit(1)
+
+if len(sys.argv) != 2:
+    print("Please specify the remote name: " + ', '.join([r.name for r in repo.remotes]))
+    sys.exit(1)
+remote_name = sys.argv[1]
+
+remote = {r.name: r for r in repo.remotes}[remote_name]
+raw_url = list(remote.urls)[0]
+url = urlparse(raw_url)
+# Windows urls don't quite parse properly
+if url.scheme == "" and url.netloc == "":
+    url = urlparse("ssh://" + raw_url)
+print("URL: " + str(url))
+netloc = url.netloc.split(":")[0]
+path = url.path
+branch_name = repo.active_branch.name
+
+print("Force pushing content to remote...")
+# Use with care given the force push
+remote.push(force=True).raise_if_error()
+
+print("Ensuring correct branch checked out on remote via ssh...")
+subprocess.check_call(['ssh', netloc, 'cd', path, ';', 'git', 'checkout', branch_name])
+
+
+# TODO - add some hardening to try to figure out how to set up the path properly
+# subprocess.check_call(['ssh', netloc, 'cd', path, ';', 'env'])
+# TODO - or consider paramiko maybe
+
+print("Performing generate")
+subprocess.check_call(['ssh', netloc, 'cd', path, ';', GoCmd, 'generate', './...'])
+
+print("Building")
+subprocess.check_call(['ssh', netloc, 'cd', path, ';', GoCmd, 'build', '.'])
+
--- a/scripts/setup_integration_tests.sh
+++ b/scripts/setup_integration_tests.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+
+# This script sets up integration tests which run the full stack to verify
+# inference locally
+set -e
+set -o pipefail
+
+REPO=$(dirname $0)/../
+export OLLAMA_MODELS=${REPO}/test_data/models
+REGISTRY_SCHEME=https
+REGISTRY=registry.ollama.ai
+TEST_MODELS=("library/orca-mini:latest" "library/llava:7b")
+ACCEPT_HEADER="Accept: application/vnd.docker.distribution.manifest.v2+json"
+
+for model in ${TEST_MODELS[@]}; do
+    TEST_MODEL=$(echo ${model} | cut -f1 -d:)
+    TEST_MODEL_TAG=$(echo ${model} | cut -f2 -d:)
+    mkdir -p ${OLLAMA_MODELS}/manifests/${REGISTRY}/${TEST_MODEL}/
+    mkdir -p ${OLLAMA_MODELS}/blobs/
+
+    echo "Pulling manifest for ${TEST_MODEL}:${TEST_MODEL_TAG}"
+    curl -s --header "${ACCEPT_HEADER}" \
+        -o ${OLLAMA_MODELS}/manifests/${REGISTRY}/${TEST_MODEL}/${TEST_MODEL_TAG} \
+        ${REGISTRY_SCHEME}://${REGISTRY}/v2/${TEST_MODEL}/manifests/${TEST_MODEL_TAG} 
+
+    CFG_HASH=$(cat ${OLLAMA_MODELS}/manifests/${REGISTRY}/${TEST_MODEL}/${TEST_MODEL_TAG} | jq -r ".config.digest")
+    echo "Pulling config blob ${CFG_HASH}"
+    curl -L -C - --header "${ACCEPT_HEADER}" \
+            -o ${OLLAMA_MODELS}/blobs/${CFG_HASH} \
+            ${REGISTRY_SCHEME}://${REGISTRY}/v2/${TEST_MODEL}/blobs/${CFG_HASH}
+
+    for LAYER in $(cat ${OLLAMA_MODELS}/manifests/${REGISTRY}/${TEST_MODEL}/${TEST_MODEL_TAG} | jq -r ".layers[].digest" ) ; do
+        echo "Pulling blob ${LAYER}"
+        curl -L -C - --header "${ACCEPT_HEADER}" \
+            -o ${OLLAMA_MODELS}/blobs/${LAYER} \
+            ${REGISTRY_SCHEME}://${REGISTRY}/v2/${TEST_MODEL}/blobs/${LAYER}
+    done
+done
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Matt Williams	1eefebe392	add faq for what is context Signed-off-by: Matt Williams <m@technovangelist.com>	2023-12-22 09:20:23 -08:00
Matt Williams	5a85070c22	Update readmes, requirements, packagejsons, etc for all examples (#1452 ) Most of the examples needed updates of Readmes to show how to run them. Some of the requirements.txt files had extra content that wasn't needed, or missing altogether. Apparently some folks like to run npm start to run typescript, so a script was added to all typescript examples which hadn't been done before. Basically just a lot of cleanup. Signed-off-by: Matt Williams <m@technovangelist.com>	2023-12-22 09:10:41 -08:00
Matt Williams	291700c92d	Clean up documentation (#1506 ) * Clean up documentation Will probably need to update with PRs for new release. Signed-off-by: Matt Williams <m@technovangelist.com> * Correcting to fit in 0.1.15 changes Signed-off-by: Matt Williams <m@technovangelist.com> * Update README.md Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com> * addressing comments Signed-off-by: Matt Williams <m@technovangelist.com> * more api cleanup Signed-off-by: Matt Williams <m@technovangelist.com> * its llava not llama Signed-off-by: Matt Williams <m@technovangelist.com> * Update docs/troubleshooting.md Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com> * Updated hosting to server and documented all env vars Signed-off-by: Matt Williams <m@technovangelist.com> * remove last of the cli descriptions Signed-off-by: Matt Williams <m@technovangelist.com> * Update README.md Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com> * update further per conversation with jeff earlier today Signed-off-by: Matt Williams <m@technovangelist.com> * cleanup the doc readme Signed-off-by: Matt Williams <m@technovangelist.com> * move upgrade to faq Signed-off-by: Matt Williams <m@technovangelist.com> * first change Signed-off-by: Matt Williams <m@technovangelist.com> * updated Signed-off-by: Matt Williams <m@technovangelist.com> * Update docs/faq.md Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com> * Update docs/api.md Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com> * Update docs/api.md Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com> * Update docs/api.md Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com> * Update docs/api.md Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com> * Update docs/api.md Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com> * Update docs/api.md Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com> * Update docs/README.md Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com> * Update docs/api.md Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com> * Update docs/api.md Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com> * Update docs/api.md Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com> * Update README.md Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com> * Update docs/README.md Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com> * Update docs/api.md Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com> * Update docs/api.md Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com> * Update docs/api.md Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com> * Update docs/README.md Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com> * Update docs/README.md Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com> * Update docs/README.md Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com> * examples in parent Signed-off-by: Matt Williams <m@technovangelist.com> * add exapmle for create model. Signed-off-by: Matt Williams <m@technovangelist.com> * update faq Signed-off-by: Matt Williams <m@technovangelist.com> * update create model api Signed-off-by: Matt Williams <m@technovangelist.com> * Update docs/api.md Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com> * Update docs/faq.md Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com> * Update docs/troubleshooting.md Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com> * update the readme in docs Signed-off-by: Matt Williams <m@technovangelist.com> * update a few more things Signed-off-by: Matt Williams <m@technovangelist.com> * Update docs/troubleshooting.md Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com> * Update docs/faq.md Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com> * Update README.md Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com> * Update docs/modelfile.md Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com> * Update docs/troubleshooting.md Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com> --------- Signed-off-by: Matt Williams <m@technovangelist.com> Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com>	2023-12-22 09:10:01 -08:00
Daniel Hiltgen	9db28af84e	Merge pull request #1675 from dhiltgen/less_verbose Quiet down llama.cpp logging by default	2023-12-22 08:57:17 -08:00
Daniel Hiltgen	e5202eb687	Quiet down llama.cpp logging by default By default builds will now produce non-debug and non-verbose binaries. To enable verbose logs in llama.cpp and debug symbols in the native code, set `CGO_CFLAGS=-g`	2023-12-22 08:47:18 -08:00
Daniel Hiltgen	96fb441abd	Merge pull request #1146 from dhiltgen/ext_server_cgo Add cgo implementation for llama.cpp	2023-12-22 08:16:31 -08:00
Daniel Hiltgen	495c06e4a6	Fix doc glitch	2023-12-21 18:21:31 -08:00
Daniel Hiltgen	fa24e73b82	Remove CPU build, fixup linux build script	2023-12-21 18:21:31 -08:00
Daniel Hiltgen	325d74985b	Fix CPU performance on hyperthreaded systems The default thread count logic was broken and resulted in 2x the number of threads as it should on a hyperthreading CPU resulting in thrashing and poor performance.	2023-12-21 16:23:36 -08:00
Bruce MacDonald	fabf2f3467	allow for starting llava queries with filepath (#1549 )	2023-12-21 13:20:59 -05:00
Daniel Hiltgen	d9cd3d9667	Revive windows build The windows native setup still needs some more work, but this gets it building again and if you set the PATH properly, you can run the resulting exe on a cuda system.	2023-12-20 17:21:54 -08:00
Patrick Devine	a607d922f0	add FAQ for slow networking in WSL2 (#1646 )	2023-12-20 16:27:24 -08:00
Daniel Hiltgen	7555ea44f8	Revamp the dynamic library shim This switches the default llama.cpp to be CPU based, and builds the GPU variants as dynamically loaded libraries which we can select at runtime. This also bumps the ROCm library to version 6 given 5.7 builds don't work on the latest ROCm library that just shipped.	2023-12-20 14:45:57 -08:00
Jeffrey Morgan	df06812494	Update api.md	2023-12-20 08:47:53 -05:00
Daniel Hiltgen	1d1eb1688c	Additional nvidial-ml path to check	2023-12-19 15:52:34 -08:00
Michael Yang	23dc179350	Merge pull request #1619 from jmorganca/mxyng/fix-version-test fix(test): use real version string for comparison	2023-12-19 15:48:52 -08:00
Michael Yang	63aac0edc5	fix(test): use real version string for comparison	2023-12-19 15:03:02 -08:00
Daniel Hiltgen	6558f94ed0	Fix darwin intel build	2023-12-19 13:32:24 -08:00
Erick Ghaumez	1ca484f67e	Add Langchain Dart library (#1564 ) * Add Langchain Dart * Update README.md --------- Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com>	2023-12-19 14:04:52 -05:00
Jeffrey Morgan	72b0c32fe9	Update README.md	2023-12-19 12:59:22 -05:00
Jeffrey Morgan	68c28224f8	Update README.md	2023-12-19 12:59:03 -05:00
Daniel Hiltgen	54dbfa4c4a	Carry ggml-metal.metal as payload	2023-12-19 09:05:46 -08:00
Daniel Hiltgen	5646826a79	Add WSL2 path to nvidia-ml.so library	2023-12-19 09:05:46 -08:00
Daniel Hiltgen	3269535a4c	Refine handling of shim presence This allows the CPU only builds to work on systems with Radeon cards	2023-12-19 09:05:46 -08:00
Daniel Hiltgen	1b991d0ba9	Refine build to support CPU only If someone checks out the ollama repo and doesn't install the CUDA library, this will ensure they can build a CPU only version	2023-12-19 09:05:46 -08:00
Daniel Hiltgen	51082535e1	Add automated test for multimodal A simple test case that verifies llava:7b can read text in an image	2023-12-19 09:05:46 -08:00
Daniel Hiltgen	9adca7f711	Bump llama.cpp to b1662 and set n_parallel=1	2023-12-19 09:05:46 -08:00
Daniel Hiltgen	89bbaafa64	Build linux using ubuntu 20.04 This changes the container-based linux build to use an older Ubuntu distro to improve our compatibility matrix for older user machines	2023-12-19 09:05:46 -08:00
Daniel Hiltgen	35934b2e05	Adapted rocm support to cgo based llama.cpp	2023-12-19 09:05:46 -08:00
65a	f8ef4439e9	Use build tags to generate accelerated binaries for CUDA and ROCm on Linux. The build tags rocm or cuda must be specified to both go generate and go build. ROCm builds should have both ROCM_PATH set (and the ROCM SDK present) as well as CLBlast installed (for GGML) and CLBlast_DIR set in the environment to the CLBlast cmake directory (likely /usr/lib/cmake/CLBlast). Build tags are also used to switch VRAM detection between cuda and rocm implementations, using added "accelerator_foo.go" files which contain architecture specific functions and variables. accelerator_none is used when no tags are set, and a helper function addRunner will ignore it if it is the chosen accelerator. Fix go generate commands, thanks @deadmeu for testing.	2023-12-19 09:05:46 -08:00
Daniel Hiltgen	d4cd695759	Add cgo implementation for llama.cpp Run the server.cpp directly inside the Go runtime via cgo while retaining the LLM Go abstractions.	2023-12-19 09:05:46 -08:00
Bruce MacDonald	5e7fd6906f	Update images.go	2023-12-19 09:05:46 -08:00
Bruce MacDonald	811b1f03c8	deprecate ggml - remove ggml runner - automatically pull gguf models when ggml detected - tell users to update to gguf in the case automatic pull fails Co-Authored-By: Jeffrey Morgan <jmorganca@gmail.com>	2023-12-19 09:05:46 -08:00
Matt Williams	ed195f3562	Merge pull request #1595 from pgibler/main Added cmdh to community section in README	2023-12-18 20:55:18 -08:00
Matt Williams	e0d0072ef1	Merge pull request #1592 from jmorganca/mattw/examplepruning Lets get rid of these old modelfile examples	2023-12-18 20:29:48 -08:00
pgibler	620a2ffcfb	Added cmdh to community section in README	2023-12-18 22:04:40 -05:00
Matt Williams	d287013f24	Lets get rid of these old modelfile examples Signed-off-by: Matt Williams <m@technovangelist.com>	2023-12-18 17:47:33 -08:00
Jeffrey Morgan	6b5bdfa6c9	update runner submodule	2023-12-18 17:33:46 -05:00
Jeffrey Morgan	c063ee4af0	update runner submodule to fix hipblas build	2023-12-18 15:41:13 -05:00
Bruce MacDonald	d99fa6ce0a	send empty messages on last chat response (#1530 )	2023-12-18 14:23:38 -05:00
Patrick Devine	3948c6ea06	add magic header for unit tests (#1558 )	2023-12-18 10:41:02 -08:00
Jeffrey Morgan	b85982eb91	update runner submodule	2023-12-18 12:43:31 -05:00
Patrick Devine	86b0dd4b16	add API create/copy handlers (#1541 )	2023-12-15 11:59:18 -08:00
Augustinas Malinauskas	f728738427	README with Enchanted iOS App (#1529 ) * feat(docs): README with Enchanted iOS app * Update README.md --------- Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com>	2023-12-15 14:37:29 -05:00
Ian Purton	115048a0d8	Added Bionic GPT as a front end. (#1463 ) * Added Bionic GPT as a front end. * Update README.md --------- Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com>	2023-12-15 14:33:04 -05:00
Bruce MacDonald	1b417a7836	use exp slices for go 1.20 compatibility (#1544 )	2023-12-15 14:15:56 -05:00
Patrick Devine	0174665d0e	add API tests for list handler (#1535 )	2023-12-14 18:18:25 -08:00
Patrick Devine	630518f0d9	Add unit test of API routes (#1528 )	2023-12-14 16:47:40 -08:00
Bruce MacDonald	6e16098a60	remove sample_count from docs (#1527 ) this info has not been returned from these endpoints in some time	2023-12-14 17:49:00 -05:00
Bruce MacDonald	6ee8c80199	restore model load duration on generate response (#1524 ) * restore model load duration on generate response - set model load duration on generate and chat done response - calculate createAt time when response created * remove checkpoints predict opts * Update routes.go	2023-12-14 12:15:50 -05:00
Jeffrey Morgan	31f0551dab	Update runner to support mixtral and mixture of experts (MoE) (#1475 )	2023-12-13 17:15:10 -05:00
Jeffrey Morgan	4a1abfe4fa	fix tests	2023-12-13 14:42:30 -05:00
Jeffrey Morgan	bbd41494bf	add multimodal to `README.md`	2023-12-13 14:38:47 -05:00
Jeffrey Morgan	fedba24a63	Docs for multimodal support (#1485 ) * add multimodal docs * add chat api docs * consistency between `/api/generate` and `/api/chat` * simplify docs	2023-12-13 13:59:33 -05:00
pepperoni21	e3b090dbc5	Added message format for chat api (#1488 )	2023-12-13 11:21:23 -05:00
Patrick Devine	d9e60f634b	add image support to the chat api (#1490 )	2023-12-12 13:28:58 -08:00
Michael Yang	4251b342de	Merge pull request #1469 from jmorganca/mxyng/model-types remove per-model types	2023-12-12 12:27:03 -08:00
Jeffrey Morgan	0a9d348023	Fix issues with `/set template` and `/set system` (#1486 )	2023-12-12 14:43:19 -05:00
Bruce MacDonald	3144e2a439	exponential back-off (#1484 )	2023-12-12 12:33:02 -05:00
Bruce MacDonald	c0960e29b5	retry on concurrent request failure (#1483 ) - remove parallel	2023-12-12 12:14:35 -05:00
ruecat	5314fc9b63	Fix Readme "Database -> MindsDB" link (#1479 )	2023-12-12 10:26:13 -05:00
Jorge Torres	a36b5fef3b	Update README.md (#1412 )	2023-12-11 18:05:10 -05:00
Patrick Devine	910e9401d0	Multimodal support (#1216 ) --------- Co-authored-by: Matt Apperson <mattapperson@Matts-MacBook-Pro.local>	2023-12-11 13:56:22 -08:00
Michael Yang	56ffc3023a	remove per-model types mostly replaced by decoding tensors except ggml models which only support llama	2023-12-11 09:40:21 -08:00
Bruce MacDonald	7a1b37ac64	os specific ctrl-z (#1420 )	2023-12-11 10:48:14 -05:00
Jeffrey Morgan	5d4d2e2c60	update docs with chat completion api	2023-12-10 13:53:36 -05:00