add model_format config var

2023-09-07 13:54:53 -04:00
48 changed files with 884 additions and 2628 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -1,8 +1,4 @@
 .vscode
 ollama
 app
 dist
 scripts
 llm/llama.cpp/ggml
 llm/llama.cpp/gguf
 .env
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,10 +1,4 @@
 [submodule "llm/llama.cpp/ggml"]
-    path = llm/llama.cpp/ggml
+	path = llm/llama.cpp/ggml
-    url = https://github.com/ggerganov/llama.cpp.git
+	url = https://github.com/ggerganov/llama.cpp.git
-    ignore = dirty
+	ignore = dirty
    shallow = true
 [submodule "llm/llama.cpp/gguf"]
    path = llm/llama.cpp/gguf
    url = https://github.com/ggerganov/llama.cpp.git
    ignore = dirty
    shallow = true
--- a/28
+++ b/28
@@ -1,23 +1,21 @@
-FROM nvidia/cuda:11.8.0-devel-ubuntu22.04
+FROM golang:alpine
 ARG TARGETARCH
 ARG VERSION=0.0.0
 ARG GOFLAGS="'-ldflags=-w -s'"
 WORKDIR /go/src/github.com/jmorganca/ollama
-RUN apt-get update && apt-get install -y git build-essential cmake
+RUN apk add --no-cache git build-base cmake
 ADD https://dl.google.com/go/go1.21.1.linux-$TARGETARCH.tar.gz /tmp/go1.21.1.tar.gz
 RUN mkdir -p /usr/local && tar xz -C /usr/local </tmp/go1.21.1.tar.gz
 COPY . .
-ENV GOARCH=$TARGETARCH
+RUN go generate ./... && go build -ldflags '-linkmode external -extldflags "-static"' .
 RUN /usr/local/go/bin/go generate ./... \
    && /usr/local/go/bin/go build .
-FROM ubuntu:22.04
+FROM alpine
 RUN apt-get update && apt-get install -y ca-certificates
 COPY --from=0 /go/src/github.com/jmorganca/ollama/ollama /bin/ollama
 EXPOSE 11434
 ENV OLLAMA_HOST 0.0.0.0
 RUN apk add --no-cache libstdc++
 ARG USER=ollama
 ARG GROUP=ollama
 RUN addgroup $GROUP && adduser -D -G $GROUP $USER
 COPY --from=0 /go/src/github.com/jmorganca/ollama/ollama /bin/ollama
 USER $USER:$GROUP
 ENTRYPOINT ["/bin/ollama"]
 CMD ["serve"]
--- a/Dockerfile.build
+++ b/Dockerfile.build
@@ -1,32 +0,0 @@
 # centos7 amd64 dependencies
 FROM --platform=linux/amd64 nvidia/cuda:11.8.0-devel-centos7 AS base-amd64
 RUN yum install -y https://repo.ius.io/ius-release-el7.rpm centos-release-scl && \
    yum update -y && \
    yum install -y devtoolset-10-gcc devtoolset-10-gcc-c++ git236 wget
 RUN wget "https://github.com/Kitware/CMake/releases/download/v3.27.6/cmake-3.27.6-linux-x86_64.sh" -O cmake-installer.sh && chmod +x cmake-installer.sh && ./cmake-installer.sh --skip-license --prefix=/usr/local
 ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
 # centos8 arm64 dependencies
 FROM --platform=linux/arm64 nvidia/cuda:11.4.3-devel-centos8 AS base-arm64
 RUN sed -i -e 's/mirrorlist/#mirrorlist/g' -e 's|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g' /etc/yum.repos.d/CentOS-*
 RUN yum install -y git cmake
 FROM base-${TARGETARCH}
 ARG TARGETARCH
 # install go
 ADD https://dl.google.com/go/go1.21.1.linux-$TARGETARCH.tar.gz /tmp/go1.21.1.tar.gz
 RUN mkdir -p /usr/local && tar xz -C /usr/local </tmp/go1.21.1.tar.gz
 # build the final binary
 WORKDIR /go/src/github.com/jmorganca/ollama
 COPY . .
 ENV GOOS=linux
 ENV GOARCH=$TARGETARCH
 ARG VERSION=0.0.0
 ARG GOFLAGS="'-ldflags -w -s'"
 RUN /usr/local/go/bin/go generate ./... && \
    /usr/local/go/bin/go build .
--- a/README.md
+++ b/README.md
@@ -9,27 +9,19 @@
 [![Discord](https://dcbadge.vercel.app/api/server/ollama?style=flat&compact=true)](https://discord.gg/ollama)
-Get up and running with large language models locally.
+Run, create, and share large language models (LLMs).
-### macOS
+> Note: Ollama is in early preview. Please report any issues you find.
-[Download](https://ollama.ai/download/Ollama-darwin.zip) 
+## Download
-### Linux & WSL2
+- [Download](https://ollama.ai/download) for macOS
-
+- Download for Windows and Linux (coming soon)
-```
+- Build [from source](#building)
 curl https://ollama.ai/install.sh | sh
 ```
 [Manual install instructions](https://github.com/jmorganca/ollama/blob/main/docs/linux.md)
 ### Windows 
 coming soon
 ## Quickstart
-To run and chat with [Llama 2](https://ollama.ai/library/llama2):
+To run and chat with [Llama 2](https://ai.meta.com/llama), the new model by Meta:
 ```
 ollama run llama2
@@ -37,50 +29,87 @@ ollama run llama2
 ## Model library
-Ollama supports a list of open-source models available on [ollama.ai/library](https://ollama.ai/library "ollama model library")
+Ollama supports a list of open-source models available on [ollama.ai/library](https://ollama.ai/library 'ollama model library')
 Here are some example open-source models that can be downloaded:
-| Model              | Parameters | Size  | Download                       |
+| Model                    | Parameters | Size  | Download                        |
-| ------------------ | ---------- | ----- | ------------------------------ |
+| ------------------------ | ---------- | ----- | ------------------------------- |
-| Mistral            | 7B         | 4.1GB | `ollama run mistral`           |
+| Llama2                   | 7B         | 3.8GB | `ollama pull llama2`            |
-| Llama 2            | 7B         | 3.8GB | `ollama run llama2`            |
+| Llama2 13B               | 13B        | 7.3GB | `ollama pull llama2:13b`        |
-| Code Llama         | 7B         | 3.8GB | `ollama run codellama`         |
+| Llama2 70B               | 70B        | 39GB  | `ollama pull llama2:70b`        |
-| Llama 2 Uncensored | 7B         | 3.8GB | `ollama run llama2-uncensored` |
+| Llama2 Uncensored        | 7B         | 3.8GB | `ollama pull llama2-uncensored` |
-| Llama 2 13B        | 13B        | 7.3GB | `ollama run llama2:13b`        |
+| Code Llama               | 7B         | 3.8GB | `ollama pull codellama`         |
-| Llama 2 70B        | 70B        | 39GB  | `ollama run llama2:70b`        |
+| Orca Mini                | 3B         | 1.9GB | `ollama pull orca-mini`         |
-| Orca Mini          | 3B         | 1.9GB | `ollama run orca-mini`         |
+| Vicuna                   | 7B         | 3.8GB | `ollama pull vicuna`            |
-| Vicuna             | 7B         | 3.8GB | `ollama run vicuna`            |
+| Nous-Hermes              | 7B         | 3.8GB | `ollama pull nous-hermes`       |
 | Nous-Hermes 13B          | 13B        | 7.3GB | `ollama pull nous-hermes:13b`   |
 | Wizard Vicuna Uncensored | 13B        | 7.3GB | `ollama pull wizard-vicuna`     |
 > Note: You should have at least 8 GB of RAM to run the 3B models, 16 GB to run the 7B models, and 32 GB to run the 13B models.
-## Customize your own model
+## Examples
-### Import from GGUF or GGML
+### Pull a public model
-Ollama supports importing GGUF and GGML file formats in the Modelfile. This means if you have a model that is not in the Ollama library, you can create it, iterate on it, and upload it to the Ollama library to share with others when you are ready.
+```
 ollama pull llama2
 ```
-1. Create a file named Modelfile, and add a `FROM` instruction with the local filepath to the model you want to import.
+> This command can also be used to update a local model. Only updated changes will be pulled.
-   ```
+### Run a model interactively
   FROM ./vicuna-33b.Q4_0.gguf
   ```
-3. Create the model in Ollama
+```
 ollama run llama2
 >>> hi
 Hello! How can I help you today?
 ```
-   ```
+For multiline input, you can wrap text with `"""`:
   ollama create name -f path_to_modelfile
   ```
-5. Run the model
+```
 >>> """Hello,
 ... world!
 ... """
 I'm a basic program that prints the famous "Hello, world!" message to the console.
 ```
-   ```
+### Run a model non-interactively
   ollama run name
   ```
-### Customize a prompt
+```
 $ ollama run llama2 'tell me a joke'
 Sure! Here's a quick one:
 Why did the scarecrow win an award? Because he was outstanding in his field!
 ```
-Models from the Ollama library can be customized with a prompt. The example
+```
 $ cat <<EOF >prompts.txt
 tell me a joke about llamas
 tell me another one
 EOF
 $ ollama run llama2 <prompts.txt
 >>> tell me a joke about llamas
 Why did the llama refuse to play hide-and-seek?
 nobody likes to be hided!
 >>> tell me another one
 Sure, here's another one:
 Why did the llama go to the bar?
 To have a hay-often good time!
 ```
 ### Run a model on contents of a text file
 ```
 $ ollama run llama2 "summarize this file:" "$(cat README.md)"
 Ollama is a lightweight, extensible framework for building and running language models on the local machine. It provides a simple API for creating, running, and managing models, as well as a library of pre-built models that can be easily used in a variety of applications.
 ```
 ### Customize a model
 Pull a base model:
 ```
 ollama pull llama2
@@ -109,61 +138,30 @@ ollama run mario
 Hello! It's your friend Mario.
 ```
-For more examples, see the [examples](./examples) directory. For more information on working with a Modelfile, see the [Modelfile](./docs/modelfile.md) documentation.
+For more examples, see the [examples](./examples) directory. For more information on creating a Modelfile, see the [Modelfile](./docs/modelfile.md) documentation.
-## CLI Reference
+### Listing local models
 ### Create a model
 `ollama create` is used to create a model from a Modelfile.
 ### Pull a model
 ```
 ollama pull llama2
 ```
 > This command can also be used to update a local model. Only the diff will be pulled.
 ### Remove a model
 ```
 ollama rm llama2
 ```
 ### Copy a model
 ```
 ollama cp llama2 my-llama2
 ```
 ### Multiline input
 For multiline input, you can wrap text with `"""`:
 ```
 >>> """Hello,
 ... world!
 ... """
 I'm a basic program that prints the famous "Hello, world!" message to the console.
 ```
 ### Pass in prompt as arguments
 ```
 $ ollama run llama2 "summarize this file:" "$(cat README.md)"
 Ollama is a lightweight, extensible framework for building and running language models on the local machine. It provides a simple API for creating, running, and managing models, as well as a library of pre-built models that can be easily used in a variety of applications.
 ```
 ### List models on your computer
 ```
 ollama list
 ```
-### Start Ollama
+### Removing local models
-`ollama serve` is used when you want to start ollama without running the desktop application.
+```
 ollama rm llama2
 ```
 ## Model packages
 ### Overview
 Ollama bundles model weights, configurations, and data into a single package, defined by a [Modelfile](./docs/modelfile.md).
 <picture>
  <source media="(prefers-color-scheme: dark)" height="480" srcset="https://github.com/jmorganca/ollama/assets/251292/2fd96b5f-191b-45c1-9668-941cfad4eb70">
  <img alt="logo" height="480" src="https://github.com/jmorganca/ollama/assets/251292/2fd96b5f-191b-45c1-9668-941cfad4eb70">
 </picture>
 ## Building
@@ -206,18 +204,12 @@ curl -X POST http://localhost:11434/api/generate -d '{
 }'
 ```
-## Community Integrations
+## Community Projects using Ollama
- [LangChain](https://python.langchain.com/docs/integrations/llms/ollama) and [LangChain.js](https://js.langchain.com/docs/modules/model_io/models/llms/integrations/ollama) with [example](https://js.langchain.com/docs/use_cases/question_answering/local_retrieval_qa)
+- [LangChain](https://python.langchain.com/docs/integrations/llms/ollama) and [LangChain.js](https://js.langchain.com/docs/modules/model_io/models/llms/integrations/ollama) with a question-answering [example](https://js.langchain.com/docs/use_cases/question_answering/local_retrieval_qa).
- [LlamaIndex](https://gpt-index.readthedocs.io/en/stable/examples/llm/ollama.html)
+- [Continue](https://github.com/continuedev/continue) - embeds Ollama inside Visual Studio Code. The extension lets you highlight code to add to the prompt, ask questions in the sidebar, and generate code inline.
- [Raycast extension](https://github.com/MassimilianoPasquini97/raycast_ollama)
+- [LiteLLM](https://github.com/BerriAI/litellm) a lightweight python package to simplify LLM API calls
- [Discollama](https://github.com/mxyng/discollama) (Discord bot inside the Ollama discord channel)
+- [Discord AI Bot](https://github.com/mekb-turtle/discord-ai-bot) - interact with Ollama as a chatbot on Discord.
- [Continue](https://github.com/continuedev/continue)
+- [Raycast Ollama](https://github.com/MassimilianoPasquini97/raycast_ollama) - Raycast extension to use Ollama for local llama inference on Raycast.
- [Obsidian Ollama plugin](https://github.com/hinterdupfinger/obsidian-ollama)
+- [Simple HTML UI for Ollama](https://github.com/rtcfirefly/ollama-ui)
- [Dagger Chatbot](https://github.com/samalba/dagger-chatbot)
+- [Emacs client](https://github.com/zweifisch/ollama) for Ollama
 - [LiteLLM](https://github.com/BerriAI/litellm)
 - [Discord AI Bot](https://github.com/mekb-turtle/discord-ai-bot)
 - [HTML UI](https://github.com/rtcfirefly/ollama-ui)
 - [Typescript UI](https://github.com/ollama-interface/Ollama-Gui?tab=readme-ov-file)
 - [Dumbar](https://github.com/JerrySievert/Dumbar)
 - [Emacs client](https://github.com/zweifisch/ollama)
--- a/api/client.py
+++ b/api/client.py
@@ -1,225 +0,0 @@
 import os
 import json
 import requests
 BASE_URL = os.environ.get('OLLAMA_HOST', 'http://localhost:11434')
 # Generate a response for a given prompt with a provided model. This is a streaming endpoint, so will be a series of responses.
 # The final response object will include statistics and additional data from the request. Use the callback function to override
 # the default handler.
 def generate(model_name, prompt, system=None, template=None, context=None, options=None, callback=None):
    try:
        url = f"{BASE_URL}/api/generate"
        payload = {
            "model": model_name, 
            "prompt": prompt, 
            "system": system, 
            "template": template, 
            "context": context, 
            "options": options
        }
        # Remove keys with None values
        payload = {k: v for k, v in payload.items() if v is not None}
        with requests.post(url, json=payload, stream=True) as response:
            response.raise_for_status()
            # Creating a variable to hold the context history of the final chunk
            final_context = None
            # Variable to hold concatenated response strings if no callback is provided
            full_response = ""
            # Iterating over the response line by line and displaying the details
            for line in response.iter_lines():
                if line:
                    # Parsing each line (JSON chunk) and extracting the details
                    chunk = json.loads(line)
                    # If a callback function is provided, call it with the chunk
                    if callback:
                        callback(chunk)
                    else:
                        # If this is not the last chunk, add the "response" field value to full_response and print it
                        if not chunk.get("done"):
                            response_piece = chunk.get("response", "")
                            full_response += response_piece
                            print(response_piece, end="", flush=True)
                    # Check if it's the last chunk (done is true)
                    if chunk.get("done"):
                        final_context = chunk.get("context")
            # Return the full response and the final context
            return full_response, final_context
    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
        return None, None
 # Create a model from a Modelfile. Use the callback function to override the default handler.
 def create(model_name, model_path, callback=None):
    try:
        url = f"{BASE_URL}/api/create"
        payload = {"name": model_name, "path": model_path}
        # Making a POST request with the stream parameter set to True to handle streaming responses
        with requests.post(url, json=payload, stream=True) as response:
            response.raise_for_status()
            # Iterating over the response line by line and displaying the status
            for line in response.iter_lines():
                if line:
                    # Parsing each line (JSON chunk) and extracting the status
                    chunk = json.loads(line)
                    if callback:
                        callback(chunk)
                    else:
                        print(f"Status: {chunk.get('status')}")
    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
 # Pull a model from a the model registry. Cancelled pulls are resumed from where they left off, and multiple
 # calls to will share the same download progress. Use the callback function to override the default handler.
 def pull(model_name, insecure=False, callback=None):
    try:
        url = f"{BASE_URL}/api/pull"
        payload = {
            "name": model_name,
            "insecure": insecure
        }
        # Making a POST request with the stream parameter set to True to handle streaming responses
        with requests.post(url, json=payload, stream=True) as response:
            response.raise_for_status()
            # Iterating over the response line by line and displaying the details
            for line in response.iter_lines():
                if line:
                    # Parsing each line (JSON chunk) and extracting the details
                    chunk = json.loads(line)
                    # If a callback function is provided, call it with the chunk
                    if callback:
                        callback(chunk)
                    else:
                        # Print the status message directly to the console
                        print(chunk.get('status', ''), end='', flush=True)
                    # If there's layer data, you might also want to print that (adjust as necessary)
                    if 'digest' in chunk:
                        print(f" - Digest: {chunk['digest']}", end='', flush=True)
                        print(f" - Total: {chunk['total']}", end='', flush=True)
                        print(f" - Completed: {chunk['completed']}", end='\n', flush=True)
                    else:
                        print()
    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
 # Push a model to the model registry. Use the callback function to override the default handler.
 def push(model_name, insecure=False, callback=None):
    try:
        url = f"{BASE_URL}/api/push"
        payload = {
            "name": model_name,
            "insecure": insecure
        }
        # Making a POST request with the stream parameter set to True to handle streaming responses
        with requests.post(url, json=payload, stream=True) as response:
            response.raise_for_status()
            # Iterating over the response line by line and displaying the details
            for line in response.iter_lines():
                if line:
                    # Parsing each line (JSON chunk) and extracting the details
                    chunk = json.loads(line)
                    # If a callback function is provided, call it with the chunk
                    if callback:
                        callback(chunk)
                    else:
                        # Print the status message directly to the console
                        print(chunk.get('status', ''), end='', flush=True)
                    # If there's layer data, you might also want to print that (adjust as necessary)
                    if 'digest' in chunk:
                        print(f" - Digest: {chunk['digest']}", end='', flush=True)
                        print(f" - Total: {chunk['total']}", end='', flush=True)
                        print(f" - Completed: {chunk['completed']}", end='\n', flush=True)
                    else:
                        print()
    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
 # List models that are available locally.
 def list():
    try:
        response = requests.get(f"{BASE_URL}/api/tags")
        response.raise_for_status()
        data = response.json()
        models = data.get('models', [])
        return models
    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
        return None
 # Copy a model. Creates a model with another name from an existing model.
 def copy(source, destination):
    try:
        # Create the JSON payload
        payload = {
            "source": source,
            "destination": destination
        }
        response = requests.post(f"{BASE_URL}/api/copy", json=payload)
        response.raise_for_status()
        # If the request was successful, return a message indicating that the copy was successful
        return "Copy successful"
    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
        return None
 # Delete a model and its data.
 def delete(model_name):
    try:
        url = f"{BASE_URL}/api/delete"
        payload = {"name": model_name}
        response = requests.delete(url, json=payload)
        response.raise_for_status()
        return "Delete successful"
    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
        return None
 # Show info about a model.
 def show(model_name):
    try:
        url = f"{BASE_URL}/api/show"
        payload = {"name": model_name}
        response = requests.post(url, json=payload)
        response.raise_for_status()
        # Parse the JSON response and return it
        data = response.json()
        return data
    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
        return None
 def heartbeat():
    try:
        url = f"{BASE_URL}/"
        response = requests.head(url)
        response.raise_for_status()
        return "Ollama is running"
    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
        return "Ollama is not running"
--- a/api/types.go
+++ b/api/types.go
@@ -31,22 +31,6 @@ func (e StatusError) Error() string {
 	}
 }
 // /api/chat
 type Message struct {
 	Role    string `json:"role"`
 	Content string `json:"content"`
 }
 type ChatRequest struct {
 	Model    string    `json:"model"`
 	Messages []Message `json:"messages"`
 }
 type ChatResponse struct {
 	CreatedAt time.Time `json:"created_at"`
 	Message   Message   `json:"message"`
 }
 type GenerateRequest struct {
 	Model    string `json:"model"`
 	Prompt   string `json:"prompt"`
@@ -97,18 +81,22 @@ type CopyRequest struct {
 type PullRequest struct {
 	Name     string `json:"name"`
 	Insecure bool   `json:"insecure,omitempty"`
 	Username string `json:"username"`
 	Password string `json:"password"`
 }
 type ProgressResponse struct {
 	Status    string `json:"status"`
 	Digest    string `json:"digest,omitempty"`
-	Total     int64  `json:"total,omitempty"`
+	Total     int    `json:"total,omitempty"`
-	Completed int64  `json:"completed,omitempty"`
+	Completed int    `json:"completed,omitempty"`
 }
 type PushRequest struct {
 	Name     string `json:"name"`
 	Insecure bool   `json:"insecure,omitempty"`
 	Username string `json:"username"`
 	Password string `json:"password"`
 }
 type ListResponse struct {
@@ -118,7 +106,7 @@ type ListResponse struct {
 type ModelResponse struct {
 	Name       string    `json:"name"`
 	ModifiedAt time.Time `json:"modified_at"`
-	Size       int64     `json:"size"`
+	Size       int       `json:"size"`
 	Digest     string    `json:"digest"`
 }
@@ -303,7 +291,7 @@ func DefaultOptions() Options {
 		NumCtx:             2048,
 		NumKeep:            -1,
 		NumBatch:           512,
-		NumGPU:             -1, // -1 here indicates that NumGPU should be set dynamically
+		NumGPU:             1,
 		NumGQA:             1,
 		LowVRAM:            false,
 		F16KV:              true,
--- a/app/src/index.ts
+++ b/app/src/index.ts
@@ -5,7 +5,7 @@ import winston from 'winston'
 import 'winston-daily-rotate-file'
 import * as path from 'path'
-import { v4 as uuidv4 } from 'uuid'
+import { analytics, id } from './telemetry'
 import { installed } from './install'
 require('@electron/remote/main').initialize()
@@ -164,11 +164,11 @@ app.on('before-quit', () => {
 function init() {
  if (app.isPackaged) {
    heartbeat()
    autoUpdater.checkForUpdates()
    setInterval(() => {
-      if (!updateAvailable) {
+      heartbeat()
-        autoUpdater.checkForUpdates()
+      autoUpdater.checkForUpdates()
      }
    }, 60 * 60 * 1000)
  }
@@ -234,26 +234,28 @@ app.on('window-all-closed', () => {
  }
 })
-function id(): string {
+// In this file you can include the rest of your app's specific main process
-  const id = store.get('id') as string
+// code. You can also put them in separate files and import them here.
-
+let aid = ''
-  if (id) {
+try {
-    return id
+  aid = id()
-  }
+} catch (e) {}
  const uuid = uuidv4()
  store.set('id', uuid)
  return uuid
 }
 autoUpdater.setFeedURL({
-  url: `https://ollama.ai/api/update?os=${process.platform}&arch=${
+  url: `https://ollama.ai/api/update?os=${process.platform}&arch=${process.arch}&version=${app.getVersion()}&id=${aid}`,
    process.arch
  }&version=${app.getVersion()}&id=${id()}`,
 })
 async function heartbeat() {
  analytics.track({
    anonymousId: aid,
    event: 'heartbeat',
    properties: {
      version: app.getVersion(),
    },
  })
 }
 autoUpdater.on('error', e => {
  logger.error(`update check failed - ${e.message}`)
  console.error(`update check failed - ${e.message}`)
 })
--- a/app/src/telemetry.ts
+++ b/app/src/telemetry.ts
@@ -0,0 +1,19 @@
 import { Analytics } from '@segment/analytics-node'
 import { v4 as uuidv4 } from 'uuid'
 import Store from 'electron-store'
 const store = new Store()
 export const analytics = new Analytics({ writeKey: process.env.TELEMETRY_WRITE_KEY || '<empty>' })
 export function id(): string {
  const id = store.get('id') as string
  if (id) {
    return id
  }
  const uuid = uuidv4()
  store.set('id', uuid)
  return uuid
 }
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -11,21 +11,20 @@ import (
 	"io"
 	"log"
 	"net"
 	"net/http"
 	"os"
 	"os/exec"
-	"os/signal"
+	"path"
 	"path/filepath"
 	"runtime"
 	"strings"
 	"syscall"
 	"time"
 	"github.com/chzyer/readline"
 	"github.com/dustin/go-humanize"
 	"github.com/olekukonko/tablewriter"
 	"github.com/pdevine/readline"
 	"github.com/spf13/cobra"
 	"golang.org/x/crypto/ssh"
 	"golang.org/x/term"
 	"github.com/jmorganca/ollama/api"
 	"github.com/jmorganca/ollama/format"
@@ -34,26 +33,6 @@ import (
 	"github.com/jmorganca/ollama/version"
 )
 type Painter struct {
 	IsMultiLine bool
 }
 func (p Painter) Paint(line []rune, _ int) []rune {
 	termType := os.Getenv("TERM")
 	if termType == "xterm-256color" && len(line) == 0 {
 		var prompt string
 		if p.IsMultiLine {
 			prompt = "Use \"\"\" to end multi-line input"
 		} else {
 			prompt = "Send a message (/? for help)"
 		}
 		return []rune(fmt.Sprintf("\033[38;5;245m%s\033[%dD\033[0m", prompt, len(prompt)))
 	}
 	// add a space and a backspace to prevent the cursor from walking up the screen
 	line = append(line, []rune(" \b")...)
 	return line
 }
 func CreateHandler(cmd *cobra.Command, args []string) error {
 	filename, _ := cmd.Flags().GetString("file")
 	filename, err := filepath.Abs(filename)
@@ -80,18 +59,18 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
 			currentDigest = resp.Digest
 			switch {
 			case strings.Contains(resp.Status, "embeddings"):
-				bar = progressbar.Default(resp.Total, resp.Status)
+				bar = progressbar.Default(int64(resp.Total), resp.Status)
-				bar.Set64(resp.Completed)
+				bar.Set(resp.Completed)
 			default:
 				// pulling
 				bar = progressbar.DefaultBytes(
-					resp.Total,
+					int64(resp.Total),
 					resp.Status,
 				)
-				bar.Set64(resp.Completed)
+				bar.Set(resp.Completed)
 			}
 		} else if resp.Digest == currentDigest && resp.Digest != "" {
-			bar.Set64(resp.Completed)
+			bar.Set(resp.Completed)
 		} else {
 			currentDigest = ""
 			if spinner != nil {
@@ -119,24 +98,39 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
 }
 func RunHandler(cmd *cobra.Command, args []string) error {
-	client, err := api.FromEnv()
+	insecure, err := cmd.Flags().GetBool("insecure")
 	if err != nil {
 		return err
 	}
-	models, err := client.List(context.Background())
+	mp := server.ParseModelPath(args[0])
 	if err != nil {
 		return err
 	}
-	canonicalModelPath := server.ParseModelPath(args[0])
+	if mp.ProtocolScheme == "http" && !insecure {
-	for _, model := range models.Models {
+		return fmt.Errorf("insecure protocol http")
-		if model.Name == canonicalModelPath.GetShortTagname() {
+	}
-			return RunGenerate(cmd, args)
+
 	fp, err := mp.GetManifestPath(false)
 	if err != nil {
 		return err
 	}
 	_, err = os.Stat(fp)
 	switch {
 	case errors.Is(err, os.ErrNotExist):
 		if err := pull(args[0], insecure); err != nil {
 			var apiStatusError api.StatusError
 			if !errors.As(err, &apiStatusError) {
 				return err
 			}
 			if apiStatusError.StatusCode != http.StatusBadGateway {
 				return err
 			}
 		}
-	}
+	case err != nil:
 	if err := PullHandler(cmd, args); err != nil {
 		return err
 	}
@@ -162,13 +156,13 @@ func PushHandler(cmd *cobra.Command, args []string) error {
 		if resp.Digest != currentDigest && resp.Digest != "" {
 			currentDigest = resp.Digest
 			bar = progressbar.DefaultBytes(
-				resp.Total,
+				int64(resp.Total),
 				fmt.Sprintf("pushing %s...", resp.Digest[7:19]),
 			)
-			bar.Set64(resp.Completed)
+			bar.Set(resp.Completed)
 		} else if resp.Digest == currentDigest && resp.Digest != "" {
-			bar.Set64(resp.Completed)
+			bar.Set(resp.Completed)
 		} else {
 			currentDigest = ""
 			fmt.Println(resp.Status)
@@ -351,13 +345,13 @@ func pull(model string, insecure bool) error {
 		if resp.Digest != currentDigest && resp.Digest != "" {
 			currentDigest = resp.Digest
 			bar = progressbar.DefaultBytes(
-				resp.Total,
+				int64(resp.Total),
 				fmt.Sprintf("pulling %s...", resp.Digest[7:19]),
 			)
-			bar.Set64(resp.Completed)
+			bar.Set(resp.Completed)
 		} else if resp.Digest == currentDigest && resp.Digest != "" {
-			bar.Set64(resp.Completed)
+			bar.Set(resp.Completed)
 		} else {
 			currentDigest = ""
 			fmt.Println(resp.Status)
@@ -393,135 +387,70 @@ func RunGenerate(cmd *cobra.Command, args []string) error {
 type generateContextKey string
 func generate(cmd *cobra.Command, model, prompt string) error {
-	client, err := api.FromEnv()
+	if len(strings.TrimSpace(prompt)) > 0 {
-	if err != nil {
+		client, err := api.FromEnv()
-		return err
+		if err != nil {
-	}
+			return err
 	spinner := NewSpinner("")
 	go spinner.Spin(60 * time.Millisecond)
 	var latest api.GenerateResponse
 	generateContext, ok := cmd.Context().Value(generateContextKey("context")).([]int)
 	if !ok {
 		generateContext = []int{}
 	}
 	var wrapTerm bool
 	termType := os.Getenv("TERM")
 	if termType == "xterm-256color" {
 		wrapTerm = true
 	}
 	termWidth, _, err := term.GetSize(int(0))
 	if err != nil {
 		wrapTerm = false
 	}
 	// override wrapping if the user turned it off
 	nowrap, err := cmd.Flags().GetBool("nowordwrap")
 	if err != nil {
 		return err
 	}
 	if nowrap {
 		wrapTerm = false
 	}
 	cancelCtx, cancel := context.WithCancel(context.Background())
 	defer cancel()
 	sigChan := make(chan os.Signal, 1)
 	signal.Notify(sigChan, syscall.SIGINT)
 	var abort bool
 	go func() {
 		<-sigChan
 		cancel()
 		abort = true
 	}()
 	var currentLineLength int
 	var wordBuffer string
 	request := api.GenerateRequest{Model: model, Prompt: prompt, Context: generateContext}
 	fn := func(response api.GenerateResponse) error {
 		if !spinner.IsFinished() {
 			spinner.Finish()
 		}
-		latest = response
+		spinner := NewSpinner("")
 		go spinner.Spin(60 * time.Millisecond)
-		if wrapTerm {
+		var latest api.GenerateResponse
 			for _, ch := range response.Response {
 				if currentLineLength+1 > termWidth-5 {
 					// backtrack the length of the last word and clear to the end of the line
 					fmt.Printf("\x1b[%dD\x1b[K\n", len(wordBuffer))
 					fmt.Printf("%s%c", wordBuffer, ch)
 					currentLineLength = len(wordBuffer) + 1
 				} else {
 					fmt.Print(string(ch))
 					currentLineLength += 1
-					switch ch {
+		generateContext, ok := cmd.Context().Value(generateContextKey("context")).([]int)
-					case ' ':
+		if !ok {
-						wordBuffer = ""
+			generateContext = []int{}
-					case '\n':
+		}
-						currentLineLength = 0
+
-					default:
+		request := api.GenerateRequest{Model: model, Prompt: prompt, Context: generateContext}
-						wordBuffer += string(ch)
+		fn := func(response api.GenerateResponse) error {
-					}
+			if !spinner.IsFinished() {
 				spinner.Finish()
 			}
 			latest = response
 			fmt.Print(response.Response)
 			return nil
 		}
 		if err := client.Generate(context.Background(), &request, fn); err != nil {
 			if strings.Contains(err.Error(), "failed to load model") {
 				// tell the user to check the server log, if it exists locally
 				home, nestedErr := os.UserHomeDir()
 				if nestedErr != nil {
 					// return the original error
 					return err
 				}
 				logPath := filepath.Join(home, ".ollama", "logs", "server.log")
 				if _, nestedErr := os.Stat(logPath); nestedErr == nil {
 					err = fmt.Errorf("%w\nFor more details, check the error logs at %s", err, logPath)
 				}
 			}
-		} else {
+			return err
 			fmt.Print(response.Response)
 		}
 		return nil
 	}
 	if err := client.Generate(cancelCtx, &request, fn); err != nil {
 		if strings.Contains(err.Error(), "failed to load model") {
 			// tell the user to check the server log, if it exists locally
 			home, nestedErr := os.UserHomeDir()
 			if nestedErr != nil {
 				// return the original error
 				return err
 			}
 			logPath := filepath.Join(home, ".ollama", "logs", "server.log")
 			if _, nestedErr := os.Stat(logPath); nestedErr == nil {
 				err = fmt.Errorf("%w\nFor more details, check the error logs at %s", err, logPath)
 			}
 		} else if strings.Contains(err.Error(), "context canceled") && abort {
 			spinner.Finish()
 			return nil
 		}
 		return err
 	}
 	if prompt != "" {
 		fmt.Println()
 		fmt.Println()
 	}
-	if !latest.Done {
+		if !latest.Done {
-		if abort {
+			return errors.New("unexpected end of response")
 			return nil
 		}
 		return errors.New("unexpected end of response")
 	}
-	verbose, err := cmd.Flags().GetBool("verbose")
+		verbose, err := cmd.Flags().GetBool("verbose")
-	if err != nil {
+		if err != nil {
-		return err
+			return err
-	}
+		}
-	if verbose {
+		if verbose {
-		latest.Summary()
+			latest.Summary()
-	}
+		}
-	ctx := cmd.Context()
+		ctx := cmd.Context()
-	ctx = context.WithValue(ctx, generateContextKey("context"), latest.Context)
+		ctx = context.WithValue(ctx, generateContextKey("context"), latest.Context)
-	cmd.SetContext(ctx)
+		cmd.SetContext(ctx)
 	}
 	return nil
 }
@@ -532,21 +461,19 @@ func generateInteractive(cmd *cobra.Command, model string) error {
 		return err
 	}
 	// load the model
 	if err := generate(cmd, model, ""); err != nil {
 		return err
 	}
 	completer := readline.NewPrefixCompleter(
 		readline.PcItem("/help"),
 		readline.PcItem("/list"),
 		readline.PcItem("/set",
 			readline.PcItem("history"),
 			readline.PcItem("nohistory"),
 			readline.PcItem("wordwrap"),
 			readline.PcItem("nowordwrap"),
 			readline.PcItem("verbose"),
 			readline.PcItem("quiet"),
 			readline.PcItem("mode",
 				readline.PcItem("vim"),
 				readline.PcItem("emacs"),
 				readline.PcItem("default"),
 			),
 		),
 		readline.PcItem("/show",
 			readline.PcItem("license"),
@@ -564,10 +491,7 @@ func generateInteractive(cmd *cobra.Command, model string) error {
 		fmt.Fprintln(os.Stderr, completer.Tree("  "))
 	}
 	var painter Painter
 	config := readline.Config{
 		Painter:      &painter,
 		Prompt:       ">>> ",
 		HistoryFile:  filepath.Join(home, ".ollama", "history"),
 		AutoComplete: completer,
@@ -589,7 +513,7 @@ func generateInteractive(cmd *cobra.Command, model string) error {
 			return nil
 		case errors.Is(err, readline.ErrInterrupt):
 			if line == "" {
-				fmt.Println("Use Ctrl-D or /bye to exit.")
+				return nil
 			}
 			continue
@@ -603,7 +527,6 @@ func generateInteractive(cmd *cobra.Command, model string) error {
 		case isMultiLine:
 			if strings.HasSuffix(line, `"""`) {
 				isMultiLine = false
 				painter.IsMultiLine = isMultiLine
 				multiLineBuffer += strings.TrimSuffix(line, `"""`)
 				line = multiLineBuffer
 				multiLineBuffer = ""
@@ -614,7 +537,6 @@ func generateInteractive(cmd *cobra.Command, model string) error {
 			}
 		case strings.HasPrefix(line, `"""`):
 			isMultiLine = true
 			painter.IsMultiLine = isMultiLine
 			multiLineBuffer = strings.TrimPrefix(line, `"""`) + " "
 			scanner.SetPrompt("... ")
 			continue
@@ -623,44 +545,45 @@ func generateInteractive(cmd *cobra.Command, model string) error {
 			if err := ListHandler(cmd, args[1:]); err != nil {
 				return err
 			}
 			continue
 		case strings.HasPrefix(line, "/set"):
 			args := strings.Fields(line)
 			if len(args) > 1 {
 				switch args[1] {
 				case "history":
 					scanner.HistoryEnable()
 					continue
 				case "nohistory":
 					scanner.HistoryDisable()
-				case "wordwrap":
+					continue
 					cmd.Flags().Set("nowordwrap", "false")
 					fmt.Println("Set 'wordwrap' mode.")
 				case "nowordwrap":
 					cmd.Flags().Set("nowordwrap", "true")
 					fmt.Println("Set 'nowordwrap' mode.")
 				case "verbose":
 					cmd.Flags().Set("verbose", "true")
-					fmt.Println("Set 'verbose' mode.")
+					continue
 				case "quiet":
 					cmd.Flags().Set("verbose", "false")
-					fmt.Println("Set 'quiet' mode.")
+					continue
 				case "mode":
 					if len(args) > 2 {
 						switch args[2] {
 						case "vim":
 							scanner.SetVimMode(true)
 							continue
 						case "emacs", "default":
 							scanner.SetVimMode(false)
 							continue
 						default:
 							usage()
 							continue
 						}
 					} else {
 						usage()
 						continue
 					}
 				default:
 					fmt.Printf("Unknown command '/set %s'. Type /? for help\n", args[1])
 				}
 			} else {
 				usage()
 				continue
 			}
 		case strings.HasPrefix(line, "/show"):
 			args := strings.Fields(line)
@@ -668,7 +591,7 @@ func generateInteractive(cmd *cobra.Command, model string) error {
 				resp, err := server.GetModelInfo(model)
 				if err != nil {
 					fmt.Println("error: couldn't get model")
-					return err
+					continue
 				}
 				switch args[1] {
@@ -683,24 +606,23 @@ func generateInteractive(cmd *cobra.Command, model string) error {
 				case "template":
 					fmt.Println(resp.Template)
 				default:
-					fmt.Printf("Unknown command '/show %s'. Type /? for help\n", args[1])
+					fmt.Println("error: unknown command")
 				}
 				continue
 			} else {
 				usage()
 				continue
 			}
 		case line == "/help", line == "/?":
 			usage()
 			continue
 		case line == "/exit", line == "/bye":
 			return nil
 		case strings.HasPrefix(line, "/"):
 			args := strings.Fields(line)
 			fmt.Printf("Unknown command '%s'. Type /? for help\n", args[0])
 		}
-		if len(line) > 0 && line[0] != '/' {
+		if err := generate(cmd, model, line); err != nil {
-			if err := generate(cmd, model, line); err != nil {
+			return err
 				return err
 			}
 		}
 	}
 }
@@ -719,19 +641,28 @@ func generateBatch(cmd *cobra.Command, model string) error {
 }
 func RunServer(cmd *cobra.Command, _ []string) error {
-	host, port, err := net.SplitHostPort(os.Getenv("OLLAMA_HOST"))
+	host, port := "127.0.0.1", "11434"
-	if err != nil {
+
-		host, port = "127.0.0.1", "11434"
+	parts := strings.Split(os.Getenv("OLLAMA_HOST"), ":")
-		if ip := net.ParseIP(strings.Trim(os.Getenv("OLLAMA_HOST"), "[]")); ip != nil {
+	if ip := net.ParseIP(parts[0]); ip != nil {
-			host = ip.String()
+		host = ip.String()
 		}
 	}
-	if err := initializeKeypair(); err != nil {
+	if len(parts) > 1 {
 		port = parts[1]
 	}
 	// deprecated: include port in OLLAMA_HOST
 	if p := os.Getenv("OLLAMA_PORT"); p != "" {
 		port = p
 	}
 	err := initializeKeypair()
 	if err != nil {
 		return err
 	}
-	ln, err := net.Listen("tcp", net.JoinHostPort(host, port))
+	ln, err := net.Listen("tcp", fmt.Sprintf("%s:%s", host, port))
 	if err != nil {
 		return err
 	}
@@ -741,21 +672,6 @@ func RunServer(cmd *cobra.Command, _ []string) error {
 		origins = strings.Split(o, ",")
 	}
 	if noprune := os.Getenv("OLLAMA_NOPRUNE"); noprune == "" {
 		if err := server.PruneLayers(); err != nil {
 			return err
 		}
 		manifestsPath, err := server.GetManifestPath()
 		if err != nil {
 			return err
 		}
 		if err := server.PruneDirectory(manifestsPath); err != nil {
 			return err
 		}
 	}
 	return server.Serve(ln, origins)
 }
@@ -781,7 +697,7 @@ func initializeKeypair() error {
 			return err
 		}
-		err = os.MkdirAll(filepath.Dir(privKeyPath), 0o755)
+		err = os.MkdirAll(path.Dir(privKeyPath), 0o700)
 		if err != nil {
 			return fmt.Errorf("could not create directory %w", err)
 		}
@@ -909,7 +825,6 @@ func NewCLI() *cobra.Command {
 	runCmd.Flags().Bool("verbose", false, "Show timings for response")
 	runCmd.Flags().Bool("insecure", false, "Use an insecure registry")
 	runCmd.Flags().Bool("nowordwrap", false, "Don't wrap words to the next line automatically")
 	serveCmd := &cobra.Command{
 		Use:     "serve",
--- a/docs/api.md
+++ b/docs/api.md
@@ -3,33 +3,26 @@
 ## Endpoints
 - [Generate a completion](#generate-a-completion)
- [Create a Model](#create-a-model)
+- [Create a model](#create-a-model)
- [List Local Models](#list-local-models)
+- [List local models](#list-local-models)
- [Show Model Information](#show-model-information)
+- [Copy a model](#copy-a-model)
- [Copy a Model](#copy-a-model)
+- [Delete a model](#delete-a-model)
- [Delete a Model](#delete-a-model)
+- [Pull a model](#pull-a-model)
- [Pull a Model](#pull-a-model)
+- [Generate embeddings](#generate-embeddings)
 - [Push a Model](#push-a-model)
 - [Generate Embeddings](#generate-embeddings)
 ## Conventions
 ### Model names
-Model names follow a `model:tag` format. Some examples are `orca-mini:3b-q4_1` and `llama2:70b`. The tag is optional and, if not provided, will default to `latest`. The tag is used to identify a specific version.
+Model names follow a `model:tag` format. Some examples are `orca-mini:3b-q4_1` and `llama2:70b`. The tag is optional and if not provided will default to `latest`. The tag is used to identify a specific version.
 ### Durations
 All durations are returned in nanoseconds.
 ### Streaming responses
 Certain endpoints stream responses as JSON objects delineated with the newline (`\n`) character.
 ## Generate a completion
-```shell
+```
 POST /api/generate
 ```
@@ -49,7 +42,7 @@ Advanced parameters:
 ### Request
-```shell
+```
 curl -X POST http://localhost:11434/api/generate -d '{
  "model": "llama2:7b",
  "prompt": "Why is the sky blue?"
@@ -102,7 +95,7 @@ To calculate how fast the response is generated in tokens per second (token/s),
 ## Create a Model
-```shell
+```
 POST /api/create
 ```
@@ -115,7 +108,7 @@ Create a model from a [`Modelfile`](./modelfile.md)
 ### Request
-```shell
+```
 curl -X POST http://localhost:11434/api/create -d '{
  "name": "mario",
  "path": "~/Modelfile"
@@ -124,7 +117,7 @@ curl -X POST http://localhost:11434/api/create -d '{
 ### Response
-A stream of JSON objects. When finished, `status` is `success`.
+A stream of JSON objects. When finished, `status` is `success`
 ```json
 {
@@ -134,7 +127,7 @@ A stream of JSON objects. When finished, `status` is `success`.
 ## List Local Models
-```shell
+```
 GET /api/tags
 ```
@@ -142,7 +135,7 @@ List models that are available locally.
 ### Request
-```shell
+```
 curl http://localhost:11434/api/tags
 ```
@@ -165,40 +158,9 @@ curl http://localhost:11434/api/tags
 }
 ```
 ## Show Model Information
 ```shell
 POST /api/show
 ```
 Show details about a model including modelfile, template, parameters, license, and system prompt.
 ### Parameters
 - `name`: name of the model to show
 ### Request
 ```shell  
 curl http://localhost:11434/api/show -d '{
  "name": "llama2:7b"
 }'
 ```
 ### Response
 ```json
 {
    "license": "<contents of license block>",
    "modelfile": "# Modelfile generated by \"ollama show\"\n# To build a new Modelfile based on this one, replace the FROM line with:\n# FROM llama2:latest\n\nFROM /Users/username/.ollama/models/blobs/sha256:8daa9615cce30c259a9555b1cc250d461d1bc69980a274b44d7eda0be78076d8\nTEMPLATE \"\"\"[INST] {{ if and .First .System }}<<SYS>>{{ .System }}<</SYS>>\n\n{{ end }}{{ .Prompt }} [/INST] \"\"\"\nSYSTEM \"\"\"\"\"\"\nPARAMETER stop [INST]\nPARAMETER stop [/INST]\nPARAMETER stop <<SYS>>\nPARAMETER stop <</SYS>>\n",
    "parameters": "stop                           [INST]\nstop                           [/INST]\nstop                           <<SYS>>\nstop                           <</SYS>>",
    "template": "[INST] {{ if and .First .System }}<<SYS>>{{ .System }}<</SYS>>\n\n{{ end }}{{ .Prompt }} [/INST] "
 }
 ```
 ## Copy a Model
-```shell
+```
 POST /api/copy
 ```
@@ -206,7 +168,7 @@ Copy a model. Creates a model with another name from an existing model.
 ### Request
-```shell
+```
 curl http://localhost:11434/api/copy -d '{
  "source": "llama2:7b",
  "destination": "llama2-backup"
@@ -215,7 +177,7 @@ curl http://localhost:11434/api/copy -d '{
 ## Delete a Model
-```shell
+```
 DELETE /api/delete
 ```
@@ -227,7 +189,7 @@ Delete a model and its data.
 ### Request
-```shell
+```
 curl -X DELETE http://localhost:11434/api/delete -d '{
  "name": "llama2:13b"
 }'
@@ -235,20 +197,19 @@ curl -X DELETE http://localhost:11434/api/delete -d '{
 ## Pull a Model
-```shell
+```
 POST /api/pull
 ```
-Download a model from the ollama library. Cancelled pulls are resumed from where they left off, and multiple calls will share the same download progress.
+Download a model from a the model registry. Cancelled pulls are resumed from where they left off, and multiple calls to will share the same download progress.
 ### Parameters
 - `name`: name of the model to pull
 - `insecure`: (optional) allow insecure connections to the library. Only use this if you are pulling from your own library during development.
 ### Request
-```shell
+```
 curl -X POST http://localhost:11434/api/pull -d '{
  "name": "llama2:7b"
 }'
@@ -264,63 +225,9 @@ curl -X POST http://localhost:11434/api/pull -d '{
 }
 ```
 ## Push a Model
 ```shell
 POST /api/push
 ```
 Upload a model to a model library. Requires registering for ollama.ai and adding a public key first.
 ### Parameters
 - `name`: name of the model to push in the form of `<namespace>/<model>:<tag>`
 - `insecure`: (optional) allow insecure connections to the library. Only use this if you are pushing to your library during development.  
 ### Request
 ```shell
 curl -X POST http://localhost:11434/api/push -d '{
  "name": "mattw/pygmalion:latest"
 }'
 ```
 ### Response
 Streaming response that starts with:
 ```json
 {"status":"retrieving manifest"}
 ```
 and then:
 ```json
 {
 "status":"starting upload","digest":"sha256:bc07c81de745696fdf5afca05e065818a8149fb0c77266fb584d9b2cba3711ab",
 "total":1928429856
 }
 ```
 Then there is a series of uploading responses:
 ```json
 {
 "status":"starting upload",
 "digest":"sha256:bc07c81de745696fdf5afca05e065818a8149fb0c77266fb584d9b2cba3711ab",
 "total":1928429856}
 ```
 Finally, when the upload is complete:
 ```json
 {"status":"pushing manifest"}
 {"status":"success"}
 ```
 ## Generate Embeddings
-```shell
+```
 POST /api/embeddings
 ```
@@ -337,7 +244,7 @@ Advanced parameters:
 ### Request
-```shell
+```
 curl -X POST http://localhost:11434/api/embeddings -d '{
  "model": "llama2:7b",
  "prompt": "Here is an article about llamas..."
@@ -352,4 +259,5 @@ curl -X POST http://localhost:11434/api/embeddings -d '{
    0.5670403838157654, 0.009260174818336964, 0.23178744316101074, -0.2916173040866852, -0.8924556970596313,
    0.8785552978515625, -0.34576427936553955, 0.5742510557174683, -0.04222835972905159, -0.137906014919281
  ]
-}```
+}
 ```
--- a/docs/development.md
+++ b/docs/development.md
@@ -6,10 +6,6 @@
 Install required tools:
 - cmake version 3.24 or higher
 - go version 1.20 or higher
 - gcc version 11.4.0 or higher
 ```
 brew install go cmake gcc
 ```
@@ -31,9 +27,3 @@ Now you can run `ollama`:
 ```
 ./ollama
 ```
 ## Building on Linux with GPU support
 - Install cmake and nvidia-cuda-toolkit
 - run `go generate ./...`
 - run `go build .`
--- a/docs/faq.md
+++ b/docs/faq.md
@@ -14,6 +14,4 @@ OLLAMA_ORIGINS=http://192.168.1.1:*,https://example.com ollama serve
 ## Where are models stored?
-* macOS: Raw model data is stored under `~/.ollama/models`.
+Raw model data is stored under `~/.ollama/models`.
 * Linux: Raw model data is stored under `/usr/share/ollama/.ollama/models`
--- a/docs/linux.md
+++ b/docs/linux.md
@@ -1,83 +0,0 @@
 # Installing Ollama on Linux
 > Note: A one line installer for Ollama is available by running:
 >
 > ```
 > curl https://ollama.ai/install.sh | sh
 > ```
 ## Download the `ollama` binary
 Ollama is distributed as a self-contained binary. Download it to a directory in your PATH:
 ```
 sudo curl -L https://ollama.ai/download/ollama-linux-amd64 -o /usr/bin/ollama
 sudo chmod +x /usr/bin/ollama
 ```
 ## Start Ollama
 Start Ollama by running `ollama serve`:
 ```
 ollama serve
 ```
 Once Ollama is running, run a model in another terminal session:
 ```
 ollama run llama2
 ```
 ## Install CUDA drivers (optional – for Nvidia GPUs)
 [Download and install](https://developer.nvidia.com/cuda-downloads) CUDA.
 Verify that the drivers are installed by running the following command, which should print details about your GPU:
 ```
 nvidia-smi
 ```
 ## Adding Ollama as a startup service (optional)
 Create a user for Ollama:
 ```
 sudo useradd -r -s /bin/false -m -d /usr/share/ollama ollama
 ```
 Create a service file in `/etc/systemd/system/ollama.service`:
 ```ini
 [Unit]
 Description=Ollama Service
 After=network-online.target
 [Service]
 ExecStart=/usr/bin/ollama serve
 User=ollama
 Group=ollama
 Restart=always
 RestartSec=3
 Environment="HOME=/usr/share/ollama"
 [Install]
 WantedBy=default.target
 ```
 Then start the service:
 ```
 sudo systemctl daemon-reload
 sudo systemctl enable ollama
 ```
 ### Viewing logs
 To view logs of Ollama running as a startup service, run:
 ```
 journalctl -u ollama
 ```
--- a/docs/modelfile.md
+++ b/docs/modelfile.md
@@ -94,7 +94,6 @@ This bin file location should be specified as an absolute path or relative to th
 ### EMBED
 The EMBED instruction is used to add embeddings of files to a model. This is useful for adding custom data that the model can reference when generating an answer. Note that currently only text files are supported, formatted with each line as one embedding.
 ```
 FROM <model name>:<tag>
 EMBED <file path>.txt
@@ -119,14 +118,13 @@ PARAMETER <parameter> <parametervalue>
 | mirostat_tau   | Controls the balance between coherence and diversity of the output. A lower value will result in more focused and coherent text. (Default: 5.0)                                                                                                         | float      | mirostat_tau 5.0     |
 | num_ctx        | Sets the size of the context window used to generate the next token. (Default: 2048)                                                                                                                                                                    | int        | num_ctx 4096         |
 | num_gqa        | The number of GQA groups in the transformer layer. Required for some models, for example it is 8 for llama2:70b                                                                                                                                         | int        | num_gqa 1            |
-| num_gpu        | The number of layers to send to the GPU(s). On macOS it defaults to 1 to enable metal support, 0 to disable.                                                                                                                                            | int        | num_gpu 50           |
+| num_gpu        | The number of GPUs to use. On macOS it defaults to 1 to enable metal support, 0 to disable.                                                                                                                                                             | int        | num_gpu 1            |
 | num_thread     | Sets the number of threads to use during computation. By default, Ollama will detect this for optimal performance. It is recommended to set this value to the number of physical CPU cores your system has (as opposed to the logical number of cores). | int        | num_thread 8         |
 | repeat_last_n  | Sets how far back for the model to look back to prevent repetition. (Default: 64, 0 = disabled, -1 = num_ctx)                                                                                                                                           | int        | repeat_last_n 64     |
 | repeat_penalty | Sets how strongly to penalize repetitions. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. (Default: 1.1)                                                                     | float      | repeat_penalty 1.1   |
 | temperature    | The temperature of the model. Increasing the temperature will make the model answer more creatively. (Default: 0.8)                                                                                                                                     | float      | temperature 0.7      |
 | stop           | Sets the stop sequences to use.                                                                                                                                                                                                                         | string     | stop "AI assistant:" |
 | tfs_z          | Tail free sampling is used to reduce the impact of less probable tokens from the output. A higher value (e.g., 2.0) will reduce the impact more, while a value of 1.0 disables this setting. (default: 1)                                               | float      | tfs_z 1              |
 | num_predict    | Maximum number of tokens to predict when generating text. (Default: 128, -1 = infinite generation, -2 = fill context)                                                                                                                                   | int        | num_predict 42       |
 | top_k          | Reduces the probability of generating nonsense. A higher value (e.g. 100) will give more diverse answers, while a lower value (e.g. 10) will be more conservative. (Default: 40)                                                                        | int        | top_k 40             |
 | top_p          | Works together with top-k. A higher value (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. (Default: 0.9)                                                                 | float      | top_p 0.9            |
--- a/go.mod
+++ b/go.mod
@@ -8,7 +8,6 @@ require (
 	github.com/mattn/go-runewidth v0.0.14
 	github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db
 	github.com/olekukonko/tablewriter v0.0.5
 	github.com/pdevine/readline v1.5.2
 	github.com/spf13/cobra v1.7.0
 )
@@ -17,6 +16,7 @@ require github.com/rivo/uniseg v0.2.0 // indirect
 require (
 	github.com/bytedance/sonic v1.9.1 // indirect
 	github.com/chenzhuoyu/base64x v0.0.0-20221115062448-fe3a3abad311 // indirect
 	github.com/chzyer/readline v1.5.1
 	github.com/gabriel-vasile/mimetype v1.4.2 // indirect
 	github.com/gin-contrib/cors v1.4.0
 	github.com/gin-contrib/sse v0.1.0 // indirect
--- a/go.sum
+++ b/go.sum
@@ -6,6 +6,8 @@ github.com/chenzhuoyu/base64x v0.0.0-20221115062448-fe3a3abad311 h1:qSGYFH7+jGhD
 github.com/chenzhuoyu/base64x v0.0.0-20221115062448-fe3a3abad311/go.mod h1:b583jCggY9gE99b6G5LEC39OIiVsWj+R97kbl5odCEk=
 github.com/chzyer/logex v1.2.1 h1:XHDu3E6q+gdHgsdTPH6ImJMIp436vR6MPtH8gP05QzM=
 github.com/chzyer/logex v1.2.1/go.mod h1:JLbx6lG2kDbNRFnfkgvh4eRJRPX1QCoOIWomwysCBrQ=
 github.com/chzyer/readline v1.5.1 h1:upd/6fQk4src78LMRzh5vItIt361/o4uq553V8B5sGI=
 github.com/chzyer/readline v1.5.1/go.mod h1:Eh+b79XXUwfKfcPLepksvw2tcLE/Ct21YObkaSkeBlk=
 github.com/chzyer/test v1.0.0 h1:p3BQDXSxOhOG0P9z6/hGnII4LGiEPOYBhs8asl/fC04=
 github.com/chzyer/test v1.0.0/go.mod h1:2JlltgoNkt4TW/z9V/IzDdFaMTM2JPIi26O1pF38GC8=
 github.com/cpuguy83/go-md2man/v2 v2.0.2/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=
@@ -78,8 +80,6 @@ github.com/olekukonko/tablewriter v0.0.5 h1:P2Ga83D34wi1o9J6Wh1mRuqd4mF/x/lgBS7N
 github.com/olekukonko/tablewriter v0.0.5/go.mod h1:hPp6KlRPjbx+hW8ykQs1w3UBbZlj6HuIJcUGPhkA7kY=
 github.com/pbnjay/memory v0.0.0-20210728143218-7b4eea64cf58 h1:onHthvaw9LFnH4t2DcNVpwGmV9E1BkGknEliJkfwQj0=
 github.com/pbnjay/memory v0.0.0-20210728143218-7b4eea64cf58/go.mod h1:DXv8WO4yhMYhSNPKjeNKa5WY9YCIEBRbNzFFPJbWO6Y=
 github.com/pdevine/readline v1.5.2 h1:oz6Y5GdTmhPG+08hhxcAvtHitSANWuA2100Sppb38xI=
 github.com/pdevine/readline v1.5.2/go.mod h1:na/LbuE5PYwxI7GyopWdIs3U8HVe89lYlNTFTXH3wOw=
 github.com/pelletier/go-toml/v2 v2.0.1/go.mod h1:r9LEWfGN8R5k0VXJ+0BkIe7MYkRdwZOjgMj2KwnJFUo=
 github.com/pelletier/go-toml/v2 v2.0.8 h1:0ctb6s9mE31h0/lhu+J6OPmVeDxJn+kYnJc2jZR9tGQ=
 github.com/pelletier/go-toml/v2 v2.0.8/go.mod h1:vuYfssBdrU2XDZ9bYydBu6t+6a6PYNcZljzZR9VXg+4=
@@ -120,6 +120,7 @@ golang.org/x/arch v0.3.0/go.mod h1:5om86z9Hs0C8fWVUuoMHwpExlXzs5Tkyp9hOrfG7pp8=
 golang.org/x/crypto v0.0.0-20210711020723-a769d52b0f97/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
 golang.org/x/crypto v0.10.0 h1:LKqV2xt9+kDzSTfOhx4FrkEBcMrAgHSYgzywV9zcGmM=
 golang.org/x/crypto v0.10.0/go.mod h1:o4eNf7Ede1fv+hwOwZsTHl9EsPFO6q6ZvYR8vYfY45I=
 golang.org/x/exp v0.0.0-20230321023759-10a507213a29 h1:ooxPy7fPvB4kwsA2h+iBNHkAbp/4JxTSwCmvdjEYmug=
 golang.org/x/exp v0.0.0-20230817173708-d852ddb80c63 h1:m64FZMko/V45gv0bNmrNYoDEq8U5YUhetc9cBWKS1TQ=
 golang.org/x/exp v0.0.0-20230817173708-d852ddb80c63/go.mod h1:0v4NqG35kSWCMzLaMeX+IQrlSnVE/bqGSyC2cz/9Le8=
 golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
--- a/llm/falcon.go
+++ b/llm/falcon.go
@@ -1,22 +0,0 @@
 package llm
 const ModelFamilyFalcon = "falcon"
 const (
 	falconModelType7B   = 32
 	falconModelType40B  = 60
 	falconModelType180B = 80
 )
 func falconModelType(numLayer uint32) string {
 	switch numLayer {
 	case 32:
 		return "7B"
 	case 60:
 		return "40B"
 	case 80:
 		return "180B"
 	default:
 		return "Unknown"
 	}
 }
--- a/llm/ggml.go
+++ b/llm/ggml.go
@@ -3,96 +3,72 @@ package llm
 import (
 	"encoding/binary"
 	"errors"
 	"fmt"
 	"io"
 )
 type ModelFamily string
 type ModelType uint32
 const (
 	ModelType3B  ModelType = 26
 	ModelType7B  ModelType = 32
 	ModelType13B ModelType = 40
 	ModelType34B ModelType = 48
 	ModelType30B ModelType = 60
 	ModelType65B ModelType = 80
 )
 func (mt ModelType) String() string {
 	switch mt {
 	case ModelType3B:
 		return "3B"
 	case ModelType7B:
 		return "7B"
 	case ModelType13B:
 		return "13B"
 	case ModelType34B:
 		return "34B"
 	case ModelType30B:
 		return "30B"
 	case ModelType65B:
 		return "65B"
 	default:
 		return "Unknown"
 	}
 }
 type FileType interface {
 	String() string
 }
 type GGML struct {
 	magic uint32
 	container
 	model
 }
 const (
 	fileTypeF32 uint32 = iota
 	fileTypeF16
 	fileTypeQ4_0
 	fileTypeQ4_1
 	fileTypeQ4_1_F16
 	fileTypeQ8_0 uint32 = iota + 2
 	fileTypeQ5_0
 	fileTypeQ5_1
 	fileTypeQ2_K
 	fileTypeQ3_K_S
 	fileTypeQ3_K_M
 	fileTypeQ3_K_L
 	fileTypeQ4_K_S
 	fileTypeQ4_K_M
 	fileTypeQ5_K_S
 	fileTypeQ5_K_M
 	fileTypeQ6_K
 )
 func fileType(fileType uint32) string {
 	switch fileType {
 	case fileTypeF32:
 		return "F32"
 	case fileTypeF16:
 		return "F16"
 	case fileTypeQ4_0:
 		return "Q4_0"
 	case fileTypeQ4_1:
 		return "Q4_1"
 	case fileTypeQ4_1_F16:
 		return "Q4_1_F16"
 	case fileTypeQ8_0:
 		return "Q8_0"
 	case fileTypeQ5_0:
 		return "Q5_0"
 	case fileTypeQ5_1:
 		return "Q5_1"
 	case fileTypeQ2_K:
 		return "Q2_K"
 	case fileTypeQ3_K_S:
 		return "Q3_K_S"
 	case fileTypeQ3_K_M:
 		return "Q3_K_M"
 	case fileTypeQ3_K_L:
 		return "Q3_K_L"
 	case fileTypeQ4_K_S:
 		return "Q4_K_S"
 	case fileTypeQ4_K_M:
 		return "Q4_K_M"
 	case fileTypeQ5_K_S:
 		return "Q5_K_S"
 	case fileTypeQ5_K_M:
 		return "Q5_K_M"
 	case fileTypeQ6_K:
 		return "Q6_K"
 	default:
 		return "Unknown"
 	}
 }
 type model interface {
-	ModelFamily() string
+	ModelFamily() ModelFamily
-	ModelType() string
+	ModelType() ModelType
-	FileType() string
+	FileType() FileType
 	NumLayers() int64
 }
 type container interface {
 	Name() string
-	Decode(io.Reader) (model, error)
+	Decode(io.Reader) error
 }
-type containerGGML struct{}
+type containerGGML struct {
 }
 func (c *containerGGML) Name() string {
 	return "ggml"
 }
-func (c *containerGGML) Decode(r io.Reader) (model, error) {
+func (c *containerGGML) Decode(r io.Reader) error {
-	return nil, nil
+	return nil
 }
 type containerGGMF struct {
@@ -103,18 +79,18 @@ func (c *containerGGMF) Name() string {
 	return "ggmf"
 }
-func (c *containerGGMF) Decode(r io.Reader) (model, error) {
+func (c *containerGGMF) Decode(r io.Reader) error {
 	var version uint32
 	binary.Read(r, binary.LittleEndian, &version)
 	switch version {
 	case 1:
 	default:
-		return nil, errors.New("invalid version")
+		return errors.New("invalid version")
 	}
 	c.version = version
-	return nil, nil
+	return nil
 }
 type containerGGJT struct {
@@ -125,22 +101,18 @@ func (c *containerGGJT) Name() string {
 	return "ggjt"
 }
-func (c *containerGGJT) Decode(r io.Reader) (model, error) {
+func (c *containerGGJT) Decode(r io.Reader) error {
 	var version uint32
 	binary.Read(r, binary.LittleEndian, &version)
 	switch version {
 	case 1, 2, 3:
 	default:
-		return nil, errors.New("invalid version")
+		return errors.New("invalid version")
 	}
 	c.version = version
-
+	return nil
 	// different model types may have different layouts for hyperparameters
 	var llama llamaModel
 	binary.Read(r, binary.LittleEndian, &llama.hyperparameters)
 	return &llama, nil
 }
 type containerLORA struct {
@@ -151,34 +123,32 @@ func (c *containerLORA) Name() string {
 	return "ggla"
 }
-func (c *containerLORA) Decode(r io.Reader) (model, error) {
+func (c *containerLORA) Decode(r io.Reader) error {
 	var version uint32
 	binary.Read(r, binary.LittleEndian, &version)
 	switch version {
 	case 1:
 	default:
-		return nil, errors.New("invalid version")
+		return errors.New("invalid version")
 	}
 	c.version = version
-	return nil, nil
+	return nil
 }
 const (
-	// Magic constant for `ggml` files (unversioned).
+	// / Magic constant for `ggml` files (unversioned).
 	FILE_MAGIC_GGML = 0x67676d6c
-	// Magic constant for `ggml` files (versioned, ggmf).
+	// / Magic constant for `ggml` files (versioned, ggmf).
 	FILE_MAGIC_GGMF = 0x67676d66
-	// Magic constant for `ggml` files (versioned, ggjt).
+	// / Magic constant for `ggml` files (versioned, ggjt).
 	FILE_MAGIC_GGJT = 0x67676a74
-	// Magic constant for `ggla` files (LoRA adapter).
+	// / Magic constant for `ggla` files (LoRA adapter).
 	FILE_MAGIC_GGLA = 0x67676C61
 	// Magic constant for `gguf` files (versioned, gguf)
 	FILE_MAGIC_GGUF = 0x46554747
 )
-func DecodeGGML(r io.ReadSeeker) (*GGML, error) {
+func DecodeGGML(r io.ReadSeeker, hint ModelFamily) (*GGML, error) {
 	var ggml GGML
 	binary.Read(r, binary.LittleEndian, &ggml.magic)
@@ -191,18 +161,24 @@ func DecodeGGML(r io.ReadSeeker) (*GGML, error) {
 		ggml.container = &containerGGJT{}
 	case FILE_MAGIC_GGLA:
 		ggml.container = &containerLORA{}
 	case FILE_MAGIC_GGUF:
 		ggml.container = &containerGGUF{}
 	default:
 		return nil, errors.New("invalid file magic")
 	}
-	model, err := ggml.Decode(r)
+	if err := ggml.Decode(r); err != nil {
 	if err != nil {
 		return nil, err
 	}
-	ggml.model = model
+	// different model types may have different layouts for hyperparameters
 	switch hint {
 	case ModelFamilyLlama:
 		var llama llamaModel
 		binary.Read(r, binary.LittleEndian, &llama.hyperparameters)
 		ggml.model = &llama
 		// TODO: sanity check hyperparameters
 	default:
 		return nil, fmt.Errorf("unsupported model type: %s", hint)
 	}
 	// final model type
 	return &ggml, nil
--- a/llm/ggml_llama.go
+++ b/llm/ggml_llama.go
@@ -20,143 +20,127 @@ import (
 	"runtime"
 	"strconv"
 	"strings"
 	"sync"
 	"time"
 	"github.com/jmorganca/ollama/api"
 )
-//go:embed llama.cpp/*/build/*/bin/*
+const ModelFamilyLlama ModelFamily = "llama"
 //go:embed llama.cpp/ggml/build/*/bin/*
 var llamaCppEmbed embed.FS
 var (
 	ggmlGPU = path.Join("llama.cpp", "ggml", "build", "gpu", "bin")
 	ggmlCPU = path.Join("llama.cpp", "ggml", "build", "cpu", "bin")
 )
 var (
 	ggmlInit       sync.Once
 	ggmlRunnerPath string
 )
 func osPath(llamaPath string) string {
 	if runtime.GOOS == "windows" {
 		return path.Join(llamaPath, "Release")
 	}
 	return llamaPath
 }
 func initGGML() {
 	ggmlInit.Do(func() {
 		tmpDir, err := os.MkdirTemp("", "llama-*")
 		if err != nil {
 			log.Fatalf("llama.cpp: failed to create temp dir: %v", err)
 		}
 		llamaPath := osPath(ggmlGPU)
 		if _, err := fs.Stat(llamaCppEmbed, llamaPath); err != nil {
 			llamaPath = osPath(ggmlCPU)
 			if _, err := fs.Stat(llamaCppEmbed, llamaPath); err != nil {
 				log.Fatalf("llama.cpp executable not found")
 			}
 		}
 		files := []string{"server"}
 		switch runtime.GOOS {
 		case "windows":
 			files = []string{"server.exe"}
 		case "darwin":
 			if llamaPath == osPath(ggmlGPU) {
 				files = append(files, "ggml-metal.metal")
 			}
 		}
 		for _, f := range files {
 			srcPath := path.Join(llamaPath, f)
 			destPath := filepath.Join(tmpDir, f)
 			srcFile, err := llamaCppEmbed.Open(srcPath)
 			if err != nil {
 				log.Fatalf("read llama.cpp %s: %v", f, err)
 			}
 			defer srcFile.Close()
 			destFile, err := os.OpenFile(destPath, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
 			if err != nil {
 				log.Fatalf("write llama.cpp %s: %v", f, err)
 			}
 			defer destFile.Close()
 			if _, err := io.Copy(destFile, srcFile); err != nil {
 				log.Fatalf("copy llama.cpp %s: %v", f, err)
 			}
 		}
 		ggmlRunnerPath = filepath.Join(tmpDir, "server")
 		if runtime.GOOS == "windows" {
 			ggmlRunnerPath = filepath.Join(tmpDir, "server.exe")
 		}
 	})
 }
 type ModelRunner struct {
 	Path string // path to the model runner executable
 }
-func chooseRunners(workDir, runnerType string) []ModelRunner {
+func ggmlRunner() ModelRunner {
-	buildPath := path.Join("llama.cpp", runnerType, "build")
+	initGGML()
-	var runners []string
+	return ModelRunner{Path: ggmlRunnerPath}
 	// set the runners based on the OS
 	// IMPORTANT: the order of the runners in the array is the priority order
 	switch runtime.GOOS {
 	case "darwin":
 		runners = []string{
 			path.Join(buildPath, "metal", "bin", "server"),
 			path.Join(buildPath, "cpu", "bin", "server"),
 		}
 	case "linux":
 		runners = []string{
 			path.Join(buildPath, "cuda", "bin", "server"),
 			path.Join(buildPath, "cpu", "bin", "server"),
 		}
 	case "windows":
 		// TODO: select windows GPU runner here when available
 		runners = []string{
 			path.Join(buildPath, "cpu", "bin", "Release", "server.exe"),
 		}
 	default:
 		log.Printf("unknown OS, running on CPU: %s", runtime.GOOS)
 		runners = []string{
 			path.Join(buildPath, "cpu", "bin", "server"),
 		}
 	}
 	runnerAvailable := false // if no runner files are found in the embed, this flag will cause a fast fail
 	for _, r := range runners {
 		// find all the files in the runner's bin directory
 		files, err := fs.Glob(llamaCppEmbed, path.Join(path.Dir(r), "*"))
 		if err != nil {
 			// this is expected, ollama may be compiled without all runners packed in
 			log.Printf("%s runner not found: %v", r, err)
 			continue
 		}
 		for _, f := range files {
 			runnerAvailable = true
 			srcFile, err := llamaCppEmbed.Open(f)
 			if err != nil {
 				log.Fatalf("read llama runner %s: %v", f, err)
 			}
 			defer srcFile.Close()
 			// create the directory in case it does not exist, filepath.Dir() converts the file path to the OS's format
 			destPath := filepath.Join(workDir, filepath.Dir(f))
 			if err := os.MkdirAll(destPath, 0o755); err != nil {
 				log.Fatalf("create runner temp dir %s: %v", filepath.Dir(f), err)
 			}
 			// create the path to the destination file, filepath.Base() converts the file path to the OS's format
 			destFile := filepath.Join(destPath, filepath.Base(f))
 			_, err = os.Stat(destFile)
 			switch {
 			case errors.Is(err, os.ErrNotExist):
 				destFile, err := os.OpenFile(destFile, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
 				if err != nil {
 					log.Fatalf("write llama runner %s: %v", f, err)
 				}
 				defer destFile.Close()
 				if _, err := io.Copy(destFile, srcFile); err != nil {
 					log.Fatalf("copy llama runner %s: %v", f, err)
 				}
 			case err != nil:
 				log.Fatalf("stat llama runner %s: %v", f, err)
 			}
 		}
 	}
 	if !runnerAvailable {
 		log.Fatalf("%s runner not found", runnerType)
 	}
 	// return the runners to try in priority order
 	localRunnersByPriority := []ModelRunner{}
 	for _, r := range runners {
 		// clean the ModelRunner paths so that they match the OS we are running on
 		localRunnersByPriority = append(localRunnersByPriority, ModelRunner{Path: filepath.Clean(path.Join(workDir, r))})
 	}
 	return localRunnersByPriority
 }
 type llamaModel struct {
 	hyperparameters llamaHyperparameters
 }
-func (llm *llamaModel) ModelFamily() string {
+func (llm *llamaModel) ModelFamily() ModelFamily {
-	return "llama"
+	return ModelFamilyLlama
 }
-func llamaModelType(numLayer uint32) string {
+func (llm *llamaModel) ModelType() ModelType {
-	switch numLayer {
+	switch llm.hyperparameters.NumLayer {
 	case 26:
-		return "3B"
+		return ModelType3B
 	case 32:
-		return "7B"
+		return ModelType7B
 	case 40:
-		return "13B"
+		return ModelType13B
 	case 48:
-		return "34B"
+		return ModelType34B
 	case 60:
-		return "30B"
+		return ModelType30B
 	case 80:
-		return "65B"
+		return ModelType65B
 	default:
 		return "Unknown"
 	}
 	// TODO: find a better default
 	return ModelType7B
 }
-func (llm *llamaModel) ModelType() string {
+func (llm *llamaModel) FileType() FileType {
-	return llamaModelType(llm.hyperparameters.NumLayer)
+	return llm.hyperparameters.FileType
 }
 func (llm *llamaModel) FileType() string {
 	return fileType(llm.hyperparameters.FileType)
 }
 func (llm *llamaModel) NumLayers() int64 {
 	return int64(llm.hyperparameters.NumLayer)
 }
 type llamaHyperparameters struct {
@@ -173,7 +157,70 @@ type llamaHyperparameters struct {
 	NumRot   uint32
 	// FileType describes the quantization level of the model, e.g. Q4_0, Q5_K, etc.
-	FileType uint32
+	FileType llamaFileType
 }
 type llamaFileType uint32
 const (
 	llamaFileTypeF32 llamaFileType = iota
 	llamaFileTypeF16
 	llamaFileTypeQ4_0
 	llamaFileTypeQ4_1
 	llamaFileTypeQ4_1_F16
 	llamaFileTypeQ8_0 llamaFileType = iota + 2
 	llamaFileTypeQ5_0
 	llamaFileTypeQ5_1
 	llamaFileTypeQ2_K
 	llamaFileTypeQ3_K_S
 	llamaFileTypeQ3_K_M
 	llamaFileTypeQ3_K_L
 	llamaFileTypeQ4_K_S
 	llamaFileTypeQ4_K_M
 	llamaFileTypeQ5_K_S
 	llamaFileTypeQ5_K_M
 	llamaFileTypeQ6_K
 )
 func (ft llamaFileType) String() string {
 	switch ft {
 	case llamaFileTypeF32:
 		return "F32"
 	case llamaFileTypeF16:
 		return "F16"
 	case llamaFileTypeQ4_0:
 		return "Q4_0"
 	case llamaFileTypeQ4_1:
 		return "Q4_1"
 	case llamaFileTypeQ4_1_F16:
 		return "Q4_1_F16"
 	case llamaFileTypeQ8_0:
 		return "Q8_0"
 	case llamaFileTypeQ5_0:
 		return "Q5_0"
 	case llamaFileTypeQ5_1:
 		return "Q5_1"
 	case llamaFileTypeQ2_K:
 		return "Q2_K"
 	case llamaFileTypeQ3_K_S:
 		return "Q3_K_S"
 	case llamaFileTypeQ3_K_M:
 		return "Q3_K_M"
 	case llamaFileTypeQ3_K_L:
 		return "Q3_K_L"
 	case llamaFileTypeQ4_K_S:
 		return "Q4_K_S"
 	case llamaFileTypeQ4_K_M:
 		return "Q4_K_M"
 	case llamaFileTypeQ5_K_S:
 		return "Q5_K_S"
 	case llamaFileTypeQ5_K_M:
 		return "Q5_K_M"
 	case llamaFileTypeQ6_K:
 		return "Q6_K"
 	default:
 		return "Unknown"
 	}
 }
 type Running struct {
@@ -187,66 +234,12 @@ type llama struct {
 	Running
 }
-var errNoGPU = errors.New("nvidia-smi command failed")
+func newLlama(model string, adapters []string, runner ModelRunner, opts api.Options) (*llama, error) {
-
+	if _, err := os.Stat(model); err != nil {
-// CheckVRAM returns the available VRAM in MiB on Linux machines with NVIDIA GPUs
+		return nil, err
 func CheckVRAM() (int64, error) {
 	cmd := exec.Command("nvidia-smi", "--query-gpu=memory.total", "--format=csv,noheader,nounits")
 	var stdout bytes.Buffer
 	cmd.Stdout = &stdout
 	err := cmd.Run()
 	if err != nil {
 		return 0, errNoGPU
 	}
-	var total int64
+	if _, err := os.Stat(runner.Path); err != nil {
 	scanner := bufio.NewScanner(&stdout)
 	for scanner.Scan() {
 		line := scanner.Text()
 		vram, err := strconv.ParseInt(strings.TrimSpace(line), 10, 64)
 		if err != nil {
 			return 0, fmt.Errorf("failed to parse available VRAM: %v", err)
 		}
 		total += vram
 	}
 	return total, nil
 }
 func NumGPU(numLayer, fileSizeBytes int64, opts api.Options) int {
 	if opts.NumGPU != -1 {
 		return opts.NumGPU
 	}
 	n := 1 // default to enable metal on macOS
 	if runtime.GOOS == "linux" {
 		vramMib, err := CheckVRAM()
 		if err != nil {
 			if err.Error() != "nvidia-smi command failed" {
 				log.Print(err.Error())
 			}
 			// nvidia driver not installed or no nvidia GPU found
 			return 0
 		}
 		totalVramBytes := int64(vramMib) * 1024 * 1024 // 1 MiB = 1024^2 bytes
 		// Calculate bytes per layer
 		// TODO: this is a rough heuristic, better would be to calculate this based on number of layers and context size
 		bytesPerLayer := fileSizeBytes / numLayer
 		// set n to the max number of layers we can fit in VRAM
 		return int(totalVramBytes / bytesPerLayer)
 		log.Printf("%d MiB VRAM available, loading up to %d GPU layers", vramMib, n)
 	}
 	// default to enable metal on macOS
 	return 1
 }
 func newLlama(model string, adapters []string, runners []ModelRunner, numLayers int64, opts api.Options) (*llama, error) {
 	fileInfo, err := os.Stat(model)
 	if err != nil {
 		return nil, err
 	}
@@ -257,17 +250,14 @@ func newLlama(model string, adapters []string, runners []ModelRunner, numLayers
 	params := []string{
 		"--model", model,
 		"--ctx-size", fmt.Sprintf("%d", opts.NumCtx),
 		"--gqa", fmt.Sprintf("%d", opts.NumGQA),
 		"--rope-freq-base", fmt.Sprintf("%f", opts.RopeFrequencyBase),
 		"--rope-freq-scale", fmt.Sprintf("%f", opts.RopeFrequencyScale),
 		"--batch-size", fmt.Sprintf("%d", opts.NumBatch),
-		"--n-gpu-layers", fmt.Sprintf("%d", NumGPU(numLayers, fileInfo.Size(), opts)),
+		"--n-gpu-layers", fmt.Sprintf("%d", opts.NumGPU),
 		"--embedding",
 	}
 	if opts.NumGQA > 0 {
 		params = append(params, "--gqa", fmt.Sprintf("%d", opts.NumGQA))
 	}
 	if len(adapters) > 0 {
 		// TODO: applying multiple adapters is not supported by the llama.cpp server yet
 		params = append(params, "--lora", adapters[0])
@@ -291,12 +281,7 @@ func newLlama(model string, adapters []string, runners []ModelRunner, numLayers
 	}
 	// start the llama.cpp server with a retry in case the port is already in use
-	for _, runner := range runners {
+	for try := 0; try < 3; try++ {
 		if _, err := os.Stat(runner.Path); err != nil {
 			log.Printf("llama runner not found: %v", err)
 			continue
 		}
 		port := rand.Intn(65535-49152) + 49152 // get a random port in the ephemeral range
 		ctx, cancel := context.WithCancel(context.Background())
 		cmd := exec.CommandContext(
@@ -304,70 +289,67 @@ func newLlama(model string, adapters []string, runners []ModelRunner, numLayers
 			runner.Path,
 			append(params, "--port", strconv.Itoa(port))...,
 		)
 		cmd.Env = append(os.Environ(), fmt.Sprintf("LD_LIBRARY_PATH=%s", filepath.Dir(runner.Path)))
 		cmd.Stdout = os.Stderr
 		cmd.Stderr = os.Stderr
 		llm := &llama{Options: opts, Running: Running{Port: port, Cmd: cmd, Cancel: cancel}}
 		log.Print("starting llama runner")
 		if err := llm.Cmd.Start(); err != nil {
 			log.Printf("error starting the external llama runner: %v", err)
 			continue
 		}
 		// monitor the command, it is blocking, so if it exits we need to capture that
 		go func() {
 			err := llm.Cmd.Wait() // this will block until the command exits
 			if err != nil {
 				log.Printf("llama runner exited with error: %v", err)
 			} else {
 				log.Printf("llama runner exited")
 			}
 		}()
 		if err := waitForServer(llm); err != nil {
-			log.Printf("error starting llama runner: %v", err)
+			log.Printf("error starting llama.cpp server: %v", err)
 			llm.Close()
 			// try again
 			continue
 		}
 		// server started successfully
 		return llm, nil
 	}
-	return nil, fmt.Errorf("failed to start a llama runner")
+	return nil, fmt.Errorf("max retry exceeded starting llama.cpp")
 }
 func waitForServer(llm *llama) error {
-	// wait for the server to start responding
+	log.Print("starting llama.cpp server")
-	start := time.Now()
+	var stderr bytes.Buffer
-	expiresAt := time.Now().Add(2 * time.Minute) // be generous with timeout, large models can take a while to load
+	llm.Cmd.Stderr = &stderr
-	ticker := time.NewTicker(200 * time.Millisecond)
+	err := llm.Cmd.Start()
-
+	if err != nil {
-	log.Print("waiting for llama runner to start responding")
+		return fmt.Errorf("error starting the external llama.cpp server: %w", err)
 	for range ticker.C {
 		if time.Now().After(expiresAt) {
 			return fmt.Errorf("llama runner did not start within alloted time, retrying")
 		}
 		// check if the server process has terminated
 		if llm.Cmd.ProcessState != nil && llm.Cmd.ProcessState.Exited() {
 			return fmt.Errorf("llama runner process has terminated")
 		}
 		if err := llm.Ping(context.Background()); err == nil {
 			break
 		}
 	}
-	log.Printf("llama runner started in %f seconds", time.Since(start).Seconds())
+	exitChan := make(chan error, 1)
-	return nil
+
 	// the server is a long running process, watch for it exiting to keep track of something going wrong
 	go func() {
 		err := llm.Cmd.Wait()
 		log.Print(stderr.String())
 		exitChan <- err
 	}()
 	// wait for the server to start responding
 	start := time.Now()
 	expiresAt := time.Now().Add(30 * time.Second)
 	ticker := time.NewTicker(100 * time.Millisecond)
 	log.Print("waiting for llama.cpp server to start responding")
 	for {
 		select {
 		case <-ticker.C:
 			if time.Now().After(expiresAt) {
 				return fmt.Errorf("llama.cpp server did not start responding within 30 seconds, retrying")
 			}
 			if err := llm.Ping(context.Background()); err == nil {
 				log.Printf("llama.cpp server started in %f seconds", time.Since(start).Seconds())
 				return nil
 			}
 		case err := <-exitChan:
 			return fmt.Errorf("llama.cpp server exited unexpectedly: %w", err)
 		}
 	}
 }
 func (llm *llama) Close() {
-	llm.Cancel()
+	llm.Running.Cmd.Cancel()
 }
 func (llm *llama) SetOptions(opts api.Options) {
@@ -694,7 +676,7 @@ func (llm *llama) Embedding(ctx context.Context, input string) ([]float64, error
 // Ping checks that the server subprocess is still running and responding to requests
 func (llm *llama) Ping(ctx context.Context) error {
-	resp, err := http.Head(fmt.Sprintf("http://127.0.0.1:%d", llm.Port))
+	resp, err := http.Head(fmt.Sprintf("http://127.0.0.1:%d", llm.Running.Port))
 	if err != nil {
 		return fmt.Errorf("ping resp: %w", err)
 	}
--- a/llm/gguf.go
+++ b/llm/gguf.go
@@ -1,379 +0,0 @@
 package llm
 import (
 	"bytes"
 	"encoding/binary"
 	"errors"
 	"fmt"
 	"io"
 )
 type containerGGUF struct {
 	Version uint32
 	V1 struct {
 		NumTensor uint32
 		NumKV     uint32
 	}
 	V2 struct {
 		NumTensor uint64
 		NumKV     uint64
 	}
 }
 func (c *containerGGUF) Name() string {
 	return "gguf"
 }
 func (c *containerGGUF) Decode(r io.Reader) (model, error) {
 	binary.Read(r, binary.LittleEndian, &c.Version)
 	switch c.Version {
 	case 1:
 		binary.Read(r, binary.LittleEndian, &c.V1)
 	case 2:
 		binary.Read(r, binary.LittleEndian, &c.V2)
 	default:
 		return nil, errors.New("invalid version")
 	}
 	model := newGGUFModel(c)
 	if err := model.Decode(r); err != nil {
 		return nil, err
 	}
 	return model, nil
 }
 const (
 	ggufTypeUint8 uint32 = iota
 	ggufTypeInt8
 	ggufTypeUint16
 	ggufTypeInt16
 	ggufTypeUint32
 	ggufTypeInt32
 	ggufTypeFloat32
 	ggufTypeBool
 	ggufTypeString
 	ggufTypeArray
 	ggufTypeUint64
 	ggufTypeInt64
 	ggufTypeFloat64
 )
 type kv map[string]any
 type ggufModel struct {
 	*containerGGUF
 	kv
 }
 func newGGUFModel(container *containerGGUF) *ggufModel {
 	return &ggufModel{
 		containerGGUF: container,
 		kv:            make(kv),
 	}
 }
 func (llm *ggufModel) NumKV() uint64 {
 	if llm.Version == 1 {
 		return uint64(llm.V1.NumKV)
 	}
 	return llm.V2.NumKV
 }
 func (llm *ggufModel) ModelFamily() string {
 	t, ok := llm.kv["general.architecture"].(string)
 	if ok {
 		return t
 	}
 	return "unknown"
 }
 func (llm *ggufModel) ModelType() string {
 	switch llm.ModelFamily() {
 	case "llama":
 		if blocks, ok := llm.kv["llama.block_count"].(uint32); ok {
 			heads, headsOK := llm.kv["llama.head_count"].(uint32)
 			headKVs, headsKVsOK := llm.kv["llama.head_count_kv"].(uint32)
 			if headsOK && headsKVsOK && heads/headKVs == 8 {
 				return "70B"
 			}
 			return llamaModelType(blocks)
 		}
 	case "falcon":
 		if blocks, ok := llm.kv["falcon.block_count"].(uint32); ok {
 			return falconModelType(blocks)
 		}
 	}
 	return "Unknown"
 }
 func (llm *ggufModel) FileType() string {
 	t, ok := llm.kv["general.file_type"].(uint32)
 	if ok {
 		return fileType(t)
 	}
 	return "Unknown"
 }
 func (llm *ggufModel) Decode(r io.Reader) error {
 	read := llm.readString
 	if llm.Version == 1 {
 		read = llm.readStringV1
 	}
 	for i := 0; uint64(i) < llm.NumKV(); i++ {
 		k, err := read(r)
 		if err != nil {
 			return err
 		}
 		vtype := llm.readU32(r)
 		var v any
 		switch vtype {
 		case ggufTypeUint8:
 			v = llm.readU8(r)
 		case ggufTypeInt8:
 			v = llm.readI8(r)
 		case ggufTypeUint16:
 			v = llm.readU16(r)
 		case ggufTypeInt16:
 			v = llm.readI16(r)
 		case ggufTypeUint32:
 			v = llm.readU32(r)
 		case ggufTypeInt32:
 			v = llm.readI32(r)
 		case ggufTypeUint64:
 			v = llm.readU64(r)
 		case ggufTypeInt64:
 			v = llm.readI64(r)
 		case ggufTypeFloat32:
 			v = llm.readF32(r)
 		case ggufTypeFloat64:
 			v = llm.readF64(r)
 		case ggufTypeBool:
 			v = llm.readBool(r)
 		case ggufTypeString:
 			fn := llm.readString
 			if llm.Version == 1 {
 				fn = llm.readStringV1
 			}
 			s, err := fn(r)
 			if err != nil {
 				return err
 			}
 			v = s
 		case ggufTypeArray:
 			fn := llm.readArray
 			if llm.Version == 1 {
 				fn = llm.readArrayV1
 			}
 			a, err := fn(r)
 			if err != nil {
 				return err
 			}
 			v = a
 		default:
 			return fmt.Errorf("invalid type: %d", vtype)
 		}
 		llm.kv[k] = v
 	}
 	return nil
 }
 func (llm *ggufModel) NumLayers() int64 {
 	value, exists := llm.kv[fmt.Sprintf("%s.block_count", llm.ModelFamily())]
 	if !exists {
 		return 0
 	}
 	v := value.(uint32)
 	return int64(v)
 }
 func (ggufModel) readU8(r io.Reader) uint8 {
 	var u8 uint8
 	binary.Read(r, binary.LittleEndian, &u8)
 	return u8
 }
 func (ggufModel) readI8(r io.Reader) int8 {
 	var i8 int8
 	binary.Read(r, binary.LittleEndian, &i8)
 	return i8
 }
 func (ggufModel) readU16(r io.Reader) uint16 {
 	var u16 uint16
 	binary.Read(r, binary.LittleEndian, &u16)
 	return u16
 }
 func (ggufModel) readI16(r io.Reader) int16 {
 	var i16 int16
 	binary.Read(r, binary.LittleEndian, &i16)
 	return i16
 }
 func (ggufModel) readU32(r io.Reader) uint32 {
 	var u32 uint32
 	binary.Read(r, binary.LittleEndian, &u32)
 	return u32
 }
 func (ggufModel) readI32(r io.Reader) int32 {
 	var i32 int32
 	binary.Read(r, binary.LittleEndian, &i32)
 	return i32
 }
 func (ggufModel) readU64(r io.Reader) uint64 {
 	var u64 uint64
 	binary.Read(r, binary.LittleEndian, &u64)
 	return u64
 }
 func (ggufModel) readI64(r io.Reader) int64 {
 	var i64 int64
 	binary.Read(r, binary.LittleEndian, &i64)
 	return i64
 }
 func (ggufModel) readF32(r io.Reader) float32 {
 	var f32 float32
 	binary.Read(r, binary.LittleEndian, &f32)
 	return f32
 }
 func (ggufModel) readF64(r io.Reader) float64 {
 	var f64 float64
 	binary.Read(r, binary.LittleEndian, &f64)
 	return f64
 }
 func (ggufModel) readBool(r io.Reader) bool {
 	var b bool
 	binary.Read(r, binary.LittleEndian, &b)
 	return b
 }
 func (ggufModel) readStringV1(r io.Reader) (string, error) {
 	var nameLength uint32
 	binary.Read(r, binary.LittleEndian, &nameLength)
 	var b bytes.Buffer
 	if _, err := io.CopyN(&b, r, int64(nameLength)); err != nil {
 		return "", err
 	}
 	// gguf v1 strings are null-terminated
 	b.Truncate(b.Len() - 1)
 	return b.String(), nil
 }
 func (llm ggufModel) readString(r io.Reader) (string, error) {
 	var nameLength uint64
 	binary.Read(r, binary.LittleEndian, &nameLength)
 	var b bytes.Buffer
 	if _, err := io.CopyN(&b, r, int64(nameLength)); err != nil {
 		return "", err
 	}
 	return b.String(), nil
 }
 func (llm *ggufModel) readArrayV1(r io.Reader) (arr []any, err error) {
 	atype := llm.readU32(r)
 	n := llm.readU32(r)
 	for i := 0; uint32(i) < n; i++ {
 		switch atype {
 		case ggufTypeUint8:
 			arr = append(arr, llm.readU8(r))
 		case ggufTypeInt8:
 			arr = append(arr, llm.readU8(r))
 		case ggufTypeUint16:
 			arr = append(arr, llm.readU16(r))
 		case ggufTypeInt16:
 			arr = append(arr, llm.readI16(r))
 		case ggufTypeUint32:
 			arr = append(arr, llm.readU32(r))
 		case ggufTypeInt32:
 			arr = append(arr, llm.readI32(r))
 		case ggufTypeFloat32:
 			arr = append(arr, llm.readF32(r))
 		case ggufTypeBool:
 			arr = append(arr, llm.readBool(r))
 		case ggufTypeString:
 			s, err := llm.readStringV1(r)
 			if err != nil {
 				return nil, err
 			}
 			arr = append(arr, s)
 		default:
 			return nil, fmt.Errorf("invalid array type: %d", atype)
 		}
 	}
 	return
 }
 func (llm *ggufModel) readArray(r io.Reader) (arr []any, err error) {
 	atype := llm.readU32(r)
 	n := llm.readU64(r)
 	for i := 0; uint64(i) < n; i++ {
 		switch atype {
 		case ggufTypeUint8:
 			arr = append(arr, llm.readU8(r))
 		case ggufTypeInt8:
 			arr = append(arr, llm.readU8(r))
 		case ggufTypeUint16:
 			arr = append(arr, llm.readU16(r))
 		case ggufTypeInt16:
 			arr = append(arr, llm.readI16(r))
 		case ggufTypeUint32:
 			arr = append(arr, llm.readU32(r))
 		case ggufTypeInt32:
 			arr = append(arr, llm.readI32(r))
 		case ggufTypeUint64:
 			arr = append(arr, llm.readU64(r))
 		case ggufTypeInt64:
 			arr = append(arr, llm.readI64(r))
 		case ggufTypeFloat32:
 			arr = append(arr, llm.readF32(r))
 		case ggufTypeFloat64:
 			arr = append(arr, llm.readF64(r))
 		case ggufTypeBool:
 			arr = append(arr, llm.readBool(r))
 		case ggufTypeString:
 			s, err := llm.readString(r)
 			if err != nil {
 				return nil, err
 			}
 			arr = append(arr, s)
 		default:
 			return nil, fmt.Errorf("invalid array type: %d", atype)
 		}
 	}
 	return
 }
--- a/llm/llama.cpp/generate.go
+++ b/llm/llama.cpp/generate.go
@@ -0,0 +1,13 @@
 //go:build !darwin
 // +build !darwin
 package llm
 //go:generate git submodule init
 //go:generate git submodule update --force ggml
 //go:generate git -C ggml apply ../ggml_patch/0001-add-detokenize-endpoint.patch
 //go:generate git -C ggml apply ../ggml_patch/0002-34B-model-support.patch
 //go:generate git -C ggml apply ../ggml_patch/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
 //go:generate git -C ggml apply ../ggml_patch/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
 //go:generate cmake --fresh -S ggml -B ggml/build/cpu -DLLAMA_K_QUANTS=on
 //go:generate cmake --build ggml/build/cpu --target server --config Release
--- a/llm/llama.cpp/generate_darwin_amd64.go
+++ b/llm/llama.cpp/generate_darwin_amd64.go
@@ -1,16 +1,10 @@
 package llm
 //go:generate git submodule init
 //go:generate git submodule update --force ggml
-//go:generate git -C ggml apply ../patches/0001-add-detokenize-endpoint.patch
+//go:generate git -C ggml apply ../ggml_patch/0001-add-detokenize-endpoint.patch
-//go:generate git -C ggml apply ../patches/0002-34B-model-support.patch
+//go:generate git -C ggml apply ../ggml_patch/0002-34B-model-support.patch
-//go:generate git -C ggml apply ../patches/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
+//go:generate git -C ggml apply ../ggml_patch/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
-//go:generate git -C ggml apply ../patches/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
+//go:generate git -C ggml apply ../ggml_patch/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
-//go:generate cmake -S ggml -B ggml/build/cpu -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
+//go:generate cmake --fresh -S ggml -B ggml/build/cpu -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
 //go:generate cmake --build ggml/build/cpu --target server --config Release
 //go:generate git submodule update --force gguf
 //go:generate git -C gguf apply ../patches/0001-remove-warm-up-logging.patch
 //go:generate cmake -S gguf -B gguf/build/cpu -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
 //go:generate cmake --build gguf/build/cpu --target server --config Release
--- a/llm/llama.cpp/generate_darwin_arm64.go
+++ b/llm/llama.cpp/generate_darwin_arm64.go
@@ -1,16 +1,10 @@
 package llm
 //go:generate git submodule init
 //go:generate git submodule update --force ggml
-//go:generate git -C ggml apply ../patches/0001-add-detokenize-endpoint.patch
+//go:generate git -C ggml apply ../ggml_patch/0001-add-detokenize-endpoint.patch
-//go:generate git -C ggml apply ../patches/0002-34B-model-support.patch
+//go:generate git -C ggml apply ../ggml_patch/0002-34B-model-support.patch
-//go:generate git -C ggml apply ../patches/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
+//go:generate git -C ggml apply ../ggml_patch/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
-//go:generate git -C ggml apply ../patches/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
+//go:generate git -C ggml apply ../ggml_patch/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
-//go:generate cmake -S ggml -B ggml/build/metal -DLLAMA_METAL=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
+//go:generate cmake --fresh -S ggml -B ggml/build/gpu -DLLAMA_METAL=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
-//go:generate cmake --build ggml/build/metal --target server --config Release
+//go:generate cmake --build ggml/build/gpu --target server --config Release
 //go:generate git submodule update --force gguf
 //go:generate git -C gguf apply ../patches/0001-remove-warm-up-logging.patch
 //go:generate cmake -S gguf -B gguf/build/metal -DLLAMA_METAL=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
 //go:generate cmake --build gguf/build/metal --target server --config Release
--- a/llm/llama.cpp/generate_linux.go
+++ b/llm/llama.cpp/generate_linux.go
@@ -1,22 +0,0 @@
 package llm
 //go:generate git submodule init
 //go:generate git submodule update --force ggml
 //go:generate git -C ggml apply ../patches/0001-add-detokenize-endpoint.patch
 //go:generate git -C ggml apply ../patches/0002-34B-model-support.patch
 //go:generate git -C ggml apply ../patches/0005-ggml-support-CUDA-s-half-type-for-aarch64-1455-2670.patch
 //go:generate git -C ggml apply ../patches/0001-copy-cuda-runtime-libraries.patch
 //go:generate cmake -S ggml -B ggml/build/cpu -DLLAMA_K_QUANTS=on
 //go:generate cmake --build ggml/build/cpu --target server --config Release
 //go:generate git submodule update --force gguf
 //go:generate git -C gguf apply ../patches/0001-copy-cuda-runtime-libraries.patch
 //go:generate git -C gguf apply ../patches/0001-remove-warm-up-logging.patch
 //go:generate cmake -S gguf -B gguf/build/cpu -DLLAMA_K_QUANTS=on
 //go:generate cmake --build gguf/build/cpu --target server --config Release
 //go:generate cmake -S ggml -B ggml/build/cuda -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on
 //go:generate cmake --build ggml/build/cuda --target server --config Release
 //go:generate cmake -S gguf -B gguf/build/cuda -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on
 //go:generate cmake --build gguf/build/cuda --target server --config Release
--- a/llm/llama.cpp/generate_windows.go
+++ b/llm/llama.cpp/generate_windows.go
@@ -1,14 +0,0 @@
 package llm
 //go:generate git submodule init
 //go:generate git submodule update --force ggml
 //go:generate git -C ggml apply ../patches/0001-add-detokenize-endpoint.patch
 //go:generate git -C ggml apply ../patches/0002-34B-model-support.patch
 //go:generate cmake -S ggml -B ggml/build/cpu -DLLAMA_K_QUANTS=on
 //go:generate cmake --build ggml/build/cpu --target server --config Release
 //go:generate git submodule update --force gguf
 //go:generate git -C gguf apply ../patches/0001-remove-warm-up-logging.patch
 //go:generate cmake -S gguf -B gguf/build/cpu -DLLAMA_K_QUANTS=on
 //go:generate cmake --build gguf/build/cpu --target server --config Release
--- a/llm/llama.cpp/ggml_patch/0001-add-detokenize-endpoint.patch
+++ b/llm/llama.cpp/ggml_patch/0001-add-detokenize-endpoint.patch
--- a/llm/llama.cpp/ggml_patch/0002-34B-model-support.patch
+++ b/llm/llama.cpp/ggml_patch/0002-34B-model-support.patch
--- a/llm/llama.cpp/ggml_patch/0003-metal-add-missing-barriers-for-mul-mat-2699.patch
+++ b/llm/llama.cpp/ggml_patch/0003-metal-add-missing-barriers-for-mul-mat-2699.patch
@@ -0,0 +1,32 @@
 From 8c0ea847ac1460bca534d92266e3471cb31471be Mon Sep 17 00:00:00 2001
 From: Bruce MacDonald <brucewmacdonald@gmail.com>
 Date: Tue, 5 Sep 2023 16:05:08 -0400
 Subject: [PATCH] metal: add missing barriers for mul-mat #2699
 ---
 ggml-metal.metal | 2 ++
 1 file changed, 2 insertions(+)
 diff --git a/ggml-metal.metal b/ggml-metal.metal
 index 3f31252..ce3541f 100644
 --- a/ggml-metal.metal
 +++ b/ggml-metal.metal
@@ -1850,6 +1850,7 @@ kernel void kernel_mul_mm(device const  uchar * src0,
         //load data and store to threadgroup memory
         half4x4 temp_a;
         dequantize_func(x, il, temp_a);
 +        threadgroup_barrier(mem_flags::mem_threadgroup);
         #pragma unroll(16)
         for (int i = 0; i < 16; i++) {
             *(sa + SG_MAT_SIZE * ((tiitg / THREAD_PER_ROW / 8) \
@@ -1895,6 +1896,7 @@ kernel void kernel_mul_mm(device const  uchar * src0,
         }
     } else {
         // block is smaller than 64x32, we should avoid writing data outside of the matrix
 +        threadgroup_barrier(mem_flags::mem_threadgroup);
         threadgroup float *temp_str = ((threadgroup float *)shared_memory) \
                                       + 32 * (sgitg&1) + (16 * (sgitg>>1)) * BLOCK_SIZE_M;
         for (int i = 0; i < 8; i++) {
 -- 
 2.39.2 (Apple Git-143)
--- a/llm/llama.cpp/ggml_patch/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
+++ b/llm/llama.cpp/ggml_patch/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
--- a/llm/llama.cpp/ggml_patch/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
+++ b/llm/llama.cpp/ggml_patch/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
--- a/llm/llama.cpp/gguf
+++ b/llm/llama.cpp/gguf
--- a/llm/llama.cpp/patches/0001-copy-cuda-runtime-libraries.patch
+++ b/llm/llama.cpp/patches/0001-copy-cuda-runtime-libraries.patch
@@ -1,27 +0,0 @@
 From 5dd02993e8cc2ce309157736b95bb572f274a3fd Mon Sep 17 00:00:00 2001
 From: Michael Yang <mxyng@pm.me>
 Date: Wed, 20 Sep 2023 14:19:52 -0700
 Subject: [PATCH] copy cuda runtime libraries
 ---
 CMakeLists.txt | 4 ++++
 1 file changed, 4 insertions(+)
 diff --git a/CMakeLists.txt b/CMakeLists.txt
 index 824d9f2..dd24137 100644
 --- a/CMakeLists.txt
 +++ b/CMakeLists.txt
@@ -274,6 +274,10 @@ if (LLAMA_CUBLAS)
             set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
         endif()
 +        configure_file(${CUDAToolkit_LIBRARY_DIR}/libcudart.so ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/libcudart.so.${CUDAToolkit_VERSION_MAJOR}.0 COPYONLY)
 +        configure_file(${CUDAToolkit_LIBRARY_DIR}/libcublas.so ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/libcublas.so.${CUDAToolkit_VERSION_MAJOR} COPYONLY)
 +        configure_file(${CUDAToolkit_LIBRARY_DIR}/libcublasLt.so ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/libcublasLt.so.${CUDAToolkit_VERSION_MAJOR} COPYONLY)
 +
     if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
         # 52 == lowest CUDA 12 standard
         # 60 == f16 CUDA intrinsics
 -- 
 2.42.0
--- a/llm/llama.cpp/patches/0001-remove-warm-up-logging.patch
+++ b/llm/llama.cpp/patches/0001-remove-warm-up-logging.patch
@@ -1,25 +0,0 @@
 From 07993bdc35345b67b27aa649a7c099ad42d80c4c Mon Sep 17 00:00:00 2001
 From: Michael Yang <mxyng@pm.me>
 Date: Thu, 21 Sep 2023 14:43:21 -0700
 Subject: [PATCH] remove warm up logging
 ---
 common/common.cpp | 2 --
 1 file changed, 2 deletions(-)
 diff --git a/common/common.cpp b/common/common.cpp
 index 2597ba0..b56549b 100644
 --- a/common/common.cpp
 +++ b/common/common.cpp
@@ -780,8 +780,6 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
     }
     {
 -        LOG("warming up the model with an empty run\n");
 -
         const std::vector<llama_token> tmp = { llama_token_bos(lctx), llama_token_eos(lctx), };
         llama_eval(lctx, tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, params.n_threads);
         llama_reset_timings(lctx);
 -- 
 2.42.0
--- a/llm/llama.cpp/patches/0005-ggml-support-CUDA-s-half-type-for-aarch64-1455-2670.patch
+++ b/llm/llama.cpp/patches/0005-ggml-support-CUDA-s-half-type-for-aarch64-1455-2670.patch
@@ -1,32 +0,0 @@
 From 1e3bc523d8053a77df3ac7126a84d0297ee97ef6 Mon Sep 17 00:00:00 2001
 From: Kylin <56434533+KyL0N@users.noreply.github.com>
 Date: Tue, 22 Aug 2023 15:14:23 +0800
 Subject: [PATCH] ggml : support CUDA's half type for aarch64(#1455) (#2670)
 * ggml: support CUDA's half type for aarch64(#1455)
 support CUDA's half type for aarch64 in ggml_fp16_t definition
 * ggml: use __CUDACC__ to recognise nvcc compiler
 ---
 ggml.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)
 diff --git a/ggml.h b/ggml.h
 index 544ad2d..0ec7ec5 100644
 --- a/ggml.h
 +++ b/ggml.h
@@ -259,8 +259,9 @@
 extern "C" {
 #endif
 -#ifdef __ARM_NEON
 -    // we use the built-in 16-bit float type
 +#if defined(__ARM_NEON) && defined(__CUDACC__)
 +    typedef half ggml_fp16_t;
 +#elif defined(__ARM_NEON)
     typedef __fp16 ggml_fp16_t;
 #else
     typedef uint16_t ggml_fp16_t;
 -- 
 2.39.2 (Apple Git-143)
--- a/llm/llm.go
+++ b/llm/llm.go
@@ -21,7 +21,7 @@ type LLM interface {
 	Ping(context.Context) error
 }
-func New(workDir, model string, adapters []string, opts api.Options) (LLM, error) {
+func New(model string, adapters []string, opts api.Options) (LLM, error) {
 	if _, err := os.Stat(model); err != nil {
 		return nil, err
 	}
@@ -32,22 +32,15 @@ func New(workDir, model string, adapters []string, opts api.Options) (LLM, error
 	}
 	defer f.Close()
-	ggml, err := DecodeGGML(f)
+	ggml, err := DecodeGGML(f, ModelFamilyLlama)
 	if err != nil {
 		return nil, err
 	}
-	switch ggml.FileType() {
+	switch ggml.FileType().String() {
-	case "Q8_0":
+	case "F32", "Q5_0", "Q5_1", "Q8_0":
 		if ggml.Name() != "gguf" && opts.NumGPU != 0 {
 			// GGML Q8_0 do not support Metal API and will
 			// cause the runner to segmentation fault so disable GPU
 			log.Printf("WARNING: GPU disabled for F32, Q5_0, Q5_1, and Q8_0")
 			opts.NumGPU = 0
 		}
 	case "F32", "Q5_0", "Q5_1":
 		if opts.NumGPU != 0 {
-			// F32, Q5_0, Q5_1, and Q8_0 do not support Metal API and will
+			// F32, F16, Q5_0, Q5_1, and Q8_0 do not support Metal API and will
 			// cause the runner to segmentation fault so disable GPU
 			log.Printf("WARNING: GPU disabled for F32, Q5_0, Q5_1, and Q8_0")
 			opts.NumGPU = 0
@@ -56,44 +49,35 @@ func New(workDir, model string, adapters []string, opts api.Options) (LLM, error
 	totalResidentMemory := memory.TotalMemory()
 	switch ggml.ModelType() {
-	case "3B", "7B":
+	case ModelType3B, ModelType7B:
-		if ggml.FileType() == "F16" && totalResidentMemory < 16*1024*1024 {
+		if ggml.FileType().String() == "F16" && totalResidentMemory < 16*1024*1024 {
 			return nil, fmt.Errorf("F16 model requires at least 16GB of memory")
 		} else if totalResidentMemory < 8*1024*1024 {
 			return nil, fmt.Errorf("model requires at least 8GB of memory")
 		}
-	case "13B":
+	case ModelType13B:
-		if ggml.FileType() == "F16" && totalResidentMemory < 32*1024*1024 {
+		if ggml.FileType().String() == "F16" && totalResidentMemory < 32*1024*1024 {
 			return nil, fmt.Errorf("F16 model requires at least 32GB of memory")
 		} else if totalResidentMemory < 16*1024*1024 {
 			return nil, fmt.Errorf("model requires at least 16GB of memory")
 		}
-	case "30B", "34B", "40B":
+	case ModelType30B, ModelType34B:
-		if ggml.FileType() == "F16" && totalResidentMemory < 64*1024*1024 {
+		if ggml.FileType().String() == "F16" && totalResidentMemory < 64*1024*1024 {
 			return nil, fmt.Errorf("F16 model requires at least 64GB of memory")
 		} else if totalResidentMemory < 32*1024*1024 {
 			return nil, fmt.Errorf("model requires at least 32GB of memory")
 		}
-	case "65B", "70B":
+	case ModelType65B:
-		if ggml.FileType() == "F16" && totalResidentMemory < 128*1024*1024 {
+		if ggml.FileType().String() == "F16" && totalResidentMemory < 128*1024*1024 {
 			return nil, fmt.Errorf("F16 model requires at least 128GB of memory")
 		} else if totalResidentMemory < 64*1024*1024 {
 			return nil, fmt.Errorf("model requires at least 64GB of memory")
 		}
 	case "180B":
 		if ggml.FileType() == "F16" && totalResidentMemory < 512*1024*1024 {
 			return nil, fmt.Errorf("F16 model requires at least 512GB of memory")
 		} else if totalResidentMemory < 128*1024*1024 {
 			return nil, fmt.Errorf("model requires at least 128GB of memory")
 		}
 	}
-	switch ggml.Name() {
+	switch ggml.ModelFamily() {
-	case "gguf":
+	case ModelFamilyLlama:
-		opts.NumGQA = 0 // TODO: remove this when llama.cpp runners differ enough to need separate newLlama functions
+		return newLlama(model, adapters, ggmlRunner(), opts)
 		return newLlama(model, adapters, chooseRunners(workDir, "gguf"), ggml.NumLayers(), opts)
 	case "ggml", "ggmf", "ggjt", "ggla":
 		return newLlama(model, adapters, chooseRunners(workDir, "ggml"), ggml.NumLayers(), opts)
 	default:
 		return nil, fmt.Errorf("unknown ggml type: %s", ggml.ModelFamily())
 	}
--- a/scripts/build.sh
+++ b/scripts/build.sh
@@ -1,21 +0,0 @@
 #!/bin/sh
 set -eu
 usage() {
    echo "usage: $(basename $0) VERSION"
    exit 1
 }
 [ "$#" -eq 1 ] || usage
 export VERSION="$1"
 # build universal MacOS binary
 sh $(dirname $0)/build_darwin.sh
 # # build arm64 and amd64 Linux binaries
 sh $(dirname $0)/build_linux.sh
 # # build arm64 and amd64 Docker images
 sh $(dirname $0)/build_docker.sh
--- a/scripts/build_darwin.sh
+++ b/scripts/build_darwin.sh
@@ -1,30 +1,29 @@
-#!/bin/sh
+#!/bin/bash
 set -eu
 export VERSION=${VERSION:-0.0.0}
 export GOFLAGS="'-ldflags=-w -s \"-X=github.com/jmorganca/ollama/version.Version=$VERSION\" \"-X=github.com/jmorganca/ollama/server.mode=release\"'"
 mkdir -p dist
-for TARGETARCH in arm64 amd64; do
+GO_LDFLAGS="-X github.com/jmorganca/ollama/version.Version=$VERSION"
-    GOOS=darwin GOARCH=$TARGETARCH go generate ./...
+GO_LDFLAGS="$GO_LDFLAGS -X github.com/jmorganca/ollama/server.mode=release"
    GOOS=darwin GOARCH=$TARGETARCH go build -o dist/ollama-darwin-$TARGETARCH
 done
-lipo -create -output dist/ollama dist/ollama-darwin-*
+# build universal binary
-rm -f dist/ollama-darwin-*
+GOARCH=arm64 go generate ./...
 GOARCH=arm64 go build -ldflags "$GO_LDFLAGS" -o dist/ollama-darwin-arm64
 rm -rf llm/llama.cpp/ggml/build/*/bin
 GOARCH=amd64 go generate ./...
 GOARCH=amd64 go build -ldflags "$GO_LDFLAGS" -o dist/ollama-darwin-amd64
 lipo -create -output dist/ollama dist/ollama-darwin-arm64 dist/ollama-darwin-amd64
 rm dist/ollama-darwin-amd64 dist/ollama-darwin-arm64
 codesign --deep --force --options=runtime --sign "$APPLE_IDENTITY" --timestamp dist/ollama
 chmod +x dist/ollama
 # build and sign the mac app
 npm install --prefix app
 npm run --prefix app make:sign
-cp app/out/make/zip/darwin/universal/Ollama-darwin-universal-$VERSION.zip dist/Ollama-darwin.zip
+cp app/out/make/zip/darwin/universal/Ollama-darwin-universal-${VERSION:-0.0.0}.zip dist/Ollama-darwin.zip
 # sign the binary and rename it
 codesign -f --timestamp -s "$APPLE_IDENTITY" --identifier ai.ollama.ollama --options=runtime dist/ollama
 ditto -c -k --keepParent dist/ollama dist/temp.zip
 xcrun notarytool submit dist/temp.zip --wait --timeout 10m --apple-id $APPLE_ID --password $APPLE_PASSWORD --team-id $APPLE_TEAM_ID
 mv dist/ollama dist/ollama-darwin
-rm -f dist/temp.zip
+rm dist/temp.zip
--- a/scripts/build_docker.sh
+++ b/scripts/build_docker.sh
@@ -1,15 +0,0 @@
 #!/bin/sh
 set -eu
 export VERSION=${VERSION:-0.0.0}
 export GOFLAGS="'-ldflags=-w -s \"-X=github.com/jmorganca/ollama/version.Version=$VERSION\" \"-X=github.com/jmorganca/ollama/server.mode=release\"'"
 docker buildx build \
    --load \
    --platform=linux/arm64,linux/amd64 \
    --build-arg=VERSION \
    --build-arg=GOFLAGS \
    -f Dockerfile \
    -t ollama \
    .
--- a/scripts/build_linux.sh
+++ b/scripts/build_linux.sh
@@ -1,15 +0,0 @@
 #!/bin/sh
 set -eu
 export VERSION=${VERSION:-0.0.0}
 export GOFLAGS="'-ldflags=-w -s \"-X=github.com/jmorganca/ollama/version.Version=$VERSION\" \"-X=github.com/jmorganca/ollama/server.mode=release\"'"
 mkdir -p dist
 for TARGETARCH in arm64 amd64; do
    docker buildx build --load --platform=linux/$TARGETARCH --build-arg=VERSION --build-arg=GOFLAGS -f Dockerfile.build -t builder:$TARGETARCH .
    docker create --platform linux/$TARGETARCH --name builder-$TARGETARCH builder:$TARGETARCH
    docker cp builder-$TARGETARCH:/go/src/github.com/jmorganca/ollama/ollama ./dist/ollama-linux-$TARGETARCH
    docker rm builder-$TARGETARCH
 done
--- a/scripts/install.sh
+++ b/scripts/install.sh
@@ -1,243 +0,0 @@
 #!/bin/sh
 # This script installs Ollama on Linux.
 # It detects the current operating system architecture and installs the appropriate version of Ollama.
 set -eu
 status() { echo ">>> $*" >&2; }
 error() { echo "ERROR $*"; exit 1; }
 warning() { echo "WARNING: $*"; }
 TEMP_DIR=$(mktemp -d)
 cleanup() { rm -rf $TEMP_DIR; }
 trap cleanup EXIT
 available() { command -v $1 >/dev/null; }
 require() {
    local MISSING=''
    for TOOL in $*; do
        if ! available $TOOL; then
            MISSING="$MISSING $TOOL"
        fi
    done
    echo $MISSING
 }
 [ "$(uname -s)" = "Linux" ] || error 'This script is intended to run on Linux only.'
 case "$(uname -m)" in
    x86_64) ARCH="amd64" ;;
    aarch64|arm64) ARCH="arm64" ;;
    *) error "Unsupported architecture: $ARCH" ;;
 esac
 SUDO=
 if [ "$(id -u)" -ne 0 ]; then
    # Running as root, no need for sudo
    if ! available sudo; then
        error "This script requires superuser permissions. Please re-run as root."
    fi
    SUDO="sudo"
 fi
 NEEDS=$(require curl awk grep sed tee xargs)
 if [ -n "$NEEDS" ]; then
    status "ERROR: The following tools are required but missing:"
    for NEED in $NEEDS; do
        echo "  - $NEED"
    done
    exit 1
 fi
 status "Downloading ollama..."
 curl --fail --show-error --location --progress-bar -o $TEMP_DIR/ollama "https://ollama.ai/download/ollama-linux-$ARCH"
 for BINDIR in /usr/local/bin /usr/bin /bin; do
    echo $PATH | grep -q $BINDIR && break || continue
 done
 status "Installing ollama to $BINDIR..."
 $SUDO install -o0 -g0 -m755 -d $BINDIR
 $SUDO install -o0 -g0 -m755 $TEMP_DIR/ollama $BINDIR/ollama
 install_success() { status 'Install complete. Run "ollama" from the command line.'; }
 trap install_success EXIT
 # Everything from this point onwards is optional.
 configure_systemd() {
    if ! id ollama >/dev/null 2>&1; then
        status "Creating ollama user..."
        $SUDO useradd -r -s /bin/false -m -d /usr/share/ollama ollama
    fi
    status "Creating ollama systemd service..."
    cat <<EOF | $SUDO tee /etc/systemd/system/ollama.service >/dev/null
 [Unit]
 Description=Ollama Service
 After=network-online.target
 [Service]
 ExecStart=$BINDIR/ollama serve
 User=ollama
 Group=ollama
 Restart=always
 RestartSec=3
 Environment="HOME=/usr/share/ollama"
 Environment="PATH=$PATH"
 [Install]
 WantedBy=default.target
 EOF
    SYSTEMCTL_RUNNING="$(systemctl is-system-running || true)"
    case $SYSTEMCTL_RUNNING in
        running|degraded)
            status "Enabling and starting ollama service..."
            $SUDO systemctl daemon-reload
            $SUDO systemctl enable ollama
            start_service() { $SUDO systemctl restart ollama; }
            trap start_service EXIT
            ;;
    esac
 }
 if available systemctl; then
    configure_systemd
 fi
 if ! available lspci && ! available lshw; then
    warning "Unable to detect NVIDIA GPU. Install lspci or lshw to automatically detect and install NVIDIA CUDA drivers."
    exit 0
 fi
 check_gpu() {
    case $1 in
        lspci) available lspci && lspci -d '10de:' | grep -q 'NVIDIA' || return 1 ;;
        lshw) available lshw && $SUDO lshw -c display -numeric | grep -q 'vendor: .* \[10DE\]' || return 1 ;;
        nvidia-smi) available nvidia-smi || return 1 ;;
    esac
 }
 if check_gpu nvidia-smi; then
    status "NVIDIA GPU installed."
    exit 0
 fi
 if ! check_gpu lspci && ! check_gpu lshw; then
    warning "No NVIDIA GPU detected. Ollama will run in CPU-only mode."
    exit 0
 fi
 # ref: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#rhel-7-centos-7
 # ref: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#rhel-8-rocky-8
 # ref: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#rhel-9-rocky-9
 # ref: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#fedora
 install_cuda_driver_yum() {
    status 'Installing NVIDIA repository...'
    case $PACKAGE_MANAGER in
        yum)
            $SUDO $PACKAGE_MANAGER -y install yum-utils
            $SUDO $PACKAGE_MANAGER-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-$1$2.repo
            ;;
        dnf)
            $SUDO $PACKAGE_MANAGER config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-$1$2.repo
            ;;
    esac
    case $1 in
        rhel)
            status 'Installing EPEL repository...'
            # EPEL is required for third-party dependencies such as dkms and libvdpau
            $SUDO $PACKAGE_MANAGER -y install https://dl.fedoraproject.org/pub/epel/epel-release-latest-$2.noarch.rpm || true
            ;;
    esac
    status 'Installing CUDA driver...'
    if [ "$1" = 'centos' ] || [ "$1$2" = 'rhel7' ]; then
        $SUDO $PACKAGE_MANAGER -y install nvidia-driver-latest-dkms
    fi
    $SUDO $PACKAGE_MANAGER -y install cuda-drivers
 }
 # ref: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#ubuntu
 # ref: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#debian
 install_cuda_driver_apt() {
    status 'Installing NVIDIA repository...'
    curl -fsSL -o $TEMP_DIR/cuda-keyring.deb https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-keyring_1.1-1_all.deb
    case $1 in
        debian)
            status 'Enabling contrib sources...'
            $SUDO sed 's/main/contrib/' < /etc/apt/sources.list | sudo tee /etc/apt/sources.list.d/contrib.list > /dev/null
            ;;
    esac
    status 'Installing CUDA driver...'
    $SUDO dpkg -i $TEMP_DIR/cuda-keyring.deb
    $SUDO apt-get update
    [ -n "$SUDO" ] && SUDO_E="$SUDO -E" || SUDO_E=
    DEBIAN_FRONTEND=noninteractive $SUDO_E apt-get -y install cuda-drivers -q
 }
 if [ ! -f "/etc/os-release" ]; then
    error "Unknown distribution. Skipping CUDA installation."
 fi
 . /etc/os-release
 OS_NAME=$ID
 OS_VERSION=$VERSION_ID
 PACKAGE_MANAGER=
 for PACKAGE_MANAGER in dnf yum apt-get; do
    if available $PACKAGE_MANAGER; then
        break
    fi
 done
 if [ -z "$PACKAGE_MANAGER" ]; then
    error "Unknown package manager. Skipping CUDA installation."
 fi
 if ! check_gpu nvidia-smi || [ -z "$(nvidia-smi | grep -o "CUDA Version: [0-9]*\.[0-9]*")" ]; then
    case $OS_NAME in
        centos|rhel) install_cuda_driver_yum 'rhel' $OS_VERSION ;;
        rocky) install_cuda_driver_yum 'rhel' $(echo $OS_VERSION | cut -c1) ;;
        fedora) install_cuda_driver_yum $OS_NAME $OS_VERSION ;;
        amzn) install_cuda_driver_yum 'fedora' '35' ;;
        debian) install_cuda_driver_apt $OS_NAME $OS_VERSION ;;
        ubuntu) install_cuda_driver_apt $OS_NAME $(echo $OS_VERSION | sed 's/\.//') ;;
        *) exit ;;
    esac
 fi
 if ! lsmod | grep -q nvidia; then
    KERNEL_RELEASE="$(uname -r)"
    case $OS_NAME in
        centos|rhel|rocky|amzn) $SUDO $PACKAGE_MANAGER -y install kernel-devel-$KERNEL_RELEASE kernel-headers-$KERNEL_RELEASE ;;
        fedora) $SUDO $PACKAGE_MANAGER -y install kernel-devel-$KERNEL_RELEASE ;;
        debian|ubuntu) $SUDO apt-get -y install linux-headers-$KERNEL_RELEASE ;;
        *) exit ;;
    esac
    NVIDIA_CUDA_VERSION=$($SUDO dkms status | awk -F: '/added/ { print $1 }')
    if [ -n "$NVIDIA_CUDA_VERSION" ]; then
        $SUDO dkms install $NVIDIA_CUDA_VERSION
    fi
    if lsmod | grep -q nouveau; then
        status 'Reboot to complete NVIDIA CUDA driver install.'
        exit 0
    fi
    $SUDO modprobe nvidia
 fi
 status "NVIDIA CUDA drivers installed."
--- a/server/auth.go
+++ b/server/auth.go
@@ -14,7 +14,7 @@ import (
 	"net/http"
 	"net/url"
 	"os"
-	"path/filepath"
+	"path"
 	"strconv"
 	"strings"
 	"time"
@@ -71,7 +71,7 @@ func (r AuthRedirect) URL() (*url.URL, error) {
 	return redirectURL, nil
 }
-func getAuthToken(ctx context.Context, redirData AuthRedirect) (string, error) {
+func getAuthToken(ctx context.Context, redirData AuthRedirect, regOpts *RegistryOptions) (string, error) {
 	redirectURL, err := redirData.URL()
 	if err != nil {
 		return "", err
@@ -82,7 +82,7 @@ func getAuthToken(ctx context.Context, redirData AuthRedirect) (string, error) {
 		return "", err
 	}
-	keyPath := filepath.Join(home, ".ollama", "id_ed25519")
+	keyPath := path.Join(home, ".ollama", "id_ed25519")
 	rawKey, err := os.ReadFile(keyPath)
 	if err != nil {
@@ -103,7 +103,7 @@ func getAuthToken(ctx context.Context, redirData AuthRedirect) (string, error) {
 	headers := make(http.Header)
 	headers.Set("Authorization", sig)
-	resp, err := makeRequest(ctx, "GET", redirectURL, headers, nil, nil)
+	resp, err := makeRequest(ctx, "GET", redirectURL, headers, nil, regOpts)
 	if err != nil {
 		log.Printf("couldn't get token: %q", err)
 	}
--- a/server/download.go
+++ b/server/download.go
@@ -8,7 +8,7 @@ import (
 	"log"
 	"net/http"
 	"os"
-	"path/filepath"
+	"path"
 	"strconv"
 	"sync"
 	"time"
@@ -46,8 +46,8 @@ func downloadBlob(ctx context.Context, opts downloadOpts) error {
 		// we already have the file, so return
 		opts.fn(api.ProgressResponse{
 			Digest:    opts.digest,
-			Total:     fi.Size(),
+			Total:     int(fi.Size()),
-			Completed: fi.Size(),
+			Completed: int(fi.Size()),
 		})
 		return nil
@@ -93,8 +93,8 @@ func monitorDownload(ctx context.Context, opts downloadOpts, f *FileDownload) er
 					// successful download while monitoring
 					opts.fn(api.ProgressResponse{
 						Digest:    f.Digest,
-						Total:     fi.Size(),
+						Total:     int(fi.Size()),
-						Completed: fi.Size(),
+						Completed: int(fi.Size()),
 					})
 					return true, false, nil
 				}
@@ -109,8 +109,8 @@ func monitorDownload(ctx context.Context, opts downloadOpts, f *FileDownload) er
 			opts.fn(api.ProgressResponse{
 				Status:    fmt.Sprintf("downloading %s", f.Digest),
 				Digest:    f.Digest,
-				Total:     f.Total,
+				Total:     int(f.Total),
-				Completed: f.Completed,
+				Completed: int(f.Completed),
 			})
 			return false, false, nil
 		}()
@@ -129,8 +129,8 @@ func monitorDownload(ctx context.Context, opts downloadOpts, f *FileDownload) er
 }
 var (
-	chunkSize   int64 = 1024 * 1024 // 1 MiB in bytes
+	chunkSize   = 1024 * 1024 // 1 MiB in bytes
-	errDownload       = fmt.Errorf("download failed")
+	errDownload = fmt.Errorf("download failed")
 )
 // doDownload downloads a blob from the registry and stores it in the blobs directory
@@ -147,7 +147,7 @@ func doDownload(ctx context.Context, opts downloadOpts, f *FileDownload) error {
 	default:
 		size = fi.Size()
 		// Ensure the size is divisible by the chunk size by removing excess bytes
-		size -= size % chunkSize
+		size -= size % int64(chunkSize)
 		err := os.Truncate(f.FilePath+"-partial", size)
 		if err != nil {
@@ -173,7 +173,7 @@ func doDownload(ctx context.Context, opts downloadOpts, f *FileDownload) error {
 		return fmt.Errorf("%w: on download registry responded with code %d: %v", errDownload, resp.StatusCode, string(body))
 	}
-	err = os.MkdirAll(filepath.Dir(f.FilePath), 0o700)
+	err = os.MkdirAll(path.Dir(f.FilePath), 0o700)
 	if err != nil {
 		return fmt.Errorf("make blobs directory: %w", err)
 	}
@@ -200,8 +200,8 @@ outerLoop:
 			opts.fn(api.ProgressResponse{
 				Status:    fmt.Sprintf("downloading %s", f.Digest),
 				Digest:    f.Digest,
-				Total:     f.Total,
+				Total:     int(f.Total),
-				Completed: f.Completed,
+				Completed: int(f.Completed),
 			})
 			if f.Completed >= f.Total {
@@ -213,8 +213,8 @@ outerLoop:
 					opts.fn(api.ProgressResponse{
 						Status:    fmt.Sprintf("error renaming file: %v", err),
 						Digest:    f.Digest,
-						Total:     f.Total,
+						Total:     int(f.Total),
-						Completed: f.Completed,
+						Completed: int(f.Completed),
 					})
 					return err
 				}
@@ -223,7 +223,7 @@ outerLoop:
 			}
 		}
-		n, err := io.CopyN(out, resp.Body, chunkSize)
+		n, err := io.CopyN(out, resp.Body, int64(chunkSize))
 		if err != nil && !errors.Is(err, io.EOF) {
 			return fmt.Errorf("%w: %w", errDownload, err)
 		}
--- a/server/images.go
+++ b/server/images.go
@@ -14,6 +14,7 @@ import (
 	"net/http"
 	"net/url"
 	"os"
 	"path"
 	"path/filepath"
 	"reflect"
 	"runtime"
@@ -54,54 +55,6 @@ type Model struct {
 	Embeddings    []vector.Embedding
 }
 func (m *Model) ChatPrompt(messages []api.Message) (string, error) {
 	tmpl, err := template.New("").Parse(m.Template)
 	if err != nil {
 		return "", err
 	}
 	var vars struct {
 		System string
 		Prompt string
 		First  bool
 	}
 	vars.First = true
 	var sb strings.Builder
 	flush := func() {
 		tmpl.Execute(&sb, vars)
 		vars.System = ""
 		vars.Prompt = ""
 	}
 	// build the chat history from messages
 	for _, m := range messages {
 		if m.Role == "system" {
 			if vars.System != "" {
 				flush()
 			}
 			vars.System = m.Content
 		}
 		if m.Role == "user" {
 			if vars.Prompt != "" {
 				flush()
 			}
 			vars.Prompt = m.Content
 		}
 		if m.Role == "assistant" {
 			flush()
 			sb.Write([]byte(m.Content))
 		}
 	}
 	flush()
 	return sb.String(), nil
 }
 func (m *Model) Prompt(request api.GenerateRequest, embedding string) (string, error) {
 	t := m.Template
 	if request.Template != "" {
@@ -151,7 +104,7 @@ type ManifestV2 struct {
 type Layer struct {
 	MediaType string `json:"mediaType"`
 	Digest    string `json:"digest"`
-	Size      int64  `json:"size"`
+	Size      int    `json:"size"`
 	From      string `json:"from,omitempty"`
 }
@@ -161,11 +114,11 @@ type LayerReader struct {
 }
 type ConfigV2 struct {
-	ModelFormat string `json:"model_format"`
+	ModelFamily llm.ModelFamily `json:"model_family"`
-	ModelFamily string `json:"model_family"`
+	ModelType   string          `json:"model_type"`
-	ModelType   string `json:"model_type"`
+	ModelFormat string          `json:"model_format"`
-	FileType    string `json:"file_type"`
+	FileType    string          `json:"file_type"`
-	RootFS      RootFS `json:"rootfs"`
+	RootFS      RootFS          `json:"rootfs"`
 	// required by spec
 	Architecture string `json:"architecture"`
@@ -177,11 +130,11 @@ type RootFS struct {
 	DiffIDs []string `json:"diff_ids"`
 }
-func (m *ManifestV2) GetTotalSize() (total int64) {
+func (m *ManifestV2) GetTotalSize() int {
 	var total int
 	for _, layer := range m.Layers {
 		total += layer.Size
 	}
 	total += m.Config.Size
 	return total
 }
@@ -315,30 +268,7 @@ func filenameWithPath(path, f string) (string, error) {
 	return f, nil
 }
-func CreateModel(ctx context.Context, workDir, name string, path string, fn func(resp api.ProgressResponse)) error {
+func CreateModel(ctx context.Context, name string, path string, fn func(resp api.ProgressResponse)) error {
 	mp := ParseModelPath(name)
 	var manifest *ManifestV2
 	var err error
 	var noprune string
 	// build deleteMap to prune unused layers
 	deleteMap := make(map[string]bool)
 	if noprune = os.Getenv("OLLAMA_NOPRUNE"); noprune == "" {
 		manifest, _, err = GetManifest(mp)
 		if err != nil && !errors.Is(err, os.ErrNotExist) {
 			return err
 		}
 		if manifest != nil {
 			for _, l := range manifest.Layers {
 				deleteMap[l.Digest] = true
 			}
 			deleteMap[manifest.Config.Digest] = true
 		}
 	}
 	mf, err := os.Open(path)
 	if err != nil {
 		fn(api.ProgressResponse{Status: fmt.Sprintf("couldn't open modelfile '%s'", path)})
@@ -399,15 +329,15 @@ func CreateModel(ctx context.Context, workDir, name string, path string, fn func
 					}
 					defer file.Close()
-					ggml, err := llm.DecodeGGML(file)
+					ggml, err := llm.DecodeGGML(file, llm.ModelFamilyLlama)
 					if err != nil {
 						return err
 					}
 					config.ModelFormat = ggml.Name()
 					config.ModelFamily = ggml.ModelFamily()
-					config.ModelType = ggml.ModelType()
+					config.ModelType = ggml.ModelType().String()
-					config.FileType = ggml.FileType()
+					config.ModelFormat = ggml.Name()
 					config.FileType = ggml.FileType().String()
 					// reset the file
 					file.Seek(0, io.SeekStart)
@@ -508,10 +438,8 @@ func CreateModel(ctx context.Context, workDir, name string, path string, fn func
 				return err
 			}
-			if layer.Size > 0 {
+			layer.MediaType = mediaType
-				layer.MediaType = mediaType
+			layers = append(layers, layer)
 				layers = append(layers, layer)
 			}
 		case "template", "system", "prompt":
 			fn(api.ProgressResponse{Status: fmt.Sprintf("creating model %s layer", c.Name)})
 			// remove the layer if one exists
@@ -523,10 +451,8 @@ func CreateModel(ctx context.Context, workDir, name string, path string, fn func
 				return err
 			}
-			if layer.Size > 0 {
+			layer.MediaType = mediaType
-				layer.MediaType = mediaType
+			layers = append(layers, layer)
 				layers = append(layers, layer)
 			}
 		default:
 			// runtime parameters, build a list of args for each parameter to allow multiple values to be specified (ex: multiple stop sequences)
 			params[c.Name] = append(params[c.Name], c.Args)
@@ -549,12 +475,6 @@ func CreateModel(ctx context.Context, workDir, name string, path string, fn func
 			}
 		}
 		if config.ModelType == "65B" {
 			if numGQA, ok := formattedParams["num_gqa"].(int); ok && numGQA == 8 {
 				config.ModelType = "70B"
 			}
 		}
 		bts, err := json.Marshal(formattedParams)
 		if err != nil {
 			return err
@@ -572,7 +492,7 @@ func CreateModel(ctx context.Context, workDir, name string, path string, fn func
 	}
 	// generate the embedding layers
-	embeddingLayers, err := embeddingLayers(workDir, embed)
+	embeddingLayers, err := embeddingLayers(embed)
 	if err != nil {
 		return err
 	}
@@ -586,7 +506,6 @@ func CreateModel(ctx context.Context, workDir, name string, path string, fn func
 	var manifestLayers []*Layer
 	for _, l := range layers {
 		manifestLayers = append(manifestLayers, &l.Layer)
 		delete(deleteMap, l.Layer.Digest)
 	}
 	// Create a layer for the config object
@@ -596,7 +515,6 @@ func CreateModel(ctx context.Context, workDir, name string, path string, fn func
 		return err
 	}
 	layers = append(layers, cfg)
 	delete(deleteMap, cfg.Layer.Digest)
 	if err := SaveLayers(layers, fn, false); err != nil {
 		return err
@@ -609,14 +527,6 @@ func CreateModel(ctx context.Context, workDir, name string, path string, fn func
 		return err
 	}
 	if noprune == "" {
 		fn(api.ProgressResponse{Status: "removing any unused layers"})
 		err = deleteUnusedLayers(nil, deleteMap, false)
 		if err != nil {
 			return err
 		}
 	}
 	fn(api.ProgressResponse{Status: "success"})
 	return nil
 }
@@ -629,7 +539,7 @@ type EmbeddingParams struct {
 }
 // embeddingLayers loads the associated LLM and generates the embeddings to be stored from an input file
-func embeddingLayers(workDir string, e EmbeddingParams) ([]*LayerReader, error) {
+func embeddingLayers(e EmbeddingParams) ([]*LayerReader, error) {
 	layers := []*LayerReader{}
 	if len(e.files) > 0 {
 		// check if the model is a file path or a model name
@@ -642,7 +552,7 @@ func embeddingLayers(workDir string, e EmbeddingParams) ([]*LayerReader, error)
 			model = &Model{ModelPath: e.model}
 		}
-		if err := load(context.Background(), workDir, model, e.opts, defaultSessionDuration); err != nil {
+		if err := load(context.Background(), model, e.opts, defaultSessionDuration); err != nil {
 			return nil, fmt.Errorf("load model to generate embeddings: %v", err)
 		}
@@ -697,8 +607,8 @@ func embeddingLayers(workDir string, e EmbeddingParams) ([]*LayerReader, error)
 					e.fn(api.ProgressResponse{
 						Status:    fmt.Sprintf("creating embeddings for file %s", filePath),
 						Digest:    fileDigest,
-						Total:     int64(len(data) - 1),
+						Total:     len(data) - 1,
-						Completed: int64(i),
+						Completed: i,
 					})
 					if len(existing[d]) > 0 {
 						// already have an embedding for this line
@@ -723,7 +633,7 @@ func embeddingLayers(workDir string, e EmbeddingParams) ([]*LayerReader, error)
 					Layer: Layer{
 						MediaType: "application/vnd.ollama.image.embed",
 						Digest:    digest,
-						Size:      r.Size(),
+						Size:      r.Len(),
 					},
 					Reader: r,
 				}
@@ -872,14 +782,14 @@ func formatParams(params map[string][]string) (map[string]interface{}, error) {
 						return nil, fmt.Errorf("invalid float value %s", vals)
 					}
-					out[key] = float32(floatVal)
+					out[key] = floatVal
 				case reflect.Int:
-					intVal, err := strconv.ParseInt(vals[0], 10, 64)
+					intVal, err := strconv.ParseInt(vals[0], 10, 0)
 					if err != nil {
 						return nil, fmt.Errorf("invalid int value %s", vals)
 					}
-					out[key] = int(intVal)
+					out[key] = intVal
 				case reflect.Bool:
 					boolVal, err := strconv.ParseBool(vals[0])
 					if err != nil {
@@ -959,7 +869,18 @@ func CopyModel(src, dest string) error {
 	return nil
 }
-func deleteUnusedLayers(skipModelPath *ModelPath, deleteMap map[string]bool, dryRun bool) error {
+func DeleteModel(name string) error {
 	mp := ParseModelPath(name)
 	manifest, _, err := GetManifest(mp)
 	if err != nil {
 		return err
 	}
 	deleteMap := make(map[string]bool)
 	for _, layer := range manifest.Layers {
 		deleteMap[layer.Digest] = true
 	}
 	deleteMap[manifest.Config.Digest] = true
 	fp, err := GetManifestPath()
 	if err != nil {
 		return err
@@ -976,13 +897,14 @@ func deleteUnusedLayers(skipModelPath *ModelPath, deleteMap map[string]bool, dry
 		fmp := ParseModelPath(tag)
 		// skip the manifest we're trying to delete
-		if skipModelPath != nil && skipModelPath.GetFullTagname() == fmp.GetFullTagname() {
+		if mp.GetFullTagname() == fmp.GetFullTagname() {
 			return nil
 		}
 		// save (i.e. delete from the deleteMap) any files used in other manifests
 		manifest, _, err := GetManifest(fmp)
 		if err != nil {
 			log.Printf("skipping file: %s", fp)
 			return nil
 		}
@@ -1006,105 +928,14 @@ func deleteUnusedLayers(skipModelPath *ModelPath, deleteMap map[string]bool, dry
 				log.Printf("couldn't get file path for '%s': %v", k, err)
 				continue
 			}
-			if !dryRun {
+			if err := os.Remove(fp); err != nil {
-				if err := os.Remove(fp); err != nil {
+				log.Printf("couldn't remove file '%s': %v", fp, err)
-					log.Printf("couldn't remove file '%s': %v", fp, err)
+				continue
 					continue
 				}
 			} else {
 				log.Printf("wanted to remove: %s", fp)
 			}
 		}
 	}
-	return nil
+	fp, err = mp.GetManifestPath(false)
 }
 func PruneLayers() error {
 	deleteMap := make(map[string]bool)
 	p, err := GetBlobsPath("")
 	if err != nil {
 		return err
 	}
 	blobs, err := os.ReadDir(p)
 	if err != nil {
 		log.Printf("couldn't read dir '%s': %v", p, err)
 		return err
 	}
 	for _, blob := range blobs {
 		name := blob.Name()
 		if runtime.GOOS == "windows" {
 			name = strings.ReplaceAll(name, "-", ":")
 		}
 		deleteMap[name] = true
 	}
 	log.Printf("total blobs: %d", len(deleteMap))
 	err = deleteUnusedLayers(nil, deleteMap, false)
 	if err != nil {
 		return err
 	}
 	log.Printf("total unused blobs removed: %d", len(deleteMap))
 	return nil
 }
 func PruneDirectory(path string) error {
 	info, err := os.Lstat(path)
 	if err != nil {
 		return err
 	}
 	if info.IsDir() && info.Mode()&os.ModeSymlink == 0 {
 		entries, err := os.ReadDir(path)
 		if err != nil {
 			return err
 		}
 		for _, entry := range entries {
 			if err := PruneDirectory(filepath.Join(path, entry.Name())); err != nil {
 				return err
 			}
 		}
 		entries, err = os.ReadDir(path)
 		if err != nil {
 			return err
 		}
 		if len(entries) > 0 {
 			return nil
 		}
 		return os.Remove(path)
 	}
 	return nil
 }
 func DeleteModel(name string) error {
 	mp := ParseModelPath(name)
 	manifest, _, err := GetManifest(mp)
 	if err != nil {
 		return err
 	}
 	deleteMap := make(map[string]bool)
 	for _, layer := range manifest.Layers {
 		deleteMap[layer.Digest] = true
 	}
 	deleteMap[manifest.Config.Digest] = true
 	err = deleteUnusedLayers(&mp, deleteMap, false)
 	if err != nil {
 		return err
 	}
 	fp, err := mp.GetManifestPath(false)
 	if err != nil {
 		return err
 	}
@@ -1235,14 +1066,14 @@ func PushModel(ctx context.Context, name string, regOpts *RegistryOptions, fn fu
 			Total:  layer.Size,
 		})
-		location, chunkSize, err := startUpload(ctx, mp, layer, regOpts)
+		location, err := startUpload(ctx, mp, layer, regOpts)
 		if err != nil {
 			log.Printf("couldn't start upload: %v", err)
 			return err
 		}
-		if strings.HasPrefix(filepath.Base(location.Path), "sha256:") {
+		if strings.HasPrefix(path.Base(location.Path), "sha256:") {
-			layer.Digest = filepath.Base(location.Path)
+			layer.Digest = path.Base(location.Path)
 			fn(api.ProgressResponse{
 				Status:    "using existing layer",
 				Digest:    layer.Digest,
@@ -1252,7 +1083,7 @@ func PushModel(ctx context.Context, name string, regOpts *RegistryOptions, fn fu
 			continue
 		}
-		if err := uploadBlob(ctx, location, layer, chunkSize, regOpts, fn); err != nil {
+		if err := uploadBlobChunked(ctx, location, layer, regOpts, fn); err != nil {
 			log.Printf("error uploading blob: %v", err)
 			return err
 		}
@@ -1283,34 +1114,13 @@ func PushModel(ctx context.Context, name string, regOpts *RegistryOptions, fn fu
 func PullModel(ctx context.Context, name string, regOpts *RegistryOptions, fn func(api.ProgressResponse)) error {
 	mp := ParseModelPath(name)
 	var manifest *ManifestV2
 	var err error
 	var noprune string
 	// build deleteMap to prune unused layers
 	deleteMap := make(map[string]bool)
 	if noprune = os.Getenv("OLLAMA_NOPRUNE"); noprune == "" {
 		manifest, _, err = GetManifest(mp)
 		if err != nil && !errors.Is(err, os.ErrNotExist) {
 			return err
 		}
 		if manifest != nil {
 			for _, l := range manifest.Layers {
 				deleteMap[l.Digest] = true
 			}
 			deleteMap[manifest.Config.Digest] = true
 		}
 	}
 	if mp.ProtocolScheme == "http" && !regOpts.Insecure {
 		return fmt.Errorf("insecure protocol http")
 	}
 	fn(api.ProgressResponse{Status: "pulling manifest"})
-	manifest, err = pullModelManifest(ctx, mp, regOpts)
+	manifest, err := pullModelManifest(ctx, mp, regOpts)
 	if err != nil {
 		return fmt.Errorf("pull model manifest: %s", err)
 	}
@@ -1330,9 +1140,7 @@ func PullModel(ctx context.Context, name string, regOpts *RegistryOptions, fn fu
 			}); err != nil {
 			return err
 		}
 		delete(deleteMap, layer.Digest)
 	}
 	delete(deleteMap, manifest.Config.Digest)
 	fn(api.ProgressResponse{Status: "verifying sha256 digest"})
 	for _, layer := range layers {
@@ -1370,14 +1178,6 @@ func PullModel(ctx context.Context, name string, regOpts *RegistryOptions, fn fu
 		return err
 	}
 	if noprune == "" {
 		fn(api.ProgressResponse{Status: "removing any unused layers"})
 		err = deleteUnusedLayers(nil, deleteMap, false)
 		if err != nil {
 			return err
 		}
 	}
 	fn(api.ProgressResponse{Status: "success"})
 	return nil
@@ -1437,14 +1237,14 @@ func createConfigLayer(config ConfigV2, layers []string) (*LayerReader, error) {
 }
 // GetSHA256Digest returns the SHA256 hash of a given buffer and returns it, and the size of buffer
-func GetSHA256Digest(r io.Reader) (string, int64) {
+func GetSHA256Digest(r io.Reader) (string, int) {
 	h := sha256.New()
 	n, err := io.Copy(h, r)
 	if err != nil {
 		log.Fatal(err)
 	}
-	return fmt.Sprintf("sha256:%x", h.Sum(nil)), n
+	return fmt.Sprintf("sha256:%x", h.Sum(nil)), int(n)
 }
 // Function to check if a blob already exists in the Docker registry
@@ -1478,7 +1278,7 @@ func makeRequestWithRetry(ctx context.Context, method string, requestURL *url.UR
 		case resp.StatusCode == http.StatusUnauthorized:
 			auth := resp.Header.Get("www-authenticate")
 			authRedir := ParseAuthRedirectString(auth)
-			token, err := getAuthToken(ctx, authRedir)
+			token, err := getAuthToken(ctx, authRedir, regOpts)
 			if err != nil {
 				return nil, err
 			}
@@ -1503,7 +1303,7 @@ func makeRequestWithRetry(ctx context.Context, method string, requestURL *url.UR
 }
 func makeRequest(ctx context.Context, method string, requestURL *url.URL, headers http.Header, body io.Reader, regOpts *RegistryOptions) (*http.Response, error) {
-	if requestURL.Scheme != "http" && regOpts != nil && regOpts.Insecure {
+	if requestURL.Scheme != "http" && regOpts.Insecure {
 		requestURL.Scheme = "http"
 	}
@@ -1516,25 +1316,14 @@ func makeRequest(ctx context.Context, method string, requestURL *url.URL, header
 		req.Header = headers
 	}
-	if regOpts != nil {
+	if regOpts.Token != "" {
-		if regOpts.Token != "" {
+		req.Header.Set("Authorization", "Bearer "+regOpts.Token)
-			req.Header.Set("Authorization", "Bearer "+regOpts.Token)
+	} else if regOpts.Username != "" && regOpts.Password != "" {
-		} else if regOpts.Username != "" && regOpts.Password != "" {
+		req.SetBasicAuth(regOpts.Username, regOpts.Password)
 			req.SetBasicAuth(regOpts.Username, regOpts.Password)
 		}
 	}
 	req.Header.Set("User-Agent", fmt.Sprintf("ollama/%s (%s %s) Go/%s", version.Version, runtime.GOARCH, runtime.GOOS, runtime.Version()))
 	if s := req.Header.Get("Content-Length"); s != "" {
 		contentLength, err := strconv.ParseInt(s, 10, 64)
 		if err != nil {
 			return nil, err
 		}
 		req.ContentLength = contentLength
 	}
 	client := &http.Client{
 		CheckRedirect: func(req *http.Request, via []*http.Request) error {
 			if len(via) >= 10 {
--- a/server/modelpath.go
+++ b/server/modelpath.go
@@ -133,12 +133,7 @@ func GetBlobsPath(digest string) (string, error) {
 	}
 	path := filepath.Join(home, ".ollama", "models", "blobs", digest)
-	dirPath := filepath.Dir(path)
+	if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil {
 	if digest == "" {
 		dirPath = path
 	}
 	if err := os.MkdirAll(dirPath, 0o755); err != nil {
 		return "", err
 	}
--- a/server/modelpath_test.go
+++ b/server/modelpath_test.go
@@ -4,9 +4,9 @@ import "testing"
 func TestParseModelPath(t *testing.T) {
 	tests := []struct {
-		name string
+		name    string
-		arg  string
+		arg    string
-		want ModelPath
+		want    ModelPath
 	}{
 		{
 			"full path https",
--- a/server/routes.go
+++ b/server/routes.go
@@ -12,7 +12,6 @@ import (
 	"os/signal"
 	"path/filepath"
 	"reflect"
 	"runtime"
 	"strconv"
 	"strings"
 	"sync"
@@ -58,7 +57,7 @@ var loaded struct {
 var defaultSessionDuration = 5 * time.Minute
 // load a model into memory if it is not already loaded, it is up to the caller to lock loaded.mu before calling this function
-func load(ctx context.Context, workDir string, model *Model, reqOpts map[string]interface{}, sessionDuration time.Duration) error {
+func load(ctx context.Context, model *Model, reqOpts map[string]interface{}, sessionDuration time.Duration) error {
 	opts := api.DefaultOptions()
 	if err := opts.FromMap(model.Options); err != nil {
 		log.Printf("could not load model options: %v", err)
@@ -94,7 +93,7 @@ func load(ctx context.Context, workDir string, model *Model, reqOpts map[string]
 			loaded.Embeddings = model.Embeddings
 		}
-		llmModel, err := llm.New(workDir, model.ModelPath, model.AdapterPaths, opts)
+		llmModel, err := llm.New(model.ModelPath, model.AdapterPaths, opts)
 		if err != nil {
 			return err
 		}
@@ -130,7 +129,6 @@ func load(ctx context.Context, workDir string, model *Model, reqOpts map[string]
 			llmModel.SetOptions(opts)
 		}
 	}
 	loaded.expireAt = time.Now().Add(sessionDuration)
 	if loaded.expireTimer == nil {
@@ -151,59 +149,10 @@ func load(ctx context.Context, workDir string, model *Model, reqOpts map[string]
 			loaded.digest = ""
 		})
 	}
 	loaded.expireTimer.Reset(sessionDuration)
 	return nil
 }
 func ChatModelHandler(c *gin.Context) {
 	loaded.mu.Lock()
 	defer loaded.mu.Unlock()
 	var req api.ChatRequest
 	if err := c.ShouldBindJSON(&req); err != nil {
 		c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
 		return
 	}
 	model, err := GetModel(req.Model)
 	if err != nil {
 		c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
 		return
 	}
 	prompt, err := model.ChatPrompt(req.Messages)
 	if err != nil {
 		c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
 		return
 	}
 	var response string
 	fn := func(r api.GenerateResponse) {
 		response += r.Response
 	}
 	workDir := c.GetString("workDir")
 	if err := load(c.Request.Context(), workDir, model, nil, defaultSessionDuration); err != nil {
 		c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
 		return
 	}
 	fmt.Println(prompt)
 	if err := loaded.llm.Predict(c.Request.Context(), []int{}, prompt, fn); err != nil {
 		c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
 	}
 	c.JSON(http.StatusOK, api.ChatResponse{
 		Message: api.Message{
 			Role:    "assistant",
 			Content: response,
 		},
 		CreatedAt: time.Now().UTC(),
 	})
 }
 func GenerateHandler(c *gin.Context) {
 	loaded.mu.Lock()
 	defer loaded.mu.Unlock()
@@ -222,11 +171,8 @@ func GenerateHandler(c *gin.Context) {
 		return
 	}
-	workDir := c.GetString("workDir")
+	sessionDuration := defaultSessionDuration // TODO: set this duration from the request if specified
-
+	if err := load(c.Request.Context(), model, req.Options, sessionDuration); err != nil {
 	// TODO: set this duration from the request if specified
 	sessionDuration := defaultSessionDuration
 	if err := load(c.Request.Context(), workDir, model, req.Options, sessionDuration); err != nil {
 		c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
 		return
 	}
@@ -271,13 +217,8 @@ func GenerateHandler(c *gin.Context) {
 			ch <- r
 		}
-		// an empty request loads the model
+		if err := loaded.llm.Predict(c.Request.Context(), req.Context, prompt, fn); err != nil {
-		if req.Prompt == "" && req.Template == "" && req.System == "" {
+			ch <- gin.H{"error": err.Error()}
 			ch <- api.GenerateResponse{Model: req.Model, Done: true}
 		} else {
 			if err := loaded.llm.Predict(c.Request.Context(), req.Context, prompt, fn); err != nil {
 				ch <- gin.H{"error": err.Error()}
 			}
 		}
 	}()
@@ -299,9 +240,7 @@ func EmbeddingHandler(c *gin.Context) {
 		c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
 		return
 	}
-
+	if err := load(c.Request.Context(), model, req.Options, 5*time.Minute); err != nil {
 	workDir := c.GetString("workDir")
 	if err := load(c.Request.Context(), workDir, model, req.Options, 5*time.Minute); err != nil {
 		c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
 		return
 	}
@@ -340,6 +279,8 @@ func PullModelHandler(c *gin.Context) {
 		regOpts := &RegistryOptions{
 			Insecure: req.Insecure,
 			Username: req.Username,
 			Password: req.Password,
 		}
 		ctx, cancel := context.WithCancel(c.Request.Context())
@@ -369,6 +310,8 @@ func PushModelHandler(c *gin.Context) {
 		regOpts := &RegistryOptions{
 			Insecure: req.Insecure,
 			Username: req.Username,
 			Password: req.Password,
 		}
 		ctx := context.Background()
@@ -387,8 +330,6 @@ func CreateModelHandler(c *gin.Context) {
 		return
 	}
 	workDir := c.GetString("workDir")
 	ch := make(chan any)
 	go func() {
 		defer close(ch)
@@ -399,7 +340,7 @@ func CreateModelHandler(c *gin.Context) {
 		ctx, cancel := context.WithCancel(c.Request.Context())
 		defer cancel()
-		if err := CreateModel(ctx, workDir, req.Name, req.Path, fn); err != nil {
+		if err := CreateModel(ctx, req.Name, req.Path, fn); err != nil {
 			ch <- gin.H{"error": err.Error()}
 		}
 	}()
@@ -422,19 +363,6 @@ func DeleteModelHandler(c *gin.Context) {
 		}
 		return
 	}
 	manifestsPath, err := GetManifestPath()
 	if err != nil {
 		c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
 		return
 	}
 	if err := PruneDirectory(manifestsPath); err != nil {
 		c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
 		return
 	}
 	c.JSON(http.StatusOK, nil)
 }
 func ShowModelHandler(c *gin.Context) {
@@ -565,59 +493,44 @@ func CopyModelHandler(c *gin.Context) {
 	}
 }
-var defaultAllowOrigins = []string{
+func Serve(ln net.Listener, origins []string) error {
 	"localhost",
 	"127.0.0.1",
 	"0.0.0.0",
 }
 func Serve(ln net.Listener, allowOrigins []string) error {
 	config := cors.DefaultConfig()
 	config.AllowWildcard = true
-
+	config.AllowOrigins = append(origins, []string{
-	config.AllowOrigins = allowOrigins
+		"http://localhost",
-	for _, allowOrigin := range defaultAllowOrigins {
+		"http://localhost:*",
-		config.AllowOrigins = append(config.AllowOrigins,
+		"https://localhost",
-			fmt.Sprintf("http://%s", allowOrigin),
+		"https://localhost:*",
-			fmt.Sprintf("https://%s", allowOrigin),
+		"http://127.0.0.1",
-			fmt.Sprintf("http://%s:*", allowOrigin),
+		"http://127.0.0.1:*",
-			fmt.Sprintf("https://%s:*", allowOrigin),
+		"https://127.0.0.1",
-		)
+		"https://127.0.0.1:*",
-	}
+		"http://0.0.0.0",
-
+		"http://0.0.0.0:*",
-	workDir, err := os.MkdirTemp("", "ollama")
+		"https://0.0.0.0",
-	if err != nil {
+		"https://0.0.0.0:*",
-		return err
+	}...)
 	}
 	defer os.RemoveAll(workDir)
 	r := gin.Default()
-	r.Use(
+	r.Use(cors.New(config))
-		cors.New(config),
+
-		func(c *gin.Context) {
+	r.GET("/", func(c *gin.Context) {
-			c.Set("workDir", workDir)
+		c.String(http.StatusOK, "Ollama is running")
-			c.Next()
+	})
-		},
+	r.HEAD("/", func(c *gin.Context) {
-	)
+		c.Status(http.StatusOK)
 	})
 	r.POST("/api/chat", ChatModelHandler)
 	r.POST("/api/pull", PullModelHandler)
 	r.POST("/api/generate", GenerateHandler)
 	r.POST("/api/embeddings", EmbeddingHandler)
 	r.POST("/api/create", CreateModelHandler)
 	r.POST("/api/push", PushModelHandler)
 	r.POST("/api/copy", CopyModelHandler)
 	r.GET("/api/tags", ListModelsHandler)
 	r.DELETE("/api/delete", DeleteModelHandler)
 	r.POST("/api/show", ShowModelHandler)
 	for _, method := range []string{http.MethodGet, http.MethodHead} {
 		r.Handle(method, "/", func(c *gin.Context) {
 			c.String(http.StatusOK, "Ollama is running")
 		})
 		r.Handle(method, "/api/tags", ListModelsHandler)
 	}
 	log.Printf("Listening on %s", ln.Addr())
 	s := &http.Server{
 		Handler: r,
@@ -625,23 +538,15 @@ func Serve(ln net.Listener, allowOrigins []string) error {
 	// listen for a ctrl+c and stop any loaded llm
 	signals := make(chan os.Signal, 1)
-	signal.Notify(signals, syscall.SIGINT, syscall.SIGTERM)
+	signal.Notify(signals, syscall.SIGINT)
 	go func() {
 		<-signals
 		if loaded.llm != nil {
 			loaded.llm.Close()
 		}
 		os.RemoveAll(workDir)
 		os.Exit(0)
 	}()
 	if runtime.GOOS == "linux" {
 		// check compatibility to log warnings
 		if _, err := llm.CheckVRAM(); err != nil {
 			log.Printf("Warning: GPU support may not enabled, check you have installed install GPU drivers: %v", err)
 		}
 	}
 	return s.Serve(ln)
 }
@@ -659,7 +564,6 @@ func streamResponse(c *gin.Context, ch chan any) {
 			return false
 		}
 		// Delineate chunks with new-line delimiter
 		bts = append(bts, '\n')
 		if _, err := w.Write(bts); err != nil {
 			log.Printf("streamResponse: w.Write failed with %s", err)
--- a/server/upload.go
+++ b/server/upload.go
@@ -14,12 +14,7 @@ import (
 	"github.com/jmorganca/ollama/api"
 )
-const (
+func startUpload(ctx context.Context, mp ModelPath, layer *Layer, regOpts *RegistryOptions) (*url.URL, error) {
 	redirectChunkSize int64 = 1024 * 1024 * 1024
 	regularChunkSize  int64 = 95 * 1024 * 1024
 )
 func startUpload(ctx context.Context, mp ModelPath, layer *Layer, regOpts *RegistryOptions) (*url.URL, int64, error) {
 	requestURL := mp.BaseURL()
 	requestURL = requestURL.JoinPath("v2", mp.GetNamespaceRepository(), "blobs/uploads/")
 	if layer.From != "" {
@@ -32,26 +27,20 @@ func startUpload(ctx context.Context, mp ModelPath, layer *Layer, regOpts *Regis
 	resp, err := makeRequestWithRetry(ctx, "POST", requestURL, nil, nil, regOpts)
 	if err != nil {
 		log.Printf("couldn't start upload: %v", err)
-		return nil, 0, err
+		return nil, err
 	}
 	defer resp.Body.Close()
-	location := resp.Header.Get("Docker-Upload-Location")
+	// Extract UUID location from header
-	chunkSize := redirectChunkSize
+	location := resp.Header.Get("Location")
 	if location == "" {
-		location = resp.Header.Get("Location")
+		return nil, fmt.Errorf("location header is missing in response")
 		chunkSize = regularChunkSize
 	}
-	locationURL, err := url.Parse(location)
+	return url.Parse(location)
 	if err != nil {
 		return nil, 0, err
 	}
 	return locationURL, chunkSize, nil
 }
-func uploadBlob(ctx context.Context, requestURL *url.URL, layer *Layer, chunkSize int64, regOpts *RegistryOptions, fn func(api.ProgressResponse)) error {
+func uploadBlobChunked(ctx context.Context, requestURL *url.URL, layer *Layer, regOpts *RegistryOptions, fn func(api.ProgressResponse)) error {
 	// TODO allow resumability
 	// TODO allow canceling uploads via DELETE
@@ -66,40 +55,89 @@ func uploadBlob(ctx context.Context, requestURL *url.URL, layer *Layer, chunkSiz
 	}
 	defer f.Close()
-	pw := ProgressWriter{
+	// 95MB chunk size
-		status: fmt.Sprintf("uploading %s", layer.Digest),
+	chunkSize := 95 * 1024 * 1024
 		digest: layer.Digest,
 		total:  layer.Size,
 		fn:     fn,
 	}
-	for offset := int64(0); offset < layer.Size; {
+	for offset := int64(0); offset < int64(layer.Size); {
-		chunk := layer.Size - offset
+		chunk := int64(layer.Size) - offset
-		if chunk > chunkSize {
+		if chunk > int64(chunkSize) {
-			chunk = chunkSize
+			chunk = int64(chunkSize)
 		}
-		resp, err := uploadBlobChunk(ctx, http.MethodPatch, requestURL, f, offset, chunk, regOpts, &pw)
+		sectionReader := io.NewSectionReader(f, int64(offset), chunk)
-		if err != nil {
+		for try := 0; try < MaxRetries; try++ {
-			fn(api.ProgressResponse{
+			r, w := io.Pipe()
-				Status:    fmt.Sprintf("error uploading chunk: %v", err),
+			defer r.Close()
-				Digest:    layer.Digest,
+			go func() {
-				Total:     layer.Size,
+				defer w.Close()
 				Completed: offset,
 			})
-			return err
+				for chunked := int64(0); chunked < chunk; {
-		}
+					n, err := io.CopyN(w, sectionReader, 1024*1024)
 					if err != nil && !errors.Is(err, io.EOF) {
 						fn(api.ProgressResponse{
 							Status:    fmt.Sprintf("error reading chunk: %v", err),
 							Digest:    layer.Digest,
 							Total:     layer.Size,
 							Completed: int(offset),
 						})
-		offset += chunk
+						return
-		location := resp.Header.Get("Docker-Upload-Location")
+					}
 		if location == "" {
 			location = resp.Header.Get("Location")
 		}
-		requestURL, err = url.Parse(location)
+					chunked += n
-		if err != nil {
+					fn(api.ProgressResponse{
-			return err
+						Status:    fmt.Sprintf("uploading %s", layer.Digest),
 						Digest:    layer.Digest,
 						Total:     layer.Size,
 						Completed: int(offset) + int(chunked),
 					})
 				}
 			}()
 			headers := make(http.Header)
 			headers.Set("Content-Type", "application/octet-stream")
 			headers.Set("Content-Length", strconv.Itoa(int(chunk)))
 			headers.Set("Content-Range", fmt.Sprintf("%d-%d", offset, offset+sectionReader.Size()-1))
 			resp, err := makeRequest(ctx, "PATCH", requestURL, headers, r, regOpts)
 			if err != nil && !errors.Is(err, io.EOF) {
 				fn(api.ProgressResponse{
 					Status:    fmt.Sprintf("error uploading chunk: %v", err),
 					Digest:    layer.Digest,
 					Total:     layer.Size,
 					Completed: int(offset),
 				})
 				return err
 			}
 			defer resp.Body.Close()
 			switch {
 			case resp.StatusCode == http.StatusUnauthorized:
 				auth := resp.Header.Get("www-authenticate")
 				authRedir := ParseAuthRedirectString(auth)
 				token, err := getAuthToken(ctx, authRedir, regOpts)
 				if err != nil {
 					return err
 				}
 				regOpts.Token = token
 				if _, err := sectionReader.Seek(0, io.SeekStart); err != nil {
 					return err
 				}
 				continue
 			case resp.StatusCode >= http.StatusBadRequest:
 				body, _ := io.ReadAll(resp.Body)
 				return fmt.Errorf("on upload registry responded with code %d: %s", resp.StatusCode, body)
 			}
 			offset += sectionReader.Size()
 			requestURL, err = url.Parse(resp.Header.Get("Location"))
 			if err != nil {
 				return err
 			}
 			break
 		}
 	}
@@ -125,90 +163,3 @@ func uploadBlob(ctx context.Context, requestURL *url.URL, layer *Layer, chunkSiz
 	}
 	return nil
 }
 func uploadBlobChunk(ctx context.Context, method string, requestURL *url.URL, r io.ReaderAt, offset, limit int64, opts *RegistryOptions, pw *ProgressWriter) (*http.Response, error) {
 	sectionReader := io.NewSectionReader(r, offset, limit)
 	headers := make(http.Header)
 	headers.Set("Content-Type", "application/octet-stream")
 	headers.Set("Content-Length", strconv.Itoa(int(limit)))
 	headers.Set("X-Redirect-Uploads", "1")
 	if method == http.MethodPatch {
 		headers.Set("Content-Range", fmt.Sprintf("%d-%d", offset, offset+sectionReader.Size()-1))
 	}
 	for try := 0; try < MaxRetries; try++ {
 		resp, err := makeRequest(ctx, method, requestURL, headers, io.TeeReader(sectionReader, pw), opts)
 		if err != nil && !errors.Is(err, io.EOF) {
 			return nil, err
 		}
 		defer resp.Body.Close()
 		switch {
 		case resp.StatusCode == http.StatusTemporaryRedirect:
 			location, err := resp.Location()
 			if err != nil {
 				return nil, err
 			}
 			pw.completed = offset
 			if _, err := uploadBlobChunk(ctx, http.MethodPut, location, r, offset, limit, nil, pw); err != nil {
 				// retry
 				log.Printf("retrying redirected upload: %v", err)
 				continue
 			}
 			return resp, nil
 		case resp.StatusCode == http.StatusUnauthorized:
 			auth := resp.Header.Get("www-authenticate")
 			authRedir := ParseAuthRedirectString(auth)
 			token, err := getAuthToken(ctx, authRedir)
 			if err != nil {
 				return nil, err
 			}
 			opts.Token = token
 			pw.completed = offset
 			sectionReader = io.NewSectionReader(r, offset, limit)
 			continue
 		case resp.StatusCode >= http.StatusBadRequest:
 			body, _ := io.ReadAll(resp.Body)
 			return nil, fmt.Errorf("on upload registry responded with code %d: %s", resp.StatusCode, body)
 		}
 		return resp, nil
 	}
 	return nil, fmt.Errorf("max retries exceeded")
 }
 type ProgressWriter struct {
 	status    string
 	digest    string
 	bucket    int64
 	completed int64
 	total     int64
 	fn        func(api.ProgressResponse)
 }
 func (pw *ProgressWriter) Write(b []byte) (int, error) {
 	n := len(b)
 	pw.bucket += int64(n)
 	// throttle status updates to not spam the client
 	if pw.bucket >= 1024*1024 || pw.completed+pw.bucket >= pw.total {
 		pw.completed += pw.bucket
 		pw.fn(api.ProgressResponse{
 			Status:    pw.status,
 			Digest:    pw.digest,
 			Total:     pw.total,
 			Completed: pw.completed,
 		})
 		pw.bucket = 0
 	}
 	return n, nil
 }