Update linux.md

Merge pull request #595 from jmorganca/mxyng/install.sh
ignore systemctl is-system-running exit code
2023-09-25 16:10:32 -07:00 · 2023-09-25 15:49:47 -07:00 · 2023-09-25 15:47:45 -07:00 · 2023-09-25 18:36:46 -04:00 · 2023-09-25 15:30:58 -07:00 · 2023-09-25 14:09:40 -07:00
41 changed files with 2213 additions and 688 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -1,4 +1,8 @@
 .vscode
 ollama
 app
 dist
 scripts
 llm/llama.cpp/ggml
 llm/llama.cpp/gguf
 .env
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,4 +1,10 @@
 [submodule "llm/llama.cpp/ggml"]
-	path = llm/llama.cpp/ggml
+    path = llm/llama.cpp/ggml
-	url = https://github.com/ggerganov/llama.cpp.git
+    url = https://github.com/ggerganov/llama.cpp.git
-	ignore = dirty
+    ignore = dirty
    shallow = true
 [submodule "llm/llama.cpp/gguf"]
    path = llm/llama.cpp/gguf
    url = https://github.com/ggerganov/llama.cpp.git
    ignore = dirty
    shallow = true
--- a/22
+++ b/22
@@ -1,18 +1,28 @@
-FROM golang:alpine
+ARG CUDA_VERSION=12.2.0
 FROM nvidia/cuda:$CUDA_VERSION-devel-ubuntu22.04
 ARG TARGETARCH
 ARG VERSION=0.0.0
 WORKDIR /go/src/github.com/jmorganca/ollama
-RUN apk add --no-cache git build-base cmake
+RUN apt-get update && apt-get install -y git build-essential cmake
 ADD https://dl.google.com/go/go1.21.1.linux-$TARGETARCH.tar.gz /tmp/go1.21.1.tar.gz
 RUN mkdir -p /usr/local && tar xz -C /usr/local </tmp/go1.21.1.tar.gz
 COPY . .
-RUN go generate ./... && go build -ldflags '-linkmode external -extldflags "-static"' .
+ENV GOARCH=$TARGETARCH
 RUN /usr/local/go/bin/go generate ./... \
    && /usr/local/go/bin/go build -ldflags "-linkmode=external -extldflags='-static' -X=github.com/jmorganca/ollama/version.Version=$VERSION -X=github.com/jmorganca/ollama/server.mode=release" .
-FROM alpine
+FROM ubuntu:22.04
 ENV OLLAMA_HOST 0.0.0.0
-RUN apk add --no-cache libstdc++
+
 RUN apt-get update && apt-get install -y ca-certificates
 ARG USER=ollama
 ARG GROUP=ollama
-RUN addgroup $GROUP && adduser -D -G $GROUP $USER
+RUN groupadd $GROUP && useradd -m -g $GROUP $USER
 COPY --from=0 /go/src/github.com/jmorganca/ollama/ollama /bin/ollama
--- a/Dockerfile.build
+++ b/Dockerfile.build
@@ -0,0 +1,29 @@
 ARG VERSION=0.0.0
 # centos7 amd64 dependencies
 FROM --platform=linux/amd64 nvidia/cuda:11.8.0-devel-centos7 AS base-amd64
 RUN yum install -y https://repo.ius.io/ius-release-el7.rpm centos-release-scl && \
    yum update -y && \
    yum install -y devtoolset-10-gcc devtoolset-10-gcc-c++ git236 wget
 RUN wget "https://github.com/Kitware/CMake/releases/download/v3.27.6/cmake-3.27.6-linux-x86_64.sh" -O cmake-installer.sh && chmod +x cmake-installer.sh && ./cmake-installer.sh --skip-license --prefix=/usr/local
 ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
 # centos8 arm64 dependencies
 FROM --platform=linux/arm64 nvidia/cuda:11.4.3-devel-centos8 AS base-arm64
 RUN sed -i -e 's/mirrorlist/#mirrorlist/g' -e 's|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g' /etc/yum.repos.d/CentOS-*
 RUN yum install -y git cmake
 FROM base-${TARGETARCH}
 ARG TARGETARCH
 # install go
 ADD https://dl.google.com/go/go1.21.1.linux-$TARGETARCH.tar.gz /tmp/go1.21.1.tar.gz
 RUN mkdir -p /usr/local && tar xz -C /usr/local </tmp/go1.21.1.tar.gz
 # build the final binary
 WORKDIR /go/src/github.com/jmorganca/ollama
 COPY . .
 ENV GOARCH=$TARGETARCH
 RUN /usr/local/go/bin/go generate ./... && \
    /usr/local/go/bin/go build -ldflags "-X=github.com/jmorganca/ollama/version.Version=$VERSION -X=github.com/jmorganca/ollama/server.mode=release" .
--- a/README.md
+++ b/README.md
@@ -206,10 +206,17 @@ curl -X POST http://localhost:11434/api/generate -d '{
 ## Community Projects using Ollama
- [LangChain](https://python.langchain.com/docs/integrations/llms/ollama) and [LangChain.js](https://js.langchain.com/docs/modules/model_io/models/llms/integrations/ollama) with a question-answering [example](https://js.langchain.com/docs/use_cases/question_answering/local_retrieval_qa).
+| Project                                                                    | Description                                                                                                                                                  |
- [Continue](https://github.com/continuedev/continue) - embeds Ollama inside Visual Studio Code. The extension lets you highlight code to add to the prompt, ask questions in the sidebar, and generate code inline.
+| -------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------ |
- [LiteLLM](https://github.com/BerriAI/litellm) a lightweight python package to simplify LLM API calls
+| [LangChain][1] and [LangChain.js][2]                                       | Also, there is a question-answering [example][3].                                                                                                            |
- [Discord AI Bot](https://github.com/mekb-turtle/discord-ai-bot) - interact with Ollama as a chatbot on Discord.
+| [Continue](https://github.com/continuedev/continue)                        | Embeds Ollama inside Visual Studio Code. The extension lets you highlight code to add to the prompt, ask questions in the sidebar, and generate code inline. |
- [Raycast Ollama](https://github.com/MassimilianoPasquini97/raycast_ollama) - Raycast extension to use Ollama for local llama inference on Raycast.
+| [LiteLLM](https://github.com/BerriAI/litellm)                              | Lightweight Python package to simplify LLM API calls.                                                                                                        |
- [Simple HTML UI for Ollama](https://github.com/rtcfirefly/ollama-ui)
+| [Discord AI Bot](https://github.com/mekb-turtle/discord-ai-bot)            | Interact with Ollama as a chatbot on Discord.                                                                                                                |
- [Emacs client](https://github.com/zweifisch/ollama) for Ollama
+| [Raycast Ollama](https://github.com/MassimilianoPasquini97/raycast_ollama) | Raycast extension to use Ollama for local llama inference on Raycast.                                                                                        |
 | [Simple HTML UI](https://github.com/rtcfirefly/ollama-ui)                  | Also, there is a Chrome extension.                                                                                                                           |
 | [Ollama-GUI](https://github.com/ollama-interface/Ollama-Gui?tab=readme-ov-file)                  | 🖥️ Mac Chat Interface ⚡️                                                                                                                           |
 | [Emacs client](https://github.com/zweifisch/ollama)                        |                                                                                                                                                              |
 [1]: https://python.langchain.com/docs/integrations/llms/ollama
 [2]: https://js.langchain.com/docs/modules/model_io/models/llms/integrations/ollama
 [3]: https://js.langchain.com/docs/use_cases/question_answering/local_retrieval_qa
--- a/api/client.py
+++ b/api/client.py
@@ -0,0 +1,225 @@
 import os
 import json
 import requests
 BASE_URL = os.environ.get('OLLAMA_HOST', 'http://localhost:11434')
 # Generate a response for a given prompt with a provided model. This is a streaming endpoint, so will be a series of responses.
 # The final response object will include statistics and additional data from the request. Use the callback function to override
 # the default handler.
 def generate(model_name, prompt, system=None, template=None, context=None, options=None, callback=None):
    try:
        url = f"{BASE_URL}/api/generate"
        payload = {
            "model": model_name, 
            "prompt": prompt, 
            "system": system, 
            "template": template, 
            "context": context, 
            "options": options
        }
        # Remove keys with None values
        payload = {k: v for k, v in payload.items() if v is not None}
        with requests.post(url, json=payload, stream=True) as response:
            response.raise_for_status()
            # Creating a variable to hold the context history of the final chunk
            final_context = None
            # Variable to hold concatenated response strings if no callback is provided
            full_response = ""
            # Iterating over the response line by line and displaying the details
            for line in response.iter_lines():
                if line:
                    # Parsing each line (JSON chunk) and extracting the details
                    chunk = json.loads(line)
                    # If a callback function is provided, call it with the chunk
                    if callback:
                        callback(chunk)
                    else:
                        # If this is not the last chunk, add the "response" field value to full_response and print it
                        if not chunk.get("done"):
                            response_piece = chunk.get("response", "")
                            full_response += response_piece
                            print(response_piece, end="", flush=True)
                    # Check if it's the last chunk (done is true)
                    if chunk.get("done"):
                        final_context = chunk.get("context")
            # Return the full response and the final context
            return full_response, final_context
    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
        return None, None
 # Create a model from a Modelfile. Use the callback function to override the default handler.
 def create(model_name, model_path, callback=None):
    try:
        url = f"{BASE_URL}/api/create"
        payload = {"name": model_name, "path": model_path}
        # Making a POST request with the stream parameter set to True to handle streaming responses
        with requests.post(url, json=payload, stream=True) as response:
            response.raise_for_status()
            # Iterating over the response line by line and displaying the status
            for line in response.iter_lines():
                if line:
                    # Parsing each line (JSON chunk) and extracting the status
                    chunk = json.loads(line)
                    if callback:
                        callback(chunk)
                    else:
                        print(f"Status: {chunk.get('status')}")
    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
 # Pull a model from a the model registry. Cancelled pulls are resumed from where they left off, and multiple
 # calls to will share the same download progress. Use the callback function to override the default handler.
 def pull(model_name, insecure=False, callback=None):
    try:
        url = f"{BASE_URL}/api/pull"
        payload = {
            "name": model_name,
            "insecure": insecure
        }
        # Making a POST request with the stream parameter set to True to handle streaming responses
        with requests.post(url, json=payload, stream=True) as response:
            response.raise_for_status()
            # Iterating over the response line by line and displaying the details
            for line in response.iter_lines():
                if line:
                    # Parsing each line (JSON chunk) and extracting the details
                    chunk = json.loads(line)
                    # If a callback function is provided, call it with the chunk
                    if callback:
                        callback(chunk)
                    else:
                        # Print the status message directly to the console
                        print(chunk.get('status', ''), end='', flush=True)
                    # If there's layer data, you might also want to print that (adjust as necessary)
                    if 'digest' in chunk:
                        print(f" - Digest: {chunk['digest']}", end='', flush=True)
                        print(f" - Total: {chunk['total']}", end='', flush=True)
                        print(f" - Completed: {chunk['completed']}", end='\n', flush=True)
                    else:
                        print()
    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
 # Push a model to the model registry. Use the callback function to override the default handler.
 def push(model_name, insecure=False, callback=None):
    try:
        url = f"{BASE_URL}/api/push"
        payload = {
            "name": model_name,
            "insecure": insecure
        }
        # Making a POST request with the stream parameter set to True to handle streaming responses
        with requests.post(url, json=payload, stream=True) as response:
            response.raise_for_status()
            # Iterating over the response line by line and displaying the details
            for line in response.iter_lines():
                if line:
                    # Parsing each line (JSON chunk) and extracting the details
                    chunk = json.loads(line)
                    # If a callback function is provided, call it with the chunk
                    if callback:
                        callback(chunk)
                    else:
                        # Print the status message directly to the console
                        print(chunk.get('status', ''), end='', flush=True)
                    # If there's layer data, you might also want to print that (adjust as necessary)
                    if 'digest' in chunk:
                        print(f" - Digest: {chunk['digest']}", end='', flush=True)
                        print(f" - Total: {chunk['total']}", end='', flush=True)
                        print(f" - Completed: {chunk['completed']}", end='\n', flush=True)
                    else:
                        print()
    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
 # List models that are available locally.
 def list():
    try:
        response = requests.get(f"{BASE_URL}/api/tags")
        response.raise_for_status()
        data = response.json()
        models = data.get('models', [])
        return models
    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
        return None
 # Copy a model. Creates a model with another name from an existing model.
 def copy(source, destination):
    try:
        # Create the JSON payload
        payload = {
            "source": source,
            "destination": destination
        }
        response = requests.post(f"{BASE_URL}/api/copy", json=payload)
        response.raise_for_status()
        # If the request was successful, return a message indicating that the copy was successful
        return "Copy successful"
    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
        return None
 # Delete a model and its data.
 def delete(model_name):
    try:
        url = f"{BASE_URL}/api/delete"
        payload = {"name": model_name}
        response = requests.delete(url, json=payload)
        response.raise_for_status()
        return "Delete successful"
    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
        return None
 # Show info about a model.
 def show(model_name):
    try:
        url = f"{BASE_URL}/api/show"
        payload = {"name": model_name}
        response = requests.post(url, json=payload)
        response.raise_for_status()
        # Parse the JSON response and return it
        data = response.json()
        return data
    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
        return None
 def heartbeat():
    try:
        url = f"{BASE_URL}/"
        response = requests.head(url)
        response.raise_for_status()
        return "Ollama is running"
    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
        return "Ollama is not running"
--- a/api/types.go
+++ b/api/types.go
@@ -291,7 +291,7 @@ func DefaultOptions() Options {
 		NumCtx:             2048,
 		NumKeep:            -1,
 		NumBatch:           512,
-		NumGPU:             1,
+		NumGPU:             -1, // -1 here indicates that NumGPU should be set dynamically
 		NumGQA:             1,
 		LowVRAM:            false,
 		F16KV:              true,
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -11,20 +11,19 @@ import (
 	"io"
 	"log"
 	"net"
 	"net/http"
 	"os"
 	"os/exec"
 	"path"
 	"path/filepath"
 	"runtime"
 	"strings"
 	"time"
 	"github.com/chzyer/readline"
 	"github.com/dustin/go-humanize"
 	"github.com/olekukonko/tablewriter"
 	"github.com/pdevine/readline"
 	"github.com/spf13/cobra"
 	"golang.org/x/crypto/ssh"
 	"golang.org/x/term"
 	"github.com/jmorganca/ollama/api"
 	"github.com/jmorganca/ollama/format"
@@ -33,6 +32,26 @@ import (
 	"github.com/jmorganca/ollama/version"
 )
 type Painter struct {
 	IsMultiLine bool
 }
 func (p Painter) Paint(line []rune, _ int) []rune {
 	termType := os.Getenv("TERM")
 	if termType == "xterm-256color" && len(line) == 0 {
 		var prompt string
 		if p.IsMultiLine {
 			prompt = "Use \"\"\" to end multi-line input"
 		} else {
 			prompt = "Send a message (/? for help)"
 		}
 		return []rune(fmt.Sprintf("\033[38;5;245m%s\033[%dD\033[0m", prompt, len(prompt)))
 	}
 	// add a space and a backspace to prevent the cursor from walking up the screen
 	line = append(line, []rune(" \b")...)
 	return line
 }
 func CreateHandler(cmd *cobra.Command, args []string) error {
 	filename, _ := cmd.Flags().GetString("file")
 	filename, err := filepath.Abs(filename)
@@ -98,39 +117,28 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
 }
 func RunHandler(cmd *cobra.Command, args []string) error {
-	insecure, err := cmd.Flags().GetBool("insecure")
+	client, err := api.FromEnv()
 	if err != nil {
 		return err
 	}
-	mp := server.ParseModelPath(args[0])
+	models, err := client.List(context.Background())
 	if err != nil {
 		return err
 	}
-	if mp.ProtocolScheme == "http" && !insecure {
+	modelName, modelTag, ok := strings.Cut(args[0], ":")
-		return fmt.Errorf("insecure protocol http")
+	if !ok {
 		modelTag = "latest"
 	}
-	fp, err := mp.GetManifestPath(false)
+	for _, model := range models.Models {
-	if err != nil {
+		if model.Name == strings.Join([]string{modelName, modelTag}, ":") {
-		return err
+			return RunGenerate(cmd, args)
 	}
 	_, err = os.Stat(fp)
 	switch {
 	case errors.Is(err, os.ErrNotExist):
 		if err := pull(args[0], insecure); err != nil {
 			var apiStatusError api.StatusError
 			if !errors.As(err, &apiStatusError) {
 				return err
 			}
 			if apiStatusError.StatusCode != http.StatusBadGateway {
 				return err
 			}
 		}
-	case err != nil:
+	}
 	if err := PullHandler(cmd, args); err != nil {
 		return err
 	}
@@ -387,71 +395,117 @@ func RunGenerate(cmd *cobra.Command, args []string) error {
 type generateContextKey string
 func generate(cmd *cobra.Command, model, prompt string) error {
-	if len(strings.TrimSpace(prompt)) > 0 {
+	client, err := api.FromEnv()
-		client, err := api.FromEnv()
+	if err != nil {
-		if err != nil {
+		return err
 			return err
 		}
 		spinner := NewSpinner("")
 		go spinner.Spin(60 * time.Millisecond)
 		var latest api.GenerateResponse
 		generateContext, ok := cmd.Context().Value(generateContextKey("context")).([]int)
 		if !ok {
 			generateContext = []int{}
 		}
 		request := api.GenerateRequest{Model: model, Prompt: prompt, Context: generateContext}
 		fn := func(response api.GenerateResponse) error {
 			if !spinner.IsFinished() {
 				spinner.Finish()
 			}
 			latest = response
 			fmt.Print(response.Response)
 			return nil
 		}
 		if err := client.Generate(context.Background(), &request, fn); err != nil {
 			if strings.Contains(err.Error(), "failed to load model") {
 				// tell the user to check the server log, if it exists locally
 				home, nestedErr := os.UserHomeDir()
 				if nestedErr != nil {
 					// return the original error
 					return err
 				}
 				logPath := filepath.Join(home, ".ollama", "logs", "server.log")
 				if _, nestedErr := os.Stat(logPath); nestedErr == nil {
 					err = fmt.Errorf("%w\nFor more details, check the error logs at %s", err, logPath)
 				}
 			}
 			return err
 		}
 		fmt.Println()
 		fmt.Println()
 		if !latest.Done {
 			return errors.New("unexpected end of response")
 		}
 		verbose, err := cmd.Flags().GetBool("verbose")
 		if err != nil {
 			return err
 		}
 		if verbose {
 			latest.Summary()
 		}
 		ctx := cmd.Context()
 		ctx = context.WithValue(ctx, generateContextKey("context"), latest.Context)
 		cmd.SetContext(ctx)
 	}
 	spinner := NewSpinner("")
 	go spinner.Spin(60 * time.Millisecond)
 	var latest api.GenerateResponse
 	generateContext, ok := cmd.Context().Value(generateContextKey("context")).([]int)
 	if !ok {
 		generateContext = []int{}
 	}
 	var wrapTerm bool
 	termType := os.Getenv("TERM")
 	if termType == "xterm-256color" {
 		wrapTerm = true
 	}
 	termWidth, _, err := term.GetSize(int(0))
 	if err != nil {
 		wrapTerm = false
 	}
 	// override wrapping if the user turned it off
 	nowrap, err := cmd.Flags().GetBool("nowordwrap")
 	if err != nil {
 		return err
 	}
 	if nowrap {
 		wrapTerm = false
 	}
 	var currentLineLength int
 	var wordBuffer string
 	request := api.GenerateRequest{Model: model, Prompt: prompt, Context: generateContext}
 	fn := func(response api.GenerateResponse) error {
 		if !spinner.IsFinished() {
 			spinner.Finish()
 		}
 		latest = response
 		if wrapTerm {
 			for _, ch := range response.Response {
 				if currentLineLength+1 > termWidth-5 {
 					// backtrack the length of the last word and clear to the end of the line
 					fmt.Printf("\x1b[%dD\x1b[K\n", len(wordBuffer))
 					fmt.Printf("%s%c", wordBuffer, ch)
 					currentLineLength = len(wordBuffer) + 1
 				} else {
 					fmt.Print(string(ch))
 					currentLineLength += 1
 					switch ch {
 					case ' ':
 						wordBuffer = ""
 					case '\n':
 						currentLineLength = 0
 					default:
 						wordBuffer += string(ch)
 					}
 				}
 			}
 		} else {
 			fmt.Print(response.Response)
 		}
 		return nil
 	}
 	if err := client.Generate(context.Background(), &request, fn); err != nil {
 		if strings.Contains(err.Error(), "failed to load model") {
 			// tell the user to check the server log, if it exists locally
 			home, nestedErr := os.UserHomeDir()
 			if nestedErr != nil {
 				// return the original error
 				return err
 			}
 			logPath := filepath.Join(home, ".ollama", "logs", "server.log")
 			if _, nestedErr := os.Stat(logPath); nestedErr == nil {
 				err = fmt.Errorf("%w\nFor more details, check the error logs at %s", err, logPath)
 			}
 		}
 		return err
 	}
 	if prompt != "" {
 		fmt.Println()
 		fmt.Println()
 	}
 	if !latest.Done {
 		return errors.New("unexpected end of response")
 	}
 	verbose, err := cmd.Flags().GetBool("verbose")
 	if err != nil {
 		return err
 	}
 	if verbose {
 		latest.Summary()
 	}
 	ctx := cmd.Context()
 	ctx = context.WithValue(ctx, generateContextKey("context"), latest.Context)
 	cmd.SetContext(ctx)
 	return nil
 }
@@ -461,19 +515,21 @@ func generateInteractive(cmd *cobra.Command, model string) error {
 		return err
 	}
 	// load the model
 	if err := generate(cmd, model, ""); err != nil {
 		return err
 	}
 	completer := readline.NewPrefixCompleter(
 		readline.PcItem("/help"),
 		readline.PcItem("/list"),
 		readline.PcItem("/set",
 			readline.PcItem("history"),
 			readline.PcItem("nohistory"),
 			readline.PcItem("wordwrap"),
 			readline.PcItem("nowordwrap"),
 			readline.PcItem("verbose"),
 			readline.PcItem("quiet"),
 			readline.PcItem("mode",
 				readline.PcItem("vim"),
 				readline.PcItem("emacs"),
 				readline.PcItem("default"),
 			),
 		),
 		readline.PcItem("/show",
 			readline.PcItem("license"),
@@ -491,7 +547,10 @@ func generateInteractive(cmd *cobra.Command, model string) error {
 		fmt.Fprintln(os.Stderr, completer.Tree("  "))
 	}
 	var painter Painter
 	config := readline.Config{
 		Painter:      &painter,
 		Prompt:       ">>> ",
 		HistoryFile:  filepath.Join(home, ".ollama", "history"),
 		AutoComplete: completer,
@@ -527,6 +586,7 @@ func generateInteractive(cmd *cobra.Command, model string) error {
 		case isMultiLine:
 			if strings.HasSuffix(line, `"""`) {
 				isMultiLine = false
 				painter.IsMultiLine = isMultiLine
 				multiLineBuffer += strings.TrimSuffix(line, `"""`)
 				line = multiLineBuffer
 				multiLineBuffer = ""
@@ -537,6 +597,7 @@ func generateInteractive(cmd *cobra.Command, model string) error {
 			}
 		case strings.HasPrefix(line, `"""`):
 			isMultiLine = true
 			painter.IsMultiLine = isMultiLine
 			multiLineBuffer = strings.TrimPrefix(line, `"""`) + " "
 			scanner.SetPrompt("... ")
 			continue
@@ -545,45 +606,42 @@ func generateInteractive(cmd *cobra.Command, model string) error {
 			if err := ListHandler(cmd, args[1:]); err != nil {
 				return err
 			}
 			continue
 		case strings.HasPrefix(line, "/set"):
 			args := strings.Fields(line)
 			if len(args) > 1 {
 				switch args[1] {
 				case "history":
 					scanner.HistoryEnable()
 					continue
 				case "nohistory":
 					scanner.HistoryDisable()
-					continue
+				case "wordwrap":
 					cmd.Flags().Set("nowordwrap", "false")
 					fmt.Println("Set 'wordwrap' mode.")
 				case "nowordwrap":
 					cmd.Flags().Set("nowordwrap", "true")
 					fmt.Println("Set 'nowordwrap' mode.")
 				case "verbose":
 					cmd.Flags().Set("verbose", "true")
-					continue
+					fmt.Println("Set 'verbose' mode.")
 				case "quiet":
 					cmd.Flags().Set("verbose", "false")
-					continue
+					fmt.Println("Set 'quiet' mode.")
 				case "mode":
 					if len(args) > 2 {
 						switch args[2] {
 						case "vim":
 							scanner.SetVimMode(true)
 							continue
 						case "emacs", "default":
 							scanner.SetVimMode(false)
 							continue
 						default:
 							usage()
 							continue
 						}
 					} else {
 						usage()
 						continue
 					}
 				}
 			} else {
 				usage()
 				continue
 			}
 		case strings.HasPrefix(line, "/show"):
 			args := strings.Fields(line)
@@ -591,7 +649,6 @@ func generateInteractive(cmd *cobra.Command, model string) error {
 				resp, err := server.GetModelInfo(model)
 				if err != nil {
 					fmt.Println("error: couldn't get model")
 					continue
 				}
 				switch args[1] {
@@ -608,21 +665,22 @@ func generateInteractive(cmd *cobra.Command, model string) error {
 				default:
 					fmt.Println("error: unknown command")
 				}
 				continue
 			} else {
 				usage()
 				continue
 			}
 		case line == "/help", line == "/?":
 			usage()
 			continue
 		case line == "/exit", line == "/bye":
 			return nil
 		case strings.HasPrefix(line, "/"):
 			args := strings.Fields(line)
 			fmt.Printf("Unknown command '%s'. Type /? for help\n", args[0])
 		}
-		if err := generate(cmd, model, line); err != nil {
+		if len(line) > 0 && line[0] != '/' {
-			return err
+			if err := generate(cmd, model, line); err != nil {
 				return err
 			}
 		}
 	}
 }
@@ -641,28 +699,19 @@ func generateBatch(cmd *cobra.Command, model string) error {
 }
 func RunServer(cmd *cobra.Command, _ []string) error {
-	host, port := "127.0.0.1", "11434"
+	host, port, err := net.SplitHostPort(os.Getenv("OLLAMA_HOST"))
 	parts := strings.Split(os.Getenv("OLLAMA_HOST"), ":")
 	if ip := net.ParseIP(parts[0]); ip != nil {
 		host = ip.String()
 	}
 	if len(parts) > 1 {
 		port = parts[1]
 	}
 	// deprecated: include port in OLLAMA_HOST
 	if p := os.Getenv("OLLAMA_PORT"); p != "" {
 		port = p
 	}
 	err := initializeKeypair()
 	if err != nil {
 		host, port = "127.0.0.1", "11434"
 		if ip := net.ParseIP(strings.Trim(os.Getenv("OLLAMA_HOST"), "[]")); ip != nil {
 			host = ip.String()
 		}
 	}
 	if err := initializeKeypair(); err != nil {
 		return err
 	}
-	ln, err := net.Listen("tcp", fmt.Sprintf("%s:%s", host, port))
+	ln, err := net.Listen("tcp", net.JoinHostPort(host, port))
 	if err != nil {
 		return err
 	}
@@ -672,6 +721,12 @@ func RunServer(cmd *cobra.Command, _ []string) error {
 		origins = strings.Split(o, ",")
 	}
 	if noprune := os.Getenv("OLLAMA_NOPRUNE"); noprune == "" {
 		if err := server.PruneLayers(); err != nil {
 			return err
 		}
 	}
 	return server.Serve(ln, origins)
 }
@@ -697,7 +752,7 @@ func initializeKeypair() error {
 			return err
 		}
-		err = os.MkdirAll(path.Dir(privKeyPath), 0o700)
+		err = os.MkdirAll(filepath.Dir(privKeyPath), 0o755)
 		if err != nil {
 			return fmt.Errorf("could not create directory %w", err)
 		}
@@ -825,6 +880,7 @@ func NewCLI() *cobra.Command {
 	runCmd.Flags().Bool("verbose", false, "Show timings for response")
 	runCmd.Flags().Bool("insecure", false, "Use an insecure registry")
 	runCmd.Flags().Bool("nowordwrap", false, "Don't wrap words to the next line automatically")
 	serveCmd := &cobra.Command{
 		Use:     "serve",
--- a/docs/api.md
+++ b/docs/api.md
@@ -3,18 +3,21 @@
 ## Endpoints
 - [Generate a completion](#generate-a-completion)
- [Create a model](#create-a-model)
+- [Create a Model](#create-a-model)
- [List local models](#list-local-models)
+- [List Local Models](#list-local-models)
- [Copy a model](#copy-a-model)
+- [Show Model Information](#show-model-information)
- [Delete a model](#delete-a-model)
+- [Copy a Model](#copy-a-model)
- [Pull a model](#pull-a-model)
+- [Delete a Model](#delete-a-model)
- [Generate embeddings](#generate-embeddings)
+- [Pull a Model](#pull-a-model)
 - [Push a Model](#push-a-model)
 - [Generate Embeddings](#generate-embeddings)
 ## Conventions
 ### Model names
-Model names follow a `model:tag` format. Some examples are `orca-mini:3b-q4_1` and `llama2:70b`. The tag is optional and if not provided will default to `latest`. The tag is used to identify a specific version.
+Model names follow a `model:tag` format. Some examples are `orca-mini:3b-q4_1` and `llama2:70b`. The tag is optional and, if not provided, will default to `latest`. The tag is used to identify a specific version.
 ### Durations
@@ -22,7 +25,7 @@ All durations are returned in nanoseconds.
 ## Generate a completion
-```
+```shell
 POST /api/generate
 ```
@@ -42,7 +45,7 @@ Advanced parameters:
 ### Request
-```
+```shell
 curl -X POST http://localhost:11434/api/generate -d '{
  "model": "llama2:7b",
  "prompt": "Why is the sky blue?"
@@ -95,7 +98,7 @@ To calculate how fast the response is generated in tokens per second (token/s),
 ## Create a Model
-```
+```shell
 POST /api/create
 ```
@@ -108,7 +111,7 @@ Create a model from a [`Modelfile`](./modelfile.md)
 ### Request
-```
+```shell
 curl -X POST http://localhost:11434/api/create -d '{
  "name": "mario",
  "path": "~/Modelfile"
@@ -117,7 +120,7 @@ curl -X POST http://localhost:11434/api/create -d '{
 ### Response
-A stream of JSON objects. When finished, `status` is `success`
+A stream of JSON objects. When finished, `status` is `success`.
 ```json
 {
@@ -127,7 +130,7 @@ A stream of JSON objects. When finished, `status` is `success`
 ## List Local Models
-```
+```shell
 GET /api/tags
 ```
@@ -135,7 +138,7 @@ List models that are available locally.
 ### Request
-```
+```shell
 curl http://localhost:11434/api/tags
 ```
@@ -158,9 +161,40 @@ curl http://localhost:11434/api/tags
 }
 ```
 ## Show Model Information
 ```shell
 POST /api/show
 ```
 Show details about a model including modelfile, template, parameters, license, and system prompt.
 ### Parameters
 - `name`: name of the model to show
 ### Request
 ```shell  
 curl http://localhost:11434/api/show -d '{
  "name": "llama2:7b"
 }'
 ```
 ### Response
 ```json
 {
    "license": "<contents of license block>",
    "modelfile": "# Modelfile generated by \"ollama show\"\n# To build a new Modelfile based on this one, replace the FROM line with:\n# FROM llama2:latest\n\nFROM /Users/username/.ollama/models/blobs/sha256:8daa9615cce30c259a9555b1cc250d461d1bc69980a274b44d7eda0be78076d8\nTEMPLATE \"\"\"[INST] {{ if and .First .System }}<<SYS>>{{ .System }}<</SYS>>\n\n{{ end }}{{ .Prompt }} [/INST] \"\"\"\nSYSTEM \"\"\"\"\"\"\nPARAMETER stop [INST]\nPARAMETER stop [/INST]\nPARAMETER stop <<SYS>>\nPARAMETER stop <</SYS>>\n",
    "parameters": "stop                           [INST]\nstop                           [/INST]\nstop                           <<SYS>>\nstop                           <</SYS>>",
    "template": "[INST] {{ if and .First .System }}<<SYS>>{{ .System }}<</SYS>>\n\n{{ end }}{{ .Prompt }} [/INST] "
 }
 ```
 ## Copy a Model
-```
+```shell
 POST /api/copy
 ```
@@ -168,7 +202,7 @@ Copy a model. Creates a model with another name from an existing model.
 ### Request
-```
+```shell
 curl http://localhost:11434/api/copy -d '{
  "source": "llama2:7b",
  "destination": "llama2-backup"
@@ -177,7 +211,7 @@ curl http://localhost:11434/api/copy -d '{
 ## Delete a Model
-```
+```shell
 DELETE /api/delete
 ```
@@ -189,7 +223,7 @@ Delete a model and its data.
 ### Request
-```
+```shell
 curl -X DELETE http://localhost:11434/api/delete -d '{
  "name": "llama2:13b"
 }'
@@ -197,19 +231,20 @@ curl -X DELETE http://localhost:11434/api/delete -d '{
 ## Pull a Model
-```
+```shell
 POST /api/pull
 ```
-Download a model from a the model registry. Cancelled pulls are resumed from where they left off, and multiple calls to will share the same download progress.
+Download a model from the ollama library. Cancelled pulls are resumed from where they left off, and multiple calls will share the same download progress.
 ### Parameters
 - `name`: name of the model to pull
 - `insecure`: (optional) allow insecure connections to the library. Only use this if you are pulling from your own library during development.
 ### Request
-```
+```shell
 curl -X POST http://localhost:11434/api/pull -d '{
  "name": "llama2:7b"
 }'
@@ -225,9 +260,63 @@ curl -X POST http://localhost:11434/api/pull -d '{
 }
 ```
 ## Push a Model
 ```shell
 POST /api/push
 ```
 Upload a model to a model library. Requires registering for ollama.ai and adding a public key first.
 ### Parameters
 - `name`: name of the model to push in the form of `<namespace>/<model>:<tag>`
 - `insecure`: (optional) allow insecure connections to the library. Only use this if you are pushing to your library during development.  
 ### Request
 ```shell
 curl -X POST http://localhost:11434/api/push -d '{
  "name": "mattw/pygmalion:latest"
 }'
 ```
 ### Response
 Streaming response that starts with:
 ```json
 {"status":"retrieving manifest"}
 ```
 and then:
 ```json
 {
 "status":"starting upload","digest":"sha256:bc07c81de745696fdf5afca05e065818a8149fb0c77266fb584d9b2cba3711ab",
 "total":1928429856
 }
 ```
 Then there is a series of uploading responses:
 ```json
 {
 "status":"starting upload",
 "digest":"sha256:bc07c81de745696fdf5afca05e065818a8149fb0c77266fb584d9b2cba3711ab",
 "total":1928429856}
 ```
 Finally, when the upload is complete:
 ```json
 {"status":"pushing manifest"}
 {"status":"success"}
 ```
 ## Generate Embeddings
-```
+```shell
 POST /api/embeddings
 ```
@@ -244,7 +333,7 @@ Advanced parameters:
 ### Request
-```
+```shell
 curl -X POST http://localhost:11434/api/embeddings -d '{
  "model": "llama2:7b",
  "prompt": "Here is an article about llamas..."
@@ -259,5 +348,4 @@ curl -X POST http://localhost:11434/api/embeddings -d '{
    0.5670403838157654, 0.009260174818336964, 0.23178744316101074, -0.2916173040866852, -0.8924556970596313,
    0.8785552978515625, -0.34576427936553955, 0.5742510557174683, -0.04222835972905159, -0.137906014919281
  ]
-}
+}```
 ```
--- a/docs/development.md
+++ b/docs/development.md
@@ -6,6 +6,10 @@
 Install required tools:
 - cmake version 3.24 or higher
 - go version 1.20 or higher
 - gcc version 11.4.0 or higher
 ```
 brew install go cmake gcc
 ```
@@ -27,3 +31,9 @@ Now you can run `ollama`:
 ```
 ./ollama
 ```
 ## Building on Linux with GPU support
 - Install cmake and nvidia-cuda-toolkit
 - run `go generate ./...`
 - run `go build .`
--- a/docs/linux.md
+++ b/docs/linux.md
@@ -0,0 +1,83 @@
 # Installing Ollama on Linux
 > Note: A one line installer for Ollama is available by running:
 >
 > ```
 > curl https://ollama.ai/install.sh | sh
 > ```
 ## Download the `ollama` binary
 Ollama is distributed as a self-contained binary. Download it to a directory in your PATH:
 ```
 sudo curl -L https://ollama.ai/download/ollama-linux-amd64 -o /usr/bin/ollama
 sudo chmod +x /usr/bin/ollama
 ```
 ## Start Ollama
 Start Ollama by running `ollama serve`:
 ```
 ollama serve
 ```
 Once Ollama is running, run a model in another terminal session:
 ```
 ollama run llama2
 ```
 ## Install CUDA drivers (optional – for Nvidia GPUs)
 [Download and install](https://developer.nvidia.com/cuda-downloads) CUDA.
 Verify that the drivers are installed by running the following command, which should print details about your GPU:
 ```
 nvidia-smi
 ```
 ## Adding Ollama as a startup service (optional)
 Create a user for Ollama:
 ```
 sudo useradd -r -s /bin/false -m -d /usr/share/ollama ollama
 ```
 Create a service file in `/etc/systemd/system/ollama.service`:
 ```ini
 [Unit]
 Description=Ollama Service
 After=network-online.target
 [Service]
 ExecStart=/usr/bin/ollama serve
 User=ollama
 Group=ollama
 Restart=always
 RestartSec=3
 Environment="HOME=/usr/share/ollama"
 [Install]
 WantedBy=default.target
 ```
 Then start the service:
 ```
 sudo systemctl daemon-reload
 sudo systemctl enable ollama
 ```
 ### Viewing logs
 To view logs of Ollama running as a startup service, run:
 ```
 journalctl -u ollama
 ```
--- a/go.mod
+++ b/go.mod
@@ -8,6 +8,7 @@ require (
 	github.com/mattn/go-runewidth v0.0.14
 	github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db
 	github.com/olekukonko/tablewriter v0.0.5
 	github.com/pdevine/readline v1.5.2
 	github.com/spf13/cobra v1.7.0
 )
@@ -16,7 +17,6 @@ require github.com/rivo/uniseg v0.2.0 // indirect
 require (
 	github.com/bytedance/sonic v1.9.1 // indirect
 	github.com/chenzhuoyu/base64x v0.0.0-20221115062448-fe3a3abad311 // indirect
 	github.com/chzyer/readline v1.5.1
 	github.com/gabriel-vasile/mimetype v1.4.2 // indirect
 	github.com/gin-contrib/cors v1.4.0
 	github.com/gin-contrib/sse v0.1.0 // indirect
--- a/go.sum
+++ b/go.sum
@@ -6,8 +6,6 @@ github.com/chenzhuoyu/base64x v0.0.0-20221115062448-fe3a3abad311 h1:qSGYFH7+jGhD
 github.com/chenzhuoyu/base64x v0.0.0-20221115062448-fe3a3abad311/go.mod h1:b583jCggY9gE99b6G5LEC39OIiVsWj+R97kbl5odCEk=
 github.com/chzyer/logex v1.2.1 h1:XHDu3E6q+gdHgsdTPH6ImJMIp436vR6MPtH8gP05QzM=
 github.com/chzyer/logex v1.2.1/go.mod h1:JLbx6lG2kDbNRFnfkgvh4eRJRPX1QCoOIWomwysCBrQ=
 github.com/chzyer/readline v1.5.1 h1:upd/6fQk4src78LMRzh5vItIt361/o4uq553V8B5sGI=
 github.com/chzyer/readline v1.5.1/go.mod h1:Eh+b79XXUwfKfcPLepksvw2tcLE/Ct21YObkaSkeBlk=
 github.com/chzyer/test v1.0.0 h1:p3BQDXSxOhOG0P9z6/hGnII4LGiEPOYBhs8asl/fC04=
 github.com/chzyer/test v1.0.0/go.mod h1:2JlltgoNkt4TW/z9V/IzDdFaMTM2JPIi26O1pF38GC8=
 github.com/cpuguy83/go-md2man/v2 v2.0.2/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=
@@ -80,6 +78,8 @@ github.com/olekukonko/tablewriter v0.0.5 h1:P2Ga83D34wi1o9J6Wh1mRuqd4mF/x/lgBS7N
 github.com/olekukonko/tablewriter v0.0.5/go.mod h1:hPp6KlRPjbx+hW8ykQs1w3UBbZlj6HuIJcUGPhkA7kY=
 github.com/pbnjay/memory v0.0.0-20210728143218-7b4eea64cf58 h1:onHthvaw9LFnH4t2DcNVpwGmV9E1BkGknEliJkfwQj0=
 github.com/pbnjay/memory v0.0.0-20210728143218-7b4eea64cf58/go.mod h1:DXv8WO4yhMYhSNPKjeNKa5WY9YCIEBRbNzFFPJbWO6Y=
 github.com/pdevine/readline v1.5.2 h1:oz6Y5GdTmhPG+08hhxcAvtHitSANWuA2100Sppb38xI=
 github.com/pdevine/readline v1.5.2/go.mod h1:na/LbuE5PYwxI7GyopWdIs3U8HVe89lYlNTFTXH3wOw=
 github.com/pelletier/go-toml/v2 v2.0.1/go.mod h1:r9LEWfGN8R5k0VXJ+0BkIe7MYkRdwZOjgMj2KwnJFUo=
 github.com/pelletier/go-toml/v2 v2.0.8 h1:0ctb6s9mE31h0/lhu+J6OPmVeDxJn+kYnJc2jZR9tGQ=
 github.com/pelletier/go-toml/v2 v2.0.8/go.mod h1:vuYfssBdrU2XDZ9bYydBu6t+6a6PYNcZljzZR9VXg+4=
@@ -120,7 +120,6 @@ golang.org/x/arch v0.3.0/go.mod h1:5om86z9Hs0C8fWVUuoMHwpExlXzs5Tkyp9hOrfG7pp8=
 golang.org/x/crypto v0.0.0-20210711020723-a769d52b0f97/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
 golang.org/x/crypto v0.10.0 h1:LKqV2xt9+kDzSTfOhx4FrkEBcMrAgHSYgzywV9zcGmM=
 golang.org/x/crypto v0.10.0/go.mod h1:o4eNf7Ede1fv+hwOwZsTHl9EsPFO6q6ZvYR8vYfY45I=
 golang.org/x/exp v0.0.0-20230321023759-10a507213a29 h1:ooxPy7fPvB4kwsA2h+iBNHkAbp/4JxTSwCmvdjEYmug=
 golang.org/x/exp v0.0.0-20230817173708-d852ddb80c63 h1:m64FZMko/V45gv0bNmrNYoDEq8U5YUhetc9cBWKS1TQ=
 golang.org/x/exp v0.0.0-20230817173708-d852ddb80c63/go.mod h1:0v4NqG35kSWCMzLaMeX+IQrlSnVE/bqGSyC2cz/9Le8=
 golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
--- a/llm/falcon.go
+++ b/llm/falcon.go
@@ -0,0 +1,22 @@
 package llm
 const ModelFamilyFalcon = "falcon"
 const (
 	falconModelType7B   = 32
 	falconModelType40B  = 60
 	falconModelType180B = 80
 )
 func falconModelType(numLayer uint32) string {
 	switch numLayer {
 	case 32:
 		return "7B"
 	case 60:
 		return "40B"
 	case 80:
 		return "180B"
 	default:
 		return "Unknown"
 	}
 }
--- a/llm/ggml.go
+++ b/llm/ggml.go
@@ -3,72 +3,96 @@ package llm
 import (
 	"encoding/binary"
 	"errors"
 	"fmt"
 	"io"
 )
 type ModelFamily string
 type ModelType uint32
 const (
 	ModelType3B  ModelType = 26
 	ModelType7B  ModelType = 32
 	ModelType13B ModelType = 40
 	ModelType34B ModelType = 48
 	ModelType30B ModelType = 60
 	ModelType65B ModelType = 80
 )
 func (mt ModelType) String() string {
 	switch mt {
 	case ModelType3B:
 		return "3B"
 	case ModelType7B:
 		return "7B"
 	case ModelType13B:
 		return "13B"
 	case ModelType34B:
 		return "34B"
 	case ModelType30B:
 		return "30B"
 	case ModelType65B:
 		return "65B"
 	default:
 		return "Unknown"
 	}
 }
 type FileType interface {
 	String() string
 }
 type GGML struct {
 	magic uint32
 	container
 	model
 }
 const (
 	fileTypeF32 uint32 = iota
 	fileTypeF16
 	fileTypeQ4_0
 	fileTypeQ4_1
 	fileTypeQ4_1_F16
 	fileTypeQ8_0 uint32 = iota + 2
 	fileTypeQ5_0
 	fileTypeQ5_1
 	fileTypeQ2_K
 	fileTypeQ3_K_S
 	fileTypeQ3_K_M
 	fileTypeQ3_K_L
 	fileTypeQ4_K_S
 	fileTypeQ4_K_M
 	fileTypeQ5_K_S
 	fileTypeQ5_K_M
 	fileTypeQ6_K
 )
 func fileType(fileType uint32) string {
 	switch fileType {
 	case fileTypeF32:
 		return "F32"
 	case fileTypeF16:
 		return "F16"
 	case fileTypeQ4_0:
 		return "Q4_0"
 	case fileTypeQ4_1:
 		return "Q4_1"
 	case fileTypeQ4_1_F16:
 		return "Q4_1_F16"
 	case fileTypeQ8_0:
 		return "Q8_0"
 	case fileTypeQ5_0:
 		return "Q5_0"
 	case fileTypeQ5_1:
 		return "Q5_1"
 	case fileTypeQ2_K:
 		return "Q2_K"
 	case fileTypeQ3_K_S:
 		return "Q3_K_S"
 	case fileTypeQ3_K_M:
 		return "Q3_K_M"
 	case fileTypeQ3_K_L:
 		return "Q3_K_L"
 	case fileTypeQ4_K_S:
 		return "Q4_K_S"
 	case fileTypeQ4_K_M:
 		return "Q4_K_M"
 	case fileTypeQ5_K_S:
 		return "Q5_K_S"
 	case fileTypeQ5_K_M:
 		return "Q5_K_M"
 	case fileTypeQ6_K:
 		return "Q6_K"
 	default:
 		return "Unknown"
 	}
 }
 type model interface {
-	ModelFamily() ModelFamily
+	ModelFamily() string
-	ModelType() ModelType
+	ModelType() string
-	FileType() FileType
+	FileType() string
 	NumLayers() int64
 }
 type container interface {
 	Name() string
-	Decode(io.Reader) error
+	Decode(io.Reader) (model, error)
 }
-type containerGGML struct {
+type containerGGML struct{}
 }
 func (c *containerGGML) Name() string {
 	return "ggml"
 }
-func (c *containerGGML) Decode(r io.Reader) error {
+func (c *containerGGML) Decode(r io.Reader) (model, error) {
-	return nil
+	return nil, nil
 }
 type containerGGMF struct {
@@ -79,18 +103,18 @@ func (c *containerGGMF) Name() string {
 	return "ggmf"
 }
-func (c *containerGGMF) Decode(r io.Reader) error {
+func (c *containerGGMF) Decode(r io.Reader) (model, error) {
 	var version uint32
 	binary.Read(r, binary.LittleEndian, &version)
 	switch version {
 	case 1:
 	default:
-		return errors.New("invalid version")
+		return nil, errors.New("invalid version")
 	}
 	c.version = version
-	return nil
+	return nil, nil
 }
 type containerGGJT struct {
@@ -101,18 +125,22 @@ func (c *containerGGJT) Name() string {
 	return "ggjt"
 }
-func (c *containerGGJT) Decode(r io.Reader) error {
+func (c *containerGGJT) Decode(r io.Reader) (model, error) {
 	var version uint32
 	binary.Read(r, binary.LittleEndian, &version)
 	switch version {
 	case 1, 2, 3:
 	default:
-		return errors.New("invalid version")
+		return nil, errors.New("invalid version")
 	}
 	c.version = version
-	return nil
+
 	// different model types may have different layouts for hyperparameters
 	var llama llamaModel
 	binary.Read(r, binary.LittleEndian, &llama.hyperparameters)
 	return &llama, nil
 }
 type containerLORA struct {
@@ -123,32 +151,34 @@ func (c *containerLORA) Name() string {
 	return "ggla"
 }
-func (c *containerLORA) Decode(r io.Reader) error {
+func (c *containerLORA) Decode(r io.Reader) (model, error) {
 	var version uint32
 	binary.Read(r, binary.LittleEndian, &version)
 	switch version {
 	case 1:
 	default:
-		return errors.New("invalid version")
+		return nil, errors.New("invalid version")
 	}
 	c.version = version
-	return nil
+	return nil, nil
 }
 const (
-	// / Magic constant for `ggml` files (unversioned).
+	// Magic constant for `ggml` files (unversioned).
 	FILE_MAGIC_GGML = 0x67676d6c
-	// / Magic constant for `ggml` files (versioned, ggmf).
+	// Magic constant for `ggml` files (versioned, ggmf).
 	FILE_MAGIC_GGMF = 0x67676d66
-	// / Magic constant for `ggml` files (versioned, ggjt).
+	// Magic constant for `ggml` files (versioned, ggjt).
 	FILE_MAGIC_GGJT = 0x67676a74
-	// / Magic constant for `ggla` files (LoRA adapter).
+	// Magic constant for `ggla` files (LoRA adapter).
 	FILE_MAGIC_GGLA = 0x67676C61
 	// Magic constant for `gguf` files (versioned, gguf)
 	FILE_MAGIC_GGUF = 0x46554747
 )
-func DecodeGGML(r io.ReadSeeker, hint ModelFamily) (*GGML, error) {
+func DecodeGGML(r io.ReadSeeker) (*GGML, error) {
 	var ggml GGML
 	binary.Read(r, binary.LittleEndian, &ggml.magic)
@@ -161,24 +191,18 @@ func DecodeGGML(r io.ReadSeeker, hint ModelFamily) (*GGML, error) {
 		ggml.container = &containerGGJT{}
 	case FILE_MAGIC_GGLA:
 		ggml.container = &containerLORA{}
 	case FILE_MAGIC_GGUF:
 		ggml.container = &containerGGUF{}
 	default:
 		return nil, errors.New("invalid file magic")
 	}
-	if err := ggml.Decode(r); err != nil {
+	model, err := ggml.Decode(r)
 	if err != nil {
 		return nil, err
 	}
-	// different model types may have different layouts for hyperparameters
+	ggml.model = model
 	switch hint {
 	case ModelFamilyLlama:
 		var llama llamaModel
 		binary.Read(r, binary.LittleEndian, &llama.hyperparameters)
 		ggml.model = &llama
 		// TODO: sanity check hyperparameters
 	default:
 		return nil, fmt.Errorf("unsupported model type: %s", hint)
 	}
 	// final model type
 	return &ggml, nil
--- a/llm/gguf.go
+++ b/llm/gguf.go
@@ -0,0 +1,379 @@
 package llm
 import (
 	"bytes"
 	"encoding/binary"
 	"errors"
 	"fmt"
 	"io"
 )
 type containerGGUF struct {
 	Version uint32
 	V1 struct {
 		NumTensor uint32
 		NumKV     uint32
 	}
 	V2 struct {
 		NumTensor uint64
 		NumKV     uint64
 	}
 }
 func (c *containerGGUF) Name() string {
 	return "gguf"
 }
 func (c *containerGGUF) Decode(r io.Reader) (model, error) {
 	binary.Read(r, binary.LittleEndian, &c.Version)
 	switch c.Version {
 	case 1:
 		binary.Read(r, binary.LittleEndian, &c.V1)
 	case 2:
 		binary.Read(r, binary.LittleEndian, &c.V2)
 	default:
 		return nil, errors.New("invalid version")
 	}
 	model := newGGUFModel(c)
 	if err := model.Decode(r); err != nil {
 		return nil, err
 	}
 	return model, nil
 }
 const (
 	ggufTypeUint8 uint32 = iota
 	ggufTypeInt8
 	ggufTypeUint16
 	ggufTypeInt16
 	ggufTypeUint32
 	ggufTypeInt32
 	ggufTypeFloat32
 	ggufTypeBool
 	ggufTypeString
 	ggufTypeArray
 	ggufTypeUint64
 	ggufTypeInt64
 	ggufTypeFloat64
 )
 type kv map[string]any
 type ggufModel struct {
 	*containerGGUF
 	kv
 }
 func newGGUFModel(container *containerGGUF) *ggufModel {
 	return &ggufModel{
 		containerGGUF: container,
 		kv:            make(kv),
 	}
 }
 func (llm *ggufModel) NumKV() uint64 {
 	if llm.Version == 1 {
 		return uint64(llm.V1.NumKV)
 	}
 	return llm.V2.NumKV
 }
 func (llm *ggufModel) ModelFamily() string {
 	t, ok := llm.kv["general.architecture"].(string)
 	if ok {
 		return t
 	}
 	return "unknown"
 }
 func (llm *ggufModel) ModelType() string {
 	switch llm.ModelFamily() {
 	case "llama":
 		if blocks, ok := llm.kv["llama.block_count"].(uint32); ok {
 			heads, headsOK := llm.kv["llama.head_count"].(uint32)
 			headKVs, headsKVsOK := llm.kv["llama.head_count_kv"].(uint32)
 			if headsOK && headsKVsOK && heads/headKVs == 8 {
 				return "70B"
 			}
 			return llamaModelType(blocks)
 		}
 	case "falcon":
 		if blocks, ok := llm.kv["falcon.block_count"].(uint32); ok {
 			return falconModelType(blocks)
 		}
 	}
 	return "Unknown"
 }
 func (llm *ggufModel) FileType() string {
 	t, ok := llm.kv["general.file_type"].(uint32)
 	if ok {
 		return fileType(t)
 	}
 	return "Unknown"
 }
 func (llm *ggufModel) Decode(r io.Reader) error {
 	read := llm.readString
 	if llm.Version == 1 {
 		read = llm.readStringV1
 	}
 	for i := 0; uint64(i) < llm.NumKV(); i++ {
 		k, err := read(r)
 		if err != nil {
 			return err
 		}
 		vtype := llm.readU32(r)
 		var v any
 		switch vtype {
 		case ggufTypeUint8:
 			v = llm.readU8(r)
 		case ggufTypeInt8:
 			v = llm.readI8(r)
 		case ggufTypeUint16:
 			v = llm.readU16(r)
 		case ggufTypeInt16:
 			v = llm.readI16(r)
 		case ggufTypeUint32:
 			v = llm.readU32(r)
 		case ggufTypeInt32:
 			v = llm.readI32(r)
 		case ggufTypeUint64:
 			v = llm.readU64(r)
 		case ggufTypeInt64:
 			v = llm.readI64(r)
 		case ggufTypeFloat32:
 			v = llm.readF32(r)
 		case ggufTypeFloat64:
 			v = llm.readF64(r)
 		case ggufTypeBool:
 			v = llm.readBool(r)
 		case ggufTypeString:
 			fn := llm.readString
 			if llm.Version == 1 {
 				fn = llm.readStringV1
 			}
 			s, err := fn(r)
 			if err != nil {
 				return err
 			}
 			v = s
 		case ggufTypeArray:
 			fn := llm.readArray
 			if llm.Version == 1 {
 				fn = llm.readArrayV1
 			}
 			a, err := fn(r)
 			if err != nil {
 				return err
 			}
 			v = a
 		default:
 			return fmt.Errorf("invalid type: %d", vtype)
 		}
 		llm.kv[k] = v
 	}
 	return nil
 }
 func (llm *ggufModel) NumLayers() int64 {
 	value, exists := llm.kv[fmt.Sprintf("%s.block_count", llm.ModelFamily())]
 	if !exists {
 		return 0
 	}
 	v := value.(uint32)
 	return int64(v)
 }
 func (ggufModel) readU8(r io.Reader) uint8 {
 	var u8 uint8
 	binary.Read(r, binary.LittleEndian, &u8)
 	return u8
 }
 func (ggufModel) readI8(r io.Reader) int8 {
 	var i8 int8
 	binary.Read(r, binary.LittleEndian, &i8)
 	return i8
 }
 func (ggufModel) readU16(r io.Reader) uint16 {
 	var u16 uint16
 	binary.Read(r, binary.LittleEndian, &u16)
 	return u16
 }
 func (ggufModel) readI16(r io.Reader) int16 {
 	var i16 int16
 	binary.Read(r, binary.LittleEndian, &i16)
 	return i16
 }
 func (ggufModel) readU32(r io.Reader) uint32 {
 	var u32 uint32
 	binary.Read(r, binary.LittleEndian, &u32)
 	return u32
 }
 func (ggufModel) readI32(r io.Reader) int32 {
 	var i32 int32
 	binary.Read(r, binary.LittleEndian, &i32)
 	return i32
 }
 func (ggufModel) readU64(r io.Reader) uint64 {
 	var u64 uint64
 	binary.Read(r, binary.LittleEndian, &u64)
 	return u64
 }
 func (ggufModel) readI64(r io.Reader) int64 {
 	var i64 int64
 	binary.Read(r, binary.LittleEndian, &i64)
 	return i64
 }
 func (ggufModel) readF32(r io.Reader) float32 {
 	var f32 float32
 	binary.Read(r, binary.LittleEndian, &f32)
 	return f32
 }
 func (ggufModel) readF64(r io.Reader) float64 {
 	var f64 float64
 	binary.Read(r, binary.LittleEndian, &f64)
 	return f64
 }
 func (ggufModel) readBool(r io.Reader) bool {
 	var b bool
 	binary.Read(r, binary.LittleEndian, &b)
 	return b
 }
 func (ggufModel) readStringV1(r io.Reader) (string, error) {
 	var nameLength uint32
 	binary.Read(r, binary.LittleEndian, &nameLength)
 	var b bytes.Buffer
 	if _, err := io.CopyN(&b, r, int64(nameLength)); err != nil {
 		return "", err
 	}
 	// gguf v1 strings are null-terminated
 	b.Truncate(b.Len() - 1)
 	return b.String(), nil
 }
 func (llm ggufModel) readString(r io.Reader) (string, error) {
 	var nameLength uint64
 	binary.Read(r, binary.LittleEndian, &nameLength)
 	var b bytes.Buffer
 	if _, err := io.CopyN(&b, r, int64(nameLength)); err != nil {
 		return "", err
 	}
 	return b.String(), nil
 }
 func (llm *ggufModel) readArrayV1(r io.Reader) (arr []any, err error) {
 	atype := llm.readU32(r)
 	n := llm.readU32(r)
 	for i := 0; uint32(i) < n; i++ {
 		switch atype {
 		case ggufTypeUint8:
 			arr = append(arr, llm.readU8(r))
 		case ggufTypeInt8:
 			arr = append(arr, llm.readU8(r))
 		case ggufTypeUint16:
 			arr = append(arr, llm.readU16(r))
 		case ggufTypeInt16:
 			arr = append(arr, llm.readI16(r))
 		case ggufTypeUint32:
 			arr = append(arr, llm.readU32(r))
 		case ggufTypeInt32:
 			arr = append(arr, llm.readI32(r))
 		case ggufTypeFloat32:
 			arr = append(arr, llm.readF32(r))
 		case ggufTypeBool:
 			arr = append(arr, llm.readBool(r))
 		case ggufTypeString:
 			s, err := llm.readStringV1(r)
 			if err != nil {
 				return nil, err
 			}
 			arr = append(arr, s)
 		default:
 			return nil, fmt.Errorf("invalid array type: %d", atype)
 		}
 	}
 	return
 }
 func (llm *ggufModel) readArray(r io.Reader) (arr []any, err error) {
 	atype := llm.readU32(r)
 	n := llm.readU64(r)
 	for i := 0; uint64(i) < n; i++ {
 		switch atype {
 		case ggufTypeUint8:
 			arr = append(arr, llm.readU8(r))
 		case ggufTypeInt8:
 			arr = append(arr, llm.readU8(r))
 		case ggufTypeUint16:
 			arr = append(arr, llm.readU16(r))
 		case ggufTypeInt16:
 			arr = append(arr, llm.readI16(r))
 		case ggufTypeUint32:
 			arr = append(arr, llm.readU32(r))
 		case ggufTypeInt32:
 			arr = append(arr, llm.readI32(r))
 		case ggufTypeUint64:
 			arr = append(arr, llm.readU64(r))
 		case ggufTypeInt64:
 			arr = append(arr, llm.readI64(r))
 		case ggufTypeFloat32:
 			arr = append(arr, llm.readF32(r))
 		case ggufTypeFloat64:
 			arr = append(arr, llm.readF64(r))
 		case ggufTypeBool:
 			arr = append(arr, llm.readBool(r))
 		case ggufTypeString:
 			s, err := llm.readString(r)
 			if err != nil {
 				return nil, err
 			}
 			arr = append(arr, s)
 		default:
 			return nil, fmt.Errorf("invalid array type: %d", atype)
 		}
 	}
 	return
 }
--- a/llm/llama.cpp/generate.go
+++ b/llm/llama.cpp/generate.go
@@ -1,13 +0,0 @@
 //go:build !darwin
 // +build !darwin
 package llm
 //go:generate git submodule init
 //go:generate git submodule update --force ggml
 //go:generate git -C ggml apply ../ggml_patch/0001-add-detokenize-endpoint.patch
 //go:generate git -C ggml apply ../ggml_patch/0002-34B-model-support.patch
 //go:generate git -C ggml apply ../ggml_patch/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
 //go:generate git -C ggml apply ../ggml_patch/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
 //go:generate cmake --fresh -S ggml -B ggml/build/cpu -DLLAMA_K_QUANTS=on
 //go:generate cmake --build ggml/build/cpu --target server --config Release
--- a/llm/llama.cpp/generate_darwin_amd64.go
+++ b/llm/llama.cpp/generate_darwin_amd64.go
@@ -1,10 +1,16 @@
 package llm
 //go:generate git submodule init
 //go:generate git submodule update --force ggml
-//go:generate git -C ggml apply ../ggml_patch/0001-add-detokenize-endpoint.patch
+//go:generate git -C ggml apply ../patches/0001-add-detokenize-endpoint.patch
-//go:generate git -C ggml apply ../ggml_patch/0002-34B-model-support.patch
+//go:generate git -C ggml apply ../patches/0002-34B-model-support.patch
-//go:generate git -C ggml apply ../ggml_patch/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
+//go:generate git -C ggml apply ../patches/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
-//go:generate git -C ggml apply ../ggml_patch/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
+//go:generate git -C ggml apply ../patches/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
-//go:generate cmake --fresh -S ggml -B ggml/build/cpu -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
+//go:generate cmake -S ggml -B ggml/build/cpu -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
 //go:generate cmake --build ggml/build/cpu --target server --config Release
 //go:generate git submodule update --force gguf
 //go:generate git -C gguf apply ../patches/0001-remove-warm-up-logging.patch
 //go:generate cmake -S gguf -B gguf/build/cpu -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
 //go:generate cmake --build gguf/build/cpu --target server --config Release
--- a/llm/llama.cpp/generate_darwin_arm64.go
+++ b/llm/llama.cpp/generate_darwin_arm64.go
@@ -1,10 +1,16 @@
 package llm
 //go:generate git submodule init
 //go:generate git submodule update --force ggml
-//go:generate git -C ggml apply ../ggml_patch/0001-add-detokenize-endpoint.patch
+//go:generate git -C ggml apply ../patches/0001-add-detokenize-endpoint.patch
-//go:generate git -C ggml apply ../ggml_patch/0002-34B-model-support.patch
+//go:generate git -C ggml apply ../patches/0002-34B-model-support.patch
-//go:generate git -C ggml apply ../ggml_patch/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
+//go:generate git -C ggml apply ../patches/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
-//go:generate git -C ggml apply ../ggml_patch/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
+//go:generate git -C ggml apply ../patches/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
-//go:generate cmake --fresh -S ggml -B ggml/build/gpu -DLLAMA_METAL=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
+//go:generate cmake -S ggml -B ggml/build/metal -DLLAMA_METAL=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
-//go:generate cmake --build ggml/build/gpu --target server --config Release
+//go:generate cmake --build ggml/build/metal --target server --config Release
 //go:generate git submodule update --force gguf
 //go:generate git -C gguf apply ../patches/0001-remove-warm-up-logging.patch
 //go:generate cmake -S gguf -B gguf/build/metal -DLLAMA_METAL=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
 //go:generate cmake --build gguf/build/metal --target server --config Release
--- a/llm/llama.cpp/generate_linux.go
+++ b/llm/llama.cpp/generate_linux.go
@@ -0,0 +1,22 @@
 package llm
 //go:generate git submodule init
 //go:generate git submodule update --force ggml
 //go:generate git -C ggml apply ../patches/0001-add-detokenize-endpoint.patch
 //go:generate git -C ggml apply ../patches/0002-34B-model-support.patch
 //go:generate git -C ggml apply ../patches/0005-ggml-support-CUDA-s-half-type-for-aarch64-1455-2670.patch
 //go:generate git -C ggml apply ../patches/0001-copy-cuda-runtime-libraries.patch
 //go:generate cmake -S ggml -B ggml/build/cpu -DLLAMA_K_QUANTS=on
 //go:generate cmake --build ggml/build/cpu --target server --config Release
 //go:generate git submodule update --force gguf
 //go:generate git -C gguf apply ../patches/0001-copy-cuda-runtime-libraries.patch
 //go:generate git -C gguf apply ../patches/0001-remove-warm-up-logging.patch
 //go:generate cmake -S gguf -B gguf/build/cpu -DLLAMA_K_QUANTS=on
 //go:generate cmake --build gguf/build/cpu --target server --config Release
 //go:generate cmake -S ggml -B ggml/build/cuda -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on
 //go:generate cmake --build ggml/build/cuda --target server --config Release
 //go:generate cmake -S gguf -B gguf/build/cuda -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on
 //go:generate cmake --build gguf/build/cuda --target server --config Release
--- a/llm/llama.cpp/generate_windows.go
+++ b/llm/llama.cpp/generate_windows.go
@@ -0,0 +1,14 @@
 package llm
 //go:generate git submodule init
 //go:generate git submodule update --force ggml
 //go:generate git -C ggml apply ../patches/0001-add-detokenize-endpoint.patch
 //go:generate git -C ggml apply ../patches/0002-34B-model-support.patch
 //go:generate cmake -S ggml -B ggml/build/cpu -DLLAMA_K_QUANTS=on
 //go:generate cmake --build ggml/build/cpu --target server --config Release
 //go:generate git submodule update --force gguf
 //go:generate git -C gguf apply ../patches/0001-remove-warm-up-logging.patch
 //go:generate cmake -S gguf -B gguf/build/cpu -DLLAMA_K_QUANTS=on
 //go:generate cmake --build gguf/build/cpu --target server --config Release
--- a/llm/llama.cpp/ggml_patch/0003-metal-add-missing-barriers-for-mul-mat-2699.patch
+++ b/llm/llama.cpp/ggml_patch/0003-metal-add-missing-barriers-for-mul-mat-2699.patch
@@ -1,32 +0,0 @@
 From 8c0ea847ac1460bca534d92266e3471cb31471be Mon Sep 17 00:00:00 2001
 From: Bruce MacDonald <brucewmacdonald@gmail.com>
 Date: Tue, 5 Sep 2023 16:05:08 -0400
 Subject: [PATCH] metal: add missing barriers for mul-mat #2699
 ---
 ggml-metal.metal | 2 ++
 1 file changed, 2 insertions(+)
 diff --git a/ggml-metal.metal b/ggml-metal.metal
 index 3f31252..ce3541f 100644
 --- a/ggml-metal.metal
 +++ b/ggml-metal.metal
@@ -1850,6 +1850,7 @@ kernel void kernel_mul_mm(device const  uchar * src0,
         //load data and store to threadgroup memory
         half4x4 temp_a;
         dequantize_func(x, il, temp_a);
 +        threadgroup_barrier(mem_flags::mem_threadgroup);
         #pragma unroll(16)
         for (int i = 0; i < 16; i++) {
             *(sa + SG_MAT_SIZE * ((tiitg / THREAD_PER_ROW / 8) \
@@ -1895,6 +1896,7 @@ kernel void kernel_mul_mm(device const  uchar * src0,
         }
     } else {
         // block is smaller than 64x32, we should avoid writing data outside of the matrix
 +        threadgroup_barrier(mem_flags::mem_threadgroup);
         threadgroup float *temp_str = ((threadgroup float *)shared_memory) \
                                       + 32 * (sgitg&1) + (16 * (sgitg>>1)) * BLOCK_SIZE_M;
         for (int i = 0; i < 8; i++) {
 -- 
 2.39.2 (Apple Git-143)
--- a/llm/llama.cpp/gguf
+++ b/llm/llama.cpp/gguf
--- a/llm/llama.cpp/ggml_patch/0001-add-detokenize-endpoint.patch
+++ b/llm/llama.cpp/ggml_patch/0001-add-detokenize-endpoint.patch
--- a/llm/llama.cpp/patches/0001-copy-cuda-runtime-libraries.patch
+++ b/llm/llama.cpp/patches/0001-copy-cuda-runtime-libraries.patch
@@ -0,0 +1,27 @@
 From 5dd02993e8cc2ce309157736b95bb572f274a3fd Mon Sep 17 00:00:00 2001
 From: Michael Yang <mxyng@pm.me>
 Date: Wed, 20 Sep 2023 14:19:52 -0700
 Subject: [PATCH] copy cuda runtime libraries
 ---
 CMakeLists.txt | 4 ++++
 1 file changed, 4 insertions(+)
 diff --git a/CMakeLists.txt b/CMakeLists.txt
 index 824d9f2..dd24137 100644
 --- a/CMakeLists.txt
 +++ b/CMakeLists.txt
@@ -274,6 +274,10 @@ if (LLAMA_CUBLAS)
             set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
         endif()
 +        configure_file(${CUDAToolkit_LIBRARY_DIR}/libcudart.so ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/libcudart.so.${CUDAToolkit_VERSION_MAJOR}.0 COPYONLY)
 +        configure_file(${CUDAToolkit_LIBRARY_DIR}/libcublas.so ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/libcublas.so.${CUDAToolkit_VERSION_MAJOR} COPYONLY)
 +        configure_file(${CUDAToolkit_LIBRARY_DIR}/libcublasLt.so ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/libcublasLt.so.${CUDAToolkit_VERSION_MAJOR} COPYONLY)
 +
     if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
         # 52 == lowest CUDA 12 standard
         # 60 == f16 CUDA intrinsics
 -- 
 2.42.0
--- a/llm/llama.cpp/patches/0001-remove-warm-up-logging.patch
+++ b/llm/llama.cpp/patches/0001-remove-warm-up-logging.patch
@@ -0,0 +1,25 @@
 From 07993bdc35345b67b27aa649a7c099ad42d80c4c Mon Sep 17 00:00:00 2001
 From: Michael Yang <mxyng@pm.me>
 Date: Thu, 21 Sep 2023 14:43:21 -0700
 Subject: [PATCH] remove warm up logging
 ---
 common/common.cpp | 2 --
 1 file changed, 2 deletions(-)
 diff --git a/common/common.cpp b/common/common.cpp
 index 2597ba0..b56549b 100644
 --- a/common/common.cpp
 +++ b/common/common.cpp
@@ -780,8 +780,6 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
     }
     {
 -        LOG("warming up the model with an empty run\n");
 -
         const std::vector<llama_token> tmp = { llama_token_bos(lctx), llama_token_eos(lctx), };
         llama_eval(lctx, tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, params.n_threads);
         llama_reset_timings(lctx);
 -- 
 2.42.0
--- a/llm/llama.cpp/ggml_patch/0002-34B-model-support.patch
+++ b/llm/llama.cpp/ggml_patch/0002-34B-model-support.patch
--- a/llm/llama.cpp/ggml_patch/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
+++ b/llm/llama.cpp/ggml_patch/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
--- a/llm/llama.cpp/ggml_patch/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
+++ b/llm/llama.cpp/ggml_patch/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
--- a/llm/llama.cpp/patches/0005-ggml-support-CUDA-s-half-type-for-aarch64-1455-2670.patch
+++ b/llm/llama.cpp/patches/0005-ggml-support-CUDA-s-half-type-for-aarch64-1455-2670.patch
@@ -0,0 +1,32 @@
 From 1e3bc523d8053a77df3ac7126a84d0297ee97ef6 Mon Sep 17 00:00:00 2001
 From: Kylin <56434533+KyL0N@users.noreply.github.com>
 Date: Tue, 22 Aug 2023 15:14:23 +0800
 Subject: [PATCH] ggml : support CUDA's half type for aarch64(#1455) (#2670)
 * ggml: support CUDA's half type for aarch64(#1455)
 support CUDA's half type for aarch64 in ggml_fp16_t definition
 * ggml: use __CUDACC__ to recognise nvcc compiler
 ---
 ggml.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)
 diff --git a/ggml.h b/ggml.h
 index 544ad2d..0ec7ec5 100644
 --- a/ggml.h
 +++ b/ggml.h
@@ -259,8 +259,9 @@
 extern "C" {
 #endif
 -#ifdef __ARM_NEON
 -    // we use the built-in 16-bit float type
 +#if defined(__ARM_NEON) && defined(__CUDACC__)
 +    typedef half ggml_fp16_t;
 +#elif defined(__ARM_NEON)
     typedef __fp16 ggml_fp16_t;
 #else
     typedef uint16_t ggml_fp16_t;
 -- 
 2.39.2 (Apple Git-143)
--- a/llm/ggml_llama.go
+++ b/llm/ggml_llama.go
@@ -20,127 +20,140 @@ import (
 	"runtime"
 	"strconv"
 	"strings"
 	"sync"
 	"time"
 	"github.com/jmorganca/ollama/api"
 )
-const ModelFamilyLlama ModelFamily = "llama"
+//go:embed llama.cpp/*/build/*/bin/*
 //go:embed llama.cpp/ggml/build/*/bin/*
 var llamaCppEmbed embed.FS
 var (
 	ggmlGPU = path.Join("llama.cpp", "ggml", "build", "gpu", "bin")
 	ggmlCPU = path.Join("llama.cpp", "ggml", "build", "cpu", "bin")
 )
 var (
 	ggmlInit       sync.Once
 	ggmlRunnerPath string
 )
 func osPath(llamaPath string) string {
 	if runtime.GOOS == "windows" {
 		return path.Join(llamaPath, "Release")
 	}
 	return llamaPath
 }
 func initGGML() {
 	ggmlInit.Do(func() {
 		tmpDir, err := os.MkdirTemp("", "llama-*")
 		if err != nil {
 			log.Fatalf("llama.cpp: failed to create temp dir: %v", err)
 		}
 		llamaPath := osPath(ggmlGPU)
 		if _, err := fs.Stat(llamaCppEmbed, llamaPath); err != nil {
 			llamaPath = osPath(ggmlCPU)
 			if _, err := fs.Stat(llamaCppEmbed, llamaPath); err != nil {
 				log.Fatalf("llama.cpp executable not found")
 			}
 		}
 		files := []string{"server"}
 		switch runtime.GOOS {
 		case "windows":
 			files = []string{"server.exe"}
 		case "darwin":
 			if llamaPath == osPath(ggmlGPU) {
 				files = append(files, "ggml-metal.metal")
 			}
 		}
 		for _, f := range files {
 			srcPath := path.Join(llamaPath, f)
 			destPath := filepath.Join(tmpDir, f)
 			srcFile, err := llamaCppEmbed.Open(srcPath)
 			if err != nil {
 				log.Fatalf("read llama.cpp %s: %v", f, err)
 			}
 			defer srcFile.Close()
 			destFile, err := os.OpenFile(destPath, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
 			if err != nil {
 				log.Fatalf("write llama.cpp %s: %v", f, err)
 			}
 			defer destFile.Close()
 			if _, err := io.Copy(destFile, srcFile); err != nil {
 				log.Fatalf("copy llama.cpp %s: %v", f, err)
 			}
 		}
 		ggmlRunnerPath = filepath.Join(tmpDir, "server")
 		if runtime.GOOS == "windows" {
 			ggmlRunnerPath = filepath.Join(tmpDir, "server.exe")
 		}
 	})
 }
 type ModelRunner struct {
 	Path string // path to the model runner executable
 }
-func ggmlRunner() ModelRunner {
+func chooseRunners(workDir, runnerType string) []ModelRunner {
-	initGGML()
+	buildPath := path.Join("llama.cpp", runnerType, "build")
-	return ModelRunner{Path: ggmlRunnerPath}
+	var runners []string
 	// set the runners based on the OS
 	// IMPORTANT: the order of the runners in the array is the priority order
 	switch runtime.GOOS {
 	case "darwin":
 		runners = []string{
 			path.Join(buildPath, "metal", "bin", "server"),
 			path.Join(buildPath, "cpu", "bin", "server"),
 		}
 	case "linux":
 		runners = []string{
 			path.Join(buildPath, "cuda", "bin", "server"),
 			path.Join(buildPath, "cpu", "bin", "server"),
 		}
 	case "windows":
 		// TODO: select windows GPU runner here when available
 		runners = []string{
 			path.Join(buildPath, "cpu", "bin", "Release", "server.exe"),
 		}
 	default:
 		log.Printf("unknown OS, running on CPU: %s", runtime.GOOS)
 		runners = []string{
 			path.Join(buildPath, "cpu", "bin", "server"),
 		}
 	}
 	runnerAvailable := false // if no runner files are found in the embed, this flag will cause a fast fail
 	for _, r := range runners {
 		// find all the files in the runner's bin directory
 		files, err := fs.Glob(llamaCppEmbed, filepath.Join(filepath.Dir(r), "*"))
 		if err != nil {
 			// this is expected, ollama may be compiled without all runners packed in
 			log.Printf("%s runner not found: %v", r, err)
 			continue
 		}
 		runnerAvailable = true
 		for _, f := range files {
 			srcFile, err := llamaCppEmbed.Open(f)
 			if err != nil {
 				log.Fatalf("read llama runner %s: %v", f, err)
 			}
 			defer srcFile.Close()
 			// create the directory in case it does not exist
 			destPath := filepath.Join(workDir, filepath.Dir(f))
 			if err := os.MkdirAll(destPath, 0o755); err != nil {
 				log.Fatalf("create runner temp dir %s: %v", filepath.Dir(f), err)
 			}
 			destFile := filepath.Join(destPath, filepath.Base(f))
 			_, err = os.Stat(destFile)
 			switch {
 			case errors.Is(err, os.ErrNotExist):
 				destFile, err := os.OpenFile(destFile, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
 				if err != nil {
 					log.Fatalf("write llama runner %s: %v", f, err)
 				}
 				defer destFile.Close()
 				if _, err := io.Copy(destFile, srcFile); err != nil {
 					log.Fatalf("copy llama runner %s: %v", f, err)
 				}
 			case err != nil:
 				log.Fatalf("stat llama runner %s: %v", f, err)
 			}
 		}
 	}
 	if !runnerAvailable {
 		log.Fatalf("%s runner not found", runnerType)
 	}
 	// return the runners to try in priority order
 	localRunnersByPriority := []ModelRunner{}
 	for _, r := range runners {
 		localRunnersByPriority = append(localRunnersByPriority, ModelRunner{Path: path.Join(workDir, r)})
 	}
 	return localRunnersByPriority
 }
 type llamaModel struct {
 	hyperparameters llamaHyperparameters
 }
-func (llm *llamaModel) ModelFamily() ModelFamily {
+func (llm *llamaModel) ModelFamily() string {
-	return ModelFamilyLlama
+	return "llama"
 }
-func (llm *llamaModel) ModelType() ModelType {
+func llamaModelType(numLayer uint32) string {
-	switch llm.hyperparameters.NumLayer {
+	switch numLayer {
 	case 26:
-		return ModelType3B
+		return "3B"
 	case 32:
-		return ModelType7B
+		return "7B"
 	case 40:
-		return ModelType13B
+		return "13B"
 	case 48:
-		return ModelType34B
+		return "34B"
 	case 60:
-		return ModelType30B
+		return "30B"
 	case 80:
-		return ModelType65B
+		return "65B"
 	default:
 		return "Unknown"
 	}
 	// TODO: find a better default
 	return ModelType7B
 }
-func (llm *llamaModel) FileType() FileType {
+func (llm *llamaModel) ModelType() string {
-	return llm.hyperparameters.FileType
+	return llamaModelType(llm.hyperparameters.NumLayer)
 }
 func (llm *llamaModel) FileType() string {
 	return fileType(llm.hyperparameters.FileType)
 }
 func (llm *llamaModel) NumLayers() int64 {
 	return int64(llm.hyperparameters.NumLayer)
 }
 type llamaHyperparameters struct {
@@ -157,70 +170,7 @@ type llamaHyperparameters struct {
 	NumRot   uint32
 	// FileType describes the quantization level of the model, e.g. Q4_0, Q5_K, etc.
-	FileType llamaFileType
+	FileType uint32
 }
 type llamaFileType uint32
 const (
 	llamaFileTypeF32 llamaFileType = iota
 	llamaFileTypeF16
 	llamaFileTypeQ4_0
 	llamaFileTypeQ4_1
 	llamaFileTypeQ4_1_F16
 	llamaFileTypeQ8_0 llamaFileType = iota + 2
 	llamaFileTypeQ5_0
 	llamaFileTypeQ5_1
 	llamaFileTypeQ2_K
 	llamaFileTypeQ3_K_S
 	llamaFileTypeQ3_K_M
 	llamaFileTypeQ3_K_L
 	llamaFileTypeQ4_K_S
 	llamaFileTypeQ4_K_M
 	llamaFileTypeQ5_K_S
 	llamaFileTypeQ5_K_M
 	llamaFileTypeQ6_K
 )
 func (ft llamaFileType) String() string {
 	switch ft {
 	case llamaFileTypeF32:
 		return "F32"
 	case llamaFileTypeF16:
 		return "F16"
 	case llamaFileTypeQ4_0:
 		return "Q4_0"
 	case llamaFileTypeQ4_1:
 		return "Q4_1"
 	case llamaFileTypeQ4_1_F16:
 		return "Q4_1_F16"
 	case llamaFileTypeQ8_0:
 		return "Q8_0"
 	case llamaFileTypeQ5_0:
 		return "Q5_0"
 	case llamaFileTypeQ5_1:
 		return "Q5_1"
 	case llamaFileTypeQ2_K:
 		return "Q2_K"
 	case llamaFileTypeQ3_K_S:
 		return "Q3_K_S"
 	case llamaFileTypeQ3_K_M:
 		return "Q3_K_M"
 	case llamaFileTypeQ3_K_L:
 		return "Q3_K_L"
 	case llamaFileTypeQ4_K_S:
 		return "Q4_K_S"
 	case llamaFileTypeQ4_K_M:
 		return "Q4_K_M"
 	case llamaFileTypeQ5_K_S:
 		return "Q5_K_S"
 	case llamaFileTypeQ5_K_M:
 		return "Q5_K_M"
 	case llamaFileTypeQ6_K:
 		return "Q6_K"
 	default:
 		return "Unknown"
 	}
 }
 type Running struct {
@@ -234,12 +184,66 @@ type llama struct {
 	Running
 }
-func newLlama(model string, adapters []string, runner ModelRunner, opts api.Options) (*llama, error) {
+var errNoGPU = errors.New("nvidia-smi command failed")
-	if _, err := os.Stat(model); err != nil {
+
-		return nil, err
+// CheckVRAM returns the available VRAM in MiB on Linux machines with NVIDIA GPUs
 func CheckVRAM() (int, error) {
 	cmd := exec.Command("nvidia-smi", "--query-gpu=memory.total", "--format=csv,noheader,nounits")
 	var stdout bytes.Buffer
 	cmd.Stdout = &stdout
 	err := cmd.Run()
 	if err != nil {
 		return 0, errNoGPU
 	}
-	if _, err := os.Stat(runner.Path); err != nil {
+	var total int
 	scanner := bufio.NewScanner(&stdout)
 	for scanner.Scan() {
 		line := scanner.Text()
 		vram, err := strconv.Atoi(line)
 		if err != nil {
 			return 0, fmt.Errorf("failed to parse available VRAM: %v", err)
 		}
 		total += vram
 	}
 	return total, nil
 }
 func NumGPU(numLayer, fileSizeBytes int64, opts api.Options) int {
 	if opts.NumGPU != -1 {
 		return opts.NumGPU
 	}
 	n := 1 // default to enable metal on macOS
 	if runtime.GOOS == "linux" {
 		vramMib, err := CheckVRAM()
 		if err != nil {
 			if err.Error() != "nvidia-smi command failed" {
 				log.Print(err.Error())
 			}
 			// nvidia driver not installed or no nvidia GPU found
 			return 0
 		}
 		totalVramBytes := int64(vramMib) * 1024 * 1024 // 1 MiB = 1024^2 bytes
 		// Calculate bytes per layer
 		// TODO: this is a rough heuristic, better would be to calculate this based on number of layers and context size
 		bytesPerLayer := fileSizeBytes / numLayer
 		// set n to the max number of layers we can fit in VRAM
 		return int(totalVramBytes / bytesPerLayer)
 		log.Printf("%d MiB VRAM available, loading up to %d GPU layers", vramMib, n)
 	}
 	// default to enable metal on macOS
 	return 1
 }
 func newLlama(model string, adapters []string, runners []ModelRunner, numLayers int64, opts api.Options) (*llama, error) {
 	fileInfo, err := os.Stat(model)
 	if err != nil {
 		return nil, err
 	}
@@ -250,14 +254,17 @@ func newLlama(model string, adapters []string, runner ModelRunner, opts api.Opti
 	params := []string{
 		"--model", model,
 		"--ctx-size", fmt.Sprintf("%d", opts.NumCtx),
 		"--gqa", fmt.Sprintf("%d", opts.NumGQA),
 		"--rope-freq-base", fmt.Sprintf("%f", opts.RopeFrequencyBase),
 		"--rope-freq-scale", fmt.Sprintf("%f", opts.RopeFrequencyScale),
 		"--batch-size", fmt.Sprintf("%d", opts.NumBatch),
-		"--n-gpu-layers", fmt.Sprintf("%d", opts.NumGPU),
+		"--n-gpu-layers", fmt.Sprintf("%d", NumGPU(numLayers, fileInfo.Size(), opts)),
 		"--embedding",
 	}
 	if opts.NumGQA > 0 {
 		params = append(params, "--gqa", fmt.Sprintf("%d", opts.NumGQA))
 	}
 	if len(adapters) > 0 {
 		// TODO: applying multiple adapters is not supported by the llama.cpp server yet
 		params = append(params, "--lora", adapters[0])
@@ -281,7 +288,12 @@ func newLlama(model string, adapters []string, runner ModelRunner, opts api.Opti
 	}
 	// start the llama.cpp server with a retry in case the port is already in use
-	for try := 0; try < 3; try++ {
+	for _, runner := range runners {
 		if _, err := os.Stat(runner.Path); err != nil {
 			log.Printf("llama runner not found: %v", err)
 			continue
 		}
 		port := rand.Intn(65535-49152) + 49152 // get a random port in the ephemeral range
 		ctx, cancel := context.WithCancel(context.Background())
 		cmd := exec.CommandContext(
@@ -289,67 +301,70 @@ func newLlama(model string, adapters []string, runner ModelRunner, opts api.Opti
 			runner.Path,
 			append(params, "--port", strconv.Itoa(port))...,
 		)
 		cmd.Env = append(os.Environ(), fmt.Sprintf("LD_LIBRARY_PATH=%s", filepath.Dir(runner.Path)))
 		cmd.Stdout = os.Stderr
 		cmd.Stderr = os.Stderr
 		llm := &llama{Options: opts, Running: Running{Port: port, Cmd: cmd, Cancel: cancel}}
 		log.Print("starting llama runner")
 		if err := llm.Cmd.Start(); err != nil {
 			log.Printf("error starting the external llama runner: %v", err)
 			continue
 		}
 		// monitor the command, it is blocking, so if it exits we need to capture that
 		go func() {
 			err := llm.Cmd.Wait() // this will block until the command exits
 			if err != nil {
 				log.Printf("llama runner exited with error: %v", err)
 			} else {
 				log.Printf("llama runner exited")
 			}
 		}()
 		if err := waitForServer(llm); err != nil {
-			log.Printf("error starting llama.cpp server: %v", err)
+			log.Printf("error starting llama runner: %v", err)
 			llm.Close()
 			// try again
 			continue
 		}
 		// server started successfully
 		return llm, nil
 	}
-	return nil, fmt.Errorf("max retry exceeded starting llama.cpp")
+	return nil, fmt.Errorf("failed to start a llama runner")
 }
 func waitForServer(llm *llama) error {
 	log.Print("starting llama.cpp server")
 	var stderr bytes.Buffer
 	llm.Cmd.Stderr = &stderr
 	err := llm.Cmd.Start()
 	if err != nil {
 		return fmt.Errorf("error starting the external llama.cpp server: %w", err)
 	}
 	exitChan := make(chan error, 1)
 	// the server is a long running process, watch for it exiting to keep track of something going wrong
 	go func() {
 		err := llm.Cmd.Wait()
 		log.Print(stderr.String())
 		exitChan <- err
 	}()
 	// wait for the server to start responding
 	start := time.Now()
-	expiresAt := time.Now().Add(30 * time.Second)
+	expiresAt := time.Now().Add(2 * time.Minute) // be generous with timeout, large models can take a while to load
-	ticker := time.NewTicker(100 * time.Millisecond)
+	ticker := time.NewTicker(200 * time.Millisecond)
-	log.Print("waiting for llama.cpp server to start responding")
+	log.Print("waiting for llama runner to start responding")
 	for range ticker.C {
 		if time.Now().After(expiresAt) {
 			return fmt.Errorf("llama runner did not start within alloted time, retrying")
 		}
-	for {
+		// check if the server process has terminated
-		select {
+		if llm.Cmd.ProcessState != nil && llm.Cmd.ProcessState.Exited() {
-		case <-ticker.C:
+			return fmt.Errorf("llama runner process has terminated")
-			if time.Now().After(expiresAt) {
+		}
-				return fmt.Errorf("llama.cpp server did not start responding within 30 seconds, retrying")
+
-			}
+		if err := llm.Ping(context.Background()); err == nil {
-			if err := llm.Ping(context.Background()); err == nil {
+			break
 				log.Printf("llama.cpp server started in %f seconds", time.Since(start).Seconds())
 				return nil
 			}
 		case err := <-exitChan:
 			return fmt.Errorf("llama.cpp server exited unexpectedly: %w", err)
 		}
 	}
 	log.Printf("llama runner started in %f seconds", time.Since(start).Seconds())
 	return nil
 }
 func (llm *llama) Close() {
-	llm.Running.Cmd.Cancel()
+	llm.Cancel()
 }
 func (llm *llama) SetOptions(opts api.Options) {
@@ -676,7 +691,7 @@ func (llm *llama) Embedding(ctx context.Context, input string) ([]float64, error
 // Ping checks that the server subprocess is still running and responding to requests
 func (llm *llama) Ping(ctx context.Context) error {
-	resp, err := http.Head(fmt.Sprintf("http://127.0.0.1:%d", llm.Running.Port))
+	resp, err := http.Head(fmt.Sprintf("http://127.0.0.1:%d", llm.Port))
 	if err != nil {
 		return fmt.Errorf("ping resp: %w", err)
 	}
--- a/llm/llm.go
+++ b/llm/llm.go
@@ -21,7 +21,7 @@ type LLM interface {
 	Ping(context.Context) error
 }
-func New(model string, adapters []string, opts api.Options) (LLM, error) {
+func New(workDir, model string, adapters []string, opts api.Options) (LLM, error) {
 	if _, err := os.Stat(model); err != nil {
 		return nil, err
 	}
@@ -32,15 +32,22 @@ func New(model string, adapters []string, opts api.Options) (LLM, error) {
 	}
 	defer f.Close()
-	ggml, err := DecodeGGML(f, ModelFamilyLlama)
+	ggml, err := DecodeGGML(f)
 	if err != nil {
 		return nil, err
 	}
-	switch ggml.FileType().String() {
+	switch ggml.FileType() {
-	case "F32", "Q5_0", "Q5_1", "Q8_0":
+	case "Q8_0":
 		if ggml.Name() != "gguf" && opts.NumGPU != 0 {
 			// GGML Q8_0 do not support Metal API and will
 			// cause the runner to segmentation fault so disable GPU
 			log.Printf("WARNING: GPU disabled for F32, Q5_0, Q5_1, and Q8_0")
 			opts.NumGPU = 0
 		}
 	case "F32", "Q5_0", "Q5_1":
 		if opts.NumGPU != 0 {
-			// F32, F16, Q5_0, Q5_1, and Q8_0 do not support Metal API and will
+			// F32, Q5_0, Q5_1, and Q8_0 do not support Metal API and will
 			// cause the runner to segmentation fault so disable GPU
 			log.Printf("WARNING: GPU disabled for F32, Q5_0, Q5_1, and Q8_0")
 			opts.NumGPU = 0
@@ -49,35 +56,44 @@ func New(model string, adapters []string, opts api.Options) (LLM, error) {
 	totalResidentMemory := memory.TotalMemory()
 	switch ggml.ModelType() {
-	case ModelType3B, ModelType7B:
+	case "3B", "7B":
-		if ggml.FileType().String() == "F16" && totalResidentMemory < 16*1024*1024 {
+		if ggml.FileType() == "F16" && totalResidentMemory < 16*1024*1024 {
 			return nil, fmt.Errorf("F16 model requires at least 16GB of memory")
 		} else if totalResidentMemory < 8*1024*1024 {
 			return nil, fmt.Errorf("model requires at least 8GB of memory")
 		}
-	case ModelType13B:
+	case "13B":
-		if ggml.FileType().String() == "F16" && totalResidentMemory < 32*1024*1024 {
+		if ggml.FileType() == "F16" && totalResidentMemory < 32*1024*1024 {
 			return nil, fmt.Errorf("F16 model requires at least 32GB of memory")
 		} else if totalResidentMemory < 16*1024*1024 {
 			return nil, fmt.Errorf("model requires at least 16GB of memory")
 		}
-	case ModelType30B, ModelType34B:
+	case "30B", "34B", "40B":
-		if ggml.FileType().String() == "F16" && totalResidentMemory < 64*1024*1024 {
+		if ggml.FileType() == "F16" && totalResidentMemory < 64*1024*1024 {
 			return nil, fmt.Errorf("F16 model requires at least 64GB of memory")
 		} else if totalResidentMemory < 32*1024*1024 {
 			return nil, fmt.Errorf("model requires at least 32GB of memory")
 		}
-	case ModelType65B:
+	case "65B", "70B":
-		if ggml.FileType().String() == "F16" && totalResidentMemory < 128*1024*1024 {
+		if ggml.FileType() == "F16" && totalResidentMemory < 128*1024*1024 {
 			return nil, fmt.Errorf("F16 model requires at least 128GB of memory")
 		} else if totalResidentMemory < 64*1024*1024 {
 			return nil, fmt.Errorf("model requires at least 64GB of memory")
 		}
 	case "180B":
 		if ggml.FileType() == "F16" && totalResidentMemory < 512*1024*1024 {
 			return nil, fmt.Errorf("F16 model requires at least 512GB of memory")
 		} else if totalResidentMemory < 128*1024*1024 {
 			return nil, fmt.Errorf("model requires at least 128GB of memory")
 		}
 	}
-	switch ggml.ModelFamily() {
+	switch ggml.Name() {
-	case ModelFamilyLlama:
+	case "gguf":
-		return newLlama(model, adapters, ggmlRunner(), opts)
+		opts.NumGQA = 0 // TODO: remove this when llama.cpp runners differ enough to need separate newLlama functions
 		return newLlama(model, adapters, chooseRunners(workDir, "gguf"), ggml.NumLayers(), opts)
 	case "ggml", "ggmf", "ggjt", "ggla":
 		return newLlama(model, adapters, chooseRunners(workDir, "ggml"), ggml.NumLayers(), opts)
 	default:
 		return nil, fmt.Errorf("unknown ggml type: %s", ggml.ModelFamily())
 	}
--- a/scripts/build_darwin.sh
+++ b/scripts/build_darwin.sh
@@ -8,7 +8,7 @@ GO_LDFLAGS="$GO_LDFLAGS -X github.com/jmorganca/ollama/server.mode=release"
 # build universal binary
 GOARCH=arm64 go generate ./...
 GOARCH=arm64 go build -ldflags "$GO_LDFLAGS" -o dist/ollama-darwin-arm64
-rm -rf llm/llama.cpp/ggml/build/*/bin
+rm -rf llm/llama.cpp/*/build/*/bin
 GOARCH=amd64 go generate ./...
 GOARCH=amd64 go build -ldflags "$GO_LDFLAGS" -o dist/ollama-darwin-amd64
 lipo -create -output dist/ollama dist/ollama-darwin-arm64 dist/ollama-darwin-amd64
--- a/scripts/build_linux.sh
+++ b/scripts/build_linux.sh
@@ -0,0 +1,12 @@
 #!/bin/bash
 set -e
 mkdir -p dist
 for ARCH in arm64 amd64; do
    docker buildx build --platform=linux/$ARCH -f Dockerfile.build . -t builder:$ARCH --load
    docker create --platform linux/$ARCH --name builder builder:$ARCH
    docker cp builder:/go/src/github.com/jmorganca/ollama/ollama ./dist/ollama-linux-$ARCH
    docker rm builder
 done
--- a/scripts/install.sh
+++ b/scripts/install.sh
@@ -0,0 +1,227 @@
 #!/bin/sh
 # This script installs Ollama on Linux.
 # It detects the current operating system architecture and installs the appropriate version of Ollama.
 set -eu
 status() { echo ">>> $*" >&2; }
 error() { echo "ERROR $*"; exit 1; }
 warning() { echo "WARNING: $*"; }
 TEMP_DIR=$(mktemp -d)
 cleanup() { rm -rf $TEMP_DIR; }
 trap cleanup EXIT
 available() { command -v $1 >/dev/null; }
 require() {
    local MISSING=''
    for TOOL in $*; do
        if ! available $TOOL; then
            MISSING="$MISSING $TOOL"
        fi
    done
    echo $MISSING
 }
 [ "$(uname -s)" = "Linux" ] || error 'This script is intended to run on Linux only.'
 case "$(uname -m)" in
    x86_64) ARCH="amd64" ;;
    aarch64|arm64) ARCH="arm64" ;;
    *) error "Unsupported architecture: $ARCH" ;;
 esac
 SUDO=
 if [ "$(id -u)" -ne 0 ]; then
    # Running as root, no need for sudo
    if ! available sudo; then
        error "This script requires superuser permissions. Please re-run as root."
    fi
    SUDO="sudo"
 fi
 NEEDS=$(require curl awk grep sed tee xargs)
 if [ -n "$NEEDS" ]; then
    status "ERROR: The following tools are required but missing:"
    for NEED in $NEEDS; do
        echo "  - $NEED"
    done
    exit 1
 fi
 status "Downloading ollama..."
 $SUDO curl --fail --show-error --location --progress-bar -o $TEMP_DIR/ollama "https://ollama.ai/download/ollama-linux-$ARCH"
 status "Installing ollama to /usr/bin..."
 $SUDO install -o0 -g0 -m755 -d /usr/bin
 $SUDO install -o0 -g0 -m755 $TEMP_DIR/ollama /usr/bin/ollama
 install_success() { status 'Install complete. Run "ollama" from the command line.'; }
 trap install_success EXIT
 # Everything from this point onwards is optional.
 configure_systemd() {
    if ! id ollama >/dev/null 2>&1; then
        status "Creating ollama user..."
        $SUDO useradd -r -s /bin/false -m -d /usr/share/ollama ollama
    fi
    status "Creating ollama systemd service..."
    cat <<EOF | $SUDO tee /etc/systemd/system/ollama.service >/dev/null
 [Unit]
 Description=Ollama Service
 After=network-online.target
 [Service]
 ExecStart=/usr/bin/ollama serve
 User=ollama
 Group=ollama
 Restart=always
 RestartSec=3
 Environment="HOME=/usr/share/ollama"
 [Install]
 WantedBy=default.target
 EOF
    SYSTEMCTL_RUNNING="$(systemctl is-system-running || true)"
    case $SYSTEMCTL_RUNNING in
        running|degraded)
            status "Enabling and starting ollama service..."
            $SUDO systemctl daemon-reload
            $SUDO systemctl enable ollama
            $SUDO systemctl restart ollama
            ;;
    esac
 }
 if available systemctl; then
    configure_systemd
 fi
 if ! available lspci && ! available lshw; then
    warning "Unable to detect NVIDIA GPU. Install lspci or lshw to automatically detect and install NVIDIA CUDA drivers."
    exit 0
 fi
 check_gpu() {
    case $1 in
        lspci) available lspci && lspci -d '10de:' | grep -q 'NVIDIA' || return 1 ;;
        lshw) available lshw && $SUDO lshw -c display -numeric | grep -q 'vendor: .* \[10DE\]' || return 1 ;;
        nvidia-smi) available nvidia-smi || return 1 ;;
    esac
 }
 if ! check_gpu lspci && ! check_gpu lshw; then
    warning "No NVIDIA GPU detected. Ollama will run in CPU-only mode."
    exit 0
 fi
 # ref: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#rhel-7-centos-7
 # ref: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#rhel-8-rocky-8
 # ref: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#rhel-9-rocky-9
 # ref: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#fedora
 install_cuda_driver_yum() {
    status 'Installing NVIDIA repository...'
    case $PACKAGE_MANAGER in
        yum)
            $SUDO $PACKAGE_MANAGER -y install yum-utils
            $SUDO $PACKAGE_MANAGER-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-$1$2.repo
            ;;
        dnf)
            $SUDO $PACKAGE_MANAGER config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-$1$2.repo
            ;;
    esac
    case $1 in
        rhel)
            status 'Installing EPEL repository...'
            # EPEL is required for third-party dependencies such as dkms and libvdpau
            $SUDO $PACKAGE_MANAGER -y install https://dl.fedoraproject.org/pub/epel/epel-release-latest-$2.noarch.rpm || true
            ;;
    esac
    status 'Installing CUDA driver...'
    if [ "$1" = 'centos' ] || [ "$1$2" = 'rhel7' ]; then
        $SUDO $PACKAGE_MANAGER -y install nvidia-driver-latest-dkms
    fi
    $SUDO $PACKAGE_MANAGER -y install cuda-drivers
 }
 # ref: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#ubuntu
 # ref: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#debian
 install_cuda_driver_apt() {
    status 'Installing NVIDIA repository...'
    curl -fsSL -o $TEMP_DIR/cuda-keyring.deb https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-keyring_1.1-1_all.deb
    case $1 in
        debian)
            status 'Enabling contrib sources...'
            $SUDO sed 's/main/contrib/' < /etc/apt/sources.list | sudo tee /etc/apt/sources.list.d/contrib.list > /dev/null
            ;;
    esac
    status 'Installing CUDA driver...'
    $SUDO dpkg -i $TEMP_DIR/cuda-keyring.deb
    $SUDO apt-get update
    [ -n "$SUDO" ] && SUDO_E="$SUDO -E" || SUDO_E=
    DEBIAN_FRONTEND=noninteractive $SUDO_E apt-get -y install cuda-drivers -q
 }
 if [ ! -f "/etc/os-release" ]; then
    error "Unknown distribution. Skipping CUDA installation."
 fi
 . /etc/os-release
 OS_NAME=$ID
 OS_VERSION=$VERSION_ID
 PACKAGE_MANAGER=
 for PACKAGE_MANAGER in dnf yum apt-get; do
    if available $PACKAGE_MANAGER; then
        break
    fi
 done
 if [ -z "$PACKAGE_MANAGER" ]; then
    error "Unknown package manager. Skipping CUDA installation."
 fi
 if ! check_gpu nvidia-smi || [ -z "$(nvidia-smi | grep -o "CUDA Version: [0-9]*\.[0-9]*")" ]; then
    case $OS_NAME in
        centos|rhel) install_cuda_driver_yum 'rhel' $OS_VERSION ;;
        rocky) install_cuda_driver_yum 'rhel' $(echo $OS_VERSION | cut -c1) ;;
        fedora) install_cuda_driver_yum $OS_NAME $OS_VERSION ;;
        amzn) install_cuda_driver_yum 'fedora' '35' ;;
        debian) install_cuda_driver_apt $OS_NAME $OS_VERSION ;;
        ubuntu) install_cuda_driver_apt $OS_NAME $(echo $OS_VERSION | sed 's/\.//') ;;
        *) exit ;;
    esac
 fi
 if ! lsmod | grep -q nvidia; then
    KERNEL_RELEASE="$(uname -r)"
    case $OS_NAME in
        centos|rhel|rocky|fedora|amzn) $SUDO $PACKAGE_MANAGER -y install kernel-devel-$KERNEL_RELEASE kernel-headers-$KERNEL_RELEASE ;;
        debian|ubuntu) $SUDO apt-get -y install linux-headers-$KERNEL_RELEASE ;;
        *) exit ;;
    esac
    NVIDIA_CUDA_VERSION=$($SUDO dkms status | awk -F: '/added/ { print $1 }')
    if [ -n "$NVIDIA_CUDA_VERSION" ]; then
        $SUDO dkms install $NVIDIA_CUDA_VERSION
    fi
    if lsmod | grep -q nouveau; then
        status "Removing nouveau..."
        $SUDO rmmod nouveau
    fi
    $SUDO modprobe nvidia
 fi
--- a/server/auth.go
+++ b/server/auth.go
@@ -14,7 +14,7 @@ import (
 	"net/http"
 	"net/url"
 	"os"
-	"path"
+	"path/filepath"
 	"strconv"
 	"strings"
 	"time"
@@ -71,7 +71,7 @@ func (r AuthRedirect) URL() (*url.URL, error) {
 	return redirectURL, nil
 }
-func getAuthToken(ctx context.Context, redirData AuthRedirect, regOpts *RegistryOptions) (string, error) {
+func getAuthToken(ctx context.Context, redirData AuthRedirect) (string, error) {
 	redirectURL, err := redirData.URL()
 	if err != nil {
 		return "", err
@@ -82,7 +82,7 @@ func getAuthToken(ctx context.Context, redirData AuthRedirect, regOpts *Registry
 		return "", err
 	}
-	keyPath := path.Join(home, ".ollama", "id_ed25519")
+	keyPath := filepath.Join(home, ".ollama", "id_ed25519")
 	rawKey, err := os.ReadFile(keyPath)
 	if err != nil {
@@ -103,7 +103,7 @@ func getAuthToken(ctx context.Context, redirData AuthRedirect, regOpts *Registry
 	headers := make(http.Header)
 	headers.Set("Authorization", sig)
-	resp, err := makeRequest(ctx, "GET", redirectURL, headers, nil, regOpts)
+	resp, err := makeRequest(ctx, "GET", redirectURL, headers, nil, nil)
 	if err != nil {
 		log.Printf("couldn't get token: %q", err)
 	}
--- a/server/download.go
+++ b/server/download.go
@@ -8,7 +8,7 @@ import (
 	"log"
 	"net/http"
 	"os"
-	"path"
+	"path/filepath"
 	"strconv"
 	"sync"
 	"time"
@@ -173,7 +173,7 @@ func doDownload(ctx context.Context, opts downloadOpts, f *FileDownload) error {
 		return fmt.Errorf("%w: on download registry responded with code %d: %v", errDownload, resp.StatusCode, string(body))
 	}
-	err = os.MkdirAll(path.Dir(f.FilePath), 0o700)
+	err = os.MkdirAll(filepath.Dir(f.FilePath), 0o700)
 	if err != nil {
 		return fmt.Errorf("make blobs directory: %w", err)
 	}
--- a/server/images.go
+++ b/server/images.go
@@ -14,7 +14,6 @@ import (
 	"net/http"
 	"net/url"
 	"os"
 	"path"
 	"path/filepath"
 	"reflect"
 	"runtime"
@@ -114,10 +113,11 @@ type LayerReader struct {
 }
 type ConfigV2 struct {
-	ModelFamily llm.ModelFamily `json:"model_family"`
+	ModelFormat string `json:"model_format"`
-	ModelType   string          `json:"model_type"`
+	ModelFamily string `json:"model_family"`
-	FileType    string          `json:"file_type"`
+	ModelType   string `json:"model_type"`
-	RootFS      RootFS          `json:"rootfs"`
+	FileType    string `json:"file_type"`
 	RootFS      RootFS `json:"rootfs"`
 	// required by spec
 	Architecture string `json:"architecture"`
@@ -267,7 +267,30 @@ func filenameWithPath(path, f string) (string, error) {
 	return f, nil
 }
-func CreateModel(ctx context.Context, name string, path string, fn func(resp api.ProgressResponse)) error {
+func CreateModel(ctx context.Context, workDir, name string, path string, fn func(resp api.ProgressResponse)) error {
 	mp := ParseModelPath(name)
 	var manifest *ManifestV2
 	var err error
 	var noprune string
 	// build deleteMap to prune unused layers
 	deleteMap := make(map[string]bool)
 	if noprune = os.Getenv("OLLAMA_NOPRUNE"); noprune == "" {
 		manifest, _, err = GetManifest(mp)
 		if err != nil && !errors.Is(err, os.ErrNotExist) {
 			return err
 		}
 		if manifest != nil {
 			for _, l := range manifest.Layers {
 				deleteMap[l.Digest] = true
 			}
 			deleteMap[manifest.Config.Digest] = true
 		}
 	}
 	mf, err := os.Open(path)
 	if err != nil {
 		fn(api.ProgressResponse{Status: fmt.Sprintf("couldn't open modelfile '%s'", path)})
@@ -328,14 +351,15 @@ func CreateModel(ctx context.Context, name string, path string, fn func(resp api
 					}
 					defer file.Close()
-					ggml, err := llm.DecodeGGML(file, llm.ModelFamilyLlama)
+					ggml, err := llm.DecodeGGML(file)
 					if err != nil {
 						return err
 					}
 					config.ModelFormat = ggml.Name()
 					config.ModelFamily = ggml.ModelFamily()
-					config.ModelType = ggml.ModelType().String()
+					config.ModelType = ggml.ModelType()
-					config.FileType = ggml.FileType().String()
+					config.FileType = ggml.FileType()
 					// reset the file
 					file.Seek(0, io.SeekStart)
@@ -366,9 +390,10 @@ func CreateModel(ctx context.Context, name string, path string, fn func(resp api
 					return err
 				}
-				// copie the model metadata
+				// copy the model metadata
 				config.ModelFamily = source.ModelFamily
 				config.ModelType = source.ModelType
 				config.ModelFormat = source.ModelFormat
 				config.FileType = source.FileType
 				for _, l := range mf.Layers {
@@ -435,8 +460,10 @@ func CreateModel(ctx context.Context, name string, path string, fn func(resp api
 				return err
 			}
-			layer.MediaType = mediaType
+			if layer.Size > 0 {
-			layers = append(layers, layer)
+				layer.MediaType = mediaType
 				layers = append(layers, layer)
 			}
 		case "template", "system", "prompt":
 			fn(api.ProgressResponse{Status: fmt.Sprintf("creating model %s layer", c.Name)})
 			// remove the layer if one exists
@@ -448,8 +475,10 @@ func CreateModel(ctx context.Context, name string, path string, fn func(resp api
 				return err
 			}
-			layer.MediaType = mediaType
+			if layer.Size > 0 {
-			layers = append(layers, layer)
+				layer.MediaType = mediaType
 				layers = append(layers, layer)
 			}
 		default:
 			// runtime parameters, build a list of args for each parameter to allow multiple values to be specified (ex: multiple stop sequences)
 			params[c.Name] = append(params[c.Name], c.Args)
@@ -472,6 +501,12 @@ func CreateModel(ctx context.Context, name string, path string, fn func(resp api
 			}
 		}
 		if config.ModelType == "65B" {
 			if numGQA, ok := formattedParams["num_gqa"].(int); ok && numGQA == 8 {
 				config.ModelType = "70B"
 			}
 		}
 		bts, err := json.Marshal(formattedParams)
 		if err != nil {
 			return err
@@ -489,7 +524,7 @@ func CreateModel(ctx context.Context, name string, path string, fn func(resp api
 	}
 	// generate the embedding layers
-	embeddingLayers, err := embeddingLayers(embed)
+	embeddingLayers, err := embeddingLayers(workDir, embed)
 	if err != nil {
 		return err
 	}
@@ -503,6 +538,7 @@ func CreateModel(ctx context.Context, name string, path string, fn func(resp api
 	var manifestLayers []*Layer
 	for _, l := range layers {
 		manifestLayers = append(manifestLayers, &l.Layer)
 		delete(deleteMap, l.Layer.Digest)
 	}
 	// Create a layer for the config object
@@ -512,6 +548,7 @@ func CreateModel(ctx context.Context, name string, path string, fn func(resp api
 		return err
 	}
 	layers = append(layers, cfg)
 	delete(deleteMap, cfg.Layer.Digest)
 	if err := SaveLayers(layers, fn, false); err != nil {
 		return err
@@ -524,6 +561,14 @@ func CreateModel(ctx context.Context, name string, path string, fn func(resp api
 		return err
 	}
 	if noprune == "" {
 		fn(api.ProgressResponse{Status: "removing any unused layers"})
 		err = deleteUnusedLayers(nil, deleteMap, false)
 		if err != nil {
 			return err
 		}
 	}
 	fn(api.ProgressResponse{Status: "success"})
 	return nil
 }
@@ -536,7 +581,7 @@ type EmbeddingParams struct {
 }
 // embeddingLayers loads the associated LLM and generates the embeddings to be stored from an input file
-func embeddingLayers(e EmbeddingParams) ([]*LayerReader, error) {
+func embeddingLayers(workDir string, e EmbeddingParams) ([]*LayerReader, error) {
 	layers := []*LayerReader{}
 	if len(e.files) > 0 {
 		// check if the model is a file path or a model name
@@ -549,7 +594,7 @@ func embeddingLayers(e EmbeddingParams) ([]*LayerReader, error) {
 			model = &Model{ModelPath: e.model}
 		}
-		if err := load(context.Background(), model, e.opts, defaultSessionDuration); err != nil {
+		if err := load(context.Background(), workDir, model, e.opts, defaultSessionDuration); err != nil {
 			return nil, fmt.Errorf("load model to generate embeddings: %v", err)
 		}
@@ -779,14 +824,14 @@ func formatParams(params map[string][]string) (map[string]interface{}, error) {
 						return nil, fmt.Errorf("invalid float value %s", vals)
 					}
-					out[key] = floatVal
+					out[key] = float32(floatVal)
 				case reflect.Int:
-					intVal, err := strconv.ParseInt(vals[0], 10, 0)
+					intVal, err := strconv.ParseInt(vals[0], 10, 64)
 					if err != nil {
 						return nil, fmt.Errorf("invalid int value %s", vals)
 					}
-					out[key] = intVal
+					out[key] = int(intVal)
 				case reflect.Bool:
 					boolVal, err := strconv.ParseBool(vals[0])
 					if err != nil {
@@ -866,18 +911,7 @@ func CopyModel(src, dest string) error {
 	return nil
 }
-func DeleteModel(name string) error {
+func deleteUnusedLayers(skipModelPath *ModelPath, deleteMap map[string]bool, dryRun bool) error {
 	mp := ParseModelPath(name)
 	manifest, _, err := GetManifest(mp)
 	if err != nil {
 		return err
 	}
 	deleteMap := make(map[string]bool)
 	for _, layer := range manifest.Layers {
 		deleteMap[layer.Digest] = true
 	}
 	deleteMap[manifest.Config.Digest] = true
 	fp, err := GetManifestPath()
 	if err != nil {
 		return err
@@ -894,14 +928,13 @@ func DeleteModel(name string) error {
 		fmp := ParseModelPath(tag)
 		// skip the manifest we're trying to delete
-		if mp.GetFullTagname() == fmp.GetFullTagname() {
+		if skipModelPath != nil && skipModelPath.GetFullTagname() == fmp.GetFullTagname() {
 			return nil
 		}
 		// save (i.e. delete from the deleteMap) any files used in other manifests
 		manifest, _, err := GetManifest(fmp)
 		if err != nil {
 			log.Printf("skipping file: %s", fp)
 			return nil
 		}
@@ -925,14 +958,72 @@ func DeleteModel(name string) error {
 				log.Printf("couldn't get file path for '%s': %v", k, err)
 				continue
 			}
-			if err := os.Remove(fp); err != nil {
+			if !dryRun {
-				log.Printf("couldn't remove file '%s': %v", fp, err)
+				if err := os.Remove(fp); err != nil {
-				continue
+					log.Printf("couldn't remove file '%s': %v", fp, err)
 					continue
 				}
 			} else {
 				log.Printf("wanted to remove: %s", fp)
 			}
 		}
 	}
-	fp, err = mp.GetManifestPath(false)
+	return nil
 }
 func PruneLayers() error {
 	deleteMap := make(map[string]bool)
 	p, err := GetBlobsPath("")
 	if err != nil {
 		return err
 	}
 	blobs, err := os.ReadDir(p)
 	if err != nil {
 		log.Printf("couldn't read dir '%s': %v", p, err)
 		return err
 	}
 	for _, blob := range blobs {
 		name := blob.Name()
 		if runtime.GOOS == "windows" {
 			name = strings.ReplaceAll(name, "-", ":")
 		}
 		deleteMap[name] = true
 	}
 	log.Printf("total blobs: %d", len(deleteMap))
 	err = deleteUnusedLayers(nil, deleteMap, false)
 	if err != nil {
 		return err
 	}
 	log.Printf("total unused blobs removed: %d", len(deleteMap))
 	return nil
 }
 func DeleteModel(name string) error {
 	mp := ParseModelPath(name)
 	manifest, _, err := GetManifest(mp)
 	if err != nil {
 		return err
 	}
 	deleteMap := make(map[string]bool)
 	for _, layer := range manifest.Layers {
 		deleteMap[layer.Digest] = true
 	}
 	deleteMap[manifest.Config.Digest] = true
 	err = deleteUnusedLayers(&mp, deleteMap, false)
 	if err != nil {
 		return err
 	}
 	fp, err := mp.GetManifestPath(false)
 	if err != nil {
 		return err
 	}
@@ -1063,14 +1154,14 @@ func PushModel(ctx context.Context, name string, regOpts *RegistryOptions, fn fu
 			Total:  layer.Size,
 		})
-		location, err := startUpload(ctx, mp, layer, regOpts)
+		location, chunkSize, err := startUpload(ctx, mp, layer, regOpts)
 		if err != nil {
 			log.Printf("couldn't start upload: %v", err)
 			return err
 		}
-		if strings.HasPrefix(path.Base(location.Path), "sha256:") {
+		if strings.HasPrefix(filepath.Base(location.Path), "sha256:") {
-			layer.Digest = path.Base(location.Path)
+			layer.Digest = filepath.Base(location.Path)
 			fn(api.ProgressResponse{
 				Status:    "using existing layer",
 				Digest:    layer.Digest,
@@ -1080,7 +1171,7 @@ func PushModel(ctx context.Context, name string, regOpts *RegistryOptions, fn fu
 			continue
 		}
-		if err := uploadBlobChunked(ctx, location, layer, regOpts, fn); err != nil {
+		if err := uploadBlob(ctx, location, layer, chunkSize, regOpts, fn); err != nil {
 			log.Printf("error uploading blob: %v", err)
 			return err
 		}
@@ -1111,13 +1202,34 @@ func PushModel(ctx context.Context, name string, regOpts *RegistryOptions, fn fu
 func PullModel(ctx context.Context, name string, regOpts *RegistryOptions, fn func(api.ProgressResponse)) error {
 	mp := ParseModelPath(name)
 	var manifest *ManifestV2
 	var err error
 	var noprune string
 	// build deleteMap to prune unused layers
 	deleteMap := make(map[string]bool)
 	if noprune = os.Getenv("OLLAMA_NOPRUNE"); noprune == "" {
 		manifest, _, err = GetManifest(mp)
 		if err != nil && !errors.Is(err, os.ErrNotExist) {
 			return err
 		}
 		if manifest != nil {
 			for _, l := range manifest.Layers {
 				deleteMap[l.Digest] = true
 			}
 			deleteMap[manifest.Config.Digest] = true
 		}
 	}
 	if mp.ProtocolScheme == "http" && !regOpts.Insecure {
 		return fmt.Errorf("insecure protocol http")
 	}
 	fn(api.ProgressResponse{Status: "pulling manifest"})
-	manifest, err := pullModelManifest(ctx, mp, regOpts)
+	manifest, err = pullModelManifest(ctx, mp, regOpts)
 	if err != nil {
 		return fmt.Errorf("pull model manifest: %s", err)
 	}
@@ -1137,7 +1249,9 @@ func PullModel(ctx context.Context, name string, regOpts *RegistryOptions, fn fu
 			}); err != nil {
 			return err
 		}
 		delete(deleteMap, layer.Digest)
 	}
 	delete(deleteMap, manifest.Config.Digest)
 	fn(api.ProgressResponse{Status: "verifying sha256 digest"})
 	for _, layer := range layers {
@@ -1175,6 +1289,14 @@ func PullModel(ctx context.Context, name string, regOpts *RegistryOptions, fn fu
 		return err
 	}
 	if noprune == "" {
 		fn(api.ProgressResponse{Status: "removing any unused layers"})
 		err = deleteUnusedLayers(nil, deleteMap, false)
 		if err != nil {
 			return err
 		}
 	}
 	fn(api.ProgressResponse{Status: "success"})
 	return nil
@@ -1275,7 +1397,7 @@ func makeRequestWithRetry(ctx context.Context, method string, requestURL *url.UR
 		case resp.StatusCode == http.StatusUnauthorized:
 			auth := resp.Header.Get("www-authenticate")
 			authRedir := ParseAuthRedirectString(auth)
-			token, err := getAuthToken(ctx, authRedir, regOpts)
+			token, err := getAuthToken(ctx, authRedir)
 			if err != nil {
 				return nil, err
 			}
@@ -1300,7 +1422,7 @@ func makeRequestWithRetry(ctx context.Context, method string, requestURL *url.UR
 }
 func makeRequest(ctx context.Context, method string, requestURL *url.URL, headers http.Header, body io.Reader, regOpts *RegistryOptions) (*http.Response, error) {
-	if requestURL.Scheme != "http" && regOpts.Insecure {
+	if requestURL.Scheme != "http" && regOpts != nil && regOpts.Insecure {
 		requestURL.Scheme = "http"
 	}
@@ -1313,14 +1435,25 @@ func makeRequest(ctx context.Context, method string, requestURL *url.URL, header
 		req.Header = headers
 	}
-	if regOpts.Token != "" {
+	if regOpts != nil {
-		req.Header.Set("Authorization", "Bearer "+regOpts.Token)
+		if regOpts.Token != "" {
-	} else if regOpts.Username != "" && regOpts.Password != "" {
+			req.Header.Set("Authorization", "Bearer "+regOpts.Token)
-		req.SetBasicAuth(regOpts.Username, regOpts.Password)
+		} else if regOpts.Username != "" && regOpts.Password != "" {
 			req.SetBasicAuth(regOpts.Username, regOpts.Password)
 		}
 	}
 	req.Header.Set("User-Agent", fmt.Sprintf("ollama/%s (%s %s) Go/%s", version.Version, runtime.GOARCH, runtime.GOOS, runtime.Version()))
 	if s := req.Header.Get("Content-Length"); s != "" {
 		contentLength, err := strconv.ParseInt(s, 10, 64)
 		if err != nil {
 			return nil, err
 		}
 		req.ContentLength = contentLength
 	}
 	client := &http.Client{
 		CheckRedirect: func(req *http.Request, via []*http.Request) error {
 			if len(via) >= 10 {
--- a/server/modelpath.go
+++ b/server/modelpath.go
@@ -133,7 +133,12 @@ func GetBlobsPath(digest string) (string, error) {
 	}
 	path := filepath.Join(home, ".ollama", "models", "blobs", digest)
-	if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil {
+	dirPath := filepath.Dir(path)
 	if digest == "" {
 		dirPath = path
 	}
 	if err := os.MkdirAll(dirPath, 0o755); err != nil {
 		return "", err
 	}
--- a/server/routes.go
+++ b/server/routes.go
@@ -12,6 +12,7 @@ import (
 	"os/signal"
 	"path/filepath"
 	"reflect"
 	"runtime"
 	"strconv"
 	"strings"
 	"sync"
@@ -57,7 +58,7 @@ var loaded struct {
 var defaultSessionDuration = 5 * time.Minute
 // load a model into memory if it is not already loaded, it is up to the caller to lock loaded.mu before calling this function
-func load(ctx context.Context, model *Model, reqOpts map[string]interface{}, sessionDuration time.Duration) error {
+func load(ctx context.Context, workDir string, model *Model, reqOpts map[string]interface{}, sessionDuration time.Duration) error {
 	opts := api.DefaultOptions()
 	if err := opts.FromMap(model.Options); err != nil {
 		log.Printf("could not load model options: %v", err)
@@ -93,7 +94,7 @@ func load(ctx context.Context, model *Model, reqOpts map[string]interface{}, ses
 			loaded.Embeddings = model.Embeddings
 		}
-		llmModel, err := llm.New(model.ModelPath, model.AdapterPaths, opts)
+		llmModel, err := llm.New(workDir, model.ModelPath, model.AdapterPaths, opts)
 		if err != nil {
 			return err
 		}
@@ -129,6 +130,7 @@ func load(ctx context.Context, model *Model, reqOpts map[string]interface{}, ses
 			llmModel.SetOptions(opts)
 		}
 	}
 	loaded.expireAt = time.Now().Add(sessionDuration)
 	if loaded.expireTimer == nil {
@@ -149,6 +151,7 @@ func load(ctx context.Context, model *Model, reqOpts map[string]interface{}, ses
 			loaded.digest = ""
 		})
 	}
 	loaded.expireTimer.Reset(sessionDuration)
 	return nil
 }
@@ -171,8 +174,11 @@ func GenerateHandler(c *gin.Context) {
 		return
 	}
-	sessionDuration := defaultSessionDuration // TODO: set this duration from the request if specified
+	workDir := c.GetString("workDir")
-	if err := load(c.Request.Context(), model, req.Options, sessionDuration); err != nil {
+
 	// TODO: set this duration from the request if specified
 	sessionDuration := defaultSessionDuration
 	if err := load(c.Request.Context(), workDir, model, req.Options, sessionDuration); err != nil {
 		c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
 		return
 	}
@@ -217,8 +223,13 @@ func GenerateHandler(c *gin.Context) {
 			ch <- r
 		}
-		if err := loaded.llm.Predict(c.Request.Context(), req.Context, prompt, fn); err != nil {
+		// an empty request loads the model
-			ch <- gin.H{"error": err.Error()}
+		if req.Prompt == "" && req.Template == "" && req.System == "" {
 			ch <- api.GenerateResponse{Model: req.Model, Done: true}
 		} else {
 			if err := loaded.llm.Predict(c.Request.Context(), req.Context, prompt, fn); err != nil {
 				ch <- gin.H{"error": err.Error()}
 			}
 		}
 	}()
@@ -240,7 +251,9 @@ func EmbeddingHandler(c *gin.Context) {
 		c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
 		return
 	}
-	if err := load(c.Request.Context(), model, req.Options, 5*time.Minute); err != nil {
+
 	workDir := c.GetString("workDir")
 	if err := load(c.Request.Context(), workDir, model, req.Options, 5*time.Minute); err != nil {
 		c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
 		return
 	}
@@ -330,6 +343,8 @@ func CreateModelHandler(c *gin.Context) {
 		return
 	}
 	workDir := c.GetString("workDir")
 	ch := make(chan any)
 	go func() {
 		defer close(ch)
@@ -340,7 +355,7 @@ func CreateModelHandler(c *gin.Context) {
 		ctx, cancel := context.WithCancel(c.Request.Context())
 		defer cancel()
-		if err := CreateModel(ctx, req.Name, req.Path, fn); err != nil {
+		if err := CreateModel(ctx, workDir, req.Name, req.Path, fn); err != nil {
 			ch <- gin.H{"error": err.Error()}
 		}
 	}()
@@ -363,6 +378,7 @@ func DeleteModelHandler(c *gin.Context) {
 		}
 		return
 	}
 	c.JSON(http.StatusOK, nil)
 }
 func ShowModelHandler(c *gin.Context) {
@@ -493,33 +509,40 @@ func CopyModelHandler(c *gin.Context) {
 	}
 }
-func Serve(ln net.Listener, origins []string) error {
+var defaultAllowOrigins = []string{
 	"localhost",
 	"127.0.0.1",
 	"0.0.0.0",
 }
 func Serve(ln net.Listener, allowOrigins []string) error {
 	config := cors.DefaultConfig()
 	config.AllowWildcard = true
-	config.AllowOrigins = append(origins, []string{
+
-		"http://localhost",
+	config.AllowOrigins = allowOrigins
-		"http://localhost:*",
+	for _, allowOrigin := range defaultAllowOrigins {
-		"https://localhost",
+		config.AllowOrigins = append(config.AllowOrigins,
-		"https://localhost:*",
+			fmt.Sprintf("http://%s", allowOrigin),
-		"http://127.0.0.1",
+			fmt.Sprintf("https://%s", allowOrigin),
-		"http://127.0.0.1:*",
+			fmt.Sprintf("http://%s:*", allowOrigin),
-		"https://127.0.0.1",
+			fmt.Sprintf("https://%s:*", allowOrigin),
-		"https://127.0.0.1:*",
+		)
-		"http://0.0.0.0",
+	}
-		"http://0.0.0.0:*",
+
-		"https://0.0.0.0",
+	workDir, err := os.MkdirTemp("", "ollama")
-		"https://0.0.0.0:*",
+	if err != nil {
-	}...)
+		return err
 	}
 	defer os.RemoveAll(workDir)
 	r := gin.Default()
-	r.Use(cors.New(config))
+	r.Use(
-
+		cors.New(config),
-	r.GET("/", func(c *gin.Context) {
+		func(c *gin.Context) {
-		c.String(http.StatusOK, "Ollama is running")
+			c.Set("workDir", workDir)
-	})
+			c.Next()
-	r.HEAD("/", func(c *gin.Context) {
+		},
-		c.Status(http.StatusOK)
+	)
 	})
 	r.POST("/api/pull", PullModelHandler)
 	r.POST("/api/generate", GenerateHandler)
@@ -527,10 +550,17 @@ func Serve(ln net.Listener, origins []string) error {
 	r.POST("/api/create", CreateModelHandler)
 	r.POST("/api/push", PushModelHandler)
 	r.POST("/api/copy", CopyModelHandler)
 	r.GET("/api/tags", ListModelsHandler)
 	r.DELETE("/api/delete", DeleteModelHandler)
 	r.POST("/api/show", ShowModelHandler)
 	for _, method := range []string{http.MethodGet, http.MethodHead} {
 		r.Handle(method, "/", func(c *gin.Context) {
 			c.String(http.StatusOK, "Ollama is running")
 		})
 		r.Handle(method, "/api/tags", ListModelsHandler)
 	}
 	log.Printf("Listening on %s", ln.Addr())
 	s := &http.Server{
 		Handler: r,
@@ -538,15 +568,23 @@ func Serve(ln net.Listener, origins []string) error {
 	// listen for a ctrl+c and stop any loaded llm
 	signals := make(chan os.Signal, 1)
-	signal.Notify(signals, syscall.SIGINT)
+	signal.Notify(signals, syscall.SIGINT, syscall.SIGTERM)
 	go func() {
 		<-signals
 		if loaded.llm != nil {
 			loaded.llm.Close()
 		}
 		os.RemoveAll(workDir)
 		os.Exit(0)
 	}()
 	if runtime.GOOS == "linux" {
 		// check compatibility to log warnings
 		if _, err := llm.CheckVRAM(); err != nil {
 			log.Printf("Warning: GPU support may not enabled, check you have installed install GPU drivers: %v", err)
 		}
 	}
 	return s.Serve(ln)
 }
--- a/server/upload.go
+++ b/server/upload.go
@@ -14,7 +14,12 @@ import (
 	"github.com/jmorganca/ollama/api"
 )
-func startUpload(ctx context.Context, mp ModelPath, layer *Layer, regOpts *RegistryOptions) (*url.URL, error) {
+const (
 	redirectChunkSize = 1024 * 1024 * 1024
 	regularChunkSize  = 95 * 1024 * 1024
 )
 func startUpload(ctx context.Context, mp ModelPath, layer *Layer, regOpts *RegistryOptions) (*url.URL, int64, error) {
 	requestURL := mp.BaseURL()
 	requestURL = requestURL.JoinPath("v2", mp.GetNamespaceRepository(), "blobs/uploads/")
 	if layer.From != "" {
@@ -27,20 +32,26 @@ func startUpload(ctx context.Context, mp ModelPath, layer *Layer, regOpts *Regis
 	resp, err := makeRequestWithRetry(ctx, "POST", requestURL, nil, nil, regOpts)
 	if err != nil {
 		log.Printf("couldn't start upload: %v", err)
-		return nil, err
+		return nil, 0, err
 	}
 	defer resp.Body.Close()
-	// Extract UUID location from header
+	location := resp.Header.Get("Docker-Upload-Location")
-	location := resp.Header.Get("Location")
+	chunkSize := redirectChunkSize
 	if location == "" {
-		return nil, fmt.Errorf("location header is missing in response")
+		location = resp.Header.Get("Location")
 		chunkSize = regularChunkSize
 	}
-	return url.Parse(location)
+	locationURL, err := url.Parse(location)
 	if err != nil {
 		return nil, 0, err
 	}
 	return locationURL, int64(chunkSize), nil
 }
-func uploadBlobChunked(ctx context.Context, requestURL *url.URL, layer *Layer, regOpts *RegistryOptions, fn func(api.ProgressResponse)) error {
+func uploadBlob(ctx context.Context, requestURL *url.URL, layer *Layer, chunkSize int64, regOpts *RegistryOptions, fn func(api.ProgressResponse)) error {
 	// TODO allow resumability
 	// TODO allow canceling uploads via DELETE
@@ -55,8 +66,12 @@ func uploadBlobChunked(ctx context.Context, requestURL *url.URL, layer *Layer, r
 	}
 	defer f.Close()
-	// 95MB chunk size
+	pw := ProgressWriter{
-	chunkSize := 95 * 1024 * 1024
+		status: fmt.Sprintf("uploading %s", layer.Digest),
 		digest: layer.Digest,
 		total:  layer.Size,
 		fn:     fn,
 	}
 	for offset := int64(0); offset < int64(layer.Size); {
 		chunk := int64(layer.Size) - offset
@@ -64,80 +79,27 @@ func uploadBlobChunked(ctx context.Context, requestURL *url.URL, layer *Layer, r
 			chunk = int64(chunkSize)
 		}
-		sectionReader := io.NewSectionReader(f, int64(offset), chunk)
+		resp, err := uploadBlobChunk(ctx, http.MethodPatch, requestURL, f, offset, chunk, regOpts, &pw)
-		for try := 0; try < MaxRetries; try++ {
+		if err != nil {
-			r, w := io.Pipe()
+			fn(api.ProgressResponse{
-			defer r.Close()
+				Status:    fmt.Sprintf("error uploading chunk: %v", err),
-			go func() {
+				Digest:    layer.Digest,
-				defer w.Close()
+				Total:     layer.Size,
 				Completed: int(offset),
 			})
-				for chunked := int64(0); chunked < chunk; {
+			return err
-					n, err := io.CopyN(w, sectionReader, 1024*1024)
+		}
 					if err != nil && !errors.Is(err, io.EOF) {
 						fn(api.ProgressResponse{
 							Status:    fmt.Sprintf("error reading chunk: %v", err),
 							Digest:    layer.Digest,
 							Total:     layer.Size,
 							Completed: int(offset),
 						})
-						return
+		offset += chunk
-					}
+		location := resp.Header.Get("Docker-Upload-Location")
 		if location == "" {
 			location = resp.Header.Get("Location")
 		}
-					chunked += n
+		requestURL, err = url.Parse(location)
-					fn(api.ProgressResponse{
+		if err != nil {
-						Status:    fmt.Sprintf("uploading %s", layer.Digest),
+			return err
 						Digest:    layer.Digest,
 						Total:     layer.Size,
 						Completed: int(offset) + int(chunked),
 					})
 				}
 			}()
 			headers := make(http.Header)
 			headers.Set("Content-Type", "application/octet-stream")
 			headers.Set("Content-Length", strconv.Itoa(int(chunk)))
 			headers.Set("Content-Range", fmt.Sprintf("%d-%d", offset, offset+sectionReader.Size()-1))
 			resp, err := makeRequest(ctx, "PATCH", requestURL, headers, r, regOpts)
 			if err != nil && !errors.Is(err, io.EOF) {
 				fn(api.ProgressResponse{
 					Status:    fmt.Sprintf("error uploading chunk: %v", err),
 					Digest:    layer.Digest,
 					Total:     layer.Size,
 					Completed: int(offset),
 				})
 				return err
 			}
 			defer resp.Body.Close()
 			switch {
 			case resp.StatusCode == http.StatusUnauthorized:
 				auth := resp.Header.Get("www-authenticate")
 				authRedir := ParseAuthRedirectString(auth)
 				token, err := getAuthToken(ctx, authRedir, regOpts)
 				if err != nil {
 					return err
 				}
 				regOpts.Token = token
 				if _, err := sectionReader.Seek(0, io.SeekStart); err != nil {
 					return err
 				}
 				continue
 			case resp.StatusCode >= http.StatusBadRequest:
 				body, _ := io.ReadAll(resp.Body)
 				return fmt.Errorf("on upload registry responded with code %d: %s", resp.StatusCode, body)
 			}
 			offset += sectionReader.Size()
 			requestURL, err = url.Parse(resp.Header.Get("Location"))
 			if err != nil {
 				return err
 			}
 			break
 		}
 	}
@@ -163,3 +125,90 @@ func uploadBlobChunked(ctx context.Context, requestURL *url.URL, layer *Layer, r
 	}
 	return nil
 }
 func uploadBlobChunk(ctx context.Context, method string, requestURL *url.URL, r io.ReaderAt, offset, limit int64, opts *RegistryOptions, pw *ProgressWriter) (*http.Response, error) {
 	sectionReader := io.NewSectionReader(r, int64(offset), limit)
 	headers := make(http.Header)
 	headers.Set("Content-Type", "application/octet-stream")
 	headers.Set("Content-Length", strconv.Itoa(int(limit)))
 	headers.Set("X-Redirect-Uploads", "1")
 	if method == http.MethodPatch {
 		headers.Set("Content-Range", fmt.Sprintf("%d-%d", offset, offset+sectionReader.Size()-1))
 	}
 	for try := 0; try < MaxRetries; try++ {
 		resp, err := makeRequest(ctx, method, requestURL, headers, io.TeeReader(sectionReader, pw), opts)
 		if err != nil && !errors.Is(err, io.EOF) {
 			return nil, err
 		}
 		defer resp.Body.Close()
 		switch {
 		case resp.StatusCode == http.StatusTemporaryRedirect:
 			location, err := resp.Location()
 			if err != nil {
 				return nil, err
 			}
 			pw.completed = int(offset)
 			if _, err := uploadBlobChunk(ctx, http.MethodPut, location, r, offset, limit, nil, pw); err != nil {
 				// retry
 				log.Printf("retrying redirected upload: %v", err)
 				continue
 			}
 			return resp, nil
 		case resp.StatusCode == http.StatusUnauthorized:
 			auth := resp.Header.Get("www-authenticate")
 			authRedir := ParseAuthRedirectString(auth)
 			token, err := getAuthToken(ctx, authRedir)
 			if err != nil {
 				return nil, err
 			}
 			opts.Token = token
 			pw.completed = int(offset)
 			sectionReader = io.NewSectionReader(r, offset, limit)
 			continue
 		case resp.StatusCode >= http.StatusBadRequest:
 			body, _ := io.ReadAll(resp.Body)
 			return nil, fmt.Errorf("on upload registry responded with code %d: %s", resp.StatusCode, body)
 		}
 		return resp, nil
 	}
 	return nil, fmt.Errorf("max retries exceeded")
 }
 type ProgressWriter struct {
 	status    string
 	digest    string
 	bucket    int
 	completed int
 	total     int
 	fn        func(api.ProgressResponse)
 }
 func (pw *ProgressWriter) Write(b []byte) (int, error) {
 	n := len(b)
 	pw.bucket += n
 	pw.completed += n
 	// throttle status updates to not spam the client
 	if pw.bucket >= 1024*1024 || pw.completed >= pw.total {
 		pw.fn(api.ProgressResponse{
 			Status:    pw.status,
 			Digest:    pw.digest,
 			Total:     pw.total,
 			Completed: pw.completed,
 		})
 		pw.bucket = 0
 	}
 	return n, nil
 }
Author	SHA1	Message	Date
Jeffrey Morgan	5306b0269d	Update linux.md	2023-09-25 16:10:32 -07:00
Michael Yang	7de0c8345d	Merge pull request #595 from jmorganca/mxyng/install.sh ignore systemctl is-system-running exit code	2023-09-25 15:49:47 -07:00
Michael Yang	1b9dcab3ab	ignore systemctl is-system-running exit code	2023-09-25 15:47:45 -07:00
Bruce MacDonald	86279f4ae3	unbound max num gpu layers (#591 ) --------- Co-authored-by: Michael Yang <mxyng@pm.me>	2023-09-25 18:36:46 -04:00
Michael Yang	b934bf23e6	exit on unknown distro (#594 )	2023-09-25 15:30:58 -07:00
Michael Yang	2b8ef455ad	Merge pull request #593 from jmorganca/mxyng/install.sh update install.sh	2023-09-25 14:09:40 -07:00
Michael Yang	0c5f47177c	update install.sh	2023-09-25 14:01:44 -07:00
Michael Yang	1210db2924	Merge pull request #592 from jmorganca/mxyng/install.sh fix dkms on debian	2023-09-25 12:59:01 -07:00
Michael Yang	d0854bf1e6	fix dkms on debian	2023-09-25 12:57:25 -07:00
Michael Yang	8396463255	Merge pull request #590 from jmorganca/mxyng/install.sh fix dkms install	2023-09-25 12:17:31 -07:00
Michael Yang	a027bbf4d7	fix dkms install	2023-09-25 12:16:41 -07:00
Michael Yang	ed94a3dd02	Merge pull request #589 from jmorganca/mxyng/install.sh update install.sh	2023-09-25 11:08:25 -07:00
Michael Yang	f14f62ab3b	update install.sh	2023-09-25 11:05:38 -07:00
Jeffrey Morgan	0fb5268496	Update linux.md	2023-09-25 10:06:23 -07:00
Bruce MacDonald	c65edb1506	fix linux installer warning logs (#588 )	2023-09-25 11:22:56 -04:00
Twan L	1605af32ec	Added a new community project (#574 )	2023-09-25 10:40:59 -04:00
Jeffrey Morgan	ee3032ad89	improvements to `docs/linux.md`	2023-09-24 21:50:07 -07:00
Jeffrey Morgan	5b7a27281d	improvements to `docs/linux.md`	2023-09-24 21:38:23 -07:00
Jeffrey Morgan	d2a784e33e	add `docs/linux.md`	2023-09-24 21:34:44 -07:00
Jeffrey Morgan	413a2e4f91	set `DEBIAN_FRONTEND=noninteractive` correctly	2023-09-24 20:35:42 -07:00
Patrick Devine	b5614f3ebc	fix end-of-line issue with the new prompt (#582 )	2023-09-23 17:20:30 -07:00
Jeffrey Morgan	8b2ba9cab8	minor improvements to `install.sh`	2023-09-23 11:20:39 -04:00
Jeffrey Morgan	e29662ab5c	fix minor install script issues on debian	2023-09-23 10:25:47 -04:00
Bruce MacDonald	cbc40aa996	debian installer support (#579 ) * debian installer support - normalize os name to lowercase - check needed commands are available - dont check sudo when root user - share common install commands - support debian cuda install - skip aarm cuda install - system user shared home dir * refactor and add other platforms (#580) --------- Co-authored-by: Michael Yang <mxyng@pm.me>	2023-09-23 09:46:47 -04:00
Jeffrey Morgan	5cb82540c9	`install.sh`: update install url	2023-09-23 09:35:14 -04:00
Jeffrey Morgan	d7849a1dc9	add `.env` to `.dockerignore`	2023-09-23 00:53:48 -04:00
Jeffrey Morgan	01c44d687e	add multi line strings to final prompt	2023-09-23 00:27:24 -04:00
Jeffrey Morgan	9b12a511ca	check other request fields before load short circuit in `/api/generate`	2023-09-22 23:50:55 -04:00
Jeffrey Morgan	e20362e0d5	fix multi line input in `ollama run`	2023-09-22 23:49:35 -04:00
Patrick Devine	c928ceb927	add word wrapping for lines which are longer than the terminal width (#553 )	2023-09-22 13:36:08 -07:00
Michael Yang	e1a0846483	Merge pull request #571 from jmorganca/mxyng/update-dockerfile update dockerfile.cuda	2023-09-22 12:34:41 -07:00
Jeffrey Morgan	f997e29e45	Add `Dockerfile.build` for building linux binaries (#558 ) Add `Dockerfile.build` for building linux binaries --------- Co-authored-by: Michael Yang <mxyng@pm.me>	2023-09-22 15:20:12 -04:00
Patrick Devine	87d9efb364	switch to forked readline lib which doesn't wreck the repl prompt (#578 )	2023-09-22 12:17:45 -07:00
Michael Yang	93d3a2568d	replace dockerfile	2023-09-22 11:57:38 -07:00
Michael Yang	5a81390b24	update dockerfile.cuda	2023-09-22 11:57:38 -07:00
Michael Yang	a89ef99aed	Merge pull request #575 from jmorganca/mxyng/fix-ipv6-only fix ipv6 parse ip	2023-09-22 11:47:11 -07:00
Bruce MacDonald	dc0c725ceb	ubuntu cuda drivers (#576 )	2023-09-22 19:43:14 +01:00
Bruce MacDonald	5d71bda478	close llm on interrupt (#577 )	2023-09-22 19:41:52 +01:00
Michael Yang	88897a90e4	fix ipv6 parse ip	2023-09-22 10:41:32 -07:00
Bruce MacDonald	9df31c3518	linux installer script (#534 ) Co-authored-by: Michael Yang <mxyng@pm.me>	2023-09-22 17:01:03 +01:00
Michael Yang	2044f9d4da	Merge pull request #570 from jmorganca/mxyng/head-request fix HEAD request	2023-09-21 16:56:17 -07:00
Michael Yang	0d186f3b33	Merge pull request #569 from jmorganca/mxyng/update-submodules silence warm up log	2023-09-21 16:52:42 -07:00
Michael Yang	82f5b66c01	register HEAD /api/tags	2023-09-21 16:38:03 -07:00
Michael Yang	c986694367	fix HEAD / request HEAD request should respond like their GET counterparts except without a response body.	2023-09-21 16:35:58 -07:00
Michael Yang	058d0cd04b	silence warm up log	2023-09-21 14:53:33 -07:00
Michael Yang	ee1c994d15	update submodule (#567 )	2023-09-21 16:22:23 -04:00
Bruce MacDonald	4cba75efc5	remove tmp directories created by previous servers (#559 ) * remove tmp directories created by previous servers * clean up on server stop * Update routes.go * Update server/routes.go Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com> * create top-level temp ollama dir * check file exists before creating --------- Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com> Co-authored-by: Michael Yang <mxyng@pm.me>	2023-09-21 20:38:49 +01:00
Michael Yang	8c83701e9f	Merge pull request #566 from jmorganca/mxyng/api-check-model-exists Use API to check if model exists and pull if necessary	2023-09-21 10:35:14 -07:00
Michael Yang	6137b12799	validate existence and pull model using api	2023-09-21 09:55:34 -07:00
Michael Yang	1fabba474b	refactor default allow origins this should be less error prone	2023-09-21 09:42:25 -07:00
Michael Yang	765770efdb	Merge pull request #562 from jmorganca/mxyng/fix-ollama-host fix OLLAMA_HOST parsing for ip6	2023-09-20 19:54:47 -07:00
Michael Yang	9297ff8330	fix OLLAMA_HOST parsing for ip6	2023-09-20 18:52:57 -07:00
Michael Yang	ee4fd16f2c	Merge pull request #556 from jmorganca/pack-cuda pack in cuda libs	2023-09-20 15:02:36 -07:00
Michael Yang	a9ed7cc6aa	rename generate.go	2023-09-20 14:42:17 -07:00
Michael Yang	6c6a31a1e8	embed libraries using cmake	2023-09-20 14:41:57 -07:00
Bruce MacDonald	fc6ec356fc	remove libcuda.so	2023-09-20 20:36:14 +01:00
Bruce MacDonald	1255bc9b45	only package 11.8 runner	2023-09-20 20:00:41 +01:00
Michael Yang	084e4c782a	Merge pull request #557 from jmorganca/mxyng/cleanup fix impossible condition	2023-09-20 11:51:01 -07:00
Michael Yang	58ffa03d8b	fix impossible condition	2023-09-20 11:27:44 -07:00
Michael Yang	637f8bc6a5	Merge pull request #536 from jmorganca/mxyng/redirect-uploads explicitly follow upload redirects	2023-09-20 11:27:03 -07:00
Michael Yang	499e9007a5	pick chunksize based on location	2023-09-20 11:10:24 -07:00
Bruce MacDonald	b9bb5ca288	use cuda_version	2023-09-20 17:58:16 +01:00
Bruce MacDonald	4e8be787c7	pack in cuda libs	2023-09-20 17:40:42 +01:00
Michael Yang	aa45d7c1df	draft: explicitly follow upload redirects	2023-09-19 13:36:58 -07:00
Michael Yang	e35565c567	Merge pull request #555 from jmorganca/mxyng/fix-windows-startup fix build	2023-09-19 10:51:58 -07:00
Michael Yang	a5520bfb42	fix build	2023-09-19 10:42:24 -07:00
Michael Yang	2627c464ba	Merge pull request #554 from jmorganca/mxyng/fix-windows-startup fix mkdir on windows	2023-09-19 09:42:12 -07:00
Michael Yang	b58d5d16b0	fix mkdir on windows	2023-09-19 09:41:13 -07:00
Patrick Devine	24580df958	only add a layer if there is actual data (#535 )	2023-09-18 13:47:45 -07:00
Patrick Devine	80dd44e80a	Cmd changes (#541 )	2023-09-18 12:26:56 -07:00
James Braza	94e1d96b29	Updated README section on community projects for table (#550 )	2023-09-18 15:22:50 -04:00
Bruce MacDonald	66003e1d05	subprocess improvements (#524 ) * subprocess improvements - increase start-up timeout - when runner fails to start fail rather than timing out - try runners in order rather than choosing 1 runner - embed metal runner in metal dir rather than gpu - refactor logging and error messages * Update llama.go * Update llama.go * simplify by using glob	2023-09-18 15:16:32 -04:00
Michael Yang	c345053a8b	Merge pull request #537 from jmorganca/mxyng/upload fix error on upload chunk	2023-09-15 17:48:39 -07:00
Michael Yang	08d7c2a944	fix error on upload chunk	2023-09-15 15:59:30 -07:00
Michael Yang	bc9573dcb1	Merge pull request #530 from jmorganca/mxyng/progresswriter implement ProgressWriter	2023-09-15 12:43:46 -07:00
Michael Yang	e53bc57d4d	split uploadBlobChunked	2023-09-14 17:22:05 -07:00
Michael Yang	f0b398d17f	implement ProgressWriter	2023-09-14 17:22:04 -07:00
Patrick Devine	8efbc5df55	DRAFT: add a simple python client to access ollama (#522 )	2023-09-14 16:37:38 -07:00
Michael Yang	ccc3e9ac6d	Merge pull request #531 from jmorganca/mxyng/content-length set request.ContentLength	2023-09-14 13:33:11 -07:00
Michael Yang	daa4f096f9	set request.ContentLength This informs the HTTP client the content length is known and disables chunked Transfer-Encoding	2023-09-14 13:32:44 -07:00
Michael Yang	3ee85f1c6c	Merge pull request #526 from jmorganca/mxyng/cleanup remove unused	2023-09-14 13:10:59 -07:00
Bruce MacDonald	2540c9181c	support for packaging in multiple cuda runners (#509 ) * enable packaging multiple cuda versions * use nvcc cuda version if available --------- Co-authored-by: Michael Yang <mxyng@pm.me>	2023-09-14 15:08:13 -04:00
Michael Yang	83ffb154bc	Merge pull request #507 from jmorganca/mxyng/build update docker image	2023-09-14 11:25:59 -07:00
Michael Yang	9aa192c812	update cuda docker image	2023-09-14 11:25:20 -07:00
Matt Williams	fc8707686f	Update API docs (#527 ) * Update API docs Signed-off-by: Matt Williams <m@technovangelist.com> * strange TOC was getting auto generated Signed-off-by: Matt Williams <m@technovangelist.com> * Update docs/api.md Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com> * Update docs/api.md Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com> * Update docs/api.md Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com> * Update api.md --------- Signed-off-by: Matt Williams <m@technovangelist.com> Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com> Co-authored-by: Michael Chiang <mchiang0610@users.noreply.github.com>	2023-09-14 08:51:26 -07:00
Michael Yang	f89c23764b	Merge pull request #525 from jmorganca/mxyng/falcon-decode fix: add falcon.go	2023-09-13 15:08:47 -07:00
Michael Yang	e6881cabd0	remove unused	2023-09-13 14:48:33 -07:00
Michael Yang	d028853879	fix: add falcon.go	2023-09-13 14:47:37 -07:00
Michael Yang	949553db23	Merge pull request #519 from jmorganca/mxyng/decode Mxyng/decode	2023-09-13 12:43:57 -07:00
Michael Yang	0c5a454361	fix model type for 70b	2023-09-12 15:12:59 -07:00
Bruce MacDonald	f59c4d03f7	fix ggml arm64 cuda build (#520 )	2023-09-12 17:06:48 -04:00
Michael Yang	7dee25a07f	fix falcon decode get model and file type from bin file	2023-09-12 12:34:53 -07:00
Bruce MacDonald	f221637053	first pass at linux gpu support (#454 ) * linux gpu support * handle multiple gpus * add cuda docker image (#488) --------- Co-authored-by: Michael Yang <mxyng@pm.me>	2023-09-12 11:04:35 -04:00
Patrick Devine	45ac07cd02	create the blobs directory correctly (#508 )	2023-09-11 14:54:52 -07:00
Jeffrey Morgan	7d749cc787	fix darwin build script	2023-09-11 16:31:46 -04:00
Patrick Devine	e7e91cd71c	add autoprune to remove unused layers (#491 )	2023-09-11 11:46:35 -07:00
Jeffrey Morgan	3920e15386	add model format to config layer (#497 )	2023-09-09 17:53:44 -04:00
Michael Yang	41e976edde	Merge pull request #492 from jmorganca/mxyng/nil-pointer fix nil pointer dereference	2023-09-07 17:25:23 -07:00
Michael Yang	de227b620f	fix nil pointer dereference	2023-09-07 17:24:31 -07:00
Michael Yang	63def6ca49	Merge pull request #487 from jmorganca/mxyng/dockerignore update dockerignore	2023-09-07 14:16:17 -07:00
Michael Yang	738fe9c4aa	Merge pull request #486 from jmorganca/mxyng/fix-push fix: retry push on expired token	2023-09-07 13:58:34 -07:00
Michael Yang	a8da0bacbe	update dockerignore	2023-09-07 13:36:25 -07:00
Michael Yang	bf146fb072	fix retry on unauthorized chunk	2023-09-07 12:02:04 -07:00
Michael Yang	f0f4943577	fix get auth token	2023-09-07 12:01:56 -07:00
Bruce MacDonald	09dd2aeff9	GGUF support (#441 )	2023-09-07 13:55:37 -04:00