add word wrapping for lines which are longer than the terminal width (#553 )

Merge pull request #571 from jmorganca/mxyng/update-dockerfile
update dockerfile.cuda
2023-09-22 13:36:08 -07:00 · 2023-09-22 12:34:41 -07:00 · 2023-09-22 15:20:12 -04:00 · 2023-09-22 12:17:45 -07:00 · 2023-09-22 11:57:38 -07:00 · 2023-09-22 11:57:38 -07:00
38 changed files with 1459 additions and 645 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -1,5 +1,7 @@
 .vscode
 ollama
 app
+dist
+scripts
 llm/llama.cpp/ggml
 llm/llama.cpp/gguf
--- a/.gitmodules
+++ b/.gitmodules
@@ -6,4 +6,5 @@
 [submodule "llm/llama.cpp/gguf"]
    path = llm/llama.cpp/gguf
    url = https://github.com/ggerganov/llama.cpp.git
+    ignore = dirty
    shallow = true
--- a/22
+++ b/22
@@ -1,18 +1,28 @@
-FROM golang:alpine
+ARG CUDA_VERSION=12.2.0
+
+FROM nvidia/cuda:$CUDA_VERSION-devel-ubuntu22.04
+
+ARG TARGETARCH
+ARG VERSION=0.0.0

 WORKDIR /go/src/github.com/jmorganca/ollama
-RUN apk add --no-cache git build-base cmake
+RUN apt-get update && apt-get install -y git build-essential cmake
+ADD https://dl.google.com/go/go1.21.1.linux-$TARGETARCH.tar.gz /tmp/go1.21.1.tar.gz
+RUN mkdir -p /usr/local && tar xz -C /usr/local </tmp/go1.21.1.tar.gz

 COPY . .
-RUN go generate ./... && go build -ldflags '-linkmode external -extldflags "-static"' .
+ENV GOARCH=$TARGETARCH
+RUN /usr/local/go/bin/go generate ./... \
+    && /usr/local/go/bin/go build -ldflags "-linkmode=external -extldflags='-static' -X=github.com/jmorganca/ollama/version.Version=$VERSION -X=github.com/jmorganca/ollama/server.mode=release" .

-FROM alpine
+FROM ubuntu:22.04
 ENV OLLAMA_HOST 0.0.0.0
-RUN apk add --no-cache libstdc++
+
+RUN apt-get update && apt-get install -y ca-certificates

 ARG USER=ollama
 ARG GROUP=ollama
-RUN addgroup $GROUP && adduser -D -G $GROUP $USER
+RUN groupadd $GROUP && useradd -m -g $GROUP $USER

 COPY --from=0 /go/src/github.com/jmorganca/ollama/ollama /bin/ollama

--- a/Dockerfile.build
+++ b/Dockerfile.build
@@ -0,0 +1,29 @@
+ARG VERSION=0.0.0
+
+# centos7 amd64 dependencies
+FROM --platform=linux/amd64 nvidia/cuda:11.8.0-devel-centos7 AS base-amd64
+RUN yum install -y https://repo.ius.io/ius-release-el7.rpm centos-release-scl && \
+    yum update -y && \
+    yum install -y devtoolset-10-gcc devtoolset-10-gcc-c++ git236 wget
+RUN wget "https://github.com/Kitware/CMake/releases/download/v3.27.6/cmake-3.27.6-linux-x86_64.sh" -O cmake-installer.sh && chmod +x cmake-installer.sh && ./cmake-installer.sh --skip-license --prefix=/usr/local
+ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
+
+# centos8 arm64 dependencies
+FROM --platform=linux/arm64 nvidia/cuda:11.4.3-devel-centos8 AS base-arm64
+RUN sed -i -e 's/mirrorlist/#mirrorlist/g' -e 's|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g' /etc/yum.repos.d/CentOS-*
+RUN yum install -y git cmake
+
+FROM base-${TARGETARCH}
+ARG TARGETARCH
+
+# install go
+ADD https://dl.google.com/go/go1.21.1.linux-$TARGETARCH.tar.gz /tmp/go1.21.1.tar.gz
+RUN mkdir -p /usr/local && tar xz -C /usr/local </tmp/go1.21.1.tar.gz
+
+# build the final binary
+WORKDIR /go/src/github.com/jmorganca/ollama
+COPY . .
+ENV GOARCH=$TARGETARCH
+
+RUN /usr/local/go/bin/go generate ./... && \
+    /usr/local/go/bin/go build -ldflags "-X=github.com/jmorganca/ollama/version.Version=$VERSION -X=github.com/jmorganca/ollama/server.mode=release" .
--- a/README.md
+++ b/README.md
@@ -206,10 +206,16 @@ curl -X POST http://localhost:11434/api/generate -d '{

 ## Community Projects using Ollama

- [LangChain](https://python.langchain.com/docs/integrations/llms/ollama) and [LangChain.js](https://js.langchain.com/docs/modules/model_io/models/llms/integrations/ollama) with a question-answering [example](https://js.langchain.com/docs/use_cases/question_answering/local_retrieval_qa).
- [Continue](https://github.com/continuedev/continue) - embeds Ollama inside Visual Studio Code. The extension lets you highlight code to add to the prompt, ask questions in the sidebar, and generate code inline.
- [LiteLLM](https://github.com/BerriAI/litellm) a lightweight python package to simplify LLM API calls
- [Discord AI Bot](https://github.com/mekb-turtle/discord-ai-bot) - interact with Ollama as a chatbot on Discord.
- [Raycast Ollama](https://github.com/MassimilianoPasquini97/raycast_ollama) - Raycast extension to use Ollama for local llama inference on Raycast.
- [Simple HTML UI for Ollama](https://github.com/rtcfirefly/ollama-ui)
- [Emacs client](https://github.com/zweifisch/ollama) for Ollama
+| Project                                                                    | Description                                                                                                                                                  |
+| -------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| [LangChain][1] and [LangChain.js][2]                                       | Also, there is a question-answering [example][3].                                                                                                            |
+| [Continue](https://github.com/continuedev/continue)                        | Embeds Ollama inside Visual Studio Code. The extension lets you highlight code to add to the prompt, ask questions in the sidebar, and generate code inline. |
+| [LiteLLM](https://github.com/BerriAI/litellm)                              | Lightweight Python package to simplify LLM API calls.                                                                                                        |
+| [Discord AI Bot](https://github.com/mekb-turtle/discord-ai-bot)            | Interact with Ollama as a chatbot on Discord.                                                                                                                |
+| [Raycast Ollama](https://github.com/MassimilianoPasquini97/raycast_ollama) | Raycast extension to use Ollama for local llama inference on Raycast.                                                                                        |
+| [Simple HTML UI](https://github.com/rtcfirefly/ollama-ui)                  | Also, there is a Chrome extension.                                                                                                                           |
+| [Emacs client](https://github.com/zweifisch/ollama)                        |                                                                                                                                                              |
+
+[1]: https://python.langchain.com/docs/integrations/llms/ollama
+[2]: https://js.langchain.com/docs/modules/model_io/models/llms/integrations/ollama
+[3]: https://js.langchain.com/docs/use_cases/question_answering/local_retrieval_qa
--- a/api/client.py
+++ b/api/client.py
@@ -0,0 +1,225 @@
+import os
+import json
+import requests
+
+BASE_URL = os.environ.get('OLLAMA_HOST', 'http://localhost:11434')
+
+# Generate a response for a given prompt with a provided model. This is a streaming endpoint, so will be a series of responses.
+# The final response object will include statistics and additional data from the request. Use the callback function to override
+# the default handler.
+def generate(model_name, prompt, system=None, template=None, context=None, options=None, callback=None):
+    try:
+        url = f"{BASE_URL}/api/generate"
+        payload = {
+            "model": model_name, 
+            "prompt": prompt, 
+            "system": system, 
+            "template": template, 
+            "context": context, 
+            "options": options
+        }
+        
+        # Remove keys with None values
+        payload = {k: v for k, v in payload.items() if v is not None}
+        
+        with requests.post(url, json=payload, stream=True) as response:
+            response.raise_for_status()
+            
+            # Creating a variable to hold the context history of the final chunk
+            final_context = None
+            
+            # Variable to hold concatenated response strings if no callback is provided
+            full_response = ""
+
+            # Iterating over the response line by line and displaying the details
+            for line in response.iter_lines():
+                if line:
+                    # Parsing each line (JSON chunk) and extracting the details
+                    chunk = json.loads(line)
+                    
+                    # If a callback function is provided, call it with the chunk
+                    if callback:
+                        callback(chunk)
+                    else:
+                        # If this is not the last chunk, add the "response" field value to full_response and print it
+                        if not chunk.get("done"):
+                            response_piece = chunk.get("response", "")
+                            full_response += response_piece
+                            print(response_piece, end="", flush=True)
+                    
+                    # Check if it's the last chunk (done is true)
+                    if chunk.get("done"):
+                        final_context = chunk.get("context")
+            
+            # Return the full response and the final context
+            return full_response, final_context
+    except requests.exceptions.RequestException as e:
+        print(f"An error occurred: {e}")
+        return None, None
+
+# Create a model from a Modelfile. Use the callback function to override the default handler.
+def create(model_name, model_path, callback=None):
+    try:
+        url = f"{BASE_URL}/api/create"
+        payload = {"name": model_name, "path": model_path}
+        
+        # Making a POST request with the stream parameter set to True to handle streaming responses
+        with requests.post(url, json=payload, stream=True) as response:
+            response.raise_for_status()
+
+            # Iterating over the response line by line and displaying the status
+            for line in response.iter_lines():
+                if line:
+                    # Parsing each line (JSON chunk) and extracting the status
+                    chunk = json.loads(line)
+
+                    if callback:
+                        callback(chunk)
+                    else:
+                        print(f"Status: {chunk.get('status')}")
+    except requests.exceptions.RequestException as e:
+        print(f"An error occurred: {e}")
+
+# Pull a model from a the model registry. Cancelled pulls are resumed from where they left off, and multiple
+# calls to will share the same download progress. Use the callback function to override the default handler.
+def pull(model_name, insecure=False, callback=None):
+    try:
+        url = f"{BASE_URL}/api/pull"
+        payload = {
+            "name": model_name,
+            "insecure": insecure
+        }
+
+        # Making a POST request with the stream parameter set to True to handle streaming responses
+        with requests.post(url, json=payload, stream=True) as response:
+            response.raise_for_status()
+
+            # Iterating over the response line by line and displaying the details
+            for line in response.iter_lines():
+                if line:
+                    # Parsing each line (JSON chunk) and extracting the details
+                    chunk = json.loads(line)
+
+                    # If a callback function is provided, call it with the chunk
+                    if callback:
+                        callback(chunk)
+                    else:
+                        # Print the status message directly to the console
+                        print(chunk.get('status', ''), end='', flush=True)
+                    
+                    # If there's layer data, you might also want to print that (adjust as necessary)
+                    if 'digest' in chunk:
+                        print(f" - Digest: {chunk['digest']}", end='', flush=True)
+                        print(f" - Total: {chunk['total']}", end='', flush=True)
+                        print(f" - Completed: {chunk['completed']}", end='\n', flush=True)
+                    else:
+                        print()
+    except requests.exceptions.RequestException as e:
+        print(f"An error occurred: {e}")
+
+# Push a model to the model registry. Use the callback function to override the default handler.
+def push(model_name, insecure=False, callback=None):
+    try:
+        url = f"{BASE_URL}/api/push"
+        payload = {
+            "name": model_name,
+            "insecure": insecure
+        }
+
+        # Making a POST request with the stream parameter set to True to handle streaming responses
+        with requests.post(url, json=payload, stream=True) as response:
+            response.raise_for_status()
+
+            # Iterating over the response line by line and displaying the details
+            for line in response.iter_lines():
+                if line:
+                    # Parsing each line (JSON chunk) and extracting the details
+                    chunk = json.loads(line)
+
+                    # If a callback function is provided, call it with the chunk
+                    if callback:
+                        callback(chunk)
+                    else:
+                        # Print the status message directly to the console
+                        print(chunk.get('status', ''), end='', flush=True)
+                    
+                    # If there's layer data, you might also want to print that (adjust as necessary)
+                    if 'digest' in chunk:
+                        print(f" - Digest: {chunk['digest']}", end='', flush=True)
+                        print(f" - Total: {chunk['total']}", end='', flush=True)
+                        print(f" - Completed: {chunk['completed']}", end='\n', flush=True)
+                    else:
+                        print()
+    except requests.exceptions.RequestException as e:
+        print(f"An error occurred: {e}")
+
+# List models that are available locally.
+def list():
+    try:
+        response = requests.get(f"{BASE_URL}/api/tags")
+        response.raise_for_status()
+        data = response.json()
+        models = data.get('models', [])
+        return models
+
+    except requests.exceptions.RequestException as e:
+        print(f"An error occurred: {e}")
+        return None
+
+# Copy a model. Creates a model with another name from an existing model.
+def copy(source, destination):
+    try:
+        # Create the JSON payload
+        payload = {
+            "source": source,
+            "destination": destination
+        }
+        
+        response = requests.post(f"{BASE_URL}/api/copy", json=payload)
+        response.raise_for_status()
+        
+        # If the request was successful, return a message indicating that the copy was successful
+        return "Copy successful"
+
+    except requests.exceptions.RequestException as e:
+        print(f"An error occurred: {e}")
+        return None
+
+# Delete a model and its data.
+def delete(model_name):
+    try:
+        url = f"{BASE_URL}/api/delete"
+        payload = {"name": model_name}
+        response = requests.delete(url, json=payload)
+        response.raise_for_status()
+        return "Delete successful"
+    except requests.exceptions.RequestException as e:
+        print(f"An error occurred: {e}")
+        return None
+
+# Show info about a model.
+def show(model_name):
+    try:
+        url = f"{BASE_URL}/api/show"
+        payload = {"name": model_name}
+        response = requests.post(url, json=payload)
+        response.raise_for_status()
+        
+        # Parse the JSON response and return it
+        data = response.json()
+        return data
+    except requests.exceptions.RequestException as e:
+        print(f"An error occurred: {e}")
+        return None
+
+def heartbeat():
+    try:
+        url = f"{BASE_URL}/"
+        response = requests.head(url)
+        response.raise_for_status()
+        return "Ollama is running"
+    except requests.exceptions.RequestException as e:
+        print(f"An error occurred: {e}")
+        return "Ollama is not running"
+
+
--- a/api/types.go
+++ b/api/types.go
@@ -291,7 +291,7 @@ func DefaultOptions() Options {
 		NumCtx:             2048,
 		NumKeep:            -1,
 		NumBatch:           512,
-		NumGPU:             1,
+		NumGPU:             -1, // -1 here indicates that NumGPU should be set dynamically
 		NumGQA:             1,
 		LowVRAM:            false,
 		F16KV:              true,
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -11,20 +11,19 @@ import (
 	"io"
 	"log"
 	"net"
-	"net/http"
 	"os"
 	"os/exec"
-	"path"
 	"path/filepath"
 	"runtime"
 	"strings"
 	"time"

-	"github.com/chzyer/readline"
 	"github.com/dustin/go-humanize"
 	"github.com/olekukonko/tablewriter"
+	"github.com/pdevine/readline"
 	"github.com/spf13/cobra"
 	"golang.org/x/crypto/ssh"
+	"golang.org/x/term"

 	"github.com/jmorganca/ollama/api"
 	"github.com/jmorganca/ollama/format"
@@ -33,6 +32,17 @@ import (
 	"github.com/jmorganca/ollama/version"
 )

+type Painter struct{}
+
+func (p Painter) Paint(line []rune, l int) []rune {
+	termType := os.Getenv("TERM")
+	if termType == "xterm-256color" && len(line) == 0 {
+		prompt := "Send a message (/? for help)"
+		return []rune(fmt.Sprintf("\033[38;5;245m%s\033[%dD\033[0m", prompt, len(prompt)))
+	}
+	return line
+}
+
 func CreateHandler(cmd *cobra.Command, args []string) error {
 	filename, _ := cmd.Flags().GetString("file")
 	filename, err := filepath.Abs(filename)
@@ -98,39 +108,28 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
 }

 func RunHandler(cmd *cobra.Command, args []string) error {
-	insecure, err := cmd.Flags().GetBool("insecure")
+	client, err := api.FromEnv()
 	if err != nil {
 		return err
 	}

-	mp := server.ParseModelPath(args[0])
+	models, err := client.List(context.Background())
 	if err != nil {
 		return err
 	}

-	if mp.ProtocolScheme == "http" && !insecure {
-		return fmt.Errorf("insecure protocol http")
+	modelName, modelTag, ok := strings.Cut(args[0], ":")
+	if !ok {
+		modelTag = "latest"
 	}

-	fp, err := mp.GetManifestPath(false)
-	if err != nil {
-		return err
-	}
-
-	_, err = os.Stat(fp)
-	switch {
-	case errors.Is(err, os.ErrNotExist):
-		if err := pull(args[0], insecure); err != nil {
-			var apiStatusError api.StatusError
-			if !errors.As(err, &apiStatusError) {
-				return err
-			}
-
-			if apiStatusError.StatusCode != http.StatusBadGateway {
-				return err
-			}
+	for _, model := range models.Models {
+		if model.Name == strings.Join([]string{modelName, modelTag}, ":") {
+			return RunGenerate(cmd, args)
 		}
-	case err != nil:
+	}
+
+	if err := PullHandler(cmd, args); err != nil {
 		return err
 	}

@@ -387,71 +386,117 @@ func RunGenerate(cmd *cobra.Command, args []string) error {
 type generateContextKey string

 func generate(cmd *cobra.Command, model, prompt string) error {
-	if len(strings.TrimSpace(prompt)) > 0 {
-		client, err := api.FromEnv()
-		if err != nil {
-			return err
-		}
-
-		spinner := NewSpinner("")
-		go spinner.Spin(60 * time.Millisecond)
-
-		var latest api.GenerateResponse
-
-		generateContext, ok := cmd.Context().Value(generateContextKey("context")).([]int)
-		if !ok {
-			generateContext = []int{}
-		}
-
-		request := api.GenerateRequest{Model: model, Prompt: prompt, Context: generateContext}
-		fn := func(response api.GenerateResponse) error {
-			if !spinner.IsFinished() {
-				spinner.Finish()
-			}
-
-			latest = response
-
-			fmt.Print(response.Response)
-			return nil
-		}
-
-		if err := client.Generate(context.Background(), &request, fn); err != nil {
-			if strings.Contains(err.Error(), "failed to load model") {
-				// tell the user to check the server log, if it exists locally
-				home, nestedErr := os.UserHomeDir()
-				if nestedErr != nil {
-					// return the original error
-					return err
-				}
-				logPath := filepath.Join(home, ".ollama", "logs", "server.log")
-				if _, nestedErr := os.Stat(logPath); nestedErr == nil {
-					err = fmt.Errorf("%w\nFor more details, check the error logs at %s", err, logPath)
-				}
-			}
-			return err
-		}
-
-		fmt.Println()
-		fmt.Println()
-
-		if !latest.Done {
-			return errors.New("unexpected end of response")
-		}
-
-		verbose, err := cmd.Flags().GetBool("verbose")
-		if err != nil {
-			return err
-		}
-
-		if verbose {
-			latest.Summary()
-		}
-
-		ctx := cmd.Context()
-		ctx = context.WithValue(ctx, generateContextKey("context"), latest.Context)
-		cmd.SetContext(ctx)
+	client, err := api.FromEnv()
+	if err != nil {
+		return err
 	}

+	spinner := NewSpinner("")
+	go spinner.Spin(60 * time.Millisecond)
+
+	var latest api.GenerateResponse
+
+	generateContext, ok := cmd.Context().Value(generateContextKey("context")).([]int)
+	if !ok {
+		generateContext = []int{}
+	}
+
+	var wrapTerm bool
+	termType := os.Getenv("TERM")
+	if termType == "xterm-256color" {
+		wrapTerm = true
+	}
+
+	termWidth, _, err := term.GetSize(int(0))
+	if err != nil {
+		wrapTerm = false
+	}
+
+	// override wrapping if the user turned it off
+	nowrap, err := cmd.Flags().GetBool("nowordwrap")
+	if err != nil {
+		return err
+	}
+	if nowrap {
+		wrapTerm = false
+	}
+
+	var currentLineLength int
+	var wordBuffer string
+
+	request := api.GenerateRequest{Model: model, Prompt: prompt, Context: generateContext}
+	fn := func(response api.GenerateResponse) error {
+		if !spinner.IsFinished() {
+			spinner.Finish()
+		}
+
+		latest = response
+
+		if wrapTerm {
+			for _, ch := range response.Response {
+				if currentLineLength+1 > termWidth-5 {
+					// backtrack the length of the last word and clear to the end of the line
+					fmt.Printf("\x1b[%dD\x1b[K\n", len(wordBuffer))
+					fmt.Printf("%s%c", wordBuffer, ch)
+					currentLineLength = len(wordBuffer) + 1
+				} else {
+					fmt.Print(string(ch))
+					currentLineLength += 1
+
+					switch ch {
+					case ' ':
+						wordBuffer = ""
+					case '\n':
+						currentLineLength = 0
+					default:
+						wordBuffer += string(ch)
+					}
+				}
+			}
+		} else {
+			fmt.Print(response.Response)
+		}
+
+		return nil
+	}
+
+	if err := client.Generate(context.Background(), &request, fn); err != nil {
+		if strings.Contains(err.Error(), "failed to load model") {
+			// tell the user to check the server log, if it exists locally
+			home, nestedErr := os.UserHomeDir()
+			if nestedErr != nil {
+				// return the original error
+				return err
+			}
+			logPath := filepath.Join(home, ".ollama", "logs", "server.log")
+			if _, nestedErr := os.Stat(logPath); nestedErr == nil {
+				err = fmt.Errorf("%w\nFor more details, check the error logs at %s", err, logPath)
+			}
+		}
+		return err
+	}
+	if prompt != "" {
+		fmt.Println()
+		fmt.Println()
+	}
+
+	if !latest.Done {
+		return errors.New("unexpected end of response")
+	}
+
+	verbose, err := cmd.Flags().GetBool("verbose")
+	if err != nil {
+		return err
+	}
+
+	if verbose {
+		latest.Summary()
+	}
+
+	ctx := cmd.Context()
+	ctx = context.WithValue(ctx, generateContextKey("context"), latest.Context)
+	cmd.SetContext(ctx)
+
 	return nil
 }

@@ -461,19 +506,21 @@ func generateInteractive(cmd *cobra.Command, model string) error {
 		return err
 	}

+	// load the model
+	if err := generate(cmd, model, ""); err != nil {
+		return err
+	}
+
 	completer := readline.NewPrefixCompleter(
 		readline.PcItem("/help"),
 		readline.PcItem("/list"),
 		readline.PcItem("/set",
 			readline.PcItem("history"),
 			readline.PcItem("nohistory"),
+			readline.PcItem("wordwrap"),
+			readline.PcItem("nowordwrap"),
 			readline.PcItem("verbose"),
 			readline.PcItem("quiet"),
-			readline.PcItem("mode",
-				readline.PcItem("vim"),
-				readline.PcItem("emacs"),
-				readline.PcItem("default"),
-			),
 		),
 		readline.PcItem("/show",
 			readline.PcItem("license"),
@@ -492,6 +539,7 @@ func generateInteractive(cmd *cobra.Command, model string) error {
 	}

 	config := readline.Config{
+		Painter:      Painter{},
 		Prompt:       ">>> ",
 		HistoryFile:  filepath.Join(home, ".ollama", "history"),
 		AutoComplete: completer,
@@ -531,6 +579,7 @@ func generateInteractive(cmd *cobra.Command, model string) error {
 				line = multiLineBuffer
 				multiLineBuffer = ""
 				scanner.SetPrompt(">>> ")
+				continue
 			} else {
 				multiLineBuffer += line + " "
 				continue
@@ -545,45 +594,42 @@ func generateInteractive(cmd *cobra.Command, model string) error {
 			if err := ListHandler(cmd, args[1:]); err != nil {
 				return err
 			}
-
-			continue
 		case strings.HasPrefix(line, "/set"):
 			args := strings.Fields(line)
 			if len(args) > 1 {
 				switch args[1] {
 				case "history":
 					scanner.HistoryEnable()
-					continue
 				case "nohistory":
 					scanner.HistoryDisable()
-					continue
+				case "wordwrap":
+					cmd.Flags().Set("nowordwrap", "false")
+					fmt.Println("Set 'wordwrap' mode.")
+				case "nowordwrap":
+					cmd.Flags().Set("nowordwrap", "true")
+					fmt.Println("Set 'nowordwrap' mode.")
 				case "verbose":
 					cmd.Flags().Set("verbose", "true")
-					continue
+					fmt.Println("Set 'verbose' mode.")
 				case "quiet":
 					cmd.Flags().Set("verbose", "false")
-					continue
+					fmt.Println("Set 'quiet' mode.")
 				case "mode":
 					if len(args) > 2 {
 						switch args[2] {
 						case "vim":
 							scanner.SetVimMode(true)
-							continue
 						case "emacs", "default":
 							scanner.SetVimMode(false)
-							continue
 						default:
 							usage()
-							continue
 						}
 					} else {
 						usage()
-						continue
 					}
 				}
 			} else {
 				usage()
-				continue
 			}
 		case strings.HasPrefix(line, "/show"):
 			args := strings.Fields(line)
@@ -591,7 +637,6 @@ func generateInteractive(cmd *cobra.Command, model string) error {
 				resp, err := server.GetModelInfo(model)
 				if err != nil {
 					fmt.Println("error: couldn't get model")
-					continue
 				}

 				switch args[1] {
@@ -608,21 +653,22 @@ func generateInteractive(cmd *cobra.Command, model string) error {
 				default:
 					fmt.Println("error: unknown command")
 				}
-
-				continue
 			} else {
 				usage()
-				continue
 			}
 		case line == "/help", line == "/?":
 			usage()
-			continue
 		case line == "/exit", line == "/bye":
 			return nil
+		case strings.HasPrefix(line, "/"):
+			args := strings.Fields(line)
+			fmt.Printf("Unknown command '%s'. Type /? for help\n", args[0])
 		}

-		if err := generate(cmd, model, line); err != nil {
-			return err
+		if len(line) > 0 && line[0] != '/' {
+			if err := generate(cmd, model, line); err != nil {
+				return err
+			}
 		}
 	}
 }
@@ -641,28 +687,19 @@ func generateBatch(cmd *cobra.Command, model string) error {
 }

 func RunServer(cmd *cobra.Command, _ []string) error {
-	host, port := "127.0.0.1", "11434"
-
-	parts := strings.Split(os.Getenv("OLLAMA_HOST"), ":")
-	if ip := net.ParseIP(parts[0]); ip != nil {
-		host = ip.String()
-	}
-
-	if len(parts) > 1 {
-		port = parts[1]
-	}
-
-	// deprecated: include port in OLLAMA_HOST
-	if p := os.Getenv("OLLAMA_PORT"); p != "" {
-		port = p
-	}
-
-	err := initializeKeypair()
+	host, port, err := net.SplitHostPort(os.Getenv("OLLAMA_HOST"))
 	if err != nil {
+		host, port = "127.0.0.1", "11434"
+		if ip := net.ParseIP(strings.Trim(os.Getenv("OLLAMA_HOST"), "[]")); ip != nil {
+			host = ip.String()
+		}
+	}
+
+	if err := initializeKeypair(); err != nil {
 		return err
 	}

-	ln, err := net.Listen("tcp", fmt.Sprintf("%s:%s", host, port))
+	ln, err := net.Listen("tcp", net.JoinHostPort(host, port))
 	if err != nil {
 		return err
 	}
@@ -703,7 +740,7 @@ func initializeKeypair() error {
 			return err
 		}

-		err = os.MkdirAll(path.Dir(privKeyPath), 0o700)
+		err = os.MkdirAll(filepath.Dir(privKeyPath), 0o755)
 		if err != nil {
 			return fmt.Errorf("could not create directory %w", err)
 		}
@@ -831,6 +868,7 @@ func NewCLI() *cobra.Command {

 	runCmd.Flags().Bool("verbose", false, "Show timings for response")
 	runCmd.Flags().Bool("insecure", false, "Use an insecure registry")
+	runCmd.Flags().Bool("nowordwrap", false, "Don't wrap words to the next line automatically")

 	serveCmd := &cobra.Command{
 		Use:     "serve",
--- a/docs/api.md
+++ b/docs/api.md
@@ -3,18 +3,21 @@
 ## Endpoints

 - [Generate a completion](#generate-a-completion)
- [Create a model](#create-a-model)
- [List local models](#list-local-models)
- [Copy a model](#copy-a-model)
- [Delete a model](#delete-a-model)
- [Pull a model](#pull-a-model)
- [Generate embeddings](#generate-embeddings)
+- [Create a Model](#create-a-model)
+- [List Local Models](#list-local-models)
+- [Show Model Information](#show-model-information)
+- [Copy a Model](#copy-a-model)
+- [Delete a Model](#delete-a-model)
+- [Pull a Model](#pull-a-model)
+- [Push a Model](#push-a-model)
+- [Generate Embeddings](#generate-embeddings)
+

 ## Conventions

 ### Model names

-Model names follow a `model:tag` format. Some examples are `orca-mini:3b-q4_1` and `llama2:70b`. The tag is optional and if not provided will default to `latest`. The tag is used to identify a specific version.
+Model names follow a `model:tag` format. Some examples are `orca-mini:3b-q4_1` and `llama2:70b`. The tag is optional and, if not provided, will default to `latest`. The tag is used to identify a specific version.

 ### Durations

@@ -22,7 +25,7 @@ All durations are returned in nanoseconds.

 ## Generate a completion

-```
+```shell
 POST /api/generate
 ```

@@ -42,7 +45,7 @@ Advanced parameters:

 ### Request

-```
+```shell
 curl -X POST http://localhost:11434/api/generate -d '{
  "model": "llama2:7b",
  "prompt": "Why is the sky blue?"
@@ -95,7 +98,7 @@ To calculate how fast the response is generated in tokens per second (token/s),

 ## Create a Model

-```
+```shell
 POST /api/create
 ```

@@ -108,7 +111,7 @@ Create a model from a [`Modelfile`](./modelfile.md)

 ### Request

-```
+```shell
 curl -X POST http://localhost:11434/api/create -d '{
  "name": "mario",
  "path": "~/Modelfile"
@@ -117,7 +120,7 @@ curl -X POST http://localhost:11434/api/create -d '{

 ### Response

-A stream of JSON objects. When finished, `status` is `success`
+A stream of JSON objects. When finished, `status` is `success`.

 ```json
 {
@@ -127,7 +130,7 @@ A stream of JSON objects. When finished, `status` is `success`

 ## List Local Models

-```
+```shell
 GET /api/tags
 ```

@@ -135,7 +138,7 @@ List models that are available locally.

 ### Request

-```
+```shell
 curl http://localhost:11434/api/tags
 ```

@@ -158,9 +161,40 @@ curl http://localhost:11434/api/tags
 }
 ```

+## Show Model Information
+
+```shell
+POST /api/show
+```
+
+Show details about a model including modelfile, template, parameters, license, and system prompt.
+
+### Parameters
+
+- `name`: name of the model to show
+
+### Request
+
+```shell  
+curl http://localhost:11434/api/show -d '{
+  "name": "llama2:7b"
+}'
+```
+
+### Response
+
+```json
+{
+    "license": "<contents of license block>",
+    "modelfile": "# Modelfile generated by \"ollama show\"\n# To build a new Modelfile based on this one, replace the FROM line with:\n# FROM llama2:latest\n\nFROM /Users/username/.ollama/models/blobs/sha256:8daa9615cce30c259a9555b1cc250d461d1bc69980a274b44d7eda0be78076d8\nTEMPLATE \"\"\"[INST] {{ if and .First .System }}<<SYS>>{{ .System }}<</SYS>>\n\n{{ end }}{{ .Prompt }} [/INST] \"\"\"\nSYSTEM \"\"\"\"\"\"\nPARAMETER stop [INST]\nPARAMETER stop [/INST]\nPARAMETER stop <<SYS>>\nPARAMETER stop <</SYS>>\n",
+    "parameters": "stop                           [INST]\nstop                           [/INST]\nstop                           <<SYS>>\nstop                           <</SYS>>",
+    "template": "[INST] {{ if and .First .System }}<<SYS>>{{ .System }}<</SYS>>\n\n{{ end }}{{ .Prompt }} [/INST] "
+}
+```
+
 ## Copy a Model

-```
+```shell
 POST /api/copy
 ```

@@ -168,7 +202,7 @@ Copy a model. Creates a model with another name from an existing model.

 ### Request

-```
+```shell
 curl http://localhost:11434/api/copy -d '{
  "source": "llama2:7b",
  "destination": "llama2-backup"
@@ -177,7 +211,7 @@ curl http://localhost:11434/api/copy -d '{

 ## Delete a Model

-```
+```shell
 DELETE /api/delete
 ```

@@ -189,7 +223,7 @@ Delete a model and its data.

 ### Request

-```
+```shell
 curl -X DELETE http://localhost:11434/api/delete -d '{
  "name": "llama2:13b"
 }'
@@ -197,19 +231,20 @@ curl -X DELETE http://localhost:11434/api/delete -d '{

 ## Pull a Model

-```
+```shell
 POST /api/pull
 ```

-Download a model from a the model registry. Cancelled pulls are resumed from where they left off, and multiple calls to will share the same download progress.
+Download a model from the ollama library. Cancelled pulls are resumed from where they left off, and multiple calls will share the same download progress.

 ### Parameters

 - `name`: name of the model to pull
+- `insecure`: (optional) allow insecure connections to the library. Only use this if you are pulling from your own library during development.

 ### Request

-```
+```shell
 curl -X POST http://localhost:11434/api/pull -d '{
  "name": "llama2:7b"
 }'
@@ -225,9 +260,63 @@ curl -X POST http://localhost:11434/api/pull -d '{
 }
 ```

+## Push a Model
+
+```shell
+POST /api/push
+```
+
+Upload a model to a model library. Requires registering for ollama.ai and adding a public key first.
+
+### Parameters
+
+- `name`: name of the model to push in the form of `<namespace>/<model>:<tag>`
+- `insecure`: (optional) allow insecure connections to the library. Only use this if you are pushing to your library during development.  
+
+### Request
+
+```shell
+curl -X POST http://localhost:11434/api/push -d '{
+  "name": "mattw/pygmalion:latest"
+}'
+```
+
+### Response
+
+Streaming response that starts with:
+
+```json
+{"status":"retrieving manifest"}
+```
+
+and then:
+
+```json
+{
+"status":"starting upload","digest":"sha256:bc07c81de745696fdf5afca05e065818a8149fb0c77266fb584d9b2cba3711ab",
+"total":1928429856
+}
+```
+
+Then there is a series of uploading responses:
+
+```json
+{
+"status":"starting upload",
+"digest":"sha256:bc07c81de745696fdf5afca05e065818a8149fb0c77266fb584d9b2cba3711ab",
+"total":1928429856}
+```
+
+Finally, when the upload is complete:
+
+```json
+{"status":"pushing manifest"}
+{"status":"success"}
+```
+
 ## Generate Embeddings

-```
+```shell
 POST /api/embeddings
 ```

@@ -244,7 +333,7 @@ Advanced parameters:

 ### Request

-```
+```shell
 curl -X POST http://localhost:11434/api/embeddings -d '{
  "model": "llama2:7b",
  "prompt": "Here is an article about llamas..."
@@ -259,5 +348,4 @@ curl -X POST http://localhost:11434/api/embeddings -d '{
    0.5670403838157654, 0.009260174818336964, 0.23178744316101074, -0.2916173040866852, -0.8924556970596313,
    0.8785552978515625, -0.34576427936553955, 0.5742510557174683, -0.04222835972905159, -0.137906014919281
  ]
-}
-```
+}```
--- a/docs/development.md
+++ b/docs/development.md
@@ -6,6 +6,10 @@

 Install required tools:

+- cmake version 3.24 or higher
+- go version 1.20 or higher
+- gcc version 11.4.0 or higher
+
 ```
 brew install go cmake gcc
 ```
@@ -27,3 +31,9 @@ Now you can run `ollama`:
 ```
 ./ollama
 ```
+
+## Building on Linux with GPU support
+
+- Install cmake and nvidia-cuda-toolkit
+- run `go generate ./...`
+- run `go build .`
--- a/go.mod
+++ b/go.mod
@@ -8,6 +8,7 @@ require (
 	github.com/mattn/go-runewidth v0.0.14
 	github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db
 	github.com/olekukonko/tablewriter v0.0.5
+	github.com/pdevine/readline v1.5.2
 	github.com/spf13/cobra v1.7.0
 )

@@ -16,7 +17,6 @@ require github.com/rivo/uniseg v0.2.0 // indirect
 require (
 	github.com/bytedance/sonic v1.9.1 // indirect
 	github.com/chenzhuoyu/base64x v0.0.0-20221115062448-fe3a3abad311 // indirect
-	github.com/chzyer/readline v1.5.1
 	github.com/gabriel-vasile/mimetype v1.4.2 // indirect
 	github.com/gin-contrib/cors v1.4.0
 	github.com/gin-contrib/sse v0.1.0 // indirect
--- a/go.sum
+++ b/go.sum
@@ -6,8 +6,6 @@ github.com/chenzhuoyu/base64x v0.0.0-20221115062448-fe3a3abad311 h1:qSGYFH7+jGhD
 github.com/chenzhuoyu/base64x v0.0.0-20221115062448-fe3a3abad311/go.mod h1:b583jCggY9gE99b6G5LEC39OIiVsWj+R97kbl5odCEk=
 github.com/chzyer/logex v1.2.1 h1:XHDu3E6q+gdHgsdTPH6ImJMIp436vR6MPtH8gP05QzM=
 github.com/chzyer/logex v1.2.1/go.mod h1:JLbx6lG2kDbNRFnfkgvh4eRJRPX1QCoOIWomwysCBrQ=
-github.com/chzyer/readline v1.5.1 h1:upd/6fQk4src78LMRzh5vItIt361/o4uq553V8B5sGI=
-github.com/chzyer/readline v1.5.1/go.mod h1:Eh+b79XXUwfKfcPLepksvw2tcLE/Ct21YObkaSkeBlk=
 github.com/chzyer/test v1.0.0 h1:p3BQDXSxOhOG0P9z6/hGnII4LGiEPOYBhs8asl/fC04=
 github.com/chzyer/test v1.0.0/go.mod h1:2JlltgoNkt4TW/z9V/IzDdFaMTM2JPIi26O1pF38GC8=
 github.com/cpuguy83/go-md2man/v2 v2.0.2/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=
@@ -80,6 +78,8 @@ github.com/olekukonko/tablewriter v0.0.5 h1:P2Ga83D34wi1o9J6Wh1mRuqd4mF/x/lgBS7N
 github.com/olekukonko/tablewriter v0.0.5/go.mod h1:hPp6KlRPjbx+hW8ykQs1w3UBbZlj6HuIJcUGPhkA7kY=
 github.com/pbnjay/memory v0.0.0-20210728143218-7b4eea64cf58 h1:onHthvaw9LFnH4t2DcNVpwGmV9E1BkGknEliJkfwQj0=
 github.com/pbnjay/memory v0.0.0-20210728143218-7b4eea64cf58/go.mod h1:DXv8WO4yhMYhSNPKjeNKa5WY9YCIEBRbNzFFPJbWO6Y=
+github.com/pdevine/readline v1.5.2 h1:oz6Y5GdTmhPG+08hhxcAvtHitSANWuA2100Sppb38xI=
+github.com/pdevine/readline v1.5.2/go.mod h1:na/LbuE5PYwxI7GyopWdIs3U8HVe89lYlNTFTXH3wOw=
 github.com/pelletier/go-toml/v2 v2.0.1/go.mod h1:r9LEWfGN8R5k0VXJ+0BkIe7MYkRdwZOjgMj2KwnJFUo=
 github.com/pelletier/go-toml/v2 v2.0.8 h1:0ctb6s9mE31h0/lhu+J6OPmVeDxJn+kYnJc2jZR9tGQ=
 github.com/pelletier/go-toml/v2 v2.0.8/go.mod h1:vuYfssBdrU2XDZ9bYydBu6t+6a6PYNcZljzZR9VXg+4=
@@ -120,7 +120,6 @@ golang.org/x/arch v0.3.0/go.mod h1:5om86z9Hs0C8fWVUuoMHwpExlXzs5Tkyp9hOrfG7pp8=
 golang.org/x/crypto v0.0.0-20210711020723-a769d52b0f97/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
 golang.org/x/crypto v0.10.0 h1:LKqV2xt9+kDzSTfOhx4FrkEBcMrAgHSYgzywV9zcGmM=
 golang.org/x/crypto v0.10.0/go.mod h1:o4eNf7Ede1fv+hwOwZsTHl9EsPFO6q6ZvYR8vYfY45I=
-golang.org/x/exp v0.0.0-20230321023759-10a507213a29 h1:ooxPy7fPvB4kwsA2h+iBNHkAbp/4JxTSwCmvdjEYmug=
 golang.org/x/exp v0.0.0-20230817173708-d852ddb80c63 h1:m64FZMko/V45gv0bNmrNYoDEq8U5YUhetc9cBWKS1TQ=
 golang.org/x/exp v0.0.0-20230817173708-d852ddb80c63/go.mod h1:0v4NqG35kSWCMzLaMeX+IQrlSnVE/bqGSyC2cz/9Le8=
 golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
--- a/llm/falcon.go
+++ b/llm/falcon.go
@@ -0,0 +1,22 @@
+package llm
+
+const ModelFamilyFalcon = "falcon"
+
+const (
+	falconModelType7B   = 32
+	falconModelType40B  = 60
+	falconModelType180B = 80
+)
+
+func falconModelType(numLayer uint32) string {
+	switch numLayer {
+	case 32:
+		return "7B"
+	case 60:
+		return "40B"
+	case 80:
+		return "180B"
+	default:
+		return "Unknown"
+	}
+}
--- a/llm/ggml.go
+++ b/llm/ggml.go
@@ -4,58 +4,79 @@ import (
 	"encoding/binary"
 	"errors"
 	"io"
-	"path"
-	"sync"
 )

-type ModelFamily string
-
-const ModelFamilyUnknown ModelFamily = "unknown"
-
-type ModelType uint32
-
-const (
-	ModelType3B  ModelType = 26
-	ModelType7B  ModelType = 32
-	ModelType13B ModelType = 40
-	ModelType34B ModelType = 48
-	ModelType30B ModelType = 60
-	ModelType65B ModelType = 80
-)
-
-func (mt ModelType) String() string {
-	switch mt {
-	case ModelType3B:
-		return "3B"
-	case ModelType7B:
-		return "7B"
-	case ModelType13B:
-		return "13B"
-	case ModelType34B:
-		return "34B"
-	case ModelType30B:
-		return "30B"
-	case ModelType65B:
-		return "65B"
-	default:
-		return "Unknown"
-	}
-}
-
-type FileType interface {
-	String() string
-}
-
 type GGML struct {
 	magic uint32
 	container
 	model
 }

+const (
+	fileTypeF32 uint32 = iota
+	fileTypeF16
+	fileTypeQ4_0
+	fileTypeQ4_1
+	fileTypeQ4_1_F16
+	fileTypeQ8_0 uint32 = iota + 2
+	fileTypeQ5_0
+	fileTypeQ5_1
+	fileTypeQ2_K
+	fileTypeQ3_K_S
+	fileTypeQ3_K_M
+	fileTypeQ3_K_L
+	fileTypeQ4_K_S
+	fileTypeQ4_K_M
+	fileTypeQ5_K_S
+	fileTypeQ5_K_M
+	fileTypeQ6_K
+)
+
+func fileType(fileType uint32) string {
+	switch fileType {
+	case fileTypeF32:
+		return "F32"
+	case fileTypeF16:
+		return "F16"
+	case fileTypeQ4_0:
+		return "Q4_0"
+	case fileTypeQ4_1:
+		return "Q4_1"
+	case fileTypeQ4_1_F16:
+		return "Q4_1_F16"
+	case fileTypeQ8_0:
+		return "Q8_0"
+	case fileTypeQ5_0:
+		return "Q5_0"
+	case fileTypeQ5_1:
+		return "Q5_1"
+	case fileTypeQ2_K:
+		return "Q2_K"
+	case fileTypeQ3_K_S:
+		return "Q3_K_S"
+	case fileTypeQ3_K_M:
+		return "Q3_K_M"
+	case fileTypeQ3_K_L:
+		return "Q3_K_L"
+	case fileTypeQ4_K_S:
+		return "Q4_K_S"
+	case fileTypeQ4_K_M:
+		return "Q4_K_M"
+	case fileTypeQ5_K_S:
+		return "Q5_K_S"
+	case fileTypeQ5_K_M:
+		return "Q5_K_M"
+	case fileTypeQ6_K:
+		return "Q6_K"
+	default:
+		return "Unknown"
+	}
+}
+
 type model interface {
-	ModelFamily() ModelFamily
-	ModelType() ModelType
-	FileType() FileType
+	ModelFamily() string
+	ModelType() string
+	FileType() string
 }

 type container interface {
@@ -143,23 +164,6 @@ func (c *containerLORA) Decode(r io.Reader) (model, error) {
 	return nil, nil
 }

-var (
-	ggmlGPU = path.Join("llama.cpp", "ggml", "build", "gpu", "bin")
-	ggmlCPU = path.Join("llama.cpp", "ggml", "build", "cpu", "bin")
-)
-
-var (
-	ggmlInit       sync.Once
-	ggmlRunnerPath string
-)
-
-func ggmlRunner() ModelRunner {
-	ggmlInit.Do(func() {
-		ggmlRunnerPath = chooseRunner(ggmlGPU, ggmlCPU)
-	})
-	return ModelRunner{Path: ggmlRunnerPath}
-}
-
 const (
 	// Magic constant for `ggml` files (unversioned).
 	FILE_MAGIC_GGML = 0x67676d6c
--- a/llm/gguf.go
+++ b/llm/gguf.go
@@ -6,9 +6,6 @@ import (
 	"errors"
 	"fmt"
 	"io"
-	"log"
-	"path"
-	"sync"
 )

 type containerGGUF struct {
@@ -87,38 +84,43 @@ func (llm *ggufModel) NumKV() uint64 {
 	return llm.V2.NumKV
 }

-func (llm *ggufModel) ModelFamily() ModelFamily {
+func (llm *ggufModel) ModelFamily() string {
 	t, ok := llm.kv["general.architecture"].(string)
 	if ok {
-		return ModelFamily(t)
+		return t
 	}

-	log.Printf("unknown model family: %T", t)
-	return ModelFamilyUnknown
+	return "unknown"
 }

-func (llm *ggufModel) ModelType() ModelType {
+func (llm *ggufModel) ModelType() string {
 	switch llm.ModelFamily() {
-	case ModelFamilyLlama:
-		blocks, ok := llm.kv["llama.block_count"].(uint32)
-		if ok {
-			return ModelType(blocks)
+	case "llama":
+		if blocks, ok := llm.kv["llama.block_count"].(uint32); ok {
+			heads, headsOK := llm.kv["llama.head_count"].(uint32)
+			headKVs, headsKVsOK := llm.kv["llama.head_count_kv"].(uint32)
+			if headsOK && headsKVsOK && heads/headKVs == 8 {
+				return "70B"
+			}
+
+			return llamaModelType(blocks)
+		}
+	case "falcon":
+		if blocks, ok := llm.kv["falcon.block_count"].(uint32); ok {
+			return falconModelType(blocks)
 		}
 	}

-	return ModelType7B
+	return "Unknown"
 }

-func (llm *ggufModel) FileType() FileType {
-	switch llm.ModelFamily() {
-	case ModelFamilyLlama:
-		t, ok := llm.kv["general.file_type"].(uint32)
-		if ok {
-			return llamaFileType(t)
-		}
+func (llm *ggufModel) FileType() string {
+	t, ok := llm.kv["general.file_type"].(uint32)
+	if ok {
+		return fileType(t)
 	}

-	return llamaFileTypeF16
+	return "Unknown"
 }

 func (llm *ggufModel) Decode(r io.Reader) error {
@@ -365,21 +367,3 @@ func (llm *ggufModel) readArray(r io.Reader) (arr []any, err error) {

 	return
 }
-
-var (
-	ggufGPU = path.Join("llama.cpp", "gguf", "build", "gpu", "bin")
-	ggufCPU = path.Join("llama.cpp", "gguf", "build", "cpu", "bin")
-)
-
-var (
-	ggufInit       sync.Once
-	ggufRunnerPath string
-)
-
-func ggufRunner() ModelRunner {
-	ggufInit.Do(func() {
-		ggufRunnerPath = chooseRunner(ggufGPU, ggufCPU)
-	})
-
-	return ModelRunner{Path: ggufRunnerPath}
-}
--- a/llm/llama.cpp/generate.go
+++ b/llm/llama.cpp/generate.go
@@ -1,15 +0,0 @@
-//go:build !darwin
-// +build !darwin
-
-package llm
-
-//go:generate git submodule init
-//go:generate git submodule update --force ggml gguf
-//go:generate git -C ggml apply ../ggml_patch/0001-add-detokenize-endpoint.patch
-//go:generate git -C ggml apply ../ggml_patch/0002-34B-model-support.patch
-//go:generate git -C ggml apply ../ggml_patch/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
-//go:generate git -C ggml apply ../ggml_patch/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
-//go:generate cmake --fresh -S ggml -B ggml/build/cpu -DLLAMA_K_QUANTS=on
-//go:generate cmake --build ggml/build/cpu --target server --config Release
-//go:generate cmake -S gguf -B gguf/build/cpu -DLLAMA_K_QUANTS=on
-//go:generate cmake --build gguf/build/cpu --target server --config Release
--- a/llm/llama.cpp/generate_darwin_amd64.go
+++ b/llm/llama.cpp/generate_darwin_amd64.go
@@ -1,12 +1,16 @@
 package llm

 //go:generate git submodule init
-//go:generate git submodule update --force ggml gguf
-//go:generate git -C ggml apply ../ggml_patch/0001-add-detokenize-endpoint.patch
-//go:generate git -C ggml apply ../ggml_patch/0002-34B-model-support.patch
-//go:generate git -C ggml apply ../ggml_patch/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
-//go:generate git -C ggml apply ../ggml_patch/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
-//go:generate cmake --fresh -S ggml -B ggml/build/cpu -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
+
+//go:generate git submodule update --force ggml
+//go:generate git -C ggml apply ../patches/0001-add-detokenize-endpoint.patch
+//go:generate git -C ggml apply ../patches/0002-34B-model-support.patch
+//go:generate git -C ggml apply ../patches/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
+//go:generate git -C ggml apply ../patches/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
+//go:generate cmake -S ggml -B ggml/build/cpu -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
 //go:generate cmake --build ggml/build/cpu --target server --config Release
-//go:generate cmake --fresh -S gguf -B gguf/build/cpu -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
+
+//go:generate git submodule update --force gguf
+//go:generate git -C gguf apply ../patches/0001-remove-warm-up-logging.patch
+//go:generate cmake -S gguf -B gguf/build/cpu -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
 //go:generate cmake --build gguf/build/cpu --target server --config Release
--- a/llm/llama.cpp/generate_darwin_arm64.go
+++ b/llm/llama.cpp/generate_darwin_arm64.go
@@ -1,12 +1,16 @@
 package llm

 //go:generate git submodule init
-//go:generate git submodule update --force ggml gguf
-//go:generate git -C ggml apply ../ggml_patch/0001-add-detokenize-endpoint.patch
-//go:generate git -C ggml apply ../ggml_patch/0002-34B-model-support.patch
-//go:generate git -C ggml apply ../ggml_patch/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
-//go:generate git -C ggml apply ../ggml_patch/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
-//go:generate cmake --fresh -S ggml -B ggml/build/gpu -DLLAMA_METAL=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
-//go:generate cmake --build ggml/build/gpu --target server --config Release
-//go:generate cmake -S gguf -B gguf/build/gpu -DLLAMA_METAL=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
-//go:generate cmake --build gguf/build/gpu --target server --config Release
+
+//go:generate git submodule update --force ggml
+//go:generate git -C ggml apply ../patches/0001-add-detokenize-endpoint.patch
+//go:generate git -C ggml apply ../patches/0002-34B-model-support.patch
+//go:generate git -C ggml apply ../patches/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
+//go:generate git -C ggml apply ../patches/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
+//go:generate cmake -S ggml -B ggml/build/metal -DLLAMA_METAL=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
+//go:generate cmake --build ggml/build/metal --target server --config Release
+
+//go:generate git submodule update --force gguf
+//go:generate git -C gguf apply ../patches/0001-remove-warm-up-logging.patch
+//go:generate cmake -S gguf -B gguf/build/metal -DLLAMA_METAL=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
+//go:generate cmake --build gguf/build/metal --target server --config Release
--- a/llm/llama.cpp/generate_linux.go
+++ b/llm/llama.cpp/generate_linux.go
@@ -0,0 +1,22 @@
+package llm
+
+//go:generate git submodule init
+
+//go:generate git submodule update --force ggml
+//go:generate git -C ggml apply ../patches/0001-add-detokenize-endpoint.patch
+//go:generate git -C ggml apply ../patches/0002-34B-model-support.patch
+//go:generate git -C ggml apply ../patches/0005-ggml-support-CUDA-s-half-type-for-aarch64-1455-2670.patch
+//go:generate git -C ggml apply ../patches/0001-copy-cuda-runtime-libraries.patch
+//go:generate cmake -S ggml -B ggml/build/cpu -DLLAMA_K_QUANTS=on
+//go:generate cmake --build ggml/build/cpu --target server --config Release
+
+//go:generate git submodule update --force gguf
+//go:generate git -C gguf apply ../patches/0001-copy-cuda-runtime-libraries.patch
+//go:generate git -C gguf apply ../patches/0001-remove-warm-up-logging.patch
+//go:generate cmake -S gguf -B gguf/build/cpu -DLLAMA_K_QUANTS=on
+//go:generate cmake --build gguf/build/cpu --target server --config Release
+
+//go:generate cmake -S ggml -B ggml/build/cuda -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on
+//go:generate cmake --build ggml/build/cuda --target server --config Release
+//go:generate cmake -S gguf -B gguf/build/cuda -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on
+//go:generate cmake --build gguf/build/cuda --target server --config Release
--- a/llm/llama.cpp/generate_windows.go
+++ b/llm/llama.cpp/generate_windows.go
@@ -0,0 +1,14 @@
+package llm
+
+//go:generate git submodule init
+
+//go:generate git submodule update --force ggml
+//go:generate git -C ggml apply ../patches/0001-add-detokenize-endpoint.patch
+//go:generate git -C ggml apply ../patches/0002-34B-model-support.patch
+//go:generate cmake -S ggml -B ggml/build/cpu -DLLAMA_K_QUANTS=on
+//go:generate cmake --build ggml/build/cpu --target server --config Release
+
+//go:generate git submodule update --force gguf
+//go:generate git -C gguf apply ../patches/0001-remove-warm-up-logging.patch
+//go:generate cmake -S gguf -B gguf/build/cpu -DLLAMA_K_QUANTS=on
+//go:generate cmake --build gguf/build/cpu --target server --config Release
--- a/llm/llama.cpp/ggml_patch/0003-metal-add-missing-barriers-for-mul-mat-2699.patch
+++ b/llm/llama.cpp/ggml_patch/0003-metal-add-missing-barriers-for-mul-mat-2699.patch
@@ -1,32 +0,0 @@
-From 8c0ea847ac1460bca534d92266e3471cb31471be Mon Sep 17 00:00:00 2001
-From: Bruce MacDonald <brucewmacdonald@gmail.com>
-Date: Tue, 5 Sep 2023 16:05:08 -0400
-Subject: [PATCH] metal: add missing barriers for mul-mat #2699
-
---
- ggml-metal.metal | 2 ++
- 1 file changed, 2 insertions(+)
-
-diff --git a/ggml-metal.metal b/ggml-metal.metal
-index 3f31252..ce3541f 100644
--- a/ggml-metal.metal
-+++ b/ggml-metal.metal
-@@ -1850,6 +1850,7 @@ kernel void kernel_mul_mm(device const  uchar * src0,
-         //load data and store to threadgroup memory
-         half4x4 temp_a;
-         dequantize_func(x, il, temp_a);
-+        threadgroup_barrier(mem_flags::mem_threadgroup);
-         #pragma unroll(16)
-         for (int i = 0; i < 16; i++) {
-             *(sa + SG_MAT_SIZE * ((tiitg / THREAD_PER_ROW / 8) \
-@@ -1895,6 +1896,7 @@ kernel void kernel_mul_mm(device const  uchar * src0,
-         }
-     } else {
-         // block is smaller than 64x32, we should avoid writing data outside of the matrix
-+        threadgroup_barrier(mem_flags::mem_threadgroup);
-         threadgroup float *temp_str = ((threadgroup float *)shared_memory) \
-                                       + 32 * (sgitg&1) + (16 * (sgitg>>1)) * BLOCK_SIZE_M;
-         for (int i = 0; i < 8; i++) {
-- 
-2.39.2 (Apple Git-143)
-
--- a/llm/llama.cpp/gguf
+++ b/llm/llama.cpp/gguf
--- a/llm/llama.cpp/ggml_patch/0001-add-detokenize-endpoint.patch
+++ b/llm/llama.cpp/ggml_patch/0001-add-detokenize-endpoint.patch
--- a/llm/llama.cpp/patches/0001-copy-cuda-runtime-libraries.patch
+++ b/llm/llama.cpp/patches/0001-copy-cuda-runtime-libraries.patch
@@ -0,0 +1,27 @@
+From 5dd02993e8cc2ce309157736b95bb572f274a3fd Mon Sep 17 00:00:00 2001
+From: Michael Yang <mxyng@pm.me>
+Date: Wed, 20 Sep 2023 14:19:52 -0700
+Subject: [PATCH] copy cuda runtime libraries
+
+---
+ CMakeLists.txt | 4 ++++
+ 1 file changed, 4 insertions(+)
+
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index 824d9f2..dd24137 100644
+--- a/CMakeLists.txt
+++ b/CMakeLists.txt
+@@ -274,6 +274,10 @@ if (LLAMA_CUBLAS)
+             set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
+         endif()
+ 
+        configure_file(${CUDAToolkit_LIBRARY_DIR}/libcudart.so ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/libcudart.so.${CUDAToolkit_VERSION_MAJOR}.0 COPYONLY)
+        configure_file(${CUDAToolkit_LIBRARY_DIR}/libcublas.so ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/libcublas.so.${CUDAToolkit_VERSION_MAJOR} COPYONLY)
+        configure_file(${CUDAToolkit_LIBRARY_DIR}/libcublasLt.so ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/libcublasLt.so.${CUDAToolkit_VERSION_MAJOR} COPYONLY)
+
+     if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
+         # 52 == lowest CUDA 12 standard
+         # 60 == f16 CUDA intrinsics
+-- 
+2.42.0
+
--- a/llm/llama.cpp/patches/0001-remove-warm-up-logging.patch
+++ b/llm/llama.cpp/patches/0001-remove-warm-up-logging.patch
@@ -0,0 +1,25 @@
+From 07993bdc35345b67b27aa649a7c099ad42d80c4c Mon Sep 17 00:00:00 2001
+From: Michael Yang <mxyng@pm.me>
+Date: Thu, 21 Sep 2023 14:43:21 -0700
+Subject: [PATCH] remove warm up logging
+
+---
+ common/common.cpp | 2 --
+ 1 file changed, 2 deletions(-)
+
+diff --git a/common/common.cpp b/common/common.cpp
+index 2597ba0..b56549b 100644
+--- a/common/common.cpp
+++ b/common/common.cpp
+@@ -780,8 +780,6 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
+     }
+ 
+     {
+-        LOG("warming up the model with an empty run\n");
+-
+         const std::vector<llama_token> tmp = { llama_token_bos(lctx), llama_token_eos(lctx), };
+         llama_eval(lctx, tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, params.n_threads);
+         llama_reset_timings(lctx);
+-- 
+2.42.0
+
--- a/llm/llama.cpp/ggml_patch/0002-34B-model-support.patch
+++ b/llm/llama.cpp/ggml_patch/0002-34B-model-support.patch
--- a/llm/llama.cpp/ggml_patch/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
+++ b/llm/llama.cpp/ggml_patch/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
--- a/llm/llama.cpp/ggml_patch/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
+++ b/llm/llama.cpp/ggml_patch/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
--- a/llm/llama.cpp/patches/0005-ggml-support-CUDA-s-half-type-for-aarch64-1455-2670.patch
+++ b/llm/llama.cpp/patches/0005-ggml-support-CUDA-s-half-type-for-aarch64-1455-2670.patch
@@ -0,0 +1,32 @@
+From 1e3bc523d8053a77df3ac7126a84d0297ee97ef6 Mon Sep 17 00:00:00 2001
+From: Kylin <56434533+KyL0N@users.noreply.github.com>
+Date: Tue, 22 Aug 2023 15:14:23 +0800
+Subject: [PATCH] ggml : support CUDA's half type for aarch64(#1455) (#2670)
+
+* ggml: support CUDA's half type for aarch64(#1455)
+support CUDA's half type for aarch64 in ggml_fp16_t definition
+
+* ggml: use __CUDACC__ to recognise nvcc compiler
+---
+ ggml.h | 5 +++--
+ 1 file changed, 3 insertions(+), 2 deletions(-)
+
+diff --git a/ggml.h b/ggml.h
+index 544ad2d..0ec7ec5 100644
+--- a/ggml.h
+++ b/ggml.h
+@@ -259,8 +259,9 @@
+ extern "C" {
+ #endif
+ 
+-#ifdef __ARM_NEON
+-    // we use the built-in 16-bit float type
+#if defined(__ARM_NEON) && defined(__CUDACC__)
+    typedef half ggml_fp16_t;
+#elif defined(__ARM_NEON)
+     typedef __fp16 ggml_fp16_t;
+ #else
+     typedef uint16_t ggml_fp16_t;
+-- 
+2.39.2 (Apple Git-143)
+
--- a/llm/llama.go
+++ b/llm/llama.go
@@ -28,99 +28,128 @@ import (
 //go:embed llama.cpp/*/build/*/bin/*
 var llamaCppEmbed embed.FS

-func osPath(llamaPath string) string {
-	if runtime.GOOS == "windows" {
-		return path.Join(llamaPath, "Release")
-	}
-
-	return llamaPath
+type ModelRunner struct {
+	Path string // path to the model runner executable
 }

-func chooseRunner(gpuPath, cpuPath string) string {
-	tmpDir, err := os.MkdirTemp("", "llama-*")
-	if err != nil {
-		log.Fatalf("llama.cpp: failed to create temp dir: %v", err)
-	}
+func chooseRunners(workDir, runnerType string) []ModelRunner {
+	buildPath := path.Join("llama.cpp", runnerType, "build")
+	var runners []string

-	llamaPath := osPath(gpuPath)
-	if _, err := fs.Stat(llamaCppEmbed, llamaPath); err != nil {
-		llamaPath = osPath(cpuPath)
-		if _, err := fs.Stat(llamaCppEmbed, llamaPath); err != nil {
-			log.Fatalf("llama.cpp executable not found")
-		}
-	}
-
-	files := []string{"server"}
+	// set the runners based on the OS
+	// IMPORTANT: the order of the runners in the array is the priority order
 	switch runtime.GOOS {
-	case "windows":
-		files = []string{"server.exe"}
 	case "darwin":
-		if llamaPath == osPath(gpuPath) {
-			files = append(files, "ggml-metal.metal")
+		runners = []string{
+			path.Join(buildPath, "metal", "bin", "server"),
+			path.Join(buildPath, "cpu", "bin", "server"),
+		}
+	case "linux":
+		runners = []string{
+			path.Join(buildPath, "cuda", "bin", "server"),
+			path.Join(buildPath, "cpu", "bin", "server"),
+		}
+	case "windows":
+		// TODO: select windows GPU runner here when available
+		runners = []string{
+			path.Join(buildPath, "cpu", "bin", "Release", "server.exe"),
+		}
+	default:
+		log.Printf("unknown OS, running on CPU: %s", runtime.GOOS)
+		runners = []string{
+			path.Join(buildPath, "cpu", "bin", "server"),
 		}
 	}

-	for _, f := range files {
-		srcPath := path.Join(llamaPath, f)
-		destPath := filepath.Join(tmpDir, f)
-
-		srcFile, err := llamaCppEmbed.Open(srcPath)
+	runnerAvailable := false // if no runner files are found in the embed, this flag will cause a fast fail
+	for _, r := range runners {
+		// find all the files in the runner's bin directory
+		files, err := fs.Glob(llamaCppEmbed, filepath.Join(filepath.Dir(r), "*"))
 		if err != nil {
-			log.Fatalf("read llama.cpp %s: %v", f, err)
+			// this is expected, ollama may be compiled without all runners packed in
+			log.Printf("%s runner not found: %v", r, err)
+			continue
 		}
-		defer srcFile.Close()
+		runnerAvailable = true

-		destFile, err := os.OpenFile(destPath, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
-		if err != nil {
-			log.Fatalf("write llama.cpp %s: %v", f, err)
-		}
-		defer destFile.Close()
+		for _, f := range files {
+			srcFile, err := llamaCppEmbed.Open(f)
+			if err != nil {
+				log.Fatalf("read llama runner %s: %v", f, err)
+			}
+			defer srcFile.Close()

-		if _, err := io.Copy(destFile, srcFile); err != nil {
-			log.Fatalf("copy llama.cpp %s: %v", f, err)
+			// create the directory in case it does not exist
+			destPath := filepath.Join(workDir, filepath.Dir(f))
+			if err := os.MkdirAll(destPath, 0o755); err != nil {
+				log.Fatalf("create runner temp dir %s: %v", filepath.Dir(f), err)
+			}
+
+			destFile := filepath.Join(destPath, filepath.Base(f))
+
+			_, err = os.Stat(destFile)
+			switch {
+			case errors.Is(err, os.ErrNotExist):
+				destFile, err := os.OpenFile(destFile, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
+				if err != nil {
+					log.Fatalf("write llama runner %s: %v", f, err)
+				}
+				defer destFile.Close()
+
+				if _, err := io.Copy(destFile, srcFile); err != nil {
+					log.Fatalf("copy llama runner %s: %v", f, err)
+				}
+			case err != nil:
+				log.Fatalf("stat llama runner %s: %v", f, err)
+			}
 		}
 	}
-
-	runPath := filepath.Join(tmpDir, "server")
-	if runtime.GOOS == "windows" {
-		runPath = filepath.Join(tmpDir, "server.exe")
+	if !runnerAvailable {
+		log.Fatalf("%s runner not found", runnerType)
 	}

-	return runPath
+	// return the runners to try in priority order
+	localRunnersByPriority := []ModelRunner{}
+	for _, r := range runners {
+		localRunnersByPriority = append(localRunnersByPriority, ModelRunner{Path: path.Join(workDir, r)})
+	}
+
+	return localRunnersByPriority
 }

-const ModelFamilyLlama ModelFamily = "llama"
-
 type llamaModel struct {
 	hyperparameters llamaHyperparameters
 }

-func (llm *llamaModel) ModelFamily() ModelFamily {
-	return ModelFamilyLlama
+func (llm *llamaModel) ModelFamily() string {
+	return "llama"
 }

-func (llm *llamaModel) ModelType() ModelType {
-	switch llm.hyperparameters.NumLayer {
+func llamaModelType(numLayer uint32) string {
+	switch numLayer {
 	case 26:
-		return ModelType3B
+		return "3B"
 	case 32:
-		return ModelType7B
+		return "7B"
 	case 40:
-		return ModelType13B
+		return "13B"
 	case 48:
-		return ModelType34B
+		return "34B"
 	case 60:
-		return ModelType30B
+		return "30B"
 	case 80:
-		return ModelType65B
+		return "65B"
+	default:
+		return "Unknown"
 	}
-
-	// TODO: find a better default
-	return ModelType7B
 }

-func (llm *llamaModel) FileType() FileType {
-	return llm.hyperparameters.FileType
+func (llm *llamaModel) ModelType() string {
+	return llamaModelType(llm.hyperparameters.NumLayer)
+}
+
+func (llm *llamaModel) FileType() string {
+	return fileType(llm.hyperparameters.FileType)
 }

 type llamaHyperparameters struct {
@@ -137,70 +166,7 @@ type llamaHyperparameters struct {
 	NumRot   uint32

 	// FileType describes the quantization level of the model, e.g. Q4_0, Q5_K, etc.
-	FileType llamaFileType
-}
-
-type llamaFileType uint32
-
-const (
-	llamaFileTypeF32 llamaFileType = iota
-	llamaFileTypeF16
-	llamaFileTypeQ4_0
-	llamaFileTypeQ4_1
-	llamaFileTypeQ4_1_F16
-	llamaFileTypeQ8_0 llamaFileType = iota + 2
-	llamaFileTypeQ5_0
-	llamaFileTypeQ5_1
-	llamaFileTypeQ2_K
-	llamaFileTypeQ3_K_S
-	llamaFileTypeQ3_K_M
-	llamaFileTypeQ3_K_L
-	llamaFileTypeQ4_K_S
-	llamaFileTypeQ4_K_M
-	llamaFileTypeQ5_K_S
-	llamaFileTypeQ5_K_M
-	llamaFileTypeQ6_K
-)
-
-func (ft llamaFileType) String() string {
-	switch ft {
-	case llamaFileTypeF32:
-		return "F32"
-	case llamaFileTypeF16:
-		return "F16"
-	case llamaFileTypeQ4_0:
-		return "Q4_0"
-	case llamaFileTypeQ4_1:
-		return "Q4_1"
-	case llamaFileTypeQ4_1_F16:
-		return "Q4_1_F16"
-	case llamaFileTypeQ8_0:
-		return "Q8_0"
-	case llamaFileTypeQ5_0:
-		return "Q5_0"
-	case llamaFileTypeQ5_1:
-		return "Q5_1"
-	case llamaFileTypeQ2_K:
-		return "Q2_K"
-	case llamaFileTypeQ3_K_S:
-		return "Q3_K_S"
-	case llamaFileTypeQ3_K_M:
-		return "Q3_K_M"
-	case llamaFileTypeQ3_K_L:
-		return "Q3_K_L"
-	case llamaFileTypeQ4_K_S:
-		return "Q4_K_S"
-	case llamaFileTypeQ4_K_M:
-		return "Q4_K_M"
-	case llamaFileTypeQ5_K_S:
-		return "Q5_K_S"
-	case llamaFileTypeQ5_K_M:
-		return "Q5_K_M"
-	case llamaFileTypeQ6_K:
-		return "Q6_K"
-	default:
-		return "Unknown"
-	}
+	FileType uint32
 }

 type Running struct {
@@ -209,21 +175,79 @@ type Running struct {
 	Cancel context.CancelFunc
 }

-type ModelRunner struct {
-	Path string // path to the model runner executable
-}
-
 type llama struct {
 	api.Options
 	Running
 }

-func newLlama(model string, adapters []string, runner ModelRunner, opts api.Options) (*llama, error) {
-	if _, err := os.Stat(model); err != nil {
-		return nil, err
+var errNoGPU = errors.New("nvidia-smi command failed")
+
+// CheckVRAM returns the available VRAM in MiB on Linux machines with NVIDIA GPUs
+func CheckVRAM() (int, error) {
+	cmd := exec.Command("nvidia-smi", "--query-gpu=memory.total", "--format=csv,noheader,nounits")
+	var stdout bytes.Buffer
+	cmd.Stdout = &stdout
+	err := cmd.Run()
+	if err != nil {
+		return 0, errNoGPU
 	}

-	if _, err := os.Stat(runner.Path); err != nil {
+	var total int
+	scanner := bufio.NewScanner(&stdout)
+	for scanner.Scan() {
+		line := scanner.Text()
+		vram, err := strconv.Atoi(line)
+		if err != nil {
+			return 0, fmt.Errorf("failed to parse available VRAM: %v", err)
+		}
+
+		total += vram
+	}
+
+	return total, nil
+}
+
+func NumGPU(opts api.Options) int {
+	if opts.NumGPU != -1 {
+		return opts.NumGPU
+	}
+	n := 1 // default to enable metal on macOS
+	if runtime.GOOS == "linux" {
+		vram, err := CheckVRAM()
+		if err != nil {
+			if err.Error() != "nvidia-smi command failed" {
+				log.Print(err.Error())
+			}
+			// nvidia driver not installed or no nvidia GPU found
+			return 0
+		}
+		// TODO: this is a very rough heuristic, better would be to calculate this based on number of layers and context size
+		switch {
+		case vram < 500:
+			log.Printf("WARNING: Low VRAM detected, disabling GPU")
+			n = 0
+		case vram < 1000:
+			n = 4
+		case vram < 2000:
+			n = 8
+		case vram < 4000:
+			n = 12
+		case vram < 8000:
+			n = 16
+		case vram < 12000:
+			n = 24
+		case vram < 16000:
+			n = 32
+		default:
+			n = 48
+		}
+		log.Printf("%d MB VRAM available, loading %d GPU layers", vram, n)
+	}
+	return n
+}
+
+func newLlama(model string, adapters []string, runners []ModelRunner, opts api.Options) (*llama, error) {
+	if _, err := os.Stat(model); err != nil {
 		return nil, err
 	}

@@ -237,7 +261,7 @@ func newLlama(model string, adapters []string, runner ModelRunner, opts api.Opti
 		"--rope-freq-base", fmt.Sprintf("%f", opts.RopeFrequencyBase),
 		"--rope-freq-scale", fmt.Sprintf("%f", opts.RopeFrequencyScale),
 		"--batch-size", fmt.Sprintf("%d", opts.NumBatch),
-		"--n-gpu-layers", fmt.Sprintf("%d", opts.NumGPU),
+		"--n-gpu-layers", fmt.Sprintf("%d", NumGPU(opts)),
 		"--embedding",
 	}

@@ -268,7 +292,12 @@ func newLlama(model string, adapters []string, runner ModelRunner, opts api.Opti
 	}

 	// start the llama.cpp server with a retry in case the port is already in use
-	for try := 0; try < 3; try++ {
+	for _, runner := range runners {
+		if _, err := os.Stat(runner.Path); err != nil {
+			log.Printf("llama runner not found: %v", err)
+			continue
+		}
+
 		port := rand.Intn(65535-49152) + 49152 // get a random port in the ephemeral range
 		ctx, cancel := context.WithCancel(context.Background())
 		cmd := exec.CommandContext(
@@ -276,20 +305,30 @@ func newLlama(model string, adapters []string, runner ModelRunner, opts api.Opti
 			runner.Path,
 			append(params, "--port", strconv.Itoa(port))...,
 		)
-
+		cmd.Env = append(os.Environ(), fmt.Sprintf("LD_LIBRARY_PATH=%s", filepath.Dir(runner.Path)))
 		cmd.Stdout = os.Stderr
 		cmd.Stderr = os.Stderr

 		llm := &llama{Options: opts, Running: Running{Port: port, Cmd: cmd, Cancel: cancel}}

-		log.Print("starting llama.cpp server")
+		log.Print("starting llama runner")
 		if err := llm.Cmd.Start(); err != nil {
-			log.Printf("error starting the external llama.cpp server: %v", err)
+			log.Printf("error starting the external llama runner: %v", err)
 			continue
 		}

+		// monitor the command, it is blocking, so if it exits we need to capture that
+		go func() {
+			err := llm.Cmd.Wait() // this will block until the command exits
+			if err != nil {
+				log.Printf("llama runner exited with error: %v", err)
+			} else {
+				log.Printf("llama runner exited")
+			}
+		}()
+
 		if err := waitForServer(llm); err != nil {
-			log.Printf("error starting llama.cpp server: %v", err)
+			log.Printf("error starting llama runner: %v", err)
 			llm.Close()
 			// try again
 			continue
@@ -299,19 +338,24 @@ func newLlama(model string, adapters []string, runner ModelRunner, opts api.Opti
 		return llm, nil
 	}

-	return nil, fmt.Errorf("max retry exceeded starting llama.cpp")
+	return nil, fmt.Errorf("failed to start a llama runner")
 }

 func waitForServer(llm *llama) error {
 	// wait for the server to start responding
 	start := time.Now()
-	expiresAt := time.Now().Add(30 * time.Second)
+	expiresAt := time.Now().Add(2 * time.Minute) // be generous with timeout, large models can take a while to load
 	ticker := time.NewTicker(200 * time.Millisecond)

-	log.Print("waiting for llama.cpp server to start responding")
+	log.Print("waiting for llama runner to start responding")
 	for range ticker.C {
 		if time.Now().After(expiresAt) {
-			return fmt.Errorf("llama.cpp server did not start within alloted time, retrying")
+			return fmt.Errorf("llama runner did not start within alloted time, retrying")
+		}
+
+		// check if the server process has terminated
+		if llm.Cmd.ProcessState != nil && llm.Cmd.ProcessState.Exited() {
+			return fmt.Errorf("llama runner process has terminated")
 		}

 		if err := llm.Ping(context.Background()); err == nil {
@@ -319,15 +363,12 @@ func waitForServer(llm *llama) error {
 		}
 	}

-	log.Printf("llama.cpp server started in %f seconds", time.Since(start).Seconds())
+	log.Printf("llama runner started in %f seconds", time.Since(start).Seconds())
 	return nil
 }

 func (llm *llama) Close() {
 	llm.Cancel()
-	if err := llm.Cmd.Wait(); err != nil {
-		log.Printf("llama.cpp server exited with error: %v", err)
-	}
 }

 func (llm *llama) SetOptions(opts api.Options) {
--- a/llm/llm.go
+++ b/llm/llm.go
@@ -21,7 +21,7 @@ type LLM interface {
 	Ping(context.Context) error
 }

-func New(model string, adapters []string, opts api.Options) (LLM, error) {
+func New(workDir, model string, adapters []string, opts api.Options) (LLM, error) {
 	if _, err := os.Stat(model); err != nil {
 		return nil, err
 	}
@@ -37,7 +37,7 @@ func New(model string, adapters []string, opts api.Options) (LLM, error) {
 		return nil, err
 	}

-	switch ggml.FileType().String() {
+	switch ggml.FileType() {
 	case "Q8_0":
 		if ggml.Name() != "gguf" && opts.NumGPU != 0 {
 			// GGML Q8_0 do not support Metal API and will
@@ -56,38 +56,44 @@ func New(model string, adapters []string, opts api.Options) (LLM, error) {

 	totalResidentMemory := memory.TotalMemory()
 	switch ggml.ModelType() {
-	case ModelType3B, ModelType7B:
-		if ggml.FileType().String() == "F16" && totalResidentMemory < 16*1024*1024 {
+	case "3B", "7B":
+		if ggml.FileType() == "F16" && totalResidentMemory < 16*1024*1024 {
 			return nil, fmt.Errorf("F16 model requires at least 16GB of memory")
 		} else if totalResidentMemory < 8*1024*1024 {
 			return nil, fmt.Errorf("model requires at least 8GB of memory")
 		}
-	case ModelType13B:
-		if ggml.FileType().String() == "F16" && totalResidentMemory < 32*1024*1024 {
+	case "13B":
+		if ggml.FileType() == "F16" && totalResidentMemory < 32*1024*1024 {
 			return nil, fmt.Errorf("F16 model requires at least 32GB of memory")
 		} else if totalResidentMemory < 16*1024*1024 {
 			return nil, fmt.Errorf("model requires at least 16GB of memory")
 		}
-	case ModelType30B, ModelType34B:
-		if ggml.FileType().String() == "F16" && totalResidentMemory < 64*1024*1024 {
+	case "30B", "34B", "40B":
+		if ggml.FileType() == "F16" && totalResidentMemory < 64*1024*1024 {
 			return nil, fmt.Errorf("F16 model requires at least 64GB of memory")
 		} else if totalResidentMemory < 32*1024*1024 {
 			return nil, fmt.Errorf("model requires at least 32GB of memory")
 		}
-	case ModelType65B:
-		if ggml.FileType().String() == "F16" && totalResidentMemory < 128*1024*1024 {
+	case "65B", "70B":
+		if ggml.FileType() == "F16" && totalResidentMemory < 128*1024*1024 {
 			return nil, fmt.Errorf("F16 model requires at least 128GB of memory")
 		} else if totalResidentMemory < 64*1024*1024 {
 			return nil, fmt.Errorf("model requires at least 64GB of memory")
 		}
+	case "180B":
+		if ggml.FileType() == "F16" && totalResidentMemory < 512*1024*1024 {
+			return nil, fmt.Errorf("F16 model requires at least 512GB of memory")
+		} else if totalResidentMemory < 128*1024*1024 {
+			return nil, fmt.Errorf("model requires at least 128GB of memory")
+		}
 	}

 	switch ggml.Name() {
 	case "gguf":
 		opts.NumGQA = 0 // TODO: remove this when llama.cpp runners differ enough to need separate newLlama functions
-		return newLlama(model, adapters, ggufRunner(), opts)
+		return newLlama(model, adapters, chooseRunners(workDir, "gguf"), opts)
 	case "ggml", "ggmf", "ggjt", "ggla":
-		return newLlama(model, adapters, ggmlRunner(), opts)
+		return newLlama(model, adapters, chooseRunners(workDir, "ggml"), opts)
 	default:
 		return nil, fmt.Errorf("unknown ggml type: %s", ggml.ModelFamily())
 	}
--- a/scripts/build_linux.sh
+++ b/scripts/build_linux.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+
+set -e
+
+mkdir -p dist
+
+for ARCH in arm64 amd64; do
+    docker buildx build --platform=linux/$ARCH -f Dockerfile.build . -t builder:$ARCH --load
+    docker create --platform linux/$ARCH --name builder builder:$ARCH
+    docker cp builder:/go/src/github.com/jmorganca/ollama/ollama ./dist/ollama-linux-$ARCH
+    docker rm builder
+done
--- a/scripts/install.sh
+++ b/scripts/install.sh
@@ -0,0 +1,160 @@
+#!/bin/sh
+# This script installs Ollama on Linux.
+# It detects the current operating system architecture and installs the appropriate version of Ollama.
+
+set -eu
+
+check_os() {
+    if [ "$(uname -s)" != "Linux" ]; then
+        echo "This script is intended to run on Linux only."
+        exit 1
+    fi
+}
+
+determine_architecture() {
+    ARCH=$(uname -m)
+    case $ARCH in
+        x86_64)
+            ARCH_SUFFIX="amd64"
+            ;;
+        aarch64|arm64)
+            ARCH_SUFFIX="arm64"
+            ;;
+        *)
+            echo "Unsupported architecture: $ARCH"
+            exit 1
+            ;;
+    esac
+}
+
+check_sudo() {
+    if [ "$(id -u)" -ne 0 ]; then
+        if command -v sudo >/dev/null 2>&1; then
+            SUDO_CMD="sudo"
+            echo "Downloading the ollama executable to the PATH, this will require sudo permissions."
+        else
+            echo "Error: sudo is not available. Please run as root or install sudo."
+            exit 1
+        fi
+    else
+        SUDO_CMD=""
+    fi
+}
+
+install_cuda_drivers() {
+    local os_name os_version
+    if [ -f "/etc/os-release" ]; then
+        . /etc/os-release
+        os_name=$ID
+        os_version=$VERSION_ID
+    else
+        echo "Unable to detect operating system. Skipping CUDA installation."
+        return 1
+    fi
+
+    # based on https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#package-manager-installation
+    case $os_name in
+        CentOS)
+            $SUDO_CMD yum install yum-utils
+            $SUDO_CMD yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo
+            $SUDO_CMD yum clean all
+            $SUDO_CMD yum -y install nvidia-driver-latest-dkms
+            $SUDO_CMD yum -y install cuda-driver
+            $SUDO_CMD yum install kernel-devel-$(uname -r) kernel-headers-$(uname -r)
+            $SUDO_CMD dkms status | awk -F: '/added/ { print $1 }' | xargs -n1 $SUDO_CMD dkms install
+            $SUDO_CMD modprobe nvidia
+            ;;
+        ubuntu)
+            case $os_version in
+                20.04)
+                    wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.1-1_all.deb
+                ;;
+                22.04)
+                    wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
+                ;;
+                *)
+                    echo "Skipping automatic CUDA installation, not supported for Ubuntu ($os_version)."
+                    return
+                ;;
+            esac
+            $SUDO_CMD dpkg -i cuda-keyring_1.1-1_all.deb
+            $SUDO_CMD apt-get update
+            $SUDO_CMD apt-get -y install cuda-drivers
+            ;;
+        RedHatEnterprise*|Kylin|Fedora|SLES|openSUSE*|Microsoft|Debian)
+            echo "NVIDIA CUDA drivers may not be installed, you can install them from: https://developer.nvidia.com/cuda-downloads"
+            ;;
+        *)
+            echo "Unsupported or unknown distribution, skipping GPU CUDA driver install: $os_name"
+            ;;
+    esac
+}
+
+check_install_cuda_drivers() {
+    if lspci -d '10de:' | grep 'NVIDIA' >/dev/null; then
+        # NVIDIA Corporation [10de] device is available
+        if command -v nvidia-smi >/dev/null 2>&1; then
+            CUDA_VERSION=$(nvidia-smi | grep -o "CUDA Version: [0-9]*\.[0-9]*")
+            if [ -z "$CUDA_VERSION" ]; then
+                echo "Warning: NVIDIA-SMI is available, but the CUDA version cannot be detected. Installing CUDA drivers..."
+                install_cuda_drivers
+            else
+                echo "Detected CUDA version $CUDA_VERSION"
+            fi
+        else
+            echo "Warning: NVIDIA GPU detected but NVIDIA-SMI is not available. Installing CUDA drivers..."
+            install_cuda_drivers
+        fi
+    else
+        echo "No NVIDIA GPU detected. Skipping driver installation."
+    fi
+}
+
+download_ollama() {
+    $SUDO_CMD mkdir -p /usr/bin
+    $SUDO_CMD curl -fsSL -o /usr/bin/ollama "https://ollama.ai/download/latest/ollama-linux-$ARCH_SUFFIX"
+}
+
+configure_systemd() {
+    if command -v systemctl >/dev/null 2>&1; then
+        $SUDO_CMD useradd -r -s /bin/false -m -d /home/ollama ollama 2>/dev/null 
+
+        echo "Creating systemd service file for ollama..."
+        cat <<EOF | $SUDO_CMD tee /etc/systemd/system/ollama.service >/dev/null
+[Unit]
+Description=Ollama Service
+After=network-online.target
+
+[Service]
+ExecStart=/usr/bin/ollama serve
+User=ollama
+Group=ollama
+Restart=always
+RestartSec=3
+Environment="HOME=/home/ollama"
+
+[Install]
+WantedBy=default.target
+EOF
+        echo "Reloading systemd and enabling ollama service..."
+        if [ "$(systemctl is-system-running || echo 'not running')" = 'running' ]; then 
+            $SUDO_CMD systemctl daemon-reload
+            $SUDO_CMD systemctl enable ollama
+            $SUDO_CMD systemctl restart ollama
+        fi
+    else
+        echo "Run 'ollama serve' from the command line to start the service."
+    fi
+}
+
+main() {
+    check_os
+    determine_architecture
+    check_sudo
+    download_ollama
+    configure_systemd
+    check_install_cuda_drivers
+    echo "Installation complete. You can now run 'ollama' from the command line."
+}
+
+main
--- a/server/auth.go
+++ b/server/auth.go
@@ -14,7 +14,7 @@ import (
 	"net/http"
 	"net/url"
 	"os"
-	"path"
+	"path/filepath"
 	"strconv"
 	"strings"
 	"time"
@@ -71,7 +71,7 @@ func (r AuthRedirect) URL() (*url.URL, error) {
 	return redirectURL, nil
 }

-func getAuthToken(ctx context.Context, redirData AuthRedirect, regOpts *RegistryOptions) (string, error) {
+func getAuthToken(ctx context.Context, redirData AuthRedirect) (string, error) {
 	redirectURL, err := redirData.URL()
 	if err != nil {
 		return "", err
@@ -82,7 +82,7 @@ func getAuthToken(ctx context.Context, redirData AuthRedirect, regOpts *Registry
 		return "", err
 	}

-	keyPath := path.Join(home, ".ollama", "id_ed25519")
+	keyPath := filepath.Join(home, ".ollama", "id_ed25519")

 	rawKey, err := os.ReadFile(keyPath)
 	if err != nil {
--- a/server/download.go
+++ b/server/download.go
@@ -8,7 +8,7 @@ import (
 	"log"
 	"net/http"
 	"os"
-	"path"
+	"path/filepath"
 	"strconv"
 	"sync"
 	"time"
@@ -173,7 +173,7 @@ func doDownload(ctx context.Context, opts downloadOpts, f *FileDownload) error {
 		return fmt.Errorf("%w: on download registry responded with code %d: %v", errDownload, resp.StatusCode, string(body))
 	}

-	err = os.MkdirAll(path.Dir(f.FilePath), 0o700)
+	err = os.MkdirAll(filepath.Dir(f.FilePath), 0o700)
 	if err != nil {
 		return fmt.Errorf("make blobs directory: %w", err)
 	}
--- a/server/images.go
+++ b/server/images.go
@@ -14,7 +14,6 @@ import (
 	"net/http"
 	"net/url"
 	"os"
-	"path"
 	"path/filepath"
 	"reflect"
 	"runtime"
@@ -114,11 +113,11 @@ type LayerReader struct {
 }

 type ConfigV2 struct {
-	ModelFamily llm.ModelFamily `json:"model_family"`
-	ModelType   string          `json:"model_type"`
-	ModelFormat string          `json:"model_format"`
-	FileType    string          `json:"file_type"`
-	RootFS      RootFS          `json:"rootfs"`
+	ModelFormat string `json:"model_format"`
+	ModelFamily string `json:"model_family"`
+	ModelType   string `json:"model_type"`
+	FileType    string `json:"file_type"`
+	RootFS      RootFS `json:"rootfs"`

 	// required by spec
 	Architecture string `json:"architecture"`
@@ -268,7 +267,7 @@ func filenameWithPath(path, f string) (string, error) {
 	return f, nil
 }

-func CreateModel(ctx context.Context, name string, path string, fn func(resp api.ProgressResponse)) error {
+func CreateModel(ctx context.Context, workDir, name string, path string, fn func(resp api.ProgressResponse)) error {
 	mp := ParseModelPath(name)

 	var manifest *ManifestV2
@@ -357,10 +356,10 @@ func CreateModel(ctx context.Context, name string, path string, fn func(resp api
 						return err
 					}

-					config.ModelFamily = ggml.ModelFamily()
-					config.ModelType = ggml.ModelType().String()
 					config.ModelFormat = ggml.Name()
-					config.FileType = ggml.FileType().String()
+					config.ModelFamily = ggml.ModelFamily()
+					config.ModelType = ggml.ModelType()
+					config.FileType = ggml.FileType()

 					// reset the file
 					file.Seek(0, io.SeekStart)
@@ -391,7 +390,7 @@ func CreateModel(ctx context.Context, name string, path string, fn func(resp api
 					return err
 				}

-				// copie the model metadata
+				// copy the model metadata
 				config.ModelFamily = source.ModelFamily
 				config.ModelType = source.ModelType
 				config.ModelFormat = source.ModelFormat
@@ -461,8 +460,10 @@ func CreateModel(ctx context.Context, name string, path string, fn func(resp api
 				return err
 			}

-			layer.MediaType = mediaType
-			layers = append(layers, layer)
+			if layer.Size > 0 {
+				layer.MediaType = mediaType
+				layers = append(layers, layer)
+			}
 		case "template", "system", "prompt":
 			fn(api.ProgressResponse{Status: fmt.Sprintf("creating model %s layer", c.Name)})
 			// remove the layer if one exists
@@ -474,8 +475,10 @@ func CreateModel(ctx context.Context, name string, path string, fn func(resp api
 				return err
 			}

-			layer.MediaType = mediaType
-			layers = append(layers, layer)
+			if layer.Size > 0 {
+				layer.MediaType = mediaType
+				layers = append(layers, layer)
+			}
 		default:
 			// runtime parameters, build a list of args for each parameter to allow multiple values to be specified (ex: multiple stop sequences)
 			params[c.Name] = append(params[c.Name], c.Args)
@@ -498,6 +501,12 @@ func CreateModel(ctx context.Context, name string, path string, fn func(resp api
 			}
 		}

+		if config.ModelType == "65B" {
+			if numGQA, ok := formattedParams["num_gqa"].(int); ok && numGQA == 8 {
+				config.ModelType = "70B"
+			}
+		}
+
 		bts, err := json.Marshal(formattedParams)
 		if err != nil {
 			return err
@@ -515,7 +524,7 @@ func CreateModel(ctx context.Context, name string, path string, fn func(resp api
 	}

 	// generate the embedding layers
-	embeddingLayers, err := embeddingLayers(embed)
+	embeddingLayers, err := embeddingLayers(workDir, embed)
 	if err != nil {
 		return err
 	}
@@ -572,7 +581,7 @@ type EmbeddingParams struct {
 }

 // embeddingLayers loads the associated LLM and generates the embeddings to be stored from an input file
-func embeddingLayers(e EmbeddingParams) ([]*LayerReader, error) {
+func embeddingLayers(workDir string, e EmbeddingParams) ([]*LayerReader, error) {
 	layers := []*LayerReader{}
 	if len(e.files) > 0 {
 		// check if the model is a file path or a model name
@@ -585,7 +594,7 @@ func embeddingLayers(e EmbeddingParams) ([]*LayerReader, error) {
 			model = &Model{ModelPath: e.model}
 		}

-		if err := load(context.Background(), model, e.opts, defaultSessionDuration); err != nil {
+		if err := load(context.Background(), workDir, model, e.opts, defaultSessionDuration); err != nil {
 			return nil, fmt.Errorf("load model to generate embeddings: %v", err)
 		}

@@ -815,14 +824,14 @@ func formatParams(params map[string][]string) (map[string]interface{}, error) {
 						return nil, fmt.Errorf("invalid float value %s", vals)
 					}

-					out[key] = floatVal
+					out[key] = float32(floatVal)
 				case reflect.Int:
-					intVal, err := strconv.ParseInt(vals[0], 10, 0)
+					intVal, err := strconv.ParseInt(vals[0], 10, 64)
 					if err != nil {
 						return nil, fmt.Errorf("invalid int value %s", vals)
 					}

-					out[key] = intVal
+					out[key] = int(intVal)
 				case reflect.Bool:
 					boolVal, err := strconv.ParseBool(vals[0])
 					if err != nil {
@@ -1145,14 +1154,14 @@ func PushModel(ctx context.Context, name string, regOpts *RegistryOptions, fn fu
 			Total:  layer.Size,
 		})

-		location, err := startUpload(ctx, mp, layer, regOpts)
+		location, chunkSize, err := startUpload(ctx, mp, layer, regOpts)
 		if err != nil {
 			log.Printf("couldn't start upload: %v", err)
 			return err
 		}

-		if strings.HasPrefix(path.Base(location.Path), "sha256:") {
-			layer.Digest = path.Base(location.Path)
+		if strings.HasPrefix(filepath.Base(location.Path), "sha256:") {
+			layer.Digest = filepath.Base(location.Path)
 			fn(api.ProgressResponse{
 				Status:    "using existing layer",
 				Digest:    layer.Digest,
@@ -1162,7 +1171,7 @@ func PushModel(ctx context.Context, name string, regOpts *RegistryOptions, fn fu
 			continue
 		}

-		if err := uploadBlobChunked(ctx, location, layer, regOpts, fn); err != nil {
+		if err := uploadBlob(ctx, location, layer, chunkSize, regOpts, fn); err != nil {
 			log.Printf("error uploading blob: %v", err)
 			return err
 		}
@@ -1388,7 +1397,7 @@ func makeRequestWithRetry(ctx context.Context, method string, requestURL *url.UR
 		case resp.StatusCode == http.StatusUnauthorized:
 			auth := resp.Header.Get("www-authenticate")
 			authRedir := ParseAuthRedirectString(auth)
-			token, err := getAuthToken(ctx, authRedir, regOpts)
+			token, err := getAuthToken(ctx, authRedir)
 			if err != nil {
 				return nil, err
 			}
@@ -1436,6 +1445,15 @@ func makeRequest(ctx context.Context, method string, requestURL *url.URL, header

 	req.Header.Set("User-Agent", fmt.Sprintf("ollama/%s (%s %s) Go/%s", version.Version, runtime.GOARCH, runtime.GOOS, runtime.Version()))

+	if s := req.Header.Get("Content-Length"); s != "" {
+		contentLength, err := strconv.ParseInt(s, 10, 64)
+		if err != nil {
+			return nil, err
+		}
+
+		req.ContentLength = contentLength
+	}
+
 	client := &http.Client{
 		CheckRedirect: func(req *http.Request, via []*http.Request) error {
 			if len(via) >= 10 {
--- a/server/routes.go
+++ b/server/routes.go
@@ -12,6 +12,7 @@ import (
 	"os/signal"
 	"path/filepath"
 	"reflect"
+	"runtime"
 	"strconv"
 	"strings"
 	"sync"
@@ -57,7 +58,7 @@ var loaded struct {
 var defaultSessionDuration = 5 * time.Minute

 // load a model into memory if it is not already loaded, it is up to the caller to lock loaded.mu before calling this function
-func load(ctx context.Context, model *Model, reqOpts map[string]interface{}, sessionDuration time.Duration) error {
+func load(ctx context.Context, workDir string, model *Model, reqOpts map[string]interface{}, sessionDuration time.Duration) error {
 	opts := api.DefaultOptions()
 	if err := opts.FromMap(model.Options); err != nil {
 		log.Printf("could not load model options: %v", err)
@@ -93,7 +94,7 @@ func load(ctx context.Context, model *Model, reqOpts map[string]interface{}, ses
 			loaded.Embeddings = model.Embeddings
 		}

-		llmModel, err := llm.New(model.ModelPath, model.AdapterPaths, opts)
+		llmModel, err := llm.New(workDir, model.ModelPath, model.AdapterPaths, opts)
 		if err != nil {
 			return err
 		}
@@ -129,6 +130,7 @@ func load(ctx context.Context, model *Model, reqOpts map[string]interface{}, ses
 			llmModel.SetOptions(opts)
 		}
 	}
+
 	loaded.expireAt = time.Now().Add(sessionDuration)

 	if loaded.expireTimer == nil {
@@ -149,6 +151,7 @@ func load(ctx context.Context, model *Model, reqOpts map[string]interface{}, ses
 			loaded.digest = ""
 		})
 	}
+
 	loaded.expireTimer.Reset(sessionDuration)
 	return nil
 }
@@ -171,8 +174,11 @@ func GenerateHandler(c *gin.Context) {
 		return
 	}

-	sessionDuration := defaultSessionDuration // TODO: set this duration from the request if specified
-	if err := load(c.Request.Context(), model, req.Options, sessionDuration); err != nil {
+	workDir := c.GetString("workDir")
+
+	// TODO: set this duration from the request if specified
+	sessionDuration := defaultSessionDuration
+	if err := load(c.Request.Context(), workDir, model, req.Options, sessionDuration); err != nil {
 		c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
 		return
 	}
@@ -217,8 +223,12 @@ func GenerateHandler(c *gin.Context) {
 			ch <- r
 		}

-		if err := loaded.llm.Predict(c.Request.Context(), req.Context, prompt, fn); err != nil {
-			ch <- gin.H{"error": err.Error()}
+		if req.Prompt == "" {
+			ch <- api.GenerateResponse{Model: req.Model, Done: true}
+		} else {
+			if err := loaded.llm.Predict(c.Request.Context(), req.Context, prompt, fn); err != nil {
+				ch <- gin.H{"error": err.Error()}
+			}
 		}
 	}()

@@ -240,7 +250,9 @@ func EmbeddingHandler(c *gin.Context) {
 		c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
 		return
 	}
-	if err := load(c.Request.Context(), model, req.Options, 5*time.Minute); err != nil {
+
+	workDir := c.GetString("workDir")
+	if err := load(c.Request.Context(), workDir, model, req.Options, 5*time.Minute); err != nil {
 		c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
 		return
 	}
@@ -330,6 +342,8 @@ func CreateModelHandler(c *gin.Context) {
 		return
 	}

+	workDir := c.GetString("workDir")
+
 	ch := make(chan any)
 	go func() {
 		defer close(ch)
@@ -340,7 +354,7 @@ func CreateModelHandler(c *gin.Context) {
 		ctx, cancel := context.WithCancel(c.Request.Context())
 		defer cancel()

-		if err := CreateModel(ctx, req.Name, req.Path, fn); err != nil {
+		if err := CreateModel(ctx, workDir, req.Name, req.Path, fn); err != nil {
 			ch <- gin.H{"error": err.Error()}
 		}
 	}()
@@ -494,33 +508,40 @@ func CopyModelHandler(c *gin.Context) {
 	}
 }

-func Serve(ln net.Listener, origins []string) error {
+var defaultAllowOrigins = []string{
+	"localhost",
+	"127.0.0.1",
+	"0.0.0.0",
+}
+
+func Serve(ln net.Listener, allowOrigins []string) error {
 	config := cors.DefaultConfig()
 	config.AllowWildcard = true
-	config.AllowOrigins = append(origins, []string{
-		"http://localhost",
-		"http://localhost:*",
-		"https://localhost",
-		"https://localhost:*",
-		"http://127.0.0.1",
-		"http://127.0.0.1:*",
-		"https://127.0.0.1",
-		"https://127.0.0.1:*",
-		"http://0.0.0.0",
-		"http://0.0.0.0:*",
-		"https://0.0.0.0",
-		"https://0.0.0.0:*",
-	}...)
+
+	config.AllowOrigins = allowOrigins
+	for _, allowOrigin := range defaultAllowOrigins {
+		config.AllowOrigins = append(config.AllowOrigins,
+			fmt.Sprintf("http://%s", allowOrigin),
+			fmt.Sprintf("https://%s", allowOrigin),
+			fmt.Sprintf("http://%s:*", allowOrigin),
+			fmt.Sprintf("https://%s:*", allowOrigin),
+		)
+	}
+
+	workDir, err := os.MkdirTemp("", "ollama")
+	if err != nil {
+		return err
+	}
+	defer os.RemoveAll(workDir)

 	r := gin.Default()
-	r.Use(cors.New(config))
-
-	r.GET("/", func(c *gin.Context) {
-		c.String(http.StatusOK, "Ollama is running")
-	})
-	r.HEAD("/", func(c *gin.Context) {
-		c.Status(http.StatusOK)
-	})
+	r.Use(
+		cors.New(config),
+		func(c *gin.Context) {
+			c.Set("workDir", workDir)
+			c.Next()
+		},
+	)

 	r.POST("/api/pull", PullModelHandler)
 	r.POST("/api/generate", GenerateHandler)
@@ -528,10 +549,17 @@ func Serve(ln net.Listener, origins []string) error {
 	r.POST("/api/create", CreateModelHandler)
 	r.POST("/api/push", PushModelHandler)
 	r.POST("/api/copy", CopyModelHandler)
-	r.GET("/api/tags", ListModelsHandler)
 	r.DELETE("/api/delete", DeleteModelHandler)
 	r.POST("/api/show", ShowModelHandler)

+	for _, method := range []string{http.MethodGet, http.MethodHead} {
+		r.Handle(method, "/", func(c *gin.Context) {
+			c.String(http.StatusOK, "Ollama is running")
+		})
+
+		r.Handle(method, "/api/tags", ListModelsHandler)
+	}
+
 	log.Printf("Listening on %s", ln.Addr())
 	s := &http.Server{
 		Handler: r,
@@ -539,15 +567,23 @@ func Serve(ln net.Listener, origins []string) error {

 	// listen for a ctrl+c and stop any loaded llm
 	signals := make(chan os.Signal, 1)
-	signal.Notify(signals, syscall.SIGINT)
+	signal.Notify(signals, syscall.SIGINT, syscall.SIGTERM)
 	go func() {
 		<-signals
 		if loaded.llm != nil {
 			loaded.llm.Close()
 		}
+		os.RemoveAll(workDir)
 		os.Exit(0)
 	}()

+	if runtime.GOOS == "linux" {
+		// check compatibility to log warnings
+		if _, err := llm.CheckVRAM(); err != nil {
+			log.Printf("Warning: GPU support may not enabled, check you have installed install GPU drivers: %v", err)
+		}
+	}
+
 	return s.Serve(ln)
 }

--- a/server/upload.go
+++ b/server/upload.go
@@ -14,7 +14,12 @@ import (
 	"github.com/jmorganca/ollama/api"
 )

-func startUpload(ctx context.Context, mp ModelPath, layer *Layer, regOpts *RegistryOptions) (*url.URL, error) {
+const (
+	redirectChunkSize = 1024 * 1024 * 1024
+	regularChunkSize  = 95 * 1024 * 1024
+)
+
+func startUpload(ctx context.Context, mp ModelPath, layer *Layer, regOpts *RegistryOptions) (*url.URL, int64, error) {
 	requestURL := mp.BaseURL()
 	requestURL = requestURL.JoinPath("v2", mp.GetNamespaceRepository(), "blobs/uploads/")
 	if layer.From != "" {
@@ -27,20 +32,26 @@ func startUpload(ctx context.Context, mp ModelPath, layer *Layer, regOpts *Regis
 	resp, err := makeRequestWithRetry(ctx, "POST", requestURL, nil, nil, regOpts)
 	if err != nil {
 		log.Printf("couldn't start upload: %v", err)
-		return nil, err
+		return nil, 0, err
 	}
 	defer resp.Body.Close()

-	// Extract UUID location from header
-	location := resp.Header.Get("Location")
+	location := resp.Header.Get("Docker-Upload-Location")
+	chunkSize := redirectChunkSize
 	if location == "" {
-		return nil, fmt.Errorf("location header is missing in response")
+		location = resp.Header.Get("Location")
+		chunkSize = regularChunkSize
 	}

-	return url.Parse(location)
+	locationURL, err := url.Parse(location)
+	if err != nil {
+		return nil, 0, err
+	}
+
+	return locationURL, int64(chunkSize), nil
 }

-func uploadBlobChunked(ctx context.Context, requestURL *url.URL, layer *Layer, regOpts *RegistryOptions, fn func(api.ProgressResponse)) error {
+func uploadBlob(ctx context.Context, requestURL *url.URL, layer *Layer, chunkSize int64, regOpts *RegistryOptions, fn func(api.ProgressResponse)) error {
 	// TODO allow resumability
 	// TODO allow canceling uploads via DELETE

@@ -55,8 +66,12 @@ func uploadBlobChunked(ctx context.Context, requestURL *url.URL, layer *Layer, r
 	}
 	defer f.Close()

-	// 95MB chunk size
-	chunkSize := 95 * 1024 * 1024
+	pw := ProgressWriter{
+		status: fmt.Sprintf("uploading %s", layer.Digest),
+		digest: layer.Digest,
+		total:  layer.Size,
+		fn:     fn,
+	}

 	for offset := int64(0); offset < int64(layer.Size); {
 		chunk := int64(layer.Size) - offset
@@ -64,87 +79,27 @@ func uploadBlobChunked(ctx context.Context, requestURL *url.URL, layer *Layer, r
 			chunk = int64(chunkSize)
 		}

-		sectionReader := io.NewSectionReader(f, int64(offset), chunk)
-		for try := 0; try < MaxRetries; try++ {
-			ch := make(chan error, 1)
+		resp, err := uploadBlobChunk(ctx, http.MethodPatch, requestURL, f, offset, chunk, regOpts, &pw)
+		if err != nil {
+			fn(api.ProgressResponse{
+				Status:    fmt.Sprintf("error uploading chunk: %v", err),
+				Digest:    layer.Digest,
+				Total:     layer.Size,
+				Completed: int(offset),
+			})

-			r, w := io.Pipe()
-			defer r.Close()
-			go func() {
-				defer w.Close()
+			return err
+		}

-				for chunked := int64(0); chunked < chunk; {
-					select {
-					case err := <-ch:
-						log.Printf("chunk interrupted: %v", err)
-						return
-					default:
-						n, err := io.CopyN(w, sectionReader, 1024*1024)
-						if err != nil && !errors.Is(err, io.EOF) {
-							fn(api.ProgressResponse{
-								Status:    fmt.Sprintf("error reading chunk: %v", err),
-								Digest:    layer.Digest,
-								Total:     layer.Size,
-								Completed: int(offset),
-							})
+		offset += chunk
+		location := resp.Header.Get("Docker-Upload-Location")
+		if location == "" {
+			location = resp.Header.Get("Location")
+		}

-							return
-						}
-
-						chunked += n
-						fn(api.ProgressResponse{
-							Status:    fmt.Sprintf("uploading %s", layer.Digest),
-							Digest:    layer.Digest,
-							Total:     layer.Size,
-							Completed: int(offset) + int(chunked),
-						})
-					}
-				}
-			}()
-
-			headers := make(http.Header)
-			headers.Set("Content-Type", "application/octet-stream")
-			headers.Set("Content-Length", strconv.Itoa(int(chunk)))
-			headers.Set("Content-Range", fmt.Sprintf("%d-%d", offset, offset+sectionReader.Size()-1))
-			resp, err := makeRequest(ctx, "PATCH", requestURL, headers, r, regOpts)
-			if err != nil && !errors.Is(err, io.EOF) {
-				fn(api.ProgressResponse{
-					Status:    fmt.Sprintf("error uploading chunk: %v", err),
-					Digest:    layer.Digest,
-					Total:     layer.Size,
-					Completed: int(offset),
-				})
-
-				return err
-			}
-			defer resp.Body.Close()
-
-			switch {
-			case resp.StatusCode == http.StatusUnauthorized:
-				ch <- errors.New("unauthorized")
-
-				auth := resp.Header.Get("www-authenticate")
-				authRedir := ParseAuthRedirectString(auth)
-				token, err := getAuthToken(ctx, authRedir, regOpts)
-				if err != nil {
-					return err
-				}
-
-				regOpts.Token = token
-				sectionReader = io.NewSectionReader(f, int64(offset), chunk)
-				continue
-			case resp.StatusCode >= http.StatusBadRequest:
-				body, _ := io.ReadAll(resp.Body)
-				return fmt.Errorf("on upload registry responded with code %d: %s", resp.StatusCode, body)
-			}
-
-			offset += sectionReader.Size()
-			requestURL, err = url.Parse(resp.Header.Get("Location"))
-			if err != nil {
-				return err
-			}
-
-			break
+		requestURL, err = url.Parse(location)
+		if err != nil {
+			return err
 		}
 	}

@@ -170,3 +125,90 @@ func uploadBlobChunked(ctx context.Context, requestURL *url.URL, layer *Layer, r
 	}
 	return nil
 }
+
+func uploadBlobChunk(ctx context.Context, method string, requestURL *url.URL, r io.ReaderAt, offset, limit int64, opts *RegistryOptions, pw *ProgressWriter) (*http.Response, error) {
+	sectionReader := io.NewSectionReader(r, int64(offset), limit)
+
+	headers := make(http.Header)
+	headers.Set("Content-Type", "application/octet-stream")
+	headers.Set("Content-Length", strconv.Itoa(int(limit)))
+	headers.Set("X-Redirect-Uploads", "1")
+
+	if method == http.MethodPatch {
+		headers.Set("Content-Range", fmt.Sprintf("%d-%d", offset, offset+sectionReader.Size()-1))
+	}
+
+	for try := 0; try < MaxRetries; try++ {
+		resp, err := makeRequest(ctx, method, requestURL, headers, io.TeeReader(sectionReader, pw), opts)
+		if err != nil && !errors.Is(err, io.EOF) {
+			return nil, err
+		}
+		defer resp.Body.Close()
+
+		switch {
+		case resp.StatusCode == http.StatusTemporaryRedirect:
+			location, err := resp.Location()
+			if err != nil {
+				return nil, err
+			}
+
+			pw.completed = int(offset)
+			if _, err := uploadBlobChunk(ctx, http.MethodPut, location, r, offset, limit, nil, pw); err != nil {
+				// retry
+				log.Printf("retrying redirected upload: %v", err)
+				continue
+			}
+
+			return resp, nil
+		case resp.StatusCode == http.StatusUnauthorized:
+			auth := resp.Header.Get("www-authenticate")
+			authRedir := ParseAuthRedirectString(auth)
+			token, err := getAuthToken(ctx, authRedir)
+			if err != nil {
+				return nil, err
+			}
+
+			opts.Token = token
+
+			pw.completed = int(offset)
+			sectionReader = io.NewSectionReader(r, offset, limit)
+			continue
+		case resp.StatusCode >= http.StatusBadRequest:
+			body, _ := io.ReadAll(resp.Body)
+			return nil, fmt.Errorf("on upload registry responded with code %d: %s", resp.StatusCode, body)
+		}
+
+		return resp, nil
+	}
+
+	return nil, fmt.Errorf("max retries exceeded")
+}
+
+type ProgressWriter struct {
+	status    string
+	digest    string
+	bucket    int
+	completed int
+	total     int
+	fn        func(api.ProgressResponse)
+}
+
+func (pw *ProgressWriter) Write(b []byte) (int, error) {
+	n := len(b)
+	pw.bucket += n
+	pw.completed += n
+
+	// throttle status updates to not spam the client
+	if pw.bucket >= 1024*1024 || pw.completed >= pw.total {
+		pw.fn(api.ProgressResponse{
+			Status:    pw.status,
+			Digest:    pw.digest,
+			Total:     pw.total,
+			Completed: pw.completed,
+		})
+
+		pw.bucket = 0
+	}
+
+	return n, nil
+}
Author	SHA1	Message	Date
Patrick Devine	c928ceb927	add word wrapping for lines which are longer than the terminal width (#553 )	2023-09-22 13:36:08 -07:00
Michael Yang	e1a0846483	Merge pull request #571 from jmorganca/mxyng/update-dockerfile update dockerfile.cuda	2023-09-22 12:34:41 -07:00
Jeffrey Morgan	f997e29e45	Add `Dockerfile.build` for building linux binaries (#558 ) Add `Dockerfile.build` for building linux binaries --------- Co-authored-by: Michael Yang <mxyng@pm.me>	2023-09-22 15:20:12 -04:00
Patrick Devine	87d9efb364	switch to forked readline lib which doesn't wreck the repl prompt (#578 )	2023-09-22 12:17:45 -07:00
Michael Yang	93d3a2568d	replace dockerfile	2023-09-22 11:57:38 -07:00
Michael Yang	5a81390b24	update dockerfile.cuda	2023-09-22 11:57:38 -07:00
Michael Yang	a89ef99aed	Merge pull request #575 from jmorganca/mxyng/fix-ipv6-only fix ipv6 parse ip	2023-09-22 11:47:11 -07:00
Bruce MacDonald	dc0c725ceb	ubuntu cuda drivers (#576 )	2023-09-22 19:43:14 +01:00
Bruce MacDonald	5d71bda478	close llm on interrupt (#577 )	2023-09-22 19:41:52 +01:00
Michael Yang	88897a90e4	fix ipv6 parse ip	2023-09-22 10:41:32 -07:00
Bruce MacDonald	9df31c3518	linux installer script (#534 ) Co-authored-by: Michael Yang <mxyng@pm.me>	2023-09-22 17:01:03 +01:00
Michael Yang	2044f9d4da	Merge pull request #570 from jmorganca/mxyng/head-request fix HEAD request	2023-09-21 16:56:17 -07:00
Michael Yang	0d186f3b33	Merge pull request #569 from jmorganca/mxyng/update-submodules silence warm up log	2023-09-21 16:52:42 -07:00
Michael Yang	82f5b66c01	register HEAD /api/tags	2023-09-21 16:38:03 -07:00
Michael Yang	c986694367	fix HEAD / request HEAD request should respond like their GET counterparts except without a response body.	2023-09-21 16:35:58 -07:00
Michael Yang	058d0cd04b	silence warm up log	2023-09-21 14:53:33 -07:00
Michael Yang	ee1c994d15	update submodule (#567 )	2023-09-21 16:22:23 -04:00
Bruce MacDonald	4cba75efc5	remove tmp directories created by previous servers (#559 ) * remove tmp directories created by previous servers * clean up on server stop * Update routes.go * Update server/routes.go Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com> * create top-level temp ollama dir * check file exists before creating --------- Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com> Co-authored-by: Michael Yang <mxyng@pm.me>	2023-09-21 20:38:49 +01:00
Michael Yang	8c83701e9f	Merge pull request #566 from jmorganca/mxyng/api-check-model-exists Use API to check if model exists and pull if necessary	2023-09-21 10:35:14 -07:00
Michael Yang	6137b12799	validate existence and pull model using api	2023-09-21 09:55:34 -07:00
Michael Yang	1fabba474b	refactor default allow origins this should be less error prone	2023-09-21 09:42:25 -07:00
Michael Yang	765770efdb	Merge pull request #562 from jmorganca/mxyng/fix-ollama-host fix OLLAMA_HOST parsing for ip6	2023-09-20 19:54:47 -07:00
Michael Yang	9297ff8330	fix OLLAMA_HOST parsing for ip6	2023-09-20 18:52:57 -07:00
Michael Yang	ee4fd16f2c	Merge pull request #556 from jmorganca/pack-cuda pack in cuda libs	2023-09-20 15:02:36 -07:00
Michael Yang	a9ed7cc6aa	rename generate.go	2023-09-20 14:42:17 -07:00
Michael Yang	6c6a31a1e8	embed libraries using cmake	2023-09-20 14:41:57 -07:00
Bruce MacDonald	fc6ec356fc	remove libcuda.so	2023-09-20 20:36:14 +01:00
Bruce MacDonald	1255bc9b45	only package 11.8 runner	2023-09-20 20:00:41 +01:00
Michael Yang	084e4c782a	Merge pull request #557 from jmorganca/mxyng/cleanup fix impossible condition	2023-09-20 11:51:01 -07:00
Michael Yang	58ffa03d8b	fix impossible condition	2023-09-20 11:27:44 -07:00
Michael Yang	637f8bc6a5	Merge pull request #536 from jmorganca/mxyng/redirect-uploads explicitly follow upload redirects	2023-09-20 11:27:03 -07:00
Michael Yang	499e9007a5	pick chunksize based on location	2023-09-20 11:10:24 -07:00
Bruce MacDonald	b9bb5ca288	use cuda_version	2023-09-20 17:58:16 +01:00
Bruce MacDonald	4e8be787c7	pack in cuda libs	2023-09-20 17:40:42 +01:00
Michael Yang	aa45d7c1df	draft: explicitly follow upload redirects	2023-09-19 13:36:58 -07:00
Michael Yang	e35565c567	Merge pull request #555 from jmorganca/mxyng/fix-windows-startup fix build	2023-09-19 10:51:58 -07:00
Michael Yang	a5520bfb42	fix build	2023-09-19 10:42:24 -07:00
Michael Yang	2627c464ba	Merge pull request #554 from jmorganca/mxyng/fix-windows-startup fix mkdir on windows	2023-09-19 09:42:12 -07:00
Michael Yang	b58d5d16b0	fix mkdir on windows	2023-09-19 09:41:13 -07:00
Patrick Devine	24580df958	only add a layer if there is actual data (#535 )	2023-09-18 13:47:45 -07:00
Patrick Devine	80dd44e80a	Cmd changes (#541 )	2023-09-18 12:26:56 -07:00
James Braza	94e1d96b29	Updated README section on community projects for table (#550 )	2023-09-18 15:22:50 -04:00
Bruce MacDonald	66003e1d05	subprocess improvements (#524 ) * subprocess improvements - increase start-up timeout - when runner fails to start fail rather than timing out - try runners in order rather than choosing 1 runner - embed metal runner in metal dir rather than gpu - refactor logging and error messages * Update llama.go * Update llama.go * simplify by using glob	2023-09-18 15:16:32 -04:00
Michael Yang	c345053a8b	Merge pull request #537 from jmorganca/mxyng/upload fix error on upload chunk	2023-09-15 17:48:39 -07:00
Michael Yang	08d7c2a944	fix error on upload chunk	2023-09-15 15:59:30 -07:00
Michael Yang	bc9573dcb1	Merge pull request #530 from jmorganca/mxyng/progresswriter implement ProgressWriter	2023-09-15 12:43:46 -07:00
Michael Yang	e53bc57d4d	split uploadBlobChunked	2023-09-14 17:22:05 -07:00
Michael Yang	f0b398d17f	implement ProgressWriter	2023-09-14 17:22:04 -07:00
Patrick Devine	8efbc5df55	DRAFT: add a simple python client to access ollama (#522 )	2023-09-14 16:37:38 -07:00
Michael Yang	ccc3e9ac6d	Merge pull request #531 from jmorganca/mxyng/content-length set request.ContentLength	2023-09-14 13:33:11 -07:00
Michael Yang	daa4f096f9	set request.ContentLength This informs the HTTP client the content length is known and disables chunked Transfer-Encoding	2023-09-14 13:32:44 -07:00
Michael Yang	3ee85f1c6c	Merge pull request #526 from jmorganca/mxyng/cleanup remove unused	2023-09-14 13:10:59 -07:00
Bruce MacDonald	2540c9181c	support for packaging in multiple cuda runners (#509 ) * enable packaging multiple cuda versions * use nvcc cuda version if available --------- Co-authored-by: Michael Yang <mxyng@pm.me>	2023-09-14 15:08:13 -04:00
Michael Yang	83ffb154bc	Merge pull request #507 from jmorganca/mxyng/build update docker image	2023-09-14 11:25:59 -07:00
Michael Yang	9aa192c812	update cuda docker image	2023-09-14 11:25:20 -07:00
Matt Williams	fc8707686f	Update API docs (#527 ) * Update API docs Signed-off-by: Matt Williams <m@technovangelist.com> * strange TOC was getting auto generated Signed-off-by: Matt Williams <m@technovangelist.com> * Update docs/api.md Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com> * Update docs/api.md Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com> * Update docs/api.md Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com> * Update api.md --------- Signed-off-by: Matt Williams <m@technovangelist.com> Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com> Co-authored-by: Michael Chiang <mchiang0610@users.noreply.github.com>	2023-09-14 08:51:26 -07:00
Michael Yang	f89c23764b	Merge pull request #525 from jmorganca/mxyng/falcon-decode fix: add falcon.go	2023-09-13 15:08:47 -07:00
Michael Yang	e6881cabd0	remove unused	2023-09-13 14:48:33 -07:00
Michael Yang	d028853879	fix: add falcon.go	2023-09-13 14:47:37 -07:00
Michael Yang	949553db23	Merge pull request #519 from jmorganca/mxyng/decode Mxyng/decode	2023-09-13 12:43:57 -07:00
Michael Yang	0c5a454361	fix model type for 70b	2023-09-12 15:12:59 -07:00
Bruce MacDonald	f59c4d03f7	fix ggml arm64 cuda build (#520 )	2023-09-12 17:06:48 -04:00
Michael Yang	7dee25a07f	fix falcon decode get model and file type from bin file	2023-09-12 12:34:53 -07:00
Bruce MacDonald	f221637053	first pass at linux gpu support (#454 ) * linux gpu support * handle multiple gpus * add cuda docker image (#488) --------- Co-authored-by: Michael Yang <mxyng@pm.me>	2023-09-12 11:04:35 -04:00