addressing new comments after merge

Signed-off-by: Matt Williams <m@technovangelist.com>
applied mikes comments
2023-10-15 14:17:23 -07:00 · 2023-10-14 08:29:24 -07:00 · 2023-10-12 15:57:50 -07:00 · 2023-10-12 15:34:57 -07:00
7 changed files with 160 additions and 97 deletions
--- a/4
+++ b/4
@@ -5,8 +5,8 @@ ARG GOFLAGS="'-ldflags=-w -s'"

 WORKDIR /go/src/github.com/jmorganca/ollama
 RUN apt-get update && apt-get install -y git build-essential cmake
-ADD https://dl.google.com/go/go1.21.3.linux-$TARGETARCH.tar.gz /tmp/go1.21.3.tar.gz
-RUN mkdir -p /usr/local && tar xz -C /usr/local </tmp/go1.21.3.tar.gz
+ADD https://dl.google.com/go/go1.21.1.linux-$TARGETARCH.tar.gz /tmp/go1.21.1.tar.gz
+RUN mkdir -p /usr/local && tar xz -C /usr/local </tmp/go1.21.1.tar.gz

 COPY . .
 ENV GOARCH=$TARGETARCH
--- a/Dockerfile.build
+++ b/Dockerfile.build
@@ -1,5 +1,6 @@
+
 # centos7 amd64 dependencies
-FROM --platform=linux/amd64 nvidia/cuda:11.3.1-devel-centos7 AS base-amd64
+FROM --platform=linux/amd64 nvidia/cuda:11.8.0-devel-centos7 AS base-amd64
 RUN yum install -y https://repo.ius.io/ius-release-el7.rpm centos-release-scl && \
    yum update -y && \
    yum install -y devtoolset-10-gcc devtoolset-10-gcc-c++ git236 wget
@@ -7,7 +8,7 @@ RUN wget "https://github.com/Kitware/CMake/releases/download/v3.27.6/cmake-3.27.
 ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH

 # centos8 arm64 dependencies
-FROM --platform=linux/arm64 nvidia/cuda-arm64:11.3.1-devel-centos8 AS base-arm64
+FROM --platform=linux/arm64 nvidia/cuda:11.4.3-devel-centos8 AS base-arm64
 RUN sed -i -e 's/mirrorlist/#mirrorlist/g' -e 's|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g' /etc/yum.repos.d/CentOS-*
 RUN yum install -y git cmake

@@ -16,8 +17,8 @@ ARG TARGETARCH
 ARG GOFLAGS="'-ldflags -w -s'"

 # install go
-ADD https://dl.google.com/go/go1.21.3.linux-$TARGETARCH.tar.gz /tmp/go1.21.3.tar.gz
-RUN mkdir -p /usr/local && tar xz -C /usr/local </tmp/go1.21.3.tar.gz
+ADD https://dl.google.com/go/go1.21.1.linux-$TARGETARCH.tar.gz /tmp/go1.21.1.tar.gz
+RUN mkdir -p /usr/local && tar xz -C /usr/local </tmp/go1.21.1.tar.gz

 # build the final binary
 WORKDIR /go/src/github.com/jmorganca/ollama
--- a/api/types.go
+++ b/api/types.go
@@ -3,6 +3,7 @@ package api
 import (
 	"encoding/json"
 	"fmt"
+	"log"
 	"math"
 	"os"
 	"reflect"
@@ -237,39 +238,44 @@ func (opts *Options) FromMap(m map[string]interface{}) error {
 						// when JSON unmarshals numbers, it uses float64, not int
 						field.SetInt(int64(t))
 					default:
-						return fmt.Errorf("option %q must be of type integer", key)
+						log.Printf("could not convert model parameter %v of type %T to int, skipped", key, val)
 					}
 				case reflect.Bool:
 					val, ok := val.(bool)
 					if !ok {
-						return fmt.Errorf("option %q must be of type boolean", key)
+						log.Printf("could not convert model parameter %v of type %T to bool, skipped", key, val)
+						continue
 					}
 					field.SetBool(val)
 				case reflect.Float32:
 					// JSON unmarshals to float64
 					val, ok := val.(float64)
 					if !ok {
-						return fmt.Errorf("option %q must be of type float32", key)
+						log.Printf("could not convert model parameter %v of type %T to float32, skipped", key, val)
+						continue
 					}
 					field.SetFloat(val)
 				case reflect.String:
 					val, ok := val.(string)
 					if !ok {
-						return fmt.Errorf("option %q must be of type string", key)
+						log.Printf("could not convert model parameter %v of type %T to string, skipped", key, val)
+						continue
 					}
 					field.SetString(val)
 				case reflect.Slice:
 					// JSON unmarshals to []interface{}, not []string
 					val, ok := val.([]interface{})
 					if !ok {
-						return fmt.Errorf("option %q must be of type array", key)
+						log.Printf("could not convert model parameter %v of type %T to slice, skipped", key, val)
+						continue
 					}
 					// convert []interface{} to []string
 					slice := make([]string, len(val))
 					for i, item := range val {
 						str, ok := item.(string)
 						if !ok {
-							return fmt.Errorf("option %q must be of an array of strings", key)
+							log.Printf("could not convert model parameter %v of type %T to slice of strings, skipped", key, item)
+							continue
 						}
 						slice[i] = str
 					}
--- a/app/src/index.ts
+++ b/app/src/index.ts
@@ -162,56 +162,13 @@ app.on('before-quit', () => {
  }
 })

-const updateURL = `https://ollama.ai/api/update?os=${process.platform}&arch=${
-  process.arch
-}&version=${app.getVersion()}&id=${id()}`
-
-let latest = ''
-async function isNewReleaseAvailable() {
-  try {
-    const response = await fetch(updateURL)
-
-    if (!response.ok) {
-      return false
-    }
-
-    if (response.status === 204) {
-      return false
-    }
-
-    const data = await response.json()
-
-    const url = data?.url
-    if (!url) {
-      return false
-    }
-
-    if (latest === url) {
-      return false
-    }
-
-    latest = url
-
-    return true
-  } catch (error) {
-    logger.error(`update check failed - ${error}`)
-    return false
-  }
-}
-
-async function checkUpdate() {
-  const available = await isNewReleaseAvailable()
-  if (available) {
-    logger.info('checking for update')
-    autoUpdater.checkForUpdates()
-  }
-}
-
 function init() {
  if (app.isPackaged) {
-    checkUpdate()
+    autoUpdater.checkForUpdates()
    setInterval(() => {
-      checkUpdate()
+      if (!updateAvailable) {
+        autoUpdater.checkForUpdates()
+      }
    }, 60 * 60 * 1000)
  }

@@ -289,7 +246,11 @@ function id(): string {
  return uuid
 }

-autoUpdater.setFeedURL({ url: updateURL })
+autoUpdater.setFeedURL({
+  url: `https://ollama.ai/api/update?os=${process.platform}&arch=${
+    process.arch
+  }&version=${app.getVersion()}&id=${id()}`,
+})

 autoUpdater.on('error', e => {
  logger.error(`update check failed - ${e.message}`)
--- a/docs/modelfile.md
+++ b/docs/modelfile.md
@@ -124,6 +124,7 @@ PARAMETER <parameter> <parametervalue>
 | repeat_last_n  | Sets how far back for the model to look back to prevent repetition. (Default: 64, 0 = disabled, -1 = num_ctx)                                                                                                                                           | int        | repeat_last_n 64     |
 | repeat_penalty | Sets how strongly to penalize repetitions. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. (Default: 1.1)                                                                     | float      | repeat_penalty 1.1   |
 | temperature    | The temperature of the model. Increasing the temperature will make the model answer more creatively. (Default: 0.8)                                                                                                                                     | float      | temperature 0.7      |
+| seed | Sets the random number seed to use for generation. Setting this to a specific number will make the model generate the same text for the same prompt. | int | seed 42 |
 | stop           | Sets the stop sequences to use.                                                                                                                                                                                                                         | string     | stop "AI assistant:" |
 | tfs_z          | Tail free sampling is used to reduce the impact of less probable tokens from the output. A higher value (e.g., 2.0) will reduce the impact more, while a value of 1.0 disables this setting. (default: 1)                                               | float      | tfs_z 1              |
 | num_predict    | Maximum number of tokens to predict when generating text. (Default: 128, -1 = infinite generation, -2 = fill context)                                                                                                                                   | int        | num_predict 42       |
@@ -132,7 +133,7 @@ PARAMETER <parameter> <parametervalue>

 ### TEMPLATE

-`TEMPLATE` of the full prompt template to be passed into the model. It may include (optionally) a system prompt and a user's prompt. This is used to create a full custom prompt, and syntax may be model specific.
+`TEMPLATE` of the full prompt template to be passed into the model. It may include (optionally) a system prompt and a user's prompt. This is used to create a full custom prompt, and syntax may be model specific. You can usually find the template for a given model in the readme for that model.

 #### Template Variables

--- a/docs/quantize.md
+++ b/docs/quantize.md
@@ -0,0 +1,111 @@
+# How to Quantize a Model
+
+Sometimes the model you want to work with is not available at [https://ollama.ai/library](https://ollama.ai/library).
+
+## Figure out if we can run the model?
+
+Not all models will work with Ollama. There are a number of factors that go into whether we are able to work with the next cool model. First it has to work with llama.cpp. Then we have to have implemented the features of llama.cpp that it requires. And then, sometimes, even with both of those, the model might not work...
+
+1. What is the model you want to convert and upload?
+2. Visit the model's page on HuggingFace.
+3. Switch to the **Files and versions** tab.
+4. Click on the **config.json** file. If there is no config.json file, it may not work.
+5. Take note of the **architecture** list in the json file.
+6. Does any entry in the list match one of the following architectures?
+    1. LlamaForCausalLM
+    2. MistralForCausalLM
+    3. RWForCausalLM
+    4. FalconForCausalLM
+    5. GPTNeoXForCausalLM
+    6. GPTBigCodeForCausalLM
+7. If the answer is yes, then there is a good chance the model will run after being converted and quantized.
+8. An alternative to this process is to visit [https://caniquant.tvl.st](https://caniquant.tvl.st) and enter the org/modelname in the box and submit.
+
+At this point there are two processes you can use. You can either use a Docker container to convert and quantize, OR you can manually run the scripts. The Docker container is the easiest way to do it, but it requires you to have Docker installed on your machine. If you don't have Docker installed, you can follow the manual process.
+
+## Convert and Quantize with Docker
+
+Run `docker run --rm -v /path/to/model/repo:/repo ollama/quantize -q quantlevel /repo`. For instance, if you have downloaded the latest Mistral 7B model, then clone it to your machine. Then change into that directory and you can run:
+
+```shell
+docker run --rm -v .:/repo ollama/quantize -q q4_0 /repo
+```
+
+You can find the different quantization levels below under **Quantize the Model**.
+
+This will output two files into the directory. First is a f16.bin file that is the model converted to GGUF. The second file is a q4_0.bin file which is the model quantized to a 4 bit quantization. You should rename it to something more descriptive.
+
+You can find the repository for the Docker container here: [https://github.com/mxyng/quantize](https://github.com/mxyng/quantize)
+
+For instance, if you wanted to convert the Mistral 7B model to a Q4 quantized model, then you could go through the following steps:
+
+1. First verify the model will potentially work.
+2. Now clone Mistral 7B to your machine. You can find the command to run when you click the three vertical dots button on the model page, then click **Clone Repository**.
+   1. For this repo, the command is:
+
+      ```shell
+      git lfs install
+      git clone https://huggingface.co/mistralai/Mistral-7B-v0.1
+      ```
+
+   2. Navigate into the new directory and run `docker run --rm -v .:/repo ollama/quantize -q q4_0 /repo`
+   3. Now you can create a modelfile using the q4_0.bin file that was created.
+
+## Convert and Quantize Manually
+
+### Clone llama.cpp to your machine
+
+If we know the model has a chance of working, then we need to convert and quantize. This is a matter of running two separate scripts in the llama.cpp project.
+
+1. Decide where you want the llama.cpp repository on your machine.
+2. Navigate to that location and then run:
+ [`git clone https://github.com/ggerganov/llama.cpp.git`](https://github.com/ggerganov/llama.cpp.git)
+    1. If you don't have git installed, download this zip file and unzip it to that location: https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.zip
+3. Install the Python dependencies: `pip install torch transformers sentencepiece`
+4. Run 'make' to build the project and the quantize executable.
+
+### Convert the model to GGUF
+
+1. Decide on the right convert script to run. What was the model architecture you found in the first section.
+    1. LlamaForCausalLM or MistralForCausalLM:
+    run `python3 convert.py <modelfilename>`
+    No need to specify fp16 or fp32.
+    2. FalconForCausalLM or RWForCausalLM:
+    run `python3 convert-falcon-hf-to-gguf.py <modelfilename> <fpsize>`  
+    fpsize depends on the weight size. 1 for fp16, 0 for fp32
+    3. GPTNeoXForCausalLM:
+    run `python3 convert-gptneox-hf-to-gguf.py <modelfilename> <fpsize>`
+    fpsize depends on the weight size. 1 for fp16, 0 for fp32
+    4. GPTBigCodeForCausalLM:
+    run `python3 convert-starcoder-hf-to-gguf.py <modelfilename> <fpsize>`
+    fpsize depends on the weight size. 1 for fp16, 0 for fp32
+
+### Quantize the model
+
+If the model converted successfully, there is a good chance it will also quantize successfully. Now you need to decide on the quantization to use. We will always try to create all the quantizations and upload them to the library. You should decide which level is more important to you and quantize accordingly.
+
+The quantization options are as follows. Note that some architectures such as Falcon do not support K quants.
+
+- Q4_0
+- Q4_1
+- Q5_0
+- Q5_1
+- Q2_K
+- Q3_K
+- Q3_K_S
+- Q3_K_M
+- Q3_K_L
+- Q4_K
+- Q4_K_S
+- Q4_K_M
+- Q5_K
+- Q5_K_S
+- Q5_K_M
+- Q6_K
+- Q8_0
+
+Run the following command `quantize <converted model from above> <output file> <quantization type>`
+
+## Now Create the Model
+
+Now you can create the Ollama model. Refer to the [modelfile](./modelfile.md) doc for more information on doing that.
--- a/llm/llama.go
+++ b/llm/llama.go
@@ -30,43 +30,42 @@ import (
 var llamaCppEmbed embed.FS

 type ModelRunner struct {
-	Path        string // path to the model runner executable
-	Accelerated bool
+	Path string // path to the model runner executable
 }

 func chooseRunners(workDir, runnerType string) []ModelRunner {
 	buildPath := path.Join("llama.cpp", runnerType, "build")
-	var runners []ModelRunner
+	var runners []string

 	// set the runners based on the OS
 	// IMPORTANT: the order of the runners in the array is the priority order
 	switch runtime.GOOS {
 	case "darwin":
-		runners = []ModelRunner{
-			{Path: path.Join(buildPath, "metal", "bin", "ollama-runner")},
-			{Path: path.Join(buildPath, "cpu", "bin", "ollama-runner")},
+		runners = []string{
+			path.Join(buildPath, "metal", "bin", "ollama-runner"),
+			path.Join(buildPath, "cpu", "bin", "ollama-runner"),
 		}
 	case "linux":
-		runners = []ModelRunner{
-			{Path: path.Join(buildPath, "cuda", "bin", "ollama-runner"), Accelerated: true},
-			{Path: path.Join(buildPath, "cpu", "bin", "ollama-runner")},
+		runners = []string{
+			path.Join(buildPath, "cuda", "bin", "ollama-runner"),
+			path.Join(buildPath, "cpu", "bin", "ollama-runner"),
 		}
 	case "windows":
 		// TODO: select windows GPU runner here when available
-		runners = []ModelRunner{
-			{Path: path.Join(buildPath, "cpu", "bin", "Release", "ollama-runner.exe")},
+		runners = []string{
+			path.Join(buildPath, "cpu", "bin", "Release", "ollama-runner.exe"),
 		}
 	default:
 		log.Printf("unknown OS, running on CPU: %s", runtime.GOOS)
-		runners = []ModelRunner{
-			{Path: path.Join(buildPath, "cpu", "bin", "ollama-runner")},
+		runners = []string{
+			path.Join(buildPath, "cpu", "bin", "ollama-runner"),
 		}
 	}

 	runnerAvailable := false // if no runner files are found in the embed, this flag will cause a fast fail
 	for _, r := range runners {
 		// find all the files in the runner's bin directory
-		files, err := fs.Glob(llamaCppEmbed, path.Join(path.Dir(r.Path), "*"))
+		files, err := fs.Glob(llamaCppEmbed, path.Join(path.Dir(r), "*"))
 		if err != nil {
 			// this is expected, ollama may be compiled without all runners packed in
 			log.Printf("%s runner not found: %v", r, err)
@@ -116,10 +115,7 @@ func chooseRunners(workDir, runnerType string) []ModelRunner {
 	localRunnersByPriority := []ModelRunner{}
 	for _, r := range runners {
 		// clean the ModelRunner paths so that they match the OS we are running on
-		localRunnersByPriority = append(localRunnersByPriority, ModelRunner{
-			Path:        filepath.Clean(path.Join(workDir, r.Path)),
-			Accelerated: r.Accelerated,
-		})
+		localRunnersByPriority = append(localRunnersByPriority, ModelRunner{Path: filepath.Clean(path.Join(workDir, r))})
 	}

 	return localRunnersByPriority
@@ -219,11 +215,6 @@ func CheckVRAM() (int64, error) {
 		free += vram
 	}

-	if free*1024*1024 < 2*1000*1000*1000 {
-		log.Printf("less than 2 GB VRAM available, falling back to CPU only")
-		free = 0
-	}
-
 	return free, nil
 }

@@ -247,8 +238,8 @@ func NumGPU(numLayer, fileSizeBytes int64, opts api.Options) int {
 		// TODO: this is a rough heuristic, better would be to calculate this based on number of layers and context size
 		bytesPerLayer := fileSizeBytes / numLayer

-		// max number of layers we can fit in VRAM, subtract 8% to prevent consuming all available VRAM and running out of memory
-		layers := int(freeVramBytes/bytesPerLayer) * 92 / 100
+		// max number of layers we can fit in VRAM, subtract 5% to prevent consuming all available VRAM and running out of memory
+		layers := int(freeVramBytes/bytesPerLayer) * 95 / 100
 		log.Printf("%d MiB VRAM available, loading up to %d GPU layers", vramMib, layers)

 		return layers
@@ -270,7 +261,8 @@ func NewStatusWriter() *StatusWriter {

 func (w *StatusWriter) Write(b []byte) (int, error) {
 	if _, after, ok := bytes.Cut(b, []byte("error:")); ok {
-		w.ErrCh <- fmt.Errorf("llama runner: %s", bytes.TrimSpace(after))
+		err := fmt.Errorf("llama runner: %s", after)
+		w.ErrCh <- err
 	}
 	return os.Stderr.Write(b)
 }
@@ -285,20 +277,16 @@ func newLlama(model string, adapters []string, runners []ModelRunner, numLayers
 		return nil, errors.New("ollama supports only one lora adapter, but multiple were provided")
 	}

-	numGPU := NumGPU(numLayers, fileInfo.Size(), opts)
 	params := []string{
 		"--model", model,
 		"--ctx-size", fmt.Sprintf("%d", opts.NumCtx),
 		"--rope-freq-base", fmt.Sprintf("%f", opts.RopeFrequencyBase),
 		"--rope-freq-scale", fmt.Sprintf("%f", opts.RopeFrequencyScale),
 		"--batch-size", fmt.Sprintf("%d", opts.NumBatch),
+		"--n-gpu-layers", fmt.Sprintf("%d", NumGPU(numLayers, fileInfo.Size(), opts)),
 		"--embedding",
 	}

-	if numGPU > 0 {
-		params = append(params, "--n-gpu-layers", fmt.Sprintf("%d", numGPU))
-	}
-
 	if opts.NumGQA > 0 {
 		params = append(params, "--gqa", fmt.Sprintf("%d", opts.NumGQA))
 	}
@@ -329,11 +317,6 @@ func newLlama(model string, adapters []string, runners []ModelRunner, numLayers

 	// start the llama.cpp server with a retry in case the port is already in use
 	for _, runner := range runners {
-		if runner.Accelerated && numGPU == 0 {
-			log.Printf("skipping accelerated runner because num_gpu=0")
-			continue
-		}
-
 		if _, err := os.Stat(runner.Path); err != nil {
 			log.Printf("llama runner not found: %v", err)
 			continue
Author	SHA1	Message	Date
Matt Williams	4522109b11	addressing new comments after merge Signed-off-by: Matt Williams <m@technovangelist.com>	2023-10-15 14:17:23 -07:00
Matt Williams	b2974a7095	applied mikes comments Signed-off-by: Matt Williams <m@technovangelist.com>	2023-10-14 08:29:24 -07:00
Matt Williams	3c975f898f	update doc to refer to docker image Signed-off-by: Matt Williams <m@technovangelist.com>	2023-10-12 15:57:50 -07:00
Matt Williams	9245c8a1df	add how to quantize doc Signed-off-by: Matt Williams <m@technovangelist.com>	2023-10-12 15:34:57 -07:00