change error handler behavior and fix error when a model isn't found

Merge pull request #172 from jmorganca/set-vars-first
fix vars.First
2023-07-21 22:44:04 -07:00 · 2023-07-21 20:55:06 -07:00 · 2023-07-21 20:45:32 -07:00 · 2023-07-21 20:27:25 -07:00 · 2023-07-21 17:30:40 -07:00 · 2023-07-21 15:42:19 -07:00
36 changed files with 4454 additions and 899 deletions
--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
 <div align="center">
  <picture>
-    <source media="(prefers-color-scheme: dark)" height="200px" srcset="https://github.com/jmorganca/ollama/assets/3325447/318048d2-b2dd-459c-925a-ac8449d5f02c">
-    <img alt="logo" height="200px" src="https://github.com/jmorganca/ollama/assets/3325447/c7d6e15f-7f4d-4776-b568-c084afa297c2">
+    <source media="(prefers-color-scheme: dark)" height="200px" srcset="https://github.com/jmorganca/ollama/assets/3325447/56ea1849-1284-4645-8970-956de6e51c3c">
+    <img alt="logo" height="200px" src="https://github.com/jmorganca/ollama/assets/3325447/0d0b44e2-8f4a-4e99-9b52-a5c1c741c8f7">
  </picture>
 </div>

@@ -9,10 +9,10 @@

 [![Discord](https://dcbadge.vercel.app/api/server/ollama?style=flat&compact=true)](https://discord.gg/ollama)

-Create, run, and share large language models (LLMs). Ollama bundles a model’s weights, configuration, prompts, and more into self-contained packages that can run on any machine.
-
 > Note: Ollama is in early preview. Please report any issues you find.

+Run, create, and share large language models (LLMs).
+
 ## Download

 - [Download](https://ollama.ai/download) for macOS on Apple Silicon (Intel coming soon)
@@ -29,7 +29,7 @@ ollama run llama2

 ## Model library

-Ollama includes a library of open-source, pre-trained models. More models are coming soon. You should have at least 8 GB of RAM to run the 3B models, 16 GB to run the 7B models, and 32 GB to run the 13B models.
+`ollama` includes a library of open-source models:

 | Model                    | Parameters | Size  | Download                    |
 | ------------------------ | ---------- | ----- | --------------------------- |
@@ -40,6 +40,8 @@ Ollama includes a library of open-source, pre-trained models. More models are co
 | Nous-Hermes              | 13B        | 7.3GB | `ollama pull nous-hermes`   |
 | Wizard Vicuna Uncensored | 13B        | 7.3GB | `ollama pull wizard-vicuna` |

+> Note: You should have at least 8 GB of RAM to run the 3B models, 16 GB to run the 7B models, and 32 GB to run the 13B models.
+
 ## Examples

 ### Run a model
@@ -50,26 +52,25 @@ ollama run llama2
 Hello! How can I help you today?
 ```

-### Create a custom character model
+### Create a custom model

 Pull a base model:

 ```
-ollama pull orca
+ollama pull llama2
 ```

 Create a `Modelfile`:

 ```
-FROM orca
-PROMPT """
-### System:
+FROM llama2
+
+# set the temperature to 1 [higher is more creative, lower is more coherent]
+PARAMETER temperature 1
+
+# set the system prompt
+SYSTEM """
 You are Mario from Super Mario Bros. Answer as Mario, the assistant, only.
-
-### User:
-{{ .Prompt }}
-
-### Response:
 """
 ```

@@ -87,9 +88,26 @@ For more examples, see the [examples](./examples) directory.
 ### Pull a model from the registry

 ```
-ollama pull nous-hermes
+ollama pull orca
 ```

+### Listing local models
+
+```
+ollama list
+```
+
+## Model packages
+
+### Overview
+
+Ollama bundles model weights, configuration, and data into a single package, defined by a [Modelfile](./docs/modelfile.md).
+
+<picture>
+  <source media="(prefers-color-scheme: dark)" height="480" srcset="https://github.com/jmorganca/ollama/assets/251292/2fd96b5f-191b-45c1-9668-941cfad4eb70">
+  <img alt="logo" height="480" src="https://github.com/jmorganca/ollama/assets/251292/2fd96b5f-191b-45c1-9668-941cfad4eb70">
+</picture>
+
 ## Building

 ```
@@ -107,3 +125,13 @@ Finally, run a model!
 ```
 ./ollama run llama2
 ```
+
+## REST API
+
+### `POST /api/generate`
+
+Generate text from a model.
+
+```
+curl -X POST http://localhost:11434/api/generate -d '{"model": "llama2", "prompt":"Why is the sky blue?"}'
+```
--- a/api/client.go
+++ b/api/client.go
@@ -27,7 +27,7 @@ func checkError(resp *http.Response, body []byte) error {
 	err := json.Unmarshal(body, &apiError)
 	if err != nil {
 		// Use the full body as the message if we fail to decode a response.
-		apiError.Message = string(body)
+		apiError.ErrorMessage = string(body)
 	}

 	return apiError
@@ -92,7 +92,6 @@ func (c *Client) do(ctx context.Context, method, path string, reqData, respData
 		}
 	}
 	return nil
-
 }

 func (c *Client) stream(ctx context.Context, method, path string, data any, fn func([]byte) error) error {
@@ -131,11 +130,15 @@ func (c *Client) stream(ctx context.Context, method, path string, data any, fn f
 			return fmt.Errorf("unmarshal: %w", err)
 		}

+		if errorResponse.Error != "" {
+			return fmt.Errorf("stream: %s", errorResponse.Error)
+		}
+
 		if response.StatusCode >= 400 {
 			return StatusError{
-				StatusCode: response.StatusCode,
-				Status:     response.Status,
-				Message:    errorResponse.Error,
+				StatusCode:   response.StatusCode,
+				Status:       response.Status,
+				ErrorMessage: errorResponse.Error,
 			}
 		}

@@ -206,3 +209,10 @@ func (c *Client) List(ctx context.Context) (*ListResponse, error) {
 	}
 	return &lr, nil
 }
+
+func (c *Client) Delete(ctx context.Context, req *DeleteRequest) error {
+	if err := c.do(ctx, http.MethodDelete, "/api/delete", req, nil); err != nil {
+		return err
+	}
+	return nil
+}
--- a/api/types.go
+++ b/api/types.go
@@ -8,16 +8,23 @@ import (
 )

 type StatusError struct {
-	StatusCode int
-	Status     string
-	Message    string
+	StatusCode   int
+	Status       string
+	ErrorMessage string `json:"error"`
 }

 func (e StatusError) Error() string {
-	if e.Message != "" {
-		return fmt.Sprintf("%s: %s", e.Status, e.Message)
+	switch {
+	case e.Status != "" && e.ErrorMessage != "":
+		return fmt.Sprintf("%s: %s", e.Status, e.ErrorMessage)
+	case e.Status != "":
+		return e.Status
+	case e.ErrorMessage != "":
+		return e.ErrorMessage
+	default:
+		// this should not happen
+		return "something went wrong, please see the ollama server logs for details"
 	}
-	return e.Status
 }

 type GenerateRequest struct {
@@ -37,21 +44,27 @@ type CreateProgress struct {
 	Status string `json:"status"`
 }

+type DeleteRequest struct {
+	Name string `json:"name"`
+}
+
 type PullRequest struct {
 	Name     string `json:"name"`
+	Insecure bool   `json:"insecure,omitempty"`
 	Username string `json:"username"`
 	Password string `json:"password"`
 }

 type ProgressResponse struct {
-	Status    string  `json:"status"`
-	Digest    string  `json:"digest,omitempty"`
-	Total     int     `json:"total,omitempty"`
-	Completed int     `json:"completed,omitempty"`
+	Status    string `json:"status"`
+	Digest    string `json:"digest,omitempty"`
+	Total     int    `json:"total,omitempty"`
+	Completed int    `json:"completed,omitempty"`
 }

 type PushRequest struct {
 	Name     string `json:"name"`
+	Insecure bool   `json:"insecure,omitempty"`
 	Username string `json:"username"`
 	Password string `json:"password"`
 }
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -25,7 +25,7 @@ import (
 	"github.com/jmorganca/ollama/server"
 )

-func create(cmd *cobra.Command, args []string) error {
+func CreateHandler(cmd *cobra.Command, args []string) error {
 	filename, _ := cmd.Flags().GetString("file")
 	filename, err := filepath.Abs(filename)
 	if err != nil {
@@ -59,7 +59,7 @@ func create(cmd *cobra.Command, args []string) error {
 	return nil
 }

-func RunRun(cmd *cobra.Command, args []string) error {
+func RunHandler(cmd *cobra.Command, args []string) error {
 	mp := server.ParseModelPath(args[0])
 	fp, err := mp.GetManifestPath(false)
 	if err != nil {
@@ -69,7 +69,7 @@ func RunRun(cmd *cobra.Command, args []string) error {
 	_, err = os.Stat(fp)
 	switch {
 	case errors.Is(err, os.ErrNotExist):
-		if err := pull(args[0]); err != nil {
+		if err := pull(args[0], false); err != nil {
 			var apiStatusError api.StatusError
 			if !errors.As(err, &apiStatusError) {
 				return err
@@ -86,10 +86,15 @@ func RunRun(cmd *cobra.Command, args []string) error {
 	return RunGenerate(cmd, args)
 }

-func push(cmd *cobra.Command, args []string) error {
+func PushHandler(cmd *cobra.Command, args []string) error {
 	client := api.NewClient()

-	request := api.PushRequest{Name: args[0]}
+	insecure, err := cmd.Flags().GetBool("insecure")
+	if err != nil {
+		return err
+	}
+
+	request := api.PushRequest{Name: args[0], Insecure: insecure}
 	fn := func(resp api.ProgressResponse) error {
 		fmt.Println(resp.Status)
 		return nil
@@ -101,7 +106,7 @@ func push(cmd *cobra.Command, args []string) error {
 	return nil
 }

-func list(cmd *cobra.Command, args []string) error {
+func ListHandler(cmd *cobra.Command, args []string) error {
 	client := api.NewClient()

 	models, err := client.List(context.Background())
@@ -131,17 +136,33 @@ func list(cmd *cobra.Command, args []string) error {
 	return nil
 }

-func RunPull(cmd *cobra.Command, args []string) error {
-	return pull(args[0])
+func DeleteHandler(cmd *cobra.Command, args []string) error {
+	client := api.NewClient()
+
+	request := api.DeleteRequest{Name: args[0]}
+	if err := client.Delete(context.Background(), &request); err != nil {
+		return err
+	}
+	fmt.Printf("deleted '%s'\n", args[0])
+	return nil
 }

-func pull(model string) error {
+func PullHandler(cmd *cobra.Command, args []string) error {
+	insecure, err := cmd.Flags().GetBool("insecure")
+	if err != nil {
+		return err
+	}
+
+	return pull(args[0], insecure)
+}
+
+func pull(model string, insecure bool) error {
 	client := api.NewClient()

 	var currentDigest string
 	var bar *progressbar.ProgressBar

-	request := api.PullRequest{Name: model}
+	request := api.PullRequest{Name: model, Insecure: insecure}
 	fn := func(resp api.ProgressResponse) error {
 		if resp.Digest != currentDigest && resp.Digest != "" {
 			currentDigest = resp.Digest
@@ -290,7 +311,7 @@ func generateInteractive(cmd *cobra.Command, model string) error {
 		switch {
 		case strings.HasPrefix(line, "/list"):
 			args := strings.Fields(line)
-			if err := list(cmd, args[1:]); err != nil {
+			if err := ListHandler(cmd, args[1:]); err != nil {
 				return err
 			}

@@ -387,7 +408,7 @@ func NewCLI() *cobra.Command {
 		Use:   "create MODEL",
 		Short: "Create a model from a Modelfile",
 		Args:  cobra.MinimumNArgs(1),
-		RunE:  create,
+		RunE:  CreateHandler,
 	}

 	createCmd.Flags().StringP("file", "f", "Modelfile", "Name of the Modelfile (default \"Modelfile\")")
@@ -396,7 +417,7 @@ func NewCLI() *cobra.Command {
 		Use:   "run MODEL [PROMPT]",
 		Short: "Run a model",
 		Args:  cobra.MinimumNArgs(1),
-		RunE:  RunRun,
+		RunE:  RunHandler,
 	}

 	runCmd.Flags().Bool("verbose", false, "Show timings for response")
@@ -412,20 +433,32 @@ func NewCLI() *cobra.Command {
 		Use:   "pull MODEL",
 		Short: "Pull a model from a registry",
 		Args:  cobra.MinimumNArgs(1),
-		RunE:  RunPull,
+		RunE:  PullHandler,
 	}

+	pullCmd.Flags().Bool("insecure", false, "Use an insecure registry")
+
 	pushCmd := &cobra.Command{
 		Use:   "push MODEL",
 		Short: "Push a model to a registry",
 		Args:  cobra.MinimumNArgs(1),
-		RunE:  push,
+		RunE:  PushHandler,
 	}

+	pushCmd.Flags().Bool("insecure", false, "Use an insecure registry")
+
 	listCmd := &cobra.Command{
-		Use:   "list",
-		Short: "List models",
-		RunE:  list,
+		Use:     "list",
+		Aliases: []string{"ls"},
+		Short:   "List models",
+		RunE:    ListHandler,
+	}
+
+	deleteCmd := &cobra.Command{
+		Use:   "rm",
+		Short: "Remove a model",
+		Args:  cobra.MinimumNArgs(1),
+		RunE:  DeleteHandler,
 	}

 	rootCmd.AddCommand(
@@ -435,6 +468,7 @@ func NewCLI() *cobra.Command {
 		pullCmd,
 		pushCmd,
 		listCmd,
+		deleteCmd,
 	)

 	return rootCmd
--- a/docs/development.md
+++ b/docs/development.md
@@ -6,6 +6,12 @@ Install required tools:
 brew install go
 ```

+Enable CGO:
+
+```
+export CGO_ENABLED=1
+```
+
 Then build ollama:

 ```
--- a/docs/modelfile.md
+++ b/docs/modelfile.md
@@ -1,5 +1,7 @@
 # Ollama Model File

+> Note: this model file syntax is in development
+
 A model file is the blueprint to create and share models with Ollama.

 ## Format
@@ -11,13 +13,13 @@ The format of the Modelfile:
 INSTRUCTION arguments
 ```

-| Instruction               | Description                                           |
-| ------------------------- | ----------------------------------------------------- |
-| `FROM`<br>(required)      | Defines the base model to use                         |
-| `PARAMETER`<br>(optional) | Sets the parameters for how Ollama will run the model |
-| `SYSTEM`<br>(optional)    | Specifies the system prompt that will set the context |
-| `TEMPLATE`<br>(optional)  | The full prompt template to be sent to the model      |
-| `LICENSE`<br>(optional)   | Specifies the legal license                           |
+| Instruction       | Description                                           |
+| ----------------- | ----------------------------------------------------- |
+| `FROM` (required) | Defines the base model to use                         |
+| `PARAMETER`       | Sets the parameters for how Ollama will run the model |
+| `SYSTEM`          | Specifies the system prompt that will set the context |
+| `TEMPLATE`        | The full prompt template to be sent to the model      |
+| `LICENSE`         | Specifies the legal license                           |

 ## Examples

@@ -36,8 +38,8 @@ SYSTEM You are Mario from super mario bros, acting as an assistant.

 To use this:

-1. Save it as a file (eg. modelfile)
-2. `ollama create NAME -f <location of the file eg. ./modelfile>'`
+1. Save it as a file (eg. `Modelfile``)
+2. `ollama create NAME -f <location of the file eg. ./Modelfile>'`
 3. `ollama run NAME`
 4. Start using the model!

@@ -52,7 +54,7 @@ FROM <model name>:<tag>
 ### Build from llama2

 ```
-FROM llama2:latest
+FROM llama2
 ```

 A list of available base models:
--- a/examples/mario/Modelfile
+++ b/examples/mario/Modelfile
@@ -1,11 +1,5 @@
 FROM llama2
 PARAMETER temperature 1
-PROMPT """
-{{- if not .Context }}
-<<SYS>>
+SYSTEM """
 You are Mario from super mario bros, acting as an assistant.
-<</SYS>>
-
-{{- end }}
-[INST] {{ .Prompt }} [/INST]
-"""
+"""
--- a/examples/mario/readme.md
+++ b/examples/mario/readme.md
@@ -20,14 +20,8 @@ What the model file looks like:
 ```
 FROM llama2
 PARAMETER temperature 1
-PROMPT """
-{{- if not .Context }}
-<<SYS>>
-You are Mario from super mario bros, acting as an assistant.
-<</SYS>>
-
-{{- end }}
-[INST] {{ .Prompt }} [/INST]
+SYSTEM """
+You are Mario from Super Mario Bros, acting as an assistant.
 """
 ```

--- a/examples/midjourney-prompter/Modelfile
+++ b/examples/midjourney-prompter/Modelfile
@@ -1,15 +1,8 @@
 # Modelfile for creating a Midjourney prompts from a topic
 # This prompt was adapted from the original at https://www.greataiprompts.com/guide/midjourney/best-chatgpt-prompt-for-midjourney/
-# Run `ollama create mj -f pathtofile` and then `ollama run mj` and enter a topic
+# Run `ollama create mj -f ./Modelfile` and then `ollama run mj` and enter a topic

 FROM nous-hermes
-PROMPT """
-{{- if not .Context }}
-### System:
+SYSTEM """
 Embrace your role as an AI-powered creative assistant, employing Midjourney to manifest compelling AI-generated art. I will outline a specific image concept, and in response, you must produce an exhaustive, multifaceted prompt for Midjourney, ensuring every detail of the original concept is represented in your instructions. Midjourney doesn't do well with text, so after the prompt, give me instructions that I can use to create the titles in a image editor.
-{{- end }}
-### Instruction:
-{{ .Prompt }}
-
-### Response:
-"""
+"""
--- a/examples/recipemaker/Modelfile
+++ b/examples/recipemaker/Modelfile
@@ -1,13 +1,6 @@
 # Modelfile for creating a recipe from a list of ingredients
-# Run `ollama create recipemaker -f pathtofile` and then `ollama run recipemaker` and feed it lists of ingredients to create recipes around.
+# Run `ollama create recipemaker -f ./Modelfile` and then `ollama run recipemaker` and feed it lists of ingredients to create recipes around.
 FROM nous-hermes
-PROMPT """
-{{- if not .Context }}
-### System:
+SYSTEM """
 The instruction will be a list of ingredients. You should generate a recipe that can be made in less than an hour. You can also include ingredients that most people will find in their pantry every day. The recipe should be 4 people and you should include a description of what the meal will taste like
-{{- end }}
-### Instruction:
-{{ .Prompt }}
-
-### Response:
 """
--- a/examples/tweetwriter/Modelfile
+++ b/examples/tweetwriter/Modelfile
@@ -1,14 +1,7 @@
 # Modelfile for creating a tweet from a topic
-# Run `ollama create tweetwriter -f pathtofile` and then `ollama run tweetwriter` and enter a topic 
+# Run `ollama create tweetwriter -f ./Modelfile` and then `ollama run tweetwriter` and enter a topic

 FROM nous-hermes
-PROMPT """
-{{- if not .Context }}
-### System:
+SYSTEM """
 You are a content marketer who needs to come up with a short but succinct tweet. Make sure to include the appropriate hashtags and links. Sometimes when appropriate, describe a meme that can be includes as well. All answers should be in the form of a tweet which has a max size of 280 characters. Every instruction will be the topic to create a tweet about.
-{{- end }}
-### Instruction:
-{{ .Prompt }}
-
-### Response:
-"""
+"""
--- a/library/modelfiles/llama2_13b
+++ b/library/modelfiles/llama2_13b
--- a/library/modelfiles/llama2_7b
+++ b/library/modelfiles/llama2_7b
--- a/llama/ggml-cuda.cu
+++ b/llama/ggml-cuda.cu
--- a/llama/ggml-cuda.h
+++ b/llama/ggml-cuda.h
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - git 5bf2a2771886ee86137e01dbc7492f78fb392066
+ * llama.cpp - git e782c9e735f93ab4767ffc37462c523b73a17ddc
 *
 * MIT License
 *
--- a/llama/ggml-metal.h
+++ b/llama/ggml-metal.h
@@ -1,5 +1,7 @@
+//go:build darwin
+
 /**
- * llama.cpp - git 5bf2a2771886ee86137e01dbc7492f78fb392066
+ * llama.cpp - git e782c9e735f93ab4767ffc37462c523b73a17ddc
 *
 * MIT License
 *
--- a/llama/ggml-metal.m
+++ b/llama/ggml-metal.m
@@ -1,7 +1,7 @@
-// +build darwin
+//go:build darwin

 /**
- * llama.cpp - git 5bf2a2771886ee86137e01dbc7492f78fb392066
+ * llama.cpp - git e782c9e735f93ab4767ffc37462c523b73a17ddc
 *
 * MIT License
 *
@@ -722,8 +722,8 @@ void ggml_metal_graph_compute(
                                            GGML_ASSERT(ne02 == 1);
                                            GGML_ASSERT(ne12 == 1);

-                                            nth0 = 4;
-                                            nth1 = 16;
+                                            nth0 = 2;
+                                            nth1 = 32;
                                            [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_K_f32];
                                        } break;
                                    case GGML_TYPE_Q5_K:
@@ -731,8 +731,8 @@ void ggml_metal_graph_compute(
                                            GGML_ASSERT(ne02 == 1);
                                            GGML_ASSERT(ne12 == 1);

-                                            nth0 = 4;
-                                            nth1 = 16;
+                                            nth0 = 2;
+                                            nth1 = 32;
                                            [encoder setComputePipelineState:ctx->pipeline_mul_mat_q5_K_f32];
                                        } break;
                                    case GGML_TYPE_Q6_K:
@@ -740,8 +740,8 @@ void ggml_metal_graph_compute(
                                            GGML_ASSERT(ne02 == 1);
                                            GGML_ASSERT(ne12 == 1);

-                                            nth0 = 4;
-                                            nth1 = 16;
+                                            nth0 = 2;
+                                            nth1 = 32;
                                            [encoder setComputePipelineState:ctx->pipeline_mul_mat_q6_K_f32];
                                        } break;
                                    default:
@@ -767,15 +767,18 @@ void ggml_metal_graph_compute(
                                [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:13];
                                [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:14];

-                                if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1) {
-                                    [encoder setThreadgroupMemoryLength:nth0*nth1*sizeof(float) atIndex:0];
-                                    [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                                if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 ||
+                                    src0t == GGML_TYPE_Q4_K) {
+                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7) / 8, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                                }
+                                else if (src0t == GGML_TYPE_Q5_K) {
+                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3) / 4, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                                }
+                                else if (src0t == GGML_TYPE_Q6_K) {
+                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01+1)/2, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                                }
                                else if (src0t == GGML_TYPE_Q2_K ||
-                                         src0t == GGML_TYPE_Q3_K ||
-                                         src0t == GGML_TYPE_Q4_K ||
-                                         src0t == GGML_TYPE_Q5_K ||
-                                         src0t == GGML_TYPE_Q6_K) {
+                                         src0t == GGML_TYPE_Q3_K) {
                                    [encoder setThreadgroupMemoryLength:nth0*nth1*sizeof(float) atIndex:0];
                                    [encoder dispatchThreadgroups:MTLSizeMake(ne01, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                                } else {
@@ -821,7 +824,7 @@ void ggml_metal_graph_compute(

                            const float eps = 1e-6f;

-                            const int nth = 256;
+                            const int nth = 512;

                            [encoder setComputePipelineState:ctx->pipeline_rms_norm];
                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
@@ -829,7 +832,7 @@ void ggml_metal_graph_compute(
                            [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
                            [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3];
                            [encoder setBytes:&eps  length:sizeof(   float) atIndex:4];
-                            [encoder setThreadgroupMemoryLength:nth*sizeof(float) atIndex:0];
+                            [encoder setThreadgroupMemoryLength:nth/32*sizeof(float) atIndex:0];

                            const int64_t nrows = ggml_nrows(src0);

@@ -910,28 +913,35 @@ void ggml_metal_graph_compute(

                            const int n_past = ((int32_t *)(src1->data))[0];

+                            float freq_base;
+                            float freq_scale;
+                            memcpy(&freq_base,  (int32_t *) src1->data + 4, sizeof(float));
+                            memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float));
+
                            [encoder setComputePipelineState:ctx->pipeline_rope];
                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-                            [encoder setBytes:&ne00   length:sizeof( int64_t) atIndex:2];
-                            [encoder setBytes:&ne01   length:sizeof( int64_t) atIndex:3];
-                            [encoder setBytes:&ne02   length:sizeof( int64_t) atIndex:4];
-                            [encoder setBytes:&ne03   length:sizeof( int64_t) atIndex:5];
-                            [encoder setBytes:&nb00   length:sizeof(uint64_t) atIndex:6];
-                            [encoder setBytes:&nb01   length:sizeof(uint64_t) atIndex:7];
-                            [encoder setBytes:&nb02   length:sizeof(uint64_t) atIndex:8];
-                            [encoder setBytes:&nb03   length:sizeof(uint64_t) atIndex:9];
-                            [encoder setBytes:&ne0    length:sizeof( int64_t) atIndex:10];
-                            [encoder setBytes:&ne1    length:sizeof( int64_t) atIndex:11];
-                            [encoder setBytes:&ne2    length:sizeof( int64_t) atIndex:12];
-                            [encoder setBytes:&ne3    length:sizeof( int64_t) atIndex:13];
-                            [encoder setBytes:&nb0    length:sizeof(uint64_t) atIndex:14];
-                            [encoder setBytes:&nb1    length:sizeof(uint64_t) atIndex:15];
-                            [encoder setBytes:&nb2    length:sizeof(uint64_t) atIndex:16];
-                            [encoder setBytes:&nb3    length:sizeof(uint64_t) atIndex:17];
-                            [encoder setBytes:&n_past length:sizeof(     int) atIndex:18];
-                            [encoder setBytes:&n_dims length:sizeof(     int) atIndex:19];
-                            [encoder setBytes:&mode   length:sizeof(     int) atIndex:20];
+                            [encoder setBytes:&ne00    length:sizeof( int64_t) atIndex:2];
+                            [encoder setBytes:&ne01    length:sizeof( int64_t) atIndex:3];
+                            [encoder setBytes:&ne02    length:sizeof( int64_t) atIndex:4];
+                            [encoder setBytes:&ne03    length:sizeof( int64_t) atIndex:5];
+                            [encoder setBytes:&nb00    length:sizeof(uint64_t) atIndex:6];
+                            [encoder setBytes:&nb01    length:sizeof(uint64_t) atIndex:7];
+                            [encoder setBytes:&nb02    length:sizeof(uint64_t) atIndex:8];
+                            [encoder setBytes:&nb03    length:sizeof(uint64_t) atIndex:9];
+                            [encoder setBytes:&ne0     length:sizeof( int64_t) atIndex:10];
+                            [encoder setBytes:&ne1     length:sizeof( int64_t) atIndex:11];
+                            [encoder setBytes:&ne2     length:sizeof( int64_t) atIndex:12];
+                            [encoder setBytes:&ne3     length:sizeof( int64_t) atIndex:13];
+                            [encoder setBytes:&nb0     length:sizeof(uint64_t) atIndex:14];
+                            [encoder setBytes:&nb1     length:sizeof(uint64_t) atIndex:15];
+                            [encoder setBytes:&nb2     length:sizeof(uint64_t) atIndex:16];
+                            [encoder setBytes:&nb3     length:sizeof(uint64_t) atIndex:17];
+                            [encoder setBytes:&n_past  length:sizeof(     int) atIndex:18];
+                            [encoder setBytes:&n_dims  length:sizeof(     int) atIndex:19];
+                            [encoder setBytes:&mode    length:sizeof(     int) atIndex:20];
+                            [encoder setBytes:&freq_base  length:sizeof(float) atIndex:21];
+                            [encoder setBytes:&freq_scale length:sizeof(float) atIndex:22];

                            [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                        } break;
--- a/llama/ggml-metal.metal
+++ b/llama/ggml-metal.metal
@@ -1,5 +1,7 @@
+//go:build darwin
+
 /**
- * llama.cpp - git 5bf2a2771886ee86137e01dbc7492f78fb392066
+ * llama.cpp - git e782c9e735f93ab4767ffc37462c523b73a17ddc
 *
 * MIT License
 *
@@ -357,26 +359,33 @@ kernel void kernel_rms_norm(
        threadgroup float  * sum [[threadgroup(0)]],
        uint tgpig[[threadgroup_position_in_grid]],
        uint tpitg[[thread_position_in_threadgroup]],
+        uint sgitg[[simdgroup_index_in_threadgroup]],
+        uint tiisg[[thread_index_in_simdgroup]],
        uint   ntg[[threads_per_threadgroup]]) {
-    device const float * x = (device const float *) ((device const char *) src0 + tgpig*nb01);
+    device const float4 * x = (device const float4 *) ((device const char *) src0 + tgpig*nb01);
+    device const float * x_scalar = (device const float *) x;
+    float4 sumf=0;
+    float all_sum=0;

    // parallel sum
-    sum[tpitg] = 0.0f;
-    for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
-        sum[tpitg] += x[i00] * x[i00];
+    for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
+        sumf += x[i00] * x[i00];
+    }
+    all_sum = sumf[0] + sumf[1] + sumf[2] + sumf[3];
+    all_sum = simd_sum(all_sum);
+    if (tiisg == 0) {
+        sum[sgitg] = all_sum;
    }

-    // reduce
    threadgroup_barrier(mem_flags::mem_threadgroup);
-    for (uint i = ntg/2; i > 0; i /= 2) {
-        if (tpitg < i) {
-            sum[tpitg] += sum[tpitg + i];
-        }
-        threadgroup_barrier(mem_flags::mem_threadgroup);
+    // broadcast, simd group number is ntg / 32
+    for (int i = ntg / 32 / 2; i > 0; i /= 2) {
+       if (tpitg < i) {
+           sum[tpitg] += sum[tpitg + i];
+       }
    }
-
-    // broadcast
    if (tpitg == 0) {
+        for (int i = 4 * (ne00 / 4); i < ne00; i++) {sum[0] += x_scalar[i];}
        sum[0] /= ne00;
    }

@@ -385,10 +394,99 @@ kernel void kernel_rms_norm(
    const float mean  = sum[0];
    const float scale = 1.0f/sqrt(mean + eps);

-    device float * y = dst + tgpig*ne00;
-    for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
+    device float4 * y = (device float4 *) (dst + tgpig*ne00);
+    device float * y_scalar = (device float *) y;
+    for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
        y[i00] = x[i00] * scale;
    }
+    if (tpitg == 0) {
+        for (int i00 = 4 * (ne00 / 4); i00 < ne00; i00++) {y_scalar[i00] = x_scalar[i00] * scale;}
+    }
+}
+
+// function for calculate inner product between a q4_0 block and 32 floats (yl), sumy is SUM(yl[i])
+float block_q_n_dot_y(device const block_q4_0 * qb_curr, float sumy, thread float * yl) {
+    float d = qb_curr->d;
+    float4 acc = 0.f;
+    device uint16_t * qs = ((device uint16_t *)qb_curr + 1);
+    for (int i = 0; i < 16; i+=2) {
+        acc[0] += yl[i]      * (qs[i / 2] & 0x000F);
+        acc[1] += yl[i + 16] * (qs[i / 2] & 0x00F0);
+        acc[2] += yl[i +  1] * (qs[i / 2] & 0x0F00);
+        acc[3] += yl[i + 17] * (qs[i / 2] & 0xF000);
+    }
+    return d * (sumy * -8.f + acc[0] + acc[1]/16.f + acc[2]/256.f + acc[3]/4096.f);
+}
+
+// function for calculate inner product between a q4_1 block and 32 floats (yl), sumy is SUM(yl[i])
+float block_q_n_dot_y(device const block_q4_1 * qb_curr, float sumy, thread float * yl) {
+    float d = qb_curr->d;
+    float m = qb_curr->m;
+    float4 acc = 0.f;
+    device uint16_t * qs = ((device uint16_t *)qb_curr + 2);
+    for (int i = 0; i < 16; i+=2) {
+        acc[0] += yl[i]      * (qs[i / 2] & 0x000F);
+        acc[1] += yl[i + 16] * (qs[i / 2] & 0x00F0);
+        acc[2] += yl[i +  1] * (qs[i / 2] & 0x0F00);
+        acc[3] += yl[i + 17] * (qs[i / 2] & 0xF000);
+    }
+    return d * (acc[0] + acc[1]/16.f + acc[2]/256.f + acc[3]/4096.f) + sumy * m;
+}
+
+// putting them in the kernel cause a significant performance penalty
+#define N_DST 4 // each SIMD group works on 4 rows
+#define N_SIMDGROUP 2 // number of SIMD groups in a thread group
+#define N_SIMDWIDTH 32 // assuming SIMD group size is 32
+template<typename block_q_type>
+void mul_vec_q_n_f32(device const void * src0, device const float * src1, device float * dst,
+                    int64_t ne00, int64_t ne10, int64_t ne0, int64_t ne01,
+                    uint2 tgpig, uint tiisg, uint sgitg) {
+    const int nb = ne00/QK4_0;
+    const int r0 = tgpig.x;
+    const int r1 = tgpig.y;
+    device const block_q_type * x = (device const block_q_type *) src0 + (r0 * N_SIMDGROUP + sgitg) * N_DST * nb;
+    device const float      * y = (device const float      *) src1 + r1*ne10;
+    float4 y_curr[8];       // src1 vector cache
+    float sumf[N_DST]={0.f}, all_sum;
+    thread float * yl=(thread float *)y_curr;
+
+    // each thread in a SIMD group deals with 1 block.
+    for (int column = 0; column < nb / N_SIMDWIDTH; column++) {
+        float sumy = 0;
+        for (int i = 0; i < QK4_0 / 4; i++) {
+            y_curr[i] = *((device float4  *)(y + N_SIMDWIDTH * (tiisg + column * QK4_0)) + i);
+            sumy += y_curr[i][0] + y_curr[i][1] + y_curr[i][2] + y_curr[i][3];
+        }
+
+        for (int row = 0; row < N_DST; row++) {
+            sumf[row] += block_q_n_dot_y(x+(tiisg + row * nb + column * N_SIMDWIDTH), sumy, yl);
+        }
+    }
+
+    // from now loads two rows every time and 16 blocks per row
+    int ir = tiisg / (N_SIMDWIDTH / 2);
+    int ib = tiisg % (N_SIMDWIDTH / 2);
+    for (int ind = 0; ind < (nb % N_SIMDWIDTH + N_SIMDWIDTH / 2 - 1)/(N_SIMDWIDTH / 2); ind++) {
+        int nb_start = (nb / N_SIMDWIDTH) * N_SIMDWIDTH + ind * (N_SIMDWIDTH / 2); //where the left blocks start
+        float sumy = 0;
+        for (int i = 0; i < QK4_0 / 4; i++) {
+            y_curr[i] = *((device float4 *)(y + (nb_start + ib) * QK4_0) + i);
+            sumy += y_curr[i][0] + y_curr[i][1] + y_curr[i][2] + y_curr[i][3];
+        }
+
+        for (int row = 0; row < N_DST; row+=2) {
+            if (nb_start + ib < nb) {
+                sumf[row + ir] += block_q_n_dot_y(x + (nb_start + ib + (row + ir) * nb), sumy, yl);
+            }
+        }
+    }
+
+    for (int row = 0; row < N_DST; ++row) {
+        all_sum = simd_sum(sumf[row]);
+        if (tiisg == 0 && ((r0 * N_SIMDGROUP + sgitg) * N_DST + row) < ne01) {
+            dst[r1*ne0 + (r0 * N_SIMDGROUP + sgitg) * N_DST + row] = all_sum;
+        }
+    }
 }

 kernel void kernel_mul_mat_q4_0_f32(
@@ -398,65 +496,11 @@ kernel void kernel_mul_mat_q4_0_f32(
        constant   int64_t & ne00,
        constant   int64_t & ne10,
        constant   int64_t & ne0,
-        threadgroup float  * sum [[threadgroup(0)]],
+        constant   int64_t & ne01[[buffer(4)]],
        uint2 tgpig[[threadgroup_position_in_grid]],
-        uint2 tpitg[[thread_position_in_threadgroup]],
-        uint2  tptg[[threads_per_threadgroup]]) {
-    const int nb = ne00/QK4_0;
-
-    const int64_t r0 = tgpig.x;
-    const int64_t r1 = tgpig.y;
-
-    device const block_q4_0 * x = (device const block_q4_0 *) src0 + r0*nb;
-    device const float      * y = (device const float      *) src1 + r1*ne10;
-
-    const int nth = tptg.x*tptg.y;
-    const int ith = tptg.y*tpitg.x + tpitg.y;
-
-    const int ix = tpitg.y/4;           // 0 or 1
-    const int iy = tpitg.y - 4*ix;      // 0...3
-
-    const int first = 4 * iy;
-
-    float sumf = 0;
-
-    for (int i = 2*tpitg.x + ix; i < nb; i += 2*tptg.x) {
-
-        const float d = (float)x[i].d;
-
-        device const uint8_t * xl = x[i].qs + first;
-        device const float   * yl = y + i * QK4_0 + first;
-
-        float2 acc = {0.0f, 0.0f};
-
-        for (int j = 0; j < 4; ++j) {
-
-            acc[0] += yl[j] * (xl[j] & 0xF) + yl[j+16] * (xl[j] >> 4);
-            acc[1] += yl[j] + yl[j+16];
-
-        }
-
-        sumf += d * (acc[0] - 8.f*acc[1]);
-    }
-
-    sum[ith] = sumf;
-
-    //
-    // Accumulate the sum from all threads in the threadgroup
-    //
-    threadgroup_barrier(mem_flags::mem_threadgroup);
-    if (ith%4 == 0) {
-        sum[ith] += sum[ith+1] + sum[ith+2] + sum[ith+3];
-    }
-    threadgroup_barrier(mem_flags::mem_threadgroup);
-    if (ith%16 == 0) {
-        sum[ith] += sum[ith+4] + sum[ith+8] + sum[ith+12];
-    }
-    threadgroup_barrier(mem_flags::mem_threadgroup);
-    if (ith == 0) {
-        for (int i = 16; i < nth; i += 16) sum[0] += sum[i];
-        dst[r1*ne0 + r0] = sum[0];
-    }
+        uint tiisg[[thread_index_in_simdgroup]],
+        uint sgitg[[simdgroup_index_in_threadgroup]]) {
+    mul_vec_q_n_f32<block_q4_0>(src0,src1,dst,ne00,ne10,ne0,ne01,tgpig,tiisg,sgitg);
 }

 kernel void kernel_mul_mat_q4_1_f32(
@@ -466,66 +510,11 @@ kernel void kernel_mul_mat_q4_1_f32(
        constant   int64_t & ne00,
        constant   int64_t & ne10,
        constant   int64_t & ne0,
-        threadgroup float  * sum [[threadgroup(0)]],
+        constant   int64_t & ne01[[buffer(4)]],
        uint2 tgpig[[threadgroup_position_in_grid]],
-        uint2 tpitg[[thread_position_in_threadgroup]],
-        uint2  tptg[[threads_per_threadgroup]]) {
-    const int nb = ne00/QK4_1;
-
-    const int64_t r0 = tgpig.x;
-    const int64_t r1 = tgpig.y;
-
-    device const block_q4_1 * x = (device const block_q4_1 *) src0 + r0*nb;
-    device const float      * y = (device const float      *) src1 + r1*ne10;
-
-    const uint nth = tptg.x*tptg.y;
-    const uint ith = tptg.y*tpitg.x + tpitg.y;
-
-    const int ix = tpitg.y/4;           // 0 or 1
-    const int iy = tpitg.y - 4*ix;      // 0...3
-
-    const int first = 4 * iy;
-
-    float sumf = 0;
-
-    for (int i = 2*tpitg.x + ix; i < nb; i += 2*tptg.x) {
-
-        const float d = (float)x[i].d;
-        const float m = (float)x[i].m;
-
-        device const uint8_t * xl = x[i].qs + first;
-        device const float   * yl = y + i * QK4_1 + first;
-
-        float2 acc = {0.0f, 0.0f};
-
-        for (int j = 0; j < 4; ++j) {
-
-            acc[0] += yl[j+ 0] * (d * (xl[j] & 0xF) + m);
-            acc[1] += yl[j+16] * (d * (xl[j] >>  4) + m);
-
-        }
-
-        sumf += acc[0] + acc[1];
-    }
-
-    sum[ith] = sumf;
-
-    //
-    // Accumulate the sum from all threads in the threadgroup
-    //
-    threadgroup_barrier(mem_flags::mem_threadgroup);
-    if (ith%4 == 0) {
-        sum[ith] += sum[ith+1] + sum[ith+2] + sum[ith+3];
-    }
-    threadgroup_barrier(mem_flags::mem_threadgroup);
-    if (ith%16 == 0) {
-        sum[ith] += sum[ith+4] + sum[ith+8] + sum[ith+12];
-    }
-    threadgroup_barrier(mem_flags::mem_threadgroup);
-    if (ith == 0) {
-        for (uint i = 16; i < nth; i += 16) sum[0] += sum[i];
-        dst[r1*ne0 + r0] = sum[0];
-    }
+        uint tiisg[[thread_index_in_simdgroup]],
+        uint sgitg[[simdgroup_index_in_threadgroup]]) {
+     mul_vec_q_n_f32<block_q4_1>(src0,src1,dst,ne00,ne10,ne0,ne01,tgpig,tiisg,sgitg);
 }

 kernel void kernel_mul_mat_f16_f32(
@@ -641,17 +630,19 @@ kernel void kernel_rope(
        constant       int & n_past,
        constant       int & n_dims,
        constant       int & mode,
+        constant     float & freq_base,
+        constant     float & freq_scale,
        uint3 tpig[[thread_position_in_grid]]) {
    const int64_t i3 = tpig[2];
    const int64_t i2 = tpig[1];
    const int64_t i1 = tpig[0];

    const bool is_neox = mode & 2;
-    const float theta_scale = pow(10000.0, -2.0f/n_dims);
+    const float theta_scale = pow(freq_base, -2.0f/n_dims);

    const int64_t p = ((mode & 1) == 0 ? n_past + i2 : i2);

-    float theta = (float)p;
+    float theta = freq_scale * (float)p;

    if (!is_neox) {
        for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
@@ -1489,6 +1480,7 @@ kernel void kernel_mul_mat_q3_K_f32(

 }

+#if QK_K == 256
 kernel void kernel_mul_mat_q4_K_f32(
        device const  void * src0,
        device const float * src1,
@@ -1496,131 +1488,180 @@ kernel void kernel_mul_mat_q4_K_f32(
        constant   int64_t & ne00,
        constant   int64_t & ne10,
        constant   int64_t & ne0,
-        threadgroup float  * sum [[threadgroup(0)]],
+        constant   int64_t & ne01[[buffer(4)]],
        uint2 tgpig[[threadgroup_position_in_grid]],
-        uint2 tpitg[[thread_position_in_threadgroup]],
-        uint2  tptg[[threads_per_threadgroup]]) {
-
-    const int nb = ne00/QK_K;
-
-    const int64_t r0 = tgpig.x;
-    const int64_t r1 = tgpig.y;
-
-    const int nth = tptg.x*tptg.y;
-    const int ith = tptg.y*tpitg.x + tpitg.y;
-
-    device const block_q4_K * x = (device const block_q4_K *) src0 + r0*nb;
-    device const float     * yy = (device const float      *) src1 + r1*ne10;
-
-    float sumf = 0;
-
-#if QK_K == 256
+        uint tiisg[[thread_index_in_simdgroup]],
+        uint sgitg[[simdgroup_index_in_threadgroup]]) {

    const uint16_t kmask1 = 0x3f3f;
    const uint16_t kmask2 = 0x0f0f;
    const uint16_t kmask3 = 0xc0c0;

-    const int tid = tpitg.y;   // 0...16
-    const int il  = tid/4;     // 0...3
-    const int ir  = tid - 4*il;// 0...3
-    const int n   = 4;
+    const int ix = tiisg/8;  // 0...3
+    const int it = tiisg%8;  // 0...7
+    const int im = it/4;     // 0 or 1
+    const int ir = it%4;     // 0...3

-    const int im = il/2;  // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
-    const int in = il%2;
+    const int nb = ne00/QK_K;
+    const int r0 = tgpig.x;
+    const int r1 = tgpig.y;
+    const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
+    const int ib_row = first_row * nb;
+    device const block_q4_K * x = (device const block_q4_K *) src0 + ib_row;
+    device const float      * y = (device const float      *) src1 + r1*ne10;
+    float yl[16];
+    float yh[16];
+    float sumf[N_DST]={0.f}, all_sum;

-    const int l0 = n*(2*ir + in);
-    const int q_offset = 32*im + l0;
-    const int y_offset = 64*im + l0;
+    const int step = sizeof(block_q4_K) * nb / 2;

-    uchar2 sc1, sc2, sc3, sc4;
+    device const float * y4 = y + ix * QK_K + 64 * im + 8 * ir;

-    for (int i = tpitg.x; i < nb; i += tptg.x) {
+    uint16_t sc16[4];
+    thread const uint8_t * sc8 = (thread const uint8_t *)sc16;

-        device const uint8_t * q1 = (x + i)->qs + q_offset;
-        device const uint8_t * q2 = q1 + 64;
-        device const float   * y1 = yy + i*QK_K + y_offset;
-        device const float   * y2 = y1 + 128;
-
-        const float dall = (float)((x + i)->d);
-        const float dmin = (float)((x + i)->dmin);
-
-        device const uint16_t * a = (device const uint16_t *)(x + i)->scales;
-        sc1 = as_type<uchar2>((uint16_t)(a[im+0] & kmask1));
-        sc2 = as_type<uchar2>((uint16_t)(a[im+2] & kmask1));
-        sc3 = as_type<uchar2>((uint16_t)(((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2)));
-        sc4 = as_type<uchar2>((uint16_t)(((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2)));
-
-        float4 s = {0.f, 0.f, 0.f, 0.f};
-        float smin = 0;
-        for (int l = 0; l < n; ++l) {
-
-            s[0] += y1[l] * (q1[l] & 0xF); s[1] += y1[l+32] * (q1[l] >> 4);
-            s[2] += y2[l] * (q2[l] & 0xF); s[3] += y2[l+32] * (q2[l] >> 4);
-            smin += y1[l] * sc2[0] + y1[l+32] * sc2[1] + y2[l] * sc4[0] + y2[l+32] * sc4[1];
+    for (int ib = ix; ib < nb; ib += 4) {

+        float4 sumy = {0.f, 0.f, 0.f, 0.f};
+        for (int i = 0; i < 8; ++i) {
+            yl[i+0] = y4[i+  0]; sumy[0] += yl[i+0];
+            yl[i+8] = y4[i+ 32]; sumy[1] += yl[i+8];
+            yh[i+0] = y4[i+128]; sumy[2] += yh[i+0];
+            yh[i+8] = y4[i+160]; sumy[3] += yh[i+8];
        }
-        sumf += dall * (s[0] * sc1[0] + s[1] * sc1[1] + s[2] * sc3[0] + s[3] * sc3[1]) - dmin * smin;

+        device const uint16_t * sc = (device const uint16_t *)x[ib].scales + im;
+        device const uint16_t * q1 = (device const uint16_t *)x[ib].qs + 16 * im + 4 * ir;
+        device const half     * dh = &x[ib].d;
+
+        for (int row = 0; row < N_DST; row++) {
+
+            sc16[0] = sc[0] & kmask1;
+            sc16[1] = sc[2] & kmask1;
+            sc16[2] = ((sc[4] >> 0) & kmask2) | ((sc[0] & kmask3) >> 2);
+            sc16[3] = ((sc[4] >> 4) & kmask2) | ((sc[2] & kmask3) >> 2);
+
+            device const uint16_t * q2 = q1 + 32;
+
+            float4 acc1 = {0.f, 0.f, 0.f, 0.f};
+            float4 acc2 = {0.f, 0.f, 0.f, 0.f};
+            for (int i = 0; i < 8; i += 2) {
+                acc1[0] += yl[i+0] * (q1[i/2] & 0x000F);
+                acc1[1] += yl[i+1] * (q1[i/2] & 0x0F00);
+                acc1[2] += yl[i+8] * (q1[i/2] & 0x00F0);
+                acc1[3] += yl[i+9] * (q1[i/2] & 0xF000);
+                acc2[0] += yh[i+0] * (q2[i/2] & 0x000F);
+                acc2[1] += yh[i+1] * (q2[i/2] & 0x0F00);
+                acc2[2] += yh[i+8] * (q2[i/2] & 0x00F0);
+                acc2[3] += yh[i+9] * (q2[i/2] & 0xF000);
+            }
+
+            float dall = dh[0];
+            float dmin = dh[1];
+            sumf[row] += dall * ((acc1[0] + 1.f/256.f * acc1[1]) * sc8[0] +
+                                 (acc1[2] + 1.f/256.f * acc1[3]) * sc8[1] * 1.f/16.f +
+                                 (acc2[0] + 1.f/256.f * acc2[1]) * sc8[4] +
+                                 (acc2[2] + 1.f/256.f * acc2[3]) * sc8[5] * 1.f/16.f) -
+                         dmin * (sumy[0] * sc8[2] + sumy[1] * sc8[3] + sumy[2] * sc8[6] + sumy[3] * sc8[7]);
+
+            q1 += step;
+            sc += step;
+            dh += step;
+        }
+
+        y4 += 4 * QK_K;
    }
-#else
-    uint16_t aux16[2];
-    thread const uint8_t * scales = (thread const uint8_t *)aux16;

-    const int il  = 4*tpitg.x;
-
-    for (int i = tpitg.y; i < nb; i += tptg.y) {
-
-        device const uint8_t * q = x[i].qs + il;
-        device const float   * y = yy + i * QK_K + il;
-
-        const float d = (float)x[i].d[0];
-        const float m = (float)x[i].d[1];
-
-        device const uint16_t * a = (device const uint16_t *)x[i].scales;
-        aux16[0] = a[0] & 0x0f0f;
-        aux16[1] = (a[0] >> 4) & 0x0f0f;
-
-        for (int l = 0; l < 4; ++l) {
-            sumf += d * scales[0] * (y[l+ 0] * (q[l] & 0xF) + y[l+16] * (q[l+16] & 0xF)) - m * scales[2] * (y[l+ 0] + y[l+16])
-                  + d * scales[1] * (y[l+32] * (q[l] >>  4) + y[l+48] * (q[l+16] >>  4)) - m * scales[3] * (y[l+32] + y[l+48]);
+    for (int row = 0; row < N_DST; ++row) {
+        all_sum = simd_sum(sumf[row]);
+        if (tiisg == 0) {
+            dst[r1*ne0 + first_row + row] = all_sum;
        }
    }
-#endif
-
-    sum[ith] = sumf;
-
-    //
-    // Accumulate the sum from all threads in the threadgroup
-    // This version is slightly faster than the commented out one below,
-    // which I copy-pasted from ggerganov's q4_0 dot product for metal.
-    //
-    threadgroup_barrier(mem_flags::mem_threadgroup);
-    if (ith%4 == 0) {
-        for (int i = 1; i < 4; ++i) sum[ith] += sum[ith + i];
-    }
-    threadgroup_barrier(mem_flags::mem_threadgroup);
-    if (ith%16 == 0) {
-        for (int i = 4; i < 16; i += 4) sum[ith] += sum[ith + i];
-    }
-    threadgroup_barrier(mem_flags::mem_threadgroup);
-    if (ith == 0) {
-        for (int i = 16; i < nth; i += 16) sum[0] += sum[i];
-        dst[r1*ne0 + r0] = sum[0];
-    }
-
-    //// accumulate the sum from all threads in the threadgroup
-    //threadgroup_barrier(mem_flags::mem_threadgroup);
-    //for (uint i = nth/2; i > 0; i /= 2) {
-    //    if (ith < i) {
-    //        sum[ith] += sum[ith + i];
-    //    }
-    //    threadgroup_barrier(mem_flags::mem_threadgroup);
-    //}
-
-    //if (ith == 0) {
-    //    dst[r1*ne0 + r0] = sum[0];
-    //}
 }
+#else
+kernel void kernel_mul_mat_q4_K_f32(
+        device const  void * src0,
+        device const float * src1,
+        device       float * dst,
+        constant   int64_t & ne00,
+        constant   int64_t & ne10,
+        constant   int64_t & ne0,
+        constant   int64_t & ne01[[buffer(4)]],
+        uint2 tgpig[[threadgroup_position_in_grid]],
+        uint tiisg[[thread_index_in_simdgroup]],
+        uint sgitg[[simdgroup_index_in_threadgroup]]) {
+
+    const int ix = tiisg/4;  // 0...7
+    const int it = tiisg%4;  // 0...3
+
+    const int nb = ne00/QK_K;
+    const int r0 = tgpig.x;
+    const int r1 = tgpig.y;
+    const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
+    const int ib_row = first_row * nb;
+    device const block_q4_K * x = (device const block_q4_K *) src0 + ib_row;
+    device const float      * y = (device const float      *) src1 + r1*ne10;
+    float yl[8];
+    float yh[8];
+    float sumf[N_DST]={0.f}, all_sum;
+
+    const int step = sizeof(block_q4_K) * nb / 2;
+
+    device const float * y4 = y + ix * QK_K + 8 * it;
+
+    uint16_t sc16[4];
+
+    for (int ib = ix; ib < nb; ib += 8) {
+
+        float2 sumy = {0.f, 0.f};
+        for (int i = 0; i < 8; ++i) {
+            yl[i] = y4[i+ 0]; sumy[0] += yl[i];
+            yh[i] = y4[i+32]; sumy[1] += yh[i];
+        }
+
+        device const uint16_t * sc = (device const uint16_t *)x[ib].scales;
+        device const uint16_t * qs = (device const uint16_t *)x[ib].qs + 4 * it;
+        device const half     * dh = x[ib].d;
+
+        for (int row = 0; row < N_DST; row++) {
+
+            sc16[0] = sc[0] & 0x000f;
+            sc16[1] = sc[0] & 0x0f00;
+            sc16[2] = sc[0] & 0x00f0;
+            sc16[3] = sc[0] & 0xf000;
+
+            float2 acc1 = {0.f, 0.f};
+            float2 acc2 = {0.f, 0.f};
+            for (int i = 0; i < 8; i += 2) {
+                acc1[0] += yl[i+0] * (qs[i/2] & 0x000F);
+                acc1[1] += yl[i+1] * (qs[i/2] & 0x0F00);
+                acc2[0] += yh[i+0] * (qs[i/2] & 0x00F0);
+                acc2[1] += yh[i+1] * (qs[i/2] & 0xF000);
+            }
+
+            float dall = dh[0];
+            float dmin = dh[1];
+            sumf[row] += dall * ((acc1[0] + 1.f/256.f * acc1[1]) * sc16[0] +
+                                 (acc2[0] + 1.f/256.f * acc2[1]) * sc16[1] * 1.f/4096.f) -
+                         dmin * 1.f/16.f * (sumy[0] * sc16[2] + sumy[1] * sc16[3] * 1.f/256.f);
+
+            qs += step;
+            sc += step;
+            dh += step;
+        }
+
+        y4 += 8 * QK_K;
+    }
+
+    for (int row = 0; row < N_DST; ++row) {
+        all_sum = simd_sum(sumf[row]);
+        if (tiisg == 0) {
+            dst[r1*ne0 + first_row + row] = all_sum;
+        }
+    }
+}
+#endif

 kernel void kernel_mul_mat_q5_K_f32(
        device const  void * src0,
@@ -1629,39 +1670,39 @@ kernel void kernel_mul_mat_q5_K_f32(
        constant   int64_t & ne00,
        constant   int64_t & ne10,
        constant   int64_t & ne0,
-        threadgroup float  * sum [[threadgroup(0)]],
        uint2 tgpig[[threadgroup_position_in_grid]],
-        uint2 tpitg[[thread_position_in_threadgroup]],
-        uint2  tptg[[threads_per_threadgroup]]) {
+        uint tiisg[[thread_index_in_simdgroup]],
+        uint sgitg[[simdgroup_index_in_threadgroup]]) {

    const int nb = ne00/QK_K;

    const int64_t r0 = tgpig.x;
    const int64_t r1 = tgpig.y;

-    device const block_q5_K * x = (device const block_q5_K *) src0 + r0*nb;
+    const int first_row = (r0 * N_SIMDGROUP + sgitg) * 2;
+
+    device const block_q5_K * x = (device const block_q5_K *) src0 + first_row*nb;
    device const float     * yy = (device const float      *) src1 + r1*ne10;

-    const int nth = tptg.x*tptg.y;
-    const int ith = tptg.y*tpitg.x + tpitg.y;
+    float sumf[2]={0.f};

-    float sumf = 0;
+    const int step = sizeof(block_q5_K) * nb;

 #if QK_K == 256
+#
+    float yl[16], yh[16];

    const uint16_t kmask1 = 0x3f3f;
    const uint16_t kmask2 = 0x0f0f;
    const uint16_t kmask3 = 0xc0c0;

-    const int tid = tpitg.y;   // 0...16
-    const int il  = tid/4;     // 0...3
-    const int ir  = tid - 4*il;// 0...3
-    const int n   = 4;
+    const int tid = tiisg/4;
+    const int ix  = tiisg%4;
+    const int im  = tid/4;
+    const int ir  = tid%4;
+    const int n   = 8;

-    const int im = il/2;  // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
-    const int in = il%2;
-
-    const int l0 = n*(2*ir + in);
+    const int l0 = n*ir;
    const int q_offset = 32*im + l0;
    const int y_offset = 64*im + l0;

@@ -1670,78 +1711,114 @@ kernel void kernel_mul_mat_q5_K_f32(
    const uint8_t hm3 = hm1 << 4;
    const uint8_t hm4 = hm2 << 4;

-    uchar2 sc1, sc2, sc3, sc4;
+    uint16_t sc16[4];
+    thread const uint8_t * sc8 = (thread const uint8_t *)sc16;

-    for (int i = tpitg.x; i < nb; i += tptg.x) {
+    device const float * y1 = yy + ix*QK_K + y_offset;

-        device const uint8_t * q1 = (x + i)->qs + q_offset;
-        device const uint8_t * q2 = q1 + 64;
-        device const uint8_t * qh = (x + i)->qh + l0;
-        device const float   * y1 = yy + i*QK_K + y_offset;
-        device const float   * y2 = y1 + 128;
+    for (int i = ix; i < nb; i += 4) {

-        const float dall = (float)((x + i)->d);
-        const float dmin = (float)((x + i)->dmin);
+        device const uint8_t * q1 = x[i].qs + q_offset;
+        device const uint8_t * qh = x[i].qh + l0;
+        device const half * dh = &x[i].d;
+        device const uint16_t * a = (device const uint16_t *)x[i].scales + im;

-        device const uint16_t * a = (device const uint16_t *)(x + i)->scales;
-        sc1 = as_type<uchar2>((uint16_t)(a[im+0] & kmask1));
-        sc2 = as_type<uchar2>((uint16_t)(a[im+2] & kmask1));
-        sc3 = as_type<uchar2>((uint16_t)(((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2)));
-        sc4 = as_type<uchar2>((uint16_t)(((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2)));
+        device const float * y2 = y1 + 128;
+        float4 sumy = {0.f, 0.f, 0.f, 0.f};
+        for (int l = 0; l < 8; ++l) {
+            yl[l+0] = y1[l+ 0]; sumy[0] += yl[l+0];
+            yl[l+8] = y1[l+32]; sumy[1] += yl[l+8];
+            yh[l+0] = y2[l+ 0]; sumy[2] += yh[l+0];
+            yh[l+8] = y2[l+32]; sumy[3] += yh[l+8];
+        }

-        float4 s = {0.f, 0.f, 0.f, 0.f};
-        float smin = 0;
-        for (int l = 0; l < n; ++l) {
+        for (int row = 0; row < 2; ++row) {

-            s[0] += y1[l+ 0] * ((q1[l] & 0xF) + (qh[l] & hm1 ? 16 : 0));
-            s[1] += y1[l+32] * ((q1[l] >>  4) + (qh[l] & hm2 ? 16 : 0));
-            s[2] += y2[l+ 0] * ((q2[l] & 0xF) + (qh[l] & hm3 ? 16 : 0));
-            s[3] += y2[l+32] * ((q2[l] >>  4) + (qh[l] & hm4 ? 16 : 0));
-            smin += y1[l] * sc2[0] + y1[l+32] * sc2[1] + y2[l] * sc4[0] + y2[l+32] * sc4[1];
+            device const uint8_t * q2 = q1 + 64;
+
+            sc16[0] = a[0] & kmask1;
+            sc16[1] = a[2] & kmask1;
+            sc16[2] = ((a[4] >> 0) & kmask2) | ((a[0] & kmask3) >> 2);
+            sc16[3] = ((a[4] >> 4) & kmask2) | ((a[2] & kmask3) >> 2);
+
+            float4 acc = {0.f, 0.f, 0.f, 0.f};
+            for (int l = 0; l < n; ++l) {
+                uint8_t h = qh[l];
+                acc[0] += yl[l+0] * ((uint16_t)(q1[l] & 0x0F) + (h & hm1 ? 16 : 0));
+                acc[1] += yl[l+8] * ((uint16_t)(q1[l] & 0xF0) + (h & hm2 ? 256 : 0));
+                acc[2] += yh[l+0] * ((uint16_t)(q2[l] & 0x0F) + (h & hm3 ? 16 : 0));
+                acc[3] += yh[l+8] * ((uint16_t)(q2[l] & 0xF0) + (h & hm4 ? 256 : 0));
+            }
+            const float dall = dh[0];
+            const float dmin = dh[1];
+            sumf[row] += dall * (acc[0] * sc8[0] + acc[1] * sc8[1] * 1.f/16.f + acc[2] * sc8[4] + acc[3] * sc8[5] * 1.f/16.f) -
+                         dmin * (sumy[0] * sc8[2] + sumy[1] * sc8[3] + sumy[2] * sc8[6] + sumy[3] * sc8[7]);
+
+            q1 += step;
+            qh += step;
+            dh += step/2;
+            a  += step/2;

        }
-        sumf += dall * (s[0] * sc1[0] + s[1] * sc1[1] + s[2] * sc3[0] + s[3] * sc3[1]) - dmin * smin;
+
+        y1 += 4 * QK_K;

    }
 #else
-    const int il  = 4 * tpitg.x;  // 0, 4, 8, 12
-    const int im  = il/8;         // 0, 0, 1, 1
-    const int in  = il%8;         // 0, 4, 0, 4
+    float yl[8], yh[8];

-    for (int i = tpitg.y; i < nb; i += tptg.y) {
+    const int il = 4 * (tiisg/8);  // 0, 4, 8, 12
+    const int ix = tiisg%8;
+    const int im = il/8;         // 0, 0, 1, 1
+    const int in = il%8;         // 0, 4, 0, 4

-        const float d = (float)x[i].d;
+    device const float * y = yy + ix*QK_K + il;
+
+    for (int i = ix; i < nb; i += 8) {
+
+        float4 sumy = {0.f, 0.f, 0.f, 0.f};
+        for (int l = 0; l < 4; ++l) {
+            yl[l+0] = y[l+ 0];
+            yl[l+4] = y[l+16];
+            yh[l+0] = y[l+32];
+            yh[l+4] = y[l+48];
+        }
+
+        device const half * dh = &x[i].d;
        device const uint8_t * q = x[i].qs + il;
        device const uint8_t * h = x[i].qh + in;
        device const int8_t  * s = x[i].scales;
-        device const float   * y = yy + i*QK_K + il;

-        for (int l = 0; l < 4; ++l) {
-            const uint8_t hl = h[l] >> im;
-            sumf += y[l+ 0] * d * s[0] * ((q[l+ 0] & 0xF) - (hl & 0x01 ? 0 : 16))
-                  + y[l+16] * d * s[1] * ((q[l+16] & 0xF) - (hl & 0x04 ? 0 : 16))
-                  + y[l+32] * d * s[2] * ((q[l+ 0] >>  4) - (hl & 0x10 ? 0 : 16))
-                  + y[l+48] * d * s[3] * ((q[l+16] >>  4) - (hl & 0x40 ? 0 : 16));
+        for (int row = 0; row < 2; ++row) {
+
+            const float d = dh[0];
+
+            float2 acc = {0.f, 0.f};
+            for (int l = 0; l < 4; ++l) {
+                const uint8_t hl = h[l] >> im;
+                acc[0] += yl[l+0] * s[0] * ((int16_t)(q[l+ 0] & 0x0F) - (hl & 0x01 ? 0 : 16))
+                        + yl[l+4] * s[1] * ((int16_t)(q[l+16] & 0x0F) - (hl & 0x04 ? 0 : 16));
+                acc[1] += yh[l+0] * s[2] * ((int16_t)(q[l+ 0] & 0xF0) - (hl & 0x10 ? 0 : 256))
+                        + yh[l+4] * s[3] * ((int16_t)(q[l+16] & 0xF0) - (hl & 0x40 ? 0 : 256));
+            }
+            sumf[row] += d * (acc[0] + 1.f/16.f * acc[1]);
+
+            q += step;
+            h += step;
+            s += step;
+            dh += step/2;
+
        }
+
+        y += 8 * QK_K;
    }
 #endif
-    sum[ith] = sumf;

-    //
-    // Accumulate the sum from all threads in the threadgroup
-    //
-    threadgroup_barrier(mem_flags::mem_threadgroup);
-    if (ith%4 == 0) {
-        sum[ith] += sum[ith+1] + sum[ith+2] + sum[ith+3];
-    }
-    threadgroup_barrier(mem_flags::mem_threadgroup);
-    if (ith%16 == 0) {
-        sum[ith] += sum[ith+4] + sum[ith+8] + sum[ith+12];
-    }
-    threadgroup_barrier(mem_flags::mem_threadgroup);
-    if (ith == 0) {
-        for (int i = 16; i < nth; i += 16) sum[0] += sum[i];
-        dst[r1*ne0 + r0] = sum[0];
+    for (int row = 0; row < 2; ++row) {
+        const float tot = simd_sum(sumf[row]);
+        if (tiisg == 0) {
+            dst[r1*ne0 + first_row + row] = tot;
+        }
    }

 }
@@ -1753,10 +1830,9 @@ kernel void kernel_mul_mat_q6_K_f32(
        constant   int64_t & ne00,
        constant   int64_t & ne10,
        constant   int64_t & ne0,
-        threadgroup float  * sum [[threadgroup(0)]],
        uint2 tgpig[[threadgroup_position_in_grid]],
-        uint2 tpitg[[thread_position_in_threadgroup]],
-        uint2  tptg[[threads_per_threadgroup]]) {
+        uint tiisg[[thread_index_in_simdgroup]],
+        uint sgitg[[simdgroup_index_in_threadgroup]]) {

    const uint8_t kmask1 = 0x03;
    const uint8_t kmask2 = 0x0C;
@@ -1768,19 +1844,18 @@ kernel void kernel_mul_mat_q6_K_f32(
    const int64_t r0 = tgpig.x;
    const int64_t r1 = tgpig.y;

-    device const block_q6_K * x = (device const block_q6_K *) src0 + r0*nb;
-    device const float     * yy = (device const float      *) src1 + r1*ne10;
+    const int row = 2 * r0 + sgitg;

-    const int nth = tptg.x*tptg.y;
-    const int ith = tptg.y*tpitg.x + tpitg.y;
+    device const block_q6_K * x = (device const block_q6_K *) src0 + row * nb; //r0*nb;
+    device const float     * yy = (device const float      *) src1 + r1*ne10;

    float sumf = 0;

 #if QK_K == 256
-    // Note: we absolutely assume that tptg.y = 16 and QK_K = 256!
-    const int iqs  = 16 * tpitg.y;
-    const int ip   = iqs / 128;         // 0 or 1
-    const int il   = (iqs - 128*ip)/16; // 0...7
+    const int tid  = tiisg/2;
+    const int ix   = tiisg%2;
+    const int ip   = tid/8;         // 0 or 1
+    const int il   = tid%8;
    const int n    = 4;
    const int l0   = n*il;
    const int is   = 8*ip + l0/16;
@@ -1789,9 +1864,10 @@ kernel void kernel_mul_mat_q6_K_f32(
    const int q_offset_l = 64*ip + l0;
    const int q_offset_h = 32*ip + l0;

-    for (int i = tpitg.x; i < nb; i += tptg.x) {
+    for (int i = ix; i < nb; i += 2) {

-        device const uint8_t * ql = x[i].ql + q_offset_l;
+        device const uint8_t * q1 = x[i].ql + q_offset_l;
+        device const uint8_t * q2 = q1 + 32;
        device const uint8_t * qh = x[i].qh + q_offset_h;
        device const int8_t  * sc = x[i].scales + is;

@@ -1801,19 +1877,21 @@ kernel void kernel_mul_mat_q6_K_f32(

        float4 sums = {0.f, 0.f, 0.f, 0.f};
        for (int l = 0; l < n; ++l) {
-            sums[0] += y[l+ 0] * ((int8_t)((ql[l+ 0] & 0xF) | ((qh[l] & kmask1) << 4)) - 32);
-            sums[1] += y[l+32] * ((int8_t)((ql[l+32] & 0xF) | ((qh[l] & kmask2) << 2)) - 32);
-            sums[2] += y[l+64] * ((int8_t)((ql[l+ 0]  >> 4) | ((qh[l] & kmask3) << 0)) - 32);
-            sums[3] += y[l+96] * ((int8_t)((ql[l+32]  >> 4) | ((qh[l] & kmask4) >> 2)) - 32);
+            sums[0] += y[l+ 0] * ((int8_t)((q1[l] & 0xF) | ((qh[l] & kmask1) << 4)) - 32);
+            sums[1] += y[l+32] * ((int8_t)((q2[l] & 0xF) | ((qh[l] & kmask2) << 2)) - 32);
+            sums[2] += y[l+64] * ((int8_t)((q1[l]  >> 4) | ((qh[l] & kmask3) << 0)) - 32);
+            sums[3] += y[l+96] * ((int8_t)((q2[l]  >> 4) | ((qh[l] & kmask4) >> 2)) - 32);
        }

        sumf += dall * (sums[0] * sc[0] + sums[1] * sc[2] + sums[2] * sc[4] + sums[3] * sc[6]);

    }
-#else
-    const int il  = 4*tpitg.x;    // 0, 4, 8, 12

-    for (int i = tpitg.y; i < nb; i += tptg.y) {
+#else
+    const int ix  = tiisg/4;
+    const int il  = 4*(tiisg%4);
+
+    for (int i = ix; i < nb; i += 8) {
        device const float * y = yy + i * QK_K + il;
        device const uint8_t * ql = x[i].ql + il;
        device const uint8_t * qh = x[i].qh + il;
@@ -1833,23 +1911,8 @@ kernel void kernel_mul_mat_q6_K_f32(

 #endif

-    sum[ith] = sumf;
-
-    //
-    // Accumulate the sum from all threads in the threadgroup
-    //
-    threadgroup_barrier(mem_flags::mem_threadgroup);
-    if (ith%4 == 0) {
-        for (int i = 1; i < 4; ++i) sum[ith] += sum[ith + i];
+    const float tot = simd_sum(sumf);
+    if (tiisg == 0) {
+        dst[r1*ne0 + row] = tot;
    }
-    threadgroup_barrier(mem_flags::mem_threadgroup);
-    if (ith%16 == 0) {
-        for (int i = 4; i < 16; i += 4) sum[ith] += sum[ith + i];
-    }
-    threadgroup_barrier(mem_flags::mem_threadgroup);
-    if (ith == 0) {
-        for (int i = 16; i < nth; i += 16) sum[0] += sum[i];
-        dst[r1*ne0 + r0] = sum[0];
-    }
-
 }
--- a/llama/ggml-mpi.c
+++ b/llama/ggml-mpi.c
@@ -0,0 +1,244 @@
+//go:build mpi
+
+/**
+ * llama.cpp - git e782c9e735f93ab4767ffc37462c523b73a17ddc
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023 Georgi Gerganov
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "ggml-mpi.h"
+
+#include "ggml.h"
+
+#include <mpi.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+
+#define UNUSED GGML_UNUSED
+
+struct ggml_mpi_context {
+    int rank;
+    int size;
+};
+
+void ggml_mpi_backend_init(void) {
+    MPI_Init(NULL, NULL);
+}
+
+void ggml_mpi_backend_free(void) {
+    MPI_Finalize();
+}
+
+struct ggml_mpi_context * ggml_mpi_init(void) {
+    struct ggml_mpi_context * ctx = calloc(1, sizeof(struct ggml_mpi_context));
+
+    MPI_Comm_rank(MPI_COMM_WORLD, &ctx->rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &ctx->size);
+
+    return ctx;
+}
+
+void ggml_mpi_free(struct ggml_mpi_context * ctx) {
+    free(ctx);
+}
+
+int ggml_mpi_rank(struct ggml_mpi_context * ctx) {
+    return ctx->rank;
+}
+
+void ggml_mpi_eval_init(
+        struct ggml_mpi_context * ctx_mpi,
+                            int * n_tokens,
+                            int * n_past,
+                            int * n_threads) {
+    UNUSED(ctx_mpi);
+
+    // synchronize the worker node parameters with the root node
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    MPI_Bcast(n_tokens,  1, MPI_INT, 0, MPI_COMM_WORLD);
+    MPI_Bcast(n_past,    1, MPI_INT, 0, MPI_COMM_WORLD);
+    MPI_Bcast(n_threads, 1, MPI_INT, 0, MPI_COMM_WORLD);
+}
+
+static int ggml_graph_get_node_idx(struct ggml_cgraph * gf, const char * name) {
+    struct ggml_tensor * t = ggml_graph_get_tensor(gf, name);
+    if (t == NULL) {
+        fprintf(stderr, "%s: tensor %s not found\n", __func__, name);
+        return -1;
+    }
+
+    for (int i = 0; i < gf->n_nodes; i++) {
+        if (gf->nodes[i] == t) {
+            return i;
+        }
+    }
+
+    fprintf(stderr, "%s: tensor %s not found in graph (should not happen)\n", __func__, name);
+    return -1;
+}
+
+static void ggml_mpi_tensor_send(struct ggml_tensor * t, int mpi_rank_dst) {
+    MPI_Datatype mpi_type;
+
+    switch (t->type) {
+        case GGML_TYPE_I32: mpi_type = MPI_INT32_T; break;
+        case GGML_TYPE_F32: mpi_type = MPI_FLOAT;   break;
+        default: GGML_ASSERT(false && "not implemented");
+    }
+
+    const int retval = MPI_Send(t->data, ggml_nelements(t), mpi_type, mpi_rank_dst, 0, MPI_COMM_WORLD);
+    GGML_ASSERT(retval == MPI_SUCCESS);
+}
+
+static void ggml_mpi_tensor_recv(struct ggml_tensor * t, int mpi_rank_src) {
+    MPI_Datatype mpi_type;
+
+    switch (t->type) {
+        case GGML_TYPE_I32: mpi_type = MPI_INT32_T; break;
+        case GGML_TYPE_F32: mpi_type = MPI_FLOAT;   break;
+        default: GGML_ASSERT(false && "not implemented");
+    }
+
+    MPI_Status status; UNUSED(status);
+
+    const int retval = MPI_Recv(t->data, ggml_nelements(t), mpi_type, mpi_rank_src, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
+    GGML_ASSERT(retval == MPI_SUCCESS);
+}
+
+// TODO: there are many improvements that can be done to this implementation
+void ggml_mpi_graph_compute_pre(
+        struct ggml_mpi_context * ctx_mpi,
+             struct ggml_cgraph * gf,
+                            int   n_layers) {
+    const int mpi_rank = ctx_mpi->rank;
+    const int mpi_size = ctx_mpi->size;
+
+    struct ggml_tensor * inp_tokens = ggml_graph_get_tensor(gf, "inp_tokens");
+    if (inp_tokens == NULL) {
+        fprintf(stderr, "%s: tensor 'inp_tokens' not found\n", __func__);
+        return;
+    }
+
+    struct ggml_tensor * inp0 = ggml_graph_get_tensor(gf, "layer_inp_0");
+    if (inp0 == NULL) {
+        fprintf(stderr, "%s: tensor 'inp0' not found\n", __func__);
+        return;
+    }
+
+    GGML_ASSERT(inp0 == gf->nodes[0]);
+
+    // distribute the compute graph into slices across the MPI nodes
+    //
+    // the main node (0) processes the last layers + the remainder of the compute graph
+    // and is responsible to pass the input tokens to the first node (1)
+    //
+    // node 1:   [(  0) * n_per_node, (  1) * n_per_node)
+    // node 2:   [(  1) * n_per_node, (  2) * n_per_node)
+    // ...
+    // node n-1: [(n-2) * n_per_node, (n-1) * n_per_node)
+    // node 0:   [(n-1) * n_per_node,            n_nodes)
+    //
+    if (mpi_rank > 0) {
+        if (mpi_rank == 1) {
+            // the first node (1) receives the input tokens from the main node (0)
+            ggml_mpi_tensor_recv(inp_tokens, 0);
+        } else {
+            // recv input data for each node into the "inp0" tensor (i.e. the first node in the compute graph)
+            ggml_mpi_tensor_recv(inp0, mpi_rank - 1);
+        }
+    } else if (mpi_size > 1) {
+        // node 0 sends the input tokens to node 1
+        ggml_mpi_tensor_send(inp_tokens, 1);
+
+        // recv the output data from the last node
+        ggml_mpi_tensor_recv(inp0, mpi_size - 1);
+    }
+
+    {
+        const int n_per_node = (n_layers + (mpi_size - 1)) / mpi_size;
+
+        const int mpi_idx = mpi_rank > 0 ? mpi_rank - 1 : mpi_size - 1;
+
+        const int il0 =               (mpi_idx + 0) * n_per_node;
+        const int il1 = MIN(n_layers, (mpi_idx + 1) * n_per_node);
+
+        char name_l0[GGML_MAX_NAME];
+        char name_l1[GGML_MAX_NAME];
+
+        snprintf(name_l0, sizeof(name_l0), "layer_inp_%d", il0);
+        snprintf(name_l1, sizeof(name_l1), "layer_inp_%d", il1);
+
+        const int idx_l0 =                ggml_graph_get_node_idx(gf, name_l0);
+        const int idx_l1 = mpi_rank > 0 ? ggml_graph_get_node_idx(gf, name_l1) + 1 : gf->n_nodes;
+
+        if (idx_l0 < 0 || idx_l1 < 0) {
+            fprintf(stderr, "%s: layer input nodes not found\n", __func__);
+            return;
+        }
+
+        // attach the input data to all nodes that need it
+        // TODO: not great - should be able to do this without modifying the compute graph (see next TODO below)
+        for (int i = idx_l0; i < idx_l1; i++) {
+            if (gf->nodes[i]->src[0] == gf->nodes[idx_l0]) {
+                gf->nodes[i]->src[0] =  inp0;
+            }
+            if (gf->nodes[i]->src[1] == gf->nodes[idx_l0]) {
+                gf->nodes[i]->src[1] =  inp0;
+            }
+        }
+
+        // TODO: instead of rearranging the nodes, we should be able to execute a subset of the compute graph
+        for (int i = 1; i < idx_l1 - idx_l0; i++) {
+            gf->nodes[i] = gf->nodes[idx_l0 + i];
+            gf->grads[i] = gf->grads[idx_l0 + i];
+        }
+
+        // the first node performs the "get_rows" operation, the rest of the nodes get the data from the previous node
+        if (mpi_idx != 0) {
+            gf->nodes[0]->op = GGML_OP_NONE;
+        }
+
+        gf->n_nodes = idx_l1 - idx_l0;
+
+        //fprintf(stderr, "%s: node %d: processing %d nodes [%d, %d)\n", __func__, mpi_rank, gf->n_nodes, il0, il1);
+    }
+}
+
+void ggml_mpi_graph_compute_post(
+        struct ggml_mpi_context * ctx_mpi,
+             struct ggml_cgraph * gf,
+                            int   n_layers) {
+    UNUSED(n_layers);
+
+    const int mpi_rank = ctx_mpi->rank;
+    const int mpi_size = ctx_mpi->size;
+
+    // send the output data to the next node
+    if (mpi_rank > 0) {
+        ggml_mpi_tensor_send(gf->nodes[gf->n_nodes - 1], (mpi_rank + 1) % mpi_size);
+    }
+}
--- a/llama/ggml-mpi.h
+++ b/llama/ggml-mpi.h
@@ -0,0 +1,67 @@
+//go:build mpi
+
+/**
+ * llama.cpp - git e782c9e735f93ab4767ffc37462c523b73a17ddc
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023 Georgi Gerganov
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+struct ggml_context;
+struct ggml_tensor;
+struct ggml_cgraph;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct ggml_mpi_context;
+
+void ggml_mpi_backend_init(void);
+void ggml_mpi_backend_free(void);
+
+struct ggml_mpi_context * ggml_mpi_init(void);
+void ggml_mpi_free(struct ggml_mpi_context * ctx);
+
+int ggml_mpi_rank(struct ggml_mpi_context * ctx);
+
+void ggml_mpi_eval_init(
+        struct ggml_mpi_context * ctx_mpi,
+                            int * n_tokens,
+                            int * n_past,
+                            int * n_threads);
+
+void ggml_mpi_graph_compute_pre(
+        struct ggml_mpi_context * ctx_mpi,
+             struct ggml_cgraph * gf,
+                            int   n_layers);
+
+void ggml_mpi_graph_compute_post(
+        struct ggml_mpi_context * ctx_mpi,
+             struct ggml_cgraph * gf,
+                            int   n_layers);
+
+#ifdef __cplusplus
+}
+#endif
--- a/llama/ggml-opencl.cpp
+++ b/llama/ggml-opencl.cpp
--- a/llama/ggml-opencl.h
+++ b/llama/ggml-opencl.h
@@ -0,0 +1,53 @@
+//go:build opencl
+
+/**
+ * llama.cpp - git e782c9e735f93ab4767ffc37462c523b73a17ddc
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023 Georgi Gerganov
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include "ggml.h"
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+void ggml_cl_init(void);
+
+void   ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
+bool   ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
+size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
+void   ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
+
+void * ggml_cl_host_malloc(size_t size);
+void   ggml_cl_host_free(void * ptr);
+
+void ggml_cl_free_data(const struct ggml_tensor* tensor);
+
+void ggml_cl_transform_tensor(void * data, struct ggml_tensor * tensor);
+
+#ifdef  __cplusplus
+}
+#endif
--- a/llama/ggml.c
+++ b/llama/ggml.c
--- a/llama/ggml.h
+++ b/llama/ggml.h
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - git 5bf2a2771886ee86137e01dbc7492f78fb392066
+ * llama.cpp - git e782c9e735f93ab4767ffc37462c523b73a17ddc
 *
 * MIT License
 *
@@ -227,8 +227,13 @@
 #define GGML_MAX_NAME          48
 #define GGML_DEFAULT_N_THREADS 4

+
+#define GGML_EXIT_SUCCESS 0
+#define GGML_EXIT_ABORTED 1
+
 #define GGML_UNUSED(x) (void)(x)

+
 #define GGML_ASSERT(x) \
    do { \
        if (!(x)) { \
@@ -389,6 +394,8 @@ extern "C" {
        GGML_OP_CLAMP,
        GGML_OP_CONV_1D,
        GGML_OP_CONV_2D,
+        GGML_OP_POOL_1D,
+        GGML_OP_POOL_2D,

        GGML_OP_FLASH_ATTN,
        GGML_OP_FLASH_FF,
@@ -468,6 +475,10 @@ extern "C" {

        // the `n_tasks` of nodes, 1:1 mapping to cgraph nodes
        int n_tasks[GGML_MAX_NODES];
+
+        // abort ggml_graph_compute when true
+        bool (*abort_callback)(void * data);
+        void * abort_callback_data;
    };

    // computation graph
@@ -1136,6 +1147,17 @@ extern "C" {
            int                   mode,
            int                   n_ctx);

+    // custom RoPE, in-place, returns view(a)
+    GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   n_past,
+            int                   n_dims,
+            int                   mode,
+            float                 freq_base,
+            float                 freq_scale,
+            int                   n_ctx);
+
    // rotary position embedding backward, i.e compute dx from dy
    // a - dy
    GGML_API struct ggml_tensor * ggml_rope_back(
@@ -1190,6 +1212,31 @@ extern "C" {
            int                   s,
            int                   d);

+    enum ggml_op_pool {
+        GGML_OP_POOL_MAX,
+        GGML_OP_POOL_AVG,
+        GGML_OP_POOL_COUNT,
+    };
+
+    GGML_API struct ggml_tensor* ggml_pool_1d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            enum ggml_op_pool     op,
+            int                   k0, // kernel size
+            int                   s0, // stride
+            int                   p0); // padding
+
+    GGML_API struct ggml_tensor* ggml_pool_2d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            enum ggml_op_pool     op,
+            int                   k0,
+            int                   k1,
+            int                   s0,
+            int                   s1,
+            int                   p0,
+            int                   p1);
+
    GGML_API struct ggml_tensor * ggml_flash_attn(
            struct ggml_context * ctx,
            struct ggml_tensor  * q,
@@ -1329,7 +1376,7 @@ extern "C" {
    // ggml_graph_plan() has to be called before ggml_graph_compute()
    // when plan.work_size > 0, caller must allocate memory for plan.work_data
    GGML_API struct ggml_cplan ggml_graph_plan   (struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
-    GGML_API              void ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
+    GGML_API               int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
    GGML_API              void ggml_graph_reset  (struct ggml_cgraph * cgraph);

    // same as ggml_graph_compute() but the work data is allocated as a part of the context
--- a/llama/k_quants.c
+++ b/llama/k_quants.c
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - git 5bf2a2771886ee86137e01dbc7492f78fb392066
+ * llama.cpp - git e782c9e735f93ab4767ffc37462c523b73a17ddc
 *
 * MIT License
 *
--- a/llama/k_quants.h
+++ b/llama/k_quants.h
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - git 5bf2a2771886ee86137e01dbc7492f78fb392066
+ * llama.cpp - git e782c9e735f93ab4767ffc37462c523b73a17ddc
 *
 * MIT License
 *
@@ -41,6 +41,14 @@
 #define K_SCALE_SIZE 12
 #endif

+#ifndef static_assert
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
+#define static_assert(cond, msg) _Static_assert(cond, msg)
+#else
+#define static_assert(cond, msg) struct global_scope_noop_trick
+#endif
+#endif
+
 //
 // Super-block quantization structures
 //
--- a/llama/llama-util.h
+++ b/llama/llama-util.h
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - git 5bf2a2771886ee86137e01dbc7492f78fb392066
+ * llama.cpp - git e782c9e735f93ab4767ffc37462c523b73a17ddc
 *
 * MIT License
 *
@@ -201,13 +201,13 @@ struct llama_mmap {
    llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) {
        size = file->size;
        int fd = fileno(file->fp);
-        int flags = MAP_PRIVATE;
+        int flags = MAP_SHARED;
        // prefetch/readahead impairs performance on NUMA systems
        if (numa) { prefetch = 0; }
 #ifdef __linux__
        if (prefetch) { flags |= MAP_POPULATE; }
 #endif
-        addr = mmap(NULL, file->size, PROT_READ | PROT_WRITE, flags, fd, 0);
+        addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
        if (addr == MAP_FAILED) {
            throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
        }
@@ -249,7 +249,7 @@ struct llama_mmap {
            throw std::runtime_error(format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str()));
        }

-        addr = MapViewOfFile(hMapping, FILE_MAP_COPY, 0, 0, 0);
+        addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
        error = GetLastError();
        CloseHandle(hMapping);

--- a/llama/llama.cpp
+++ b/llama/llama.cpp
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - git 5bf2a2771886ee86137e01dbc7492f78fb392066
+ * llama.cpp - git e782c9e735f93ab4767ffc37462c523b73a17ddc
 *
 * MIT License
 *
@@ -127,14 +127,15 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
 // memory sizes
 //

-static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
+static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0(int n_ctx)
 {
    static std::map<e_model, size_t> k_sizes = {
-        { MODEL_3B,    256ull * MB },
-        { MODEL_7B,    512ull * MB },
-        { MODEL_13B,   512ull * MB },
-        { MODEL_30B,   512ull * MB },
-        { MODEL_65B,  1024ull * MB },
+        /* empirical scaling, still a guess */
+        { MODEL_3B,   ((size_t) n_ctx / 16ull + 128ull) * MB },
+        { MODEL_7B,   ((size_t) n_ctx / 16ull + 256ull) * MB },
+        { MODEL_13B,  ((size_t) n_ctx / 12ull + 256ull) * MB },
+        { MODEL_30B,  ((size_t) n_ctx / 10ull + 256ull) * MB },
+        { MODEL_65B,  ((size_t) n_ctx /  8ull + 512ull) * MB },
    };
    return k_sizes;
 }
@@ -166,14 +167,14 @@ static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()

 // this is mostly needed for temporary mul_mat buffers to dequantize the data
 // not actually needed if BLAS is disabled
-static const std::map<e_model, size_t> & MEM_REQ_EVAL()
+static const std::map<e_model, size_t> & MEM_REQ_EVAL(int n_ctx)
 {
    static std::map<e_model, size_t> k_sizes = {
-        { MODEL_3B,   512ull * MB },
-        { MODEL_7B,   768ull * MB },
-        { MODEL_13B, 1024ull * MB },
-        { MODEL_30B, 1280ull * MB },
-        { MODEL_65B, 1536ull * MB },
+        { MODEL_3B,  ((size_t) n_ctx / 256ull +  512ull) * MB },
+        { MODEL_7B,  ((size_t) n_ctx / 256ull +  768ull) * MB },
+        { MODEL_13B, ((size_t) n_ctx / 256ull + 1024ull) * MB },
+        { MODEL_30B, ((size_t) n_ctx / 256ull + 1280ull) * MB },
+        { MODEL_65B, ((size_t) n_ctx / 256ull + 1536ull) * MB },
    };
    return k_sizes;
 }
@@ -215,6 +216,10 @@ struct llama_hparams {
    uint32_t n_head  = 32;
    uint32_t n_layer = 32;
    uint32_t n_rot   = 64;
+
+    float rope_freq_base  = 10000.0f;
+    float rope_freq_scale = 1.0f;
+
    enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16;

    bool operator!=(const llama_hparams & other) const {
@@ -329,7 +334,7 @@ struct llama_model {
 };

 struct llama_context {
-    llama_context(const llama_model & model, const llama_vocab & vocab) : model(model), vocab(vocab), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
+    llama_context(const llama_model & model) : model(model), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
 #ifdef GGML_USE_METAL
    ~llama_context() {
        if (ctx_metal) {
@@ -350,7 +355,6 @@ struct llama_context {
    int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)

    const llama_model & model;
-    const llama_vocab & vocab;

    bool model_owner = false;

@@ -577,7 +581,9 @@ struct llama_file_loader {
            }

            // skip to the next multiple of 32 bytes
-            file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
+            if (file_version >= LLAMA_FILE_VERSION_GGJT_V1) {
+                file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
+            }

            tensor.file_off = file.tell();
            tensor.name = name;
@@ -674,7 +680,7 @@ struct llama_model_loader {
        *ctx_size_p = *mmapped_size_p = 0;
        for (const llama_load_tensor & lt : tensors_map.tensors) {
            *ctx_size_p += sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE;
-            *(use_mmap ? mmapped_size_p : ctx_size_p) += lt.size;
+            *(use_mmap ? mmapped_size_p : ctx_size_p) += lt.size + 16;
        }
    }

@@ -870,6 +876,8 @@ struct llama_context_params llama_context_default_params() {
        /*.gpu_layers                  =*/ 0,
        /*.main_gpu                    =*/ 0,
        /*.tensor_split                =*/ {0},
+        /*.rope_freq_base              =*/ 10000.0f,
+        /*.rope_freq_scale             =*/ 1.0f,
        /*.progress_callback           =*/ nullptr,
        /*.progress_callback_user_data =*/ nullptr,
        /*.low_vram                    =*/ false,
@@ -895,6 +903,10 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
    return result;
 }

+int llama_max_devices() {
+    return LLAMA_MAX_DEVICES;
+}
+
 bool llama_mmap_supported() {
    return llama_mmap::SUPPORTED;
 }
@@ -993,6 +1005,8 @@ static void llama_model_load_internal(
        int n_gpu_layers,
        int main_gpu,
        const float * tensor_split,
+        float rope_freq_base,
+        float rope_freq_scale,
        bool low_vram,
        ggml_type memory_type,
        bool use_mmap,
@@ -1027,22 +1041,27 @@ static void llama_model_load_internal(
        }

        hparams.n_ctx = n_ctx;
+
+        hparams.rope_freq_base  = rope_freq_base;
+        hparams.rope_freq_scale = rope_freq_scale;
    }

    const uint32_t n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;

    {
-        fprintf(stderr, "%s: format     = %s\n",  __func__, llama_file_version_name(file_version));
-        fprintf(stderr, "%s: n_vocab    = %u\n",  __func__, hparams.n_vocab);
-        fprintf(stderr, "%s: n_ctx      = %u\n",  __func__, hparams.n_ctx);
-        fprintf(stderr, "%s: n_embd     = %u\n",  __func__, hparams.n_embd);
-        fprintf(stderr, "%s: n_mult     = %u\n",  __func__, hparams.n_mult);
-        fprintf(stderr, "%s: n_head     = %u\n",  __func__, hparams.n_head);
-        fprintf(stderr, "%s: n_layer    = %u\n",  __func__, hparams.n_layer);
-        fprintf(stderr, "%s: n_rot      = %u\n",  __func__, hparams.n_rot);
+        fprintf(stderr, "%s: format     = %s\n",   __func__, llama_file_version_name(file_version));
+        fprintf(stderr, "%s: n_vocab    = %u\n",   __func__, hparams.n_vocab);
+        fprintf(stderr, "%s: n_ctx      = %u\n",   __func__, hparams.n_ctx);
+        fprintf(stderr, "%s: n_embd     = %u\n",   __func__, hparams.n_embd);
+        fprintf(stderr, "%s: n_mult     = %u\n",   __func__, hparams.n_mult);
+        fprintf(stderr, "%s: n_head     = %u\n",   __func__, hparams.n_head);
+        fprintf(stderr, "%s: n_layer    = %u\n",   __func__, hparams.n_layer);
+        fprintf(stderr, "%s: n_rot      = %u\n",   __func__, hparams.n_rot);
+        fprintf(stderr, "%s: freq_base  = %.1f\n", __func__, hparams.rope_freq_base);
+        fprintf(stderr, "%s: freq_scale = %g\n",   __func__, hparams.rope_freq_scale);
        fprintf(stderr, "%s: ftype      = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype));
-        fprintf(stderr, "%s: n_ff       = %u\n",  __func__, n_ff);
-        fprintf(stderr, "%s: model size = %s\n",  __func__, llama_model_type_name(model.type));
+        fprintf(stderr, "%s: n_ff       = %u\n",   __func__, n_ff);
+        fprintf(stderr, "%s: model size = %s\n",   __func__, llama_model_type_name(model.type));
    }

    if (file_version < LLAMA_FILE_VERSION_GGJT_V2) {
@@ -1191,9 +1210,9 @@ static void llama_model_load_internal(
        const size_t mem_required =
            ctx_size +
            mmapped_size - vram_weights + // weights in VRAM not in memory
-            MEM_REQ_SCRATCH0().at(model.type) +
+            MEM_REQ_SCRATCH0(hparams.n_ctx).at(model.type) +
            MEM_REQ_SCRATCH1().at(model.type) +
-            MEM_REQ_EVAL().at    (model.type);
+            MEM_REQ_EVAL(hparams.n_ctx).at(model.type);

        // this is the memory required by one llama_state
        const size_t mem_required_state =
@@ -1297,6 +1316,8 @@ static bool llama_model_load(
        int n_gpu_layers,
        int main_gpu,
        float * tensor_split,
+        float rope_freq_base,
+        float rope_freq_scale,
        bool low_vram,
        ggml_type memory_type,
        bool use_mmap,
@@ -1305,7 +1326,7 @@ static bool llama_model_load(
        llama_progress_callback progress_callback,
        void *progress_callback_user_data) {
    try {
-        llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, low_vram, memory_type,
+        llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, rope_freq_base, rope_freq_scale, low_vram, memory_type,
                                  use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
        return true;
    } catch (const std::exception & err) {
@@ -1357,6 +1378,9 @@ static bool llama_eval_internal(
    const int n_rot        = hparams.n_embd/hparams.n_head;
    const int n_gpu_layers = model.n_gpu_layers;

+    const float freq_base  = hparams.rope_freq_base;
+    const float freq_scale = hparams.rope_freq_scale;
+
    auto & mem_per_token = lctx.mem_per_token;
    auto & buf_compute   = lctx.buf_compute;

@@ -1454,11 +1478,11 @@ static bool llama_eval_internal(
            offload_func_kq(tmpq);
            ggml_set_name(tmpq, "tmpq");

-            struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
+            struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0, freq_base, freq_scale, 0);
            offload_func_kq(Kcur);
            ggml_set_name(Kcur, "Kcur");

-            struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
+            struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0, freq_base, freq_scale, 0);
            offload_func_kq(Qcur);
            ggml_set_name(Qcur, "Qcur");

@@ -2032,9 +2056,18 @@ void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array *
    }

    // Normalize the second derivatives
-    float second_derivatives_sum = std::accumulate(second_derivatives.begin(), second_derivatives.end(), 0.0f);
-    for (float & value : second_derivatives) {
-        value /= second_derivatives_sum;
+    {
+        const float second_derivatives_sum = std::accumulate(second_derivatives.begin(), second_derivatives.end(), 0.0f);
+
+        if (second_derivatives_sum > 1e-6f) {
+            for (float & value : second_derivatives) {
+                value /= second_derivatives_sum;
+            }
+        } else {
+            for (float & value : second_derivatives) {
+                value = 1.0f / second_derivatives.size();
+            }
+        }
    }

    float cum_sum = 0.0f;
@@ -2213,7 +2246,7 @@ void llama_sample_classifier_free_guidance(
          struct llama_context * guidance_ctx,
                         float   scale,
                         float   smooth_factor) {
-    int64_t t_start_sample_us = t_start_sample_us = ggml_time_us();
+    int64_t t_start_sample_us = ggml_time_us();

    assert(ctx);
    auto n_vocab = llama_n_vocab(ctx);
@@ -2701,8 +2734,9 @@ struct llama_model * llama_load_model_from_file(
    ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;

    if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gpu_layers,
-                params.main_gpu, params.tensor_split, params.low_vram, memory_type, params.use_mmap, params.use_mlock,
-                params.vocab_only, params.progress_callback, params.progress_callback_user_data)) {
+                params.main_gpu, params.tensor_split, params.rope_freq_base, params.rope_freq_scale,params.low_vram,
+                memory_type, params.use_mmap, params.use_mlock, params.vocab_only, params.progress_callback,
+                params.progress_callback_user_data)) {
        delete model;
        fprintf(stderr, "%s: failed to load model\n", __func__);
        return nullptr;
@@ -2723,7 +2757,7 @@ struct llama_context * llama_new_context_with_model(
        return nullptr;
    }

-    llama_context * ctx = new llama_context(*model, model->vocab);
+    llama_context * ctx = new llama_context(*model);

    if (params.seed == LLAMA_DEFAULT_SEED) {
        params.seed = time(NULL);
@@ -2777,9 +2811,9 @@ struct llama_context * llama_new_context_with_model(
            ctx->embedding.resize(hparams.n_embd);
        }

-        ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type));
+        ctx->buf_compute.resize(MEM_REQ_EVAL(hparams.n_ctx).at(ctx->model.type));

-        ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0().at(ctx->model.type));
+        ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type));
        ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
    }

@@ -3561,13 +3595,13 @@ int llama_eval_export(struct llama_context * ctx, const char * fname) {
    return 0;
 }

-int llama_tokenize(
-        struct llama_context * ctx,
+int llama_tokenize_with_model(
+    const struct llama_model * model,
                  const char * text,
                 llama_token * tokens,
                         int   n_max_tokens,
                        bool   add_bos) {
-    auto res = llama_tokenize(ctx->vocab, text, add_bos);
+    auto res = llama_tokenize(model->vocab, text, add_bos);

    if (n_max_tokens < (int) res.size()) {
        fprintf(stderr, "%s: too many tokens\n", __func__);
@@ -3581,8 +3615,29 @@ int llama_tokenize(
    return res.size();
 }

+int llama_tokenize(
+        struct llama_context * ctx,
+                  const char * text,
+                 llama_token * tokens,
+                         int   n_max_tokens,
+                        bool   add_bos) {
+    return llama_tokenize_with_model(&ctx->model, text, tokens, n_max_tokens, add_bos);
+}
+
+int llama_n_vocab_from_model(const struct llama_model * model) {
+    return model->vocab.id_to_token.size();
+}
+
+int llama_n_ctx_from_model(const struct llama_model * model) {
+    return model->hparams.n_ctx;
+}
+
+int llama_n_embd_from_model(const struct llama_model * model) {
+    return model->hparams.n_embd;
+}
+
 int llama_n_vocab(const struct llama_context * ctx) {
-    return ctx->vocab.id_to_token.size();
+    return ctx->model.vocab.id_to_token.size();
 }

 int llama_n_ctx(const struct llama_context * ctx) {
@@ -3593,17 +3648,25 @@ int llama_n_embd(const struct llama_context * ctx) {
    return ctx->model.hparams.n_embd;
 }

+int llama_get_vocab_from_model(
+        const struct llama_model * model,
+        const char * * strings,
+        float  * scores,
+        int capacity) {
+    int n = std::min(capacity, (int) model->vocab.id_to_token.size());
+    for (int i = 0; i<n; ++i) {
+        strings[i] = model->vocab.id_to_token[i].tok.c_str();
+        scores[i]  = model->vocab.id_to_token[i].score;
+    }
+    return n;
+}
+
 int llama_get_vocab(
        const struct llama_context * ctx,
        const char * * strings,
        float  * scores,
        int capacity) {
-    int n = std::min(capacity, (int) ctx->vocab.id_to_token.size());
-    for (int i = 0; i<n; ++i) {
-        strings[i] = ctx->vocab.id_to_token[i].tok.c_str();
-        scores[i]  = ctx->vocab.id_to_token[i].score;
-    }
-    return n;
+    return llama_get_vocab_from_model(&ctx->model, strings, scores, capacity);
 }

 float * llama_get_logits(struct llama_context * ctx) {
@@ -3614,12 +3677,16 @@ float * llama_get_embeddings(struct llama_context * ctx) {
    return ctx->embedding.data();
 }

-const char * llama_token_to_str(const struct llama_context * ctx, llama_token token) {
-    if (token >= llama_n_vocab(ctx)) {
+const char * llama_token_to_str_with_model(const struct llama_model * model, llama_token token) {
+    if (token >= llama_n_vocab_from_model(model)) {
        return nullptr;
    }

-    return ctx->vocab.id_to_token[token].tok.c_str();
+    return model->vocab.id_to_token[token].tok.c_str();
+}
+
+const char * llama_token_to_str(const struct llama_context * ctx, llama_token token) {
+    return llama_token_to_str_with_model(&ctx->model, token);
 }

 llama_token llama_token_bos() {
--- a/llama/llama.h
+++ b/llama/llama.h
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - git 5bf2a2771886ee86137e01dbc7492f78fb392066
+ * llama.cpp - git e782c9e735f93ab4767ffc37462c523b73a17ddc
 *
 * MIT License
 *
@@ -115,6 +115,11 @@ extern "C" {
        int32_t  n_gpu_layers;                 // number of layers to store in VRAM
        int32_t  main_gpu;                     // the GPU that is used for scratch and small tensors
        float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
+
+        // ref: https://github.com/ggerganov/llama.cpp/pull/2054
+        float    rope_freq_base;  // RoPE base frequency
+        float    rope_freq_scale; // RoPE frequency scaling factor
+
        // called with a progress value between 0 and 1, pass NULL to disable
        llama_progress_callback progress_callback;
        // context pointer passed to the progress callback
@@ -174,6 +179,8 @@ extern "C" {
        int32_t n_eval;
    };

+    LLAMA_API int llama_max_devices();
+
    LLAMA_API struct llama_context_params llama_context_default_params();
    LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params();

@@ -296,10 +303,21 @@ extern "C" {
                             int   n_max_tokens,
                            bool   add_bos);

+    LLAMA_API int llama_tokenize_with_model(
+        const struct llama_model * model,
+                      const char * text,
+                     llama_token * tokens,
+                             int   n_max_tokens,
+                            bool   add_bos);
+
    LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
    LLAMA_API int llama_n_ctx  (const struct llama_context * ctx);
    LLAMA_API int llama_n_embd (const struct llama_context * ctx);

+    LLAMA_API int llama_n_vocab_from_model(const struct llama_model * model);
+    LLAMA_API int llama_n_ctx_from_model  (const struct llama_model * model);
+    LLAMA_API int llama_n_embd_from_model (const struct llama_model * model);
+
    // Get the vocabulary as output parameters.
    // Returns number of results.
    LLAMA_API int llama_get_vocab(
@@ -308,6 +326,12 @@ extern "C" {
                                 float * scores,
                                   int   capacity);

+    LLAMA_API int llama_get_vocab_from_model(
+              const struct llama_model * model,
+                          const char * * strings,
+                                 float * scores,
+                                   int   capacity);
+
    // Token logits obtained from the last call to llama_eval()
    // The logits for the last token are stored in the last row
    // Can be mutated in order to change the probabilities of the next token
@@ -320,7 +344,13 @@ extern "C" {
    LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);

    // Token Id -> String. Uses the vocabulary in the provided context
-    LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token);
+    LLAMA_API const char * llama_token_to_str(
+            const struct llama_context * ctx,
+                           llama_token   token);
+
+    LLAMA_API const char * llama_token_to_str_with_model(
+              const struct llama_model * model,
+                           llama_token   token);

    // Special tokens
    LLAMA_API llama_token llama_token_bos();  // beginning-of-sentence
--- a/llama/update-llama-cpp.sh
+++ b/llama/update-llama-cpp.sh
@@ -0,0 +1,70 @@
+#!/bin/sh
+
+set -eu
+
+
+status() { echo >&2 ">>> $*"; }
+error() { status "ERROR $*"; }
+usage() {
+    echo "usage: $(basename $0) /path/to/repo"
+    exit 1
+}
+
+OUT=$(dirname $0)
+while getopts "hC:" OPTION; do
+    case $OPTION in
+        C) OUT=$OPTARG ;;
+        *) usage ;;
+    esac
+done
+
+shift $(( $OPTIND - 1 ))
+[ $# -eq 1 ] || usage
+
+status "updating source..."
+cp -a "$1"/*.{c,h,cpp,m,metal,cu} "$OUT"
+
+status "removing incompatible files..."
+rm -f "$OUT"/build-info.h
+
+SHA1=$(git -C $1 rev-parse @)
+
+LICENSE=$(mktemp)
+cleanup() {
+    rm -f $LICENSE
+}
+trap cleanup 0
+
+cat <<EOF | sed 's/ *$//' >$LICENSE
+/**
+ * llama.cpp - git $SHA1
+ *
+$(sed 's/^/ * /' <$1/LICENSE)
+ */
+
+EOF
+
+for IN in $OUT/*.{c,h,cpp,m,metal,cu}; do
+    TMP=$(mktemp)
+    status "updating license $IN"
+    cat $LICENSE $IN >$TMP
+    mv $TMP $IN
+done
+
+touchup() {
+    local CONSTRAINT=$1 && shift
+
+    for IN in $*; do
+        status "touching up $IN..."
+        TMP=$(mktemp)
+        {
+            echo "//go:build $CONSTRAINT"
+            echo
+        } | cat - $IN >$TMP
+        mv $TMP $IN
+    done
+}
+
+touchup darwin $OUT/ggml-metal.*
+touchup mpi $OUT/ggml-mpi.*
+touchup opencl $OUT/ggml-opencl.*
--- a/models.json
+++ b/models.json
@@ -1,38 +0,0 @@
-[
-  {
-    "name": "orca",
-    "display_name": "Orca Mini",
-    "parameters": "3B",
-    "url": "https://huggingface.co/TheBloke/orca_mini_3B-GGML/resolve/main/orca-mini-3b.ggmlv3.q4_1.bin",
-    "short_description": "Follow instructions. Great small model that runs fast even without GPU support.",
-    "description": "An OpenLLaMa-3B model trained on explain tuned datasets, created using Instructions and Input from WizardLM, Alpaca & Dolly-V2 datasets and applying Orca Research Paper dataset construction approaches.",
-    "published_by": "TheBloke",
-    "original_author": "psmathur",
-    "original_url": "https://huggingface.co/psmathur/orca_mini_3b",
-    "license": "CC-BY-SA-4.0"
-  },
-  {
-    "name": "nous-hermes",
-    "display_name": "Nous Hermes",
-    "parameters": "13B",
-    "url": "https://huggingface.co/TheBloke/Nous-Hermes-13B-GGML/resolve/main/nous-hermes-13b.ggmlv3.q2_K.bin",
-    "short_description": "Currently one of the best 13B general model.",
-    "description": "It is suitable for a wide range of language tasks, from generating creative text to understanding and following complex instructions. This model was fine-tuned by Nous Research, with Teknium and Karan4D leading the fine tuning process and dataset curation, Redmond AI sponsoring the compute, and several other contributors. The result is an enhanced Llama 13b model that rivals GPT-3.5-turbo in performance across a variety of tasks. \n \n This model stands out for its long responses, low hallucination rate, and absence of OpenAI censorship mechanisms. The fine-tuning process was performed with a 2000 sequence length on an 8x a100 80GB DGX machine for over 50 hours.",
-    "published_by": "TheBloke",
-    "original_author": "NousResearch",
-    "original_url": "https://huggingface.co/NousResearch/Nous-Hermes-13b",
-    "license": "GPL"
-  },
-  {
-    "name": "vicuna",
-    "display_name": "Vicuna",
-    "parameters": "7B",
-    "url": "https://huggingface.co/TheBloke/vicuna-7B-v1.3-GGML/resolve/main/vicuna-7b-v1.3.ggmlv3.q4_0.bin",
-    "short_description": "Vicuna is a chat assistant trained by fine-tuning LLaMA on user-shared conversations collected from ShareGPT.",
-    "description": "The primary use of Vicuna is research on large language models and chatbots. The primary intended users of the model are researchers and hobbyists in natural language processing, machine learning, and artificial intelligence.",
-    "published_by": "TheBloke",
-    "original_author": "LMSYS",
-    "original_url": "https://huggingface.co/lmsys/vicuna-7b-v1.3",
-    "license:": "Non-commercial"
-  }
-]
--- a/server/images.go
+++ b/server/images.go
@@ -22,6 +22,12 @@ import (
 	"github.com/jmorganca/ollama/parser"
 )

+type RegistryOptions struct {
+	Insecure bool
+	Username string
+	Password string
+}
+
 type Model struct {
 	Name      string `json:"name"`
 	ModelPath string
@@ -45,7 +51,7 @@ func (m *Model) Prompt(request api.GenerateRequest) (string, error) {
 		Context []int
 	}

-	vars.First = len(vars.Context) == 0
+	vars.First = len(request.Context) == 0
 	vars.System = m.System
 	vars.Prompt = request.Prompt
 	vars.Context = request.Context
@@ -102,8 +108,8 @@ func GetManifest(mp ModelPath) (*ManifestV2, error) {
 		return nil, err
 	}

-	if _, err = os.Stat(fp); err != nil && !errors.Is(err, os.ErrNotExist) {
-		return nil, fmt.Errorf("couldn't find model '%s'", mp.GetShortTagname())
+	if _, err = os.Stat(fp); err != nil {
+		return nil, err
 	}

 	var manifest *ManifestV2
@@ -192,7 +198,6 @@ func CreateModel(name string, path string, fn func(status string)) error {
 	fn("parsing modelfile")
 	commands, err := parser.Parse(mf)
 	if err != nil {
-		fn(fmt.Sprintf("error: %v", err))
 		return err
 	}

@@ -227,14 +232,12 @@ func CreateModel(name string, path string, fn func(status string)) error {
 				fn("creating model layer")
 				file, err := os.Open(fp)
 				if err != nil {
-					fn(fmt.Sprintf("couldn't find model '%s'", c.Args))
 					return fmt.Errorf("failed to open file: %v", err)
 				}
 				defer file.Close()

 				l, err := CreateLayer(file)
 				if err != nil {
-					fn(fmt.Sprintf("couldn't create model layer: %v", err))
 					return fmt.Errorf("failed to create layer: %v", err)
 				}
 				l.MediaType = "application/vnd.ollama.image.model"
@@ -244,7 +247,6 @@ func CreateModel(name string, path string, fn func(status string)) error {
 				for _, l := range mf.Layers {
 					newLayer, err := GetLayerWithBufferFromLayer(l)
 					if err != nil {
-						fn(fmt.Sprintf("couldn't read layer: %v", err))
 						return err
 					}
 					layers = append(layers, newLayer)
@@ -304,7 +306,6 @@ func CreateModel(name string, path string, fn func(status string)) error {

 	err = SaveLayers(layers, fn, false)
 	if err != nil {
-		fn(fmt.Sprintf("error saving layers: %v", err))
 		return err
 	}

@@ -312,7 +313,6 @@ func CreateModel(name string, path string, fn func(status string)) error {
 	fn("writing manifest")
 	err = CreateManifest(name, cfg, manifestLayers)
 	if err != nil {
-		fn(fmt.Sprintf("error creating manifest: %v", err))
 		return err
 	}

@@ -493,7 +493,84 @@ func CreateLayer(f io.ReadSeeker) (*LayerReader, error) {
 	return layer, nil
 }

-func PushModel(name, username, password string, fn func(api.ProgressResponse)) error {
+func DeleteModel(name string) error {
+	mp := ParseModelPath(name)
+
+	manifest, err := GetManifest(mp)
+	if err != nil {
+		return err
+	}
+	deleteMap := make(map[string]bool)
+	for _, layer := range manifest.Layers {
+		deleteMap[layer.Digest] = true
+	}
+	deleteMap[manifest.Config.Digest] = true
+
+	fp, err := GetManifestPath()
+	if err != nil {
+		return err
+	}
+	err = filepath.Walk(fp, func(path string, info os.FileInfo, err error) error {
+		if err != nil {
+			return err
+		}
+		if !info.IsDir() {
+			path := path[len(fp)+1:]
+			slashIndex := strings.LastIndex(path, "/")
+			if slashIndex == -1 {
+				return nil
+			}
+			tag := path[:slashIndex] + ":" + path[slashIndex+1:]
+			fmp := ParseModelPath(tag)
+
+			// skip the manifest we're trying to delete
+			if mp.GetFullTagname() == fmp.GetFullTagname() {
+				return nil
+			}
+
+			// save (i.e. delete from the deleteMap) any files used in other manifests
+			manifest, err := GetManifest(fmp)
+			if err != nil {
+				log.Printf("skipping file: %s", fp)
+				return nil
+			}
+			for _, layer := range manifest.Layers {
+				delete(deleteMap, layer.Digest)
+			}
+			delete(deleteMap, manifest.Config.Digest)
+		}
+		return nil
+	})
+
+	// only delete the files which are still in the deleteMap
+	for k, v := range deleteMap {
+		if v {
+			fp, err := GetBlobsPath(k)
+			if err != nil {
+				log.Printf("couldn't get file path for '%s': %v", k, err)
+				continue
+			}
+			if err := os.Remove(fp); err != nil {
+				log.Printf("couldn't remove file '%s': %v", fp, err)
+				continue
+			}
+		}
+	}
+
+	fp, err = mp.GetManifestPath(false)
+	if err != nil {
+		return err
+	}
+	err = os.Remove(fp)
+	if err != nil {
+		log.Printf("couldn't remove manifest file '%s': %v", fp, err)
+		return err
+	}
+
+	return nil
+}
+
+func PushModel(name string, regOpts *RegistryOptions, fn func(api.ProgressResponse)) error {
 	mp := ParseModelPath(name)

 	fn(api.ProgressResponse{Status: "retrieving manifest"})
@@ -515,7 +592,7 @@ func PushModel(name, username, password string, fn func(api.ProgressResponse)) e
 	total += manifest.Config.Size

 	for _, layer := range layers {
-		exists, err := checkBlobExistence(mp, layer.Digest, username, password)
+		exists, err := checkBlobExistence(mp, layer.Digest, regOpts)
 		if err != nil {
 			return err
 		}
@@ -538,13 +615,13 @@ func PushModel(name, username, password string, fn func(api.ProgressResponse)) e
 			Completed: completed,
 		})

-		location, err := startUpload(mp, username, password)
+		location, err := startUpload(mp, regOpts)
 		if err != nil {
 			log.Printf("couldn't start upload: %v", err)
 			return err
 		}

-		err = uploadBlob(location, layer, username, password)
+		err = uploadBlob(location, layer, regOpts)
 		if err != nil {
 			log.Printf("error uploading blob: %v", err)
 			return err
@@ -563,7 +640,7 @@ func PushModel(name, username, password string, fn func(api.ProgressResponse)) e
 		Total:     total,
 		Completed: completed,
 	})
-	url := fmt.Sprintf("%s://%s/v2/%s/manifests/%s", mp.ProtocolScheme, mp.Registry, mp.GetNamespaceRepository(), mp.Tag)
+	url := fmt.Sprintf("%s/v2/%s/manifests/%s", mp.Registry, mp.GetNamespaceRepository(), mp.Tag)
 	headers := map[string]string{
 		"Content-Type": "application/vnd.docker.distribution.manifest.v2+json",
 	}
@@ -573,7 +650,7 @@ func PushModel(name, username, password string, fn func(api.ProgressResponse)) e
 		return err
 	}

-	resp, err := makeRequest("PUT", url, headers, bytes.NewReader(manifestJSON), username, password)
+	resp, err := makeRequest("PUT", url, headers, bytes.NewReader(manifestJSON), regOpts)
 	if err != nil {
 		return err
 	}
@@ -594,33 +671,31 @@ func PushModel(name, username, password string, fn func(api.ProgressResponse)) e
 	return nil
 }

-func PullModel(name, username, password string, fn func(api.ProgressResponse)) error {
+func PullModel(name string, regOpts *RegistryOptions, fn func(api.ProgressResponse)) error {
 	mp := ParseModelPath(name)

 	fn(api.ProgressResponse{Status: "pulling manifest"})

-	manifest, err := pullModelManifest(mp, username, password)
+	manifest, err := pullModelManifest(mp, regOpts)
 	if err != nil {
 		return fmt.Errorf("pull model manifest: %q", err)
 	}

 	var layers []*Layer
-	var total int
-	var completed int
-	for _, layer := range manifest.Layers {
-		layers = append(layers, layer)
-		total += layer.Size
-	}
+	layers = append(layers, manifest.Layers...)
 	layers = append(layers, &manifest.Config)
-	total += manifest.Config.Size

 	for _, layer := range layers {
-		if err := downloadBlob(mp, layer.Digest, username, password, fn); err != nil {
-			fn(api.ProgressResponse{Status: fmt.Sprintf("error downloading: %v", err), Digest: layer.Digest})
+		if err := downloadBlob(mp, layer.Digest, regOpts, fn); err != nil {
 			return err
 		}
+	}

-		completed += layer.Size
+	fn(api.ProgressResponse{Status: "verifying sha256 digest"})
+	for _, layer := range layers {
+		if err := verifyBlob(layer.Digest); err != nil {
+			return err
+		}
 	}

 	fn(api.ProgressResponse{Status: "writing manifest"})
@@ -635,7 +710,7 @@ func PullModel(name, username, password string, fn func(api.ProgressResponse)) e
 		return err
 	}

-	err = os.WriteFile(fp, manifestJSON, 0644)
+	err = os.WriteFile(fp, manifestJSON, 0o644)
 	if err != nil {
 		log.Printf("couldn't write to %s", fp)
 		return err
@@ -646,13 +721,13 @@ func PullModel(name, username, password string, fn func(api.ProgressResponse)) e
 	return nil
 }

-func pullModelManifest(mp ModelPath, username, password string) (*ManifestV2, error) {
-	url := fmt.Sprintf("%s://%s/v2/%s/manifests/%s", mp.ProtocolScheme, mp.Registry, mp.GetNamespaceRepository(), mp.Tag)
+func pullModelManifest(mp ModelPath, regOpts *RegistryOptions) (*ManifestV2, error) {
+	url := fmt.Sprintf("%s/v2/%s/manifests/%s", mp.Registry, mp.GetNamespaceRepository(), mp.Tag)
 	headers := map[string]string{
 		"Accept": "application/vnd.docker.distribution.manifest.v2+json",
 	}

-	resp, err := makeRequest("GET", url, headers, nil, username, password)
+	resp, err := makeRequest("GET", url, headers, nil, regOpts)
 	if err != nil {
 		log.Printf("couldn't get manifest: %v", err)
 		return nil, err
@@ -713,10 +788,10 @@ func GetSHA256Digest(r io.Reader) (string, int) {
 	return fmt.Sprintf("sha256:%x", h.Sum(nil)), int(n)
 }

-func startUpload(mp ModelPath, username string, password string) (string, error) {
-	url := fmt.Sprintf("%s://%s/v2/%s/blobs/uploads/", mp.ProtocolScheme, mp.Registry, mp.GetNamespaceRepository())
+func startUpload(mp ModelPath, regOpts *RegistryOptions) (string, error) {
+	url := fmt.Sprintf("%s/v2/%s/blobs/uploads/", mp.Registry, mp.GetNamespaceRepository())

-	resp, err := makeRequest("POST", url, nil, nil, username, password)
+	resp, err := makeRequest("POST", url, nil, nil, regOpts)
 	if err != nil {
 		log.Printf("couldn't start upload: %v", err)
 		return "", err
@@ -739,10 +814,10 @@ func startUpload(mp ModelPath, username string, password string) (string, error)
 }

 // Function to check if a blob already exists in the Docker registry
-func checkBlobExistence(mp ModelPath, digest string, username string, password string) (bool, error) {
-	url := fmt.Sprintf("%s://%s/v2/%s/blobs/%s", mp.ProtocolScheme, mp.Registry, mp.GetNamespaceRepository(), digest)
+func checkBlobExistence(mp ModelPath, digest string, regOpts *RegistryOptions) (bool, error) {
+	url := fmt.Sprintf("%s/v2/%s/blobs/%s", mp.Registry, mp.GetNamespaceRepository(), digest)

-	resp, err := makeRequest("HEAD", url, nil, nil, username, password)
+	resp, err := makeRequest("HEAD", url, nil, nil, regOpts)
 	if err != nil {
 		log.Printf("couldn't check for blob: %v", err)
 		return false, err
@@ -753,7 +828,7 @@ func checkBlobExistence(mp ModelPath, digest string, username string, password s
 	return resp.StatusCode == http.StatusOK, nil
 }

-func uploadBlob(location string, layer *Layer, username string, password string) error {
+func uploadBlob(location string, layer *Layer, regOpts *RegistryOptions) error {
 	// Create URL
 	url := fmt.Sprintf("%s&digest=%s", location, layer.Digest)

@@ -776,7 +851,7 @@ func uploadBlob(location string, layer *Layer, username string, password string)
 		return err
 	}

-	resp, err := makeRequest("PUT", url, headers, f, username, password)
+	resp, err := makeRequest("PUT", url, headers, f, regOpts)
 	if err != nil {
 		log.Printf("couldn't upload blob: %v", err)
 		return err
@@ -792,7 +867,7 @@ func uploadBlob(location string, layer *Layer, username string, password string)
 	return nil
 }

-func downloadBlob(mp ModelPath, digest string, username, password string, fn func(api.ProgressResponse)) error {
+func downloadBlob(mp ModelPath, digest string, regOpts *RegistryOptions, fn func(api.ProgressResponse)) error {
 	fp, err := GetBlobsPath(digest)
 	if err != nil {
 		return err
@@ -821,12 +896,12 @@ func downloadBlob(mp ModelPath, digest string, username, password string, fn fun
 		size = fi.Size()
 	}

-	url := fmt.Sprintf("%s://%s/v2/%s/blobs/%s", mp.ProtocolScheme, mp.Registry, mp.GetNamespaceRepository(), digest)
+	url := fmt.Sprintf("%s/v2/%s/blobs/%s", mp.Registry, mp.GetNamespaceRepository(), digest)
 	headers := map[string]string{
 		"Range": fmt.Sprintf("bytes=%d-", size),
 	}

-	resp, err := makeRequest("GET", url, headers, nil, username, password)
+	resp, err := makeRequest("GET", url, headers, nil, regOpts)
 	if err != nil {
 		log.Printf("couldn't download blob: %v", err)
 		return err
@@ -862,6 +937,10 @@ func downloadBlob(mp ModelPath, digest string, username, password string, fn fun
 		})

 		if completed >= total {
+			if err := out.Close(); err != nil {
+				return err
+			}
+
 			if err := os.Rename(fp+"-partial", fp); err != nil {
 				fn(api.ProgressResponse{
 					Status:    fmt.Sprintf("error renaming file: %v", err),
@@ -886,7 +965,17 @@ func downloadBlob(mp ModelPath, digest string, username, password string, fn fun
 	return nil
 }

-func makeRequest(method, url string, headers map[string]string, body io.Reader, username, password string) (*http.Response, error) {
+func makeRequest(method, url string, headers map[string]string, body io.Reader, regOpts *RegistryOptions) (*http.Response, error) {
+	if !strings.HasPrefix(url, "http") {
+		if regOpts.Insecure {
+			url = "http://" + url
+		} else {
+			url = "https://" + url
+		}
+	}
+
+	log.Printf("url = %s", url)
+
 	req, err := http.NewRequest(method, url, body)
 	if err != nil {
 		return nil, err
@@ -897,8 +986,8 @@ func makeRequest(method, url string, headers map[string]string, body io.Reader,
 	}

 	// TODO: better auth
-	if username != "" && password != "" {
-		req.SetBasicAuth(username, password)
+	if regOpts.Username != "" && regOpts.Password != "" {
+		req.SetBasicAuth(regOpts.Username, regOpts.Password)
 	}

 	client := &http.Client{
@@ -917,3 +1006,23 @@ func makeRequest(method, url string, headers map[string]string, body io.Reader,

 	return resp, nil
 }
+
+func verifyBlob(digest string) error {
+	fp, err := GetBlobsPath(digest)
+	if err != nil {
+		return err
+	}
+
+	f, err := os.Open(fp)
+	if err != nil {
+		return err
+	}
+	defer f.Close()
+
+	fileDigest, _ := GetSHA256Digest(f)
+	if digest != fileDigest {
+		return fmt.Errorf("digest mismatch: want %s, got %s", digest, fileDigest)
+	}
+
+	return nil
+}
--- a/server/modelpath.go
+++ b/server/modelpath.go
@@ -4,6 +4,7 @@ import (
 	"fmt"
 	"os"
 	"path/filepath"
+	"runtime"
 	"strings"
 )

@@ -44,7 +45,7 @@ func ParseModelPath(name string) ModelPath {
 		return ModelPath{}
 	}

-	colonParts := strings.Split(name, ":")
+	colonParts := strings.Split(slashParts[len(slashParts)-1], ":")
 	if len(colonParts) == 2 {
 		tag = colonParts[1]
 	} else {
@@ -69,10 +70,13 @@ func (mp ModelPath) GetFullTagname() string {
 }

 func (mp ModelPath) GetShortTagname() string {
-	if mp.Registry == DefaultRegistry && mp.Namespace == DefaultNamespace {
-		return fmt.Sprintf("%s:%s", mp.Repository, mp.Tag)
+	if mp.Registry == DefaultRegistry {
+		if mp.Namespace == DefaultNamespace {
+			return fmt.Sprintf("%s:%s", mp.Repository, mp.Tag)
+		}
+		return fmt.Sprintf("%s/%s:%s", mp.Namespace, mp.Repository, mp.Tag)
 	}
-	return fmt.Sprintf("%s/%s:%s", mp.Namespace, mp.Repository, mp.Tag)
+	return fmt.Sprintf("%s/%s/%s:%s", mp.Registry, mp.Namespace, mp.Repository, mp.Tag)
 }

 func (mp ModelPath) GetManifestPath(createDir bool) (string, error) {
@@ -106,6 +110,10 @@ func GetBlobsPath(digest string) (string, error) {
 		return "", err
 	}

+	if runtime.GOOS == "windows" {
+		digest = strings.ReplaceAll(digest, ":", "-")
+	}
+
 	path := filepath.Join(home, ".ollama", "models", "blobs", digest)
 	if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil {
 		return "", err
--- a/server/routes.go
+++ b/server/routes.go
@@ -2,6 +2,8 @@ package server

 import (
 	"encoding/json"
+	"errors"
+	"fmt"
 	"io"
 	"log"
 	"net"
@@ -18,16 +20,7 @@ import (
 	"github.com/jmorganca/ollama/llama"
 )

-func cacheDir() string {
-	home, err := os.UserHomeDir()
-	if err != nil {
-		panic(err)
-	}
-
-	return filepath.Join(home, ".ollama")
-}
-
-func generate(c *gin.Context) {
+func GenerateHandler(c *gin.Context) {
 	start := time.Now()

 	var req api.GenerateRequest
@@ -69,7 +62,7 @@ func generate(c *gin.Context) {
 	ch := make(chan any)
 	go func() {
 		defer close(ch)
-		llm.Predict(req.Context, prompt, func(r api.GenerateResponse) {
+		fn := func(r api.GenerateResponse) {
 			r.Model = req.Model
 			r.CreatedAt = time.Now().UTC()
 			if r.Done {
@@ -77,13 +70,17 @@ func generate(c *gin.Context) {
 			}

 			ch <- r
-		})
+		}
+
+		if err := llm.Predict(req.Context, prompt, fn); err != nil {
+			ch <- gin.H{"error": err.Error()}
+		}
 	}()

 	streamResponse(c, ch)
 }

-func pull(c *gin.Context) {
+func PullModelHandler(c *gin.Context) {
 	var req api.PullRequest
 	if err := c.ShouldBindJSON(&req); err != nil {
 		c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
@@ -97,16 +94,21 @@ func pull(c *gin.Context) {
 			ch <- r
 		}

-		if err := PullModel(req.Name, req.Username, req.Password, fn); err != nil {
-			c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
-			return
+		regOpts := &RegistryOptions{
+			Insecure: req.Insecure,
+			Username: req.Username,
+			Password: req.Password,
+		}
+
+		if err := PullModel(req.Name, regOpts, fn); err != nil {
+			ch <- gin.H{"error": err.Error()}
 		}
 	}()

 	streamResponse(c, ch)
 }

-func push(c *gin.Context) {
+func PushModelHandler(c *gin.Context) {
 	var req api.PushRequest
 	if err := c.ShouldBindJSON(&req); err != nil {
 		c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
@@ -120,16 +122,21 @@ func push(c *gin.Context) {
 			ch <- r
 		}

-		if err := PushModel(req.Name, req.Username, req.Password, fn); err != nil {
-			c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
-			return
+		regOpts := &RegistryOptions{
+			Insecure: req.Insecure,
+			Username: req.Username,
+			Password: req.Password,
+		}
+
+		if err := PushModel(req.Name, regOpts, fn); err != nil {
+			ch <- gin.H{"error": err.Error()}
 		}
 	}()

 	streamResponse(c, ch)
 }

-func create(c *gin.Context) {
+func CreateModelHandler(c *gin.Context) {
 	var req api.CreateRequest
 	if err := c.ShouldBindJSON(&req); err != nil {
 		c.JSON(http.StatusBadRequest, gin.H{"message": err.Error()})
@@ -146,15 +153,31 @@ func create(c *gin.Context) {
 		}

 		if err := CreateModel(req.Name, req.Path, fn); err != nil {
-			c.JSON(http.StatusBadRequest, gin.H{"message": err.Error()})
-			return
+			ch <- gin.H{"error": err.Error()}
 		}
 	}()

 	streamResponse(c, ch)
 }

-func list(c *gin.Context) {
+func DeleteModelHandler(c *gin.Context) {
+	var req api.DeleteRequest
+	if err := c.ShouldBindJSON(&req); err != nil {
+		c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
+		return
+	}
+
+	if err := DeleteModel(req.Name); err != nil {
+		if os.IsNotExist(err) {
+			c.JSON(http.StatusNotFound, gin.H{"error": fmt.Sprintf("model '%s' not found", req.Name)})
+		} else {
+			c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
+		}
+		return
+	}
+}
+
+func ListModelsHandler(c *gin.Context) {
 	var models []api.ListResponseModel
 	fp, err := GetManifestPath()
 	if err != nil {
@@ -163,6 +186,10 @@ func list(c *gin.Context) {
 	}
 	err = filepath.Walk(fp, func(path string, info os.FileInfo, err error) error {
 		if err != nil {
+			if errors.Is(err, os.ErrNotExist) {
+				log.Printf("manifest file does not exist: %s", fp)
+				return nil
+			}
 			return err
 		}
 		if !info.IsDir() {
@@ -207,11 +234,12 @@ func Serve(ln net.Listener) error {
 		c.String(http.StatusOK, "Ollama is running")
 	})

-	r.POST("/api/pull", pull)
-	r.POST("/api/generate", generate)
-	r.POST("/api/create", create)
-	r.POST("/api/push", push)
-	r.GET("/api/tags", list)
+	r.POST("/api/pull", PullModelHandler)
+	r.POST("/api/generate", GenerateHandler)
+	r.POST("/api/create", CreateModelHandler)
+	r.POST("/api/push", PushModelHandler)
+	r.GET("/api/tags", ListModelsHandler)
+	r.DELETE("/api/delete", DeleteModelHandler)

 	log.Printf("Listening on %s", ln.Addr())
 	s := &http.Server{
--- a/web/app/api/models/route.ts
+++ b/web/app/api/models/route.ts
@@ -1,6 +0,0 @@
-import models from '../../../../models.json'
-import { NextResponse } from 'next/server'
-
-export async function GET() {
-  return NextResponse.json(models)
-}
--- a/web/app/page.tsx
+++ b/web/app/page.tsx
@@ -11,18 +11,23 @@ export default async function Home() {
        <Image src='/ollama.png' width={64} height={64} alt='ollamaIcon' />
        <section className='my-12 text-center'>
          <div className='flex flex-col space-y-2'>
-            <h2 className='md:max-w-[18rem] mx-auto my-2 text-3xl tracking-tight'>Portable large language models</h2>
+            <h2 className='md:max-w-md mx-auto my-2 text-3xl tracking-tight'>
+              Get up and running with large language models, locally.
+            </h2>
            <h3 className='md:max-w-xs mx-auto text-base text-neutral-500'>
-              Bundle a model’s weights, configuration, prompts, data and more into self-contained packages that run anywhere.
+              Run Llama 2 and other models on macOS. Customize and create your own.
            </h3>
          </div>
-          <div className='mx-auto flex flex-col space-y-4 mt-12'>
-            <Link href='/download' className='md:mx-10 lg:mx-14 bg-black text-white rounded-full px-4 py-2 focus:outline-none cursor-pointer'>
+          <div className='mx-auto max-w-xs flex flex-col space-y-4 mt-12'>
+            <Link
+              href='/download'
+              className='md:mx-10 lg:mx-14 bg-black text-white rounded-full px-4 py-2 focus:outline-none cursor-pointer'
+            >
              Download
            </Link>
            <p className='text-neutral-500 text-sm '>
-            Available for macOS with Apple Silicon <br />
-            Windows & Linux support coming soon.
+              Available for macOS with Apple Silicon <br />
+              Windows & Linux support coming soon.
            </p>
          </div>
        </section>
Author	SHA1	Message	Date
Patrick Devine	3168f51125	change error handler behavior and fix error when a model isn't found	2023-07-21 22:44:04 -07:00
Michael Yang	37324a0a00	Merge pull request #172 from jmorganca/set-vars-first fix vars.First	2023-07-21 20:55:06 -07:00
Michael Yang	20a5d99f77	fix vars.First	2023-07-21 20:45:32 -07:00
Patrick Devine	3b43cc019a	fix extended tag names (#171 )	2023-07-21 20:27:25 -07:00
Patrick Devine	b8421dce3d	get the proper path for blobs to delete (#168 )	2023-07-21 17:30:40 -07:00
Patrick Devine	9f6e97865c	allow pushing/pulling to insecure registries (#157 )	2023-07-21 15:42:19 -07:00
Bruce MacDonald	f5f0da06d9	Merge pull request #166 from jmorganca/brucemacd/dev-cgo	2023-07-21 22:48:10 +02:00
Bruce MacDonald	52f04e39f2	Note that CGO must be enabled in dev docs	2023-07-21 22:36:36 +02:00
Jeffrey Morgan	3c8f4c03d7	web: tweak homepage text	2023-07-21 09:57:57 -07:00
Bruce MacDonald	7ba1308595	Merge pull request #147 from jmorganca/brucemacd/cli-err-display Improve CLI error display	2023-07-21 16:10:19 +02:00
Jeffrey Morgan	91cd54016c	add basic REST api documentation	2023-07-21 00:47:17 -07:00
Patrick Devine	e7a393de54	add rm command for models (#151 )	2023-07-20 16:09:23 -07:00
Jeffrey Morgan	8454f298ac	fix example `Modelfile`s	2023-07-20 15:46:32 -07:00
Patrick Devine	a3badaf103	add ls alias (#152 )	2023-07-20 15:28:27 -07:00
Michael Yang	50e8e5bdbe	Merge pull request #148 from jmorganca/more-llama-files add llama.cpp mpi, opencl files	2023-07-20 14:26:46 -07:00
Michael Yang	8526e1f5f1	add llama.cpp mpi, opencl files	2023-07-20 14:19:55 -07:00
Michael Yang	0cfdbb95cc	Merge pull request #146 from jmorganca/fix-windows-pull windows: fix model pulling	2023-07-20 13:41:54 -07:00
Michael Yang	6cea2061ec	windows: fix model pulling	2023-07-20 12:35:04 -07:00
Michael Yang	2832801c2a	Merge pull request #91 from jmorganca/fix-stream-errors fix stream errors	2023-07-20 12:21:59 -07:00
Jeffrey Morgan	23a37dc466	clean up `README.md`	2023-07-20 12:21:36 -07:00
Michael Yang	992892866b	Merge pull request #145 from jmorganca/verify-digest verify blob digest	2023-07-20 12:14:21 -07:00
Michael Yang	dde880290c	Merge pull request #131 from jmorganca/update-llama-cpp update llama.cpp to e782c9e735f93ab4767ffc37462c523b73a17ddc	2023-07-20 12:14:10 -07:00
Michael Yang	1f27d7f1b8	fix stream errors	2023-07-20 12:12:08 -07:00
Bruce MacDonald	00aaa05901	remove unused code	2023-07-20 20:57:30 +02:00
Michael Yang	a83eaa7a9f	update llama.cpp to e782c9e735f93ab4767ffc37462c523b73a17ddc	2023-07-20 11:55:56 -07:00
Michael Yang	5156e48c2a	add script to update llama.cpp	2023-07-20 11:54:59 -07:00
Michael Yang	bf198c3918	verify blob digest	2023-07-20 11:53:57 -07:00
Bruce MacDonald	09dc6273e3	suppress error when running list before pulling image	2023-07-20 20:53:09 +02:00
Bruce MacDonald	ebaa33ac28	display gin api errors in cli	2023-07-20 20:45:12 +02:00
Bruce MacDonald	3ec4ebc562	remove unused code	2023-07-20 20:18:00 +02:00
Jeffrey Morgan	6a19724d5f	remove colon from library modelfiles	2023-07-20 09:51:30 -07:00
Jeffrey Morgan	924ce739f9	documentation on the model format	2023-07-20 09:03:41 -07:00
Michael Chiang	e1973e6780	Update icon (#139 )	2023-07-20 08:55:20 -07:00
Jeffrey Morgan	f1b08ef40e	set temperature on `README.md` example	2023-07-20 08:17:09 -07:00
Jeffrey Morgan	31f0cb7742	new `Modelfile` syntax	2023-07-20 07:52:24 -07:00
Jeffrey Morgan	e4b2ccfb23	web: clean up remaining `models.json` usage	2023-07-20 07:51:46 -07:00
Bruce MacDonald	a3d7bb0a30	Merge pull request #136 from jmorganca/brucemacd/remove-models Delete models.json	2023-07-20 16:40:46 +02:00
Bruce MacDonald	77e49f3822	Delete models.json	2023-07-20 16:32:50 +02:00
Jeffrey Morgan	8945b25484	new modelfile syntax on branch	2023-07-20 02:24:21 -07:00