macapp: add error handling for symlink operations

Refactor the installer to do a better job handling errors when creating symlinks. It checks if paths are valid before trying to use them, safely creates directories if they don't exist, and clearly tells you what went wrong if something fails. It also uses TypeScript to catch mistakes early and makes sure paths work correctly in all cases.
2024-12-05 16:20:43 -08:00
22 changed files with 117 additions and 597 deletions
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -310,7 +310,8 @@ jobs:
            arm64) echo ARCH=arm64 ;;
          esac >>$GITHUB_ENV
        shell: bash
-      - run: go test ./...
+      - run: go build
+      - run: go test -v ./...

  patches:
    needs: [changes]
--- a/api/types.go
+++ b/api/types.go
@@ -67,7 +67,7 @@ type GenerateRequest struct {
 	Raw bool `json:"raw,omitempty"`

 	// Format specifies the format to return a response in.
-	Format json.RawMessage `json:"format,omitempty"`
+	Format string `json:"format"`

 	// KeepAlive controls how long the model will stay loaded in memory following
 	// this request.
@@ -94,7 +94,7 @@ type ChatRequest struct {
 	Stream *bool `json:"stream,omitempty"`

 	// Format is the format to return the response in (e.g. "json").
-	Format json.RawMessage `json:"format,omitempty"`
+	Format string `json:"format"`

 	// KeepAlive controls how long the model will stay loaded into memory
 	// following the request.
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -8,7 +8,6 @@ import (
 	"crypto/ed25519"
 	"crypto/rand"
 	"crypto/sha256"
-	"encoding/json"
 	"encoding/pem"
 	"errors"
 	"fmt"
@@ -1039,7 +1038,7 @@ func chat(cmd *cobra.Command, opts runOptions) (*api.Message, error) {
 	req := &api.ChatRequest{
 		Model:    opts.Model,
 		Messages: opts.Messages,
-		Format:   json.RawMessage(opts.Format),
+		Format:   opts.Format,
 		Options:  opts.Options,
 	}

@@ -1126,7 +1125,7 @@ func generate(cmd *cobra.Command, opts runOptions) error {
 		Prompt:    opts.Prompt,
 		Context:   generateContext,
 		Images:    opts.Images,
-		Format:    json.RawMessage(opts.Format),
+		Format:    opts.Format,
 		System:    opts.System,
 		Options:   opts.Options,
 		KeepAlive: opts.KeepAlive,
@@ -1446,7 +1445,6 @@ func NewCLI() *cobra.Command {
 				envVars["OLLAMA_SCHED_SPREAD"],
 				envVars["OLLAMA_TMPDIR"],
 				envVars["OLLAMA_FLASH_ATTENTION"],
-				envVars["OLLAMA_KV_CACHE_TYPE"],
 				envVars["OLLAMA_LLM_LIBRARY"],
 				envVars["OLLAMA_GPU_OVERHEAD"],
 				envVars["OLLAMA_LOAD_TIMEOUT"],
--- a/convert/tokenizer.go
+++ b/convert/tokenizer.go
@@ -10,7 +10,6 @@ import (
 	"log/slog"
 	"os"
 	"slices"
-	"strings"

 	"golang.org/x/exp/maps"
 )
@@ -61,25 +60,7 @@ func parseTokenizer(fsys fs.FS, specialTokenTypes []string) (*Tokenizer, error)
 			addedTokens[t.Content] = t
 		}

-		if len(tt.Model.Merges) == 0 {
-			// noop; merges is empty
-		} else if err := json.Unmarshal(tt.Model.Merges, &t.Merges); err == nil {
-			// noop; merges is []string
-		} else if merges, err := func() ([][]string, error) {
-			var merges [][]string
-			if err := json.Unmarshal(tt.Model.Merges, &merges); err != nil {
-				return nil, err
-			}
-
-			return merges, nil
-		}(); err == nil {
-			t.Merges = make([]string, len(merges))
-			for i := range merges {
-				t.Merges[i] = strings.Join(merges[i], " ")
-			}
-		} else {
-			return nil, fmt.Errorf("could not parse tokenizer merges. expected []string or [][]string: %w", err)
-		}
+		t.Merges = tt.Model.Merges

 		sha256sum := sha256.New()
 		for _, pt := range tt.PreTokenizer.PreTokenizers {
@@ -175,9 +156,9 @@ func parseTokenizer(fsys fs.FS, specialTokenTypes []string) (*Tokenizer, error)
 type tokenizer struct {
 	AddedTokens []token `json:"added_tokens"`
 	Model       struct {
-		Type   string          `json:"type"`
-		Vocab  map[string]int  `json:"vocab"`
-		Merges json.RawMessage `json:"merges"`
+		Type   string         `json:"type"`
+		Vocab  map[string]int `json:"vocab"`
+		Merges []string       `json:"merges"`
 	} `json:"model"`

 	PreTokenizer struct {
--- a/convert/tokenizer_test.go
+++ b/convert/tokenizer_test.go
@@ -191,62 +191,6 @@ func TestParseTokenizer(t *testing.T) {
 				Pre: "default",
 			},
 		},
-		{
-			name: "list string merges",
-			fsys: createTokenizerFS(t, t.TempDir(), map[string]io.Reader{
-				"tokenizer.json": strings.NewReader(`{
-					"model": {
-						"merges": [
-							"a b",
-							"c d",
-							"e f"
-						]
-					}
-				}`),
-			}),
-			want: &Tokenizer{
-				Vocabulary: &Vocabulary{
-					Model: "gpt2",
-				},
-				Merges: []string{
-					"a b",
-					"c d",
-					"e f",
-				},
-				Pre: "default",
-			},
-		},
-		{
-			name: "list list string merges",
-			fsys: createTokenizerFS(t, t.TempDir(), map[string]io.Reader{
-				"tokenizer.json": strings.NewReader(`{
-					"model": {
-						"merges": [
-							[
-								"a", "b"
-							],
-							[
-								"c", "d"
-							],
-							[
-								"e", "f"
-							]
-						]
-					}
-				}`),
-			}),
-			want: &Tokenizer{
-				Vocabulary: &Vocabulary{
-					Model: "gpt2",
-				},
-				Merges: []string{
-					"a b",
-					"c d",
-					"e f",
-				},
-				Pre: "default",
-			},
-		},
 	}

 	for _, tt := range cases {
--- a/discover/types.go
+++ b/discover/types.go
@@ -183,17 +183,3 @@ func (si SystemInfo) GetOptimalThreadCount() int {

 	return coreCount
 }
-
-// For each GPU, check if it does NOT support flash attention
-func (l GpuInfoList) FlashAttentionSupported() bool {
-	for _, gpu := range l {
-		supportsFA := gpu.Library == "metal" ||
-			(gpu.Library == "cuda" && gpu.DriverMajor >= 7) ||
-			gpu.Library == "rocm"
-
-		if !supportsFA {
-			return false
-		}
-	}
-	return true
-}
--- a/docs/faq.md
+++ b/docs/faq.md
@@ -151,7 +151,7 @@ Refer to the section [above](#how-do-i-configure-ollama-server) for how to set e

 Ollama runs an HTTP server and can be exposed using a proxy server such as Nginx. To do so, configure the proxy to forward requests and optionally set required headers (if not exposing Ollama on the network). For example, with Nginx:

-```nginx
+```
 server {
    listen 80;
    server_name example.com;  # Replace with your domain or IP
@@ -285,28 +285,4 @@ Note: Windows with Radeon GPUs currently default to 1 model maximum due to limit

 ## How does Ollama load models on multiple GPUs?

-When loading a new model, Ollama evaluates the required VRAM for the model against what is currently available.  If the model will entirely fit on any single GPU, Ollama will load the model on that GPU.  This typically provides the best performance as it reduces the amount of data transferring across the PCI bus during inference.  If the model does not fit entirely on one GPU, then it will be spread across all the available GPUs.
-
-## How can I enable Flash Attention?
-
-Flash Attention is a feature of most modern models that can significantly reduce memory usage as the context size grows.  To enable Flash Attention, set the `OLLAMA_FLASH_ATTENTION` environment variable to `1` when starting the Ollama server.
-
-## How can I set the quantization type for the K/V cache?
-
-The K/V context cache can be quantized to significantly reduce memory usage when Flash Attention is enabled.
-
-To use quantized K/V cache with Ollama you can set the following environment variable:
-
- `OLLAMA_KV_CACHE_TYPE` - The quantization type for the K/V cache.  Default is `f16`.
-
-> Note: Currently this is a global option - meaning all models will run with the specified quantization type.
-
-The currently available K/V cache quantization types are:
-
- `f16` - high precision and memory usage (default).
- `q8_0` - 8-bit quantization, uses approximately 1/2 the memory of `f16` with a very small loss in precision, this usually has no noticeable impact on the model's quality (recommended if not using f16).
- `q4_0` - 4-bit quantization, uses approximately 1/4 the memory of `f16` with a small-medium loss in precision that may be more noticeable at higher context sizes.
-
-How much the cache quantization impacts the model's response quality will depend on the model and the task.  Models that have a high GQA count (e.g. Qwen2) may see a larger impact on precision from quantization than models with a low GQA count.
-
-You may need to experiment with different quantization types to find the best balance between memory usage and quality.
+Installing multiple GPUs of the same brand can be a great way to increase your available VRAM to load larger models.  When you load a new model, Ollama evaluates the required VRAM for the model against what is currently available.  If the model will entirely fit on any single GPU, Ollama will load the model on that GPU.  This typically provides the best performance as it reduces the amount of data transfering across the PCI bus during inference.  If the model does not fit entirely on one GPU, then it will be spread across all the available GPUs.
--- a/docs/openai.md
+++ b/docs/openai.md
@@ -199,8 +199,6 @@ curl http://localhost:11434/v1/embeddings \
 - [x] `seed`
 - [x] `stop`
 - [x] `stream`
- [x] `stream_options`
-  - [x] `include_usage`
 - [x] `temperature`
 - [x] `top_p`
 - [x] `max_tokens`
@@ -229,8 +227,6 @@ curl http://localhost:11434/v1/embeddings \
 - [x] `seed`
 - [x] `stop`
 - [x] `stream`
- [x] `stream_options`
-  - [x] `include_usage`
 - [x] `temperature`
 - [x] `top_p`
 - [x] `max_tokens`
--- a/envconfig/config.go
+++ b/envconfig/config.go
@@ -153,8 +153,6 @@ var (
 	Debug = Bool("OLLAMA_DEBUG")
 	// FlashAttention enables the experimental flash attention feature.
 	FlashAttention = Bool("OLLAMA_FLASH_ATTENTION")
-	// KvCacheType is the quantization type for the K/V cache.
-	KvCacheType = String("OLLAMA_KV_CACHE_TYPE")
 	// NoHistory disables readline history.
 	NoHistory = Bool("OLLAMA_NOHISTORY")
 	// NoPrune disables pruning of model blobs on startup.
@@ -236,7 +234,6 @@ func AsMap() map[string]EnvVar {
 	ret := map[string]EnvVar{
 		"OLLAMA_DEBUG":             {"OLLAMA_DEBUG", Debug(), "Show additional debug information (e.g. OLLAMA_DEBUG=1)"},
 		"OLLAMA_FLASH_ATTENTION":   {"OLLAMA_FLASH_ATTENTION", FlashAttention(), "Enabled flash attention"},
-		"OLLAMA_KV_CACHE_TYPE":     {"OLLAMA_KV_CACHE_TYPE", KvCacheType(), "Quantization type for the K/V cache (default: f16)"},
 		"OLLAMA_GPU_OVERHEAD":      {"OLLAMA_GPU_OVERHEAD", GpuOverhead(), "Reserve a portion of VRAM per GPU (bytes)"},
 		"OLLAMA_HOST":              {"OLLAMA_HOST", Host(), "IP Address for the ollama server (default 127.0.0.1:11434)"},
 		"OLLAMA_KEEP_ALIVE":        {"OLLAMA_KEEP_ALIVE", KeepAlive(), "The duration that models stay loaded in memory (default \"5m\")"},
--- a/llama/llama.go
+++ b/llama/llama.go
@@ -85,12 +85,9 @@ COMPILER inline get_compiler() {
 import "C"

 import (
-	"bytes"
 	_ "embed"
-	"encoding/json"
 	"errors"
 	"fmt"
-	"log/slog"
 	"runtime"
 	"runtime/cgo"
 	"slices"
@@ -143,7 +140,7 @@ type ContextParams struct {
 	c C.struct_llama_context_params
 }

-func NewContextParams(numCtx int, batchSize int, numSeqMax int, threads int, flashAttention bool, kvCacheType string) ContextParams {
+func NewContextParams(numCtx int, batchSize int, numSeqMax int, threads int, flashAttention bool) ContextParams {
 	params := C.llama_context_default_params()
 	params.n_ctx = C.uint(numCtx)
 	params.n_batch = C.uint(batchSize)
@@ -152,28 +149,9 @@ func NewContextParams(numCtx int, batchSize int, numSeqMax int, threads int, fla
 	params.n_threads_batch = params.n_threads
 	params.embeddings = C.bool(true)
 	params.flash_attn = C.bool(flashAttention)
-	params.type_k = kvCacheTypeFromStr(strings.ToLower(kvCacheType))
-	params.type_v = kvCacheTypeFromStr(strings.ToLower(kvCacheType))
-
 	return ContextParams{c: params}
 }

-// kvCacheTypeFromStr converts a string cache type to the corresponding GGML type value
-func kvCacheTypeFromStr(s string) C.enum_ggml_type {
-	if s == "" {
-		return C.GGML_TYPE_F16
-	}
-
-	switch s {
-	case "q8_0":
-		return C.GGML_TYPE_Q8_0
-	case "q4_0":
-		return C.GGML_TYPE_Q4_0
-	default:
-		return C.GGML_TYPE_F16
-	}
-}
-
 type Context struct {
 	c          *C.struct_llama_context
 	numThreads int
@@ -702,33 +680,3 @@ func (s *SamplingContext) Sample(llamaContext *Context, idx int) int {
 func (s *SamplingContext) Accept(id int, applyGrammar bool) {
 	C.gpt_sampler_caccept(s.c, C.llama_token(id), C.bool(applyGrammar))
 }
-
-type JsonSchema struct {
-	Defs       map[string]any `json:"$defs,omitempty"`
-	Properties map[string]any `json:"properties,omitempty"`
-	Required   []string       `json:"required,omitempty"`
-	Title      string         `json:"title,omitempty"`
-	Type       string         `json:"type,omitempty"`
-}
-
-func (js JsonSchema) AsGrammar() string {
-	var b bytes.Buffer
-	if err := json.NewEncoder(&b).Encode(js); err != nil {
-		return ""
-	}
-
-	cStr := C.CString(b.String())
-	defer C.free(unsafe.Pointer(cStr))
-
-	// Allocate buffer for grammar output with reasonable size
-	const maxLen = 32768 // 32KB
-	buf := make([]byte, maxLen)
-
-	// Call C function to convert schema to grammar
-	length := C.schema_to_grammar(cStr, (*C.char)(unsafe.Pointer(&buf[0])), C.size_t(maxLen))
-	if length == 0 {
-		slog.Warn("unable to convert schema to grammar")
-	}
-
-	return string(buf[:length])
-}
--- a/llama/llama_test.go
+++ b/llama/llama_test.go
@@ -1,70 +1 @@
 package llama
-
-import (
-	"strings"
-	"testing"
-
-	"github.com/google/go-cmp/cmp"
-)
-
-func TestJsonSchema(t *testing.T) {
-	testCases := []struct {
-		name     string
-		schema   JsonSchema
-		expected string
-	}{
-		{
-			name: "empty schema",
-			schema: JsonSchema{
-				Type: "object",
-			},
-			expected: `array ::= "[" space ( value ("," space value)* )? "]" space
-boolean ::= ("true" | "false") space
-char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
-decimal-part ::= [0-9]{1,16}
-integral-part ::= [0] | [1-9] [0-9]{0,15}
-null ::= "null" space
-number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
-object ::= "{" space ( string ":" space value ("," space string ":" space value)* )? "}" space
-root ::= object
-space ::= | " " | "\n" [ \t]{0,20}
-string ::= "\"" char* "\"" space
-value ::= object | array | string | number | boolean | null`,
-		},
-		{
-			name: "invalid schema with circular reference",
-			schema: JsonSchema{
-				Type: "object",
-				Properties: map[string]any{
-					"self": map[string]any{
-						"$ref": "#", // Self reference
-					},
-				},
-			},
-			expected: "", // Should return empty string for invalid schema
-		},
-		{
-			name: "schema with invalid type",
-			schema: JsonSchema{
-				Type: "invalid_type", // Invalid type
-				Properties: map[string]any{
-					"foo": map[string]any{
-						"type": "string",
-					},
-				},
-			},
-			expected: "", // Should return empty string for invalid schema
-		},
-	}
-
-	for _, tc := range testCases {
-		t.Run(tc.name, func(t *testing.T) {
-			result := tc.schema.AsGrammar()
-			if !strings.EqualFold(strings.TrimSpace(result), strings.TrimSpace(tc.expected)) {
-				if diff := cmp.Diff(tc.expected, result); diff != "" {
-					t.Fatalf("grammar mismatch (-want +got):\n%s", diff)
-				}
-			}
-		})
-	}
-}
--- a/llama/runner/runner.go
+++ b/llama/runner/runner.go
@@ -850,7 +850,6 @@ func (s *Server) loadModel(
 	lpath multiLPath,
 	ppath string,
 	kvSize int,
-	kvCacheType string,
 	flashAttention bool,
 	threads int,
 	multiUserCache bool,
@@ -863,7 +862,7 @@ func (s *Server) loadModel(
 		panic(err)
 	}

-	ctxParams := llama.NewContextParams(kvSize, s.batchSize*s.parallel, s.parallel, threads, flashAttention, kvCacheType)
+	ctxParams := llama.NewContextParams(kvSize, s.batchSize*s.parallel, s.parallel, threads, flashAttention)
 	s.lc, err = llama.NewContextWithModel(s.model, ctxParams)
 	if err != nil {
 		panic(err)
@@ -904,7 +903,6 @@ func main() {
 	mainGpu := flag.Int("main-gpu", 0, "Main GPU")
 	flashAttention := flag.Bool("flash-attn", false, "Enable flash attention")
 	kvSize := flag.Int("ctx-size", 2048, "Context (or KV cache) size")
-	kvCacheType := flag.String("kv-cache-type", "", "quantization type for KV cache (default: f16)")
 	port := flag.Int("port", 8080, "Port to expose the server on")
 	threads := flag.Int("threads", runtime.NumCPU(), "Number of threads to use during generation")
 	verbose := flag.Bool("verbose", false, "verbose output (default: disabled)")
@@ -972,7 +970,7 @@ func main() {
 	}

 	server.ready.Add(1)
-	go server.loadModel(params, *mpath, lpaths, *ppath, *kvSize, *kvCacheType, *flashAttention, *threads, *multiUserCache)
+	go server.loadModel(params, *mpath, lpaths, *ppath, *kvSize, *flashAttention, *threads, *multiUserCache)

 	server.cond = sync.NewCond(&server.mu)

--- a/llama/sampling_ext.cpp
+++ b/llama/sampling_ext.cpp
@@ -1,13 +1,11 @@
 // TODO: this is a temporary wrapper to allow calling C++ code from CGo
 #include "sampling.h"
 #include "sampling_ext.h"
-#include "json-schema-to-grammar.h"

 struct gpt_sampler *gpt_sampler_cinit(
    const struct llama_model *model, struct gpt_sampler_cparams *params)
 {
-    try
-    {
+    try {
        gpt_sampler_params sparams;
        sparams.top_k = params->top_k;
        sparams.top_p = params->top_p;
@@ -26,9 +24,7 @@ struct gpt_sampler *gpt_sampler_cinit(
        sparams.seed = params->seed;
        sparams.grammar = params->grammar;
        return gpt_sampler_init(model, sparams);
-    }
-    catch (const std::exception &err)
-    {
+    } catch (const std::exception & err) {
        return nullptr;
    }
 }
@@ -58,24 +54,3 @@ void gpt_sampler_caccept(
 {
    gpt_sampler_accept(sampler, id, apply_grammar);
 }
-
-int schema_to_grammar(const char *json_schema, char *grammar, size_t max_len)
-{
-    try
-    {
-        nlohmann::json schema = nlohmann::json::parse(json_schema);
-        std::string grammar_str = json_schema_to_grammar(schema);
-        size_t len = grammar_str.length();
-        if (len >= max_len)
-        {
-            len = max_len - 1;
-        }
-        strncpy(grammar, grammar_str.c_str(), len);
-        return len;
-    }
-    catch (const std::exception &e)
-    {
-        strncpy(grammar, "", max_len - 1);
-        return 0;
-    }
-}
--- a/llama/sampling_ext.h
+++ b/llama/sampling_ext.h
@@ -47,8 +47,6 @@ extern "C"
        llama_token id,
        bool apply_grammar);

-    int schema_to_grammar(const char *json_schema, char *grammar, size_t max_len);
-
 #ifdef __cplusplus
 }
 #endif
--- a/llm/ggml.go
+++ b/llm/ggml.go
@@ -360,7 +360,7 @@ func DecodeGGML(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) {
 	}, offset, nil
 }

-func (llm GGML) GraphSize(context, batch uint64, kvCacheType string) (kv, partialOffload, fullOffload uint64) {
+func (llm GGML) GraphSize(context, batch uint64) (kv, partialOffload, fullOffload uint64) {
 	embedding := llm.KV().EmbeddingLength()
 	heads := llm.KV().HeadCount()
 	headsKV := llm.KV().HeadCountKV()
@@ -372,8 +372,7 @@ func (llm GGML) GraphSize(context, batch uint64, kvCacheType string) (kv, partia

 	layers := llm.Tensors().Layers()

-	bytesPerElement := kvCacheBytesPerElement(kvCacheType)
-	kv = uint64(float64(context*llm.KV().BlockCount()*(embeddingHeadsK+embeddingHeadsV)*headsKV) * bytesPerElement)
+	kv = 2 * context * llm.KV().BlockCount() * (embeddingHeadsK + embeddingHeadsV) * headsKV

 	switch llm.KV().Architecture() {
 	case "llama":
@@ -528,34 +527,3 @@ func (llm GGML) GraphSize(context, batch uint64, kvCacheType string) (kv, partia

 	return
 }
-
-// SupportsKVCacheType checks if the requested cache type is supported
-func (ggml GGML) SupportsKVCacheType(cacheType string) bool {
-	validKVCacheTypes := []string{"f16", "q8_0", "q4_0"}
-	return slices.Contains(validKVCacheTypes, cacheType)
-}
-
-// SupportsFlashAttention checks if the model supports flash attention
-func (ggml GGML) SupportsFlashAttention() bool {
-	_, isEmbedding := ggml.KV()[fmt.Sprintf("%s.pooling_type", ggml.KV().Architecture())]
-	if isEmbedding {
-		return false
-	}
-
-	// Check head counts match and are non-zero
-	headCountK := ggml.KV().EmbeddingHeadCountK()
-	headCountV := ggml.KV().EmbeddingHeadCountV()
-	return headCountK != 0 && headCountV != 0 && headCountK == headCountV
-}
-
-// kvCacheBytesPerElement returns the number of bytes per element for a given KV cache type
-func kvCacheBytesPerElement(cacheType string) float64 {
-	switch cacheType {
-	case "q8_0":
-		return 1 // 1/2 of fp16
-	case "q4_0":
-		return 0.5 // 1/4 of fp16
-	default:
-		return 2 // f16 (default)
-	}
-}
--- a/llm/memory.go
+++ b/llm/memory.go
@@ -123,23 +123,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string,
 		slog.Warn("model missing blk.0 layer size")
 	}

-	fa := envconfig.FlashAttention() &&
-		discover.GetGPUInfo().FlashAttentionSupported() &&
-		ggml.SupportsFlashAttention()
-
-	var kvct string
-	if fa {
-		requested := strings.ToLower(envconfig.KvCacheType())
-		if requested != "" && ggml.SupportsKVCacheType(requested) {
-			kvct = requested
-		}
-	}
-
-	kv, graphPartialOffload, graphFullOffload := ggml.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)), kvct)
-
-	// KV is proportional to the number of layers
-	layerSize += kv / ggml.KV().BlockCount()
-
+	kv, graphPartialOffload, graphFullOffload := ggml.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)))
 	if graphPartialOffload == 0 {
 		graphPartialOffload = ggml.KV().GQA() * kv / 6
 	}
@@ -147,6 +131,9 @@ func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string,
 		graphFullOffload = graphPartialOffload
 	}

+	// KV is proportional to the number of layers
+	layerSize += kv / ggml.KV().BlockCount()
+
 	// on metal there's no partial offload overhead
 	if gpus[0].Library == "metal" {
 		graphPartialOffload = graphFullOffload
--- a/llm/memory_test.go
+++ b/llm/memory_test.go
@@ -15,7 +15,6 @@ import (

 func TestEstimateGPULayers(t *testing.T) {
 	t.Setenv("OLLAMA_DEBUG", "1")
-	t.Setenv("OLLAMA_KV_CACHE_TYPE", "") // Ensure default f16

 	modelName := "dummy"
 	f, err := os.CreateTemp(t.TempDir(), modelName)
--- a/llm/server.go
+++ b/llm/server.go
@@ -214,36 +214,15 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
 		params = append(params, "--threads", strconv.Itoa(defaultThreads))
 	}

-	fa := envconfig.FlashAttention()
-	if fa && !gpus.FlashAttentionSupported() {
-		slog.Warn("flash attention enabled but not supported by gpu")
-		fa = false
-	}
+	flashAttnEnabled := envconfig.FlashAttention()

-	if fa && !ggml.SupportsFlashAttention() {
-		slog.Warn("flash attention enabled but not supported by model")
-		fa = false
-	}
-
-	kvct := strings.ToLower(envconfig.KvCacheType())
-
-	if fa {
-		slog.Info("enabling flash attention")
-		params = append(params, "--flash-attn")
-
-		// Flash Attention also supports kv cache quantization
-		// Enable if the requested and kv cache type is supported by the model
-		if kvct != "" && ggml.SupportsKVCacheType(kvct) {
-			params = append(params, "--kv-cache-type", kvct)
-		} else {
-			slog.Warn("kv cache type not supported by model", "type", kvct)
-		}
-	} else if kvct != "" && kvct != "f16" {
-		slog.Warn("quantized kv cache requested but flash attention disabled", "type", kvct)
-	}
-
-	// mmap has issues with partial offloading on metal
 	for _, g := range gpus {
+		// only cuda (compute capability 7+) and metal support flash attention
+		if g.Library != "metal" && (g.Library != "cuda" || g.DriverMajor < 7) {
+			flashAttnEnabled = false
+		}
+
+		// mmap has issues with partial offloading on metal
 		if g.Library == "metal" &&
 			uint64(opts.NumGPU) > 0 &&
 			uint64(opts.NumGPU) < ggml.KV().BlockCount()+1 {
@@ -252,6 +231,10 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
 		}
 	}

+	if flashAttnEnabled {
+		params = append(params, "--flash-attn")
+	}
+
 	// Windows CUDA should not use mmap for best performance
 	// Linux  with a model larger than free space, mmap leads to thrashing
 	// For CPU loads we want the memory to be allocated, not FS cache
@@ -634,22 +617,27 @@ func (s *llmServer) WaitUntilRunning(ctx context.Context) error {
 const jsonGrammar = `
 root   ::= object
 value  ::= object | array | string | number | ("true" | "false" | "null") ws
+
 object ::=
  "{" ws (
            string ":" ws value
    ("," ws string ":" ws value)*
  )? "}" ws
+
 array  ::=
  "[" ws (
            value
    ("," ws value)*
  )? "]" ws
+
 string ::=
  "\"" (
    [^"\\\x7F\x00-\x1F] |
    "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) # escapes
  )* "\"" ws
+
 number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws
+
 # Optional space: by convention, applied in this grammar after literal chars when allowed
 ws ::= ([ \t\n] ws)?
 `
@@ -679,7 +667,7 @@ type completion struct {

 type CompletionRequest struct {
 	Prompt  string
-	Format  json.RawMessage
+	Format  string
 	Images  []ImageData
 	Options *api.Options
 }
@@ -744,22 +732,10 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
 		return fmt.Errorf("unexpected server status: %s", status.ToString())
 	}

-	// TODO (parthsareen): Move conversion to grammar with sampling logic
-	// API should do error handling for invalid formats
-	if req.Format != nil {
-		if strings.ToLower(strings.TrimSpace(string(req.Format))) == `"json"` {
-			request["grammar"] = jsonGrammar
-			if !strings.Contains(strings.ToLower(req.Prompt), "json") {
-				slog.Warn("prompt does not specify that the LLM should response in JSON, but JSON format is expected. For best results specify that JSON is expected in the system prompt.")
-			}
-		} else if schema, err := func() (llama.JsonSchema, error) {
-			var schema llama.JsonSchema
-			err := json.Unmarshal(req.Format, &schema)
-			return schema, err
-		}(); err == nil {
-			request["grammar"] = schema.AsGrammar()
-		} else {
-			slog.Warn(`format is neither a schema or "json"`, "format", req.Format)
+	if req.Format == "json" {
+		request["grammar"] = jsonGrammar
+		if !strings.Contains(strings.ToLower(req.Prompt), "json") {
+			slog.Warn("Prompt does not specify that the LLM should response in JSON, but JSON format is expected. For best results specify that JSON is expected in the system prompt.")
 		}
 	}

--- a/macapp/src/install.ts
+++ b/macapp/src/install.ts
@@ -1,21 +1,35 @@
 import * as fs from 'fs'
-import { exec as cbExec } from 'child_process'
+import { spawn } from 'child_process'
 import * as path from 'path'
-import { promisify } from 'util'

 const app = process && process.type === 'renderer' ? require('@electron/remote').app : require('electron').app
 const ollama = app.isPackaged ? path.join(process.resourcesPath, 'ollama') : path.resolve(process.cwd(), '..', 'ollama')
-const exec = promisify(cbExec)
 const symlinkPath = '/usr/local/bin/ollama'

-export function installed() {
+export function installed(): boolean {
  return fs.existsSync(symlinkPath) && fs.readlinkSync(symlinkPath) === ollama
 }

-export async function install() {
-  const command = `do shell script "mkdir -p ${path.dirname(
-    symlinkPath
-  )} && ln -F -s \\"${ollama}\\" \\"${symlinkPath}\\"" with administrator privileges`
-
-  await exec(`osascript -e '${command}'`)
+function validPath(targetPath: string): boolean {
+  const normalized = path.normalize(targetPath)
+  return !(/[;&|`$(){}[\]<>]/.test(normalized) || normalized.includes('..'))
+}
+
+export async function install(): Promise<void> {
+  if (!validPath(ollama) || !validPath(symlinkPath)) {
+    throw new Error('Invalid path format')
+  }
+
+  await fs.promises.mkdir(path.dirname(symlinkPath), { recursive: true })
+    .catch(err => err.code === 'EEXIST' ? null : Promise.reject(err))
+
+  const process = spawn('osascript', [
+    '-e',
+    `do shell script "ln -F -s '${path.normalize(ollama)}' '${path.normalize(symlinkPath)}'" with administrator privileges`
+  ])
+
+  await new Promise<void>((resolve, reject) => {
+    process.on('error', reject)
+    process.on('close', code => code === 0 ? resolve() : reject(new Error(`Failed with code ${code}`)))
+  })
 }
--- a/openai/openai.go
+++ b/openai/openai.go
@@ -62,12 +62,7 @@ type Usage struct {
 }

 type ResponseFormat struct {
-	Type       string      `json:"type"`
-	JsonSchema *JsonSchema `json:"json_schema,omitempty"`
-}
-
-type JsonSchema struct {
-	Schema map[string]any `json:"schema"`
+	Type string `json:"type"`
 }

 type EmbedRequest struct {
@@ -75,15 +70,10 @@ type EmbedRequest struct {
 	Model string `json:"model"`
 }

-type StreamOptions struct {
-	IncludeUsage bool `json:"include_usage"`
-}
-
 type ChatCompletionRequest struct {
 	Model            string          `json:"model"`
 	Messages         []Message       `json:"messages"`
 	Stream           bool            `json:"stream"`
-	StreamOptions    *StreamOptions  `json:"stream_options"`
 	MaxTokens        *int            `json:"max_tokens"`
 	Seed             *int            `json:"seed"`
 	Stop             any             `json:"stop"`
@@ -112,23 +102,21 @@ type ChatCompletionChunk struct {
 	Model             string        `json:"model"`
 	SystemFingerprint string        `json:"system_fingerprint"`
 	Choices           []ChunkChoice `json:"choices"`
-	Usage             *Usage        `json:"usage,omitempty"`
 }

 // TODO (https://github.com/ollama/ollama/issues/5259): support []string, []int and [][]int
 type CompletionRequest struct {
-	Model            string         `json:"model"`
-	Prompt           string         `json:"prompt"`
-	FrequencyPenalty float32        `json:"frequency_penalty"`
-	MaxTokens        *int           `json:"max_tokens"`
-	PresencePenalty  float32        `json:"presence_penalty"`
-	Seed             *int           `json:"seed"`
-	Stop             any            `json:"stop"`
-	Stream           bool           `json:"stream"`
-	StreamOptions    *StreamOptions `json:"stream_options"`
-	Temperature      *float32       `json:"temperature"`
-	TopP             float32        `json:"top_p"`
-	Suffix           string         `json:"suffix"`
+	Model            string   `json:"model"`
+	Prompt           string   `json:"prompt"`
+	FrequencyPenalty float32  `json:"frequency_penalty"`
+	MaxTokens        *int     `json:"max_tokens"`
+	PresencePenalty  float32  `json:"presence_penalty"`
+	Seed             *int     `json:"seed"`
+	Stop             any      `json:"stop"`
+	Stream           bool     `json:"stream"`
+	Temperature      *float32 `json:"temperature"`
+	TopP             float32  `json:"top_p"`
+	Suffix           string   `json:"suffix"`
 }

 type Completion struct {
@@ -148,7 +136,6 @@ type CompletionChunk struct {
 	Choices           []CompleteChunkChoice `json:"choices"`
 	Model             string                `json:"model"`
 	SystemFingerprint string                `json:"system_fingerprint"`
-	Usage             *Usage                `json:"usage,omitempty"`
 }

 type ToolCall struct {
@@ -205,14 +192,6 @@ func NewError(code int, message string) ErrorResponse {
 	return ErrorResponse{Error{Type: etype, Message: message}}
 }

-func toUsage(r api.ChatResponse) Usage {
-	return Usage{
-		PromptTokens:     r.PromptEvalCount,
-		CompletionTokens: r.EvalCount,
-		TotalTokens:      r.PromptEvalCount + r.EvalCount,
-	}
-}
-
 func toolCallId() string {
 	const letterBytes = "abcdefghijklmnopqrstuvwxyz0123456789"
 	b := make([]byte, 8)
@@ -262,7 +241,11 @@ func toChatCompletion(id string, r api.ChatResponse) ChatCompletion {
 				return nil
 			}(r.DoneReason),
 		}},
-		Usage: toUsage(r),
+		Usage: Usage{
+			PromptTokens:     r.PromptEvalCount,
+			CompletionTokens: r.EvalCount,
+			TotalTokens:      r.PromptEvalCount + r.EvalCount,
+		},
 	}
 }

@@ -287,14 +270,6 @@ func toChunk(id string, r api.ChatResponse) ChatCompletionChunk {
 	}
 }

-func toUsageGenerate(r api.GenerateResponse) Usage {
-	return Usage{
-		PromptTokens:     r.PromptEvalCount,
-		CompletionTokens: r.EvalCount,
-		TotalTokens:      r.PromptEvalCount + r.EvalCount,
-	}
-}
-
 func toCompletion(id string, r api.GenerateResponse) Completion {
 	return Completion{
 		Id:                id,
@@ -312,7 +287,11 @@ func toCompletion(id string, r api.GenerateResponse) Completion {
 				return nil
 			}(r.DoneReason),
 		}},
-		Usage: toUsageGenerate(r),
+		Usage: Usage{
+			PromptTokens:     r.PromptEvalCount,
+			CompletionTokens: r.EvalCount,
+			TotalTokens:      r.PromptEvalCount + r.EvalCount,
+		},
 	}
 }

@@ -503,21 +482,9 @@ func fromChatRequest(r ChatCompletionRequest) (*api.ChatRequest, error) {
 		options["top_p"] = 1.0
 	}

-	var format json.RawMessage
-	if r.ResponseFormat != nil {
-		switch strings.ToLower(strings.TrimSpace(r.ResponseFormat.Type)) {
-		// Support the old "json_object" type for OpenAI compatibility
-		case "json_object":
-			format = json.RawMessage(`"json"`)
-		case "json_schema":
-			if r.ResponseFormat.JsonSchema != nil {
-				schema, err := json.Marshal(r.ResponseFormat.JsonSchema.Schema)
-				if err != nil {
-					return nil, fmt.Errorf("failed to marshal json schema: %w", err)
-				}
-				format = schema
-			}
-		}
+	var format string
+	if r.ResponseFormat != nil && r.ResponseFormat.Type == "json_object" {
+		format = "json"
 	}

 	return &api.ChatRequest{
@@ -586,16 +553,14 @@ type BaseWriter struct {
 }

 type ChatWriter struct {
-	stream        bool
-	streamOptions *StreamOptions
-	id            string
+	stream bool
+	id     string
 	BaseWriter
 }

 type CompleteWriter struct {
-	stream        bool
-	streamOptions *StreamOptions
-	id            string
+	stream bool
+	id     string
 	BaseWriter
 }

@@ -638,11 +603,7 @@ func (w *ChatWriter) writeResponse(data []byte) (int, error) {

 	// chat chunk
 	if w.stream {
-		c := toChunk(w.id, chatResponse)
-		if w.streamOptions != nil && w.streamOptions.IncludeUsage {
-			c.Usage = &Usage{}
-		}
-		d, err := json.Marshal(c)
+		d, err := json.Marshal(toChunk(w.id, chatResponse))
 		if err != nil {
 			return 0, err
 		}
@@ -654,17 +615,6 @@ func (w *ChatWriter) writeResponse(data []byte) (int, error) {
 		}

 		if chatResponse.Done {
-			if w.streamOptions != nil && w.streamOptions.IncludeUsage {
-				u := toUsage(chatResponse)
-				d, err := json.Marshal(ChatCompletionChunk{Choices: []ChunkChoice{}, Usage: &u})
-				if err != nil {
-					return 0, err
-				}
-				_, err = w.ResponseWriter.Write([]byte(fmt.Sprintf("data: %s\n\n", d)))
-				if err != nil {
-					return 0, err
-				}
-			}
 			_, err = w.ResponseWriter.Write([]byte("data: [DONE]\n\n"))
 			if err != nil {
 				return 0, err
@@ -702,11 +652,7 @@ func (w *CompleteWriter) writeResponse(data []byte) (int, error) {

 	// completion chunk
 	if w.stream {
-		c := toCompleteChunk(w.id, generateResponse)
-		if w.streamOptions != nil && w.streamOptions.IncludeUsage {
-			c.Usage = &Usage{}
-		}
-		d, err := json.Marshal(c)
+		d, err := json.Marshal(toCompleteChunk(w.id, generateResponse))
 		if err != nil {
 			return 0, err
 		}
@@ -718,17 +664,6 @@ func (w *CompleteWriter) writeResponse(data []byte) (int, error) {
 		}

 		if generateResponse.Done {
-			if w.streamOptions != nil && w.streamOptions.IncludeUsage {
-				u := toUsageGenerate(generateResponse)
-				d, err := json.Marshal(CompletionChunk{Choices: []CompleteChunkChoice{}, Usage: &u})
-				if err != nil {
-					return 0, err
-				}
-				_, err = w.ResponseWriter.Write([]byte(fmt.Sprintf("data: %s\n\n", d)))
-				if err != nil {
-					return 0, err
-				}
-			}
 			_, err = w.ResponseWriter.Write([]byte("data: [DONE]\n\n"))
 			if err != nil {
 				return 0, err
@@ -891,10 +826,9 @@ func CompletionsMiddleware() gin.HandlerFunc {
 		c.Request.Body = io.NopCloser(&b)

 		w := &CompleteWriter{
-			BaseWriter:    BaseWriter{ResponseWriter: c.Writer},
-			stream:        req.Stream,
-			id:            fmt.Sprintf("cmpl-%d", rand.Intn(999)),
-			streamOptions: req.StreamOptions,
+			BaseWriter: BaseWriter{ResponseWriter: c.Writer},
+			stream:     req.Stream,
+			id:         fmt.Sprintf("cmpl-%d", rand.Intn(999)),
 		}

 		c.Writer = w
@@ -974,10 +908,9 @@ func ChatMiddleware() gin.HandlerFunc {
 		c.Request.Body = io.NopCloser(&b)

 		w := &ChatWriter{
-			BaseWriter:    BaseWriter{ResponseWriter: c.Writer},
-			stream:        req.Stream,
-			id:            fmt.Sprintf("chatcmpl-%d", rand.Intn(999)),
-			streamOptions: req.StreamOptions,
+			BaseWriter: BaseWriter{ResponseWriter: c.Writer},
+			stream:     req.Stream,
+			id:         fmt.Sprintf("chatcmpl-%d", rand.Intn(999)),
 		}

 		c.Writer = w
--- a/openai/openai_test.go
+++ b/openai/openai_test.go
@@ -13,7 +13,6 @@ import (
 	"time"

 	"github.com/gin-gonic/gin"
-	"github.com/google/go-cmp/cmp"

 	"github.com/ollama/ollama/api"
 )
@@ -108,46 +107,7 @@ func TestChatMiddleware(t *testing.T) {
 					"presence_penalty":  5.0,
 					"top_p":             6.0,
 				},
-				Format: json.RawMessage(`"json"`),
-				Stream: &True,
-			},
-		},
-		{
-			name: "chat handler with streaming usage",
-			body: `{
-				"model": "test-model",
-				"messages": [
-					{"role": "user", "content": "Hello"}
-				],
-				"stream":            true,
-				"stream_options":    {"include_usage": true},
-				"max_tokens":        999,
-				"seed":              123,
-				"stop":              ["\n", "stop"],
-				"temperature":       3.0,
-				"frequency_penalty": 4.0,
-				"presence_penalty":  5.0,
-				"top_p":             6.0,
-				"response_format":   {"type": "json_object"}
-			}`,
-			req: api.ChatRequest{
-				Model: "test-model",
-				Messages: []api.Message{
-					{
-						Role:    "user",
-						Content: "Hello",
-					},
-				},
-				Options: map[string]any{
-					"num_predict":       999.0, // float because JSON doesn't distinguish between float and int
-					"seed":              123.0,
-					"stop":              []any{"\n", "stop"},
-					"temperature":       3.0,
-					"frequency_penalty": 4.0,
-					"presence_penalty":  5.0,
-					"top_p":             6.0,
-				},
-				Format: json.RawMessage(`"json"`),
+				Format: "json",
 				Stream: &True,
 			},
 		},
@@ -356,13 +316,13 @@ func TestChatMiddleware(t *testing.T) {
 				if err := json.Unmarshal(resp.Body.Bytes(), &errResp); err != nil {
 					t.Fatal(err)
 				}
-				return
 			}
-			if diff := cmp.Diff(&tc.req, capturedRequest); diff != "" {
-				t.Fatalf("requests did not match: %+v", diff)
+			if capturedRequest != nil && !reflect.DeepEqual(tc.req, *capturedRequest) {
+				t.Fatal("requests did not match")
 			}
-			if diff := cmp.Diff(tc.err, errResp); diff != "" {
-				t.Fatalf("errors did not match for %s:\n%s", tc.name, diff)
+
+			if !reflect.DeepEqual(tc.err, errResp) {
+				t.Fatal("errors did not match")
 			}
 		})
 	}
@@ -402,55 +362,6 @@ func TestCompletionsMiddleware(t *testing.T) {
 				Stream: &False,
 			},
 		},
-		{
-			name: "completions handler stream",
-			body: `{
-				"model": "test-model",
-				"prompt": "Hello",
-				"stream": true,
-				"temperature": 0.8,
-				"stop": ["\n", "stop"],
-				"suffix": "suffix"
-			}`,
-			req: api.GenerateRequest{
-				Model:  "test-model",
-				Prompt: "Hello",
-				Options: map[string]any{
-					"frequency_penalty": 0.0,
-					"presence_penalty":  0.0,
-					"temperature":       0.8,
-					"top_p":             1.0,
-					"stop":              []any{"\n", "stop"},
-				},
-				Suffix: "suffix",
-				Stream: &True,
-			},
-		},
-		{
-			name: "completions handler stream with usage",
-			body: `{
-				"model": "test-model",
-				"prompt": "Hello",
-				"stream": true,
-				"stream_options": {"include_usage": true},
-				"temperature": 0.8,
-				"stop": ["\n", "stop"],
-				"suffix": "suffix"
-			}`,
-			req: api.GenerateRequest{
-				Model:  "test-model",
-				Prompt: "Hello",
-				Options: map[string]any{
-					"frequency_penalty": 0.0,
-					"presence_penalty":  0.0,
-					"temperature":       0.8,
-					"top_p":             1.0,
-					"stop":              []any{"\n", "stop"},
-				},
-				Suffix: "suffix",
-				Stream: &True,
-			},
-		},
 		{
 			name: "completions handler error forwarding",
 			body: `{
--- a/server/routes.go
+++ b/server/routes.go
@@ -148,7 +148,10 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 		return
 	}

-	if req.Raw && (req.Template != "" || req.System != "" || len(req.Context) > 0) {
+	if req.Format != "" && req.Format != "json" {
+		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "format must be empty or \"json\""})
+		return
+	} else if req.Raw && (req.Template != "" || req.System != "" || len(req.Context) > 0) {
 		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "raw mode does not support template, system, or context"})
 		return
 	}