set limit to increment

tune concurrency manager
- higher initial concurrency - lower cooldown after ramping up - lower threshold for ramp up
2024-03-07 17:31:45 -08:00 · 2024-03-07 16:25:17 -08:00 · 2024-03-07 13:57:07 -08:00 · 2024-03-07 11:35:06 -08:00 · 2024-03-07 11:33:49 -08:00 · 2024-03-07 10:51:00 -08:00
63 changed files with 4586 additions and 1232 deletions
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -2,17 +2,22 @@ name: test

 on:
  pull_request:
+    paths:
+      - '**/*'
+      - '!docs/**'
+      - '!examples/**'
+      - '!README.md'

 jobs:
  generate:
    strategy:
      matrix:
-        os: [ubuntu-latest, macos-latest, windows-latest]
+        os: [ubuntu-latest, macos-latest, windows-2019]
        arch: [amd64, arm64]
        exclude:
          - os: ubuntu-latest
            arch: arm64
-          - os: windows-latest
+          - os: windows-2019
            arch: arm64
    runs-on: ${{ matrix.os }}
    env:
@@ -21,10 +26,21 @@ jobs:
      - uses: actions/checkout@v4
      - uses: actions/setup-go@v5
        with:
-          go-version: '1.21'
+          go-version: '1.22'
          cache: true
      - run: go get ./...
+      - run: |
+          $gopath=(get-command go).source | split-path -parent
+          & "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1"
+          cd $env:GITHUB_WORKSPACE
+          $env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
+          $env:PATH="$gopath;$env:PATH"
+          go generate -x ./...
+        if: ${{ startsWith(matrix.os, 'windows-') }}
+        name: "Windows Go Generate"
      - run: go generate -x ./...
+        if: ${{ ! startsWith(matrix.os, 'windows-') }}
+        name: "Unix Go Generate"
      - uses: actions/upload-artifact@v4
        with:
          name: ${{ matrix.os }}-${{ matrix.arch }}-libraries
@@ -46,7 +62,7 @@ jobs:
      - uses: actions/checkout@v4
      - uses: actions/setup-go@v4
        with:
-          go-version: '1.21'
+          go-version: '1.22'
          cache: true
      - run: go get ./...
      - run: |
@@ -62,7 +78,6 @@ jobs:
    strategy:
      matrix:
        rocm-version:
-          - '5.7.1'
          - '6.0'
    runs-on: linux
    container: rocm/dev-ubuntu-20.04:${{ matrix.rocm-version }}
@@ -76,7 +91,7 @@ jobs:
      - uses: actions/checkout@v4
      - uses: actions/setup-go@v4
        with:
-          go-version: '1.21'
+          go-version: '1.22'
          cache: true
      - run: go get ./...
      - run: |
@@ -91,26 +106,26 @@ jobs:
  lint:
    strategy:
      matrix:
-        os: [ubuntu-latest, macos-latest, windows-latest]
+        os: [ubuntu-latest, macos-latest, windows-2019]
        arch: [amd64, arm64]
        exclude:
          - os: ubuntu-latest
            arch: arm64
-          - os: windows-latest
+          - os: windows-2019
            arch: arm64
          - os: macos-latest
            arch: amd64
    runs-on: ${{ matrix.os }}
    env:
      GOARCH: ${{ matrix.arch }}
-      CGO_ENABLED: "1"
+      CGO_ENABLED: '1'
    steps:
      - uses: actions/checkout@v4
        with:
          submodules: recursive
      - uses: actions/setup-go@v5
        with:
-          go-version: '1.21'
+          go-version: '1.22'
          cache: false
      - run: |
          mkdir -p llm/llama.cpp/build/linux/${{ matrix.arch }}/stub/lib/
@@ -130,24 +145,24 @@ jobs:
    needs: generate
    strategy:
      matrix:
-        os: [ubuntu-latest, macos-latest, windows-latest]
+        os: [ubuntu-latest, macos-latest, windows-2019]
        arch: [amd64]
        exclude:
          - os: ubuntu-latest
            arch: arm64
-          - os: windows-latest
+          - os: windows-2019
            arch: arm64
    runs-on: ${{ matrix.os }}
    env:
      GOARCH: ${{ matrix.arch }}
-      CGO_ENABLED: "1"
+      CGO_ENABLED: '1'
    steps:
      - uses: actions/checkout@v4
        with:
          submodules: recursive
      - uses: actions/setup-go@v5
        with:
-          go-version: '1.21'
+          go-version: '1.22'
          cache: true
      - run: go get
      - uses: actions/download-artifact@v4
--- a/30
+++ b/30
@@ -1,6 +1,7 @@
-ARG GOLANG_VERSION=1.21.3
+ARG GOLANG_VERSION=1.22.1
 ARG CMAKE_VERSION=3.22.1
 ARG CUDA_VERSION=11.3.1
+ARG ROCM_VERSION=6.0

 # Copy the minimal context we need to run the generate scripts
 FROM scratch AS llm-code
@@ -28,7 +29,7 @@ WORKDIR /go/src/github.com/jmorganca/ollama/llm/generate
 ARG CGO_CFLAGS
 RUN OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh

-FROM --platform=linux/amd64 rocm/dev-centos-7:5.7.1-complete AS rocm-5-build-amd64
+FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete AS rocm-build-amd64
 ARG CMAKE_VERSION
 COPY ./scripts/rh_linux_deps.sh /
 RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
@@ -39,18 +40,14 @@ WORKDIR /go/src/github.com/jmorganca/ollama/llm/generate
 ARG CGO_CFLAGS
 ARG AMDGPU_TARGETS
 RUN OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
+RUN mkdir /tmp/scratch && \
+    for dep in $(cat /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/x86_64/rocm*/lib/deps.txt) ; do \
+        cp ${dep} /tmp/scratch/ || exit 1 ; \
+    done && \
+    (cd /opt/rocm/lib && tar cf - rocblas/library) | (cd /tmp/scratch/ && tar xf - ) && \
+    mkdir -p /go/src/github.com/jmorganca/ollama/dist/deps/ && \
+    (cd /tmp/scratch/ && tar czvf /go/src/github.com/jmorganca/ollama/dist/deps/rocm-amd64-deps.tgz . )

-FROM --platform=linux/amd64 rocm/dev-centos-7:6.0-complete AS rocm-6-build-amd64
-ARG CMAKE_VERSION
-COPY ./scripts/rh_linux_deps.sh /
-RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
-ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
-ENV LIBRARY_PATH /opt/amdgpu/lib64
-COPY --from=llm-code / /go/src/github.com/jmorganca/ollama/
-WORKDIR /go/src/github.com/jmorganca/ollama/llm/generate
-ARG CGO_CFLAGS
-ARG AMDGPU_TARGETS
-RUN OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh

 FROM --platform=linux/amd64 centos:7 AS cpu-builder-amd64
 ARG CMAKE_VERSION
@@ -91,8 +88,8 @@ COPY . .
 COPY --from=cpu_avx-build-amd64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
 COPY --from=cpu_avx2-build-amd64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
 COPY --from=cuda-build-amd64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
-COPY --from=rocm-5-build-amd64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
-COPY --from=rocm-6-build-amd64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
+COPY --from=rocm-build-amd64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
+COPY --from=rocm-build-amd64 /go/src/github.com/jmorganca/ollama/dist/deps/ ./dist/deps/
 ARG GOFLAGS
 ARG CGO_CFLAGS
 RUN go build .
@@ -117,7 +114,7 @@ RUN apt-get update && apt-get install -y ca-certificates
 COPY --from=build-arm64 /go/src/github.com/jmorganca/ollama/ollama /bin/ollama

 # Radeon images are much larger so we keep it distinct from the CPU/CUDA image
-FROM --platform=linux/amd64 rocm/dev-centos-7:5.7.1-complete as runtime-rocm
+FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete as runtime-rocm
 RUN update-pciids
 COPY --from=build-amd64 /go/src/github.com/jmorganca/ollama/ollama /bin/ollama
 EXPOSE 11434
@@ -132,6 +129,7 @@ ENV OLLAMA_HOST 0.0.0.0
 ENV PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
 ENV LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64
 ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
+ENV NVIDIA_VISIBLE_DEVICES=all

 ENTRYPOINT ["/bin/ollama"]
 CMD ["serve"]
--- a/README.md
+++ b/README.md
@@ -62,6 +62,8 @@ Here are some example models that can be downloaded:
 | Orca Mini          | 3B         | 1.9GB | `ollama run orca-mini`         |
 | Vicuna             | 7B         | 3.8GB | `ollama run vicuna`            |
 | LLaVA              | 7B         | 4.5GB | `ollama run llava`             |
+| Gemma              | 2B         | 1.4GB | `ollama run gemma:2b`          |
+| Gemma              | 7B         | 4.8GB | `ollama run gemma:7b`          |

 > Note: You should have at least 8 GB of RAM available to run the 7B models, 16 GB to run the 13B models, and 32 GB to run the 33B models.

@@ -258,19 +260,26 @@ See the [API documentation](./docs/api.md) for all endpoints.
 ### Web & Desktop

 - [Bionic GPT](https://github.com/bionic-gpt/bionic-gpt)
+- [Enchanted (macOS native)](https://github.com/AugustDev/enchanted)
 - [HTML UI](https://github.com/rtcfirefly/ollama-ui)
 - [Chatbot UI](https://github.com/ivanfioravanti/chatbot-ollama)
 - [Typescript UI](https://github.com/ollama-interface/Ollama-Gui?tab=readme-ov-file)
 - [Minimalistic React UI for Ollama Models](https://github.com/richawo/minimal-llm-ui)
 - [Open WebUI](https://github.com/open-webui/open-webui)
 - [Ollamac](https://github.com/kevinhermawan/Ollamac)
- [big-AGI](https://github.com/enricoros/big-agi/blob/main/docs/config-ollama.md)
+- [big-AGI](https://github.com/enricoros/big-AGI/blob/main/docs/config-local-ollama.md)
 - [Cheshire Cat assistant framework](https://github.com/cheshire-cat-ai/core)
 - [Amica](https://github.com/semperai/amica)
 - [chatd](https://github.com/BruceMacD/chatd)
 - [Ollama-SwiftUI](https://github.com/kghandour/Ollama-SwiftUI)
 - [MindMac](https://mindmac.app)
 - [NextJS Web Interface for Ollama](https://github.com/jakobhoeg/nextjs-ollama-llm-ui)
+- [Msty](https://msty.app)
+- [Chatbox](https://github.com/Bin-Huang/Chatbox)
+- [WinForm Ollama Copilot](https://github.com/tgraupmann/WinForm_Ollama_Copilot)
+- [NextChat](https://github.com/ChatGPTNextWeb/ChatGPT-Next-Web) with [Get Started Doc](https://docs.nextchat.dev/models/ollama)
+- [Odin Runes](https://github.com/leonid20000/OdinRunes)
+- [LLM-X: Progressive Web App](https://github.com/mrdjohnson/llm-x)

 ### Terminal

@@ -301,6 +310,7 @@ See the [API documentation](./docs/api.md) for all endpoints.

 - [LangChain](https://python.langchain.com/docs/integrations/llms/ollama) and [LangChain.js](https://js.langchain.com/docs/modules/model_io/models/llms/integrations/ollama) with [example](https://js.langchain.com/docs/use_cases/question_answering/local_retrieval_qa)
 - [LangChainGo](https://github.com/tmc/langchaingo/) with [example](https://github.com/tmc/langchaingo/tree/main/examples/ollama-completion-example)
+- [LangChain4j](https://github.com/langchain4j/langchain4j) with [example](https://github.com/langchain4j/langchain4j-examples/tree/main/ollama-examples/src/main/java)
 - [LlamaIndex](https://gpt-index.readthedocs.io/en/stable/examples/llm/ollama.html)
 - [LangChain4j](https://github.com/langchain4j/langchain4j/tree/main/langchain4j-ollama)
 - [LiteLLM](https://github.com/BerriAI/litellm)
@@ -315,8 +325,10 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [LangChainDart](https://github.com/davidmigloz/langchain_dart)
 - [Semantic Kernel - Python](https://github.com/microsoft/semantic-kernel/tree/main/python/semantic_kernel/connectors/ai/ollama)
 - [Haystack](https://github.com/deepset-ai/haystack-integrations/blob/main/integrations/ollama.md)
+- [Elixir LangChain](https://github.com/brainlid/langchain)
 - [Ollama for R - rollama](https://github.com/JBGruber/rollama)
 - [Ollama-ex for Elixir](https://github.com/lebrunel/ollama-ex)
+- [Ollama Connector for SAP ABAP](https://github.com/b-tocs/abap_btocs_ollama)

 ### Mobile

@@ -330,6 +342,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Continue](https://github.com/continuedev/continue)
 - [Obsidian Ollama plugin](https://github.com/hinterdupfinger/obsidian-ollama)
 - [Logseq Ollama plugin](https://github.com/omagdy7/ollama-logseq)
+- [NotesOllama](https://github.com/andersrex/notesollama) (Apple Notes Ollama plugin)
 - [Dagger Chatbot](https://github.com/samalba/dagger-chatbot)
 - [Discord AI Bot](https://github.com/mekb-turtle/discord-ai-bot)
 - [Ollama Telegram Bot](https://github.com/ruecat/ollama-telegram)
@@ -337,6 +350,9 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Rivet plugin](https://github.com/abrenneke/rivet-plugin-ollama)
 - [Llama Coder](https://github.com/ex3ndr/llama-coder) (Copilot alternative using Ollama)
 - [Obsidian BMO Chatbot plugin](https://github.com/longy2k/obsidian-bmo-chatbot)
+- [Copilot for Obsidian plugin](https://github.com/logancyang/obsidian-copilot)
+- [Obsidian Local GPT plugin](https://github.com/pfrankov/obsidian-local-gpt)
 - [Open Interpreter](https://docs.openinterpreter.com/language-model-setup/local-models/ollama)
 - [twinny](https://github.com/rjmacarthy/twinny) (Copilot and Copilot chat alternative using Ollama)
 - [Wingman-AI](https://github.com/RussellCanfield/wingman-ai) (Copilot code and chat alternative using Ollama and HuggingFace)
+- [Page Assist](https://github.com/n4ze3m/page-assist) (Chrome Extension)
--- a/api/client.go
+++ b/api/client.go
@@ -21,7 +21,7 @@ import (

 type Client struct {
 	base *url.URL
-	http http.Client
+	http *http.Client
 }

 func checkError(resp *http.Response, body []byte) error {
@@ -66,30 +66,13 @@ func ClientFromEnvironment() (*Client, error) {
 		}
 	}

-	client := Client{
+	return &Client{
 		base: &url.URL{
 			Scheme: scheme,
 			Host:   net.JoinHostPort(host, port),
 		},
-	}
-
-	mockRequest, err := http.NewRequest(http.MethodHead, client.base.String(), nil)
-	if err != nil {
-		return nil, err
-	}
-
-	proxyURL, err := http.ProxyFromEnvironment(mockRequest)
-	if err != nil {
-		return nil, err
-	}
-
-	client.http = http.Client{
-		Transport: &http.Transport{
-			Proxy: http.ProxyURL(proxyURL),
-		},
-	}
-
-	return &client, nil
+		http: http.DefaultClient,
+	}, nil
 }

 func (c *Client) do(ctx context.Context, method, path string, reqData, respData any) error {
--- a/api/types.go
+++ b/api/types.go
@@ -83,7 +83,7 @@ type Metrics struct {
 	EvalDuration       time.Duration `json:"eval_duration,omitempty"`
 }

-// Options specfied in GenerateRequest, if you add a new option here add it to the API docs also
+// Options specified in GenerateRequest, if you add a new option here add it to the API docs also
 type Options struct {
 	Runner

@@ -121,7 +121,6 @@ type Runner struct {
 	VocabOnly          bool    `json:"vocab_only,omitempty"`
 	UseMMap            bool    `json:"use_mmap,omitempty"`
 	UseMLock           bool    `json:"use_mlock,omitempty"`
-	EmbeddingOnly      bool    `json:"embedding_only,omitempty"`
 	RopeFrequencyBase  float32 `json:"rope_frequency_base,omitempty"`
 	RopeFrequencyScale float32 `json:"rope_frequency_scale,omitempty"`
 	NumThread          int     `json:"num_thread,omitempty"`
@@ -395,7 +394,6 @@ func DefaultOptions() Options {
 			UseMLock:           false,
 			UseMMap:            true,
 			UseNUMA:            false,
-			EmbeddingOnly:      true,
 		},
 	}
 }
--- a/app/lifecycle/updater.go
+++ b/app/lifecycle/updater.go
@@ -34,20 +34,6 @@ type UpdateResponse struct {
 	UpdateVersion string `json:"version"`
 }

-func getClient(req *http.Request) http.Client {
-	proxyURL, err := http.ProxyFromEnvironment(req)
-	if err != nil {
-		slog.Warn(fmt.Sprintf("failed to handle proxy: %s", err))
-		return http.Client{}
-	}
-
-	return http.Client{
-		Transport: &http.Transport{
-			Proxy: http.ProxyURL(proxyURL),
-		},
-	}
-}
-
 func IsNewReleaseAvailable(ctx context.Context) (bool, UpdateResponse) {
 	var updateResp UpdateResponse

@@ -83,10 +69,9 @@ func IsNewReleaseAvailable(ctx context.Context) (bool, UpdateResponse) {
 	}
 	req.Header.Set("Authorization", signature)
 	req.Header.Set("User-Agent", fmt.Sprintf("ollama/%s (%s %s) Go/%s", version.Version, runtime.GOARCH, runtime.GOOS, runtime.Version()))
-	client := getClient(req)

 	slog.Debug("checking for available update", "requestURL", requestURL)
-	resp, err := client.Do(req)
+	resp, err := http.DefaultClient.Do(req)
 	if err != nil {
 		slog.Warn(fmt.Sprintf("failed to check for update: %s", err))
 		return false, updateResp
@@ -101,6 +86,11 @@ func IsNewReleaseAvailable(ctx context.Context) (bool, UpdateResponse) {
 	if err != nil {
 		slog.Warn(fmt.Sprintf("failed to read body response: %s", err))
 	}
+
+	if resp.StatusCode != 200 {
+		slog.Info(fmt.Sprintf("check update error %d - %.96s", resp.StatusCode, string(body)))
+		return false, updateResp
+	}
 	err = json.Unmarshal(body, &updateResp)
 	if err != nil {
 		slog.Warn(fmt.Sprintf("malformed response checking for update: %s", err))
@@ -119,8 +109,8 @@ func DownloadNewRelease(ctx context.Context, updateResp UpdateResponse) error {
 	if err != nil {
 		return err
 	}
-	client := getClient(req)
-	resp, err := client.Do(req)
+
+	resp, err := http.DefaultClient.Do(req)
 	if err != nil {
 		return fmt.Errorf("error checking update: %w", err)
 	}
@@ -151,7 +141,7 @@ func DownloadNewRelease(ctx context.Context, updateResp UpdateResponse) error {
 	cleanupOldDownloads()

 	req.Method = http.MethodGet
-	resp, err = client.Do(req)
+	resp, err = http.DefaultClient.Do(req)
 	if err != nil {
 		return fmt.Errorf("error checking update: %w", err)
 	}
--- a/app/ollama.iss
+++ b/app/ollama.iss
@@ -49,9 +49,6 @@ SetupLogging=yes
 CloseApplications=yes
 RestartApplications=no

-; Make sure they can at least download llama2 as a minimum
-ExtraDiskSpaceRequired=3826806784
-
 ; https://jrsoftware.org/ishelp/index.php?topic=setup_wizardimagefile
 WizardSmallImageFile=.\assets\setup.bmp

@@ -94,6 +91,14 @@ Source: "..\ollama.exe"; DestDir: "{app}"; Flags: ignoreversion 64bit
 Source: "..\dist\windeps\*.dll"; DestDir: "{app}"; Flags: ignoreversion 64bit
 Source: "..\dist\ollama_welcome.ps1"; DestDir: "{app}"; Flags: ignoreversion
 Source: ".\assets\app.ico"; DestDir: "{app}"; Flags: ignoreversion
+; Assumes v5.7, may need adjustments for v6
+#if GetEnv("HIP_PATH") != ""
+  Source: "{#GetEnv('HIP_PATH')}\bin\hipblas.dll"; DestDir: "{app}\rocm\"; Flags: ignoreversion
+  Source: "{#GetEnv('HIP_PATH')}\bin\rocblas.dll"; DestDir: "{app}\rocm\"; Flags: ignoreversion
+  ; amdhip64.dll dependency comes from the driver and must be installed already
+  Source: "{#GetEnv('HIP_PATH')}\bin\rocblas\library\*"; DestDir: "{app}\rocm\rocblas\library\"; Flags: ignoreversion
+#endif
+

 [Icons]
 Name: "{group}\{#MyAppName}"; Filename: "{app}\{#MyAppExeName}"; IconFilename: "{app}\app.ico"
@@ -116,7 +121,8 @@ Filename: "{cmd}"; Parameters: "/c timeout 5"; Flags: runhidden
 Type: filesandordirs; Name: "{%TEMP}\ollama*"
 Type: filesandordirs; Name: "{%LOCALAPPDATA}\Ollama"
 Type: filesandordirs; Name: "{%LOCALAPPDATA}\Programs\Ollama"
-Type: filesandordirs; Name: "{%USERPROFILE}\.ollama"
+Type: filesandordirs; Name: "{%USERPROFILE}\.ollama\models"
+Type: filesandordirs; Name: "{%USERPROFILE}\.ollama\history"
 ; NOTE: if the user has a custom OLLAMA_MODELS it will be preserved

 [Messages]
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -1,6 +1,7 @@
 package cmd

 import (
+	"archive/zip"
 	"bytes"
 	"context"
 	"crypto/ed25519"
@@ -87,22 +88,82 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
 				path = filepath.Join(filepath.Dir(filename), path)
 			}

-			bin, err := os.Open(path)
+			fi, err := os.Stat(path)
 			if errors.Is(err, os.ErrNotExist) && c.Name == "model" {
 				continue
 			} else if err != nil {
 				return err
 			}
-			defer bin.Close()

-			hash := sha256.New()
-			if _, err := io.Copy(hash, bin); err != nil {
-				return err
+			// TODO make this work w/ adapters
+			if fi.IsDir() {
+				tf, err := os.CreateTemp("", "ollama-tf")
+				if err != nil {
+					return err
+				}
+				defer os.RemoveAll(tf.Name())
+
+				zf := zip.NewWriter(tf)
+
+				files, err := filepath.Glob(filepath.Join(path, "model-*.safetensors"))
+				if err != nil {
+					return err
+				}
+
+				if len(files) == 0 {
+					return fmt.Errorf("no safetensors files were found in '%s'", path)
+				}
+
+				// add the safetensor config file + tokenizer
+				files = append(files, filepath.Join(path, "config.json"))
+				files = append(files, filepath.Join(path, "added_tokens.json"))
+				files = append(files, filepath.Join(path, "tokenizer.model"))
+
+				for _, fn := range files {
+					f, err := os.Open(fn)
+					if os.IsNotExist(err) && strings.HasSuffix(fn, "added_tokens.json") {
+						continue
+					} else if err != nil {
+						return err
+					}
+
+					fi, err := f.Stat()
+					if err != nil {
+						return err
+					}
+
+					h, err := zip.FileInfoHeader(fi)
+					if err != nil {
+						return err
+					}
+
+					h.Name = filepath.Base(fn)
+					h.Method = zip.Store
+
+					w, err := zf.CreateHeader(h)
+					if err != nil {
+						return err
+					}
+
+					_, err = io.Copy(w, f)
+					if err != nil {
+						return err
+					}
+
+				}
+
+				if err := zf.Close(); err != nil {
+					return err
+				}
+
+				if err := tf.Close(); err != nil {
+					return err
+				}
+				path = tf.Name()
 			}
-			bin.Seek(0, io.SeekStart)

-			digest := fmt.Sprintf("sha256:%x", hash.Sum(nil))
-			if err = client.CreateBlob(cmd.Context(), digest, bin); err != nil {
+			digest, err := createBlob(cmd, client, path)
+			if err != nil {
 				return err
 			}

@@ -141,6 +202,26 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
 	return nil
 }

+func createBlob(cmd *cobra.Command, client *api.Client, path string) (string, error) {
+	bin, err := os.Open(path)
+	if err != nil {
+		return "", err
+	}
+	defer bin.Close()
+
+	hash := sha256.New()
+	if _, err := io.Copy(hash, bin); err != nil {
+		return "", err
+	}
+	bin.Seek(0, io.SeekStart)
+
+	digest := fmt.Sprintf("sha256:%x", hash.Sum(nil))
+	if err = client.CreateBlob(cmd.Context(), digest, bin); err != nil {
+		return "", err
+	}
+	return digest, nil
+}
+
 func RunHandler(cmd *cobra.Command, args []string) error {
 	client, err := api.ClientFromEnvironment()
 	if err != nil {
@@ -718,39 +799,36 @@ func initializeKeypair() error {
 	_, err = os.Stat(privKeyPath)
 	if os.IsNotExist(err) {
 		fmt.Printf("Couldn't find '%s'. Generating new private key.\n", privKeyPath)
-		_, privKey, err := ed25519.GenerateKey(rand.Reader)
+		cryptoPublicKey, cryptoPrivateKey, err := ed25519.GenerateKey(rand.Reader)
 		if err != nil {
 			return err
 		}

-		privKeyBytes, err := format.OpenSSHPrivateKey(privKey, "")
+		privateKeyBytes, err := ssh.MarshalPrivateKey(cryptoPrivateKey, "")
 		if err != nil {
 			return err
 		}

-		err = os.MkdirAll(filepath.Dir(privKeyPath), 0o755)
-		if err != nil {
+		if err := os.MkdirAll(filepath.Dir(privKeyPath), 0o755); err != nil {
 			return fmt.Errorf("could not create directory %w", err)
 		}

-		err = os.WriteFile(privKeyPath, pem.EncodeToMemory(privKeyBytes), 0o600)
+		if err := os.WriteFile(privKeyPath, pem.EncodeToMemory(privateKeyBytes), 0o600); err != nil {
+			return err
+		}
+
+		sshPublicKey, err := ssh.NewPublicKey(cryptoPublicKey)
 		if err != nil {
 			return err
 		}

-		sshPrivateKey, err := ssh.NewSignerFromKey(privKey)
-		if err != nil {
+		publicKeyBytes := ssh.MarshalAuthorizedKey(sshPublicKey)
+
+		if err := os.WriteFile(pubKeyPath, publicKeyBytes, 0o644); err != nil {
 			return err
 		}

-		pubKeyData := ssh.MarshalAuthorizedKey(sshPrivateKey.PublicKey())
-
-		err = os.WriteFile(pubKeyPath, pubKeyData, 0o644)
-		if err != nil {
-			return err
-		}
-
-		fmt.Printf("Your new public key is: \n\n%s\n", string(pubKeyData))
+		fmt.Printf("Your new public key is: \n\n%s\n", publicKeyBytes)
 	}
 	return nil
 }
@@ -809,6 +887,14 @@ func versionHandler(cmd *cobra.Command, _ []string) {
 	}
 }

+func appendHostEnvDocs(cmd *cobra.Command) {
+	const hostEnvDocs = `
+Environment Variables:
+      OLLAMA_HOST        The host:port or base URL of the Ollama server (e.g. http://localhost:11434)
+`
+	cmd.SetUsageTemplate(cmd.UsageTemplate() + hostEnvDocs)
+}
+
 func NewCLI() *cobra.Command {
 	log.SetFlags(log.LstdFlags | log.Lshortfile)
 	cobra.EnableCommandSorting = false
@@ -874,7 +960,6 @@ func NewCLI() *cobra.Command {
 	runCmd.Flags().Bool("insecure", false, "Use an insecure registry")
 	runCmd.Flags().Bool("nowordwrap", false, "Don't wrap words to the next line automatically")
 	runCmd.Flags().String("format", "", "Response format (e.g. json)")
-
 	serveCmd := &cobra.Command{
 		Use:     "serve",
 		Aliases: []string{"start"},
@@ -882,6 +967,13 @@ func NewCLI() *cobra.Command {
 		Args:    cobra.ExactArgs(0),
 		RunE:    RunServer,
 	}
+	serveCmd.SetUsageTemplate(serveCmd.UsageTemplate() + `
+Environment Variables:
+
+    OLLAMA_HOST       The host:port to bind to (default "127.0.0.1:11434")
+    OLLAMA_ORIGINS    A comma separated list of allowed origins.
+    OLLAMA_MODELS     The path to the models directory (default is "~/.ollama/models")
+`)

 	pullCmd := &cobra.Command{
 		Use:     "pull MODEL",
@@ -910,7 +1002,6 @@ func NewCLI() *cobra.Command {
 		PreRunE: checkServerHeartbeat,
 		RunE:    ListHandler,
 	}
-
 	copyCmd := &cobra.Command{
 		Use:     "cp SOURCE TARGET",
 		Short:   "Copy a model",
@@ -927,6 +1018,19 @@ func NewCLI() *cobra.Command {
 		RunE:    DeleteHandler,
 	}

+	for _, cmd := range []*cobra.Command{
+		createCmd,
+		showCmd,
+		runCmd,
+		pullCmd,
+		pushCmd,
+		listCmd,
+		copyCmd,
+		deleteCmd,
+	} {
+		appendHostEnvDocs(cmd)
+	}
+
 	rootCmd.AddCommand(
 		serveCmd,
 		createCmd,
--- a/convert/convert.go
+++ b/convert/convert.go
@@ -0,0 +1,331 @@
+package convert
+
+import (
+	"bytes"
+	"cmp"
+	"encoding/binary"
+	"encoding/json"
+	"fmt"
+	"io"
+	"log/slog"
+	"os"
+	"path/filepath"
+	"regexp"
+	"slices"
+
+	"github.com/mitchellh/mapstructure"
+	"google.golang.org/protobuf/proto"
+
+	"github.com/jmorganca/ollama/convert/sentencepiece"
+	"github.com/jmorganca/ollama/llm"
+)
+
+type Params struct {
+	Architectures    []string `json:"architectures"`
+	VocabSize        int      `json:"vocab_size"`
+	HiddenSize       int      `json:"hidden_size"`       // n_embd
+	HiddenLayers     int      `json:"num_hidden_layers"` // n_layer
+	ContextSize      int      `json:"max_position_embeddings"`
+	IntermediateSize int      `json:"intermediate_size"`
+	AttentionHeads   int      `json:"num_attention_heads"` // n_head
+	KeyValHeads      int      `json:"num_key_value_heads"`
+	NormEPS          float64  `json:"rms_norm_eps"`
+	RopeFreqBase     float64  `json:"rope_theta"`
+	BoSTokenID       int      `json:"bos_token_id"`
+	EoSTokenID       int      `json:"eos_token_id"`
+}
+
+type MetaData struct {
+	Type    string `mapstructure:"dtype"`
+	Shape   []int  `mapstructure:"shape"`
+	Offsets []int  `mapstructure:"data_offsets"`
+}
+
+func ReadSafeTensors(fn string, offset uint64) ([]llm.Tensor, uint64, error) {
+	f, err := os.Open(fn)
+	if err != nil {
+		return []llm.Tensor{}, 0, err
+	}
+	defer f.Close()
+
+	var jsonSize uint64
+	binary.Read(f, binary.LittleEndian, &jsonSize)
+
+	buf := make([]byte, jsonSize)
+	_, err = io.ReadFull(f, buf)
+	if err != nil {
+		return []llm.Tensor{}, 0, err
+	}
+
+	d := json.NewDecoder(bytes.NewBuffer(buf))
+	d.UseNumber()
+	var parsed map[string]interface{}
+	if err = d.Decode(&parsed); err != nil {
+		return []llm.Tensor{}, 0, err
+	}
+
+	var keys []string
+	for k := range parsed {
+		keys = append(keys, k)
+	}
+
+	slices.Sort(keys)
+
+	slog.Info("converting layers")
+
+	var tensors []llm.Tensor
+	for _, k := range keys {
+		vals := parsed[k].(map[string]interface{})
+		var data MetaData
+		if err = mapstructure.Decode(vals, &data); err != nil {
+			return []llm.Tensor{}, 0, err
+		}
+
+		var size uint64
+		var kind uint32
+		switch len(data.Shape) {
+		case 0:
+			// metadata
+			continue
+		case 1:
+			// convert to float32
+			kind = 0
+			size = uint64(data.Shape[0] * 4)
+		case 2:
+			// convert to float16
+			kind = 1
+			size = uint64(data.Shape[0] * data.Shape[1] * 2)
+		}
+
+		ggufName, err := GetTensorName(k)
+		if err != nil {
+			slog.Error("%v", err)
+			return []llm.Tensor{}, 0, err
+		}
+
+		shape := [4]uint64{0, 0, 0, 0}
+		for cnt, s := range data.Shape {
+			shape[cnt] = uint64(s)
+		}
+
+		t := llm.Tensor{
+			Name:          ggufName,
+			Kind:          kind,
+			Offset:        offset,
+			Shape:         shape,
+			FileName:      fn,
+			OffsetPadding: 8 + jsonSize,
+			FileOffsets:   []uint64{uint64(data.Offsets[0]), uint64(data.Offsets[1])},
+		}
+		slog.Debug(fmt.Sprintf("%v", t))
+		tensors = append(tensors, t)
+		offset += size
+	}
+	return tensors, offset, nil
+}
+
+func GetSafeTensors(dirpath string) ([]llm.Tensor, error) {
+	var tensors []llm.Tensor
+	files, err := filepath.Glob(filepath.Join(dirpath, "/model-*.safetensors"))
+	if err != nil {
+		return []llm.Tensor{}, err
+	}
+
+	var offset uint64
+	for _, f := range files {
+		var t []llm.Tensor
+		var err error
+		t, offset, err = ReadSafeTensors(f, offset)
+		if err != nil {
+			slog.Error("%v", err)
+			return []llm.Tensor{}, err
+		}
+		tensors = append(tensors, t...)
+	}
+	return tensors, nil
+}
+
+func GetParams(dirpath string) (*Params, error) {
+	f, err := os.Open(filepath.Join(dirpath, "config.json"))
+	if err != nil {
+		return nil, err
+	}
+	defer f.Close()
+
+	var params Params
+
+	d := json.NewDecoder(f)
+	err = d.Decode(&params)
+	if err != nil {
+		return nil, err
+	}
+
+	return &params, nil
+}
+
+// Details on gguf's tokenizer can be found at:
+// https://github.com/ggerganov/ggml/blob/master/docs/gguf.md#tokenizer
+type Vocab struct {
+	Tokens []string
+	Scores []float32
+	Types  []int32
+}
+
+func LoadTokens(dirpath string) (*Vocab, error) {
+	slog.Info(fmt.Sprintf("reading vocab from %s", filepath.Join(dirpath, "tokenizer.model")))
+	in, err := os.ReadFile(filepath.Join(dirpath, "tokenizer.model"))
+	if err != nil {
+		return nil, err
+	}
+
+	// To regenerate sentencepiece from the protobufs use:
+	// protoc -I=./ --go_out=./ sentencepiece_model.proto
+	modelProto := &sentencepiece.ModelProto{}
+	if err := proto.Unmarshal(in, modelProto); err != nil {
+		return nil, err
+	}
+
+	v := &Vocab{
+		Tokens: make([]string, 0),
+		Scores: make([]float32, 0),
+		Types:  make([]int32, 0),
+	}
+
+	pieces := modelProto.GetPieces()
+	for _, p := range pieces {
+		v.Tokens = append(v.Tokens, p.GetPiece())
+		v.Scores = append(v.Scores, p.GetScore())
+		t := p.GetType()
+		v.Types = append(v.Types, int32(t))
+	}
+
+	slog.Info(fmt.Sprintf("vocab size: %d", len(v.Tokens)))
+
+	// add any additional tokens
+	addIn, err := os.ReadFile(filepath.Join(dirpath, "added_tokens.json"))
+	if os.IsNotExist(err) {
+		return v, nil
+	} else if err != nil {
+		return nil, err
+	}
+
+	slog.Info("reading user defined tokens")
+
+	var extraTokenData map[string]int
+	if err := json.Unmarshal(addIn, &extraTokenData); err != nil {
+		return nil, err
+	}
+
+	type token struct {
+		key string
+		pos int
+	}
+
+	extraTokens := make([]token, 0)
+	for k, id := range extraTokenData {
+		extraTokens = append(extraTokens, token{k, id})
+	}
+
+	slices.SortFunc(extraTokens, func(a, b token) int {
+		return cmp.Compare(a.pos, b.pos)
+	})
+
+	numToks := len(v.Tokens)
+
+	for cnt, t := range extraTokens {
+		// the token id should match the specific index for the total number of tokens
+		if t.pos != cnt+numToks {
+			return nil, fmt.Errorf("token ID '%d' for '%s' doesn't match total token size", t.pos, t.key)
+		}
+		v.Tokens = append(v.Tokens, t.key)
+		v.Scores = append(v.Scores, -1000.0)
+		v.Types = append(v.Types, int32(llm.GGUFTokenUserDefined))
+	}
+	slog.Info(fmt.Sprintf("vocab size w/ extra tokens: %d", len(v.Tokens)))
+
+	return v, nil
+}
+
+func GetTensorName(n string) (string, error) {
+	tMap := map[string]string{
+		"model.embed_tokens.weight":                           "token_embd.weight",
+		"model.layers.(\\d+).input_layernorm.weight":          "blk.$1.attn_norm.weight",
+		"model.layers.(\\d+).mlp.down_proj.weight":            "blk.$1.ffn_down.weight",
+		"model.layers.(\\d+).mlp.gate_proj.weight":            "blk.$1.ffn_gate.weight",
+		"model.layers.(\\d+).mlp.up_proj.weight":              "blk.$1.ffn_up.weight",
+		"model.layers.(\\d+).post_attention_layernorm.weight": "blk.$1.ffn_norm.weight",
+		"model.layers.(\\d+).self_attn.k_proj.weight":         "blk.$1.attn_k.weight",
+		"model.layers.(\\d+).self_attn.o_proj.weight":         "blk.$1.attn_output.weight",
+		"model.layers.(\\d+).self_attn.q_proj.weight":         "blk.$1.attn_q.weight",
+		"model.layers.(\\d+).self_attn.v_proj.weight":         "blk.$1.attn_v.weight",
+		"lm_head.weight":    "output.weight",
+		"model.norm.weight": "output_norm.weight",
+	}
+
+	v, ok := tMap[n]
+	if ok {
+		return v, nil
+	}
+
+	// quick hack to rename the layers to gguf format
+	for k, v := range tMap {
+		re := regexp.MustCompile(k)
+		newName := re.ReplaceAllString(n, v)
+		if newName != n {
+			return newName, nil
+		}
+	}
+
+	return "", fmt.Errorf("couldn't find a layer name for '%s'", n)
+}
+
+func WriteGGUF(name string, tensors []llm.Tensor, params *Params, vocab *Vocab) (string, error) {
+	c := llm.ContainerGGUF{
+		ByteOrder: binary.LittleEndian,
+	}
+
+	m := llm.NewGGUFModel(&c)
+	m.Tensors = tensors
+	m.KV["general.architecture"] = "llama"
+	m.KV["general.name"] = name
+	m.KV["llama.context_length"] = uint32(params.ContextSize)
+	m.KV["llama.embedding_length"] = uint32(params.HiddenSize)
+	m.KV["llama.block_count"] = uint32(params.HiddenLayers)
+	m.KV["llama.feed_forward_length"] = uint32(params.IntermediateSize)
+	m.KV["llama.rope.dimension_count"] = uint32(128)
+	m.KV["llama.attention.head_count"] = uint32(params.AttentionHeads)
+	m.KV["llama.attention.head_count_kv"] = uint32(params.KeyValHeads)
+	m.KV["llama.attention.layer_norm_rms_epsilon"] = float32(params.NormEPS)
+	m.KV["llama.rope.freq_base"] = float32(params.RopeFreqBase)
+	m.KV["general.file_type"] = uint32(1)
+	m.KV["tokenizer.ggml.model"] = "llama"
+
+	m.KV["tokenizer.ggml.tokens"] = vocab.Tokens
+	m.KV["tokenizer.ggml.scores"] = vocab.Scores
+	m.KV["tokenizer.ggml.token_type"] = vocab.Types
+
+	m.KV["tokenizer.ggml.bos_token_id"] = uint32(params.BoSTokenID)
+	m.KV["tokenizer.ggml.eos_token_id"] = uint32(params.EoSTokenID)
+	m.KV["tokenizer.ggml.unknown_token_id"] = uint32(0)
+	m.KV["tokenizer.ggml.add_bos_token"] = true
+	m.KV["tokenizer.ggml.add_eos_token"] = false
+
+	// llamacpp sets the chat template, however we don't need to set it since we pass it in through a layer
+	// m.KV["tokenizer.chat_template"] = "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}" // XXX removeme
+
+	c.V3.NumTensor = uint64(len(tensors))
+	c.V3.NumKV = uint64(len(m.KV))
+
+	f, err := os.CreateTemp("", "ollama-gguf")
+	if err != nil {
+		return "", err
+	}
+	defer f.Close()
+
+	err = m.Encode(f)
+	if err != nil {
+		return "", err
+	}
+
+	return f.Name(), nil
+}
--- a/convert/sentencepiece/sentencepiece_model.pb.go
+++ b/convert/sentencepiece/sentencepiece_model.pb.go
--- a/convert/sentencepiece_model.proto
+++ b/convert/sentencepiece_model.proto
@@ -0,0 +1,333 @@
+// Copyright 2016 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.!
+
+syntax = "proto2";
+
+// TODO(taku): Needs to use LITE RUNTIME in OSS release.
+option optimize_for = LITE_RUNTIME;
+option go_package = "./sentencepiece";
+
+package sentencepiece;
+
+// TrainerSpec encodes a various parameters for SentencePiece training.
+// Next id: 55
+message TrainerSpec {
+  ///////////////////////////////////////////////////////////////////
+  // General parameters
+  //
+  // Input corpus files.
+  //  Trainer accepts the following two formats:
+  //  A) Monolingual: plain text, one sentence per line.
+  //  B) Bilingual:   TSV, source sentence <tab> target sentence
+  //  When bilingual data is passed, shared vocabulary model is built.
+  //  Note that the input file must be raw corpus, not a preprocessed corpus.
+  //  Trainer only loads the first `input_sentence_size` sentences specified
+  //  with this parameter.
+  repeated string input = 1;
+
+  // Input corpus format:
+  // "text": one-sentence-per-line text format (default)
+  // "tsv":  sentence <tab> freq
+  optional string input_format = 7;
+
+  // Output model file prefix.
+  // <model_prefix>.model and <model_prefix>.vocab are generated.
+  optional string model_prefix = 2;
+
+  // Model type. only have UNIGRAM now.
+  enum ModelType {
+    UNIGRAM = 1;  // Unigram language model with dynamic algorithm
+    BPE = 2;      // Byte Pair Encoding
+    WORD = 3;     // Delimitered by whitespace.
+    CHAR = 4;     // tokenizes into character sequence
+  }
+  optional ModelType model_type = 3 [default = UNIGRAM];
+
+  // Vocabulary size. 8k is the default size.
+  optional int32 vocab_size = 4 [default = 8000];
+
+  // List of the languages this model can accept.
+  // Since the model is language-agnostic, this field is used as a reference.
+  repeated string accept_language = 5;
+
+  // Size of self-test samples, which are encoded in the model file.
+  optional int32 self_test_sample_size = 6 [default = 0];
+
+  // Whether to use DP version of sentencepiece. Use it with TSV input format
+  // (requires precomputed word tab counts to work).
+  optional bool enable_differential_privacy = 50 [default = false];
+  // Set these parameters if you need DP version of sentencepiece.
+  // std of noise to add.
+  optional float differential_privacy_noise_level = 51 [default = 0.0];
+  // Clipping threshold to apply after adding noise. All the words with
+  // frequency less than this value are dropped.
+  optional uint64 differential_privacy_clipping_threshold = 52 [default = 0];
+
+  ///////////////////////////////////////////////////////////////////
+  // Training parameters.
+  //
+  // Uses characters which cover the corpus with the ratio of `chars_coverage`.
+  // This parameter determines the set of basic Alphabet of sentence piece.
+  // 1.0 - `chars_coverage` characters are treated as UNK.
+  // See also required_chars field.
+  optional float character_coverage = 10 [default = 0.9995];
+
+  // Maximum size of sentences the trainer loads from `input` parameter.
+  // Trainer simply loads the `input` files in sequence.
+  // It is better to shuffle the input corpus randomly.
+  optional uint64 input_sentence_size = 11 [default = 0];
+  optional bool shuffle_input_sentence = 19 [default = true];
+
+  // Maximum size of sentences to make seed sentence pieces.
+  // Extended suffix array is constructed to extract frequent
+  // sub-strings from the corpus. This uses 20N working space,
+  // where N is the size of corpus.
+  optional int32 mining_sentence_size = 12 [deprecated = true];
+
+  // Maximum size of sentences to train sentence pieces.
+  optional int32 training_sentence_size = 13 [deprecated = true];
+
+  // The size of seed sentencepieces.
+  // `seed_sentencepiece_size` must be larger than `vocab_size`.
+  optional int32 seed_sentencepiece_size = 14 [default = 1000000];
+
+  // In every EM sub-iterations, keeps top
+  // `shrinking_factor` * `current sentencepieces size` with respect to
+  // the loss of the sentence piece. This value should be smaller than 1.0.
+  optional float shrinking_factor = 15 [default = 0.75];
+
+  // The maximum sentence length in byte. The sentences with the length
+  // larger than `max_sentence_length` is simply ignored.
+  // Longer input tends to bring the following risks:
+  //  * Overflow during EM training (unigram language model only)
+  //  * Performance drop because of O(n log n) cost in BPE.
+  optional int32 max_sentence_length = 18 [default = 4192];
+
+  // Number of threads in the training.
+  optional int32 num_threads = 16 [default = 16];
+
+  // Number of EM sub iterations.
+  optional int32 num_sub_iterations = 17 [default = 2];
+
+  ///////////////////////////////////////////////////////////////////
+  // SentencePiece parameters which control the shapes of sentence piece.
+  //
+  // Maximum length of sentencepiece.
+  optional int32 max_sentencepiece_length = 20 [default = 16];
+
+  // Uses Unicode script to split sentence pieces.
+  // When `split_by_unicode_script` is true, we do not allow sentence piece to
+  // include multiple Unicode scripts, e.g. "F1" is not a valid piece.
+  // Exception: CJ characters (Hiragana/Katakana/Han) are all handled
+  // as one script type, since Japanese word can consist of multiple scripts.
+  // This exception is always applied regardless of the accept-language
+  // parameter.
+  optional bool split_by_unicode_script = 21 [default = true];
+
+  // When `split_by_number` is true, put a boundary between number and
+  // non-number transition. If we want to treat "F1" is one token, set this flag
+  // to be false.
+  optional bool split_by_number = 23 [default = true];
+
+  // Use a white space to split sentence pieces.
+  // When `split_by_whitespace` is false, we may have the piece containing
+  // a white space in the middle. e.g., "in_the".
+  optional bool split_by_whitespace = 22 [default = true];
+
+  // Adds whitespace symbol (_) as a suffix instead of prefix. e.g., _hello =>
+  // hello_. When `treat_whitespace_as_suffix` is true,
+  // NormalizerSpec::add_dummy_prefix will add the dummy whitespace to the end
+  // of sentence.
+  optional bool treat_whitespace_as_suffix = 24 [default = false];
+
+  // Allows pieces that only contain whitespaces instead of appearing only as
+  // prefix or suffix of other pieces.
+  optional bool allow_whitespace_only_pieces = 26 [default = false];
+
+  // Split all digits (0-9) into separate pieces.
+  optional bool split_digits = 25 [default = false];
+
+  // Defines the pre-tokenization delimiter.
+  // When specified, no pieces crossing this delimiter is not included
+  // in the vocab. Then the delimiter string is virtually ignored
+  // during the training. This field can allows constraints on the vocabulary
+  // selection. Note that this field is available on unigram mode.
+  optional string pretokenization_delimiter = 53 [ default = ""];
+
+  ///////////////////////////////////////////////////////////////////
+  // Vocabulary management
+  //
+  // Defines control symbols used as an indicator to
+  // change the behavior of the decoder. <s> and </s> are pre-defined.
+  // We can use this field to encode various meta information,
+  // including language indicator in multilingual model.
+  // These symbols are not visible to users, but visible to
+  // the decoder. Note that when the input sentence contains control symbols,
+  // they are not treated as one token, but segmented into normal pieces.
+  // Control symbols must be inserted independently from the segmentation.
+  repeated string control_symbols = 30;
+
+  // Defines user defined symbols.
+  // These symbols are added with extremely high score
+  // so they are always treated as one unique symbol in any context.
+  // Typical usage of user_defined_symbols is placeholder for named entities.
+  repeated string user_defined_symbols = 31;
+
+  // Defines required characters. Each UTF8 character in this string is included
+  // in the character set regardless of character_coverage value. Unlike
+  // user_defined_symbols, these characters have scores based on the frequency
+  // on input sentences, and the model can form subwords using characters
+  // in this field.
+  optional string required_chars = 36;
+
+  // Decomposes unknown pieces into UTF-8 bytes.
+  optional bool byte_fallback = 35 [default = false];
+
+  // When creating the vocabulary file, defines whether or not to additionally
+  // output the score for each piece.
+  optional bool vocabulary_output_piece_score = 32 [default = true];
+
+  // `vocab_size` is treated as hard limit. Crash if
+  // the model can not produce the vocab of size `vocab_size`,
+  // When `hard_vocab_limit` is false, vocab_size is treated
+  // as soft limit. Note that when model_type=char,
+  // always assumes hard_vocab_limit = false.
+  optional bool hard_vocab_limit = 33 [default = true];
+
+  // use all symbols for vocab extraction. This flag is valid
+  // if model type is either CHAR or WORD
+  optional bool use_all_vocab = 34 [default = false];
+
+  ///////////////////////////////////////////////////////////////////
+  // Reserved special meta tokens.
+  // * -1 is not used.
+  // * unk_id must not be -1.
+  // Id must starts with 0 and be contigous.
+  optional int32 unk_id = 40 [default = 0];   // <unk>
+  optional int32 bos_id = 41 [default = 1];   // <s>
+  optional int32 eos_id = 42 [default = 2];   // </s>
+  optional int32 pad_id = 43 [default = -1];  // <pad> (padding)
+  optional string unk_piece = 45 [default = "<unk>"];
+  optional string bos_piece = 46 [default = "<s>"];
+  optional string eos_piece = 47 [default = "</s>"];
+  optional string pad_piece = 48 [default = "<pad>"];
+
+  // Encodes <unk> into U+2047 (DOUBLE QUESTION MARK),
+  // since this character can be useful both for user and
+  // developer. We can easily figure out that <unk> is emitted.
+  optional string unk_surface = 44 [default = " \xE2\x81\x87 "];
+
+  // Increase bit depth to allow unigram model training on large
+  // (>10M sentences) corpora. A Side-effect of enabling this flag
+  // is increased memory usage.
+  optional bool train_extremely_large_corpus = 49 [default = false];
+
+ // Path to a seed sentencepieces file, with one tab-separated
+  // seed sentencepiece <tab> frequency per line.
+  optional string seed_sentencepieces_file = 54 [default = ""];
+
+  // Customized extensions: the range of field numbers
+  // are open to third-party extensions.
+  extensions 200 to max;
+}
+
+// NormalizerSpec encodes a various parameters for string normalizaiton
+message NormalizerSpec {
+  // name of normalization rule.
+  optional string name = 1;
+
+  // Pre-compiled normalization rule created by
+  // Builder::GetPrecompiledCharsMap() or Builder::CompileCharsMap() method.
+  // Usually this field is set by Builder::GetNormalizerSpec() method.
+  optional bytes precompiled_charsmap = 2;
+
+  // Adds dummy whitespace at the beginning of text in order to
+  // treat "world" in "world" and "hello world" in the same way.
+  optional bool add_dummy_prefix = 3 [default = true];
+
+  // Removes leading, trailing, and duplicate internal whitespace.
+  optional bool remove_extra_whitespaces = 4 [default = true];
+
+  // Replaces whitespace with meta symbol.
+  // This field must be true to train sentence piece model.
+  optional bool escape_whitespaces = 5 [default = true];
+
+  // Custom normalization rule file in TSV format.
+  // https://github.com/google/sentencepiece/blob/master/doc/normalization.md
+  // This field is only used in SentencePieceTrainer::Train() method, which
+  // compiles the rule into the binary rule stored in `precompiled_charsmap`.
+  optional string normalization_rule_tsv = 6;
+
+  // Customized extensions: the range of field numbers
+  // are open to third-party extensions.
+  extensions 200 to max;
+}
+
+// Proto to store samples for self-testing.
+message SelfTestData {
+  message Sample {
+    optional string input = 1;
+    optional string expected = 2;
+  }
+  repeated Sample samples = 1;
+
+  // Customized extensions: the range of field numbers
+  // are open to third-party extensions.
+  extensions 200 to max;
+}
+
+// ModelProto stores model parameters.
+// SentencePieceProcessor is supposed to be self-contained.
+// All settings/parameters which may change the behavior must be encoded
+// in ModelProto.
+message ModelProto {
+  message SentencePiece {
+    enum Type {
+      NORMAL = 1;        // normal symbol
+      UNKNOWN = 2;       // unknown symbol. only <unk> for now.
+      CONTROL = 3;       // control symbols. </s>, <s>, <2ja> etc.
+      USER_DEFINED = 4;  // user defined symbols.
+                         // Typical usage of USER_DEFINED symbol
+                         // is placeholder.
+      BYTE = 6;          // byte symbols. Used when `byte_fallback` is true.
+      UNUSED = 5;        // this piece is not used.
+    }
+    optional string piece = 1;  // piece must not be empty.
+    optional float score = 2;
+    optional Type type = 3 [default = NORMAL];
+
+    // Customized extensions: the range of field numbers
+    // are open to third-party extensions.
+    extensions 200 to max;
+  }
+
+  // Sentence pieces with scores.
+  repeated SentencePiece pieces = 1;
+
+  // Spec used to generate this model file.
+  optional TrainerSpec trainer_spec = 2;
+
+  // Spec for text normalization.
+  optional NormalizerSpec normalizer_spec = 3;
+
+  // Stores sample input and its expected segmentation to verify the model.
+  optional SelfTestData self_test_data = 4;
+
+  // Spec for text de-normalization.
+  optional NormalizerSpec denormalizer_spec = 5;
+
+  // Customized extensions: the range of field numbers
+  // are open to third-party extensions.
+  extensions 200 to max;
+}
--- a/docs/api.md
+++ b/docs/api.md
@@ -54,7 +54,7 @@ Advanced parameters (optional):

 #### JSON mode

-Enable JSON mode by setting the `format` parameter to `json`. This will structure the response as a valid JSON object. See the JSON mode [example](#generate-request-json-mode) below.
+Enable JSON mode by setting the `format` parameter to `json`. This will structure the response as a valid JSON object. See the JSON mode [example](#request-json-mode) below.

 > Note: it's important to instruct the model to use JSON in the `prompt`. Otherwise, the model may generate large amounts whitespace.

@@ -256,9 +256,9 @@ For reproducible outputs, set `temperature` to 0 and `seed` to a number:
 ```shell
 curl http://localhost:11434/api/generate -d '{
  "model": "mistral",
-  "prompt": "[INST] why is the sky blue? [/INST]",
+  "prompt": "Why is the sky blue?",
  "options": {
-    "seed": 101,
+    "seed": 123,
    "temperature": 0
  }
 }'
@@ -321,7 +321,6 @@ curl http://localhost:11434/api/generate -d '{
    "vocab_only": false,
    "use_mmap": true,
    "use_mlock": false,
-    "embedding_only": false,
    "rope_frequency_base": 1.1,
    "rope_frequency_scale": 0.8,
    "num_thread": 8
--- a/docs/development.md
+++ b/docs/development.md
@@ -3,7 +3,7 @@
 Install required tools:

 - cmake version 3.24 or higher
- go version 1.21 or higher
+- go version 1.22 or higher
 - gcc version 11.4.0 or higher

 ```bash
@@ -42,15 +42,15 @@ Now you can run `ollama`:

 #### Linux CUDA (NVIDIA)

-*Your operating system distribution may already have packages for NVIDIA CUDA. Distro packages are often preferable, but instructions are distro-specific. Please consult distro-specific docs for dependencies if available!*
+_Your operating system distribution may already have packages for NVIDIA CUDA. Distro packages are often preferable, but instructions are distro-specific. Please consult distro-specific docs for dependencies if available!_

 Install `cmake` and `golang` as well as [NVIDIA CUDA](https://developer.nvidia.com/cuda-downloads)
-development and runtime packages. 
+development and runtime packages.

 Typically the build scripts will auto-detect CUDA, however, if your Linux distro
 or installation approach uses unusual paths, you can specify the location by
 specifying an environment variable `CUDA_LIB_DIR` to the location of the shared
-libraries, and `CUDACXX` to the location of the nvcc compiler.  You can customize
+libraries, and `CUDACXX` to the location of the nvcc compiler. You can customize
 set set of target CUDA architectues by setting `CMAKE_CUDA_ARCHITECTURES` (e.g. "50;60;70")

 Then generate dependencies:
@@ -67,15 +67,15 @@ go build .

 #### Linux ROCm (AMD)

-*Your operating system distribution may already have packages for AMD ROCm and CLBlast. Distro packages are often preferable, but instructions are distro-specific. Please consult distro-specific docs for dependencies if available!*
+_Your operating system distribution may already have packages for AMD ROCm and CLBlast. Distro packages are often preferable, but instructions are distro-specific. Please consult distro-specific docs for dependencies if available!_

-Install [CLBlast](https://github.com/CNugteren/CLBlast/blob/master/doc/installation.md) and [ROCm](https://rocm.docs.amd.com/en/latest/deploy/linux/quick_start.html) developement packages first, as well as `cmake` and `golang`.
+Install [CLBlast](https://github.com/CNugteren/CLBlast/blob/master/doc/installation.md) and [ROCm](https://rocm.docs.amd.com/en/latest/deploy/linux/quick_start.html) development packages first, as well as `cmake` and `golang`.

 Typically the build scripts will auto-detect ROCm, however, if your Linux distro
 or installation approach uses unusual paths, you can specify the location by
 specifying an environment variable `ROCM_PATH` to the location of the ROCm
 install (typically `/opt/rocm`), and `CLBlast_DIR` to the location of the
-CLBlast install (typically `/usr/lib/cmake/CLBlast`).  You can also customize
+CLBlast install (typically `/usr/lib/cmake/CLBlast`). You can also customize
 the AMD GPU targets by setting AMDGPU_TARGETS (e.g. `AMDGPU_TARGETS="gfx1101;gfx1102"`)

 ```
@@ -88,17 +88,17 @@ Then build the binary:
 go build .
 ```

-ROCm requires elevated privileges to access the GPU at runtime.  On most distros you can add your user account to the `render` group, or run as root.
+ROCm requires elevated privileges to access the GPU at runtime. On most distros you can add your user account to the `render` group, or run as root.

 #### Advanced CPU Settings

 By default, running `go generate ./...` will compile a few different variations
 of the LLM library based on common CPU families and vector math capabilities,
 including a lowest-common-denominator which should run on almost any 64 bit CPU
-somewhat slowly.  At runtime, Ollama will auto-detect the optimal variation to
-load.  If you would like to build a CPU-based build customized for your
+somewhat slowly. At runtime, Ollama will auto-detect the optimal variation to
+load. If you would like to build a CPU-based build customized for your
 processor, you can set `OLLAMA_CUSTOM_CPU_DEFS` to the llama.cpp flags you would
-like to use.  For example, to compile an optimized binary for an Intel i9-9880H,
+like to use. For example, to compile an optimized binary for an Intel i9-9880H,
 you might use:

 ```
@@ -108,8 +108,7 @@ go build .

 #### Containerized Linux Build

-If you have Docker available, you can build linux binaries with `./scripts/build_linux.sh` which has the CUDA and ROCm dependencies included.  The resulting binary is placed in `./dist`
-
+If you have Docker available, you can build linux binaries with `./scripts/build_linux.sh` which has the CUDA and ROCm dependencies included. The resulting binary is placed in `./dist`

 ### Windows

@@ -117,8 +116,8 @@ Note: The windows build for Ollama is still under development.

 Install required tools:

- MSVC toolchain - C/C++ and cmake as minimal requirements
- go version 1.21 or higher
+- MSVC toolchain - C/C++ and cmake as minimal requirements - You must build from a "Developer Shell" with the environment variables set
+- go version 1.22 or higher
 - MinGW (pick one variant) with GCC.
  - <https://www.mingw-w64.org/>
  - <https://www.msys2.org/>
@@ -133,6 +132,6 @@ go build .

 #### Windows CUDA (NVIDIA)

-In addition to the common Windows development tools described above, install:
+In addition to the common Windows development tools described above, install CUDA **AFTER** you install MSVC.

 - [NVIDIA CUDA](https://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/index.html)
--- a/docs/faq.md
+++ b/docs/faq.md
@@ -113,9 +113,9 @@ If a different directory needs to be used, set the environment variable `OLLAMA_

 Refer to the section [above](#how-do-i-configure-ollama-server) for how to set environment variables on your platform.

-## Does Ollama send my prompts and answers back to Ollama.ai to use in any way?
+## Does Ollama send my prompts and answers back to ollama.com?

-No, Ollama runs entirely locally, and conversation data will never leave your machine.
+No. Ollama runs locally, and conversation data does not leave your machine.

 ## How can I use Ollama in Visual Studio Code?

--- a/docs/import.md
+++ b/docs/import.md
@@ -124,7 +124,10 @@ ollama run example "What is your favourite condiment?"
 Publishing models is in early alpha. If you'd like to publish your model to share with others, follow these steps:

 1. Create [an account](https://ollama.com/signup)
-2. Run `cat ~/.ollama/id_ed25519.pub` (or `type %USERPROFILE%\.ollama\id_ed25519.pub` on Windows) to view your Ollama public key. Copy this to the clipboard.
+2. Copy your Ollama public key:
+  - macOS: `cat ~/.ollama/id_ed25519.pub`
+  - Windows: `type %USERPROFILE%\.ollama\id_ed25519.pub`
+  - Linux: `cat /usr/share/ollama/.ollama/id_ed25519.pub`
 3. Add your public key to your [Ollama account](https://ollama.com/settings/keys)

 Next, copy your model to your username's namespace:
--- a/docs/linux.md
+++ b/docs/linux.md
@@ -10,6 +10,14 @@ Install Ollama running this one-liner:
 curl -fsSL https://ollama.com/install.sh | sh
 ```

+## AMD Radeon GPU support
+
+While AMD has contributed the `amdgpu` driver upstream to the official linux
+kernel source, the version is older and may not support all ROCm features. We
+recommend you install the latest driver from
+https://www.amd.com/en/support/linux-drivers for best support of your Radeon
+GPU.
+
 ## Manual install

 ### Download the `ollama` binary
--- a/docs/troubleshooting.md
+++ b/docs/troubleshooting.md
@@ -67,6 +67,43 @@ You can see what features your CPU has with the following.
 cat /proc/cpuinfo| grep flags  | head -1
 ```

+## AMD Radeon GPU Support
+
+Ollama leverages the AMD ROCm library, which does not support all AMD GPUs. In
+some cases you can force the system to try to use a close GPU type.  For example
+The Radeon RX 5400 is `gfx1034` (also known as 10.3.4) however, ROCm does not
+support this patch-level, the closest support is `gfx1030`.  You can use the
+environment variable `HSA_OVERRIDE_GFX_VERSION` with `x.y.z` syntax.  So for
+example, to force the system to run on the RX 5400, you would set
+`HSA_OVERRIDE_GFX_VERSION="10.3.0"` as an environment variable for the server.
+
+At this time, the known supported GPU types are the following: (This may change from
+release to release)
+- gfx900
+- gfx906
+- gfx908
+- gfx90a
+- gfx940
+- gfx941
+- gfx942
+- gfx1030
+- gfx1100
+- gfx1101
+- gfx1102
+
+This will not work for all unsupported GPUs.  Reach out on [Discord](https://discord.gg/ollama)
+or file an [issue](https://github.com/ollama/ollama/issues) for additional help.
+
+
+## Installing older versions on Linux
+
+If you run into problems on Linux and want to install an older version you can tell the install script
+which version to install.
+
+```sh
+curl -fsSL https://ollama.com/install.sh | OLLAMA_VERSION="0.1.27" sh
+```
+
 ## Known issues

 * N/A
--- a/docs/tutorials/langchainpy.md
+++ b/docs/tutorials/langchainpy.md
@@ -42,12 +42,12 @@ text_splitter=RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
 all_splits = text_splitter.split_documents(data)
 ```

-It's split up, but we have to find the relevant splits and then submit those to the model. We can do this by creating embeddings and storing them in a vector database. We can use Ollama directly to instantiate an embedding model. We will use ChromaDB in this example for a vector database. `pip install GPT4All chromadb`
+It's split up, but we have to find the relevant splits and then submit those to the model. We can do this by creating embeddings and storing them in a vector database. We can use Ollama directly to instantiate an embedding model. We will use ChromaDB in this example for a vector database. `pip install chromadb`

 ```python
 from langchain.embeddings import OllamaEmbeddings
 from langchain.vectorstores import Chroma
-oembed = OllamaEmbeddings(base_url="http://localhost:11434", model="llama2")
+oembed = OllamaEmbeddings(base_url="http://localhost:11434", model="nomic-embed-text")
 vectorstore = Chroma.from_documents(documents=all_splits, embedding=oembed)
 ```

@@ -66,7 +66,7 @@ The next thing is to send the question and the relevant parts of the docs to the
 ```python
 from langchain.chains import RetrievalQA
 qachain=RetrievalQA.from_chain_type(ollama, retriever=vectorstore.as_retriever())
-qachain({"query": question})
+qachain.invoke({"query": question})
 ```

 The answer received from this chain was:
--- a/docs/windows.md
+++ b/docs/windows.md
@@ -4,7 +4,7 @@ Welcome to the Ollama Windows preview.

 No more WSL required!

-Ollama now runs as a native Windows application, including NVIDIA GPU support.
+Ollama now runs as a native Windows application, including NVIDIA and AMD Radeon GPU support.
 After installing Ollama Windows Preview, Ollama will run in the background and
 the `ollama` command line is available in `cmd`, `powershell` or your favorite
 terminal application. As usual the Ollama [api](./api.md) will be served on
@@ -21,6 +21,7 @@ Logs will often be helpful in dianosing the problem (see

 * Windows 10 or newer, Home or Pro
 * NVIDIA 452.39 or newer Drivers if you have an NVIDIA card
+* AMD Radeon Driver https://www.amd.com/en/support if you have a Radeon card

 ## API Access

--- a/examples/modelfile-tweetwriter/readme.md
+++ b/examples/modelfile-tweetwriter/readme.md
@@ -1,23 +0,0 @@
-# Example Modelfile - Tweetwriter
-
-This simple examples shows what you can do without any code, simply relying on a Modelfile. The file has two instructions:
-
-1. FROM - The From instructions defines the parent model to use for this one. If you choose a model from the library, you can enter just the model name. For all other models, you need to specify the namespace as well. You could also use a local file. Just include the relative path to the converted, quantized model weights file. To learn more about creating that file, see the `import.md` file in the docs folder of this repository.
-2. SYSTEM - This defines the system prompt for the model and overrides the system prompt from the parent model.
-
-## Running the Example
-
-1. Create the model:
-
-   ```bash
-   ollama create tweetwriter
-   ```
-
-2. Enter a topic to generate a tweet about.
-3. Show the Modelfile in the REPL.
-
-   ```bash
-   /show modelfile
-   ```
-
-   Notice that the FROM and SYSTEM match what was in the file. But there is also a TEMPLATE and PARAMETER. These are inherited from the parent model.
--- a/format/openssh.go
+++ b/format/openssh.go
@@ -1,102 +0,0 @@
-// Copyright 2012 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// Code originally from https://go-review.googlesource.com/c/crypto/+/218620
-
-// TODO: replace with upstream once the above change is merged and released.
-
-package format
-
-import (
-	"crypto"
-	"crypto/ed25519"
-	"crypto/rand"
-	"encoding/binary"
-	"encoding/pem"
-	"fmt"
-
-	"golang.org/x/crypto/ssh"
-)
-
-const privateKeyAuthMagic = "openssh-key-v1\x00"
-
-type openSSHEncryptedPrivateKey struct {
-	CipherName string
-	KDFName    string
-	KDFOptions string
-	KeysCount  uint32
-	PubKey     []byte
-	KeyBlocks  []byte
-}
-
-type openSSHPrivateKey struct {
-	Check1  uint32
-	Check2  uint32
-	Keytype string
-	Rest    []byte `ssh:"rest"`
-}
-
-type openSSHEd25519PrivateKey struct {
-	Pub     []byte
-	Priv    []byte
-	Comment string
-	Pad     []byte `ssh:"rest"`
-}
-
-func OpenSSHPrivateKey(key crypto.PrivateKey, comment string) (*pem.Block, error) {
-	var check uint32
-	if err := binary.Read(rand.Reader, binary.BigEndian, &check); err != nil {
-		return nil, err
-	}
-
-	var pk1 openSSHPrivateKey
-	pk1.Check1 = check
-	pk1.Check2 = check
-
-	var w openSSHEncryptedPrivateKey
-	w.KeysCount = 1
-
-	if k, ok := key.(*ed25519.PrivateKey); ok {
-		key = *k
-	}
-
-	switch k := key.(type) {
-	case ed25519.PrivateKey:
-		pub, priv := k[32:], k
-		key := openSSHEd25519PrivateKey{
-			Pub:     pub,
-			Priv:    priv,
-			Comment: comment,
-		}
-
-		pk1.Keytype = ssh.KeyAlgoED25519
-		pk1.Rest = ssh.Marshal(key)
-
-		w.PubKey = ssh.Marshal(struct {
-			KeyType string
-			Pub     []byte
-		}{
-			ssh.KeyAlgoED25519, pub,
-		})
-	default:
-		return nil, fmt.Errorf("ssh: unknown key type %T", k)
-	}
-
-	w.KeyBlocks = openSSHPadding(ssh.Marshal(pk1), 8)
-
-	w.CipherName, w.KDFName, w.KDFOptions = "none", "none", ""
-
-	return &pem.Block{
-		Type:  "OPENSSH PRIVATE KEY",
-		Bytes: append([]byte(privateKeyAuthMagic), ssh.Marshal(w)...),
-	}, nil
-}
-
-func openSSHPadding(block []byte, blocksize int) []byte {
-	for i, j := 0, len(block); (j+i)%blocksize != 0; i++ {
-		block = append(block, byte(i+1))
-	}
-
-	return block
-}
--- a/go.mod
+++ b/go.mod
@@ -1,37 +1,45 @@
 module github.com/jmorganca/ollama

-go 1.21
+go 1.22
+
+toolchain go1.22.0

 require (
+	github.com/containerd/console v1.0.3
+	github.com/d4l3k/go-bfloat16 v0.0.0-20211005043715-690c3bdd05f1
 	github.com/emirpasic/gods v1.18.1
 	github.com/gin-gonic/gin v1.9.1
+	github.com/golang/protobuf v1.5.0
+	github.com/google/uuid v1.0.0
+	github.com/mitchellh/mapstructure v1.5.0
 	github.com/olekukonko/tablewriter v0.0.5
 	github.com/spf13/cobra v1.7.0
 	github.com/stretchr/testify v1.8.4
+	github.com/x448/float16 v0.8.4
 	golang.org/x/sync v0.3.0
 )

+require github.com/pdevine/tensor v0.0.0-20240228013915-64ccaa8d9ca9
+
 require (
-	github.com/containerd/console v1.0.3 // indirect
-	github.com/cratonica/2goarray v0.0.0-20190331194516-514510793eaa // indirect
+	github.com/apache/arrow/go/arrow v0.0.0-20201229220542-30ce2eb5d4dc // indirect
+	github.com/chewxy/hm v1.0.0 // indirect
+	github.com/chewxy/math32 v1.0.8 // indirect
 	github.com/davecgh/go-spew v1.1.1 // indirect
-	github.com/getlantern/context v0.0.0-20190109183933-c447772a6520 // indirect
-	github.com/getlantern/errors v0.0.0-20190325191628-abdb3e3e36f7 // indirect
-	github.com/getlantern/golog v0.0.0-20190830074920-4ef2e798c2d7 // indirect
-	github.com/getlantern/hex v0.0.0-20190417191902-c6586a6fe0b7 // indirect
-	github.com/getlantern/hidden v0.0.0-20190325191715-f02dbb02be55 // indirect
-	github.com/getlantern/ops v0.0.0-20190325191751-d70cb0d6f85f // indirect
-	github.com/getlantern/systray v1.2.2 // indirect
-	github.com/go-stack/stack v1.8.0 // indirect
-	github.com/google/uuid v1.0.0 // indirect
+	github.com/gogo/protobuf v1.3.2 // indirect
+	github.com/google/flatbuffers v1.12.0 // indirect
 	github.com/mattn/go-runewidth v0.0.14 // indirect
-	github.com/oxtoacart/bpool v0.0.0-20190530202638-03653db5a59c // indirect
-	github.com/pborman/uuid v1.2.1 // indirect
+	github.com/pkg/errors v0.9.1 // indirect
 	github.com/pmezard/go-difflib v1.0.0 // indirect
 	github.com/rivo/uniseg v0.2.0 // indirect
+	github.com/xtgo/set v1.0.0 // indirect
+	go4.org/unsafe/assume-no-moving-gc v0.0.0-20231121144256-b99613f794b6 // indirect
+	golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 // indirect
+	gonum.org/v1/gonum v0.8.2 // indirect
+	gorgonia.org/vecf32 v0.9.0 // indirect
+	gorgonia.org/vecf64 v0.9.0 // indirect
 )

-
 require (
 	github.com/bytedance/sonic v1.9.1 // indirect
 	github.com/chenzhuoyu/base64x v0.0.0-20221115062448-fe3a3abad311 // indirect
@@ -50,7 +58,6 @@ require (
 	github.com/mattn/go-isatty v0.0.19 // indirect
 	github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
 	github.com/modern-go/reflect2 v1.0.2 // indirect
-	github.com/pbnjay/memory v0.0.0-20210728143218-7b4eea64cf58
 	github.com/pelletier/go-toml/v2 v2.0.8 // indirect
 	github.com/spf13/pflag v1.0.5 // indirect
 	github.com/twitchyliquid64/golang-asm v0.15.1 // indirect
@@ -62,6 +69,6 @@ require (
 	golang.org/x/sys v0.13.0
 	golang.org/x/term v0.13.0
 	golang.org/x/text v0.13.0 // indirect
-	google.golang.org/protobuf v1.30.0 // indirect
+	google.golang.org/protobuf v1.30.0
 	gopkg.in/yaml.v3 v3.0.1 // indirect
 )
--- a/go.sum
+++ b/go.sum
@@ -1,36 +1,40 @@
+cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw=
+github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
+github.com/ajstarks/svgo v0.0.0-20180226025133-644b8db467af/go.mod h1:K08gAheRH3/J6wwsYMMT4xOr94bZjxIelGM0+d/wbFw=
+github.com/apache/arrow/go/arrow v0.0.0-20201229220542-30ce2eb5d4dc h1:zvQ6w7KwtQWgMQiewOF9tFtundRMVZFSAksNV6ogzuY=
+github.com/apache/arrow/go/arrow v0.0.0-20201229220542-30ce2eb5d4dc/go.mod h1:c9sxoIT3YgLxH4UhLOCKaBlEojuMhVYpk4Ntv3opUTQ=
 github.com/bytedance/sonic v1.5.0/go.mod h1:ED5hyg4y6t3/9Ku1R6dU/4KyJ48DZ4jPhfY1O2AihPM=
 github.com/bytedance/sonic v1.9.1 h1:6iJ6NqdoxCDr6mbY8h18oSO+cShGSMRGCEo7F2h0x8s=
 github.com/bytedance/sonic v1.9.1/go.mod h1:i736AoUSYt75HyZLoJW9ERYxcy6eaN6h4BZXU064P/U=
+github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU=
 github.com/chenzhuoyu/base64x v0.0.0-20211019084208-fb5309c8db06/go.mod h1:DH46F32mSOjUmXrMHnKwZdA8wcEefY7UVqBKYGjpdQY=
 github.com/chenzhuoyu/base64x v0.0.0-20221115062448-fe3a3abad311 h1:qSGYFH7+jGhDF8vLC+iwCD4WpbV1EBDSzWkJODFLams=
 github.com/chenzhuoyu/base64x v0.0.0-20221115062448-fe3a3abad311/go.mod h1:b583jCggY9gE99b6G5LEC39OIiVsWj+R97kbl5odCEk=
+github.com/chewxy/hm v1.0.0 h1:zy/TSv3LV2nD3dwUEQL2VhXeoXbb9QkpmdRAVUFiA6k=
+github.com/chewxy/hm v1.0.0/go.mod h1:qg9YI4q6Fkj/whwHR1D+bOGeF7SniIP40VweVepLjg0=
+github.com/chewxy/math32 v1.0.0/go.mod h1:Miac6hA1ohdDUTagnvJy/q+aNnEk16qWUdb8ZVhvCN0=
+github.com/chewxy/math32 v1.0.8 h1:fU5E4Ec4Z+5RtRAi3TovSxUjQPkgRh+HbP7tKB2OFbM=
+github.com/chewxy/math32 v1.0.8/go.mod h1:dOB2rcuFrCn6UHrze36WSLVPKtzPMRAQvBvUwkSsLqs=
+github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw=
+github.com/cncf/udpa/go v0.0.0-20191209042840-269d4d468f6f/go.mod h1:M8M6+tZqaGXZJjfX53e64911xZQV5JYwmTeXPW+k8Sc=
 github.com/containerd/console v1.0.3 h1:lIr7SlA5PxZyMV30bDW0MGbiOPXwc63yRuCP0ARubLw=
 github.com/containerd/console v1.0.3/go.mod h1:7LqA/THxQ86k76b8c/EMSiaJ3h1eZkMkXar0TQ1gf3U=
 github.com/cpuguy83/go-md2man/v2 v2.0.2/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=
-github.com/cratonica/2goarray v0.0.0-20190331194516-514510793eaa h1:Wg+722vs7a2zQH5lR9QWYsVbplKeffaQFIs5FTdfNNo=
-github.com/cratonica/2goarray v0.0.0-20190331194516-514510793eaa/go.mod h1:6Arca19mRx58CA7OWEd7Wu1NpC1rd3uDnNs6s1pj/DI=
 github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
+github.com/d4l3k/go-bfloat16 v0.0.0-20211005043715-690c3bdd05f1 h1:cBzrdJPAFBsgCrDPnZxlp1dF2+k4r1kVpD7+1S1PVjY=
+github.com/d4l3k/go-bfloat16 v0.0.0-20211005043715-690c3bdd05f1/go.mod h1:uw2gLcxEuYUlAd/EXyjc/v55nd3+47YAgWbSXVxPrNI=
 github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
 github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 github.com/emirpasic/gods v1.18.1 h1:FXtiHYKDGKCW2KzwZKx0iC0PQmdlorYgdFG9jPXJ1Bc=
 github.com/emirpasic/gods v1.18.1/go.mod h1:8tpGGwCnJ5H4r6BWwaV6OrWmMoPhUl5jm/FMNAnJvWQ=
+github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
+github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
+github.com/envoyproxy/go-control-plane v0.9.4/go.mod h1:6rpuAdCZL397s3pYoYcLgu1mIlRU8Am5FuJP05cCM98=
+github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c=
+github.com/fogleman/gg v1.2.1-0.20190220221249-0403632d5b90/go.mod h1:R/bRT+9gY/C5z7JzPU0zXsXHKM4/ayA+zqcVNZzPa1k=
 github.com/gabriel-vasile/mimetype v1.4.2 h1:w5qFW6JKBz9Y393Y4q372O9A7cUSequkh1Q7OhCmWKU=
 github.com/gabriel-vasile/mimetype v1.4.2/go.mod h1:zApsH/mKG4w07erKIaJPFiX0Tsq9BFQgN3qGY5GnNgA=
-github.com/getlantern/context v0.0.0-20190109183933-c447772a6520 h1:NRUJuo3v3WGC/g5YiyF790gut6oQr5f3FBI88Wv0dx4=
-github.com/getlantern/context v0.0.0-20190109183933-c447772a6520/go.mod h1:L+mq6/vvYHKjCX2oez0CgEAJmbq1fbb/oNJIWQkBybY=
-github.com/getlantern/errors v0.0.0-20190325191628-abdb3e3e36f7 h1:6uJ+sZ/e03gkbqZ0kUG6mfKoqDb4XMAzMIwlajq19So=
-github.com/getlantern/errors v0.0.0-20190325191628-abdb3e3e36f7/go.mod h1:l+xpFBrCtDLpK9qNjxs+cHU6+BAdlBaxHqikB6Lku3A=
-github.com/getlantern/golog v0.0.0-20190830074920-4ef2e798c2d7 h1:guBYzEaLz0Vfc/jv0czrr2z7qyzTOGC9hiQ0VC+hKjk=
-github.com/getlantern/golog v0.0.0-20190830074920-4ef2e798c2d7/go.mod h1:zx/1xUUeYPy3Pcmet8OSXLbF47l+3y6hIPpyLWoR9oc=
-github.com/getlantern/hex v0.0.0-20190417191902-c6586a6fe0b7 h1:micT5vkcr9tOVk1FiH8SWKID8ultN44Z+yzd2y/Vyb0=
-github.com/getlantern/hex v0.0.0-20190417191902-c6586a6fe0b7/go.mod h1:dD3CgOrwlzca8ed61CsZouQS5h5jIzkK9ZWrTcf0s+o=
-github.com/getlantern/hidden v0.0.0-20190325191715-f02dbb02be55 h1:XYzSdCbkzOC0FDNrgJqGRo8PCMFOBFL9py72DRs7bmc=
-github.com/getlantern/hidden v0.0.0-20190325191715-f02dbb02be55/go.mod h1:6mmzY2kW1TOOrVy+r41Za2MxXM+hhqTtY3oBKd2AgFA=
-github.com/getlantern/ops v0.0.0-20190325191751-d70cb0d6f85f h1:wrYrQttPS8FHIRSlsrcuKazukx/xqO/PpLZzZXsF+EA=
-github.com/getlantern/ops v0.0.0-20190325191751-d70cb0d6f85f/go.mod h1:D5ao98qkA6pxftxoqzibIBBrLSUli+kYnJqrgBf9cIA=
-github.com/getlantern/systray v1.2.2 h1:dCEHtfmvkJG7HZ8lS/sLklTH4RKUcIsKrAD9sThoEBE=
-github.com/getlantern/systray v1.2.2/go.mod h1:pXFOI1wwqwYXEhLPm9ZGjS2u/vVELeIgNMY5HvhHhcE=
 github.com/gin-contrib/cors v1.4.0 h1:oJ6gwtUl3lqV0WEIwM/LxPF1QZ5qe2lGWdY2+bz7y0g=
 github.com/gin-contrib/cors v1.4.0/go.mod h1:bs9pNM0x/UsmHPBWT2xZz9ROh8xYjYkiURUfmBoMlcs=
 github.com/gin-contrib/sse v0.1.0 h1:Y/yl/+YNO8GZSjAhjMsSuLt29uWRFHdHYUb5lYOV9qE=
@@ -40,6 +44,7 @@ github.com/gin-gonic/gin v1.9.1 h1:4idEAncQnU5cB7BeOkPtxjfCSye0AAm1R0RVIqJ+Jmg=
 github.com/gin-gonic/gin v1.9.1/go.mod h1:hPrL7YrpYKXt5YId3A/Tnip5kqbEAP+KLuI3SUcPTeU=
 github.com/go-playground/assert/v2 v2.0.1/go.mod h1:VDjEfimB/XKnb+ZQfWdccd7VUvScMdVu0Titje2rxJ4=
 github.com/go-playground/assert/v2 v2.2.0 h1:JvknZsQTYeFEAhQwI4qEt9cyV5ONwRHC+lYKSsYSR8s=
+github.com/go-playground/assert/v2 v2.2.0/go.mod h1:VDjEfimB/XKnb+ZQfWdccd7VUvScMdVu0Titje2rxJ4=
 github.com/go-playground/locales v0.14.0/go.mod h1:sawfccIbzZTqEDETgFXqTho0QybSa7l++s0DH+LDiLs=
 github.com/go-playground/locales v0.14.1 h1:EWaQ/wswjilfKLTECiXz7Rh+3BjFhfDFKv/oXslEjJA=
 github.com/go-playground/locales v0.14.1/go.mod h1:hxrqLVvrK65+Rwrd5Fc6F2O76J/NuW9t0sjnWqG1slY=
@@ -49,12 +54,34 @@ github.com/go-playground/universal-translator v0.18.1/go.mod h1:xekY+UJKNuX9WP91
 github.com/go-playground/validator/v10 v10.10.0/go.mod h1:74x4gJWsvQexRdW8Pn3dXSGrTK4nAUsbPlLADvpJkos=
 github.com/go-playground/validator/v10 v10.14.0 h1:vgvQWe3XCz3gIeFDm/HnTIbj6UGmg/+t63MyGU2n5js=
 github.com/go-playground/validator/v10 v10.14.0/go.mod h1:9iXMNT7sEkjXb0I+enO7QXmzG6QCsPWY4zveKFVRSyU=
-github.com/go-stack/stack v1.8.0 h1:5SgMzNM5HxrEjV0ww2lTmX6E2Izsfxas4+YHWRs3Lsk=
-github.com/go-stack/stack v1.8.0/go.mod h1:v0f6uXyyMGvRgIKkXu+yp6POWl0qKG85gN/melR3HDY=
 github.com/goccy/go-json v0.9.7/go.mod h1:6MelG93GURQebXPDq3khkgXZkazVtN9CRI+MGFi0w8I=
 github.com/goccy/go-json v0.10.2 h1:CrxCmQqYDkv1z7lO7Wbh2HN93uovUHgrECaO5ZrCXAU=
 github.com/goccy/go-json v0.10.2/go.mod h1:6MelG93GURQebXPDq3khkgXZkazVtN9CRI+MGFi0w8I=
+github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q=
+github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q=
+github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0/go.mod h1:E/TSTwGwJL78qG/PmXZO1EjYhfJinVAhrmmHX6Z8B9k=
+github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q=
+github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A=
+github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
+github.com/golang/protobuf v1.3.2/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
+github.com/golang/protobuf v1.3.3/go.mod h1:vzj43D7+SQXF/4pzW/hwtAqwc6iTitCiVSaWz5lYuqw=
+github.com/golang/protobuf v1.4.0-rc.1/go.mod h1:ceaxUfeHdC40wWswd/P6IGgMaK3YpKi5j83Wpe3EHw8=
+github.com/golang/protobuf v1.4.0-rc.1.0.20200221234624-67d41d38c208/go.mod h1:xKAWHe0F5eneWXFV3EuXVDTCmh+JuBKY0li0aMyXATA=
+github.com/golang/protobuf v1.4.0-rc.2/go.mod h1:LlEzMj4AhA7rCAGe4KMBDvJI+AwstrUpVNzEA03Pprs=
+github.com/golang/protobuf v1.4.0-rc.4.0.20200313231945-b860323f09d0/go.mod h1:WU3c8KckQ9AFe+yFwt9sWVRKCVIyN9cPHBJSNnbL67w=
+github.com/golang/protobuf v1.4.0/go.mod h1:jodUvKwWbYaEsadDk5Fwe5c77LiNKVO9IDvqG2KuDX0=
+github.com/golang/protobuf v1.4.1/go.mod h1:U8fpvMrcmy5pZrNK1lt4xCsGvpyWQ/VVv6QDs8UjoX8=
+github.com/golang/protobuf v1.4.2/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI=
+github.com/golang/protobuf v1.5.0 h1:LUVKkCeviFUMKqHa4tXIIij/lbhnMbP7Fn5wKdKkRh4=
 github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk=
+github.com/google/flatbuffers v1.11.0/go.mod h1:1AeVuKshWv4vARoZatz6mlQ0JxURH0Kv5+zNeJKJCa8=
+github.com/google/flatbuffers v1.12.0 h1:/PtAHvnBY4Kqnx/xCQ3OIV9uYcSFGScBsWI3Oogeh6w=
+github.com/google/flatbuffers v1.12.0/go.mod h1:1AeVuKshWv4vARoZatz6mlQ0JxURH0Kv5+zNeJKJCa8=
+github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M=
+github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU=
+github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU=
+github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
+github.com/google/go-cmp v0.5.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
 github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
 github.com/google/go-cmp v0.5.9 h1:O2Tfq5qg4qc4AmwVlvv0oLiVAGB7enBSJ2x2DqQFi38=
 github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
@@ -65,6 +92,9 @@ github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2
 github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw=
 github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM=
 github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo=
+github.com/jung-kurt/gofpdf v1.0.3-0.20190309125859-24315acbbda5/go.mod h1:7Id9E/uU8ce6rXgefFLlgrJj/GYY22cpxn+r32jIOes=
+github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8=
+github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
 github.com/klauspost/cpuid/v2 v2.0.9/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg=
 github.com/klauspost/cpuid/v2 v2.2.4 h1:acbojRNwl3o09bUq+yDCtZFc1aiwaAAxtcn8YkZXnvk=
 github.com/klauspost/cpuid/v2 v2.2.4/go.mod h1:RVVoqg1df56z8g3pUjL/3lE5UfnlrJX8tyFgg4nqhuY=
@@ -79,14 +109,14 @@ github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
 github.com/leodido/go-urn v1.2.1/go.mod h1:zt4jvISO2HfUBqxjfIshjdMTYS56ZS/qv49ictyFfxY=
 github.com/leodido/go-urn v1.2.4 h1:XlAE/cm/ms7TE/VMVoduSpNBoyc2dOxHs5MZSwAN63Q=
 github.com/leodido/go-urn v1.2.4/go.mod h1:7ZrI8mTSeBSHl/UaRyKQW1qZeMgak41ANeCNaVckg+4=
-github.com/lxn/walk v0.0.0-20210112085537-c389da54e794/go.mod h1:E23UucZGqpuUANJooIbHWCufXvOcT6E7Stq81gU+CSQ=
-github.com/lxn/win v0.0.0-20210218163916-a377121e959e/go.mod h1:KxxjdtRkfNoYDCUP5ryK7XJJNTnpC8atvtmTheChOtk=
 github.com/mattn/go-isatty v0.0.14/go.mod h1:7GGIvUiUoEMVVmxf/4nioHXj79iQHKdU27kJ6hsGG94=
 github.com/mattn/go-isatty v0.0.19 h1:JITubQf0MOLdlGRuRq+jtsDlekdYPia9ZFsB8h/APPA=
 github.com/mattn/go-isatty v0.0.19/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
 github.com/mattn/go-runewidth v0.0.9/go.mod h1:H031xJmbD/WCDINGzjvQ9THkh0rPKHF+m2gUSrubnMI=
 github.com/mattn/go-runewidth v0.0.14 h1:+xnbZSEeDbOIg5/mE6JF0w6n9duR1l3/WmbinWVwUuU=
 github.com/mattn/go-runewidth v0.0.14/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w=
+github.com/mitchellh/mapstructure v1.5.0 h1:jeMsZIYE/09sWLaz43PL7Gy6RuMjD2eJVyuac5Z2hdY=
+github.com/mitchellh/mapstructure v1.5.0/go.mod h1:bFUtVrKA4DC2yAKiSyO/QUcy7e+RRV2QTWOzhPopBRo=
 github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
 github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg=
 github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
@@ -94,25 +124,23 @@ github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9G
 github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk=
 github.com/olekukonko/tablewriter v0.0.5 h1:P2Ga83D34wi1o9J6Wh1mRuqd4mF/x/lgBS7N7AbDhec=
 github.com/olekukonko/tablewriter v0.0.5/go.mod h1:hPp6KlRPjbx+hW8ykQs1w3UBbZlj6HuIJcUGPhkA7kY=
-github.com/oxtoacart/bpool v0.0.0-20190530202638-03653db5a59c h1:rp5dCmg/yLR3mgFuSOe4oEnDDmGLROTvMragMUXpTQw=
-github.com/oxtoacart/bpool v0.0.0-20190530202638-03653db5a59c/go.mod h1:X07ZCGwUbLaax7L0S3Tw4hpejzu63ZrrQiUe6W0hcy0=
-github.com/pbnjay/memory v0.0.0-20210728143218-7b4eea64cf58 h1:onHthvaw9LFnH4t2DcNVpwGmV9E1BkGknEliJkfwQj0=
-github.com/pbnjay/memory v0.0.0-20210728143218-7b4eea64cf58/go.mod h1:DXv8WO4yhMYhSNPKjeNKa5WY9YCIEBRbNzFFPJbWO6Y=
-github.com/pborman/uuid v1.2.1 h1:+ZZIw58t/ozdjRaXh/3awHfmWRbzYxJoAdNJxe/3pvw=
-github.com/pborman/uuid v1.2.1/go.mod h1:X/NO0urCmaxf9VXbdlT7C2Yzkj2IKimNn4k+gtPdI/k=
+github.com/pdevine/tensor v0.0.0-20240228013915-64ccaa8d9ca9 h1:DV4iXjNn6fGeDl1AkZ1I0QB/0DBjrc7kPpxHrmuDzW4=
+github.com/pdevine/tensor v0.0.0-20240228013915-64ccaa8d9ca9/go.mod h1:nR7l3gM6ubiOm+mCkmmUyIBUcBAyiUmW6dQrDZhugFE=
 github.com/pelletier/go-toml/v2 v2.0.1/go.mod h1:r9LEWfGN8R5k0VXJ+0BkIe7MYkRdwZOjgMj2KwnJFUo=
 github.com/pelletier/go-toml/v2 v2.0.8 h1:0ctb6s9mE31h0/lhu+J6OPmVeDxJn+kYnJc2jZR9tGQ=
 github.com/pelletier/go-toml/v2 v2.0.8/go.mod h1:vuYfssBdrU2XDZ9bYydBu6t+6a6PYNcZljzZR9VXg+4=
 github.com/pkg/diff v0.0.0-20210226163009-20ebb0f2a09e/go.mod h1:pJLUxLENpZxwdsKMEsNbx1VGcRFpLqf3715MtcvvzbA=
+github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
+github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
 github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
 github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
+github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=
 github.com/rivo/uniseg v0.2.0 h1:S1pD9weZBuJdFmowNwbpi7BJ8TNftyUImj/0WQi72jY=
 github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc=
 github.com/rogpeppe/go-internal v1.6.1/go.mod h1:xXDCJY+GAPziupqXw64V24skbSoqbTEfhy4qGm1nDQc=
 github.com/rogpeppe/go-internal v1.8.0 h1:FCbCCtXNOY3UtUuHUYaghJg4y7Fd14rXifAYUAtL9R8=
 github.com/rogpeppe/go-internal v1.8.0/go.mod h1:WmiCO8CzOY8rg0OYDC4/i/2WRWAB6poM+XZ2dLUbcbE=
 github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
-github.com/skratchdot/open-golang v0.0.0-20200116055534-eef842397966/go.mod h1:sUM3LWHvSMaG192sy56D9F7CNvL7jUJVXoqM1QKLnog=
 github.com/spf13/cobra v1.7.0 h1:hyqWnYt1ZQShIddO5kBpj3vu05/++x6tJ6dg8EC572I=
 github.com/spf13/cobra v1.7.0/go.mod h1:uLxZILRyS/50WlhOIKD7W6V5bgeIt+4sICxh6uRMrb0=
 github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA=
@@ -120,6 +148,8 @@ github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An
 github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
 github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
 github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
+github.com/stretchr/testify v1.1.4/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
+github.com/stretchr/testify v1.2.0/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
 github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
 github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
 github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
@@ -136,44 +166,127 @@ github.com/ugorji/go v1.2.7/go.mod h1:nF9osbDWLy6bDVv/Rtoh6QgnvNDpmCalQV5urGCCS6
 github.com/ugorji/go/codec v1.2.7/go.mod h1:WGN1fab3R1fzQlVQTkfxVtIBhWDRqOviHU95kRgeqEY=
 github.com/ugorji/go/codec v1.2.11 h1:BMaWp1Bb6fHwEtbplGBGJ498wD+LKlNSl25MjdZY4dU=
 github.com/ugorji/go/codec v1.2.11/go.mod h1:UNopzCgEMSXjBc6AOMqYvWC1ktqTAfzJZUZgYf6w6lg=
+github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM=
+github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg=
+github.com/xtgo/set v1.0.0 h1:6BCNBRv3ORNDQ7fyoJXRv+tstJz3m1JVFQErfeZz2pY=
+github.com/xtgo/set v1.0.0/go.mod h1:d3NHzGzSa0NmB2NhFyECA+QdRp29oEn2xbT+TpeFoM8=
+github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
+github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
+go4.org/unsafe/assume-no-moving-gc v0.0.0-20231121144256-b99613f794b6 h1:lGdhQUN/cnWdSH3291CUuxSEqc+AsGTiDxPP3r2J0l4=
+go4.org/unsafe/assume-no-moving-gc v0.0.0-20231121144256-b99613f794b6/go.mod h1:FftLjUGFEDu5k8lt0ddY+HcrH/qU/0qk+H8j9/nTl3E=
 golang.org/x/arch v0.0.0-20210923205945-b76863e36670/go.mod h1:5om86z9Hs0C8fWVUuoMHwpExlXzs5Tkyp9hOrfG7pp8=
 golang.org/x/arch v0.3.0 h1:02VY4/ZcO/gBOH6PUaoiptASxtXU10jazRCP865E97k=
 golang.org/x/arch v0.3.0/go.mod h1:5om86z9Hs0C8fWVUuoMHwpExlXzs5Tkyp9hOrfG7pp8=
+golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
+golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
+golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
 golang.org/x/crypto v0.0.0-20210711020723-a769d52b0f97/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
 golang.org/x/crypto v0.14.0 h1:wBqGXzWJW6m1XrIKlAH0Hs1JJ7+9KBwnIO8v66Q9cHc=
 golang.org/x/crypto v0.14.0/go.mod h1:MVFd36DqK4CsrnJYDkBA3VC4m2GkXAM0PvzMCn4JQf4=
+golang.org/x/exp v0.0.0-20180321215751-8460e604b9de/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
+golang.org/x/exp v0.0.0-20180807140117-3d87b88a115f/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
+golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
+golang.org/x/exp v0.0.0-20190125153040-c74c464bbbf2/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
 golang.org/x/exp v0.0.0-20230817173708-d852ddb80c63 h1:m64FZMko/V45gv0bNmrNYoDEq8U5YUhetc9cBWKS1TQ=
 golang.org/x/exp v0.0.0-20230817173708-d852ddb80c63/go.mod h1:0v4NqG35kSWCMzLaMeX+IQrlSnVE/bqGSyC2cz/9Le8=
+golang.org/x/image v0.0.0-20180708004352-c73c2afc3b81/go.mod h1:ux5Hcp/YLpHSI86hEcLt0YII63i6oz57MZXIpbrjZUs=
+golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE=
+golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU=
+golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc=
+golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
+golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
+golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
+golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
+golang.org/x/net v0.0.0-20190213061140-3a22650c66bd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
+golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
+golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
+golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
+golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
+golang.org/x/net v0.0.0-20200904194848-62affa334b73/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA=
+golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
 golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
 golang.org/x/net v0.17.0 h1:pVaXccu2ozPjCXewfr1S7xza/zcXTity9cCdXQYSjIM=
 golang.org/x/net v0.17.0/go.mod h1:NxSsAGuq816PNPmqtQdLE42eU2Fs7NoRIZrHJAlaCOE=
+golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
+golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sync v0.3.0 h1:ftCYgMx6zT/asHUrPw8BLLscYtGznsLAnjq5RH9P66E=
 golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y=
-golang.org/x/sys v0.0.0-20201018230417-eeed37f84f13/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
+golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
+golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20200909081042-eff7692f9009/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20210124154548-22da62e12c0c/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.0.0-20210630005230-0f9fa26af87c/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.0.0-20210806184541-e5e7981a1069/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.0.0-20220704084225-05e143d24a9e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
-golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.13.0 h1:Af8nKPmuFypiUBjVoU9V20FiaFXOcuZI21p0ycVYYGE=
 golang.org/x/sys v0.13.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
 golang.org/x/term v0.13.0 h1:bb+I9cTfFazGW51MZqBVmZy7+JEJMouUHTUSKVQLBek=
 golang.org/x/term v0.13.0/go.mod h1:LTmsnFJwVN6bCy1rVCoS+qHT1HhALEFxKncY3WNNh4U=
+golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
 golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
 golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
 golang.org/x/text v0.13.0 h1:ablQoSUd0tRdKxZewP80B+BaqeKJuVhuRxj/dkrun3k=
 golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE=
+golang.org/x/tools v0.0.0-20180525024113-a5b4c53f6e8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
 golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
+golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
+golang.org/x/tools v0.0.0-20190206041539-40960b6deb8e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
+golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY=
+golang.org/x/tools v0.0.0-20190311212946-11955173bddd/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs=
+golang.org/x/tools v0.0.0-20190524140312-2c0ae7006135/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q=
+golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
+golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE=
+golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA=
+golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
+golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
 golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
+golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 h1:go1bK/D/BFZV2I8cIQd1NKEZ+0owSTG1fDTci4IqFcE=
+golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
+gonum.org/v1/gonum v0.0.0-20180816165407-929014505bf4/go.mod h1:Y+Yx5eoAFn32cQvJDxZx5Dpnq+c3wtXuadVZAcxbbBo=
+gonum.org/v1/gonum v0.8.2 h1:CCXrcPKiGGotvnN6jfUsKk4rRqm7q09/YbKb5xCEvtM=
+gonum.org/v1/gonum v0.8.2/go.mod h1:oe/vMfY3deqTw+1EZJhuvEW2iwGF1bW9wwu7XCu0+v0=
+gonum.org/v1/netlib v0.0.0-20190313105609-8cb42192e0e0 h1:OE9mWmgKkjJyEmDAAtGMPjXu+YNeGvK9VTSHY6+Qihc=
+gonum.org/v1/netlib v0.0.0-20190313105609-8cb42192e0e0/go.mod h1:wa6Ws7BG/ESfp6dHfk7C6KdzKA7wR7u/rKwOGE66zvw=
+gonum.org/v1/plot v0.0.0-20190515093506-e2840ee46a6b/go.mod h1:Wt8AAjI+ypCyYX3nZBvf6cAIx93T+c/OS2HFAYskSZc=
+google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM=
+google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4=
+google.golang.org/genproto v0.0.0-20180817151627-c66870c02cf8/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc=
+google.golang.org/genproto v0.0.0-20190819201941-24fa4b261c55/go.mod h1:DMBHOl98Agz4BDEuKkezgsaosCRResVns1a3J2ZsMNc=
+google.golang.org/genproto v0.0.0-20200526211855-cb27e3aa2013/go.mod h1:NbSheEEYHJ7i3ixzK3sjbqSGDJWnxyFXZblF3eUsNvo=
+google.golang.org/genproto v0.0.0-20200911024640-645f7a48b24f h1:Yv4xsIx7HZOoyUGSJ2ksDyWE2qIBXROsZKt2ny3hCGM=
+google.golang.org/genproto v0.0.0-20200911024640-645f7a48b24f/go.mod h1:FWY/as6DDZQgahTzZj3fqbO1CbirC29ZNUFHwi0/+no=
+google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c=
+google.golang.org/grpc v1.23.0/go.mod h1:Y5yQAOtifL1yxbo5wqy6BxZv8vAUGQwXBOALyacEbxg=
+google.golang.org/grpc v1.25.1/go.mod h1:c3i+UQWmh7LiEpx4sFZnkU36qjEYZ0imhYfXVyQciAY=
+google.golang.org/grpc v1.27.0/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk=
+google.golang.org/grpc v1.32.0 h1:zWTV+LMdc3kaiJMSTOFz2UgSBgx8RNQoTGiZu3fR9S0=
+google.golang.org/grpc v1.32.0/go.mod h1:N36X2cJ7JwdamYAgDz+s+rVMFjt3numwzf/HckM8pak=
+google.golang.org/grpc/cmd/protoc-gen-go-grpc v0.0.0-20200910201057-6591123024b3/go.mod h1:6Kw0yEErY5E/yWrBtf03jp27GLLJujG4z/JK95pnjjw=
+google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8=
+google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0=
+google.golang.org/protobuf v0.0.0-20200228230310-ab0ca4ff8a60/go.mod h1:cfTl7dwQJ+fmap5saPgwCLgHXTUD7jkjRqWcaiX5VyM=
+google.golang.org/protobuf v1.20.1-0.20200309200217-e05f789c0967/go.mod h1:A+miEFZTKqfCUM6K7xSMQL9OKL/b6hQv+e19PK+JZNE=
+google.golang.org/protobuf v1.21.0/go.mod h1:47Nbq4nVaFHyn7ilMalzfO3qCViNmqZ2kzikPIcrTAo=
+google.golang.org/protobuf v1.22.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU=
+google.golang.org/protobuf v1.23.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU=
+google.golang.org/protobuf v1.23.1-0.20200526195155-81db48ad09cc/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU=
+google.golang.org/protobuf v1.24.0/go.mod h1:r/3tXBNzIEhYS9I1OUVjXDlt8tc493IdKGjtUeSXeh4=
+google.golang.org/protobuf v1.25.0/go.mod h1:9JNX74DMeImyA3h4bdi1ymwjUzf21/xIlbajtzgsN7c=
 google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw=
 google.golang.org/protobuf v1.28.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I=
 google.golang.org/protobuf v1.30.0 h1:kPPoIgf3TsEvrm0PFe15JQ+570QVxYzEvvHqChK+cng=
 google.golang.org/protobuf v1.30.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I=
-gopkg.in/Knetic/govaluate.v3 v3.0.0/go.mod h1:csKLBORsPbafmSCGTEh3U7Ozmsuq8ZSIlKk1bcqph0E=
 gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
 gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
 gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
@@ -184,4 +297,10 @@ gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C
 gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
 gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
 gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
+gorgonia.org/vecf32 v0.9.0 h1:PClazic1r+JVJ1dEzRXgeiVl4g1/Hf/w+wUSqnco1Xg=
+gorgonia.org/vecf32 v0.9.0/go.mod h1:NCc+5D2oxddRL11hd+pCB1PEyXWOyiQxfZ/1wwhOXCA=
+gorgonia.org/vecf64 v0.9.0 h1:bgZDP5x0OzBF64PjMGC3EvTdOoMEcmfAh1VCUnZFm1A=
+gorgonia.org/vecf64 v0.9.0/go.mod h1:hp7IOWCnRiVQKON73kkC/AUMtEXyf9kGlVrtPQ9ccVA=
+honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
+honnef.co/go/tools v0.0.0-20190523083050-ea95bdfd59fc/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
 rsc.io/pdf v0.1.1/go.mod h1:n8OzWcQ6Sp37PL01nO98y4iUCRdTGarVfzxY20ICaU4=
--- a/gpu/amd.go
+++ b/gpu/amd.go
@@ -1,101 +0,0 @@
-package gpu
-
-import (
-	"bufio"
-	"errors"
-	"fmt"
-	"io"
-	"log/slog"
-	"os"
-	"path/filepath"
-	"strconv"
-	"strings"
-)
-
-// TODO - windows vs. non-windows vs darwin
-
-// Discovery logic for AMD/ROCm GPUs
-
-const (
-	DriverVersionFile     = "/sys/module/amdgpu/version"
-	GPUPropertiesFileGlob = "/sys/class/kfd/kfd/topology/nodes/*/properties"
-	// TODO probably break these down per GPU to make the logic simpler
-	GPUTotalMemoryFileGlob = "/sys/class/kfd/kfd/topology/nodes/*/mem_banks/*/properties" // size_in_bytes line
-	GPUUsedMemoryFileGlob  = "/sys/class/kfd/kfd/topology/nodes/*/mem_banks/*/used_memory"
-)
-
-func AMDDetected() bool {
-	// Some driver versions (older?) don't have a version file, so just lookup the parent dir
-	sysfsDir := filepath.Dir(DriverVersionFile)
-	_, err := os.Stat(sysfsDir)
-	if errors.Is(err, os.ErrNotExist) {
-		slog.Debug("amd driver not detected " + sysfsDir)
-		return false
-	} else if err != nil {
-		slog.Debug(fmt.Sprintf("error looking up amd driver %s %s", sysfsDir, err))
-		return false
-	}
-	return true
-}
-
-func AMDDriverVersion() (string, error) {
-	_, err := os.Stat(DriverVersionFile)
-	if err != nil {
-		return "", fmt.Errorf("amdgpu file stat error: %s %w", DriverVersionFile, err)
-	}
-	fp, err := os.Open(DriverVersionFile)
-	if err != nil {
-		return "", err
-	}
-	defer fp.Close()
-	verString, err := io.ReadAll(fp)
-	if err != nil {
-		return "", err
-	}
-	return strings.TrimSpace(string(verString)), nil
-}
-
-func AMDGFXVersions() []Version {
-	res := []Version{}
-	matches, _ := filepath.Glob(GPUPropertiesFileGlob)
-	for _, match := range matches {
-		fp, err := os.Open(match)
-		if err != nil {
-			slog.Debug(fmt.Sprintf("failed to open sysfs node file %s: %s", match, err))
-			continue
-		}
-		defer fp.Close()
-
-		scanner := bufio.NewScanner(fp)
-		// optionally, resize scanner's capacity for lines over 64K, see next example
-		for scanner.Scan() {
-			line := strings.TrimSpace(scanner.Text())
-			if strings.HasPrefix(line, "gfx_target_version") {
-				ver := strings.Fields(line)
-				if len(ver) != 2 || len(ver[1]) < 5 {
-					slog.Debug("malformed " + line)
-					continue
-				}
-				l := len(ver[1])
-				patch, err1 := strconv.ParseUint(ver[1][l-2:l], 10, 32)
-				minor, err2 := strconv.ParseUint(ver[1][l-4:l-2], 10, 32)
-				major, err3 := strconv.ParseUint(ver[1][:l-4], 10, 32)
-				if err1 != nil || err2 != nil || err3 != nil {
-					slog.Debug("malformed int " + line)
-					continue
-				}
-
-				res = append(res, Version{
-					Major: uint(major),
-					Minor: uint(minor),
-					Patch: uint(patch),
-				})
-			}
-		}
-	}
-	return res
-}
-
-func (v Version) ToGFXString() string {
-	return fmt.Sprintf("gfx%d%d%d", v.Major, v.Minor, v.Patch)
-}
--- a/gpu/amd_common.go
+++ b/gpu/amd_common.go
@@ -0,0 +1,58 @@
+//go:build linux || windows
+
+package gpu
+
+import (
+	"fmt"
+	"log/slog"
+	"os"
+	"path/filepath"
+	"strconv"
+	"strings"
+)
+
+// Determine if the given ROCm lib directory is usable by checking for existence of some glob patterns
+func rocmLibUsable(libDir string) bool {
+	slog.Debug("evaluating potential rocm lib dir " + libDir)
+	for _, g := range ROCmLibGlobs {
+		res, _ := filepath.Glob(filepath.Join(libDir, g))
+		if len(res) == 0 {
+			return false
+		}
+	}
+	return true
+}
+
+func GetSupportedGFX(libDir string) ([]string, error) {
+	var ret []string
+	files, err := filepath.Glob(filepath.Join(libDir, "rocblas", "library", "TensileLibrary_lazy_gfx*.dat"))
+	if err != nil {
+		return nil, err
+	}
+	for _, file := range files {
+		ret = append(ret, strings.TrimSuffix(strings.TrimPrefix(filepath.Base(file), "TensileLibrary_lazy_"), ".dat"))
+	}
+	return ret, nil
+}
+
+func amdSetVisibleDevices(ids []int, skip map[int]interface{}) {
+	// Set the visible devices if not already set
+	// TODO - does sort order matter?
+	devices := []string{}
+	for i := range ids {
+		slog.Debug(fmt.Sprintf("i=%d", i))
+		if _, skipped := skip[i]; skipped {
+			slog.Debug("skipped")
+			continue
+		}
+		devices = append(devices, strconv.Itoa(i))
+	}
+	slog.Debug(fmt.Sprintf("devices=%v", devices))
+
+	val := strings.Join(devices, ",")
+	err := os.Setenv("HIP_VISIBLE_DEVICES", val)
+	if err != nil {
+		slog.Warn(fmt.Sprintf("failed to set env: %s", err))
+	}
+	slog.Debug("HIP_VISIBLE_DEVICES=" + val)
+}
--- a/gpu/amd_hip_windows.go
+++ b/gpu/amd_hip_windows.go
@@ -0,0 +1,141 @@
+package gpu
+
+import (
+	"fmt"
+	"log/slog"
+	"strconv"
+	"syscall"
+	"unsafe"
+
+	"golang.org/x/sys/windows"
+)
+
+const (
+	hipSuccess       = 0
+	hipErrorNoDevice = 100
+)
+
+type hipDevicePropMinimal struct {
+	Name        [256]byte
+	unused1     [140]byte
+	GcnArchName [256]byte // gfx####
+	iGPU        int       // Doesn't seem to actually report correctly
+	unused2     [128]byte
+}
+
+// Wrap the amdhip64.dll library for GPU discovery
+type HipLib struct {
+	dll                    windows.Handle
+	hipGetDeviceCount      uintptr
+	hipGetDeviceProperties uintptr
+	hipMemGetInfo          uintptr
+	hipSetDevice           uintptr
+	hipDriverGetVersion    uintptr
+}
+
+func NewHipLib() (*HipLib, error) {
+	h, err := windows.LoadLibrary("amdhip64.dll")
+	if err != nil {
+		return nil, fmt.Errorf("unable to load amdhip64.dll: %w", err)
+	}
+	hl := &HipLib{}
+	hl.dll = h
+	hl.hipGetDeviceCount, err = windows.GetProcAddress(hl.dll, "hipGetDeviceCount")
+	if err != nil {
+		return nil, err
+	}
+	hl.hipGetDeviceProperties, err = windows.GetProcAddress(hl.dll, "hipGetDeviceProperties")
+	if err != nil {
+		return nil, err
+	}
+	hl.hipMemGetInfo, err = windows.GetProcAddress(hl.dll, "hipMemGetInfo")
+	if err != nil {
+		return nil, err
+	}
+	hl.hipSetDevice, err = windows.GetProcAddress(hl.dll, "hipSetDevice")
+	if err != nil {
+		return nil, err
+	}
+	hl.hipDriverGetVersion, err = windows.GetProcAddress(hl.dll, "hipDriverGetVersion")
+	if err != nil {
+		return nil, err
+	}
+	return hl, nil
+}
+
+// The hip library only evaluates the HIP_VISIBLE_DEVICES variable at startup
+// so we have to unload/reset the library after we do our initial discovery
+// to make sure our updates to that variable are processed by llama.cpp
+func (hl *HipLib) Release() {
+	err := windows.FreeLibrary(hl.dll)
+	if err != nil {
+		slog.Warn(fmt.Sprintf("failed to unload amdhip64.dll: %s", err))
+	}
+	hl.dll = 0
+}
+
+func (hl *HipLib) AMDDriverVersion() (string, error) {
+	if hl.dll == 0 {
+		return "", fmt.Errorf("dll has been unloaded")
+	}
+	var version int
+	status, _, err := syscall.SyscallN(hl.hipDriverGetVersion, uintptr(unsafe.Pointer(&version)))
+	if status != hipSuccess {
+		return "", fmt.Errorf("failed call to hipDriverGetVersion: %d %s", status, err)
+	}
+	return strconv.Itoa(version), nil
+}
+
+func (hl *HipLib) HipGetDeviceCount() int {
+	if hl.dll == 0 {
+		slog.Error("dll has been unloaded")
+		return 0
+	}
+	var count int
+	status, _, err := syscall.SyscallN(hl.hipGetDeviceCount, uintptr(unsafe.Pointer(&count)))
+	if status == hipErrorNoDevice {
+		slog.Info("AMD ROCm reports no devices found")
+		return 0
+	}
+	if status != hipSuccess {
+		slog.Warn(fmt.Sprintf("failed call to hipGetDeviceCount: %d %s", status, err))
+	}
+	return count
+}
+
+func (hl *HipLib) HipSetDevice(device int) error {
+	if hl.dll == 0 {
+		return fmt.Errorf("dll has been unloaded")
+	}
+	status, _, err := syscall.SyscallN(hl.hipSetDevice, uintptr(device))
+	if status != hipSuccess {
+		return fmt.Errorf("failed call to hipSetDevice: %d %s", status, err)
+	}
+	return nil
+}
+
+func (hl *HipLib) HipGetDeviceProperties(device int) (*hipDevicePropMinimal, error) {
+	if hl.dll == 0 {
+		return nil, fmt.Errorf("dll has been unloaded")
+	}
+	var props hipDevicePropMinimal
+	status, _, err := syscall.SyscallN(hl.hipGetDeviceProperties, uintptr(unsafe.Pointer(&props)), uintptr(device))
+	if status != hipSuccess {
+		return nil, fmt.Errorf("failed call to hipGetDeviceProperties: %d %s", status, err)
+	}
+	return &props, nil
+}
+
+// free, total, err
+func (hl *HipLib) HipMemGetInfo() (uint64, uint64, error) {
+	if hl.dll == 0 {
+		return 0, 0, fmt.Errorf("dll has been unloaded")
+	}
+	var totalMemory uint64
+	var freeMemory uint64
+	status, _, err := syscall.SyscallN(hl.hipMemGetInfo, uintptr(unsafe.Pointer(&freeMemory)), uintptr(unsafe.Pointer(&totalMemory)))
+	if status != hipSuccess {
+		return 0, 0, fmt.Errorf("failed call to hipMemGetInfo: %d %s", status, err)
+	}
+	return freeMemory, totalMemory, nil
+}
--- a/gpu/amd_linux.go
+++ b/gpu/amd_linux.go
@@ -0,0 +1,411 @@
+package gpu
+
+import (
+	"bufio"
+	"errors"
+	"fmt"
+	"io"
+	"log/slog"
+	"os"
+	"path/filepath"
+	"slices"
+	"strconv"
+	"strings"
+
+	"github.com/jmorganca/ollama/version"
+)
+
+// Discovery logic for AMD/ROCm GPUs
+
+const (
+	curlMsg               = "curl -fsSL https://github.com/ollama/ollama/releases/download/v%s/rocm-amd64-deps.tgz | tar -zxf - -C %s"
+	DriverVersionFile     = "/sys/module/amdgpu/version"
+	AMDNodesSysfsDir      = "/sys/class/kfd/kfd/topology/nodes/"
+	GPUPropertiesFileGlob = AMDNodesSysfsDir + "*/properties"
+
+	// Prefix with the node dir
+	GPUTotalMemoryFileGlob = "mem_banks/*/properties" // size_in_bytes line
+	GPUUsedMemoryFileGlob  = "mem_banks/*/used_memory"
+	RocmStandardLocation   = "/opt/rocm/lib"
+)
+
+var (
+	// Used to validate if the given ROCm lib is usable
+	ROCmLibGlobs = []string{"libhipblas.so.2*", "rocblas"} // TODO - probably include more coverage of files here...
+)
+
+// Gather GPU information from the amdgpu driver if any supported GPUs are detected
+// HIP_VISIBLE_DEVICES will be set if we detect a mix of unsupported and supported devices
+// and the user hasn't already set this variable
+func AMDGetGPUInfo(resp *GpuInfo) {
+	// TODO - DRY this out with windows
+	if !AMDDetected() {
+		return
+	}
+	skip := map[int]interface{}{}
+
+	// Opportunistic logging of driver version to aid in troubleshooting
+	ver, err := AMDDriverVersion()
+	if err == nil {
+		slog.Info("AMD Driver: " + ver)
+	} else {
+		// TODO - if we see users crash and burn with the upstreamed kernel this can be adjusted to hard-fail rocm support and fallback to CPU
+		slog.Warn(fmt.Sprintf("ollama recommends running the https://www.amd.com/en/support/linux-drivers: %s", err))
+	}
+
+	// If the user has specified exactly which GPUs to use, look up their memory
+	visibleDevices := os.Getenv("HIP_VISIBLE_DEVICES")
+	if visibleDevices != "" {
+		ids := []int{}
+		for _, idStr := range strings.Split(visibleDevices, ",") {
+			id, err := strconv.Atoi(idStr)
+			if err != nil {
+				slog.Warn(fmt.Sprintf("malformed HIP_VISIBLE_DEVICES=%s %s", visibleDevices, err))
+			} else {
+				ids = append(ids, id)
+			}
+		}
+		amdProcMemLookup(resp, nil, ids)
+		return
+	}
+
+	// Gather GFX version information from all detected cards
+	gfx := AMDGFXVersions()
+	verStrings := []string{}
+	for i, v := range gfx {
+		verStrings = append(verStrings, v.ToGFXString())
+		if v.Major == 0 {
+			// Silently skip CPUs
+			skip[i] = struct{}{}
+			continue
+		}
+		if v.Major < 9 {
+			// TODO consider this a build-time setting if we can support 8xx family GPUs
+			slog.Warn(fmt.Sprintf("amdgpu [%d] too old %s", i, v.ToGFXString()))
+			skip[i] = struct{}{}
+		}
+	}
+	slog.Info(fmt.Sprintf("detected amdgpu versions %v", verStrings))
+
+	// Abort if all GPUs are skipped
+	if len(skip) >= len(gfx) {
+		slog.Info("all detected amdgpus are skipped, falling back to CPU")
+		return
+	}
+
+	// If we got this far, then we have at least 1 GPU that's a ROCm candidate, so make sure we have a lib
+	libDir, err := AMDValidateLibDir()
+	if err != nil {
+		slog.Warn(fmt.Sprintf("unable to verify rocm library, will use cpu: %s", err))
+		return
+	}
+
+	gfxOverride := os.Getenv("HSA_OVERRIDE_GFX_VERSION")
+	if gfxOverride == "" {
+		supported, err := GetSupportedGFX(libDir)
+		if err != nil {
+			slog.Warn(fmt.Sprintf("failed to lookup supported GFX types, falling back to CPU mode: %s", err))
+			return
+		}
+		slog.Debug(fmt.Sprintf("rocm supported GPU types %v", supported))
+
+		for i, v := range gfx {
+			if !slices.Contains[[]string, string](supported, v.ToGFXString()) {
+				slog.Warn(fmt.Sprintf("amdgpu [%d] %s is not supported by %s %v", i, v.ToGFXString(), libDir, supported))
+				// TODO - consider discrete markdown just for ROCM troubleshooting?
+				slog.Warn("See https://github.com/ollama/ollama/blob/main/docs/troubleshooting.md for HSA_OVERRIDE_GFX_VERSION usage")
+				skip[i] = struct{}{}
+			} else {
+				slog.Info(fmt.Sprintf("amdgpu [%d] %s is supported", i, v.ToGFXString()))
+			}
+		}
+	} else {
+		slog.Debug("skipping rocm gfx compatibility check with HSA_OVERRIDE_GFX_VERSION=" + gfxOverride)
+	}
+
+	if len(skip) >= len(gfx) {
+		slog.Info("all detected amdgpus are skipped, falling back to CPU")
+		return
+	}
+
+	ids := make([]int, len(gfx))
+	i := 0
+	for k := range gfx {
+		ids[i] = k
+		i++
+	}
+	amdProcMemLookup(resp, skip, ids)
+	if resp.memInfo.DeviceCount == 0 {
+		return
+	}
+	if len(skip) > 0 {
+		amdSetVisibleDevices(ids, skip)
+	}
+}
+
+// Walk the sysfs nodes for the available GPUs and gather information from them
+// skipping over any devices in the skip map
+func amdProcMemLookup(resp *GpuInfo, skip map[int]interface{}, ids []int) {
+	resp.memInfo.DeviceCount = 0
+	resp.memInfo.TotalMemory = 0
+	resp.memInfo.FreeMemory = 0
+	if len(ids) == 0 {
+		slog.Debug("discovering all amdgpu devices")
+		entries, err := os.ReadDir(AMDNodesSysfsDir)
+		if err != nil {
+			slog.Warn(fmt.Sprintf("failed to read amdgpu sysfs %s - %s", AMDNodesSysfsDir, err))
+			return
+		}
+		for _, node := range entries {
+			if !node.IsDir() {
+				continue
+			}
+			id, err := strconv.Atoi(node.Name())
+			if err != nil {
+				slog.Warn("malformed amdgpu sysfs node id " + node.Name())
+				continue
+			}
+			ids = append(ids, id)
+		}
+	}
+	slog.Debug(fmt.Sprintf("discovering amdgpu devices %v", ids))
+
+	for _, id := range ids {
+		if _, skipped := skip[id]; skipped {
+			continue
+		}
+		totalMemory := uint64(0)
+		usedMemory := uint64(0)
+		propGlob := filepath.Join(AMDNodesSysfsDir, strconv.Itoa(id), GPUTotalMemoryFileGlob)
+		propFiles, err := filepath.Glob(propGlob)
+		if err != nil {
+			slog.Warn(fmt.Sprintf("error looking up total GPU memory: %s %s", propGlob, err))
+		}
+		// 1 or more memory banks - sum the values of all of them
+		for _, propFile := range propFiles {
+			fp, err := os.Open(propFile)
+			if err != nil {
+				slog.Warn(fmt.Sprintf("failed to open sysfs node file %s: %s", propFile, err))
+				continue
+			}
+			defer fp.Close()
+			scanner := bufio.NewScanner(fp)
+			for scanner.Scan() {
+				line := strings.TrimSpace(scanner.Text())
+				if strings.HasPrefix(line, "size_in_bytes") {
+					ver := strings.Fields(line)
+					if len(ver) != 2 {
+						slog.Warn("malformed " + line)
+						continue
+					}
+					bankSizeInBytes, err := strconv.ParseUint(ver[1], 10, 64)
+					if err != nil {
+						slog.Warn("malformed int " + line)
+						continue
+					}
+					totalMemory += bankSizeInBytes
+				}
+			}
+		}
+		if totalMemory == 0 {
+			continue
+		}
+		usedGlob := filepath.Join(AMDNodesSysfsDir, strconv.Itoa(id), GPUUsedMemoryFileGlob)
+		usedFiles, err := filepath.Glob(usedGlob)
+		if err != nil {
+			slog.Warn(fmt.Sprintf("error looking up used GPU memory: %s %s", usedGlob, err))
+			continue
+		}
+		for _, usedFile := range usedFiles {
+			fp, err := os.Open(usedFile)
+			if err != nil {
+				slog.Warn(fmt.Sprintf("failed to open sysfs node file %s: %s", usedFile, err))
+				continue
+			}
+			defer fp.Close()
+			data, err := io.ReadAll(fp)
+			if err != nil {
+				slog.Warn(fmt.Sprintf("failed to read sysfs node file %s: %s", usedFile, err))
+				continue
+			}
+			used, err := strconv.ParseUint(strings.TrimSpace(string(data)), 10, 64)
+			if err != nil {
+				slog.Warn(fmt.Sprintf("malformed used memory %s: %s", string(data), err))
+				continue
+			}
+			usedMemory += used
+		}
+		slog.Info(fmt.Sprintf("[%d] amdgpu totalMemory %d", id, totalMemory))
+		slog.Info(fmt.Sprintf("[%d] amdgpu freeMemory  %d", id, (totalMemory - usedMemory)))
+		resp.memInfo.DeviceCount++
+		resp.memInfo.TotalMemory += totalMemory
+		resp.memInfo.FreeMemory += (totalMemory - usedMemory)
+	}
+	if resp.memInfo.DeviceCount > 0 {
+		resp.Library = "rocm"
+	}
+}
+
+// Quick check for AMD driver so we can skip amdgpu discovery if not present
+func AMDDetected() bool {
+	// Some driver versions (older?) don't have a version file, so just lookup the parent dir
+	sysfsDir := filepath.Dir(DriverVersionFile)
+	_, err := os.Stat(sysfsDir)
+	if errors.Is(err, os.ErrNotExist) {
+		slog.Debug("amdgpu driver not detected " + sysfsDir)
+		return false
+	} else if err != nil {
+		slog.Debug(fmt.Sprintf("error looking up amd driver %s %s", sysfsDir, err))
+		return false
+	}
+	return true
+}
+
+func setupLink(source, target string) error {
+	if err := os.RemoveAll(target); err != nil {
+		return fmt.Errorf("failed to remove old rocm directory %s %w", target, err)
+	}
+	if err := os.Symlink(source, target); err != nil {
+		return fmt.Errorf("failed to create link %s => %s %w", source, target, err)
+	}
+	slog.Debug(fmt.Sprintf("host rocm linked %s => %s", source, target))
+	return nil
+}
+
+// Ensure the AMD rocm lib dir is wired up
+// Prefer to use host installed ROCm, as long as it meets our minimum requirements
+// failing that, tell the user how to download it on their own
+func AMDValidateLibDir() (string, error) {
+	// We rely on the rpath compiled into our library to find rocm
+	// so we establish a symlink to wherever we find it on the system
+	// to $AssetsDir/rocm
+
+	// If we already have a rocm dependency wired, nothing more to do
+	assetsDir, err := AssetsDir()
+	if err != nil {
+		return "", fmt.Errorf("unable to lookup lib dir: %w", err)
+	}
+	// Versioned directory
+	rocmTargetDir := filepath.Join(assetsDir, "rocm")
+	if rocmLibUsable(rocmTargetDir) {
+		return rocmTargetDir, nil
+	}
+	// Parent dir (unversioned)
+	commonRocmDir := filepath.Join(filepath.Dir(assetsDir), "rocm")
+	if rocmLibUsable(commonRocmDir) {
+		return rocmTargetDir, setupLink(commonRocmDir, rocmTargetDir)
+	}
+
+	// Prefer explicit HIP env var
+	hipPath := os.Getenv("HIP_PATH")
+	if hipPath != "" {
+		hipLibDir := filepath.Join(hipPath, "lib")
+		if rocmLibUsable(hipLibDir) {
+			slog.Debug("detected ROCM via HIP_PATH=" + hipPath)
+			return rocmTargetDir, setupLink(hipLibDir, rocmTargetDir)
+		}
+	}
+
+	// Scan the library path for potential matches
+	ldPaths := strings.Split(os.Getenv("LD_LIBRARY_PATH"), ":")
+	for _, ldPath := range ldPaths {
+		d, err := filepath.Abs(ldPath)
+		if err != nil {
+			continue
+		}
+		if rocmLibUsable(d) {
+			return rocmTargetDir, setupLink(d, rocmTargetDir)
+		}
+	}
+
+	// Well known location(s)
+	if rocmLibUsable("/opt/rocm/lib") {
+		return rocmTargetDir, setupLink("/opt/rocm/lib", rocmTargetDir)
+	}
+	err = os.MkdirAll(rocmTargetDir, 0755)
+	if err != nil {
+		return "", fmt.Errorf("failed to create empty rocm dir %s %w", rocmTargetDir, err)
+	}
+
+	// If we still haven't found a usable rocm, the user will have to download it on their own
+	slog.Warn("amdgpu detected, but no compatible rocm library found.  Either install rocm v6, or run the following")
+	slog.Warn(fmt.Sprintf(curlMsg, version.Version, rocmTargetDir))
+	return "", fmt.Errorf("no suitable rocm found, falling back to CPU")
+}
+
+func AMDDriverVersion() (string, error) {
+	_, err := os.Stat(DriverVersionFile)
+	if err != nil {
+		return "", fmt.Errorf("amdgpu version file missing: %s %w", DriverVersionFile, err)
+	}
+	fp, err := os.Open(DriverVersionFile)
+	if err != nil {
+		return "", err
+	}
+	defer fp.Close()
+	verString, err := io.ReadAll(fp)
+	if err != nil {
+		return "", err
+	}
+	return strings.TrimSpace(string(verString)), nil
+}
+
+func AMDGFXVersions() map[int]Version {
+	res := map[int]Version{}
+	matches, _ := filepath.Glob(GPUPropertiesFileGlob)
+	for _, match := range matches {
+		fp, err := os.Open(match)
+		if err != nil {
+			slog.Debug(fmt.Sprintf("failed to open sysfs node file %s: %s", match, err))
+			continue
+		}
+		defer fp.Close()
+		i, err := strconv.Atoi(filepath.Base(filepath.Dir(match)))
+		if err != nil {
+			slog.Debug(fmt.Sprintf("failed to parse node ID %s", err))
+			continue
+		}
+
+		scanner := bufio.NewScanner(fp)
+		for scanner.Scan() {
+			line := strings.TrimSpace(scanner.Text())
+			if strings.HasPrefix(line, "gfx_target_version") {
+				ver := strings.Fields(line)
+				if len(ver) != 2 || len(ver[1]) < 5 {
+
+					if ver[1] == "0" {
+						// Silently skip the CPU
+						continue
+					} else {
+						slog.Debug("malformed " + line)
+					}
+					res[i] = Version{
+						Major: 0,
+						Minor: 0,
+						Patch: 0,
+					}
+					continue
+				}
+				l := len(ver[1])
+				patch, err1 := strconv.ParseUint(ver[1][l-2:l], 10, 32)
+				minor, err2 := strconv.ParseUint(ver[1][l-4:l-2], 10, 32)
+				major, err3 := strconv.ParseUint(ver[1][:l-4], 10, 32)
+				if err1 != nil || err2 != nil || err3 != nil {
+					slog.Debug("malformed int " + line)
+					continue
+				}
+
+				res[i] = Version{
+					Major: uint(major),
+					Minor: uint(minor),
+					Patch: uint(patch),
+				}
+			}
+		}
+	}
+	return res
+}
+
+func (v Version) ToGFXString() string {
+	return fmt.Sprintf("gfx%d%d%d", v.Major, v.Minor, v.Patch)
+}
--- a/gpu/amd_windows.go
+++ b/gpu/amd_windows.go
@@ -0,0 +1,190 @@
+package gpu
+
+import (
+	"bytes"
+	"fmt"
+	"log/slog"
+	"os"
+	"path/filepath"
+	"slices"
+	"strings"
+)
+
+const (
+	RocmStandardLocation = "C:\\Program Files\\AMD\\ROCm\\5.7\\bin" // TODO glob?
+
+	// TODO  We're lookinng for this exact name to detect iGPUs since hipGetDeviceProperties never reports integrated==true
+	iGPUName = "AMD Radeon(TM) Graphics"
+)
+
+var (
+	// Used to validate if the given ROCm lib is usable
+	ROCmLibGlobs = []string{"hipblas.dll", "rocblas"} // TODO - probably include more coverage of files here...
+)
+
+func AMDGetGPUInfo(resp *GpuInfo) {
+	hl, err := NewHipLib()
+	if err != nil {
+		slog.Debug(err.Error())
+		return
+	}
+	defer hl.Release()
+	skip := map[int]interface{}{}
+	ids := []int{}
+	resp.memInfo.DeviceCount = 0
+	resp.memInfo.TotalMemory = 0
+	resp.memInfo.FreeMemory = 0
+
+	ver, err := hl.AMDDriverVersion()
+	if err == nil {
+		slog.Info("AMD Driver: " + ver)
+	} else {
+		// For now this is benign, but we may eventually need to fail compatibility checks
+		slog.Debug(fmt.Sprintf("error looking up amd driver version: %s", err))
+	}
+
+	// Note: the HIP library automatically handles HIP_VISIBLE_DEVICES
+	count := hl.HipGetDeviceCount()
+	if count == 0 {
+		return
+	}
+	libDir, err := AMDValidateLibDir()
+	if err != nil {
+		slog.Warn(fmt.Sprintf("unable to verify rocm library, will use cpu: %s", err))
+		return
+	}
+
+	var supported []string
+	gfxOverride := os.Getenv("HSA_OVERRIDE_GFX_VERSION")
+	if gfxOverride == "" {
+		supported, err = GetSupportedGFX(libDir)
+		if err != nil {
+			slog.Warn(fmt.Sprintf("failed to lookup supported GFX types, falling back to CPU mode: %s", err))
+			return
+		}
+	} else {
+		slog.Debug("skipping rocm gfx compatibility check with HSA_OVERRIDE_GFX_VERSION=" + gfxOverride)
+	}
+
+	slog.Info(fmt.Sprintf("detected %d hip devices", count))
+	for i := 0; i < count; i++ {
+		ids = append(ids, i)
+		err = hl.HipSetDevice(i)
+		if err != nil {
+			slog.Warn(fmt.Sprintf("[%d] %s", i, err))
+			skip[i] = struct{}{}
+			continue
+		}
+
+		props, err := hl.HipGetDeviceProperties(i)
+		if err != nil {
+			slog.Warn(fmt.Sprintf("[%d] %s", i, err))
+			skip[i] = struct{}{}
+			continue
+		}
+		n := bytes.IndexByte(props.Name[:], 0)
+		name := string(props.Name[:n])
+		slog.Info(fmt.Sprintf("[%d] Name: %s", i, name))
+		n = bytes.IndexByte(props.GcnArchName[:], 0)
+		gfx := string(props.GcnArchName[:n])
+		slog.Info(fmt.Sprintf("[%d] GcnArchName: %s", i, gfx))
+		//slog.Info(fmt.Sprintf("[%d] Integrated: %d", i, props.iGPU)) // DOESN'T REPORT CORRECTLY!  Always 0
+		// TODO  Why isn't props.iGPU accurate!?
+		if strings.EqualFold(name, iGPUName) {
+			slog.Info(fmt.Sprintf("iGPU detected [%d] skipping", i))
+			skip[i] = struct{}{}
+			continue
+		}
+		if gfxOverride == "" {
+			if !slices.Contains[[]string, string](supported, gfx) {
+				slog.Warn(fmt.Sprintf("amdgpu [%d] %s is not supported by %s %v", i, gfx, libDir, supported))
+				// TODO - consider discrete markdown just for ROCM troubleshooting?
+				slog.Warn("See https://github.com/ollama/ollama/blob/main/docs/troubleshooting.md for HSA_OVERRIDE_GFX_VERSION usage")
+				skip[i] = struct{}{}
+				continue
+			} else {
+				slog.Info(fmt.Sprintf("amdgpu [%d] %s is supported", i, gfx))
+			}
+		}
+
+		totalMemory, freeMemory, err := hl.HipMemGetInfo()
+		if err != nil {
+			slog.Warn(fmt.Sprintf("[%d] %s", i, err))
+			continue
+		}
+
+		// TODO according to docs, freeMem may lie on windows!
+		slog.Info(fmt.Sprintf("[%d] Total Mem: %d", i, totalMemory))
+		slog.Info(fmt.Sprintf("[%d] Free Mem:  %d", i, freeMemory))
+		resp.memInfo.DeviceCount++
+		resp.memInfo.TotalMemory += totalMemory
+		resp.memInfo.FreeMemory += freeMemory
+	}
+	if resp.memInfo.DeviceCount > 0 {
+		resp.Library = "rocm"
+	}
+	// Abort if all GPUs are skipped
+	if len(skip) >= count {
+		slog.Info("all detected amdgpus are skipped, falling back to CPU")
+		return
+	}
+	if len(skip) > 0 {
+		amdSetVisibleDevices(ids, skip)
+	}
+	UpdatePath(libDir)
+}
+
+func AMDValidateLibDir() (string, error) {
+	// On windows non-admins typically can't create links
+	// so instead of trying to rely on rpath and a link in
+	// $LibDir/rocm, we instead rely on setting PATH to point
+	// to the location of the ROCm library
+
+	// Installer payload location
+	exe, err := os.Executable()
+	if err == nil {
+		rocmTargetDir := filepath.Join(filepath.Dir(exe), "rocm")
+		if rocmLibUsable(rocmTargetDir) {
+			slog.Debug("detected ROCM next to ollama executable " + rocmTargetDir)
+			return rocmTargetDir, nil
+		}
+	}
+
+	// If we already have a rocm dependency wired, nothing more to do
+	libDir, err := AssetsDir()
+	if err != nil {
+		return "", fmt.Errorf("unable to lookup lib dir: %w", err)
+	}
+	rocmTargetDir := filepath.Join(libDir, "rocm")
+	if rocmLibUsable(rocmTargetDir) {
+		return rocmTargetDir, nil
+	}
+
+	// Prefer explicit HIP env var
+	hipPath := os.Getenv("HIP_PATH")
+	if hipPath != "" {
+		hipLibDir := filepath.Join(hipPath, "bin")
+		if rocmLibUsable(hipLibDir) {
+			slog.Debug("detected ROCM via HIP_PATH=" + hipPath)
+			return hipLibDir, nil
+		}
+	}
+
+	// Well known location(s)
+	if rocmLibUsable(RocmStandardLocation) {
+		return RocmStandardLocation, nil
+	}
+
+	// Installer payload (if we're running from some other location)
+	localAppData := os.Getenv("LOCALAPPDATA")
+	appDir := filepath.Join(localAppData, "Programs", "Ollama")
+	rocmTargetDir = filepath.Join(appDir, "rocm")
+	if rocmLibUsable(rocmTargetDir) {
+		slog.Debug("detected ollama installed ROCm at " + rocmTargetDir)
+		return rocmTargetDir, nil
+	}
+
+	// Should not happen on windows since we include it in the installer, but stand-alone binary might hit this
+	slog.Warn("amdgpu detected, but no compatible rocm library found.  Please install ROCm v6")
+	return "", fmt.Errorf("no suitable rocm found, falling back to CPU")
+}
--- a/gpu/assets.go
+++ b/gpu/assets.go
@@ -0,0 +1,60 @@
+package gpu
+
+import (
+	"fmt"
+	"log/slog"
+	"os"
+	"path/filepath"
+	"runtime"
+	"strings"
+
+	"github.com/jmorganca/ollama/version"
+)
+
+func AssetsDir() (string, error) {
+	home, err := os.UserHomeDir()
+	if err != nil {
+		return "", err
+	}
+	baseDir := filepath.Join(home, ".ollama", "assets")
+	libDirs, err := os.ReadDir(baseDir)
+	if err == nil {
+		for _, d := range libDirs {
+			if d.Name() == version.Version {
+				continue
+			}
+			// Special case the rocm dependencies, which are handled by the installer
+			if d.Name() == "rocm" {
+				continue
+			}
+			slog.Debug("stale lib detected, cleaning up " + d.Name())
+			err = os.RemoveAll(filepath.Join(baseDir, d.Name()))
+			if err != nil {
+				slog.Warn(fmt.Sprintf("unable to clean up stale library %s: %s", filepath.Join(baseDir, d.Name()), err))
+			}
+		}
+	}
+	return filepath.Join(baseDir, version.Version), nil
+}
+
+func UpdatePath(dir string) {
+	if runtime.GOOS == "windows" {
+		tmpDir := filepath.Dir(dir)
+		pathComponents := strings.Split(os.Getenv("PATH"), ";")
+		i := 0
+		for _, comp := range pathComponents {
+			if strings.EqualFold(comp, dir) {
+				return
+			}
+			// Remove any other prior paths to our temp dir
+			if !strings.HasPrefix(strings.ToLower(comp), strings.ToLower(tmpDir)) {
+				pathComponents[i] = comp
+				i++
+			}
+		}
+		newPath := strings.Join(append([]string{dir}, pathComponents...), ";")
+		slog.Info(fmt.Sprintf("Updating PATH to %s", newPath))
+		os.Setenv("PATH", newPath)
+	}
+	// linux and darwin rely on rpath
+}
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@@ -24,7 +24,6 @@ import (

 type handles struct {
 	cuda *C.cuda_handle_t
-	rocm *C.rocm_handle_t
 }

 var gpuMutex sync.Mutex
@@ -54,39 +53,23 @@ var CudaWindowsGlobs = []string{
 	"c:\\Windows\\System32\\nvml.dll",
 }

-var RocmLinuxGlobs = []string{
-	"/opt/rocm*/lib*/librocm_smi64.so*",
-}
-
-var RocmWindowsGlobs = []string{
-	"c:\\Windows\\System32\\rocm_smi64.dll",
-}
-
 // Note: gpuMutex must already be held
 func initGPUHandles() {

 	// TODO - if the ollama build is CPU only, don't do these checks as they're irrelevant and confusing

-	gpuHandles = &handles{nil, nil}
+	gpuHandles = &handles{nil}
 	var cudaMgmtName string
 	var cudaMgmtPatterns []string
-	var rocmMgmtName string
-	var rocmMgmtPatterns []string
 	switch runtime.GOOS {
 	case "windows":
 		cudaMgmtName = "nvml.dll"
 		cudaMgmtPatterns = make([]string, len(CudaWindowsGlobs))
 		copy(cudaMgmtPatterns, CudaWindowsGlobs)
-		rocmMgmtName = "rocm_smi64.dll"
-		rocmMgmtPatterns = make([]string, len(RocmWindowsGlobs))
-		copy(rocmMgmtPatterns, RocmWindowsGlobs)
 	case "linux":
 		cudaMgmtName = "libnvidia-ml.so"
 		cudaMgmtPatterns = make([]string, len(CudaLinuxGlobs))
 		copy(cudaMgmtPatterns, CudaLinuxGlobs)
-		rocmMgmtName = "librocm_smi64.so"
-		rocmMgmtPatterns = make([]string, len(RocmLinuxGlobs))
-		copy(rocmMgmtPatterns, RocmLinuxGlobs)
 	default:
 		return
 	}
@@ -101,16 +84,6 @@ func initGPUHandles() {
 			return
 		}
 	}
-
-	rocmLibPaths := FindGPULibs(rocmMgmtName, rocmMgmtPatterns)
-	if len(rocmLibPaths) > 0 {
-		rocm := LoadROCMMgmt(rocmLibPaths)
-		if rocm != nil {
-			slog.Info("Radeon GPU detected")
-			gpuHandles.rocm = rocm
-			return
-		}
-	}
 }

 func GetGPUInfo() GpuInfo {
@@ -149,66 +122,10 @@ func GetGPUInfo() GpuInfo {
 				slog.Info(fmt.Sprintf("CUDA GPU is too old. Falling back to CPU mode. Compute Capability detected: %d.%d", cc.major, cc.minor))
 			}
 		}
-	} else if AMDDetected() && gpuHandles.rocm != nil && (cpuVariant != "" || runtime.GOARCH != "amd64") {
-		ver, err := AMDDriverVersion()
-		if err == nil {
-			slog.Info("AMD Driver: " + ver)
-		} else {
-			// For now this is benign, but we may eventually need to fail compatibility checks
-			slog.Debug("error looking up amd driver version: %s", err)
-		}
-		gfx := AMDGFXVersions()
-		tooOld := false
-		for _, v := range gfx {
-			if v.Major < 9 {
-				slog.Info("AMD GPU too old, falling back to CPU " + v.ToGFXString())
-				tooOld = true
-				break
-			}
-
-			// TODO - remap gfx strings for unsupporetd minor/patch versions to supported for the same major
-			// e.g. gfx1034 works if we map it to gfx1030 at runtime
-
-		}
-		if !tooOld {
-			// TODO - this algo can be shifted over to use sysfs instead of the rocm info library...
-			C.rocm_check_vram(*gpuHandles.rocm, &memInfo)
-			if memInfo.err != nil {
-				slog.Info(fmt.Sprintf("error looking up ROCm GPU memory: %s", C.GoString(memInfo.err)))
-				C.free(unsafe.Pointer(memInfo.err))
-			} else if memInfo.igpu_index >= 0 && memInfo.count == 1 {
-				// Only one GPU detected and it appears to be an integrated GPU - skip it
-				slog.Info("ROCm unsupported integrated GPU detected")
-			} else if memInfo.count > 0 {
-				if memInfo.igpu_index >= 0 {
-					// We have multiple GPUs reported, and one of them is an integrated GPU
-					// so we have to set the env var to bypass it
-					// If the user has specified their own ROCR_VISIBLE_DEVICES, don't clobber it
-					val := os.Getenv("ROCR_VISIBLE_DEVICES")
-					if val == "" {
-						devices := []string{}
-						for i := 0; i < int(memInfo.count); i++ {
-							if i == int(memInfo.igpu_index) {
-								continue
-							}
-							devices = append(devices, strconv.Itoa(i))
-						}
-						val = strings.Join(devices, ",")
-						os.Setenv("ROCR_VISIBLE_DEVICES", val)
-					}
-					slog.Info(fmt.Sprintf("ROCm integrated GPU detected - ROCR_VISIBLE_DEVICES=%s", val))
-				}
-				resp.Library = "rocm"
-				var version C.rocm_version_resp_t
-				C.rocm_get_version(*gpuHandles.rocm, &version)
-				verString := C.GoString(version.str)
-				if version.status == 0 {
-					resp.Variant = "v" + verString
-				} else {
-					slog.Info(fmt.Sprintf("failed to look up ROCm version: %s", verString))
-				}
-				C.free(unsafe.Pointer(version.str))
-			}
+	} else {
+		AMDGetGPUInfo(&resp)
+		if resp.Library != "" {
+			return resp
 		}
 	}
 	if resp.Library == "" {
@@ -242,6 +159,15 @@ func getCPUMem() (memInfo, error) {
 }

 func CheckVRAM() (int64, error) {
+	userLimit := os.Getenv("OLLAMA_MAX_VRAM")
+	if userLimit != "" {
+		avail, err := strconv.ParseInt(userLimit, 10, 64)
+		if err != nil {
+			return 0, fmt.Errorf("Invalid OLLAMA_MAX_VRAM setting %s: %s", userLimit, err)
+		}
+		slog.Info(fmt.Sprintf("user override OLLAMA_MAX_VRAM=%d", avail))
+		return avail, nil
+	}
 	gpuInfo := GetGPUInfo()
 	if gpuInfo.FreeMemory > 0 && (gpuInfo.Library == "cuda" || gpuInfo.Library == "rocm") {
 		// leave 10% or 1024MiB of VRAM free per GPU to handle unaccounted for overhead
@@ -329,23 +255,6 @@ func LoadCUDAMgmt(cudaLibPaths []string) *C.cuda_handle_t {
 	return nil
 }

-func LoadROCMMgmt(rocmLibPaths []string) *C.rocm_handle_t {
-	var resp C.rocm_init_resp_t
-	resp.rh.verbose = getVerboseState()
-	for _, libPath := range rocmLibPaths {
-		lib := C.CString(libPath)
-		defer C.free(unsafe.Pointer(lib))
-		C.rocm_init(lib, &resp)
-		if resp.err != nil {
-			slog.Info(fmt.Sprintf("Unable to load ROCm management library %s: %s", libPath, C.GoString(resp.err)))
-			C.free(unsafe.Pointer(resp.err))
-		} else {
-			return &resp.rh
-		}
-	}
-	return nil
-}
-
 func getVerboseState() C.uint16_t {
 	if debug := os.Getenv("OLLAMA_DEBUG"); debug != "" {
 		return C.uint16_t(1)
--- a/gpu/gpu_darwin.go
+++ b/gpu/gpu_darwin.go
@@ -2,32 +2,38 @@

 package gpu

+/*
+#cgo CFLAGS: -x objective-c
+#cgo LDFLAGS: -framework Foundation -framework CoreGraphics -framework Metal
+#include "gpu_info_darwin.h"
+*/
 import "C"
 import (
+	"fmt"
+	"log/slog"
+	"os"
 	"runtime"
-
-	"github.com/pbnjay/memory"
+	"strconv"
 )

 // CheckVRAM returns the free VRAM in bytes on Linux machines with NVIDIA GPUs
 func CheckVRAM() (int64, error) {
+	userLimit := os.Getenv("OLLAMA_MAX_VRAM")
+	if userLimit != "" {
+		avail, err := strconv.ParseInt(userLimit, 10, 64)
+		if err != nil {
+			return 0, fmt.Errorf("Invalid OLLAMA_MAX_VRAM setting %s: %s", userLimit, err)
+		}
+		slog.Info(fmt.Sprintf("user override OLLAMA_MAX_VRAM=%d", avail))
+		return avail, nil
+	}
+
 	if runtime.GOARCH == "amd64" {
 		// gpu not supported, this may not be metal
 		return 0, nil
 	}
-
-	// on macOS, there's already buffer for available vram (see below) so just return the total
-	systemMemory := int64(memory.TotalMemory())
-
-	// macOS limits how much memory is available to the GPU based on the amount of system memory
-	// TODO: handle case where iogpu.wired_limit_mb is set to a higher value
-	if systemMemory <= 36*1024*1024*1024 {
-		systemMemory = systemMemory * 2 / 3
-	} else {
-		systemMemory = systemMemory * 3 / 4
-	}
-
-	return systemMemory, nil
+	recommendedMaxVRAM := int64(C.getRecommendedMaxVRAM())
+	return recommendedMaxVRAM, nil
 }

 func GetGPUInfo() GpuInfo {
--- a/gpu/gpu_info.h
+++ b/gpu/gpu_info.h
@@ -53,7 +53,6 @@ void cpu_check_ram(mem_info_t *resp);
 #endif

 #include "gpu_info_cuda.h"
-#include "gpu_info_rocm.h"

 #endif  // __GPU_INFO_H__
 #endif  // __APPLE__
--- a/gpu/gpu_info_cuda.c
+++ b/gpu/gpu_info_cuda.c
@@ -124,31 +124,31 @@ void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
      // When in verbose mode, report more information about
      // the card we discover, but don't fail on error
      ret = (*h.nvmlDeviceGetName)(device, buf, buflen);
-      if (ret != RSMI_STATUS_SUCCESS) {
+      if (ret != NVML_SUCCESS) {
        LOG(h.verbose, "nvmlDeviceGetName failed: %d\n", ret);
      } else {
        LOG(h.verbose, "[%d] CUDA device name: %s\n", i, buf);
      }
      ret = (*h.nvmlDeviceGetBoardPartNumber)(device, buf, buflen);
-      if (ret != RSMI_STATUS_SUCCESS) {
+      if (ret != NVML_SUCCESS) {
        LOG(h.verbose, "nvmlDeviceGetBoardPartNumber failed: %d\n", ret);
      } else {
        LOG(h.verbose, "[%d] CUDA part number: %s\n", i, buf);
      }
      ret = (*h.nvmlDeviceGetSerial)(device, buf, buflen);
-      if (ret != RSMI_STATUS_SUCCESS) {
+      if (ret != NVML_SUCCESS) {
        LOG(h.verbose, "nvmlDeviceGetSerial failed: %d\n", ret);
      } else {
        LOG(h.verbose, "[%d] CUDA S/N: %s\n", i, buf);
      }
      ret = (*h.nvmlDeviceGetVbiosVersion)(device, buf, buflen);
-      if (ret != RSMI_STATUS_SUCCESS) {
+      if (ret != NVML_SUCCESS) {
        LOG(h.verbose, "nvmlDeviceGetVbiosVersion failed: %d\n", ret);
      } else {
        LOG(h.verbose, "[%d] CUDA vbios version: %s\n", i, buf);
      }
      ret = (*h.nvmlDeviceGetBrand)(device, &brand);
-      if (ret != RSMI_STATUS_SUCCESS) {
+      if (ret != NVML_SUCCESS) {
        LOG(h.verbose, "nvmlDeviceGetBrand failed: %d\n", ret);
      } else {
        LOG(h.verbose, "[%d] CUDA brand: %d\n", i, brand);
@@ -156,7 +156,7 @@ void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
    }

    LOG(h.verbose, "[%d] CUDA totalMem %ld\n", i, memInfo.total);
-    LOG(h.verbose, "[%d] CUDA usedMem %ld\n", i, memInfo.free);
+    LOG(h.verbose, "[%d] CUDA usedMem %ld\n", i, memInfo.used);

    resp->total += memInfo.total;
    resp->free += memInfo.free;
--- a/gpu/gpu_info_darwin.h
+++ b/gpu/gpu_info_darwin.h
@@ -0,0 +1,3 @@
+#import <Metal/Metal.h>
+#include <stdint.h>
+uint64_t getRecommendedMaxVRAM();
--- a/gpu/gpu_info_darwin.m
+++ b/gpu/gpu_info_darwin.m
@@ -0,0 +1,11 @@
+//go:build darwin
+#include "gpu_info_darwin.h"
+
+uint64_t getRecommendedMaxVRAM()
+{
+	id<MTLDevice> device = MTLCreateSystemDefaultDevice();
+	uint64_t result = device.recommendedMaxWorkingSetSize;
+	CFRelease(device);
+	return result;
+}
+
--- a/gpu/gpu_info_rocm.c
+++ b/gpu/gpu_info_rocm.c
@@ -1,198 +0,0 @@
-#ifndef __APPLE__
-
-#include "gpu_info_rocm.h"
-
-#include <string.h>
-
-void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
-  rsmi_status_t ret;
-  resp->err = NULL;
-  const int buflen = 256;
-  char buf[buflen + 1];
-  int i;
-  struct lookup {
-    char *s;
-    void **p;
-  } l[] = {
-      {"rsmi_init", (void *)&resp->rh.rsmi_init},
-      {"rsmi_shut_down", (void *)&resp->rh.rsmi_shut_down},
-      {"rsmi_dev_memory_total_get", (void *)&resp->rh.rsmi_dev_memory_total_get},
-      {"rsmi_dev_memory_usage_get", (void *)&resp->rh.rsmi_dev_memory_usage_get},
-      {"rsmi_version_get", (void *)&resp->rh.rsmi_version_get},
-      {"rsmi_num_monitor_devices", (void*)&resp->rh.rsmi_num_monitor_devices},
-      {"rsmi_dev_id_get", (void*)&resp->rh.rsmi_dev_id_get},
-      {"rsmi_dev_name_get", (void *)&resp->rh.rsmi_dev_name_get},
-      {"rsmi_dev_brand_get", (void *)&resp->rh.rsmi_dev_brand_get},
-      {"rsmi_dev_vendor_name_get", (void *)&resp->rh.rsmi_dev_vendor_name_get},
-      {"rsmi_dev_vram_vendor_get", (void *)&resp->rh.rsmi_dev_vram_vendor_get},
-      {"rsmi_dev_serial_number_get", (void *)&resp->rh.rsmi_dev_serial_number_get},
-      {"rsmi_dev_subsystem_name_get", (void *)&resp->rh.rsmi_dev_subsystem_name_get},
-      {"rsmi_dev_vbios_version_get", (void *)&resp->rh.rsmi_dev_vbios_version_get},
-      {NULL, NULL},
-  };
-
-  resp->rh.handle = LOAD_LIBRARY(rocm_lib_path, RTLD_LAZY);
-  if (!resp->rh.handle) {
-    char *msg = LOAD_ERR();
-    snprintf(buf, buflen,
-             "Unable to load %s library to query for Radeon GPUs: %s\n",
-             rocm_lib_path, msg);
-    free(msg);
-    resp->err = strdup(buf);
-    return;
-  }
-
-  // TODO once we've squashed the remaining corner cases remove this log
-  LOG(resp->rh.verbose, "wiring rocm management library functions in %s\n", rocm_lib_path);
-
-  for (i = 0; l[i].s != NULL; i++) {
-    // TODO once we've squashed the remaining corner cases remove this log
-    LOG(resp->rh.verbose, "dlsym: %s\n", l[i].s);
-
-    *l[i].p = LOAD_SYMBOL(resp->rh.handle, l[i].s);
-    if (!l[i].p) {
-      resp->rh.handle = NULL;
-      char *msg = LOAD_ERR();
-      LOG(resp->rh.verbose, "dlerr: %s\n", msg);
-      UNLOAD_LIBRARY(resp->rh.handle);
-      snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
-               msg);
-      free(msg);
-      resp->err = strdup(buf);
-      return;
-    }
-  }
-
-  ret = (*resp->rh.rsmi_init)(0);
-  if (ret != RSMI_STATUS_SUCCESS) {
-    LOG(resp->rh.verbose, "rsmi_init err: %d\n", ret);
-    UNLOAD_LIBRARY(resp->rh.handle);
-    resp->rh.handle = NULL;
-    snprintf(buf, buflen, "rocm vram init failure: %d", ret);
-    resp->err = strdup(buf);
-  }
-
-  return;
-}
-
-void rocm_check_vram(rocm_handle_t h, mem_info_t *resp) {
-  resp->err = NULL;
-  resp->igpu_index = -1;
-  uint64_t totalMem = 0;
-  uint64_t usedMem = 0;
-  rsmi_status_t ret;
-  const int buflen = 256;
-  char buf[buflen + 1];
-  int i;
-
-  if (h.handle == NULL) {
-    resp->err = strdup("rocm handle not initialized");
-    return;
-  }
-
-  ret = (*h.rsmi_num_monitor_devices)(&resp->count);
-  if (ret != RSMI_STATUS_SUCCESS) {
-    snprintf(buf, buflen, "unable to get device count: %d", ret);
-    resp->err = strdup(buf);
-    return;
-  }
-  LOG(h.verbose, "discovered %d ROCm GPU Devices\n", resp->count);
-
-  resp->total = 0;
-  resp->free = 0;
-  for (i = 0; i < resp->count; i++) {
-    if (h.verbose) {
-      // When in verbose mode, report more information about
-      // the card we discover, but don't fail on error
-      ret = (*h.rsmi_dev_name_get)(i, buf, buflen);
-      if (ret != RSMI_STATUS_SUCCESS) {
-        LOG(h.verbose, "rsmi_dev_name_get failed: %d\n", ret);
-      } else {
-        LOG(h.verbose, "[%d] ROCm device name: %s\n", i, buf);
-      }
-      ret = (*h.rsmi_dev_brand_get)(i, buf, buflen);
-      if (ret != RSMI_STATUS_SUCCESS) {
-        LOG(h.verbose, "rsmi_dev_brand_get failed: %d\n", ret);
-      } else {
-        LOG(h.verbose, "[%d] ROCm brand: %s\n", i, buf);
-      }
-      ret = (*h.rsmi_dev_vendor_name_get)(i, buf, buflen);
-      if (ret != RSMI_STATUS_SUCCESS) {
-        LOG(h.verbose, "rsmi_dev_vendor_name_get failed: %d\n", ret);
-      } else {
-        LOG(h.verbose, "[%d] ROCm vendor: %s\n", i, buf);
-      }
-      ret = (*h.rsmi_dev_vram_vendor_get)(i, buf, buflen);
-      if (ret != RSMI_STATUS_SUCCESS) {
-        LOG(h.verbose, "rsmi_dev_vram_vendor_get failed: %d\n", ret);
-      } else {
-        LOG(h.verbose, "[%d] ROCm VRAM vendor: %s\n", i, buf);
-      }
-      ret = (*h.rsmi_dev_serial_number_get)(i, buf, buflen);
-      if (ret != RSMI_STATUS_SUCCESS) {
-        LOG(h.verbose, "rsmi_dev_serial_number_get failed: %d\n", ret);
-      } else {
-        LOG(h.verbose, "[%d] ROCm S/N: %s\n", i, buf);
-      }
-      ret = (*h.rsmi_dev_subsystem_name_get)(i, buf, buflen);
-      if (ret != RSMI_STATUS_SUCCESS) {
-        LOG(h.verbose, "rsmi_dev_subsystem_name_get failed: %d\n", ret);
-      } else {
-        LOG(h.verbose, "[%d] ROCm subsystem name: %s\n", i, buf);
-      }
-      ret = (*h.rsmi_dev_vbios_version_get)(i, buf, buflen);
-      if (ret != RSMI_STATUS_SUCCESS) {
-        LOG(h.verbose, "rsmi_dev_vbios_version_get failed: %d\n", ret);
-      } else {
-        LOG(h.verbose, "[%d] ROCm vbios version: %s\n", i, buf);
-      }
-    }
-
-    // Get total memory - used memory for available memory
-    ret = (*h.rsmi_dev_memory_total_get)(i, RSMI_MEM_TYPE_VRAM, &totalMem);
-    if (ret != RSMI_STATUS_SUCCESS) {
-      snprintf(buf, buflen, "rocm total mem lookup failure: %d", ret);
-      resp->err = strdup(buf);
-      return;
-    }
-    ret = (*h.rsmi_dev_memory_usage_get)(i, RSMI_MEM_TYPE_VRAM, &usedMem);
-    if (ret != RSMI_STATUS_SUCCESS) {
-      snprintf(buf, buflen, "rocm usage mem lookup failure: %d", ret);
-      resp->err = strdup(buf);
-      return;
-    }
-    LOG(h.verbose, "[%d] ROCm totalMem %ld\n", i, totalMem);
-    LOG(h.verbose, "[%d] ROCm usedMem %ld\n", i, usedMem);
-    if (totalMem < 1024 * 1024 * 1024) {
-      // Do not add up integrated GPU memory capacity, it's a bogus 512M, and actually uses system memory
-      LOG(h.verbose, "[%d] ROCm integrated GPU\n", i);
-      resp->igpu_index = i;
-    } else {
-      resp->total += totalMem;
-      resp->free += totalMem - usedMem;
-    }
-  }
-}
-
-void rocm_get_version(rocm_handle_t h, rocm_version_resp_t *resp) {
-  const int buflen = 256;
-  char buf[buflen + 1];
-  if (h.handle == NULL) {
-    resp->str = strdup("rocm handle not initialized");
-    resp->status = 1;
-    return;
-  }
-  rsmi_version_t ver;
-  rsmi_status_t ret;
-  ret = h.rsmi_version_get(&ver);
-  if (ret != RSMI_STATUS_SUCCESS) {
-    snprintf(buf, buflen, "unexpected response on version lookup %d", ret);
-    resp->status = 1;
-  } else {
-    snprintf(buf, buflen, "%d", ver.major);
-    resp->status = 0;
-  }
-  resp->str = strdup(buf);
-}
-
-#endif  // __APPLE__
--- a/gpu/gpu_info_rocm.h
+++ b/gpu/gpu_info_rocm.h
@@ -1,59 +0,0 @@
-#ifndef __APPLE__
-#ifndef __GPU_INFO_ROCM_H__
-#define __GPU_INFO_ROCM_H__
-#include "gpu_info.h"
-
-// Just enough typedef's to dlopen/dlsym for memory information
-typedef enum rsmi_status_return {
-  RSMI_STATUS_SUCCESS = 0,
-  // Other values omitted for now...
-} rsmi_status_t;
-
-typedef enum rsmi_memory_type {
-  RSMI_MEM_TYPE_VRAM = 0,
-  RSMI_MEM_TYPE_VIS_VRAM,
-  RSMI_MEM_TYPE_GTT,
-} rsmi_memory_type_t;
-
- typedef struct {
-     uint32_t major;     
-     uint32_t minor;     
-     uint32_t patch;     
-     const char *build;  
- } rsmi_version_t;
-
-typedef struct rocm_handle {
-  void *handle;
-  uint16_t verbose;
-  rsmi_status_t (*rsmi_init)(uint64_t);
-  rsmi_status_t (*rsmi_shut_down)(void);
-  rsmi_status_t (*rsmi_dev_memory_total_get)(uint32_t, rsmi_memory_type_t, uint64_t *);
-  rsmi_status_t (*rsmi_dev_memory_usage_get)(uint32_t, rsmi_memory_type_t, uint64_t *);
-  rsmi_status_t (*rsmi_version_get) (rsmi_version_t *version);
-  rsmi_status_t (*rsmi_num_monitor_devices) (uint32_t *);
-  rsmi_status_t (*rsmi_dev_id_get)(uint32_t, uint16_t *);
-  rsmi_status_t (*rsmi_dev_name_get) (uint32_t,char *,size_t);
-  rsmi_status_t (*rsmi_dev_brand_get) (uint32_t, char *, uint32_t);		
-  rsmi_status_t (*rsmi_dev_vendor_name_get) (uint32_t, char *, uint32_t);		
-  rsmi_status_t (*rsmi_dev_vram_vendor_get) (uint32_t, char *, uint32_t);		
-  rsmi_status_t (*rsmi_dev_serial_number_get) (uint32_t, char *, uint32_t);		
-  rsmi_status_t (*rsmi_dev_subsystem_name_get) (uint32_t, char *, uint32_t);		
-  rsmi_status_t (*rsmi_dev_vbios_version_get) (uint32_t, char *, uint32_t);		
-} rocm_handle_t;
-
-typedef struct rocm_init_resp {
-  char *err;  // If err is non-null handle is invalid
-  rocm_handle_t rh;
-} rocm_init_resp_t;
-
-typedef struct rocm_version_resp {
-  rsmi_status_t status;
-  char *str; // Contains version or error string if status != 0 
-} rocm_version_resp_t;
-
-void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp);
-void rocm_check_vram(rocm_handle_t rh, mem_info_t *resp);
-void rocm_get_version(rocm_handle_t rh, rocm_version_resp_t *resp);
-
-#endif  // __GPU_INFO_ROCM_H__
-#endif  // __APPLE__
--- a/llm/dyn_ext_server.c
+++ b/llm/dyn_ext_server.c
@@ -14,17 +14,14 @@
 #define LOAD_LIBRARY(lib, flags) LoadLibrary(lib)
 #define LOAD_SYMBOL(handle, sym) GetProcAddress(handle, sym)
 #define UNLOAD_LIBRARY(handle) FreeLibrary(handle)
-inline char *LOAD_ERR() {
-  LPSTR messageBuffer = NULL;
-  size_t size = FormatMessageA(
-      FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM |
-          FORMAT_MESSAGE_IGNORE_INSERTS,
-      NULL, GetLastError(), MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT),
-      (LPSTR)&messageBuffer, 0, NULL);
-  char *resp = strdup(messageBuffer);
-  LocalFree(messageBuffer);
-  return resp;
-}
+#define LOAD_ERR() ({\
+  LPSTR messageBuffer = NULL; \
+  size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, \
+                                 NULL, GetLastError(), MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&messageBuffer, 0, NULL); \
+  char *resp = strdup(messageBuffer); \
+  LocalFree(messageBuffer); \
+  resp; \
+})
 #else
 #include <dlfcn.h>
 #define LOAD_LIBRARY(lib, flags) dlopen(lib, flags)
--- a/llm/dyn_ext_server.go
+++ b/llm/dyn_ext_server.go
@@ -28,13 +28,13 @@ import (
 	"log/slog"
 	"os"
 	"path/filepath"
-	"runtime"
 	"strings"
 	"sync"
 	"time"
 	"unsafe"

 	"github.com/jmorganca/ollama/api"
+	"github.com/jmorganca/ollama/gpu"
 )

 type dynExtServer struct {
@@ -72,7 +72,7 @@ func newDynExtServer(library, model string, adapters, projectors []string, opts
 		slog.Info("concurrent llm servers not yet supported, waiting for prior server to complete")
 		mutex.Lock()
 	}
-	updatePath(filepath.Dir(library))
+	gpu.UpdatePath(filepath.Dir(library))
 	libPath := C.CString(library)
 	defer C.free(unsafe.Pointer(libPath))
 	resp := newExtServerResp(512)
@@ -106,7 +106,12 @@ func newDynExtServer(library, model string, adapters, projectors []string, opts
 	sparams.memory_f16 = C.bool(opts.F16KV)
 	sparams.use_mlock = C.bool(opts.UseMLock)
 	sparams.use_mmap = C.bool(opts.UseMMap)
-	sparams.numa = C.bool(opts.UseNUMA)
+
+	if opts.UseNUMA {
+		sparams.numa = C.int(1)
+	} else {
+		sparams.numa = C.int(0)
+	}

 	sparams.lora_adapters = nil
 	for i := 0; i < len(adapters); i++ {
@@ -143,6 +148,7 @@ func newDynExtServer(library, model string, adapters, projectors []string, opts
 	}

 	slog.Info("Initializing llama server")
+	slog.Debug(fmt.Sprintf("server params: %+v", sparams))
 	initResp := newExtServerResp(128)
 	defer freeExtServerResp(initResp)
 	C.dyn_llama_server_init(llm.s, &sparams, &initResp)
@@ -360,25 +366,3 @@ func (llm *dynExtServer) Close() {
 	C.dyn_llama_server_stop(llm.s)
 	mutex.Unlock()
 }
-
-func updatePath(dir string) {
-	if runtime.GOOS == "windows" {
-		tmpDir := filepath.Dir(dir)
-		pathComponents := strings.Split(os.Getenv("PATH"), ";")
-		i := 0
-		for _, comp := range pathComponents {
-			if strings.EqualFold(comp, dir) {
-				return
-			}
-			// Remove any other prior paths to our temp dir
-			if !strings.HasPrefix(strings.ToLower(comp), strings.ToLower(tmpDir)) {
-				pathComponents[i] = comp
-				i++
-			}
-		}
-		newPath := strings.Join(append([]string{dir}, pathComponents...), ";")
-		slog.Info(fmt.Sprintf("Updating PATH to %s", newPath))
-		os.Setenv("PATH", newPath)
-	}
-	// linux and darwin rely on rpath
-}
--- a/llm/ext_server/ext_server.cpp
+++ b/llm/ext_server/ext_server.cpp
@@ -80,7 +80,7 @@ void llama_server_init(ext_server_params *sparams, ext_server_resp_t *err) {
    params.main_gpu = sparams->main_gpu;
    params.use_mlock = sparams->use_mlock;
    params.use_mmap = sparams->use_mmap;
-    params.numa = sparams->numa;
+    params.numa = (ggml_numa_strategy)sparams->numa;
    params.embedding = sparams->embedding;
    if (sparams->model != NULL) {
      params.model = sparams->model;
@@ -111,7 +111,8 @@ void llama_server_init(ext_server_params *sparams, ext_server_resp_t *err) {
    }
 #endif

-    llama_backend_init(params.numa);
+    llama_backend_init();
+    llama_numa_init(params.numa);

    // load the model
    if (!llama->load_model(params)) {
@@ -145,9 +146,9 @@ void llama_server_start() {
      llama->queue_tasks.on_new_task(std::bind(
        &llama_server_context::process_single_task, llama, std::placeholders::_1));
      llama->queue_tasks.on_finish_multitask(std::bind(
-          &llama_server_context::on_finish_multitask, llama, std::placeholders::_1));
-      llama->queue_tasks.on_all_tasks_finished(std::bind(
-          &llama_server_context::run_on_all_tasks_finished, llama));
+        &llama_server_context::on_finish_multitask, llama, std::placeholders::_1));
+      llama->queue_tasks.on_run_slots(std::bind(
+        &llama_server_context::update_slots, llama));
      llama->queue_results.on_multitask_update(std::bind(
          &llama_server_queue::update_multitask,
          &llama->queue_tasks,
@@ -208,7 +209,6 @@ void llama_server_completion(const char *json_req, ext_server_resp_t *resp) {
 void llama_server_completion_next_result(const int task_id,
                                         ext_server_task_result_t *resp) {
  assert(llama != NULL && resp != NULL);
-  std::string msg;
  resp->id = -1;
  resp->stop = false;
  resp->error = false;
--- a/llm/ext_server/ext_server.h
+++ b/llm/ext_server/ext_server.h
@@ -41,7 +41,7 @@ typedef struct ext_server_params {
  int32_t main_gpu;      // the GPU that is used for scratch and small tensors
  bool use_mlock;        // force system to keep model in RAM
  bool use_mmap;         // use mmap if possible
-  bool numa;             // attempt optimizations that help on some NUMA systems
+  int numa;              // attempt optimizations that help on some NUMA systems
  bool embedding;        // get only sentence embedding
  ext_server_lora_adapter_t *lora_adapters;
  char *mmproj;
--- a/llm/generate/gen_common.sh
+++ b/llm/generate/gen_common.sh
@@ -1,4 +1,4 @@
-# common logic accross linux and darwin
+# common logic across linux and darwin

 init_vars() {
    case "${GOARCH}" in
@@ -101,7 +101,7 @@ compress_libs() {
    pids=""
    rm -rf ${BUILD_DIR}/lib/*.${LIB_EXT}*.gz
    for lib in ${BUILD_DIR}/lib/*.${LIB_EXT}* ; do
-        gzip --best -f ${lib} &
+        gzip -n --best -f ${lib} &
        pids+=" $!"
    done
    echo 
--- a/llm/generate/gen_linux.sh
+++ b/llm/generate/gen_linux.sh
@@ -179,17 +179,21 @@ fi

 if [ -d "${ROCM_PATH}" ]; then
    echo "ROCm libraries detected - building dynamic ROCm library"
-    if [ -f ${ROCM_PATH}/lib/librocm_smi64.so.? ]; then
-        ROCM_VARIANT=_v$(ls ${ROCM_PATH}/lib/librocm_smi64.so.? | cut -f3 -d. || true)
+    if [ -f ${ROCM_PATH}/lib/librocblas.so.*.*.????? ]; then
+        ROCM_VARIANT=_v$(ls ${ROCM_PATH}/lib/librocblas.so.*.*.????? | cut -f5 -d. || true)
    fi
    init_vars
    CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DLLAMA_HIPBLAS=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS=$(amdGPUs) -DGPU_TARGETS=$(amdGPUs)"
    BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/rocm${ROCM_VARIANT}"
-    EXTRA_LIBS="-L${ROCM_PATH}/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ -Wl,-rpath,${ROCM_PATH}/lib,-rpath,/opt/amdgpu/lib/x86_64-linux-gnu/ -lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu"
+    EXTRA_LIBS="-L${ROCM_PATH}/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ -Wl,-rpath,\$ORIGIN/../rocm/ -lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu"
    build

-    # Note: the ROCM libs and runtime library files are too large to embed, so we depend on
-    #       them being present at runtime on the host
+    # Record the ROCM dependencies
+    rm -f "${BUILD_DIR}/lib/deps.txt"
+    touch "${BUILD_DIR}/lib/deps.txt"
+    for dep in $(ldd "${BUILD_DIR}/lib/libext_server.so" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -e rocm -e amdgpu -e libtinfo ); do
+        echo "${dep}" >> "${BUILD_DIR}/lib/deps.txt"
+    done
    compress_libs
 fi

--- a/llm/generate/gen_windows.ps1
+++ b/llm/generate/gen_windows.ps1
@@ -2,19 +2,52 @@

 $ErrorActionPreference = "Stop"

+function amdGPUs {
+    if ($env:AMDGPU_TARGETS) {
+        return $env:AMDGPU_TARGETS
+    }
+    # TODO - load from some common data file for linux + windows build consistency
+    $GPU_LIST = @(
+        "gfx900"
+        "gfx906:xnack-"
+        "gfx908:xnack-"
+        "gfx90a:xnack+"
+        "gfx90a:xnack-"
+        "gfx1010"
+        "gfx1012"
+        "gfx1030"
+        "gfx1100"
+        "gfx1101"
+        "gfx1102"
+    )
+    $GPU_LIST -join ';'
+}
+
 function init_vars {
+    # Verify the environment is a Developer Shell for MSVC 2019
+    write-host $env:VSINSTALLDIR
+    if (($env:VSINSTALLDIR -eq $null)) {
+        Write-Error "`r`nBUILD ERROR - YOUR DEVELOPMENT ENVIRONMENT IS NOT SET UP CORRECTLY`r`nTo build Ollama you must run from an MSVC Developer Shell`r`nSee .\docs\development.md for instructions to set up your dev environment"
+        exit 1
+    }
    $script:SRC_DIR = $(resolve-path "..\..\")
    $script:llamacppDir = "../llama.cpp"
-    $script:cmakeDefs = @("-DBUILD_SHARED_LIBS=on", "-DLLAMA_NATIVE=off",  "-A", "x64")
+    $script:cmakeDefs = @(
+        "-DBUILD_SHARED_LIBS=on",
+        "-DLLAMA_NATIVE=off"
+        )
    $script:cmakeTargets = @("ext_server")
    $script:ARCH = "amd64" # arm not yet supported.
    if ($env:CGO_CFLAGS -contains "-g") {
-        $script:cmakeDefs += @("-DCMAKE_VERBOSE_MAKEFILE=on", "-DLLAMA_SERVER_VERBOSE=on")
+        $script:cmakeDefs += @("-DCMAKE_VERBOSE_MAKEFILE=on", "-DLLAMA_SERVER_VERBOSE=on", "-DCMAKE_BUILD_TYPE=RelWithDebInfo")
        $script:config = "RelWithDebInfo"
    } else {
-        $script:cmakeDefs += @("-DLLAMA_SERVER_VERBOSE=off")
+        $script:cmakeDefs += @("-DLLAMA_SERVER_VERBOSE=off", "-DCMAKE_BUILD_TYPE=Release")
        $script:config = "Release"
    }
+    if ($null -ne $env:CMAKE_SYSTEM_VERSION) {
+        $script:cmakeDefs += @("-DCMAKE_SYSTEM_VERSION=${env:CMAKE_SYSTEM_VERSION}")
+    }
    # Try to find the CUDA dir
    if ($env:CUDA_LIB_DIR -eq $null) {
        $d=(get-command -ea 'silentlycontinue' nvcc).path
@@ -154,9 +187,10 @@ apply_patches
 # -DLLAMA_AVX2 -- 2013 Intel Haswell & 2015 AMD Excavator / 2017 AMD Zen
 # -DLLAMA_FMA (FMA3) -- 2013 Intel Haswell & 2012 AMD Piledriver

-$script:commonCpuDefs = @("-DCMAKE_POSITION_INDEPENDENT_CODE=on", "-DLLAMA_NATIVE=off")
+$script:commonCpuDefs = @("-DCMAKE_POSITION_INDEPENDENT_CODE=on")

-$script:cmakeDefs = $script:commonCpuDefs + @("-DLLAMA_AVX=off", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs
+init_vars
+$script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DLLAMA_AVX=off", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs
 $script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cpu"
 write-host "Building LCD CPU"
 build
@@ -164,7 +198,8 @@ install
 sign
 compress_libs

-$script:cmakeDefs = $script:commonCpuDefs + @("-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs
+init_vars
+$script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs
 $script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cpu_avx"
 write-host "Building AVX CPU"
 build
@@ -172,7 +207,8 @@ install
 sign
 compress_libs

-$script:cmakeDefs = $script:commonCpuDefs + @("-DLLAMA_AVX=on", "-DLLAMA_AVX2=on", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=on", "-DLLAMA_F16C=on") + $script:cmakeDefs
+init_vars
+$script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=on", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=on", "-DLLAMA_F16C=on") + $script:cmakeDefs
 $script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cpu_avx2"
 write-host "Building AVX2 CPU"
 build
@@ -189,18 +225,51 @@ if ($null -ne $script:CUDA_LIB_DIR) {
    }
    init_vars
    $script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cuda$script:CUDA_VARIANT"
-    $script:cmakeDefs += @("-DLLAMA_CUBLAS=ON", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DCUDAToolkit_INCLUDE_DIR=$script:CUDA_INCLUDE_DIR", "-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}")
+    $script:cmakeDefs += @("-A", "x64", "-DLLAMA_CUBLAS=ON", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DCUDAToolkit_INCLUDE_DIR=$script:CUDA_INCLUDE_DIR", "-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}")
+    write-host "Building CUDA"
    build
    install
    sign
    compress_libs
 }
-# TODO - actually implement ROCm support on windows
-$script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/rocm"

-rm -ea 0 -recurse -force -path "${script:buildDir}/lib"
-md "${script:buildDir}/lib" -ea 0 > $null
-echo $null >> "${script:buildDir}/lib/.generated"
+if ($null -ne $env:HIP_PATH) {
+    $script:ROCM_VERSION=(get-item $env:HIP_PATH).Basename
+    if ($null -ne $script:ROCM_VERSION) {
+        $script:ROCM_VARIANT="_v"+$script:ROCM_VERSION
+    }
+
+    init_vars
+    $script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/rocm$script:ROCM_VARIANT"
+    $script:cmakeDefs += @(
+        "-G", "Ninja", 
+        "-DCMAKE_C_COMPILER=clang.exe",
+        "-DCMAKE_CXX_COMPILER=clang++.exe",
+        "-DLLAMA_HIPBLAS=on",
+        "-DLLAMA_AVX=on",
+        "-DLLAMA_AVX2=off",
+        "-DCMAKE_POSITION_INDEPENDENT_CODE=on",
+        "-DAMDGPU_TARGETS=$(amdGPUs)",
+        "-DGPU_TARGETS=$(amdGPUs)"
+        )
+
+    # Make sure the ROCm binary dir is first in the path
+    $env:PATH="$env:HIP_PATH\bin;$env:VSINSTALLDIR\Common7\IDE\CommonExtensions\Microsoft\CMake\Ninja;$env:PATH"
+
+    # We have to clobber the LIB var from the developer shell for clang to work properly
+    $env:LIB=""
+
+    write-host "Building ROCm"
+    build
+    # Ninja doesn't prefix with config name
+    ${script:config}=""
+    install
+    if ($null -ne $script:DUMPBIN) {
+        & "$script:DUMPBIN" /dependents "${script:buildDir}/bin/${script:config}/ext_server.dll" | select-string ".dll"
+    }
+    sign
+    compress_libs
+}

 cleanup
 write-host "`ngo generate completed"
--- a/llm/ggml.go
+++ b/llm/ggml.go
@@ -31,6 +31,11 @@ const (
 	fileTypeQ5_K_S
 	fileTypeQ5_K_M
 	fileTypeQ6_K
+	fileTypeIQ2_XXS
+	fileTypeIQ2_XS
+	fileTypeQ2_K_S
+	fileTypeQ3_K_XS
+	fileTypeIQ3_XXS
 )

 func fileType(fileType uint32) string {
@@ -69,6 +74,16 @@ func fileType(fileType uint32) string {
 		return "Q5_K_M"
 	case fileTypeQ6_K:
 		return "Q6_K"
+	case fileTypeIQ2_XXS:
+		return "IQ2_XXS"
+	case fileTypeIQ2_XS:
+		return "IQ2_XS"
+	case fileTypeQ2_K_S:
+		return "Q2_K_S"
+	case fileTypeQ3_K_XS:
+		return "Q3_K_XS"
+	case fileTypeIQ3_XXS:
+		return "IQ3_XXS"
 	default:
 		return "unknown"
 	}
@@ -148,9 +163,9 @@ func DecodeGGML(r io.ReadSeeker) (*GGML, error) {
 	case FILE_MAGIC_GGLA:
 		c = &containerLORA{}
 	case FILE_MAGIC_GGUF_LE:
-		c = &containerGGUF{bo: binary.LittleEndian}
+		c = &ContainerGGUF{ByteOrder: binary.LittleEndian}
 	case FILE_MAGIC_GGUF_BE:
-		c = &containerGGUF{bo: binary.BigEndian}
+		c = &ContainerGGUF{ByteOrder: binary.BigEndian}
 	default:
 		return nil, errors.New("invalid file magic")
 	}
--- a/llm/gguf.go
+++ b/llm/gguf.go
--- a/llm/llama.cpp
+++ b/llm/llama.cpp
--- a/llm/llm.go
+++ b/llm/llm.go
@@ -19,7 +19,7 @@ type LLM interface {
 	Close()
 }

-func New(workDir, model string, adapters, projectors []string, opts api.Options) (LLM, error) {
+func New(model string, adapters, projectors []string, opts api.Options) (LLM, error) {
 	if _, err := os.Stat(model); err != nil {
 		return nil, err
 	}
@@ -120,15 +120,15 @@ func New(workDir, model string, adapters, projectors []string, opts api.Options)

 	opts.RopeFrequencyBase = 0.0
 	opts.RopeFrequencyScale = 0.0
-	return newLlmServer(info, workDir, model, adapters, projectors, opts)
+	return newLlmServer(info, model, adapters, projectors, opts)
 }

 // Give any native cgo implementations an opportunity to initialize
-func Init(workdir string) error {
-	return nativeInit(workdir)
+func Init() error {
+	return nativeInit()
 }

-func newLlmServer(gpuInfo gpu.GpuInfo, workDir, model string, adapters, projectors []string, opts api.Options) (LLM, error) {
+func newLlmServer(gpuInfo gpu.GpuInfo, model string, adapters, projectors []string, opts api.Options) (LLM, error) {
 	dynLibs := getDynLibs(gpuInfo)

 	// Check to see if the user has requested a specific library instead of auto-detecting
@@ -147,7 +147,7 @@ func newLlmServer(gpuInfo gpu.GpuInfo, workDir, model string, adapters, projecto
 	_, err := os.Stat(dynLibs[0])
 	if err != nil {
 		slog.Info(fmt.Sprintf("%s has disappeared, reloading libraries", dynLibs[0]))
-		err = nativeInit(workDir)
+		err = nativeInit()
 		if err != nil {
 			return nil, err
 		}
--- a/llm/patches/01-cache.diff
+++ b/llm/patches/01-cache.diff
@@ -1,9 +1,9 @@
 diff --git a/examples/server/server.cpp b/examples/server/server.cpp
-index d86d7e04..2694e92e 100644
+index 2b2f4a0f..afac49af 100644
 --- a/examples/server/server.cpp
 +++ b/examples/server/server.cpp
-@@ -901,13 +901,15 @@ struct llama_server_context
-                 slot.sent_count += result.text_to_send.size();
+@@ -997,13 +997,15 @@ struct llama_server_context
+                 slot.n_sent_text += result.text_to_send.size();
                 // add the token to slot queue and cache
             }
 -            slot.add_token_string(result);
--- a/llm/patches/02-cudaleaks.diff
+++ b/llm/patches/02-cudaleaks.diff
@@ -1,30 +1,29 @@
 diff --git a/examples/server/server.cpp b/examples/server/server.cpp
-index 3102762c..568ac1d0 100644
+index 2b2f4a0f..25857bdd 100644
 --- a/examples/server/server.cpp
 +++ b/examples/server/server.cpp
-@@ -307,6 +307,10 @@ struct llama_client_slot
-     }
- };
+@@ -31,6 +31,10 @@
+ #include <atomic>
+ #include <signal.h>
 
 +#ifdef GGML_USE_CUBLAS
 +extern "C" GGML_CALL void ggml_free_cublas(void);
 +#endif
 +
- struct llama_server_context
- {
-     llama_model *model = nullptr;
-@@ -353,6 +357,10 @@ struct llama_server_context
+ using json = nlohmann::json;
+ 
+ struct server_params {
+@@ -363,6 +367,9 @@ struct llama_server_context
             llama_free_model(model);
             model = nullptr;
         }
 +#ifdef GGML_USE_CUBLAS
 +        ggml_free_cublas();
 +#endif
-+
     }
 
     bool load_model(const gpt_params &params_)
-@@ -3093,6 +3101,7 @@ int main(int argc, char **argv)
+@@ -3494,6 +3501,7 @@ int main(int argc, char **argv)
     sigemptyset (&sigint_action.sa_mask);
     sigint_action.sa_flags = 0;
     sigaction(SIGINT, &sigint_action, NULL);
@@ -32,16 +31,11 @@ index 3102762c..568ac1d0 100644
 #elif defined (_WIN32)
     auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
         return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false;
-@@ -3106,3 +3115,4 @@ int main(int argc, char **argv)
-     llama_backend_free();
-     return 0;
- }
-+
 diff --git a/ggml-cuda.cu b/ggml-cuda.cu
-index 96976f24..3543920e 100644
+index 0c6501e9..75c12723 100644
 --- a/ggml-cuda.cu
 +++ b/ggml-cuda.cu
-@@ -39,6 +39,7 @@
+@@ -43,6 +43,7 @@
 #define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
 #define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
 #define cublasCreate hipblasCreate
@@ -49,30 +43,30 @@ index 96976f24..3543920e 100644
 #define cublasGemmEx hipblasGemmEx
 #define cublasGemmBatchedEx hipblasGemmBatchedEx
 #define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
-@@ -7928,10 +7929,11 @@ GGML_CALL bool ggml_cublas_loaded(void) {
+@@ -8694,10 +8695,10 @@ GGML_CALL bool ggml_cublas_loaded(void) {
     return g_cublas_loaded;
 }
 
-+static bool g_cublas_initialized = false;
-+
- GGML_CALL void ggml_init_cublas() {
+-GGML_CALL void ggml_init_cublas() {
 -    static bool initialized = false;
+static bool g_cublas_initialized = false;
 
 -    if (!initialized) {
+GGML_CALL void ggml_init_cublas() {
 +    if (!g_cublas_initialized) {
 
 #ifdef __HIP_PLATFORM_AMD__
         // Workaround for a rocBLAS bug when using multiple graphics cards:
-@@ -7941,7 +7943,7 @@ GGML_CALL void ggml_init_cublas() {
+@@ -8707,7 +8708,7 @@ GGML_CALL void ggml_init_cublas() {
 #endif
 
         if (cudaGetDeviceCount(&g_device_count) != cudaSuccess) {
 -            initialized = true;
 +            g_cublas_initialized = true;
             g_cublas_loaded = false;
+             fprintf(stderr, "%s: no " GGML_CUDA_NAME " devices found, " GGML_CUDA_NAME " will be disabled\n", __func__);
             return;
-         }
-@@ -8011,7 +8013,7 @@ GGML_CALL void ggml_init_cublas() {
+@@ -8778,7 +8779,7 @@ GGML_CALL void ggml_init_cublas() {
         // configure logging to stdout
         // CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
 
@@ -81,7 +75,7 @@ index 96976f24..3543920e 100644
         g_cublas_loaded = true;
     }
 }
-@@ -11528,3 +11530,17 @@ GGML_CALL int ggml_backend_cuda_reg_devices() {
+@@ -12345,3 +12346,22 @@ GGML_CALL int ggml_backend_cuda_reg_devices() {
     }
     return device_count;
 }
@@ -89,28 +83,32 @@ index 96976f24..3543920e 100644
 +extern "C" GGML_CALL void ggml_free_cublas(void);
 +GGML_CALL void ggml_free_cublas(void) {
 +    for (int id = 0; id < g_device_count; ++id) {
-+#if !defined(GGML_USE_HIPBLAS)
-+        CU_CHECK(cuMemUnmap(g_cuda_pool_addr[id], g_cuda_pool_size[id]));
-+        g_cuda_pool_size[id] = 0;
-+        g_cuda_pool_addr[id] = 0;
+#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
+        if (g_device_caps[id].vmm) {
+            CU_CHECK(cuMemUnmap(g_cuda_pool_addr[id], g_cuda_pool_size[id]));
+            g_cuda_pool_size[id] = 0;
+            g_cuda_pool_addr[id] = 0;
+        }
 +#endif
+        // TODO: free legacy non-vmm memory
+        // destroy cublas handle
 +        CUBLAS_CHECK(cublasDestroy(g_cublas_handles[id]));
 +        g_cublas_handles[id] = nullptr;
 +    }
+
 +    g_cublas_initialized = false;
 +}
-\ No newline at end of file
 diff --git a/ggml-cuda.h b/ggml-cuda.h
-index b1ebd61d..b4c80c2c 100644
+index b1ebd61d..6dd58ddf 100644
 --- a/ggml-cuda.h
 +++ b/ggml-cuda.h
-@@ -20,6 +20,9 @@ extern "C" {
- // Always success. To check if CUDA is actually loaded, use `ggml_cublas_loaded`.
- GGML_API GGML_CALL void   ggml_init_cublas(void);
+@@ -23,6 +23,9 @@ GGML_API GGML_CALL void   ggml_init_cublas(void);
+ // Returns `true` if there are available CUDA devices and cublas loads successfully; otherwise, it returns `false`.
+ GGML_API GGML_CALL bool   ggml_cublas_loaded(void);
 
 +// Release CUDA resources
 +GGML_API GGML_CALL void   ggml_free_cublas(void);
 +
- // Returns `true` if there are available CUDA devices and cublas loads successfully; otherwise, it returns `false`.
- GGML_API GGML_CALL bool   ggml_cublas_loaded(void);
+ GGML_API GGML_CALL void * ggml_cuda_host_malloc(size_t size);
+ GGML_API GGML_CALL void   ggml_cuda_host_free(void * ptr);
 
--- a/llm/patches/02-shutdown.diff
+++ b/llm/patches/02-shutdown.diff
@@ -1,96 +0,0 @@
-diff --git a/examples/server/server.cpp b/examples/server/server.cpp
-index a0b46970..7800c6e7 100644
--- a/examples/server/server.cpp
-+++ b/examples/server/server.cpp
-@@ -28,6 +28,7 @@
- #include <chrono>
- #include <condition_variable>
- #include <atomic>
-+#include <signal.h>
- 
- using json = nlohmann::json;
- 
-@@ -2511,6 +2512,9 @@ static void append_to_generated_text_from_generated_token_probs(llama_server_con
-     }
- }
- 
-+std::function<void(int)> shutdown_handler;
-+inline void signal_handler(int signal) { shutdown_handler(signal); }
-+
- int main(int argc, char **argv)
- {
- #if SERVER_VERBOSE != 1
-@@ -3128,8 +3132,25 @@ int main(int argc, char **argv)
-         std::placeholders::_2,
-         std::placeholders::_3
-     ));
-    llama.queue_tasks.start_loop();
- 
-+    shutdown_handler = [&](int) {
-+        llama.queue_tasks.terminate();
-+    };
-+
-+#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
-+    struct sigaction sigint_action;
-+    sigint_action.sa_handler = signal_handler;
-+    sigemptyset (&sigint_action.sa_mask);
-+    sigint_action.sa_flags = 0;
-+    sigaction(SIGINT, &sigint_action, NULL);
-+#elif defined (_WIN32)
-+    auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
-+        return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false;
-+    };
-+    SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
-+#endif
-+    llama.queue_tasks.start_loop();
-+    svr.stop();
-     t.join();
- 
-     llama_backend_free();
-diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
-index 54854896..0ee670db 100644
--- a/examples/server/utils.hpp
-+++ b/examples/server/utils.hpp
-@@ -220,6 +220,7 @@ inline std::string format_chatml(std::vector<json> messages)
- struct llama_server_queue {
-     int id = 0;
-     std::mutex mutex_tasks;
-+    bool running;
-     // queues
-     std::vector<task_server> queue_tasks;
-     std::vector<task_server> queue_tasks_deferred;
-@@ -278,9 +279,18 @@ struct llama_server_queue {
-         queue_tasks_deferred.clear();
-     }
- 
-    // Start the main loop. This call is blocking
-    [[noreturn]]
-+    // end the start_loop routine
-+    void terminate() {
-+        {
-+            std::unique_lock<std::mutex> lock(mutex_tasks);
-+            running = false;
-+        }
-+        condition_tasks.notify_all();
-+    }
-+
-+    // Start the main loop.
-     void start_loop() {
-+        running = true;
-         while (true) {
-             // new task arrived
-             LOG_VERBOSE("have new task", {});
-@@ -324,8 +334,12 @@ struct llama_server_queue {
-             {
-                 std::unique_lock<std::mutex> lock(mutex_tasks);
-                 if (queue_tasks.empty()) {
-+                    if (!running) {
-+                        LOG_VERBOSE("ending start_loop", {});
-+                        return;
-+                    }
-                     condition_tasks.wait(lock, [&]{
-                        return !queue_tasks.empty();
-+                        return (!queue_tasks.empty() || !running);
-                     });
-                 }
-             }
--- a/llm/payload_common.go
+++ b/llm/payload_common.go
@@ -103,10 +103,14 @@ func rocmDynLibPresent() bool {
 	return false
 }

-func nativeInit(workdir string) error {
+func nativeInit() error {
 	slog.Info("Extracting dynamic libraries...")
+	assetsDir, err := gpu.AssetsDir()
+	if err != nil {
+		return err
+	}
 	if runtime.GOOS == "darwin" {
-		err := extractPayloadFiles(workdir, "llama.cpp/ggml-metal.metal")
+		err := extractPayloadFiles(assetsDir, "llama.cpp/ggml-metal.metal")
 		if err != nil {
 			if err == payloadMissing {
 				// TODO perhaps consider this a hard failure on arm macs?
@@ -115,10 +119,10 @@ func nativeInit(workdir string) error {
 			}
 			return err
 		}
-		os.Setenv("GGML_METAL_PATH_RESOURCES", workdir)
+		os.Setenv("GGML_METAL_PATH_RESOURCES", assetsDir)
 	}

-	libs, err := extractDynamicLibs(workdir, "llama.cpp/build/*/*/*/lib/*")
+	libs, err := extractDynamicLibs(assetsDir, "llama.cpp/build/*/*/*/lib/*")
 	if err != nil {
 		if err == payloadMissing {
 			slog.Info(fmt.Sprintf("%s", payloadMissing))
@@ -149,17 +153,13 @@ func nativeInit(workdir string) error {
 	return nil
 }

-func extractDynamicLibs(workDir, glob string) ([]string, error) {
+func extractDynamicLibs(assetsDir, glob string) ([]string, error) {
 	files, err := fs.Glob(libEmbed, glob)
 	if err != nil || len(files) == 0 {
 		return nil, payloadMissing
 	}
 	libs := []string{}

-	// TODO consider making this idempotent with some sort of persistent directory (where we store models probably)
-	// and tracking by version so we don't reexpand the files every time
-	// Also maybe consider lazy loading only what is needed
-
 	g := new(errgroup.Group)
 	for _, file := range files {
 		pathComps := strings.Split(file, "/")
@@ -172,14 +172,14 @@ func extractDynamicLibs(workDir, glob string) ([]string, error) {
 		g.Go(func() error {
 			// llama.cpp/build/$OS/$GOARCH/$VARIANT/lib/$LIBRARY
 			// Include the variant in the path to avoid conflicts between multiple server libs
-			targetDir := filepath.Join(workDir, pathComps[pathComponentCount-3])
+			targetDir := filepath.Join(assetsDir, pathComps[pathComponentCount-3])
 			srcFile, err := libEmbed.Open(file)
 			if err != nil {
 				return fmt.Errorf("read payload %s: %v", file, err)
 			}
 			defer srcFile.Close()
 			if err := os.MkdirAll(targetDir, 0o755); err != nil {
-				return fmt.Errorf("create payload temp dir %s: %v", workDir, err)
+				return fmt.Errorf("create payload lib dir %s: %v", assetsDir, err)
 			}
 			src := io.Reader(srcFile)
 			filename := file
@@ -196,19 +196,13 @@ func extractDynamicLibs(workDir, glob string) ([]string, error) {
 				libs = append(libs, destFile)
 			}

-			_, err = os.Stat(destFile)
-			switch {
-			case errors.Is(err, os.ErrNotExist):
-				destFile, err := os.OpenFile(destFile, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
-				if err != nil {
-					return fmt.Errorf("write payload %s: %v", file, err)
-				}
-				defer destFile.Close()
-				if _, err := io.Copy(destFile, src); err != nil {
-					return fmt.Errorf("copy payload %s: %v", file, err)
-				}
-			case err != nil:
-				return fmt.Errorf("stat payload %s: %v", file, err)
+			destFp, err := os.OpenFile(destFile, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
+			if err != nil {
+				return fmt.Errorf("write payload %s: %v", file, err)
+			}
+			defer destFp.Close()
+			if _, err := io.Copy(destFp, src); err != nil {
+				return fmt.Errorf("copy payload %s: %v", file, err)
 			}
 			return nil
 		})
@@ -216,7 +210,7 @@ func extractDynamicLibs(workDir, glob string) ([]string, error) {
 	return libs, g.Wait()
 }

-func extractPayloadFiles(workDir, glob string) error {
+func extractPayloadFiles(assetsDir, glob string) error {
 	files, err := fs.Glob(libEmbed, glob)
 	if err != nil || len(files) == 0 {
 		return payloadMissing
@@ -228,8 +222,8 @@ func extractPayloadFiles(workDir, glob string) error {
 			return fmt.Errorf("read payload %s: %v", file, err)
 		}
 		defer srcFile.Close()
-		if err := os.MkdirAll(workDir, 0o755); err != nil {
-			return fmt.Errorf("create payload temp dir %s: %v", workDir, err)
+		if err := os.MkdirAll(assetsDir, 0o755); err != nil {
+			return fmt.Errorf("create payload lib dir %s: %v", assetsDir, err)
 		}
 		src := io.Reader(srcFile)
 		filename := file
@@ -241,20 +235,22 @@ func extractPayloadFiles(workDir, glob string) error {
 			filename = strings.TrimSuffix(filename, ".gz")
 		}

-		destFile := filepath.Join(workDir, filepath.Base(filename))
+		destFile := filepath.Join(assetsDir, filepath.Base(filename))
 		_, err = os.Stat(destFile)
 		switch {
 		case errors.Is(err, os.ErrNotExist):
-			destFile, err := os.OpenFile(destFile, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
+			destFp, err := os.OpenFile(destFile, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
 			if err != nil {
 				return fmt.Errorf("write payload %s: %v", file, err)
 			}
-			defer destFile.Close()
-			if _, err := io.Copy(destFile, src); err != nil {
+			defer destFp.Close()
+			if _, err := io.Copy(destFp, src); err != nil {
 				return fmt.Errorf("copy payload %s: %v", file, err)
 			}
 		case err != nil:
 			return fmt.Errorf("stat payload %s: %v", file, err)
+		case err == nil:
+			slog.Debug("payload already exists: " + destFile)
 		}
 	}
 	return nil
--- a/llm/payload_linux.go
+++ b/llm/payload_linux.go
@@ -4,5 +4,5 @@ import (
 	"embed"
 )

-//go:embed llama.cpp/build/linux/*/*/lib/*.so*
+//go:embed llama.cpp/build/linux/*/*/lib/*
 var libEmbed embed.FS
--- a/scripts/build_docker.sh
+++ b/scripts/build_docker.sh
@@ -5,13 +5,15 @@ set -eu
 export VERSION=${VERSION:-$(git describe --tags --first-parent --abbrev=7 --long --dirty --always | sed -e "s/^v//g")}
 export GOFLAGS="'-ldflags=-w -s \"-X=github.com/jmorganca/ollama/version.Version=$VERSION\" \"-X=github.com/jmorganca/ollama/server.mode=release\"'"

+IMAGE_NAME=${IMAGE_NAME:-"ollama/ollama"}
+BUILD_PLATFORM=${BUILD_PLATFORM:-"linux/arm64,linux/amd64"}
 docker build \
    --load \
-    --platform=linux/arm64,linux/amd64 \
+    --platform=${BUILD_PLATFORM} \
    --build-arg=VERSION \
    --build-arg=GOFLAGS \
    -f Dockerfile \
-    -t ollama/ollama:$VERSION \
+    -t ${IMAGE_NAME}:$VERSION \
    .

 docker build \
@@ -21,5 +23,12 @@ docker build \
    --build-arg=GOFLAGS \
    --target runtime-rocm \
    -f Dockerfile \
-    -t ollama/ollama:$VERSION-rocm \
+    -t ${IMAGE_NAME}:$VERSION-rocm \
    .
+
+docker tag ${IMAGE_NAME}:$VERSION ${IMAGE_NAME}:latest
+docker tag ${IMAGE_NAME}:$VERSION-rocm ${IMAGE_NAME}:rocm
+
+echo "To release, run:"
+echo "  docker push ${IMAGE_NAME}:$VERSION && docker push ${IMAGE_NAME}:latest"
+echo "  docker push ${IMAGE_NAME}:$VERSION-rocm && docker push ${IMAGE_NAME}:rocm"
--- a/scripts/build_linux.sh
+++ b/scripts/build_linux.sh
@@ -22,5 +22,6 @@ for TARGETARCH in ${BUILD_ARCH}; do
        .
    docker create --platform linux/$TARGETARCH --name builder-$TARGETARCH builder:$TARGETARCH
    docker cp builder-$TARGETARCH:/go/src/github.com/jmorganca/ollama/ollama ./dist/ollama-linux-$TARGETARCH
+    docker cp builder-$TARGETARCH:/go/src/github.com/jmorganca/ollama/dist/deps/ ./dist/
    docker rm builder-$TARGETARCH
 done
--- a/scripts/build_windows.ps1
+++ b/scripts/build_windows.ps1
@@ -53,13 +53,14 @@ function buildOllama() {
    write-host "Building ollama CLI"
    & go generate ./...
    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
-    & go build "-ldflags=-w -s ""-X=github.com/jmorganca/ollama/version.Version=$script:VERSION"" ""-X=github.com/jmorganca/ollama/server.mode=release""" .
+    & go build -ldflags "-s -w -X=github.com/jmorganca/ollama/version.Version=$script:VERSION -X=github.com/jmorganca/ollama/server.mode=release" .
    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
    if ("${env:KEY_CONTAINER}") {
        & "${script:SignTool}" sign /v /fd sha256 /t http://timestamp.digicert.com /f "${script:OLLAMA_CERT}" `
            /csp "Google Cloud KMS Provider" /kc ${env:KEY_CONTAINER} ollama.exe
        if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
    }
+    New-Item -ItemType Directory -Path .\dist -Force
    cp .\ollama.exe .\dist\ollama-windows-amd64.exe
 }

@@ -67,7 +68,7 @@ function buildApp() {
    write-host "Building Ollama App"
    cd "${script:SRC_DIR}\app"
    & windres -l 0 -o ollama.syso ollama.rc
-    & go build "-ldflags=-H windowsgui -w -s ""-X=github.com/jmorganca/ollama/version.Version=$script:VERSION"" ""-X=github.com/jmorganca/ollama/server.mode=release""" .
+    & go build -ldflags "-s -w -H windowsgui -X=github.com/jmorganca/ollama/version.Version=$script:VERSION -X=github.com/jmorganca/ollama/server.mode=release" .
    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
    if ("${env:KEY_CONTAINER}") {
        & "${script:SignTool}" sign /v /fd sha256 /t http://timestamp.digicert.com /f "${script:OLLAMA_CERT}" `
@@ -129,4 +130,4 @@ try {
 } finally {
    set-location $script:SRC_DIR
    $env:PKG_VERSION=""
-}
+}
--- a/scripts/install.sh
+++ b/scripts/install.sh
@@ -72,7 +72,7 @@ $SUDO install -o0 -g0 -m755 -d $BINDIR
 $SUDO install -o0 -g0 -m755 $TEMP_DIR/ollama $BINDIR/ollama

 install_success() { 
-    status 'The Ollama API is now available at 0.0.0.0:11434.'
+    status 'The Ollama API is now available at 127.0.0.1:11434.'
    status 'Install complete. Run "ollama" from the command line.'
 }
 trap install_success EXIT
@@ -88,6 +88,10 @@ configure_systemd() {
        status "Adding ollama user to render group..."
        $SUDO usermod -a -G render ollama
    fi
+    if getent group video >/dev/null 2>&1; then
+        status "Adding ollama user to video group..."
+        $SUDO usermod -a -G video ollama
+    fi

    status "Adding current user to ollama group..."
    $SUDO usermod -a -G ollama $(whoami)
--- a/server/download.go
+++ b/server/download.go
@@ -12,6 +12,7 @@ import (
 	"net/url"
 	"os"
 	"path/filepath"
+	"runtime"
 	"strconv"
 	"strings"
 	"sync"
@@ -20,6 +21,7 @@ import (
 	"time"

 	"golang.org/x/sync/errgroup"
+	"golang.org/x/sync/semaphore"

 	"github.com/jmorganca/ollama/api"
 	"github.com/jmorganca/ollama/format"
@@ -138,30 +140,29 @@ func (b *blobDownload) Prepare(ctx context.Context, requestURL *url.URL, opts *r
 }

 func (b *blobDownload) Run(ctx context.Context, requestURL *url.URL, opts *registryOptions) {
-	b.err = b.run(ctx, requestURL, opts)
-}
-
-func (b *blobDownload) run(ctx context.Context, requestURL *url.URL, opts *registryOptions) error {
 	defer blobDownloadManager.Delete(b.Digest)
 	ctx, b.CancelFunc = context.WithCancel(ctx)

 	file, err := os.OpenFile(b.Name+"-partial", os.O_CREATE|os.O_RDWR, 0o644)
 	if err != nil {
-		return err
+		b.err = err
+		return
 	}
 	defer file.Close()

 	_ = file.Truncate(b.Total)

-	g, inner := errgroup.WithContext(ctx)
-	g.SetLimit(numDownloadParts)
+	limit := int64(runtime.NumCPU())
+	g, inner := NewLimitGroup(ctx, numDownloadParts, limit)
+	go watchDelta(inner, g, &b.Completed, limit)
+
 	for i := range b.Parts {
 		part := b.Parts[i]
 		if part.Completed == part.Size {
 			continue
 		}

-		g.Go(func() error {
+		g.Go(inner, func() error {
 			var err error
 			for try := 0; try < maxRetries; try++ {
 				w := io.NewOffsetWriter(file, part.StartsAt())
@@ -188,26 +189,29 @@ func (b *blobDownload) run(ctx context.Context, requestURL *url.URL, opts *regis
 	}

 	if err := g.Wait(); err != nil {
-		return err
+		b.err = err
+		return
 	}

 	// explicitly close the file so we can rename it
 	if err := file.Close(); err != nil {
-		return err
+		b.err = err
+		return
 	}

 	for i := range b.Parts {
 		if err := os.Remove(file.Name() + "-" + strconv.Itoa(i)); err != nil {
-			return err
+			b.err = err
+			return
 		}
 	}

 	if err := os.Rename(file.Name(), b.Name); err != nil {
-		return err
+		b.err = err
+		return
 	}

 	b.done = true
-	return nil
 }

 func (b *blobDownload) downloadChunk(ctx context.Context, requestURL *url.URL, w io.Writer, part *blobDownloadPart, opts *registryOptions) error {
@@ -377,3 +381,87 @@ func downloadBlob(ctx context.Context, opts downloadOpts) error {

 	return download.Wait(ctx, opts.fn)
 }
+
+type LimitGroup struct {
+	*errgroup.Group
+	*semaphore.Weighted
+	size, limit int64
+}
+
+func NewLimitGroup(ctx context.Context, size, limit int64) (*LimitGroup, context.Context) {
+	g, ctx := errgroup.WithContext(ctx)
+	return &LimitGroup{
+		Group:    g,
+		Weighted: semaphore.NewWeighted(size),
+		size:     size,
+		limit:    limit,
+	}, ctx
+}
+
+func (g *LimitGroup) Go(ctx context.Context, fn func() error) {
+	var weight int64 = 1
+	if g.limit > 0 {
+		weight = g.size / g.limit
+	}
+
+	_ = g.Acquire(ctx, weight)
+	if ctx.Err() != nil {
+		return
+	}
+
+	g.Group.Go(func() error {
+		defer g.Release(weight)
+		return fn()
+	})
+}
+
+func (g *LimitGroup) SetLimit(limit int64) {
+	if limit > g.limit {
+		g.limit = limit
+	}
+}
+
+func watchDelta(ctx context.Context, g *LimitGroup, c *atomic.Int64, limit int64) {
+	var maxDelta float64
+	var buckets []int64
+
+	// 3s ramp up period
+	nextUpdate := time.Now().Add(3 * time.Second)
+
+	ticker := time.NewTicker(time.Second)
+	for {
+		select {
+		case <-ticker.C:
+			buckets = append(buckets, c.Load())
+			if len(buckets) < 2 {
+				continue
+			} else if len(buckets) > 10 {
+				buckets = buckets[1:]
+			}
+
+			delta := float64((buckets[len(buckets)-1] - buckets[0])) / float64(len(buckets))
+			slog.Debug("", "limit", limit, "delta", format.HumanBytes(int64(delta)), "max_delta", format.HumanBytes(int64(maxDelta)))
+
+			if time.Now().Before(nextUpdate) {
+				// quiet period; do not update ccy if recently updated
+				continue
+			} else if maxDelta > 0 {
+				x := delta / maxDelta
+				if x < 1 {
+					continue
+				}
+
+				limit += min(int64(x), int64(runtime.NumCPU()))
+				slog.Debug("setting", "limit", limit)
+				g.SetLimit(limit)
+			}
+
+			// 3s cooldown period
+			nextUpdate = time.Now().Add(2 * time.Second)
+			maxDelta = delta
+
+		case <-ctx.Done():
+			return
+		}
+	}
+}
--- a/server/images.go
+++ b/server/images.go
@@ -1,6 +1,7 @@
 package server

 import (
+	"archive/zip"
 	"bytes"
 	"context"
 	"crypto/sha256"
@@ -23,6 +24,7 @@ import (
 	"golang.org/x/exp/slices"

 	"github.com/jmorganca/ollama/api"
+	"github.com/jmorganca/ollama/convert"
 	"github.com/jmorganca/ollama/llm"
 	"github.com/jmorganca/ollama/parser"
 	"github.com/jmorganca/ollama/version"
@@ -52,6 +54,10 @@ type Model struct {
 	Messages       []Message
 }

+func (m *Model) IsEmbedding() bool {
+	return slices.Contains(m.Config.ModelFamilies, "bert") || slices.Contains(m.Config.ModelFamilies, "nomic-bert")
+}
+
 type Message struct {
 	Role    string `json:"role"`
 	Content string `json:"content"`
@@ -312,7 +318,24 @@ func CreateModel(ctx context.Context, name, modelFileDir string, commands []pars
 				c.Args = blobPath
 			}

-			bin, err := os.Open(realpath(modelFileDir, c.Args))
+			pathName := realpath(modelFileDir, c.Args)
+
+			ggufName, err := convertSafetensors(name, pathName)
+			if err != nil {
+				switch {
+				case errors.Is(err, zip.ErrFormat):
+					// it's not a safetensor archive
+				default:
+					return err
+				}
+			}
+
+			if ggufName != "" {
+				pathName = ggufName
+				defer os.RemoveAll(ggufName)
+			}
+
+			bin, err := os.Open(pathName)
 			if err != nil {
 				// not a file on disk so must be a model reference
 				modelpath := ParseModelPath(c.Args)
@@ -588,6 +611,73 @@ func CreateModel(ctx context.Context, name, modelFileDir string, commands []pars
 	return nil
 }

+func convertSafetensors(name, fn string) (string, error) {
+	r, err := zip.OpenReader(fn)
+	if err != nil {
+		return "", err
+	}
+	defer r.Close()
+
+	tempDir, err := os.MkdirTemp("", "ollama-convert")
+	if err != nil {
+		return "", err
+	}
+	defer os.RemoveAll(tempDir)
+
+	for _, f := range r.File {
+		fpath := filepath.Join(tempDir, f.Name)
+		outFile, err := os.OpenFile(fpath, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, f.Mode())
+		if err != nil {
+			return "", err
+		}
+
+		rc, err := f.Open()
+		if err != nil {
+			return "", err
+		}
+
+		_, err = io.Copy(outFile, rc)
+		if err != nil {
+			return "", err
+		}
+
+		outFile.Close()
+		rc.Close()
+	}
+
+	params, err := convert.GetParams(tempDir)
+	if err != nil {
+		return "", err
+	}
+
+	SupportedArchs := []string{
+		"MistralForCausalLM",
+	}
+
+	for _, arch := range params.Architectures {
+		if !slices.Contains(SupportedArchs, arch) {
+			return "", fmt.Errorf("this safetensors model is not yet supported")
+		}
+	}
+
+	t, err := convert.GetSafeTensors(tempDir)
+	if err != nil {
+		return "", err
+	}
+
+	vocab, err := convert.LoadTokens(tempDir)
+	if err != nil {
+		return "", err
+	}
+
+	fn, err = convert.WriteGGUF(name, t, params, vocab)
+	if err != nil {
+		return "", err
+	}
+
+	return fn, nil
+}
+
 func CopyModel(src, dest string) error {
 	srcModelPath := ParseModelPath(src)
 	srcPath, err := srcModelPath.GetManifestPath()
@@ -1103,18 +1193,7 @@ func makeRequest(ctx context.Context, method string, requestURL *url.URL, header
 		req.ContentLength = contentLength
 	}

-	proxyURL, err := http.ProxyFromEnvironment(req)
-	if err != nil {
-		return nil, err
-	}
-
-	client := http.Client{
-		Transport: &http.Transport{
-			Proxy: http.ProxyURL(proxyURL),
-		},
-	}
-
-	resp, err := client.Do(req)
+	resp, err := http.DefaultClient.Do(req)
 	if err != nil {
 		return nil, err
 	}
--- a/server/prompt.go
+++ b/server/prompt.go
@@ -121,13 +121,15 @@ func ChatPrompt(tmpl string, messages []api.Message, window int, encode func(str
 				p = prompt{}
 			}

-			p.Prompt = msg.Content
-
+			var sb strings.Builder
 			for range msg.Images {
-				p.Prompt += fmt.Sprintf(" [img-%d]", imgId)
+				fmt.Fprintf(&sb, "[img-%d] ", imgId)
 				p.images = append(p.images, imgId)
 				imgId += 1
 			}
+
+			sb.WriteString(msg.Content)
+			p.Prompt = sb.String()
 		case "assistant":
 			if p.Response != "" {
 				prompts = append(prompts, p)
--- a/server/prompt_test.go
+++ b/server/prompt_test.go
@@ -155,7 +155,7 @@ func TestChatPrompt(t *testing.T) {
 				{Role: "user", Content: "Hello", Images: []api.ImageData{[]byte("base64")}},
 			},
 			window: 1024,
-			want:   "You are a Wizard. Hello [img-0]",
+			want:   "You are a Wizard. [img-0] Hello",
 		},
 		{
 			name:     "images truncated",
@@ -165,7 +165,7 @@ func TestChatPrompt(t *testing.T) {
 				{Role: "user", Content: "Hello", Images: []api.ImageData{[]byte("img1"), []byte("img2")}},
 			},
 			window: 1024,
-			want:   "You are a Wizard. Hello [img-1]",
+			want:   "You are a Wizard. [img-0] [img-1] Hello",
 		},
 		{
 			name:     "empty list",
@@ -198,7 +198,7 @@ func TestChatPrompt(t *testing.T) {
 			}

 			if got != tc.want {
-				t.Errorf("got = %v, want %v", got, tc.want)
+				t.Errorf("got: %q, want: %q", got, tc.want)
 			}
 		})
 	}
--- a/server/routes.go
+++ b/server/routes.go
@@ -66,8 +66,6 @@ var defaultSessionDuration = 5 * time.Minute

 // load a model into memory if it is not already loaded, it is up to the caller to lock loaded.mu before calling this function
 func load(c *gin.Context, model *Model, opts api.Options, sessionDuration time.Duration) error {
-	workDir := c.GetString("workDir")
-
 	needLoad := loaded.runner == nil || // is there a model loaded?
 		loaded.ModelPath != model.ModelPath || // has the base model changed?
 		!reflect.DeepEqual(loaded.AdapterPaths, model.AdapterPaths) || // have the adapters changed?
@@ -82,7 +80,7 @@ func load(c *gin.Context, model *Model, opts api.Options, sessionDuration time.D
 			loaded.Options = nil
 		}

-		llmRunner, err := llm.New(workDir, model.ModelPath, model.AdapterPaths, model.ProjectorPaths, opts)
+		llmRunner, err := llm.New(model.ModelPath, model.AdapterPaths, model.ProjectorPaths, opts)
 		if err != nil {
 			// some older models are not compatible with newer versions of llama.cpp
 			// show a generalized compatibility error until there is a better way to
@@ -191,6 +189,11 @@ func GenerateHandler(c *gin.Context) {
 		return
 	}

+	if model.IsEmbedding() {
+		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "embedding models do not support generate"})
+		return
+	}
+
 	opts, err := modelOptions(model, req.Options)
 	if err != nil {
 		if errors.Is(err, api.ErrInvalidOpts) {
@@ -245,6 +248,19 @@ func GenerateHandler(c *gin.Context) {
 		slog.Debug("generate handler", "system", req.System)

 		var sb strings.Builder
+		for i := range req.Images {
+			fmt.Fprintf(&sb, "[img-%d] ", i)
+		}
+
+		sb.WriteString(req.Prompt)
+
+		p, err := Prompt(req.Template, req.System, sb.String(), "", true)
+		if err != nil {
+			c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
+			return
+		}
+
+		sb.Reset()
 		if req.Context != nil {
 			prev, err := loaded.runner.Decode(c.Request.Context(), req.Context)
 			if err != nil {
@@ -255,18 +271,6 @@ func GenerateHandler(c *gin.Context) {
 			sb.WriteString(prev)
 		}

-		// write image tags
-		// TODO: limit the number of images to fit in the context similar to the chat endpoint
-		for i := range req.Images {
-			req.Prompt += fmt.Sprintf(" [img-%d]", i)
-		}
-
-		p, err := Prompt(req.Template, req.System, req.Prompt, "", true)
-		if err != nil {
-			c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
-			return
-		}
-
 		sb.WriteString(p)

 		prompt = sb.String()
@@ -379,7 +383,7 @@ func GenerateHandler(c *gin.Context) {
 	streamResponse(c, ch)
 }

-func EmbeddingHandler(c *gin.Context) {
+func EmbeddingsHandler(c *gin.Context) {
 	loaded.mu.Lock()
 	defer loaded.mu.Unlock()

@@ -432,8 +436,9 @@ func EmbeddingHandler(c *gin.Context) {
 		return
 	}

-	if !loaded.Options.EmbeddingOnly {
-		c.JSON(http.StatusBadRequest, gin.H{"error": "embedding option must be set to true"})
+	// an empty request loads the model
+	if req.Prompt == "" {
+		c.JSON(http.StatusOK, api.EmbeddingResponse{Embedding: []float64{}})
 		return
 	}

@@ -942,7 +947,7 @@ func (s *Server) GenerateRoutes() http.Handler {
 	r.POST("/api/pull", PullModelHandler)
 	r.POST("/api/generate", GenerateHandler)
 	r.POST("/api/chat", ChatHandler)
-	r.POST("/api/embeddings", EmbeddingHandler)
+	r.POST("/api/embeddings", EmbeddingsHandler)
 	r.POST("/api/create", CreateModelHandler)
 	r.POST("/api/push", PushModelHandler)
 	r.POST("/api/copy", CopyModelHandler)
@@ -1028,7 +1033,7 @@ func Serve(ln net.Listener) error {
 		os.Exit(0)
 	}()

-	if err := llm.Init(s.WorkDir); err != nil {
+	if err := llm.Init(); err != nil {
 		return fmt.Errorf("unable to initialize llm library %w", err)
 	}
 	if runtime.GOOS == "linux" { // TODO - windows too
@@ -1143,6 +1148,11 @@ func ChatHandler(c *gin.Context) {
 		return
 	}

+	if model.IsEmbedding() {
+		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "embedding models do not support chat"})
+		return
+	}
+
 	opts, err := modelOptions(model, req.Options)
 	if err != nil {
 		if errors.Is(err, api.ErrInvalidOpts) {
--- a/server/upload.go
+++ b/server/upload.go
@@ -12,14 +12,12 @@ import (
 	"net/http"
 	"net/url"
 	"os"
-	"strings"
 	"sync"
 	"sync/atomic"
 	"time"

 	"github.com/jmorganca/ollama/api"
 	"github.com/jmorganca/ollama/format"
-	"golang.org/x/sync/errgroup"
 )

 var blobUploadManager sync.Map
@@ -138,14 +136,17 @@ func (b *blobUpload) Run(ctx context.Context, opts *registryOptions) {
 	}
 	defer b.file.Close()

-	g, inner := errgroup.WithContext(ctx)
-	g.SetLimit(numUploadParts)
+	var limit int64 = 2
+	g, inner := NewLimitGroup(ctx, numUploadParts, limit)
+	go watchDelta(inner, g, &b.Completed, limit)
+
 	for i := range b.Parts {
 		part := &b.Parts[i]
 		select {
 		case <-inner.Done():
+			break
 		case requestURL := <-b.nextURL:
-			g.Go(func() error {
+			g.Go(inner, func() error {
 				var err error
 				for try := 0; try < maxRetries; try++ {
 					err = b.uploadPart(inner, http.MethodPatch, requestURL, part, opts)
@@ -177,16 +178,14 @@ func (b *blobUpload) Run(ctx context.Context, opts *registryOptions) {
 	requestURL := <-b.nextURL

 	// calculate md5 checksum and add it to the commit request
-	var sb strings.Builder
+	md5sum := md5.New()
 	for _, part := range b.Parts {
-		sb.Write(part.Sum(nil))
+		md5sum.Write(part.Sum(nil))
 	}

-	md5sum := md5.Sum([]byte(sb.String()))
-
 	values := requestURL.Query()
 	values.Add("digest", b.Digest)
-	values.Add("etag", fmt.Sprintf("%x-%d", md5sum, len(b.Parts)))
+	values.Add("etag", fmt.Sprintf("%x-%d", md5sum.Sum(nil), len(b.Parts)))
 	requestURL.RawQuery = values.Encode()

 	headers := make(http.Header)
Author	SHA1	Message	Date
Michael Yang	39374984ef	set limit to increment	2024-03-07 17:31:45 -08:00
Michael Yang	daf928fe1a	tune concurrency manager - higher initial concurrency - lower cooldown after ramping up - lower threshold for ramp up	2024-03-07 16:25:17 -08:00
Blake Mizerany	2ada81e068	cmd: tighten up env var usage sections (#2962 ) Also, document OLLAMA_HOST client semantics per command that honors it. This looks nicer than having a general puprose environment variable section in the root usage which was showing up after the "addition help topics" section outputed by Cobra's default template. It was decided this was easier to work with than using a custom template for Cobra right now.	2024-03-07 13:57:07 -08:00
Michael Yang	f678f5c5c3	Merge pull request #2991 from ollama/mxyng/fix-ci fix ci	2024-03-07 11:35:06 -08:00
Michael Yang	2cb74e23fb	fix ci	2024-03-07 11:33:49 -08:00
Daniel Hiltgen	3c8df3808b	Merge pull request #2885 from dhiltgen/rocm_v6_only Revamp ROCm support	2024-03-07 10:51:00 -08:00
Michael Yang	7d564835c2	Merge pull request #2985 from ollama/rm-empty-examples remove empty examples	2024-03-07 10:49:40 -08:00
Michael Yang	72431031d9	no ci test on docs, examples	2024-03-07 10:44:48 -08:00
Michael Yang	6041abb5b2	remove empty examples	2024-03-07 10:40:32 -08:00
Daniel Hiltgen	6c5ccb11f9	Revamp ROCm support This refines where we extract the LLM libraries to by adding a new OLLAMA_HOME env var, that defaults to `~/.ollama` The logic was already idempotenent, so this should speed up startups after the first time a new release is deployed. It also cleans up after itself. We now build only a single ROCm version (latest major) on both windows and linux. Given the large size of ROCms tensor files, we split the dependency out. It's bundled into the installer on windows, and a separate download on windows. The linux install script is now smart and detects the presence of AMD GPUs and looks to see if rocm v6 is already present, and if not, then downloads our dependency tar file. For Linux discovery, we now use sysfs and check each GPU against what ROCm supports so we can degrade to CPU gracefully instead of having llama.cpp+rocm assert/crash on us. For Windows, we now use go's windows dynamic library loading logic to access the amdhip64.dll APIs to query the GPU information.	2024-03-07 10:36:50 -08:00
Michael Yang	2e20110e50	Merge pull request #2221 from ollama/mxyng/up-down-ccy adjust download and upload concurrency based on available bandwidth	2024-03-07 09:27:33 -08:00
Daniel Hiltgen	82ddc3e441	Merge pull request #2964 from dhiltgen/mem_limit_var Allow setting max vram for workarounds	2024-03-07 09:25:44 -08:00
Jeffrey Morgan	d481fb3cc8	update go to 1.22 in other places (#2975 )	2024-03-07 07:39:49 -08:00
DJ Johnson	23ee633252	docs: Add LLM-X to Web Integration section (#2759 )	2024-03-07 10:11:53 -05:00
John	23ebe8fe11	fix some typos (#2973 ) Signed-off-by: hishope <csqiye@126.com>	2024-03-06 22:50:11 -08:00
Patrick Devine	2c017ca441	Convert Safetensors to an Ollama model (#2824 )	2024-03-06 21:01:51 -08:00
Daniel Hiltgen	be330174dd	Allow setting max vram for workarounds Until we get all the memory calculations correct, this can provide and escape valve for users to workaround out of memory crashes.	2024-03-06 17:15:06 -08:00
Blake Mizerany	0ded7fdc4b	cmd: document environment variables for serve command Updates #2944	2024-03-06 13:48:46 -08:00
Leo	2103a5073c	Add Odin Runes, a Feature-Rich Java UI for Ollama, to README (#2440 ) * Add Odin Runes to README Add Odin Runes to README This commit adds Odin Runes to the "Community Integrations" section of the README. Odin Runes is a Java-based GPT client designed to provide seamless interaction with GPT models, enhancing productivity in prompt engineering and text generation tasks. This addition highlights the integration between Odin Runes and Ollama, offering users the flexibility to leverage large language models locally within their development workflow. * Update README.md this commit applies the comments of the reviewer.	2024-03-06 11:57:49 -08:00
Jeffrey Morgan	ce9f7c4674	Update api.md	2024-03-05 13:13:23 -08:00
Anders Rex	e5596c1944	Add NotesOllama to Community Integrations (#2909 )	2024-03-04 01:18:10 -08:00
Timothy Graupmann	9bc3fee694	Added community link for Ollama Copilot (#2582 ) * Added community link for Ollama Copilot * Update README.md --------- Co-authored-by: Michael <mchiang0610@users.noreply.github.com>	2024-03-04 00:40:36 -08:00
Jeffrey Morgan	21347e1ed6	update llama.cpp submodule to `c29af7e` (#2868 )	2024-03-01 15:26:04 -08:00
Jeffrey Morgan	3b4bab3dc5	Fix embeddings load model behavior (#2848 )	2024-02-29 17:40:56 -08:00
Daniel Hiltgen	cbd6e3b38e	Merge pull request #2838 from dhiltgen/opensuse Add ollama user to video group	2024-02-29 15:47:56 -08:00
Daniel Hiltgen	b830afa716	Merge pull request #2837 from dhiltgen/podman_image_support Add env var so podman will map cuda GPUs	2024-02-29 15:47:37 -08:00
Daniel Hiltgen	bd1d8b0d14	Merge pull request #2836 from bmwiedemann/gzip Omit build date from gzip headers	2024-02-29 15:46:46 -08:00
fred-bf	25c2912120	Add Community Integration: NextChat (#2780 )	2024-02-29 12:12:13 -08:00
Michael Yang	0e19476b56	prepend image tags (#2789 ) instead of appending image tags, prepend them - this generally produces better results	2024-02-29 11:30:14 -08:00
tylinux	fa2f2b3563	fix: print usedMemory size right (#2827 )	2024-02-29 11:11:04 -08:00
Jeffrey Morgan	cbf4970e0f	bump submodule to `87c91c07663b707e831c59ec373b5e665ff9d64a` (#2828 )	2024-02-29 09:42:08 -08:00
Daniel Hiltgen	74468513bd	Add ollama user to video group On OpenSUSE, ollama needs to be a member of the video group to access the GPU	2024-02-29 08:50:10 -08:00
Daniel Hiltgen	794a916a72	Add env var so podman will map cuda GPUs Without this env var, podman's GPU logic doesn't map the GPU through	2024-02-29 08:43:08 -08:00
Bernhard M. Wiedemann	76e5d9ec88	Omit build date from gzip headers See https://reproducible-builds.org/ for why this is good. This patch was done while working on reproducible builds for openSUSE.	2024-02-29 16:48:19 +01:00
Daniel Hiltgen	076237b8ea	Merge pull request #2771 from dhiltgen/toggle_models Bump llama.cpp to b2276	2024-02-27 11:29:53 -08:00
Daniel Hiltgen	53d694c67f	Merge pull request #2772 from dhiltgen/container_image Refine container image build script	2024-02-27 11:29:08 -08:00
Daniel Hiltgen	5aa6bfea94	Merge pull request #2785 from dhiltgen/win_download Log unexpected server errors checking for update	2024-02-27 10:43:14 -08:00
Daniel Hiltgen	1cde63dd64	Log unexpected server errors checking for update This should unmask some failure modes that likely show up in app logs as unmarshal errors	2024-02-27 09:17:04 -08:00
Daniel Hiltgen	98e0b7e94f	Refine container image build script Allow overriding the platform, image name, and tag latest for standard and rocm images.	2024-02-26 17:26:49 -08:00
Daniel Hiltgen	061e8f6abc	Bump llama.cpp to b2276	2024-02-26 16:49:24 -08:00
peanut256	a189810df6	Determine max VRAM on macOS using `recommendedMaxWorkingSetSize` (#2354 ) * read iogpu.wired_limit_mb on macOS Fix for https://github.com/ollama/ollama/issues/1826 * improved determination of available vram on macOS read the recommended maximal vram on macOS via Metal API * Removed macOS-specific logging * Remove logging from gpu_darwin.go * release Core Foundation object fixes a possible memory leak	2024-02-25 18:16:45 -05:00
Ikko Eltociear Ashimine	e95b896790	Update types.go (#2744 ) specfied -> specified	2024-02-25 13:41:25 -05:00
elthommy	1f087c4d26	Update langchain python tutorial (#2737 ) Remove unused GPT4all Use nomic-embed-text as embedded model Fix a deprecation warning (__call__)	2024-02-25 00:31:36 -05:00
Jeffrey Morgan	5d7ea6616f	no extra disk space for windows installation (#2739 )	2024-02-25 00:20:35 -05:00
Michael Yang	2a4b128ae3	Merge pull request #2719 from ollama/mxyng/format-private-key remove format private key	2024-02-23 17:15:14 -08:00
Michael Yang	fc483274ad	clean up go.mod	2024-02-23 16:53:36 -08:00
Michael Yang	fd10a2ad4b	remove format/openssh.go this is unnecessary now that x/crypto/ssh.MarshalPrivateKey has been added	2024-02-23 16:52:23 -08:00
Benn Huang	b291f63188	Add Community Integration: Chatbox Co-authored-by: bennhuang <bennhuang@tencent.com>	2024-02-23 07:17:28 -05:00
Jeffrey Morgan	f58856bf6f	better directory cleanup in `ollama.iss`	2024-02-23 07:14:59 -05:00
Jeffrey Morgan	275ea01587	restore windows build flags and compression	2024-02-22 18:07:18 -05:00
Jeffrey Morgan	8782dd5628	fix `build_windows.ps1` script to run `go build` with the correct flags	2024-02-22 17:41:43 -05:00
Jeffrey Morgan	11bfff8ee1	update llama.cpp submodule to `96633eeca1265ed03e57230de54032041c58f9cd`	2024-02-22 16:44:26 -05:00
Logan Yang	7c0167a8f6	Add copilot for obsidian plugin to community integration (#1918 )	2024-02-22 14:17:20 -05:00
LangChain4j	74d898e37d	Added LangChain4j links (#1690 )	2024-02-22 14:09:08 -05:00
Yuan-Man	c6e8b00718	Add README.md (#2249 )	2024-02-22 14:03:44 -05:00
B-Tocs.org Community	be9980ef13	Update README.md - Ollama for SAP ABAP (#2510 )	2024-02-22 13:12:27 -05:00
Augustinas Malinauskas	646a0dedb9	Update README.md (#2504 ) - Enchanted is now supported for desktop on macOS	2024-02-22 13:09:29 -05:00
Azhar Khan	7f964d938c	update README to add Gemma 2B, 7B model in Model Library Table (#2686 )	2024-02-22 13:07:47 -05:00
Pavel Frankov	e6b8a139ff	Update README.md (#2138 )	2024-02-22 10:52:36 -05:00
Jeffrey Morgan	bdc0ea1ba5	Update import.md	2024-02-22 02:08:03 -05:00
Jeffrey Morgan	7fab7918cc	Update import.md	2024-02-22 02:06:24 -05:00
Michael Yang	74c1bdba0d	Merge pull request #2657 from joshyan1/patch-1 Update install.sh success message	2024-02-21 15:55:20 -08:00
Josh	f983ef7f5f	Update install.sh success message	2024-02-21 18:30:01 -05:00
Jeffrey Morgan	1ae1c33651	Windows build + installer adjustments (#2656 ) * remove `-w -s` linker flags on windows * use `zip` for windows installer compression	2024-02-21 18:21:26 -05:00
Michael Yang	084d846621	refactor	2024-02-21 13:42:48 -08:00
Michael Yang	6a4b994433	lint	2024-02-21 13:42:48 -08:00
Michael Yang	bea007deb7	use LimitGroup for uploads	2024-02-21 13:42:48 -08:00
Michael Yang	074934be03	adjust group limit based on download speed	2024-02-21 13:42:48 -08:00
Michael Yang	0de12368a0	add new LimitGroup for dynamic concurrency	2024-02-21 13:42:48 -08:00
Michael Yang	917bd61084	refactor download run	2024-02-21 13:42:46 -08:00
Jeffrey Morgan	efe040f8c0	reset with `init_vars` ahead of each cpu build in `gen_windows.ps1` (#2654 )	2024-02-21 16:35:34 -05:00
Jeffrey Morgan	2a7553ce09	update llama.cpp submodule to `c14f72d`	2024-02-21 09:03:14 -05:00
Sun Bo	10af6070a9	Update big-AGI config file link (#2626 ) Co-authored-by: bo.sun <bo.sun@cotticoffee.com>	2024-02-21 01:24:48 -05:00
Jeffrey Morgan	92423b0600	add `dist` directory in `build_windows.ps`	2024-02-21 00:05:05 -05:00
Jeffrey Morgan	b3eac61cac	update llama.cpp submodule to `f0d1fafc029a056cd765bdae58dcaa12312e9879`	2024-02-20 22:56:51 -05:00
Jeffrey Morgan	287ba11500	better error message when calling `/api/generate` or `/api/chat` with embedding models	2024-02-20 21:53:45 -05:00
Jeffrey Morgan	63861f58cc	Support for `bert` and `nomic-bert` embedding models	2024-02-20 21:37:29 -05:00
Jeffrey Morgan	f0425d3de9	Update faq.md	2024-02-20 20:44:45 -05:00
Michael Yang	210b65268e	replace strings buffer with hasher (#2437 ) the buffered value is going into the hasher eventually so write directly to the hasher instead	2024-02-20 19:07:50 -05:00
Michael Yang	949d7b1c48	add gguf file types (#2532 )	2024-02-20 19:06:29 -05:00
Michael Yang	897b213468	use http.DefaultClient (#2530 ) default client already handles proxy	2024-02-20 18:34:47 -05:00
Jeffrey Morgan	4613a080e7	update llama.cpp submodule to `66c1968f7` (#2618 )	2024-02-20 17:42:31 -05:00
Muhammed Nazeem	ace2cdf1c6	Add Page Assist to the community integrations (#2447 )	2024-02-20 14:03:58 -05:00
Nikesh Parajuli	eed92bc19a	docs: add Msty app in readme (#1775 ) * docs: add Msty app in readme * docs: update msty url	2024-02-20 14:03:33 -05:00
Michael Edoror	e0a2f46466	Update README.md to include Elixir LangChain Library (#2180 ) The Elixir LangChain Library now supports Ollama Chat with this [PR](https://github.com/brainlid/langchain/pull/70)	2024-02-20 14:03:02 -05:00
Taras Tsugrii	01ff2e14db	[nit] Remove unused msg local var. (#2511 )	2024-02-20 14:02:34 -05:00