update to resolve jmorganca comments

Signed-off-by: Matt Williams <m@technovangelist.com>
add faq about quant and context
2024-01-04 12:58:07 -08:00 · 2024-01-04 09:45:13 -08:00
88 changed files with 2334 additions and 3076 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -2,7 +2,7 @@
 ollama
 app
 dist
-llm/llama.cpp
+llm/llama.cpp/gguf
 .env
 .cache
 test_data
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -1,106 +0,0 @@
 name: test
 on:
  pull_request:
 jobs:
  generate:
    strategy:
      matrix:
        os: [ubuntu-latest, macos-latest, windows-latest]
        arch: [amd64, arm64]
        exclude:
          - os: ubuntu-latest
            arch: arm64
          - os: windows-latest
            arch: arm64
    runs-on: ${{ matrix.os }}
    env:
      GOARCH: ${{ matrix.arch }}
    steps:
      - uses: actions/checkout@v4
      - uses: actions/setup-go@v5
        with:
          go-version: '1.21'
          cache: true
      - if: ${{ startsWith(matrix.os, 'windows-') }}
        shell: pwsh
        run: |
          $path = vswhere -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath
          if ($path) {
              $path = join-path $path 'Common7\Tools\vsdevcmd.bat'
              if (test-path $path) {
                  cmd /s /c """$path"" $args && set" | where { $_ -match '(\w+)=(.*)' } | foreach {
                      echo "$($Matches[1])=$($Matches[2])" | Out-File -FilePath $Env:GITHUB_ENV -Encoding utf8 -Append
                  }
              }
          }
          echo "C:\Program Files\Git\usr\bin" | Out-File -FilePath $Env:GITHUB_PATH -Encoding utf8 -Append
      - run: go get ./...
      - run: go generate -x ./...
      - uses: actions/upload-artifact@v4
        with:
          name: ${{ matrix.os }}-${{ matrix.arch }}-libraries
          path: |
            llm/llama.cpp/build/**/lib/*
  lint:
    needs: generate
    strategy:
      matrix:
        os: [ubuntu-latest, macos-latest, windows-latest]
        arch: [amd64, arm64]
        exclude:
          - os: ubuntu-latest
            arch: arm64
          - os: windows-latest
            arch: arm64
          - os: macos-latest
            arch: amd64
    runs-on: ${{ matrix.os }}
    env:
      GOARCH: ${{ matrix.arch }}
      CGO_ENABLED: "1"
    steps:
      - uses: actions/checkout@v4
        with:
          submodules: recursive
      - uses: actions/setup-go@v5
        with:
          go-version: '1.21'
          cache: false
      - uses: actions/download-artifact@v4
        with:
          name: ${{ matrix.os }}-${{ matrix.arch }}-libraries
          path: llm/llama.cpp/build
      - uses: golangci/golangci-lint-action@v3
  test:
    needs: generate
    strategy:
      matrix:
        os: [ubuntu-latest, macos-latest, windows-latest]
        arch: [amd64]
        exclude:
          - os: ubuntu-latest
            arch: arm64
          - os: windows-latest
            arch: arm64
    runs-on: ${{ matrix.os }}
    env:
      GOARCH: ${{ matrix.arch }}
      CGO_ENABLED: "1"
    steps:
      - uses: actions/checkout@v4
        with:
          submodules: recursive
      - uses: actions/setup-go@v5
        with:
          go-version: '1.21'
          cache: true
      - run: go get
      - uses: actions/download-artifact@v4
        with:
          name: ${{ matrix.os }}-${{ matrix.arch }}-libraries
          path: llm/llama.cpp/build
      - run: go build
      - run: go test -v ./...
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,4 +1,5 @@
-[submodule "llama.cpp"]
+[submodule "llm/llama.cpp/gguf"]
-	path = llm/llama.cpp
+    path = llm/llama.cpp/gguf
-	url = https://github.com/ggerganov/llama.cpp.git
+    url = https://github.com/ggerganov/llama.cpp.git
-	shallow = true
+    ignore = dirty
    shallow = true
--- a/.golangci.yaml
+++ b/.golangci.yaml
@@ -1,27 +0,0 @@
 run:
  timeout: 5m
 linters:
  enable:
    - asasalint
    - bidichk
    - bodyclose
    - containedctx
    - contextcheck
    - exportloopref
    - gocheckcompilerdirectives
    # FIXME: for some reason this errors on windows
    # - gofmt
    # - goimports
    - misspell
    - nilerr
    - unused
 linters-settings:
  errcheck:
    # exclude the following functions since we don't generally
    # need to be concerned with the returned errors
    exclude-functions:
      - encoding/binary.Read
      - (*os.File).Seek
      - (*bufio.Writer).WriteString
      - (*github.com/spf13/pflag.FlagSet).Set
      - (*github.com/jmorganca/ollama/llm.readSeekOffset).Seek
--- a/Dockerfile.build
+++ b/Dockerfile.build
@@ -1,99 +1,74 @@
-ARG GOLANG_VERSION=1.21.3
+# Ubuntu 20.04 amd64 dependencies
 FROM --platform=linux/amd64 ubuntu:20.04 AS base-amd64
 ARG CUDA_VERSION=11.3.1-1
 ARG CMAKE_VERSION=3.22.1
-ARG CUDA_VERSION=11.3.1
+# ROCm only supports amd64
 ARG ROCM_VERSION=6.0
 ARG CLBLAST_VER=1.6.1
-# Copy the minimal context we need to run the generate scripts
+# Note: https://rocm.docs.amd.com/en/latest/release/user_kernel_space_compat_matrix.html
-FROM scratch AS llm-code
+RUN apt-get update && \
-COPY .git .git
+    apt-get install -y wget gnupg && \
-COPY .gitmodules .gitmodules
+    wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-ubuntu2004.pin && \
-COPY llm llm
+    mv cuda-ubuntu2004.pin /etc/apt/preferences.d/cuda-repository-pin-600 && \
    apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub && \
    echo "deb [by-hash=no] https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/ /" > /etc/apt/sources.list.d/cuda.list && \
    wget "https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.sh" -O /tmp/cmake-installer.sh && \
    chmod +x /tmp/cmake-installer.sh && /tmp/cmake-installer.sh --skip-license --prefix=/usr && \
    mkdir --parents --mode=0755 /etc/apt/keyrings && \
    wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | gpg --dearmor > /etc/apt/keyrings/rocm.gpg && \
    echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/${ROCM_VERSION} focal main" > /etc/apt/sources.list.d/rocm.list && \
    echo "Package: *" > /etc/apt/preferences.d/rocm-pin-600 && \
    echo "Pin: release o=repo.radeon.com" >> /etc/apt/preferences.d/rocm-pin-600 && \
    echo "Pin-Priority: 600" >> /etc/apt/preferences.d/rocm-pin-600 && \
    apt-get update && \
    DEBIAN_FRONTEND=noninteractive apt-get -y install cuda=${CUDA_VERSION} rocm-hip-libraries rocm-device-libs rocm-libs rocm-ocl-icd rocm-hip-sdk rocm-hip-libraries rocm-cmake rocm-clang-ocl rocm-dev
-FROM --platform=linux/amd64 nvidia/cuda:$CUDA_VERSION-devel-centos7 AS cuda-build-amd64
+# CLBlast
-ARG CMAKE_VERSION
+RUN wget -qO- https://github.com/CNugteren/CLBlast/archive/refs/tags/${CLBLAST_VER}.tar.gz | tar zxv -C /tmp/ && \
    cd /tmp/CLBlast-${CLBLAST_VER} && mkdir build && cd build && cmake .. && make && make install
 ENV ROCM_PATH=/opt/rocm
 # Ubuntu 22.04 arm64 dependencies
 FROM --platform=linux/arm64 ubuntu:20.04 AS base-arm64
 ARG CUDA_VERSION=11.3.1-1
 ARG CMAKE_VERSION=3.27.6
 RUN apt-get update && \
    apt-get install -y wget gnupg && \
    wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/sbsa/cuda-ubuntu2004.pin && \
    mv cuda-ubuntu2004.pin /etc/apt/preferences.d/cuda-repository-pin-600 && \
    apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/sbsa//3bf863cc.pub && \
    echo "deb [by-hash=no] https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/sbsa/ /" > /etc/apt/sources.list.d/cuda.list && \
    wget "https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-aarch64.sh" -O /tmp/cmake-installer.sh && \
    chmod +x /tmp/cmake-installer.sh && /tmp/cmake-installer.sh --skip-license --prefix=/usr && \
    apt-get update && \
    apt-cache madison cuda && \
    DEBIAN_FRONTEND=noninteractive apt-get -y install cuda=${CUDA_VERSION} 
 FROM base-${TARGETARCH}
 ARG TARGETARCH
 ARG GOFLAGS="'-ldflags -w -s'"
 ARG CGO_CFLAGS
-COPY ./scripts/rh_linux_deps.sh /
+ARG GOLANG_VERSION=1.21.3
 RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
 ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
 COPY --from=llm-code / /go/src/github.com/jmorganca/ollama/
 WORKDIR /go/src/github.com/jmorganca/ollama/llm/generate
 RUN OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
-FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION-devel-rockylinux8 AS cuda-build-arm64
+# Common toolchain
-ARG CMAKE_VERSION
+RUN apt-get update && \
-ARG CGO_CFLAGS
+    DEBIAN_FRONTEND=noninteractive apt-get install -y gcc-10 g++-10 cpp-10 git ocl-icd-opencl-dev && \
-COPY ./scripts/rh_linux_deps.sh /
+    update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 100 --slave /usr/bin/g++ g++ /usr/bin/g++-10 --slave /usr/bin/gcov gcov /usr/bin/gcov-10
 RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
 ENV PATH /opt/rh/gcc-toolset-10/root/usr/bin:$PATH
 COPY --from=llm-code / /go/src/github.com/jmorganca/ollama/
 WORKDIR /go/src/github.com/jmorganca/ollama/llm/generate
 RUN OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
-FROM --platform=linux/amd64 rocm/dev-centos-7:5.7.1-complete AS rocm-5-build-amd64
+# install go
-ARG CMAKE_VERSION
+ADD https://dl.google.com/go/go${GOLANG_VERSION}.linux-$TARGETARCH.tar.gz /tmp/go${GOLANG_VERSION}.tar.gz
-ARG CGO_CFLAGS
+RUN mkdir -p /usr/local && tar xz -C /usr/local </tmp/go${GOLANG_VERSION}.tar.gz
 COPY ./scripts/rh_linux_deps.sh /
 RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
 ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
 ENV LIBRARY_PATH /opt/amdgpu/lib64
 COPY --from=llm-code / /go/src/github.com/jmorganca/ollama/
 WORKDIR /go/src/github.com/jmorganca/ollama/llm/generate
 RUN OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
-FROM --platform=linux/amd64 rocm/dev-centos-7:6.0-complete AS rocm-6-build-amd64
+# build the final binary
 ARG CMAKE_VERSION
 ARG CGO_CFLAGS
 COPY ./scripts/rh_linux_deps.sh /
 RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
 ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
 ENV LIBRARY_PATH /opt/amdgpu/lib64
 COPY --from=llm-code / /go/src/github.com/jmorganca/ollama/
 WORKDIR /go/src/github.com/jmorganca/ollama/llm/generate
 RUN OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
 FROM --platform=linux/amd64 centos:7 AS cpu-build-amd64
 ARG CMAKE_VERSION
 ARG GOLANG_VERSION
 ARG OLLAMA_CUSTOM_CPU_DEFS
 ARG CGO_CFLAGS
 COPY ./scripts/rh_linux_deps.sh /
 RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
 ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
 COPY --from=llm-code / /go/src/github.com/jmorganca/ollama/
 WORKDIR /go/src/github.com/jmorganca/ollama/llm/generate
 RUN sh gen_linux.sh
 FROM --platform=linux/arm64 centos:7 AS cpu-build-arm64
 ARG CMAKE_VERSION
 ARG GOLANG_VERSION
 ARG OLLAMA_CUSTOM_CPU_DEFS
 ARG CGO_CFLAGS
 COPY ./scripts/rh_linux_deps.sh /
 RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
 ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
 COPY --from=llm-code / /go/src/github.com/jmorganca/ollama/
 WORKDIR /go/src/github.com/jmorganca/ollama/llm/generate
 RUN sh gen_linux.sh
 FROM --platform=linux/amd64 cpu-build-amd64 AS build-amd64
 ENV CGO_ENABLED 1
 ARG GOFLAGS
 ARG CGO_CFLAGS
 WORKDIR /go/src/github.com/jmorganca/ollama
 COPY . .
 COPY --from=cuda-build-amd64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
 COPY --from=rocm-5-build-amd64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
 COPY --from=rocm-6-build-amd64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
 RUN go build .
-FROM --platform=linux/arm64 cpu-build-arm64 AS build-arm64
+ENV GOOS=linux
-ENV CGO_ENABLED 1
+ENV GOARCH=$TARGETARCH
-ARG GOLANG_VERSION
+ENV GOFLAGS=$GOFLAGS
-ARG GOFLAGS
+ENV CGO_CFLAGS=${CGO_CFLAGS}
 ARG CGO_CFLAGS
 WORKDIR /go/src/github.com/jmorganca/ollama
 COPY . .
 COPY --from=cuda-build-arm64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
 RUN go build .
-FROM build-$TARGETARCH
+RUN /usr/local/go/bin/go generate ./... && \
    /usr/local/go/bin/go build .
--- a/README.md
+++ b/README.md
@@ -248,10 +248,6 @@ curl http://localhost:11434/api/chat -d '{
 See the [API documentation](./docs/api.md) for all endpoints.
 ## Integrations
 - [ollama-python](https://github.com/jmorganca/ollama-python)
 ## Community Integrations
 ### Web & Desktop
@@ -296,7 +292,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [LlamaIndex](https://gpt-index.readthedocs.io/en/stable/examples/llm/ollama.html)
 - [LiteLLM](https://github.com/BerriAI/litellm)
 - [OllamaSharp for .NET](https://github.com/awaescher/OllamaSharp)
 - [Ollama for Ruby](https://github.com/gbaptista/ollama-ai)
 - [Ollama-rs for Rust](https://github.com/pepperoni21/ollama-rs)
 - [Ollama4j for Java](https://github.com/amithkoujalgi/ollama4j)
 - [ModelFusion Typescript Library](https://modelfusion.dev/integration/model-provider/ollama)
@@ -304,9 +299,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Ollama for Dart](https://github.com/breitburg/dart-ollama)
 - [Ollama for Laravel](https://github.com/cloudstudio/ollama-laravel)
 - [LangChainDart](https://github.com/davidmigloz/langchain_dart)
 - [Semantic Kernel - Python](https://github.com/microsoft/semantic-kernel/tree/main/python/semantic_kernel/connectors/ai/ollama)
 - [Haystack](https://github.com/deepset-ai/haystack-integrations/blob/main/integrations/ollama.md)
 ### Mobile
@@ -327,4 +319,3 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Rivet plugin](https://github.com/abrenneke/rivet-plugin-ollama)
 - [Llama Coder](https://github.com/ex3ndr/llama-coder) (Copilot alternative using Ollama)
 - [Obsidian BMO Chatbot plugin](https://github.com/longy2k/obsidian-bmo-chatbot)
 - [Open Interpreter](https://docs.openinterpreter.com/language-model-setup/local-models/ollama)
--- a/api/client.go
+++ b/api/client.go
@@ -309,13 +309,6 @@ func (c *Client) Heartbeat(ctx context.Context) error {
 	}
 	return nil
 }
 func (c *Client) Embeddings(ctx context.Context, req *EmbeddingRequest) (*EmbeddingResponse, error) {
 	var resp EmbeddingResponse
 	if err := c.do(ctx, http.MethodPost, "/api/embeddings", req, &resp); err != nil {
 		return nil, err
 	}
 	return &resp, nil
 }
 func (c *Client) CreateBlob(ctx context.Context, digest string, r io.Reader) error {
 	if err := c.do(ctx, http.MethodHead, fmt.Sprintf("/api/blobs/%s", digest), nil, nil); err != nil {
--- a/api/client.py
+++ b/api/client.py
@@ -0,0 +1,284 @@
 import os
 import json
 import requests
 import os
 import hashlib
 import json
 from pathlib import Path
 BASE_URL = os.environ.get('OLLAMA_HOST', 'http://localhost:11434')
 # Generate a response for a given prompt with a provided model. This is a streaming endpoint, so will be a series of responses.
 # The final response object will include statistics and additional data from the request. Use the callback function to override
 # the default handler.
 def generate(model_name, prompt, system=None, template=None, format="", context=None, options=None, callback=None):
    try:
        url = f"{BASE_URL}/api/generate"
        payload = {
            "model": model_name, 
            "prompt": prompt, 
            "system": system, 
            "template": template, 
            "context": context, 
            "options": options,
            "format": format,
        }
        # Remove keys with None values
        payload = {k: v for k, v in payload.items() if v is not None}
        with requests.post(url, json=payload, stream=True) as response:
            response.raise_for_status()
            # Creating a variable to hold the context history of the final chunk
            final_context = None
            # Variable to hold concatenated response strings if no callback is provided
            full_response = ""
            # Iterating over the response line by line and displaying the details
            for line in response.iter_lines():
                if line:
                    # Parsing each line (JSON chunk) and extracting the details
                    chunk = json.loads(line)
                    # If a callback function is provided, call it with the chunk
                    if callback:
                        callback(chunk)
                    else:
                        # If this is not the last chunk, add the "response" field value to full_response and print it
                        if not chunk.get("done"):
                            response_piece = chunk.get("response", "")
                            full_response += response_piece
                            print(response_piece, end="", flush=True)
                    # Check if it's the last chunk (done is true)
                    if chunk.get("done"):
                        final_context = chunk.get("context")
            # Return the full response and the final context
            return full_response, final_context
    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
        return None, None
 # Create a blob file on the server if it doesn't exist.
 def create_blob(digest, file_path):
    url = f"{BASE_URL}/api/blobs/{digest}"
    # Check if the blob exists
    response = requests.head(url)
    if response.status_code != 404:
        return  # Blob already exists, no need to upload
    response.raise_for_status()
    # Upload the blob
    with open(file_path, 'rb') as file_data:
        requests.post(url, data=file_data)
 # Create a model from a Modelfile. Use the callback function to override the default handler.
 def create(model_name, filename, callback=None):
    try:
        file_path = Path(filename).expanduser().resolve()
        processed_lines = []
        # Read and process the modelfile
        with open(file_path, 'r') as f:
            for line in f:            
                # Skip empty or whitespace-only lines
                if not line.strip():
                    continue
                command, args = line.split(maxsplit=1)
                if command.upper() in ["FROM", "ADAPTER"]:
                    path = Path(args.strip()).expanduser()
                    # Check if path is relative and resolve it
                    if not path.is_absolute():
                        path = (file_path.parent / path)
                    # Skip if file does not exist for "model", this is handled by the server
                    if not path.exists():
                        processed_lines.append(line)
                        continue
                    # Calculate SHA-256 hash
                    with open(path, 'rb') as bin_file:
                        hash = hashlib.sha256()
                        hash.update(bin_file.read())
                        blob = f"sha256:{hash.hexdigest()}"
                    # Add the file to the remote server
                    create_blob(blob, path)
                    # Replace path with digest in the line
                    line = f"{command} @{blob}\n"
                processed_lines.append(line)
        # Combine processed lines back into a single string
        modelfile_content = '\n'.join(processed_lines)
        url = f"{BASE_URL}/api/create"
        payload = {"name": model_name, "modelfile": modelfile_content}
        # Making a POST request with the stream parameter set to True to handle streaming responses
        with requests.post(url, json=payload, stream=True) as response:
            response.raise_for_status()
            # Iterating over the response line by line and displaying the status
            for line in response.iter_lines():
                if line:
                    chunk = json.loads(line)
                    if callback:
                        callback(chunk)
                    else:
                        print(f"Status: {chunk.get('status')}")
    except Exception as e:
        print(f"An error occurred: {e}")
 # Pull a model from a the model registry. Cancelled pulls are resumed from where they left off, and multiple
 # calls to will share the same download progress. Use the callback function to override the default handler.
 def pull(model_name, insecure=False, callback=None):
    try:
        url = f"{BASE_URL}/api/pull"
        payload = {
            "name": model_name,
            "insecure": insecure
        }
        # Making a POST request with the stream parameter set to True to handle streaming responses
        with requests.post(url, json=payload, stream=True) as response:
            response.raise_for_status()
            # Iterating over the response line by line and displaying the details
            for line in response.iter_lines():
                if line:
                    # Parsing each line (JSON chunk) and extracting the details
                    chunk = json.loads(line)
                    # If a callback function is provided, call it with the chunk
                    if callback:
                        callback(chunk)
                    else:
                        # Print the status message directly to the console
                        print(chunk.get('status', ''), end='', flush=True)
                    # If there's layer data, you might also want to print that (adjust as necessary)
                    if 'digest' in chunk:
                        print(f" - Digest: {chunk['digest']}", end='', flush=True)
                        print(f" - Total: {chunk['total']}", end='', flush=True)
                        print(f" - Completed: {chunk['completed']}", end='\n', flush=True)
                    else:
                        print()
    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
 # Push a model to the model registry. Use the callback function to override the default handler.
 def push(model_name, insecure=False, callback=None):
    try:
        url = f"{BASE_URL}/api/push"
        payload = {
            "name": model_name,
            "insecure": insecure
        }
        # Making a POST request with the stream parameter set to True to handle streaming responses
        with requests.post(url, json=payload, stream=True) as response:
            response.raise_for_status()
            # Iterating over the response line by line and displaying the details
            for line in response.iter_lines():
                if line:
                    # Parsing each line (JSON chunk) and extracting the details
                    chunk = json.loads(line)
                    # If a callback function is provided, call it with the chunk
                    if callback:
                        callback(chunk)
                    else:
                        # Print the status message directly to the console
                        print(chunk.get('status', ''), end='', flush=True)
                    # If there's layer data, you might also want to print that (adjust as necessary)
                    if 'digest' in chunk:
                        print(f" - Digest: {chunk['digest']}", end='', flush=True)
                        print(f" - Total: {chunk['total']}", end='', flush=True)
                        print(f" - Completed: {chunk['completed']}", end='\n', flush=True)
                    else:
                        print()
    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
 # List models that are available locally.
 def list():
    try:
        response = requests.get(f"{BASE_URL}/api/tags")
        response.raise_for_status()
        data = response.json()
        models = data.get('models', [])
        return models
    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
        return None
 # Copy a model. Creates a model with another name from an existing model.
 def copy(source, destination):
    try:
        # Create the JSON payload
        payload = {
            "source": source,
            "destination": destination
        }
        response = requests.post(f"{BASE_URL}/api/copy", json=payload)
        response.raise_for_status()
        # If the request was successful, return a message indicating that the copy was successful
        return "Copy successful"
    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
        return None
 # Delete a model and its data.
 def delete(model_name):
    try:
        url = f"{BASE_URL}/api/delete"
        payload = {"name": model_name}
        response = requests.delete(url, json=payload)
        response.raise_for_status()
        return "Delete successful"
    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
        return None
 # Show info about a model.
 def show(model_name):
    try:
        url = f"{BASE_URL}/api/show"
        payload = {"name": model_name}
        response = requests.post(url, json=payload)
        response.raise_for_status()
        # Parse the JSON response and return it
        data = response.json()
        return data
    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
        return None
 def heartbeat():
    try:
        url = f"{BASE_URL}/"
        response = requests.head(url)
        response.raise_for_status()
        return "Ollama is running"
    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
        return "Ollama is not running"
--- a/api/types.go
+++ b/api/types.go
@@ -137,30 +137,17 @@ type EmbeddingResponse struct {
 }
 type CreateRequest struct {
-	Model     string `json:"model"`
+	Name      string `json:"name"`
 	Path      string `json:"path"`
 	Modelfile string `json:"modelfile"`
 	Stream    *bool  `json:"stream,omitempty"`
 	// Name is deprecated, see Model
 	Name string `json:"name"`
 }
 type DeleteRequest struct {
 	Model string `json:"model"`
 	// Name is deprecated, see Model
 	Name string `json:"name"`
 }
 type ShowRequest struct {
 	Model    string `json:"model"`
 	System   string `json:"system"`
 	Template string `json:"template"`
 	Options map[string]interface{} `json:"options"`
 	// Name is deprecated, see Model
 	Name string `json:"name"`
 }
@@ -179,14 +166,11 @@ type CopyRequest struct {
 }
 type PullRequest struct {
-	Model    string `json:"model"`
+	Name     string `json:"name"`
 	Insecure bool   `json:"insecure,omitempty"`
 	Username string `json:"username"`
 	Password string `json:"password"`
 	Stream   *bool  `json:"stream,omitempty"`
 	// Name is deprecated, see Model
 	Name string `json:"name"`
 }
 type ProgressResponse struct {
@@ -197,14 +181,11 @@ type ProgressResponse struct {
 }
 type PushRequest struct {
-	Model    string `json:"model"`
+	Name     string `json:"name"`
 	Insecure bool   `json:"insecure,omitempty"`
 	Username string `json:"username"`
 	Password string `json:"password"`
 	Stream   *bool  `json:"stream,omitempty"`
 	// Name is deprecated, see Model
 	Name string `json:"name"`
 }
 type ListResponse struct {
@@ -213,7 +194,6 @@ type ListResponse struct {
 type ModelResponse struct {
 	Name       string       `json:"name"`
 	Model      string       `json:"model"`
 	ModifiedAt time.Time    `json:"modified_at"`
 	Size       int64        `json:"size"`
 	Digest     string       `json:"digest"`
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -17,6 +17,7 @@ import (
 	"os/exec"
 	"os/signal"
 	"path/filepath"
 	"regexp"
 	"runtime"
 	"strings"
 	"syscall"
@@ -25,16 +26,20 @@ import (
 	"github.com/olekukonko/tablewriter"
 	"github.com/spf13/cobra"
 	"golang.org/x/crypto/ssh"
 	"golang.org/x/exp/slices"
 	"golang.org/x/term"
 	"github.com/jmorganca/ollama/api"
 	"github.com/jmorganca/ollama/format"
 	"github.com/jmorganca/ollama/parser"
 	"github.com/jmorganca/ollama/progress"
 	"github.com/jmorganca/ollama/readline"
 	"github.com/jmorganca/ollama/server"
 	"github.com/jmorganca/ollama/version"
 )
 type ImageData []byte
 func CreateHandler(cmd *cobra.Command, args []string) error {
 	filename, _ := cmd.Flags().GetString("file")
 	filename, err := filepath.Abs(filename)
@@ -151,7 +156,7 @@ func RunHandler(cmd *cobra.Command, args []string) error {
 	var statusError api.StatusError
 	switch {
 	case errors.As(err, &statusError) && statusError.StatusCode == http.StatusNotFound:
-		if err := PullHandler(cmd, []string{name}); err != nil {
+		if err := PullHandler(cmd, args); err != nil {
 			return err
 		}
 	case err != nil:
@@ -413,10 +418,11 @@ func PullHandler(cmd *cobra.Command, args []string) error {
 func RunGenerate(cmd *cobra.Command, args []string) error {
 	interactive := true
-	opts := runOptions{
+	opts := generateOptions{
 		Model:    args[0],
 		WordWrap: os.Getenv("TERM") == "xterm-256color",
 		Options:  map[string]interface{}{},
 		Images:   []ImageData{},
 	}
 	format, err := cmd.Flags().GetString("format")
@@ -457,135 +463,18 @@ func RunGenerate(cmd *cobra.Command, args []string) error {
 type generateContextKey string
-type runOptions struct {
+type generateOptions struct {
 	Model    string
 	Prompt   string
 	Messages []api.Message
 	WordWrap bool
 	Format   string
 	System   string
 	Template string
-	Images   []api.ImageData
+	Images   []ImageData
 	Options  map[string]interface{}
 }
-type displayResponseState struct {
+func generate(cmd *cobra.Command, opts generateOptions) error {
 	lineLength int
 	wordBuffer string
 }
 func displayResponse(content string, wordWrap bool, state *displayResponseState) {
 	termWidth, _, _ := term.GetSize(int(os.Stdout.Fd()))
 	if wordWrap && termWidth >= 10 {
 		for _, ch := range content {
 			if state.lineLength+1 > termWidth-5 {
 				if len(state.wordBuffer) > termWidth-10 {
 					fmt.Printf("%s%c", state.wordBuffer, ch)
 					state.wordBuffer = ""
 					state.lineLength = 0
 					continue
 				}
 				// backtrack the length of the last word and clear to the end of the line
 				fmt.Printf("\x1b[%dD\x1b[K\n", len(state.wordBuffer))
 				fmt.Printf("%s%c", state.wordBuffer, ch)
 				state.lineLength = len(state.wordBuffer) + 1
 			} else {
 				fmt.Print(string(ch))
 				state.lineLength += 1
 				switch ch {
 				case ' ':
 					state.wordBuffer = ""
 				case '\n':
 					state.lineLength = 0
 				default:
 					state.wordBuffer += string(ch)
 				}
 			}
 		}
 	} else {
 		fmt.Printf("%s%s", state.wordBuffer, content)
 		if len(state.wordBuffer) > 0 {
 			state.wordBuffer = ""
 		}
 	}
 }
 func chat(cmd *cobra.Command, opts runOptions) (*api.Message, error) {
 	client, err := api.ClientFromEnvironment()
 	if err != nil {
 		return nil, err
 	}
 	p := progress.NewProgress(os.Stderr)
 	defer p.StopAndClear()
 	spinner := progress.NewSpinner("")
 	p.Add("", spinner)
 	cancelCtx, cancel := context.WithCancel(cmd.Context())
 	defer cancel()
 	sigChan := make(chan os.Signal, 1)
 	signal.Notify(sigChan, syscall.SIGINT)
 	go func() {
 		<-sigChan
 		cancel()
 	}()
 	var state *displayResponseState = &displayResponseState{}
 	var latest api.ChatResponse
 	var fullResponse strings.Builder
 	var role string
 	fn := func(response api.ChatResponse) error {
 		p.StopAndClear()
 		latest = response
 		role = response.Message.Role
 		content := response.Message.Content
 		fullResponse.WriteString(content)
 		displayResponse(content, opts.WordWrap, state)
 		return nil
 	}
 	req := &api.ChatRequest{
 		Model:    opts.Model,
 		Messages: opts.Messages,
 		Format:   opts.Format,
 		Options:  opts.Options,
 	}
 	if err := client.Chat(cancelCtx, req, fn); err != nil {
 		if errors.Is(err, context.Canceled) {
 			return nil, nil
 		}
 		return nil, err
 	}
 	if len(opts.Messages) > 0 {
 		fmt.Println()
 		fmt.Println()
 	}
 	verbose, err := cmd.Flags().GetBool("verbose")
 	if err != nil {
 		return nil, err
 	}
 	if verbose {
 		latest.Summary()
 	}
 	return &api.Message{Role: role, Content: fullResponse.String()}, nil
 }
 func generate(cmd *cobra.Command, opts runOptions) error {
 	client, err := api.ClientFromEnvironment()
 	if err != nil {
 		return err
@@ -604,6 +493,11 @@ func generate(cmd *cobra.Command, opts runOptions) error {
 		generateContext = []int{}
 	}
 	termWidth, _, err := term.GetSize(int(os.Stdout.Fd()))
 	if err != nil {
 		opts.WordWrap = false
 	}
 	ctx, cancel := context.WithCancel(cmd.Context())
 	defer cancel()
@@ -615,19 +509,57 @@ func generate(cmd *cobra.Command, opts runOptions) error {
 		cancel()
 	}()
-	var state *displayResponseState = &displayResponseState{}
+	var currentLineLength int
 	var wordBuffer string
 	fn := func(response api.GenerateResponse) error {
 		p.StopAndClear()
 		latest = response
 		content := response.Response
-		displayResponse(content, opts.WordWrap, state)
+		termWidth, _, _ = term.GetSize(int(os.Stdout.Fd()))
 		if opts.WordWrap && termWidth >= 10 {
 			for _, ch := range response.Response {
 				if currentLineLength+1 > termWidth-5 {
 					if len(wordBuffer) > termWidth-10 {
 						fmt.Printf("%s%c", wordBuffer, ch)
 						wordBuffer = ""
 						currentLineLength = 0
 						continue
 					}
 					// backtrack the length of the last word and clear to the end of the line
 					fmt.Printf("\x1b[%dD\x1b[K\n", len(wordBuffer))
 					fmt.Printf("%s%c", wordBuffer, ch)
 					currentLineLength = len(wordBuffer) + 1
 				} else {
 					fmt.Print(string(ch))
 					currentLineLength += 1
 					switch ch {
 					case ' ':
 						wordBuffer = ""
 					case '\n':
 						currentLineLength = 0
 					default:
 						wordBuffer += string(ch)
 					}
 				}
 			}
 		} else {
 			fmt.Printf("%s%s", wordBuffer, response.Response)
 			if len(wordBuffer) > 0 {
 				wordBuffer = ""
 			}
 		}
 		return nil
 	}
 	images := make([]api.ImageData, 0)
 	for _, i := range opts.Images {
 		images = append(images, api.ImageData(i))
 	}
 	request := api.GenerateRequest{
 		Model:    opts.Model,
 		Prompt:   opts.Prompt,
@@ -636,15 +568,35 @@ func generate(cmd *cobra.Command, opts runOptions) error {
 		System:   opts.System,
 		Template: opts.Template,
 		Options:  opts.Options,
 		Images:   images,
 	}
 	if err := client.Generate(ctx, &request, fn); err != nil {
-		if errors.Is(err, context.Canceled) {
+		switch {
 		case errors.Is(err, context.Canceled):
 			return nil
 		case strings.Contains(err.Error(), "unsupported model format"):
 			// pull and retry to see if the model has been updated
 			parts := strings.Split(opts.Model, string(os.PathSeparator))
 			if len(parts) == 1 {
 				// this is a library model, log some info
 				fmt.Fprintln(os.Stderr, "This model is no longer compatible with Ollama. Pulling a new version...")
 			}
 			if err := PullHandler(cmd, []string{opts.Model}); err != nil {
 				fmt.Printf("Error: %s\n", err)
 				return fmt.Errorf("unsupported model, please update this model to gguf format") // relay the original error
 			}
 			// retry
 			if err := client.Generate(ctx, &request, fn); err != nil {
 				if errors.Is(err, context.Canceled) {
 					return nil
 				}
 				return err
 			}
 		default:
 			return err
 		}
 		return err
 	}
 	if opts.Prompt != "" {
 		fmt.Println()
 		fmt.Println()
@@ -669,6 +621,459 @@ func generate(cmd *cobra.Command, opts runOptions) error {
 	return nil
 }
 type MultilineState int
 const (
 	MultilineNone MultilineState = iota
 	MultilinePrompt
 	MultilineSystem
 	MultilineTemplate
 )
 func modelIsMultiModal(cmd *cobra.Command, name string) bool {
 	// get model details
 	client, err := api.ClientFromEnvironment()
 	if err != nil {
 		fmt.Println("error: couldn't connect to ollama server")
 		return false
 	}
 	req := api.ShowRequest{Name: name}
 	resp, err := client.Show(cmd.Context(), &req)
 	if err != nil {
 		return false
 	}
 	return slices.Contains(resp.Details.Families, "clip")
 }
 func generateInteractive(cmd *cobra.Command, opts generateOptions) error {
 	multiModal := modelIsMultiModal(cmd, opts.Model)
 	// load the model
 	loadOpts := generateOptions{
 		Model:  opts.Model,
 		Prompt: "",
 		Images: []ImageData{},
 	}
 	if err := generate(cmd, loadOpts); err != nil {
 		return err
 	}
 	usage := func() {
 		fmt.Fprintln(os.Stderr, "Available Commands:")
 		fmt.Fprintln(os.Stderr, "  /set          Set session variables")
 		fmt.Fprintln(os.Stderr, "  /show         Show model information")
 		fmt.Fprintln(os.Stderr, "  /bye          Exit")
 		fmt.Fprintln(os.Stderr, "  /?, /help     Help for a command")
 		fmt.Fprintln(os.Stderr, "  /? shortcuts  Help for keyboard shortcuts")
 		fmt.Fprintln(os.Stderr, "")
 		fmt.Fprintln(os.Stderr, "Use \"\"\" to begin a multi-line message.")
 		fmt.Fprintln(os.Stderr, "")
 	}
 	usageSet := func() {
 		fmt.Fprintln(os.Stderr, "Available Commands:")
 		fmt.Fprintln(os.Stderr, "  /set parameter ...     Set a parameter")
 		fmt.Fprintln(os.Stderr, "  /set system <string>   Set system message")
 		fmt.Fprintln(os.Stderr, "  /set template <string> Set prompt template")
 		fmt.Fprintln(os.Stderr, "  /set history           Enable history")
 		fmt.Fprintln(os.Stderr, "  /set nohistory         Disable history")
 		fmt.Fprintln(os.Stderr, "  /set wordwrap          Enable wordwrap")
 		fmt.Fprintln(os.Stderr, "  /set nowordwrap        Disable wordwrap")
 		fmt.Fprintln(os.Stderr, "  /set format json       Enable JSON mode")
 		fmt.Fprintln(os.Stderr, "  /set noformat          Disable formatting")
 		fmt.Fprintln(os.Stderr, "  /set verbose           Show LLM stats")
 		fmt.Fprintln(os.Stderr, "  /set quiet             Disable LLM stats")
 		fmt.Fprintln(os.Stderr, "")
 	}
 	usageShortcuts := func() {
 		fmt.Fprintln(os.Stderr, "Available keyboard shortcuts:")
 		fmt.Fprintln(os.Stderr, "  Ctrl + a            Move to the beginning of the line (Home)")
 		fmt.Fprintln(os.Stderr, "  Ctrl + e            Move to the end of the line (End)")
 		fmt.Fprintln(os.Stderr, "   Alt + b            Move back (left) one word")
 		fmt.Fprintln(os.Stderr, "   Alt + f            Move forward (right) one word")
 		fmt.Fprintln(os.Stderr, "  Ctrl + k            Delete the sentence after the cursor")
 		fmt.Fprintln(os.Stderr, "  Ctrl + u            Delete the sentence before the cursor")
 		fmt.Fprintln(os.Stderr, "")
 		fmt.Fprintln(os.Stderr, "  Ctrl + l            Clear the screen")
 		fmt.Fprintln(os.Stderr, "  Ctrl + c            Stop the model from responding")
 		fmt.Fprintln(os.Stderr, "  Ctrl + d            Exit ollama (/bye)")
 		fmt.Fprintln(os.Stderr, "")
 	}
 	usageShow := func() {
 		fmt.Fprintln(os.Stderr, "Available Commands:")
 		fmt.Fprintln(os.Stderr, "  /show license      Show model license")
 		fmt.Fprintln(os.Stderr, "  /show modelfile    Show Modelfile for this model")
 		fmt.Fprintln(os.Stderr, "  /show parameters   Show parameters for this model")
 		fmt.Fprintln(os.Stderr, "  /show system       Show system message")
 		fmt.Fprintln(os.Stderr, "  /show template     Show prompt template")
 		fmt.Fprintln(os.Stderr, "")
 	}
 	// only list out the most common parameters
 	usageParameters := func() {
 		fmt.Fprintln(os.Stderr, "Available Parameters:")
 		fmt.Fprintln(os.Stderr, "  /set parameter seed <int>             Random number seed")
 		fmt.Fprintln(os.Stderr, "  /set parameter num_predict <int>      Max number of tokens to predict")
 		fmt.Fprintln(os.Stderr, "  /set parameter top_k <int>            Pick from top k num of tokens")
 		fmt.Fprintln(os.Stderr, "  /set parameter top_p <float>          Pick token based on sum of probabilities")
 		fmt.Fprintln(os.Stderr, "  /set parameter num_ctx <int>          Set the context size")
 		fmt.Fprintln(os.Stderr, "  /set parameter temperature <float>    Set creativity level")
 		fmt.Fprintln(os.Stderr, "  /set parameter repeat_penalty <float> How strongly to penalize repetitions")
 		fmt.Fprintln(os.Stderr, "  /set parameter repeat_last_n <int>    Set how far back to look for repetitions")
 		fmt.Fprintln(os.Stderr, "  /set parameter num_gpu <int>          The number of layers to send to the GPU")
 		fmt.Fprintln(os.Stderr, "  /set parameter stop \"<string>\", ...   Set the stop parameters")
 		fmt.Fprintln(os.Stderr, "")
 	}
 	scanner, err := readline.New(readline.Prompt{
 		Prompt:         ">>> ",
 		AltPrompt:      "... ",
 		Placeholder:    "Send a message (/? for help)",
 		AltPlaceholder: `Use """ to end multi-line input`,
 	})
 	if err != nil {
 		return err
 	}
 	fmt.Print(readline.StartBracketedPaste)
 	defer fmt.Printf(readline.EndBracketedPaste)
 	var multiline MultilineState
 	var prompt string
 	for {
 		line, err := scanner.Readline()
 		switch {
 		case errors.Is(err, io.EOF):
 			fmt.Println()
 			return nil
 		case errors.Is(err, readline.ErrInterrupt):
 			if line == "" {
 				fmt.Println("\nUse Ctrl + d or /bye to exit.")
 			}
 			scanner.Prompt.UseAlt = false
 			prompt = ""
 			continue
 		case err != nil:
 			return err
 		}
 		switch {
 		case strings.HasPrefix(prompt, `"""`):
 			// if the prompt so far starts with """ then we're in multiline mode
 			// and we need to keep reading until we find a line that ends with """
 			cut, found := strings.CutSuffix(line, `"""`)
 			prompt += cut
 			if !found {
 				prompt += "\n"
 				continue
 			}
 			prompt = strings.TrimPrefix(prompt, `"""`)
 			scanner.Prompt.UseAlt = false
 			switch multiline {
 			case MultilineSystem:
 				opts.System = prompt
 				prompt = ""
 				fmt.Println("Set system message.")
 			case MultilineTemplate:
 				opts.Template = prompt
 				prompt = ""
 				fmt.Println("Set prompt template.")
 			}
 			multiline = MultilineNone
 		case strings.HasPrefix(line, `"""`) && len(prompt) == 0:
 			scanner.Prompt.UseAlt = true
 			multiline = MultilinePrompt
 			prompt += line + "\n"
 			continue
 		case scanner.Pasting:
 			prompt += line + "\n"
 			continue
 		case strings.HasPrefix(line, "/list"):
 			args := strings.Fields(line)
 			if err := ListHandler(cmd, args[1:]); err != nil {
 				return err
 			}
 		case strings.HasPrefix(line, "/set"):
 			args := strings.Fields(line)
 			if len(args) > 1 {
 				switch args[1] {
 				case "history":
 					scanner.HistoryEnable()
 				case "nohistory":
 					scanner.HistoryDisable()
 				case "wordwrap":
 					opts.WordWrap = true
 					fmt.Println("Set 'wordwrap' mode.")
 				case "nowordwrap":
 					opts.WordWrap = false
 					fmt.Println("Set 'nowordwrap' mode.")
 				case "verbose":
 					cmd.Flags().Set("verbose", "true")
 					fmt.Println("Set 'verbose' mode.")
 				case "quiet":
 					cmd.Flags().Set("verbose", "false")
 					fmt.Println("Set 'quiet' mode.")
 				case "format":
 					if len(args) < 3 || args[2] != "json" {
 						fmt.Println("Invalid or missing format. For 'json' mode use '/set format json'")
 					} else {
 						opts.Format = args[2]
 						fmt.Printf("Set format to '%s' mode.\n", args[2])
 					}
 				case "noformat":
 					opts.Format = ""
 					fmt.Println("Disabled format.")
 				case "parameter":
 					if len(args) < 4 {
 						usageParameters()
 						continue
 					}
 					var params []string
 					for _, p := range args[3:] {
 						params = append(params, p)
 					}
 					fp, err := api.FormatParams(map[string][]string{args[2]: params})
 					if err != nil {
 						fmt.Printf("Couldn't set parameter: %q\n\n", err)
 						continue
 					}
 					fmt.Printf("Set parameter '%s' to '%s'\n\n", args[2], strings.Join(params, ", "))
 					opts.Options[args[2]] = fp[args[2]]
 				case "system", "template":
 					if len(args) < 3 {
 						usageSet()
 						continue
 					}
 					line := strings.Join(args[2:], " ")
 					line = strings.TrimPrefix(line, `"""`)
 					if strings.HasPrefix(args[2], `"""`) {
 						cut, found := strings.CutSuffix(line, `"""`)
 						prompt += cut
 						if found {
 							if args[1] == "system" {
 								opts.System = prompt
 								fmt.Println("Set system message.")
 							} else {
 								opts.Template = prompt
 								fmt.Println("Set prompt template.")
 							}
 							prompt = ""
 						} else {
 							prompt = `"""` + prompt + "\n"
 							if args[1] == "system" {
 								multiline = MultilineSystem
 							} else {
 								multiline = MultilineTemplate
 							}
 							scanner.Prompt.UseAlt = true
 						}
 					} else {
 						opts.System = line
 						fmt.Println("Set system message.")
 					}
 				default:
 					fmt.Printf("Unknown command '/set %s'. Type /? for help\n", args[1])
 				}
 			} else {
 				usageSet()
 			}
 		case strings.HasPrefix(line, "/show"):
 			args := strings.Fields(line)
 			if len(args) > 1 {
 				client, err := api.ClientFromEnvironment()
 				if err != nil {
 					fmt.Println("error: couldn't connect to ollama server")
 					return err
 				}
 				resp, err := client.Show(cmd.Context(), &api.ShowRequest{Name: opts.Model})
 				if err != nil {
 					fmt.Println("error: couldn't get model")
 					return err
 				}
 				switch args[1] {
 				case "license":
 					if resp.License == "" {
 						fmt.Print("No license was specified for this model.\n\n")
 					} else {
 						fmt.Println(resp.License)
 					}
 				case "modelfile":
 					fmt.Println(resp.Modelfile)
 				case "parameters":
 					if resp.Parameters == "" {
 						fmt.Print("No parameters were specified for this model.\n\n")
 					} else {
 						if len(opts.Options) > 0 {
 							fmt.Println("User defined parameters:")
 							for k, v := range opts.Options {
 								fmt.Printf("%-*s %v\n", 30, k, v)
 							}
 							fmt.Println()
 						}
 						fmt.Println("Model defined parameters:")
 						fmt.Println(resp.Parameters)
 					}
 				case "system":
 					switch {
 					case opts.System != "":
 						fmt.Println(opts.System + "\n")
 					case resp.System != "":
 						fmt.Println(resp.System + "\n")
 					default:
 						fmt.Print("No system message was specified for this model.\n\n")
 					}
 				case "template":
 					switch {
 					case opts.Template != "":
 						fmt.Println(opts.Template + "\n")
 					case resp.Template != "":
 						fmt.Println(resp.Template)
 					default:
 						fmt.Print("No prompt template was specified for this model.\n\n")
 					}
 				default:
 					fmt.Printf("Unknown command '/show %s'. Type /? for help\n", args[1])
 				}
 			} else {
 				usageShow()
 			}
 		case strings.HasPrefix(line, "/help"), strings.HasPrefix(line, "/?"):
 			args := strings.Fields(line)
 			if len(args) > 1 {
 				switch args[1] {
 				case "set", "/set":
 					usageSet()
 				case "show", "/show":
 					usageShow()
 				case "shortcut", "shortcuts":
 					usageShortcuts()
 				}
 			} else {
 				usage()
 			}
 		case line == "/exit", line == "/bye":
 			return nil
 		case strings.HasPrefix(line, "/"):
 			args := strings.Fields(line)
 			isFile := false
 			if multiModal {
 				for _, f := range extractFileNames(line) {
 					if strings.HasPrefix(f, args[0]) {
 						isFile = true
 						break
 					}
 				}
 			}
 			if isFile {
 				prompt += line
 			} else {
 				fmt.Printf("Unknown command '%s'. Type /? for help\n", args[0])
 				continue
 			}
 		default:
 			prompt += line
 		}
 		if len(prompt) > 0 && multiline == MultilineNone {
 			opts.Prompt = prompt
 			if multiModal {
 				newPrompt, images, err := extractFileData(prompt)
 				if err != nil {
 					return err
 				}
 				opts.Prompt = newPrompt
 				// reset the context if we find another image
 				if len(images) > 0 {
 					opts.Images = images
 					ctx := cmd.Context()
 					ctx = context.WithValue(ctx, generateContextKey("context"), []int{})
 					cmd.SetContext(ctx)
 				}
 				if len(opts.Images) == 0 {
 					fmt.Println("This model requires you to add a jpeg, png, or svg image.")
 					fmt.Println()
 					prompt = ""
 					continue
 				}
 			}
 			if err := generate(cmd, opts); err != nil {
 				return err
 			}
 			prompt = ""
 		}
 	}
 }
 func normalizeFilePath(fp string) string {
 	// Define a map of escaped characters and their replacements
 	replacements := map[string]string{
 		"\\ ":  " ",  // Escaped space
 		"\\(":  "(",  // Escaped left parenthesis
 		"\\)":  ")",  // Escaped right parenthesis
 		"\\[":  "[",  // Escaped left square bracket
 		"\\]":  "]",  // Escaped right square bracket
 		"\\{":  "{",  // Escaped left curly brace
 		"\\}":  "}",  // Escaped right curly brace
 		"\\$":  "$",  // Escaped dollar sign
 		"\\&":  "&",  // Escaped ampersand
 		"\\;":  ";",  // Escaped semicolon
 		"\\'":  "'",  // Escaped single quote
 		"\\\\": "\\", // Escaped backslash
 		"\\*":  "*",  // Escaped asterisk
 		"\\?":  "?",  // Escaped question mark
 	}
 	for escaped, actual := range replacements {
 		fp = strings.ReplaceAll(fp, escaped, actual)
 	}
 	return fp
 }
 func extractFileNames(input string) []string {
 	// Regex to match file paths starting with / or ./ and include escaped spaces (\ or %20)
 	// and followed by more characters and a file extension
 	regexPattern := `(?:\./|/)[\S\\ ]+?\.(?i:jpg|jpeg|png|svg)\b`
 	re := regexp.MustCompile(regexPattern)
 	return re.FindAllString(input, -1)
 }
 func extractFileData(input string) (string, []ImageData, error) {
 	filePaths := extractFileNames(input)
 	var imgs []ImageData
 	for _, fp := range filePaths {
 		nfp := normalizeFilePath(fp)
 		data, err := getImageData(nfp)
 		if err != nil {
 			if os.IsNotExist(err) {
 				continue
 			}
 			fmt.Printf("Couldn't process image: %q\n", err)
 			return "", imgs, err
 		}
 		fmt.Printf("Added image '%s'\n", nfp)
 		input = strings.ReplaceAll(input, fp, "")
 		imgs = append(imgs, data)
 	}
 	return input, imgs, nil
 }
 func RunServer(cmd *cobra.Command, _ []string) error {
 	host, port, err := net.SplitHostPort(os.Getenv("OLLAMA_HOST"))
 	if err != nil {
@@ -690,6 +1095,50 @@ func RunServer(cmd *cobra.Command, _ []string) error {
 	return server.Serve(ln)
 }
 func getImageData(filePath string) ([]byte, error) {
 	file, err := os.Open(filePath)
 	if err != nil {
 		return nil, err
 	}
 	defer file.Close()
 	buf := make([]byte, 512)
 	_, err = file.Read(buf)
 	if err != nil {
 		return nil, err
 	}
 	contentType := http.DetectContentType(buf)
 	allowedTypes := []string{"image/jpeg", "image/jpg", "image/svg+xml", "image/png"}
 	if !slices.Contains(allowedTypes, contentType) {
 		return nil, fmt.Errorf("invalid image type: %s", contentType)
 	}
 	info, err := file.Stat()
 	if err != nil {
 		return nil, err
 	}
 	// Check if the file size exceeds 100MB
 	var maxSize int64 = 100 * 1024 * 1024 // 100MB in bytes
 	if info.Size() > maxSize {
 		return nil, fmt.Errorf("file size exceeds maximum limit (100MB)")
 	}
 	buf = make([]byte, info.Size())
 	_, err = file.Seek(0, 0)
 	if err != nil {
 		return nil, err
 	}
 	_, err = io.ReadFull(file, buf)
 	if err != nil {
 		return nil, err
 	}
 	return buf, nil
 }
 func initializeKeypair() error {
 	home, err := os.UserHomeDir()
 	if err != nil {
--- a/cmd/interactive.go
+++ b/cmd/interactive.go
@@ -1,555 +0,0 @@
 package cmd
 import (
 	"errors"
 	"fmt"
 	"io"
 	"net/http"
 	"os"
 	"regexp"
 	"strings"
 	"github.com/spf13/cobra"
 	"golang.org/x/exp/slices"
 	"github.com/jmorganca/ollama/api"
 	"github.com/jmorganca/ollama/readline"
 )
 type MultilineState int
 const (
 	MultilineNone MultilineState = iota
 	MultilinePrompt
 	MultilineSystem
 	MultilineTemplate
 )
 func modelIsMultiModal(cmd *cobra.Command, name string) bool {
 	// get model details
 	client, err := api.ClientFromEnvironment()
 	if err != nil {
 		fmt.Println("error: couldn't connect to ollama server")
 		return false
 	}
 	req := api.ShowRequest{Name: name}
 	resp, err := client.Show(cmd.Context(), &req)
 	if err != nil {
 		return false
 	}
 	return slices.Contains(resp.Details.Families, "clip")
 }
 func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 	multiModal := modelIsMultiModal(cmd, opts.Model)
 	// load the model
 	loadOpts := runOptions{
 		Model:    opts.Model,
 		Prompt:   "",
 		Messages: []api.Message{},
 	}
 	if _, err := chat(cmd, loadOpts); err != nil {
 		return err
 	}
 	usage := func() {
 		fmt.Fprintln(os.Stderr, "Available Commands:")
 		fmt.Fprintln(os.Stderr, "  /set          Set session variables")
 		fmt.Fprintln(os.Stderr, "  /show         Show model information")
 		fmt.Fprintln(os.Stderr, "  /bye          Exit")
 		fmt.Fprintln(os.Stderr, "  /?, /help     Help for a command")
 		fmt.Fprintln(os.Stderr, "  /? shortcuts  Help for keyboard shortcuts")
 		fmt.Fprintln(os.Stderr, "")
 		fmt.Fprintln(os.Stderr, "Use \"\"\" to begin a multi-line message.")
 		fmt.Fprintln(os.Stderr, "")
 	}
 	usageSet := func() {
 		fmt.Fprintln(os.Stderr, "Available Commands:")
 		fmt.Fprintln(os.Stderr, "  /set parameter ...     Set a parameter")
 		fmt.Fprintln(os.Stderr, "  /set system <string>   Set system message")
 		fmt.Fprintln(os.Stderr, "  /set template <string> Set prompt template")
 		fmt.Fprintln(os.Stderr, "  /set history           Enable history")
 		fmt.Fprintln(os.Stderr, "  /set nohistory         Disable history")
 		fmt.Fprintln(os.Stderr, "  /set wordwrap          Enable wordwrap")
 		fmt.Fprintln(os.Stderr, "  /set nowordwrap        Disable wordwrap")
 		fmt.Fprintln(os.Stderr, "  /set format json       Enable JSON mode")
 		fmt.Fprintln(os.Stderr, "  /set noformat          Disable formatting")
 		fmt.Fprintln(os.Stderr, "  /set verbose           Show LLM stats")
 		fmt.Fprintln(os.Stderr, "  /set quiet             Disable LLM stats")
 		fmt.Fprintln(os.Stderr, "")
 	}
 	usageShortcuts := func() {
 		fmt.Fprintln(os.Stderr, "Available keyboard shortcuts:")
 		fmt.Fprintln(os.Stderr, "  Ctrl + a            Move to the beginning of the line (Home)")
 		fmt.Fprintln(os.Stderr, "  Ctrl + e            Move to the end of the line (End)")
 		fmt.Fprintln(os.Stderr, "   Alt + b            Move back (left) one word")
 		fmt.Fprintln(os.Stderr, "   Alt + f            Move forward (right) one word")
 		fmt.Fprintln(os.Stderr, "  Ctrl + k            Delete the sentence after the cursor")
 		fmt.Fprintln(os.Stderr, "  Ctrl + u            Delete the sentence before the cursor")
 		fmt.Fprintln(os.Stderr, "")
 		fmt.Fprintln(os.Stderr, "  Ctrl + l            Clear the screen")
 		fmt.Fprintln(os.Stderr, "  Ctrl + c            Stop the model from responding")
 		fmt.Fprintln(os.Stderr, "  Ctrl + d            Exit ollama (/bye)")
 		fmt.Fprintln(os.Stderr, "")
 	}
 	usageShow := func() {
 		fmt.Fprintln(os.Stderr, "Available Commands:")
 		fmt.Fprintln(os.Stderr, "  /show info         Show details for this model")
 		fmt.Fprintln(os.Stderr, "  /show license      Show model license")
 		fmt.Fprintln(os.Stderr, "  /show modelfile    Show Modelfile for this model")
 		fmt.Fprintln(os.Stderr, "  /show parameters   Show parameters for this model")
 		fmt.Fprintln(os.Stderr, "  /show system       Show system message")
 		fmt.Fprintln(os.Stderr, "  /show template     Show prompt template")
 		fmt.Fprintln(os.Stderr, "")
 	}
 	// only list out the most common parameters
 	usageParameters := func() {
 		fmt.Fprintln(os.Stderr, "Available Parameters:")
 		fmt.Fprintln(os.Stderr, "  /set parameter seed <int>             Random number seed")
 		fmt.Fprintln(os.Stderr, "  /set parameter num_predict <int>      Max number of tokens to predict")
 		fmt.Fprintln(os.Stderr, "  /set parameter top_k <int>            Pick from top k num of tokens")
 		fmt.Fprintln(os.Stderr, "  /set parameter top_p <float>          Pick token based on sum of probabilities")
 		fmt.Fprintln(os.Stderr, "  /set parameter num_ctx <int>          Set the context size")
 		fmt.Fprintln(os.Stderr, "  /set parameter temperature <float>    Set creativity level")
 		fmt.Fprintln(os.Stderr, "  /set parameter repeat_penalty <float> How strongly to penalize repetitions")
 		fmt.Fprintln(os.Stderr, "  /set parameter repeat_last_n <int>    Set how far back to look for repetitions")
 		fmt.Fprintln(os.Stderr, "  /set parameter num_gpu <int>          The number of layers to send to the GPU")
 		fmt.Fprintln(os.Stderr, "  /set parameter stop \"<string>\", ...   Set the stop parameters")
 		fmt.Fprintln(os.Stderr, "")
 	}
 	scanner, err := readline.New(readline.Prompt{
 		Prompt:         ">>> ",
 		AltPrompt:      "... ",
 		Placeholder:    "Send a message (/? for help)",
 		AltPlaceholder: `Use """ to end multi-line input`,
 	})
 	if err != nil {
 		return err
 	}
 	fmt.Print(readline.StartBracketedPaste)
 	defer fmt.Printf(readline.EndBracketedPaste)
 	var sb strings.Builder
 	var multiline MultilineState
 	opts.Messages = make([]api.Message, 0)
 	for {
 		line, err := scanner.Readline()
 		switch {
 		case errors.Is(err, io.EOF):
 			fmt.Println()
 			return nil
 		case errors.Is(err, readline.ErrInterrupt):
 			if line == "" {
 				fmt.Println("\nUse Ctrl + d or /bye to exit.")
 			}
 			scanner.Prompt.UseAlt = false
 			sb.Reset()
 			continue
 		case err != nil:
 			return err
 		}
 		switch {
 		case multiline != MultilineNone:
 			// check if there's a multiline terminating string
 			before, ok := strings.CutSuffix(line, `"""`)
 			sb.WriteString(before)
 			if !ok {
 				fmt.Fprintln(&sb)
 				continue
 			}
 			switch multiline {
 			case MultilineSystem:
 				opts.System = sb.String()
 				fmt.Println("Set system message.")
 				sb.Reset()
 			case MultilineTemplate:
 				opts.Template = sb.String()
 				fmt.Println("Set prompt template.")
 				sb.Reset()
 			}
 			multiline = MultilineNone
 			scanner.Prompt.UseAlt = false
 		case strings.HasPrefix(line, `"""`):
 			line := strings.TrimPrefix(line, `"""`)
 			line, ok := strings.CutSuffix(line, `"""`)
 			sb.WriteString(line)
 			if !ok {
 				// no multiline terminating string; need more input
 				fmt.Fprintln(&sb)
 				multiline = MultilinePrompt
 				scanner.Prompt.UseAlt = true
 				break
 			}
 		case scanner.Pasting:
 			fmt.Fprintln(&sb, line)
 			continue
 		case strings.HasPrefix(line, "/list"):
 			args := strings.Fields(line)
 			if err := ListHandler(cmd, args[1:]); err != nil {
 				return err
 			}
 		case strings.HasPrefix(line, "/set"):
 			args := strings.Fields(line)
 			if len(args) > 1 {
 				switch args[1] {
 				case "history":
 					scanner.HistoryEnable()
 				case "nohistory":
 					scanner.HistoryDisable()
 				case "wordwrap":
 					opts.WordWrap = true
 					fmt.Println("Set 'wordwrap' mode.")
 				case "nowordwrap":
 					opts.WordWrap = false
 					fmt.Println("Set 'nowordwrap' mode.")
 				case "verbose":
 					cmd.Flags().Set("verbose", "true")
 					fmt.Println("Set 'verbose' mode.")
 				case "quiet":
 					cmd.Flags().Set("verbose", "false")
 					fmt.Println("Set 'quiet' mode.")
 				case "format":
 					if len(args) < 3 || args[2] != "json" {
 						fmt.Println("Invalid or missing format. For 'json' mode use '/set format json'")
 					} else {
 						opts.Format = args[2]
 						fmt.Printf("Set format to '%s' mode.\n", args[2])
 					}
 				case "noformat":
 					opts.Format = ""
 					fmt.Println("Disabled format.")
 				case "parameter":
 					if len(args) < 4 {
 						usageParameters()
 						continue
 					}
 					params := args[3:]
 					fp, err := api.FormatParams(map[string][]string{args[2]: params})
 					if err != nil {
 						fmt.Printf("Couldn't set parameter: %q\n", err)
 						continue
 					}
 					fmt.Printf("Set parameter '%s' to '%s'\n", args[2], strings.Join(params, ", "))
 					opts.Options[args[2]] = fp[args[2]]
 				case "system", "template":
 					if len(args) < 3 {
 						usageSet()
 						continue
 					}
 					if args[1] == "system" {
 						multiline = MultilineSystem
 					} else if args[1] == "template" {
 						multiline = MultilineTemplate
 					}
 					line := strings.Join(args[2:], " ")
 					line, ok := strings.CutPrefix(line, `"""`)
 					if !ok {
 						multiline = MultilineNone
 					} else {
 						// only cut suffix if the line is multiline
 						line, ok = strings.CutSuffix(line, `"""`)
 						if ok {
 							multiline = MultilineNone
 						}
 					}
 					sb.WriteString(line)
 					if multiline != MultilineNone {
 						scanner.Prompt.UseAlt = true
 						continue
 					}
 					if args[1] == "system" {
 						opts.System = sb.String()
 						fmt.Println("Set system message.")
 					} else if args[1] == "template" {
 						opts.Template = sb.String()
 						fmt.Println("Set prompt template.")
 					}
 					sb.Reset()
 					continue
 				default:
 					fmt.Printf("Unknown command '/set %s'. Type /? for help\n", args[1])
 				}
 			} else {
 				usageSet()
 			}
 		case strings.HasPrefix(line, "/show"):
 			args := strings.Fields(line)
 			if len(args) > 1 {
 				client, err := api.ClientFromEnvironment()
 				if err != nil {
 					fmt.Println("error: couldn't connect to ollama server")
 					return err
 				}
 				req := &api.ShowRequest{
 					Name:     opts.Model,
 					System:   opts.System,
 					Template: opts.Template,
 					Options:  opts.Options,
 				}
 				resp, err := client.Show(cmd.Context(), req)
 				if err != nil {
 					fmt.Println("error: couldn't get model")
 					return err
 				}
 				switch args[1] {
 				case "info":
 					fmt.Println("Model details:")
 					if len(resp.Details.Families) > 0 {
 						fmt.Printf("Family              %s\n", strings.Join(resp.Details.Families, ", "))
 					} else if resp.Details.Family != "" {
 						fmt.Printf("Family              %s\n", resp.Details.Family)
 					}
 					fmt.Printf("Parameter Size      %s\n", resp.Details.ParameterSize)
 					fmt.Printf("Quantization Level  %s\n", resp.Details.QuantizationLevel)
 					fmt.Println("")
 				case "license":
 					if resp.License == "" {
 						fmt.Println("No license was specified for this model.")
 					} else {
 						fmt.Println(resp.License)
 					}
 				case "modelfile":
 					fmt.Println(resp.Modelfile)
 				case "parameters":
 					if resp.Parameters == "" {
 						fmt.Println("No parameters were specified for this model.")
 					} else {
 						if len(opts.Options) > 0 {
 							fmt.Println("User defined parameters:")
 							for k, v := range opts.Options {
 								fmt.Printf("%-*s %v\n", 30, k, v)
 							}
 							fmt.Println()
 						}
 						fmt.Println("Model defined parameters:")
 						fmt.Println(resp.Parameters)
 					}
 				case "system":
 					switch {
 					case opts.System != "":
 						fmt.Println(opts.System + "\n")
 					case resp.System != "":
 						fmt.Println(resp.System + "\n")
 					default:
 						fmt.Println("No system message was specified for this model.")
 					}
 				case "template":
 					switch {
 					case opts.Template != "":
 						fmt.Println(opts.Template + "\n")
 					case resp.Template != "":
 						fmt.Println(resp.Template)
 					default:
 						fmt.Println("No prompt template was specified for this model.")
 					}
 				default:
 					fmt.Printf("Unknown command '/show %s'. Type /? for help\n", args[1])
 				}
 			} else {
 				usageShow()
 			}
 		case strings.HasPrefix(line, "/help"), strings.HasPrefix(line, "/?"):
 			args := strings.Fields(line)
 			if len(args) > 1 {
 				switch args[1] {
 				case "set", "/set":
 					usageSet()
 				case "show", "/show":
 					usageShow()
 				case "shortcut", "shortcuts":
 					usageShortcuts()
 				}
 			} else {
 				usage()
 			}
 		case line == "/exit", line == "/bye":
 			return nil
 		case strings.HasPrefix(line, "/"):
 			args := strings.Fields(line)
 			isFile := false
 			if multiModal {
 				for _, f := range extractFileNames(line) {
 					if strings.HasPrefix(f, args[0]) {
 						isFile = true
 						break
 					}
 				}
 			}
 			if !isFile {
 				fmt.Printf("Unknown command '%s'. Type /? for help\n", args[0])
 				continue
 			}
 			sb.WriteString(line)
 		default:
 			sb.WriteString(line)
 		}
 		if sb.Len() > 0 && multiline == MultilineNone {
 			newMessage := api.Message{Role: "user", Content: sb.String()}
 			if multiModal {
 				msg, images, err := extractFileData(sb.String())
 				if err != nil {
 					return err
 				}
 				newMessage.Content = msg
 				// reset the context if we find another image
 				if len(images) > 0 {
 					newMessage.Images = append(newMessage.Images, images...)
 					// reset the context for the new image
 					opts.Messages = []api.Message{}
 				} else {
 					if len(opts.Messages) > 1 {
 						newMessage.Images = append(newMessage.Images, opts.Messages[len(opts.Messages)-2].Images...)
 					}
 				}
 				if len(newMessage.Images) == 0 {
 					fmt.Println("This model requires you to add a jpeg, png, or svg image.")
 					fmt.Println()
 					sb.Reset()
 					continue
 				}
 			}
 			if opts.System != "" {
 				opts.Messages = append(opts.Messages, api.Message{Role: "system", Content: opts.System})
 			}
 			opts.Messages = append(opts.Messages, newMessage)
 			assistant, err := chat(cmd, opts)
 			if err != nil {
 				return err
 			}
 			if assistant != nil {
 				opts.Messages = append(opts.Messages, *assistant)
 			}
 			sb.Reset()
 		}
 	}
 }
 func normalizeFilePath(fp string) string {
 	// Define a map of escaped characters and their replacements
 	replacements := map[string]string{
 		"\\ ":  " ",  // Escaped space
 		"\\(":  "(",  // Escaped left parenthesis
 		"\\)":  ")",  // Escaped right parenthesis
 		"\\[":  "[",  // Escaped left square bracket
 		"\\]":  "]",  // Escaped right square bracket
 		"\\{":  "{",  // Escaped left curly brace
 		"\\}":  "}",  // Escaped right curly brace
 		"\\$":  "$",  // Escaped dollar sign
 		"\\&":  "&",  // Escaped ampersand
 		"\\;":  ";",  // Escaped semicolon
 		"\\'":  "'",  // Escaped single quote
 		"\\\\": "\\", // Escaped backslash
 		"\\*":  "*",  // Escaped asterisk
 		"\\?":  "?",  // Escaped question mark
 	}
 	for escaped, actual := range replacements {
 		fp = strings.ReplaceAll(fp, escaped, actual)
 	}
 	return fp
 }
 func extractFileNames(input string) []string {
 	// Regex to match file paths starting with optional drive letter, / ./ \ or .\ and include escaped or unescaped spaces (\ or %20)
 	// and followed by more characters and a file extension
 	// This will capture non filename strings, but we'll check for file existence to remove mismatches
 	regexPattern := `(?:[a-zA-Z]:)?(?:\./|/|\\)[\S\\ ]+?\.(?i:jpg|jpeg|png|svg)\b`
 	re := regexp.MustCompile(regexPattern)
 	return re.FindAllString(input, -1)
 }
 func extractFileData(input string) (string, []api.ImageData, error) {
 	filePaths := extractFileNames(input)
 	var imgs []api.ImageData
 	for _, fp := range filePaths {
 		nfp := normalizeFilePath(fp)
 		data, err := getImageData(nfp)
 		if err != nil {
 			if os.IsNotExist(err) {
 				continue
 			}
 			fmt.Printf("Couldn't process image: %q\n", err)
 			return "", imgs, err
 		}
 		fmt.Printf("Added image '%s'\n", nfp)
 		input = strings.ReplaceAll(input, fp, "")
 		imgs = append(imgs, data)
 	}
 	return input, imgs, nil
 }
 func getImageData(filePath string) ([]byte, error) {
 	file, err := os.Open(filePath)
 	if err != nil {
 		return nil, err
 	}
 	defer file.Close()
 	buf := make([]byte, 512)
 	_, err = file.Read(buf)
 	if err != nil {
 		return nil, err
 	}
 	contentType := http.DetectContentType(buf)
 	allowedTypes := []string{"image/jpeg", "image/jpg", "image/svg+xml", "image/png"}
 	if !slices.Contains(allowedTypes, contentType) {
 		return nil, fmt.Errorf("invalid image type: %s", contentType)
 	}
 	info, err := file.Stat()
 	if err != nil {
 		return nil, err
 	}
 	// Check if the file size exceeds 100MB
 	var maxSize int64 = 100 * 1024 * 1024 // 100MB in bytes
 	if info.Size() > maxSize {
 		return nil, fmt.Errorf("file size exceeds maximum limit (100MB)")
 	}
 	buf = make([]byte, info.Size())
 	_, err = file.Seek(0, 0)
 	if err != nil {
 		return nil, err
 	}
 	_, err = io.ReadFull(file, buf)
 	if err != nil {
 		return nil, err
 	}
 	return buf, nil
 }
--- a/cmd/interactive_test.go
+++ b/cmd/interactive_test.go
@@ -1,51 +0,0 @@
 package cmd
 import (
 	"testing"
 	"github.com/stretchr/testify/assert"
 )
 func TestExtractFilenames(t *testing.T) {
 	// Unix style paths
 	input := ` some preamble 
 ./relative\ path/one.png inbetween1 ./not a valid two.jpg inbetween2
 /unescaped space /three.jpeg inbetween3 /valid\ path/dir/four.png "./quoted with spaces/five.svg`
 	res := extractFileNames(input)
 	assert.Len(t, res, 5)
 	assert.Contains(t, res[0], "one.png")
 	assert.Contains(t, res[1], "two.jpg")
 	assert.Contains(t, res[2], "three.jpeg")
 	assert.Contains(t, res[3], "four.png")
 	assert.Contains(t, res[4], "five.svg")
 	assert.NotContains(t, res[4], '"')
 	assert.NotContains(t, res, "inbtween")
 	// Windows style paths
 	input = ` some preamble
 c:/users/jdoe/one.png inbetween1 c:/program files/someplace/two.jpg inbetween2 
 /absolute/nospace/three.jpeg inbetween3 /absolute/with space/four.png inbetween4
 ./relative\ path/five.svg inbetween5 "./relative with/spaces/six.png inbetween6
 d:\path with\spaces\seven.svg inbetween7 c:\users\jdoe\eight.png inbetween8 
 d:\program files\someplace\nine.png inbetween9 "E:\program files\someplace\ten.svg some ending
 `
 	res = extractFileNames(input)
 	assert.Len(t, res, 10)
 	assert.NotContains(t, res, "inbtween")
 	assert.Contains(t, res[0], "one.png")
 	assert.Contains(t, res[0], "c:")
 	assert.Contains(t, res[1], "two.jpg")
 	assert.Contains(t, res[1], "c:")
 	assert.Contains(t, res[2], "three.jpeg")
 	assert.Contains(t, res[3], "four.png")
 	assert.Contains(t, res[4], "five.svg")
 	assert.Contains(t, res[5], "six.png")
 	assert.Contains(t, res[6], "seven.svg")
 	assert.Contains(t, res[6], "d:")
 	assert.Contains(t, res[7], "eight.png")
 	assert.Contains(t, res[7], "c:")
 	assert.Contains(t, res[8], "nine.png")
 	assert.Contains(t, res[8], "d:")
 	assert.Contains(t, res[9], "ten.svg")
 	assert.Contains(t, res[9], "E:")
 }
--- a/docs/README.md
+++ b/docs/README.md
@@ -12,7 +12,7 @@ Import models using source model weights found on Hugging Face and similar sites
 Installing on Linux in most cases is easy using the script on Ollama.ai. To get more detail about the install, including CUDA drivers, see the **[Linux Documentation](./linux.md)**.
-Many of our users like the flexibility of using our official Docker Image. Learn more about using Docker with Ollama using the **[Docker Documentation](https://hub.docker.com/r/ollama/ollama)**.
+Many of our users like the flexibility of using our official Docker Image. Learn more about using Docker with Ollama using the **[Docker Documentation](./docker.md)**.
 It is easy to install on Linux and Mac, but many users will choose to build Ollama on their own. To do this, refer to the **[Development Documentation](./development.md)**.
--- a/docs/api.md
+++ b/docs/api.md
@@ -409,7 +409,7 @@ A stream of JSON objects is returned:
  "model": "llama2",
  "created_at": "2023-08-04T08:52:19.385406455-07:00",
  "message": {
-    "role": "assistant",
+    "role": "assisant",
    "content": "The",
    "images": null
  },
@@ -505,7 +505,7 @@ A stream of JSON objects is returned:
  "model": "llama2",
  "created_at": "2023-08-04T08:52:19.385406455-07:00",
  "message": {
-    "role": "assistant",
+    "role": "assisant",
    "content": "The"
  },
  "done": false
--- a/docs/development.md
+++ b/docs/development.md
@@ -1,9 +1,13 @@
 # Development
 - Install cmake or (optionally, required tools for GPUs)
 - run `go generate ./...`
 - run `go build .`
 Install required tools:
 - cmake version 3.24 or higher
- go version 1.21 or higher
+- go version 1.20 or higher
 - gcc version 11.4.0 or higher
 ```bash
@@ -13,11 +17,7 @@ brew install go cmake gcc
 Optionally enable debugging and more verbose logging:
 ```bash
 # At build time
 export CGO_CFLAGS="-g"
 # At runtime
 export OLLAMA_DEBUG=1
 ```
 Get the required libraries and build the native LLM code:
@@ -38,99 +38,37 @@ Now you can run `ollama`:
 ./ollama
 ```
-### Linux
+## Building on Linux with GPU support
 #### Linux CUDA (NVIDIA)
 ### Linux/Windows CUDA (NVIDIA)
 *Your operating system distribution may already have packages for NVIDIA CUDA. Distro packages are often preferable, but instructions are distro-specific. Please consult distro-specific docs for dependencies if available!*
-Install `cmake` and `golang` as well as [NVIDIA CUDA](https://developer.nvidia.com/cuda-downloads)
+Install `cmake` and `golang` as well as [NVIDIA CUDA](https://developer.nvidia.com/cuda-downloads) development and runtime packages.
 development and runtime packages. 
 Typically the build scripts will auto-detect CUDA, however, if your Linux distro
 or installation approach uses unusual paths, you can specify the location by
 specifying an environment variable `CUDA_LIB_DIR` to the location of the shared
 libraries, and `CUDACXX` to the location of the nvcc compiler.
 Then generate dependencies:
 ```
 go generate ./...
 ```
 Then build the binary:
 ```
 go build .
 ```
-#### Linux ROCm (AMD)
+### Linux ROCm (AMD)
 *Your operating system distribution may already have packages for AMD ROCm and CLBlast. Distro packages are often preferable, but instructions are distro-specific. Please consult distro-specific docs for dependencies if available!*
 Install [CLBlast](https://github.com/CNugteren/CLBlast/blob/master/doc/installation.md) and [ROCm](https://rocm.docs.amd.com/en/latest/deploy/linux/quick_start.html) developement packages first, as well as `cmake` and `golang`.
-
+Adjust the paths below (correct for Arch) as appropriate for your distributions install locations and generate dependencies:
 Typically the build scripts will auto-detect ROCm, however, if your Linux distro
 or installation approach uses unusual paths, you can specify the location by
 specifying an environment variable `ROCM_PATH` to the location of the ROCm
 install (typically `/opt/rocm`), and `CLBlast_DIR` to the location of the
 CLBlast install (typically `/usr/lib/cmake/CLBlast`).
 ```
-go generate ./...
+CLBlast_DIR=/usr/lib/cmake/CLBlast ROCM_PATH=/opt/rocm go generate ./...
 ```
 Then build the binary:
 ```
 go build .
 ```
 ROCm requires elevated privileges to access the GPU at runtime.  On most distros you can add your user account to the `render` group, or run as root.
-#### Advanced CPU Settings
+## Containerized Build
-By default, running `go generate ./...` will compile a few different variations
+If you have Docker available, you can build linux binaries with `./scripts/build_linux.sh` which has the CUDA and ROCm dependencies included.
 of the LLM library based on common CPU families and vector math capabilities,
 including a lowest-common-denominator which should run on almost any 64 bit CPU
 somewhat slowly.  At runtime, Ollama will auto-detect the optimal variation to
 load.  If you would like to build a CPU-based build customized for your
 processor, you can set `OLLAMA_CUSTOM_CPU_DEFS` to the llama.cpp flags you would
 like to use.  For example, to compile an optimized binary for an Intel i9-9880H,
 you might use:
 ```
 OLLAMA_CUSTOM_CPU_DEFS="-DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_F16C=on -DLLAMA_FMA=on" go generate ./...
 go build .
 ```
 #### Containerized Linux Build
 If you have Docker available, you can build linux binaries with `./scripts/build_linux.sh` which has the CUDA and ROCm dependencies included.  The resulting binary is placed in `./dist`
 ### Windows
 Note: The windows build for Ollama is still under development.
 Install required tools:
 - MSVC toolchain - C/C++ and cmake as minimal requirements
 - go version 1.21 or higher
 - MinGW (pick one variant) with GCC.
  - <https://www.mingw-w64.org/>
  - <https://www.msys2.org/>
 ```powershell
 $env:CGO_ENABLED="1"
 go generate ./...
 go build .
 ```
 #### Windows CUDA (NVIDIA)
 In addition to the common Windows development tools described above, install:
 - [NVIDIA CUDA](https://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/index.html)
--- a/docs/faq.md
+++ b/docs/faq.md
@@ -112,3 +112,26 @@ This can impact both installing Ollama, as well as downloading models.
 Open `Control Panel > Networking and Internet > View network status and tasks` and click on `Change adapter settings` on the left panel. Find the `vEthernel (WSL)` adapter, right click and select `Properties`.
 Click on `Configure` and open the `Advanced` tab. Search through each of the properties until you find `Large Send Offload Version 2 (IPv4)` and `Large Send Offload Version 2 (IPv6)`. *Disable* both of these
 properties.
 ## What does the q in the model tag mean? What is quantization?
 Whenever you pull a model without a tag, Ollama will actually pull the q4_0 quantization of the model. You can verify this on the tags page. On https://ollama.ai/library/llama2/tags you can see that the hash for the latest tag matches the hash for the 7b model. ![quant hashes](https://github.com/jmorganca/ollama/assets/633681/814b1b78-8205-4845-89f9-e671b3b96085)
 Looking at the that page for any model, you can see several quantization options available. Quantization is a method of compression that allows the model to fit in less space and thus use less RAM and VRAM on your machine.
 At a high level, a model is made of an enormous collection of nodes that determine how to generate text. These nodes are connected at different levels with weights. The training process adjusts these weights to be able to output the right text every time.
 Most of the source models that we use start with weights that are 32bit floating-point numbers. Those weights, and another concept called biases, add up to be the parameters. So a source model with 7 billion parameters has 7 billion 32bit floating-point numbers, plus a description of all the nodes and more. That adds up to needing at least 28 Gigabytes of memory to load, if you choose to load one of those source models.
 Quantization turns those 32bit floating point weights into much smaller integers. The number next to the q indicates the bit size of the weights. So a q4 model converted those 32bit floats into 4bit integers. A 4bit quantization takes up the space for 7billion 4bit integers, plus a little overhead. That comes out to almost 4 Gigabytes. Obviously, there is some loss of information in this process of going from 30GB to 4GB, but it turns out in most cases it isn't really noticeable. In fact, even the 2bit quantization which fits in less than 3GB can be very useful.
 There are three major sets of quantizations you will see in the Ollama Library of models: **fp16**, models with just a q and a number, like **q4_0**, and then models with a **K** in the tag. The **fp16** model is one that has been converted and quantized from the source 32bit to 16bit. This will be about half the size of the 32bit source model and is the largest quantization we deliver in the library. The **q4_0**, **q4_1**, **q5_0**, etc. models use two different quantization methods that were the original methods.
 The models with a **K** are often referred to as K Quants. This is a method that allows for models of a similar quality but smaller than the original method used. Essentially, it finds clusters of weights and quantizes those together, allowing for higher precision while using the same bit sizes as the regular quantization options. But this requires a set of maps for the model to figure out the original values which have a computational cost. You may see some impact on the speed of models with K quants compared to the regular quantizations.
 ## What is context, can I increase it, and why doesn't every model support a huge context?
 Context refers to the size of the input you can send to a model and get sensible output back. Many models have a context size of 2048 tokens. It's sometimes possible to give it more using the **num_ctx** parameter, but the answers start to degrade. This is because half of the context is "freed" up to allow for more memory. Newer models have been able to increase that context size using different methods. This increase in context size results in a corresponding increase in memory required, sometimes by orders of magnitude.
 > !WARNING]
 > Currently, over-allocating context size may result in model quality or stability issues.
--- a/docs/linux.md
+++ b/docs/linux.md
@@ -109,9 +109,8 @@ Remove the ollama binary from your bin directory (either `/usr/local/bin`, `/usr
 sudo rm $(which ollama)
 ```
-Remove the downloaded models and Ollama service user and group:
+Remove the downloaded models and Ollama service user:
 ```bash
 sudo rm -r /usr/share/ollama
 sudo userdel ollama
 sudo groupdel ollama
 ```
--- a/docs/modelfile.md
+++ b/docs/modelfile.md
@@ -156,12 +156,11 @@ PARAMETER <parameter> <parametervalue>
 #### Template Variables
-| Variable          | Description                                                                                                   |
+| Variable        | Description                                                                                                   |
-| ----------------- | ------------------------------------------------------------------------------------------------------------- |
+| --------------- | ------------------------------------------------------------------------------------------------------------- |
-| `{{ .System }}`   | The system message used to specify custom behavior, this must also be set in the Modelfile as an instruction. |
+| `{{ .System }}` | The system message used to specify custom behavior, this must also be set in the Modelfile as an instruction. |
-| `{{ .Prompt }}`   | The incoming prompt, this is not specified in the model file and will be set based on input.                  |
+| `{{ .Prompt }}` | The incoming prompt, this is not specified in the model file and will be set based on input.                  |
-| `{{ .Response }}` | The response from the LLM, if not specified response is appended to the end of the template.                  |
+| `{{ .First }}`  | A boolean value used to render specific template information for the first generation of a session.           |
 | `{{ .First }}`    | A boolean value used to render specific template information for the first generation of a session.           |
 ```modelfile
 TEMPLATE """
--- a/docs/troubleshooting.md
+++ b/docs/troubleshooting.md
@@ -16,38 +16,7 @@ If manually running `ollama serve` in a terminal, the logs will be on that termi
 Join the [Discord](https://discord.gg/ollama) for help interpreting the logs.
 ## LLM libraries
 Ollama includes multiple LLM libraries compiled for different GPUs and CPU
 vector features.  Ollama tries to pick the best one based on the capabilities of
 your system.  If this autodetection has problems, or you run into other problems
 (e.g. crashes in your GPU) you can workaround this by forcing a specific LLM
 library.  `cpu_avx2` will perform the best, followed by `cpu_avx` an the slowest
 but most compatible is `cpu`.  Rosetta emulation under MacOS will work with the
 `cpu` library. 
 In the server log, you will see a message that looks something like this (varies
 from release to release):
 ```
 Dynamic LLM libraries [rocm_v6 cpu cpu_avx cpu_avx2 cuda_v11 rocm_v5]
 ```
 **Experimental LLM Library Override**
 You can set OLLAMA_LLM_LIBRARY to any of the available LLM libraries to bypass
 autodetection, so for example, if you have a CUDA card, but want to force the
 CPU LLM library with AVX2 vector support, use:
 ```
 OLLAMA_LLM_LIBRARY="cpu_avx2" ollama serve
 ```
 You can see what features your CPU has with the following.  
 ```
 cat /proc/cpuinfo| grep flags  | head -1
 ```
 ## Known issues
-* N/A
+
 * `signal: illegal instruction (core dumped)`: Ollama requires AVX support from the CPU. This was introduced in 2011 and CPUs started offering it in 2012. CPUs from before that and some lower end CPUs after that may not have AVX support and thus are not supported by Ollama. Some users have had luck with building Ollama on their machines disabling the need for AVX.
--- a/go.mod
+++ b/go.mod
@@ -1,6 +1,6 @@
 module github.com/jmorganca/ollama
-go 1.21
+go 1.20
 require (
 	github.com/emirpasic/gods v1.18.1
@@ -45,7 +45,7 @@ require (
 	golang.org/x/crypto v0.14.0
 	golang.org/x/exp v0.0.0-20230817173708-d852ddb80c63
 	golang.org/x/net v0.17.0 // indirect
-	golang.org/x/sys v0.13.0
+	golang.org/x/sys v0.13.0 // indirect
 	golang.org/x/term v0.13.0
 	golang.org/x/text v0.13.0 // indirect
 	google.golang.org/protobuf v1.30.0 // indirect
--- a/gpu/cpu_common.go
+++ b/gpu/cpu_common.go
@@ -1,21 +0,0 @@
 package gpu
 import (
 	"log/slog"
 	"golang.org/x/sys/cpu"
 )
 func GetCPUVariant() string {
 	if cpu.X86.HasAVX2 {
 		slog.Info("CPU has AVX2")
 		return "avx2"
 	}
 	if cpu.X86.HasAVX {
 		slog.Info("CPU has AVX")
 		return "avx"
 	}
 	slog.Info("CPU does not have vector extensions")
 	// else LCD
 	return ""
 }
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@@ -12,13 +12,12 @@ package gpu
 import "C"
 import (
 	"fmt"
-	"log/slog"
+	"log"
 	"os"
 	"path/filepath"
 	"runtime"
 	"strings"
 	"sync"
 	"unsafe"
 	"github.com/jmorganca/ollama/api"
 )
 type handles struct {
@@ -29,83 +28,31 @@ type handles struct {
 var gpuMutex sync.Mutex
 var gpuHandles *handles = nil
 // With our current CUDA compile flags, 5.2 and older will not work properly
 const CudaComputeMajorMin = 6
 // Possible locations for the nvidia-ml library
 var CudaLinuxGlobs = []string{
 	"/usr/local/cuda/lib64/libnvidia-ml.so*",
 	"/usr/lib/x86_64-linux-gnu/nvidia/current/libnvidia-ml.so*",
 	"/usr/lib/x86_64-linux-gnu/libnvidia-ml.so*",
 	"/usr/lib/wsl/lib/libnvidia-ml.so*",
 	"/opt/cuda/lib64/libnvidia-ml.so*",
 	"/opt/cuda/targets/x86_64-linux/lib/stubs/libnvidia-ml.so*",
 	"/usr/lib*/libnvidia-ml.so*",
 	"/usr/local/lib*/libnvidia-ml.so*",
 	"/usr/lib/aarch64-linux-gnu/nvidia/current/libnvidia-ml.so*",
 	"/usr/lib/aarch64-linux-gnu/libnvidia-ml.so*",
 }
 var CudaWindowsGlobs = []string{
 	"c:\\Windows\\System32\\nvml.dll",
 }
 var RocmLinuxGlobs = []string{
 	"/opt/rocm*/lib*/librocm_smi64.so*",
 }
 var RocmWindowsGlobs = []string{
 	"c:\\Windows\\System32\\rocm_smi64.dll",
 }
 // Note: gpuMutex must already be held
 func initGPUHandles() {
 	// TODO - if the ollama build is CPU only, don't do these checks as they're irrelevant and confusing
-
+	log.Printf("Detecting GPU type")
 	gpuHandles = &handles{nil, nil}
-	var cudaMgmtName string
+	var resp C.cuda_init_resp_t
-	var cudaMgmtPatterns []string
+	C.cuda_init(&resp)
-	var rocmMgmtName string
+	if resp.err != nil {
-	var rocmMgmtPatterns []string
+		log.Printf("CUDA not detected: %s", C.GoString(resp.err))
-	switch runtime.GOOS {
+		C.free(unsafe.Pointer(resp.err))
 	case "windows":
 		cudaMgmtName = "nvml.dll"
 		cudaMgmtPatterns = make([]string, len(CudaWindowsGlobs))
 		copy(cudaMgmtPatterns, CudaWindowsGlobs)
 		rocmMgmtName = "rocm_smi64.dll"
 		rocmMgmtPatterns = make([]string, len(RocmWindowsGlobs))
 		copy(rocmMgmtPatterns, RocmWindowsGlobs)
 	case "linux":
 		cudaMgmtName = "libnvidia-ml.so"
 		cudaMgmtPatterns = make([]string, len(CudaLinuxGlobs))
 		copy(cudaMgmtPatterns, CudaLinuxGlobs)
 		rocmMgmtName = "librocm_smi64.so"
 		rocmMgmtPatterns = make([]string, len(RocmLinuxGlobs))
 		copy(rocmMgmtPatterns, RocmLinuxGlobs)
 	default:
 		return
 	}
-	slog.Info("Detecting GPU type")
+		var resp C.rocm_init_resp_t
-	cudaLibPaths := FindGPULibs(cudaMgmtName, cudaMgmtPatterns)
+		C.rocm_init(&resp)
-	if len(cudaLibPaths) > 0 {
+		if resp.err != nil {
-		cuda := LoadCUDAMgmt(cudaLibPaths)
+			log.Printf("ROCm not detected: %s", C.GoString(resp.err))
-		if cuda != nil {
+			C.free(unsafe.Pointer(resp.err))
-			slog.Info("Nvidia GPU detected")
+		} else {
-			gpuHandles.cuda = cuda
+			log.Printf("Radeon GPU detected")
-			return
+			rocm := resp.rh
-		}
+			gpuHandles.rocm = &rocm
 	}
 	rocmLibPaths := FindGPULibs(rocmMgmtName, rocmMgmtPatterns)
 	if len(rocmLibPaths) > 0 {
 		rocm := LoadROCMMgmt(rocmLibPaths)
 		if rocm != nil {
 			slog.Info("Radeon GPU detected")
 			gpuHandles.rocm = rocm
 			return
 		}
 	} else {
 		log.Printf("Nvidia GPU detected")
 		cuda := resp.ch
 		gpuHandles.cuda = &cuda
 	}
 }
@@ -123,52 +70,34 @@ func GetGPUInfo() GpuInfo {
 	if gpuHandles.cuda != nil {
 		C.cuda_check_vram(*gpuHandles.cuda, &memInfo)
 		if memInfo.err != nil {
-			slog.Info(fmt.Sprintf("error looking up CUDA GPU memory: %s", C.GoString(memInfo.err)))
+			log.Printf("error looking up CUDA GPU memory: %s", C.GoString(memInfo.err))
 			C.free(unsafe.Pointer(memInfo.err))
 		} else {
-			// Verify minimum compute capability
+			resp.Library = "cuda"
 			var cc C.cuda_compute_capability_t
 			C.cuda_compute_capability(*gpuHandles.cuda, &cc)
 			if cc.err != nil {
 				slog.Info(fmt.Sprintf("error looking up CUDA GPU compute capability: %s", C.GoString(cc.err)))
 				C.free(unsafe.Pointer(cc.err))
 			} else if cc.major >= CudaComputeMajorMin {
 				slog.Info(fmt.Sprintf("CUDA Compute Capability detected: %d.%d", cc.major, cc.minor))
 				resp.Library = "cuda"
 			} else {
 				slog.Info(fmt.Sprintf("CUDA GPU is too old. Falling back to CPU mode. Compute Capability detected: %d.%d", cc.major, cc.minor))
 			}
 		}
 	} else if gpuHandles.rocm != nil {
 		C.rocm_check_vram(*gpuHandles.rocm, &memInfo)
 		if memInfo.err != nil {
-			slog.Info(fmt.Sprintf("error looking up ROCm GPU memory: %s", C.GoString(memInfo.err)))
+			log.Printf("error looking up ROCm GPU memory: %s", C.GoString(memInfo.err))
 			C.free(unsafe.Pointer(memInfo.err))
 		} else {
 			resp.Library = "rocm"
 			var version C.rocm_version_resp_t
 			C.rocm_get_version(*gpuHandles.rocm, &version)
 			verString := C.GoString(version.str)
 			if version.status == 0 {
 				resp.Variant = "v" + verString
 			} else {
 				slog.Info(fmt.Sprintf("failed to look up ROCm version: %s", verString))
 			}
 			C.free(unsafe.Pointer(version.str))
 		}
 	}
 	if resp.Library == "" {
 		C.cpu_check_ram(&memInfo)
-		resp.Library = "cpu"
+		// In the future we may offer multiple CPU variants to tune CPU features
-		resp.Variant = GetCPUVariant()
+		if runtime.GOOS == "windows" {
 			resp.Library = "cpu"
 		} else {
 			resp.Library = "default"
 		}
 	}
 	if memInfo.err != nil {
-		slog.Info(fmt.Sprintf("error looking up CPU memory: %s", C.GoString(memInfo.err)))
+		log.Printf("error looking up CPU memory: %s", C.GoString(memInfo.err))
 		C.free(unsafe.Pointer(memInfo.err))
 		return resp
 	}
 	resp.DeviceCount = uint32(memInfo.count)
 	resp.FreeMemory = uint64(memInfo.free)
 	resp.TotalMemory = uint64(memInfo.total)
 	return resp
@@ -192,92 +121,29 @@ func CheckVRAM() (int64, error) {
 	if gpuInfo.FreeMemory > 0 && (gpuInfo.Library == "cuda" || gpuInfo.Library == "rocm") {
 		return int64(gpuInfo.FreeMemory), nil
 	}
 	return 0, fmt.Errorf("no GPU detected") // TODO - better handling of CPU based memory determiniation
 }
-func FindGPULibs(baseLibName string, patterns []string) []string {
+func NumGPU(numLayer, fileSizeBytes int64, opts api.Options) int {
-	// Multiple GPU libraries may exist, and some may not work, so keep trying until we exhaust them
+	if opts.NumGPU != -1 {
-	var ldPaths []string
+		return opts.NumGPU
-	gpuLibPaths := []string{}
+	}
-	slog.Info(fmt.Sprintf("Searching for GPU management library %s", baseLibName))
+	info := GetGPUInfo()
 	if info.Library == "cpu" || info.Library == "default" {
 		return 0
 	}
-	switch runtime.GOOS {
+	/*
-	case "windows":
+		Calculate bytes per layer, this will roughly be the size of the model file divided by the number of layers.
-		ldPaths = strings.Split(os.Getenv("PATH"), ";")
+		We can store the model weights and the kv cache in vram,
-	case "linux":
+		to enable kv chache vram storage add two additional layers to the number of layers retrieved from the model file.
-		ldPaths = strings.Split(os.Getenv("LD_LIBRARY_PATH"), ":")
+	*/
-	default:
+	bytesPerLayer := uint64(fileSizeBytes / numLayer)
 		return gpuLibPaths
 	}
 	// Start with whatever we find in the PATH/LD_LIBRARY_PATH
 	for _, ldPath := range ldPaths {
 		d, err := filepath.Abs(ldPath)
 		if err != nil {
 			continue
 		}
 		patterns = append(patterns, filepath.Join(d, baseLibName+"*"))
 	}
 	slog.Debug(fmt.Sprintf("gpu management search paths: %v", patterns))
 	for _, pattern := range patterns {
 		// Ignore glob discovery errors
 		matches, _ := filepath.Glob(pattern)
 		for _, match := range matches {
 			// Resolve any links so we don't try the same lib multiple times
 			// and weed out any dups across globs
 			libPath := match
 			tmp := match
 			var err error
 			for ; err == nil; tmp, err = os.Readlink(libPath) {
 				if !filepath.IsAbs(tmp) {
 					tmp = filepath.Join(filepath.Dir(libPath), tmp)
 				}
 				libPath = tmp
 			}
 			new := true
 			for _, cmp := range gpuLibPaths {
 				if cmp == libPath {
 					new = false
 					break
 				}
 			}
 			if new {
 				gpuLibPaths = append(gpuLibPaths, libPath)
 			}
 		}
 	}
 	slog.Info(fmt.Sprintf("Discovered GPU libraries: %v", gpuLibPaths))
 	return gpuLibPaths
 }
-func LoadCUDAMgmt(cudaLibPaths []string) *C.cuda_handle_t {
+	// 75% of the absolute max number of layers we can fit in available VRAM, off-loading too many layers to the GPU can cause OOM errors
-	var resp C.cuda_init_resp_t
+	layers := int(info.FreeMemory/bytesPerLayer) * 3 / 4
 	for _, libPath := range cudaLibPaths {
 		lib := C.CString(libPath)
 		defer C.free(unsafe.Pointer(lib))
 		C.cuda_init(lib, &resp)
 		if resp.err != nil {
 			slog.Info(fmt.Sprintf("Unable to load CUDA management library %s: %s", libPath, C.GoString(resp.err)))
 			C.free(unsafe.Pointer(resp.err))
 		} else {
 			return &resp.ch
 		}
 	}
 	return nil
 }
-func LoadROCMMgmt(rocmLibPaths []string) *C.rocm_handle_t {
+	log.Printf("%d MB VRAM available, loading up to %d %s GPU layers out of %d", info.FreeMemory/(1024*1024), layers, info.Library, numLayer)
-	var resp C.rocm_init_resp_t
+
-	for _, libPath := range rocmLibPaths {
+	return layers
 		lib := C.CString(libPath)
 		defer C.free(unsafe.Pointer(lib))
 		C.rocm_init(lib, &resp)
 		if resp.err != nil {
 			slog.Info(fmt.Sprintf("Unable to load ROCm management library %s: %s", libPath, C.GoString(resp.err)))
 			C.free(unsafe.Pointer(resp.err))
 		} else {
 			return &resp.rh
 		}
 	}
 	return nil
 }
--- a/gpu/gpu_darwin.go
+++ b/gpu/gpu_darwin.go
@@ -6,41 +6,21 @@ import "C"
 import (
 	"runtime"
-	"github.com/pbnjay/memory"
+	"github.com/jmorganca/ollama/api"
 )
 // CheckVRAM returns the free VRAM in bytes on Linux machines with NVIDIA GPUs
 func CheckVRAM() (int64, error) {
-	if runtime.GOARCH == "amd64" {
+	// TODO - assume metal, and return free memory?
-		// gpu not supported, this may not be metal
+	return 0, nil
 		return 0, nil
 	}
 	// on macOS, there's already buffer for available vram (see below) so just return the total
 	systemMemory := int64(memory.TotalMemory())
 	// macOS limits how much memory is available to the GPU based on the amount of system memory
 	// TODO: handle case where iogpu.wired_limit_mb is set to a higher value
 	if systemMemory <= 36*1024*1024*1024 {
 		systemMemory = systemMemory * 2 / 3
 	} else {
 		systemMemory = systemMemory * 3 / 4
 	}
 	return systemMemory, nil
 }
 func GetGPUInfo() GpuInfo {
 	// TODO - Metal vs. x86 macs...
 	mem, _ := getCPUMem()
 	if runtime.GOARCH == "amd64" {
 		return GpuInfo{
 			Library: "cpu",
 			Variant: GetCPUVariant(),
 			memInfo: mem,
 		}
 	}
 	return GpuInfo{
-		Library: "metal",
+		Library: "default",
 		memInfo: mem,
 	}
 }
@@ -49,6 +29,22 @@ func getCPUMem() (memInfo, error) {
 	return memInfo{
 		TotalMemory: 0,
 		FreeMemory:  0,
 		DeviceCount: 0,
 	}, nil
 }
 func NumGPU(numLayer, fileSizeBytes int64, opts api.Options) int {
 	if opts.NumGPU != -1 {
 		return opts.NumGPU
 	}
 	// metal only supported on arm64
 	if runtime.GOARCH == "arm64" {
 		return 1
 	}
 	return 0
 }
 func nativeInit() error {
 	return nil
 }
--- a/gpu/gpu_info.h
+++ b/gpu/gpu_info.h
@@ -34,7 +34,6 @@ extern "C" {
 typedef struct mem_info {
  uint64_t total;
  uint64_t free;
  unsigned int count;
  char *err;  // If non-nill, caller responsible for freeing
 } mem_info_t;
--- a/gpu/gpu_info_cpu.c
+++ b/gpu/gpu_info_cpu.c
@@ -8,7 +8,6 @@ void cpu_check_ram(mem_info_t *resp) {
  MEMORYSTATUSEX info;
  info.dwLength = sizeof(info);
  if (GlobalMemoryStatusEx(&info) != 0) {
    resp->count = 1;
    resp->total = info.ullTotalPhys;
    resp->free = info.ullAvailPhys;
  } else {
@@ -27,7 +26,6 @@ void cpu_check_ram(mem_info_t *resp) {
  if (sysinfo(&info) != 0) {
    resp->err = strdup(strerror(errno));
  } else {
    resp->count = 1;
    resp->total = info.totalram * info.mem_unit;
    resp->free = info.freeram * info.mem_unit;
  }
--- a/gpu/gpu_info_cuda.c
+++ b/gpu/gpu_info_cuda.c
@@ -4,9 +4,23 @@
 #include <string.h>
-#define CUDA_LOOKUP_SIZE 6
+#ifndef _WIN32
 const char *cuda_lib_paths[] = {
    "libnvidia-ml.so",
    "/usr/local/cuda/lib64/libnvidia-ml.so",
    "/usr/lib/x86_64-linux-gnu/nvidia/current/libnvidia-ml.so",
    "/usr/lib/wsl/lib/libnvidia-ml.so.1",  // TODO Maybe glob?
    NULL,
 };
 #else
 const char *cuda_lib_paths[] = {
    "nvml.dll",
    "",
    NULL,
 };
 #endif
-void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
+void cuda_init(cuda_init_resp_t *resp) {
  nvmlReturn_t ret;
  resp->err = NULL;
  const int buflen = 256;
@@ -16,27 +30,29 @@ void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
  struct lookup {
    char *s;
    void **p;
-  } l[CUDA_LOOKUP_SIZE] = {
+  } l[4] = {
      {"nvmlInit_v2", (void *)&resp->ch.initFn},
      {"nvmlShutdown", (void *)&resp->ch.shutdownFn},
      {"nvmlDeviceGetHandleByIndex", (void *)&resp->ch.getHandle},
      {"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.getMemInfo},
      {"nvmlDeviceGetCount_v2", (void *)&resp->ch.getCount},
      {"nvmlDeviceGetCudaComputeCapability", (void *)&resp->ch.getComputeCapability},
  };
-  resp->ch.handle = LOAD_LIBRARY(cuda_lib_path, RTLD_LAZY);
+  for (i = 0; cuda_lib_paths[i] != NULL && resp->ch.handle == NULL; i++) {
    resp->ch.handle = LOAD_LIBRARY(cuda_lib_paths[i], RTLD_LAZY);
  }
  if (!resp->ch.handle) {
    // TODO improve error message, as the LOAD_ERR will have typically have the
    // final path that was checked which might be confusing.
    char *msg = LOAD_ERR();
    snprintf(buf, buflen,
             "Unable to load %s library to query for Nvidia GPUs: %s",
-             cuda_lib_path, msg);
+             cuda_lib_paths[0], msg);
    free(msg);
    resp->err = strdup(buf);
    return;
  }
-  for (i = 0; i < CUDA_LOOKUP_SIZE; i++) {  // TODO - fix this to use a null terminated list
+  for (i = 0; i < 4; i++) {  // TODO - fix this to use a null terminated list
    *l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
    if (!l[i].p) {
      UNLOAD_LIBRARY(resp->ch.handle);
@@ -52,8 +68,6 @@ void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
  ret = (*resp->ch.initFn)();
  if (ret != NVML_SUCCESS) {
    UNLOAD_LIBRARY(resp->ch.handle);
    resp->ch.handle = NULL;
    snprintf(buf, buflen, "nvml vram init failure: %d", ret);
    resp->err = strdup(buf);
  }
@@ -75,81 +89,22 @@ void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
    return;
  }
-  ret = (*h.getCount)(&resp->count);
+  // TODO - handle multiple GPUs
  ret = (*h.getHandle)(0, &device);
  if (ret != NVML_SUCCESS) {
-    snprintf(buf, buflen, "unable to get device count: %d", ret);
+    snprintf(buf, buflen, "unable to get device handle: %d", ret);
    resp->err = strdup(buf);
    return;
  }
-  resp->total = 0;
+  ret = (*h.getMemInfo)(device, &memInfo);
  resp->free = 0;
  for (i = 0; i < resp->count; i++) {
    ret = (*h.getHandle)(i, &device);
    if (ret != NVML_SUCCESS) {
      snprintf(buf, buflen, "unable to get device handle %d: %d", i, ret);
      resp->err = strdup(buf);
      return;
    }
    ret = (*h.getMemInfo)(device, &memInfo);
    if (ret != NVML_SUCCESS) {
      snprintf(buf, buflen, "device memory info lookup failure %d: %d", i, ret);
      resp->err = strdup(buf);
      return;
    }
    resp->total += memInfo.total;
    resp->free += memInfo.free;
  }
 }
 void cuda_compute_capability(cuda_handle_t h, cuda_compute_capability_t *resp) {
  resp->err = NULL;
  resp->major = 0;
  resp->minor = 0;
  nvmlDevice_t device;
  int major = 0;
  int minor = 0;
  nvmlReturn_t ret;
  const int buflen = 256;
  char buf[buflen + 1];
  int i;
  if (h.handle == NULL) {
    resp->err = strdup("nvml handle not initialized");
    return;
  }
  unsigned int devices;
  ret = (*h.getCount)(&devices);
  if (ret != NVML_SUCCESS) {
-    snprintf(buf, buflen, "unable to get device count: %d", ret);
+    snprintf(buf, buflen, "device memory info lookup failure: %d", ret);
    resp->err = strdup(buf);
    return;
  }
-
+  resp->total = memInfo.total;
-  for (i = 0; i < devices; i++) {
+  resp->free = memInfo.free;
-    ret = (*h.getHandle)(i, &device);
+  return;
    if (ret != NVML_SUCCESS) {
      snprintf(buf, buflen, "unable to get device handle %d: %d", i, ret);
      resp->err = strdup(buf);
      return;
    }
    ret = (*h.getComputeCapability)(device, &major, &minor);
    if (ret != NVML_SUCCESS) {
      snprintf(buf, buflen, "device compute capability lookup failure %d: %d", i, ret);
      resp->err = strdup(buf);
      return;
    }
    // Report the lowest major.minor we detect as that limits our compatibility
    if (resp->major == 0 || resp->major > major ) {
      resp->major = major;
      resp->minor = minor;
    } else if ( resp->major == major && resp->minor > minor ) {
      resp->minor = minor;
    }
  }
 }
 #endif  // __APPLE__
--- a/gpu/gpu_info_cuda.h
+++ b/gpu/gpu_info_cuda.h
@@ -21,8 +21,6 @@ typedef struct cuda_handle {
  nvmlReturn_t (*shutdownFn)(void);
  nvmlReturn_t (*getHandle)(unsigned int, nvmlDevice_t *);
  nvmlReturn_t (*getMemInfo)(nvmlDevice_t, nvmlMemory_t *);
  nvmlReturn_t (*getCount)(unsigned int *);
  nvmlReturn_t (*getComputeCapability)(nvmlDevice_t, int* major, int* minor);
 } cuda_handle_t;
 typedef struct cuda_init_resp {
@@ -30,15 +28,8 @@ typedef struct cuda_init_resp {
  cuda_handle_t ch;
 } cuda_init_resp_t;
-typedef struct cuda_compute_capability {
+void cuda_init(cuda_init_resp_t *resp);
  char *err;
  int major;
  int minor;
 } cuda_compute_capability_t;
 void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp);
 void cuda_check_vram(cuda_handle_t ch, mem_info_t *resp);
 void cuda_compute_capability(cuda_handle_t ch, cuda_compute_capability_t *cc);
 #endif  // __GPU_INFO_CUDA_H__
 #endif  // __APPLE__
--- a/gpu/gpu_info_rocm.c
+++ b/gpu/gpu_info_rocm.c
@@ -4,9 +4,22 @@
 #include <string.h>
-#define ROCM_LOOKUP_SIZE 5
+#ifndef _WIN32
 const char *rocm_lib_paths[] = {
    "librocm_smi64.so",
    "/opt/rocm/lib/librocm_smi64.so",
    NULL,
 };
 #else
 // TODO untested
 const char *rocm_lib_paths[] = {
    "rocm_smi64.dll",
    "/opt/rocm/lib/rocm_smi64.dll",
    NULL,
 };
 #endif
-void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
+void rocm_init(rocm_init_resp_t *resp) {
  rsmi_status_t ret;
  resp->err = NULL;
  const int buflen = 256;
@@ -15,31 +28,31 @@ void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
  struct lookup {
    char *s;
    void **p;
-  } l[ROCM_LOOKUP_SIZE] = {
+  } l[4] = {
      {"rsmi_init", (void *)&resp->rh.initFn},
      {"rsmi_shut_down", (void *)&resp->rh.shutdownFn},
      {"rsmi_dev_memory_total_get", (void *)&resp->rh.totalMemFn},
      {"rsmi_dev_memory_usage_get", (void *)&resp->rh.usageMemFn},
      {"rsmi_version_get", (void *)&resp->rh.versionGetFn},
      // { "rsmi_dev_id_get", (void*)&resp->rh.getHandle },
  };
-  resp->rh.handle = LOAD_LIBRARY(rocm_lib_path, RTLD_LAZY);
+  for (i = 0; rocm_lib_paths[i] != NULL && resp->rh.handle == NULL; i++) {
    resp->rh.handle = LOAD_LIBRARY(rocm_lib_paths[i], RTLD_LAZY);
  }
  if (!resp->rh.handle) {
    char *msg = LOAD_ERR();
    snprintf(buf, buflen,
             "Unable to load %s library to query for Radeon GPUs: %s\n",
-             rocm_lib_path, msg);
+             rocm_lib_paths[0], msg);
    free(msg);
    resp->err = strdup(buf);
    return;
  }
-  for (i = 0; i < ROCM_LOOKUP_SIZE; i++) {
+  for (i = 0; i < 4; i++) {
    *l[i].p = LOAD_SYMBOL(resp->rh.handle, l[i].s);
    if (!l[i].p) {
      UNLOAD_LIBRARY(resp->rh.handle);
      resp->rh.handle = NULL;
      char *msg = LOAD_ERR();
      snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
               msg);
@@ -51,8 +64,6 @@ void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
  ret = (*resp->rh.initFn)(0);
  if (ret != RSMI_STATUS_SUCCESS) {
    UNLOAD_LIBRARY(resp->rh.handle);
    resp->rh.handle = NULL;
    snprintf(buf, buflen, "rocm vram init failure: %d", ret);
    resp->err = strdup(buf);
  }
@@ -72,7 +83,7 @@ void rocm_check_vram(rocm_handle_t h, mem_info_t *resp) {
  int i;
  if (h.handle == NULL) {
-    resp->err = strdup("rocm handle not initialized");
+    resp->err = strdup("nvml handle sn't initialized");
    return;
  }
@@ -99,32 +110,9 @@ void rocm_check_vram(rocm_handle_t h, mem_info_t *resp) {
    return;
  }
  // TODO: set this to the actual number of devices
  resp->count = 1;
  resp->total = totalMem;
  resp->free = totalMem - usedMem;
  return;
 }
 void rocm_get_version(rocm_handle_t h, rocm_version_resp_t *resp) {
  const int buflen = 256;
  char buf[buflen + 1];
  if (h.handle == NULL) {
    resp->str = strdup("nvml handle not initialized");
    resp->status = 1;
    return;
  }
  rsmi_version_t ver;
  rsmi_status_t ret;
  ret = h.versionGetFn(&ver);
  if (ret != RSMI_STATUS_SUCCESS) {
    snprintf(buf, buflen, "unexpected response on version lookup %d", ret);
    resp->status = 1;
  } else {
    snprintf(buf, buflen, "%d", ver.major);
    resp->status = 0;
  }
  resp->str = strdup(buf);
 }
 #endif  // __APPLE__
--- a/gpu/gpu_info_rocm.h
+++ b/gpu/gpu_info_rocm.h
@@ -15,20 +15,12 @@ typedef enum rsmi_memory_type {
  RSMI_MEM_TYPE_GTT,
 } rsmi_memory_type_t;
 typedef struct {
     uint32_t major;     
     uint32_t minor;     
     uint32_t patch;     
     const char *build;  
 } rsmi_version_t;
 typedef struct rocm_handle {
  void *handle;
  rsmi_status_t (*initFn)(uint64_t);
  rsmi_status_t (*shutdownFn)(void);
  rsmi_status_t (*totalMemFn)(uint32_t, rsmi_memory_type_t, uint64_t *);
  rsmi_status_t (*usageMemFn)(uint32_t, rsmi_memory_type_t, uint64_t *);
  rsmi_status_t (*versionGetFn) (rsmi_version_t *version);
  // rsmi_status_t (*getHandle)(uint32_t, uint16_t *);
 } rocm_handle_t;
@@ -37,14 +29,8 @@ typedef struct rocm_init_resp {
  rocm_handle_t rh;
 } rocm_init_resp_t;
-typedef struct rocm_version_resp {
+void rocm_init(rocm_init_resp_t *resp);
  rsmi_status_t status;
  char *str; // Contains version or error string if status != 0 
 } rocm_version_resp_t;
 void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp);
 void rocm_check_vram(rocm_handle_t rh, mem_info_t *resp);
 void rocm_get_version(rocm_handle_t rh, rocm_version_resp_t *resp);
 #endif  // __GPU_INFO_ROCM_H__
 #endif  // __APPLE__
--- a/gpu/gpu_test.go
+++ b/gpu/gpu_test.go
@@ -9,7 +9,7 @@ import (
 func TestBasicGetGPUInfo(t *testing.T) {
 	info := GetGPUInfo()
-	assert.Contains(t, "cuda rocm cpu metal", info.Library)
+	assert.Contains(t, "cuda rocm cpu default", info.Library)
 	switch runtime.GOOS {
 	case "darwin":
@@ -18,7 +18,6 @@ func TestBasicGetGPUInfo(t *testing.T) {
 	case "linux", "windows":
 		assert.Greater(t, info.TotalMemory, uint64(0))
 		assert.Greater(t, info.FreeMemory, uint64(0))
 		assert.Greater(t, info.DeviceCount, uint32(0))
 	default:
 		return
 	}
@@ -36,6 +35,7 @@ func TestCPUMemInfo(t *testing.T) {
 	default:
 		return
 	}
 }
 // TODO - add some logic to figure out card type through other means and actually verify we got back what we expected
--- a/gpu/types.go
+++ b/gpu/types.go
@@ -3,7 +3,6 @@ package gpu
 type memInfo struct {
 	TotalMemory uint64 `json:"total_memory,omitempty"`
 	FreeMemory  uint64 `json:"free_memory,omitempty"`
 	DeviceCount uint32 `json:"device_count,omitempty"`
 }
 // Beginning of an `ollama info` command
@@ -11,8 +10,5 @@ type GpuInfo struct {
 	memInfo
 	Library string `json:"library,omitempty"`
 	// Optional variant to select (e.g. versions, cpu feature flags)
 	Variant string `json:"variant,omitempty"`
 	// TODO add other useful attributes about the card here for discovery information
 }
--- a/llm/dyn_ext_server.c
+++ b/llm/dyn_ext_server.c
@@ -1,11 +1,11 @@
-#include "dyn_ext_server.h"
+#include "dynamic_shim.h"
 #include <stdio.h>
 #include <string.h>
 #ifdef __linux__
 #include <dlfcn.h>
-#define LOAD_LIBRARY(lib, flags) dlopen(lib, flags)
+#define LOAD_LIBRARY(lib, flags) dlopen(lib, flags | RTLD_DEEPBIND)
 #define LOAD_SYMBOL(handle, sym) dlsym(handle, sym)
 #define LOAD_ERR() strdup(dlerror())
 #define UNLOAD_LIBRARY(handle) dlclose(handle)
@@ -33,7 +33,7 @@ inline char *LOAD_ERR() {
 #define UNLOAD_LIBRARY(handle) dlclose(handle)
 #endif
-void dyn_init(const char *libPath, struct dynamic_llama_server *s,
+void dynamic_shim_init(const char *libPath, struct dynamic_llama_server *s,
                       ext_server_resp_t *err) {
  int i = 0;
  struct lookup {
@@ -58,8 +58,8 @@ void dyn_init(const char *libPath, struct dynamic_llama_server *s,
      {"", NULL},
  };
-  printf("loading library %s\n", libPath);
+  printf("Lazy loading %s library\n", libPath);
-  s->handle = LOAD_LIBRARY(libPath, RTLD_GLOBAL|RTLD_NOW);
+  s->handle = LOAD_LIBRARY(libPath, RTLD_NOW);
  if (!s->handle) {
    err->id = -1;
    char *msg = LOAD_ERR();
@@ -83,63 +83,63 @@ void dyn_init(const char *libPath, struct dynamic_llama_server *s,
  }
 }
-inline void dyn_llama_server_init(struct dynamic_llama_server s,
+inline void dynamic_shim_llama_server_init(struct dynamic_llama_server s,
                                           ext_server_params_t *sparams,
                                           ext_server_resp_t *err) {
  s.llama_server_init(sparams, err);
 }
-inline void dyn_llama_server_start(struct dynamic_llama_server s) {
+inline void dynamic_shim_llama_server_start(struct dynamic_llama_server s) {
  s.llama_server_start();
 }
-inline void dyn_llama_server_stop(struct dynamic_llama_server s) {
+inline void dynamic_shim_llama_server_stop(struct dynamic_llama_server s) {
  s.llama_server_stop();
 }
-inline void dyn_llama_server_completion(struct dynamic_llama_server s,
+inline void dynamic_shim_llama_server_completion(struct dynamic_llama_server s,
                                                 const char *json_req,
                                                 ext_server_resp_t *resp) {
  s.llama_server_completion(json_req, resp);
 }
-inline void dyn_llama_server_completion_next_result(
+inline void dynamic_shim_llama_server_completion_next_result(
    struct dynamic_llama_server s, const int task_id,
    ext_server_task_result_t *result) {
  s.llama_server_completion_next_result(task_id, result);
 }
-inline void dyn_llama_server_completion_cancel(
+inline void dynamic_shim_llama_server_completion_cancel(
    struct dynamic_llama_server s, const int task_id, ext_server_resp_t *err) {
  s.llama_server_completion_cancel(task_id, err);
 }
-inline void dyn_llama_server_release_task_result(
+inline void dynamic_shim_llama_server_release_task_result(
    struct dynamic_llama_server s, ext_server_task_result_t *result) {
  s.llama_server_release_task_result(result);
 }
-inline void dyn_llama_server_tokenize(struct dynamic_llama_server s,
+inline void dynamic_shim_llama_server_tokenize(struct dynamic_llama_server s,
                                               const char *json_req,
                                               char **json_resp,
                                               ext_server_resp_t *err) {
  s.llama_server_tokenize(json_req, json_resp, err);
 }
-inline void dyn_llama_server_detokenize(struct dynamic_llama_server s,
+inline void dynamic_shim_llama_server_detokenize(struct dynamic_llama_server s,
                                                 const char *json_req,
                                                 char **json_resp,
                                                 ext_server_resp_t *err) {
  s.llama_server_detokenize(json_req, json_resp, err);
 }
-inline void dyn_llama_server_embedding(struct dynamic_llama_server s,
+inline void dynamic_shim_llama_server_embedding(struct dynamic_llama_server s,
                                                const char *json_req,
                                                char **json_resp,
                                                ext_server_resp_t *err) {
  s.llama_server_embedding(json_req, json_resp, err);
 }
-inline void dyn_llama_server_release_json_resp(
+inline void dynamic_shim_llama_server_release_json_resp(
    struct dynamic_llama_server s, char **json_resp) {
  s.llama_server_release_json_resp(json_resp);
 }
--- a/llm/dyn_ext_server.h
+++ b/llm/dyn_ext_server.h
@@ -27,46 +27,46 @@ struct dynamic_llama_server {
  void (*llama_server_release_json_resp)(char **json_resp);
 };
-void dyn_init(const char *libPath, struct dynamic_llama_server *s,
+void dynamic_shim_init(const char *libPath, struct dynamic_llama_server *s,
                       ext_server_resp_t *err);
 // No good way to call C function pointers from Go so inline the indirection
-void dyn_llama_server_init(struct dynamic_llama_server s,
+void dynamic_shim_llama_server_init(struct dynamic_llama_server s,
                                    ext_server_params_t *sparams,
                                    ext_server_resp_t *err);
-void dyn_llama_server_start(struct dynamic_llama_server s);
+void dynamic_shim_llama_server_start(struct dynamic_llama_server s);
-void dyn_llama_server_stop(struct dynamic_llama_server s);
+void dynamic_shim_llama_server_stop(struct dynamic_llama_server s);
-void dyn_llama_server_completion(struct dynamic_llama_server s,
+void dynamic_shim_llama_server_completion(struct dynamic_llama_server s,
                                          const char *json_req,
                                          ext_server_resp_t *resp);
-void dyn_llama_server_completion_next_result(
+void dynamic_shim_llama_server_completion_next_result(
    struct dynamic_llama_server s, const int task_id,
    ext_server_task_result_t *result);
-void dyn_llama_server_completion_cancel(struct dynamic_llama_server s,
+void dynamic_shim_llama_server_completion_cancel(struct dynamic_llama_server s,
                                                 const int task_id,
                                                 ext_server_resp_t *err);
-void dyn_llama_server_release_task_result(
+void dynamic_shim_llama_server_release_task_result(
    struct dynamic_llama_server s, ext_server_task_result_t *result);
-void dyn_llama_server_tokenize(struct dynamic_llama_server s,
+void dynamic_shim_llama_server_tokenize(struct dynamic_llama_server s,
                                        const char *json_req, char **json_resp,
                                        ext_server_resp_t *err);
-void dyn_llama_server_detokenize(struct dynamic_llama_server s,
+void dynamic_shim_llama_server_detokenize(struct dynamic_llama_server s,
                                          const char *json_req,
                                          char **json_resp,
                                          ext_server_resp_t *err);
-void dyn_llama_server_embedding(struct dynamic_llama_server s,
+void dynamic_shim_llama_server_embedding(struct dynamic_llama_server s,
                                         const char *json_req, char **json_resp,
                                         ext_server_resp_t *err);
-void dyn_llama_server_release_json_resp(struct dynamic_llama_server s,
+void dynamic_shim_llama_server_release_json_resp(struct dynamic_llama_server s,
                                                 char **json_resp);
 #ifdef __cplusplus
--- a/llm/ext_server/CMakeLists.txt
+++ b/llm/ext_server/CMakeLists.txt
@@ -1,25 +0,0 @@
 # Ollama specific CMakefile to include in llama.cpp/examples/server
 set(TARGET ext_server)
 option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
 if (WIN32)
    add_library(${TARGET} SHARED ../../../ext_server/ext_server.cpp ../../llama.cpp)
 else()
    add_library(${TARGET} STATIC ../../../ext_server/ext_server.cpp ../../llama.cpp)
 endif()
 target_include_directories(${TARGET} PRIVATE ../../common)
 target_include_directories(${TARGET} PRIVATE ../..)
 target_include_directories(${TARGET} PRIVATE ../../..)
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 target_compile_definitions(${TARGET} PUBLIC LLAMA_SERVER_LIBRARY=1)
 target_link_libraries(${TARGET} PRIVATE ggml llava common )
 set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
 target_compile_definitions(${TARGET} PRIVATE SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>)
 install(TARGETS ext_server LIBRARY)
 if (CUDAToolkit_FOUND)
    target_include_directories(${TARGET} PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
    if (WIN32)
        target_link_libraries(${TARGET} PRIVATE nvml)
    endif()
 endif()
--- a/llm/ext_server/README.md
+++ b/llm/ext_server/README.md
@@ -1,18 +0,0 @@
 # Extern C Server
 This directory contains a thin facade we layer on top of the Llama.cpp server to
 expose `extern C` interfaces to access the functionality through direct API
 calls in-process.  The llama.cpp code uses compile time macros to configure GPU
 type along with other settings.  During the `go generate ./...` execution, the
 build will generate one or more copies of the llama.cpp `extern C` server based
 on what GPU libraries are detected to support multiple GPU types as well as CPU
 only support. The Ollama go build then embeds these different servers to support
 different GPUs and settings at runtime.
 If you are making changes to the code in this directory, make sure to disable
 caching during your go build to ensure you pick up your changes.  A typical
 iteration cycle from the top of the source tree looks like:
 ```
 go generate ./... && go build -a .
 ```
--- a/llm/ext_server_common.go
+++ b/llm/ext_server_common.go
@@ -1,45 +1,63 @@
 package llm
 /*
-#cgo CFLAGS: -I${SRCDIR}/ext_server -I${SRCDIR}/llama.cpp -I${SRCDIR}/llama.cpp/common -I${SRCDIR}/llama.cpp/examples/server
+#cgo CFLAGS: -I${SRCDIR}/llama.cpp -I${SRCDIR}/llama.cpp/gguf -I${SRCDIR}/llama.cpp/gguf/common -I${SRCDIR}/llama.cpp/gguf/examples/server
 #cgo CFLAGS: -DNDEBUG -DLLAMA_SERVER_LIBRARY=1 -D_XOPEN_SOURCE=600 -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64
-#cgo CFLAGS: -Wmissing-noreturn -Wextra -Wcast-qual -Wno-unused-function -Wno-array-bounds
+#cgo CFLAGS: -Wmissing-noreturn -Wall -Wextra -Wcast-qual -Wno-unused-function -Wno-array-bounds
-#cgo CPPFLAGS: -Ofast -Wextra -Wno-unused-function -Wno-unused-variable -Wno-deprecated-declarations -Wno-unused-but-set-variable
+#cgo CPPFLAGS: -Ofast -Wall -Wextra -Wno-unused-function -Wno-unused-variable -Wno-deprecated-declarations -Wno-unused-but-set-variable
 #cgo darwin CFLAGS: -D_DARWIN_C_SOURCE
 #cgo darwin CPPFLAGS:  -DGGML_USE_ACCELERATE
 #cgo darwin CPPFLAGS: -DGGML_USE_METAL -DGGML_METAL_NDEBUG
 #cgo darwin LDFLAGS: -lc++ -framework Accelerate
 #cgo darwin LDFLAGS: -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
 #cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/darwin/metal/lib/libcommon.a
 #cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/darwin/metal/lib/libext_server.a
 #cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/darwin/metal/lib/libllama.a
 #cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/darwin/metal/lib/libggml_static.a
 #cgo linux CFLAGS: -D_GNU_SOURCE
 #cgo linux windows CFLAGS: -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_USE_CUBLAS
 #cgo linux LDFLAGS: -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/local/cuda/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib/stubs
 #cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/linux/cpu/lib/libext_server.a
 #cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/linux/cpu/lib/libcommon.a
 #cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/linux/cpu/lib/libllama.a
 #cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/linux/cpu/lib/libggml_static.a
 #cgo linux LDFLAGS: -lrt -ldl -lstdc++ -lm
 #cgo linux windows LDFLAGS: -lpthread
 #include <stdlib.h>
-#include "dyn_ext_server.h"
+#include "ext_server.h"
 */
 import "C"
 import (
 	"bytes"
 	"context"
 	"encoding/json"
 	"fmt"
-	"log/slog"
+	"log"
 	"os"
 	"path/filepath"
 	"runtime"
 	"strings"
 	"sync"
 	"time"
 	"unsafe"
 	"github.com/jmorganca/ollama/api"
 	"github.com/jmorganca/ollama/gpu"
 )
-type dynExtServer struct {
+type extServer interface {
-	s       C.struct_dynamic_llama_server
+	LLM
-	options api.Options
+	llama_server_init(sparams *C.ext_server_params_t, err *C.ext_server_resp_t)
 	llama_server_start()
 	llama_server_stop()
 	llama_server_completion(json_req *C.char, resp *C.ext_server_resp_t)
 	llama_server_completion_next_result(task_id C.int, resp *C.ext_server_task_result_t)
 	llama_server_completion_cancel(task_id C.int, err *C.ext_server_resp_t)
 	llama_server_release_task_result(result *C.ext_server_task_result_t)
 	llama_server_tokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t)
 	llama_server_detokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t)
 	llama_server_embedding(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t)
 	llama_server_release_json_resp(json_resp **C.char)
 }
 // Note: current implementation does not support concurrent instantiations
@@ -64,39 +82,25 @@ func extServerResponseToErr(resp C.ext_server_resp_t) error {
 	return fmt.Errorf(C.GoString(resp.msg))
 }
-// Note: current implementation does not support concurrent instantiations
+func newExtServer(server extServer, model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) {
 var llm *dynExtServer
 func newDynExtServer(library, model string, adapters, projectors []string, opts api.Options) (LLM, error) {
 	if !mutex.TryLock() {
-		slog.Info("concurrent llm servers not yet supported, waiting for prior server to complete")
+		log.Printf("concurrent llm servers not yet supported, waiting for prior server to complete")
 		mutex.Lock()
 	}
-	updatePath(filepath.Dir(library))
+	fileInfo, err := os.Stat(model)
-	libPath := C.CString(library)
+	if err != nil {
-	defer C.free(unsafe.Pointer(libPath))
+		return nil, err
 	resp := newExtServerResp(512)
 	defer freeExtServerResp(resp)
 	var srv C.struct_dynamic_llama_server
 	C.dyn_init(libPath, &srv, &resp)
 	if resp.id < 0 {
 		mutex.Unlock()
 		return nil, fmt.Errorf("Unable to load dynamic library: %s", C.GoString(resp.msg))
 	}
 	llm = &dynExtServer{
 		s:       srv,
 		options: opts,
 	}
 	slog.Info(fmt.Sprintf("Loading Dynamic llm server: %s", library))
 	var sparams C.ext_server_params_t
 	sparams.model = C.CString(model)
 	defer C.free(unsafe.Pointer(sparams.model))
 	numGPU := gpu.NumGPU(numLayers, fileInfo.Size(), opts)
 	sparams.embedding = true
 	sparams.n_ctx = C.uint(opts.NumCtx)
 	sparams.n_batch = C.uint(opts.NumBatch)
-	sparams.n_gpu_layers = C.int(opts.NumGPU)
+	sparams.n_gpu_layers = C.int(numGPU)
 	sparams.main_gpu = C.int(opts.MainGPU)
 	sparams.n_parallel = 1 // TODO - wire up concurrency
@@ -136,20 +140,20 @@ func newDynExtServer(library, model string, adapters, projectors []string, opts
 	sparams.n_threads = C.uint(opts.NumThread)
-	slog.Info("Initializing llama server")
+	log.Printf("Initializing internal llama server")
-	initResp := newExtServerResp(128)
+	resp := newExtServerResp(128)
-	defer freeExtServerResp(initResp)
+	defer freeExtServerResp(resp)
-	C.dyn_llama_server_init(llm.s, &sparams, &initResp)
+	server.llama_server_init(&sparams, &resp)
-	if initResp.id < 0 {
+	if resp.id < 0 {
-		return nil, extServerResponseToErr(initResp)
+		return nil, extServerResponseToErr(resp)
 	}
-	slog.Info("Starting llama main loop")
+	log.Printf("Starting internal llama main loop")
-	C.dyn_llama_server_start(llm.s)
+	server.llama_server_start()
-	return llm, nil
+	return server, nil
 }
-func (llm *dynExtServer) Predict(ctx context.Context, predict PredictOpts, fn func(PredictResult)) error {
+func predict(ctx context.Context, llm extServer, predict PredictOpts, fn func(PredictResult)) error {
 	resp := newExtServerResp(128)
 	defer freeExtServerResp(resp)
 	var imageData []ImageData
@@ -158,7 +162,7 @@ func (llm *dynExtServer) Predict(ctx context.Context, predict PredictOpts, fn fu
 			imageData = append(imageData, ImageData{Data: i, ID: cnt})
 		}
 	}
-	slog.Info(fmt.Sprintf("loaded %d images", len(imageData)))
+	log.Printf("loaded %d images", len(imageData))
 	request := map[string]any{
 		"prompt":            predict.Prompt,
@@ -181,6 +185,7 @@ func (llm *dynExtServer) Predict(ctx context.Context, predict PredictOpts, fn fu
 		"seed":              predict.Options.Seed,
 		"stop":              predict.Options.Stop,
 		"image_data":        imageData,
 		"cache_prompt":      true,
 	}
 	if predict.Format == "json" {
@@ -206,7 +211,7 @@ func (llm *dynExtServer) Predict(ctx context.Context, predict PredictOpts, fn fu
 		req := C.CString(buffer.String())
 		defer C.free(unsafe.Pointer(req))
-		C.dyn_llama_server_completion(llm.s, req, &resp)
+		llm.llama_server_completion(req, &resp)
 		if resp.id < 0 {
 			return extServerResponseToErr(resp)
 		}
@@ -217,7 +222,7 @@ func (llm *dynExtServer) Predict(ctx context.Context, predict PredictOpts, fn fu
 			select {
 			case <-ctx.Done():
 				// This handles the request cancellation
-				C.dyn_llama_server_completion_cancel(llm.s, resp.id, &resp)
+				llm.llama_server_completion_cancel(resp.id, &resp)
 				if resp.id < 0 {
 					return extServerResponseToErr(resp)
 				} else {
@@ -225,13 +230,13 @@ func (llm *dynExtServer) Predict(ctx context.Context, predict PredictOpts, fn fu
 				}
 			default:
 				var result C.ext_server_task_result_t
-				C.dyn_llama_server_completion_next_result(llm.s, resp.id, &result)
+				llm.llama_server_completion_next_result(resp.id, &result)
 				json_resp := C.GoString(result.json_resp)
-				C.dyn_llama_server_release_task_result(llm.s, &result)
+				llm.llama_server_release_task_result(&result)
 				var p prediction
 				if err := json.Unmarshal([]byte(json_resp), &p); err != nil {
-					C.dyn_llama_server_completion_cancel(llm.s, resp.id, &resp)
+					llm.llama_server_completion_cancel(resp.id, &resp)
 					if resp.id < 0 {
 						return fmt.Errorf("error unmarshaling llm prediction response: %w and cancel %s", err, C.GoString(resp.msg))
 					} else {
@@ -272,7 +277,7 @@ func (llm *dynExtServer) Predict(ctx context.Context, predict PredictOpts, fn fu
 	return fmt.Errorf("max retries exceeded")
 }
-func (llm *dynExtServer) Encode(ctx context.Context, prompt string) ([]int, error) {
+func encode(llm extServer, ctx context.Context, prompt string) ([]int, error) {
 	data, err := json.Marshal(TokenizeRequest{Content: prompt})
 	if err != nil {
 		return nil, fmt.Errorf("marshaling encode data: %w", err)
@@ -282,11 +287,11 @@ func (llm *dynExtServer) Encode(ctx context.Context, prompt string) ([]int, erro
 	var json_resp *C.char
 	resp := newExtServerResp(128)
 	defer freeExtServerResp(resp)
-	C.dyn_llama_server_tokenize(llm.s, req, &json_resp, &resp)
+	llm.llama_server_tokenize(req, &json_resp, &resp)
 	if resp.id < 0 {
 		return nil, extServerResponseToErr(resp)
 	}
-	defer C.dyn_llama_server_release_json_resp(llm.s, &json_resp)
+	defer llm.llama_server_release_json_resp(&json_resp)
 	var encoded TokenizeResponse
 	if err2 := json.Unmarshal([]byte(C.GoString(json_resp)), &encoded); err2 != nil {
@@ -296,7 +301,7 @@ func (llm *dynExtServer) Encode(ctx context.Context, prompt string) ([]int, erro
 	return encoded.Tokens, err
 }
-func (llm *dynExtServer) Decode(ctx context.Context, tokens []int) (string, error) {
+func decode(llm extServer, ctx context.Context, tokens []int) (string, error) {
 	if len(tokens) == 0 {
 		return "", nil
 	}
@@ -310,11 +315,11 @@ func (llm *dynExtServer) Decode(ctx context.Context, tokens []int) (string, erro
 	var json_resp *C.char
 	resp := newExtServerResp(128)
 	defer freeExtServerResp(resp)
-	C.dyn_llama_server_detokenize(llm.s, req, &json_resp, &resp)
+	llm.llama_server_detokenize(req, &json_resp, &resp)
 	if resp.id < 0 {
 		return "", extServerResponseToErr(resp)
 	}
-	defer C.dyn_llama_server_release_json_resp(llm.s, &json_resp)
+	defer llm.llama_server_release_json_resp(&json_resp)
 	var decoded DetokenizeResponse
 	if err2 := json.Unmarshal([]byte(C.GoString(json_resp)), &decoded); err2 != nil {
@@ -324,7 +329,7 @@ func (llm *dynExtServer) Decode(ctx context.Context, tokens []int) (string, erro
 	return decoded.Content, err
 }
-func (llm *dynExtServer) Embedding(ctx context.Context, input string) ([]float64, error) {
+func embedding(llm extServer, ctx context.Context, input string) ([]float64, error) {
 	data, err := json.Marshal(TokenizeRequest{Content: input})
 	if err != nil {
 		return nil, fmt.Errorf("error marshaling embed data: %w", err)
@@ -335,11 +340,11 @@ func (llm *dynExtServer) Embedding(ctx context.Context, input string) ([]float64
 	var json_resp *C.char
 	resp := newExtServerResp(128)
 	defer freeExtServerResp(resp)
-	C.dyn_llama_server_embedding(llm.s, req, &json_resp, &resp)
+	llm.llama_server_embedding(req, &json_resp, &resp)
 	if resp.id < 0 {
 		return nil, extServerResponseToErr(resp)
 	}
-	defer C.dyn_llama_server_release_json_resp(llm.s, &json_resp)
+	defer llm.llama_server_release_json_resp(&json_resp)
 	var embedding EmbeddingResponse
 	if err := json.Unmarshal([]byte(C.GoString(json_resp)), &embedding); err != nil {
@@ -349,29 +354,7 @@ func (llm *dynExtServer) Embedding(ctx context.Context, input string) ([]float64
 	return embedding.Embedding, nil
 }
-func (llm *dynExtServer) Close() {
+func close(llm extServer) {
-	C.dyn_llama_server_stop(llm.s)
+	llm.llama_server_stop()
 	mutex.Unlock()
 }
 func updatePath(dir string) {
 	if runtime.GOOS == "windows" {
 		tmpDir := filepath.Dir(dir)
 		pathComponents := strings.Split(os.Getenv("PATH"), ";")
 		i := 0
 		for _, comp := range pathComponents {
 			if strings.EqualFold(comp, dir) {
 				return
 			}
 			// Remove any other prior paths to our temp dir
 			if !strings.HasPrefix(strings.ToLower(comp), strings.ToLower(tmpDir)) {
 				pathComponents[i] = comp
 				i++
 			}
 		}
 		newPath := strings.Join(append([]string{dir}, pathComponents...), ";")
 		slog.Info(fmt.Sprintf("Updating PATH to %s", newPath))
 		os.Setenv("PATH", newPath)
 	}
 	// linux and darwin rely on rpath
 }
--- a/llm/ext_server_default.go
+++ b/llm/ext_server_default.go
@@ -0,0 +1,80 @@
 //go:build !windows
 package llm
 /*
 #include <stdlib.h>
 #include "ext_server.h"
 */
 import "C"
 import (
 	"context"
 	"github.com/jmorganca/ollama/api"
 )
 type llamaExtServer struct {
 	api.Options
 }
 func (llm *llamaExtServer) llama_server_init(sparams *C.ext_server_params_t, err *C.ext_server_resp_t) {
 	C.llama_server_init(sparams, err)
 }
 func (llm *llamaExtServer) llama_server_start() {
 	C.llama_server_start()
 }
 func (llm *llamaExtServer) llama_server_stop() {
 	C.llama_server_stop()
 }
 func (llm *llamaExtServer) llama_server_completion(json_req *C.char, resp *C.ext_server_resp_t) {
 	C.llama_server_completion(json_req, resp)
 }
 func (llm *llamaExtServer) llama_server_completion_next_result(task_id C.int, resp *C.ext_server_task_result_t) {
 	C.llama_server_completion_next_result(task_id, resp)
 }
 func (llm *llamaExtServer) llama_server_completion_cancel(task_id C.int, err *C.ext_server_resp_t) {
 	C.llama_server_completion_cancel(task_id, err)
 }
 func (llm *llamaExtServer) llama_server_release_task_result(result *C.ext_server_task_result_t) {
 	C.llama_server_release_task_result(result)
 }
 func (llm *llamaExtServer) llama_server_tokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
 	C.llama_server_tokenize(json_req, json_resp, err)
 }
 func (llm *llamaExtServer) llama_server_detokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
 	C.llama_server_detokenize(json_req, json_resp, err)
 }
 func (llm *llamaExtServer) llama_server_embedding(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
 	C.llama_server_embedding(json_req, json_resp, err)
 }
 func (llm *llamaExtServer) llama_server_release_json_resp(json_resp **C.char) {
 	C.llama_server_release_json_resp(json_resp)
 }
 func newDefaultExtServer(model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) {
 	server := &llamaExtServer{opts}
 	return newExtServer(server, model, adapters, projectors, numLayers, opts)
 }
 func (llm *llamaExtServer) Predict(ctx context.Context, pred PredictOpts, fn func(PredictResult)) error {
 	return predict(ctx, llm, pred, fn)
 }
 func (llm *llamaExtServer) Encode(ctx context.Context, prompt string) ([]int, error) {
 	return encode(llm, ctx, prompt)
 }
 func (llm *llamaExtServer) Decode(ctx context.Context, tokens []int) (string, error) {
 	return decode(llm, ctx, tokens)
 }
 func (llm *llamaExtServer) Embedding(ctx context.Context, input string) ([]float64, error) {
 	return embedding(llm, ctx, input)
 }
 func (llm *llamaExtServer) Close() {
 	close(llm)
 }
--- a/llm/ext_server_windows.go
+++ b/llm/ext_server_windows.go
@@ -0,0 +1,12 @@
 package llm
 import (
 	"github.com/jmorganca/ollama/api"
 )
 func newDefaultExtServer(model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) {
 	// On windows we always load the llama.cpp libraries dynamically to avoid startup DLL dependencies
 	// This ensures we can update the PATH at runtime to get everything loaded
 	return newDynamicShimExtServer(AvailableShims["cpu"], model, adapters, projectors, numLayers, opts)
 }
--- a/llm/generate/gen_common.sh
+++ b/llm/generate/gen_common.sh
@@ -1,100 +0,0 @@
 # common logic accross linux and darwin
 init_vars() {
    case "${GOARCH}" in
    "amd64")
        ARCH="x86_64"
        ;;
    "arm64")
        ARCH="arm64"
        ;;
    *)
        ARCH=$(uname -m | sed -e "s/aarch64/arm64/g")
    esac
    LLAMACPP_DIR=../llama.cpp
    CMAKE_DEFS=""
    CMAKE_TARGETS="--target ext_server"
    if echo "${CGO_CFLAGS}" | grep -- '-g' >/dev/null; then
        CMAKE_DEFS="-DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_VERBOSE_MAKEFILE=on -DLLAMA_GPROF=on -DLLAMA_SERVER_VERBOSE=on ${CMAKE_DEFS}"
    else
        # TODO - add additional optimization flags...
        CMAKE_DEFS="-DCMAKE_BUILD_TYPE=Release -DLLAMA_SERVER_VERBOSE=off ${CMAKE_DEFS}"
    fi
    case $(uname -s) in 
    "Darwin")
        LIB_EXT="dylib"
        WHOLE_ARCHIVE="-Wl,-force_load"
        NO_WHOLE_ARCHIVE=""
        GCC_ARCH="-arch ${ARCH}"
        ;;
    "Linux")
        LIB_EXT="so"
        WHOLE_ARCHIVE="-Wl,--whole-archive"
        NO_WHOLE_ARCHIVE="-Wl,--no-whole-archive"
        # Cross compiling not supported on linux - Use docker
        GCC_ARCH=""
        ;;
    *)
        ;;
    esac
 }
 git_module_setup() {
    if [ -n "${OLLAMA_SKIP_PATCHING}" ]; then
        echo "Skipping submodule initialization"
        return
    fi
    # Make sure the tree is clean after the directory moves
    if [ -d "${LLAMACPP_DIR}/gguf" ]; then
        echo "Cleaning up old submodule"
        rm -rf ${LLAMACPP_DIR}
    fi
    git submodule init
    git submodule update --force ${LLAMACPP_DIR}
 }
 apply_patches() {
    # Wire up our CMakefile
    if ! grep ollama ${LLAMACPP_DIR}/examples/server/CMakeLists.txt; then
        echo 'include (../../../ext_server/CMakeLists.txt) # ollama' >>${LLAMACPP_DIR}/examples/server/CMakeLists.txt
    fi
    # Avoid duplicate main symbols when we link into the cgo binary
    sed -e 's/int main(/int __main(/g' <${LLAMACPP_DIR}/examples/server/server.cpp >${LLAMACPP_DIR}/examples/server/server.cpp.tmp &&
        mv ${LLAMACPP_DIR}/examples/server/server.cpp.tmp ${LLAMACPP_DIR}/examples/server/server.cpp
 }
 build() {
    cmake -S ${LLAMACPP_DIR} -B ${BUILD_DIR} ${CMAKE_DEFS}
    cmake --build ${BUILD_DIR} ${CMAKE_TARGETS} -j8
    mkdir -p ${BUILD_DIR}/lib/
    g++ -fPIC -g -shared -o ${BUILD_DIR}/lib/libext_server.${LIB_EXT} \
        ${GCC_ARCH} \
        ${WHOLE_ARCHIVE} ${BUILD_DIR}/examples/server/libext_server.a ${NO_WHOLE_ARCHIVE} \
        ${BUILD_DIR}/common/libcommon.a \
        ${BUILD_DIR}/libllama.a \
        -Wl,-rpath,\$ORIGIN \
        -lpthread -ldl -lm \
        ${EXTRA_LIBS}
 }
 compress_libs() {
    echo "Compressing payloads to reduce overall binary size..."
    pids=""
    for lib in ${BUILD_DIR}/lib/*.${LIB_EXT}* ; do
        gzip --best ${lib} &
        pids+=" $!"
    done
    echo 
    for pid in ${pids}; do
        wait $pid
    done
    echo "Finished compression"
 }
 # Keep the local tree clean after we're done with the build
 cleanup() {
    (cd ${LLAMACPP_DIR}/examples/server/ && git checkout CMakeLists.txt server.cpp)
 }
--- a/llm/generate/gen_darwin.sh
+++ b/llm/generate/gen_darwin.sh
@@ -1,66 +0,0 @@
 #!/bin/bash
 # This script is intended to run inside the go generate
 # working directory must be ./llm/generate/
 # TODO - add hardening to detect missing tools (cmake, etc.)
 set -ex
 set -o pipefail
 echo "Starting darwin generate script"
 source $(dirname $0)/gen_common.sh
 init_vars
 git_module_setup
 apply_patches
 COMMON_DARWIN_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 -DCMAKE_SYSTEM_NAME=Darwin -DLLAMA_ACCELERATE=off"
 case "${GOARCH}" in
 "amd64")
    COMMON_CPU_DEFS="${COMMON_DARWIN_DEFS} -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DLLAMA_METAL=off -DLLAMA_NATIVE=off"
    #
    # CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
    #
    CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
    BUILD_DIR="${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu"
    echo "Building LCD CPU"
    build
    compress_libs
    #
    # ~2011 CPU Dynamic library with more capabilities turned on to optimize performance
    # Approximately 400% faster than LCD on same CPU
    #
    init_vars
    CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
    BUILD_DIR="${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu_avx"
    echo "Building AVX CPU"
    build
    compress_libs
    #
    # ~2013 CPU Dynamic library
    # Approximately 10% faster than AVX on same CPU
    #
    init_vars
    CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on ${CMAKE_DEFS}"
    BUILD_DIR="${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu_avx2"
    echo "Building AVX2 CPU"
    build
    compress_libs
    ;;
 "arm64")
    CMAKE_DEFS="${COMMON_DARWIN_DEFS} -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DLLAMA_METAL=on -DLLAMA_ACCELERATE=on ${CMAKE_DEFS}"
    BUILD_DIR="${LLAMACPP_DIR}/build/darwin/${ARCH}/metal"
    EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders"
    build
    compress_libs
    ;;
 *)
    echo "GOARCH must be set"
    echo "this script is meant to be run from within go generate"
    exit 1
    ;;
 esac
 cleanup
--- a/llm/generate/gen_linux.sh
+++ b/llm/generate/gen_linux.sh
@@ -1,180 +0,0 @@
 #!/bin/bash
 # This script is intended to run inside the go generate
 # working directory must be llm/generate/
 # First we build one or more CPU based LLM libraries
 #
 # Then if we detect CUDA, we build a CUDA dynamic library, and carry the required
 # library dependencies
 #
 # Then if we detect ROCm, we build a dynamically loaded ROCm lib.  The ROCM
 # libraries are quite large, and also dynamically load data files at runtime
 # which in turn are large, so we don't attempt to cary them as payload
 set -ex
 set -o pipefail
 # See https://llvm.org/docs/AMDGPUUsage.html#processors for reference
 amdGPUs() {
    GPU_LIST=(
        "gfx803"
        "gfx900"
        "gfx906:xnack-"
        "gfx908:xnack-"
        "gfx90a:xnack+"
        "gfx90a:xnack-"
        "gfx1010"
        "gfx1012"
        "gfx1030"
        "gfx1100"
        "gfx1101"
        "gfx1102"
    )
    (
        IFS=$';'
        echo "'${GPU_LIST[*]}'"
    )
 }
 echo "Starting linux generate script"
 if [ -z "${CUDACXX}" ]; then
    if [ -x /usr/local/cuda/bin/nvcc ]; then
        export CUDACXX=/usr/local/cuda/bin/nvcc
    else
        # Try the default location in case it exists
        export CUDACXX=$(command -v nvcc)
    fi
 fi
 COMMON_CMAKE_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off"
 source $(dirname $0)/gen_common.sh
 init_vars
 git_module_setup
 apply_patches
 if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
    # Users building from source can tune the exact flags we pass to cmake for configuring
    # llama.cpp, and we'll build only 1 CPU variant in that case as the default.
    if [ -n "${OLLAMA_CUSTOM_CPU_DEFS}" ]; then
        echo "OLLAMA_CUSTOM_CPU_DEFS=\"${OLLAMA_CUSTOM_CPU_DEFS}\""
        CMAKE_DEFS="${OLLAMA_CUSTOM_CPU_DEFS} -DCMAKE_POSITION_INDEPENDENT_CODE=on ${CMAKE_DEFS}"
        BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu"
        echo "Building custom CPU"
        build
        compress_libs
    else
        # Darwin Rosetta x86 emulation does NOT support AVX, AVX2, AVX512
        # -DLLAMA_AVX -- 2011 Intel Sandy Bridge & AMD Bulldozer
        # -DLLAMA_F16C -- 2012 Intel Ivy Bridge & AMD 2011 Bulldozer (No significant improvement over just AVX)
        # -DLLAMA_AVX2 -- 2013 Intel Haswell & 2015 AMD Excavator / 2017 AMD Zen
        # -DLLAMA_FMA (FMA3) -- 2013 Intel Haswell & 2012 AMD Piledriver
        # Note: the following seem to yield slower results than AVX2 - ymmv
        # -DLLAMA_AVX512 -- 2017 Intel Skylake and High End DeskTop (HEDT)
        # -DLLAMA_AVX512_VBMI -- 2018 Intel Cannon Lake
        # -DLLAMA_AVX512_VNNI -- 2021 Intel Alder Lake
        COMMON_CPU_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off"
        #
        # CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
        #
        CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
        BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu"
        echo "Building LCD CPU"
        build
        compress_libs
        #
        # ~2011 CPU Dynamic library with more capabilities turned on to optimize performance
        # Approximately 400% faster than LCD on same CPU
        #
        init_vars
        CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
        BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu_avx"
        echo "Building AVX CPU"
        build
        compress_libs
        #
        # ~2013 CPU Dynamic library
        # Approximately 10% faster than AVX on same CPU
        #
        init_vars
        CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on ${CMAKE_DEFS}"
        BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu_avx2"
        echo "Building AVX2 CPU"
        build
        compress_libs
    fi
 else
    echo "Skipping CPU generation step as requested"
 fi
 # If needed, look for the default CUDA toolkit location
 if [ -z "${CUDA_LIB_DIR}" ] && [ -d /usr/local/cuda/lib64 ]; then
    CUDA_LIB_DIR=/usr/local/cuda/lib64
 fi
 # If needed, look for CUDA on Arch Linux
 if [ -z "${CUDA_LIB_DIR}" ] && [ -d /opt/cuda/targets/x86_64-linux/lib ]; then
    CUDA_LIB_DIR=/opt/cuda/targets/x86_64-linux/lib
 fi
 if [ -d "${CUDA_LIB_DIR}" ]; then
    echo "CUDA libraries detected - building dynamic CUDA library"
    init_vars
    CUDA_MAJOR=$(ls "${CUDA_LIB_DIR}"/libcudart.so.* | head -1 | cut -f3 -d. || true)
    if [ -n "${CUDA_MAJOR}" ]; then
        CUDA_VARIANT=_v${CUDA_MAJOR}
    fi
    CMAKE_DEFS="-DLLAMA_CUBLAS=on ${COMMON_CMAKE_DEFS} ${CMAKE_DEFS}"
    BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cuda${CUDA_VARIANT}"
    EXTRA_LIBS="-L${CUDA_LIB_DIR} -lcudart -lcublas -lcublasLt -lcuda"
    build
    # Cary the CUDA libs as payloads to help reduce dependency burden on users
    #
    # TODO - in the future we may shift to packaging these separately and conditionally
    #        downloading them in the install script.
    DEPS="$(ldd ${BUILD_DIR}/lib/libext_server.so )"
    for lib in libcudart.so libcublas.so libcublasLt.so ; do
        DEP=$(echo "${DEPS}" | grep ${lib} | cut -f1 -d' ' | xargs || true)
        if [ -n "${DEP}" -a -e "${CUDA_LIB_DIR}/${DEP}" ]; then
            cp "${CUDA_LIB_DIR}/${DEP}" "${BUILD_DIR}/lib/"
        elif [ -e "${CUDA_LIB_DIR}/${lib}.${CUDA_MAJOR}" ]; then
            cp "${CUDA_LIB_DIR}/${lib}.${CUDA_MAJOR}" "${BUILD_DIR}/lib/"
        else
            cp -d "${CUDA_LIB_DIR}/${lib}*" "${BUILD_DIR}/lib/"
        fi
    done
    compress_libs
 fi
 if [ -z "${ROCM_PATH}" ]; then
    # Try the default location in case it exists
    ROCM_PATH=/opt/rocm
 fi
 if [ -z "${CLBlast_DIR}" ]; then
    # Try the default location in case it exists
    if [ -d /usr/lib/cmake/CLBlast ]; then
        export CLBlast_DIR=/usr/lib/cmake/CLBlast
    fi
 fi
 if [ -d "${ROCM_PATH}" ]; then
    echo "ROCm libraries detected - building dynamic ROCm library"
    if [ -f ${ROCM_PATH}/lib/librocm_smi64.so.? ]; then
        ROCM_VARIANT=_v$(ls ${ROCM_PATH}/lib/librocm_smi64.so.? | cut -f3 -d. || true)
    fi
    init_vars
    CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DLLAMA_HIPBLAS=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS=$(amdGPUs) -DGPU_TARGETS=$(amdGPUs)"
    BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/rocm${ROCM_VARIANT}"
    EXTRA_LIBS="-L${ROCM_PATH}/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ -Wl,-rpath,${ROCM_PATH}/lib,-rpath,/opt/amdgpu/lib/x86_64-linux-gnu/ -lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu"
    build
    # Note: the ROCM libs and runtime library files are too large to embed, so we depend on
    #       them being present at runtime on the host
    compress_libs
 fi
 cleanup
--- a/llm/generate/gen_windows.ps1
+++ b/llm/generate/gen_windows.ps1
@@ -1,147 +0,0 @@
 #!powershell
 $ErrorActionPreference = "Stop"
 function init_vars {
    $script:llamacppDir = "../llama.cpp"
    $script:cmakeDefs = @("-DBUILD_SHARED_LIBS=on", "-DLLAMA_NATIVE=off",  "-A","x64")
    $script:cmakeTargets = @("ext_server")
    $script:ARCH = "amd64" # arm not yet supported.
    if ($env:CGO_CFLAGS -contains "-g") {
        $script:cmakeDefs += @("-DCMAKE_VERBOSE_MAKEFILE=on", "-DLLAMA_SERVER_VERBOSE=on")
        $script:config = "RelWithDebInfo"
    } else {
        $script:cmakeDefs += @("-DLLAMA_SERVER_VERBOSE=off")
        $script:config = "Release"
    }
    # Try to find the CUDA dir
    if ($env:CUDA_LIB_DIR -eq $null) {
        $d=(get-command -ea 'silentlycontinue' nvcc).path
        if ($d -ne $null) {
            $script:CUDA_LIB_DIR=($d| split-path -parent)
        }
    } else {
        $script:CUDA_LIB_DIR=$env:CUDA_LIB_DIR
    }
    $script:GZIP=(get-command -ea 'silentlycontinue' gzip).path
    $script:DUMPBIN=(get-command -ea 'silentlycontinue' dumpbin).path
 }
 function git_module_setup {
    # TODO add flags to skip the init/patch logic to make it easier to mod llama.cpp code in-repo
    & git submodule init
    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
    & git submodule update --force "${script:llamacppDir}"
    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
 }
 function apply_patches {
    # Wire up our CMakefile
    if (!(Select-String -Path "${script:llamacppDir}/examples/server/CMakeLists.txt" -Pattern 'ollama')) {
        Add-Content -Path "${script:llamacppDir}/examples/server/CMakeLists.txt" -Value 'include (../../../ext_server/CMakeLists.txt) # ollama'
    }
    # Avoid duplicate main symbols when we link into the cgo binary
    $content = Get-Content -Path "${script:llamacppDir}/examples/server/server.cpp"
    $content = $content -replace 'int main\(', 'int __main('
    Set-Content -Path "${script:llamacppDir}/examples/server/server.cpp" -Value $content
 }
 function build {
    write-host "generating config with: cmake -S ${script:llamacppDir} -B $script:buildDir $script:cmakeDefs"
    & cmake --version
    & cmake -S "${script:llamacppDir}" -B $script:buildDir $script:cmakeDefs
    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
    write-host "building with: cmake --build $script:buildDir --config $script:config ($script:cmakeTargets | ForEach-Object { "--target", $_ })"
    & cmake --build $script:buildDir --config $script:config ($script:cmakeTargets | ForEach-Object { "--target", $_ })
    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
 }
 function install {
    rm -ea 0 -recurse -force -path "${script:buildDir}/lib"
    md "${script:buildDir}/lib" -ea 0 > $null
    cp "${script:buildDir}/bin/${script:config}/ext_server.dll" "${script:buildDir}/lib"
    cp "${script:buildDir}/bin/${script:config}/llama.dll" "${script:buildDir}/lib"
    # Display the dll dependencies in the build log
    if ($script:DUMPBIN -ne $null) {
        & "$script:DUMPBIN" /dependents "${script:buildDir}/bin/${script:config}/ext_server.dll" | select-string ".dll"
    }
 }
 function compress_libs {
    if ($script:GZIP -eq $null) {
        write-host "gzip not installed, not compressing files"
        return
    }
    write-host "Compressing dlls..."
    $libs = dir "${script:buildDir}/lib/*.dll"
    foreach ($file in $libs) {
        & "$script:GZIP" --best $file
    }
 }
 function cleanup {
    Set-Location "${script:llamacppDir}/examples/server"
    git checkout CMakeLists.txt server.cpp
 }
 init_vars
 git_module_setup
 apply_patches
 # -DLLAMA_AVX -- 2011 Intel Sandy Bridge & AMD Bulldozer
 # -DLLAMA_F16C -- 2012 Intel Ivy Bridge & AMD 2011 Bulldozer (No significant improvement over just AVX)
 # -DLLAMA_AVX2 -- 2013 Intel Haswell & 2015 AMD Excavator / 2017 AMD Zen
 # -DLLAMA_FMA (FMA3) -- 2013 Intel Haswell & 2012 AMD Piledriver
 $script:commonCpuDefs = @("-DCMAKE_POSITION_INDEPENDENT_CODE=on", "-DLLAMA_NATIVE=off")
 $script:cmakeDefs = $script:commonCpuDefs + @("-DLLAMA_AVX=off", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs
 $script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cpu"
 write-host "Building LCD CPU"
 build
 install
 compress_libs
 $script:cmakeDefs = $script:commonCpuDefs + @("-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs
 $script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cpu_avx"
 write-host "Building AVX CPU"
 build
 install
 compress_libs
 $script:cmakeDefs = $script:commonCpuDefs + @("-DLLAMA_AVX=on", "-DLLAMA_AVX2=on", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=on", "-DLLAMA_F16C=on") + $script:cmakeDefs
 $script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cpu_avx2"
 write-host "Building AVX2 CPU"
 build
 install
 compress_libs
 if ($null -ne $script:CUDA_LIB_DIR) {
    # Then build cuda as a dynamically loaded library
    $nvcc = (get-command -ea 'silentlycontinue' nvcc)
    if ($null -ne $nvcc) {
        $script:CUDA_VERSION=(get-item ($nvcc | split-path | split-path)).Basename
    }
    if ($null -ne $script:CUDA_VERSION) {
        $script:CUDA_VARIANT="_"+$script:CUDA_VERSION
    }
    init_vars
    $script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cuda$script:CUDA_VARIANT"
    $script:cmakeDefs += @("-DLLAMA_CUBLAS=ON", "-DLLAMA_AVX=on")
    build
    install
    cp "${script:CUDA_LIB_DIR}/cudart64_*.dll" "${script:buildDir}/lib"
    cp "${script:CUDA_LIB_DIR}/cublas64_*.dll" "${script:buildDir}/lib"
    cp "${script:CUDA_LIB_DIR}/cublasLt64_*.dll" "${script:buildDir}/lib"
    compress_libs
 }
 # TODO - actually implement ROCm support on windows
 $script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/rocm"
 rm -ea 0 -recurse -force -path "${script:buildDir}/lib"
 md "${script:buildDir}/lib" -ea 0 > $null
 echo $null >> "${script:buildDir}/lib/.generated"
 cleanup
 write-host "`ngo generate completed"
--- a/llm/ggml.go
+++ b/llm/ggml.go
@@ -78,12 +78,7 @@ type model interface {
 	ModelFamily() string
 	ModelType() string
 	FileType() string
-	NumLayers() uint32
+	NumLayers() int64
 	NumGQA() uint32
 	NumEmbed() uint32
 	NumHead() uint32
 	NumHeadKv() uint32
 	NumCtx() uint32
 }
 type container interface {
@@ -99,9 +94,9 @@ func (c *containerLORA) Name() string {
 	return "ggla"
 }
-func (c *containerLORA) Decode(rso *readSeekOffset) (model, error) {
+func (c *containerLORA) Decode(ro *readSeekOffset) (model, error) {
 	var version uint32
-	binary.Read(rso, binary.LittleEndian, &version)
+	binary.Read(ro, binary.LittleEndian, &version)
 	switch version {
 	case 1:
@@ -112,7 +107,7 @@ func (c *containerLORA) Decode(rso *readSeekOffset) (model, error) {
 	c.version = version
 	// remaining file contents aren't decoded
-	rso.Seek(0, io.SeekEnd)
+	ro.Seek(0, io.SeekEnd)
 	return nil, nil
 }
--- a/llm/gguf.go
+++ b/llm/gguf.go
@@ -272,58 +272,14 @@ func (llm *ggufModel) Decode(rso *readSeekOffset) error {
 	return nil
 }
-func (llm *ggufModel) NumLayers() uint32 {
+func (llm *ggufModel) NumLayers() int64 {
 	value, exists := llm.kv[fmt.Sprintf("%s.block_count", llm.ModelFamily())]
 	if !exists {
 		return 0
 	}
-	return value.(uint32)
+	v := value.(uint32)
-}
+	return int64(v)
 func (llm *ggufModel) NumHead() uint32 {
 	value, exists := llm.kv[fmt.Sprintf("%s.attention.head_count", llm.ModelFamily())]
 	if !exists {
 		return 0
 	}
 	return value.(uint32)
 }
 func (llm *ggufModel) NumEmbed() uint32 {
 	value, exists := llm.kv[fmt.Sprintf("%s.embedding_length", llm.ModelFamily())]
 	if !exists {
 		return 0
 	}
 	return value.(uint32)
 }
 func (llm *ggufModel) NumHeadKv() uint32 {
 	value, exists := llm.kv[fmt.Sprintf("%s.attention.head_count_kv", llm.ModelFamily())]
 	if !exists {
 		return 0
 	}
 	return value.(uint32)
 }
 func (llm *ggufModel) NumCtx() uint32 {
 	value, exists := llm.kv[fmt.Sprintf("%s.context_length", llm.ModelFamily())]
 	if !exists {
 		return 0
 	}
 	return value.(uint32)
 }
 func (llm *ggufModel) NumGQA() uint32 {
 	numHeadKv := llm.NumHeadKv()
 	if numHeadKv == 0 {
 		return 0
 	}
 	return llm.NumHead() / numHeadKv
 }
 func (llm ggufModel) readU8(r io.Reader) uint8 {
--- a/llm/llama.cpp
+++ b/llm/llama.cpp
--- a/llm/llama.cpp/CMakeLists.txt
+++ b/llm/llama.cpp/CMakeLists.txt
@@ -0,0 +1,29 @@
 # Ollama specific CMakefile to include in llama.cpp/examples/server
 set(TARGET ext_server)
 option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
 add_library(${TARGET} STATIC ../../../ext_server.cpp)
 target_include_directories(${TARGET} PRIVATE ../../common)
 target_include_directories(${TARGET} PRIVATE ../..)
 target_include_directories(${TARGET} PRIVATE ../../..)
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 target_compile_definitions(${TARGET} PUBLIC LLAMA_SERVER_LIBRARY=1)
 target_link_libraries(${TARGET} PRIVATE common llama llava ${CMAKE_THREAD_LIBS_INIT})
 target_compile_definitions(${TARGET} PRIVATE
    SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
 )
 if (BUILD_SHARED_LIBS)
    set_target_properties(ext_server PROPERTIES POSITION_INDEPENDENT_CODE ON)
    target_compile_definitions(ext_server PRIVATE LLAMA_SHARED LLAMA_BUILD)
    add_library(ext_server_shared SHARED $<TARGET_OBJECTS:ext_server>)
    target_link_libraries(ext_server_shared PRIVATE ggml llama llava common ${CMAKE_THREAD_LIBS_INIT})
    install(TARGETS ext_server_shared LIBRARY)
 endif()
 if (CUDAToolkit_FOUND)
    target_include_directories(${TARGET} PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
    if (WIN32)
        target_link_libraries(ext_server_shared PRIVATE nvml)
    endif()
 endif()
--- a/llm/ext_server/ext_server.cpp
+++ b/llm/ext_server/ext_server.cpp
@@ -12,7 +12,6 @@ void llama_server_init(ext_server_params *sparams, ext_server_resp_t *err) {
 #if SERVER_VERBOSE != 1
  log_disable();
 #endif
  LOG_TEE("system info: %s", llama_print_system_info());
  assert(err != NULL && sparams != NULL);
  err->id = 0;
  err->msg[0] = '\0';
@@ -47,13 +46,9 @@ void llama_server_init(ext_server_params *sparams, ext_server_resp_t *err) {
      params.model = sparams->model;
    }
-    if (sparams->lora_adapters != NULL) {
+    for (ext_server_lora_adapter *la = sparams->lora_adapters; la != NULL;
-      for (ext_server_lora_adapter *la = sparams->lora_adapters; la != NULL;
+         la = la->next) {
-          la = la->next) {
+      params.lora_adapter.push_back(std::make_tuple(la->adapter, la->scale));
        params.lora_adapter.push_back(std::make_tuple(la->adapter, la->scale));
      }
      params.use_mmap = false;
    }
    if (sparams->mmproj != NULL) {
@@ -115,10 +110,6 @@ void llama_server_stop() {
  // TODO - too verbose, remove once things are solid
  LOG_TEE("requesting llama server shutdown\n");
  ext_server_running = false;
  // unblocks the update_slots() loop so it can clean up and exit
  llama->request_cancel(0);
  ext_server_thread.join();
  delete llama;
  llama = NULL;
--- a/llm/ext_server/ext_server.h
+++ b/llm/ext_server/ext_server.h
--- a/llm/llama.cpp/gen_common.sh
+++ b/llm/llama.cpp/gen_common.sh
@@ -0,0 +1,53 @@
 # common logic accross linux and darwin
 init_vars() {
    LLAMACPP_DIR=gguf
    PATCHES="0001-Expose-callable-API-for-server.patch"
    CMAKE_DEFS=""
    CMAKE_TARGETS="--target ggml --target ggml_static --target llama --target build_info --target common --target ext_server --target llava_static"
    if echo "${CGO_CFLAGS}" | grep -- '-g' >/dev/null; then
        CMAKE_DEFS="-DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_VERBOSE_MAKEFILE=on -DLLAMA_GPROF=on -DLLAMA_SERVER_VERBOSE=on"
    else
        # TODO - add additional optimization flags...
        CMAKE_DEFS="-DCMAKE_BUILD_TYPE=Release -DLLAMA_SERVER_VERBOSE=off"
    fi
 }
 git_module_setup() {
    if [ -n "${OLLAMA_SKIP_PATCHING}" ]; then
        echo "Skipping submodule initialization"
        return
    fi
    git submodule init
    git submodule update --force gguf
 }
 apply_patches() {
    # Wire up our CMakefile
    if ! grep ollama gguf/examples/server/CMakeLists.txt; then
        echo 'include (../../../CMakeLists.txt) # ollama' >>gguf/examples/server/CMakeLists.txt
    fi
    # Avoid duplicate main symbols when we link into the cgo binary
    sed -e 's/int main(/int __main(/g' <./gguf/examples/server/server.cpp >./gguf/examples/server/server.cpp.tmp &&
        mv ./gguf/examples/server/server.cpp.tmp ./gguf/examples/server/server.cpp
 }
 build() {
    cmake -S ${LLAMACPP_DIR} -B ${BUILD_DIR} ${CMAKE_DEFS}
    cmake --build ${BUILD_DIR} ${CMAKE_TARGETS} -j8
 }
 install() {
    rm -rf ${BUILD_DIR}/lib
    mkdir -p ${BUILD_DIR}/lib
    cp ${BUILD_DIR}/examples/server/libext_server.a ${BUILD_DIR}/lib
    cp ${BUILD_DIR}/common/libcommon.a ${BUILD_DIR}/lib
    cp ${BUILD_DIR}/libllama.a ${BUILD_DIR}/lib
    cp ${BUILD_DIR}/libggml_static.a ${BUILD_DIR}/lib
 }
 # Keep the local tree clean after we're done with the build
 cleanup() {
    (cd gguf/examples/server/ && git checkout CMakeLists.txt server.cpp)
 }
--- a/llm/llama.cpp/gen_darwin.sh
+++ b/llm/llama.cpp/gen_darwin.sh
@@ -0,0 +1,32 @@
 #!/bin/bash
 # This script is intended to run inside the go generate
 # working directory must be ../llm/llama.cpp
 # TODO - add hardening to detect missing tools (cmake, etc.)
 set -ex
 set -o pipefail
 echo "Starting darwin generate script"
 source $(dirname $0)/gen_common.sh
 init_vars
 CMAKE_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 -DLLAMA_METAL=on -DLLAMA_ACCELERATE=on ${CMAKE_DEFS}"
 BUILD_DIR="gguf/build/darwin/metal"
 case "${GOARCH}" in
 "amd64")
    CMAKE_DEFS="-DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
    ;;
 "arm64")
    CMAKE_DEFS="-DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 ${CMAKE_DEFS}"
    ;;
 *)
    echo "GOARCH must be set"
    echo "this script is meant to be run from within go generate"
    exit 1
    ;;
 esac
 git_module_setup
 apply_patches
 build
 install
 cleanup
--- a/llm/llama.cpp/gen_linux.sh
+++ b/llm/llama.cpp/gen_linux.sh
@@ -0,0 +1,116 @@
 #!/bin/bash
 # This script is intended to run inside the go generate
 # working directory must be llm/llama.cpp
 # First we build our default built-in library which will be linked into the CGO
 # binary as a normal dependency. This default build is CPU based.
 #
 # Then we build a CUDA dynamic library (although statically linked with the CUDA
 # library dependencies for maximum portability)
 #
 # Then if we detect ROCm, we build a dynamically loaded ROCm lib.  ROCm is particularly
 # important to be a dynamic lib even if it's the only GPU library detected because
 # we can't redistribute the objectfiles but must rely on dynamic libraries at
 # runtime, which could lead the server not to start if not present.
 set -ex
 set -o pipefail
 # See https://llvm.org/docs/AMDGPUUsage.html#processors for reference
 amdGPUs() {
    GPU_LIST=(
        "gfx803"
        "gfx900"
        "gfx906:xnack-"
        "gfx908:xnack-"
        "gfx90a:xnack+"
        "gfx90a:xnack-"
        "gfx1010"
        "gfx1012"
        "gfx1030"
        "gfx1100"
        "gfx1101"
        "gfx1102"
    )
    (
        IFS=$';'
        echo "'${GPU_LIST[*]}'"
    )
 }
 echo "Starting linux generate script"
 if [ -z "${CUDACXX}" -a -x /usr/local/cuda/bin/nvcc ]; then
    export CUDACXX=/usr/local/cuda/bin/nvcc
 fi
 COMMON_CMAKE_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off"
 source $(dirname $0)/gen_common.sh
 init_vars
 git_module_setup
 apply_patches
 #
 # CPU first for the default library
 #
 CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS}"
 BUILD_DIR="gguf/build/linux/cpu"
 build
 install
 # Placeholder to keep go embed happy until we start building dynamic CPU lib variants
 touch ${BUILD_DIR}/lib/dummy.so
 if [ -d /usr/local/cuda/lib64/ ]; then
    echo "CUDA libraries detected - building dynamic CUDA library"
    init_vars
    CMAKE_DEFS="-DLLAMA_CUBLAS=on ${COMMON_CMAKE_DEFS} ${CMAKE_DEFS}"
    BUILD_DIR="gguf/build/linux/cuda"
    CUDA_LIB_DIR=/usr/local/cuda/lib64
    build
    install
    gcc -fPIC -g -shared -o ${BUILD_DIR}/lib/libext_server.so \
        -Wl,--whole-archive \
        ${BUILD_DIR}/lib/libext_server.a \
        ${BUILD_DIR}/lib/libcommon.a \
        ${BUILD_DIR}/lib/libllama.a \
        -Wl,--no-whole-archive \
        ${CUDA_LIB_DIR}/libcudart_static.a \
        ${CUDA_LIB_DIR}/libcublas_static.a \
        ${CUDA_LIB_DIR}/libcublasLt_static.a \
        ${CUDA_LIB_DIR}/libcudadevrt.a \
        ${CUDA_LIB_DIR}/libculibos.a \
        -lrt -lpthread -ldl -lstdc++ -lm
 fi
 if [ -z "${ROCM_PATH}" ]; then
    # Try the default location in case it exists
    ROCM_PATH=/opt/rocm
 fi
 if [ -z "${CLBlast_DIR}" ]; then
    # Try the default location in case it exists
    if [ -d /usr/lib/cmake/CLBlast ]; then
        export CLBlast_DIR=/usr/lib/cmake/CLBlast
    fi
 fi
 if [ -d "${ROCM_PATH}" ]; then
    echo "ROCm libraries detected - building dynamic ROCm library"
    init_vars
    CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DLLAMA_HIPBLAS=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS=$(amdGPUs) -DGPU_TARGETS=$(amdGPUs)"
    BUILD_DIR="gguf/build/linux/rocm"
    build
    install
    gcc -fPIC -g -shared -o ${BUILD_DIR}/lib/libext_server.so \
        -Wl,--whole-archive \
        ${BUILD_DIR}/lib/libext_server.a \
        ${BUILD_DIR}/lib/libcommon.a \
        ${BUILD_DIR}/lib/libllama.a \
        -Wl,--no-whole-archive \
        -lrt -lpthread -ldl -lstdc++ -lm \
        -L/opt/rocm/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ \
        -Wl,-rpath,/opt/rocm/lib,-rpath,/opt/amdgpu/lib/x86_64-linux-gnu/ \
        -lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu
 fi
 cleanup
--- a/llm/llama.cpp/gen_windows.ps1
+++ b/llm/llama.cpp/gen_windows.ps1
@@ -0,0 +1,87 @@
 #!powershell
 $ErrorActionPreference = "Stop"
 function init_vars {
    $script:patches = @("0001-Expose-callable-API-for-server.patch")
    $script:cmakeDefs = @("-DBUILD_SHARED_LIBS=on", "-DLLAMA_NATIVE=off", "-DLLAMA_F16C=off", "-DLLAMA_FMA=off", "-DLLAMA_AVX512=off", "-DLLAMA_AVX2=off", "-DLLAMA_AVX=on", "-A","x64")
    $script:cmakeTargets = @("ggml", "ggml_static", "llama", "build_info", "common", "ext_server_shared", "llava_static")
    if ($env:CGO_CFLAGS -contains "-g") {
        $script:cmakeDefs += @("-DCMAKE_VERBOSE_MAKEFILE=on", "-DLLAMA_SERVER_VERBOSE=on")
        $script:config = "RelWithDebInfo"
    } else {
        $script:cmakeDefs += @("-DLLAMA_SERVER_VERBOSE=off")
        $script:config = "Release"
    }
 }
 function git_module_setup {
    # TODO add flags to skip the init/patch logic to make it easier to mod llama.cpp code in-repo
    & git submodule init
    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
    & git submodule update --force gguf
    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
 }
 function apply_patches {
    # Wire up our CMakefile
    if (!(Select-String -Path "gguf/examples/server/CMakeLists.txt" -Pattern 'ollama')) {
        Add-Content -Path "gguf/examples/server/CMakeLists.txt" -Value 'include (../../../CMakeLists.txt) # ollama'
    }
    # Avoid duplicate main symbols when we link into the cgo binary
    $content = Get-Content -Path "./gguf/examples/server/server.cpp"
    $content = $content -replace 'int main\(', 'int __main('
    Set-Content -Path "./gguf/examples/server/server.cpp" -Value $content
 }
 function build {
    write-host "generating config with: cmake -S gguf -B $script:buildDir $script:cmakeDefs"
    & cmake --version
    & cmake -S gguf -B $script:buildDir $script:cmakeDefs
    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
    write-host "building with: cmake --build $script:buildDir --config $script:config ($script:cmakeTargets | ForEach-Object { "--target", $_ })"
    & cmake --build $script:buildDir --config $script:config ($script:cmakeTargets | ForEach-Object { "--target", $_ })
    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
 }
 function install {
    rm -ea 0 -recurse -force -path "${script:buildDir}/lib"
    md "${script:buildDir}/lib" -ea 0 > $null
    cp "${script:buildDir}/bin/${script:config}/ext_server_shared.dll" "${script:buildDir}/lib"
    cp "${script:buildDir}/bin/${script:config}/llama.dll" "${script:buildDir}/lib"
    # Display the dll dependencies in the build log
    dumpbin /dependents "${script:buildDir}/bin/${script:config}/ext_server_shared.dll" | select-string ".dll"
 }
 function cleanup {
    Set-Location "gguf/examples/server"
    git checkout CMakeLists.txt server.cpp
 }
 init_vars
 git_module_setup
 apply_patches
 # first build CPU based
 $script:buildDir="gguf/build/windows/cpu"
 build
 install
 # Then build cuda as a dynamically loaded library
 init_vars
 $script:buildDir="gguf/build/windows/cuda"
 $script:cmakeDefs += @("-DLLAMA_CUBLAS=ON")
 build
 install
 # TODO - actually implement ROCm support on windows
 $script:buildDir="gguf/build/windows/rocm"
 rm -ea 0 -recurse -force -path "${script:buildDir}/lib"
 md "${script:buildDir}/lib" -ea 0 > $null
 echo $null >> "${script:buildDir}/lib/.generated"
 cleanup
 write-host "`ngo generate completed"
--- a/llm/llama.cpp/generate_darwin.go
+++ b/llm/llama.cpp/generate_darwin.go
@@ -1,3 +1,3 @@
-package generate
+package llm
 //go:generate sh ./gen_darwin.sh
--- a/llm/llama.cpp/generate_linux.go
+++ b/llm/llama.cpp/generate_linux.go
@@ -1,3 +1,3 @@
-package generate
+package llm
 //go:generate bash ./gen_linux.sh
--- a/llm/llama.cpp/generate_windows.go
+++ b/llm/llama.cpp/generate_windows.go
@@ -1,3 +1,3 @@
-package generate
+package llm
 //go:generate powershell -ExecutionPolicy Bypass -File ./gen_windows.ps1
--- a/llm/llama.cpp/gguf
+++ b/llm/llama.cpp/gguf
--- a/llm/llama.go
+++ b/llm/llama.go
@@ -1,11 +1,18 @@
 package llm
 import (
 	"bytes"
 	"context"
 	_ "embed"
 	"errors"
 	"fmt"
 	"os"
 	"os/exec"
 	"sync"
 	"time"
 	"github.com/jmorganca/ollama/api"
 	"github.com/jmorganca/ollama/format"
 )
 const jsonGrammar = `
@@ -36,12 +43,109 @@ number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws
 ws ::= ([ \t\n] ws)?
 `
 type llamaModel struct {
 	hyperparameters llamaHyperparameters
 }
 func (llm *llamaModel) ModelFamily() string {
 	return "llama"
 }
 func llamaModelType(numLayer uint32) string {
 	switch numLayer {
 	case 26:
 		return "3B"
 	case 32:
 		return "7B"
 	case 40:
 		return "13B"
 	case 48:
 		return "34B"
 	case 60:
 		return "30B"
 	case 80:
 		return "65B"
 	default:
 		return "unknown"
 	}
 }
 func (llm *llamaModel) ModelType() string {
 	return llamaModelType(llm.hyperparameters.NumLayer)
 }
 func (llm *llamaModel) FileType() string {
 	return fileType(llm.hyperparameters.FileType)
 }
 func (llm *llamaModel) NumLayers() int64 {
 	return int64(llm.hyperparameters.NumLayer)
 }
 type llamaHyperparameters struct {
 	// NumVocab is the size of the model's vocabulary.
 	NumVocab uint32
 	// NumEmbd is the size of the model's embedding layer.
 	NumEmbd uint32
 	NumMult uint32
 	NumHead uint32
 	// NumLayer is the number of layers in the model.
 	NumLayer uint32
 	NumRot   uint32
 	// FileType describes the quantization level of the model, e.g. Q4_0, Q5_K, etc.
 	FileType uint32
 }
 type Running struct {
 	Port          int
 	Cmd           *exec.Cmd
 	Cancel        context.CancelFunc
 	exitOnce      sync.Once
 	exitCh        chan error // channel to receive the exit status of the subprocess
 	*StatusWriter            // captures error messages from the llama runner process
 }
 type ImageData struct {
 	Data []byte `json:"data"`
 	ID   int    `json:"id"`
 }
-var payloadMissing = fmt.Errorf("expected dynamic library payloads not included in this build of ollama")
+var (
 	errNvidiaSMI     = errors.New("warning: gpu support may not be enabled, check that you have installed GPU drivers: nvidia-smi command failed")
 	errAvailableVRAM = errors.New("not enough VRAM available, falling back to CPU only")
 	payloadMissing   = fmt.Errorf("expected dynamic library payloads not included in this build of ollama")
 )
 // StatusWriter is a writer that captures error messages from the llama runner process
 type StatusWriter struct {
 	ErrCh      chan error
 	LastErrMsg string
 }
 func NewStatusWriter() *StatusWriter {
 	return &StatusWriter{
 		ErrCh: make(chan error, 1),
 	}
 }
 func (w *StatusWriter) Write(b []byte) (int, error) {
 	var errMsg string
 	if _, after, ok := bytes.Cut(b, []byte("error:")); ok {
 		errMsg = string(bytes.TrimSpace(after))
 	} else if _, after, ok := bytes.Cut(b, []byte("CUDA error")); ok {
 		errMsg = string(bytes.TrimSpace(after))
 	}
 	if errMsg != "" {
 		w.LastErrMsg = errMsg
 		w.ErrCh <- fmt.Errorf("llama runner: %s", errMsg)
 	}
 	return os.Stderr.Write(b)
 }
 type prediction struct {
 	Content string `json:"content"`
@@ -57,7 +161,9 @@ type prediction struct {
 	}
 }
 const maxBufferSize = 512 * format.KiloByte
 const maxRetries = 3
 const retryDelay = 1 * time.Second
 type PredictOpts struct {
 	Prompt  string
--- a/llm/llm.go
+++ b/llm/llm.go
@@ -3,11 +3,14 @@ package llm
 import (
 	"context"
 	"fmt"
-	"log/slog"
+	"log"
 	"os"
 	"runtime"
 	"github.com/pbnjay/memory"
 	"github.com/jmorganca/ollama/api"
 	"github.com/jmorganca/ollama/format"
 	"github.com/jmorganca/ollama/gpu"
 )
@@ -19,6 +22,8 @@ type LLM interface {
 	Close()
 }
 var AvailableShims = map[string]string{}
 func New(workDir, model string, adapters, projectors []string, opts api.Options) (LLM, error) {
 	if _, err := os.Stat(model); err != nil {
 		return nil, err
@@ -35,91 +40,40 @@ func New(workDir, model string, adapters, projectors []string, opts api.Options)
 		return nil, err
 	}
-	if opts.NumCtx > int(ggml.NumCtx()) {
+	if runtime.GOOS == "darwin" {
-		slog.Warn(fmt.Sprintf("requested context length is greater than model's max context length (%d > %d), using %d instead", opts.NumCtx, ggml.NumCtx(), ggml.NumCtx()))
+		var requiredMemory int64
-		opts.NumCtx = int(ggml.NumCtx())
+		var f16Multiplier int64 = 2
-	}
+
-
+		switch ggml.ModelType() {
-	if opts.NumCtx < 4 {
+		case "3B", "7B":
-		opts.NumCtx = 4
+			requiredMemory = 8 * format.GigaByte
-	}
+		case "13B":
-
+			requiredMemory = 16 * format.GigaByte
-	vram, _ := gpu.CheckVRAM()
+		case "30B", "34B", "40B":
-	size := ggml.Size
+			requiredMemory = 32 * format.GigaByte
-
+		case "47B":
-	// fp16 k,v matrices require = n_ctx * n_layer * n_embd / n_head * n_head_kv * 2 bytes each * 2 key and value
+			requiredMemory = 48 * format.GigaByte
-	kv := 2 * 2 * int64(opts.NumCtx) * int64(ggml.NumLayers()) * int64(ggml.NumEmbed()) * int64(ggml.NumHeadKv()) / int64(ggml.NumHead())
+		case "65B", "70B":
-
+			requiredMemory = 64 * format.GigaByte
-	// rough estimation for scratch space based on context size, batch size and number of layers in the model
+		case "180B":
-	// TODO: instead call llama.cpp's alloc functions to measure required memory
+			requiredMemory = 128 * format.GigaByte
-	// TODO: account for quantization levels
+			f16Multiplier = 4
-	scratch := 8*int64(opts.NumCtx)*int64(opts.NumBatch)*int64(ggml.NumLayers()) + 1536*1024*1024 // 1536MiB overhead
+		}
-
+
-	info := gpu.GetGPUInfo()
+		systemMemory := int64(memory.TotalMemory())
-	switch runtime.GOOS {
+
-	case "darwin":
+		if ggml.FileType() == "F16" && requiredMemory*f16Multiplier > systemMemory {
-		if opts.NumGPU == 0 {
+			return nil, fmt.Errorf("F16 model requires at least %s of memory", format.HumanBytes(requiredMemory))
-			break
+		} else if requiredMemory > systemMemory {
-		}
+			return nil, fmt.Errorf("model requires at least %s of memory", format.HumanBytes(requiredMemory))
-
+		}
 		if size+kv+scratch > vram {
 			slog.Info("not enough vram available, falling back to CPU only")
 			info.Library = "cpu"
 			info.Variant = gpu.GetCPUVariant()
 			opts.NumGPU = 0
 			break
 		}
 		opts.NumGPU = 1
 	default:
 		if info.Library == "cpu" {
 			slog.Info("GPU not available, falling back to CPU")
 			opts.NumGPU = 0
 			break
 		}
 		// don't use GPU at all if no layers are loaded
 		if opts.NumGPU == 0 {
 			info.Library = "cpu"
 			info.Variant = gpu.GetCPUVariant()
 			break
 		}
 		// user-defined GPU count
 		if opts.NumGPU != -1 {
 			break
 		}
 		// the "main" GPU needs the most memory and determines the limit
 		// of how many layers can be loaded. It needs to fit:
 		// 1. the full compute graph allocation for all devices (graph)
 		// 2. the proportional kv cache for all devices (kv * % layers)
 		// 3. the proportional model (size * % layers / # devices)
 		// This estimates the number of layers
 		maxlayers := int64(ggml.NumLayers()) + 1
 		devices := int64(info.DeviceCount)
 		avg := vram / devices
 		layers := maxlayers * (avg - scratch) / (kv + size/devices)
 		if layers > maxlayers {
 			layers = maxlayers
 		}
 		// 1 + 2 must fit on the main gpu
 		min := scratch + kv*layers/maxlayers
 		if layers <= 0 || min > avg {
 			slog.Info("not enough vram available, falling back to CPU only")
 			info.Library = "cpu"
 			info.Variant = gpu.GetCPUVariant()
 			opts.NumGPU = 0
 			break
 		}
 		opts.NumGPU = int(layers)
 	}
 	opts.NumGQA = 0
 	opts.RopeFrequencyBase = 0.0
 	opts.RopeFrequencyScale = 0.0
-	return newLlmServer(info, model, adapters, projectors, opts)
+	gpuInfo := gpu.GetGPUInfo()
 	return newLlmServer(gpuInfo.Library, model, adapters, projectors, ggml.NumLayers(), opts)
 }
 // Give any native cgo implementations an opportunity to initialize
@@ -127,30 +81,16 @@ func Init(workdir string) error {
 	return nativeInit(workdir)
 }
-func newLlmServer(gpuInfo gpu.GpuInfo, model string, adapters, projectors []string, opts api.Options) (LLM, error) {
+func newLlmServer(library, model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) {
-	dynLibs := getDynLibs(gpuInfo)
+	if _, libPresent := AvailableShims[library]; libPresent && library != "default" {
-
+		srv, err := newDynamicShimExtServer(AvailableShims[library], model, adapters, projectors, numLayers, opts)
 	// Check to see if the user has requested a specific library instead of auto-detecting
 	demandLib := os.Getenv("OLLAMA_LLM_LIBRARY")
 	if demandLib != "" {
 		libPath := availableDynLibs[demandLib]
 		if libPath == "" {
 			slog.Info(fmt.Sprintf("Invalid OLLAMA_LLM_LIBRARY %s - not found", demandLib))
 		} else {
 			slog.Info(fmt.Sprintf("Loading OLLAMA_LLM_LIBRARY=%s", demandLib))
 			dynLibs = []string{libPath}
 		}
 	}
 	err2 := fmt.Errorf("unable to locate suitable llm library")
 	for _, dynLib := range dynLibs {
 		srv, err := newDynExtServer(dynLib, model, adapters, projectors, opts)
 		if err == nil {
 			return srv, nil
 		}
-		slog.Warn(fmt.Sprintf("Failed to load dynamic library %s  %s", dynLib, err))
+		log.Printf("Failed to load dynamic library %s - falling back to CPU mode %s", library, err)
-		err2 = err
+		// TODO - update some state to indicate we were unable to load the GPU library for future "info" ux
 	}
-	return nil, err2
+	return newDefaultExtServer(model, adapters, projectors, numLayers, opts)
 }
--- a/llm/payload_common.go
+++ b/llm/payload_common.go
@@ -1,283 +0,0 @@
 package llm
 import (
 	"compress/gzip"
 	"errors"
 	"fmt"
 	"io"
 	"io/fs"
 	"log/slog"
 	"os"
 	"path/filepath"
 	"runtime"
 	"strings"
 	"golang.org/x/exp/slices"
 	"golang.org/x/sync/errgroup"
 	"github.com/jmorganca/ollama/gpu"
 )
 // Libraries names may contain an optional variant separated by '_'
 // For example, "rocm_v6" and "rocm_v5" or "cpu" and "cpu_avx2"
 // Any library without a variant is the lowest common denominator
 var availableDynLibs = map[string]string{}
 const pathComponentCount = 7
 // getDynLibs returns an ordered list of LLM libraries to try, starting with the best
 func getDynLibs(gpuInfo gpu.GpuInfo) []string {
 	// Short circuit if we know we're using the default built-in (darwin only)
 	if gpuInfo.Library == "default" {
 		return []string{"default"}
 	}
 	// TODO - temporary until we have multiple CPU variations for Darwin
 	// Short circuit on darwin with metal only
 	if len(availableDynLibs) == 1 {
 		if _, onlyMetal := availableDynLibs["metal"]; onlyMetal {
 			return []string{availableDynLibs["metal"]}
 		}
 	}
 	exactMatch := ""
 	dynLibs := []string{}
 	altDynLibs := []string{}
 	requested := gpuInfo.Library
 	if gpuInfo.Variant != "" {
 		requested += "_" + gpuInfo.Variant
 	}
 	// Try to find an exact match
 	for cmp := range availableDynLibs {
 		if requested == cmp {
 			exactMatch = cmp
 			dynLibs = []string{availableDynLibs[cmp]}
 			break
 		}
 	}
 	// Then for GPUs load alternates and sort the list for consistent load ordering
 	if gpuInfo.Library != "cpu" {
 		for cmp := range availableDynLibs {
 			if gpuInfo.Library == strings.Split(cmp, "_")[0] && cmp != exactMatch {
 				altDynLibs = append(altDynLibs, cmp)
 			}
 		}
 		slices.Sort(altDynLibs)
 		for _, altDynLib := range altDynLibs {
 			dynLibs = append(dynLibs, availableDynLibs[altDynLib])
 		}
 	}
 	// Load up the best CPU variant if not primary requested
 	if gpuInfo.Library != "cpu" {
 		variant := gpu.GetCPUVariant()
 		// If no variant, then we fall back to default
 		// If we have a variant, try that if we find an exact match
 		// Attempting to run the wrong CPU instructions will panic the
 		// process
 		if variant != "" {
 			for cmp := range availableDynLibs {
 				if cmp == "cpu_"+variant {
 					dynLibs = append(dynLibs, availableDynLibs[cmp])
 					break
 				}
 			}
 		} else {
 			dynLibs = append(dynLibs, availableDynLibs["cpu"])
 		}
 	}
 	// Finally, if we didn't find any matches, LCD CPU FTW
 	if len(dynLibs) == 0 {
 		dynLibs = []string{availableDynLibs["cpu"]}
 	}
 	return dynLibs
 }
 func rocmDynLibPresent() bool {
 	for dynLibName := range availableDynLibs {
 		if strings.HasPrefix(dynLibName, "rocm") {
 			return true
 		}
 	}
 	return false
 }
 func nativeInit(workdir string) error {
 	slog.Info("Extracting dynamic libraries...")
 	if runtime.GOOS == "darwin" {
 		err := extractPayloadFiles(workdir, "llama.cpp/ggml-metal.metal")
 		if err != nil {
 			if err == payloadMissing {
 				// TODO perhaps consider this a hard failure on arm macs?
 				slog.Info("ggml-meta.metal payload missing")
 				return nil
 			}
 			return err
 		}
 		os.Setenv("GGML_METAL_PATH_RESOURCES", workdir)
 	}
 	libs, err := extractDynamicLibs(workdir, "llama.cpp/build/*/*/*/lib/*")
 	if err != nil {
 		if err == payloadMissing {
 			slog.Info(fmt.Sprintf("%s", payloadMissing))
 			return nil
 		}
 		return err
 	}
 	for _, lib := range libs {
 		// The last dir component is the variant name
 		variant := filepath.Base(filepath.Dir(lib))
 		availableDynLibs[variant] = lib
 	}
 	if err := verifyDriverAccess(); err != nil {
 		return err
 	}
 	// Report which dynamic libraries we have loaded to assist troubleshooting
 	variants := make([]string, len(availableDynLibs))
 	i := 0
 	for variant := range availableDynLibs {
 		variants[i] = variant
 		i++
 	}
 	slog.Info(fmt.Sprintf("Dynamic LLM libraries %v", variants))
 	slog.Debug("Override detection logic by setting OLLAMA_LLM_LIBRARY")
 	return nil
 }
 func extractDynamicLibs(workDir, glob string) ([]string, error) {
 	files, err := fs.Glob(libEmbed, glob)
 	if err != nil || len(files) == 0 {
 		return nil, payloadMissing
 	}
 	libs := []string{}
 	// TODO consider making this idempotent with some sort of persistent directory (where we store models probably)
 	// and tracking by version so we don't reexpand the files every time
 	// Also maybe consider lazy loading only what is needed
 	g := new(errgroup.Group)
 	for _, file := range files {
 		pathComps := strings.Split(file, "/")
 		if len(pathComps) != pathComponentCount {
 			slog.Error(fmt.Sprintf("unexpected payload components: %v", pathComps))
 			continue
 		}
 		file := file
 		g.Go(func() error {
 			// llama.cpp/build/$OS/$GOARCH/$VARIANT/lib/$LIBRARY
 			// Include the variant in the path to avoid conflicts between multiple server libs
 			targetDir := filepath.Join(workDir, pathComps[pathComponentCount-3])
 			srcFile, err := libEmbed.Open(file)
 			if err != nil {
 				return fmt.Errorf("read payload %s: %v", file, err)
 			}
 			defer srcFile.Close()
 			if err := os.MkdirAll(targetDir, 0o755); err != nil {
 				return fmt.Errorf("create payload temp dir %s: %v", workDir, err)
 			}
 			src := io.Reader(srcFile)
 			filename := file
 			if strings.HasSuffix(file, ".gz") {
 				src, err = gzip.NewReader(src)
 				if err != nil {
 					return fmt.Errorf("decompress payload %s: %v", file, err)
 				}
 				filename = strings.TrimSuffix(filename, ".gz")
 			}
 			destFile := filepath.Join(targetDir, filepath.Base(filename))
 			if strings.Contains(destFile, "server") {
 				libs = append(libs, destFile)
 			}
 			_, err = os.Stat(destFile)
 			switch {
 			case errors.Is(err, os.ErrNotExist):
 				destFile, err := os.OpenFile(destFile, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
 				if err != nil {
 					return fmt.Errorf("write payload %s: %v", file, err)
 				}
 				defer destFile.Close()
 				if _, err := io.Copy(destFile, src); err != nil {
 					return fmt.Errorf("copy payload %s: %v", file, err)
 				}
 			case err != nil:
 				return fmt.Errorf("stat payload %s: %v", file, err)
 			}
 			return nil
 		})
 	}
 	return libs, g.Wait()
 }
 func extractPayloadFiles(workDir, glob string) error {
 	files, err := fs.Glob(libEmbed, glob)
 	if err != nil || len(files) == 0 {
 		return payloadMissing
 	}
 	for _, file := range files {
 		srcFile, err := libEmbed.Open(file)
 		if err != nil {
 			return fmt.Errorf("read payload %s: %v", file, err)
 		}
 		defer srcFile.Close()
 		if err := os.MkdirAll(workDir, 0o755); err != nil {
 			return fmt.Errorf("create payload temp dir %s: %v", workDir, err)
 		}
 		src := io.Reader(srcFile)
 		filename := file
 		if strings.HasSuffix(file, ".gz") {
 			src, err = gzip.NewReader(src)
 			if err != nil {
 				return fmt.Errorf("decompress payload %s: %v", file, err)
 			}
 			filename = strings.TrimSuffix(filename, ".gz")
 		}
 		destFile := filepath.Join(workDir, filepath.Base(filename))
 		_, err = os.Stat(destFile)
 		switch {
 		case errors.Is(err, os.ErrNotExist):
 			destFile, err := os.OpenFile(destFile, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
 			if err != nil {
 				return fmt.Errorf("write payload %s: %v", file, err)
 			}
 			defer destFile.Close()
 			if _, err := io.Copy(destFile, src); err != nil {
 				return fmt.Errorf("copy payload %s: %v", file, err)
 			}
 		case err != nil:
 			return fmt.Errorf("stat payload %s: %v", file, err)
 		}
 	}
 	return nil
 }
 func verifyDriverAccess() error {
 	if runtime.GOOS != "linux" {
 		return nil
 	}
 	// Only check ROCm access if we have the dynamic lib loaded
 	if rocmDynLibPresent() {
 		// Verify we have permissions - either running as root, or we have group access to the driver
 		fd, err := os.OpenFile("/dev/kfd", os.O_RDWR, 0666)
 		if err != nil {
 			if errors.Is(err, fs.ErrPermission) {
 				return fmt.Errorf("Radeon card detected, but permissions not set up properly.  Either run ollama as root, or add you user account to the render group.")
 			} else if errors.Is(err, fs.ErrNotExist) {
 				// expected behavior without a radeon card
 				return nil
 			}
 			return fmt.Errorf("failed to check permission on /dev/kfd: %w", err)
 		}
 		fd.Close()
 	}
 	return nil
 }
--- a/llm/payload_darwin_amd64.go
+++ b/llm/payload_darwin_amd64.go
@@ -1,8 +0,0 @@
 package llm
 import (
 	"embed"
 )
 //go:embed llama.cpp/ggml-metal.metal llama.cpp/build/darwin/x86_64/*/lib/*.dylib*
 var libEmbed embed.FS
--- a/llm/payload_darwin_arm64.go
+++ b/llm/payload_darwin_arm64.go
@@ -1,8 +0,0 @@
 package llm
 import (
 	"embed"
 )
 //go:embed llama.cpp/ggml-metal.metal llama.cpp/build/darwin/arm64/*/lib/*.dylib*
 var libEmbed embed.FS
--- a/llm/payload_linux.go
+++ b/llm/payload_linux.go
@@ -1,8 +0,0 @@
 package llm
 import (
 	"embed"
 )
 //go:embed llama.cpp/build/linux/*/*/lib/*.so*
 var libEmbed embed.FS
--- a/llm/payload_test.go
+++ b/llm/payload_test.go
@@ -1,58 +0,0 @@
 package llm
 import (
 	"testing"
 	"github.com/jmorganca/ollama/gpu"
 	"github.com/stretchr/testify/assert"
 )
 func TestGetDynLibs(t *testing.T) {
 	availableDynLibs = map[string]string{
 		"cpu": "X_cpu",
 	}
 	assert.Equal(t, false, rocmDynLibPresent())
 	res := getDynLibs(gpu.GpuInfo{Library: "cpu"})
 	assert.Len(t, res, 1)
 	assert.Equal(t, availableDynLibs["cpu"], res[0])
 	variant := gpu.GetCPUVariant()
 	if variant != "" {
 		variant = "_" + variant
 	}
 	availableDynLibs = map[string]string{
 		"rocm_v5":       "X_rocm_v5",
 		"rocm_v6":       "X_rocm_v6",
 		"cpu" + variant: "X_cpu",
 	}
 	assert.Equal(t, true, rocmDynLibPresent())
 	res = getDynLibs(gpu.GpuInfo{Library: "rocm"})
 	assert.Len(t, res, 3)
 	assert.Equal(t, availableDynLibs["rocm_v5"], res[0])
 	assert.Equal(t, availableDynLibs["rocm_v6"], res[1])
 	assert.Equal(t, availableDynLibs["cpu"+variant], res[2])
 	res = getDynLibs(gpu.GpuInfo{Library: "rocm", Variant: "v6"})
 	assert.Len(t, res, 3)
 	assert.Equal(t, availableDynLibs["rocm_v6"], res[0])
 	assert.Equal(t, availableDynLibs["rocm_v5"], res[1])
 	assert.Equal(t, availableDynLibs["cpu"+variant], res[2])
 	res = getDynLibs(gpu.GpuInfo{Library: "cuda"})
 	assert.Len(t, res, 1)
 	assert.Equal(t, availableDynLibs["cpu"+variant], res[0])
 	res = getDynLibs(gpu.GpuInfo{Library: "default"})
 	assert.Len(t, res, 1)
 	assert.Equal(t, "default", res[0])
 	availableDynLibs = map[string]string{
 		"rocm":          "X_rocm_v5",
 		"cpu" + variant: "X_cpu",
 	}
 	assert.Equal(t, true, rocmDynLibPresent())
 	res = getDynLibs(gpu.GpuInfo{Library: "rocm", Variant: "v6"})
 	assert.Len(t, res, 2)
 	assert.Equal(t, availableDynLibs["rocm"], res[0])
 	assert.Equal(t, availableDynLibs["cpu"+variant], res[1])
 }
--- a/llm/payload_windows.go
+++ b/llm/payload_windows.go
@@ -1,8 +0,0 @@
 package llm
 import (
 	"embed"
 )
 //go:embed llama.cpp/build/windows/*/*/lib/*.dll*
 var libEmbed embed.FS
--- a/llm/shim_darwin.go
+++ b/llm/shim_darwin.go
@@ -0,0 +1,71 @@
 package llm
 import (
 	"embed"
 	"errors"
 	"fmt"
 	"io"
 	"io/fs"
 	"log"
 	"os"
 	"path/filepath"
 	"github.com/jmorganca/ollama/api"
 )
 //go:embed llama.cpp/gguf/ggml-metal.metal
 var libEmbed embed.FS
 func newDynamicShimExtServer(library, model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) {
 	// should never happen...
 	return nil, fmt.Errorf("Dynamic library loading not supported on Mac")
 }
 func nativeInit(workdir string) error {
 	err := extractPayloadFiles(workdir, "llama.cpp/gguf/ggml-metal.metal")
 	if err != nil {
 		if err == payloadMissing {
 			// TODO perhaps consider this a hard failure on arm macs?
 			log.Printf("ggml-meta.metal payload missing")
 			return nil
 		}
 		return err
 	}
 	os.Setenv("GGML_METAL_PATH_RESOURCES", workdir)
 	return nil
 }
 func extractPayloadFiles(workDir, glob string) error {
 	files, err := fs.Glob(libEmbed, glob)
 	if err != nil || len(files) == 0 {
 		return payloadMissing
 	}
 	for _, file := range files {
 		srcFile, err := libEmbed.Open(file)
 		if err != nil {
 			return fmt.Errorf("read payload %s: %v", file, err)
 		}
 		defer srcFile.Close()
 		if err := os.MkdirAll(workDir, 0o755); err != nil {
 			return fmt.Errorf("create payload temp dir %s: %v", workDir, err)
 		}
 		destFile := filepath.Join(workDir, filepath.Base(file))
 		_, err = os.Stat(destFile)
 		switch {
 		case errors.Is(err, os.ErrNotExist):
 			destFile, err := os.OpenFile(destFile, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
 			if err != nil {
 				return fmt.Errorf("write payload %s: %v", file, err)
 			}
 			defer destFile.Close()
 			if _, err := io.Copy(destFile, srcFile); err != nil {
 				return fmt.Errorf("copy payload %s: %v", file, err)
 			}
 		case err != nil:
 			return fmt.Errorf("stat payload %s: %v", file, err)
 		}
 	}
 	return nil
 }
--- a/llm/shim_ext_server.go
+++ b/llm/shim_ext_server.go
@@ -0,0 +1,191 @@
 //go:build !darwin
 package llm
 /*
 #include <stdlib.h>
 #include "dynamic_shim.h"
 */
 import "C"
 import (
 	"context"
 	"errors"
 	"fmt"
 	"io"
 	"io/fs"
 	"log"
 	"os"
 	"path/filepath"
 	"strings"
 	"sync"
 	"unsafe"
 	"github.com/jmorganca/ollama/api"
 )
 type shimExtServer struct {
 	s       C.struct_dynamic_llama_server
 	options api.Options
 }
 // Note: current implementation does not support concurrent instantiations
 var shimMutex sync.Mutex
 var llm *shimExtServer
 func (llm *shimExtServer) llama_server_init(sparams *C.ext_server_params_t, err *C.ext_server_resp_t) {
 	C.dynamic_shim_llama_server_init(llm.s, sparams, err)
 }
 func (llm *shimExtServer) llama_server_start() {
 	C.dynamic_shim_llama_server_start(llm.s)
 }
 func (llm *shimExtServer) llama_server_stop() {
 	C.dynamic_shim_llama_server_stop(llm.s)
 }
 func (llm *shimExtServer) llama_server_completion(json_req *C.char, resp *C.ext_server_resp_t) {
 	C.dynamic_shim_llama_server_completion(llm.s, json_req, resp)
 }
 func (llm *shimExtServer) llama_server_completion_next_result(task_id C.int, resp *C.ext_server_task_result_t) {
 	C.dynamic_shim_llama_server_completion_next_result(llm.s, task_id, resp)
 }
 func (llm *shimExtServer) llama_server_completion_cancel(task_id C.int, err *C.ext_server_resp_t) {
 	C.dynamic_shim_llama_server_completion_cancel(llm.s, task_id, err)
 }
 func (llm *shimExtServer) llama_server_release_task_result(result *C.ext_server_task_result_t) {
 	C.dynamic_shim_llama_server_release_task_result(llm.s, result)
 }
 func (llm *shimExtServer) llama_server_tokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
 	C.dynamic_shim_llama_server_tokenize(llm.s, json_req, json_resp, err)
 }
 func (llm *shimExtServer) llama_server_detokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
 	C.dynamic_shim_llama_server_detokenize(llm.s, json_req, json_resp, err)
 }
 func (llm *shimExtServer) llama_server_embedding(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
 	C.dynamic_shim_llama_server_embedding(llm.s, json_req, json_resp, err)
 }
 func (llm *shimExtServer) llama_server_release_json_resp(json_resp **C.char) {
 	C.dynamic_shim_llama_server_release_json_resp(llm.s, json_resp)
 }
 func newDynamicShimExtServer(library, model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) {
 	shimMutex.Lock()
 	defer shimMutex.Unlock()
 	updatePath(filepath.Dir(library))
 	libPath := C.CString(library)
 	defer C.free(unsafe.Pointer(libPath))
 	resp := newExtServerResp(128)
 	defer freeExtServerResp(resp)
 	var srv C.struct_dynamic_llama_server
 	C.dynamic_shim_init(libPath, &srv, &resp)
 	if resp.id < 0 {
 		return nil, fmt.Errorf("Unable to load dynamic library: %s", C.GoString(resp.msg))
 	}
 	llm = &shimExtServer{
 		s:       srv,
 		options: opts,
 	}
 	log.Printf("Loading Dynamic Shim llm server: %s", library)
 	return newExtServer(llm, model, adapters, projectors, numLayers, opts)
 }
 func (llm *shimExtServer) Predict(ctx context.Context, pred PredictOpts, fn func(PredictResult)) error {
 	return predict(ctx, llm, pred, fn)
 }
 func (llm *shimExtServer) Encode(ctx context.Context, prompt string) ([]int, error) {
 	return encode(llm, ctx, prompt)
 }
 func (llm *shimExtServer) Decode(ctx context.Context, tokens []int) (string, error) {
 	return decode(llm, ctx, tokens)
 }
 func (llm *shimExtServer) Embedding(ctx context.Context, input string) ([]float64, error) {
 	return embedding(llm, ctx, input)
 }
 func (llm *shimExtServer) Close() {
 	close(llm)
 }
 func nativeInit(workdir string) error {
 	libs, err := extractDynamicLibs(workdir, "llama.cpp/gguf/build/*/*/lib/*")
 	if err != nil {
 		if err == payloadMissing {
 			log.Printf("%s", payloadMissing)
 			return nil
 		}
 		return err
 	}
 	for _, lib := range libs {
 		// The last dir component is the variant name
 		variant := filepath.Base(filepath.Dir(lib))
 		AvailableShims[variant] = lib
 	}
 	if err := verifyDriverAccess(); err != nil {
 		return err
 	}
 	// Report which dynamic libraries we have loaded to assist troubleshooting
 	variants := make([]string, len(AvailableShims))
 	i := 0
 	for variant := range AvailableShims {
 		variants[i] = variant
 		i++
 	}
 	log.Printf("Dynamic LLM variants %v", variants)
 	return nil
 }
 func extractDynamicLibs(workDir, glob string) ([]string, error) {
 	files, err := fs.Glob(libEmbed, glob)
 	if err != nil || len(files) == 0 {
 		return nil, payloadMissing
 	}
 	libs := []string{}
 	for _, file := range files {
 		pathComps := strings.Split(file, "/")
 		if len(pathComps) != 7 {
 			log.Printf("unexpected payload components: %v", pathComps)
 			continue
 		}
 		// llama.cpp/gguf/build/$OS/$VARIANT/lib/$LIBRARY
 		// Include the variant in the path to avoid conflicts between multiple server libs
 		targetDir := filepath.Join(workDir, pathComps[4])
 		srcFile, err := libEmbed.Open(file)
 		if err != nil {
 			return nil, fmt.Errorf("read payload %s: %v", file, err)
 		}
 		defer srcFile.Close()
 		if err := os.MkdirAll(targetDir, 0o755); err != nil {
 			return nil, fmt.Errorf("create payload temp dir %s: %v", workDir, err)
 		}
 		destFile := filepath.Join(targetDir, filepath.Base(file))
 		if strings.Contains(destFile, "server") {
 			libs = append(libs, destFile)
 		}
 		_, err = os.Stat(destFile)
 		switch {
 		case errors.Is(err, os.ErrNotExist):
 			destFile, err := os.OpenFile(destFile, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
 			if err != nil {
 				return nil, fmt.Errorf("write payload %s: %v", file, err)
 			}
 			defer destFile.Close()
 			if _, err := io.Copy(destFile, srcFile); err != nil {
 				return nil, fmt.Errorf("copy payload %s: %v", file, err)
 			}
 		case err != nil:
 			return nil, fmt.Errorf("stat payload %s: %v", file, err)
 		}
 	}
 	return libs, nil
 }
--- a/llm/shim_ext_server_linux.go
+++ b/llm/shim_ext_server_linux.go
@@ -0,0 +1,46 @@
 package llm
 import (
 	"embed"
 	"errors"
 	"fmt"
 	"io/fs"
 	"log"
 	"os"
 	"strings"
 )
 //go:embed llama.cpp/gguf/build/*/*/lib/*.so
 var libEmbed embed.FS
 func updatePath(dir string) {
 	pathComponents := strings.Split(os.Getenv("PATH"), ":")
 	for _, comp := range pathComponents {
 		if comp == dir {
 			return
 		}
 	}
 	newPath := strings.Join(append(pathComponents, dir), ":")
 	log.Printf("Updating PATH to %s", newPath)
 	os.Setenv("PATH", newPath)
 }
 func verifyDriverAccess() error {
 	// Only check ROCm access if we have the dynamic lib loaded
 	if _, rocmPresent := AvailableShims["rocm"]; rocmPresent {
 		// Verify we have permissions - either running as root, or we have group access to the driver
 		fd, err := os.OpenFile("/dev/kfd", os.O_RDWR, 0666)
 		if err != nil {
 			if errors.Is(err, fs.ErrPermission) {
 				return fmt.Errorf("Radeon card detected, but permissions not set up properly.  Either run ollama as root, or add you user account to the render group.")
 			} else if errors.Is(err, fs.ErrNotExist) {
 				// expected behavior without a radeon card
 				return nil
 			}
 			return fmt.Errorf("failed to check permission on /dev/kfd: %w", err)
 		}
 		fd.Close()
 	}
 	return nil
 }
--- a/llm/shim_ext_server_windows.go
+++ b/llm/shim_ext_server_windows.go
@@ -0,0 +1,36 @@
 package llm
 import (
 	"embed"
 	"log"
 	"os"
 	"path/filepath"
 	"strings"
 )
 //go:embed llama.cpp/gguf/build/windows/*/lib/*.dll
 var libEmbed embed.FS
 func updatePath(dir string) {
 	tmpDir := filepath.Dir(dir)
 	pathComponents := strings.Split(os.Getenv("PATH"), ";")
 	i := 0
 	for _, comp := range pathComponents {
 		if strings.EqualFold(comp, dir) {
 			return
 		}
 		// Remove any other prior paths to our temp dir
 		if !strings.HasPrefix(strings.ToLower(comp), strings.ToLower(tmpDir)) {
 			pathComponents[i] = comp
 			i++
 		}
 	}
 	newPath := strings.Join(append([]string{dir}, pathComponents...), ";")
 	log.Printf("Updating PATH to %s", newPath)
 	os.Setenv("PATH", newPath)
 }
 func verifyDriverAccess() error {
 	// TODO if applicable
 	return nil
 }
--- a/parser/parser.go
+++ b/parser/parser.go
@@ -6,7 +6,7 @@ import (
 	"errors"
 	"fmt"
 	"io"
-	"log/slog"
+	"log"
 )
 type Command struct {
@@ -59,7 +59,7 @@ func Parse(reader io.Reader) ([]Command, error) {
 		default:
 			if !bytes.HasPrefix(fields[0], []byte("#")) {
 				// log a warning for unknown commands
-				slog.Warn(fmt.Sprintf("Unknown command: %s", fields[0]))
+				log.Printf("WARNING: Unknown command: %s", fields[0])
 			}
 			continue
 		}
--- a/parser/parser_test.go
+++ b/parser/parser_test.go
@@ -1,63 +0,0 @@
 package parser
 import (
 	"strings"
 	"testing"
 	"github.com/stretchr/testify/assert"
 )
 func Test_Parser(t *testing.T) {
 	input := `
 FROM model1
 ADAPTER adapter1
 LICENSE MIT
 PARAMETER param1 value1
 PARAMETER param2 value2
 TEMPLATE template1
 `
 	reader := strings.NewReader(input)
 	commands, err := Parse(reader)
 	assert.Nil(t, err)
 	expectedCommands := []Command{
 		{Name: "model", Args: "model1"},
 		{Name: "adapter", Args: "adapter1"},
 		{Name: "license", Args: "MIT"},
 		{Name: "param1", Args: "value1"},
 		{Name: "param2", Args: "value2"},
 		{Name: "template", Args: "template1"},
 	}
 	assert.Equal(t, expectedCommands, commands)
 }
 func Test_Parser_NoFromLine(t *testing.T) {
 	input := `
 PARAMETER param1 value1
 PARAMETER param2 value2
 `
 	reader := strings.NewReader(input)
 	_, err := Parse(reader)
 	assert.ErrorContains(t, err, "no FROM line")
 }
 func Test_Parser_MissingValue(t *testing.T) {
 	input := `
 FROM foo
 PARAMETER param1
 `
 	reader := strings.NewReader(input)
 	_, err := Parse(reader)
 	assert.ErrorContains(t, err, "missing value for [param1]")
 }
--- a/progress/progress.go
+++ b/progress/progress.go
@@ -77,7 +77,7 @@ func (p *Progress) Add(key string, state State) {
 	p.states = append(p.states, state)
 }
-func (p *Progress) render() {
+func (p *Progress) render() error {
 	p.mu.Lock()
 	defer p.mu.Unlock()
@@ -101,6 +101,8 @@ func (p *Progress) render() {
 	}
 	p.pos = len(p.states)
 	return nil
 }
 func (p *Progress) start() {
--- a/readline/buffer.go
+++ b/readline/buffer.go
@@ -25,7 +25,10 @@ func NewBuffer(prompt *Prompt) (*Buffer, error) {
 		return nil, err
 	}
-	lwidth := width - len(prompt.prompt())
+	lwidth := width - len(prompt.Prompt)
 	if prompt.UseAlt {
 		lwidth = width - len(prompt.AltPrompt)
 	}
 	b := &Buffer{
 		Pos:       0,
@@ -75,7 +78,7 @@ func (b *Buffer) MoveRight() {
 	if b.Pos < b.Size() {
 		b.Pos += 1
 		if b.Pos%b.LineWidth == 0 {
-			fmt.Printf(CursorDown + CursorBOL + cursorRightN(len(b.Prompt.prompt())))
+			fmt.Printf(CursorDown + CursorBOL + cursorRightN(b.PromptSize()))
 		} else {
 			fmt.Print(CursorRight)
 		}
@@ -106,7 +109,7 @@ func (b *Buffer) MoveToStart() {
 				fmt.Print(CursorUp)
 			}
 		}
-		fmt.Printf(CursorBOL + cursorRightN(len(b.Prompt.prompt())))
+		fmt.Printf(CursorBOL + cursorRightN(b.PromptSize()))
 		b.Pos = 0
 	}
 }
@@ -120,7 +123,7 @@ func (b *Buffer) MoveToEnd() {
 				fmt.Print(CursorDown)
 			}
 			remainder := b.Size() % b.LineWidth
-			fmt.Printf(CursorBOL + cursorRightN(len(b.Prompt.prompt())+remainder))
+			fmt.Printf(CursorBOL + cursorRightN(b.PromptSize()+remainder))
 		} else {
 			fmt.Print(cursorRightN(b.Size() - b.Pos))
 		}
@@ -140,6 +143,13 @@ func min(n, m int) int {
 	return n
 }
 func (b *Buffer) PromptSize() int {
 	if b.Prompt.UseAlt {
 		return len(b.Prompt.AltPrompt)
 	}
 	return len(b.Prompt.Prompt)
 }
 func (b *Buffer) Add(r rune) {
 	if b.Pos == b.Buf.Size() {
 		fmt.Printf("%c", r)
@@ -222,7 +232,7 @@ func (b *Buffer) Remove() {
 				remainingLines := (b.Size() - b.Pos) / b.LineWidth
 				fmt.Printf(cursorDownN(remainingLines+1) + CursorBOL + ClearToEOL)
 				place := b.Pos % b.LineWidth
-				fmt.Printf(cursorUpN(remainingLines+1) + cursorRightN(place+len(b.Prompt.prompt())))
+				fmt.Printf(cursorUpN(remainingLines+1) + cursorRightN(place+len(b.Prompt.Prompt)))
 			}
 		}
 	}
@@ -237,7 +247,7 @@ func (b *Buffer) Delete() {
 				remainingLines := (b.Size() - b.Pos) / b.LineWidth
 				fmt.Printf(cursorDownN(remainingLines) + CursorBOL + ClearToEOL)
 				place := b.Pos % b.LineWidth
-				fmt.Printf(cursorUpN(remainingLines) + cursorRightN(place+len(b.Prompt.prompt())))
+				fmt.Printf(cursorUpN(remainingLines) + cursorRightN(place+len(b.Prompt.Prompt)))
 			}
 		}
 	}
@@ -284,15 +294,15 @@ func (b *Buffer) DeleteWord() {
 }
 func (b *Buffer) ClearScreen() {
-	fmt.Printf(ClearScreen + CursorReset + b.Prompt.prompt())
+	fmt.Printf(ClearScreen + CursorReset + b.Prompt.Prompt)
 	if b.IsEmpty() {
-		ph := b.Prompt.placeholder()
+		ph := b.Prompt.Placeholder
 		fmt.Printf(ColorGrey + ph + cursorLeftN(len(ph)) + ColorDefault)
 	} else {
 		currPos := b.Pos
 		b.Pos = 0
 		b.drawRemaining()
-		fmt.Printf(CursorReset + cursorRightN(len(b.Prompt.prompt())))
+		fmt.Printf(CursorReset + cursorRightN(len(b.Prompt.Prompt)))
 		if currPos > 0 {
 			targetLine := currPos / b.LineWidth
 			if targetLine > 0 {
@@ -319,7 +329,7 @@ func (b *Buffer) IsEmpty() bool {
 func (b *Buffer) Replace(r []rune) {
 	b.Pos = 0
 	b.Buf.Clear()
-	fmt.Printf(ClearLine + CursorBOL + b.Prompt.prompt())
+	fmt.Printf(ClearLine + CursorBOL + b.Prompt.Prompt)
 	for _, c := range r {
 		b.Add(c)
 	}
--- a/readline/history.go
+++ b/readline/history.go
@@ -23,7 +23,7 @@ type History struct {
 func NewHistory() (*History, error) {
 	h := &History{
 		Buf:      arraylist.New(),
-		Limit:    100, // resizeme
+		Limit:    100, //resizeme
 		Autosave: true,
 		Enabled:  true,
 	}
@@ -49,7 +49,7 @@ func (h *History) Init() error {
 	h.Filename = path
-	f, err := os.OpenFile(path, os.O_CREATE|os.O_RDONLY, 0o600)
+	f, err := os.OpenFile(path, os.O_CREATE|os.O_RDONLY, 0600)
 	if err != nil {
 		if errors.Is(err, os.ErrNotExist) {
 			return nil
@@ -84,7 +84,7 @@ func (h *History) Add(l []rune) {
 	h.Compact()
 	h.Pos = h.Size()
 	if h.Autosave {
-		_ = h.Save()
+		h.Save()
 	}
 }
@@ -132,7 +132,7 @@ func (h *History) Save() error {
 	tmpFile := h.Filename + ".tmp"
-	f, err := os.OpenFile(tmpFile, os.O_CREATE|os.O_WRONLY|os.O_TRUNC|os.O_APPEND, 0o600)
+	f, err := os.OpenFile(tmpFile, os.O_CREATE|os.O_WRONLY|os.O_TRUNC|os.O_APPEND, 0666)
 	if err != nil {
 		return err
 	}
--- a/readline/readline.go
+++ b/readline/readline.go
@@ -16,20 +16,6 @@ type Prompt struct {
 	UseAlt         bool
 }
 func (p *Prompt) prompt() string {
 	if p.UseAlt {
 		return p.AltPrompt
 	}
 	return p.Prompt
 }
 func (p *Prompt) placeholder() string {
 	if p.UseAlt {
 		return p.AltPlaceholder
 	}
 	return p.Placeholder
 }
 type Terminal struct {
 	outchan chan rune
 }
@@ -60,9 +46,8 @@ func New(prompt Prompt) (*Instance, error) {
 }
 func (i *Instance) Readline() (string, error) {
-	prompt := i.Prompt.prompt()
+	prompt := i.Prompt.Prompt
-	if i.Pasting {
+	if i.Prompt.UseAlt || i.Pasting {
 		// force alt prompt when pasting
 		prompt = i.Prompt.AltPrompt
 	}
 	fmt.Print(prompt)
@@ -72,7 +57,6 @@ func (i *Instance) Readline() (string, error) {
 	if err != nil {
 		return "", err
 	}
 	// nolint: errcheck
 	defer UnsetRawMode(fd, termios)
 	buf, _ := NewBuffer(i.Prompt)
@@ -87,7 +71,10 @@ func (i *Instance) Readline() (string, error) {
 		// don't show placeholder when pasting unless we're in multiline mode
 		showPlaceholder := !i.Pasting || i.Prompt.UseAlt
 		if buf.IsEmpty() && showPlaceholder {
-			ph := i.Prompt.placeholder()
+			ph := i.Prompt.Placeholder
 			if i.Prompt.UseAlt {
 				ph = i.Prompt.AltPlaceholder
 			}
 			fmt.Printf(ColorGrey + ph + fmt.Sprintf(CursorLeftN, len(ph)) + ColorDefault)
 		}
--- a/readline/readline_unix.go
+++ b/readline/readline_unix.go
@@ -11,7 +11,7 @@ func handleCharCtrlZ(fd int, termios *Termios) (string, error) {
 		return "", err
 	}
-	_ = syscall.Kill(0, syscall.SIGSTOP)
+	syscall.Kill(0, syscall.SIGSTOP)
 	// on resume...
 	return "", nil
--- a/scripts/build_darwin.sh
+++ b/scripts/build_darwin.sh
@@ -1,6 +1,6 @@
 #!/bin/sh
-set -e
+set -eu
 export VERSION=${VERSION:-0.0.0}
 export GOFLAGS="'-ldflags=-w -s \"-X=github.com/jmorganca/ollama/version.Version=$VERSION\" \"-X=github.com/jmorganca/ollama/server.mode=release\"'"
@@ -8,39 +8,24 @@ export GOFLAGS="'-ldflags=-w -s \"-X=github.com/jmorganca/ollama/version.Version
 mkdir -p dist
 for TARGETARCH in arm64 amd64; do
    rm -rf llm/llama.cpp/build
    GOOS=darwin GOARCH=$TARGETARCH go generate ./...
    CGO_ENABLED=1 GOOS=darwin GOARCH=$TARGETARCH go build -o dist/ollama-darwin-$TARGETARCH
-    CGO_ENABLED=1 GOOS=darwin GOARCH=$TARGETARCH go build -cover -o dist/ollama-darwin-$TARGETARCH-cov
+    rm -rf llm/llama.cpp/*/build
 done
-lipo -create -output dist/ollama dist/ollama-darwin-arm64 dist/ollama-darwin-amd64
+lipo -create -output dist/ollama dist/ollama-darwin-*
-rm -f dist/ollama-darwin-arm64 dist/ollama-darwin-amd64
+rm -f dist/ollama-darwin-*
-if [ -n "$APPLE_IDENTITY" ]; then
+codesign --deep --force --options=runtime --sign "$APPLE_IDENTITY" --timestamp dist/ollama
    codesign --deep --force --options=runtime --sign "$APPLE_IDENTITY" --timestamp dist/ollama
 else
    echo "Skipping code signing - set APPLE_IDENTITY"
 fi
 chmod +x dist/ollama
-# build and optionally sign the mac app
+# build and sign the mac app
 npm install --prefix app
-if [ -n "$APPLE_IDENTITY" ]; then
+npm run --prefix app make:sign
    npm run --prefix app make:sign
 else 
    npm run --prefix app make
 fi
 cp app/out/make/zip/darwin/universal/Ollama-darwin-universal-$VERSION.zip dist/Ollama-darwin.zip
 # sign the binary and rename it
-if [ -n "$APPLE_IDENTITY" ]; then
+codesign -f --timestamp -s "$APPLE_IDENTITY" --identifier ai.ollama.ollama --options=runtime dist/ollama
    codesign -f --timestamp -s "$APPLE_IDENTITY" --identifier ai.ollama.ollama --options=runtime dist/ollama
 else
    echo "WARNING: Skipping code signing - set APPLE_IDENTITY"
 fi
 ditto -c -k --keepParent dist/ollama dist/temp.zip
-if [ -n "$APPLE_IDENTITY" ]; then
+xcrun notarytool submit dist/temp.zip --wait --timeout 10m --apple-id $APPLE_ID --password $APPLE_PASSWORD --team-id $APPLE_TEAM_ID
    xcrun notarytool submit dist/temp.zip --wait --timeout 10m --apple-id $APPLE_ID --password $APPLE_PASSWORD --team-id $APPLE_TEAM_ID
 fi
 mv dist/ollama dist/ollama-darwin
 rm -f dist/temp.zip
--- a/scripts/build_linux.sh
+++ b/scripts/build_linux.sh
@@ -5,11 +5,10 @@ set -eu
 export VERSION=${VERSION:-0.0.0}
 export GOFLAGS="'-ldflags=-w -s \"-X=github.com/jmorganca/ollama/version.Version=$VERSION\" \"-X=github.com/jmorganca/ollama/server.mode=release\"'"
 BUILD_ARCH=${BUILD_ARCH:-"amd64 arm64"}
 mkdir -p dist
-for TARGETARCH in ${BUILD_ARCH}; do
+for TARGETARCH in amd64 arm64; do
-    docker build --platform=linux/$TARGETARCH --build-arg=GOFLAGS --build-arg=CGO_CFLAGS --build-arg=OLLAMA_CUSTOM_CPU_DEFS -f Dockerfile.build -t builder:$TARGETARCH .
+    docker build --platform=linux/$TARGETARCH --build-arg=VERSION --build-arg=GOFLAGS --build-arg=CGO_CFLAGS -f Dockerfile.build -t builder:$TARGETARCH .
    docker create --platform linux/$TARGETARCH --name builder-$TARGETARCH builder:$TARGETARCH
    docker cp builder-$TARGETARCH:/go/src/github.com/jmorganca/ollama/ollama ./dist/ollama-linux-$TARGETARCH
    docker rm builder-$TARGETARCH
--- a/scripts/build_remote.py
+++ b/scripts/build_remote.py
@@ -66,7 +66,3 @@ subprocess.check_call(['ssh', netloc, 'cd', path, ';', GoCmd, 'generate', './...
 print("Building")
 subprocess.check_call(['ssh', netloc, 'cd', path, ';', GoCmd, 'build', '.'])
 print("Copying built result")
 subprocess.check_call(['scp', netloc +":"+ path + "/ollama.exe",  './dist/'])
--- a/scripts/install.sh
+++ b/scripts/install.sh
@@ -231,8 +231,8 @@ if ! check_gpu nvidia-smi || [ -z "$(nvidia-smi | grep -o "CUDA Version: [0-9]*\
    case $OS_NAME in
        centos|rhel) install_cuda_driver_yum 'rhel' $(echo $OS_VERSION | cut -d '.' -f 1) ;;
        rocky) install_cuda_driver_yum 'rhel' $(echo $OS_VERSION | cut -c1) ;;
-        fedora) [ $OS_VERSION -lt '37' ] && install_cuda_driver_yum $OS_NAME $OS_VERSION || install_cuda_driver_yum $OS_NAME '37';;
+        fedora) install_cuda_driver_yum $OS_NAME $OS_VERSION ;;
-        amzn) install_cuda_driver_yum 'fedora' '37' ;;
+        amzn) install_cuda_driver_yum 'fedora' '35' ;;
        debian) install_cuda_driver_apt $OS_NAME $OS_VERSION ;;
        ubuntu) install_cuda_driver_apt $OS_NAME $(echo $OS_VERSION | sed 's/\.//') ;;
        *) exit ;;
--- a/scripts/rh_linux_deps.sh
+++ b/scripts/rh_linux_deps.sh
@@ -1,43 +0,0 @@
 #!/bin/sh
 # Script for common Dockerfile dependency installation in redhat linux based images
 set -ex
 MACHINE=$(uname -m)
 if grep -i "centos" /etc/system-release >/dev/null; then
    # Centos 7 derivatives have too old of a git version to run our generate script
    # uninstall and ignore failures
    yum remove -y git
    yum -y install epel-release centos-release-scl
    yum -y install dnf
    if [ "${MACHINE}" = "x86_64" ]; then
        yum -y install https://repo.ius.io/ius-release-el7.rpm
        dnf install -y git236
    else
        dnf install -y rh-git227-git
        ln -s /opt/rh/rh-git227/root/usr/bin/git /usr/local/bin/git
    fi
    dnf install -y devtoolset-10-gcc devtoolset-10-gcc-c++
 elif grep -i "rocky" /etc/system-release >/dev/null; then
    dnf install -y git gcc-toolset-10-gcc gcc-toolset-10-gcc-c++
 else
    echo "ERROR Unexpected distro"
    exit 1
 fi
 if [ -n "${CMAKE_VERSION}" ]; then
    curl -s -L https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-$(uname -m).tar.gz | tar -zx -C /usr --strip-components 1
 fi
 if [ -n "${GOLANG_VERSION}" ]; then
    if [ "${MACHINE}" = "x86_64" ]; then
        GO_ARCH="amd64"
    else
        GO_ARCH="arm64"
    fi
    mkdir -p /usr/local
    curl -s -L https://dl.google.com/go/go${GOLANG_VERSION}.linux-${GO_ARCH}.tar.gz | tar xz -C /usr/local
    ln -s /usr/local/go/bin/go /usr/local/bin/go
    ln -s /usr/local/go/bin/gofmt /usr/local/bin/gofmt
 fi
--- a/server/auth.go
+++ b/server/auth.go
@@ -10,7 +10,7 @@ import (
 	"encoding/json"
 	"fmt"
 	"io"
-	"log/slog"
+	"log"
 	"net/http"
 	"net/url"
 	"os"
@@ -86,7 +86,7 @@ func getAuthToken(ctx context.Context, redirData AuthRedirect) (string, error) {
 	rawKey, err := os.ReadFile(keyPath)
 	if err != nil {
-		slog.Info(fmt.Sprintf("Failed to load private key: %v", err))
+		log.Printf("Failed to load private key: %v", err)
 		return "", err
 	}
@@ -105,7 +105,7 @@ func getAuthToken(ctx context.Context, redirData AuthRedirect) (string, error) {
 	headers.Set("Authorization", sig)
 	resp, err := makeRequest(ctx, http.MethodGet, redirectURL, headers, nil, nil)
 	if err != nil {
-		slog.Info(fmt.Sprintf("couldn't get token: %q", err))
+		log.Printf("couldn't get token: %q", err)
 		return "", err
 	}
 	defer resp.Body.Close()
--- a/server/download.go
+++ b/server/download.go
@@ -6,7 +6,7 @@ import (
 	"errors"
 	"fmt"
 	"io"
-	"log/slog"
+	"log"
 	"math"
 	"net/http"
 	"net/url"
@@ -98,7 +98,7 @@ func (b *blobDownload) Prepare(ctx context.Context, requestURL *url.URL, opts *R
 		b.Total, _ = strconv.ParseInt(resp.Header.Get("Content-Length"), 10, 64)
-		size := b.Total / numDownloadParts
+		var size = b.Total / numDownloadParts
 		switch {
 		case size < minDownloadPartSize:
 			size = minDownloadPartSize
@@ -120,7 +120,7 @@ func (b *blobDownload) Prepare(ctx context.Context, requestURL *url.URL, opts *R
 		}
 	}
-	slog.Info(fmt.Sprintf("downloading %s in %d %s part(s)", b.Digest[7:19], len(b.Parts), format.HumanBytes(b.Parts[0].Size)))
+	log.Printf("downloading %s in %d %s part(s)", b.Digest[7:19], len(b.Parts), format.HumanBytes(b.Parts[0].Size))
 	return nil
 }
@@ -132,13 +132,13 @@ func (b *blobDownload) run(ctx context.Context, requestURL *url.URL, opts *Regis
 	defer blobDownloadManager.Delete(b.Digest)
 	ctx, b.CancelFunc = context.WithCancel(ctx)
-	file, err := os.OpenFile(b.Name+"-partial", os.O_CREATE|os.O_RDWR, 0o644)
+	file, err := os.OpenFile(b.Name+"-partial", os.O_CREATE|os.O_RDWR, 0644)
 	if err != nil {
 		return err
 	}
 	defer file.Close()
-	_ = file.Truncate(b.Total)
+	file.Truncate(b.Total)
 	g, inner := errgroup.WithContext(ctx)
 	g.SetLimit(numDownloadParts)
@@ -159,7 +159,7 @@ func (b *blobDownload) run(ctx context.Context, requestURL *url.URL, opts *Regis
 					return err
 				case err != nil:
 					sleep := time.Second * time.Duration(math.Pow(2, float64(try)))
-					slog.Info(fmt.Sprintf("%s part %d attempt %d failed: %v, retrying in %s", b.Digest[7:19], part.N, try, err, sleep))
+					log.Printf("%s part %d attempt %d failed: %v, retrying in %s", b.Digest[7:19], part.N, try, err, sleep)
 					time.Sleep(sleep)
 					continue
 				default:
@@ -246,7 +246,7 @@ func (b *blobDownload) readPart(partName string) (*blobDownloadPart, error) {
 }
 func (b *blobDownload) writePart(partName string, part *blobDownloadPart) error {
-	partFile, err := os.OpenFile(partName, os.O_CREATE|os.O_RDWR|os.O_TRUNC, 0o644)
+	partFile, err := os.OpenFile(partName, os.O_CREATE|os.O_RDWR|os.O_TRUNC, 0644)
 	if err != nil {
 		return err
 	}
@@ -340,7 +340,6 @@ func downloadBlob(ctx context.Context, opts downloadOpts) error {
 			return err
 		}
 		// nolint: contextcheck
 		go download.Run(context.Background(), requestURL, opts.regOpts)
 	}
--- a/server/images.go
+++ b/server/images.go
@@ -10,7 +10,6 @@ import (
 	"fmt"
 	"io"
 	"log"
 	"log/slog"
 	"net/http"
 	"net/url"
 	"os"
@@ -337,7 +336,7 @@ func GetModel(name string) (*Model, error) {
 		case "application/vnd.ollama.image.embed":
 			// Deprecated in versions  > 0.1.2
 			// TODO: remove this warning in a future version
-			slog.Info("WARNING: model contains embeddings, but embeddings in modelfiles have been deprecated and will be ignored.")
+			log.Print("WARNING: model contains embeddings, but embeddings in modelfiles have been deprecated and will be ignored.")
 		case "application/vnd.ollama.image.adapter":
 			model.AdapterPaths = append(model.AdapterPaths, filename)
 		case "application/vnd.ollama.image.projector":
@@ -428,7 +427,7 @@ func CreateModel(ctx context.Context, name, modelFileDir string, commands []pars
 	fromParams := make(map[string]any)
 	for _, c := range commands {
-		slog.Info(fmt.Sprintf("[%s] - %s", c.Name, c.Args))
+		log.Printf("[%s] - %s", c.Name, c.Args)
 		mediatype := fmt.Sprintf("application/vnd.ollama.image.%s", c.Name)
 		switch c.Name {
@@ -479,6 +478,32 @@ func CreateModel(ctx context.Context, name, modelFileDir string, commands []pars
 					return err
 				}
 				// if the model is not in gguf format, pull the base model to try and get it in gguf format
 				if fromConfig.ModelFormat != "gguf" {
 					fn(api.ProgressResponse{Status: "updating base model"})
 					parent, err := GetModel(c.Args)
 					if err != nil {
 						return err
 					}
 					originalModel := parent.OriginalModel
 					if originalModel == "" {
 						originalModel = parent.ShortName
 					}
 					if err := PullModel(ctx, originalModel, &RegistryOptions{}, fn); err != nil {
 						log.Printf("error pulling parent model: %v", err)
 					}
 					// Reset the file pointer to the beginning of the file
 					_, err = fromConfigFile.Seek(0, 0)
 					if err != nil {
 						return fmt.Errorf("update from config after pull: %w", err)
 					}
 					if err := json.NewDecoder(fromConfigFile).Decode(&fromConfig); err != nil {
 						return err
 					}
 				}
 				// if the model is still not in gguf format, error out
 				if fromConfig.ModelFormat != "gguf" {
 					return fmt.Errorf("%s is not in gguf format, this base model is not compatible with this version of ollama", c.Args)
@@ -748,7 +773,6 @@ func deleteUnusedLayers(skipModelPath *ModelPath, deleteMap map[string]struct{},
 		// save (i.e. delete from the deleteMap) any files used in other manifests
 		manifest, _, err := GetManifest(fmp)
 		if err != nil {
 			// nolint: nilerr
 			return nil
 		}
@@ -768,16 +792,16 @@ func deleteUnusedLayers(skipModelPath *ModelPath, deleteMap map[string]struct{},
 	for k := range deleteMap {
 		fp, err := GetBlobsPath(k)
 		if err != nil {
-			slog.Info(fmt.Sprintf("couldn't get file path for '%s': %v", k, err))
+			log.Printf("couldn't get file path for '%s': %v", k, err)
 			continue
 		}
 		if !dryRun {
 			if err := os.Remove(fp); err != nil {
-				slog.Info(fmt.Sprintf("couldn't remove file '%s': %v", fp, err))
+				log.Printf("couldn't remove file '%s': %v", fp, err)
 				continue
 			}
 		} else {
-			slog.Info(fmt.Sprintf("wanted to remove: %s", fp))
+			log.Printf("wanted to remove: %s", fp)
 		}
 	}
@@ -793,7 +817,7 @@ func PruneLayers() error {
 	blobs, err := os.ReadDir(p)
 	if err != nil {
-		slog.Info(fmt.Sprintf("couldn't read dir '%s': %v", p, err))
+		log.Printf("couldn't read dir '%s': %v", p, err)
 		return err
 	}
@@ -807,14 +831,14 @@ func PruneLayers() error {
 		}
 	}
-	slog.Info(fmt.Sprintf("total blobs: %d", len(deleteMap)))
+	log.Printf("total blobs: %d", len(deleteMap))
 	err = deleteUnusedLayers(nil, deleteMap, false)
 	if err != nil {
 		return err
 	}
-	slog.Info(fmt.Sprintf("total unused blobs removed: %d", len(deleteMap)))
+	log.Printf("total unused blobs removed: %d", len(deleteMap))
 	return nil
 }
@@ -876,7 +900,7 @@ func DeleteModel(name string) error {
 	}
 	err = os.Remove(fp)
 	if err != nil {
-		slog.Info(fmt.Sprintf("couldn't remove manifest file '%s': %v", fp, err))
+		log.Printf("couldn't remove manifest file '%s': %v", fp, err)
 		return err
 	}
@@ -930,14 +954,14 @@ PARAMETER {{ $k }} {{ printf "%#v" $parameter }}
 	tmpl, err := template.New("").Parse(modelFile)
 	if err != nil {
-		slog.Info(fmt.Sprintf("error parsing template: %q", err))
+		log.Printf("error parsing template: %q", err)
 		return "", err
 	}
 	var buf bytes.Buffer
 	if err = tmpl.Execute(&buf, mt); err != nil {
-		slog.Info(fmt.Sprintf("error executing template: %q", err))
+		log.Printf("error executing template: %q", err)
 		return "", err
 	}
@@ -964,7 +988,7 @@ func PushModel(ctx context.Context, name string, regOpts *RegistryOptions, fn fu
 	for _, layer := range layers {
 		if err := uploadBlob(ctx, mp, layer, regOpts, fn); err != nil {
-			slog.Info(fmt.Sprintf("error uploading blob: %v", err))
+			log.Printf("error uploading blob: %v", err)
 			if errors.Is(err, errUnauthorized) {
 				return fmt.Errorf("unable to push %s, make sure this namespace exists and you are authorized to push to it", ParseModelPath(name).GetNamespaceRepository())
 			}
@@ -1059,7 +1083,7 @@ func PullModel(ctx context.Context, name string, regOpts *RegistryOptions, fn fu
 				}
 				if err := os.Remove(fp); err != nil {
 					// log this, but return the original error
-					slog.Info(fmt.Sprintf("couldn't remove file with digest mismatch '%s': %v", fp, err))
+					log.Printf("couldn't remove file with digest mismatch '%s': %v", fp, err)
 				}
 			}
 			return err
@@ -1083,7 +1107,7 @@ func PullModel(ctx context.Context, name string, regOpts *RegistryOptions, fn fu
 	err = os.WriteFile(fp, manifestJSON, 0o644)
 	if err != nil {
-		slog.Info(fmt.Sprintf("couldn't write to %s", fp))
+		log.Printf("couldn't write to %s", fp)
 		return err
 	}
@@ -1133,46 +1157,49 @@ func GetSHA256Digest(r io.Reader) (string, int64) {
 var errUnauthorized = fmt.Errorf("unauthorized")
 func makeRequestWithRetry(ctx context.Context, method string, requestURL *url.URL, headers http.Header, body io.ReadSeeker, regOpts *RegistryOptions) (*http.Response, error) {
-	for i := 0; i < 2; i++ {
+	resp, err := makeRequest(ctx, method, requestURL, headers, body, regOpts)
-		resp, err := makeRequest(ctx, method, requestURL, headers, body, regOpts)
+	if err != nil {
-		if err != nil {
+		if !errors.Is(err, context.Canceled) {
-			if !errors.Is(err, context.Canceled) {
+			log.Printf("request failed: %v", err)
 				slog.Info(fmt.Sprintf("request failed: %v", err))
 			}
 			return nil, err
 		}
-		switch {
+		return nil, err
-		case resp.StatusCode == http.StatusUnauthorized:
+	}
-			// Handle authentication error with one retry
+
-			auth := resp.Header.Get("www-authenticate")
+	switch {
-			authRedir := ParseAuthRedirectString(auth)
+	case resp.StatusCode == http.StatusUnauthorized:
-			token, err := getAuthToken(ctx, authRedir)
+		// Handle authentication error with one retry
 		auth := resp.Header.Get("www-authenticate")
 		authRedir := ParseAuthRedirectString(auth)
 		token, err := getAuthToken(ctx, authRedir)
 		if err != nil {
 			return nil, err
 		}
 		regOpts.Token = token
 		if body != nil {
 			_, err = body.Seek(0, io.SeekStart)
 			if err != nil {
 				return nil, err
 			}
 			regOpts.Token = token
 			if body != nil {
 				_, err = body.Seek(0, io.SeekStart)
 				if err != nil {
 					return nil, err
 				}
 			}
 		case resp.StatusCode == http.StatusNotFound:
 			return nil, os.ErrNotExist
 		case resp.StatusCode >= http.StatusBadRequest:
 			responseBody, err := io.ReadAll(resp.Body)
 			if err != nil {
 				return nil, fmt.Errorf("%d: %s", resp.StatusCode, err)
 			}
 			return nil, fmt.Errorf("%d: %s", resp.StatusCode, responseBody)
 		default:
 			return resp, nil
 		}
 		resp, err := makeRequest(ctx, method, requestURL, headers, body, regOpts)
 		if resp.StatusCode == http.StatusUnauthorized {
 			return nil, errUnauthorized
 		}
 		return resp, err
 	case resp.StatusCode == http.StatusNotFound:
 		return nil, os.ErrNotExist
 	case resp.StatusCode >= http.StatusBadRequest:
 		responseBody, err := io.ReadAll(resp.Body)
 		if err != nil {
 			return nil, fmt.Errorf("%d: %s", resp.StatusCode, err)
 		}
 		return nil, fmt.Errorf("%d: %s", resp.StatusCode, responseBody)
 	}
-	return nil, errUnauthorized
+	return resp, nil
 }
 func makeRequest(ctx context.Context, method string, requestURL *url.URL, headers http.Header, body io.Reader, regOpts *RegistryOptions) (*http.Response, error) {
--- a/server/manifests.go
+++ b/server/manifests.go
@@ -26,9 +26,9 @@ func WriteManifest(name string, config *Layer, layers []*Layer) error {
 		return err
 	}
-	if err := os.MkdirAll(filepath.Dir(manifestPath), 0o755); err != nil {
+	if err := os.MkdirAll(filepath.Dir(manifestPath), 0755); err != nil {
 		return err
 	}
-	return os.WriteFile(manifestPath, b.Bytes(), 0o644)
+	return os.WriteFile(manifestPath, b.Bytes(), 0644)
 }
--- a/server/modelpath.go
+++ b/server/modelpath.go
@@ -46,8 +46,7 @@ func ParseModelPath(name string) ModelPath {
 		name = after
 	}
-	name = strings.ReplaceAll(name, string(os.PathSeparator), "/")
+	parts := strings.Split(name, string(os.PathSeparator))
 	parts := strings.Split(name, "/")
 	switch len(parts) {
 	case 3:
 		mp.Registry = parts[0]
--- a/server/routes.go
+++ b/server/routes.go
@@ -7,7 +7,7 @@ import (
 	"fmt"
 	"io"
 	"io/fs"
-	"log/slog"
+	"log"
 	"net"
 	"net/http"
 	"os"
@@ -15,6 +15,7 @@ import (
 	"path/filepath"
 	"reflect"
 	"runtime"
 	"strconv"
 	"strings"
 	"sync"
 	"syscall"
@@ -73,7 +74,7 @@ func load(c *gin.Context, model *Model, opts api.Options, sessionDuration time.D
 	if needLoad {
 		if loaded.runner != nil {
-			slog.Info("changing loaded model")
+			log.Println("changing loaded model")
 			loaded.runner.Close()
 			loaded.runner = nil
 			loaded.Model = nil
@@ -197,8 +198,7 @@ func GenerateHandler(c *gin.Context) {
 		c.JSON(http.StatusOK, api.GenerateResponse{
 			CreatedAt: time.Now().UTC(),
 			Model:     req.Model,
-			Done:      true,
+			Done:      true})
 		})
 		return
 	}
@@ -391,7 +391,7 @@ func EmbeddingHandler(c *gin.Context) {
 	embedding, err := loaded.runner.Embedding(c.Request.Context(), req.Prompt)
 	if err != nil {
-		slog.Info(fmt.Sprintf("embedding generation failed: %v", err))
+		log.Printf("embedding generation failed: %v", err)
 		c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to generate embedding"})
 		return
 	}
@@ -414,13 +414,8 @@ func PullModelHandler(c *gin.Context) {
 		return
 	}
-	var model string
+	if req.Name == "" {
-	if req.Model != "" {
+		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "name is required"})
 		model = req.Model
 	} else if req.Name != "" {
 		model = req.Name
 	} else {
 		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "model is required"})
 		return
 	}
@@ -438,7 +433,7 @@ func PullModelHandler(c *gin.Context) {
 		ctx, cancel := context.WithCancel(c.Request.Context())
 		defer cancel()
-		if err := PullModel(ctx, model, regOpts, fn); err != nil {
+		if err := PullModel(ctx, req.Name, regOpts, fn); err != nil {
 			ch <- gin.H{"error": err.Error()}
 		}
 	}()
@@ -463,13 +458,8 @@ func PushModelHandler(c *gin.Context) {
 		return
 	}
-	var model string
+	if req.Name == "" {
-	if req.Model != "" {
+		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "name is required"})
 		model = req.Model
 	} else if req.Name != "" {
 		model = req.Name
 	} else {
 		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "model is required"})
 		return
 	}
@@ -487,7 +477,7 @@ func PushModelHandler(c *gin.Context) {
 		ctx, cancel := context.WithCancel(c.Request.Context())
 		defer cancel()
-		if err := PushModel(ctx, model, regOpts, fn); err != nil {
+		if err := PushModel(ctx, req.Name, regOpts, fn); err != nil {
 			ch <- gin.H{"error": err.Error()}
 		}
 	}()
@@ -512,17 +502,12 @@ func CreateModelHandler(c *gin.Context) {
 		return
 	}
-	var model string
+	if req.Name == "" {
-	if req.Model != "" {
+		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "name is required"})
 		model = req.Model
 	} else if req.Name != "" {
 		model = req.Name
 	} else {
 		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "model is required"})
 		return
 	}
-	if err := ParseModelPath(model).Validate(); err != nil {
+	if err := ParseModelPath(req.Name).Validate(); err != nil {
 		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": err.Error()})
 		return
 	}
@@ -560,7 +545,7 @@ func CreateModelHandler(c *gin.Context) {
 		ctx, cancel := context.WithCancel(c.Request.Context())
 		defer cancel()
-		if err := CreateModel(ctx, model, filepath.Dir(req.Path), commands, fn); err != nil {
+		if err := CreateModel(ctx, req.Name, filepath.Dir(req.Path), commands, fn); err != nil {
 			ch <- gin.H{"error": err.Error()}
 		}
 	}()
@@ -585,19 +570,14 @@ func DeleteModelHandler(c *gin.Context) {
 		return
 	}
-	var model string
+	if req.Name == "" {
-	if req.Model != "" {
+		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "name is required"})
 		model = req.Model
 	} else if req.Name != "" {
 		model = req.Name
 	} else {
 		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "model is required"})
 		return
 	}
-	if err := DeleteModel(model); err != nil {
+	if err := DeleteModel(req.Name); err != nil {
 		if os.IsNotExist(err) {
-			c.JSON(http.StatusNotFound, gin.H{"error": fmt.Sprintf("model '%s' not found", model)})
+			c.JSON(http.StatusNotFound, gin.H{"error": fmt.Sprintf("model '%s' not found", req.Name)})
 		} else {
 			c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
 		}
@@ -630,19 +610,15 @@ func ShowModelHandler(c *gin.Context) {
 		return
 	}
-	if req.Model != "" {
+	if req.Name == "" {
-		// noop
+		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "name is required"})
 	} else if req.Name != "" {
 		req.Model = req.Name
 	} else {
 		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "model is required"})
 		return
 	}
-	resp, err := GetModelInfo(req)
+	resp, err := GetModelInfo(req.Name)
 	if err != nil {
 		if os.IsNotExist(err) {
-			c.JSON(http.StatusNotFound, gin.H{"error": fmt.Sprintf("model '%s' not found", req.Model)})
+			c.JSON(http.StatusNotFound, gin.H{"error": fmt.Sprintf("model '%s' not found", req.Name)})
 		} else {
 			c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
 		}
@@ -652,8 +628,8 @@ func ShowModelHandler(c *gin.Context) {
 	c.JSON(http.StatusOK, resp)
 }
-func GetModelInfo(req api.ShowRequest) (*api.ShowResponse, error) {
+func GetModelInfo(name string) (*api.ShowResponse, error) {
-	model, err := GetModel(req.Model)
+	model, err := GetModel(name)
 	if err != nil {
 		return nil, err
 	}
@@ -666,14 +642,6 @@ func GetModelInfo(req api.ShowRequest) (*api.ShowResponse, error) {
 		QuantizationLevel: model.Config.FileType,
 	}
 	if req.System != "" {
 		model.System = req.System
 	}
 	if req.Template != "" {
 		model.Template = req.Template
 	}
 	resp := &api.ShowResponse{
 		License:  strings.Join(model.License, "\n"),
 		System:   model.System,
@@ -681,26 +649,6 @@ func GetModelInfo(req api.ShowRequest) (*api.ShowResponse, error) {
 		Details:  modelDetails,
 	}
 	var params []string
 	cs := 30
 	for k, v := range model.Options {
 		switch val := v.(type) {
 		case []interface{}:
 			for _, nv := range val {
 				params = append(params, fmt.Sprintf("%-*s %#v", cs, k, nv))
 			}
 		default:
 			params = append(params, fmt.Sprintf("%-*s %#v", cs, k, v))
 		}
 	}
 	resp.Parameters = strings.Join(params, "\n")
 	for k, v := range req.Options {
 		if _, ok := req.Options[k]; ok {
 			model.Options[k] = v
 		}
 	}
 	mf, err := ShowModelfile(model)
 	if err != nil {
 		return nil, err
@@ -708,12 +656,41 @@ func GetModelInfo(req api.ShowRequest) (*api.ShowResponse, error) {
 	resp.Modelfile = mf
 	var params []string
 	cs := 30
 	for k, v := range model.Options {
 		switch val := v.(type) {
 		case string:
 			params = append(params, fmt.Sprintf("%-*s %s", cs, k, val))
 		case int:
 			params = append(params, fmt.Sprintf("%-*s %s", cs, k, strconv.Itoa(val)))
 		case float64:
 			params = append(params, fmt.Sprintf("%-*s %s", cs, k, strconv.FormatFloat(val, 'f', 0, 64)))
 		case bool:
 			params = append(params, fmt.Sprintf("%-*s %s", cs, k, strconv.FormatBool(val)))
 		case []interface{}:
 			for _, nv := range val {
 				switch nval := nv.(type) {
 				case string:
 					params = append(params, fmt.Sprintf("%-*s %s", cs, k, nval))
 				case int:
 					params = append(params, fmt.Sprintf("%-*s %s", cs, k, strconv.Itoa(nval)))
 				case float64:
 					params = append(params, fmt.Sprintf("%-*s %s", cs, k, strconv.FormatFloat(nval, 'f', 0, 64)))
 				case bool:
 					params = append(params, fmt.Sprintf("%-*s %s", cs, k, strconv.FormatBool(nval)))
 				}
 			}
 		}
 	}
 	resp.Parameters = strings.Join(params, "\n")
 	return resp, nil
 }
 func ListModelsHandler(c *gin.Context) {
 	models := make([]api.ModelResponse, 0)
-	manifestsPath, err := GetManifestPath()
+	fp, err := GetManifestPath()
 	if err != nil {
 		c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
 		return
@@ -734,7 +711,6 @@ func ListModelsHandler(c *gin.Context) {
 		}
 		return api.ModelResponse{
 			Model:   model.ShortName,
 			Name:    model.ShortName,
 			Size:    model.Size,
 			Digest:  model.Digest,
@@ -744,15 +720,13 @@ func ListModelsHandler(c *gin.Context) {
 	walkFunc := func(path string, info os.FileInfo, _ error) error {
 		if !info.IsDir() {
-			path, tag := filepath.Split(path)
+			dir, file := filepath.Split(path)
-			model := strings.Trim(strings.TrimPrefix(path, manifestsPath), string(os.PathSeparator))
+			dir = strings.Trim(strings.TrimPrefix(dir, fp), string(os.PathSeparator))
-			modelPath := strings.Join([]string{model, tag}, ":")
+			tag := strings.Join([]string{dir, file}, ":")
 			canonicalModelPath := strings.ReplaceAll(modelPath, string(os.PathSeparator), "/")
-			resp, err := modelResponse(canonicalModelPath)
+			resp, err := modelResponse(tag)
 			if err != nil {
-				slog.Info(fmt.Sprintf("skipping file: %s", canonicalModelPath))
+				log.Printf("skipping file: %s", fp)
 				// nolint: nilerr
 				return nil
 			}
@@ -763,7 +737,7 @@ func ListModelsHandler(c *gin.Context) {
 		return nil
 	}
-	if err := filepath.Walk(manifestsPath, walkFunc); err != nil {
+	if err := filepath.Walk(fp, walkFunc); err != nil {
 		c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
 		return
 	}
@@ -863,7 +837,6 @@ func (s *Server) GenerateRoutes() http.Handler {
 	config := cors.DefaultConfig()
 	config.AllowWildcard = true
 	config.AllowBrowserExtensions = true
 	config.AllowOrigins = origins
 	for _, allowOrigin := range defaultAllowOrigins {
@@ -911,13 +884,6 @@ func (s *Server) GenerateRoutes() http.Handler {
 }
 func Serve(ln net.Listener) error {
 	if debug := os.Getenv("OLLAMA_DEBUG"); debug != "" {
 		var programLevel = new(slog.LevelVar)
 		h := slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{Level: programLevel, AddSource: true})
 		slog.SetDefault(slog.New(h))
 		programLevel.Set(slog.LevelDebug)
 		slog.Debug("Debug logging enabled")
 	}
 	if noprune := os.Getenv("OLLAMA_NOPRUNE"); noprune == "" {
 		// clean up unused layers and manifests
 		if err := PruneLayers(); err != nil {
@@ -940,7 +906,7 @@ func Serve(ln net.Listener) error {
 	}
 	r := s.GenerateRoutes()
-	slog.Info(fmt.Sprintf("Listening on %s (version %s)", ln.Addr(), version.Version))
+	log.Printf("Listening on %s (version %s)", ln.Addr(), version.Version)
 	srvr := &http.Server{
 		Handler: r,
 	}
@@ -963,7 +929,7 @@ func Serve(ln net.Listener) error {
 	if runtime.GOOS == "linux" { // TODO - windows too
 		// check compatibility to log warnings
 		if _, err := gpu.CheckVRAM(); err != nil {
-			slog.Info(err.Error())
+			log.Print(err.Error())
 		}
 	}
@@ -1005,14 +971,14 @@ func streamResponse(c *gin.Context, ch chan any) {
 		bts, err := json.Marshal(val)
 		if err != nil {
-			slog.Info(fmt.Sprintf("streamResponse: json.Marshal failed with %s", err))
+			log.Printf("streamResponse: json.Marshal failed with %s", err)
 			return false
 		}
 		// Delineate chunks with new-line delimiter
 		bts = append(bts, '\n')
 		if _, err := w.Write(bts); err != nil {
-			slog.Info(fmt.Sprintf("streamResponse: w.Write failed with %s", err))
+			log.Printf("streamResponse: w.Write failed with %s", err)
 			return false
 		}
--- a/server/routes_test.go
+++ b/server/routes_test.go
@@ -9,7 +9,6 @@ import (
 	"net/http"
 	"net/http/httptest"
 	"os"
 	"sort"
 	"strings"
 	"testing"
@@ -51,7 +50,7 @@ func Test_Routes(t *testing.T) {
 	createTestModel := func(t *testing.T, name string) {
 		fname := createTestFile(t, "ollama-model")
-		modelfile := strings.NewReader(fmt.Sprintf("FROM %s\nPARAMETER seed 42\nPARAMETER top_p 0.9\nPARAMETER stop foo\nPARAMETER stop bar", fname))
+		modelfile := strings.NewReader(fmt.Sprintf("FROM %s", fname))
 		commands, err := parser.Parse(modelfile)
 		assert.Nil(t, err)
 		fn := func(resp api.ProgressResponse) {
@@ -168,42 +167,6 @@ func Test_Routes(t *testing.T) {
 				assert.Equal(t, "beefsteak:latest", model.ShortName)
 			},
 		},
 		{
 			Name:   "Show Model Handler",
 			Method: http.MethodPost,
 			Path:   "/api/show",
 			Setup: func(t *testing.T, req *http.Request) {
 				createTestModel(t, "show-model")
 				showReq := api.ShowRequest{Model: "show-model"}
 				jsonData, err := json.Marshal(showReq)
 				assert.Nil(t, err)
 				req.Body = io.NopCloser(bytes.NewReader(jsonData))
 			},
 			Expected: func(t *testing.T, resp *http.Response) {
 				contentType := resp.Header.Get("Content-Type")
 				assert.Equal(t, contentType, "application/json; charset=utf-8")
 				body, err := io.ReadAll(resp.Body)
 				assert.Nil(t, err)
 				var showResp api.ShowResponse
 				err = json.Unmarshal(body, &showResp)
 				assert.Nil(t, err)
 				var params []string
 				paramsSplit := strings.Split(showResp.Parameters, "\n")
 				for _, p := range paramsSplit {
 					params = append(params, strings.Join(strings.Fields(p), " "))
 				}
 				sort.Strings(params)
 				expectedParams := []string{
 					"seed 42",
 					"stop \"bar\"",
 					"stop \"foo\"",
 					"top_p 0.9",
 				}
 				assert.Equal(t, expectedParams, params)
 			},
 		},
 	}
 	s, err := setupServer(t)
@@ -230,12 +193,13 @@ func Test_Routes(t *testing.T) {
 		}
 		resp, err := httpSrv.Client().Do(req)
 		assert.Nil(t, err)
 		defer resp.Body.Close()
 		assert.Nil(t, err)
 		if tc.Expected != nil {
 			tc.Expected(t, resp)
 		}
 	}
 }
--- a/server/upload.go
+++ b/server/upload.go
@@ -7,7 +7,7 @@ import (
 	"fmt"
 	"hash"
 	"io"
-	"log/slog"
+	"log"
 	"math"
 	"net/http"
 	"net/url"
@@ -88,7 +88,7 @@ func (b *blobUpload) Prepare(ctx context.Context, requestURL *url.URL, opts *Reg
 		return nil
 	}
-	size := b.Total / numUploadParts
+	var size = b.Total / numUploadParts
 	switch {
 	case size < minUploadPartSize:
 		size = minUploadPartSize
@@ -107,7 +107,7 @@ func (b *blobUpload) Prepare(ctx context.Context, requestURL *url.URL, opts *Reg
 		offset += size
 	}
-	slog.Info(fmt.Sprintf("uploading %s in %d %s part(s)", b.Digest[7:19], len(b.Parts), format.HumanBytes(b.Parts[0].Size)))
+	log.Printf("uploading %s in %d %s part(s)", b.Digest[7:19], len(b.Parts), format.HumanBytes(b.Parts[0].Size))
 	requestURL, err = url.Parse(location)
 	if err != nil {
@@ -156,7 +156,7 @@ func (b *blobUpload) Run(ctx context.Context, opts *RegistryOptions) {
 						return err
 					case err != nil:
 						sleep := time.Second * time.Duration(math.Pow(2, float64(try)))
-						slog.Info(fmt.Sprintf("%s part %d attempt %d failed: %v, retrying in %s", b.Digest[7:19], part.N, try, err, sleep))
+						log.Printf("%s part %d attempt %d failed: %v, retrying in %s", b.Digest[7:19], part.N, try, err, sleep)
 						time.Sleep(sleep)
 						continue
 					}
@@ -200,7 +200,7 @@ func (b *blobUpload) Run(ctx context.Context, opts *RegistryOptions) {
 			break
 		} else if err != nil {
 			sleep := time.Second * time.Duration(math.Pow(2, float64(try)))
-			slog.Info(fmt.Sprintf("%s complete upload attempt %d failed: %v, retrying in %s", b.Digest[7:19], try, err, sleep))
+			log.Printf("%s complete upload attempt %d failed: %v, retrying in %s", b.Digest[7:19], try, err, sleep)
 			time.Sleep(sleep)
 			continue
 		}
@@ -265,7 +265,7 @@ func (b *blobUpload) uploadPart(ctx context.Context, method string, requestURL *
 				return err
 			case err != nil:
 				sleep := time.Second * time.Duration(math.Pow(2, float64(try)))
-				slog.Info(fmt.Sprintf("%s part %d attempt %d failed: %v, retrying in %s", b.Digest[7:19], part.N, try, err, sleep))
+				log.Printf("%s part %d attempt %d failed: %v, retrying in %s", b.Digest[7:19], part.N, try, err, sleep)
 				time.Sleep(sleep)
 				continue
 			}
@@ -395,7 +395,6 @@ func uploadBlob(ctx context.Context, mp ModelPath, layer *Layer, opts *RegistryO
 			return err
 		}
 		// nolint: contextcheck
 		go upload.Run(context.Background(), opts)
 	}
Author	SHA1	Message	Date
Matt Williams	fed3843be2	update to resolve jmorganca comments Signed-off-by: Matt Williams <m@technovangelist.com>	2024-01-04 12:58:07 -08:00
Matt Williams	01d4047ed3	add faq about quant and context Signed-off-by: Matt Williams <m@technovangelist.com>	2024-01-04 09:45:13 -08:00
`@@ -1,3 +1,3 @@`
	`package generate`	`package llm`

	`//go:generate sh ./gen_darwin.sh`	`//go:generate sh ./gen_darwin.sh`