get api models

2024-08-07 14:10:18 -07:00
89 changed files with 1403 additions and 3094 deletions
--- a/.gitattributes
+++ b/.gitattributes
@@ -1,3 +1,2 @@
 llm/ext_server/* linguist-vendored
-* text=auto
-*.go text eol=lf
+* text eol=lf
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -31,7 +31,7 @@ jobs:
          security set-keychain-settings -lut 3600 build.keychain
      - uses: actions/setup-go@v5
        with:
-          go-version-file: go.mod
+          go-version: "stable"
          cache: true
      - name: Build Darwin
        env:
@@ -87,7 +87,7 @@ jobs:
          write-host "plugin installed"
      - uses: actions/setup-go@v5
        with:
-          go-version-file: go.mod
+          go-version: "stable"
          cache: true
      - run: go get ./...
      - run: |
@@ -141,7 +141,7 @@ jobs:
          write-host "plugin installed"
      - uses: actions/setup-go@v5
        with:
-          go-version-file: go.mod
+          go-version: "stable"
          cache: true
      - name: 'Install ROCm'
        run: |
@@ -187,13 +187,6 @@ jobs:
  generate-windows-cuda:
    environment: release
    runs-on: windows
-    strategy:
-      matrix:
-        cuda:
-          - version: "11"
-            url: 'https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.89_win10.exe'
-          - version: "12"
-            url: 'https://developer.download.nvidia.com/compute/cuda/12.4.0/local_installers/cuda_12.4.0_551.61_windows.exe'
    env:
      KEY_CONTAINER: ${{ vars.KEY_CONTAINER }}
    steps:
@@ -225,13 +218,13 @@ jobs:
          write-host "plugin installed"
      - uses: actions/setup-go@v5
        with:
-          go-version-file: go.mod
+          go-version: "stable"
          cache: true
-      - name: 'Install CUDA ${{ matrix.cuda.version }}'
+      - name: 'Install CUDA'
        run: |
          $ErrorActionPreference = "Stop"
          write-host "downloading CUDA Installer"
-          Invoke-WebRequest -Uri "${{ matrix.cuda.url }}" -OutFile "${env:RUNNER_TEMP}\cuda-install.exe"
+          Invoke-WebRequest -Uri "https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.89_win10.exe" -OutFile "${env:RUNNER_TEMP}\cuda-install.exe"
          write-host "Installing CUDA"
          Start-Process "${env:RUNNER_TEMP}\cuda-install.exe" -ArgumentList '-s' -NoNewWindow -Wait
          write-host "Completed CUDA"
@@ -263,16 +256,15 @@ jobs:
          cp "${NVIDIA_DIR}\cublasLt64_*.dll" "dist\deps\"
      - uses: actions/upload-artifact@v4
        with:
-          name: generate-windows-cuda-${{ matrix.cuda.version }}
+          name: generate-windows-cuda
          path: |
            llm/build/**/bin/*
            dist/windows-amd64/**
      - uses: actions/upload-artifact@v4
        with:
-          name: windows-cuda-deps-${{ matrix.cuda.version }}
+          name: windows-cuda-deps
          path: dist/deps/*

-
  # Import the prior generation steps and build the final windows assets
  build-windows:
    environment: release
@@ -314,7 +306,7 @@ jobs:
          write-host "plugin installed"
      - uses: actions/setup-go@v5
        with:
-          go-version-file: go.mod
+          go-version: "stable"
          cache: true
      - run: go get
      - uses: actions/download-artifact@v4
@@ -322,16 +314,10 @@ jobs:
          name: generate-windows-cpu
      - uses: actions/download-artifact@v4
        with:
-          name: generate-windows-cuda-11
+          name: generate-windows-cuda
      - uses: actions/download-artifact@v4
        with:
-          name: generate-windows-cuda-12
-      - uses: actions/download-artifact@v4
-        with:
-          name: windows-cuda-deps-11
-      - uses: actions/download-artifact@v4
-        with:
-          name: windows-cuda-deps-12
+          name: windows-cuda-deps
      - uses: actions/download-artifact@v4
        with:
          name: windows-rocm-deps
@@ -377,6 +363,7 @@ jobs:
      - run: |
          ./scripts/build_linux.sh
          ./scripts/build_docker.sh
+          mv dist/deps/* dist/
      - uses: actions/upload-artifact@v4
        with:
          name: dist-linux-amd64
@@ -472,10 +459,7 @@ jobs:
          merge-multiple: true
      - run: |
          ls -lh dist/
-          (cd dist; find . -type f | xargs sha256sum > ../sha256sum.txt)
-          mv sha256sum.txt dist/
-          mv dist/linux-???64 .
-          mv dist/linux-amd64-rocm .
+          (cd dist; sha256sum * > sha256sum.txt)
          cat dist/sha256sum.txt
      - name: Create or update Release
        run: |
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -63,7 +63,7 @@ jobs:
      - uses: actions/checkout@v4
      - uses: actions/setup-go@v5
        with:
-          go-version-file: go.mod
+          go-version: "stable"
          cache: true
      - run: go get ./...
      - run: |
@@ -163,7 +163,7 @@ jobs:
      - uses: actions/checkout@v4
      - uses: actions/setup-go@v5
        with:
-          go-version-file: go.mod
+          go-version: "stable"
          cache: true
      - name: 'Install ROCm'
        run: |
@@ -200,7 +200,7 @@ jobs:
      - uses: actions/checkout@v4
      - uses: actions/setup-go@v5
        with:
-          go-version-file: go.mod
+          go-version: "stable"
          cache: true
      - name: 'Install CUDA'
        run: |
@@ -255,7 +255,7 @@ jobs:
          submodules: recursive
      - uses: actions/setup-go@v5
        with:
-          go-version-file: go.mod
+          go-version: "stable"
          cache: false
      - run: |
          case ${{ matrix.arch }} in
@@ -297,7 +297,7 @@ jobs:
          submodules: recursive
      - uses: actions/setup-go@v5
        with:
-          go-version-file: go.mod
+          go-version: "stable"
          cache: true
      - run: |
          case ${{ matrix.arch }} in
--- a/.golangci.yaml
+++ b/.golangci.yaml
@@ -24,6 +24,7 @@ linters:
    - nosprintfhostport
    - staticcheck
    - tenv
+    - testifylint
    - unconvert
    - unused
    - usestdlibvars
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,37 +0,0 @@
-# Contributing to Ollama
-
-Thank you for your interest in contributing to Ollama! Here are a few guidelines to help get you started.
-
-## Set up
-
-See the [development documentation](./docs/development.md) for instructions on how to build and run Ollama locally.
-
-## Pull requests
-
-### Ideal issues
-
-* [Bugs](https://github.com/ollama/ollama/issues?q=is%3Aissue+is%3Aopen+label%3Abug): issues where Ollama stops working or where it results in an unexpected error.
-* [Performance](https://github.com/ollama/ollama/issues?q=is%3Aissue+is%3Aopen+label%3Aperformance): issues to make Ollama faster at model inference, downloading or uploading.
-* [Security](https://github.com/ollama/ollama/blob/main/SECURITY.md): issues that could lead to a security vulnerability. As mentioned in [SECURITY.md](https://github.com/ollama/ollama/blob/main/SECURITY.md), please do not disclose security vulnerabilities publicly.
-
-### Issues that are harder to review
-
-* New features: new features (e.g. API fields, environment variables) add surface area to Ollama and make it harder to maintain in the long run as they cannot be removed without potentially breaking users in the future.
-* Refactoring: large code improvements are important, but can be harder or take longer to review and merge.
-* Documentation: small updates to fill in or dorrect missing documentation is helpful, however large documentation additions can be hard to maintain over time.
-
-### Issues that may not be accepted
-
-* Changes that break backwards compatibility in Ollama's API (including the OpenAI-compatible API)
-* Changes that add significant friction to the user experience
-* Changes that create a large future maintenance burden for maintainers and contributors
-
-### Best practices
-
-* Commit messages: please leave both a title and a description in your commit messages. The title should be a short summary of the changes, with a leading word that explains the section of the code being changed (e.g. `api: fix parsing of prompt field`) . In the description, leave a short 2-3 sentences that explain more about the change and its impact.
-* Tests: please add test coverage to changes where possible.
-* Minimize dependencies: avoid adding new dependencies unless absolutely necessary.
-
-## Need help?
-
-If you need help with anything, feel free to reach out to us on our [Discord server](https://discord.gg/ollama).
--- a/131
+++ b/131
@@ -1,9 +1,7 @@
 ARG GOLANG_VERSION=1.22.5
 ARG CMAKE_VERSION=3.22.1
-ARG CUDA_VERSION_11=11.3.1
-ARG CUDA_V11_ARCHITECTURES="50;52;53;60;61;62;70;72;75;80;86"
-ARG CUDA_VERSION_12=12.4.0
-ARG CUDA_V12_ARCHITECTURES="60;61;62;70;72;75;80;86;87;89;90;90a"
+# this CUDA_VERSION corresponds with the one specified in docs/gpu.md
+ARG CUDA_VERSION=11.3.1
 ARG ROCM_VERSION=6.1.2

 # Copy the minimal context we need to run the generate scripts
@@ -12,7 +10,7 @@ COPY .git .git
 COPY .gitmodules .gitmodules
 COPY llm llm

-FROM --platform=linux/amd64 nvidia/cuda:$CUDA_VERSION_11-devel-centos7 AS cuda-11-build-amd64
+FROM --platform=linux/amd64 nvidia/cuda:$CUDA_VERSION-devel-centos7 AS cuda-build-amd64
 ARG CMAKE_VERSION
 COPY ./scripts/rh_linux_deps.sh /
 RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
@@ -20,34 +18,9 @@ ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
 COPY --from=llm-code / /go/src/github.com/ollama/ollama/
 WORKDIR /go/src/github.com/ollama/ollama/llm/generate
 ARG CGO_CFLAGS
-ARG CUDA_V11_ARCHITECTURES
-ENV GOARCH amd64 
-RUN --mount=type=cache,target=/root/.ccache \
-    OLLAMA_SKIP_STATIC_GENERATE=1 \
-    OLLAMA_SKIP_CPU_GENERATE=1 \
-    CMAKE_CUDA_ARCHITECTURES="${CUDA_V11_ARCHITECTURES}" \
-    CUDA_VARIANT="_v11" \
-    bash gen_linux.sh
+RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh

-FROM --platform=linux/amd64 nvidia/cuda:$CUDA_VERSION_12-devel-centos7 AS cuda-12-build-amd64
-ARG CMAKE_VERSION
-COPY ./scripts/rh_linux_deps.sh /
-RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
-ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
-COPY --from=llm-code / /go/src/github.com/ollama/ollama/
-WORKDIR /go/src/github.com/ollama/ollama/llm/generate
-ARG CGO_CFLAGS
-ARG CUDA_V12_ARCHITECTURES
-ENV GOARCH amd64 
-RUN --mount=type=cache,target=/root/.ccache \
-    OLLAMA_SKIP_STATIC_GENERATE=1 \
-    OLLAMA_SKIP_CPU_GENERATE=1 \
-    CMAKE_CUDA_ARCHITECTURES="${CUDA_V12_ARCHITECTURES}" \
-    CUDA_VARIANT="_v12" \
-    OLLAMA_CUSTOM_CUDA_DEFS="-DGGML_CUDA_USE_GRAPHS=on" \
-    bash gen_linux.sh
-
-FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION_11-devel-rockylinux8 AS cuda-11-build-server-arm64
+FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION-devel-rockylinux8 AS cuda-build-arm64
 ARG CMAKE_VERSION
 COPY ./scripts/rh_linux_deps.sh /
 RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
@@ -55,32 +28,7 @@ ENV PATH /opt/rh/gcc-toolset-10/root/usr/bin:$PATH
 COPY --from=llm-code / /go/src/github.com/ollama/ollama/
 WORKDIR /go/src/github.com/ollama/ollama/llm/generate
 ARG CGO_CFLAGS
-ARG CUDA_V11_ARCHITECTURES
-ENV GOARCH arm64 
-RUN OLLAMA_SKIP_STATIC_GENERATE=1 \
-    OLLAMA_SKIP_CPU_GENERATE=1 \
-    CMAKE_CUDA_ARCHITECTURES="${CUDA_V11_ARCHITECTURES}" \
-    CUDA_VARIANT="_v11" \
-    bash gen_linux.sh
-
-FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION_12-devel-rockylinux8 AS cuda-12-build-server-arm64
-ARG CMAKE_VERSION
-COPY ./scripts/rh_linux_deps.sh /
-RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
-ENV PATH /opt/rh/gcc-toolset-10/root/usr/bin:$PATH
-COPY --from=llm-code / /go/src/github.com/ollama/ollama/
-WORKDIR /go/src/github.com/ollama/ollama/llm/generate
-ARG CGO_CFLAGS
-ARG CUDA_V12_ARCHITECTURES
-ENV GOARCH arm64 
-RUN --mount=type=cache,target=/root/.ccache \
-    OLLAMA_SKIP_STATIC_GENERATE=1 \
-    OLLAMA_SKIP_CPU_GENERATE=1 \
-    CMAKE_CUDA_ARCHITECTURES="${CUDA_V12_ARCHITECTURES}" \
-    CUDA_VARIANT="_v12" \
-    OLLAMA_CUSTOM_CUDA_DEFS="-DGGML_CUDA_USE_GRAPHS=on" \
-    bash gen_linux.sh
-
+RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh

 FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete AS rocm-build-amd64
 ARG CMAKE_VERSION
@@ -92,11 +40,15 @@ COPY --from=llm-code / /go/src/github.com/ollama/ollama/
 WORKDIR /go/src/github.com/ollama/ollama/llm/generate
 ARG CGO_CFLAGS
 ARG AMDGPU_TARGETS
-ENV GOARCH amd64 
-RUN --mount=type=cache,target=/root/.ccache \
-    OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 bash gen_linux.sh
-RUN mkdir -p ../../dist/linux-amd64-rocm/lib/ollama && \
-    (cd /opt/rocm/lib && tar cf - rocblas/library) | (cd ../../dist/linux-amd64-rocm/lib/ollama && tar xf - )
+RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
+RUN mkdir /tmp/scratch && \
+    for dep in $(zcat /go/src/github.com/ollama/ollama/llm/build/linux/x86_64/rocm*/bin/deps.txt.gz) ; do \
+        cp ${dep} /tmp/scratch/ || exit 1 ; \
+    done && \
+    (cd /opt/rocm/lib && tar cf - rocblas/library) | (cd /tmp/scratch/ && tar xf - ) && \
+    mkdir -p /go/src/github.com/ollama/ollama/dist/deps/ && \
+    (cd /tmp/scratch/ && tar czvf /go/src/github.com/ollama/ollama/dist/deps/ollama-linux-amd64-rocm.tgz . )
+

 FROM --platform=linux/amd64 centos:7 AS cpu-builder-amd64
 ARG CMAKE_VERSION
@@ -107,21 +59,16 @@ ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
 COPY --from=llm-code / /go/src/github.com/ollama/ollama/
 ARG OLLAMA_CUSTOM_CPU_DEFS
 ARG CGO_CFLAGS
-ENV GOARCH amd64 
 WORKDIR /go/src/github.com/ollama/ollama/llm/generate

 FROM --platform=linux/amd64 cpu-builder-amd64 AS static-build-amd64
-RUN --mount=type=cache,target=/root/.ccache \
-    OLLAMA_CPU_TARGET="static" bash gen_linux.sh
+RUN OLLAMA_CPU_TARGET="static" sh gen_linux.sh
 FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu-build-amd64
-RUN --mount=type=cache,target=/root/.ccache \
-    OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu" bash gen_linux.sh
+RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu" sh gen_linux.sh
 FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu_avx-build-amd64
-RUN --mount=type=cache,target=/root/.ccache \
-    OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu_avx" bash gen_linux.sh
+RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu_avx" sh gen_linux.sh
 FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu_avx2-build-amd64
-RUN --mount=type=cache,target=/root/.ccache \
-    OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu_avx2" bash gen_linux.sh
+RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu_avx2" sh gen_linux.sh

 FROM --platform=linux/arm64 rockylinux:8 AS cpu-builder-arm64
 ARG CMAKE_VERSION
@@ -132,15 +79,12 @@ ENV PATH /opt/rh/gcc-toolset-10/root/usr/bin:$PATH
 COPY --from=llm-code / /go/src/github.com/ollama/ollama/
 ARG OLLAMA_CUSTOM_CPU_DEFS
 ARG CGO_CFLAGS
-ENV GOARCH arm64
 WORKDIR /go/src/github.com/ollama/ollama/llm/generate

 FROM --platform=linux/arm64 cpu-builder-arm64 AS static-build-arm64
-RUN --mount=type=cache,target=/root/.ccache \
-    OLLAMA_CPU_TARGET="static" bash gen_linux.sh
+RUN OLLAMA_CPU_TARGET="static" sh gen_linux.sh
 FROM --platform=linux/arm64 cpu-builder-arm64 AS cpu-build-arm64
-RUN --mount=type=cache,target=/root/.ccache \
-    OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu" bash gen_linux.sh
+RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu" sh gen_linux.sh


 # Intermediate stage used for ./scripts/build_linux.sh
@@ -151,16 +95,12 @@ COPY . .
 COPY --from=static-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
 COPY --from=cpu_avx-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
 COPY --from=cpu_avx2-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
-COPY --from=cuda-11-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/
-COPY --from=cuda-11-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
-COPY --from=cuda-12-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/
-COPY --from=cuda-12-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
-COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/
+COPY --from=cuda-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
 COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
+COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/dist/deps/ ./dist/deps/
 ARG GOFLAGS
 ARG CGO_CFLAGS
-RUN --mount=type=cache,target=/root/.ccache \
-    go build -trimpath -o dist/linux-amd64/bin/ollama .
+RUN go build -trimpath .

 # Intermediate stage used for ./scripts/build_linux.sh
 FROM --platform=linux/arm64 cpu-build-arm64 AS build-arm64
@@ -169,36 +109,23 @@ ARG GOLANG_VERSION
 WORKDIR /go/src/github.com/ollama/ollama
 COPY . .
 COPY --from=static-build-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
-COPY --from=cuda-11-build-server-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
-COPY --from=cuda-11-build-server-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
-COPY --from=cuda-12-build-server-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
-COPY --from=cuda-12-build-server-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
+COPY --from=cuda-build-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
 ARG GOFLAGS
 ARG CGO_CFLAGS
-RUN --mount=type=cache,target=/root/.ccache \
-    go build -trimpath -o dist/linux-arm64/bin/ollama .
-
-# Strip out ROCm dependencies to keep the primary image lean
-FROM --platform=linux/amd64 ubuntu:22.04 as amd64-libs-without-rocm
-COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /scratch/
-RUN cd /scratch/ollama/ && rm -rf rocblas libamd* libdrm* libroc* libhip* libhsa* 
+RUN go build -trimpath .

 # Runtime stages
 FROM --platform=linux/amd64 ubuntu:22.04 as runtime-amd64
-COPY --from=amd64-libs-without-rocm /scratch/ /lib/
 RUN apt-get update && apt-get install -y ca-certificates
-COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/
-
+COPY --from=build-amd64 /go/src/github.com/ollama/ollama/ollama /bin/ollama
 FROM --platform=linux/arm64 ubuntu:22.04 as runtime-arm64
-COPY --from=build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/
 RUN apt-get update && apt-get install -y ca-certificates
-COPY --from=build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/bin/ /bin/
+COPY --from=build-arm64 /go/src/github.com/ollama/ollama/ollama /bin/ollama

 # Radeon images are much larger so we keep it distinct from the CPU/CUDA image
 FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete as runtime-rocm
 RUN update-pciids
-COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/
-RUN ln -s /opt/rocm/lib /lib/ollama
+COPY --from=build-amd64 /go/src/github.com/ollama/ollama/ollama /bin/ollama
 EXPOSE 11434
 ENV OLLAMA_HOST 0.0.0.0

--- a/README.md
+++ b/README.md
@@ -325,7 +325,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [tlm](https://github.com/yusufcanb/tlm)
 - [podman-ollama](https://github.com/ericcurtin/podman-ollama)
 - [gollama](https://github.com/sammcj/gollama)
- [Ollama eBook Summary](https://github.com/cognitivetech/ollama-ebook-summary/)

 ### Database

--- a/api/client.go
+++ b/api/client.go
@@ -298,7 +298,7 @@ func (c *Client) List(ctx context.Context) (*ListResponse, error) {
 	return &lr, nil
 }

-// ListRunning lists running models.
+// List running models.
 func (c *Client) ListRunning(ctx context.Context) (*ProcessResponse, error) {
 	var lr ProcessResponse
 	if err := c.do(ctx, http.MethodGet, "/api/ps", nil, &lr); err != nil {
@@ -333,7 +333,7 @@ func (c *Client) Show(ctx context.Context, req *ShowRequest) (*ShowResponse, err
 	return &resp, nil
 }

-// Heartbeat checks if the server has started and is responsive; if yes, it
+// Hearbeat checks if the server has started and is responsive; if yes, it
 // returns nil, otherwise an error.
 func (c *Client) Heartbeat(ctx context.Context) error {
 	if err := c.do(ctx, http.MethodHead, "/", nil, nil); err != nil {
--- a/app/ollama.iss
+++ b/app/ollama.iss
@@ -87,11 +87,20 @@ DialogFontSize=12

 [Files]
 Source: ".\app.exe"; DestDir: "{app}"; DestName: "{#MyAppExeName}" ; Flags: ignoreversion 64bit
-Source: "..\ollama.exe"; DestDir: "{app}\bin"; Flags: ignoreversion 64bit
-Source: "..\dist\windows-{#ARCH}\lib\ollama\runners\*"; DestDir: "{app}\lib\ollama\runners"; Flags: ignoreversion 64bit recursesubdirs
+Source: "..\ollama.exe"; DestDir: "{app}"; Flags: ignoreversion 64bit
+Source: "..\dist\windows-{#ARCH}\ollama_runners\*"; DestDir: "{app}\ollama_runners"; Flags: ignoreversion 64bit recursesubdirs
 Source: "..\dist\ollama_welcome.ps1"; DestDir: "{app}"; Flags: ignoreversion
 Source: ".\assets\app.ico"; DestDir: "{app}"; Flags: ignoreversion
-Source: "..\dist\windows-amd64\lib\ollama\*"; DestDir: "{app}\lib\ollama\"; Flags: ignoreversion recursesubdirs
+#if DirExists("..\dist\windows-amd64\cuda")
+  Source: "..\dist\windows-amd64\cuda\*"; DestDir: "{app}\cuda\"; Flags: ignoreversion recursesubdirs
+#endif
+#if DirExists("..\dist\windows-amd64\oneapi")
+  Source: "..\dist\windows-amd64\oneapi\*"; DestDir: "{app}\oneapi\"; Flags: ignoreversion recursesubdirs
+#endif
+#if DirExists("..\dist\windows-amd64\rocm")
+  Source: "..\dist\windows-amd64\rocm\*"; DestDir: "{app}\rocm\"; Flags: ignoreversion recursesubdirs
+#endif
+

 [Icons]
 Name: "{group}\{#MyAppName}"; Filename: "{app}\{#MyAppExeName}"; IconFilename: "{app}\app.ico"
@@ -99,7 +108,7 @@ Name: "{userstartup}\{#MyAppName}"; Filename: "{app}\{#MyAppExeName}"; IconFilen
 Name: "{userprograms}\{#MyAppName}"; Filename: "{app}\{#MyAppExeName}"; IconFilename: "{app}\app.ico"

 [Run]
-Filename: "{cmd}"; Parameters: "/C set PATH={app}\bin;%PATH% & ""{app}\{#MyAppExeName}"""; Flags: postinstall nowait runhidden
+Filename: "{cmd}"; Parameters: "/C set PATH={app};%PATH% & ""{app}\{#MyAppExeName}"""; Flags: postinstall nowait runhidden

 [UninstallRun]
 ; Filename: "{cmd}"; Parameters: "/C ""taskkill /im ''{#MyAppExeName}'' /f /t"; Flags: runhidden
@@ -134,8 +143,8 @@ SetupAppRunningError=Another Ollama installer is running.%n%nPlease cancel or fi

 [Registry]
 Root: HKCU; Subkey: "Environment"; \
-    ValueType: expandsz; ValueName: "Path"; ValueData: "{olddata};{app}\bin"; \
-    Check: NeedsAddPath('{app}\bin')
+    ValueType: expandsz; ValueName: "Path"; ValueData: "{olddata};{app}"; \
+    Check: NeedsAddPath('{app}')

 [Code]

--- a/app/tray/wintray/menus.go
+++ b/app/tray/wintray/menus.go
@@ -11,12 +11,12 @@ import (
 )

 const (
-	updateAvailableMenuID = 1
-	updateMenuID          = updateAvailableMenuID + 1
-	separatorMenuID       = updateMenuID + 1
-	diagLogsMenuID        = separatorMenuID + 1
-	diagSeparatorMenuID   = diagLogsMenuID + 1
-	quitMenuID            = diagSeparatorMenuID + 1
+	updatAvailableMenuID = 1
+	updateMenuID         = updatAvailableMenuID + 1
+	separatorMenuID      = updateMenuID + 1
+	diagLogsMenuID       = separatorMenuID + 1
+	diagSeparatorMenuID  = diagLogsMenuID + 1
+	quitMenuID           = diagSeparatorMenuID + 1
 )

 func (t *winTray) initMenus() error {
@@ -35,7 +35,7 @@ func (t *winTray) initMenus() error {
 func (t *winTray) UpdateAvailable(ver string) error {
 	if !t.updateNotified {
 		slog.Debug("updating menu and sending notification for new update")
-		if err := t.addOrUpdateMenuItem(updateAvailableMenuID, 0, updateAvailableMenuTitle, true); err != nil {
+		if err := t.addOrUpdateMenuItem(updatAvailableMenuID, 0, updateAvailableMenuTitle, true); err != nil {
 			return fmt.Errorf("unable to create menu entries %w", err)
 		}
 		if err := t.addOrUpdateMenuItem(updateMenuID, 0, updateMenutTitle, false); err != nil {
--- a/app/tray/wintray/tray.go
+++ b/app/tray/wintray/tray.go
@@ -11,7 +11,6 @@ import (
 	"path/filepath"
 	"sort"
 	"sync"
-	"syscall"
 	"unsafe"

 	"golang.org/x/sys/windows"
@@ -434,12 +433,7 @@ func (t *winTray) setIcon(src string) error {
 	t.muNID.Lock()
 	defer t.muNID.Unlock()
 	t.nid.Icon = h
-	t.nid.Flags |= NIF_ICON | NIF_TIP
-	if toolTipUTF16, err := syscall.UTF16FromString(commontray.ToolTip); err == nil {
-		copy(t.nid.Tip[:], toolTipUTF16)
-	} else {
-		return err
-	}
+	t.nid.Flags |= NIF_ICON
 	t.nid.Size = uint32(unsafe.Sizeof(*t.nid))

 	return t.nid.modify()
--- a/app/tray/wintray/w32api.go
+++ b/app/tray/wintray/w32api.go
@@ -61,7 +61,6 @@ const (
 	MIIM_SUBMENU        = 0x00000004
 	MIM_APPLYTOSUBMENUS = 0x80000000
 	NIF_ICON            = 0x00000002
-	NIF_TIP             = 0x00000004
 	NIF_INFO            = 0x00000010
 	NIF_MESSAGE         = 0x00000001
 	SW_HIDE             = 0
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -22,7 +22,6 @@ import (
 	"runtime"
 	"slices"
 	"strings"
-	"sync/atomic"
 	"syscall"
 	"time"

@@ -79,7 +78,6 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
 	status := "transferring model data"
 	spinner := progress.NewSpinner(status)
 	p.Add(status, spinner)
-	defer p.Stop()

 	for i := range modelfile.Commands {
 		switch modelfile.Commands[i].Name {
@@ -114,7 +112,7 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
 				path = tempfile
 			}

-			digest, err := createBlob(cmd, client, path, spinner)
+			digest, err := createBlob(cmd, client, path)
 			if err != nil {
 				return err
 			}
@@ -204,12 +202,6 @@ func tempZipFiles(path string) (string, error) {
 		// safetensors files might be unresolved git lfs references; skip if they are
 		// covers model-x-of-y.safetensors, model.fp32-x-of-y.safetensors, model.safetensors
 		files = append(files, st...)
-	} else if st, _ := glob(filepath.Join(path, "adapters.safetensors"), "application/octet-stream"); len(st) > 0 {
-		// covers adapters.safetensors
-		files = append(files, st...)
-	} else if st, _ := glob(filepath.Join(path, "adapter_model.safetensors"), "application/octet-stream"); len(st) > 0 {
-		// covers adapter_model.safetensors
-		files = append(files, st...)
 	} else if pt, _ := glob(filepath.Join(path, "pytorch_model*.bin"), "application/zip"); len(pt) > 0 {
 		// pytorch files might also be unresolved git lfs references; skip if they are
 		// covers pytorch_model-x-of-y.bin, pytorch_model.fp32-x-of-y.bin, pytorch_model.bin
@@ -229,14 +221,6 @@ func tempZipFiles(path string) (string, error) {
 	}
 	files = append(files, js...)

-	// bert models require a nested config.json
-	// TODO(mxyng): merge this with the glob above
-	js, err = glob(filepath.Join(path, "**/*.json"), "text/plain")
-	if err != nil {
-		return "", err
-	}
-	files = append(files, js...)
-
 	if tks, _ := glob(filepath.Join(path, "tokenizer.model"), "application/octet-stream"); len(tks) > 0 {
 		// add tokenizer.model if it exists, tokenizer.json is automatically picked up by the previous glob
 		// tokenizer.model might be a unresolved git lfs reference; error if it is
@@ -266,11 +250,6 @@ func tempZipFiles(path string) (string, error) {
 			return "", err
 		}

-		zfi.Name, err = filepath.Rel(path, file)
-		if err != nil {
-			return "", err
-		}
-
 		zf, err := zipfile.CreateHeader(zfi)
 		if err != nil {
 			return "", err
@@ -284,20 +263,13 @@ func tempZipFiles(path string) (string, error) {
 	return tempfile.Name(), nil
 }

-func createBlob(cmd *cobra.Command, client *api.Client, path string, spinner *progress.Spinner) (string, error) {
+func createBlob(cmd *cobra.Command, client *api.Client, path string) (string, error) {
 	bin, err := os.Open(path)
 	if err != nil {
 		return "", err
 	}
 	defer bin.Close()

-	// Get file info to retrieve the size
-	fileInfo, err := bin.Stat()
-	if err != nil {
-		return "", err
-	}
-	fileSize := fileInfo.Size()
-
 	hash := sha256.New()
 	if _, err := io.Copy(hash, bin); err != nil {
 		return "", err
@@ -307,43 +279,13 @@ func createBlob(cmd *cobra.Command, client *api.Client, path string, spinner *pr
 		return "", err
 	}

-	var pw progressWriter
-	status := "transferring model data 0%"
-	spinner.SetMessage(status)
-
-	done := make(chan struct{})
-	defer close(done)
-
-	go func() {
-		ticker := time.NewTicker(60 * time.Millisecond)
-		defer ticker.Stop()
-		for {
-			select {
-			case <-ticker.C:
-				spinner.SetMessage(fmt.Sprintf("transferring model data %d%%", int(100*pw.n.Load()/fileSize)))
-			case <-done:
-				spinner.SetMessage("transferring model data 100%")
-				return
-			}
-		}
-	}()
-
 	digest := fmt.Sprintf("sha256:%x", hash.Sum(nil))
-	if err = client.CreateBlob(cmd.Context(), digest, io.TeeReader(bin, &pw)); err != nil {
+	if err = client.CreateBlob(cmd.Context(), digest, bin); err != nil {
 		return "", err
 	}
 	return digest, nil
 }

-type progressWriter struct {
-	n atomic.Int64
-}
-
-func (w *progressWriter) Write(p []byte) (n int, err error) {
-	w.n.Add(int64(len(p)))
-	return len(p), nil
-}
-
 func RunHandler(cmd *cobra.Command, args []string) error {
 	interactive := true

@@ -1144,7 +1086,7 @@ func generate(cmd *cobra.Command, opts runOptions) error {
 	return nil
 }

-func RunServer(_ *cobra.Command, _ []string) error {
+func RunServer(cmd *cobra.Command, _ []string) error {
 	if err := initializeKeypair(); err != nil {
 		return err
 	}
--- a/convert/convert.go
+++ b/convert/convert.go
@@ -7,27 +7,16 @@ import (
 	"io"
 	"io/fs"
 	"log/slog"
-	"strings"

 	"github.com/ollama/ollama/llm"
 )

-type ModelParameters struct {
+type Parameters struct {
 	Architectures []string `json:"architectures"`
 	VocabSize     uint32   `json:"vocab_size"`
 }

-type AdapterParameters struct {
-	Alpha          uint32 `json:"lora_alpha"`
-	LoraLayers     uint32 `json:"lora_layers"`
-	LoraParameters struct {
-		Rank  uint32  `json:"rank"`
-		Alpha float32 `json:"alpha"`
-		Scale float32 `json:"scale"`
-	} `json:"lora_parameters"`
-}
-
-func (ModelParameters) KV(t *Tokenizer) llm.KV {
+func (Parameters) KV(t *Tokenizer) llm.KV {
 	kv := llm.KV{
 		"general.file_type":            uint32(1),
 		"general.quantization_version": uint32(2),
@@ -38,10 +27,6 @@ func (ModelParameters) KV(t *Tokenizer) llm.KV {
 		"tokenizer.ggml.token_type":    t.Vocabulary.Types,
 	}

-	if len(t.Merges) > 0 {
-		kv["tokenizer.ggml.merges"] = t.Merges
-	}
-
 	if t.Template != "" {
 		kv["tokenizer.chat_template"] = t.Template
 	}
@@ -54,119 +39,40 @@ func (ModelParameters) KV(t *Tokenizer) llm.KV {
 	return kv
 }

-func (p AdapterParameters) KV() llm.KV {
-	var alpha float32
-	if p.LoraParameters.Alpha == 0 {
-		alpha = float32(p.Alpha)
-	} else {
-		alpha = p.LoraParameters.Alpha
-	}
-
-	kv := llm.KV{
-		"adapter.lora.alpha": alpha,
-		"adapter.type":       "lora",
-		"general.file_type":  uint32(1),
-		"general.type":       "adapter",
-		"general.version":    "v0.2",
-	}
-
-	return kv
-}
-
-func (ModelParameters) specialTokenTypes() []string {
+func (Parameters) specialTokenTypes() []string {
 	return []string{
 		"bos", "eos", "unk", "sep", "pad", "cls", "mask",
 	}
 }

-func (ModelParameters) writeFile(ws io.WriteSeeker, kv llm.KV, ts []llm.Tensor) error {
+func (Parameters) writeFile(ws io.WriteSeeker, kv llm.KV, ts []llm.Tensor) error {
 	return llm.WriteGGUF(ws, kv, ts)
 }

-func (AdapterParameters) writeFile(ws io.WriteSeeker, kv llm.KV, ts []llm.Tensor) error {
-	return llm.WriteGGUF(ws, kv, ts)
-}
-
-type ModelConverter interface {
+type Converter interface {
 	// KV maps parameters to LLM key-values
 	KV(*Tokenizer) llm.KV
 	// Tensors maps input tensors to LLM tensors. Model specific modifications can be done here.
 	Tensors([]Tensor) []llm.Tensor
-	// Replacements returns a list of string pairs to replace in tensor names.
-	// See [strings.Replacer](https://pkg.go.dev/strings#Replacer) for details
-	Replacements() []string

+	// tensorName returns the LLM tensor name for a specific input name
+	tensorName(string) string
 	// specialTokenTypes returns any special token types the model uses
 	specialTokenTypes() []string
-	// writeFile writes the model to the provided io.WriteSeeker
 	writeFile(io.WriteSeeker, llm.KV, []llm.Tensor) error
 }

-type moreParser interface {
-	parseMore(fs.FS) error
-}
-
-type AdapterConverter interface {
-	// KV maps parameters to LLM key-values
-	KV(llm.KV) llm.KV
-	// Tensors maps input tensors to LLM tensors. Adapter specific modifications can be done here.
-	Tensors([]Tensor) []llm.Tensor
-	// Replacements returns a list of string pairs to replace in tensor names.
-	// See [strings.Replacer](https://pkg.go.dev/strings#Replacer) for details
-	Replacements() []string
-
-	writeFile(io.WriteSeeker, llm.KV, []llm.Tensor) error
-}
-
-func ConvertAdapter(fsys fs.FS, ws io.WriteSeeker, baseKV llm.KV) error {
-	bts, err := fs.ReadFile(fsys, "adapter_config.json")
-	if err != nil {
-		return err
-	}
-
-	var p AdapterParameters
-	if err := json.Unmarshal(bts, &p); err != nil {
-		return err
-	}
-
-	arch, ok := baseKV["general.architecture"]
-	if !ok {
-		return errors.New("architecture not set for the base model")
-	}
-
-	var conv AdapterConverter
-	switch arch {
-	case "llama":
-		conv = &llamaAdapter{}
-	case "gemma2":
-		conv = &gemma2Adapter{}
-	default:
-		return errors.New("unsupported architecture")
-	}
-
-	ts, err := parseTensors(fsys, strings.NewReplacer(conv.Replacements()...))
-	if err != nil {
-		return err
-	}
-
-	if err := json.Unmarshal(bts, conv); err != nil {
-		return err
-	}
-
-	return conv.writeFile(ws, conv.KV(baseKV), conv.Tensors(ts))
-}
-
 // Convert writes an Ollama compatible model to the provided io.WriteSeeker based on configurations
 // and files it finds in the input path.
 // Supported input model formats include safetensors.
 // Supported input tokenizers files include tokenizer.json (preferred) and tokenizer.model.
-func ConvertModel(fsys fs.FS, ws io.WriteSeeker) error {
+func Convert(fsys fs.FS, ws io.WriteSeeker) error {
 	bts, err := fs.ReadFile(fsys, "config.json")
 	if err != nil {
 		return err
 	}

-	var p ModelParameters
+	var p Parameters
 	if err := json.Unmarshal(bts, &p); err != nil {
 		return err
 	}
@@ -175,20 +81,14 @@ func ConvertModel(fsys fs.FS, ws io.WriteSeeker) error {
 		return errors.New("unknown architecture")
 	}

-	var conv ModelConverter
+	var conv Converter
 	switch p.Architectures[0] {
 	case "LlamaForCausalLM", "MistralForCausalLM":
-		conv = &llamaModel{}
+		conv = &llama{}
 	case "MixtralForCausalLM":
-		conv = &mixtralModel{}
+		conv = &mixtral{}
 	case "GemmaForCausalLM":
-		conv = &gemmaModel{}
-	case "Gemma2ForCausalLM":
-		conv = &gemma2Model{}
-	case "Phi3ForCausalLM":
-		conv = &phi3Model{}
-	case "BertModel":
-		conv = &bertModel{}
+		conv = &gemma{}
 	default:
 		return errors.New("unsupported architecture")
 	}
@@ -197,12 +97,6 @@ func ConvertModel(fsys fs.FS, ws io.WriteSeeker) error {
 		return err
 	}

-	if t, ok := conv.(moreParser); ok {
-		if err := t.parseMore(fsys); err != nil {
-			return err
-		}
-	}
-
 	t, err := parseTokenizer(fsys, conv.specialTokenTypes())
 	if err != nil {
 		return err
@@ -219,7 +113,7 @@ func ConvertModel(fsys fs.FS, ws io.WriteSeeker) error {
 		slog.Debug("vocabulary", "size", len(t.Vocabulary.Tokens))
 	}

-	ts, err := parseTensors(fsys, strings.NewReplacer(conv.Replacements()...))
+	ts, err := parseTensors(fsys)
 	if err != nil {
 		return err
 	}
--- a/convert/convert_bert.go
+++ b/convert/convert_bert.go
@@ -1,174 +0,0 @@
-package convert
-
-import (
-	"cmp"
-	"encoding/json"
-	"io/fs"
-	"path/filepath"
-	"slices"
-	"strings"
-
-	"github.com/ollama/ollama/llm"
-)
-
-type bertModel struct {
-	ModelParameters
-	NLayers               uint32  `json:"n_layers"`
-	NumHiddenLayers       uint32  `json:"num_hidden_layers"`
-	NLayer                uint32  `json:"n_layer"`
-	MaxPositionEmbeddings uint32  `json:"max_position_embeddings"`
-	NCtx                  uint32  `json:"n_ctx"`
-	HiddenSize            uint32  `json:"hidden_size"`
-	NEmbd                 uint32  `json:"n_embd"`
-	IntermediateSize      uint32  `json:"intermediate_size"`
-	NInner                uint32  `json:"n_inner"`
-	NumAttentionHeads     uint32  `json:"num_attention_heads"`
-	NHead                 uint32  `json:"n_head"`
-	NumKeyValueHeads      uint32  `json:"num_key_value_heads"`
-	LayerNormEPS          float32 `json:"layer_norm_eps"`
-	LayerNormEpsilon      float32 `json:"layer_norm_epsilon"`
-	NormEpsilon           float32 `json:"norm_epsilon"`
-
-	PoolingType uint32
-}
-
-var (
-	_ ModelConverter = (*bertModel)(nil)
-	_ moreParser     = (*bertModel)(nil)
-)
-
-func (p *bertModel) parseMore(fsys fs.FS) error {
-	bts, err := fs.ReadFile(fsys, "modules.json")
-	if err != nil {
-		return err
-	}
-
-	var modules []struct {
-		Type string `json:"type"`
-		Path string `json:"path"`
-	}
-
-	if err := json.Unmarshal(bts, &modules); err != nil {
-		return err
-	}
-
-	var pooling string
-	for _, m := range modules {
-		if m.Type == "sentence_transformers.models.Pooling" {
-			pooling = m.Path
-			break
-		}
-	}
-
-	if pooling != "" {
-		bts, err := fs.ReadFile(fsys, filepath.Join(pooling, "config.json"))
-		if err != nil {
-			return err
-		}
-
-		var pc struct {
-			PoolingModeCLSToken   bool `json:"pooling_mode_cls_token"`
-			PoolingModeMeanTokens bool `json:"pooling_mode_mean_tokens"`
-		}
-
-		if err := json.Unmarshal(bts, &pc); err != nil {
-			return err
-		}
-
-		if pc.PoolingModeMeanTokens {
-			p.PoolingType = 1
-		} else if pc.PoolingModeCLSToken {
-			p.PoolingType = 2
-		}
-	}
-
-	return nil
-}
-
-func (p *bertModel) KV(t *Tokenizer) llm.KV {
-	kv := p.ModelParameters.KV(t)
-	kv["general.architecture"] = "bert"
-	kv["bert.attention.causal"] = false
-	kv["bert.pooling_type"] = p.PoolingType
-
-	kv["bert.block_count"] = cmp.Or(p.NLayers, p.NumHiddenLayers, p.NLayer)
-
-	if contextLength := cmp.Or(p.MaxPositionEmbeddings, p.NCtx); contextLength > 0 {
-		kv["bert.context_length"] = contextLength
-	}
-
-	if embeddingLength := cmp.Or(p.HiddenSize, p.NEmbd); embeddingLength > 0 {
-		kv["bert.embedding_length"] = cmp.Or(p.HiddenSize, p.NEmbd)
-	}
-
-	if feedForwardLength := cmp.Or(p.IntermediateSize, p.NInner); feedForwardLength > 0 {
-		kv["bert.feed_forward_length"] = cmp.Or(p.IntermediateSize, p.NInner)
-	}
-
-	if headCount := cmp.Or(p.NumAttentionHeads, p.NHead); headCount > 0 {
-		kv["bert.attention.head_count"] = cmp.Or(p.NumAttentionHeads, p.NHead)
-	}
-
-	if layerNormEpsilon := cmp.Or(p.LayerNormEPS, p.LayerNormEpsilon, p.NormEpsilon); layerNormEpsilon > 0 {
-		kv["bert.attention.layer_norm_epsilon"] = layerNormEpsilon
-	}
-
-	kv["tokenizer.ggml.model"] = "bert"
-	kv["tokenizer.ggml.token_type_count"] = uint32(2)
-
-	// convert to phantom space tokens
-	for i, e := range t.Tokens {
-		if strings.HasPrefix(e, "[") && strings.HasSuffix(e, "]") {
-			// noop
-		} else if strings.HasPrefix(e, "##") {
-			t.Tokens[i] = e[2:]
-		} else {
-			t.Tokens[i] = "\u2581" + e
-		}
-	}
-
-	kv["tokenizer.ggml.tokens"] = t.Tokens
-
-	return kv
-}
-
-func (p *bertModel) Tensors(ts []Tensor) []llm.Tensor {
-	var out []llm.Tensor
-	for _, t := range ts {
-		if slices.Contains([]string{
-			"embeddings.position_ids",
-			"pooler.dense.weight",
-			"pooler.dense.bias",
-		}, t.Name()) {
-			continue
-		}
-
-		out = append(out, llm.Tensor{
-			Name:     t.Name(),
-			Kind:     t.Kind(),
-			Shape:    t.Shape(),
-			WriterTo: t,
-		})
-	}
-
-	return out
-}
-
-func (bertModel) Replacements() []string {
-	return []string{
-		"encoder.layer", "blk",
-		"encoder.layers", "blk",
-		"embeddings.word_embeddings", "token_embd",
-		"embeddings.token_type_embeddings", "token_types",
-		"embeddings.LayerNorm", "token_embd_norm",
-		"embeddings.position_embeddings", "position_embd",
-		"attention.self.query", "attn_q",
-		"attention.self.key", "attn_k",
-		"attention.self.value", "attn_v",
-		"attention.output.dense", "attn_output",
-		"attention.output.LayerNorm", "attn_output_norm",
-		"intermediate.dense", "ffn_up",
-		"output.dense", "ffn_down",
-		"output.LayerNorm", "layer_output_norm",
-	}
-}
--- a/convert/convert_gemma.go
+++ b/convert/convert_gemma.go
@@ -9,8 +9,8 @@ import (
 	"github.com/ollama/ollama/llm"
 )

-type gemmaModel struct {
-	ModelParameters
+type gemma struct {
+	Parameters
 	MaxPositionEmbeddings uint32  `json:"max_position_embeddings"`
 	HiddenSize            uint32  `json:"hidden_size"`
 	HiddenLayers          uint32  `json:"num_hidden_layers"`
@@ -21,11 +21,12 @@ type gemmaModel struct {
 	HeadDim               uint32  `json:"head_dim"`
 }

-var _ ModelConverter = (*gemmaModel)(nil)
+var _ Converter = (*gemma)(nil)

-func (p *gemmaModel) KV(t *Tokenizer) llm.KV {
-	kv := p.ModelParameters.KV(t)
+func (p *gemma) KV(t *Tokenizer) llm.KV {
+	kv := p.Parameters.KV(t)
 	kv["general.architecture"] = "gemma"
+	kv["general.name"] = "gemma"
 	kv["gemma.context_length"] = p.MaxPositionEmbeddings
 	kv["gemma.embedding_length"] = p.HiddenSize
 	kv["gemma.block_count"] = p.HiddenLayers
@@ -42,15 +43,16 @@ func (p *gemmaModel) KV(t *Tokenizer) llm.KV {
 	return kv
 }

-func (p *gemmaModel) Tensors(ts []Tensor) []llm.Tensor {
+func (p *gemma) Tensors(ts []Tensor) []llm.Tensor {
 	var out []llm.Tensor
 	for _, t := range ts {
-		if strings.HasSuffix(t.Name(), "_norm.weight") {
+		name := p.tensorName(t.Name())
+		if strings.HasSuffix(name, "_norm.weight") {
 			t.SetRepacker(p.addOne)
 		}

 		out = append(out, llm.Tensor{
-			Name:     t.Name(),
+			Name:     name,
 			Kind:     t.Kind(),
 			Shape:    t.Shape(),
 			WriterTo: t,
@@ -60,8 +62,8 @@ func (p *gemmaModel) Tensors(ts []Tensor) []llm.Tensor {
 	return out
 }

-func (p *gemmaModel) Replacements() []string {
-	return []string{
+func (p *gemma) tensorName(n string) string {
+	return strings.NewReplacer(
 		"model.embed_tokens", "token_embd",
 		"model.norm", "output_norm",
 		"model.layers", "blk",
@@ -74,10 +76,11 @@ func (p *gemmaModel) Replacements() []string {
 		"mlp.down_proj", "ffn_down",
 		"mlp.up_proj", "ffn_up",
 		"post_attention_layernorm", "ffn_norm",
-	}
+		"block_sparse_moe.gate", "ffn_inp",
+	).Replace(n)
 }

-func (*gemmaModel) addOne(_ string, data []float32, shape []uint64) ([]float32, error) {
+func (*gemma) addOne(_ string, data []float32, shape []uint64) ([]float32, error) {
 	n := tensor.New(tensor.WithShape(int(shape[0])), tensor.WithBacking(data))
 	ones := tensor.Ones(tensor.Float32, int(shape[0]))

--- a/convert/convert_gemma2.go
+++ b/convert/convert_gemma2.go
@@ -1,43 +0,0 @@
-package convert
-
-import (
-	"github.com/ollama/ollama/llm"
-)
-
-type gemma2Model struct {
-	gemmaModel
-	SlidingWindow         uint32  `json:"sliding_window"`
-	AttentionLogitSoftcap float32 `json:"attn_logit_softcapping"`
-	FinalLogitSoftcap     float32 `json:"final_logit_softcapping"`
-}
-
-func (p *gemma2Model) KV(t *Tokenizer) llm.KV {
-	kv := p.ModelParameters.KV(t)
-	kv["general.architecture"] = "gemma2"
-	kv["gemma2.context_length"] = p.MaxPositionEmbeddings
-	kv["gemma2.embedding_length"] = p.HiddenSize
-	kv["gemma2.block_count"] = p.HiddenLayers
-	kv["gemma2.feed_forward_length"] = p.IntermediateSize
-	kv["gemma2.attention.head_count"] = p.NumAttentionHeads
-	kv["gemma2.attention.head_count_kv"] = p.NumKeyValueHeads
-	kv["gemma2.attention.layer_norm_rms_epsilon"] = p.RMSNormEPS
-	kv["gemma2.attention.key_length"] = p.HeadDim
-	kv["gemma2.attention.value_length"] = p.HeadDim
-	kv["gemma2.attention.sliding_window"] = p.SlidingWindow
-	kv["gemma2.attn_logit_softcapping"] = p.AttentionLogitSoftcap
-	kv["gemma2.final_logit_softcapping"] = p.FinalLogitSoftcap
-	kv["tokenizer.ggml.eot_token_id"] = uint32(107)
-	kv["tokenizer.ggml.middle_token_id"] = uint32(68)
-	kv["tokenizer.ggml.prefix_token_id"] = uint32(67)
-	kv["tokenizer.ggml.suffix_token_id"] = uint32(69)
-	return kv
-}
-
-func (p *gemma2Model) Replacements() []string {
-	return append(
-		p.gemmaModel.Replacements(),
-		"post_attention_layernorm", "post_attention_norm",
-		"pre_feedforward_layernorm", "ffn_norm",
-		"post_feedforward_layernorm", "post_ffw_norm",
-	)
-}
--- a/convert/convert_gemma2_adapter.go
+++ b/convert/convert_gemma2_adapter.go
@@ -1,91 +0,0 @@
-package convert
-
-import (
-	"strings"
-
-	"github.com/pdevine/tensor"
-	"github.com/pdevine/tensor/native"
-
-	"github.com/ollama/ollama/llm"
-)
-
-type gemma2Adapter struct {
-	AdapterParameters
-}
-
-var _ AdapterConverter = (*gemma2Adapter)(nil)
-
-func (p *gemma2Adapter) KV(baseKV llm.KV) llm.KV {
-	kv := p.AdapterParameters.KV()
-	kv["general.architecture"] = "gemma2"
-	return kv
-}
-
-func (p *gemma2Adapter) Tensors(ts []Tensor) []llm.Tensor {
-	var out []llm.Tensor
-	for _, t := range ts {
-		shape := t.Shape()
-		if (strings.HasSuffix(t.Name(), "weight.lora_a") && shape[0] > shape[1]) ||
-			(strings.HasSuffix(t.Name(), "weight.lora_b") && shape[0] < shape[1]) {
-			shape[0], shape[1] = shape[1], shape[0]
-			t.SetRepacker(p.repack)
-		}
-
-		out = append(out, llm.Tensor{
-			Name:     t.Name(),
-			Kind:     t.Kind(),
-			Shape:    t.Shape(),
-			WriterTo: t,
-		})
-	}
-
-	return out
-}
-
-func (p *gemma2Adapter) Replacements() []string {
-	return []string{
-		"base_model.model.", "",
-		"model.layers", "blk",
-		"self_attn.q_proj", "attn_q",
-		"self_attn.k_proj", "attn_k",
-		"self_attn.v_proj", "attn_v",
-		"self_attn.o_proj", "attn_output",
-		"mlp.gate_proj", "ffn_gate",
-		"mlp.down_proj", "ffn_down",
-		"mlp.up_proj", "ffn_up",
-		"lora_A.weight", "weight.lora_a",
-		"lora_B.weight", "weight.lora_b",
-		"lora_a", "weight.lora_a",
-		"lora_b", "weight.lora_b",
-	}
-}
-
-func (p *gemma2Adapter) repack(name string, data []float32, shape []uint64) ([]float32, error) {
-	dims := []int{int(shape[1]), int(shape[0])}
-
-	n := tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
-
-	if err := n.T(1, 0); err != nil {
-		return nil, err
-	}
-
-	if err := n.Reshape(dims...); err != nil {
-		return nil, err
-	}
-
-	if err := n.Transpose(); err != nil {
-		return nil, err
-	}
-
-	ts, err := native.SelectF32(n, 1)
-	if err != nil {
-		return nil, err
-	}
-
-	var f32s []float32
-	for _, t := range ts {
-		f32s = append(f32s, t...)
-	}
-
-	return f32s, nil
-}
--- a/convert/convert_llama.go
+++ b/convert/convert_llama.go
@@ -3,7 +3,6 @@ package convert
 import (
 	"cmp"
 	"fmt"
-	"math"
 	"strings"

 	"github.com/pdevine/tensor"
@@ -12,8 +11,8 @@ import (
 	"github.com/ollama/ollama/llm"
 )

-type llamaModel struct {
-	ModelParameters
+type llama struct {
+	Parameters
 	NLayers               uint32  `json:"n_layers"`
 	NumHiddenLayers       uint32  `json:"num_hidden_layers"`
 	NLayer                uint32  `json:"n_layer"`
@@ -28,14 +27,8 @@ type llamaModel struct {
 	NumKeyValueHeads      uint32  `json:"num_key_value_heads"`
 	RopeTheta             float32 `json:"rope_theta"`
 	RopeScaling           struct {
-		Type                            string  `json:"type"`
-		RopeType                        string  `json:"rope_type"`
-		Factor                          float32 `json:"factor"`
-		LowFrequencyFactor              float32 `json:"low_freq_factor"`
-		HighFrequencyFactor             float32 `json:"high_freq_factor"`
-		OriginalMaxPositionalEmbeddings uint32  `json:"original_max_positional_embeddings"`
-
-		factors ropeFactor
+		Type   string  `json:"type"`
+		Factor float32 `json:"factor"`
 	} `json:"rope_scaling"`
 	RMSNormEPS       float32 `json:"rms_norm_eps"`
 	LayerNormEPS     float32 `json:"layer_norm_eps"`
@@ -44,11 +37,12 @@ type llamaModel struct {
 	HeadDim          uint32  `json:"head_dim"`
 }

-var _ ModelConverter = (*llamaModel)(nil)
+var _ Converter = (*llama)(nil)

-func (p *llamaModel) KV(t *Tokenizer) llm.KV {
-	kv := p.ModelParameters.KV(t)
+func (p *llama) KV(t *Tokenizer) llm.KV {
+	kv := p.Parameters.KV(t)
 	kv["general.architecture"] = "llama"
+	kv["general.name"] = "llama"
 	kv["llama.vocab_size"] = p.VocabSize

 	kv["llama.block_count"] = cmp.Or(p.NLayers, p.NumHiddenLayers, p.NLayer)
@@ -77,27 +71,6 @@ func (p *llamaModel) KV(t *Tokenizer) llm.KV {
 	if p.RopeScaling.Type == "linear" {
 		kv["llama.rope.scaling.type"] = p.RopeScaling.Type
 		kv["llama.rope.scaling.factor"] = p.RopeScaling.Factor
-	} else if p.RopeScaling.RopeType == "llama3" {
-		dim := p.HiddenSize / p.NumAttentionHeads
-		for i := uint32(0); i < dim; i += 2 {
-			factor := cmp.Or(p.RopeScaling.Factor, 8.0)
-			factorLow := cmp.Or(p.RopeScaling.LowFrequencyFactor, 1.0)
-			factorHigh := cmp.Or(p.RopeScaling.HighFrequencyFactor, 4.0)
-
-			original := cmp.Or(p.RopeScaling.OriginalMaxPositionalEmbeddings, 8192)
-			lambdaLow := float32(original) / factorLow
-			lambdaHigh := float32(original) / factorHigh
-
-			lambda := 2 * math.Pi * math.Pow(float64(p.RopeTheta), float64(i)/float64(dim))
-			if lambda < float64(lambdaHigh) {
-				p.RopeScaling.factors = append(p.RopeScaling.factors, 1.0)
-			} else if lambda > float64(lambdaLow) {
-				p.RopeScaling.factors = append(p.RopeScaling.factors, factor)
-			} else {
-				smooth := (float32(original)/float32(lambda) - factorLow) / (factorHigh - factorLow)
-				p.RopeScaling.factors = append(p.RopeScaling.factors, 1.0/((1-smooth)/factor+smooth))
-			}
-		}
 	}

 	if p.NumKeyValueHeads > 0 {
@@ -117,29 +90,24 @@ func (p *llamaModel) KV(t *Tokenizer) llm.KV {
 		kv["llama.attention.value_length"] = p.HeadDim
 	}

+	if len(t.Merges) > 0 {
+		kv["tokenizer.ggml.merges"] = t.Merges
+	}
+
 	return kv
 }

-func (p *llamaModel) Tensors(ts []Tensor) []llm.Tensor {
+func (p *llama) Tensors(ts []Tensor) []llm.Tensor {
 	var out []llm.Tensor
-
-	if p.RopeScaling.factors != nil {
-		out = append(out, llm.Tensor{
-			Name:     "rope_freqs.weight",
-			Kind:     0,
-			Shape:    []uint64{uint64(len(p.RopeScaling.factors))},
-			WriterTo: p.RopeScaling.factors,
-		})
-	}
-
 	for _, t := range ts {
-		if strings.HasSuffix(t.Name(), "attn_q.weight") ||
-			strings.HasSuffix(t.Name(), "attn_k.weight") {
+		name := p.tensorName(t.Name())
+		if strings.HasSuffix(name, "attn_q.weight") ||
+			strings.HasSuffix(name, "attn_k.weight") {
 			t.SetRepacker(p.repack)
 		}

 		out = append(out, llm.Tensor{
-			Name:     t.Name(),
+			Name:     name,
 			Kind:     t.Kind(),
 			Shape:    t.Shape(),
 			WriterTo: t,
@@ -149,8 +117,8 @@ func (p *llamaModel) Tensors(ts []Tensor) []llm.Tensor {
 	return out
 }

-func (p *llamaModel) Replacements() []string {
-	return []string{
+func (p *llama) tensorName(n string) string {
+	return strings.NewReplacer(
 		"lm_head", "output",
 		"model.embed_tokens", "token_embd",
 		"model.norm", "output_norm",
@@ -164,19 +132,21 @@ func (p *llamaModel) Replacements() []string {
 		"mlp.down_proj", "ffn_down",
 		"mlp.up_proj", "ffn_up",
 		"post_attention_layernorm", "ffn_norm",
-	}
+		// mixtral
+		"block_sparse_moe.gate", "ffn_gate_inp",
+	).Replace(n)
 }

-func (p *llamaModel) repack(name string, data []float32, shape []uint64) ([]float32, error) {
+func (p *llama) repack(name string, data []float32, shape []uint64) ([]float32, error) {
 	var dims []int
 	for _, dim := range shape {
 		dims = append(dims, int(dim))
 	}

 	var heads uint32
-	if strings.HasSuffix(name, "attn_q.weight") {
+	if strings.HasSuffix(name, "q_proj.weight") {
 		heads = p.NumAttentionHeads
-	} else if strings.HasSuffix(name, "attn_k.weight") {
+	} else if strings.HasSuffix(name, "k_proj.weight") {
 		heads = cmp.Or(p.NumKeyValueHeads, p.NumAttentionHeads)
 	} else {
 		return nil, fmt.Errorf("unknown tensor for repack: %s", name)
--- a/convert/convert_llama_adapter.go
+++ b/convert/convert_llama_adapter.go
@@ -1,169 +0,0 @@
-package convert
-
-import (
-	"cmp"
-	"strings"
-
-	"github.com/pdevine/tensor"
-	"github.com/pdevine/tensor/native"
-
-	"github.com/ollama/ollama/llm"
-)
-
-type llamaAdapter struct {
-	AdapterParameters
-	NumAttentionHeads uint32 `json:"num_attention_heads"`
-	NumKeyValueHeads  uint32 `json:"num_key_value_heads"`
-}
-
-var _ AdapterConverter = (*llamaAdapter)(nil)
-
-func (p *llamaAdapter) KV(baseKV llm.KV) llm.KV {
-	kv := p.AdapterParameters.KV()
-	kv["general.architecture"] = "llama"
-	kv["llama.attention.head_count"] = baseKV["llama.attention.head_count"]
-	kv["llama.attention.head_count_kv"] = baseKV["llama.attention.head_count_kv"]
-
-	p.NumAttentionHeads = baseKV["llama.attention.head_count"].(uint32)
-
-	return kv
-}
-
-func (p *llamaAdapter) Tensors(ts []Tensor) []llm.Tensor {
-	var out []llm.Tensor
-	for _, t := range ts {
-		shape := t.Shape()
-		if (strings.HasSuffix(t.Name(), "weight.lora_a") && shape[0] > shape[1]) ||
-			(strings.HasSuffix(t.Name(), "weight.lora_b") && shape[0] < shape[1]) {
-			shape[0], shape[1] = shape[1], shape[0]
-			t.SetRepacker(p.repackAndTranspose)
-		} else {
-			t.SetRepacker(p.repack)
-		}
-
-		out = append(out, llm.Tensor{
-			Name:     t.Name(),
-			Kind:     t.Kind(),
-			Shape:    shape,
-			WriterTo: t,
-		})
-	}
-
-	return out
-}
-
-func (p *llamaAdapter) Replacements() []string {
-	return []string{
-		"base_model.model.", "",
-		"model.layers", "blk",
-		"self_attn.q_proj", "attn_q",
-		"self_attn.k_proj", "attn_k",
-		"self_attn.v_proj", "attn_v",
-		"self_attn.o_proj", "attn_output",
-		"mlp.gate_proj", "ffn_gate",
-		"mlp.down_proj", "ffn_down",
-		"mlp.up_proj", "ffn_up",
-		"lora_A.weight", "weight.lora_a",
-		"lora_B.weight", "weight.lora_b",
-		"lora_a", "weight.lora_a",
-		"lora_b", "weight.lora_b",
-	}
-}
-
-func (p *llamaAdapter) repack(name string, data []float32, shape []uint64) ([]float32, error) {
-	dims := []int{int(shape[1]), int(shape[0])}
-
-	var heads uint32
-	if strings.HasSuffix(name, "attn_q.weight.lora_a") {
-		heads = p.NumAttentionHeads
-	} else if strings.HasSuffix(name, "attn_k.weight.lora_a") {
-		heads = cmp.Or(p.NumKeyValueHeads, p.NumAttentionHeads)
-	} else {
-		return data, nil
-	}
-
-	n := tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
-
-	if err := n.Reshape(append([]int{int(heads), 2, dims[0] / int(heads) / 2}, dims[1:]...)...); err != nil {
-		return nil, err
-	}
-
-	if err := n.T(0, 2, 1, 3); err != nil {
-		return nil, err
-	}
-
-	if err := n.Reshape(dims...); err != nil {
-		return nil, err
-	}
-
-	if err := n.Transpose(); err != nil {
-		return nil, err
-	}
-
-	ts, err := native.SelectF32(n, 1)
-	if err != nil {
-		return nil, err
-	}
-
-	var f32s []float32
-	for _, t := range ts {
-		f32s = append(f32s, t...)
-	}
-
-	return f32s, nil
-}
-
-func (p *llamaAdapter) repackAndTranspose(name string, data []float32, shape []uint64) ([]float32, error) {
-	dims := []int{int(shape[1]), int(shape[0])}
-
-	n := tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
-
-	var heads uint32
-	if strings.HasSuffix(name, "attn_q.weight.lora_a") {
-		heads = p.NumAttentionHeads
-	} else if strings.HasSuffix(name, "attn_k.weight.lora_a") {
-		heads = cmp.Or(p.NumKeyValueHeads, p.NumAttentionHeads)
-	}
-
-	if heads > 0 {
-		if err := n.Reshape(append([]int{int(heads), 2, dims[0] / int(heads) / 2}, dims[1:]...)...); err != nil {
-			return nil, err
-		}
-
-		if err := n.T(0, 2, 1, 3); err != nil {
-			return nil, err
-		}
-
-		if err := n.Reshape(dims...); err != nil {
-			return nil, err
-		}
-
-		if err := n.Transpose(); err != nil {
-			return nil, err
-		}
-	}
-
-	if err := n.T(1, 0); err != nil {
-		return nil, err
-	}
-
-	if err := n.Reshape(dims...); err != nil {
-		return nil, err
-	}
-
-	if err := n.Transpose(); err != nil {
-		return nil, err
-	}
-
-	ts, err := native.SelectF32(n, 1)
-	if err != nil {
-		return nil, err
-	}
-
-	var f32s []float32
-	for _, t := range ts {
-		f32s = append(f32s, t...)
-	}
-
-	return f32s, nil
-}
--- a/convert/convert_mixtral.go
+++ b/convert/convert_mixtral.go
@@ -9,14 +9,16 @@ import (
 	"github.com/ollama/ollama/llm"
 )

-type mixtralModel struct {
-	llamaModel
+type mixtral struct {
+	llama
 	NumLocalExperts    uint32 `json:"num_local_experts"`
 	NumExpertsPerToken uint32 `json:"num_experts_per_tok"`
 }

-func (p *mixtralModel) KV(t *Tokenizer) llm.KV {
-	kv := p.llamaModel.KV(t)
+var _ Converter = (*mixtral)(nil)
+
+func (p *mixtral) KV(t *Tokenizer) llm.KV {
+	kv := p.llama.KV(t)

 	if p.NumLocalExperts > 0 {
 		kv["llama.expert_count"] = p.NumLocalExperts
@@ -29,7 +31,7 @@ func (p *mixtralModel) KV(t *Tokenizer) llm.KV {
 	return kv
 }

-func (p *mixtralModel) Tensors(ts []Tensor) []llm.Tensor {
+func (p *mixtral) Tensors(ts []Tensor) []llm.Tensor {
 	oldnew := []string{
 		"model.layers", "blk",
 		"w1", "ffn_gate_exps",
@@ -67,14 +69,7 @@ func (p *mixtralModel) Tensors(ts []Tensor) []llm.Tensor {
 		})
 	}

-	return append(out, p.llamaModel.Tensors(ts)...)
-}
-
-func (p *mixtralModel) Replacements() []string {
-	return append(
-		p.llamaModel.Replacements(),
-		"block_sparse_moe.gate", "ffn_gate_inp",
-	)
+	return append(out, p.llama.Tensors(ts)...)
 }

 type experts []Tensor
--- a/convert/convert_phi3.go
+++ b/convert/convert_phi3.go
@@ -1,123 +0,0 @@
-package convert
-
-import (
-	"cmp"
-	"encoding/binary"
-	"io"
-	"math"
-	"strings"
-	"sync"
-
-	"github.com/ollama/ollama/llm"
-)
-
-type phi3Model struct {
-	ModelParameters
-	NumHiddenLayers   uint32  `json:"num_hidden_layers"`
-	NLayers           uint32  `json:"n_layers"`
-	HiddenSize        uint32  `json:"hidden_size"`
-	NEmbd             uint32  `json:"n_embd"`
-	IntermediateSize  uint32  `json:"intermediate_size"`
-	NumAttentionHeads uint32  `json:"num_attention_heads"`
-	NHead             uint32  `json:"n_head"`
-	NumKeyValueHeads  uint32  `json:"num_key_value_heads"`
-	NHeadKV           uint32  `json:"n_head_kv"`
-	RopeTheta         float32 `json:"rope_theta"`
-	RopeScaling       struct {
-		Type        string     `json:"type"`
-		LongFactor  ropeFactor `json:"long_factor"`
-		ShortFactor ropeFactor `json:"short_factor"`
-	} `json:"rope_scaling"`
-	RMSNormEPS                    float32 `json:"rms_norm_eps"`
-	NPositions                    uint32  `json:"n_positions"`
-	MaxPositionEmbeddings         uint32  `json:"max_position_embeddings"`
-	OriginalMaxPositionEmbeddings uint32  `json:"original_max_position_embeddings"`
-	SlidingWindow                 uint32  `json:"sliding_window"`
-}
-
-var _ ModelConverter = (*phi3Model)(nil)
-
-func (p *phi3Model) KV(t *Tokenizer) llm.KV {
-	kv := p.ModelParameters.KV(t)
-	kv["general.architecture"] = "phi3"
-	kv["phi3.context_length"] = p.MaxPositionEmbeddings
-	kv["phi3.embedding_length"] = cmp.Or(p.HiddenSize, p.NEmbd)
-	kv["phi3.feed_forward_length"] = p.IntermediateSize
-	kv["phi3.block_count"] = cmp.Or(p.NumHiddenLayers, p.NLayers)
-	kv["phi3.attention.head_count"] = cmp.Or(p.NumAttentionHeads, p.NHead)
-	kv["phi3.attention.head_count_kv"] = cmp.Or(p.NumKeyValueHeads, p.NHeadKV)
-	kv["phi3.attention.layer_norm_rms_epsilon"] = p.RMSNormEPS
-	kv["phi3.rope.dimension_count"] = p.HiddenSize / cmp.Or(p.NumAttentionHeads, p.NHead)
-	kv["phi3.rope.freq_base"] = p.RopeTheta
-	kv["phi3.rope.scaling.original_context_length"] = p.OriginalMaxPositionEmbeddings
-	kv["phi3.attention.sliding_window"] = p.SlidingWindow
-
-	scale := float64(p.MaxPositionEmbeddings) / float64(p.OriginalMaxPositionEmbeddings)
-
-	switch p.RopeScaling.Type {
-	case "":
-		// no scaling
-	case "su", "longrope":
-		kv["phi3.rope.scaling.attn_factor"] = float32(max(math.Sqrt(1+math.Log(scale)/math.Log(float64(p.OriginalMaxPositionEmbeddings))), 1.0))
-	case "yarn":
-		kv["phi3.rope.scaling.attn_factor"] = float32(max(0.1*math.Log(scale)+1.0, 1.0))
-	default:
-		panic("unknown rope scaling type")
-	}
-
-	return kv
-}
-
-func (p *phi3Model) Tensors(ts []Tensor) []llm.Tensor {
-	var addRopeFactors sync.Once
-
-	out := make([]llm.Tensor, 0, len(ts)+2)
-	for _, t := range ts {
-		if strings.HasPrefix(t.Name(), "blk.0.") {
-			addRopeFactors.Do(func() {
-				out = append(out, llm.Tensor{
-					Name:     "rope_factors_long.weight",
-					Kind:     0,
-					Shape:    []uint64{uint64(len(p.RopeScaling.LongFactor))},
-					WriterTo: p.RopeScaling.LongFactor,
-				}, llm.Tensor{
-					Name:     "rope_factors_short.weight",
-					Kind:     0,
-					Shape:    []uint64{uint64(len(p.RopeScaling.ShortFactor))},
-					WriterTo: p.RopeScaling.ShortFactor,
-				})
-			})
-		}
-
-		out = append(out, llm.Tensor{
-			Name:     t.Name(),
-			Kind:     t.Kind(),
-			Shape:    t.Shape(),
-			WriterTo: t,
-		})
-	}
-
-	return out
-}
-
-func (p *phi3Model) Replacements() []string {
-	return []string{
-		"lm_head", "output",
-		"model.embed_tokens", "token_embd",
-		"model.norm", "output_norm",
-		"model.layers", "blk",
-		"input_layernorm", "attn_norm",
-		"self_attn.qkv_proj", "attn_qkv",
-		"self_attn.o_proj", "attn_output",
-		"mlp.down_proj", "ffn_down",
-		"mlp.gate_up_proj", "ffn_up",
-		"post_attention_layernorm", "ffn_norm",
-	}
-}
-
-type ropeFactor []float32
-
-func (r ropeFactor) WriteTo(w io.Writer) (int64, error) {
-	err := binary.Write(w, binary.LittleEndian, r)
-	return 0, err
-}
--- a/convert/convert_test.go
+++ b/convert/convert_test.go
@@ -1,9 +1,7 @@
 package convert

 import (
-	"bytes"
 	"crypto/sha256"
-	"encoding/binary"
 	"encoding/hex"
 	"encoding/json"
 	"flag"
@@ -31,7 +29,7 @@ func convertFull(t *testing.T, fsys fs.FS) (*os.File, llm.KV, llm.Tensors) {
 	}
 	defer f.Close()

-	if err := ConvertModel(fsys, f); err != nil {
+	if err := Convert(fsys, f); err != nil {
 		t.Fatal(err)
 	}

@@ -53,34 +51,6 @@ func convertFull(t *testing.T, fsys fs.FS) (*os.File, llm.KV, llm.Tensors) {
 	return r, m.KV(), m.Tensors()
 }

-func generateResultsJSON(t *testing.T, f *os.File, kv llm.KV, tensors llm.Tensors) map[string]string {
-	actual := make(map[string]string)
-	for k, v := range kv {
-		if s, ok := v.(json.Marshaler); !ok {
-			actual[k] = fmt.Sprintf("%v", v)
-		} else {
-			bts, err := json.Marshal(s)
-			if err != nil {
-				t.Fatal(err)
-			}
-
-			actual[k] = fmt.Sprintf("%x", sha256.Sum256(bts))
-		}
-	}
-
-	for _, tensor := range tensors.Items {
-		sha256sum := sha256.New()
-		sr := io.NewSectionReader(f, int64(tensors.Offset+tensor.Offset), int64(tensor.Size()))
-		if _, err := io.Copy(sha256sum, sr); err != nil {
-			t.Fatal(err)
-		}
-
-		actual[tensor.Name] = hex.EncodeToString(sha256sum.Sum(nil))
-	}
-
-	return actual
-}
-
 func TestMain(m *testing.M) {
 	var level slog.Level
 	flag.TextVar(&level, "level", slog.LevelInfo, "log level")
@@ -92,14 +62,9 @@ func TestMain(m *testing.M) {
 func TestConvertFull(t *testing.T) {
 	cases := []string{
 		"Meta-Llama-3-8B-Instruct",
-		"Meta-Llama-3.1-8B-Instruct",
 		"Mistral-7B-Instruct-v0.2",
 		"Mixtral-8x7B-Instruct-v0.1",
 		"gemma-2b-it",
-		// microsoft/Phi-3-mini-128-instruct@d548c233192db00165d842bf8edff054bb3212f8
-		"Phi-3-mini-128k-instruct",
-		"all-MiniLM-L6-v2",
-		"gemma-2-9b-it",
 	}

 	for i := range cases {
@@ -115,7 +80,29 @@ func TestConvertFull(t *testing.T) {
 			}

 			f, kv, tensors := convertFull(t, os.DirFS(p))
-			actual := generateResultsJSON(t, f, kv, tensors)
+			actual := make(map[string]string)
+			for k, v := range kv {
+				if s, ok := v.(json.Marshaler); !ok {
+					actual[k] = fmt.Sprintf("%v", v)
+				} else {
+					bts, err := json.Marshal(s)
+					if err != nil {
+						t.Fatal(err)
+					}
+
+					actual[k] = fmt.Sprintf("%x", sha256.Sum256(bts))
+				}
+			}
+
+			for _, tensor := range tensors.Items {
+				sha256sum := sha256.New()
+				sr := io.NewSectionReader(f, int64(tensors.Offset+tensor.Offset), int64(tensor.Size()))
+				if _, err := io.Copy(sha256sum, sr); err != nil {
+					t.Fatal(err)
+				}
+
+				actual[tensor.Name] = hex.EncodeToString(sha256sum.Sum(nil))
+			}

 			expectFile, err := os.Open(filepath.Join("testdata", fmt.Sprintf("%s.json", tt)))
 			if err != nil {
@@ -139,209 +126,3 @@ func TestConvertFull(t *testing.T) {
 		})
 	}
 }
-
-func TestConvertAdapter(t *testing.T) {
-	type AdapterCase struct {
-		Name     string
-		BaseKV   map[string]any
-		Expected map[string]string
-	}
-
-	cases := []AdapterCase{
-		{
-			Name: "discollama",
-			BaseKV: map[string]any{
-				"general.architecture":          "llama",
-				"llama.attention.head_count":    uint32(32),
-				"llama.attention.head_count_kv": uint32(8),
-			},
-			Expected: map[string]string{
-				"general.architecture":          "llama",
-				"general.file_type":             "1",
-				"general.parameter_count":       "106496",
-				"general.type":                  "adapter",
-				"general.version":               "v0.2",
-				"adapter.lora.alpha":            "16",
-				"adapter.type":                  "lora",
-				"llama.attention.head_count":    "32",
-				"llama.attention.head_count_kv": "8",
-				"blk.31.attn_q.weight.lora_a":   "0eb3318b02cd313429bcc7621b539fdbb10240fea190c56c9e5f93fcd37a4e50",
-				"blk.31.attn_q.weight.lora_b":   "0eb3318b02cd313429bcc7621b539fdbb10240fea190c56c9e5f93fcd37a4e50",
-				"blk.31.attn_v.weight.lora_a":   "0eb3318b02cd313429bcc7621b539fdbb10240fea190c56c9e5f93fcd37a4e50",
-				"blk.31.attn_v.weight.lora_b":   "071dcafe89df065d6e1c935ecb8fdf6479b3c202eb912e7da938597673ff5857",
-			},
-		},
-	}
-
-	for _, c := range cases {
-		t.Run(c.Name, func(t *testing.T) {
-			t.Parallel()
-
-			f, err := os.CreateTemp(t.TempDir(), "f16")
-			if err != nil {
-				t.Fatal(err)
-			}
-			defer f.Close()
-
-			tempDir := t.TempDir()
-			generateLoraTestData(t, tempDir)
-
-			if err = ConvertAdapter(os.DirFS(tempDir), f, c.BaseKV); err != nil {
-				t.Fatal(err)
-			}
-
-			r, err := os.Open(f.Name())
-			if err != nil {
-				t.Fatal(err)
-			}
-			defer r.Close()
-
-			m, _, err := llm.DecodeGGML(r, math.MaxInt)
-			if err != nil {
-				t.Fatal(err)
-			}
-
-			if _, err := r.Seek(0, io.SeekStart); err != nil {
-				t.Fatal(err)
-			}
-
-			actual := generateResultsJSON(t, r, m.KV(), m.Tensors())
-
-			keys := maps.Keys(c.Expected)
-			slices.Sort(keys)
-			for _, k := range keys {
-				if v, ok := actual[k]; !ok {
-					t.Errorf("missing %s", k)
-				} else if v != c.Expected[k] {
-					t.Errorf("unexpected %s: want %s, got %s", k, c.Expected[k], v)
-				}
-			}
-		})
-	}
-}
-
-func generateLoraTestData(t *testing.T, tempDir string) {
-	type tensorData struct {
-		Offsets []int  `json:"data_offsets"`
-		Type    string `json:"dtype"`
-		Shape   []int  `json:"shape"`
-	}
-	offset := 4096 * 8 * 4
-
-	td := map[string]*tensorData{"__metadata__": nil}
-	td["model.layers.31.self_attn.q_proj.lora_a"] = &tensorData{
-		Offsets: []int{0, offset},
-		Type:    "F32",
-		Shape:   []int{4096, 8},
-	}
-	td["model.layers.31.self_attn.q_proj.lora_b"] = &tensorData{
-		Offsets: []int{offset, offset * 2},
-		Type:    "F32",
-		Shape:   []int{8, 4096},
-	}
-	td["model.layers.31.self_attn.v_proj.lora_a"] = &tensorData{
-		Offsets: []int{offset * 2, offset * 3},
-		Type:    "F32",
-		Shape:   []int{4096, 8},
-	}
-	td["model.layers.31.self_attn.v_proj.lora_b"] = &tensorData{
-		Offsets: []int{offset * 3, offset*3 + 8*1024*4},
-		Type:    "F32",
-		Shape:   []int{8, 1024},
-	}
-
-	data, err := json.Marshal(td)
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	var buf bytes.Buffer
-
-	l := int64(len(data))
-	err = binary.Write(&buf, binary.LittleEndian, l)
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	_, err = buf.Write(data)
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	// write some data for the tensors
-
-	ones := make([]float32, 4096*8)
-	for i := range ones {
-		ones[i] = float32(1)
-	}
-
-	for range 3 {
-		err = binary.Write(&buf, binary.LittleEndian, ones)
-		if err != nil {
-			t.Fatal(err)
-		}
-	}
-
-	ones = make([]float32, 1024*8)
-	for i := range ones {
-		ones[i] = float32(1)
-	}
-
-	err = binary.Write(&buf, binary.LittleEndian, ones)
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	fdata, err := os.Create(filepath.Join(tempDir, "adapters.safetensors"))
-	if err != nil {
-		t.Fatal(err)
-	}
-	defer fdata.Close()
-
-	_, err = fdata.Write(buf.Bytes())
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	configData := `
-{
-    "adapter_path": "adapters-test",
-    "batch_size": 8,
-    "config": "config-tiny.json",
-    "data": "../discollama-completion",
-    "grad_checkpoint": null,
-    "iters": 1000,
-    "learning_rate": 1e-05,
-    "lora_layers": 1,
-    "lora_parameters": {
-        "rank": 8,
-        "alpha": 16,
-        "dropout": 0.0,
-        "scale": 2.0
-    },
-    "lr_schedule": null,
-    "max_seq_length": 2048,
-    "model": "/Users/pdevine/git/Meta-Llama-3-8B-Instruct",
-    "resume_adapter_file": null,
-    "save_every": 100,
-    "seed": 0,
-    "steps_per_eval": 200,
-    "steps_per_report": 10,
-    "test": false,
-    "test_batches": 500,
-    "train": true,
-    "use_dora": false,
-    "val_batches": 25
-}
-`
-	f, err := os.Create(filepath.Join(tempDir, "adapter_config.json"))
-	if err != nil {
-		t.Fatal(err)
-	}
-	defer f.Close()
-
-	_, err = f.WriteString(configData)
-	if err != nil {
-		t.Fatal(err)
-	}
-}
--- a/convert/reader.go
+++ b/convert/reader.go
@@ -35,9 +35,7 @@ const (
 )

 func (t tensorBase) Kind() uint32 {
-	if strings.HasSuffix(t.name, ".ffn_gate_inp.weight") ||
-		t.name == "token_types.weight" {
-		// these tensors are always F32
+	if strings.HasSuffix(t.name, ".block_sparse_moe.gate.weight") {
 		return 0
 	}

@@ -57,15 +55,13 @@ func (t *tensorBase) SetRepacker(fn repacker) {

 type repacker func(string, []float32, []uint64) ([]float32, error)

-func parseTensors(fsys fs.FS, replacer *strings.Replacer) ([]Tensor, error) {
+func parseTensors(fsys fs.FS) ([]Tensor, error) {
 	patterns := []struct {
 		Pattern string
-		Func    func(fs.FS, *strings.Replacer, ...string) ([]Tensor, error)
+		Func    func(fs.FS, ...string) ([]Tensor, error)
 	}{
 		{"model-*-of-*.safetensors", parseSafetensors},
 		{"model.safetensors", parseSafetensors},
-		{"adapters.safetensors", parseSafetensors},
-		{"adapter_model.safetensors", parseSafetensors},
 		{"pytorch_model-*-of-*.bin", parseTorch},
 		{"pytorch_model.bin", parseTorch},
 		{"consolidated.*.pth", parseTorch},
@@ -78,7 +74,7 @@ func parseTensors(fsys fs.FS, replacer *strings.Replacer) ([]Tensor, error) {
 		}

 		if len(matches) > 0 {
-			return pattern.Func(fsys, replacer, matches...)
+			return pattern.Func(fsys, matches...)
 		}
 	}

--- a/convert/reader_safetensors.go
+++ b/convert/reader_safetensors.go
@@ -8,7 +8,6 @@ import (
 	"io"
 	"io/fs"
 	"slices"
-	"strings"

 	"github.com/d4l3k/go-bfloat16"
 	"github.com/x448/float16"
@@ -21,7 +20,7 @@ type safetensorMetadata struct {
 	Offsets []int64  `json:"data_offsets"`
 }

-func parseSafetensors(fsys fs.FS, replacer *strings.Replacer, ps ...string) ([]Tensor, error) {
+func parseSafetensors(fsys fs.FS, ps ...string) ([]Tensor, error) {
 	var ts []Tensor
 	for _, p := range ps {
 		f, err := fsys.Open(p)
@@ -57,7 +56,7 @@ func parseSafetensors(fsys fs.FS, replacer *strings.Replacer, ps ...string) ([]T
 					offset: safetensorsPad(n, value.Offsets[0]),
 					size:   safetensorsPad(n, value.Offsets[1]) - safetensorsPad(n, value.Offsets[0]),
 					tensorBase: &tensorBase{
-						name:  replacer.Replace(key),
+						name:  key,
 						shape: value.Shape,
 					},
 				})
--- a/convert/reader_torch.go
+++ b/convert/reader_torch.go
@@ -3,13 +3,12 @@ package convert
 import (
 	"io"
 	"io/fs"
-	"strings"

 	"github.com/nlpodyssey/gopickle/pytorch"
 	"github.com/nlpodyssey/gopickle/types"
 )

-func parseTorch(fsys fs.FS, replacer *strings.Replacer, ps ...string) ([]Tensor, error) {
+func parseTorch(fsys fs.FS, ps ...string) ([]Tensor, error) {
 	var ts []Tensor
 	for _, p := range ps {
 		pt, err := pytorch.Load(p)
@@ -28,7 +27,7 @@ func parseTorch(fsys fs.FS, replacer *strings.Replacer, ps ...string) ([]Tensor,
 			ts = append(ts, torch{
 				storage: t.(*pytorch.Tensor).Source,
 				tensorBase: &tensorBase{
-					name:  replacer.Replace(k.(string)),
+					name:  k.(string),
 					shape: shape,
 				},
 			})
--- a/convert/testdata/Meta-Llama-3.1-8B-Instruct.json
+++ b/convert/testdata/Meta-Llama-3.1-8B-Instruct.json
@@ -1,3 +0,0 @@
-{
-  "rope_freqs.weight": "80fd5efb2f729381785b293a091a268cfeceb0079167f6ece9b07070e662b222"
-}
--- a/convert/testdata/Phi-3-mini-128k-instruct.json
+++ b/convert/testdata/Phi-3-mini-128k-instruct.json
@@ -1,225 +0,0 @@
-{
-  "general.architecture": "phi3",
-  "general.file_type": "1",
-  "general.quantization_version": "2",
-  "phi3.block_count": "32",
-  "phi3.context_length": "131072",
-  "phi3.embedding_length": "3072",
-  "phi3.feed_forward_length": "8192",
-  "phi3.rope.scaling.original_context_length": "4096",
-  "phi3.rope.dimension_count": "96",
-  "phi3.rope.freq_base": "10000",
-  "phi3.rope.scaling.attn_factor": "1.1902381",
-  "phi3.attention.head_count": "32",
-  "phi3.attention.head_count_kv": "32",
-  "phi3.attention.layer_norm_rms_epsilon": "1e-05",
-  "phi3.attention.sliding_window": "262144",
-  "tokenizer.ggml.model": "llama",
-  "tokenizer.ggml.pre": "default",
-  "tokenizer.ggml.add_bos_token": "false",
-  "tokenizer.ggml.add_eos_token": "false",
-  "tokenizer.ggml.bos_token_id": "1",
-  "tokenizer.ggml.eos_token_id": "32000",
-  "tokenizer.ggml.unknown_token_id": "0",
-  "tokenizer.ggml.padding_token_id": "32000",
-  "tokenizer.ggml.scores": "6e37bcde2adc7e350e87c496eddd7a2124329c1dc66c5bf3ad3997253e4f7a62",
-  "tokenizer.ggml.token_type": "b6ecf55ec64ee67d87750bdb8d757a2c58bf78377e9f4219f5689a6c4dea57ce",
-  "tokenizer.ggml.tokens": "d168da3ddd3eee820916945fcb9baf24dd3cde42f606cffa2d19e7c8a8743918",
-  "blk.0.attn_norm.weight": "216aeb2c9e0c271f899e1ef2a63cceeb8f41e97642e84fada54b1d3c1c11cf25",
-  "blk.0.attn_output.weight": "b597d56f7188ffc1fafc273fadc59d41738cffd677ae98c61a62c3285b3a3099",
-  "blk.0.attn_qkv.weight": "d28a6b44e13f59be5483e4be2bedb544e346168d720aca27f47d1a5a722be91e",
-  "blk.0.ffn_down.weight": "4a691370e5a61fcbbf540fbcbf4c0f1d15dec0364528c0e916d0744f6262b63b",
-  "blk.0.ffn_norm.weight": "0c00af2b4a3128bec64a0cbb1084b042fdbe13d9ad0d03bd577f9449dfead338",
-  "blk.0.ffn_up.weight": "b32b52f790c1c083bfb8a3126dc1111cfeeb28dc8c584a930a1e5334cb176bf4",
-  "blk.1.attn_norm.weight": "68748011503c6c029e8e69a84a8e5a89338f378769627b6dbf7f93d715c292e1",
-  "blk.1.attn_output.weight": "2267344add13b048ca59e4377c86dc512be8046a57156901fa32a20fa74e4ee0",
-  "blk.1.attn_qkv.weight": "9109d2e3d7a2eacfda5226587b8be124a3bf44b972da7ebb17aa15795897eacc",
-  "blk.1.ffn_down.weight": "d675df4df4dd039c0c339ad6445d39eddd2004db6bf35bed6314c7497245a633",
-  "blk.1.ffn_norm.weight": "3b5767ae977bc8baaa06b06efdbea193b6b3ba605ce76d77a76ce317e935500c",
-  "blk.1.ffn_up.weight": "80dfd6d9d234b00334c89b8e0a02f81899c2efd377321c34ba5ba51a5f61b5ff",
-  "blk.2.attn_norm.weight": "6a6743b057e5088f145bc179e92c9bfb41163e7295d7b81c62e23dd89d2b59c4",
-  "blk.2.attn_output.weight": "bc5491ea54e0db81462d7d9b7d25cbdda380c2db8de041bd1c4ab7b76a1d19c3",
-  "blk.2.attn_qkv.weight": "a61287a9852e2f5aca9c100b471d98398b2913a3497c743de3c70ec9ddd7087f",
-  "blk.2.ffn_down.weight": "4fddcc382c8dceeab027fe43d8d44e67edb5e8ce4b9a1b7f773c87770380ade1",
-  "blk.2.ffn_norm.weight": "07e05f82b3f63f711db3b684ca79aed25c0657917e66f88af47348a82065c227",
-  "blk.2.ffn_up.weight": "4835a682ef1826c12df01ae7663fc45f9c82bc8e64b665f13fb7da8e201ec0fb",
-  "blk.3.attn_norm.weight": "f22aba7c03999ba7136f39cda747a39715e498699dc1716cd97fc5dfc58d1b1c",
-  "blk.3.attn_output.weight": "53b579855366fd786c5126b2b30aac4d583ca7bda56833c4865f5cadb5c18c6d",
-  "blk.3.attn_qkv.weight": "bb56aba78158123140fcea59c69ac562ca208f6d3086819417cdad8c50f333ad",
-  "blk.3.ffn_down.weight": "97280897a7cd86db2830c004bccc5bc094f50e293baded0189159a2019145a6e",
-  "blk.3.ffn_norm.weight": "10a8c99f8b57a960e8e0a1133c4a26f9148403d1b9bff2eff114917de996f3b5",
-  "blk.3.ffn_up.weight": "7324046c915e75d621b2043597a245a428d8eea31869135e6257a861491d8dcc",
-  "blk.4.attn_norm.weight": "507d8e164de94646edbfe33def8e8fbf7c9a6ee3fbaedb5000f72d9f51ec5e36",
-  "blk.4.attn_output.weight": "bbb3429e6efa98c150e0fdbf48c16180cbf0d0cbc1b3c253c6c319d78f4593a2",
-  "blk.4.attn_qkv.weight": "b95ee5be0786d3901273d806c339fe6c20e6bfffd2a20672a9f56af80921e8ab",
-  "blk.4.ffn_down.weight": "806bbf91df92a5a22bd5aa1ffb7fc2869f7293ffc7704771c290ecc583b27975",
-  "blk.4.ffn_norm.weight": "cfc2930a81df7aee3a5e7f726a15c1182233e868bf0d9d37f6b6ae6d8c15c234",
-  "blk.4.ffn_up.weight": "c3390c69533de2c8424e8069323ccc5d0c4543111535da04cf2c7d26745576aa",
-  "blk.5.attn_norm.weight": "0d71c4fbcefabbd021569442853d2fe90668b19409ae2805a718a829ca60beab",
-  "blk.5.attn_output.weight": "10ebd93629112bf2df5c30dd0953a4a5e9020306768283181ed426934d47e14f",
-  "blk.5.attn_qkv.weight": "5cb05633369f12d4b00e0ff787736bd846856682115720ebc6cce05270c334f6",
-  "blk.5.ffn_down.weight": "e28bcc5094212eafc7476dbc5b7a520d25b79578cbf4229d698e2655956a80ad",
-  "blk.5.ffn_norm.weight": "b6f2c4cf9f34bb4d59989f96165c14a67dc1e266ad0a6d0fcc49f1add929e6ff",
-  "blk.5.ffn_up.weight": "0f9ef99423cc07ebedc0e9cfa95809f2d7108d910bb4ef97ebc0b0309c440750",
-  "blk.6.attn_norm.weight": "b3edcc47a42218234f7564d7470611b49401a41ae8cd42123f86557c69f5d7f2",
-  "blk.6.attn_output.weight": "eb9b7d257b388bb5b8fe0515e5c6873317239cb94cda236e4b6ada2a6c57c65c",
-  "blk.6.attn_qkv.weight": "eb968081f478c52f07bd9c2761741e982dba33cc4eeadeea3557d391b9ac2106",
-  "blk.6.ffn_down.weight": "1b8588bb7463206290322695577dcfced300895d6e6f4b26966c53a9ae2f0f84",
-  "blk.6.ffn_norm.weight": "1219c04b7770983c77814200eefe743f46d15328ea2b12711e44f8103eab08d3",
-  "blk.6.ffn_up.weight": "197ef287239fec47c55677f0fbb66eaf0644f775bc382de843971730721394f6",
-  "blk.7.attn_norm.weight": "b630ad08c80d564ed1c024384818e9fd3f22a36cd7a14aa96e7e2759a8285099",
-  "blk.7.attn_output.weight": "970255aa750828a47d6b9d399f9612b5bf25aefe7dadbcba41fc416d0d4067c1",
-  "blk.7.attn_qkv.weight": "ebb157c880293e6de8d629f263ba8853ed1dbdc02c311d43432bb8cfbb310739",
-  "blk.7.ffn_down.weight": "24bcd4db4cba844c89f878b81843c373dbbc0675e889d32c5b12e63384a7b670",
-  "blk.7.ffn_norm.weight": "b9c6f71001808ee873ce7db8056e4b53fb4cccec8b7f0f312899b575fae39d39",
-  "blk.7.ffn_up.weight": "979f1828d227455c26015a2a11afe9dd05f2bb97a8ba6b38c8dab3f50e627401",
-  "blk.8.attn_norm.weight": "4e8e347e3775010b7112ee630f2f4f2383be7ff64e6ca6154b9b22566552eaa6",
-  "blk.8.attn_output.weight": "65a44babf44a435a1829945211b3168f9ec78ac3cb7a049a733e93d11f0d6659",
-  "blk.8.attn_qkv.weight": "343ed07671da400b040812a4058482fa38284b5d9af9becfed07417fe26ce747",
-  "blk.8.ffn_down.weight": "7fb7e073e3c2c503c4e9d60efa0988fed7398d900cc003695fe3fffd3e188b82",
-  "blk.8.ffn_norm.weight": "b07c1f655d8593e3892a2cf73f8a0c19ce8e5cb613fafbe7cbd430da8ce4c57d",
-  "blk.8.ffn_up.weight": "8b26e14de54b3fdc2e2d3ea41720f9d9c236a93688c3b7fd7bf43f5fbb327c9b",
-  "blk.9.attn_norm.weight": "46394d408a8e316916177e6aa261de32e137a82d729c0b1800b072f0c38c39b6",
-  "blk.9.attn_output.weight": "d57f3d46107947a7073373a0b35d6ecf7759b5df15406f4a3590a60666af6b16",
-  "blk.9.attn_qkv.weight": "14bb8ace8c5453148f4b536e9f4279c813f31136716947256f5cca333448639c",
-  "blk.9.ffn_down.weight": "2b8d98e2b5ed68338f6e4de43bf7de0c4858cc69103cd5177725f7444eec7694",
-  "blk.9.ffn_norm.weight": "41a499dfd418cc4c6b8c12313f673f7e2cd4a3f9c4065eb6c4feb5eed02fb542",
-  "blk.9.ffn_up.weight": "143aab7533a64b17fbe201490a6f674bc7f0bd370c094500b2e100419073d1c2",
-  "blk.10.attn_norm.weight": "ebb670aafd36816a794347287269d8f1a5b19c1e3c0a1e38023bc19fdba9b073",
-  "blk.10.attn_output.weight": "b5d65bbc0ed5e49fdd9d754bc18163cd042a285024d0cf6f954c503bc8c877cb",
-  "blk.10.attn_qkv.weight": "f06b15bac88da798fa34a62b03eaac0dbe8b846020516603c387541f2d8dd672",
-  "blk.10.ffn_down.weight": "fb091fcd1b4de25d1bea94d1755e255cb02914a030d23e3a234e57b8d46bde6e",
-  "blk.10.ffn_norm.weight": "eb347bdf9c40414af87e13a8e72e40b31f004b50f7cb366f1a219ced60a61355",
-  "blk.10.ffn_up.weight": "ed2d52fc881a173f404fe8a1067862c9856d6c3e0d2e90a330a7aa394e3f84d1",
-  "blk.11.attn_norm.weight": "64e252603cf010a0e502ca39fdf8d0a196a79aec67c0d2bb9213fc0cb80c47d4",
-  "blk.11.attn_output.weight": "228e33e21c69f52efc74fdfc831bc9af271e44b2a29a3dced1d64e667ce36eb5",
-  "blk.11.attn_qkv.weight": "ab9ce6d4ef9e42ee0da3f20a7708a3bbc5e79e967b05fa86ba946a05e2eb63eb",
-  "blk.11.ffn_down.weight": "0ca133b7835c98dc77c25d64e4eb7873778bdb5e4d22d8b80f920f46865b43bd",
-  "blk.11.ffn_norm.weight": "02455741a0dfd161c79aa1ecc381901721f229fdcda5615622a629631fb61cfd",
-  "blk.11.ffn_up.weight": "9fecdcc099fbb8e23c6b1ea9294702a027f4a58d265543ec5e7be79b8f63b354",
-  "blk.12.attn_norm.weight": "783bb459911b1b3609a9b2bdfe272f1670add73b5471da738e07ac47e2e07dfd",
-  "blk.12.attn_output.weight": "1e1a914c9e48b857206ac5a1f7cead994bc1ea91d5d4fff8c834d73f2e38ef5d",
-  "blk.12.attn_qkv.weight": "5953e7185ccb87fb4dae8f9426ec86315d4c7794326e8ab59b3a95d4af2189f0",
-  "blk.12.ffn_down.weight": "a3eecf0f394f86e2cfb48a5940a5c50ca86d71883b2f79fcc642a935fabce0d4",
-  "blk.12.ffn_norm.weight": "0a4272e41373c23bd72f10d2d82930aa3a1480aac75832bfbf01cebf0b86b6a4",
-  "blk.12.ffn_up.weight": "06f42776de3a7ceac3025f26a7a8bd20e062233cce2bdaa2183470dc4b30b87d",
-  "blk.13.attn_norm.weight": "5915da60fb03e201fa649faba780e5fdf1c761c262b206e5415cf83181f65780",
-  "blk.13.attn_output.weight": "4dbf6eab074fa3835fd32bd631a8208e511037d5056d2fd3015735cca7674ef7",
-  "blk.13.attn_qkv.weight": "d3d8339a1c4782d9e73d77fdebe154d3c5b83ac40c9175b3e91a4977d08f876b",
-  "blk.13.ffn_down.weight": "de6772b46a55e1fd42b007637dfbf68b6598e5d5b61622da0935002e1e192d3a",
-  "blk.13.ffn_norm.weight": "5a640ea3b8c7be49c95a58a2327e10d8e8d9d142504bde5c8091613e5b961d7a",
-  "blk.13.ffn_up.weight": "f35e3545e4bd3531b2e843b5efd31dee0c13c807ee6386e65473ba67bbec30d0",
-  "blk.14.attn_norm.weight": "9b34986450b7c98b4927e81e61a816f9e84b1addc7c14926402100037aad6678",
-  "blk.14.attn_output.weight": "155d52efb23d366016d861a251d4d1f4a0c13699188c50d50dba016a0d8bfcd9",
-  "blk.14.attn_qkv.weight": "8e1415084e1f33c73a777f19e752489f4dd312cca047733e5ea643cd4a955e04",
-  "blk.14.ffn_down.weight": "a2a142226b94baa01ccb65bdea2b7418e49085c1d9c3c63e544e3112c58a25da",
-  "blk.14.ffn_norm.weight": "8aecfd9b0ae6affaea31a80c5c9a4a14b31deaa0db7bd8f6da2a64d23447921c",
-  "blk.14.ffn_up.weight": "0c1407237b8c1bd02f193346b5681926fe698a5055eac6a7450451b0f991707c",
-  "blk.15.attn_norm.weight": "e037bd19880bfa83d983200fb0c7866f8ad16c3ff5cc4b4f3a37ca7373870ff6",
-  "blk.15.attn_output.weight": "045fe4fc95cc129a1b92771b179c11b12845c4c088786c607f17bd98857e68e1",
-  "blk.15.attn_qkv.weight": "7621b7559705cab1d4dea1c69f76dbf9dc1c8837a203b656f484703b9c1b70ce",
-  "blk.15.ffn_down.weight": "7e5ac20e290bc60761e1cd972354fde225b7fa861048d44d9a0dd9b046d55f58",
-  "blk.15.ffn_norm.weight": "b6d830d88f1db1825687973c8c2b1a24c6fa84f07af8d0e3ef9c86009baca0b2",
-  "blk.15.ffn_up.weight": "dcda0957cd04fc45476774dba2bbf9aa89d6b05d5ca7b10ae6f73ad2c49b1cd3",
-  "blk.16.attn_norm.weight": "4ee9b70ba15cb2a08240f93990e90f5068c48fceb481f8e2186bec8b7214eb3f",
-  "blk.16.attn_output.weight": "315cfe5536658d2498192b2980eade15b2c9a4ff220e4011911457b1727fa103",
-  "blk.16.attn_qkv.weight": "3c8122e3ad637583b9dcde8ff3a323267d3014bb1f0f9771e5322260ca9ecc8d",
-  "blk.16.ffn_down.weight": "3b5fbebd5ee2b86cad96fb8a9b45a8770d08f82c1c8b74d7061e866f7020a18d",
-  "blk.16.ffn_norm.weight": "ffab69f20bda372de6e5878f0539163e2fc6ba113621ded95705fc3b1465c9f0",
-  "blk.16.ffn_up.weight": "0935ea3d258da42d6258406365f39f58ddaabfe97ea5977580db3635188f24a1",
-  "blk.17.attn_norm.weight": "f030441733f3d147b4a06a1eb4aeb8465c7c24d9c53bf4c48fe7e134d3629803",
-  "blk.17.attn_output.weight": "07a955ef09e8dc766ac0df647d0b2c69f23c4c69a7137654b4aad80303ed0eda",
-  "blk.17.attn_qkv.weight": "1c10688061e21e2fe12ad0cb54bf03895c1f83c3b0df743a42f548b52cbca1b2",
-  "blk.17.ffn_down.weight": "ebb9cc9836f41d88fdae2aa9a4355514e4edaec8d1577ffeb947a35204e77f52",
-  "blk.17.ffn_norm.weight": "50aff44f6528b13db5389f2ddcdb7676244947610bd7ffbff3f881c968c2a0d4",
-  "blk.17.ffn_up.weight": "d716537949582be33bde6b02e38f5a70081c9642a9fb05a61312126718b8d148",
-  "blk.18.attn_norm.weight": "0ea695c4e53d637902f46663a6ee42adc493c36794476acc7dbddaa05b13840d",
-  "blk.18.attn_output.weight": "5fd35b500221a612eb4f4bddf0e9b6b7db4d7733032a75f8802fb2d884647c2e",
-  "blk.18.attn_qkv.weight": "b0da37fd030fe69581f990bf23bfd35467a1bbe558af6de7c0924f6b72e92317",
-  "blk.18.ffn_down.weight": "b355c33f44b328f4bb977567de8f7544db4b005d7a8fbded658518ecf3c5a153",
-  "blk.18.ffn_norm.weight": "58b3fe9094079989a86e0387143259e1cc35952d24dc3df290c4ba6df44f5c51",
-  "blk.18.ffn_up.weight": "2ce530954c342c30ed2ead5353f931960bfae1d278868504c0efb973560fabbe",
-  "blk.19.attn_norm.weight": "533e9aed66feea8f0392aa81f9e293240e1f009a5334253915fb60c2749b615d",
-  "blk.19.attn_output.weight": "84f2d00f98a4113a779d3b5d1c3e7c914eb47784d3ab13b290367c124c2994aa",
-  "blk.19.attn_qkv.weight": "fbe6b9f53b07fa7537d3b3d452d20a9bc666f9fd41ec2091dd28bc2f70fc668f",
-  "blk.19.ffn_down.weight": "b30199e098c8bb3f890183d8b18471e80b62b604729b277ad62488dd71e1206b",
-  "blk.19.ffn_norm.weight": "c81373e41cd340b7badb19f9517c77c4250b4eb9a02dc758b8b49b652487d7ff",
-  "blk.19.ffn_up.weight": "5a5cb083ca7725720e3a890f7fa46354760e8007a8188849a092e305694a75e3",
-  "blk.20.attn_norm.weight": "4953091b4477e354357a8e743ba0a1900633e52f1599ee082a0c9b0b2b5cd978",
-  "blk.20.attn_output.weight": "62d54f7749cd6856097b2632066a322b0296df915fe66f382c5b5981be0d4f23",
-  "blk.20.attn_qkv.weight": "406de9e35b0729ebe902d7a47905cc7fb29a921431ed35dbef0c03e5690a1329",
-  "blk.20.ffn_down.weight": "62fb678b0d1261e19a4903a2b347d67afcc8acff01feb33a687a35a2d1e6f9a5",
-  "blk.20.ffn_norm.weight": "cd9d36b7e71e55c8925b97bb09c28219f182626bcff094878ae39c3db887a14b",
-  "blk.20.ffn_up.weight": "b9276771d79d3e932e73ccc520c3f8476342b9ef312ed2ee1e0da822e6e3ad18",
-  "blk.21.attn_norm.weight": "66d8c8a35e13ce9c2a0e75b670150e2c31484a55c2316df46075312196178ed3",
-  "blk.21.attn_output.weight": "12ab46c9382648f9b3350fdd92a6be6352743d62d6b520d7e2024e0c838588f5",
-  "blk.21.attn_qkv.weight": "a7909676ee1675ca23cd29a5fdd226df8dd9d68f94c6c9bbb51dd9fd38504008",
-  "blk.21.ffn_down.weight": "6fb317279c6542e82f97d5a12a60fac1bd0fa0405154f9fbe265e2fe39bd49cc",
-  "blk.21.ffn_norm.weight": "c0f703eb3ff161b5ba4490d87d8684b8a6c47a8f433e12f418333b9db439010a",
-  "blk.21.ffn_up.weight": "6dbdb80ef0c35e364bbce12d40d5e74c7963c7b55d58d9579567a07ffce7b863",
-  "blk.22.attn_norm.weight": "f94237433bf03d675cb2f655b81ca91a1ce2447bc6b00b13d6b0ccfe2d411eff",
-  "blk.22.attn_output.weight": "e821f95995ce497c01e63ca64f737713b1b65f11df1903e51d444aa516f33f71",
-  "blk.22.attn_qkv.weight": "1b0f717c73afb5eb4c82a1708c4e85c969e8a2a8770d9ddb78b1870a2d8a781e",
-  "blk.22.ffn_down.weight": "0f33f7a3cdc685484be99aa0c03642b0b20850a27d1fddbe054b13a9382f3ccb",
-  "blk.22.ffn_norm.weight": "9df285cf211ddd7df2b36a50489af574755c7d4d98b29a05cd04566ae613c8dc",
-  "blk.22.ffn_up.weight": "63ac300e1efb34041dd0136cf43ea622fac6f0caccce1cd9262f5e08d2cf179c",
-  "blk.23.attn_norm.weight": "5f72d9e88689b4027b28f5f8f26cd3abb03635ceea7ec98a4c91a9fc691f6707",
-  "blk.23.attn_output.weight": "6ecf04ff61125c5fc768f8656497152149373daf321ee9c957e8f7245a1184d1",
-  "blk.23.attn_qkv.weight": "a9d9978806724c2959f2cf386c233831f08e1e933dbf2b32665e788d9d512ea4",
-  "blk.23.ffn_down.weight": "72c7d17886a3da17fa0daa456aa5e877b2ef5b8b403182b870d9ca5ca9c70347",
-  "blk.23.ffn_norm.weight": "971e4b712e3025a13419b5b57d674b5e4ab7f18f74b57b9afc4671623da90c4b",
-  "blk.23.ffn_up.weight": "df2b5c7dbd5834545b815073af0c7355b065124e6d6f0fee78d8fa5b2076dc3e",
-  "blk.24.attn_norm.weight": "c41957c4a79ad3b16f6e11daec1c7f530b9f3f4b618e1e4367c3b67787ac4ab6",
-  "blk.24.attn_output.weight": "ef7d61f5fc88ac6f31bf60cb5f4d2d6b8df42d38825807112361a7224b0dee3b",
-  "blk.24.attn_qkv.weight": "3e6a58fe7d49c90bb6971efbad3371c32256881173ea5aee4b0c296cb206490f",
-  "blk.24.ffn_down.weight": "f43619144047de42fed81dfa495f1815d3cb771330e574043e2b67620819292c",
-  "blk.24.ffn_norm.weight": "5501d4a2a98c8ca6b42e77b53b221dbc08f530f6a067256d787534ec6fe028bd",
-  "blk.24.ffn_up.weight": "d64c8b0e509e2b1118f6000176f8956cacecdbb200c7e95ed93fb78b6e26c84a",
-  "blk.25.attn_norm.weight": "502fa3c302d371f61c5791f4615b73018ffb1daa09b6499b227116581244c5d4",
-  "blk.25.attn_output.weight": "ad8391d4e9c980856f2547aa945b2b6a407a6382158dc1ddd4f08d94ecc24be6",
-  "blk.25.attn_qkv.weight": "42e8983780d4a01a02c54ad23d4df21eea437f119a10af5a9c12a76a42d308c1",
-  "blk.25.ffn_down.weight": "302dd010d4e0ab4eeaee89090409ea0dddeeeed3236415eb8f97c942497eea91",
-  "blk.25.ffn_norm.weight": "fb34c1ee5bca96986c08834df0a0c047ba041c1123ac1f563e9d64312bf82d6a",
-  "blk.25.ffn_up.weight": "10739a8de156816d93c92b935386540bfa976bdbef204f0312960f6fc657582f",
-  "blk.26.attn_norm.weight": "7036c711609128c4e55968ff3681d3043338879a5737efd6c2ac9e1a2a61f1a0",
-  "blk.26.attn_output.weight": "db5db45dead5cb911fa01da59832f121b7c18b2d167bf53741c40819f24d346c",
-  "blk.26.attn_qkv.weight": "cae34c6b7f82ed14348d5ed30a79919c383737c1694a9cb9c0de609d3b0c1d0a",
-  "blk.26.ffn_down.weight": "491ec3a4da9b4f49f8ebc6be658ce397a9b801ae9fb35e82177e47808c65e5d0",
-  "blk.26.ffn_norm.weight": "fd7059d75d7f0e5288511ddeeb0f772eb3cae3ccfe4226b877015834edc3c386",
-  "blk.26.ffn_up.weight": "ea1ee1274c56458ce056d2205e5bb6e5422ce4cb0ad58006b8141749b97a0c39",
-  "blk.27.attn_norm.weight": "cc362c9a937609265052cd38544af17a1a7448cea086d4c801139e1fc865832d",
-  "blk.27.attn_output.weight": "ba757a81dabde9cb1b069d1bb616fe79649a1724f756567ec61caed1304fe6cf",
-  "blk.27.attn_qkv.weight": "1ab8d7d02d87756c12c2275636823aa5ede3d683178225c4cac4bd892c319bd4",
-  "blk.27.ffn_down.weight": "deb1c711c8a66acf4dcd2d088e1548f8e08f296f755e4067d6557fa55afde88c",
-  "blk.27.ffn_norm.weight": "fc6242d8cb8a4a37a8ddb7e41e7e60a63d4a89edf36acb35df052f10b9c91ece",
-  "blk.27.ffn_up.weight": "8df39b09c4801f343aca78f2918a1f6db78c8c55e591eda4c69eadb74c26e180",
-  "blk.28.attn_norm.weight": "75b539308f77e3cefdc6d98484d8b5cbf0538f0c2869a77b7373a145a18bc850",
-  "blk.28.attn_output.weight": "ae128940eb60a6d2e121762ef4b3e9dcf9eb3e105b249507fa7f12de0e19822c",
-  "blk.28.attn_qkv.weight": "bdda781c288e9326c240e33905f8e621b6a2ad902e620739d34f93fcd6f933de",
-  "blk.28.ffn_down.weight": "f1d6e6d1c286b1138bfd7e53fe477f399ae93bc2c04e35416f84218ed7247965",
-  "blk.28.ffn_norm.weight": "3f837ce82c8b9bde0d61d08b6f5fe5574886ea5328dbdc53f2929f18da8b4087",
-  "blk.28.ffn_up.weight": "2af027002e31d1b6cfedbdb30a2b9d7213f3aa691167c353913adfd48fda31e4",
-  "blk.29.attn_norm.weight": "61e8003b5329462ffe0fe172f2b160260de006aed858332d49d75504b6b6aa7a",
-  "blk.29.attn_output.weight": "ca44542a72a37476dc73dbdcc01f5b7497cb3ebc4ea230a55c9634ccd8e56ad4",
-  "blk.29.attn_qkv.weight": "abb3d9d6abe57872ae3daa51935d43264093ded5ce63b49d1e280ee5758be0e4",
-  "blk.29.ffn_down.weight": "6764b895fce881df097489c263446f0106de36217997660c15984b3ee22a5a06",
-  "blk.29.ffn_norm.weight": "89e03e9a33fc0e6e31ba9f0c2bd7c5734a118c5602bb90148793e08a80e8d0ae",
-  "blk.29.ffn_up.weight": "fa7ad57a84954f4121653152efed1a871d8adb20a1ea9086e3e849ce359d7d2e",
-  "blk.30.attn_norm.weight": "91a697aca1e42af54f806a20211031c3369e8d0bd58df1b0147fe24954e1f5a4",
-  "blk.30.attn_output.weight": "36063fcf766c89ac75be56f688cc63cefe5f2c733fbf4378ea9956ad386fa148",
-  "blk.30.attn_qkv.weight": "2cacd1161f1121a2c0b979930134f4666f73fb8d7237b3b0659ae091b15955a6",
-  "blk.30.ffn_down.weight": "9f3fcb6217100595850c05dc98f9ab2a263afdb6ab28df2fcb08aeff512057d7",
-  "blk.30.ffn_norm.weight": "6c600bc1fc7de39d4f8917b81fc7d1d5ed2a9b56492234c13a4bd6028c30d880",
-  "blk.30.ffn_up.weight": "73cabd1bb011956b2689ea3338bb76642ef3a57c197377d666d2ab5f56317668",
-  "blk.31.attn_norm.weight": "72d3e1cc771380645fa75a899858c95f39857a4f3f1ed60fe1578df383b8bc53",
-  "blk.31.attn_output.weight": "40089cdd29994dc19a1d89fa15902a89cfeca3540f12dc9bf4d00ef82506e456",
-  "blk.31.attn_qkv.weight": "1d0bb40e9258071ae14290a53c619a8e331dda07354d2a02ef45766c029ae5e4",
-  "blk.31.ffn_down.weight": "8defa0e06335b793fa8be03883f0a322d6c5b33f52c69c943c35c60d16e42c0a",
-  "blk.31.ffn_norm.weight": "33c55d9d0c496ccfb130361fe131649346e098abaaac39c0519507e5d846721d",
-  "blk.31.ffn_up.weight": "599f6503f61c692c1f82001973d35119f9688db5e6be9d9c298411491c93f09b",
-  "output.weight": "14b8dc662bfa3308ebb2e102c562d8e52c15670e538f20f3216a9c310ca9dd41",
-  "output_norm.weight": "7f2294ba94ce65681df6c7ddd8698799199b9d77dc83c10bdad5c3999f0fdb82",
-  "rope_factors_long.weight": "e34d378664e354652c38f47d10dafb0498ccc2fb042d39ff7fef768146fff22b",
-  "rope_factors_short.weight": "9379146a4988f373d362fe47b06c75e7fe7c54aa4dc9558758df79b7a87471fd",
-  "token_embd.weight": "19a03c1fb5ac0baee93b0a7d8b0f26e9a9b011e229b694afc50ebfc13d84f8bf"
-}
--- a/convert/testdata/all-MiniLM-L6-v2.json
+++ b/convert/testdata/all-MiniLM-L6-v2.json
@@ -1,124 +0,0 @@
-{
-  "general.architecture": "bert",
-  "general.file_type": "1",
-  "general.quantization_version": "2",
-  "bert.attention.causal": "false",
-  "bert.attention.head_count": "12",
-  "bert.attention.layer_norm_epsilon": "1e-12",
-  "bert.block_count": "6",
-  "bert.context_length": "512",
-  "bert.embedding_length": "384",
-  "bert.feed_forward_length": "1536",
-  "bert.pooling_type": "1",
-  "tokenizer.ggml.model": "bert",
-  "tokenizer.ggml.padding_token_id": "0",
-  "tokenizer.ggml.unknown_token_id": "100",
-  "tokenizer.ggml.cls_token_id": "101",
-  "tokenizer.ggml.seperator_token_id": "102",
-  "tokenizer.ggml.mask_token_id": "103",
-  "tokenizer.ggml.token_type_count": "2",
-  "tokenizer.ggml.scores": "6db964fe67338aca57790481a390121ff3dd643eebe49f7dd308029ad99abb6f",
-  "tokenizer.ggml.token_type": "98d247c5404b6b18f05f133b92dd56edf6efefefac326794b00d7b351f6c5aa1",
-  "tokenizer.ggml.tokens": "9efe405e229a45ff9916f54c475d151d2200cd2ab0006f347abfb069cf096c86",
-  "token_embd.weight": "8c1ee80a9ea4f65aa385ba30112010068af3d209bebc6e149d3d4589c2cd0a5a",
-  "position_embd.weight": "6c516f0b1c4e2388ab90394dd80ad69e4e4509b890982fc3408108ae66210eb6",
-  "token_types.weight": "f879f8e422ed211948f28b560d3c5e17aae7993f063b51196a28cf5c0fb3da21",
-  "token_embd_norm.weight": "75076e095d717aab96f8b6beeee503c27940d9a76f2b891a0e3de72f8a6043e4",
-  "token_embd_norm.bias": "298735285ffe944e1bf03e5d35c7280326b85cf121bde9874f1af5dc51ab939d",
-  "blk.0.attn_q.weight": "ab0923ce4c1549175112dcdfcc860fe30137f991e03ea6857fb5993670adaf6c",
-  "blk.0.attn_q.bias": "a3ec29551dabf976e1d34256b8ab5ab7b758f3ed9742c3cafdbd984d5441df62",
-  "blk.0.attn_k.weight": "4c1038a6d035c3e9ffed7fa672b614627814752503755fbad0cfb76a41ad71ba",
-  "blk.0.attn_k.bias": "e0363930eb588d91816aa3d230bb03b6e2551c165117b80b8d60397413819ef9",
-  "blk.0.attn_v.weight": "425e2e53e3f00ce98d29c3e6a161eb55d3e6ae0d96fdb9f6242d1c4fd6eef4b3",
-  "blk.0.attn_v.bias": "6579173a1e65ee124fbd0bd53cbdca4225515b4f2c5f18fb1bfd000f5978f9bb",
-  "blk.0.attn_output.weight": "a6d70a08cd7164de5d12af65d86d657c3db35aaecde778b2b3fda9193c4c9802",
-  "blk.0.attn_output.bias": "2b8d12c4f9a9c5bfaa29c597839568f6e0525cb41eeaf64ddeb6bd84dfeb9701",
-  "blk.0.attn_output_norm.weight": "bbe6e502a473228b525aeed26cc31b7db123ad63bdc5a6eebac6ea70b8b51d62",
-  "blk.0.attn_output_norm.bias": "36eaacaf0007c5c62daea97aab0115390c0682914f78482e37eb76885f4b7a50",
-  "blk.0.ffn_up.weight": "24654561c76ce387d125759ba843f06b904ef721fcceaeff6ccc62180a48e874",
-  "blk.0.ffn_up.bias": "fd3f0126aa1d95768fa60eb6f4ab8a2763cfcb7e5405f35b92353031d86f4d34",
-  "blk.0.ffn_down.weight": "97a829763a6a5bf3329ceb4d39c424ba4787d61653a5b0bbd1f84782e4d4e0ca",
-  "blk.0.ffn_down.bias": "7aa980c30ae8b4ee7f69df28808dbf5c431f56ccc4a80340f644a0419f16c054",
-  "blk.0.layer_output_norm.weight": "ef30dad4c2a083ae1ff5039a2a6cda60ecc89bf1e486a6f8c0d15f50589603f8",
-  "blk.0.layer_output_norm.bias": "8b1b77e67568b1bce43fc476de1b177c53ff688d66beb66995e8eb3dc290da8a",
-  "blk.1.attn_q.weight": "284331622a1f6f9b87ccee4f652bd66a394ca493c4d93be4d1844e4f6159ad10",
-  "blk.1.attn_q.bias": "e24ebd4860330e08f6bfdd077a82db0bee33f4c8846cf1db26327a34754c7069",
-  "blk.1.attn_k.weight": "729dd0d555544b5bd0f7580b3c8b384256b974605f0e7487b95f295aa032997d",
-  "blk.1.attn_k.bias": "2aa51a828a858f35473f54477583fea54ce2ccc34ea60fbd1d228fbe9bca827f",
-  "blk.1.attn_v.weight": "6be304671cc311d5ca5c103f2b51467ee800c589bc5b8101e09ff5aed1f68c21",
-  "blk.1.attn_v.bias": "43bcbab78a8819e07f723bc9e5b737b71e87a7594f15234e882b63e327a64199",
-  "blk.1.attn_output.weight": "15ec8a1a12b26c9976445308a09f748ab0e4bef0f583d13ab08c3129f8738d73",
-  "blk.1.attn_output.bias": "dac2146f4baa6ed16f6c0dc7443831fb7ec79bedcceafd80d1a4b628a1bb072d",
-  "blk.1.attn_output_norm.weight": "d2151eb33bffac536787a4c9a5d2b31c7a80b17c4611877842a3cce2cd6e98d8",
-  "blk.1.attn_output_norm.bias": "31e1b779716dafb855d2cf5631ee168a0ccf372eb9c6ea6091f66fa97a9b9d2d",
-  "blk.1.ffn_up.weight": "a57547fc3fc3b77406f5cdcb0c87af9bc184701f175c39c1f35297826fce3cc7",
-  "blk.1.ffn_up.bias": "123be6d541d086202913c75d878c54d59a749f3af7b58f7ef9eb9e7c62a24c9a",
-  "blk.1.ffn_down.weight": "cfdb79788377e5cbded8790cd41b9e66c397ecab75474071fcd7cf32d30f9613",
-  "blk.1.ffn_down.bias": "bcb58315519a573097960891c9ae41cf4c685ab78c3e0e77471471758a7eae88",
-  "blk.1.layer_output_norm.weight": "819b554271452bfb1d84c2603b90377b2e41a0ac1e3aa8b417ccf9dce63375bd",
-  "blk.1.layer_output_norm.bias": "47a3433ac27f5ce8947fb38dd491f3706df4ef6adb0ddf74612bf0f54b19e164",
-  "blk.2.attn_q.weight": "1557a9ea852b1880551f7290e00aded4f35e6c4180fdcbed1b0039bf805f639e",
-  "blk.2.attn_q.bias": "c3bfe5f3066f655fd36b055530997b59ff33ef013563aaeb3cb8ff07dabd59a9",
-  "blk.2.attn_k.weight": "cfd08eb69c61ae2f9f14f9b7ff5c5394ca264b1a9f3d48156677f90dd1766289",
-  "blk.2.attn_k.bias": "9b839bc0e79974a0b3f5d1895972bc6f5c9a1bc16052e1af786e6a530758152d",
-  "blk.2.attn_v.weight": "02b26b1208480eaeeb00e7b4cf8b690006ca14759357fc44ed4a2a8924ead993",
-  "blk.2.attn_v.bias": "e7e6f0089fded1659a867ab736c220d9653ea7da6b1b94baf5c8d30a748b63ab",
-  "blk.2.attn_output.weight": "a1db121c7d33806b349cadd050300a57db49fdc91224fd07c9ac43bf4299dc79",
-  "blk.2.attn_output.bias": "7675128b6a92555cd955c820311e91e9417d31f48848f45d047b4100c62148b3",
-  "blk.2.attn_output_norm.weight": "5b4595e0fbcba67a700c4331adf746d2fba3546364a4db5607ae241947bb1a21",
-  "blk.2.attn_output_norm.bias": "7b8e16826ea30e5a2ba0b02e0095a901775981a296e98819625320e983060d08",
-  "blk.2.ffn_up.weight": "a0d815d946ac07a65095c4ae4df77b818845e6d97795c7d82f55e689d944db59",
-  "blk.2.ffn_up.bias": "ce37c0a4174d6bf773ded7bd016ede627ad3bdb8bc99b9992a18dc8e8898f252",
-  "blk.2.ffn_down.weight": "f6231d2a25426fbd45b9f1160aa484220eb227ceef0348c4a6a6de890606e5ef",
-  "blk.2.ffn_down.bias": "429e00556e8dc63a785238b309b9d83738500c1ef6d736fe6526ad88ea496d27",
-  "blk.2.layer_output_norm.weight": "651457a573adf3f7dd9ee5dfe1c8e89389e94443993aab77ec6a0b05aa621e35",
-  "blk.2.layer_output_norm.bias": "41fbbeda7fd89b0cef5f945ae44011c316982390401d6f75ba8c6d365e185247",
-  "blk.3.attn_q.weight": "95a43f32949d2cb8d22815bb27a44abfc6665ba96221af817dfe058cb6ca72c6",
-  "blk.3.attn_q.bias": "f4e34385e75d8108b6b3bd336106e2133a8c9be0cc343dfe5dc48c32a823c7cb",
-  "blk.3.attn_k.weight": "6b892da6a17d4d3265265a15f695864a31813ee8c8e710ae9bc9e1adbc6c9a18",
-  "blk.3.attn_k.bias": "40b8067b641a56014cee42548240aa8930820958b1933004892b5f04fbaef39e",
-  "blk.3.attn_v.weight": "9fcd5922319dd2a461082a5ce040c1dfe65d87d70ca6547dd0b46eeecc3eeb2b",
-  "blk.3.attn_v.bias": "b528c56212e66931fdbe267ac327a9c2f87cd03baff3ea719e30afe681da15f1",
-  "blk.3.attn_output.weight": "e3b178c1b03981e75510e0d277af23ea59cc404b5394e61bd32291825719b502",
-  "blk.3.attn_output.bias": "712c84d39a6a5a9c06a09da8fd9939ba0d5525524a4bba61ea4de09b48f45cae",
-  "blk.3.attn_output_norm.weight": "d1ffac88e675592ff72f8a617be32b4a381d443b2f8f2645dbe44a1e5745aac0",
-  "blk.3.attn_output_norm.bias": "ea31a1c73146234c50e0e43f485c458413714867b8e2703af66482f7db2d6c40",
-  "blk.3.ffn_up.weight": "4ef4f3b9a1ea6ab2ef2eb6e8b008e06a44790d099d97482a05a51e39a29afac0",
-  "blk.3.ffn_up.bias": "06a4296dda16f452675c51f108079fe7722552d6521c737d97734943818b9a2b",
-  "blk.3.ffn_down.weight": "f114b2bebe392c7d80433bb880c6730293aa4561b0b0370dcdaf7472daebd847",
-  "blk.3.ffn_down.bias": "2c8e67831d28a3bf613fc7912ae3259b63d72abcaf4d30efd8800758400158de",
-  "blk.3.layer_output_norm.weight": "a1dfeb7b5a51dd56447312ca41e2ad2f361a3ea12ddc355127f5f4219fb0a482",
-  "blk.3.layer_output_norm.bias": "1ed630021b25c6c6fc93fd32988b9907df966d4982a93081f639aac3044618ab",
-  "blk.4.attn_q.weight": "b5fae4c1f9a5f33a2a2e816ac0c01c25f422e4efdd59ef1ed93da2610e5370fc",
-  "blk.4.attn_q.bias": "c2e376524ea98ac3b10d9eee19ecb1b1e261fa5149efe0232844c923dfb428fb",
-  "blk.4.attn_k.weight": "a4632f5ebf9321d9d08f9112a4e5dda2efe5671df4a4e67fee24845f5b14af16",
-  "blk.4.attn_k.bias": "a9a02ffb8b8b4f6dfe487a7e0341f1d5318c9d2b793a688f34cb1b22fc66ef60",
-  "blk.4.attn_v.weight": "10ad8deb81d9fa093b1e5c0f24ea82aa7df43e6aca49e260fcbea56eab8cc86a",
-  "blk.4.attn_v.bias": "7326813e181e021130bd33ac136293fcffccce2d1d8cb59041e5b13a8cceacf6",
-  "blk.4.attn_output.weight": "c92573088c7437c2b3cda51490e152c27fb19e5468df591eabba5a49d5398d44",
-  "blk.4.attn_output.bias": "14e10b419e5859af1eb685af5c330aee67048cd704dcead9217840c6f5393222",
-  "blk.4.attn_output_norm.weight": "02b6831c0e0fb0edbc579a92812a1dd972cb15d14fcd382d4427c5a7b300ac44",
-  "blk.4.attn_output_norm.bias": "7eed5cd503bb6bb6ceb1bc8b07cc077903a4f14fb8b9d6cdf39644815ecf1374",
-  "blk.4.ffn_up.weight": "8d0c91d62e74d6431321116a37cf3339e630bd50ba164d3304fc4fe8dd831223",
-  "blk.4.ffn_up.bias": "d325f07f73c005a273c484c7be8e7abb4d6e8a5c4fd093f5869133b97629d017",
-  "blk.4.ffn_down.weight": "7ba7bd81143f40537b84f938e403e19f30e4928625eb371de052b9025beb4d21",
-  "blk.4.ffn_down.bias": "2853d9c2a75288214a4bf4907dc19d04d01926f4913d302b1aa7bdbfcce0f7a1",
-  "blk.4.layer_output_norm.weight": "a4ed1885fa77b90fed5300c355ef0aa0c876a8c747151d9d790939d464d57d4f",
-  "blk.4.layer_output_norm.bias": "62142a81e813a9e636333b2b805d6bc3b17c5e7cd4b15adce1ada6bc9a32563c",
-  "blk.5.attn_q.weight": "afc1dff080a72c3daad01384b1448d476aaf789871017c8ff8e144788887995d",
-  "blk.5.attn_q.bias": "748a820371c1d4f872c84545b36358d239c35bf6c99e2812c237d88c3292763b",
-  "blk.5.attn_k.weight": "59e30c1ed8acd2cbb01de5f62e7804015b9ecf98ba157d98cab016344639eda5",
-  "blk.5.attn_k.bias": "f839520078f9e589496e982e86d0126c7aa14196047339abffcf49a696229f77",
-  "blk.5.attn_v.weight": "3e21fb874e21b90308e1f46af034a3c32d3eba1628d62ae5f2246d6af5818923",
-  "blk.5.attn_v.bias": "5cd4852bf95c1444d10d756750f6bf49f842c0b39e9953c7f408bb67c325ac8c",
-  "blk.5.attn_output.weight": "636ce6a7752895f204b9d01ba0aedd9a294f908b42f372c22a16d9dd590d7471",
-  "blk.5.attn_output.bias": "82d924d4b0d2b94f2bbff91619216d6967a3541ce9b1531a6a60457a67b5d219",
-  "blk.5.attn_output_norm.weight": "5e7bd0a8d3396080f3360d7c4700bf094a06216431bd014c4479eef72ecf4271",
-  "blk.5.attn_output_norm.bias": "66c6de5edda5466d029c6753780be81ccd4218bf8bc00680000e0f06856ab712",
-  "blk.5.ffn_up.weight": "5bbf6e7ea380e216e33f8bee06d25f2265359d3876a300e92bc6e41d48e33430",
-  "blk.5.ffn_up.bias": "9d795388bb36fb33ad3a37fea3ccb4937838e02800a608fb47d363cd06b47370",
-  "blk.5.ffn_down.weight": "2fd628974e7f075479dd227b46fbd48ae8d3ca34d735b36f391ac06410730368",
-  "blk.5.ffn_down.bias": "cd213ba9eaa75fa541648097fbe9c96e58077e6c3ad6ad2fb1f21f8350f44291",
-  "blk.5.layer_output_norm.weight": "159a9df41d15b7022d136f86a2a2631c4635f9816e957472217077b522bcf52a",
-  "blk.5.layer_output_norm.bias": "24c1f27ffd1eb4e5be7e3a2909943e6f0980635d761fa1efdd0c19645da23766"
-}
--- a/convert/testdata/gemma-2-9b-it.json
+++ b/convert/testdata/gemma-2-9b-it.json
@@ -1,6 +0,0 @@
-{
-  "general.architecture": "gemma2",
-  "gemma2.attention.sliding_window": "4096",
-  "gemma2.attn_logit_softcapping": "50",
-  "gemma2.final_logit_softcapping": "30"
-}
--- a/convert/tokenizer.go
+++ b/convert/tokenizer.go
@@ -1,6 +1,7 @@
 package convert

 import (
+	"cmp"
 	"crypto/sha256"
 	"encoding/hex"
 	"encoding/json"
@@ -10,8 +11,6 @@ import (
 	"log/slog"
 	"os"
 	"slices"
-
-	"golang.org/x/exp/maps"
 )

 const (
@@ -185,32 +184,32 @@ func parseVocabularyFromTokenizer(fsys fs.FS) (*Vocabulary, error) {
 		return nil, err
 	}

-	tokens := make(map[int]token, len(t.Model.Vocab))
+	var tokens []token
 	for k, v := range t.Model.Vocab {
-		tokens[v] = token{
+		tokens = append(tokens, token{
 			ID:      v,
 			Content: k,
-		}
+		})
 	}

-	for _, token := range t.AddedTokens {
-		token.UserDefined = true
-		tokens[token.ID] = token
+	for _, t := range t.AddedTokens {
+		t.UserDefined = true
+		tokens = append(tokens, t)
 	}

-	keys := maps.Keys(tokens)
-	slices.Sort(keys)
+	slices.SortFunc(tokens, func(i, j token) int {
+		return cmp.Compare(i.ID, j.ID)
+	})

 	v := Vocabulary{Model: "gpt2"}
-	for _, k := range keys {
-		token := tokens[k]
-		v.Tokens = append(v.Tokens, token.Content)
-		v.Scores = append(v.Scores, float32(token.ID))
+	for _, t := range tokens {
+		v.Tokens = append(v.Tokens, t.Content)
+		v.Scores = append(v.Scores, float32(t.ID))

 		switch {
-		case token.Special:
+		case t.Special:
 			v.Types = append(v.Types, tokenTypeControl)
-		case token.UserDefined:
+		case t.UserDefined:
 			v.Types = append(v.Types, tokenTypeUserDefined)
 		default:
 			v.Types = append(v.Types, tokenTypeNormal)
--- a/convert/tokenizer_spm.go
+++ b/convert/tokenizer_spm.go
@@ -15,11 +15,6 @@ import (
 )

 func parseSentencePiece(fsys fs.FS) (*Vocabulary, error) {
-	ast, err := parseAdditionalSpecialTokens(fsys)
-	if err != nil {
-		return nil, err
-	}
-
 	bts, err := fs.ReadFile(fsys, "tokenizer.model")
 	if err != nil {
 		return nil, err
@@ -42,12 +37,7 @@ func parseSentencePiece(fsys fs.FS) (*Vocabulary, error) {
 			sentencepiece.ModelProto_SentencePiece_BYTE:
 			v.Types = append(v.Types, int32(t))
 		default:
-			tt := int32(sentencepiece.ModelProto_SentencePiece_NORMAL)
-			if slices.Contains(ast, piece.GetPiece()) {
-				tt = int32(sentencepiece.ModelProto_SentencePiece_CONTROL)
-			}
-
-			v.Types = append(v.Types, tt)
+			v.Types = append(v.Types, int32(sentencepiece.ModelProto_SentencePiece_NORMAL))
 		}
 	}

@@ -91,23 +81,3 @@ func parseSentencePiece(fsys fs.FS) (*Vocabulary, error) {

 	return &v, nil
 }
-
-func parseAdditionalSpecialTokens(fsys fs.FS) ([]string, error) {
-	f, err := fsys.Open("special_tokens_map.json")
-	if errors.Is(err, os.ErrNotExist) {
-		return nil, nil
-	} else if err != nil {
-		return nil, err
-	}
-	defer f.Close()
-
-	var m struct {
-		AdditionalSpecialTokens []string `json:"additional_special_tokens"`
-	}
-
-	if err := json.NewDecoder(f).Decode(&m); err != nil {
-		return nil, err
-	}
-
-	return m.AdditionalSpecialTokens, nil
-}
--- a/docs/api.md
+++ b/docs/api.md
@@ -669,7 +669,7 @@ curl http://localhost:11434/api/chat -d '{

 ```
 curl http://localhost:11434/api/chat -d '{
-  "model": "llama3.1",
+  "model": "mistral",
  "messages": [
    {
      "role": "user",
@@ -708,7 +708,7 @@ curl http://localhost:11434/api/chat -d '{

 ```json
 {
-  "model": "llama3.1",
+  "model": "mistral:7b-instruct-v0.3-q4_K_M",
  "created_at": "2024-07-22T20:33:28.123648Z",
  "message": {
    "role": "assistant",
@@ -1175,10 +1175,7 @@ curl http://localhost:11434/api/embed -d '{
  "embeddings": [[
    0.010071029, -0.0017594862, 0.05007221, 0.04692972, 0.054916814,
    0.008599704, 0.105441414, -0.025878139, 0.12958129, 0.031952348
-  ]],
-  "total_duration": 14143917,
-  "load_duration": 1019500,
-  "prompt_eval_count": 8
+  ]]
 }
 ```

--- a/docs/faq.md
+++ b/docs/faq.md
@@ -111,10 +111,7 @@ On Windows, Ollama inherits your user and system environment variables.

 ## How do I use Ollama behind a proxy?

-Ollama pulls models from the Internet and may require a proxy server to access the models. Use `HTTPS_PROXY` to redirect outbound requests through the proxy. Ensure the proxy certificate is installed as a system certificate. Refer to the section above for how to use environment variables on your platform.
-
-> [!NOTE]
-> Avoid setting `HTTP_PROXY`. Ollama does not use HTTP for model pulls, only HTTPS. Setting `HTTP_PROXY` may interrupt client connections to the server.
+Ollama is compatible with proxy servers if `HTTP_PROXY` or `HTTPS_PROXY` are configured. When using either variables, ensure it is set where `ollama serve` can access the values. When using `HTTPS_PROXY`, ensure the proxy certificate is installed as a system certificate. Refer to the section above for how to use environment variables on your platform.

 ### How do I use Ollama behind a proxy in Docker?

--- a/docs/images/ollama-keys.png
+++ b/docs/images/ollama-keys.png
--- a/docs/images/signup.png
+++ b/docs/images/signup.png
--- a/docs/import.md
+++ b/docs/import.md
@@ -1,129 +1,42 @@
-# Importing a model
+# Import

-## Table of Contents
+GGUF models and select Safetensors models can be imported directly into Ollama.

-  * [Importing a Safetensors adapter](#Importing-a-fine-tuned-adapter-from-Safetensors-weights)
-  * [Importing a Safetensors model](#Importing-a-model-from-Safetensors-weights)
-  * [Importing a GGUF file](#Importing-a-GGUF-based-model-or-adapter)
-  * [Sharing models on ollama.com](#Sharing-your-model-on-ollamacom)
+## Import GGUF

-## Importing a fine tuned adapter from Safetensors weights
-
-First, create a `Modelfile` with a `FROM` command pointing at the base model you used for fine tuning, and an `ADAPTER` command which points to the directory with your Safetensors adapter:
-
-```dockerfile
-FROM <base model name>
-ADAPTER /path/to/safetensors/adapter/directory
-```
-
-Make sure that you use the same base model in the `FROM` command as you used to create the adapter otherwise you will get erratic results. Most frameworks use different quantization methods, so it's best to use non-quantized (i.e. non-QLoRA) adapters. If your adapter is in the same directory as your `Modelfile`, use `ADAPTER .` to specify the adapter path.
-
-Now run `ollama create` from the directory where the `Modelfile` was created:
-
-```bash
-ollama create my-model
-```
-
-Lastly, test the model:
-
-```bash
-ollama run my-model
-```
-
-Ollama supports importing adapters based on several different model architectures including:
-
-  * Llama (including Llama 2, Llama 3, and Llama 3.1);
-  * Mistral (including Mistral 1, Mistral 2, and Mixtral); and
-  * Gemma (including Gemma 1 and Gemma 2)
-
-You can create the adapter using a fine tuning framework or tool which can output adapters in the Safetensors format, such as:
-
-  * Hugging Face [fine tuning framework] (https://huggingface.co/docs/transformers/en/training)
-  * [Unsloth](https://github.com/unslothai/unsloth)
-  * [MLX](https://github.com/ml-explore/mlx)
-
-
-## Importing a model from Safetensors weights
-
-First, create a `Modelfile` with a `FROM` command which points to the directory containing your Safetensors weights:
-
-```dockerfile
-FROM /path/to/safetensors/directory
-```
-
-If you create the Modelfile in the same directory as the weights, you can use the command `FROM .`.
-
-Now run the `ollama create` command from the directory where you created the `Modelfile`:
-
-```shell
-ollama create my-model
-```
-
-Lastly, test the model:
-
-```shell
-ollama run my-model
-```
-
-Ollama supports importing models for several different architectures including:
-
-  * Llama (including Llama 2, Llama 3, and Llama 3.1);
-  * Mistral (including Mistral 1, Mistral 2, and Mixtral);
-  * Gemma (including Gemma 1 and Gemma 2); and
-  * Phi3
-
-This includes importing foundation models as well as any fine tuned models which which have been _fused_ with a foundation model.
-
-
-## Importing a GGUF based model or adapter
-
-If you have a GGUF based model or adapter it is possible to import it into Ollama. You can obtain a GGUF model or adapter by:
-
-  * converting a Safetensors model with the `convert_hf_to_gguf.py` from Llama.cpp; 
-  * converting a Safetensors adapter with the `convert_lora_to_gguf.py` from Llama.cpp; or
-  * downloading a model or adapter from a place such as HuggingFace
-
-To import a GGUF model, create a `Modelfile` containg:
+A binary GGUF file can be imported directly into Ollama through a Modelfile.

 ```dockerfile
 FROM /path/to/file.gguf
 ```

-For a GGUF adapter, create the `Modelfile` with:
+## Import Safetensors
+
+If the model being imported is one of these architectures, it can be imported directly into Ollama through a Modelfile:
+
+ - LlamaForCausalLM
+ - MistralForCausalLM
+ - GemmaForCausalLM

 ```dockerfile
-FROM <model name>
-ADAPTER /path/to/file.gguf
+FROM /path/to/safetensors/directory
 ```

-When importing a GGUF adapter, it's important to use the same base model as the base model that the adapter was created with. You can use:
+For architectures not directly convertable by Ollama, see llama.cpp's [guide](https://github.com/ggerganov/llama.cpp/blob/master/README.md#prepare-and-quantize) on conversion. After conversion, see [Import GGUF](#import-gguf).

- * a model from Ollama
- * a GGUF file
- * a Safetensors based model 
+## Automatic Quantization

-Once you have created your `Modelfile`, use the `ollama create` command to build the model.
+> [!NOTE]
+> Automatic quantization requires v0.1.35 or higher.

-```shell
-ollama create my-model
-```
-
-## Quantizing a Model
-
-Quantizing a model allows you to run models faster and with less memory consumption but at reduced accuracy. This allows you to run a model on more modest hardware.
-
-Ollama can quantize FP16 and FP32 based models into different quantization levels using the `-q/--quantize` flag with the `ollama create` command.
-
-First, create a Modelfile with the FP16 or FP32 based model you wish to quantize.
+Ollama is capable of quantizing FP16 or FP32 models to any of the supported quantizations with the `-q/--quantize` flag in `ollama create`.

 ```dockerfile
 FROM /path/to/my/gemma/f16/model
 ```

-Use `ollama create` to then create the quantized model.
-
 ```shell
-$ ollama create --quantize q4_K_M mymodel
+$ ollama create -q Q4_K_M mymodel
 transferring model data
 quantizing F16 model to Q4_K_M
 creating new layer sha256:735e246cc1abfd06e9cdcf95504d6789a6cd1ad7577108a70d9902fef503c1bd
@@ -134,53 +47,42 @@ success

 ### Supported Quantizations

- `q4_0`
- `q4_1`
- `q5_0`
- `q5_1`
- `q8_0`
+- `Q4_0`
+- `Q4_1`
+- `Q5_0`
+- `Q5_1`
+- `Q8_0`

 #### K-means Quantizations

- `q3_K_S`
- `q3_K_M`
- `q3_K_L`
- `q4_K_S`
- `q4_K_M`
- `q5_K_S`
- `q5_K_M`
- `q6_K`
+- `Q3_K_S`
+- `Q3_K_M`
+- `Q3_K_L`
+- `Q4_K_S`
+- `Q4_K_M`
+- `Q5_K_S`
+- `Q5_K_M`
+- `Q6_K`

+## Template Detection

-## Sharing your model on ollama.com
+> [!NOTE]
+> Template detection requires v0.1.42 or higher.

-You can share any model you have created by pushing it to [ollama.com](https://ollama.com) so that other users can try it out.
+Ollama uses model metadata, specifically `tokenizer.chat_template`, to automatically create a template appropriate for the model you're importing.

-First, use your browser to go to the [Ollama Sign-Up](https://ollama.com/signup) page. If you already have an account, you can skip this step.
-
-![Sign-Up](images/signup.png)
-
-The `Username` field will be used as part of your model's name (e.g. `jmorganca/mymodel`), so make sure you are comfortable with the username that you have selected.
-
-Now that you have created an account and are signed-in, go to the [Ollama Keys Settings](https://ollama.com/settings/keys) page.
-
-Follow the directions on the page to determine where your Ollama Public Key is located.
-
-![Ollama Key](images/ollama-keys.png)
-
-Click on the `Add Ollama Public Key` button, and copy and paste the contents of your Ollama Public Key into the text field.
-
-To push a model to [ollama.com](https://ollama.com), first make sure that it is named correctly with your username. You may have to use the `ollama cp` command to copy
-your model to give it the correct name. Once you're happy with your model's name, use the `ollama push` command to push it to [ollama.com](https://ollama.com).
-
-```shell
-ollama cp mymodel myuser/mymodel
-ollama push myuser/mymodel
+```dockerfile
+FROM /path/to/my/gemma/model
 ```

-Once your model has been pushed, other users can pull and run it by using the command:
-
 ```shell
-ollama run myuser/mymodel
+$ ollama create mymodel
+transferring model data
+using autodetected template gemma-instruct
+creating new layer sha256:baa2a0edc27d19cc6b7537578a9a7ba1a4e3214dc185ed5ae43692b319af7b84
+creating new layer sha256:ba66c3309914dbef07e5149a648fd1877f030d337a4f240d444ea335008943cb
+writing manifest
+success
 ```

+Defining a template in the Modelfile will disable this feature which may be useful if you want to use a different template than the autodetected one.
--- a/docs/linux.md
+++ b/docs/linux.md
@@ -20,12 +20,13 @@ GPU.

 ## Manual install

-### Download `ollama`
+### Download the `ollama` binary

-Download and extract the Linux package:
+Ollama is distributed as a self-contained binary. Download it to a directory in your PATH:

 ```bash
-curl -fsSL https://ollama.com/download/ollama-linux-amd64.tgz | sudo tar zx -C /usr
+sudo curl -L https://ollama.com/download/ollama-linux-amd64 -o /usr/bin/ollama
+sudo chmod +x /usr/bin/ollama
 ```

 ### Adding Ollama as a startup service (recommended)
@@ -95,7 +96,8 @@ curl -fsSL https://ollama.com/install.sh | sh
 Or by downloading the ollama binary:

 ```bash
-curl -fsSL https://ollama.com/download/ollama-linux-amd64.tgz | sudo tar zx -C /usr
+sudo curl -L https://ollama.com/download/ollama-linux-amd64 -o /usr/bin/ollama
+sudo chmod +x /usr/bin/ollama
 ```

 ## Installing specific versions
--- a/docs/openai.md
+++ b/docs/openai.md
@@ -182,6 +182,7 @@ curl http://localhost:11434/v1/embeddings \
 - [x] Reproducible outputs
 - [x] Vision
 - [x] Tools (streaming support coming soon)
+- [ ] Vision
 - [ ] Logprobs

 #### Supported request fields
--- a/docs/template.md
+++ b/docs/template.md
@@ -112,9 +112,15 @@ Keep the following tips and best practices in mind when working with Go template
 ChatML is a popular template format. It can be used for models such as Databrick's DBRX, Intel's Neural Chat, and Microsoft's Orca 2.

 ```gotmpl
+{{- if .System }}<|im_start|>system
+{{ .System }}<|im_end|>
+{{ end }}
 {{- range .Messages }}<|im_start|>{{ .Role }}
 {{ .Content }}<|im_end|>
 {{ end }}<|im_start|>assistant
+{{ else }}
+{{ if .System }}<|im_start|>system
+{{ .System }}<|im_end|>
 ```

 ### Example Tools
--- a/envconfig/config.go
+++ b/envconfig/config.go
@@ -174,7 +174,7 @@ func RunnersDir() (p string) {

 	defer func() {
 		if p == "" {
-			slog.Error("unable to locate llm runner directory. Set OLLAMA_RUNNERS_DIR to the location of 'ollama/runners'")
+			slog.Error("unable to locate llm runner directory. Set OLLAMA_RUNNERS_DIR to the location of 'ollama_runners'")
 		}
 	}()

@@ -190,17 +190,17 @@ func RunnersDir() (p string) {
 	}

 	var paths []string
-	for _, root := range []string{filepath.Dir(exe), filepath.Join(filepath.Dir(exe), ".."), cwd} {
+	for _, root := range []string{filepath.Dir(exe), cwd} {
 		paths = append(paths,
 			root,
-			filepath.Join(root, runtime.GOOS+"-"+runtime.GOARCH),
-			filepath.Join(root, "dist", runtime.GOOS+"-"+runtime.GOARCH),
+			filepath.Join(root, "windows-"+runtime.GOARCH),
+			filepath.Join(root, "dist", "windows-"+runtime.GOARCH),
 		)
 	}

 	// Try a few variations to improve developer experience when building from source in the local tree
 	for _, path := range paths {
-		candidate := filepath.Join(path, "lib", "ollama", "runners")
+		candidate := filepath.Join(path, "ollama_runners")
 		if _, err := os.Stat(candidate); err == nil {
 			p = candidate
 			break
--- a/go.mod
+++ b/go.mod
@@ -1,6 +1,6 @@
 module github.com/ollama/ollama

-go 1.22.5
+go 1.22.0

 require (
 	github.com/containerd/console v1.0.3
--- a/gpu/amd_common.go
+++ b/gpu/amd_common.go
@@ -54,7 +54,7 @@ func commonAMDValidateLibDir() (string, error) {
 	// Installer payload location if we're running the installed binary
 	exe, err := os.Executable()
 	if err == nil {
-		rocmTargetDir := filepath.Join(filepath.Dir(exe), "..", "lib", "ollama")
+		rocmTargetDir := filepath.Join(filepath.Dir(exe), "rocm")
 		if rocmLibUsable(rocmTargetDir) {
 			slog.Debug("detected ROCM next to ollama executable " + rocmTargetDir)
 			return rocmTargetDir, nil
--- a/gpu/amd_windows.go
+++ b/gpu/amd_windows.go
@@ -153,7 +153,7 @@ func AMDValidateLibDir() (string, error) {
 	// Installer payload (if we're running from some other location)
 	localAppData := os.Getenv("LOCALAPPDATA")
 	appDir := filepath.Join(localAppData, "Programs", "Ollama")
-	rocmTargetDir := filepath.Join(appDir, "..", "lib", "ollama")
+	rocmTargetDir := filepath.Join(appDir, "rocm")
 	if rocmLibUsable(rocmTargetDir) {
 		slog.Debug("detected ollama installed ROCm at " + rocmTargetDir)
 		return rocmTargetDir, nil
--- a/gpu/assets.go
+++ b/gpu/assets.go
@@ -49,9 +49,13 @@ func PayloadsDir() (string, error) {
 		}

 		// Track our pid so we can clean up orphaned tmpdirs
-		n := filepath.Join(tmpDir, "ollama.pid")
-		if err := os.WriteFile(n, []byte(strconv.Itoa(os.Getpid())), 0o644); err != nil {
-			return "", fmt.Errorf("failed to write pid file %s: %w", n, err)
+		pidFilePath := filepath.Join(tmpDir, "ollama.pid")
+		pidFile, err := os.OpenFile(pidFilePath, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, os.ModePerm)
+		if err != nil {
+			return "", err
+		}
+		if _, err := pidFile.Write([]byte(strconv.Itoa(os.Getpid()))); err != nil {
+			return "", err
 		}

 		// We create a distinct subdirectory for payloads within the tmpdir
@@ -63,44 +67,37 @@ func PayloadsDir() (string, error) {

 // Best effort to clean up prior tmpdirs
 func cleanupTmpDirs() {
-	matches, err := filepath.Glob(filepath.Join(os.TempDir(), "ollama*", "ollama.pid"))
+	dirs, err := filepath.Glob(filepath.Join(os.TempDir(), "ollama*"))
 	if err != nil {
 		return
 	}
-
-	for _, match := range matches {
-		raw, err := os.ReadFile(match)
-		if errors.Is(err, os.ErrNotExist) {
-			slog.Debug("not a ollama runtime directory, skipping", "path", match)
+	for _, d := range dirs {
+		info, err := os.Stat(d)
+		if err != nil || !info.IsDir() {
 			continue
-		} else if err != nil {
-			slog.Warn("could not read ollama.pid, skipping", "path", match, "error", err)
+		}
+		raw, err := os.ReadFile(filepath.Join(d, "ollama.pid"))
+		if err != nil {
+			slog.Warn("failed to read ollama.pid", "path", d, "error", err)
+			// No pid, ignore this tmpdir
 			continue
 		}

 		pid, err := strconv.Atoi(string(raw))
 		if err != nil {
-			slog.Warn("invalid pid, skipping", "path", match, "error", err)
+			slog.Warn("failed to parse pid", "path", d, "error", err)
 			continue
 		}

-		p, err := os.FindProcess(pid)
-		if err == nil && !errors.Is(p.Signal(syscall.Signal(0)), os.ErrProcessDone) {
-			slog.Warn("process still running, skipping", "pid", pid, "path", match)
+		proc, err := os.FindProcess(pid)
+		if err == nil && !errors.Is(proc.Signal(syscall.Signal(0)), os.ErrProcessDone) {
+			slog.Warn("found running ollama", "pid", pid, "path", d)
+			// Another running ollama, ignore this tmpdir
 			continue
 		}

-		if err := os.Remove(match); err != nil {
-			slog.Warn("could not cleanup stale pidfile", "path", match, "error", err)
-		}
-
-		runners := filepath.Join(filepath.Dir(match), "runners")
-		if err := os.RemoveAll(runners); err != nil {
-			slog.Warn("could not cleanup stale runners", "path", runners, "error", err)
-		}
-
-		if err := os.Remove(filepath.Dir(match)); err != nil {
-			slog.Warn("could not cleanup stale tmpdir", "path", filepath.Dir(match), "error", err)
+		if err := os.Remove(d); err != nil {
+			slog.Warn("unable to cleanup stale tmpdir", "path", d, "error", err)
 		}
 	}
 }
--- a/gpu/cuda_common.go
+++ b/gpu/cuda_common.go
@@ -4,17 +4,9 @@ package gpu

 import (
 	"log/slog"
-	"os"
-	"regexp"
-	"runtime"
-	"strconv"
 	"strings"
 )

-// Jetson devices have JETSON_JETPACK="x.y.z" factory set to the Jetpack version installed.
-// Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices.
-var CudaTegra string = os.Getenv("JETSON_JETPACK")
-
 func cudaGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) {
 	ids := []string{}
 	for _, info := range gpuInfo {
@@ -27,38 +19,3 @@ func cudaGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) {
 	}
 	return "CUDA_VISIBLE_DEVICES", strings.Join(ids, ",")
 }
-
-func cudaVariant(gpuInfo CudaGPUInfo) string {
-	if runtime.GOARCH == "arm64" && runtime.GOOS == "linux" {
-		if CudaTegra != "" {
-			ver := strings.Split(CudaTegra, ".")
-			if len(ver) > 0 {
-				return "jetpack" + ver[0]
-			}
-		} else if data, err := os.ReadFile("/etc/nv_tegra_release"); err == nil {
-			r := regexp.MustCompile(` R(\d+) `)
-			m := r.FindSubmatch(data)
-			if len(m) != 2 {
-				slog.Info("Unexpected format for /etc/nv_tegra_release.  Set JETSON_JETPACK to select version")
-			} else {
-				if l4t, err := strconv.Atoi(string(m[1])); err == nil {
-					// Note: mapping from L4t -> JP is inconsistent (can't just subtract 30)
-					// https://developer.nvidia.com/embedded/jetpack-archive
-					switch l4t {
-					case 35:
-						return "jetpack5"
-					case 36:
-						return "jetpack6"
-					default:
-						slog.Info("unsupported L4T version", "nv_tegra_release", string(data))
-					}
-				}
-			}
-		}
-	}
-
-	if gpuInfo.computeMajor < 6 || gpuInfo.DriverMajor < 12 {
-		return "v11"
-	}
-	return "v12"
-}
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@@ -64,6 +64,10 @@ var RocmComputeMin = 9
 // TODO find a better way to detect iGPU instead of minimum memory
 const IGPUMemLimit = 1 * format.GibiByte // 512G is what they typically report, so anything less than 1G must be iGPU

+// Jetson devices have JETSON_JETPACK="x.y.z" factory set to the Jetpack version installed.
+// Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices.
+var CudaTegra string = os.Getenv("JETSON_JETPACK")
+
 // Note: gpuMutex must already be held
 func initCudaHandles() *cudaHandles {
 	// TODO - if the ollama build is CPU only, don't do these checks as they're irrelevant and confusing
@@ -211,7 +215,7 @@ func GetGPUInfo() GpuInfoList {
 				GpuInfo: GpuInfo{
 					memInfo: mem,
 					Library: "cpu",
-					Variant: cpuCapability.String(),
+					Variant: cpuCapability,
 					ID:      "0",
 				},
 			},
@@ -225,7 +229,11 @@ func GetGPUInfo() GpuInfoList {
 			return GpuInfoList{cpus[0].GpuInfo}
 		}

-		depPath := LibraryDir()
+		// On windows we bundle the nvidia library one level above the runner dir
+		depPath := ""
+		if runtime.GOOS == "windows" && envconfig.RunnersDir() != "" {
+			depPath = filepath.Join(filepath.Dir(envconfig.RunnersDir()), "cuda")
+		}

 		// Load ALL libraries
 		cHandles = initCudaHandles()
@@ -261,23 +269,11 @@ func GetGPUInfo() GpuInfoList {
 				gpuInfo.FreeMemory = uint64(memInfo.free)
 				gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
 				gpuInfo.Compute = fmt.Sprintf("%d.%d", memInfo.major, memInfo.minor)
-				gpuInfo.computeMajor = int(memInfo.major)
-				gpuInfo.computeMinor = int(memInfo.minor)
 				gpuInfo.MinimumMemory = cudaMinimumMemory
+				gpuInfo.DependencyPath = depPath
+				gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
 				gpuInfo.DriverMajor = driverMajor
 				gpuInfo.DriverMinor = driverMinor
-				variant := cudaVariant(gpuInfo)
-				if depPath != "" {
-					gpuInfo.DependencyPath = depPath
-					// Check for variant specific directory
-					if variant != "" {
-						if _, err := os.Stat(filepath.Join(depPath, "cuda_"+variant)); err == nil {
-							gpuInfo.DependencyPath = filepath.Join(depPath, "cuda_"+variant)
-						}
-					}
-				}
-				gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
-				gpuInfo.Variant = variant

 				// query the management library as well so we can record any skew between the two
 				// which represents overhead on the GPU we must set aside on subsequent updates
@@ -309,34 +305,38 @@ func GetGPUInfo() GpuInfoList {
 		// Intel
 		if envconfig.IntelGPU() {
 			oHandles = initOneAPIHandles()
-			if oHandles != nil && oHandles.oneapi != nil {
-				for d := range oHandles.oneapi.num_drivers {
-					if oHandles.oneapi == nil {
-						// shouldn't happen
-						slog.Warn("nil oneapi handle with driver count", "count", int(oHandles.oneapi.num_drivers))
-						continue
-					}
-					devCount := C.oneapi_get_device_count(*oHandles.oneapi, C.int(d))
-					for i := range devCount {
-						gpuInfo := OneapiGPUInfo{
-							GpuInfo: GpuInfo{
-								Library: "oneapi",
-							},
-							driverIndex: int(d),
-							gpuIndex:    int(i),
-						}
-						// TODO - split bootstrapping from updating free memory
-						C.oneapi_check_vram(*oHandles.oneapi, C.int(d), i, &memInfo)
-						// TODO - convert this to MinimumMemory based on testing...
-						var totalFreeMem float64 = float64(memInfo.free) * 0.95 // work-around: leave some reserve vram for mkl lib used in ggml-sycl backend.
-						memInfo.free = C.uint64_t(totalFreeMem)
-						gpuInfo.TotalMemory = uint64(memInfo.total)
-						gpuInfo.FreeMemory = uint64(memInfo.free)
-						gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
-						gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
-						gpuInfo.DependencyPath = depPath
-						oneapiGPUs = append(oneapiGPUs, gpuInfo)
+			// On windows we bundle the oneapi library one level above the runner dir
+			depPath = ""
+			if runtime.GOOS == "windows" && envconfig.RunnersDir() != "" {
+				depPath = filepath.Join(filepath.Dir(envconfig.RunnersDir()), "oneapi")
+			}
+
+			for d := range oHandles.oneapi.num_drivers {
+				if oHandles.oneapi == nil {
+					// shouldn't happen
+					slog.Warn("nil oneapi handle with driver count", "count", int(oHandles.oneapi.num_drivers))
+					continue
+				}
+				devCount := C.oneapi_get_device_count(*oHandles.oneapi, C.int(d))
+				for i := range devCount {
+					gpuInfo := OneapiGPUInfo{
+						GpuInfo: GpuInfo{
+							Library: "oneapi",
+						},
+						driverIndex: int(d),
+						gpuIndex:    int(i),
 					}
+					// TODO - split bootstrapping from updating free memory
+					C.oneapi_check_vram(*oHandles.oneapi, C.int(d), i, &memInfo)
+					// TODO - convert this to MinimumMemory based on testing...
+					var totalFreeMem float64 = float64(memInfo.free) * 0.95 // work-around: leave some reserve vram for mkl lib used in ggml-sycl backend.
+					memInfo.free = C.uint64_t(totalFreeMem)
+					gpuInfo.TotalMemory = uint64(memInfo.total)
+					gpuInfo.FreeMemory = uint64(memInfo.free)
+					gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
+					gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
+					gpuInfo.DependencyPath = depPath
+					oneapiGPUs = append(oneapiGPUs, gpuInfo)
 				}
 			}
 		}
@@ -464,12 +464,10 @@ func GetGPUInfo() GpuInfoList {
 func FindGPULibs(baseLibName string, defaultPatterns []string) []string {
 	// Multiple GPU libraries may exist, and some may not work, so keep trying until we exhaust them
 	var ldPaths []string
+	var patterns []string
 	gpuLibPaths := []string{}
 	slog.Debug("Searching for GPU library", "name", baseLibName)

-	// Start with our bundled libraries
-	patterns := []string{filepath.Join(LibraryDir(), baseLibName)}
-
 	switch runtime.GOOS {
 	case "windows":
 		ldPaths = strings.Split(os.Getenv("PATH"), ";")
@@ -478,14 +476,13 @@ func FindGPULibs(baseLibName string, defaultPatterns []string) []string {
 	default:
 		return gpuLibPaths
 	}
-
-	// Then with whatever we find in the PATH/LD_LIBRARY_PATH
+	// Start with whatever we find in the PATH/LD_LIBRARY_PATH
 	for _, ldPath := range ldPaths {
 		d, err := filepath.Abs(ldPath)
 		if err != nil {
 			continue
 		}
-		patterns = append(patterns, filepath.Join(d, baseLibName))
+		patterns = append(patterns, filepath.Join(d, baseLibName+"*"))
 	}
 	patterns = append(patterns, defaultPatterns...)
 	slog.Debug("gpu library search", "globs", patterns)
@@ -641,31 +638,3 @@ func (l GpuInfoList) GetVisibleDevicesEnv() (string, string) {
 		return "", ""
 	}
 }
-
-func LibraryDir() string {
-	// On Windows/linux we bundle the dependencies at the same level as the executable
-	appExe, err := os.Executable()
-	if err != nil {
-		slog.Warn("failed to lookup executable path", "error", err)
-	}
-	cwd, err := os.Getwd()
-	if err != nil {
-		slog.Warn("failed to lookup working directory", "error", err)
-	}
-	// Scan for any of our dependeices, and pick first match
-	for _, root := range []string{filepath.Dir(appExe), filepath.Join(filepath.Dir(appExe), ".."), cwd} {
-		libDep := filepath.Join("lib", "ollama")
-		if _, err := os.Stat(filepath.Join(root, libDep)); err == nil {
-			return filepath.Join(root, libDep)
-		}
-		// Developer mode, local build
-		if _, err := os.Stat(filepath.Join(root, runtime.GOOS+"-"+runtime.GOARCH, libDep)); err == nil {
-			return filepath.Join(root, runtime.GOOS+"-"+runtime.GOARCH, libDep)
-		}
-		if _, err := os.Stat(filepath.Join(root, "dist", runtime.GOOS+"-"+runtime.GOARCH, libDep)); err == nil {
-			return filepath.Join(root, "dist", runtime.GOOS+"-"+runtime.GOARCH, libDep)
-		}
-	}
-	slog.Warn("unable to locate gpu dependency libraries")
-	return ""
-}
--- a/gpu/gpu_darwin.go
+++ b/gpu/gpu_darwin.go
@@ -25,7 +25,7 @@ func GetGPUInfo() GpuInfoList {
 		return []GpuInfo{
 			{
 				Library: "cpu",
-				Variant: GetCPUCapability().String(),
+				Variant: GetCPUCapability(),
 				memInfo: mem,
 			},
 		}
@@ -48,7 +48,7 @@ func GetCPUInfo() GpuInfoList {
 	return []GpuInfo{
 		{
 			Library: "cpu",
-			Variant: GetCPUCapability().String(),
+			Variant: GetCPUCapability(),
 			memInfo: mem,
 		},
 	}
--- a/gpu/gpu_linux.go
+++ b/gpu/gpu_linux.go
@@ -47,7 +47,7 @@ var (
 	CudartMgmtName = "libcudart.so*"
 	NvcudaMgmtName = "libcuda.so*"
 	NvmlMgmtName   = "" // not currently wired on linux
-	OneapiMgmtName = "libze_intel_gpu.so*"
+	OneapiMgmtName = "libze_intel_gpu.so"
 )

 func GetCPUMem() (memInfo, error) {
--- a/gpu/gpu_test.go
+++ b/gpu/gpu_test.go
@@ -32,29 +32,4 @@ func TestCPUMemInfo(t *testing.T) {
 	}
 }

-func TestByLibrary(t *testing.T) {
-	type testCase struct {
-		input  []GpuInfo
-		expect int
-	}
-
-	testCases := map[string]*testCase{
-		"empty":                    {input: []GpuInfo{}, expect: 0},
-		"cpu":                      {input: []GpuInfo{{Library: "cpu"}}, expect: 1},
-		"cpu + GPU":                {input: []GpuInfo{{Library: "cpu"}, {Library: "cuda"}}, expect: 2},
-		"cpu + 2 GPU no variant":   {input: []GpuInfo{{Library: "cpu"}, {Library: "cuda"}, {Library: "cuda"}}, expect: 2},
-		"cpu + 2 GPU same variant": {input: []GpuInfo{{Library: "cpu"}, {Library: "cuda", Variant: "v11"}, {Library: "cuda", Variant: "v11"}}, expect: 2},
-		"cpu + 2 GPU diff variant": {input: []GpuInfo{{Library: "cpu"}, {Library: "cuda", Variant: "v11"}, {Library: "cuda", Variant: "v12"}}, expect: 3},
-	}
-
-	for k, v := range testCases {
-		t.Run(k, func(t *testing.T) {
-			resp := (GpuInfoList)(v.input).ByLibrary()
-			if len(resp) != v.expect {
-				t.Fatalf("expected length %d, got %d => %+v", v.expect, len(resp), resp)
-			}
-		})
-	}
-}
-
 // TODO - add some logic to figure out card type through other means and actually verify we got back what we expected
--- a/gpu/types.go
+++ b/gpu/types.go
@@ -19,7 +19,7 @@ type GpuInfo struct {
 	Library string `json:"library,omitempty"`

 	// Optional variant to select (e.g. versions, cpu feature flags)
-	Variant string `json:"variant"`
+	Variant CPUCapability `json:"variant"`

 	// MinimumMemory represents the minimum memory required to use the GPU
 	MinimumMemory uint64 `json:"-"`
@@ -53,10 +53,8 @@ type CPUInfo struct {

 type CudaGPUInfo struct {
 	GpuInfo
-	OSOverhead   uint64 // Memory overhead between the driver library and management library
-	index        int    //nolint:unused,nolintlint
-	computeMajor int    //nolint:unused,nolintlint
-	computeMinor int    //nolint:unused,nolintlint
+	OSOverhead uint64 // Memory overhead between the driver library and management library
+	index      int    //nolint:unused,nolintlint
 }
 type CudaGPUInfoList []CudaGPUInfo

@@ -83,8 +81,8 @@ func (l GpuInfoList) ByLibrary() []GpuInfoList {
 	for _, info := range l {
 		found := false
 		requested := info.Library
-		if info.Variant != CPUCapabilityNone.String() {
-			requested += "_" + info.Variant
+		if info.Variant != CPUCapabilityNone {
+			requested += "_" + info.Variant.String()
 		}
 		for i, lib := range libs {
 			if lib == requested {
@@ -94,7 +92,7 @@ func (l GpuInfoList) ByLibrary() []GpuInfoList {
 			}
 		}
 		if !found {
-			libs = append(libs, requested)
+			libs = append(libs, info.Library)
 			resp = append(resp, []GpuInfo{info})
 		}
 	}
@@ -107,7 +105,6 @@ func (l GpuInfoList) LogDetails() {
 		slog.Info("inference compute",
 			"id", g.ID,
 			"library", g.Library,
-			"variant", g.Variant,
 			"compute", g.Compute,
 			"driver", fmt.Sprintf("%d.%d", g.DriverMajor, g.DriverMinor),
 			"name", g.Name,
--- a/integration/embed_test.go
+++ b/integration/embed_test.go
@@ -70,8 +70,8 @@ func TestAllMiniLMEmbed(t *testing.T) {
 		t.Fatalf("expected 0.010071031, got %.8f", res.Embeddings[0][0])
 	}

-	if res.PromptEvalCount != 6 {
-		t.Fatalf("expected 6 prompt tokens, got %d", res.PromptEvalCount)
+	if res.PromptEvalCount != 8 {
+		t.Fatalf("expected 8 prompt tokens, got %d", res.PromptEvalCount)
 	}
 }

@@ -102,8 +102,8 @@ func TestAllMiniLMBatchEmbed(t *testing.T) {
 		t.Fatalf("expected 0.010071031 and -0.009802706, got %.8f and %.8f", res.Embeddings[0][0], res.Embeddings[1][0])
 	}

-	if res.PromptEvalCount != 12 {
-		t.Fatalf("expected 12 prompt tokens, got %d", res.PromptEvalCount)
+	if res.PromptEvalCount != 16 {
+		t.Fatalf("expected 16 prompt tokens, got %d", res.PromptEvalCount)
 	}
 }

--- a/llm/ext_server/CMakeLists.txt
+++ b/llm/ext_server/CMakeLists.txt
@@ -1,13 +1,12 @@
 set(TARGET ollama_llama_server)
 option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
-set(LLAMA_SERVER_LDFLAGS $ENV{LLAMA_SERVER_LDFLAGS})
 include_directories(${CMAKE_CURRENT_SOURCE_DIR})
 add_executable(${TARGET} server.cpp utils.hpp json.hpp httplib.h)
 install(TARGETS ${TARGET} RUNTIME)
 target_compile_definitions(${TARGET} PRIVATE
    SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
 )
-target_link_libraries(${TARGET} PRIVATE ggml llama common llava ${CMAKE_THREAD_LIBS_INIT} ${LLAMA_SERVER_LDFLAGS})
+target_link_libraries(${TARGET} PRIVATE ggml llama common llava ${CMAKE_THREAD_LIBS_INIT})
 if (WIN32)
    TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
 endif()
--- a/llm/ext_server/server.cpp
+++ b/llm/ext_server/server.cpp
@@ -1223,7 +1223,9 @@ struct llama_server_context

                res.result_json = json
                {
+                    {"id", res.id},
                    {"embedding", std::vector<float>(embd, embd + n_embd)},
+                    {"timings",             slot.get_formated_timings()},
                };
            }
        }
@@ -1429,13 +1431,7 @@ struct llama_server_context
        switch (task.type)
        {
            case TASK_TYPE_COMPLETION: {
-                server_slot *slot = nullptr;
-                if (task.embedding_mode) {
-                    // Embedding seq_id (aka slot id) must always be <= token length, so always use slot 0
-                    slot = slots[0].available() ? &slots[0] : nullptr;
-                } else {
-                    slot = prefix_slot(task.data["prompt"]);
-                }
+                server_slot *slot = prefix_slot(task.data["prompt"]);
                if (slot == nullptr)
                {
                    // if no slot is available, we defer this task for processing later
@@ -3198,17 +3194,41 @@ int main(int argc, char **argv) {
                    prompt = "";
                }

+                if (prompt.size() == 1) {
+                    prompt = prompt[0];
+                }
+
                // create and queue the task
-                const int task_id = llama.queue_tasks.get_new_id();
-                llama.queue_results.add_waiting_task_id(task_id);
-                llama.request_completion(task_id, {{"prompt", prompt}}, true, -1);
+                json responses;
+                {
+                    const int id_task = llama.queue_tasks.get_new_id();
+                    llama.queue_results.add_waiting_task_id(id_task);
+                    llama.request_completion(id_task, {{"prompt", prompt}}, true, -1);

-                // get the result
-                task_result result = llama.queue_results.recv(task_id);
-                llama.queue_results.remove_waiting_task_id(task_id);
+                    // get the result
+                    task_result result = llama.queue_results.recv(id_task);
+                    llama.queue_results.remove_waiting_task_id(id_task);
+                    if (result.error) {
+                        return res.set_content(result.result_json.dump(), "application/json; charset=utf-8");
+                    }

-                // send the result
-                return res.set_content(result.result_json.dump(), "application/json; charset=utf-8");
+                    responses = result.result_json.value("results", std::vector<json>{result.result_json});
+                    std::sort(responses.begin(), responses.end(), [](const json& a, const json& b) {
+                        return a["id"] < b["id"];
+                    });
+
+                    json embeddings = json::array();
+
+                    int prompt_n = 0;
+                    for (auto & elem : responses) {
+                        embeddings.push_back(elem.at("embedding"));
+                        prompt_n += elem.at("timings").at("prompt_n").get<int>();
+                    }
+
+                    // send the result
+                    json embedding_res = json{{"embedding", embeddings}, {"prompt_n", prompt_n}};
+                    return res.set_content(embedding_res.dump(), "application/json; charset=utf-8");
+                }
            });

    // GG: if I put the main loop inside a thread, it crashes on the first request when build in Debug!?
--- a/llm/generate/gen_common.sh
+++ b/llm/generate/gen_common.sh
@@ -9,14 +9,11 @@ init_vars() {
        ARCH="arm64"
        ;;
    *)
-        echo "GOARCH must be set"
-        echo "this script is meant to be run from within go generate"
-        exit 1
-        ;;
+        ARCH=$(uname -m | sed -e "s/aarch64/arm64/g")
    esac

    LLAMACPP_DIR=../llama.cpp
-    CMAKE_DEFS="-DCMAKE_SKIP_RPATH=on"
+    CMAKE_DEFS=""
    CMAKE_TARGETS="--target ollama_llama_server"
    if echo "${CGO_CFLAGS}" | grep -- '-g' >/dev/null; then
        CMAKE_DEFS="-DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_VERBOSE_MAKEFILE=on -DLLAMA_GPROF=on -DLLAMA_SERVER_VERBOSE=on ${CMAKE_DEFS}"
@@ -30,7 +27,6 @@ init_vars() {
        WHOLE_ARCHIVE="-Wl,-force_load"
        NO_WHOLE_ARCHIVE=""
        GCC_ARCH="-arch ${ARCH}"
-        DIST_BASE=../../dist/darwin-${GOARCH}/
        ;;
    "Linux")
        LIB_EXT="so"
@@ -39,7 +35,6 @@ init_vars() {

        # Cross compiling not supported on linux - Use docker
        GCC_ARCH=""
-        DIST_BASE=../../dist/linux-${GOARCH}/
        ;;
    *)
        ;;
@@ -47,7 +42,6 @@ init_vars() {
    if [ -z "${CMAKE_CUDA_ARCHITECTURES}" ] ; then
        CMAKE_CUDA_ARCHITECTURES="50;52;61;70;75;80"
    fi
-    GZIP=$(which pigz 2>/dev/null || echo "gzip")
 }

 git_module_setup() {
@@ -91,36 +85,26 @@ build() {

 compress() {
    echo "Compressing payloads to reduce overall binary size..."
+    pids=""
    rm -rf ${BUILD_DIR}/bin/*.gz
    for f in ${BUILD_DIR}/bin/* ; do
-        ${GZIP} -n --best -f ${f} &
-        compress_pids+=" $!"
+        gzip -n --best -f ${f} &
+        pids+=" $!"
    done
    # check for lib directory
    if [ -d ${BUILD_DIR}/lib ]; then
        for f in ${BUILD_DIR}/lib/* ; do
-            ${GZIP} -n --best -f ${f} &
-            compress_pids+=" $!"
+            gzip -n --best -f ${f} &
+            pids+=" $!"
        done
    fi
    echo
-}
-
-wait_for_compress() {
-    for pid in ${compress_pids}; do
+    for pid in ${pids}; do
        wait $pid
    done
    echo "Finished compression"
 }

-install() {
-    echo "Installing libraries to bin dir ${BUILD_DIR}/bin/"
-    for lib in $(find ${BUILD_DIR} -name \*.${LIB_EXT}); do
-        rm -f "${BUILD_DIR}/bin/$(basename ${lib})"
-        cp -af "${lib}" "${BUILD_DIR}/bin/"
-    done
-}
-
 # Keep the local tree clean after we're done with the build
 cleanup() {
    (cd ${LLAMACPP_DIR}/ && git checkout CMakeLists.txt)
--- a/llm/generate/gen_darwin.sh
+++ b/llm/generate/gen_darwin.sh
@@ -6,7 +6,6 @@

 set -ex
 set -o pipefail
-compress_pids=""
 echo "Starting darwin generate script"
 source $(dirname $0)/gen_common.sh
 init_vars
@@ -99,5 +98,4 @@ case "${GOARCH}" in
 esac

 cleanup
-wait_for_compress
 echo "go generate completed.  LLM runners: $(cd ${BUILD_DIR}/..; echo *)"
--- a/llm/generate/gen_linux.sh
+++ b/llm/generate/gen_linux.sh
@@ -13,7 +13,6 @@

 set -ex
 set -o pipefail
-compress_pids=""

 # See https://llvm.org/docs/AMDGPUUsage.html#processors for reference
 amdGPUs() {
@@ -52,7 +51,7 @@ if [ -z "${CUDACXX}" ]; then
        export CUDACXX=$(command -v nvcc)
    fi
 fi
-COMMON_CMAKE_DEFS="-DCMAKE_SKIP_RPATH=on -DBUILD_SHARED_LIBS=on -DCMAKE_POSITION_INDEPENDENT_CODE=on -DGGML_NATIVE=off -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_OPENMP=off"
+COMMON_CMAKE_DEFS="-DBUILD_SHARED_LIBS=off -DCMAKE_POSITION_INDEPENDENT_CODE=on -DGGML_NATIVE=off -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_OPENMP=off"
 source $(dirname $0)/gen_common.sh
 init_vars
 git_module_setup
@@ -78,11 +77,10 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
    if [ -n "${OLLAMA_CUSTOM_CPU_DEFS}" ]; then
        init_vars
        echo "OLLAMA_CUSTOM_CPU_DEFS=\"${OLLAMA_CUSTOM_CPU_DEFS}\""
-        CMAKE_DEFS="${OLLAMA_CUSTOM_CPU_DEFS} -DBUILD_SHARED_LIBS=on -DCMAKE_POSITION_INDEPENDENT_CODE=on ${CMAKE_DEFS}"
+        CMAKE_DEFS="${OLLAMA_CUSTOM_CPU_DEFS} -DBUILD_SHARED_LIBS=off -DCMAKE_POSITION_INDEPENDENT_CODE=on ${CMAKE_DEFS}"
        BUILD_DIR="../build/linux/${ARCH}/cpu"
        echo "Building custom CPU"
        build
-        install
        compress
    else
        # Darwin Rosetta x86 emulation does NOT support AVX, AVX2, AVX512
@@ -95,7 +93,7 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
        # -DGGML_AVX512_VBMI -- 2018 Intel Cannon Lake
        # -DGGML_AVX512_VNNI -- 2021 Intel Alder Lake

-        COMMON_CPU_DEFS="-DBUILD_SHARED_LIBS=on -DCMAKE_POSITION_INDEPENDENT_CODE=on -DGGML_NATIVE=off -DGGML_OPENMP=off"
+        COMMON_CPU_DEFS="-DBUILD_SHARED_LIBS=off -DCMAKE_POSITION_INDEPENDENT_CODE=on -DGGML_NATIVE=off -DGGML_OPENMP=off"
        if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu" ]; then
            #
            # CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
@@ -105,7 +103,6 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
            BUILD_DIR="../build/linux/${ARCH}/cpu"
            echo "Building LCD CPU"
            build
-            install
            compress
        fi

@@ -123,7 +120,6 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
                BUILD_DIR="../build/linux/${ARCH}/cpu_avx"
                echo "Building AVX CPU"
                build
-                install
                compress
            fi

@@ -137,7 +133,6 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
                BUILD_DIR="../build/linux/${ARCH}/cpu_avx2"
                echo "Building AVX2 CPU"
                build
-                install
                compress
            fi
        fi
@@ -165,7 +160,7 @@ if [ -z "${OLLAMA_SKIP_CUDA_GENERATE}" -a -d "${CUDA_LIB_DIR}" ]; then
    echo "CUDA libraries detected - building dynamic CUDA library"
    init_vars
    CUDA_MAJOR=$(ls "${CUDA_LIB_DIR}"/libcudart.so.* | head -1 | cut -f3 -d. || true)
-    if [ -n "${CUDA_MAJOR}" -a -z "${CUDA_VARIANT}" ]; then
+    if [ -n "${CUDA_MAJOR}" ]; then
        CUDA_VARIANT=_v${CUDA_MAJOR}
    fi
    if [ "${ARCH}" == "arm64" ]; then
@@ -183,19 +178,29 @@ if [ -z "${OLLAMA_SKIP_CUDA_GENERATE}" -a -d "${CUDA_LIB_DIR}" ]; then
        CMAKE_CUDA_DEFS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} ${OLLAMA_CUSTOM_CUDA_DEFS}"
        echo "Building custom CUDA GPU"
    else
-        CMAKE_CUDA_DEFS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}"
+        CMAKE_CUDA_DEFS="-DGGML_CUDA=on -DCMAKE_CUDA_FLAGS=-t8 -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}"
    fi
-    export CUDAFLAGS="-t8"
-    CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS} ${CMAKE_CUDA_DEFS} -DGGML_STATIC=off"
+    CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS} ${CMAKE_CUDA_DEFS}"
    BUILD_DIR="../build/linux/${ARCH}/cuda${CUDA_VARIANT}"
-    export LLAMA_SERVER_LDFLAGS="-L${CUDA_LIB_DIR} -lcudart -lcublas -lcublasLt -lcuda"
-    CUDA_DIST_DIR="${CUDA_DIST_DIR:-${DIST_BASE}/lib/ollama}"
+    EXTRA_LIBS="-L${CUDA_LIB_DIR} -lcudart -lcublas -lcublasLt -lcuda"
    build
-    install
-    echo "Installing CUDA dependencies in ${CUDA_DIST_DIR}"
-    mkdir -p "${CUDA_DIST_DIR}"
-    for lib in ${CUDA_LIB_DIR}/libcudart.so* ${CUDA_LIB_DIR}/libcublas.so* ${CUDA_LIB_DIR}/libcublasLt.so* ; do
-        cp -a "${lib}" "${CUDA_DIST_DIR}"
+
+    # Carry the CUDA libs as payloads to help reduce dependency burden on users
+    #
+    # TODO - in the future we may shift to packaging these separately and conditionally
+    #        downloading them in the install script.
+    DEPS="$(ldd ${BUILD_DIR}/bin/ollama_llama_server )"
+    for lib in libcudart.so libcublas.so libcublasLt.so ; do
+        DEP=$(echo "${DEPS}" | grep ${lib} | cut -f1 -d' ' | xargs || true)
+        if [ -n "${DEP}" -a -e "${CUDA_LIB_DIR}/${DEP}" ]; then
+            cp "${CUDA_LIB_DIR}/${DEP}" "${BUILD_DIR}/bin/"
+        elif [ -e "${CUDA_LIB_DIR}/${lib}.${CUDA_MAJOR}" ]; then
+            cp "${CUDA_LIB_DIR}/${lib}.${CUDA_MAJOR}" "${BUILD_DIR}/bin/"
+        elif [ -e "${CUDART_LIB_DIR}/${lib}" ]; then
+            cp -d ${CUDART_LIB_DIR}/${lib}* "${BUILD_DIR}/bin/"
+        else
+            cp -d "${CUDA_LIB_DIR}/${lib}*" "${BUILD_DIR}/bin/"
+        fi
    done
    compress

@@ -213,24 +218,21 @@ if [ -z "${OLLAMA_SKIP_ONEAPI_GENERATE}" -a -d "${ONEAPI_ROOT}" ]; then
    CC=icx
    CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL=ON -DGGML_SYCL_F16=OFF"
    BUILD_DIR="../build/linux/${ARCH}/oneapi"
-    ONEAPI_DIST_DIR="${DIST_BASE}/lib/ollama"
-    export LLAMA_SERVER_LDFLAGS="-fsycl -lOpenCL -lmkl_core -lmkl_sycl_blas -lmkl_intel_ilp64 -lmkl_tbb_thread -ltbb"
+    EXTRA_LIBS="-fsycl -Wl,-rpath,${ONEAPI_ROOT}/compiler/latest/lib,-rpath,${ONEAPI_ROOT}/mkl/latest/lib,-rpath,${ONEAPI_ROOT}/tbb/latest/lib,-rpath,${ONEAPI_ROOT}/compiler/latest/opt/oclfpga/linux64/lib -lOpenCL -lmkl_core -lmkl_sycl_blas -lmkl_intel_ilp64 -lmkl_tbb_thread -ltbb"
    DEBUG_FLAGS="" # icx compiles with -O0 if we pass -g, so we must remove it
    build

    # copy oneAPI dependencies
-    mkdir -p "${ONEAPI_DIST_DIR}"
    for dep in $(ldd "${BUILD_DIR}/bin/ollama_llama_server" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -e sycl -e mkl -e tbb); do
-        cp -a "${dep}" "${ONEAPI_DIST_DIR}"
+        cp "${dep}" "${BUILD_DIR}/bin/"
    done
-    cp "${ONEAPI_ROOT}/compiler/latest/lib/libOpenCL.so" "${ONEAPI_DIST_DIR}"
-    cp "${ONEAPI_ROOT}/compiler/latest/lib/libimf.so" "${ONEAPI_DIST_DIR}"
-    cp "${ONEAPI_ROOT}/compiler/latest/lib/libintlc.so.5" "${ONEAPI_DIST_DIR}"
-    cp "${ONEAPI_ROOT}/compiler/latest/lib/libirng.so" "${ONEAPI_DIST_DIR}"
-    cp "${ONEAPI_ROOT}/compiler/latest/lib/libpi_level_zero.so" "${ONEAPI_DIST_DIR}"
-    cp "${ONEAPI_ROOT}/compiler/latest/lib/libsvml.so" "${ONEAPI_DIST_DIR}"
-    cp "${ONEAPI_ROOT}/compiler/latest/lib/libur_loader.so.0" "${ONEAPI_DIST_DIR}"
-    install
+    cp "${ONEAPI_ROOT}/compiler/latest/lib/libOpenCL.so" "${BUILD_DIR}/bin/"
+    cp "${ONEAPI_ROOT}/compiler/latest/lib/libimf.so" "${BUILD_DIR}/bin/"
+    cp "${ONEAPI_ROOT}/compiler/latest/lib/libintlc.so.5" "${BUILD_DIR}/bin/"
+    cp "${ONEAPI_ROOT}/compiler/latest/lib/libirng.so" "${BUILD_DIR}/bin/"
+    cp "${ONEAPI_ROOT}/compiler/latest/lib/libpi_level_zero.so" "${BUILD_DIR}/bin/"
+    cp "${ONEAPI_ROOT}/compiler/latest/lib/libsvml.so" "${BUILD_DIR}/bin/"
+    cp "${ONEAPI_ROOT}/compiler/latest/lib/libur_loader.so.0" "${BUILD_DIR}/bin/"
    compress
 fi

@@ -252,7 +254,7 @@ if [ -z "${OLLAMA_SKIP_ROCM_GENERATE}" -a -d "${ROCM_PATH}" ]; then
        ROCM_VARIANT=_v$(ls ${ROCM_PATH}/lib/librocblas.so.*.*.????? | cut -f5 -d. || true)
    fi
    init_vars
-    CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DGGML_HIPBLAS=on -DGGML_CUDA_NO_PEER_COPY=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS=$(amdGPUs) -DGPU_TARGETS=$(amdGPUs)"
+    CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DGGML_HIPBLAS=on -DLLAMA_CUDA_NO_PEER_COPY=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS=$(amdGPUs) -DGPU_TARGETS=$(amdGPUs)"
    # Users building from source can tune the exact flags we pass to cmake for configuring llama.cpp
    if [ -n "${OLLAMA_CUSTOM_ROCM_DEFS}" ]; then
        echo "OLLAMA_CUSTOM_ROCM_DEFS=\"${OLLAMA_CUSTOM_ROCM_DEFS}\""
@@ -260,22 +262,23 @@ if [ -z "${OLLAMA_SKIP_ROCM_GENERATE}" -a -d "${ROCM_PATH}" ]; then
        echo "Building custom ROCM GPU"
    fi
    BUILD_DIR="../build/linux/${ARCH}/rocm${ROCM_VARIANT}"
-    # ROCm dependencies are too large to fit into a unified bundle
-    ROCM_DIST_DIR="${DIST_BASE}/../linux-${GOARCH}-rocm/lib/ollama"
-    # TODO figure out how to disable runpath (rpath)
-    # export CMAKE_HIP_FLAGS="-fno-rtlib-add-rpath" # doesn't work
-    export LLAMA_SERVER_LDFLAGS="-L${ROCM_PATH}/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ -lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu"
+    EXTRA_LIBS="-L${ROCM_PATH}/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ -Wl,-rpath,\$ORIGIN/../../rocm/ -lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu"
    build

-    # copy the ROCM dependencies
-    mkdir -p "${ROCM_DIST_DIR}"
-    for dep in $(ldd "${BUILD_DIR}/bin/ollama_llama_server" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -v "${ARCH}/rocm${ROCM_VARIANT}" | grep -e rocm -e amdgpu -e libtinfo ); do
-        cp -a "${dep}"* "${ROCM_DIST_DIR}"
+    # Record the ROCM dependencies
+    rm -f "${BUILD_DIR}/bin/deps.txt"
+    touch "${BUILD_DIR}/bin/deps.txt"
+    for dep in $(ldd "${BUILD_DIR}/bin/ollama_llama_server" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -e rocm -e amdgpu -e libtinfo ); do
+        echo "${dep}" >> "${BUILD_DIR}/bin/deps.txt"
    done
-    install
+    # bomb out if for some reason we didn't get a few deps
+    if [ $(cat "${BUILD_DIR}/bin/deps.txt" | wc -l ) -lt 8 ] ; then
+        cat "${BUILD_DIR}/bin/deps.txt"
+        echo "ERROR: deps file short"
+        exit 1
+    fi
    compress
 fi

 cleanup
-wait_for_compress
 echo "go generate completed.  LLM runners: $(cd ${BUILD_DIR}/..; echo *)"
--- a/llm/generate/gen_windows.ps1
+++ b/llm/generate/gen_windows.ps1
@@ -35,7 +35,7 @@ function init_vars {
        )
    $script:commonCpuDefs = @("-DCMAKE_POSITION_INDEPENDENT_CODE=on")
    $script:ARCH = $Env:PROCESSOR_ARCHITECTURE.ToLower()
-    $script:DIST_BASE = "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\runners"
+    $script:DIST_BASE = "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_runners"
    md "$script:DIST_BASE" -ea 0 > $null
    if ($env:CGO_CFLAGS -contains "-g") {
        $script:cmakeDefs += @("-DCMAKE_VERBOSE_MAKEFILE=on", "-DLLAMA_SERVER_VERBOSE=on", "-DCMAKE_BUILD_TYPE=RelWithDebInfo")
@@ -117,7 +117,7 @@ function build {
    if ($cmakeDefs -contains "-G") {
        $extra=@("-j8")
    } else {
-        $extra= @("--", "/maxCpuCount:8")
+        $extra= @("--", "/p:CL_MPcount=8")
    }
    write-host "building with: cmake --build $script:buildDir --config $script:config $($script:cmakeTargets | ForEach-Object { `"--target`", $_ }) $extra"
    & cmake --build $script:buildDir --config $script:config ($script:cmakeTargets | ForEach-Object { "--target", $_ }) $extra
@@ -261,7 +261,7 @@ function build_cuda() {
    if ((-not "${env:OLLAMA_SKIP_CUDA_GENERATE}") -and ("${script:CUDA_LIB_DIR}")) {
        # Then build cuda as a dynamically loaded library
        $nvcc = "$script:CUDA_LIB_DIR\nvcc.exe"
-        $script:CUDA_VERSION=((get-item ($nvcc | split-path | split-path)).Basename -Split "\.")[0]
+        $script:CUDA_VERSION=(get-item ($nvcc | split-path | split-path)).Basename
        if ($null -ne $script:CUDA_VERSION) {
            $script:CUDA_VARIANT="_"+$script:CUDA_VERSION
        }
@@ -273,9 +273,9 @@ function build_cuda() {
            "-DGGML_CUDA=ON",
            "-DGGML_AVX=on",
            "-DGGML_AVX2=off",
-            "-DCMAKE_CUDA_FLAGS=-t6",
-            "-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}",
-            "-DCMAKE_CUDA_COMPILER_TOOLKIT_ROOT=$env:CUDA_PATH"
+            "-DCUDAToolkit_INCLUDE_DIR=$script:CUDA_INCLUDE_DIR",
+            "-DCMAKE_CUDA_FLAGS=-t8",
+            "-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}"
            )
        if ($null -ne $env:OLLAMA_CUSTOM_CUDA_DEFS) {
            write-host "OLLAMA_CUSTOM_CUDA_DEFS=`"${env:OLLAMA_CUSTOM_CUDA_DEFS}`""
@@ -286,11 +286,12 @@ function build_cuda() {
        sign
        install

-        md "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" -ea 0 > $null
-        write-host "copying CUDA dependencies to ${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
-        cp "${script:CUDA_LIB_DIR}\cudart64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
-        cp "${script:CUDA_LIB_DIR}\cublas64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
-        cp "${script:CUDA_LIB_DIR}\cublasLt64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
+        rm -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
+        md "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\" -ea 0 > $null
+        write-host "copying CUDA dependencies to ${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
+        cp "${script:CUDA_LIB_DIR}\cudart64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
+        cp "${script:CUDA_LIB_DIR}\cublas64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
+        cp "${script:CUDA_LIB_DIR}\cublasLt64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
    } else {
        write-host "Skipping CUDA generation step"
    }
@@ -324,17 +325,18 @@ function build_oneapi() {
    sign
    install

-    md "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" -ea 0 > $null
-    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libirngmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
-    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libmmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
-    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_level_zero.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
-    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_unified_runtime.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
-    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_win_proxy_loader.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
-    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\svml_dispmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
-    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\sycl7.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
-    cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_core.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
-    cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_sycl_blas.4.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
-    cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_tbb_thread.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
+    rm -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
+    md "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\" -ea 0 > $null
+    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libirngmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
+    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libmmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
+    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_level_zero.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
+    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_unified_runtime.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
+    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_win_proxy_loader.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
+    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\svml_dispmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
+    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\sycl7.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
+    cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_core.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
+    cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_sycl_blas.4.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
+    cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_tbb_thread.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
  } else {
    Write-Host "Skipping oneAPI generation step"
  }
@@ -355,7 +357,7 @@ function build_rocm() {
            "-DCMAKE_C_COMPILER=clang.exe",
            "-DCMAKE_CXX_COMPILER=clang++.exe",
            "-DGGML_HIPBLAS=on",
-            "-DGGML_CUDA_NO_PEER_COPY=on",
+            "-DLLAMA_CUDA_NO_PEER_COPY=on",
            "-DHIP_PLATFORM=amd",
            "-DGGML_AVX=on",
            "-DGGML_AVX2=off",
@@ -384,11 +386,12 @@ function build_rocm() {
        sign
        install

-        md "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\rocblas\library\" -ea 0 > $null
-        cp "${env:HIP_PATH}\bin\hipblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
-        cp "${env:HIP_PATH}\bin\rocblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
+        rm -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\"
+        md "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\rocblas\library\" -ea 0 > $null
+        cp "${env:HIP_PATH}\bin\hipblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\"
+        cp "${env:HIP_PATH}\bin\rocblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\"
        # amdhip64.dll dependency comes from the driver and must be installed on the host to use AMD GPUs
-        cp "${env:HIP_PATH}\bin\rocblas\library\*" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\rocblas\library\"
+        cp "${env:HIP_PATH}\bin\rocblas\library\*" "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\rocblas\library\"
    } else {
        write-host "Skipping ROCm generation step"
    }
--- a/llm/ggml.go
+++ b/llm/ggml.go
@@ -43,14 +43,6 @@ func (kv KV) Architecture() string {
 	return "unknown"
 }

-func (kv KV) Kind() string {
-	if s, ok := kv["general.type"].(string); ok {
-		return s
-	}
-
-	return "unknown"
-}
-
 func (kv KV) ParameterCount() uint64 {
 	return kv.u64("general.parameter_count")
 }
@@ -165,14 +157,6 @@ type Tensor struct {
 	io.WriterTo `json:"-"`
 }

-func (t Tensor) block() (n int) {
-	if _, err := fmt.Sscanf(t.Name, "blk.%d.", &n); err != nil {
-		return -1
-	}
-
-	return
-}
-
 func (t Tensor) blockSize() uint64 {
 	switch t.Kind {
 	case 0, 1, 24, 25, 26, 27, 28, 30: // F32, F16, I8, I16, I32, I64, F64, BF16
--- a/llm/gguf.go
+++ b/llm/gguf.go
@@ -532,14 +532,15 @@ func WriteGGUF(ws io.WriteSeeker, kv KV, ts []Tensor) error {
 		}
 	}

-	slices.SortStableFunc(ts, func(a, b Tensor) int {
-		if i, j := a.block(), b.block(); i < 0 && j > 0 {
-			return 1
-		} else if i > 0 && j < 0 {
-			return -1
-		} else {
-			return cmp.Compare(i, j)
+	slices.SortFunc(ts, func(a, b Tensor) int {
+		var i, j int
+		if n, err := fmt.Sscanf(a.Name, "blk.%d", &i); err != nil || n != 1 {
+			return cmp.Compare(a.Name, b.Name)
+		} else if n, err := fmt.Sscanf(b.Name, "blk.%d", &j); err != nil || n != 1 {
+			return cmp.Compare(a.Name, b.Name)
 		}
+
+		return cmp.Compare(i, j)
 	})

 	var s uint64
--- a/llm/memory_test.go
+++ b/llm/memory_test.go
@@ -33,6 +33,7 @@ func TestEstimateGPULayers(t *testing.T) {
 	assert.Len(t, tensors, inputLayerCount+1)
 	err = WriteGGUF(f, KV{
 		"general.architecture":          "llama",
+		"general.name":                  "name",
 		"llama.context_length":          uint32(32),
 		"llama.embedding_length":        uint32(4096),
 		"llama.block_count":             uint32(inputLayerCount),
--- a/llm/patches/08-pooling.diff
+++ b/llm/patches/08-pooling.diff
@@ -0,0 +1,60 @@
+diff --git a/src/llama.cpp b/src/llama.cpp
+index 721b8f4e..cfe7ac40 100644
+--- a/src/llama.cpp
+++ b/src/llama.cpp
+@@ -8420,14 +8420,14 @@ struct llm_build_context {
+     }
+ 
+     struct ggml_tensor * build_inp_mean() {
+-        lctx.inp_mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens);
+        lctx.inp_mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, cparams.n_seq_max);
+         cb(lctx.inp_mean, "inp_mean", -1);
+         ggml_set_input(lctx.inp_mean);
+         return lctx.inp_mean;
+     }
+ 
+     struct ggml_tensor * build_inp_cls() {
+-        lctx.inp_cls = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+        lctx.inp_cls = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, cparams.n_seq_max);
+         cb(lctx.inp_cls, "inp_cls", -1);
+         ggml_set_input(lctx.inp_cls);
+         return lctx.inp_cls;
+@@ -13847,19 +13847,16 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
+         GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer));
+ 
+         float * data = (float *) lctx.inp_mean->data;
+-        memset(lctx.inp_mean->data, 0, n_tokens * n_tokens * ggml_element_size(lctx.inp_mean));
+        memset(lctx.inp_mean->data, 0, n_tokens * cparams.n_seq_max * ggml_element_size(lctx.inp_mean));
+ 
+         std::vector<uint64_t> sum(n_tokens, 0);
+         for (int i = 0; i < n_tokens; ++i) {
+             const llama_seq_id seq_id = batch.seq_id[i][0];
+-
+-            GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == MEAN");
+-
+             sum[seq_id] += 1;
+         }
+ 
+-        std::vector<float> div(n_tokens, 0.0f);
+-        for (int i = 0; i < n_tokens; ++i) {
+        std::vector<float> div(cparams.n_seq_max, 0.0f);
+        for (uint32_t i = 0; i < cparams.n_seq_max; ++i) {
+             const uint64_t s = sum[i];
+             if (s > 0) {
+                 div[i] = 1.0f/float(s);
+@@ -13879,14 +13876,11 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
+         GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
+ 
+         uint32_t * data = (uint32_t *) lctx.inp_cls->data;
+-        memset(lctx.inp_cls->data, 0, n_tokens * ggml_element_size(lctx.inp_cls));
+        memset(lctx.inp_cls->data, 0, cparams.n_seq_max * ggml_element_size(lctx.inp_cls));
+ 
+         for (int i = 0; i < n_tokens; ++i) {
+             const llama_seq_id seq_id = batch.seq_id[i][0];
+             const llama_pos    pos    = batch.pos[i];
+-
+-            GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == CLS");
+-
+             if (pos == 0) {
+                 data[seq_id] = i;
+             }
--- a/llm/payload.go
+++ b/llm/payload.go
@@ -82,8 +82,8 @@ func serversForGpu(info gpu.GpuInfo) []string {
 	// glob workDir for files that start with ollama_
 	availableServers := getAvailableServers()
 	requested := info.Library
-	if info.Variant != gpu.CPUCapabilityNone.String() {
-		requested += "_" + info.Variant
+	if info.Variant != gpu.CPUCapabilityNone {
+		requested += "_" + info.Variant.String()
 	}

 	servers := []string{}
--- a/llm/server.go
+++ b/llm/server.go
@@ -33,7 +33,7 @@ type LlamaServer interface {
 	Ping(ctx context.Context) error
 	WaitUntilRunning(ctx context.Context) error
 	Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error
-	Embedding(ctx context.Context, input string) ([]float32, error)
+	Embed(ctx context.Context, input []string) (*EmbedResponse, error)
 	Tokenize(ctx context.Context, content string) ([]int, error)
 	Detokenize(ctx context.Context, tokens []int) (string, error)
 	Close() error
@@ -125,9 +125,8 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		}
 	}

-	// On linux and windows, over-allocating CPU memory will almost always result in an error
-	// Darwin has fully dynamic swap so has no direct concept of free swap space
-	if runtime.GOOS != "darwin" {
+	// On linux, over-allocating CPU memory will almost always result in an error
+	if runtime.GOOS == "linux" {
 		systemMemoryRequired := estimate.TotalSize - estimate.VRAMSize
 		available := systemFreeMemory + systemSwapFreeMemory
 		if systemMemoryRequired > available {
@@ -258,7 +257,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		params = append(params, "--mlock")
 	}

-	if gpu.IsNUMA() && gpus[0].Library == "cpu" {
+	if gpu.IsNUMA() {
 		numaMode := "distribute"
 		if runtime.GOOS == "linux" {
 			if _, err := exec.LookPath("numactl"); err == nil {
@@ -306,18 +305,20 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		if runtime.GOOS == "windows" {
 			pathEnv = "PATH"
 		}
-		// Start with the server directory for the LD_LIBRARY_PATH/PATH
-		libraryPaths := []string{dir}
+		// prepend the server directory to LD_LIBRARY_PATH/PATH and the parent dir for common dependencies
+		libraryPaths := []string{dir, filepath.Dir(dir)}

 		if libraryPath, ok := os.LookupEnv(pathEnv); ok {
-			// favor our bundled library dependencies over system libraries
+			// Append our runner directory to the path
+			// This will favor system libraries over our bundled library dependencies
 			libraryPaths = append(libraryPaths, filepath.SplitList(libraryPath)...)
 		}

 		// Note: we always put the dependency path first
-		// since this was the exact version we compiled/linked against
+		// since this was the exact version we verified for AMD GPUs
+		// and we favor what the user had in their path
 		if gpus[0].DependencyPath != "" {
-			// assume gpus from the same library have the same dependency path
+			// TODO refine for multi-gpu support
 			libraryPaths = append([]string{gpus[0].DependencyPath}, libraryPaths...)
 		}

@@ -881,20 +882,24 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
 	return nil
 }

-type EmbeddingRequest struct {
-	Content string `json:"content"`
+type EmbedRequest struct {
+	Content []string `json:"content"`
 }

-type EmbeddingResponse struct {
-	Embedding []float32 `json:"embedding"`
+type EmbedResponse struct {
+	Embedding       [][]float32 `json:"embedding"`
+	PromptEvalCount int         `json:"prompt_n"`
 }

-func (s *llmServer) Embedding(ctx context.Context, input string) ([]float32, error) {
-	if err := s.sem.Acquire(ctx, 1); err != nil {
+func (s *llmServer) Embed(ctx context.Context, input []string) (*EmbedResponse, error) {
+	// each input will use a slot, so we need to acquire the semaphore for
+	// the number of inputs up to numParallel
+	slots := int64(min(len(input), s.numParallel))
+	if err := s.sem.Acquire(ctx, slots); err != nil {
 		slog.Error("Failed to acquire semaphore", "error", err)
 		return nil, err
 	}
-	defer s.sem.Release(1)
+	defer s.sem.Release(slots)

 	// Make sure the server is ready
 	status, err := s.getServerStatusRetry(ctx)
@@ -904,18 +909,18 @@ func (s *llmServer) Embedding(ctx context.Context, input string) ([]float32, err
 		return nil, fmt.Errorf("unexpected server status: %s", status.ToString())
 	}

-	data, err := json.Marshal(EmbeddingRequest{Content: input})
+	data, err := json.Marshal(EmbedRequest{Content: input})
 	if err != nil {
 		return nil, fmt.Errorf("error marshaling embed data: %w", err)
 	}

-	r, err := http.NewRequestWithContext(ctx, http.MethodPost, fmt.Sprintf("http://127.0.0.1:%d/embedding", s.port), bytes.NewBuffer(data))
+	req, err := http.NewRequestWithContext(ctx, http.MethodPost, fmt.Sprintf("http://127.0.0.1:%d/embedding", s.port), bytes.NewBuffer(data))
 	if err != nil {
 		return nil, fmt.Errorf("error creating embed request: %w", err)
 	}
-	r.Header.Set("Content-Type", "application/json")
+	req.Header.Set("Content-Type", "application/json")

-	resp, err := http.DefaultClient.Do(r)
+	resp, err := http.DefaultClient.Do(req)
 	if err != nil {
 		return nil, fmt.Errorf("do embedding request: %w", err)
 	}
@@ -931,12 +936,12 @@ func (s *llmServer) Embedding(ctx context.Context, input string) ([]float32, err
 		return nil, fmt.Errorf("%s", body)
 	}

-	var e EmbeddingResponse
+	var e EmbedResponse
 	if err := json.Unmarshal(body, &e); err != nil {
 		return nil, fmt.Errorf("unmarshal tokenize response: %w", err)
 	}

-	return e.Embedding, nil
+	return &e, nil
 }

 type TokenizeRequest struct {
--- a/llm/status.go
+++ b/llm/status.go
@@ -26,7 +26,6 @@ var errorPrefixes = []string{
 	"cudaMalloc failed",
 	"\"ERR\"",
 	"error loading model",
-	"GGML_ASSERT",
 }

 func (w *StatusWriter) Write(b []byte) (int, error) {
--- a/openai/openai_test.go
+++ b/openai/openai_test.go
@@ -7,22 +7,27 @@ import (
 	"io"
 	"net/http"
 	"net/http/httptest"
-	"reflect"
 	"strings"
 	"testing"
 	"time"

 	"github.com/gin-gonic/gin"
+	"github.com/stretchr/testify/assert"

 	"github.com/ollama/ollama/api"
 )

 const (
-	prefix = `data:image/jpeg;base64,`
-	image  = `iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mNk+A8AAQUBAScY42YAAAAASUVORK5CYII=`
+	prefix   = `data:image/jpeg;base64,`
+	image    = `iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mNk+A8AAQUBAScY42YAAAAASUVORK5CYII=`
+	imageURL = prefix + image
 )

-var False = false
+func prepareRequest(req *http.Request, body any) {
+	bodyBytes, _ := json.Marshal(body)
+	req.Body = io.NopCloser(bytes.NewReader(bodyBytes))
+	req.Header.Set("Content-Type", "application/json")
+}

 func captureRequestMiddleware(capturedRequest any) gin.HandlerFunc {
 	return func(c *gin.Context) {
@@ -38,136 +43,134 @@ func captureRequestMiddleware(capturedRequest any) gin.HandlerFunc {

 func TestChatMiddleware(t *testing.T) {
 	type testCase struct {
-		name string
-		body string
-		req  api.ChatRequest
-		err  ErrorResponse
+		Name     string
+		Setup    func(t *testing.T, req *http.Request)
+		Expected func(t *testing.T, req *api.ChatRequest, resp *httptest.ResponseRecorder)
 	}

 	var capturedRequest *api.ChatRequest

 	testCases := []testCase{
 		{
-			name: "chat handler",
-			body: `{
-				"model": "test-model",
-				"messages": [
-					{"role": "user", "content": "Hello"}
-				]
-			}`,
-			req: api.ChatRequest{
-				Model: "test-model",
-				Messages: []api.Message{
-					{
-						Role:    "user",
-						Content: "Hello",
-					},
-				},
-				Options: map[string]any{
-					"temperature": 1.0,
-					"top_p":       1.0,
-				},
-				Stream: &False,
+			Name: "chat handler",
+			Setup: func(t *testing.T, req *http.Request) {
+				body := ChatCompletionRequest{
+					Model:    "test-model",
+					Messages: []Message{{Role: "user", Content: "Hello"}},
+				}
+				prepareRequest(req, body)
 			},
-		},
-		{
-			name: "chat handler with image content",
-			body: `{
-				"model": "test-model",
-				"messages": [
-					{
-						"role": "user",
-						"content": [
-							{
-								"type": "text",
-								"text": "Hello"
-							},
-							{
-								"type": "image_url",
-								"image_url": {
-									"url": "` + prefix + image + `"
-								}
-							}
-						]
-					}
-				]
-			}`,
-			req: api.ChatRequest{
-				Model: "test-model",
-				Messages: []api.Message{
-					{
-						Role:    "user",
-						Content: "Hello",
-					},
-					{
-						Role: "user",
-						Images: []api.ImageData{
-							func() []byte {
-								img, _ := base64.StdEncoding.DecodeString(image)
-								return img
-							}(),
-						},
-					},
-				},
-				Options: map[string]any{
-					"temperature": 1.0,
-					"top_p":       1.0,
-				},
-				Stream: &False,
-			},
-		},
-		{
-			name: "chat handler with tools",
-			body: `{
-				"model": "test-model",
-				"messages": [
-					{"role": "user", "content": "What's the weather like in Paris Today?"},
-					{"role": "assistant", "tool_calls": [{"id": "id", "type": "function", "function": {"name": "get_current_weather", "arguments": "{\"location\": \"Paris, France\", \"format\": \"celsius\"}"}}]}
-				]
-			}`,
-			req: api.ChatRequest{
-				Model: "test-model",
-				Messages: []api.Message{
-					{
-						Role:    "user",
-						Content: "What's the weather like in Paris Today?",
-					},
-					{
-						Role: "assistant",
-						ToolCalls: []api.ToolCall{
-							{
-								Function: api.ToolCallFunction{
-									Name: "get_current_weather",
-									Arguments: map[string]interface{}{
-										"location": "Paris, France",
-										"format":   "celsius",
-									},
-								},
-							},
-						},
-					},
-				},
-				Options: map[string]any{
-					"temperature": 1.0,
-					"top_p":       1.0,
-				},
-				Stream: &False,
-			},
-		},
+			Expected: func(t *testing.T, req *api.ChatRequest, resp *httptest.ResponseRecorder) {
+				if resp.Code != http.StatusOK {
+					t.Fatalf("expected 200, got %d", resp.Code)
+				}

+				if req.Messages[0].Role != "user" {
+					t.Fatalf("expected 'user', got %s", req.Messages[0].Role)
+				}
+
+				if req.Messages[0].Content != "Hello" {
+					t.Fatalf("expected 'Hello', got %s", req.Messages[0].Content)
+				}
+			},
+		},
 		{
-			name: "chat handler error forwarding",
-			body: `{
-				"model": "test-model",
-				"messages": [
-					{"role": "user", "content": 2}
-				]
-			}`,
-			err: ErrorResponse{
-				Error: Error{
-					Message: "invalid message content type: float64",
-					Type:    "invalid_request_error",
-				},
+			Name: "chat handler with image content",
+			Setup: func(t *testing.T, req *http.Request) {
+				body := ChatCompletionRequest{
+					Model: "test-model",
+					Messages: []Message{
+						{
+							Role: "user", Content: []map[string]any{
+								{"type": "text", "text": "Hello"},
+								{"type": "image_url", "image_url": map[string]string{"url": imageURL}},
+							},
+						},
+					},
+				}
+				prepareRequest(req, body)
+			},
+			Expected: func(t *testing.T, req *api.ChatRequest, resp *httptest.ResponseRecorder) {
+				if resp.Code != http.StatusOK {
+					t.Fatalf("expected 200, got %d", resp.Code)
+				}
+
+				if req.Messages[0].Role != "user" {
+					t.Fatalf("expected 'user', got %s", req.Messages[0].Role)
+				}
+
+				if req.Messages[0].Content != "Hello" {
+					t.Fatalf("expected 'Hello', got %s", req.Messages[0].Content)
+				}
+
+				img, _ := base64.StdEncoding.DecodeString(imageURL[len(prefix):])
+
+				if req.Messages[1].Role != "user" {
+					t.Fatalf("expected 'user', got %s", req.Messages[1].Role)
+				}
+
+				if !bytes.Equal(req.Messages[1].Images[0], img) {
+					t.Fatalf("expected image encoding, got %s", req.Messages[1].Images[0])
+				}
+			},
+		},
+		{
+			Name: "chat handler with tools",
+			Setup: func(t *testing.T, req *http.Request) {
+				body := ChatCompletionRequest{
+					Model: "test-model",
+					Messages: []Message{
+						{Role: "user", Content: "What's the weather like in Paris Today?"},
+						{Role: "assistant", ToolCalls: []ToolCall{{
+							ID:   "id",
+							Type: "function",
+							Function: struct {
+								Name      string `json:"name"`
+								Arguments string `json:"arguments"`
+							}{
+								Name:      "get_current_weather",
+								Arguments: "{\"location\": \"Paris, France\", \"format\": \"celsius\"}",
+							},
+						}}},
+					},
+				}
+				prepareRequest(req, body)
+			},
+			Expected: func(t *testing.T, req *api.ChatRequest, resp *httptest.ResponseRecorder) {
+				if resp.Code != 200 {
+					t.Fatalf("expected 200, got %d", resp.Code)
+				}
+
+				if req.Messages[0].Content != "What's the weather like in Paris Today?" {
+					t.Fatalf("expected What's the weather like in Paris Today?, got %s", req.Messages[0].Content)
+				}
+
+				if req.Messages[1].ToolCalls[0].Function.Arguments["location"] != "Paris, France" {
+					t.Fatalf("expected 'Paris, France', got %v", req.Messages[1].ToolCalls[0].Function.Arguments["location"])
+				}
+
+				if req.Messages[1].ToolCalls[0].Function.Arguments["format"] != "celsius" {
+					t.Fatalf("expected celsius, got %v", req.Messages[1].ToolCalls[0].Function.Arguments["format"])
+				}
+			},
+		},
+		{
+			Name: "chat handler error forwarding",
+			Setup: func(t *testing.T, req *http.Request) {
+				body := ChatCompletionRequest{
+					Model:    "test-model",
+					Messages: []Message{{Role: "user", Content: 2}},
+				}
+				prepareRequest(req, body)
+			},
+			Expected: func(t *testing.T, req *api.ChatRequest, resp *httptest.ResponseRecorder) {
+				if resp.Code != http.StatusBadRequest {
+					t.Fatalf("expected 400, got %d", resp.Code)
+				}
+
+				if !strings.Contains(resp.Body.String(), "invalid message content type") {
+					t.Fatalf("error was not forwarded")
+				}
 			},
 		},
 	}
@@ -182,26 +185,16 @@ func TestChatMiddleware(t *testing.T) {
 	router.Handle(http.MethodPost, "/api/chat", endpoint)

 	for _, tc := range testCases {
-		t.Run(tc.name, func(t *testing.T) {
-			req, _ := http.NewRequest(http.MethodPost, "/api/chat", strings.NewReader(tc.body))
-			req.Header.Set("Content-Type", "application/json")
+		t.Run(tc.Name, func(t *testing.T) {
+			req, _ := http.NewRequest(http.MethodPost, "/api/chat", nil)
+
+			tc.Setup(t, req)

 			resp := httptest.NewRecorder()
 			router.ServeHTTP(resp, req)

-			var errResp ErrorResponse
-			if resp.Code != http.StatusOK {
-				if err := json.Unmarshal(resp.Body.Bytes(), &errResp); err != nil {
-					t.Fatal(err)
-				}
-			}
-			if capturedRequest != nil && !reflect.DeepEqual(tc.req, *capturedRequest) {
-				t.Fatal("requests did not match")
-			}
+			tc.Expected(t, capturedRequest, resp)

-			if !reflect.DeepEqual(tc.err, errResp) {
-				t.Fatal("errors did not match")
-			}
 			capturedRequest = nil
 		})
 	}
@@ -209,52 +202,71 @@ func TestChatMiddleware(t *testing.T) {

 func TestCompletionsMiddleware(t *testing.T) {
 	type testCase struct {
-		name string
-		body string
-		req  api.GenerateRequest
-		err  ErrorResponse
+		Name     string
+		Setup    func(t *testing.T, req *http.Request)
+		Expected func(t *testing.T, req *api.GenerateRequest, resp *httptest.ResponseRecorder)
 	}

 	var capturedRequest *api.GenerateRequest

 	testCases := []testCase{
 		{
-			name: "completions handler",
-			body: `{
-				"model": "test-model",
-				"prompt": "Hello",
-				"temperature": 0.8,
-				"stop": ["\n", "stop"],
-				"suffix": "suffix"
-			}`,
-			req: api.GenerateRequest{
-				Model:  "test-model",
-				Prompt: "Hello",
-				Options: map[string]any{
-					"frequency_penalty": 0.0,
-					"presence_penalty":  0.0,
-					"temperature":       1.6,
-					"top_p":             1.0,
-					"stop":              []any{"\n", "stop"},
-				},
-				Suffix: "suffix",
-				Stream: &False,
+			Name: "completions handler",
+			Setup: func(t *testing.T, req *http.Request) {
+				temp := float32(0.8)
+				body := CompletionRequest{
+					Model:       "test-model",
+					Prompt:      "Hello",
+					Temperature: &temp,
+					Stop:        []string{"\n", "stop"},
+					Suffix:      "suffix",
+				}
+				prepareRequest(req, body)
+			},
+			Expected: func(t *testing.T, req *api.GenerateRequest, resp *httptest.ResponseRecorder) {
+				if req.Prompt != "Hello" {
+					t.Fatalf("expected 'Hello', got %s", req.Prompt)
+				}
+
+				if req.Options["temperature"] != 1.6 {
+					t.Fatalf("expected 1.6, got %f", req.Options["temperature"])
+				}
+
+				stopTokens, ok := req.Options["stop"].([]any)
+
+				if !ok {
+					t.Fatalf("expected stop tokens to be a list")
+				}
+
+				if stopTokens[0] != "\n" || stopTokens[1] != "stop" {
+					t.Fatalf("expected ['\\n', 'stop'], got %v", stopTokens)
+				}
+
+				if req.Suffix != "suffix" {
+					t.Fatalf("expected 'suffix', got %s", req.Suffix)
+				}
 			},
 		},
 		{
-			name: "completions handler error forwarding",
-			body: `{
-				"model": "test-model",
-				"prompt": "Hello",
-				"temperature": null,
-				"stop": [1, 2],
-				"suffix": "suffix"
-			}`,
-			err: ErrorResponse{
-				Error: Error{
-					Message: "invalid type for 'stop' field: float64",
-					Type:    "invalid_request_error",
-				},
+			Name: "completions handler error forwarding",
+			Setup: func(t *testing.T, req *http.Request) {
+				body := CompletionRequest{
+					Model:       "test-model",
+					Prompt:      "Hello",
+					Temperature: nil,
+					Stop:        []int{1, 2},
+					Suffix:      "suffix",
+				}
+				prepareRequest(req, body)
+			},
+			Expected: func(t *testing.T, req *api.GenerateRequest, resp *httptest.ResponseRecorder) {
+				if resp.Code != http.StatusBadRequest {
+					t.Fatalf("expected 400, got %d", resp.Code)
+				}
+
+				if !strings.Contains(resp.Body.String(), "invalid type for 'stop' field") {
+					t.Fatalf("error was not forwarded")
+				}
 			},
 		},
 	}
@@ -269,27 +281,15 @@ func TestCompletionsMiddleware(t *testing.T) {
 	router.Handle(http.MethodPost, "/api/generate", endpoint)

 	for _, tc := range testCases {
-		t.Run(tc.name, func(t *testing.T) {
-			req, _ := http.NewRequest(http.MethodPost, "/api/generate", strings.NewReader(tc.body))
-			req.Header.Set("Content-Type", "application/json")
+		t.Run(tc.Name, func(t *testing.T) {
+			req, _ := http.NewRequest(http.MethodPost, "/api/generate", nil)
+
+			tc.Setup(t, req)

 			resp := httptest.NewRecorder()
 			router.ServeHTTP(resp, req)

-			var errResp ErrorResponse
-			if resp.Code != http.StatusOK {
-				if err := json.Unmarshal(resp.Body.Bytes(), &errResp); err != nil {
-					t.Fatal(err)
-				}
-			}
-
-			if capturedRequest != nil && !reflect.DeepEqual(tc.req, *capturedRequest) {
-				t.Fatal("requests did not match")
-			}
-
-			if !reflect.DeepEqual(tc.err, errResp) {
-				t.Fatal("errors did not match")
-			}
+			tc.Expected(t, capturedRequest, resp)

 			capturedRequest = nil
 		})
@@ -298,47 +298,78 @@ func TestCompletionsMiddleware(t *testing.T) {

 func TestEmbeddingsMiddleware(t *testing.T) {
 	type testCase struct {
-		name string
-		body string
-		req  api.EmbedRequest
-		err  ErrorResponse
+		Name     string
+		Setup    func(t *testing.T, req *http.Request)
+		Expected func(t *testing.T, req *api.EmbedRequest, resp *httptest.ResponseRecorder)
 	}

 	var capturedRequest *api.EmbedRequest

 	testCases := []testCase{
 		{
-			name: "embed handler single input",
-			body: `{
-				"input": "Hello",
-				"model": "test-model"
-			}`,
-			req: api.EmbedRequest{
-				Input: "Hello",
-				Model: "test-model",
+			Name: "embed handler single input",
+			Setup: func(t *testing.T, req *http.Request) {
+				body := EmbedRequest{
+					Input: "Hello",
+					Model: "test-model",
+				}
+				prepareRequest(req, body)
+			},
+			Expected: func(t *testing.T, req *api.EmbedRequest, resp *httptest.ResponseRecorder) {
+				if req.Input != "Hello" {
+					t.Fatalf("expected 'Hello', got %s", req.Input)
+				}
+
+				if req.Model != "test-model" {
+					t.Fatalf("expected 'test-model', got %s", req.Model)
+				}
 			},
 		},
 		{
-			name: "embed handler batch input",
-			body: `{
-				"input": ["Hello", "World"],
-				"model": "test-model"
-			}`,
-			req: api.EmbedRequest{
-				Input: []any{"Hello", "World"},
-				Model: "test-model",
+			Name: "embed handler batch input",
+			Setup: func(t *testing.T, req *http.Request) {
+				body := EmbedRequest{
+					Input: []string{"Hello", "World"},
+					Model: "test-model",
+				}
+				prepareRequest(req, body)
+			},
+			Expected: func(t *testing.T, req *api.EmbedRequest, resp *httptest.ResponseRecorder) {
+				input, ok := req.Input.([]any)
+
+				if !ok {
+					t.Fatalf("expected input to be a list")
+				}
+
+				if input[0].(string) != "Hello" {
+					t.Fatalf("expected 'Hello', got %s", input[0])
+				}
+
+				if input[1].(string) != "World" {
+					t.Fatalf("expected 'World', got %s", input[1])
+				}
+
+				if req.Model != "test-model" {
+					t.Fatalf("expected 'test-model', got %s", req.Model)
+				}
 			},
 		},
 		{
-			name: "embed handler error forwarding",
-			body: `{
-				"model": "test-model"
-			}`,
-			err: ErrorResponse{
-				Error: Error{
-					Message: "invalid input",
-					Type:    "invalid_request_error",
-				},
+			Name: "embed handler error forwarding",
+			Setup: func(t *testing.T, req *http.Request) {
+				body := EmbedRequest{
+					Model: "test-model",
+				}
+				prepareRequest(req, body)
+			},
+			Expected: func(t *testing.T, req *api.EmbedRequest, resp *httptest.ResponseRecorder) {
+				if resp.Code != http.StatusBadRequest {
+					t.Fatalf("expected 400, got %d", resp.Code)
+				}
+
+				if !strings.Contains(resp.Body.String(), "invalid input") {
+					t.Fatalf("error was not forwarded")
+				}
 			},
 		},
 	}
@@ -353,167 +384,116 @@ func TestEmbeddingsMiddleware(t *testing.T) {
 	router.Handle(http.MethodPost, "/api/embed", endpoint)

 	for _, tc := range testCases {
-		t.Run(tc.name, func(t *testing.T) {
-			req, _ := http.NewRequest(http.MethodPost, "/api/embed", strings.NewReader(tc.body))
-			req.Header.Set("Content-Type", "application/json")
+		t.Run(tc.Name, func(t *testing.T) {
+			req, _ := http.NewRequest(http.MethodPost, "/api/embed", nil)
+
+			tc.Setup(t, req)

 			resp := httptest.NewRecorder()
 			router.ServeHTTP(resp, req)

-			var errResp ErrorResponse
-			if resp.Code != http.StatusOK {
-				if err := json.Unmarshal(resp.Body.Bytes(), &errResp); err != nil {
-					t.Fatal(err)
-				}
-			}
-
-			if capturedRequest != nil && !reflect.DeepEqual(tc.req, *capturedRequest) {
-				t.Fatal("requests did not match")
-			}
-
-			if !reflect.DeepEqual(tc.err, errResp) {
-				t.Fatal("errors did not match")
-			}
+			tc.Expected(t, capturedRequest, resp)

 			capturedRequest = nil
 		})
 	}
 }

-func TestListMiddleware(t *testing.T) {
+func TestMiddlewareResponses(t *testing.T) {
 	type testCase struct {
-		name     string
-		endpoint func(c *gin.Context)
-		resp     string
+		Name     string
+		Method   string
+		Path     string
+		TestPath string
+		Handler  func() gin.HandlerFunc
+		Endpoint func(c *gin.Context)
+		Setup    func(t *testing.T, req *http.Request)
+		Expected func(t *testing.T, resp *httptest.ResponseRecorder)
 	}

 	testCases := []testCase{
 		{
-			name: "list handler",
-			endpoint: func(c *gin.Context) {
+			Name:     "list handler",
+			Method:   http.MethodGet,
+			Path:     "/api/tags",
+			TestPath: "/api/tags",
+			Handler:  ListMiddleware,
+			Endpoint: func(c *gin.Context) {
 				c.JSON(http.StatusOK, api.ListResponse{
 					Models: []api.ListModelResponse{
 						{
-							Name:       "test-model",
-							ModifiedAt: time.Unix(int64(1686935002), 0).UTC(),
+							Name: "Test Model",
 						},
 					},
 				})
 			},
-			resp: `{
-				"object": "list",
-				"data": [
-					{
-						"id": "test-model",
-						"object": "model",
-						"created": 1686935002,
-						"owned_by": "library"
-					}
-				]
-			}`,
-		},
-		{
-			name: "list handler empty output",
-			endpoint: func(c *gin.Context) {
-				c.JSON(http.StatusOK, api.ListResponse{})
+			Expected: func(t *testing.T, resp *httptest.ResponseRecorder) {
+				var listResp ListCompletion
+				if err := json.NewDecoder(resp.Body).Decode(&listResp); err != nil {
+					t.Fatal(err)
+				}
+
+				if listResp.Object != "list" {
+					t.Fatalf("expected list, got %s", listResp.Object)
+				}
+
+				if len(listResp.Data) != 1 {
+					t.Fatalf("expected 1, got %d", len(listResp.Data))
+				}
+
+				if listResp.Data[0].Id != "Test Model" {
+					t.Fatalf("expected Test Model, got %s", listResp.Data[0].Id)
+				}
 			},
-			resp: `{
-				"object": "list",
-				"data": null
-			}`,
 		},
-	}
-
-	gin.SetMode(gin.TestMode)
-
-	for _, tc := range testCases {
-		router := gin.New()
-		router.Use(ListMiddleware())
-		router.Handle(http.MethodGet, "/api/tags", tc.endpoint)
-		req, _ := http.NewRequest(http.MethodGet, "/api/tags", nil)
-
-		resp := httptest.NewRecorder()
-		router.ServeHTTP(resp, req)
-
-		var expected, actual map[string]any
-		err := json.Unmarshal([]byte(tc.resp), &expected)
-		if err != nil {
-			t.Fatalf("failed to unmarshal expected response: %v", err)
-		}
-
-		err = json.Unmarshal(resp.Body.Bytes(), &actual)
-		if err != nil {
-			t.Fatalf("failed to unmarshal actual response: %v", err)
-		}
-
-		if !reflect.DeepEqual(expected, actual) {
-			t.Errorf("responses did not match\nExpected: %+v\nActual: %+v", expected, actual)
-		}
-	}
-}
-
-func TestRetrieveMiddleware(t *testing.T) {
-	type testCase struct {
-		name     string
-		endpoint func(c *gin.Context)
-		resp     string
-	}
-
-	testCases := []testCase{
 		{
-			name: "retrieve handler",
-			endpoint: func(c *gin.Context) {
+			Name:     "retrieve model",
+			Method:   http.MethodGet,
+			Path:     "/api/show/:model",
+			TestPath: "/api/show/test-model",
+			Handler:  RetrieveMiddleware,
+			Endpoint: func(c *gin.Context) {
 				c.JSON(http.StatusOK, api.ShowResponse{
-					ModifiedAt: time.Unix(int64(1686935002), 0).UTC(),
+					ModifiedAt: time.Date(2024, 6, 17, 13, 45, 0, 0, time.UTC),
 				})
 			},
-			resp: `{
-				"id":"test-model",
-				"object":"model",
-				"created":1686935002,
-				"owned_by":"library"}
-			`,
-		},
-		{
-			name: "retrieve handler error forwarding",
-			endpoint: func(c *gin.Context) {
-				c.JSON(http.StatusBadRequest, gin.H{"error": "model not found"})
-			},
-			resp: `{
-				"error": {
-				  "code": null,
-				  "message": "model not found",
-				  "param": null,
-				  "type": "api_error"
+			Expected: func(t *testing.T, resp *httptest.ResponseRecorder) {
+				var retrieveResp Model
+				if err := json.NewDecoder(resp.Body).Decode(&retrieveResp); err != nil {
+					t.Fatal(err)
 				}
-			}`,
+
+				if retrieveResp.Object != "model" {
+					t.Fatalf("Expected object to be model, got %s", retrieveResp.Object)
+				}
+
+				if retrieveResp.Id != "test-model" {
+					t.Fatalf("Expected id to be test-model, got %s", retrieveResp.Id)
+				}
+			},
 		},
 	}

 	gin.SetMode(gin.TestMode)
+	router := gin.New()

 	for _, tc := range testCases {
-		router := gin.New()
-		router.Use(RetrieveMiddleware())
-		router.Handle(http.MethodGet, "/api/show/:model", tc.endpoint)
-		req, _ := http.NewRequest(http.MethodGet, "/api/show/test-model", nil)
+		t.Run(tc.Name, func(t *testing.T) {
+			router = gin.New()
+			router.Use(tc.Handler())
+			router.Handle(tc.Method, tc.Path, tc.Endpoint)
+			req, _ := http.NewRequest(tc.Method, tc.TestPath, nil)

-		resp := httptest.NewRecorder()
-		router.ServeHTTP(resp, req)
+			if tc.Setup != nil {
+				tc.Setup(t, req)
+			}

-		var expected, actual map[string]any
-		err := json.Unmarshal([]byte(tc.resp), &expected)
-		if err != nil {
-			t.Fatalf("failed to unmarshal expected response: %v", err)
-		}
+			resp := httptest.NewRecorder()
+			router.ServeHTTP(resp, req)

-		err = json.Unmarshal(resp.Body.Bytes(), &actual)
-		if err != nil {
-			t.Fatalf("failed to unmarshal actual response: %v", err)
-		}
+			assert.Equal(t, http.StatusOK, resp.Code)

-		if !reflect.DeepEqual(expected, actual) {
-			t.Errorf("responses did not match\nExpected: %+v\nActual: %+v", expected, actual)
-		}
+			tc.Expected(t, resp)
+		})
 	}
 }
--- a/progress/spinner.go
+++ b/progress/spinner.go
@@ -3,12 +3,11 @@ package progress
 import (
 	"fmt"
 	"strings"
-	"sync/atomic"
 	"time"
 )

 type Spinner struct {
-	message      atomic.Value
+	message      string
 	messageWidth int

 	parts []string
@@ -22,25 +21,20 @@ type Spinner struct {

 func NewSpinner(message string) *Spinner {
 	s := &Spinner{
+		message: message,
 		parts: []string{
 			"⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏",
 		},
 		started: time.Now(),
 	}
-	s.SetMessage(message)
 	go s.start()
 	return s
 }

-func (s *Spinner) SetMessage(message string) {
-	s.message.Store(message)
-}
-
 func (s *Spinner) String() string {
 	var sb strings.Builder
-
-	if message, ok := s.message.Load().(string); ok && len(message) > 0 {
-		message := strings.TrimSpace(message)
+	if len(s.message) > 0 {
+		message := strings.TrimSpace(s.message)
 		if s.messageWidth > 0 && len(message) > s.messageWidth {
 			message = message[:s.messageWidth]
 		}
--- a/readline/buffer.go
+++ b/readline/buffer.go
@@ -62,7 +62,7 @@ func (b *Buffer) MoveLeft() {
 				rLength := runewidth.RuneWidth(r)

 				if b.DisplayPos%b.LineWidth == 0 {
-					fmt.Print(CursorUp + CursorBOL + CursorRightN(b.Width))
+					fmt.Printf(CursorUp + CursorBOL + cursorRightN(b.Width))
 					if rLength == 2 {
 						fmt.Print(CursorLeft)
 					}
@@ -74,7 +74,7 @@ func (b *Buffer) MoveLeft() {
 						fmt.Print(CursorLeft)
 					}
 				} else {
-					fmt.Print(CursorLeftN(rLength))
+					fmt.Print(cursorLeftN(rLength))
 				}

 				b.Pos -= 1
@@ -115,15 +115,15 @@ func (b *Buffer) MoveRight() {
 				b.DisplayPos += rLength

 				if b.DisplayPos%b.LineWidth == 0 {
-					fmt.Print(CursorDown + CursorBOL + CursorRightN(len(b.Prompt.prompt())))
+					fmt.Printf(CursorDown + CursorBOL + cursorRightN(len(b.Prompt.prompt())))
 				} else if (b.DisplayPos-rLength)%b.LineWidth == b.LineWidth-1 && hasSpace {
-					fmt.Print(CursorDown + CursorBOL + CursorRightN(len(b.Prompt.prompt())+rLength))
+					fmt.Printf(CursorDown + CursorBOL + cursorRightN(len(b.Prompt.prompt())+rLength))
 					b.DisplayPos += 1
 				} else if b.LineHasSpace.Size() > 0 && b.DisplayPos%b.LineWidth == b.LineWidth-1 && hasSpace {
-					fmt.Print(CursorDown + CursorBOL + CursorRightN(len(b.Prompt.prompt())))
+					fmt.Printf(CursorDown + CursorBOL + cursorRightN(len(b.Prompt.prompt())))
 					b.DisplayPos += 1
 				} else {
-					fmt.Print(CursorRightN(rLength))
+					fmt.Print(cursorRightN(rLength))
 				}
 			}
 		}
@@ -154,7 +154,7 @@ func (b *Buffer) MoveToStart() {
 				fmt.Print(CursorUp)
 			}
 		}
-		fmt.Print(CursorBOL + CursorRightN(len(b.Prompt.prompt())))
+		fmt.Printf(CursorBOL + cursorRightN(len(b.Prompt.prompt())))
 		b.Pos = 0
 		b.DisplayPos = 0
 	}
@@ -169,9 +169,9 @@ func (b *Buffer) MoveToEnd() {
 				fmt.Print(CursorDown)
 			}
 			remainder := b.DisplaySize() % b.LineWidth
-			fmt.Print(CursorBOL + CursorRightN(len(b.Prompt.prompt())+remainder))
+			fmt.Printf(CursorBOL + cursorRightN(len(b.Prompt.prompt())+remainder))
 		} else {
-			fmt.Print(CursorRightN(b.DisplaySize() - b.DisplayPos))
+			fmt.Print(cursorRightN(b.DisplaySize() - b.DisplayPos))
 		}

 		b.Pos = b.Buf.Size()
@@ -286,7 +286,8 @@ func (b *Buffer) drawRemaining() {
 	remLength := runewidth.StringWidth(remainingText)

 	if len(currLine) > 0 {
-		fmt.Print(ClearToEOL + currLine + CursorLeftN(currLineSpace))
+		fmt.Printf(ClearToEOL + currLine)
+		fmt.Print(cursorLeftN(currLineSpace))
 	} else {
 		fmt.Print(ClearToEOL)
 	}
@@ -300,9 +301,9 @@ func (b *Buffer) drawRemaining() {
 	}

 	if (b.DisplayPos+currLineSpace)%b.LineWidth == 0 && currLine == remainingText {
-		fmt.Print(CursorRightN(currLineSpace))
+		fmt.Print(cursorRightN(currLineSpace))
 		fmt.Printf("\n%s", b.Prompt.AltPrompt)
-		fmt.Print(CursorUp + CursorBOL + CursorRightN(b.Width-currLineSpace))
+		fmt.Printf(CursorUp + CursorBOL + cursorRightN(b.Width-currLineSpace))
 	}

 	// render the other lines
@@ -332,7 +333,9 @@ func (b *Buffer) drawRemaining() {
 			lineLength += runewidth.RuneWidth(c)
 			fmt.Printf("%c", c)
 		}
-		fmt.Print(ClearToEOL + CursorUpN(totalLines) + CursorBOL + CursorRightN(b.Width-currLineSpace))
+		fmt.Print(ClearToEOL)
+		fmt.Print(cursorUpN(totalLines))
+		fmt.Printf(CursorBOL + cursorRightN(b.Width-currLineSpace))

 		hasSpace := b.GetLineSpacing(b.DisplayPos / b.LineWidth)

@@ -354,7 +357,8 @@ func (b *Buffer) Remove() {
 				if b.DisplayPos%b.LineWidth == 0 {
 					// if the user backspaces over the word boundary, do this magic to clear the line
 					// and move to the end of the previous line
-					fmt.Print(CursorBOL + ClearToEOL + CursorUp + CursorBOL + CursorRightN(b.Width))
+					fmt.Printf(CursorBOL + ClearToEOL)
+					fmt.Printf(CursorUp + CursorBOL + cursorRightN(b.Width))

 					if b.DisplaySize()%b.LineWidth < (b.DisplaySize()-rLength)%b.LineWidth {
 						b.LineHasSpace.Remove(b.DisplayPos/b.LineWidth - 1)
@@ -366,23 +370,24 @@ func (b *Buffer) Remove() {
 					}

 					if rLength == 2 {
-						fmt.Print(CursorLeft + "  " + CursorLeftN(2))
+						fmt.Print(CursorLeft + "  " + cursorLeftN(2))
 					} else {
 						fmt.Print(" " + CursorLeft)
 					}
 				} else if (b.DisplayPos-rLength)%b.LineWidth == 0 && hasSpace {
-					fmt.Print(CursorBOL + ClearToEOL + CursorUp + CursorBOL + CursorRightN(b.Width))
+					fmt.Printf(CursorBOL + ClearToEOL)
+					fmt.Printf(CursorUp + CursorBOL + cursorRightN(b.Width))

 					if b.Pos == b.Buf.Size() {
 						b.LineHasSpace.Remove(b.DisplayPos/b.LineWidth - 1)
 					}
 					b.DisplayPos -= 1
 				} else {
-					fmt.Print(CursorLeftN(rLength))
+					fmt.Print(cursorLeftN(rLength))
 					for range rLength {
 						fmt.Print(" ")
 					}
-					fmt.Print(CursorLeftN(rLength))
+					fmt.Print(cursorLeftN(rLength))
 				}

 				var eraseExtraLine bool
@@ -400,9 +405,9 @@ func (b *Buffer) Remove() {
 					// are trailing characters which go over the line width boundary
 					if eraseExtraLine {
 						remainingLines := (b.DisplaySize() - b.DisplayPos) / b.LineWidth
-						fmt.Print(CursorDownN(remainingLines+1) + CursorBOL + ClearToEOL)
+						fmt.Printf(cursorDownN(remainingLines+1) + CursorBOL + ClearToEOL)
 						place := b.DisplayPos % b.LineWidth
-						fmt.Print(CursorUpN(remainingLines+1) + CursorRightN(place+len(b.Prompt.prompt())))
+						fmt.Printf(cursorUpN(remainingLines+1) + cursorRightN(place+len(b.Prompt.prompt())))
 					}
 				}
 			}
@@ -417,9 +422,9 @@ func (b *Buffer) Delete() {
 		if b.DisplaySize()%b.LineWidth == 0 {
 			if b.DisplayPos != b.DisplaySize() {
 				remainingLines := (b.DisplaySize() - b.DisplayPos) / b.LineWidth
-				fmt.Print(CursorDownN(remainingLines) + CursorBOL + ClearToEOL)
+				fmt.Printf(cursorDownN(remainingLines) + CursorBOL + ClearToEOL)
 				place := b.DisplayPos % b.LineWidth
-				fmt.Print(CursorUpN(remainingLines) + CursorRightN(place+len(b.Prompt.prompt())))
+				fmt.Printf(cursorUpN(remainingLines) + cursorRightN(place+len(b.Prompt.prompt())))
 			}
 		}
 	}
@@ -466,17 +471,17 @@ func (b *Buffer) DeleteWord() {
 }

 func (b *Buffer) ClearScreen() {
-	fmt.Print(ClearScreen + CursorReset + b.Prompt.prompt())
+	fmt.Printf(ClearScreen + CursorReset + b.Prompt.prompt())
 	if b.IsEmpty() {
 		ph := b.Prompt.placeholder()
-		fmt.Print(ColorGrey + ph + CursorLeftN(len(ph)) + ColorDefault)
+		fmt.Printf(ColorGrey + ph + cursorLeftN(len(ph)) + ColorDefault)
 	} else {
 		currPos := b.DisplayPos
 		currIndex := b.Pos
 		b.Pos = 0
 		b.DisplayPos = 0
 		b.drawRemaining()
-		fmt.Print(CursorReset + CursorRightN(len(b.Prompt.prompt())))
+		fmt.Printf(CursorReset + cursorRightN(len(b.Prompt.prompt())))
 		if currPos > 0 {
 			targetLine := currPos / b.LineWidth
 			if targetLine > 0 {
@@ -486,10 +491,10 @@ func (b *Buffer) ClearScreen() {
 			}
 			remainder := currPos % b.LineWidth
 			if remainder > 0 {
-				fmt.Print(CursorRightN(remainder))
+				fmt.Print(cursorRightN(remainder))
 			}
 			if currPos%b.LineWidth == 0 {
-				fmt.Print(CursorBOL + b.Prompt.AltPrompt)
+				fmt.Printf(CursorBOL + b.Prompt.AltPrompt)
 			}
 		}
 		b.Pos = currIndex
@@ -508,13 +513,13 @@ func (b *Buffer) Replace(r []rune) {

 	b.Buf.Clear()

-	fmt.Print(CursorBOL + ClearToEOL)
+	fmt.Printf(CursorBOL + ClearToEOL)

 	for range lineNums {
 		fmt.Print(CursorUp + CursorBOL + ClearToEOL)
 	}

-	fmt.Print(CursorBOL + b.Prompt.prompt())
+	fmt.Printf(CursorBOL + b.Prompt.prompt())

 	for _, c := range r {
 		b.Add(c)
@@ -540,3 +545,19 @@ func (b *Buffer) StringNM(n, m int) string {
 	}
 	return s
 }
+
+func cursorLeftN(n int) string {
+	return fmt.Sprintf(CursorLeftN, n)
+}
+
+func cursorRightN(n int) string {
+	return fmt.Sprintf(CursorRightN, n)
+}
+
+func cursorUpN(n int) string {
+	return fmt.Sprintf(CursorUpN, n)
+}
+
+func cursorDownN(n int) string {
+	return fmt.Sprintf(CursorDownN, n)
+}
--- a/readline/readline.go
+++ b/readline/readline.go
@@ -98,7 +98,7 @@ func (i *Instance) Readline() (string, error) {
 		showPlaceholder := !i.Pasting || i.Prompt.UseAlt
 		if buf.IsEmpty() && showPlaceholder {
 			ph := i.Prompt.placeholder()
-			fmt.Print(ColorGrey + ph + CursorLeftN(len(ph)) + ColorDefault)
+			fmt.Printf(ColorGrey + ph + fmt.Sprintf(CursorLeftN, len(ph)) + ColorDefault)
 		}

 		r, err := i.Terminal.Read()
--- a/readline/types.go
+++ b/readline/types.go
@@ -1,7 +1,5 @@
 package readline

-import "strconv"
-
 const (
 	CharNull      = 0
 	CharLineStart = 1
@@ -43,49 +41,34 @@ const (
 )

 const (
-	Esc = "\x1b"
+	CursorUp    = "\033[1A"
+	CursorDown  = "\033[1B"
+	CursorRight = "\033[1C"
+	CursorLeft  = "\033[1D"

-	CursorSave    = Esc + "[s"
-	CursorRestore = Esc + "[u"
+	CursorSave    = "\033[s"
+	CursorRestore = "\033[u"

-	CursorEOL  = Esc + "[E"
-	CursorBOL  = Esc + "[1G"
-	CursorHide = Esc + "[?25l"
-	CursorShow = Esc + "[?25h"
+	CursorUpN    = "\033[%dA"
+	CursorDownN  = "\033[%dB"
+	CursorRightN = "\033[%dC"
+	CursorLeftN  = "\033[%dD"

-	ClearToEOL  = Esc + "[K"
-	ClearLine   = Esc + "[2K"
-	ClearScreen = Esc + "[2J"
-	CursorReset = Esc + "[0;0f"
+	CursorEOL  = "\033[E"
+	CursorBOL  = "\033[1G"
+	CursorHide = "\033[?25l"
+	CursorShow = "\033[?25h"

-	ColorGrey    = Esc + "[38;5;245m"
-	ColorDefault = Esc + "[0m"
+	ClearToEOL  = "\033[K"
+	ClearLine   = "\033[2K"
+	ClearScreen = "\033[2J"
+	CursorReset = "\033[0;0f"

-	StartBracketedPaste = Esc + "[?2004h"
-	EndBracketedPaste   = Esc + "[?2004l"
-)
+	ColorGrey    = "\033[38;5;245m"
+	ColorDefault = "\033[0m"

-func CursorUpN(n int) string {
-	return Esc + "[" + strconv.Itoa(n) + "A"
-}
-
-func CursorDownN(n int) string {
-	return Esc + "[" + strconv.Itoa(n) + "B"
-}
-
-func CursorRightN(n int) string {
-	return Esc + "[" + strconv.Itoa(n) + "C"
-}
-
-func CursorLeftN(n int) string {
-	return Esc + "[" + strconv.Itoa(n) + "D"
-}
-
-var (
-	CursorUp    = CursorUpN(1)
-	CursorDown  = CursorDownN(1)
-	CursorRight = CursorRightN(1)
-	CursorLeft  = CursorLeftN(1)
+	StartBracketedPaste = "\033[?2004h"
+	EndBracketedPaste   = "\033[?2004l"
 )

 const (
--- a/scripts/build_linux.sh
+++ b/scripts/build_linux.sh
@@ -4,7 +4,6 @@ set -eu

 export VERSION=${VERSION:-$(git describe --tags --first-parent --abbrev=7 --long --dirty --always | sed -e "s/^v//g")}
 export GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=$VERSION\" \"-X=github.com/ollama/ollama/server.mode=release\"'"
-GZIP=$(which pigz 2>/dev/null || echo "gzip")

 BUILD_ARCH=${BUILD_ARCH:-"amd64 arm64"}
 export AMDGPU_TARGETS=${AMDGPU_TARGETS:=""}
@@ -22,16 +21,11 @@ for TARGETARCH in ${BUILD_ARCH}; do
        -t builder:$TARGETARCH \
        .
    docker create --platform linux/$TARGETARCH --name builder-$TARGETARCH builder:$TARGETARCH
-    rm -rf ./dist/linux-$TARGETARCH
-    docker cp builder-$TARGETARCH:/go/src/github.com/ollama/ollama/dist/linux-$TARGETARCH ./dist
-    if echo ${TARGETARCH} | grep "amd64" > /dev/null; then
-        docker cp builder-$TARGETARCH:/go/src/github.com/ollama/ollama/dist/linux-$TARGETARCH-rocm ./dist
+    docker cp builder-$TARGETARCH:/go/src/github.com/ollama/ollama/ollama ./dist/ollama-linux-$TARGETARCH
+
+    if [ "$TARGETARCH" = "amd64" ]; then
+        docker cp builder-$TARGETARCH:/go/src/github.com/ollama/ollama/dist/deps/ ./dist/
    fi
+
    docker rm builder-$TARGETARCH
-    echo "Compressing final linux bundle..."
-    rm -f ./dist/ollama-linux-$TARGETARCH.tgz
-    (cd dist/linux-$TARGETARCH && tar cf - . | ${GZIP} --best > ../ollama-linux-$TARGETARCH.tgz )
-    if [ -d dist/linux-$TARGETARCH-rocm ]; then
-        (cd dist/linux-$TARGETARCH-rocm && tar cf - . | ${GZIP} --best > ../ollama-linux-$TARGETARCH-rocm.tgz )
-    fi
 done
--- a/scripts/build_windows.ps1
+++ b/scripts/build_windows.ps1
@@ -7,7 +7,6 @@
 $ErrorActionPreference = "Stop"

 function checkEnv() {
-    $script:ARCH = $Env:PROCESSOR_ARCHITECTURE.ToLower()
    $script:TARGET_ARCH=$Env:PROCESSOR_ARCHITECTURE.ToLower()
    Write-host "Building for ${script:TARGET_ARCH}"
    write-host "Locating required tools and paths"
@@ -16,23 +15,26 @@ function checkEnv() {
        $MSVC_INSTALL=(Get-CimInstance MSFT_VSInstance -Namespace root/cimv2/vs)[0].InstallLocation
        $env:VCToolsRedistDir=(get-item "${MSVC_INSTALL}\VC\Redist\MSVC\*")[0]
    }
-    # Locate CUDA versions
-    # Note: this assumes every version found will be built
-    $cudaList=(get-item "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v*\bin\" -ea 'silentlycontinue')
-    if ($cudaList.length -eq 0) {
+    # Try to find the CUDA dir
+    if ($null -eq $env:NVIDIA_DIR) {
        $d=(get-command -ea 'silentlycontinue' nvcc).path
-        if ($null -ne $d) {
-            $script:CUDA_DIRS=@($d| split-path -parent)
+        if ($d -ne $null) {
+            $script:NVIDIA_DIR=($d| split-path -parent)
+        } else {
+            $cudaList=(get-item "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v*\bin\" -ea 'silentlycontinue')
+            if ($cudaList.length > 0) {
+                $script:NVIDIA_DIR=$cudaList[0]
+            }
        }
    } else {
-        $script:CUDA_DIRS=$cudaList
+        $script:NVIDIA_DIR=$env:NVIDIA_DIR
    }
    
    $script:INNO_SETUP_DIR=(get-item "C:\Program Files*\Inno Setup*\")[0]

    $script:DEPS_DIR="${script:SRC_DIR}\dist\windows-${script:TARGET_ARCH}"
    $env:CGO_ENABLED="1"
-    Write-Output "Checking version"
+    echo "Checking version"
    if (!$env:VERSION) {
        $data=(git describe --tags --first-parent --abbrev=7 --long --dirty --always)
        $pattern="v(.+)"
@@ -69,48 +71,7 @@ function checkEnv() {
 function buildOllama() {
    write-host "Building ollama CLI"
    if ($null -eq ${env:OLLAMA_SKIP_GENERATE}) {
-        Remove-Item -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}"
-
-        # TODO - consider trying to parallelize this with Start-ThreadJob, but env vars can't be used to toggle
-        #        which targets to build
-
-        # Start by skipping CUDA to build everything else
-        pwsh -Command { $env:OLLAMA_SKIP_CUDA_GENERATE="1"; & go generate ./... }
-        if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}    
-
-        # Then skip everyhting else and build all the CUDA variants
-        foreach ($env:CUDA_LIB_DIR in $script:CUDA_DIRS) {
-            write-host "Building CUDA ${env:CUDA_LIB_DIR}"
-
-            if ($env:CUDA_LIB_DIR.Contains("v12")) {
-                pwsh -Command {
-                    $env:OLLAMA_SKIP_CUDA_GENERATE=""
-                    $env:OLLAMA_SKIP_STATIC_GENERATE="1"
-                    $env:OLLAMA_SKIP_CPU_GENERATE="1"
-                    $env:OLLAMA_SKIP_ONEAPI_GENERATE="1"
-                    $env:OLLAMA_SKIP_ROCM_GENERATE="1"
-                    $env:CMAKE_CUDA_ARCHITECTURES="60;61;62;70;72;75;80;86;87;89;90;90a"
-                    $env:OLLAMA_CUSTOM_CUDA_DEFS="-DGGML_CUDA_USE_GRAPHS=on"
-                    $env:CUDA_PATH=split-path -path $env:CUDA_LIB_DIR -parent
-                    $env:PATH="$envs:CUDA_LIB_DIR;$env:PATH"
-                    & go generate ./...
-                }
-            } else {
-                pwsh -Command {
-                    $env:OLLAMA_SKIP_CUDA_GENERATE=""
-                    $env:OLLAMA_SKIP_STATIC_GENERATE="1"
-                    $env:OLLAMA_SKIP_CPU_GENERATE="1"
-                    $env:OLLAMA_SKIP_ONEAPI_GENERATE="1"
-                    $env:OLLAMA_SKIP_ROCM_GENERATE="1"
-                    $env:CMAKE_CUDA_ARCHITECTURES="50;52;53;60;61;62;70;72;75;80;86"
-                    $env:OLLAMA_CUSTOM_CUDA_DEFS=""
-                    $env:CUDA_PATH=split-path -path $env:CUDA_LIB_DIR -parent
-                    $env:PATH="$envs:CUDA_LIB_DIR;$env:PATH"
-                    & go generate ./...
-                }
-            }
-            if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
-        }
+        & go generate ./...
        if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}    
    } else {
        write-host "Skipping generate step with OLLAMA_SKIP_GENERATE set"
@@ -122,8 +83,8 @@ function buildOllama() {
            /csp "Google Cloud KMS Provider" /kc ${env:KEY_CONTAINER} ollama.exe
        if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
    }
-    New-Item -ItemType Directory -Path .\dist\windows-${script:TARGET_ARCH}\bin\ -Force
-    cp .\ollama.exe .\dist\windows-${script:TARGET_ARCH}\bin\
+    New-Item -ItemType Directory -Path .\dist\windows-${script:TARGET_ARCH}\ -Force
+    cp .\ollama.exe .\dist\windows-${script:TARGET_ARCH}\
 }

 function buildApp() {
@@ -142,22 +103,22 @@ function buildApp() {
 function gatherDependencies() {
    write-host "Gathering runtime dependencies"
    cd "${script:SRC_DIR}"
-    md "${script:DEPS_DIR}\lib\ollama" -ea 0 > $null
+    md "${script:DEPS_DIR}\ollama_runners" -ea 0 > $null

    # TODO - this varies based on host build system and MSVC version - drive from dumpbin output
    # currently works for Win11 + MSVC 2019 + Cuda V11
-    cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\msvcp140*.dll" "${script:DEPS_DIR}\lib\ollama\"
-    cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140.dll" "${script:DEPS_DIR}\lib\ollama\"
-    cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140_1.dll" "${script:DEPS_DIR}\lib\ollama\"
+    cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\msvcp140*.dll" "${script:DEPS_DIR}\ollama_runners\"
+    cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140.dll" "${script:DEPS_DIR}\ollama_runners\"
+    cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140_1.dll" "${script:DEPS_DIR}\ollama_runners\"
    foreach ($part in $("runtime", "stdio", "filesystem", "math", "convert", "heap", "string", "time", "locale", "environment")) {
-        cp "$env:VCToolsRedistDir\..\..\..\Tools\Llvm\x64\bin\api-ms-win-crt-${part}*.dll" "${script:DEPS_DIR}\lib\ollama\"
+        cp "$env:VCToolsRedistDir\..\..\..\Tools\Llvm\x64\bin\api-ms-win-crt-${part}*.dll" "${script:DEPS_DIR}\ollama_runners\"
    }


    cp "${script:SRC_DIR}\app\ollama_welcome.ps1" "${script:SRC_DIR}\dist\"
    if ("${env:KEY_CONTAINER}") {
        write-host "about to sign"
-        foreach ($file in (get-childitem "${script:DEPS_DIR}\lib\ollama\cu*.dll") + @("${script:SRC_DIR}\dist\ollama_welcome.ps1")){
+        foreach ($file in (get-childitem "${script:DEPS_DIR}\cuda\cu*.dll") + @("${script:SRC_DIR}\dist\ollama_welcome.ps1")){
            write-host "signing $file"
            & "${script:SignTool}" sign /v /fd sha256 /t http://timestamp.digicert.com /f "${script:OLLAMA_CERT}" `
                /csp "Google Cloud KMS Provider" /kc ${env:KEY_CONTAINER} $file
--- a/scripts/install.sh
+++ b/scripts/install.sh
@@ -63,36 +63,16 @@ if [ -n "$NEEDS" ]; then
    exit 1
 fi

+status "Downloading ollama..."
+curl --fail --show-error --location --progress-bar -o $TEMP_DIR/ollama "https://ollama.com/download/ollama-linux-${ARCH}${VER_PARAM}"
+
 for BINDIR in /usr/local/bin /usr/bin /bin; do
    echo $PATH | grep -q $BINDIR && break || continue
 done
-OLLAMA_INSTALL_DIR=$(dirname ${BINDIR})

-status "Installing ollama to $OLLAMA_INSTALL_DIR"
+status "Installing ollama to $BINDIR..."
 $SUDO install -o0 -g0 -m755 -d $BINDIR
-$SUDO install -o0 -g0 -m755 -d "$OLLAMA_INSTALL_DIR"
-if curl -I --silent --fail --location "https://ollama.com/download/ollama-linux-${ARCH}.tgz${VER_PARAM}" >/dev/null ; then
-    status "Downloading Linux ${ARCH} bundle"
-    curl --fail --show-error --location --progress-bar \
-        "https://ollama.com/download/ollama-linux-${ARCH}.tgz${VER_PARAM}" | \
-        $SUDO tar -xzf - -C "$OLLAMA_INSTALL_DIR"
-    BUNDLE=1
-    if [ "$OLLAMA_INSTALL_DIR/bin/ollama" != "$BINDIR/ollama" ] ; then
-        status "Making ollama accessible in the PATH in $BINDIR"
-        $SUDO ln -sf "$OLLAMA_INSTALL_DIR/ollama" "$BINDIR/ollama"
-    fi
-else
-    status "Downloading Linux ${ARCH} CLI"
-    curl --fail --show-error --location --progress-bar -o "$TEMP_DIR/ollama"\
-    "https://ollama.com/download/ollama-linux-${ARCH}${VER_PARAM}"
-    $SUDO install -o0 -g0 -m755 $TEMP_DIR/ollama $OLLAMA_INSTALL_DIR/ollama
-    BUNDLE=0
-    if [ "$OLLAMA_INSTALL_DIR/ollama" != "$BINDIR/ollama" ] ; then
-        status "Making ollama accessible in the PATH in $BINDIR"
-        $SUDO ln -sf "$OLLAMA_INSTALL_DIR/ollama" "$BINDIR/ollama"
-    fi
-fi
-
+$SUDO install -o0 -g0 -m755 $TEMP_DIR/ollama $BINDIR/ollama

 install_success() {
    status 'The Ollama API is now available at 127.0.0.1:11434.'
@@ -198,16 +178,6 @@ if ! check_gpu lspci nvidia && ! check_gpu lshw nvidia && ! check_gpu lspci amdg
 fi

 if check_gpu lspci amdgpu || check_gpu lshw amdgpu; then
-    if [ $BUNDLE -ne 0 ]; then
-        status "Downloading Linux ROCm ${ARCH} bundle"
-        curl --fail --show-error --location --progress-bar \
-            "https://ollama.com/download/ollama-linux-${ARCH}-rocm.tgz${VER_PARAM}" | \
-            $SUDO tar -xzf - -C "$OLLAMA_INSTALL_DIR"
-
-        install_success
-        status "AMD GPU ready."
-        exit 0
-    fi
    # Look for pre-existing ROCm v6 before downloading the dependencies
    for search in "${HIP_PATH:-''}" "${ROCM_PATH:-''}" "/opt/rocm" "/usr/lib64"; do
        if [ -n "${search}" ] && [ -e "${search}/libhipblas.so.2" -o -e "${search}/lib/libhipblas.so.2" ]; then
@@ -239,15 +209,15 @@ install_cuda_driver_yum() {
    case $PACKAGE_MANAGER in
        yum)
            $SUDO $PACKAGE_MANAGER -y install yum-utils
-            if curl -I --silent --fail --location "https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m | sed -e 's/aarch64/sbsa/')/cuda-$1$2.repo" >/dev/null ; then
-                $SUDO $PACKAGE_MANAGER-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m | sed -e 's/aarch64/sbsa/')/cuda-$1$2.repo
+            if curl -I --silent --fail --location "https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-$1$2.repo" >/dev/null ; then
+                $SUDO $PACKAGE_MANAGER-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-$1$2.repo
            else
                error $CUDA_REPO_ERR_MSG
            fi
            ;;
        dnf)
-            if curl -I --silent --fail --location "https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m | sed -e 's/aarch64/sbsa/')/cuda-$1$2.repo" >/dev/null ; then
-                $SUDO $PACKAGE_MANAGER config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m | sed -e 's/aarch64/sbsa/')/cuda-$1$2.repo
+            if curl -I --silent --fail --location "https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-$1$2.repo" >/dev/null ; then
+                $SUDO $PACKAGE_MANAGER config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-$1$2.repo
            else
                error $CUDA_REPO_ERR_MSG
            fi
@@ -275,8 +245,8 @@ install_cuda_driver_yum() {
 # ref: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#debian
 install_cuda_driver_apt() {
    status 'Installing NVIDIA repository...'
-    if curl -I --silent --fail --location "https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m | sed -e 's/aarch64/sbsa/')/cuda-keyring_1.1-1_all.deb" >/dev/null ; then
-        curl -fsSL -o $TEMP_DIR/cuda-keyring.deb https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m | sed -e 's/aarch64/sbsa/')/cuda-keyring_1.1-1_all.deb
+    if curl -I --silent --fail --location "https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-keyring_1.1-1_all.deb" >/dev/null ; then
+        curl -fsSL -o $TEMP_DIR/cuda-keyring.deb https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-keyring_1.1-1_all.deb
    else
        error $CUDA_REPO_ERR_MSG
    fi
--- a/scripts/rh_linux_deps.sh
+++ b/scripts/rh_linux_deps.sh
@@ -3,7 +3,6 @@
 # Script for common Dockerfile dependency installation in redhat linux based images

 set -ex
-set -o pipefail
 MACHINE=$(uname -m)

 if grep -i "centos" /etc/system-release >/dev/null; then
@@ -30,7 +29,7 @@ if grep -i "centos" /etc/system-release >/dev/null; then
        dnf install -y rh-git227-git
        ln -s /opt/rh/rh-git227/root/usr/bin/git /usr/local/bin/git
    fi
-    dnf install -y devtoolset-10-gcc devtoolset-10-gcc-c++ pigz
+    dnf install -y devtoolset-10-gcc devtoolset-10-gcc-c++
 elif grep -i "rocky" /etc/system-release >/dev/null; then
    # Temporary workaround until rocky 8 AppStream ships GCC 10.4 (10.3 is incompatible with NVCC)
    cat << EOF > /etc/yum.repos.d/Rocky-Vault.repo
@@ -44,21 +43,12 @@ gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-rockyofficial
 EOF
    dnf install -y git \
        gcc-toolset-10-gcc-10.2.1-8.2.el8 \
-        gcc-toolset-10-gcc-c++-10.2.1-8.2.el8 \
-        pigz
+        gcc-toolset-10-gcc-c++-10.2.1-8.2.el8
 else
    echo "ERROR Unexpected distro"
    exit 1
 fi

-if [ "${MACHINE}" = "x86_64" ] ; then
-    curl -s -L https://github.com/ccache/ccache/releases/download/v4.10.2/ccache-4.10.2-linux-x86_64.tar.xz | tar -Jx -C /tmp --strip-components 1 && \
-    mv /tmp/ccache /usr/local/bin/
-else
-    yum -y install epel-release
-    yum install -y ccache
-fi
-
 if [ -n "${CMAKE_VERSION}" ]; then
    curl -s -L https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-$(uname -m).tar.gz | tar -zx -C /usr --strip-components 1
 fi
--- a/server/download.go
+++ b/server/download.go
@@ -94,7 +94,7 @@ func (p *blobDownloadPart) UnmarshalJSON(b []byte) error {
 }

 const (
-	numDownloadParts          = 16
+	numDownloadParts          = 64
 	minDownloadPartSize int64 = 100 * format.MegaByte
 	maxDownloadPartSize int64 = 1000 * format.MegaByte
 )
@@ -216,7 +216,9 @@ func (b *blobDownload) run(ctx context.Context, requestURL *url.URL, opts *regis
 		return err
 	}
 	defer file.Close()
-	setSparse(file)
+	if err := setSparse(file); err != nil {
+		return err
+	}

 	_ = file.Truncate(b.Total)

@@ -233,7 +235,7 @@ func (b *blobDownload) run(ctx context.Context, requestURL *url.URL, opts *regis

 			newOpts.CheckRedirect = func(req *http.Request, via []*http.Request) error {
 				if len(via) > 10 {
-					return errors.New("maximum redirects exceeded (10) for directURL")
+					return errors.New("maxium redirects exceeded (10) for directURL")
 				}

 				// if the hostname is the same, allow the redirect
--- a/server/images.go
+++ b/server/images.go
@@ -215,20 +215,25 @@ func GetManifest(mp ModelPath) (*Manifest, string, error) {
 		return nil, "", err
 	}

-	f, err := os.Open(fp)
+	if _, err = os.Stat(fp); err != nil {
+		return nil, "", err
+	}
+
+	var manifest *Manifest
+
+	bts, err := os.ReadFile(fp)
 	if err != nil {
-		return nil, "", err
+		return nil, "", fmt.Errorf("couldn't open file '%s'", fp)
 	}
-	defer f.Close()

-	sha256sum := sha256.New()
+	shaSum := sha256.Sum256(bts)
+	shaStr := hex.EncodeToString(shaSum[:])

-	var manifest Manifest
-	if err := json.NewDecoder(io.TeeReader(f, sha256sum)).Decode(&manifest); err != nil {
+	if err := json.Unmarshal(bts, &manifest); err != nil {
 		return nil, "", err
 	}

-	return &manifest, hex.EncodeToString(sha256sum.Sum(nil)), nil
+	return manifest, shaStr, nil
 }

 func GetModel(name string) (*Model, error) {
@@ -245,21 +250,19 @@ func GetModel(name string) (*Model, error) {
 		Template:  template.DefaultTemplate,
 	}

-	if manifest.Config.Digest != "" {
-		filename, err := GetBlobsPath(manifest.Config.Digest)
-		if err != nil {
-			return nil, err
-		}
+	filename, err := GetBlobsPath(manifest.Config.Digest)
+	if err != nil {
+		return nil, err
+	}

-		configFile, err := os.Open(filename)
-		if err != nil {
-			return nil, err
-		}
-		defer configFile.Close()
+	configFile, err := os.Open(filename)
+	if err != nil {
+		return nil, err
+	}
+	defer configFile.Close()

-		if err := json.NewDecoder(configFile).Decode(&model.Config); err != nil {
-			return nil, err
-		}
+	if err := json.NewDecoder(configFile).Decode(&model.Config); err != nil {
+		return nil, err
 	}

 	for _, layer := range manifest.Layers {
@@ -368,15 +371,14 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio
 	var messages []*api.Message
 	parameters := make(map[string]any)

-	var layers []Layer
-	var baseLayers []*layerGGML
+	var layers []*Layer
 	for _, c := range modelfile.Commands {
 		mediatype := fmt.Sprintf("application/vnd.ollama.image.%s", c.Name)
-		command := c.Name

-		switch command {
+		switch c.Name {
 		case "model", "adapter":
-			if name := model.ParseName(c.Args); name.IsValid() && command == "model" {
+			var baseLayers []*layerGGML
+			if name := model.ParseName(c.Args); name.IsValid() {
 				baseLayers, err = parseFromModel(ctx, name, fn)
 				if err != nil {
 					return err
@@ -410,14 +412,14 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio
 				}
 				defer blob.Close()

-				baseLayers, err = parseFromFile(ctx, command, baseLayers, blob, digest, fn)
+				baseLayers, err = parseFromFile(ctx, blob, digest, fn)
 				if err != nil {
 					return err
 				}
 			} else if file, err := os.Open(realpath(modelFileDir, c.Args)); err == nil {
 				defer file.Close()

-				baseLayers, err = parseFromFile(ctx, command, baseLayers, file, "", fn)
+				baseLayers, err = parseFromFile(ctx, file, "", fn)
 				if err != nil {
 					return err
 				}
@@ -495,7 +497,7 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio

 			if c.Name != "license" {
 				// replace
-				layers = slices.DeleteFunc(layers, func(layer Layer) bool {
+				layers = slices.DeleteFunc(layers, func(layer *Layer) bool {
 					if layer.MediaType != mediatype {
 						return false
 					}
@@ -541,7 +543,7 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio
 	}

 	var err2 error
-	layers = slices.DeleteFunc(layers, func(layer Layer) bool {
+	layers = slices.DeleteFunc(layers, func(layer *Layer) bool {
 		switch layer.MediaType {
 		case "application/vnd.ollama.image.message":
 			// if there are new messages, remove the inherited ones
@@ -621,12 +623,12 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio
 		return err
 	}

-	configLayer, err := NewLayer(&b, "application/vnd.docker.container.image.v1+json")
+	layer, err := NewLayer(&b, "application/vnd.docker.container.image.v1+json")
 	if err != nil {
 		return err
 	}

-	for _, layer := range append(layers, configLayer) {
+	for _, layer := range append(layers, layer) {
 		if layer.status != "" {
 			fn(api.ProgressResponse{Status: layer.status})
 		}
@@ -635,7 +637,7 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio
 	old, _ := ParseNamedManifest(name)

 	fn(api.ProgressResponse{Status: "writing manifest"})
-	if err := WriteManifest(name, configLayer, layers); err != nil {
+	if err := WriteManifest(name, layer, layers); err != nil {
 		return err
 	}

@@ -688,18 +690,44 @@ func CopyModel(src, dst model.Name) error {
 	return err
 }

-func deleteUnusedLayers(deleteMap map[string]struct{}) error {
-	manifests, err := Manifests()
+func deleteUnusedLayers(skipModelPath *ModelPath, deleteMap map[string]struct{}) error {
+	fp, err := GetManifestPath()
 	if err != nil {
 		return err
 	}

-	for _, manifest := range manifests {
+	walkFunc := func(path string, info os.FileInfo, _ error) error {
+		if info.IsDir() {
+			return nil
+		}
+
+		dir, file := filepath.Split(path)
+		dir = strings.Trim(strings.TrimPrefix(dir, fp), string(os.PathSeparator))
+		tag := strings.Join([]string{dir, file}, ":")
+		fmp := ParseModelPath(tag)
+
+		// skip the manifest we're trying to delete
+		if skipModelPath != nil && skipModelPath.GetFullTagname() == fmp.GetFullTagname() {
+			return nil
+		}
+
+		// save (i.e. delete from the deleteMap) any files used in other manifests
+		manifest, _, err := GetManifest(fmp)
+		if err != nil {
+			//nolint:nilerr
+			return nil
+		}
+
 		for _, layer := range manifest.Layers {
 			delete(deleteMap, layer.Digest)
 		}

 		delete(deleteMap, manifest.Config.Digest)
+		return nil
+	}
+
+	if err := filepath.Walk(fp, walkFunc); err != nil {
+		return err
 	}

 	// only delete the files which are still in the deleteMap
@@ -752,9 +780,9 @@ func PruneLayers() error {

 	slog.Info(fmt.Sprintf("total blobs: %d", len(deleteMap)))

-	if err := deleteUnusedLayers(deleteMap); err != nil {
-		slog.Error(fmt.Sprintf("couldn't remove unused layers: %v", err))
-		return nil
+	err = deleteUnusedLayers(nil, deleteMap)
+	if err != nil {
+		return err
 	}

 	slog.Info(fmt.Sprintf("total unused blobs removed: %d", len(deleteMap)))
@@ -809,11 +837,9 @@ func PushModel(ctx context.Context, name string, regOpts *registryOptions, fn fu
 		return err
 	}

-	var layers []Layer
+	var layers []*Layer
 	layers = append(layers, manifest.Layers...)
-	if manifest.Config.Digest != "" {
-		layers = append(layers, manifest.Config)
-	}
+	layers = append(layers, manifest.Config)

 	for _, layer := range layers {
 		if err := uploadBlob(ctx, mp, layer, regOpts, fn); err != nil {
@@ -847,18 +873,23 @@ func PushModel(ctx context.Context, name string, regOpts *registryOptions, fn fu
 func PullModel(ctx context.Context, name string, regOpts *registryOptions, fn func(api.ProgressResponse)) error {
 	mp := ParseModelPath(name)

+	var manifest *Manifest
+	var err error
+	var noprune string
+
 	// build deleteMap to prune unused layers
 	deleteMap := make(map[string]struct{})
-	manifest, _, err := GetManifest(mp)
-	if errors.Is(err, os.ErrNotExist) {
-		// noop
-	} else if err != nil && !errors.Is(err, os.ErrNotExist) {
-		return err
-	} else {
-		for _, l := range manifest.Layers {
-			deleteMap[l.Digest] = struct{}{}
+
+	if !envconfig.NoPrune() {
+		manifest, _, err = GetManifest(mp)
+		if err != nil && !errors.Is(err, os.ErrNotExist) {
+			return err
 		}
-		if manifest.Config.Digest != "" {
+
+		if manifest != nil {
+			for _, l := range manifest.Layers {
+				deleteMap[l.Digest] = struct{}{}
+			}
 			deleteMap[manifest.Config.Digest] = struct{}{}
 		}
 	}
@@ -874,11 +905,9 @@ func PullModel(ctx context.Context, name string, regOpts *registryOptions, fn fu
 		return fmt.Errorf("pull model manifest: %s", err)
 	}

-	var layers []Layer
+	var layers []*Layer
 	layers = append(layers, manifest.Layers...)
-	if manifest.Config.Digest != "" {
-		layers = append(layers, manifest.Config)
-	}
+	layers = append(layers, manifest.Config)

 	skipVerify := make(map[string]bool)
 	for _, layer := range layers {
@@ -938,10 +967,11 @@ func PullModel(ctx context.Context, name string, regOpts *registryOptions, fn fu
 		return err
 	}

-	if !envconfig.NoPrune() && len(deleteMap) > 0 {
-		fn(api.ProgressResponse{Status: "removing unused layers"})
-		if err := deleteUnusedLayers(deleteMap); err != nil {
-			fn(api.ProgressResponse{Status: fmt.Sprintf("couldn't remove unused layers: %v", err)})
+	if noprune == "" {
+		fn(api.ProgressResponse{Status: "removing any unused layers"})
+		err = deleteUnusedLayers(nil, deleteMap)
+		if err != nil {
+			return err
 		}
 	}

@@ -961,12 +991,12 @@ func pullModelManifest(ctx context.Context, mp ModelPath, regOpts *registryOptio
 	}
 	defer resp.Body.Close()

-	var m Manifest
+	var m *Manifest
 	if err := json.NewDecoder(resp.Body).Decode(&m); err != nil {
 		return nil, err
 	}

-	return &m, err
+	return m, err
 }

 // GetSHA256Digest returns the SHA256 hash of a given buffer and returns it, and the size of buffer
--- a/server/layer.go
+++ b/server/layer.go
@@ -2,7 +2,6 @@ package server

 import (
 	"crypto/sha256"
-	"errors"
 	"fmt"
 	"io"
 	"os"
@@ -16,15 +15,15 @@ type Layer struct {
 	status    string
 }

-func NewLayer(r io.Reader, mediatype string) (Layer, error) {
+func NewLayer(r io.Reader, mediatype string) (*Layer, error) {
 	blobs, err := GetBlobsPath("")
 	if err != nil {
-		return Layer{}, err
+		return nil, err
 	}

 	temp, err := os.CreateTemp(blobs, "sha256-")
 	if err != nil {
-		return Layer{}, err
+		return nil, err
 	}
 	defer temp.Close()
 	defer os.Remove(temp.Name())
@@ -32,31 +31,28 @@ func NewLayer(r io.Reader, mediatype string) (Layer, error) {
 	sha256sum := sha256.New()
 	n, err := io.Copy(io.MultiWriter(temp, sha256sum), r)
 	if err != nil {
-		return Layer{}, err
+		return nil, err
 	}

 	if err := temp.Close(); err != nil {
-		return Layer{}, err
+		return nil, err
 	}

 	digest := fmt.Sprintf("sha256:%x", sha256sum.Sum(nil))
 	blob, err := GetBlobsPath(digest)
 	if err != nil {
-		return Layer{}, err
+		return nil, err
 	}

 	status := "using existing layer"
 	if _, err := os.Stat(blob); err != nil {
 		status = "creating new layer"
 		if err := os.Rename(temp.Name(), blob); err != nil {
-			return Layer{}, err
-		}
-		if err := os.Chmod(blob, 0o644); err != nil {
-			return Layer{}, err
+			return nil, err
 		}
 	}

-	return Layer{
+	return &Layer{
 		MediaType: mediatype,
 		Digest:    digest,
 		Size:      n,
@@ -64,22 +60,18 @@ func NewLayer(r io.Reader, mediatype string) (Layer, error) {
 	}, nil
 }

-func NewLayerFromLayer(digest, mediatype, from string) (Layer, error) {
-	if digest == "" {
-		return Layer{}, errors.New("creating new layer from layer with empty digest")
-	}
-
+func NewLayerFromLayer(digest, mediatype, from string) (*Layer, error) {
 	blob, err := GetBlobsPath(digest)
 	if err != nil {
-		return Layer{}, err
+		return nil, err
 	}

 	fi, err := os.Stat(blob)
 	if err != nil {
-		return Layer{}, err
+		return nil, err
 	}

-	return Layer{
+	return &Layer{
 		MediaType: mediatype,
 		Digest:    digest,
 		Size:      fi.Size(),
@@ -89,10 +81,6 @@ func NewLayerFromLayer(digest, mediatype, from string) (Layer, error) {
 }

 func (l *Layer) Open() (io.ReadSeekCloser, error) {
-	if l.Digest == "" {
-		return nil, errors.New("opening layer with empty digest")
-	}
-
 	blob, err := GetBlobsPath(l.Digest)
 	if err != nil {
 		return nil, err
@@ -102,10 +90,6 @@ func (l *Layer) Open() (io.ReadSeekCloser, error) {
 }

 func (l *Layer) Remove() error {
-	if l.Digest == "" {
-		return nil
-	}
-
 	ms, err := Manifests()
 	if err != nil {
 		return err
--- a/server/manifest.go
+++ b/server/manifest.go
@@ -5,7 +5,6 @@ import (
 	"encoding/hex"
 	"encoding/json"
 	"errors"
-	"fmt"
 	"io"
 	"log/slog"
 	"os"
@@ -15,11 +14,12 @@ import (
 )

 type Manifest struct {
-	SchemaVersion int     `json:"schemaVersion"`
-	MediaType     string  `json:"mediaType"`
-	Config        Layer   `json:"config"`
-	Layers        []Layer `json:"layers"`
+	SchemaVersion int      `json:"schemaVersion"`
+	MediaType     string   `json:"mediaType"`
+	Config        *Layer   `json:"config"`
+	Layers        []*Layer `json:"layers"`

+	name     model.Name
 	filepath string
 	fi       os.FileInfo
 	digest   string
@@ -48,12 +48,10 @@ func (m *Manifest) Remove() error {

 func (m *Manifest) RemoveLayers() error {
 	for _, layer := range append(m.Layers, m.Config) {
-		if layer.Digest != "" {
-			if err := layer.Remove(); errors.Is(err, os.ErrNotExist) {
-				slog.Debug("layer does not exist", "digest", layer.Digest)
-			} else if err != nil {
-				return err
-			}
+		if err := layer.Remove(); errors.Is(err, os.ErrNotExist) {
+			slog.Debug("layer does not exist", "digest", layer.Digest)
+		} else if err != nil {
+			return err
 		}
 	}

@@ -72,7 +70,6 @@ func ParseNamedManifest(n model.Name) (*Manifest, error) {

 	p := filepath.Join(manifests, n.Filepath())

-	var m Manifest
 	f, err := os.Open(p)
 	if err != nil {
 		return nil, err
@@ -84,11 +81,13 @@ func ParseNamedManifest(n model.Name) (*Manifest, error) {
 		return nil, err
 	}

+	var m Manifest
 	sha256sum := sha256.New()
 	if err := json.NewDecoder(io.TeeReader(f, sha256sum)).Decode(&m); err != nil {
 		return nil, err
 	}

+	m.name = n
 	m.filepath = p
 	m.fi = fi
 	m.digest = hex.EncodeToString(sha256sum.Sum(nil))
@@ -96,7 +95,7 @@ func ParseNamedManifest(n model.Name) (*Manifest, error) {
 	return &m, nil
 }

-func WriteManifest(name model.Name, config Layer, layers []Layer) error {
+func WriteManifest(name model.Name, config *Layer, layers []*Layer) error {
 	manifests, err := GetManifestPath()
 	if err != nil {
 		return err
@@ -151,16 +150,14 @@ func Manifests() (map[model.Name]*Manifest, error) {

 			n := model.ParseNameFromFilepath(rel)
 			if !n.IsValid() {
-				slog.Warn("bad manifest name", "path", rel)
+				slog.Warn("bad manifest name", "path", rel, "error", err)
 				continue
 			}

 			m, err := ParseNamedManifest(n)
-			if syntax := &(json.SyntaxError{}); errors.As(err, &syntax) {
+			if err != nil {
 				slog.Warn("bad manifest", "name", n, "error", err)
 				continue
-			} else if err != nil {
-				return nil, fmt.Errorf("%s: %w", n, err)
 			}

 			ms[n] = m
--- a/server/model.go
+++ b/server/model.go
@@ -26,7 +26,7 @@ import (
 var intermediateBlobs map[string]string = make(map[string]string)

 type layerGGML struct {
-	Layer
+	*Layer
 	*llm.GGML
 }

@@ -81,7 +81,7 @@ func parseFromModel(ctx context.Context, name model.Name, fn func(api.ProgressRe
 	return layers, nil
 }

-func parseFromZipFile(_ context.Context, command string, baseLayers []*layerGGML, f *os.File, digest string, fn func(api.ProgressResponse)) (layers []*layerGGML, err error) {
+func parseFromZipFile(_ context.Context, f *os.File, digest string, fn func(api.ProgressResponse)) (layers []*layerGGML, err error) {
 	fi, err := f.Stat()
 	if err != nil {
 		return nil, err
@@ -108,38 +108,16 @@ func parseFromZipFile(_ context.Context, command string, baseLayers []*layerGGML
 	defer t.Close()
 	defer os.Remove(t.Name())

-	var layerType string
-
-	switch command {
-	case "adapter":
-		var baseModel *llm.GGML
-		for _, l := range baseLayers {
-			if l.GGML != nil {
-				baseModel = l.GGML
-				break
-			}
-		}
-
-		if baseModel == nil {
-			return nil, fmt.Errorf("no base model specified for the adapter")
-		}
-
-		if err := convert.ConvertAdapter(convert.NewZipReader(r, p, 32<<20), t, baseModel.KV()); err != nil {
-			return nil, err
-		}
-		layerType = "application/vnd.ollama.image.adapter"
-	case "model":
-		if err := convert.ConvertModel(convert.NewZipReader(r, p, 32<<20), t); err != nil {
-			return nil, err
-		}
-		layerType = "application/vnd.ollama.image.model"
+	fn(api.ProgressResponse{Status: "converting model"})
+	if err := convert.Convert(convert.NewZipReader(r, p, 32<<20), t); err != nil {
+		return nil, err
 	}

 	if _, err := t.Seek(0, io.SeekStart); err != nil {
 		return nil, err
 	}

-	layer, err := NewLayer(t, layerType)
+	layer, err := NewLayer(t, "application/vnd.ollama.image.model")
 	if err != nil {
 		return nil, err
 	}
@@ -161,7 +139,7 @@ func parseFromZipFile(_ context.Context, command string, baseLayers []*layerGGML
 	return detectChatTemplate(layers)
 }

-func parseFromFile(ctx context.Context, command string, baseLayers []*layerGGML, file *os.File, digest string, fn func(api.ProgressResponse)) (layers []*layerGGML, err error) {
+func parseFromFile(ctx context.Context, file *os.File, digest string, fn func(api.ProgressResponse)) (layers []*layerGGML, err error) {
 	sr := io.NewSectionReader(file, 0, 512)
 	contentType, err := detectContentType(sr)
 	if err != nil {
@@ -172,7 +150,7 @@ func parseFromFile(ctx context.Context, command string, baseLayers []*layerGGML,
 	case "gguf", "ggla":
 		// noop
 	case "application/zip":
-		return parseFromZipFile(ctx, command, baseLayers, file, digest, fn)
+		return parseFromZipFile(ctx, file, digest, fn)
 	default:
 		return nil, fmt.Errorf("unsupported content type: %s", contentType)
 	}
@@ -192,26 +170,15 @@ func parseFromFile(ctx context.Context, command string, baseLayers []*layerGGML,
 		}

 		mediatype := "application/vnd.ollama.image.model"
-		if ggml.Name() == "ggla" || ggml.KV().Kind() == "adapter" {
+		if ggml.Name() == "ggla" {
 			mediatype = "application/vnd.ollama.image.adapter"
 		} else if ggml.KV().Architecture() == "clip" {
 			mediatype = "application/vnd.ollama.image.projector"
 		}

-		var layer Layer
-		if digest != "" && n == stat.Size() && offset == 0 {
-			layer, err = NewLayerFromLayer(digest, mediatype, file.Name())
-			if err != nil {
-				slog.Debug("could not create new layer from layer", "error", err)
-			}
-		}
-
-		// Fallback to creating layer from file copy (either NewLayerFromLayer failed, or digest empty/n != stat.Size())
-		if layer.Digest == "" {
-			layer, err = NewLayer(io.NewSectionReader(file, offset, n), mediatype)
-			if err != nil {
-				return nil, err
-			}
+		layer, err := NewLayer(io.NewSectionReader(file, offset, n), mediatype)
+		if err != nil {
+			return nil, err
 		}

 		layers = append(layers, &layerGGML{layer, ggml})
--- a/server/model_test.go
+++ b/server/model_test.go
@@ -2,10 +2,8 @@ package server

 import (
 	"bytes"
-	"context"
 	"encoding/json"
 	"fmt"
-	"io"
 	"os"
 	"path/filepath"
 	"testing"
@@ -13,7 +11,6 @@ import (
 	"github.com/google/go-cmp/cmp"

 	"github.com/ollama/ollama/api"
-	"github.com/ollama/ollama/llm"
 	"github.com/ollama/ollama/template"
 )

@@ -136,82 +133,3 @@ The temperature in San Francisco, CA is 70°F and in Toronto, Canada is 20°C.`,
 		})
 	}
 }
-
-func TestParseFromFileFromLayer(t *testing.T) {
-	tempModels := t.TempDir()
-
-	file, err := os.CreateTemp(tempModels, "")
-	if err != nil {
-		t.Fatalf("failed to open file: %v", err)
-	}
-	defer file.Close()
-	if err := llm.WriteGGUF(file, llm.KV{"general.architecture": "gemma"}, []llm.Tensor{}); err != nil {
-		t.Fatalf("failed to write gguf: %v", err)
-	}
-
-	if _, err := file.Seek(0, io.SeekStart); err != nil {
-		t.Fatalf("failed to seek to start: %v", err)
-	}
-
-	layers, err := parseFromFile(context.Background(), "model", []*layerGGML{}, file, "", func(api.ProgressResponse) {})
-	if err != nil {
-		t.Fatalf("failed to parse from file: %v", err)
-	}
-
-	if len(layers) != 1 {
-		t.Fatalf("got %d != want 1", len(layers))
-	}
-
-	if _, err := file.Seek(0, io.SeekStart); err != nil {
-		t.Fatalf("failed to seek to start: %v", err)
-	}
-
-	layers2, err := parseFromFile(context.Background(), "model", []*layerGGML{}, file, layers[0].Digest, func(api.ProgressResponse) {})
-	if err != nil {
-		t.Fatalf("failed to parse from file: %v", err)
-	}
-	if len(layers2) != 1 {
-		t.Fatalf("got %d != want 1", len(layers2))
-	}
-
-	if layers[0].Digest != layers2[0].Digest {
-		t.Fatalf("got %s != want %s", layers[0].Digest, layers2[0].Digest)
-	}
-
-	if layers[0].Size != layers2[0].Size {
-		t.Fatalf("got %d != want %d", layers[0].Size, layers2[0].Size)
-	}
-
-	if layers[0].MediaType != layers2[0].MediaType {
-		t.Fatalf("got %v != want %v", layers[0].MediaType, layers2[0].MediaType)
-	}
-}
-
-func TestParseLayerFromCopy(t *testing.T) {
-	tempModels := t.TempDir()
-
-	file2, err := os.CreateTemp(tempModels, "")
-	if err != nil {
-		t.Fatalf("failed to open file: %v", err)
-	}
-	defer file2.Close()
-
-	for range 5 {
-		if err := llm.WriteGGUF(file2, llm.KV{"general.architecture": "gemma"}, []llm.Tensor{}); err != nil {
-			t.Fatalf("failed to write gguf: %v", err)
-		}
-	}
-
-	if _, err := file2.Seek(0, io.SeekStart); err != nil {
-		t.Fatalf("failed to seek to start: %v", err)
-	}
-
-	layers, err := parseFromFile(context.Background(), "model", []*layerGGML{}, file2, "", func(api.ProgressResponse) {})
-	if err != nil {
-		t.Fatalf("failed to parse from file: %v", err)
-	}
-
-	if len(layers) != 5 {
-		t.Fatalf("got %d != want 5", len(layers))
-	}
-}
--- a/server/routes.go
+++ b/server/routes.go
@@ -23,7 +23,6 @@ import (

 	"github.com/gin-contrib/cors"
 	"github.com/gin-gonic/gin"
-	"golang.org/x/sync/errgroup"

 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/envconfig"
@@ -324,10 +323,13 @@ func (s *Server) EmbedHandler(c *gin.Context) {
 			input = append(input, v.(string))
 		}
 	default:
-		if req.Input != nil {
-			c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "invalid input type"})
-			return
-		}
+		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "invalid input type"})
+		return
+	}
+
+	if len(input) == 0 {
+		c.JSON(http.StatusOK, api.EmbedResponse{Model: req.Model, Embeddings: [][]float32{}})
+		return
 	}

 	r, m, opts, err := s.scheduleRunner(c.Request.Context(), req.Model, []Capability{}, req.Options, req.KeepAlive)
@@ -338,18 +340,12 @@ func (s *Server) EmbedHandler(c *gin.Context) {

 	checkpointLoaded := time.Now()

-	if len(input) == 0 {
-		c.JSON(http.StatusOK, api.EmbedResponse{Model: req.Model, Embeddings: [][]float32{}})
-		return
-	}
-
 	kvData, err := getKVData(m.ModelPath, false)
 	if err != nil {
 		c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
 		return
 	}

-	var count int
 	for i, s := range input {
 		tokens, err := r.Tokenize(c.Request.Context(), s)
 		if err != nil {
@@ -372,36 +368,25 @@ func (s *Server) EmbedHandler(c *gin.Context) {
 			}
 		}

-		count += len(tokens)
-
 		input[i] = s
 	}
-
-	var g errgroup.Group
-	embeddings := make([][]float32, len(input))
-	for i, text := range input {
-		g.Go(func() error {
-			embedding, err := r.Embedding(c.Request.Context(), text)
-			if err != nil {
-				return err
-			}
-			embeddings[i] = normalize(embedding)
-			return nil
-		})
+	embeddings, err := r.Embed(c.Request.Context(), input)
+	if err != nil {
+		slog.Error("embedding generation failed", "error", err)
+		c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to generate embedding"})
+		return
 	}

-	if err := g.Wait(); err != nil {
-		slog.Error("embedding generation failed", "error", err)
-		c.JSON(http.StatusInternalServerError, gin.H{"error": fmt.Errorf("failed to generate embeddings: %v", err)})
-		return
+	for i, e := range embeddings.Embedding {
+		embeddings.Embedding[i] = normalize(e)
 	}

 	resp := api.EmbedResponse{
 		Model:           req.Model,
-		Embeddings:      embeddings,
+		Embeddings:      embeddings.Embedding,
 		TotalDuration:   time.Since(checkpointStart),
 		LoadDuration:    checkpointLoaded.Sub(checkpointStart),
-		PromptEvalCount: count,
+		PromptEvalCount: embeddings.PromptEvalCount,
 	}
 	c.JSON(http.StatusOK, resp)
 }
@@ -445,20 +430,21 @@ func (s *Server) EmbeddingsHandler(c *gin.Context) {
 		return
 	}

-	embedding, err := r.Embedding(c.Request.Context(), req.Prompt)
+	embeddings, err := r.Embed(c.Request.Context(), []string{req.Prompt})
 	if err != nil {
 		slog.Info(fmt.Sprintf("embedding generation failed: %v", err))
 		c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to generate embedding"})
 		return
 	}

-	var e []float64
-	for _, v := range embedding {
-		e = append(e, float64(v))
+	embedding := make([]float64, len(embeddings.Embedding[0]))
+
+	for i, v := range embeddings.Embedding[0] {
+		embedding[i] = float64(v)
 	}

 	resp := api.EmbeddingResponse{
-		Embedding: e,
+		Embedding: embedding,
 	}
 	c.JSON(http.StatusOK, resp)
 }
@@ -717,6 +703,153 @@ func (s *Server) ShowModelHandler(c *gin.Context) {
 	c.JSON(http.StatusOK, resp)
 }

+func manifestLayers(m *Manifest, exclude []string) (map[string]any, error) {
+	r := map[string]any{
+		"name":        m.name.DisplayShortest(),
+		"digest":      m.digest,
+		"size":        m.Size(),
+		"modified_at": m.fi.ModTime(),
+	}
+
+	excludeAll := slices.Contains(exclude, "all")
+	excludeDetails := slices.Contains(exclude, "details")
+
+	for _, layer := range m.Layers {
+		var errExcludeKey = errors.New("exclude key")
+		key, content, err := func() (string, any, error) {
+			key := strings.TrimPrefix(layer.MediaType, "application/vnd.ollama.image.")
+			if slices.Contains(exclude, key) || excludeAll {
+				return "", nil, errExcludeKey
+			}
+
+			f, err := layer.Open()
+			if err != nil {
+				return "", nil, err
+			}
+			defer f.Close()
+
+			switch key {
+			case "model", "projector", "adapter":
+				ggml, _, err := llm.DecodeGGML(f, 0)
+				if err != nil {
+					return "", nil, err
+				}
+
+				content := map[string]any{
+					"architecture":    ggml.KV().Architecture(),
+					"file_type":       ggml.KV().FileType().String(),
+					"parameter_count": ggml.KV().ParameterCount(),
+				}
+
+				if !slices.Contains(exclude, key+".details") && !excludeAll && !excludeDetails {
+					// exclude any extraneous or redundant fields
+					delete(ggml.KV(), "general.basename")
+					delete(ggml.KV(), "general.description")
+					delete(ggml.KV(), "general.filename")
+					delete(ggml.KV(), "general.finetune")
+					delete(ggml.KV(), "general.languages")
+					delete(ggml.KV(), "general.license")
+					delete(ggml.KV(), "general.license.link")
+					delete(ggml.KV(), "general.name")
+					delete(ggml.KV(), "general.paramter_count")
+					delete(ggml.KV(), "general.size_label")
+					delete(ggml.KV(), "general.tags")
+					delete(ggml.KV(), "general.type")
+					delete(ggml.KV(), "general.quantization_version")
+					delete(ggml.KV(), "tokenizer.chat_template")
+					content["details"] = ggml.KV()
+				}
+
+				return key, content, nil
+			case "params", "messages":
+				var content any
+				if err := json.NewDecoder(f).Decode(&content); err != nil {
+					return "", nil, err
+				}
+
+				return key, content, nil
+			case "template", "system", "license":
+				bts, err := io.ReadAll(f)
+				if err != nil {
+					return "", nil, err
+				}
+
+				if key == "license" {
+					return key, []any{string(bts)}, nil
+				}
+
+				return key, string(bts), nil
+			}
+
+			return layer.MediaType, nil, nil
+		}()
+		if errors.Is(err, errExcludeKey) {
+			continue
+		} else if err != nil {
+			return nil, err
+		}
+
+		if s, ok := r[key].([]any); ok {
+			r[key] = append(s, content)
+		} else {
+			r[key] = content
+		}
+	}
+
+	return r, nil
+}
+
+func (s *Server) GetModelsHandler(c *gin.Context) {
+	ms, err := Manifests()
+	if err != nil {
+		c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
+		return
+	}
+
+	var rs []map[string]any
+	for _, m := range ms {
+		r, err := manifestLayers(m, c.QueryArray("exclude"))
+		if err != nil {
+			c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
+			return
+		}
+
+		rs = append(rs, r)
+	}
+
+	slices.SortStableFunc(rs, func(i, j map[string]any) int {
+		// most recently modified first
+		return cmp.Compare(
+			j["modified_at"].(time.Time).Unix(),
+			i["modified_at"].(time.Time).Unix(),
+		)
+	})
+
+	c.JSON(http.StatusOK, rs)
+}
+
+func (s *Server) GetModelHandler(c *gin.Context) {
+	n := model.ParseName(strings.TrimPrefix(c.Param("model"), "/"))
+	if !n.IsValid() {
+		c.JSON(http.StatusBadRequest, gin.H{"error": "invalid model name"})
+		return
+	}
+
+	m, err := ParseNamedManifest(n)
+	if err != nil {
+		c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
+		return
+	}
+
+	r, err := manifestLayers(m, c.QueryArray("exclude"))
+	if err != nil {
+		c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
+		return
+	}
+
+	c.JSON(http.StatusOK, r)
+}
+
 func GetModelInfo(req api.ShowRequest) (*api.ShowResponse, error) {
 	m, err := GetModel(req.Model)
 	if err != nil {
@@ -838,20 +971,17 @@ func (s *Server) ListModelsHandler(c *gin.Context) {

 	models := []api.ListModelResponse{}
 	for n, m := range ms {
+		f, err := m.Config.Open()
+		if err != nil {
+			slog.Warn("bad manifest filepath", "name", n, "error", err)
+			continue
+		}
+		defer f.Close()
+
 		var cf ConfigV2
-
-		if m.Config.Digest != "" {
-			f, err := m.Config.Open()
-			if err != nil {
-				slog.Warn("bad manifest filepath", "name", n, "error", err)
-				continue
-			}
-			defer f.Close()
-
-			if err := json.NewDecoder(f).Decode(&cf); err != nil {
-				slog.Warn("bad manifest config", "name", n, "error", err)
-				continue
-			}
+		if err := json.NewDecoder(f).Decode(&cf); err != nil {
+			slog.Warn("bad manifest config", "name", n, "error", err)
+			continue
 		}

 		// tag should never be masked
@@ -1107,6 +1237,9 @@ func (s *Server) GenerateRoutes() http.Handler {
 			c.String(http.StatusOK, "Ollama is running")
 		})

+		r.Handle(method, "/api/models", s.GetModelsHandler)
+		r.Handle(method, "/api/models/*model", s.GetModelHandler)
+
 		r.Handle(method, "/api/tags", s.ListModelsHandler)
 		r.Handle(method, "/api/version", func(c *gin.Context) {
 			c.JSON(http.StatusOK, gin.H{"version": version.Version})
--- a/server/routes_delete_test.go
+++ b/server/routes_delete_test.go
@@ -98,7 +98,7 @@ func TestDeleteDuplicateLayers(t *testing.T) {
 	}

 	// create a manifest with duplicate layers
-	if err := WriteManifest(n, config, []Layer{config}); err != nil {
+	if err := WriteManifest(n, config, []*Layer{config}); err != nil {
 		t.Fatal(err)
 	}

--- a/server/routes_test.go
+++ b/server/routes_test.go
@@ -272,6 +272,76 @@ func Test_Routes(t *testing.T) {
 				assert.Equal(t, "library", retrieveResp.OwnedBy)
 			},
 		},
+		{
+			Name:   "Embed Handler Empty Input",
+			Method: http.MethodPost,
+			Path:   "/api/embed",
+			Setup: func(t *testing.T, req *http.Request) {
+				embedReq := api.EmbedRequest{
+					Model: "t-bone",
+					Input: "",
+				}
+				jsonData, err := json.Marshal(embedReq)
+				require.NoError(t, err)
+				req.Body = io.NopCloser(bytes.NewReader(jsonData))
+			},
+			Expected: func(t *testing.T, resp *http.Response) {
+				contentType := resp.Header.Get("Content-Type")
+				if contentType != "application/json; charset=utf-8" {
+					t.Fatalf("expected content type application/json; charset=utf-8, got %s", contentType)
+				}
+				body, err := io.ReadAll(resp.Body)
+				if err != nil {
+					t.Fatal(err)
+				}
+
+				var embedResp api.EmbedResponse
+				err = json.Unmarshal(body, &embedResp)
+				if err != nil {
+					t.Fatal(err)
+				}
+
+				if embedResp.Model != "t-bone" {
+					t.Fatalf("expected model t-bone, got %s", embedResp.Model)
+				}
+
+				if embedResp.Embeddings == nil {
+					t.Fatalf("expected embeddings to not be nil, got %v", embedResp.Embeddings)
+				}
+
+				if len(embedResp.Embeddings) != 0 {
+					t.Fatalf("expected embeddings to be empty, got %v", embedResp.Embeddings)
+				}
+			},
+		},
+		{
+			Name:   "Embed Handler Invalid Input",
+			Method: http.MethodPost,
+			Path:   "/api/embed",
+			Setup: func(t *testing.T, req *http.Request) {
+				embedReq := api.EmbedRequest{
+					Model: "t-bone",
+					Input: 2,
+				}
+				jsonData, err := json.Marshal(embedReq)
+				require.NoError(t, err)
+				req.Body = io.NopCloser(bytes.NewReader(jsonData))
+			},
+			Expected: func(t *testing.T, resp *http.Response) {
+				contentType := resp.Header.Get("Content-Type")
+				if contentType != "application/json; charset=utf-8" {
+					t.Fatalf("expected content type application/json; charset=utf-8, got %s", contentType)
+				}
+				_, err := io.ReadAll(resp.Body)
+				if err != nil {
+					t.Fatal(err)
+				}
+
+				if resp.StatusCode != http.StatusBadRequest {
+					t.Fatalf("expected status code 400, got %d", resp.StatusCode)
+				}
+			},
+		},
 	}

 	t.Setenv("OLLAMA_MODELS", t.TempDir())
--- a/server/sched.go
+++ b/server/sched.go
@@ -193,11 +193,6 @@ func (s *Scheduler) processPending(ctx context.Context) {
 						break
 					}

-					// Embedding models should always be loaded with parallel=1
-					if pending.model.CheckCapabilities(CapabilityCompletion) != nil {
-						numParallel = 1
-					}
-
 					// Evaluate if the model will fit in the available system memory, or if we should unload a model first
 					if len(gpus) == 1 && gpus[0].Library == "cpu" {
 						// simplifying assumption of defaultParallel when in CPU mode
@@ -423,7 +418,7 @@ func (s *Scheduler) load(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList,
 		// some older models are not compatible with newer versions of llama.cpp
 		// show a generalized compatibility error until there is a better way to
 		// check for model compatibility
-		if errors.Is(err, llm.ErrUnsupportedFormat) || strings.Contains(err.Error(), "failed to load model") {
+		if errors.Is(llm.ErrUnsupportedFormat, err) || strings.Contains(err.Error(), "failed to load model") {
 			err = fmt.Errorf("%v: this model may be incompatible with your version of Ollama. If you previously pulled this model, try updating it by running `ollama pull %s`", err, req.model.ShortName)
 		}
 		slog.Info("NewLlamaServer failed", "model", req.model.ModelPath, "error", err)
@@ -739,10 +734,7 @@ func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoL

 // If multiple Libraries are detected, pick the Library which loads the most layers for the model
 func pickBestPartialFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel *int) gpu.GpuInfoList {
-	if *numParallel <= 0 {
-		*numParallel = 1
-		req.opts.NumCtx = req.origNumCtx
-	}
+	*numParallel = 1
 	byLibrary := gpus.ByLibrary()
 	if len(byLibrary) <= 1 {
 		return gpus
--- a/server/sched_test.go
+++ b/server/sched_test.go
@@ -117,6 +117,7 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est

 	require.NoError(t, llm.WriteGGUF(f, llm.KV{
 		"general.architecture":          "llama",
+		"general.name":                  "name",
 		"llama.context_length":          uint32(32),
 		"llama.embedding_length":        uint32(4096),
 		"llama.block_count":             uint32(1),
@@ -707,8 +708,8 @@ type mockLlm struct {
 	pingResp           error
 	waitResp           error
 	completionResp     error
-	embeddingResp      []float32
-	embeddingRespErr   error
+	embedResp          *llm.EmbedResponse
+	embedRespErr       error
 	tokenizeResp       []int
 	tokenizeRespErr    error
 	detokenizeResp     string
@@ -726,8 +727,8 @@ func (s *mockLlm) Completion(ctx context.Context, req llm.CompletionRequest, fn
 	return s.completionResp
 }

-func (s *mockLlm) Embedding(ctx context.Context, input string) ([]float32, error) {
-	return s.embeddingResp, s.embeddingRespErr
+func (s *mockLlm) Embed(ctx context.Context, input []string) (*llm.EmbedResponse, error) {
+	return s.embedResp, s.embedRespErr
 }

 func (s *mockLlm) Tokenize(ctx context.Context, content string) ([]int, error) {
--- a/server/sparse_common.go
+++ b/server/sparse_common.go
@@ -4,5 +4,6 @@ package server

 import "os"

-func setSparse(*os.File) {
+func setSparse(file *os.File) error {
+	return nil
 }
--- a/server/sparse_windows.go
+++ b/server/sparse_windows.go
@@ -6,9 +6,8 @@ import (
 	"golang.org/x/sys/windows"
 )

-func setSparse(file *os.File) {
-	// exFat (and other FS types) don't support sparse files, so ignore errors
-	windows.DeviceIoControl( //nolint:errcheck
+func setSparse(file *os.File) error {
+	return windows.DeviceIoControl(
 		windows.Handle(file.Fd()), windows.FSCTL_SET_SPARSE,
 		nil, 0,
 		nil, 0,
--- a/server/upload.go
+++ b/server/upload.go
@@ -26,7 +26,7 @@ import (
 var blobUploadManager sync.Map

 type blobUpload struct {
-	Layer
+	*Layer

 	Total     int64
 	Completed atomic.Int64
@@ -45,7 +45,7 @@ type blobUpload struct {
 }

 const (
-	numUploadParts          = 16
+	numUploadParts          = 64
 	minUploadPartSize int64 = 100 * format.MegaByte
 	maxUploadPartSize int64 = 1000 * format.MegaByte
 )
@@ -362,7 +362,7 @@ func (p *progressWriter) Rollback() {
 	p.written = 0
 }

-func uploadBlob(ctx context.Context, mp ModelPath, layer Layer, opts *registryOptions, fn func(api.ProgressResponse)) error {
+func uploadBlob(ctx context.Context, mp ModelPath, layer *Layer, opts *registryOptions, fn func(api.ProgressResponse)) error {
 	requestURL := mp.BaseURL()
 	requestURL = requestURL.JoinPath("v2", mp.GetNamespaceRepository(), "blobs", layer.Digest)

--- a/types/model/name.go
+++ b/types/model/name.go
@@ -219,7 +219,7 @@ func (n Name) String() string {
 	return b.String()
 }

-// DisplayShortest returns a short string version of the name.
+// DisplayShort returns a short string version of the name.
 func (n Name) DisplayShortest() string {
 	var sb strings.Builder