Compare commits
6 Commits
main
...
brucemacd/
Author | SHA1 | Date | |
---|---|---|---|
![]() |
c2b11611a8 | ||
![]() |
90698c7d15 | ||
![]() |
4b4a5a28bf | ||
![]() |
3c95c21ddf | ||
![]() |
8ab13e4d3e | ||
![]() |
144f63e2fb |
9
.gitattributes
vendored
9
.gitattributes
vendored
@ -7,5 +7,14 @@ llama/**/*.cuh linguist-vendored
|
|||||||
llama/**/*.m linguist-vendored
|
llama/**/*.m linguist-vendored
|
||||||
llama/**/*.metal linguist-vendored
|
llama/**/*.metal linguist-vendored
|
||||||
|
|
||||||
|
ml/backend/**/*.c linguist-vendored
|
||||||
|
ml/backend/**/*.h linguist-vendored
|
||||||
|
ml/backend/**/*.cpp linguist-vendored
|
||||||
|
ml/backend/**/*.hpp linguist-vendored
|
||||||
|
ml/backend/**/*.cu linguist-vendored
|
||||||
|
ml/backend/**/*.cuh linguist-vendored
|
||||||
|
ml/backend/**/*.m linguist-vendored
|
||||||
|
ml/backend/**/*.metal linguist-vendored
|
||||||
|
|
||||||
* text=auto
|
* text=auto
|
||||||
*.go text eol=lf
|
*.go text eol=lf
|
||||||
|
270
.github/workflows/release.yaml
vendored
270
.github/workflows/release.yaml
vendored
@ -478,243 +478,77 @@ jobs:
|
|||||||
dist/OllamaSetup.exe
|
dist/OllamaSetup.exe
|
||||||
dist/ollama-windows-*.zip
|
dist/ollama-windows-*.zip
|
||||||
|
|
||||||
# Linux x86 assets built using the container based build
|
build-linux:
|
||||||
build-linux-amd64:
|
|
||||||
environment: release
|
environment: release
|
||||||
runs-on: linux
|
runs-on: linux
|
||||||
env:
|
|
||||||
PLATFORM: linux/amd64
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@v4
|
|
||||||
with:
|
|
||||||
submodules: recursive
|
|
||||||
- name: Set Version
|
|
||||||
shell: bash
|
|
||||||
run: echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV
|
|
||||||
- run: |
|
|
||||||
./scripts/build_linux.sh
|
|
||||||
- uses: actions/upload-artifact@v4
|
|
||||||
with:
|
|
||||||
name: dist-linux-amd64
|
|
||||||
path: |
|
|
||||||
dist/*linux*
|
|
||||||
!dist/*-cov
|
|
||||||
|
|
||||||
# Linux ARM assets built using the container based build
|
|
||||||
# (at present, docker isn't pre-installed on arm ubunutu images)
|
|
||||||
build-linux-arm64:
|
|
||||||
environment: release
|
|
||||||
runs-on: linux-arm64
|
|
||||||
env:
|
|
||||||
PLATFORM: linux/arm64
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@v4
|
|
||||||
with:
|
|
||||||
submodules: recursive
|
|
||||||
- name: Set Version
|
|
||||||
shell: bash
|
|
||||||
run: echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV
|
|
||||||
- name: 'Install Docker'
|
|
||||||
run: |
|
|
||||||
# Add Docker's official GPG key:
|
|
||||||
env
|
|
||||||
uname -a
|
|
||||||
sudo apt-get update
|
|
||||||
sudo apt-get install -y ca-certificates curl
|
|
||||||
sudo install -m 0755 -d /etc/apt/keyrings
|
|
||||||
sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc
|
|
||||||
sudo chmod a+r /etc/apt/keyrings/docker.asc
|
|
||||||
|
|
||||||
# Add the repository to Apt sources:
|
|
||||||
echo \
|
|
||||||
"deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \
|
|
||||||
$(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \
|
|
||||||
sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
|
|
||||||
sudo apt-get update
|
|
||||||
sudo apt-get install -y docker-ce docker-ce-cli containerd.io
|
|
||||||
sudo usermod -aG docker $USER
|
|
||||||
sudo apt-get install acl
|
|
||||||
sudo setfacl --modify user:$USER:rw /var/run/docker.sock
|
|
||||||
- run: |
|
|
||||||
./scripts/build_linux.sh
|
|
||||||
- uses: actions/upload-artifact@v4
|
|
||||||
with:
|
|
||||||
name: dist-linux-arm64
|
|
||||||
path: |
|
|
||||||
dist/*linux*
|
|
||||||
!dist/*-cov
|
|
||||||
|
|
||||||
# Container image build
|
|
||||||
build-container-image:
|
|
||||||
environment: release
|
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
runner:
|
include:
|
||||||
- linux
|
- os: linux
|
||||||
- linux-arm64
|
arch: amd64
|
||||||
runs-on: ${{ matrix.runner }}
|
targets: [archive, rocm]
|
||||||
env:
|
- os: linux
|
||||||
FINAL_IMAGE_REPO: ollama/ollama
|
arch: arm64
|
||||||
|
targets: [archive]
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v4
|
||||||
|
- uses: docker/setup-qemu-action@v3
|
||||||
|
- uses: docker/setup-buildx-action@v3
|
||||||
|
- run: |
|
||||||
|
apt-get update && apt-get install pigz
|
||||||
|
for TARGET in ${{ matrix.targets }}; do docker buildx build --platform $PLATFORM --target $TARGET --output type=local,dest=dist/$PLATFORM .; done
|
||||||
|
tar c -C dist/$PLATFORM . | pigz -9cv >dist/ollama-${PLATFORM//\//-}.tar.gz
|
||||||
|
env:
|
||||||
|
PLATFORM: ${{ matrix.os }}/${{ matrix.arch }}
|
||||||
|
- uses: actions/upload-artifact@v4
|
||||||
with:
|
with:
|
||||||
submodules: recursive
|
name: dist-${{ matrix.os }}-${{ matrix.arch }}
|
||||||
- name: 'Install Docker'
|
path: |
|
||||||
if: ${{ startsWith(matrix.runner, 'linux-arm64') }}
|
dist/ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.gz
|
||||||
run: |
|
|
||||||
sudo apt-get update
|
build-docker:
|
||||||
sudo apt-get install -y ca-certificates curl
|
|
||||||
sudo install -m 0755 -d /etc/apt/keyrings
|
|
||||||
sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc
|
|
||||||
sudo chmod a+r /etc/apt/keyrings/docker.asc
|
|
||||||
echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \
|
|
||||||
$(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \
|
|
||||||
sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
|
|
||||||
sudo apt-get update
|
|
||||||
sudo apt-get install -y docker-ce docker-ce-cli containerd.io
|
|
||||||
sudo usermod -aG docker $USER
|
|
||||||
sudo apt-get install acl
|
|
||||||
sudo setfacl --modify user:$USER:rw /var/run/docker.sock
|
|
||||||
- name: Docker meta
|
|
||||||
id: meta
|
|
||||||
uses: docker/metadata-action@v5
|
|
||||||
with:
|
|
||||||
images: ${{ env.FINAL_IMAGE_REPO }}
|
|
||||||
flavor: |
|
|
||||||
latest=false
|
|
||||||
tags: |
|
|
||||||
type=ref,enable=true,priority=600,prefix=0.0.0-pr,suffix=,event=pr
|
|
||||||
type=semver,pattern={{version}}
|
|
||||||
- name: Set Version
|
|
||||||
shell: bash
|
|
||||||
run: |
|
|
||||||
machine=$(uname -m)
|
|
||||||
case ${machine} in
|
|
||||||
x86_64) echo ARCH=amd64; echo PLATFORM_PAIR=linux-amd64 ;;
|
|
||||||
aarch64) echo ARCH=arm64; echo PLATFORM_PAIR=linux-arm64 ;;
|
|
||||||
esac >>$GITHUB_ENV
|
|
||||||
echo GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=${{ env.DOCKER_METADATA_OUTPUT_VERSION }}\" \"-X=github.com/ollama/ollama/server.mode=release\"'" >>$GITHUB_ENV
|
|
||||||
- name: Set up Docker Buildx
|
|
||||||
uses: docker/setup-buildx-action@v3
|
|
||||||
- name: Login to Docker Hub
|
|
||||||
uses: docker/login-action@v3
|
|
||||||
with:
|
|
||||||
username: ${{ vars.DOCKER_USER }}
|
|
||||||
password: ${{ secrets.DOCKER_ACCESS_TOKEN }}
|
|
||||||
- name: Build and push by digest
|
|
||||||
id: build
|
|
||||||
uses: docker/build-push-action@v6
|
|
||||||
with:
|
|
||||||
context: "."
|
|
||||||
platforms: linux/${{ env.ARCH }}
|
|
||||||
build-args: |
|
|
||||||
GOFLAGS
|
|
||||||
outputs: type=image,name=${{ env.FINAL_IMAGE_REPO }},push-by-digest=true,name-canonical=true,push=true
|
|
||||||
- name: Export digest
|
|
||||||
run: |
|
|
||||||
mkdir -p /tmp/digests
|
|
||||||
digest="${{ steps.build.outputs.digest }}"
|
|
||||||
touch "/tmp/digests/${digest#sha256:}"
|
|
||||||
- name: Upload digest
|
|
||||||
uses: actions/upload-artifact@v4
|
|
||||||
with:
|
|
||||||
name: digests-${{ env.PLATFORM_PAIR }}
|
|
||||||
path: /tmp/digests/*
|
|
||||||
if-no-files-found: error
|
|
||||||
retention-days: 1
|
|
||||||
merge:
|
|
||||||
environment: release
|
environment: release
|
||||||
runs-on: linux
|
runs-on: linux
|
||||||
needs:
|
strategy:
|
||||||
- build-container-image
|
matrix:
|
||||||
env:
|
include:
|
||||||
FINAL_IMAGE_REPO: ollama/ollama
|
- flavor: |
|
||||||
|
latest=auto
|
||||||
|
platforms: linux/amd64,linux/arm64
|
||||||
|
build-args: [GOFLAGS]
|
||||||
|
- flavor: |
|
||||||
|
suffix=-rocm,onlatest=false
|
||||||
|
platforms: linux/amd64
|
||||||
|
build-args: [GOFLAGS, FLAVOR=rocm]
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v4
|
||||||
with:
|
- uses: docker/setup-qemu-action@v2
|
||||||
submodules: recursive
|
- uses: docker/setup-buildx-action@v2
|
||||||
- name: Download digests
|
- uses: docker/login-action@v3
|
||||||
uses: actions/download-artifact@v4
|
|
||||||
with:
|
|
||||||
path: /tmp/digests
|
|
||||||
pattern: digests-*
|
|
||||||
merge-multiple: true
|
|
||||||
- name: Set up Docker Buildx
|
|
||||||
uses: docker/setup-buildx-action@v3
|
|
||||||
- name: Docker meta
|
|
||||||
id: meta
|
|
||||||
uses: docker/metadata-action@v5
|
|
||||||
with:
|
|
||||||
images: ${{ env.FINAL_IMAGE_REPO }}
|
|
||||||
flavor: |
|
|
||||||
latest=false
|
|
||||||
tags: |
|
|
||||||
type=ref,enable=true,priority=600,prefix=0.0.0-pr,suffix=,event=pr
|
|
||||||
type=semver,pattern={{version}}
|
|
||||||
- name: Set Version
|
|
||||||
shell: bash
|
|
||||||
run: |
|
|
||||||
machine=$(uname -m)
|
|
||||||
case ${machine} in
|
|
||||||
x86_64) echo ARCH=amd64; echo PLATFORM_PAIR=linux-amd64 ;;
|
|
||||||
aarch64) echo ARCH=arm64; echo PLATFORM_PAIR=linux-arm64 ;;
|
|
||||||
esac >>$GITHUB_ENV
|
|
||||||
echo GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=${{ env.DOCKER_METADATA_OUTPUT_VERSION }}\" \"-X=github.com/ollama/ollama/server.mode=release\"'" >>$GITHUB_ENV
|
|
||||||
- name: Login to Docker Hub
|
|
||||||
uses: docker/login-action@v3
|
|
||||||
with:
|
with:
|
||||||
username: ${{ vars.DOCKER_USER }}
|
username: ${{ vars.DOCKER_USER }}
|
||||||
password: ${{ secrets.DOCKER_ACCESS_TOKEN }}
|
password: ${{ secrets.DOCKER_ACCESS_TOKEN }}
|
||||||
- name: Create manifest list and push
|
- id: metadata
|
||||||
working-directory: /tmp/digests
|
uses: docker/metadata-action@v4
|
||||||
run: |
|
|
||||||
docker buildx imagetools create $(jq -cr '.tags | map("-t " + .) | join(" ")' <<< "$DOCKER_METADATA_OUTPUT_JSON") \
|
|
||||||
$(printf '${{ env.FINAL_IMAGE_REPO }}@sha256:%s ' *)
|
|
||||||
- name: Inspect image
|
|
||||||
run: |
|
|
||||||
docker buildx imagetools inspect ${{ env.FINAL_IMAGE_REPO }}:${{ steps.meta.outputs.version }}
|
|
||||||
build-container-image-rocm:
|
|
||||||
environment: release
|
|
||||||
runs-on: linux
|
|
||||||
env:
|
|
||||||
FINAL_IMAGE_REPO: ollama/ollama
|
|
||||||
ARCH: amd64
|
|
||||||
PLATFORM_PAIR: linux-amd64
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@v4
|
|
||||||
with:
|
with:
|
||||||
submodules: recursive
|
flavor: ${{ matrix.flavor }}
|
||||||
- name: Docker meta
|
images: |
|
||||||
id: meta
|
ollama/ollama
|
||||||
uses: docker/metadata-action@v5
|
|
||||||
with:
|
|
||||||
images: ${{ env.FINAL_IMAGE_REPO }}
|
|
||||||
flavor: |
|
|
||||||
latest=false
|
|
||||||
tags: |
|
tags: |
|
||||||
type=ref,enable=true,priority=600,prefix=0.0.0-pr,suffix=,event=pr
|
|
||||||
type=semver,pattern={{version}}
|
type=semver,pattern={{version}}
|
||||||
- name: Set Version
|
- uses: docker/build-push-action@v6
|
||||||
shell: bash
|
|
||||||
run: |
|
|
||||||
echo GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=${{ env.DOCKER_METADATA_OUTPUT_VERSION }}\" \"-X=github.com/ollama/ollama/server.mode=release\"'" >>$GITHUB_ENV
|
|
||||||
- name: Set up Docker Buildx
|
|
||||||
uses: docker/setup-buildx-action@v3
|
|
||||||
- name: Login to Docker Hub
|
|
||||||
uses: docker/login-action@v3
|
|
||||||
with:
|
with:
|
||||||
username: ${{ vars.DOCKER_USER }}
|
context: .
|
||||||
password: ${{ secrets.DOCKER_ACCESS_TOKEN }}
|
|
||||||
- name: Build and push by digest
|
|
||||||
id: build
|
|
||||||
uses: docker/build-push-action@v6
|
|
||||||
with:
|
|
||||||
context: "."
|
|
||||||
target: runtime-rocm
|
|
||||||
build-args: |
|
|
||||||
GOFLAGS
|
|
||||||
tags: ${{ env.FINAL_IMAGE_REPO }}:${{ env.DOCKER_METADATA_OUTPUT_VERSION}}-rocm
|
|
||||||
push: true
|
push: true
|
||||||
|
platforms: ${{ matrix.platforms }}
|
||||||
|
build-args: ${{ matrix.build-args }}
|
||||||
|
tags: ${{ steps.metadata.outputs.tags }}
|
||||||
|
labels: ${{ steps.metadata.outputs.labels }}
|
||||||
|
cache-from: type=registry,ref=ollama/ollama:latest
|
||||||
|
cache-to: type=inline
|
||||||
|
provenance: false
|
||||||
|
env:
|
||||||
|
GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=${{ steps.metadata.outputs.version }}\" \"-X=github.com/ollama/ollama/server.mode=release\"'"
|
||||||
|
|
||||||
# Aggregate all the assets and ship a release
|
# Aggregate all the assets and ship a release
|
||||||
release:
|
release:
|
||||||
|
313
.github/workflows/test.yaml
vendored
313
.github/workflows/test.yaml
vendored
@ -1,11 +1,5 @@
|
|||||||
name: test
|
name: test
|
||||||
|
|
||||||
env:
|
|
||||||
ROCM_WINDOWS_URL: https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe
|
|
||||||
MSYS2_URL: https://github.com/msys2/msys2-installer/releases/download/2024-07-27/msys2-x86_64-20240727.exe
|
|
||||||
CUDA_12_WINDOWS_URL: https://developer.download.nvidia.com/compute/cuda/12.4.0/local_installers/cuda_12.4.0_551.61_windows.exe
|
|
||||||
CUDA_12_WINDOWS_VER: 12.4
|
|
||||||
|
|
||||||
concurrency:
|
concurrency:
|
||||||
# For PRs, later CI runs preempt previous ones. e.g. a force push on a PR
|
# For PRs, later CI runs preempt previous ones. e.g. a force push on a PR
|
||||||
# cancels running CI jobs and starts all new ones.
|
# cancels running CI jobs and starts all new ones.
|
||||||
@ -27,7 +21,7 @@ jobs:
|
|||||||
changes:
|
changes:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
outputs:
|
outputs:
|
||||||
RUNNERS: ${{ steps.changes.outputs.RUNNERS }}
|
changed: ${{ steps.changes.outputs.changed }}
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v4
|
||||||
with:
|
with:
|
||||||
@ -35,309 +29,66 @@ jobs:
|
|||||||
- id: changes
|
- id: changes
|
||||||
run: |
|
run: |
|
||||||
changed() {
|
changed() {
|
||||||
git diff-tree -r --no-commit-id --name-only \
|
local BASE=${{ github.event.pull_request.base.sha }}
|
||||||
$(git merge-base ${{ github.event.pull_request.base.sha }} ${{ github.event.pull_request.head.sha }}) \
|
local HEAD=${{ github.event.pull_request.head.sha }}
|
||||||
${{ github.event.pull_request.head.sha }} \
|
local MERGE_BASE=$(git merge-base $BASE $HEAD)
|
||||||
|
git diff-tree -r --no-commit-id --name-only "$MERGE_BASE" "$HEAD" \
|
||||||
| xargs python3 -c "import sys; from pathlib import Path; print(any(Path(x).match(glob) for x in sys.argv[1:] for glob in '$*'.split(' ')))"
|
| xargs python3 -c "import sys; from pathlib import Path; print(any(Path(x).match(glob) for x in sys.argv[1:] for glob in '$*'.split(' ')))"
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
echo changed=$(changed 'llama/llama.cpp/**' 'ml/backend/ggml/ggml/**') | tee -a $GITHUB_OUTPUT
|
||||||
echo RUNNERS=$(changed 'llama/**')
|
|
||||||
} >>$GITHUB_OUTPUT
|
|
||||||
|
|
||||||
runners-linux-cuda:
|
linux:
|
||||||
needs: [changes]
|
needs: [changes]
|
||||||
if: ${{ needs.changes.outputs.RUNNERS == 'True' }}
|
if: ${{ needs.changes.outputs.changed == 'True' }}
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
cuda-version:
|
include:
|
||||||
- '11.8.0'
|
- container: nvidia/cuda:11.8.0-devel-ubuntu22.04
|
||||||
runs-on: linux
|
preset: CUDA
|
||||||
container: nvidia/cuda:${{ matrix.cuda-version }}-devel-ubuntu20.04
|
- container: rocm/dev-ubuntu-22.04:6.1.2
|
||||||
|
preset: ROCm
|
||||||
|
extra-packages: rocm-libs
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
container: ${{ matrix.container }}
|
||||||
steps:
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
- run: |
|
- run: |
|
||||||
apt-get update && apt-get install -y git build-essential curl
|
apt-get update
|
||||||
|
apt-get install -y cmake pkg-config ccache ${{ matrix.extra-packages }}
|
||||||
|
ccache -o cache_dir=${{ github.workspace }}\.ccache
|
||||||
env:
|
env:
|
||||||
DEBIAN_FRONTEND: noninteractive
|
DEBIAN_FRONTEND: noninteractive
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/cache@v4
|
||||||
- uses: actions/setup-go@v4
|
|
||||||
with:
|
with:
|
||||||
go-version-file: go.mod
|
path: ${{ github.workspace }}\.ccache
|
||||||
cache: true
|
key: ccache-${{ runner.os }}-${{ runner.arch }}-${{ matrix.preset }}
|
||||||
- run: go get ./...
|
|
||||||
- run: |
|
- run: |
|
||||||
git config --global --add safe.directory /__w/ollama/ollama
|
cmake --preset ${{ matrix.preset }}
|
||||||
cores=$(grep '^core id' /proc/cpuinfo |sort -u|wc -l)
|
cmake --build --preset ${{ matrix.preset }} --parallel
|
||||||
make -j $cores cuda_v11
|
|
||||||
runners-linux-rocm:
|
|
||||||
needs: [changes]
|
|
||||||
if: ${{ needs.changes.outputs.RUNNERS == 'True' }}
|
|
||||||
strategy:
|
|
||||||
matrix:
|
|
||||||
rocm-version:
|
|
||||||
- '6.1.2'
|
|
||||||
runs-on: linux
|
|
||||||
container: rocm/dev-ubuntu-20.04:${{ matrix.rocm-version }}
|
|
||||||
steps:
|
|
||||||
- run: |
|
|
||||||
apt-get update && apt-get install -y git build-essential curl rocm-libs
|
|
||||||
env:
|
|
||||||
DEBIAN_FRONTEND: noninteractive
|
|
||||||
- uses: actions/checkout@v4
|
|
||||||
- uses: actions/setup-go@v4
|
|
||||||
with:
|
|
||||||
go-version-file: go.mod
|
|
||||||
cache: true
|
|
||||||
- run: go get ./...
|
|
||||||
- run: |
|
|
||||||
git config --global --add safe.directory /__w/ollama/ollama
|
|
||||||
cores=$(grep '^core id' /proc/cpuinfo |sort -u|wc -l)
|
|
||||||
make -j $cores rocm
|
|
||||||
|
|
||||||
# ROCm generation step
|
|
||||||
runners-windows-rocm:
|
|
||||||
needs: [changes]
|
|
||||||
if: ${{ needs.changes.outputs.RUNNERS == 'True' }}
|
|
||||||
runs-on: windows
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@v4
|
|
||||||
- uses: actions/setup-go@v5
|
|
||||||
with:
|
|
||||||
go-version-file: go.mod
|
|
||||||
cache: true
|
|
||||||
- name: Set make jobs default
|
|
||||||
run: |
|
|
||||||
echo "MAKEFLAGS=--jobs=$((Get-ComputerInfo -Property CsProcessors).CsProcessors.NumberOfCores)" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
|
|
||||||
|
|
||||||
# ROCM installation steps
|
|
||||||
- name: 'Cache ROCm installer'
|
|
||||||
id: cache-rocm
|
|
||||||
uses: actions/cache@v4
|
|
||||||
with:
|
|
||||||
path: rocm-install.exe
|
|
||||||
key: ${{ env.ROCM_WINDOWS_URL }}
|
|
||||||
- name: 'Conditionally Download ROCm'
|
|
||||||
if: steps.cache-rocm.outputs.cache-hit != 'true'
|
|
||||||
run: |
|
|
||||||
$ErrorActionPreference = "Stop"
|
|
||||||
Invoke-WebRequest -Uri "${env:ROCM_WINDOWS_URL}" -OutFile "rocm-install.exe"
|
|
||||||
- name: 'Install ROCm'
|
|
||||||
run: |
|
|
||||||
Start-Process "rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
|
|
||||||
- name: 'Verify ROCm'
|
|
||||||
run: |
|
|
||||||
& 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
|
|
||||||
echo "HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path | select -first 1)" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
|
|
||||||
|
|
||||||
- name: Add msys paths
|
|
||||||
run: |
|
|
||||||
echo "c:\msys64\usr\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
|
|
||||||
echo "C:\msys64\clang64\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
|
|
||||||
- name: Install msys2 tools
|
|
||||||
run: |
|
|
||||||
Start-Process "c:\msys64\usr\bin\pacman.exe" -ArgumentList @("-S", "--noconfirm", "mingw-w64-clang-x86_64-gcc-compat", "mingw-w64-clang-x86_64-clang") -NoNewWindow -Wait
|
|
||||||
|
|
||||||
- name: make rocm runner
|
|
||||||
run: |
|
|
||||||
import-module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
|
|
||||||
Enter-VsDevShell -vsinstallpath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -skipautomaticlocation -DevCmdArguments '-arch=x64 -no_logo'
|
|
||||||
if (!(gcc --version | select-string -quiet clang)) { throw "wrong gcc compiler detected - must be clang" }
|
|
||||||
make -C llama print-HIP_PATH print-HIP_LIB_DIR
|
|
||||||
make rocm
|
|
||||||
|
|
||||||
# CUDA generation step
|
|
||||||
runners-windows-cuda:
|
|
||||||
needs: [changes]
|
|
||||||
if: ${{ needs.changes.outputs.RUNNERS == 'True' }}
|
|
||||||
runs-on: windows
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@v4
|
|
||||||
- uses: actions/setup-go@v5
|
|
||||||
with:
|
|
||||||
go-version-file: go.mod
|
|
||||||
cache: true
|
|
||||||
- name: Set make jobs default
|
|
||||||
run: |
|
|
||||||
echo "MAKEFLAGS=--jobs=$((Get-ComputerInfo -Property CsProcessors).CsProcessors.NumberOfCores)" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
|
|
||||||
|
|
||||||
# CUDA installation steps
|
|
||||||
- name: 'Cache CUDA installer'
|
|
||||||
id: cache-cuda
|
|
||||||
uses: actions/cache@v4
|
|
||||||
with:
|
|
||||||
path: cuda-install.exe
|
|
||||||
key: ${{ env.CUDA_12_WINDOWS_URL }}
|
|
||||||
- name: 'Conditionally Download CUDA'
|
|
||||||
if: steps.cache-cuda.outputs.cache-hit != 'true'
|
|
||||||
run: |
|
|
||||||
$ErrorActionPreference = "Stop"
|
|
||||||
Invoke-WebRequest -Uri "${env:CUDA_12_WINDOWS_URL}" -OutFile "cuda-install.exe"
|
|
||||||
- name: 'Install CUDA'
|
|
||||||
run: |
|
|
||||||
$subpackages = @("cudart", "nvcc", "cublas", "cublas_dev") | foreach-object {"${_}_${{ env.CUDA_12_WINDOWS_VER }}"}
|
|
||||||
Start-Process "cuda-install.exe" -ArgumentList (@("-s") + $subpackages) -NoNewWindow -Wait
|
|
||||||
- name: 'Verify CUDA'
|
|
||||||
run: |
|
|
||||||
& (resolve-path "c:\Program Files\NVIDIA*\CUDA\v*\bin\nvcc.exe")[0] --version
|
|
||||||
$cudaPath=((resolve-path "c:\Program Files\NVIDIA*\CUDA\v*\bin\nvcc.exe")[0].path | split-path | split-path)
|
|
||||||
$cudaVer=($cudaPath | split-path -leaf ) -replace 'v(\d+).(\d+)', '$1_$2'
|
|
||||||
echo "$cudaPath\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
|
|
||||||
echo "CUDA_PATH=$cudaPath" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
|
|
||||||
echo "CUDA_PATH_V${cudaVer}=$cudaPath" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
|
|
||||||
echo "CUDA_PATH_VX_Y=CUDA_PATH_V${cudaVer}" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
|
|
||||||
|
|
||||||
- name: Add msys paths
|
|
||||||
run: |
|
|
||||||
echo "c:\msys64\usr\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
|
|
||||||
echo "C:\msys64\clang64\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
|
|
||||||
- name: Install msys2 tools
|
|
||||||
run: |
|
|
||||||
Start-Process "c:\msys64\usr\bin\pacman.exe" -ArgumentList @("-S", "--noconfirm", "mingw-w64-clang-x86_64-gcc-compat", "mingw-w64-clang-x86_64-clang") -NoNewWindow -Wait
|
|
||||||
- name: make cuda runner
|
|
||||||
run: |
|
|
||||||
import-module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
|
|
||||||
Enter-VsDevShell -vsinstallpath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -skipautomaticlocation -DevCmdArguments '-arch=x64 -no_logo'
|
|
||||||
if (!(gcc --version | select-string -quiet clang)) { throw "wrong gcc compiler detected - must be clang" }
|
|
||||||
make cuda_v$(($env:CUDA_PATH | split-path -leaf) -replace 'v(\d+).*', '$1')
|
|
||||||
|
|
||||||
runners-cpu:
|
|
||||||
needs: [changes]
|
|
||||||
if: ${{ needs.changes.outputs.RUNNERS == 'True' }}
|
|
||||||
strategy:
|
|
||||||
matrix:
|
|
||||||
os: [ubuntu-latest, macos-latest, windows-2019]
|
|
||||||
arch: [amd64, arm64]
|
|
||||||
exclude:
|
|
||||||
- os: ubuntu-latest
|
|
||||||
arch: arm64
|
|
||||||
- os: windows-2019
|
|
||||||
arch: arm64
|
|
||||||
runs-on: ${{ matrix.os }}
|
|
||||||
env:
|
|
||||||
GOARCH: ${{ matrix.arch }}
|
|
||||||
ARCH: ${{ matrix.arch }}
|
|
||||||
CGO_ENABLED: '1'
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@v4
|
|
||||||
- uses: actions/setup-go@v5
|
|
||||||
with:
|
|
||||||
go-version-file: go.mod
|
|
||||||
cache: true
|
|
||||||
- name: Add msys paths
|
|
||||||
if: ${{ startsWith(matrix.os, 'windows-') }}
|
|
||||||
run: |
|
|
||||||
echo "c:\msys64\usr\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
|
|
||||||
echo "C:\msys64\clang64\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
|
|
||||||
- name: Install msys2 tools
|
|
||||||
if: ${{ startsWith(matrix.os, 'windows-') }}
|
|
||||||
run: |
|
|
||||||
Start-Process "c:\msys64\usr\bin\pacman.exe" -ArgumentList @("-S", "--noconfirm", "mingw-w64-clang-x86_64-gcc-compat", "mingw-w64-clang-x86_64-clang") -NoNewWindow -Wait
|
|
||||||
- name: 'Build Windows Go Runners'
|
|
||||||
if: ${{ startsWith(matrix.os, 'windows-') }}
|
|
||||||
run: |
|
|
||||||
$gopath=(get-command go).source | split-path -parent
|
|
||||||
$gccpath=(get-command gcc).source | split-path -parent
|
|
||||||
import-module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
|
|
||||||
Enter-VsDevShell -vsinstallpath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -skipautomaticlocation -DevCmdArguments '-arch=x64 -no_logo'
|
|
||||||
$env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
|
|
||||||
$env:PATH="$gopath;$gccpath;$env:PATH"
|
|
||||||
echo $env:PATH
|
|
||||||
if (!(gcc --version | select-string -quiet clang)) { throw "wrong gcc compiler detected - must be clang" }
|
|
||||||
make -j 4
|
|
||||||
- name: 'Build Unix Go Runners'
|
|
||||||
if: ${{ ! startsWith(matrix.os, 'windows-') }}
|
|
||||||
run: make -j 4
|
|
||||||
- run: go build .
|
|
||||||
|
|
||||||
lint:
|
|
||||||
strategy:
|
|
||||||
matrix:
|
|
||||||
os: [ubuntu-latest, macos-latest, windows-2019]
|
|
||||||
arch: [amd64, arm64]
|
|
||||||
exclude:
|
|
||||||
- os: ubuntu-latest
|
|
||||||
arch: arm64
|
|
||||||
- os: windows-2019
|
|
||||||
arch: arm64
|
|
||||||
- os: macos-latest
|
|
||||||
arch: amd64
|
|
||||||
runs-on: ${{ matrix.os }}
|
|
||||||
env:
|
|
||||||
GOARCH: ${{ matrix.arch }}
|
|
||||||
CGO_ENABLED: '1'
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@v4
|
|
||||||
with:
|
|
||||||
submodules: recursive
|
|
||||||
- name: Add msys paths
|
|
||||||
if: ${{ startsWith(matrix.os, 'windows-') }}
|
|
||||||
run: |
|
|
||||||
echo "c:\msys64\usr\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
|
|
||||||
echo "C:\msys64\clang64\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
|
|
||||||
- name: Install msys2 tools
|
|
||||||
if: ${{ startsWith(matrix.os, 'windows-') }}
|
|
||||||
run: |
|
|
||||||
Start-Process "c:\msys64\usr\bin\pacman.exe" -ArgumentList @("-S", "--noconfirm", "mingw-w64-clang-x86_64-gcc-compat", "mingw-w64-clang-x86_64-clang") -NoNewWindow -Wait
|
|
||||||
- uses: actions/setup-go@v5
|
|
||||||
with:
|
|
||||||
go-version-file: go.mod
|
|
||||||
cache: false
|
|
||||||
- run: |
|
|
||||||
case ${{ matrix.arch }} in
|
|
||||||
amd64) echo ARCH=x86_64 ;;
|
|
||||||
arm64) echo ARCH=arm64 ;;
|
|
||||||
esac >>$GITHUB_ENV
|
|
||||||
shell: bash
|
|
||||||
- uses: golangci/golangci-lint-action@v6
|
|
||||||
with:
|
|
||||||
args: --timeout 10m0s -v
|
|
||||||
test:
|
test:
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
os: [ubuntu-latest, macos-latest, windows-2019]
|
os: [ubuntu-latest, macos-latest, windows-latest]
|
||||||
arch: [amd64]
|
|
||||||
exclude:
|
|
||||||
- os: ubuntu-latest
|
|
||||||
arch: arm64
|
|
||||||
- os: windows-2019
|
|
||||||
arch: arm64
|
|
||||||
runs-on: ${{ matrix.os }}
|
runs-on: ${{ matrix.os }}
|
||||||
env:
|
env:
|
||||||
GOARCH: ${{ matrix.arch }}
|
|
||||||
CGO_ENABLED: '1'
|
CGO_ENABLED: '1'
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v4
|
||||||
with:
|
|
||||||
submodules: recursive
|
|
||||||
- name: Add msys paths
|
|
||||||
if: ${{ startsWith(matrix.os, 'windows-') }}
|
|
||||||
run: |
|
|
||||||
echo "c:\msys64\usr\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
|
|
||||||
echo "C:\msys64\clang64\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
|
|
||||||
- name: Install msys2 tools
|
|
||||||
if: ${{ startsWith(matrix.os, 'windows-') }}
|
|
||||||
run: |
|
|
||||||
Start-Process "c:\msys64\usr\bin\pacman.exe" -ArgumentList @("-S", "--noconfirm", "mingw-w64-clang-x86_64-gcc-compat", "mingw-w64-clang-x86_64-clang") -NoNewWindow -Wait
|
|
||||||
- uses: actions/setup-go@v5
|
- uses: actions/setup-go@v5
|
||||||
with:
|
with:
|
||||||
go-version-file: go.mod
|
go-version-file: go.mod
|
||||||
cache: true
|
- uses: golangci/golangci-lint-action@v6
|
||||||
- run: |
|
with:
|
||||||
case ${{ matrix.arch }} in
|
args: --timeout 10m0s -v
|
||||||
amd64) echo ARCH=amd64 ;;
|
|
||||||
arm64) echo ARCH=arm64 ;;
|
|
||||||
esac >>$GITHUB_ENV
|
|
||||||
shell: bash
|
|
||||||
- run: go test ./...
|
- run: go test ./...
|
||||||
|
|
||||||
patches:
|
patches:
|
||||||
needs: [changes]
|
|
||||||
if: ${{ needs.changes.outputs.RUNNERS == 'True' }}
|
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v4
|
||||||
with:
|
- name: Verify patches apply cleanly and do not change files
|
||||||
submodules: recursive
|
|
||||||
- name: Verify patches carry all the changes
|
|
||||||
run: |
|
run: |
|
||||||
make apply-patches sync && git diff --compact-summary --exit-code llama
|
make -f Makefile2 clean checkout sync
|
||||||
|
git diff --compact-summary --exit-code
|
||||||
|
54
CMakeLists.txt
Normal file
54
CMakeLists.txt
Normal file
@ -0,0 +1,54 @@
|
|||||||
|
cmake_minimum_required(VERSION 3.21)
|
||||||
|
|
||||||
|
project(Ollama C CXX)
|
||||||
|
|
||||||
|
include(CheckLanguage)
|
||||||
|
|
||||||
|
find_package(Threads REQUIRED)
|
||||||
|
|
||||||
|
set(CMAKE_BUILD_TYPE Release)
|
||||||
|
set(BUILD_SHARED_LIBS ON)
|
||||||
|
|
||||||
|
set(CMAKE_CXX_STANDARD 17)
|
||||||
|
set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
||||||
|
set(CMAKE_CXX_EXTENSIONS OFF)
|
||||||
|
|
||||||
|
set(GGML_BUILD ON)
|
||||||
|
set(GGML_SHARED ON)
|
||||||
|
set(GGML_CCACHE ON)
|
||||||
|
set(GGML_BACKEND_DL ON)
|
||||||
|
set(GGML_BACKEND_SHARED ON)
|
||||||
|
set(GGML_SCHED_MAX_COPIES 4)
|
||||||
|
|
||||||
|
set(GGML_LLAMAFILE ON)
|
||||||
|
set(GGML_CPU_ALL_VARIANTS ON)
|
||||||
|
set(GGML_CUDA_PEER_MAX_BATCH_SIZE 128)
|
||||||
|
set(GGML_CUDA_GRAPHS ON)
|
||||||
|
|
||||||
|
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
|
||||||
|
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
|
||||||
|
|
||||||
|
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src)
|
||||||
|
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/include)
|
||||||
|
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-cpu)
|
||||||
|
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-cpu/amx)
|
||||||
|
|
||||||
|
set(GGML_CPU ON)
|
||||||
|
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src)
|
||||||
|
set_property(TARGET ggml PROPERTY EXCLUDE_FROM_ALL TRUE)
|
||||||
|
|
||||||
|
check_language(CUDA)
|
||||||
|
if(CMAKE_CUDA_COMPILER)
|
||||||
|
if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.24" AND NOT CMAKE_CUDA_ARCHITECTURES)
|
||||||
|
set(CMAKE_CUDA_ARCHITECTURES "native")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-cuda)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
check_language(HIP)
|
||||||
|
if(CMAKE_HIP_COMPILER)
|
||||||
|
set(HIP_PLATFORM "amd")
|
||||||
|
|
||||||
|
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-hip)
|
||||||
|
endif()
|
109
CMakePresets.json
Normal file
109
CMakePresets.json
Normal file
@ -0,0 +1,109 @@
|
|||||||
|
{
|
||||||
|
"version": 3,
|
||||||
|
"configurePresets": [
|
||||||
|
{
|
||||||
|
"name": "Default",
|
||||||
|
"binaryDir": "${sourceDir}/build",
|
||||||
|
"cacheVariables": {
|
||||||
|
"CMAKE_BUILD_TYPE": "Release"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "CPU",
|
||||||
|
"inherits": [ "Default" ]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "CUDA",
|
||||||
|
"inherits": [ "Default" ]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "CUDA 11",
|
||||||
|
"inherits": [ "CUDA" ],
|
||||||
|
"cacheVariables": {
|
||||||
|
"CMAKE_CUDA_ARCHITECTURES": "50;52;53;60;61;62;70;72;75;80;86"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "CUDA 12",
|
||||||
|
"inherits": [ "CUDA" ],
|
||||||
|
"cacheVariables": {
|
||||||
|
"CMAKE_CUDA_ARCHITECTURES": "60;61;62;70;72;75;80;86;87;89;90;90a"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "JetPack 5",
|
||||||
|
"inherits": [ "CUDA" ],
|
||||||
|
"cacheVariables": {
|
||||||
|
"CMAKE_CUDA_ARCHITECTURES": "72;87"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "JetPack 6",
|
||||||
|
"inherits": [ "CUDA" ],
|
||||||
|
"cacheVariables": {
|
||||||
|
"CMAKE_CUDA_ARCHITECTURES": "87"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "ROCm",
|
||||||
|
"inherits": [ "Default" ],
|
||||||
|
"cacheVariables": {
|
||||||
|
"CMAKE_HIP_PLATFORM": "amd"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "ROCm 6",
|
||||||
|
"inherits": [ "ROCm" ],
|
||||||
|
"cacheVariables": {
|
||||||
|
"AMDGPU_TARGETS": "gfx900;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"buildPresets": [
|
||||||
|
{
|
||||||
|
"name": "Default",
|
||||||
|
"configurePreset": "Default",
|
||||||
|
"configuration": "Release"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "CPU",
|
||||||
|
"configurePreset": "Default",
|
||||||
|
"targets": [ "ggml-cpu" ]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "CUDA",
|
||||||
|
"configurePreset": "CUDA",
|
||||||
|
"targets": [ "ggml-cuda" ]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "CUDA 11",
|
||||||
|
"inherits": [ "CUDA" ],
|
||||||
|
"configurePreset": "CUDA 11"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "CUDA 12",
|
||||||
|
"inherits": [ "CUDA" ],
|
||||||
|
"configurePreset": "CUDA 12"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "JetPack 5",
|
||||||
|
"inherits": [ "CUDA" ],
|
||||||
|
"configurePreset": "JetPack 5"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "JetPack 6",
|
||||||
|
"inherits": [ "CUDA" ],
|
||||||
|
"configurePreset": "JetPack 6"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "ROCm",
|
||||||
|
"configurePreset": "ROCm",
|
||||||
|
"targets": [ "ggml-hip" ]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "ROCm 6",
|
||||||
|
"inherits": [ "ROCm" ],
|
||||||
|
"configurePreset": "ROCm 6"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
316
Dockerfile
316
Dockerfile
@ -1,201 +1,161 @@
|
|||||||
ARG GOLANG_VERSION=1.22.8
|
# vim: filetype=dockerfile
|
||||||
ARG CUDA_VERSION_11=11.3.1
|
|
||||||
ARG CUDA_VERSION_12=12.4.0
|
|
||||||
ARG ROCM_VERSION=6.1.2
|
|
||||||
ARG JETPACK_6=r36.2.0
|
|
||||||
ARG JETPACK_5=r35.4.1
|
|
||||||
|
|
||||||
### To create a local image for building linux binaries on mac or windows with efficient incremental builds
|
ARG FLAVOR=${TARGETARCH}
|
||||||
#
|
|
||||||
# docker build --platform linux/amd64 -t builder-amd64 -f Dockerfile --target unified-builder-amd64 .
|
|
||||||
# docker run --platform linux/amd64 --rm -it -v $(pwd):/go/src/github.com/ollama/ollama/ builder-amd64
|
|
||||||
#
|
|
||||||
### Then incremental builds will be much faster in this container
|
|
||||||
#
|
|
||||||
# make -j 10 dist
|
|
||||||
#
|
|
||||||
FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete AS unified-builder-amd64
|
|
||||||
ARG GOLANG_VERSION
|
|
||||||
ARG CUDA_VERSION_11
|
|
||||||
ARG CUDA_VERSION_12
|
|
||||||
COPY ./scripts/rh_linux_deps.sh /
|
|
||||||
ENV PATH /opt/rh/devtoolset-10/root/usr/bin:/usr/local/cuda/bin:$PATH
|
|
||||||
ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/cuda/lib64
|
|
||||||
RUN GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
|
|
||||||
RUN yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo && \
|
|
||||||
dnf clean all && \
|
|
||||||
dnf install -y \
|
|
||||||
zsh \
|
|
||||||
cuda-toolkit-$(echo ${CUDA_VERSION_11} | cut -f1-2 -d. | sed -e "s/\./-/g") \
|
|
||||||
cuda-toolkit-$(echo ${CUDA_VERSION_12} | cut -f1-2 -d. | sed -e "s/\./-/g")
|
|
||||||
# TODO intel oneapi goes here...
|
|
||||||
ENV GOARCH amd64
|
|
||||||
ENV CGO_ENABLED 1
|
|
||||||
WORKDIR /go/src/github.com/ollama/ollama/
|
|
||||||
ENTRYPOINT [ "zsh" ]
|
|
||||||
|
|
||||||
### To create a local image for building linux binaries on mac or linux/arm64 with efficient incremental builds
|
ARG ROCMVERSION=6.1.2
|
||||||
# Note: this does not contain jetson variants
|
ARG JETPACK5VERSION=r35.4.1
|
||||||
#
|
ARG JETPACK6VERSION=r36.2.0
|
||||||
# docker build --platform linux/arm64 -t builder-arm64 -f Dockerfile --target unified-builder-arm64 .
|
ARG CMAKEVERSION=3.31.2
|
||||||
# docker run --platform linux/arm64 --rm -it -v $(pwd):/go/src/github.com/ollama/ollama/ builder-arm64
|
|
||||||
#
|
|
||||||
FROM --platform=linux/arm64 rockylinux:8 AS unified-builder-arm64
|
|
||||||
ARG GOLANG_VERSION
|
|
||||||
ARG CUDA_VERSION_11
|
|
||||||
ARG CUDA_VERSION_12
|
|
||||||
COPY ./scripts/rh_linux_deps.sh /
|
|
||||||
RUN GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
|
|
||||||
RUN yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/sbsa/cuda-rhel8.repo && \
|
|
||||||
dnf config-manager --set-enabled appstream && \
|
|
||||||
dnf clean all && \
|
|
||||||
dnf install -y \
|
|
||||||
zsh \
|
|
||||||
cuda-toolkit-$(echo ${CUDA_VERSION_11} | cut -f1-2 -d. | sed -e "s/\./-/g") \
|
|
||||||
cuda-toolkit-$(echo ${CUDA_VERSION_12} | cut -f1-2 -d. | sed -e "s/\./-/g")
|
|
||||||
ENV PATH /opt/rh/gcc-toolset-10/root/usr/bin:$PATH:/usr/local/cuda/bin
|
|
||||||
ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/cuda/lib64
|
|
||||||
ENV LIBRARY_PATH=/usr/local/cuda/lib64/stubs:/opt/amdgpu/lib64
|
|
||||||
ENV GOARCH arm64
|
|
||||||
ENV CGO_ENABLED 1
|
|
||||||
WORKDIR /go/src/github.com/ollama/ollama/
|
|
||||||
ENTRYPOINT [ "zsh" ]
|
|
||||||
|
|
||||||
FROM --platform=linux/amd64 unified-builder-amd64 AS build-amd64
|
FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCMVERSION}-complete AS base-amd64
|
||||||
COPY . .
|
RUN sed -i -e 's/mirror.centos.org/vault.centos.org/g' -e 's/^#.*baseurl=http/baseurl=http/g' -e 's/^mirrorlist=http/#mirrorlist=http/g' /etc/yum.repos.d/*.repo \
|
||||||
ARG OLLAMA_SKIP_CUDA_GENERATE
|
&& yum install -y yum-utils devtoolset-10-gcc devtoolset-10-gcc-c++ \
|
||||||
ARG OLLAMA_SKIP_ROCM_GENERATE
|
&& yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo \
|
||||||
ARG OLLAMA_FAST_BUILD
|
&& curl -s -L https://github.com/ccache/ccache/releases/download/v4.10.2/ccache-4.10.2-linux-x86_64.tar.xz | tar -Jx -C /usr/local/bin --strip-components 1
|
||||||
ARG VERSION
|
ENV PATH=/opt/rh/devtoolset-10/root/usr/bin:/opt/rh/devtoolset-11/root/usr/bin:$PATH
|
||||||
ARG CUSTOM_CPU_FLAGS
|
|
||||||
|
FROM --platform=linux/arm64 rockylinux:8 AS base-arm64
|
||||||
|
# install epel-release for ccache
|
||||||
|
RUN yum install -y yum-utils epel-release \
|
||||||
|
&& yum install -y clang ccache \
|
||||||
|
&& yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/sbsa/cuda-rhel8.repo
|
||||||
|
ENV CC=clang CXX=clang++
|
||||||
|
|
||||||
|
FROM base-${TARGETARCH} AS base
|
||||||
|
ARG CMAKEVERSION
|
||||||
|
RUN curl -fsSL https://github.com/Kitware/CMake/releases/download/v${CMAKEVERSION}/cmake-${CMAKEVERSION}-linux-$(uname -m).tar.gz | tar xz -C /usr/local --strip-components 1
|
||||||
|
COPY CMakeLists.txt CMakePresets.json .
|
||||||
|
COPY ml/backend/ggml/ggml ml/backend/ggml/ggml
|
||||||
|
ENV LDFLAGS=-s
|
||||||
|
|
||||||
|
FROM base AS cpu
|
||||||
|
# amd64 uses gcc which requires devtoolset-11 for AVX extensions while arm64 uses clang
|
||||||
|
RUN if [ "$(uname -m)" = "x86_64" ]; then yum install -y devtoolset-11-gcc devtoolset-11-gcc-c++; fi
|
||||||
|
ENV PATH=/opt/rh/devtoolset-11/root/usr/bin:$PATH
|
||||||
RUN --mount=type=cache,target=/root/.ccache \
|
RUN --mount=type=cache,target=/root/.ccache \
|
||||||
if grep "^flags" /proc/cpuinfo|grep avx>/dev/null; then \
|
cmake --preset 'CPU' && cmake --build --parallel --preset 'CPU'
|
||||||
make -j $(nproc) dist ; \
|
|
||||||
else \
|
|
||||||
make -j 5 dist ; \
|
|
||||||
fi
|
|
||||||
RUN cd dist/linux-$GOARCH && \
|
|
||||||
tar -cf - . | pigz --best > ../ollama-linux-$GOARCH.tgz
|
|
||||||
RUN if [ -z ${OLLAMA_SKIP_ROCM_GENERATE} ] ; then \
|
|
||||||
cd dist/linux-$GOARCH-rocm && \
|
|
||||||
tar -cf - . | pigz --best > ../ollama-linux-$GOARCH-rocm.tgz ;\
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Jetsons need to be built in discrete stages
|
FROM base AS cuda-11
|
||||||
FROM --platform=linux/arm64 nvcr.io/nvidia/l4t-jetpack:${JETPACK_5} AS runners-jetpack5-arm64
|
ARG CUDA11VERSION=11.3
|
||||||
ARG GOLANG_VERSION
|
RUN yum install -y cuda-toolkit-${CUDA11VERSION//./-}
|
||||||
RUN apt-get update && apt-get install -y git curl ccache && \
|
ENV PATH=/usr/local/cuda-11/bin:$PATH
|
||||||
curl -s -L https://dl.google.com/go/go${GOLANG_VERSION}.linux-arm64.tar.gz | tar xz -C /usr/local && \
|
|
||||||
ln -s /usr/local/go/bin/go /usr/local/bin/go && \
|
|
||||||
ln -s /usr/local/go/bin/gofmt /usr/local/bin/gofmt && \
|
|
||||||
apt-get clean && rm -rf /var/lib/apt/lists/*
|
|
||||||
WORKDIR /go/src/github.com/ollama/ollama/
|
|
||||||
COPY . .
|
|
||||||
ARG CGO_CFLAGS
|
|
||||||
ENV GOARCH arm64
|
|
||||||
ARG VERSION
|
|
||||||
RUN --mount=type=cache,target=/root/.ccache \
|
RUN --mount=type=cache,target=/root/.ccache \
|
||||||
make -j 5 dist_cuda_v11 \
|
cmake --preset 'CUDA 11' && cmake --build --parallel --preset 'CUDA 11'
|
||||||
CUDA_ARCHITECTURES="72;87" \
|
|
||||||
GPU_RUNNER_VARIANT=_jetpack5 \
|
|
||||||
DIST_LIB_DIR=/go/src/github.com/ollama/ollama/dist/linux-arm64-jetpack5/lib/ollama \
|
|
||||||
DIST_GPU_RUNNER_DEPS_DIR=/go/src/github.com/ollama/ollama/dist/linux-arm64-jetpack5/lib/ollama/cuda_jetpack5
|
|
||||||
|
|
||||||
FROM --platform=linux/arm64 nvcr.io/nvidia/l4t-jetpack:${JETPACK_6} AS runners-jetpack6-arm64
|
FROM base AS cuda-12
|
||||||
ARG GOLANG_VERSION
|
ARG CUDA12VERSION=12.4
|
||||||
RUN apt-get update && apt-get install -y git curl ccache && \
|
RUN yum install -y cuda-toolkit-${CUDA12VERSION//./-}
|
||||||
curl -s -L https://dl.google.com/go/go${GOLANG_VERSION}.linux-arm64.tar.gz | tar xz -C /usr/local && \
|
ENV PATH=/usr/local/cuda-12/bin:$PATH
|
||||||
ln -s /usr/local/go/bin/go /usr/local/bin/go && \
|
|
||||||
ln -s /usr/local/go/bin/gofmt /usr/local/bin/gofmt && \
|
|
||||||
apt-get clean && rm -rf /var/lib/apt/lists/*
|
|
||||||
WORKDIR /go/src/github.com/ollama/ollama/
|
|
||||||
COPY . .
|
|
||||||
ARG CGO_CFLAGS
|
|
||||||
ENV GOARCH arm64
|
|
||||||
ARG VERSION
|
|
||||||
RUN --mount=type=cache,target=/root/.ccache \
|
RUN --mount=type=cache,target=/root/.ccache \
|
||||||
make -j 5 dist_cuda_v12 \
|
cmake --preset 'CUDA 12' && cmake --build --parallel --preset 'CUDA 12'
|
||||||
CUDA_ARCHITECTURES="87" \
|
|
||||||
GPU_RUNNER_VARIANT=_jetpack6 \
|
|
||||||
DIST_LIB_DIR=/go/src/github.com/ollama/ollama/dist/linux-arm64-jetpack6/lib/ollama \
|
|
||||||
DIST_GPU_RUNNER_DEPS_DIR=/go/src/github.com/ollama/ollama/dist/linux-arm64-jetpack6/lib/ollama/cuda_jetpack6
|
|
||||||
|
|
||||||
FROM --platform=linux/arm64 unified-builder-arm64 AS build-arm64
|
FROM base AS rocm-6
|
||||||
COPY . .
|
|
||||||
ARG OLLAMA_SKIP_CUDA_GENERATE
|
|
||||||
ARG OLLAMA_FAST_BUILD
|
|
||||||
ARG VERSION
|
|
||||||
RUN --mount=type=cache,target=/root/.ccache \
|
RUN --mount=type=cache,target=/root/.ccache \
|
||||||
make -j 5 dist
|
cmake --preset 'ROCm 6' && cmake --build --parallel --preset 'ROCm 6'
|
||||||
COPY --from=runners-jetpack5-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
|
|
||||||
COPY --from=runners-jetpack6-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
|
|
||||||
RUN cd dist/linux-$GOARCH && \
|
|
||||||
tar -cf - . | pigz --best > ../ollama-linux-$GOARCH.tgz
|
|
||||||
RUN cd dist/linux-$GOARCH-jetpack5 && \
|
|
||||||
tar -cf - . | pigz --best > ../ollama-linux-$GOARCH-jetpack5.tgz
|
|
||||||
RUN cd dist/linux-$GOARCH-jetpack6 && \
|
|
||||||
tar -cf - . | pigz --best > ../ollama-linux-$GOARCH-jetpack6.tgz
|
|
||||||
|
|
||||||
FROM --platform=linux/amd64 scratch AS dist-amd64
|
FROM --platform=linux/arm64 nvcr.io/nvidia/l4t-jetpack:${JETPACK5VERSION} AS jetpack-5
|
||||||
COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/ollama-linux-*.tgz /
|
ARG CMAKEVERSION
|
||||||
FROM --platform=linux/arm64 scratch AS dist-arm64
|
RUN apt-get update && apt-get install -y curl ccache \
|
||||||
COPY --from=build-arm64 /go/src/github.com/ollama/ollama/dist/ollama-linux-*.tgz /
|
&& curl -fsSL https://github.com/Kitware/CMake/releases/download/v${CMAKEVERSION}/cmake-${CMAKEVERSION}-linux-$(uname -m).tar.gz | tar xz -C /usr/local --strip-components 1
|
||||||
FROM dist-$TARGETARCH AS dist
|
COPY CMakeLists.txt CMakePresets.json .
|
||||||
|
COPY ml/backend/ggml/ggml ml/backend/ggml/ggml
|
||||||
|
RUN --mount=type=cache,target=/root/.ccache \
|
||||||
|
cmake --preset 'JetPack 5' && cmake --build --parallel --preset 'JetPack 5'
|
||||||
|
|
||||||
|
FROM --platform=linux/arm64 nvcr.io/nvidia/l4t-jetpack:${JETPACK6VERSION} AS jetpack-6
|
||||||
|
ARG CMAKEVERSION
|
||||||
|
RUN apt-get update && apt-get install -y curl ccache \
|
||||||
|
&& curl -fsSL https://github.com/Kitware/CMake/releases/download/v${CMAKEVERSION}/cmake-${CMAKEVERSION}-linux-$(uname -m).tar.gz | tar xz -C /usr/local --strip-components 1
|
||||||
|
COPY CMakeLists.txt CMakePresets.json .
|
||||||
|
COPY ml/backend/ggml/ggml ml/backend/ggml/ggml
|
||||||
|
RUN --mount=type=cache,target=/root/.ccache \
|
||||||
|
cmake --preset 'JetPack 6' && cmake --build --parallel --preset 'JetPack 6'
|
||||||
|
|
||||||
# For amd64 container images, filter out cuda/rocm to minimize size
|
FROM base AS build
|
||||||
FROM build-amd64 AS runners-cuda-amd64
|
ARG GOVERSION=1.23.4
|
||||||
RUN rm -rf \
|
RUN curl -fsSL https://golang.org/dl/go${GOVERSION}.linux-$(case $(uname -m) in x86_64) echo amd64 ;; aarch64) echo arm64 ;; esac).tar.gz | tar xz -C /usr/local
|
||||||
./dist/linux-amd64/lib/ollama/libggml_hipblas.so \
|
ENV PATH=/usr/local/go/bin:$PATH
|
||||||
./dist/linux-amd64/lib/ollama/runners/rocm*
|
WORKDIR /go/src/github.com/ollama/ollama
|
||||||
|
COPY . .
|
||||||
|
ARG GOFLAGS="'-ldflags=-w -s'"
|
||||||
|
ENV CGO_ENABLED=1
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/go-build \
|
||||||
|
go build -trimpath -buildmode=pie -o /bin/ollama .
|
||||||
|
|
||||||
FROM build-amd64 AS runners-rocm-amd64
|
FROM --platform=linux/amd64 scratch AS amd64
|
||||||
RUN rm -rf \
|
COPY --from=cuda-11 --chmod=644 \
|
||||||
./dist/linux-amd64/lib/ollama/libggml_cuda*.so \
|
build/lib/libggml-cuda.so \
|
||||||
./dist/linux-amd64/lib/ollama/libcu*.so* \
|
/usr/local/cuda/lib64/libcublas.so.11 \
|
||||||
./dist/linux-amd64/lib/ollama/runners/cuda*
|
/usr/local/cuda/lib64/libcublasLt.so.11 \
|
||||||
|
/usr/local/cuda/lib64/libcudart.so.11.0 \
|
||||||
|
/lib/ollama/cuda_v11/
|
||||||
|
COPY --from=cuda-12 --chmod=644 \
|
||||||
|
build/lib/libggml-cuda.so \
|
||||||
|
/usr/local/cuda/lib64/libcublas.so.12 \
|
||||||
|
/usr/local/cuda/lib64/libcublasLt.so.12 \
|
||||||
|
/usr/local/cuda/lib64/libcudart.so.12 \
|
||||||
|
/lib/ollama/cuda_v12/
|
||||||
|
|
||||||
FROM --platform=linux/amd64 ubuntu:22.04 AS runtime-amd64
|
FROM --platform=linux/arm64 scratch AS arm64
|
||||||
RUN apt-get update && \
|
COPY --from=cuda-11 --chmod=644 \
|
||||||
apt-get install -y ca-certificates && \
|
build/lib/libggml-cuda.so \
|
||||||
apt-get clean && rm -rf /var/lib/apt/lists/*
|
/usr/local/cuda/lib64/libcublas.so.11 \
|
||||||
COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/
|
/usr/local/cuda/lib64/libcublasLt.so.11 \
|
||||||
COPY --from=runners-cuda-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
|
/usr/local/cuda/lib64/libcudart.so.11.0 \
|
||||||
|
/lib/ollama/cuda_v11/
|
||||||
|
COPY --from=cuda-12 --chmod=644 \
|
||||||
|
build/lib/libggml-cuda.so \
|
||||||
|
/usr/local/cuda/lib64/libcublas.so.12 \
|
||||||
|
/usr/local/cuda/lib64/libcublasLt.so.12 \
|
||||||
|
/usr/local/cuda/lib64/libcudart.so.12 \
|
||||||
|
/lib/ollama/cuda_v12/
|
||||||
|
COPY --from=jetpack-5 --chmod=644 \
|
||||||
|
build/lib/libggml-cuda.so \
|
||||||
|
/usr/local/cuda/lib64/libcublas.so.11 \
|
||||||
|
/usr/local/cuda/lib64/libcublasLt.so.11 \
|
||||||
|
/usr/local/cuda/lib64/libcudart.so.11.0 \
|
||||||
|
/lib/ollama/cuda_jetpack5/
|
||||||
|
COPY --from=jetpack-6 --chmod=644 \
|
||||||
|
build/lib/libggml-cuda.so \
|
||||||
|
/usr/local/cuda/lib64/libcublas.so.12 \
|
||||||
|
/usr/local/cuda/lib64/libcublasLt.so.12 \
|
||||||
|
/usr/local/cuda/lib64/libcudart.so.12 \
|
||||||
|
/lib/ollama/cuda_jetpack6/
|
||||||
|
|
||||||
FROM --platform=linux/arm64 ubuntu:22.04 AS runtime-arm64
|
FROM --platform=linux/arm64 scratch AS rocm
|
||||||
RUN apt-get update && \
|
COPY --from=rocm-6 --chmod=644 \
|
||||||
apt-get install -y ca-certificates && \
|
build/lib/libggml-hip.so \
|
||||||
apt-get clean && rm -rf /var/lib/apt/lists/*
|
/opt/rocm/lib/libamdhip64.so.6 \
|
||||||
COPY --from=build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/bin/ /bin/
|
/opt/rocm/lib/libhipblas.so.2 \
|
||||||
COPY --from=build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/
|
/opt/rocm/lib/librocblas.so.4 \
|
||||||
COPY --from=runners-jetpack5-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64-jetpack5/lib/ /lib/
|
/opt/rocm/lib/libamd_comgr.so.2 \
|
||||||
COPY --from=runners-jetpack6-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64-jetpack6/lib/ /lib/
|
/opt/rocm/lib/libhsa-runtime64.so.1 \
|
||||||
|
/opt/rocm/lib/librocprofiler-register.so.0 \
|
||||||
|
/opt/amdgpu/lib64/libdrm_amdgpu.so.1 \
|
||||||
|
/opt/amdgpu/lib64/libdrm.so.2 \
|
||||||
|
/usr/lib64/libnuma.so.1 \
|
||||||
|
/lib/ollama/rocm/
|
||||||
|
COPY --from=rocm-6 /opt/rocm/lib/rocblas/ /lib/ollama/rocm/rocblas/
|
||||||
|
|
||||||
|
FROM ${FLAVOR} AS archive
|
||||||
|
COPY --from=cpu --chmod=644 \
|
||||||
|
build/lib/libggml-base.so \
|
||||||
|
build/lib/libggml-cpu-*.so \
|
||||||
|
/lib/ollama/
|
||||||
|
COPY --from=build /bin/ollama /bin/ollama
|
||||||
|
|
||||||
# ROCm libraries larger so we keep it distinct from the CPU/CUDA image
|
FROM ubuntu:20.04
|
||||||
FROM --platform=linux/amd64 ubuntu:22.04 AS runtime-rocm
|
RUN apt-get update \
|
||||||
# Frontload the rocm libraries which are large, and rarely change to increase chance of a common layer
|
&& apt-get install -y ca-certificates \
|
||||||
# across releases
|
&& apt-get clean \
|
||||||
COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64-rocm/lib/ /lib/
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
RUN apt-get update && \
|
COPY --from=archive /bin/ /usr/bin/
|
||||||
apt-get install -y ca-certificates && \
|
|
||||||
apt-get clean && rm -rf /var/lib/apt/lists/*
|
|
||||||
COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/
|
|
||||||
COPY --from=runners-rocm-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
|
|
||||||
|
|
||||||
EXPOSE 11434
|
|
||||||
ENV OLLAMA_HOST 0.0.0.0
|
|
||||||
|
|
||||||
ENTRYPOINT ["/bin/ollama"]
|
|
||||||
CMD ["serve"]
|
|
||||||
|
|
||||||
FROM runtime-$TARGETARCH
|
|
||||||
EXPOSE 11434
|
|
||||||
ENV OLLAMA_HOST 0.0.0.0
|
|
||||||
ENV PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
|
ENV PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
|
||||||
ENV LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64
|
COPY --from=archive /lib/ollama/ /usr/lib/ollama/
|
||||||
|
ENV LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/usr/lib/ollama
|
||||||
ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
|
ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
|
||||||
ENV NVIDIA_VISIBLE_DEVICES=all
|
ENV NVIDIA_VISIBLE_DEVICES=all
|
||||||
|
ENV OLLAMA_HOST=0.0.0.0:11434
|
||||||
|
EXPOSE 11434
|
||||||
ENTRYPOINT ["/bin/ollama"]
|
ENTRYPOINT ["/bin/ollama"]
|
||||||
CMD ["serve"]
|
CMD ["serve"]
|
||||||
|
103
Makefile
103
Makefile
@ -1,103 +0,0 @@
|
|||||||
# top level makefile for Ollama
|
|
||||||
include make/common-defs.make
|
|
||||||
|
|
||||||
|
|
||||||
# Determine which if any GPU runners we should build
|
|
||||||
include make/cuda-v11-defs.make
|
|
||||||
include make/cuda-v12-defs.make
|
|
||||||
include make/rocm-defs.make
|
|
||||||
|
|
||||||
ifeq ($(CUSTOM_CPU_FLAGS),)
|
|
||||||
ifeq ($(ARCH),amd64)
|
|
||||||
RUNNER_TARGETS=cpu
|
|
||||||
endif
|
|
||||||
# Without CUSTOM_CPU_FLAGS we default to build both v11 and v12 if present
|
|
||||||
ifeq ($(OLLAMA_SKIP_CUDA_GENERATE),)
|
|
||||||
ifneq ($(CUDA_11_COMPILER),)
|
|
||||||
RUNNER_TARGETS += cuda_v11
|
|
||||||
endif
|
|
||||||
ifneq ($(CUDA_12_COMPILER),)
|
|
||||||
RUNNER_TARGETS += cuda_v12
|
|
||||||
endif
|
|
||||||
endif
|
|
||||||
else # CUSTOM_CPU_FLAGS is set, we'll build only the latest cuda version detected
|
|
||||||
ifneq ($(CUDA_12_COMPILER),)
|
|
||||||
RUNNER_TARGETS += cuda_v12
|
|
||||||
else ifneq ($(CUDA_11_COMPILER),)
|
|
||||||
RUNNER_TARGETS += cuda_v11
|
|
||||||
endif
|
|
||||||
endif
|
|
||||||
|
|
||||||
ifeq ($(OLLAMA_SKIP_ROCM_GENERATE),)
|
|
||||||
ifneq ($(HIP_COMPILER),)
|
|
||||||
RUNNER_TARGETS += rocm
|
|
||||||
endif
|
|
||||||
endif
|
|
||||||
|
|
||||||
|
|
||||||
all: runners exe
|
|
||||||
|
|
||||||
dist: $(addprefix dist_, $(RUNNER_TARGETS)) dist_exe
|
|
||||||
|
|
||||||
dist_%:
|
|
||||||
@$(MAKE) --no-print-directory -f make/Makefile.$* dist
|
|
||||||
|
|
||||||
runners: $(RUNNER_TARGETS)
|
|
||||||
|
|
||||||
$(RUNNER_TARGETS):
|
|
||||||
@$(MAKE) --no-print-directory -f make/Makefile.$@
|
|
||||||
|
|
||||||
exe dist_exe:
|
|
||||||
@$(MAKE) --no-print-directory -f make/Makefile.ollama $@
|
|
||||||
|
|
||||||
help-sync apply-patches create-patches sync sync-clean:
|
|
||||||
@$(MAKE) --no-print-directory -f make/Makefile.sync $@
|
|
||||||
|
|
||||||
test integration lint:
|
|
||||||
@$(MAKE) --no-print-directory -f make/Makefile.test $@
|
|
||||||
|
|
||||||
clean:
|
|
||||||
rm -rf $(BUILD_DIR) $(DIST_LIB_DIR) $(OLLAMA_EXE) $(DIST_OLLAMA_EXE)
|
|
||||||
go clean -cache
|
|
||||||
|
|
||||||
help:
|
|
||||||
@echo "The following make targets will help you build Ollama"
|
|
||||||
@echo ""
|
|
||||||
@echo " make all # (default target) Build Ollama llm subprocess runners, and the primary ollama executable"
|
|
||||||
@echo " make runners # Build Ollama llm subprocess runners; after you may use 'go build .' to build the primary ollama exectuable"
|
|
||||||
@echo " make <runner> # Build specific runners. Enabled: '$(RUNNER_TARGETS)'"
|
|
||||||
@echo " make dist # Build the runners and primary ollama executable for distribution"
|
|
||||||
@echo " make help-sync # Help information on vendor update targets"
|
|
||||||
@echo " make help-runners # Help information on runner targets"
|
|
||||||
@echo ""
|
|
||||||
@echo "The following make targets will help you test Ollama"
|
|
||||||
@echo ""
|
|
||||||
@echo " make test # Run unit tests"
|
|
||||||
@echo " make integration # Run integration tests. You must 'make all' first"
|
|
||||||
@echo " make lint # Run lint and style tests"
|
|
||||||
@echo ""
|
|
||||||
@echo "For more information see 'docs/development.md'"
|
|
||||||
@echo ""
|
|
||||||
|
|
||||||
|
|
||||||
help-runners:
|
|
||||||
@echo "The following runners will be built based on discovered GPU libraries: '$(RUNNER_TARGETS)'"
|
|
||||||
@echo ""
|
|
||||||
@echo "GPU Runner CPU Flags: '$(GPU_RUNNER_CPU_FLAGS)' (Override with CUSTOM_CPU_FLAGS)"
|
|
||||||
@echo ""
|
|
||||||
@echo "# CUDA_PATH sets the location where CUDA toolkits are present"
|
|
||||||
@echo "CUDA_PATH=$(CUDA_PATH)"
|
|
||||||
@echo " CUDA_11_PATH=$(CUDA_11_PATH)"
|
|
||||||
@echo " CUDA_11_COMPILER=$(CUDA_11_COMPILER)"
|
|
||||||
@echo " CUDA_12_PATH=$(CUDA_12_PATH)"
|
|
||||||
@echo " CUDA_12_COMPILER=$(CUDA_12_COMPILER)"
|
|
||||||
@echo ""
|
|
||||||
@echo "# HIP_PATH sets the location where the ROCm toolkit is present"
|
|
||||||
@echo "HIP_PATH=$(HIP_PATH)"
|
|
||||||
@echo " HIP_COMPILER=$(HIP_COMPILER)"
|
|
||||||
|
|
||||||
.PHONY: all exe dist help help-sync help-runners test integration lint runners clean $(RUNNER_TARGETS)
|
|
||||||
|
|
||||||
# Handy debugging for make variables
|
|
||||||
print-%:
|
|
||||||
@echo '$*=$($*)'
|
|
46
Makefile2
Normal file
46
Makefile2
Normal file
@ -0,0 +1,46 @@
|
|||||||
|
UPSTREAM=https://github.com/ggerganov/llama.cpp.git
|
||||||
|
WORKDIR=llama/vendor
|
||||||
|
FETCH_HEAD=46e3556e01b824e52395fb050b29804b6cff2a7c
|
||||||
|
|
||||||
|
all: sync
|
||||||
|
|
||||||
|
.PHONY: sync
|
||||||
|
sync: llama/llama.cpp ml/backend/ggml/ggml
|
||||||
|
|
||||||
|
.PHONY: llama/llama.cpp
|
||||||
|
llama/llama.cpp: llama/vendor/ apply_patches
|
||||||
|
rsync -arvzc -f "merge $@/.rsync-filter" $< $@
|
||||||
|
|
||||||
|
.PHONY: ml/backend/ggml/ggml apply_patches
|
||||||
|
ml/backend/ggml/ggml: llama/vendor/ggml/ apply_patches
|
||||||
|
rsync -arvzc -f "merge $@/.rsync-filter" $< $@
|
||||||
|
|
||||||
|
PATCHES=$(wildcard llama/patches/*.patch)
|
||||||
|
|
||||||
|
.PHONY: apply_patches
|
||||||
|
.NOTPARALLEL:
|
||||||
|
apply_patches: $(addsuffix ed, $(PATCHES))
|
||||||
|
|
||||||
|
%.patched: %.patch
|
||||||
|
@if git -c user.name=nobody -c 'user.email=<>' -C $(WORKDIR) am -3 $(realpath $<); then touch $@; else git -C $(WORKDIR) am --abort; exit 1; fi
|
||||||
|
|
||||||
|
.PHONY: checkout
|
||||||
|
checkout: $(WORKDIR)
|
||||||
|
git -C $(WORKDIR) fetch
|
||||||
|
git -C $(WORKDIR) checkout -f $(FETCH_HEAD)
|
||||||
|
|
||||||
|
$(WORKDIR):
|
||||||
|
git clone $(UPSTREAM) $(WORKDIR)
|
||||||
|
|
||||||
|
.PHONE: format_patches
|
||||||
|
format_patches: llama/patches
|
||||||
|
git -C $(WORKDIR) format-patch \
|
||||||
|
--no-signature \
|
||||||
|
--no-numbered \
|
||||||
|
--zero-commit \
|
||||||
|
-o $(realpath $<) \
|
||||||
|
$(FETCH_HEAD)
|
||||||
|
|
||||||
|
.PHONE: clean
|
||||||
|
clean: checkout
|
||||||
|
$(RM) $(addsuffix ed, $(PATCHES))
|
25
benchmark/README.md
Normal file
25
benchmark/README.md
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
# Benchmark
|
||||||
|
|
||||||
|
Performance benchmarking for Ollama.
|
||||||
|
|
||||||
|
## Prerequisites
|
||||||
|
- Ollama server running locally (`127.0.0.1:11434`)
|
||||||
|
- Desired models pre-downloaded (e.g., `llama3.2:1b`)
|
||||||
|
|
||||||
|
## Run Benchmark
|
||||||
|
```bash
|
||||||
|
# Run all tests
|
||||||
|
go test -bench=. -timeout 30m ./...
|
||||||
|
```
|
||||||
|
|
||||||
|
## New Runner Benchmark
|
||||||
|
```bash
|
||||||
|
go test -bench=Runner
|
||||||
|
```
|
||||||
|
|
||||||
|
or to test multiple models:
|
||||||
|
```bash
|
||||||
|
# run this from within the benchmark directory
|
||||||
|
# requires: llama3.2:1b, llama3.1:8b, llama3.3:70b
|
||||||
|
sh new_runner.sh
|
||||||
|
```
|
72
benchmark/new_runner.sh
Normal file
72
benchmark/new_runner.sh
Normal file
@ -0,0 +1,72 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
kill_process_tree() {
|
||||||
|
local pid=$1
|
||||||
|
# Get all child processes using pgrep
|
||||||
|
local children=$(pgrep -P $pid)
|
||||||
|
|
||||||
|
# Kill children first
|
||||||
|
for child in $children; do
|
||||||
|
kill_process_tree $child
|
||||||
|
done
|
||||||
|
|
||||||
|
# Kill the parent process
|
||||||
|
kill -9 $pid 2>/dev/null || true
|
||||||
|
}
|
||||||
|
|
||||||
|
# Function to run the runner and benchmark for a given model
|
||||||
|
run_benchmark() {
|
||||||
|
local model=$1
|
||||||
|
|
||||||
|
echo "Starting runner with model: $model"
|
||||||
|
# Start the runner in background and save its PID
|
||||||
|
go run ../cmd/runner/main.go --new-runner -model "$model" &
|
||||||
|
runner_pid=$!
|
||||||
|
|
||||||
|
# Wait for the runner to initialize (adjust sleep time as needed)
|
||||||
|
sleep 5
|
||||||
|
|
||||||
|
echo "Running benchmark..."
|
||||||
|
# Run test and wait for it to complete
|
||||||
|
go test -bench=Runner
|
||||||
|
test_exit_code=$?
|
||||||
|
|
||||||
|
echo "Stopping runner process..."
|
||||||
|
# Kill the runner process and all its children
|
||||||
|
kill_process_tree $runner_pid
|
||||||
|
|
||||||
|
# Wait for the process to fully terminate
|
||||||
|
wait $runner_pid 2>/dev/null || true
|
||||||
|
|
||||||
|
# Make sure no processes are still listening on port 8080
|
||||||
|
lsof -t -i:8080 | xargs kill -9 2>/dev/null || true
|
||||||
|
|
||||||
|
# Additional sleep to ensure port is freed
|
||||||
|
sleep 2
|
||||||
|
|
||||||
|
# Check if test failed
|
||||||
|
if [ $test_exit_code -ne 0 ]; then
|
||||||
|
echo "Warning: Benchmark test failed with exit code $test_exit_code"
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Benchmark complete for model: $model"
|
||||||
|
echo "----------------------------------------"
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
HOME_DIR="$HOME"
|
||||||
|
# llama3.2:1b: ~/.ollama/models/blobs/sha256-74701a8c35f6c8d9a4b91f3f3497643001d63e0c7a84e085bed452548fa88d45
|
||||||
|
# llama3.1:8b: ~/.ollama/models/blobs/sha256-667b0c1932bc6ffc593ed1d03f895bf2dc8dc6df21db3042284a6f4416b06a29
|
||||||
|
# llama3.3:70b: ~/.ollama/models/blobs/sha256-4824460d29f2058aaf6e1118a63a7a197a09bed509f0e7d4e2efb1ee273b447d
|
||||||
|
models=(
|
||||||
|
"${HOME_DIR}/.ollama/models/blobs/sha256-74701a8c35f6c8d9a4b91f3f3497643001d63e0c7a84e085bed452548fa88d45"
|
||||||
|
"${HOME_DIR}/.ollama/models/blobs/sha256-667b0c1932bc6ffc593ed1d03f895bf2dc8dc6df21db3042284a6f4416b06a29"
|
||||||
|
# "${HOME_DIR}/.ollama/models/blobs/sha256-4824460d29f2058aaf6e1118a63a7a197a09bed509f0e7d4e2efb1ee273b447d"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Run benchmarks for each model
|
||||||
|
for model in "${models[@]}"; do
|
||||||
|
run_benchmark "$model"
|
||||||
|
done
|
||||||
|
|
||||||
|
echo "All benchmarks completed!"
|
175
benchmark/new_runner_benchmark_test.go
Normal file
175
benchmark/new_runner_benchmark_test.go
Normal file
@ -0,0 +1,175 @@
|
|||||||
|
package benchmark
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"context"
|
||||||
|
"encoding/json"
|
||||||
|
"fmt"
|
||||||
|
"io"
|
||||||
|
"net/http"
|
||||||
|
"testing"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
runnerURL = "http://localhost:8080"
|
||||||
|
warmupPrompts = 2 // Number of warm-up requests per test case
|
||||||
|
warmupTokens = 50 // Smaller token count for warm-up requests
|
||||||
|
)
|
||||||
|
|
||||||
|
var runnerMetrics []BenchmarkMetrics
|
||||||
|
|
||||||
|
// CompletionRequest represents the request body for the completion endpoint
|
||||||
|
type CompletionRequest struct {
|
||||||
|
Prompt string `json:"prompt"`
|
||||||
|
NumPredict int `json:"n_predict"`
|
||||||
|
Temperature float32 `json:"temperature"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// CompletionResponse represents a single response chunk from the streaming API
|
||||||
|
type CompletionResponse struct {
|
||||||
|
Content string `json:"content"`
|
||||||
|
Stop bool `json:"stop"`
|
||||||
|
Timings struct {
|
||||||
|
PredictedN int `json:"predicted_n"`
|
||||||
|
PredictedMs int `json:"predicted_ms"`
|
||||||
|
PromptN int `json:"prompt_n"`
|
||||||
|
PromptMs int `json:"prompt_ms"`
|
||||||
|
} `json:"timings"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// warmUp performs warm-up requests before the actual benchmark
|
||||||
|
func warmUp(b *testing.B, tt TestCase) {
|
||||||
|
b.Logf("Warming up for test case %s", tt.name)
|
||||||
|
warmupTest := TestCase{
|
||||||
|
name: tt.name + "_warmup",
|
||||||
|
prompt: tt.prompt,
|
||||||
|
maxTokens: warmupTokens,
|
||||||
|
}
|
||||||
|
|
||||||
|
for i := 0; i < warmupPrompts; i++ {
|
||||||
|
runCompletion(context.Background(), warmupTest, b)
|
||||||
|
time.Sleep(100 * time.Millisecond) // Brief pause between warm-up requests
|
||||||
|
}
|
||||||
|
b.Logf("Warm-up complete")
|
||||||
|
}
|
||||||
|
|
||||||
|
func BenchmarkRunnerInference(b *testing.B) {
|
||||||
|
b.Logf("Starting benchmark suite")
|
||||||
|
|
||||||
|
// Verify server availability
|
||||||
|
if _, err := http.Get(runnerURL + "/health"); err != nil {
|
||||||
|
b.Fatalf("Runner unavailable: %v", err)
|
||||||
|
}
|
||||||
|
b.Log("Runner available")
|
||||||
|
|
||||||
|
tests := []TestCase{
|
||||||
|
{
|
||||||
|
name: "short_prompt",
|
||||||
|
prompt: formatPrompt("Write a long story"),
|
||||||
|
maxTokens: 100,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "medium_prompt",
|
||||||
|
prompt: formatPrompt("Write a detailed economic analysis"),
|
||||||
|
maxTokens: 500,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "long_prompt",
|
||||||
|
prompt: formatPrompt("Write a comprehensive AI research paper"),
|
||||||
|
maxTokens: 1000,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
// Register cleanup handler for results reporting
|
||||||
|
b.Cleanup(func() { reportMetrics(metrics) })
|
||||||
|
|
||||||
|
// Main benchmark loop
|
||||||
|
for _, tt := range tests {
|
||||||
|
b.Run(tt.name, func(b *testing.B) {
|
||||||
|
// Perform warm-up requests
|
||||||
|
warmUp(b, tt)
|
||||||
|
|
||||||
|
// Wait a bit after warm-up before starting the actual benchmark
|
||||||
|
time.Sleep(500 * time.Millisecond)
|
||||||
|
|
||||||
|
m := make([]BenchmarkMetrics, b.N)
|
||||||
|
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
b.ResetTimer()
|
||||||
|
m[i] = runCompletion(context.Background(), tt, b)
|
||||||
|
}
|
||||||
|
metrics = append(metrics, m...)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func formatPrompt(text string) string {
|
||||||
|
return fmt.Sprintf("<|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n%s<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", text)
|
||||||
|
}
|
||||||
|
|
||||||
|
func runCompletion(ctx context.Context, tt TestCase, b *testing.B) BenchmarkMetrics {
|
||||||
|
start := time.Now()
|
||||||
|
var ttft time.Duration
|
||||||
|
var tokens int
|
||||||
|
lastToken := start
|
||||||
|
|
||||||
|
// Create request body
|
||||||
|
reqBody := CompletionRequest{
|
||||||
|
Prompt: tt.prompt,
|
||||||
|
NumPredict: tt.maxTokens,
|
||||||
|
Temperature: 0.1,
|
||||||
|
}
|
||||||
|
jsonData, err := json.Marshal(reqBody)
|
||||||
|
if err != nil {
|
||||||
|
b.Fatalf("Failed to marshal request: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create HTTP request
|
||||||
|
req, err := http.NewRequestWithContext(ctx, "POST", runnerURL+"/completion", bytes.NewBuffer(jsonData))
|
||||||
|
if err != nil {
|
||||||
|
b.Fatalf("Failed to create request: %v", err)
|
||||||
|
}
|
||||||
|
req.Header.Set("Content-Type", "application/json")
|
||||||
|
|
||||||
|
// Execute request
|
||||||
|
resp, err := http.DefaultClient.Do(req)
|
||||||
|
if err != nil {
|
||||||
|
b.Fatalf("Request failed: %v", err)
|
||||||
|
}
|
||||||
|
defer resp.Body.Close()
|
||||||
|
|
||||||
|
// Process streaming response
|
||||||
|
decoder := json.NewDecoder(resp.Body)
|
||||||
|
for {
|
||||||
|
var chunk CompletionResponse
|
||||||
|
if err := decoder.Decode(&chunk); err != nil {
|
||||||
|
if err == io.EOF {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
b.Fatalf("Failed to decode response: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if ttft == 0 && chunk.Content != "" {
|
||||||
|
ttft = time.Since(start)
|
||||||
|
}
|
||||||
|
|
||||||
|
if chunk.Content != "" {
|
||||||
|
tokens++
|
||||||
|
lastToken = time.Now()
|
||||||
|
}
|
||||||
|
|
||||||
|
if chunk.Stop {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
totalTime := lastToken.Sub(start)
|
||||||
|
return BenchmarkMetrics{
|
||||||
|
testName: tt.name,
|
||||||
|
ttft: ttft,
|
||||||
|
totalTime: totalTime,
|
||||||
|
totalTokens: tokens,
|
||||||
|
tokensPerSecond: float64(tokens) / totalTime.Seconds(),
|
||||||
|
}
|
||||||
|
}
|
293
benchmark/server_benchmark_test.go
Normal file
293
benchmark/server_benchmark_test.go
Normal file
@ -0,0 +1,293 @@
|
|||||||
|
// Package benchmark provides tools for performance testing of Ollama inference server and supported models.
|
||||||
|
package benchmark
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"fmt"
|
||||||
|
"net/http"
|
||||||
|
"net/url"
|
||||||
|
"os"
|
||||||
|
"testing"
|
||||||
|
"text/tabwriter"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/ollama/ollama/api"
|
||||||
|
)
|
||||||
|
|
||||||
|
// ServerURL is the default Ollama server URL for benchmarking
|
||||||
|
const serverURL = "http://127.0.0.1:11434"
|
||||||
|
|
||||||
|
// metrics collects all benchmark results for final reporting
|
||||||
|
var metrics []BenchmarkMetrics
|
||||||
|
|
||||||
|
// models contains the list of model names to benchmark
|
||||||
|
var models = []string{
|
||||||
|
"llama3.2:1b",
|
||||||
|
// "qwen2.5:7b",
|
||||||
|
// "llama3.3:70b",
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestCase defines a benchmark test scenario with prompt characteristics
|
||||||
|
type TestCase struct {
|
||||||
|
name string // Human-readable test name
|
||||||
|
prompt string // Input prompt text
|
||||||
|
maxTokens int // Maximum tokens to generate
|
||||||
|
}
|
||||||
|
|
||||||
|
// BenchmarkMetrics contains performance measurements for a single test run
|
||||||
|
type BenchmarkMetrics struct {
|
||||||
|
model string // Model being tested
|
||||||
|
scenario string // cold_start or warm_start
|
||||||
|
testName string // Name of the test case
|
||||||
|
ttft time.Duration // Time To First Token (TTFT)
|
||||||
|
totalTime time.Duration // Total time for complete response
|
||||||
|
totalTokens int // Total generated tokens
|
||||||
|
tokensPerSecond float64 // Calculated throughput
|
||||||
|
}
|
||||||
|
|
||||||
|
// ScenarioType defines the initialization state for benchmarking
|
||||||
|
type ScenarioType int
|
||||||
|
|
||||||
|
const (
|
||||||
|
ColdStart ScenarioType = iota // Model is loaded from cold state
|
||||||
|
WarmStart // Model is already loaded in memory
|
||||||
|
)
|
||||||
|
|
||||||
|
// String implements fmt.Stringer for ScenarioType
|
||||||
|
func (s ScenarioType) String() string {
|
||||||
|
return [...]string{"cold_start", "warm_start"}[s]
|
||||||
|
}
|
||||||
|
|
||||||
|
// BenchmarkServerInference is the main entry point for benchmarking Ollama inference performance.
|
||||||
|
// It tests all configured models with different prompt lengths and start scenarios.
|
||||||
|
func BenchmarkServerInference(b *testing.B) {
|
||||||
|
b.Logf("Starting benchmark suite with %d models", len(models))
|
||||||
|
|
||||||
|
// Verify server availability
|
||||||
|
if _, err := http.Get(serverURL + "/api/version"); err != nil {
|
||||||
|
b.Fatalf("Server unavailable: %v", err)
|
||||||
|
}
|
||||||
|
b.Log("Server available")
|
||||||
|
|
||||||
|
tests := []TestCase{
|
||||||
|
{"short_prompt", "Write a long story", 100},
|
||||||
|
{"medium_prompt", "Write a detailed economic analysis", 500},
|
||||||
|
{"long_prompt", "Write a comprehensive AI research paper", 1000},
|
||||||
|
}
|
||||||
|
|
||||||
|
// Register cleanup handler for results reporting
|
||||||
|
b.Cleanup(func() { reportMetrics(metrics) })
|
||||||
|
|
||||||
|
// Main benchmark loop
|
||||||
|
for _, model := range models {
|
||||||
|
client := api.NewClient(mustParse(serverURL), http.DefaultClient)
|
||||||
|
// Verify model availability
|
||||||
|
if _, err := client.Show(context.Background(), &api.ShowRequest{Model: model}); err != nil {
|
||||||
|
b.Fatalf("Model unavailable: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tt := range tests {
|
||||||
|
testName := fmt.Sprintf("%s/%s/%s", model, ColdStart, tt.name)
|
||||||
|
b.Run(testName, func(b *testing.B) {
|
||||||
|
m := runBenchmark(b, tt, model, ColdStart, client)
|
||||||
|
metrics = append(metrics, m...)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tt := range tests {
|
||||||
|
testName := fmt.Sprintf("%s/%s/%s", model, WarmStart, tt.name)
|
||||||
|
b.Run(testName, func(b *testing.B) {
|
||||||
|
m := runBenchmark(b, tt, model, WarmStart, client)
|
||||||
|
metrics = append(metrics, m...)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// runBenchmark executes multiple iterations of a specific test case and scenario.
|
||||||
|
// Returns collected metrics for all iterations.
|
||||||
|
func runBenchmark(b *testing.B, tt TestCase, model string, scenario ScenarioType, client *api.Client) []BenchmarkMetrics {
|
||||||
|
results := make([]BenchmarkMetrics, b.N)
|
||||||
|
|
||||||
|
// Run benchmark iterations
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
switch scenario {
|
||||||
|
case WarmStart:
|
||||||
|
// Pre-warm the model by generating some tokens
|
||||||
|
for i := 0; i < 2; i++ {
|
||||||
|
client.Generate(
|
||||||
|
context.Background(),
|
||||||
|
&api.GenerateRequest{
|
||||||
|
Model: model,
|
||||||
|
Prompt: tt.prompt,
|
||||||
|
Options: map[string]interface{}{"num_predict": tt.maxTokens, "temperature": 0.1},
|
||||||
|
},
|
||||||
|
func(api.GenerateResponse) error { return nil },
|
||||||
|
)
|
||||||
|
}
|
||||||
|
case ColdStart:
|
||||||
|
unloadModel(client, model, b)
|
||||||
|
}
|
||||||
|
b.ResetTimer()
|
||||||
|
|
||||||
|
results[i] = runSingleIteration(context.Background(), client, tt, model, b)
|
||||||
|
results[i].scenario = scenario.String()
|
||||||
|
}
|
||||||
|
return results
|
||||||
|
}
|
||||||
|
|
||||||
|
// unloadModel forces model unloading using KeepAlive: -1 parameter.
|
||||||
|
// Includes short delay to ensure unloading completes before next test.
|
||||||
|
func unloadModel(client *api.Client, model string, b *testing.B) {
|
||||||
|
req := &api.GenerateRequest{
|
||||||
|
Model: model,
|
||||||
|
KeepAlive: &api.Duration{Duration: 0},
|
||||||
|
}
|
||||||
|
if err := client.Generate(context.Background(), req, func(api.GenerateResponse) error { return nil }); err != nil {
|
||||||
|
b.Logf("Unload error: %v", err)
|
||||||
|
}
|
||||||
|
time.Sleep(100 * time.Millisecond)
|
||||||
|
}
|
||||||
|
|
||||||
|
// runSingleIteration measures performance metrics for a single inference request.
|
||||||
|
// Captures TTFT, total generation time, and calculates tokens/second.
|
||||||
|
func runSingleIteration(ctx context.Context, client *api.Client, tt TestCase, model string, b *testing.B) BenchmarkMetrics {
|
||||||
|
start := time.Now()
|
||||||
|
var ttft time.Duration
|
||||||
|
var tokens int
|
||||||
|
lastToken := start
|
||||||
|
|
||||||
|
req := &api.GenerateRequest{
|
||||||
|
Model: model,
|
||||||
|
Prompt: tt.prompt,
|
||||||
|
Options: map[string]interface{}{"num_predict": tt.maxTokens, "temperature": 0.1},
|
||||||
|
}
|
||||||
|
|
||||||
|
if b != nil {
|
||||||
|
b.Logf("Prompt length: %d chars", len(tt.prompt))
|
||||||
|
}
|
||||||
|
|
||||||
|
// Execute generation request with metrics collection
|
||||||
|
client.Generate(ctx, req, func(resp api.GenerateResponse) error {
|
||||||
|
if ttft == 0 {
|
||||||
|
ttft = time.Since(start)
|
||||||
|
}
|
||||||
|
if resp.Response != "" {
|
||||||
|
tokens++
|
||||||
|
lastToken = time.Now()
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
})
|
||||||
|
|
||||||
|
totalTime := lastToken.Sub(start)
|
||||||
|
return BenchmarkMetrics{
|
||||||
|
model: model,
|
||||||
|
testName: tt.name,
|
||||||
|
ttft: ttft,
|
||||||
|
totalTime: totalTime,
|
||||||
|
totalTokens: tokens,
|
||||||
|
tokensPerSecond: float64(tokens) / totalTime.Seconds(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// reportMetrics processes collected metrics and prints formatted results.
|
||||||
|
// Generates both human-readable tables and CSV output with averaged statistics.
|
||||||
|
func reportMetrics(results []BenchmarkMetrics) {
|
||||||
|
if len(results) == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Aggregate results by test case
|
||||||
|
type statsKey struct {
|
||||||
|
model string
|
||||||
|
scenario string
|
||||||
|
testName string
|
||||||
|
}
|
||||||
|
stats := make(map[statsKey]*struct {
|
||||||
|
ttftSum time.Duration
|
||||||
|
totalTimeSum time.Duration
|
||||||
|
tokensSum int
|
||||||
|
iterations int
|
||||||
|
})
|
||||||
|
|
||||||
|
for _, m := range results {
|
||||||
|
key := statsKey{m.model, m.scenario, m.testName}
|
||||||
|
if _, exists := stats[key]; !exists {
|
||||||
|
stats[key] = &struct {
|
||||||
|
ttftSum time.Duration
|
||||||
|
totalTimeSum time.Duration
|
||||||
|
tokensSum int
|
||||||
|
iterations int
|
||||||
|
}{}
|
||||||
|
}
|
||||||
|
|
||||||
|
stats[key].ttftSum += m.ttft
|
||||||
|
stats[key].totalTimeSum += m.totalTime
|
||||||
|
stats[key].tokensSum += m.totalTokens
|
||||||
|
stats[key].iterations++
|
||||||
|
}
|
||||||
|
|
||||||
|
// Calculate averages
|
||||||
|
var averaged []BenchmarkMetrics
|
||||||
|
for key, data := range stats {
|
||||||
|
count := data.iterations
|
||||||
|
averaged = append(averaged, BenchmarkMetrics{
|
||||||
|
model: key.model,
|
||||||
|
scenario: key.scenario,
|
||||||
|
testName: key.testName,
|
||||||
|
ttft: data.ttftSum / time.Duration(count),
|
||||||
|
totalTime: data.totalTimeSum / time.Duration(count),
|
||||||
|
totalTokens: data.tokensSum / count,
|
||||||
|
tokensPerSecond: float64(data.tokensSum) / data.totalTimeSum.Seconds(),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// Print formatted results
|
||||||
|
printTableResults(averaged)
|
||||||
|
printCSVResults(averaged)
|
||||||
|
}
|
||||||
|
|
||||||
|
// printTableResults displays averaged metrics in a formatted table
|
||||||
|
func printTableResults(averaged []BenchmarkMetrics) {
|
||||||
|
w := tabwriter.NewWriter(os.Stdout, 0, 0, 2, ' ', 0)
|
||||||
|
fmt.Fprintln(w, "\nAVERAGED BENCHMARK RESULTS")
|
||||||
|
fmt.Fprintln(w, "Model\tScenario\tTest Name\tTTFT (ms)\tTotal Time (ms)\tTokens\tTokens/sec")
|
||||||
|
for _, m := range averaged {
|
||||||
|
fmt.Fprintf(w, "%s\t%s\t%s\t%.2f\t%.2f\t%d\t%.2f\n",
|
||||||
|
m.model,
|
||||||
|
m.scenario,
|
||||||
|
m.testName,
|
||||||
|
float64(m.ttft.Milliseconds()),
|
||||||
|
float64(m.totalTime.Milliseconds()),
|
||||||
|
m.totalTokens,
|
||||||
|
m.tokensPerSecond,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
w.Flush()
|
||||||
|
}
|
||||||
|
|
||||||
|
// printCSVResults outputs averaged metrics in CSV format
|
||||||
|
func printCSVResults(averaged []BenchmarkMetrics) {
|
||||||
|
fmt.Println("\nCSV OUTPUT")
|
||||||
|
fmt.Println("model,scenario,test_name,ttft_ms,total_ms,tokens,tokens_per_sec")
|
||||||
|
for _, m := range averaged {
|
||||||
|
fmt.Printf("%s,%s,%s,%.2f,%.2f,%d,%.2f\n",
|
||||||
|
m.model,
|
||||||
|
m.scenario,
|
||||||
|
m.testName,
|
||||||
|
float64(m.ttft.Milliseconds()),
|
||||||
|
float64(m.totalTime.Milliseconds()),
|
||||||
|
m.totalTokens,
|
||||||
|
m.tokensPerSecond,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// mustParse is a helper function to parse URLs with panic on error
|
||||||
|
func mustParse(rawURL string) *url.URL {
|
||||||
|
u, err := url.Parse(rawURL)
|
||||||
|
if err != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
|
return u
|
||||||
|
}
|
420
cache/cache.go
vendored
Normal file
420
cache/cache.go
vendored
Normal file
@ -0,0 +1,420 @@
|
|||||||
|
package cache
|
||||||
|
|
||||||
|
import (
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
|
"log/slog"
|
||||||
|
"math"
|
||||||
|
"slices"
|
||||||
|
|
||||||
|
"github.com/ollama/ollama/ml"
|
||||||
|
)
|
||||||
|
|
||||||
|
var ErrNotSupported = errors.New("model does not support operation")
|
||||||
|
|
||||||
|
type Cache interface {
|
||||||
|
// ** used by model implementations **
|
||||||
|
|
||||||
|
// Returns an instance of the cache for layer 'i'
|
||||||
|
Sub(i int) Cache
|
||||||
|
|
||||||
|
// Returns the history of key and value tensors plus a mask
|
||||||
|
//
|
||||||
|
// The tensors are of shape embed dim, kv heads, batch size
|
||||||
|
// The mask is of shape history size, batch size
|
||||||
|
Get(ctx ml.Context) (ml.Tensor, ml.Tensor, ml.Tensor)
|
||||||
|
|
||||||
|
// Stores a batch of key and value in the cache
|
||||||
|
//
|
||||||
|
// The tensors must be of shape embed dim, kv heads, batch size
|
||||||
|
Put(ctx ml.Context, key, value ml.Tensor)
|
||||||
|
|
||||||
|
// ** cache management **
|
||||||
|
|
||||||
|
// Closes the cache and frees resources associated with it
|
||||||
|
Close()
|
||||||
|
|
||||||
|
// Called before the start of the model's forward pass. For each
|
||||||
|
// token in the coming batch, there must be a corresponding entry
|
||||||
|
// in positions and seqs.
|
||||||
|
StartForward(ctx ml.Context, positions []int32, seqs []int) error
|
||||||
|
|
||||||
|
// Copies tokens in the range [0, len) from srcSeq to dstSeq
|
||||||
|
CopyPrefix(srcSeq, dstSeq int, len int32)
|
||||||
|
|
||||||
|
// Removes tokens in the range [beginIndex, endIndex) from seq. Set
|
||||||
|
// endIndex to math.MaxInt32 to remove everything starting at beginIndex
|
||||||
|
Remove(seq int, beginIndex, endIndex int32) error
|
||||||
|
}
|
||||||
|
|
||||||
|
type Causal struct {
|
||||||
|
DType ml.DType
|
||||||
|
Capacity int32
|
||||||
|
|
||||||
|
// current forward pass
|
||||||
|
curLayer int
|
||||||
|
curLoc int
|
||||||
|
curBatchSize int
|
||||||
|
curMask ml.Tensor
|
||||||
|
curCellRange cellRange
|
||||||
|
|
||||||
|
// metadata
|
||||||
|
cells []cacheCell
|
||||||
|
cellRanges map[int]cellRange
|
||||||
|
|
||||||
|
// cache data storage
|
||||||
|
backend ml.Backend
|
||||||
|
cacheCtx ml.Context
|
||||||
|
keys, values []ml.Tensor
|
||||||
|
}
|
||||||
|
|
||||||
|
type seqCell struct {
|
||||||
|
seq int
|
||||||
|
pos int32
|
||||||
|
}
|
||||||
|
|
||||||
|
type cacheCell struct {
|
||||||
|
sequences []seqCell
|
||||||
|
}
|
||||||
|
|
||||||
|
type cellRange struct {
|
||||||
|
min int
|
||||||
|
max int
|
||||||
|
}
|
||||||
|
|
||||||
|
func (cell cacheCell) findSeq(seq int) *seqCell {
|
||||||
|
for i := range cell.sequences {
|
||||||
|
if cell.sequences[i].seq == seq {
|
||||||
|
return &cell.sequences[i]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewCausalCache(backend ml.Backend, dtype ml.DType, capacity int32) Cache {
|
||||||
|
return &Causal{
|
||||||
|
Capacity: capacity,
|
||||||
|
DType: dtype,
|
||||||
|
cells: make([]cacheCell, capacity),
|
||||||
|
cellRanges: make(map[int]cellRange),
|
||||||
|
backend: backend,
|
||||||
|
cacheCtx: backend.NewContext(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *Causal) Close() {
|
||||||
|
c.cacheCtx.Close()
|
||||||
|
}
|
||||||
|
|
||||||
|
var ErrKvCacheFull = errors.New("could not find a kv cache slot")
|
||||||
|
|
||||||
|
func (c *Causal) StartForward(ctx ml.Context, positions []int32, seqs []int) error {
|
||||||
|
if len(positions) != len(seqs) {
|
||||||
|
return fmt.Errorf("length of positions (%v) must match length of seqs (%v)", len(positions), len(seqs))
|
||||||
|
}
|
||||||
|
|
||||||
|
c.curBatchSize = len(positions)
|
||||||
|
|
||||||
|
if c.curBatchSize < 1 {
|
||||||
|
return errors.New("batch size cannot be less than 1")
|
||||||
|
}
|
||||||
|
|
||||||
|
var err error
|
||||||
|
c.curLoc, err = c.findStartLoc()
|
||||||
|
if errors.Is(err, ErrKvCacheFull) {
|
||||||
|
c.defrag()
|
||||||
|
c.curLoc, err = c.findStartLoc()
|
||||||
|
}
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
c.curCellRange = newRange()
|
||||||
|
for i, pos := range positions {
|
||||||
|
seq := seqs[i]
|
||||||
|
|
||||||
|
c.cells[c.curLoc+i] = cacheCell{sequences: []seqCell{{seq: seq, pos: pos}}}
|
||||||
|
|
||||||
|
ranges, ok := c.cellRanges[seq]
|
||||||
|
if !ok {
|
||||||
|
ranges = newRange()
|
||||||
|
}
|
||||||
|
|
||||||
|
if c.curLoc+i > ranges.max {
|
||||||
|
ranges.max = c.curLoc + i
|
||||||
|
}
|
||||||
|
if ranges.max > c.curCellRange.max {
|
||||||
|
c.curCellRange.max = ranges.max
|
||||||
|
}
|
||||||
|
|
||||||
|
if c.curLoc+i < ranges.min {
|
||||||
|
ranges.min = c.curLoc + i
|
||||||
|
}
|
||||||
|
if ranges.min < c.curCellRange.min {
|
||||||
|
c.curCellRange.min = ranges.min
|
||||||
|
}
|
||||||
|
c.cellRanges[seq] = ranges
|
||||||
|
}
|
||||||
|
|
||||||
|
c.curMask, err = c.buildMask(ctx, positions, seqs)
|
||||||
|
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
func newRange() cellRange {
|
||||||
|
return cellRange{
|
||||||
|
min: math.MaxInt,
|
||||||
|
max: 0,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *Causal) findStartLoc() (int, error) {
|
||||||
|
var start, count int
|
||||||
|
for i := range c.cells {
|
||||||
|
if len(c.cells[i].sequences) == 0 {
|
||||||
|
count++
|
||||||
|
if count >= c.curBatchSize {
|
||||||
|
return start, nil
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
start = i + 1
|
||||||
|
count = 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0, fmt.Errorf("%w (length: %v)", ErrKvCacheFull, c.Capacity)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *Causal) buildMask(ctx ml.Context, positions []int32, seqs []int) (ml.Tensor, error) {
|
||||||
|
// TODO(jessegross): This makes a number of simplifications such as no padding,
|
||||||
|
// which could be an issue for CUDA graphs and/or flash attention
|
||||||
|
len := c.curCellRange.max - c.curCellRange.min + 1
|
||||||
|
mask := make([]float32, c.curBatchSize*len)
|
||||||
|
|
||||||
|
for i := range c.curBatchSize {
|
||||||
|
for j := c.curCellRange.min; j <= c.curCellRange.max; j++ {
|
||||||
|
cellSeq := c.cells[j].findSeq(seqs[i])
|
||||||
|
if cellSeq == nil || cellSeq.pos > positions[i] {
|
||||||
|
mask[i*len+(j-c.curCellRange.min)] = float32(math.Inf(-1))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return ctx.FromFloatSlice(mask, len, c.curBatchSize)
|
||||||
|
}
|
||||||
|
|
||||||
|
func moveCell(ctx ml.Context, objs []ml.Tensor, src, dst, len int) {
|
||||||
|
for _, obj := range objs {
|
||||||
|
srcView := obj.View(ctx, int(obj.Stride(2))*src, int(obj.Dim(0)*obj.Dim(1))*len)
|
||||||
|
dstView := obj.View(ctx, int(obj.Stride(2))*dst, int(obj.Dim(0)*obj.Dim(1))*len)
|
||||||
|
|
||||||
|
ctx.Forward(srcView.Copy(ctx, dstView))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *Causal) defrag() {
|
||||||
|
slog.Debug("defragmenting kv cache")
|
||||||
|
|
||||||
|
// Defrag strategy:
|
||||||
|
// - Search for empty holes at the beginning of the cache,
|
||||||
|
// filling them with active data starting at the end
|
||||||
|
// - If there are contiguous elements that need to be moved,
|
||||||
|
// combine them into a single operation by holding new moves
|
||||||
|
// until we see the next one is non-contiguous
|
||||||
|
// - Fill up the context with the maximum number of operations it
|
||||||
|
// can hold then compute that and continue with a new context
|
||||||
|
//
|
||||||
|
// We could try to optimize placement by grouping blocks from
|
||||||
|
// the same sequences together but most likely the next forward
|
||||||
|
// pass will disrupt this anyways, so the real world benefit
|
||||||
|
// seems limited as this time.
|
||||||
|
|
||||||
|
ctx := c.backend.NewContext()
|
||||||
|
|
||||||
|
// For every move, 6 tensors are required per layer (2 views and a
|
||||||
|
// copy for each of k and v). For efficiency, we try to group
|
||||||
|
// multiple contiguous blocks into a single move. However, if we
|
||||||
|
// exceed the maximum number of tensors then we need to compute
|
||||||
|
// what we have and start a new batch.
|
||||||
|
maxMoves := ctx.MaxTensors() / (6 * len(c.keys))
|
||||||
|
moves := 0
|
||||||
|
|
||||||
|
var pendingSrc, pendingDst, pendingLen int
|
||||||
|
|
||||||
|
for dst := range c.cells {
|
||||||
|
if len(c.cells[dst].sequences) == 0 {
|
||||||
|
for src := len(c.cells) - 1; src > dst; src-- {
|
||||||
|
if len(c.cells[src].sequences) != 0 {
|
||||||
|
c.cells[dst] = c.cells[src]
|
||||||
|
c.cells[src] = cacheCell{}
|
||||||
|
|
||||||
|
if pendingLen > 0 {
|
||||||
|
if src == pendingSrc-pendingLen && dst == pendingDst+pendingLen {
|
||||||
|
pendingSrc = src
|
||||||
|
pendingLen++
|
||||||
|
break
|
||||||
|
} else {
|
||||||
|
moveCell(ctx, c.keys, pendingSrc, pendingDst, pendingLen)
|
||||||
|
moveCell(ctx, c.values, pendingSrc, pendingDst, pendingLen)
|
||||||
|
moves++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pendingSrc = src
|
||||||
|
pendingDst = dst
|
||||||
|
pendingLen = 1
|
||||||
|
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if moves >= maxMoves {
|
||||||
|
ctx.Compute(nil)
|
||||||
|
ctx.Close()
|
||||||
|
ctx = c.backend.NewContext()
|
||||||
|
|
||||||
|
moves = 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if pendingLen > 0 {
|
||||||
|
moveCell(ctx, c.keys, pendingSrc, pendingDst, pendingLen)
|
||||||
|
moveCell(ctx, c.values, pendingSrc, pendingDst, pendingLen)
|
||||||
|
moves++
|
||||||
|
}
|
||||||
|
|
||||||
|
if moves > 0 {
|
||||||
|
ctx.Compute(nil)
|
||||||
|
}
|
||||||
|
ctx.Close()
|
||||||
|
|
||||||
|
for seq := range c.cellRanges {
|
||||||
|
seqRange := newRange()
|
||||||
|
|
||||||
|
for i, cell := range c.cells {
|
||||||
|
if cell.findSeq(seq) != nil {
|
||||||
|
if i < seqRange.min {
|
||||||
|
seqRange.min = i
|
||||||
|
}
|
||||||
|
if i > seqRange.max {
|
||||||
|
seqRange.max = i
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
c.cellRanges[seq] = seqRange
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *Causal) Sub(i int) Cache {
|
||||||
|
if i >= len(c.keys) {
|
||||||
|
c.keys = append(c.keys, make([]ml.Tensor, i-len(c.keys)+1)...)
|
||||||
|
c.values = append(c.values, make([]ml.Tensor, i-len(c.values)+1)...)
|
||||||
|
}
|
||||||
|
|
||||||
|
c.curLayer = i
|
||||||
|
|
||||||
|
return c
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *Causal) Get(ctx ml.Context) (ml.Tensor, ml.Tensor, ml.Tensor) {
|
||||||
|
key := c.keys[c.curLayer]
|
||||||
|
value := c.values[c.curLayer]
|
||||||
|
|
||||||
|
key = key.View(ctx, int(key.Stride(2))*c.curCellRange.min,
|
||||||
|
int(key.Dim(0)), int(key.Stride(1)),
|
||||||
|
int(key.Dim(1)), int(key.Stride(2)),
|
||||||
|
int(c.curMask.Dim(0)),
|
||||||
|
)
|
||||||
|
|
||||||
|
value = value.View(ctx, int(key.Stride(2))*c.curCellRange.min,
|
||||||
|
int(value.Dim(0)), int(value.Stride(1)),
|
||||||
|
int(value.Dim(1)), int(value.Stride(2)),
|
||||||
|
int(c.curMask.Dim(0)),
|
||||||
|
)
|
||||||
|
|
||||||
|
return key, value, c.curMask
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *Causal) Put(ctx ml.Context, key, value ml.Tensor) {
|
||||||
|
if c.curBatchSize != int(key.Dim(2)) {
|
||||||
|
panic(fmt.Errorf("inconsistent batch sizes (layer: %v, batch size: %v layer batch size: %v)", c.curLayer, c.curBatchSize, int(key.Dim(2))))
|
||||||
|
}
|
||||||
|
|
||||||
|
if c.keys[c.curLayer] == nil || c.values[c.curLayer] == nil {
|
||||||
|
c.keys[c.curLayer] = c.cacheCtx.Zeros(c.DType, key.Dim(0), key.Dim(1), int64(c.Capacity))
|
||||||
|
c.values[c.curLayer] = c.cacheCtx.Zeros(c.DType, value.Dim(0), value.Dim(1), int64(c.Capacity))
|
||||||
|
}
|
||||||
|
|
||||||
|
ctx.Forward(key.Copy(ctx, c.keys[c.curLayer].View(ctx, int(key.Stride(2))*c.curLoc, int(key.Dim(0)*key.Dim(1)*key.Dim(2)))))
|
||||||
|
ctx.Forward(value.Copy(ctx, c.values[c.curLayer].View(ctx, int(value.Stride(2))*c.curLoc, int(value.Dim(0)*value.Dim(1)*value.Dim(2)))))
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *Causal) CopyPrefix(srcSeq, dstSeq int, len int32) {
|
||||||
|
seqRange := newRange()
|
||||||
|
|
||||||
|
for i := range c.cells {
|
||||||
|
srcCellSeq := c.cells[i].findSeq(srcSeq)
|
||||||
|
dstCellSeq := c.cells[i].findSeq(dstSeq)
|
||||||
|
|
||||||
|
if dstCellSeq != nil {
|
||||||
|
c.cells[i].sequences = slices.DeleteFunc(c.cells[i].sequences, func(s seqCell) bool { return s.seq == dstSeq })
|
||||||
|
}
|
||||||
|
|
||||||
|
if srcCellSeq != nil && srcCellSeq.pos < len {
|
||||||
|
c.cells[i].sequences = append(c.cells[i].sequences, seqCell{seq: dstSeq, pos: srcCellSeq.pos})
|
||||||
|
if i < seqRange.min {
|
||||||
|
seqRange.min = i
|
||||||
|
}
|
||||||
|
if i > seqRange.max {
|
||||||
|
seqRange.max = i
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
c.cellRanges[dstSeq] = seqRange
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *Causal) shift(seq int, beginIndex, offset int32) error {
|
||||||
|
panic("Shift not yet implemented")
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *Causal) Remove(seq int, beginIndex, endIndex int32) error {
|
||||||
|
var offset int32
|
||||||
|
if endIndex != math.MaxInt32 {
|
||||||
|
offset = beginIndex - endIndex
|
||||||
|
}
|
||||||
|
|
||||||
|
seqRange := newRange()
|
||||||
|
|
||||||
|
for i := range c.cells {
|
||||||
|
cellSeq := c.cells[i].findSeq(seq)
|
||||||
|
if cellSeq != nil {
|
||||||
|
if cellSeq.pos >= beginIndex && cellSeq.pos < endIndex {
|
||||||
|
c.cells[i].sequences = slices.DeleteFunc(c.cells[i].sequences, func(s seqCell) bool { return s.seq == seq })
|
||||||
|
} else {
|
||||||
|
if cellSeq.pos >= endIndex {
|
||||||
|
cellSeq.pos += offset
|
||||||
|
}
|
||||||
|
if i < seqRange.min {
|
||||||
|
seqRange.min = i
|
||||||
|
}
|
||||||
|
if i > seqRange.max {
|
||||||
|
seqRange.max = i
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if endIndex != math.MaxInt32 {
|
||||||
|
err := c.shift(seq, endIndex, offset)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
c.cellRanges[seq] = seqRange
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
47
cache/tensor.go
vendored
Normal file
47
cache/tensor.go
vendored
Normal file
@ -0,0 +1,47 @@
|
|||||||
|
package cache
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/ollama/ollama/ml"
|
||||||
|
)
|
||||||
|
|
||||||
|
type TensorCache struct {
|
||||||
|
curLayer int
|
||||||
|
|
||||||
|
cacheCtx ml.Context
|
||||||
|
keys, values []ml.Tensor
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewTensorCache(backend ml.Backend) *TensorCache {
|
||||||
|
return &TensorCache{
|
||||||
|
cacheCtx: backend.NewContext(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *TensorCache) Close() {
|
||||||
|
c.cacheCtx.Close()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *TensorCache) Sub(i int) *TensorCache {
|
||||||
|
if i >= len(c.keys) {
|
||||||
|
c.keys = append(c.keys, make([]ml.Tensor, i-len(c.keys)+1)...)
|
||||||
|
c.values = append(c.values, make([]ml.Tensor, i-len(c.values)+1)...)
|
||||||
|
}
|
||||||
|
|
||||||
|
c.curLayer = i
|
||||||
|
|
||||||
|
return c
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *TensorCache) Get(ctx ml.Context) (ml.Tensor, ml.Tensor, ml.Tensor) {
|
||||||
|
return c.keys[c.curLayer], c.values[c.curLayer], nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *TensorCache) Put(ctx ml.Context, key, value ml.Tensor) {
|
||||||
|
if c.keys[c.curLayer] == nil || c.values[c.curLayer] == nil {
|
||||||
|
c.keys[c.curLayer] = c.cacheCtx.Zeros(key.DType(), key.Shape()...)
|
||||||
|
c.values[c.curLayer] = c.cacheCtx.Zeros(value.DType(), value.Shape()...)
|
||||||
|
}
|
||||||
|
|
||||||
|
ctx.Forward(key.Copy(ctx, c.keys[c.curLayer]))
|
||||||
|
ctx.Forward(value.Copy(ctx, c.values[c.curLayer]))
|
||||||
|
}
|
@ -35,9 +35,9 @@ import (
|
|||||||
"github.com/ollama/ollama/envconfig"
|
"github.com/ollama/ollama/envconfig"
|
||||||
"github.com/ollama/ollama/format"
|
"github.com/ollama/ollama/format"
|
||||||
"github.com/ollama/ollama/llama"
|
"github.com/ollama/ollama/llama"
|
||||||
"github.com/ollama/ollama/llama/runner"
|
|
||||||
"github.com/ollama/ollama/parser"
|
"github.com/ollama/ollama/parser"
|
||||||
"github.com/ollama/ollama/progress"
|
"github.com/ollama/ollama/progress"
|
||||||
|
"github.com/ollama/ollama/runner"
|
||||||
"github.com/ollama/ollama/server"
|
"github.com/ollama/ollama/server"
|
||||||
"github.com/ollama/ollama/types/model"
|
"github.com/ollama/ollama/types/model"
|
||||||
"github.com/ollama/ollama/version"
|
"github.com/ollama/ollama/version"
|
||||||
@ -338,7 +338,10 @@ func RunHandler(cmd *cobra.Command, args []string) error {
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
opts.MultiModal = len(info.ProjectorInfo) != 0
|
// TODO(jessegross): We should either find another way to know if this is
|
||||||
|
// a vision model or remove the logic. Also consider that other modalities will
|
||||||
|
// need different behavior anyways.
|
||||||
|
opts.MultiModal = true
|
||||||
opts.ParentModel = info.Details.ParentModel
|
opts.ParentModel = info.Details.ParentModel
|
||||||
|
|
||||||
if interactive {
|
if interactive {
|
||||||
|
@ -4,7 +4,7 @@ import (
|
|||||||
"fmt"
|
"fmt"
|
||||||
"os"
|
"os"
|
||||||
|
|
||||||
"github.com/ollama/ollama/llama/runner"
|
"github.com/ollama/ollama/runner"
|
||||||
)
|
)
|
||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
|
@ -9,7 +9,7 @@ import (
|
|||||||
"log/slog"
|
"log/slog"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
"github.com/ollama/ollama/llm"
|
"github.com/ollama/ollama/fs/ggml"
|
||||||
)
|
)
|
||||||
|
|
||||||
type ModelParameters struct {
|
type ModelParameters struct {
|
||||||
@ -27,8 +27,8 @@ type AdapterParameters struct {
|
|||||||
} `json:"lora_parameters"`
|
} `json:"lora_parameters"`
|
||||||
}
|
}
|
||||||
|
|
||||||
func (ModelParameters) KV(t *Tokenizer) llm.KV {
|
func (ModelParameters) KV(t *Tokenizer) ggml.KV {
|
||||||
kv := llm.KV{
|
kv := ggml.KV{
|
||||||
"general.file_type": uint32(1),
|
"general.file_type": uint32(1),
|
||||||
"general.quantization_version": uint32(2),
|
"general.quantization_version": uint32(2),
|
||||||
"tokenizer.ggml.pre": t.Pre,
|
"tokenizer.ggml.pre": t.Pre,
|
||||||
@ -54,7 +54,7 @@ func (ModelParameters) KV(t *Tokenizer) llm.KV {
|
|||||||
return kv
|
return kv
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p AdapterParameters) KV() llm.KV {
|
func (p AdapterParameters) KV() ggml.KV {
|
||||||
var alpha float32
|
var alpha float32
|
||||||
if p.LoraParameters.Alpha == 0 {
|
if p.LoraParameters.Alpha == 0 {
|
||||||
alpha = float32(p.Alpha)
|
alpha = float32(p.Alpha)
|
||||||
@ -62,7 +62,7 @@ func (p AdapterParameters) KV() llm.KV {
|
|||||||
alpha = p.LoraParameters.Alpha
|
alpha = p.LoraParameters.Alpha
|
||||||
}
|
}
|
||||||
|
|
||||||
kv := llm.KV{
|
kv := ggml.KV{
|
||||||
"adapter.lora.alpha": alpha,
|
"adapter.lora.alpha": alpha,
|
||||||
"adapter.type": "lora",
|
"adapter.type": "lora",
|
||||||
"general.file_type": uint32(1),
|
"general.file_type": uint32(1),
|
||||||
@ -79,19 +79,19 @@ func (ModelParameters) specialTokenTypes() []string {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (ModelParameters) writeFile(ws io.WriteSeeker, kv llm.KV, ts []llm.Tensor) error {
|
func (ModelParameters) writeFile(ws io.WriteSeeker, kv ggml.KV, ts []ggml.Tensor) error {
|
||||||
return llm.WriteGGUF(ws, kv, ts)
|
return ggml.WriteGGUF(ws, kv, ts)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (AdapterParameters) writeFile(ws io.WriteSeeker, kv llm.KV, ts []llm.Tensor) error {
|
func (AdapterParameters) writeFile(ws io.WriteSeeker, kv ggml.KV, ts []ggml.Tensor) error {
|
||||||
return llm.WriteGGUF(ws, kv, ts)
|
return ggml.WriteGGUF(ws, kv, ts)
|
||||||
}
|
}
|
||||||
|
|
||||||
type ModelConverter interface {
|
type ModelConverter interface {
|
||||||
// KV maps parameters to LLM key-values
|
// KV maps parameters to LLM key-values
|
||||||
KV(*Tokenizer) llm.KV
|
KV(*Tokenizer) ggml.KV
|
||||||
// Tensors maps input tensors to LLM tensors. Model specific modifications can be done here.
|
// Tensors maps input tensors to LLM tensors. Model specific modifications can be done here.
|
||||||
Tensors([]Tensor) []llm.Tensor
|
Tensors([]Tensor) []ggml.Tensor
|
||||||
// Replacements returns a list of string pairs to replace in tensor names.
|
// Replacements returns a list of string pairs to replace in tensor names.
|
||||||
// See [strings.Replacer](https://pkg.go.dev/strings#Replacer) for details
|
// See [strings.Replacer](https://pkg.go.dev/strings#Replacer) for details
|
||||||
Replacements() []string
|
Replacements() []string
|
||||||
@ -99,7 +99,7 @@ type ModelConverter interface {
|
|||||||
// specialTokenTypes returns any special token types the model uses
|
// specialTokenTypes returns any special token types the model uses
|
||||||
specialTokenTypes() []string
|
specialTokenTypes() []string
|
||||||
// writeFile writes the model to the provided io.WriteSeeker
|
// writeFile writes the model to the provided io.WriteSeeker
|
||||||
writeFile(io.WriteSeeker, llm.KV, []llm.Tensor) error
|
writeFile(io.WriteSeeker, ggml.KV, []ggml.Tensor) error
|
||||||
}
|
}
|
||||||
|
|
||||||
type moreParser interface {
|
type moreParser interface {
|
||||||
@ -108,17 +108,17 @@ type moreParser interface {
|
|||||||
|
|
||||||
type AdapterConverter interface {
|
type AdapterConverter interface {
|
||||||
// KV maps parameters to LLM key-values
|
// KV maps parameters to LLM key-values
|
||||||
KV(llm.KV) llm.KV
|
KV(ggml.KV) ggml.KV
|
||||||
// Tensors maps input tensors to LLM tensors. Adapter specific modifications can be done here.
|
// Tensors maps input tensors to LLM tensors. Adapter specific modifications can be done here.
|
||||||
Tensors([]Tensor) []llm.Tensor
|
Tensors([]Tensor) []ggml.Tensor
|
||||||
// Replacements returns a list of string pairs to replace in tensor names.
|
// Replacements returns a list of string pairs to replace in tensor names.
|
||||||
// See [strings.Replacer](https://pkg.go.dev/strings#Replacer) for details
|
// See [strings.Replacer](https://pkg.go.dev/strings#Replacer) for details
|
||||||
Replacements() []string
|
Replacements() []string
|
||||||
|
|
||||||
writeFile(io.WriteSeeker, llm.KV, []llm.Tensor) error
|
writeFile(io.WriteSeeker, ggml.KV, []ggml.Tensor) error
|
||||||
}
|
}
|
||||||
|
|
||||||
func ConvertAdapter(fsys fs.FS, ws io.WriteSeeker, baseKV llm.KV) error {
|
func ConvertAdapter(fsys fs.FS, ws io.WriteSeeker, baseKV ggml.KV) error {
|
||||||
bts, err := fs.ReadFile(fsys, "adapter_config.json")
|
bts, err := fs.ReadFile(fsys, "adapter_config.json")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
|
@ -8,7 +8,7 @@ import (
|
|||||||
"slices"
|
"slices"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
"github.com/ollama/ollama/llm"
|
"github.com/ollama/ollama/fs/ggml"
|
||||||
)
|
)
|
||||||
|
|
||||||
type bertModel struct {
|
type bertModel struct {
|
||||||
@ -85,7 +85,7 @@ func (p *bertModel) parseMore(fsys fs.FS) error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *bertModel) KV(t *Tokenizer) llm.KV {
|
func (p *bertModel) KV(t *Tokenizer) ggml.KV {
|
||||||
kv := p.ModelParameters.KV(t)
|
kv := p.ModelParameters.KV(t)
|
||||||
kv["general.architecture"] = "bert"
|
kv["general.architecture"] = "bert"
|
||||||
kv["bert.attention.causal"] = false
|
kv["bert.attention.causal"] = false
|
||||||
@ -132,8 +132,8 @@ func (p *bertModel) KV(t *Tokenizer) llm.KV {
|
|||||||
return kv
|
return kv
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *bertModel) Tensors(ts []Tensor) []llm.Tensor {
|
func (p *bertModel) Tensors(ts []Tensor) []ggml.Tensor {
|
||||||
var out []llm.Tensor
|
var out []ggml.Tensor
|
||||||
for _, t := range ts {
|
for _, t := range ts {
|
||||||
if slices.Contains([]string{
|
if slices.Contains([]string{
|
||||||
"embeddings.position_ids",
|
"embeddings.position_ids",
|
||||||
@ -143,7 +143,7 @@ func (p *bertModel) Tensors(ts []Tensor) []llm.Tensor {
|
|||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
out = append(out, llm.Tensor{
|
out = append(out, ggml.Tensor{
|
||||||
Name: t.Name(),
|
Name: t.Name(),
|
||||||
Kind: t.Kind(),
|
Kind: t.Kind(),
|
||||||
Shape: t.Shape(),
|
Shape: t.Shape(),
|
||||||
|
@ -3,7 +3,7 @@ package convert
|
|||||||
import (
|
import (
|
||||||
"cmp"
|
"cmp"
|
||||||
|
|
||||||
"github.com/ollama/ollama/llm"
|
"github.com/ollama/ollama/fs/ggml"
|
||||||
)
|
)
|
||||||
|
|
||||||
type commandrModel struct {
|
type commandrModel struct {
|
||||||
@ -24,7 +24,7 @@ type commandrModel struct {
|
|||||||
|
|
||||||
var _ ModelConverter = (*commandrModel)(nil)
|
var _ ModelConverter = (*commandrModel)(nil)
|
||||||
|
|
||||||
func (p *commandrModel) KV(t *Tokenizer) llm.KV {
|
func (p *commandrModel) KV(t *Tokenizer) ggml.KV {
|
||||||
kv := p.ModelParameters.KV(t)
|
kv := p.ModelParameters.KV(t)
|
||||||
kv["general.architecture"] = "command-r"
|
kv["general.architecture"] = "command-r"
|
||||||
kv["general.name"] = "command-r"
|
kv["general.name"] = "command-r"
|
||||||
@ -43,10 +43,10 @@ func (p *commandrModel) KV(t *Tokenizer) llm.KV {
|
|||||||
return kv
|
return kv
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *commandrModel) Tensors(ts []Tensor) []llm.Tensor {
|
func (p *commandrModel) Tensors(ts []Tensor) []ggml.Tensor {
|
||||||
var out []llm.Tensor
|
var out []ggml.Tensor
|
||||||
for _, t := range ts {
|
for _, t := range ts {
|
||||||
out = append(out, llm.Tensor{
|
out = append(out, ggml.Tensor{
|
||||||
Name: t.Name(),
|
Name: t.Name(),
|
||||||
Kind: t.Kind(),
|
Kind: t.Kind(),
|
||||||
Shape: t.Shape(),
|
Shape: t.Shape(),
|
||||||
|
@ -6,7 +6,7 @@ import (
|
|||||||
"github.com/pdevine/tensor"
|
"github.com/pdevine/tensor"
|
||||||
"github.com/pdevine/tensor/native"
|
"github.com/pdevine/tensor/native"
|
||||||
|
|
||||||
"github.com/ollama/ollama/llm"
|
"github.com/ollama/ollama/fs/ggml"
|
||||||
)
|
)
|
||||||
|
|
||||||
type gemmaModel struct {
|
type gemmaModel struct {
|
||||||
@ -23,7 +23,7 @@ type gemmaModel struct {
|
|||||||
|
|
||||||
var _ ModelConverter = (*gemmaModel)(nil)
|
var _ ModelConverter = (*gemmaModel)(nil)
|
||||||
|
|
||||||
func (p *gemmaModel) KV(t *Tokenizer) llm.KV {
|
func (p *gemmaModel) KV(t *Tokenizer) ggml.KV {
|
||||||
kv := p.ModelParameters.KV(t)
|
kv := p.ModelParameters.KV(t)
|
||||||
kv["general.architecture"] = "gemma"
|
kv["general.architecture"] = "gemma"
|
||||||
kv["gemma.context_length"] = p.MaxPositionEmbeddings
|
kv["gemma.context_length"] = p.MaxPositionEmbeddings
|
||||||
@ -42,14 +42,14 @@ func (p *gemmaModel) KV(t *Tokenizer) llm.KV {
|
|||||||
return kv
|
return kv
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *gemmaModel) Tensors(ts []Tensor) []llm.Tensor {
|
func (p *gemmaModel) Tensors(ts []Tensor) []ggml.Tensor {
|
||||||
var out []llm.Tensor
|
var out []ggml.Tensor
|
||||||
for _, t := range ts {
|
for _, t := range ts {
|
||||||
if strings.HasSuffix(t.Name(), "_norm.weight") {
|
if strings.HasSuffix(t.Name(), "_norm.weight") {
|
||||||
t.SetRepacker(p.addOne)
|
t.SetRepacker(p.addOne)
|
||||||
}
|
}
|
||||||
|
|
||||||
out = append(out, llm.Tensor{
|
out = append(out, ggml.Tensor{
|
||||||
Name: t.Name(),
|
Name: t.Name(),
|
||||||
Kind: t.Kind(),
|
Kind: t.Kind(),
|
||||||
Shape: t.Shape(),
|
Shape: t.Shape(),
|
||||||
|
@ -1,8 +1,6 @@
|
|||||||
package convert
|
package convert
|
||||||
|
|
||||||
import (
|
import "github.com/ollama/ollama/fs/ggml"
|
||||||
"github.com/ollama/ollama/llm"
|
|
||||||
)
|
|
||||||
|
|
||||||
type gemma2Model struct {
|
type gemma2Model struct {
|
||||||
gemmaModel
|
gemmaModel
|
||||||
@ -11,7 +9,7 @@ type gemma2Model struct {
|
|||||||
FinalLogitSoftcap float32 `json:"final_logit_softcapping"`
|
FinalLogitSoftcap float32 `json:"final_logit_softcapping"`
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *gemma2Model) KV(t *Tokenizer) llm.KV {
|
func (p *gemma2Model) KV(t *Tokenizer) ggml.KV {
|
||||||
kv := p.ModelParameters.KV(t)
|
kv := p.ModelParameters.KV(t)
|
||||||
kv["general.architecture"] = "gemma2"
|
kv["general.architecture"] = "gemma2"
|
||||||
kv["gemma2.context_length"] = p.MaxPositionEmbeddings
|
kv["gemma2.context_length"] = p.MaxPositionEmbeddings
|
||||||
|
@ -6,7 +6,7 @@ import (
|
|||||||
"github.com/pdevine/tensor"
|
"github.com/pdevine/tensor"
|
||||||
"github.com/pdevine/tensor/native"
|
"github.com/pdevine/tensor/native"
|
||||||
|
|
||||||
"github.com/ollama/ollama/llm"
|
"github.com/ollama/ollama/fs/ggml"
|
||||||
)
|
)
|
||||||
|
|
||||||
type gemma2Adapter struct {
|
type gemma2Adapter struct {
|
||||||
@ -15,14 +15,14 @@ type gemma2Adapter struct {
|
|||||||
|
|
||||||
var _ AdapterConverter = (*gemma2Adapter)(nil)
|
var _ AdapterConverter = (*gemma2Adapter)(nil)
|
||||||
|
|
||||||
func (p *gemma2Adapter) KV(baseKV llm.KV) llm.KV {
|
func (p *gemma2Adapter) KV(baseKV ggml.KV) ggml.KV {
|
||||||
kv := p.AdapterParameters.KV()
|
kv := p.AdapterParameters.KV()
|
||||||
kv["general.architecture"] = "gemma2"
|
kv["general.architecture"] = "gemma2"
|
||||||
return kv
|
return kv
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *gemma2Adapter) Tensors(ts []Tensor) []llm.Tensor {
|
func (p *gemma2Adapter) Tensors(ts []Tensor) []ggml.Tensor {
|
||||||
var out []llm.Tensor
|
var out []ggml.Tensor
|
||||||
for _, t := range ts {
|
for _, t := range ts {
|
||||||
shape := t.Shape()
|
shape := t.Shape()
|
||||||
if (strings.HasSuffix(t.Name(), "weight.lora_a") && shape[0] > shape[1]) ||
|
if (strings.HasSuffix(t.Name(), "weight.lora_a") && shape[0] > shape[1]) ||
|
||||||
@ -31,7 +31,7 @@ func (p *gemma2Adapter) Tensors(ts []Tensor) []llm.Tensor {
|
|||||||
t.SetRepacker(p.repack)
|
t.SetRepacker(p.repack)
|
||||||
}
|
}
|
||||||
|
|
||||||
out = append(out, llm.Tensor{
|
out = append(out, ggml.Tensor{
|
||||||
Name: t.Name(),
|
Name: t.Name(),
|
||||||
Kind: t.Kind(),
|
Kind: t.Kind(),
|
||||||
Shape: t.Shape(),
|
Shape: t.Shape(),
|
||||||
|
@ -9,7 +9,7 @@ import (
|
|||||||
"github.com/pdevine/tensor"
|
"github.com/pdevine/tensor"
|
||||||
"github.com/pdevine/tensor/native"
|
"github.com/pdevine/tensor/native"
|
||||||
|
|
||||||
"github.com/ollama/ollama/llm"
|
"github.com/ollama/ollama/fs/ggml"
|
||||||
)
|
)
|
||||||
|
|
||||||
type llamaModel struct {
|
type llamaModel struct {
|
||||||
@ -46,7 +46,7 @@ type llamaModel struct {
|
|||||||
|
|
||||||
var _ ModelConverter = (*llamaModel)(nil)
|
var _ ModelConverter = (*llamaModel)(nil)
|
||||||
|
|
||||||
func (p *llamaModel) KV(t *Tokenizer) llm.KV {
|
func (p *llamaModel) KV(t *Tokenizer) ggml.KV {
|
||||||
kv := p.ModelParameters.KV(t)
|
kv := p.ModelParameters.KV(t)
|
||||||
kv["general.architecture"] = "llama"
|
kv["general.architecture"] = "llama"
|
||||||
kv["llama.vocab_size"] = p.VocabSize
|
kv["llama.vocab_size"] = p.VocabSize
|
||||||
@ -120,11 +120,11 @@ func (p *llamaModel) KV(t *Tokenizer) llm.KV {
|
|||||||
return kv
|
return kv
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *llamaModel) Tensors(ts []Tensor) []llm.Tensor {
|
func (p *llamaModel) Tensors(ts []Tensor) []ggml.Tensor {
|
||||||
var out []llm.Tensor
|
var out []ggml.Tensor
|
||||||
|
|
||||||
if p.RopeScaling.factors != nil {
|
if p.RopeScaling.factors != nil {
|
||||||
out = append(out, llm.Tensor{
|
out = append(out, ggml.Tensor{
|
||||||
Name: "rope_freqs.weight",
|
Name: "rope_freqs.weight",
|
||||||
Kind: 0,
|
Kind: 0,
|
||||||
Shape: []uint64{uint64(len(p.RopeScaling.factors))},
|
Shape: []uint64{uint64(len(p.RopeScaling.factors))},
|
||||||
@ -138,7 +138,7 @@ func (p *llamaModel) Tensors(ts []Tensor) []llm.Tensor {
|
|||||||
t.SetRepacker(p.repack)
|
t.SetRepacker(p.repack)
|
||||||
}
|
}
|
||||||
|
|
||||||
out = append(out, llm.Tensor{
|
out = append(out, ggml.Tensor{
|
||||||
Name: t.Name(),
|
Name: t.Name(),
|
||||||
Kind: t.Kind(),
|
Kind: t.Kind(),
|
||||||
Shape: t.Shape(),
|
Shape: t.Shape(),
|
||||||
|
@ -7,7 +7,7 @@ import (
|
|||||||
"github.com/pdevine/tensor"
|
"github.com/pdevine/tensor"
|
||||||
"github.com/pdevine/tensor/native"
|
"github.com/pdevine/tensor/native"
|
||||||
|
|
||||||
"github.com/ollama/ollama/llm"
|
"github.com/ollama/ollama/fs/ggml"
|
||||||
)
|
)
|
||||||
|
|
||||||
type llamaAdapter struct {
|
type llamaAdapter struct {
|
||||||
@ -18,7 +18,7 @@ type llamaAdapter struct {
|
|||||||
|
|
||||||
var _ AdapterConverter = (*llamaAdapter)(nil)
|
var _ AdapterConverter = (*llamaAdapter)(nil)
|
||||||
|
|
||||||
func (p *llamaAdapter) KV(baseKV llm.KV) llm.KV {
|
func (p *llamaAdapter) KV(baseKV ggml.KV) ggml.KV {
|
||||||
kv := p.AdapterParameters.KV()
|
kv := p.AdapterParameters.KV()
|
||||||
kv["general.architecture"] = "llama"
|
kv["general.architecture"] = "llama"
|
||||||
kv["llama.attention.head_count"] = baseKV["llama.attention.head_count"]
|
kv["llama.attention.head_count"] = baseKV["llama.attention.head_count"]
|
||||||
@ -29,8 +29,8 @@ func (p *llamaAdapter) KV(baseKV llm.KV) llm.KV {
|
|||||||
return kv
|
return kv
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *llamaAdapter) Tensors(ts []Tensor) []llm.Tensor {
|
func (p *llamaAdapter) Tensors(ts []Tensor) []ggml.Tensor {
|
||||||
var out []llm.Tensor
|
var out []ggml.Tensor
|
||||||
for _, t := range ts {
|
for _, t := range ts {
|
||||||
shape := t.Shape()
|
shape := t.Shape()
|
||||||
if (strings.HasSuffix(t.Name(), "weight.lora_a") && shape[0] > shape[1]) ||
|
if (strings.HasSuffix(t.Name(), "weight.lora_a") && shape[0] > shape[1]) ||
|
||||||
@ -41,7 +41,7 @@ func (p *llamaAdapter) Tensors(ts []Tensor) []llm.Tensor {
|
|||||||
t.SetRepacker(p.repack)
|
t.SetRepacker(p.repack)
|
||||||
}
|
}
|
||||||
|
|
||||||
out = append(out, llm.Tensor{
|
out = append(out, ggml.Tensor{
|
||||||
Name: t.Name(),
|
Name: t.Name(),
|
||||||
Kind: t.Kind(),
|
Kind: t.Kind(),
|
||||||
Shape: shape,
|
Shape: shape,
|
||||||
|
@ -6,7 +6,7 @@ import (
|
|||||||
"slices"
|
"slices"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
"github.com/ollama/ollama/llm"
|
"github.com/ollama/ollama/fs/ggml"
|
||||||
)
|
)
|
||||||
|
|
||||||
type mixtralModel struct {
|
type mixtralModel struct {
|
||||||
@ -15,7 +15,7 @@ type mixtralModel struct {
|
|||||||
NumExpertsPerToken uint32 `json:"num_experts_per_tok"`
|
NumExpertsPerToken uint32 `json:"num_experts_per_tok"`
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *mixtralModel) KV(t *Tokenizer) llm.KV {
|
func (p *mixtralModel) KV(t *Tokenizer) ggml.KV {
|
||||||
kv := p.llamaModel.KV(t)
|
kv := p.llamaModel.KV(t)
|
||||||
|
|
||||||
if p.NumLocalExperts > 0 {
|
if p.NumLocalExperts > 0 {
|
||||||
@ -29,7 +29,7 @@ func (p *mixtralModel) KV(t *Tokenizer) llm.KV {
|
|||||||
return kv
|
return kv
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *mixtralModel) Tensors(ts []Tensor) []llm.Tensor {
|
func (p *mixtralModel) Tensors(ts []Tensor) []ggml.Tensor {
|
||||||
oldnew := []string{
|
oldnew := []string{
|
||||||
"model.layers", "blk",
|
"model.layers", "blk",
|
||||||
"w1", "ffn_gate_exps",
|
"w1", "ffn_gate_exps",
|
||||||
@ -56,10 +56,10 @@ func (p *mixtralModel) Tensors(ts []Tensor) []llm.Tensor {
|
|||||||
return true
|
return true
|
||||||
})
|
})
|
||||||
|
|
||||||
var out []llm.Tensor
|
var out []ggml.Tensor
|
||||||
for n, e := range experts {
|
for n, e := range experts {
|
||||||
// TODO(mxyng): sanity check experts
|
// TODO(mxyng): sanity check experts
|
||||||
out = append(out, llm.Tensor{
|
out = append(out, ggml.Tensor{
|
||||||
Name: n,
|
Name: n,
|
||||||
Kind: e[0].Kind(),
|
Kind: e[0].Kind(),
|
||||||
Shape: append([]uint64{uint64(len(e))}, e[0].Shape()...),
|
Shape: append([]uint64{uint64(len(e))}, e[0].Shape()...),
|
||||||
|
@ -8,7 +8,7 @@ import (
|
|||||||
"strings"
|
"strings"
|
||||||
"sync"
|
"sync"
|
||||||
|
|
||||||
"github.com/ollama/ollama/llm"
|
"github.com/ollama/ollama/fs/ggml"
|
||||||
)
|
)
|
||||||
|
|
||||||
type phi3Model struct {
|
type phi3Model struct {
|
||||||
@ -37,7 +37,7 @@ type phi3Model struct {
|
|||||||
|
|
||||||
var _ ModelConverter = (*phi3Model)(nil)
|
var _ ModelConverter = (*phi3Model)(nil)
|
||||||
|
|
||||||
func (p *phi3Model) KV(t *Tokenizer) llm.KV {
|
func (p *phi3Model) KV(t *Tokenizer) ggml.KV {
|
||||||
kv := p.ModelParameters.KV(t)
|
kv := p.ModelParameters.KV(t)
|
||||||
kv["general.architecture"] = "phi3"
|
kv["general.architecture"] = "phi3"
|
||||||
kv["phi3.context_length"] = p.MaxPositionEmbeddings
|
kv["phi3.context_length"] = p.MaxPositionEmbeddings
|
||||||
@ -68,19 +68,19 @@ func (p *phi3Model) KV(t *Tokenizer) llm.KV {
|
|||||||
return kv
|
return kv
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *phi3Model) Tensors(ts []Tensor) []llm.Tensor {
|
func (p *phi3Model) Tensors(ts []Tensor) []ggml.Tensor {
|
||||||
var addRopeFactors sync.Once
|
var addRopeFactors sync.Once
|
||||||
|
|
||||||
out := make([]llm.Tensor, 0, len(ts)+2)
|
out := make([]ggml.Tensor, 0, len(ts)+2)
|
||||||
for _, t := range ts {
|
for _, t := range ts {
|
||||||
if strings.HasPrefix(t.Name(), "blk.0.") {
|
if strings.HasPrefix(t.Name(), "blk.0.") {
|
||||||
addRopeFactors.Do(func() {
|
addRopeFactors.Do(func() {
|
||||||
out = append(out, llm.Tensor{
|
out = append(out, ggml.Tensor{
|
||||||
Name: "rope_factors_long.weight",
|
Name: "rope_factors_long.weight",
|
||||||
Kind: 0,
|
Kind: 0,
|
||||||
Shape: []uint64{uint64(len(p.RopeScaling.LongFactor))},
|
Shape: []uint64{uint64(len(p.RopeScaling.LongFactor))},
|
||||||
WriterTo: p.RopeScaling.LongFactor,
|
WriterTo: p.RopeScaling.LongFactor,
|
||||||
}, llm.Tensor{
|
}, ggml.Tensor{
|
||||||
Name: "rope_factors_short.weight",
|
Name: "rope_factors_short.weight",
|
||||||
Kind: 0,
|
Kind: 0,
|
||||||
Shape: []uint64{uint64(len(p.RopeScaling.ShortFactor))},
|
Shape: []uint64{uint64(len(p.RopeScaling.ShortFactor))},
|
||||||
@ -89,7 +89,7 @@ func (p *phi3Model) Tensors(ts []Tensor) []llm.Tensor {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
out = append(out, llm.Tensor{
|
out = append(out, ggml.Tensor{
|
||||||
Name: t.Name(),
|
Name: t.Name(),
|
||||||
Kind: t.Kind(),
|
Kind: t.Kind(),
|
||||||
Shape: t.Shape(),
|
Shape: t.Shape(),
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
package convert
|
package convert
|
||||||
|
|
||||||
import "github.com/ollama/ollama/llm"
|
import "github.com/ollama/ollama/fs/ggml"
|
||||||
|
|
||||||
|
|
||||||
type qwen2Model struct {
|
type qwen2Model struct {
|
||||||
ModelParameters
|
ModelParameters
|
||||||
@ -21,7 +22,7 @@ type qwen2Model struct {
|
|||||||
|
|
||||||
var _ ModelConverter = (*qwen2Model)(nil)
|
var _ ModelConverter = (*qwen2Model)(nil)
|
||||||
|
|
||||||
func (q *qwen2Model) KV(t *Tokenizer) llm.KV {
|
func (q *qwen2Model) KV(t *Tokenizer) ggml.KV {
|
||||||
kv := q.ModelParameters.KV(t)
|
kv := q.ModelParameters.KV(t)
|
||||||
kv["general.architecture"] = "qwen2"
|
kv["general.architecture"] = "qwen2"
|
||||||
kv["qwen2.block_count"] = q.HiddenLayers
|
kv["qwen2.block_count"] = q.HiddenLayers
|
||||||
@ -45,10 +46,10 @@ func (q *qwen2Model) KV(t *Tokenizer) llm.KV {
|
|||||||
return kv
|
return kv
|
||||||
}
|
}
|
||||||
|
|
||||||
func (q *qwen2Model) Tensors(ts []Tensor) []llm.Tensor {
|
func (q *qwen2Model) Tensors(ts []Tensor) []ggml.Tensor {
|
||||||
var out []llm.Tensor
|
var out []ggml.Tensor
|
||||||
for _, t := range ts {
|
for _, t := range ts {
|
||||||
out = append(out, llm.Tensor{
|
out = append(out, ggml.Tensor{
|
||||||
Name: t.Name(),
|
Name: t.Name(),
|
||||||
Kind: t.Kind(),
|
Kind: t.Kind(),
|
||||||
Shape: t.Shape(),
|
Shape: t.Shape(),
|
||||||
|
@ -20,7 +20,7 @@ import (
|
|||||||
|
|
||||||
"golang.org/x/exp/maps"
|
"golang.org/x/exp/maps"
|
||||||
|
|
||||||
"github.com/ollama/ollama/llm"
|
"github.com/ollama/ollama/fs/ggml"
|
||||||
)
|
)
|
||||||
|
|
||||||
type tensorData struct {
|
type tensorData struct {
|
||||||
@ -29,7 +29,7 @@ type tensorData struct {
|
|||||||
Shape []int `json:"shape"`
|
Shape []int `json:"shape"`
|
||||||
}
|
}
|
||||||
|
|
||||||
func convertFull(t *testing.T, fsys fs.FS) (*os.File, llm.KV, *llm.Tensors) {
|
func convertFull(t *testing.T, fsys fs.FS) (*os.File, ggml.KV, ggml.Tensors) {
|
||||||
t.Helper()
|
t.Helper()
|
||||||
|
|
||||||
f, err := os.CreateTemp(t.TempDir(), "f16")
|
f, err := os.CreateTemp(t.TempDir(), "f16")
|
||||||
@ -48,7 +48,7 @@ func convertFull(t *testing.T, fsys fs.FS) (*os.File, llm.KV, *llm.Tensors) {
|
|||||||
}
|
}
|
||||||
t.Cleanup(func() { r.Close() })
|
t.Cleanup(func() { r.Close() })
|
||||||
|
|
||||||
m, _, err := llm.DecodeGGML(r, math.MaxInt)
|
m, _, err := ggml.Decode(r, math.MaxInt)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
@ -60,7 +60,7 @@ func convertFull(t *testing.T, fsys fs.FS) (*os.File, llm.KV, *llm.Tensors) {
|
|||||||
return r, m.KV(), m.Tensors()
|
return r, m.KV(), m.Tensors()
|
||||||
}
|
}
|
||||||
|
|
||||||
func generateResultsJSON(t *testing.T, f *os.File, kv llm.KV, tensors *llm.Tensors) map[string]string {
|
func generateResultsJSON(t *testing.T, f *os.File, kv ggml.KV, tensors ggml.Tensors) map[string]string {
|
||||||
actual := make(map[string]string)
|
actual := make(map[string]string)
|
||||||
for k, v := range kv {
|
for k, v := range kv {
|
||||||
if s, ok := v.(json.Marshaler); !ok {
|
if s, ok := v.(json.Marshaler); !ok {
|
||||||
@ -75,7 +75,7 @@ func generateResultsJSON(t *testing.T, f *os.File, kv llm.KV, tensors *llm.Tenso
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, tensor := range tensors.Items {
|
for _, tensor := range tensors.Items() {
|
||||||
sha256sum := sha256.New()
|
sha256sum := sha256.New()
|
||||||
sr := io.NewSectionReader(f, int64(tensors.Offset+tensor.Offset), int64(tensor.Size()))
|
sr := io.NewSectionReader(f, int64(tensors.Offset+tensor.Offset), int64(tensor.Size()))
|
||||||
if _, err := io.Copy(sha256sum, sr); err != nil {
|
if _, err := io.Copy(sha256sum, sr); err != nil {
|
||||||
@ -332,7 +332,7 @@ func TestConvertAdapter(t *testing.T) {
|
|||||||
}
|
}
|
||||||
defer r.Close()
|
defer r.Close()
|
||||||
|
|
||||||
m, _, err := llm.DecodeGGML(r, math.MaxInt)
|
m, _, err := ggml.Decode(r, math.MaxInt)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
|
@ -718,23 +718,18 @@ func (l GpuInfoList) GetVisibleDevicesEnv() (string, string) {
|
|||||||
func LibraryDirs() []string {
|
func LibraryDirs() []string {
|
||||||
// dependencies can exist wherever we found the runners (e.g. build tree for developers) and relative to the executable
|
// dependencies can exist wherever we found the runners (e.g. build tree for developers) and relative to the executable
|
||||||
// This can be simplified once we no longer carry runners as payloads
|
// This can be simplified once we no longer carry runners as payloads
|
||||||
paths := []string{}
|
exe, err := os.Executable()
|
||||||
appExe, err := os.Executable()
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
slog.Warn("failed to lookup executable path", "error", err)
|
slog.Warn("failed to lookup executable path", "error", err)
|
||||||
} else {
|
return nil
|
||||||
appRelative := filepath.Join(filepath.Dir(appExe), envconfig.LibRelativeToExe(), "lib", "ollama")
|
|
||||||
if _, err := os.Stat(appRelative); err == nil {
|
|
||||||
paths = append(paths, appRelative)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
rDir := runners.Locate()
|
|
||||||
if err != nil {
|
lib := filepath.Join(filepath.Dir(exe), envconfig.LibRelativeToExe(), "lib", "ollama")
|
||||||
slog.Warn("unable to locate gpu dependency libraries", "error", err)
|
if _, err := os.Stat(lib); err != nil {
|
||||||
} else {
|
return nil
|
||||||
paths = append(paths, filepath.Dir(rDir))
|
|
||||||
}
|
}
|
||||||
return paths
|
|
||||||
|
return []string{lib}
|
||||||
}
|
}
|
||||||
|
|
||||||
func GetSystemInfo() SystemInfo {
|
func GetSystemInfo() SystemInfo {
|
||||||
|
@ -165,6 +165,8 @@ var (
|
|||||||
IntelGPU = Bool("OLLAMA_INTEL_GPU")
|
IntelGPU = Bool("OLLAMA_INTEL_GPU")
|
||||||
// MultiUserCache optimizes prompt caching for multi-user scenarios
|
// MultiUserCache optimizes prompt caching for multi-user scenarios
|
||||||
MultiUserCache = Bool("OLLAMA_MULTIUSER_CACHE")
|
MultiUserCache = Bool("OLLAMA_MULTIUSER_CACHE")
|
||||||
|
// Enable the new Ollama engine
|
||||||
|
NewRunners = Bool("OLLAMA_NEW_RUNNERS")
|
||||||
)
|
)
|
||||||
|
|
||||||
func String(s string) func() string {
|
func String(s string) func() string {
|
||||||
@ -250,6 +252,7 @@ func AsMap() map[string]EnvVar {
|
|||||||
"OLLAMA_ORIGINS": {"OLLAMA_ORIGINS", Origins(), "A comma separated list of allowed origins"},
|
"OLLAMA_ORIGINS": {"OLLAMA_ORIGINS", Origins(), "A comma separated list of allowed origins"},
|
||||||
"OLLAMA_SCHED_SPREAD": {"OLLAMA_SCHED_SPREAD", SchedSpread(), "Always schedule model across all GPUs"},
|
"OLLAMA_SCHED_SPREAD": {"OLLAMA_SCHED_SPREAD", SchedSpread(), "Always schedule model across all GPUs"},
|
||||||
"OLLAMA_MULTIUSER_CACHE": {"OLLAMA_MULTIUSER_CACHE", MultiUserCache(), "Optimize prompt caching for multi-user scenarios"},
|
"OLLAMA_MULTIUSER_CACHE": {"OLLAMA_MULTIUSER_CACHE", MultiUserCache(), "Optimize prompt caching for multi-user scenarios"},
|
||||||
|
"OLLAMA_NEW_RUNNERS": {"OLLAMA_NEW_RUNNERS", NewRunners(), "Enable the new Ollama engine"},
|
||||||
|
|
||||||
// Informational
|
// Informational
|
||||||
"HTTP_PROXY": {"HTTP_PROXY", String("HTTP_PROXY")(), "HTTP proxy"},
|
"HTTP_PROXY": {"HTTP_PROXY", String("HTTP_PROXY")(), "HTTP proxy"},
|
||||||
|
@ -1,15 +1,15 @@
|
|||||||
package llm
|
package ggml
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"encoding/binary"
|
"encoding/binary"
|
||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
|
"log/slog"
|
||||||
"slices"
|
"slices"
|
||||||
"strings"
|
"strings"
|
||||||
"sync"
|
|
||||||
|
|
||||||
"github.com/ollama/ollama/util/bufioutil"
|
"github.com/ollama/ollama/fs/util/bufioutil"
|
||||||
)
|
)
|
||||||
|
|
||||||
type GGML struct {
|
type GGML struct {
|
||||||
@ -19,145 +19,168 @@ type GGML struct {
|
|||||||
|
|
||||||
type model interface {
|
type model interface {
|
||||||
KV() KV
|
KV() KV
|
||||||
Tensors() *Tensors
|
Tensors() Tensors
|
||||||
}
|
}
|
||||||
|
|
||||||
type KV map[string]any
|
type KV map[string]any
|
||||||
|
|
||||||
func (kv KV) u64(key string) uint64 {
|
|
||||||
switch v := kv[key].(type) {
|
|
||||||
case uint64:
|
|
||||||
return v
|
|
||||||
case uint32:
|
|
||||||
return uint64(v)
|
|
||||||
case float64:
|
|
||||||
return uint64(v)
|
|
||||||
default:
|
|
||||||
return 0
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (kv KV) Architecture() string {
|
func (kv KV) Architecture() string {
|
||||||
if s, ok := kv["general.architecture"].(string); ok {
|
return kv.String("general.architecture", "unknown")
|
||||||
return s
|
|
||||||
}
|
|
||||||
|
|
||||||
return "unknown"
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (kv KV) Kind() string {
|
func (kv KV) Kind() string {
|
||||||
if s, ok := kv["general.type"].(string); ok {
|
return kv.String("general.type", "unknown")
|
||||||
return s
|
|
||||||
}
|
|
||||||
|
|
||||||
return "unknown"
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (kv KV) ParameterCount() uint64 {
|
func (kv KV) ParameterCount() uint64 {
|
||||||
return kv.u64("general.parameter_count")
|
return keyValue[uint64](kv, "general.parameter_count")
|
||||||
}
|
}
|
||||||
|
|
||||||
func (kv KV) FileType() fileType {
|
func (kv KV) FileType() fileType {
|
||||||
if u64 := kv.u64("general.file_type"); u64 > 0 {
|
if t := kv.Uint("general.file_type"); t > 0 {
|
||||||
return fileType(uint32(u64))
|
return fileType(t)
|
||||||
}
|
}
|
||||||
|
|
||||||
return fileTypeUnknown
|
return fileTypeUnknown
|
||||||
}
|
}
|
||||||
|
|
||||||
func (kv KV) BlockCount() uint64 {
|
func (kv KV) BlockCount() uint64 {
|
||||||
return kv.u64(fmt.Sprintf("%s.block_count", kv.Architecture()))
|
return uint64(kv.Uint("block_count"))
|
||||||
|
}
|
||||||
|
|
||||||
|
func (kv KV) EmbeddingLength() uint64 {
|
||||||
|
return uint64(kv.Uint("embedding_length"))
|
||||||
}
|
}
|
||||||
|
|
||||||
func (kv KV) HeadCount() uint64 {
|
func (kv KV) HeadCount() uint64 {
|
||||||
return kv.u64(fmt.Sprintf("%s.attention.head_count", kv.Architecture()))
|
return uint64(kv.Uint("attention.head_count"))
|
||||||
}
|
}
|
||||||
|
|
||||||
func (kv KV) HeadCountKV() uint64 {
|
func (kv KV) HeadCountKV() uint64 {
|
||||||
if headCountKV := kv.u64(fmt.Sprintf("%s.attention.head_count_kv", kv.Architecture())); headCountKV > 0 {
|
return uint64(kv.Uint("attention.head_count_kv", 1))
|
||||||
return headCountKV
|
|
||||||
}
|
|
||||||
|
|
||||||
return 1
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (kv KV) EmbeddingHeadCount() uint64 {
|
func (kv KV) EmbeddingHeadCount() uint64 {
|
||||||
if heads := kv.HeadCount(); heads > 0 {
|
if heads := kv.HeadCount(); heads > 0 {
|
||||||
return kv.EmbeddingLength() / kv.HeadCount()
|
return kv.EmbeddingLength() / heads
|
||||||
}
|
}
|
||||||
|
|
||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
|
|
||||||
func (kv KV) EmbeddingHeadCountK() uint64 {
|
func (kv KV) EmbeddingHeadCountK() uint64 {
|
||||||
if k := kv.u64(fmt.Sprintf("%s.attention.key_length", kv.Architecture())); k > 0 {
|
return uint64(kv.Uint("attention.key_length", uint32(kv.EmbeddingHeadCount())))
|
||||||
return k
|
|
||||||
}
|
|
||||||
|
|
||||||
return kv.EmbeddingHeadCount()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (kv KV) EmbeddingHeadCountV() uint64 {
|
func (kv KV) EmbeddingHeadCountV() uint64 {
|
||||||
if v := kv.u64(fmt.Sprintf("%s.attention.value_length", kv.Architecture())); v > 0 {
|
return uint64(kv.Uint("attention.value_length", uint32(kv.EmbeddingHeadCount())))
|
||||||
return v
|
|
||||||
}
|
|
||||||
|
|
||||||
return kv.EmbeddingHeadCount()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (kv KV) GQA() uint64 {
|
func (kv KV) GQA() uint64 {
|
||||||
return kv.HeadCount() / kv.HeadCountKV()
|
return kv.HeadCount() / kv.HeadCountKV()
|
||||||
}
|
}
|
||||||
|
|
||||||
func (kv KV) EmbeddingLength() uint64 {
|
|
||||||
return kv.u64(fmt.Sprintf("%s.embedding_length", kv.Architecture()))
|
|
||||||
}
|
|
||||||
|
|
||||||
func (kv KV) ContextLength() uint64 {
|
func (kv KV) ContextLength() uint64 {
|
||||||
return kv.u64(fmt.Sprintf("%s.context_length", kv.Architecture()))
|
return uint64(kv.Uint("context_length"))
|
||||||
}
|
}
|
||||||
|
|
||||||
func (kv KV) ChatTemplate() string {
|
func (kv KV) ChatTemplate() string {
|
||||||
s, _ := kv["tokenizer.chat_template"].(string)
|
return kv.String("tokenizer.chat_template")
|
||||||
|
}
|
||||||
|
|
||||||
|
func (kv KV) String(key string, defaultValue ...string) string {
|
||||||
|
return keyValue(kv, key, append(defaultValue, "")...)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (kv KV) Uint(key string, defaultValue ...uint32) uint32 {
|
||||||
|
return keyValue(kv, key, append(defaultValue, 0)...)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (kv KV) Float(key string, defaultValue ...float32) float32 {
|
||||||
|
return keyValue(kv, key, append(defaultValue, 0)...)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (kv KV) Strings(key string, defaultValue ...[]string) []string {
|
||||||
|
r := keyValue(kv, key, &array{})
|
||||||
|
s := make([]string, r.size)
|
||||||
|
for i := range r.size {
|
||||||
|
s[i] = r.values[i].(string)
|
||||||
|
}
|
||||||
|
|
||||||
return s
|
return s
|
||||||
}
|
}
|
||||||
|
|
||||||
type Tensors struct {
|
func (kv KV) Uints(key string, defaultValue ...[]uint32) []uint32 {
|
||||||
Items []*Tensor
|
r := keyValue(kv, key, &array{})
|
||||||
Offset uint64
|
s := make([]uint32, r.size)
|
||||||
|
for i := range r.size {
|
||||||
|
s[i] = uint32(r.values[i].(int32))
|
||||||
|
}
|
||||||
|
|
||||||
layers map[string]Layer
|
return s
|
||||||
layersOnce sync.Once
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (ts *Tensors) Layers() map[string]Layer {
|
func keyValue[T string | uint32 | uint64 | float32 | *array](kv KV, key string, defaultValue ...T) T {
|
||||||
ts.layersOnce.Do(func() {
|
if !strings.HasPrefix(key, "tokenizer.") && !strings.HasPrefix(key, "general.") {
|
||||||
ts.layers = make(map[string]Layer)
|
key = kv.Architecture() + "." + key
|
||||||
for _, t := range ts.Items {
|
}
|
||||||
parts := strings.Split(t.Name, ".")
|
|
||||||
if index := slices.IndexFunc(parts, func(s string) bool { return s == "blk" || s == "mm" }); index != -1 {
|
|
||||||
if len(parts) > index+2 {
|
|
||||||
// blk and mm should have a number after them, join it
|
|
||||||
parts = append(
|
|
||||||
[]string{strings.Join(parts[:index+2], ".")},
|
|
||||||
parts[index+2:]...)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if _, ok := ts.layers[parts[0]]; !ok {
|
if val, ok := kv[key]; ok {
|
||||||
ts.layers[parts[0]] = make(Layer)
|
return val.(T)
|
||||||
}
|
}
|
||||||
|
|
||||||
ts.layers[parts[0]][strings.Join(parts[1:], ".")] = t
|
slog.Warn("key not found", "key", key, "default", defaultValue[0])
|
||||||
|
return defaultValue[0]
|
||||||
|
}
|
||||||
|
|
||||||
|
type Tensors struct {
|
||||||
|
items []*Tensor
|
||||||
|
Offset uint64
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s Tensors) Items(prefix ...string) []*Tensor {
|
||||||
|
if len(prefix) == 0 {
|
||||||
|
return s.items
|
||||||
|
}
|
||||||
|
|
||||||
|
var items []*Tensor
|
||||||
|
for _, t := range s.items {
|
||||||
|
if strings.HasPrefix(t.Name, prefix[0]) {
|
||||||
|
items = append(items, t)
|
||||||
}
|
}
|
||||||
})
|
}
|
||||||
|
|
||||||
return ts.layers
|
return items
|
||||||
|
}
|
||||||
|
|
||||||
|
func (ts Tensors) Layers() map[string]Layer {
|
||||||
|
layers := make(map[string]Layer)
|
||||||
|
for _, t := range ts.items {
|
||||||
|
parts := strings.Split(t.Name, ".")
|
||||||
|
if i := slices.Index(parts, "blk"); i > 0 {
|
||||||
|
parts = append([]string{
|
||||||
|
strings.Join(parts[:i], "."),
|
||||||
|
strings.Join(parts[i:i+2], "."),
|
||||||
|
}, parts[i+2:]...)
|
||||||
|
} else if i == 0 {
|
||||||
|
parts = append([]string{
|
||||||
|
strings.Join(parts[i:i+2], "."),
|
||||||
|
}, parts[i+2:]...)
|
||||||
|
}
|
||||||
|
|
||||||
|
if _, ok := layers[parts[0]]; !ok {
|
||||||
|
layers[parts[0]] = make(Layer)
|
||||||
|
}
|
||||||
|
|
||||||
|
layers[parts[0]][strings.Join(parts[1:], ".")] = t
|
||||||
|
}
|
||||||
|
|
||||||
|
return layers
|
||||||
}
|
}
|
||||||
|
|
||||||
type Layer map[string]*Tensor
|
type Layer map[string]*Tensor
|
||||||
|
|
||||||
func (l Layer) size() (size uint64) {
|
func (l Layer) Size() (size uint64) {
|
||||||
for _, t := range l {
|
for _, t := range l {
|
||||||
size += t.Size()
|
size += t.Size()
|
||||||
}
|
}
|
||||||
@ -255,8 +278,6 @@ func (t Tensor) typeSize() uint64 {
|
|||||||
return 8
|
return 8
|
||||||
case 29: // IQ1_M
|
case 29: // IQ1_M
|
||||||
return blockSize/8 + blockSize/16 + blockSize/32
|
return blockSize/8 + blockSize/16 + blockSize/32
|
||||||
case 30: // BF16
|
|
||||||
return 2
|
|
||||||
default:
|
default:
|
||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
@ -295,7 +316,7 @@ const (
|
|||||||
|
|
||||||
var ErrUnsupportedFormat = errors.New("unsupported model format")
|
var ErrUnsupportedFormat = errors.New("unsupported model format")
|
||||||
|
|
||||||
func DetectGGMLType(b []byte) string {
|
func DetectContentType(b []byte) string {
|
||||||
switch binary.LittleEndian.Uint32(b[:4]) {
|
switch binary.LittleEndian.Uint32(b[:4]) {
|
||||||
case FILE_MAGIC_GGML:
|
case FILE_MAGIC_GGML:
|
||||||
return "ggml"
|
return "ggml"
|
||||||
@ -312,12 +333,12 @@ func DetectGGMLType(b []byte) string {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// DecodeGGML decodes a GGML model from the given reader.
|
// Decode decodes a GGML model from the given reader.
|
||||||
//
|
//
|
||||||
// It collects array values for arrays with a size less than or equal to
|
// It collects array values for arrays with a size less than or equal to
|
||||||
// maxArraySize. If maxArraySize is 0, the default value of 1024 is used. If
|
// maxArraySize. If maxArraySize is 0, the default value of 1024 is used. If
|
||||||
// the maxArraySize is negative, all arrays are collected.
|
// the maxArraySize is negative, all arrays are collected.
|
||||||
func DecodeGGML(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) {
|
func Decode(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) {
|
||||||
if maxArraySize == 0 {
|
if maxArraySize == 0 {
|
||||||
maxArraySize = 1024
|
maxArraySize = 1024
|
||||||
}
|
}
|
||||||
@ -331,10 +352,6 @@ func DecodeGGML(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) {
|
|||||||
|
|
||||||
var c container
|
var c container
|
||||||
switch magic {
|
switch magic {
|
||||||
case FILE_MAGIC_GGML, FILE_MAGIC_GGMF, FILE_MAGIC_GGJT:
|
|
||||||
return nil, 0, ErrUnsupportedFormat
|
|
||||||
case FILE_MAGIC_GGLA:
|
|
||||||
c = &containerGGLA{}
|
|
||||||
case FILE_MAGIC_GGUF_LE:
|
case FILE_MAGIC_GGUF_LE:
|
||||||
c = &containerGGUF{ByteOrder: binary.LittleEndian, maxArraySize: maxArraySize}
|
c = &containerGGUF{ByteOrder: binary.LittleEndian, maxArraySize: maxArraySize}
|
||||||
case FILE_MAGIC_GGUF_BE:
|
case FILE_MAGIC_GGUF_BE:
|
||||||
@ -530,21 +547,20 @@ func (llm GGML) GraphSize(context, batch uint64, kvCacheType string) (kv, partia
|
|||||||
}
|
}
|
||||||
|
|
||||||
// SupportsKVCacheType checks if the requested cache type is supported
|
// SupportsKVCacheType checks if the requested cache type is supported
|
||||||
func (ggml GGML) SupportsKVCacheType(cacheType string) bool {
|
func (llm GGML) SupportsKVCacheType(cacheType string) bool {
|
||||||
validKVCacheTypes := []string{"f16", "q8_0", "q4_0"}
|
return slices.Contains([]string{"f16", "q8_0", "q4_0"}, cacheType)
|
||||||
return slices.Contains(validKVCacheTypes, cacheType)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// SupportsFlashAttention checks if the model supports flash attention
|
// SupportsFlashAttention checks if the model supports flash attention
|
||||||
func (ggml GGML) SupportsFlashAttention() bool {
|
func (llm GGML) SupportsFlashAttention() bool {
|
||||||
_, isEmbedding := ggml.KV()[fmt.Sprintf("%s.pooling_type", ggml.KV().Architecture())]
|
_, isEmbedding := llm.KV()[fmt.Sprintf("%s.pooling_type", llm.KV().Architecture())]
|
||||||
if isEmbedding {
|
if isEmbedding {
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check head counts match and are non-zero
|
// Check head counts match and are non-zero
|
||||||
headCountK := ggml.KV().EmbeddingHeadCountK()
|
headCountK := llm.KV().EmbeddingHeadCountK()
|
||||||
headCountV := ggml.KV().EmbeddingHeadCountV()
|
headCountV := llm.KV().EmbeddingHeadCountV()
|
||||||
return headCountK != 0 && headCountV != 0 && headCountK == headCountV
|
return headCountK != 0 && headCountV != 0 && headCountK == headCountV
|
||||||
}
|
}
|
||||||
|
|
@ -1,4 +1,4 @@
|
|||||||
package llm
|
package ggml
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bytes"
|
"bytes"
|
||||||
@ -8,10 +8,9 @@ import (
|
|||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
|
"maps"
|
||||||
"slices"
|
"slices"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
"golang.org/x/exp/maps"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
type containerGGUF struct {
|
type containerGGUF struct {
|
||||||
@ -110,9 +109,9 @@ func (llm *gguf) KV() KV {
|
|||||||
return llm.kv
|
return llm.kv
|
||||||
}
|
}
|
||||||
|
|
||||||
func (llm *gguf) Tensors() *Tensors {
|
func (llm *gguf) Tensors() Tensors {
|
||||||
return &Tensors{
|
return Tensors{
|
||||||
Items: llm.tensors,
|
items: llm.tensors,
|
||||||
Offset: llm.tensorOffset,
|
Offset: llm.tensorOffset,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -523,7 +522,7 @@ func WriteGGUF(ws io.WriteSeeker, kv KV, ts []Tensor) error {
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
keys := maps.Keys(kv)
|
keys := slices.Collect(maps.Keys(kv))
|
||||||
slices.Sort(keys)
|
slices.Sort(keys)
|
||||||
|
|
||||||
for _, key := range keys {
|
for _, key := range keys {
|
@ -1,4 +1,4 @@
|
|||||||
package llm
|
package ggml
|
||||||
|
|
||||||
import "fmt"
|
import "fmt"
|
||||||
|
|
||||||
@ -32,10 +32,9 @@ const (
|
|||||||
fileTypeIQ1_S
|
fileTypeIQ1_S
|
||||||
fileTypeIQ4_NL
|
fileTypeIQ4_NL
|
||||||
fileTypeIQ3_S
|
fileTypeIQ3_S
|
||||||
fileTypeIQ3_M
|
|
||||||
fileTypeIQ2_S
|
fileTypeIQ2_S
|
||||||
fileTypeIQ2_M
|
|
||||||
fileTypeIQ4_XS
|
fileTypeIQ4_XS
|
||||||
|
fileTypeIQ2_M
|
||||||
fileTypeIQ1_M
|
fileTypeIQ1_M
|
||||||
fileTypeBF16
|
fileTypeBF16
|
||||||
|
|
||||||
@ -94,8 +93,6 @@ func ParseFileType(s string) (fileType, error) {
|
|||||||
return fileTypeIQ4_NL, nil
|
return fileTypeIQ4_NL, nil
|
||||||
case "IQ3_S":
|
case "IQ3_S":
|
||||||
return fileTypeIQ3_S, nil
|
return fileTypeIQ3_S, nil
|
||||||
case "IQ3_M":
|
|
||||||
return fileTypeIQ3_M, nil
|
|
||||||
case "IQ2_S":
|
case "IQ2_S":
|
||||||
return fileTypeIQ2_S, nil
|
return fileTypeIQ2_S, nil
|
||||||
case "IQ4_XS":
|
case "IQ4_XS":
|
||||||
@ -163,8 +160,6 @@ func (t fileType) String() string {
|
|||||||
return "IQ4_NL"
|
return "IQ4_NL"
|
||||||
case fileTypeIQ3_S:
|
case fileTypeIQ3_S:
|
||||||
return "IQ3_S"
|
return "IQ3_S"
|
||||||
case fileTypeIQ3_M:
|
|
||||||
return "IQ3_M"
|
|
||||||
case fileTypeIQ2_S:
|
case fileTypeIQ2_S:
|
||||||
return "IQ2_S"
|
return "IQ2_S"
|
||||||
case fileTypeIQ4_XS:
|
case fileTypeIQ4_XS:
|
3
go.mod
3
go.mod
@ -17,12 +17,14 @@ require (
|
|||||||
require (
|
require (
|
||||||
github.com/agnivade/levenshtein v1.1.1
|
github.com/agnivade/levenshtein v1.1.1
|
||||||
github.com/d4l3k/go-bfloat16 v0.0.0-20211005043715-690c3bdd05f1
|
github.com/d4l3k/go-bfloat16 v0.0.0-20211005043715-690c3bdd05f1
|
||||||
|
github.com/dlclark/regexp2 v1.11.4
|
||||||
github.com/emirpasic/gods/v2 v2.0.0-alpha
|
github.com/emirpasic/gods/v2 v2.0.0-alpha
|
||||||
github.com/google/go-cmp v0.6.0
|
github.com/google/go-cmp v0.6.0
|
||||||
github.com/mattn/go-runewidth v0.0.14
|
github.com/mattn/go-runewidth v0.0.14
|
||||||
github.com/nlpodyssey/gopickle v0.3.0
|
github.com/nlpodyssey/gopickle v0.3.0
|
||||||
github.com/pdevine/tensor v0.0.0-20240510204454-f88f4562727c
|
github.com/pdevine/tensor v0.0.0-20240510204454-f88f4562727c
|
||||||
golang.org/x/image v0.22.0
|
golang.org/x/image v0.22.0
|
||||||
|
gonum.org/v1/gonum v0.15.0
|
||||||
)
|
)
|
||||||
|
|
||||||
require (
|
require (
|
||||||
@ -42,7 +44,6 @@ require (
|
|||||||
github.com/xtgo/set v1.0.0 // indirect
|
github.com/xtgo/set v1.0.0 // indirect
|
||||||
go4.org/unsafe/assume-no-moving-gc v0.0.0-20231121144256-b99613f794b6 // indirect
|
go4.org/unsafe/assume-no-moving-gc v0.0.0-20231121144256-b99613f794b6 // indirect
|
||||||
golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 // indirect
|
golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 // indirect
|
||||||
gonum.org/v1/gonum v0.15.0 // indirect
|
|
||||||
gorgonia.org/vecf32 v0.9.0 // indirect
|
gorgonia.org/vecf32 v0.9.0 // indirect
|
||||||
gorgonia.org/vecf64 v0.9.0 // indirect
|
gorgonia.org/vecf64 v0.9.0 // indirect
|
||||||
)
|
)
|
||||||
|
2
go.sum
2
go.sum
@ -42,6 +42,8 @@ github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c
|
|||||||
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||||
github.com/dgryski/trifles v0.0.0-20200323201526-dd97f9abfb48 h1:fRzb/w+pyskVMQ+UbP35JkH8yB7MYb4q/qhBarqZE6g=
|
github.com/dgryski/trifles v0.0.0-20200323201526-dd97f9abfb48 h1:fRzb/w+pyskVMQ+UbP35JkH8yB7MYb4q/qhBarqZE6g=
|
||||||
github.com/dgryski/trifles v0.0.0-20200323201526-dd97f9abfb48/go.mod h1:if7Fbed8SFyPtHLHbg49SI7NAdJiC5WIA09pe59rfAA=
|
github.com/dgryski/trifles v0.0.0-20200323201526-dd97f9abfb48/go.mod h1:if7Fbed8SFyPtHLHbg49SI7NAdJiC5WIA09pe59rfAA=
|
||||||
|
github.com/dlclark/regexp2 v1.11.4 h1:rPYF9/LECdNymJufQKmri9gV604RvvABwgOA8un7yAo=
|
||||||
|
github.com/dlclark/regexp2 v1.11.4/go.mod h1:DHkYz0B9wPfa6wondMfaivmHpzrQ3v9q8cnmRbL6yW8=
|
||||||
github.com/emirpasic/gods/v2 v2.0.0-alpha h1:dwFlh8pBg1VMOXWGipNMRt8v96dKAIvBehtCt6OtunU=
|
github.com/emirpasic/gods/v2 v2.0.0-alpha h1:dwFlh8pBg1VMOXWGipNMRt8v96dKAIvBehtCt6OtunU=
|
||||||
github.com/emirpasic/gods/v2 v2.0.0-alpha/go.mod h1:W0y4M2dtBB9U5z3YlghmpuUhiaZT2h6yoeE+C1sCp6A=
|
github.com/emirpasic/gods/v2 v2.0.0-alpha/go.mod h1:W0y4M2dtBB9U5z3YlghmpuUhiaZT2h6yoeE+C1sCp6A=
|
||||||
github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
|
github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
|
||||||
|
@ -37,8 +37,7 @@ go build -tags avx .
|
|||||||
```shell
|
```shell
|
||||||
# go doesn't recognize `-mfma` as a valid compiler flag
|
# go doesn't recognize `-mfma` as a valid compiler flag
|
||||||
# see https://github.com/golang/go/issues/17895
|
# see https://github.com/golang/go/issues/17895
|
||||||
go env -w "CGO_CFLAGS_ALLOW=-mfma|-mf16c"
|
go env -w "CGO_CPPFLAGS_ALLOW=-mfma|-mf16c"
|
||||||
go env -w "CGO_CXXFLAGS_ALLOW=-mfma|-mf16c"
|
|
||||||
go build -tags=avx,avx2 .
|
go build -tags=avx,avx2 .
|
||||||
```
|
```
|
||||||
|
|
||||||
|
34
llama/amx.h
vendored
34
llama/amx.h
vendored
@ -1,34 +0,0 @@
|
|||||||
/**
|
|
||||||
* llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
|
|
||||||
*
|
|
||||||
* MIT License
|
|
||||||
*
|
|
||||||
* Copyright (c) 2023-2024 The ggml authors
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
||||||
* of this software and associated documentation files (the "Software"), to deal
|
|
||||||
* in the Software without restriction, including without limitation the rights
|
|
||||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
* copies of the Software, and to permit persons to whom the Software is
|
|
||||||
* furnished to do so, subject to the following conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice shall be included in all
|
|
||||||
* copies or substantial portions of the Software.
|
|
||||||
*
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
||||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
||||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
||||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
||||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
||||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
||||||
* SOFTWARE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include "ggml-backend.h"
|
|
||||||
#include "ggml-cpu-impl.h"
|
|
||||||
|
|
||||||
// GGML internal header
|
|
||||||
|
|
||||||
#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
|
|
||||||
ggml_backend_buffer_type_t ggml_backend_amx_buffer_type(void);
|
|
||||||
#endif
|
|
51
llama/ggml-blas.h
vendored
51
llama/ggml-blas.h
vendored
@ -1,51 +0,0 @@
|
|||||||
/**
|
|
||||||
* llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
|
|
||||||
*
|
|
||||||
* MIT License
|
|
||||||
*
|
|
||||||
* Copyright (c) 2023-2024 The ggml authors
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
||||||
* of this software and associated documentation files (the "Software"), to deal
|
|
||||||
* in the Software without restriction, including without limitation the rights
|
|
||||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
* copies of the Software, and to permit persons to whom the Software is
|
|
||||||
* furnished to do so, subject to the following conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice shall be included in all
|
|
||||||
* copies or substantial portions of the Software.
|
|
||||||
*
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
||||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
||||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
||||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
||||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
||||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
||||||
* SOFTWARE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#pragma once
|
|
||||||
|
|
||||||
#include "ggml.h"
|
|
||||||
#include "ggml-backend.h"
|
|
||||||
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
|
||||||
extern "C" {
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// backend API
|
|
||||||
GGML_BACKEND_API ggml_backend_t ggml_backend_blas_init(void);
|
|
||||||
|
|
||||||
GGML_BACKEND_API bool ggml_backend_is_blas(ggml_backend_t backend);
|
|
||||||
|
|
||||||
// number of threads used for conversion to float
|
|
||||||
// for openblas and blis, this will also set the number of threads used for blas operations
|
|
||||||
GGML_BACKEND_API void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads);
|
|
||||||
|
|
||||||
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_blas_reg(void);
|
|
||||||
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
|
||||||
}
|
|
||||||
#endif
|
|
34
llama/ggml-cpu-aarch64.h
vendored
34
llama/ggml-cpu-aarch64.h
vendored
@ -1,34 +0,0 @@
|
|||||||
/**
|
|
||||||
* llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
|
|
||||||
*
|
|
||||||
* MIT License
|
|
||||||
*
|
|
||||||
* Copyright (c) 2023-2024 The ggml authors
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
||||||
* of this software and associated documentation files (the "Software"), to deal
|
|
||||||
* in the Software without restriction, including without limitation the rights
|
|
||||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
* copies of the Software, and to permit persons to whom the Software is
|
|
||||||
* furnished to do so, subject to the following conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice shall be included in all
|
|
||||||
* copies or substantial portions of the Software.
|
|
||||||
*
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
||||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
||||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
||||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
||||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
||||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
||||||
* SOFTWARE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#pragma once
|
|
||||||
|
|
||||||
#include "ggml-cpu-traits.h"
|
|
||||||
#include "ggml.h"
|
|
||||||
|
|
||||||
// GGML internal header
|
|
||||||
|
|
||||||
ggml_backend_buffer_type_t ggml_backend_cpu_aarch64_buffer_type(void);
|
|
64
llama/ggml-cpu-traits.h
vendored
64
llama/ggml-cpu-traits.h
vendored
@ -1,64 +0,0 @@
|
|||||||
/**
|
|
||||||
* llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
|
|
||||||
*
|
|
||||||
* MIT License
|
|
||||||
*
|
|
||||||
* Copyright (c) 2023-2024 The ggml authors
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
||||||
* of this software and associated documentation files (the "Software"), to deal
|
|
||||||
* in the Software without restriction, including without limitation the rights
|
|
||||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
* copies of the Software, and to permit persons to whom the Software is
|
|
||||||
* furnished to do so, subject to the following conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice shall be included in all
|
|
||||||
* copies or substantial portions of the Software.
|
|
||||||
*
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
||||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
||||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
||||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
||||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
||||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
||||||
* SOFTWARE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#pragma once
|
|
||||||
#include "ggml-backend-impl.h"
|
|
||||||
#include "ggml-cpu-impl.h"
|
|
||||||
#include "ggml.h"
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
|
||||||
# include <vector>
|
|
||||||
extern "C" {
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// return true if op part of extra "accelerator"
|
|
||||||
bool ggml_cpu_extra_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op);
|
|
||||||
bool ggml_cpu_extra_work_size(int n_threads, const struct ggml_tensor * op, size_t * size);
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
|
||||||
}
|
|
||||||
|
|
||||||
namespace ggml::cpu {
|
|
||||||
// register in tensor->extra
|
|
||||||
class tensor_traits {
|
|
||||||
public:
|
|
||||||
virtual ~tensor_traits();
|
|
||||||
virtual bool work_size(int n_threads, const struct ggml_tensor * op, size_t & size) = 0;
|
|
||||||
virtual bool compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) = 0;
|
|
||||||
};
|
|
||||||
|
|
||||||
class extra_buffer_type {
|
|
||||||
public:
|
|
||||||
virtual ~extra_buffer_type();
|
|
||||||
virtual bool supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) = 0;
|
|
||||||
virtual tensor_traits * get_tensor_traits(const struct ggml_tensor * op) = 0;
|
|
||||||
};
|
|
||||||
} // namespace ggml::cpu
|
|
||||||
|
|
||||||
// implemented in ggml-cpu.cpp.
|
|
||||||
std::vector<ggml_backend_buffer_type_t> & ggml_backend_cpu_get_extra_buffers_type();
|
|
||||||
|
|
||||||
#endif
|
|
31
llama/ggml-cuda/acc.cuh
vendored
31
llama/ggml-cuda/acc.cuh
vendored
@ -1,31 +0,0 @@
|
|||||||
/**
|
|
||||||
* llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
|
|
||||||
*
|
|
||||||
* MIT License
|
|
||||||
*
|
|
||||||
* Copyright (c) 2023-2024 The ggml authors
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
||||||
* of this software and associated documentation files (the "Software"), to deal
|
|
||||||
* in the Software without restriction, including without limitation the rights
|
|
||||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
* copies of the Software, and to permit persons to whom the Software is
|
|
||||||
* furnished to do so, subject to the following conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice shall be included in all
|
|
||||||
* copies or substantial portions of the Software.
|
|
||||||
*
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
||||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
||||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
||||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
||||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
||||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
||||||
* SOFTWARE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include "common.cuh"
|
|
||||||
|
|
||||||
#define CUDA_ACC_BLOCK_SIZE 256
|
|
||||||
|
|
||||||
void ggml_cuda_op_acc(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
|
60
llama/ggml-cuda/arange.cu
vendored
60
llama/ggml-cuda/arange.cu
vendored
@ -1,60 +0,0 @@
|
|||||||
/**
|
|
||||||
* llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
|
|
||||||
*
|
|
||||||
* MIT License
|
|
||||||
*
|
|
||||||
* Copyright (c) 2023-2024 The ggml authors
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
||||||
* of this software and associated documentation files (the "Software"), to deal
|
|
||||||
* in the Software without restriction, including without limitation the rights
|
|
||||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
* copies of the Software, and to permit persons to whom the Software is
|
|
||||||
* furnished to do so, subject to the following conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice shall be included in all
|
|
||||||
* copies or substantial portions of the Software.
|
|
||||||
*
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
||||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
||||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
||||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
||||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
||||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
||||||
* SOFTWARE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include "arange.cuh"
|
|
||||||
|
|
||||||
static __global__ void arange_f32(float * dst, const int ne0, const float start, const float step) {
|
|
||||||
// blockIDx.x: idx of ne0 / BLOCK_SIZE
|
|
||||||
int nidx = threadIdx.x + blockIdx.x * blockDim.x;
|
|
||||||
if (nidx >= ne0) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
dst[nidx] = start + step * nidx;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void arange_f32_cuda(float * dst, const int ne0, const float start, const float step, cudaStream_t stream) {
|
|
||||||
int num_blocks = (ne0 + CUDA_ARANGE_BLOCK_SIZE - 1) / CUDA_ARANGE_BLOCK_SIZE;
|
|
||||||
arange_f32<<<num_blocks, CUDA_ARANGE_BLOCK_SIZE, 0, stream>>>(dst, ne0, start, step);
|
|
||||||
}
|
|
||||||
|
|
||||||
void ggml_cuda_op_arange(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
|
||||||
float * dst_d = (float *)dst->data;
|
|
||||||
cudaStream_t stream = ctx.stream();
|
|
||||||
|
|
||||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
|
||||||
|
|
||||||
float start;
|
|
||||||
float stop;
|
|
||||||
float step;
|
|
||||||
memcpy(&start, (float *)dst->op_params + 0, sizeof(float));
|
|
||||||
memcpy(&stop, (float *)dst->op_params + 1, sizeof(float));
|
|
||||||
memcpy(&step, (float *)dst->op_params + 2, sizeof(float));
|
|
||||||
|
|
||||||
int64_t steps = (int64_t)ceil((stop - start) / step);
|
|
||||||
GGML_ASSERT(ggml_nelements(dst) == steps);
|
|
||||||
|
|
||||||
arange_f32_cuda(dst_d, dst->ne[0], start, step, stream);
|
|
||||||
}
|
|
31
llama/ggml-cuda/arange.cuh
vendored
31
llama/ggml-cuda/arange.cuh
vendored
@ -1,31 +0,0 @@
|
|||||||
/**
|
|
||||||
* llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
|
|
||||||
*
|
|
||||||
* MIT License
|
|
||||||
*
|
|
||||||
* Copyright (c) 2023-2024 The ggml authors
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
||||||
* of this software and associated documentation files (the "Software"), to deal
|
|
||||||
* in the Software without restriction, including without limitation the rights
|
|
||||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
* copies of the Software, and to permit persons to whom the Software is
|
|
||||||
* furnished to do so, subject to the following conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice shall be included in all
|
|
||||||
* copies or substantial portions of the Software.
|
|
||||||
*
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
||||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
||||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
||||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
||||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
||||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
||||||
* SOFTWARE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include "common.cuh"
|
|
||||||
|
|
||||||
#define CUDA_ARANGE_BLOCK_SIZE 256
|
|
||||||
|
|
||||||
void ggml_cuda_op_arange(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
|
29
llama/ggml-cuda/argmax.cuh
vendored
29
llama/ggml-cuda/argmax.cuh
vendored
@ -1,29 +0,0 @@
|
|||||||
/**
|
|
||||||
* llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
|
|
||||||
*
|
|
||||||
* MIT License
|
|
||||||
*
|
|
||||||
* Copyright (c) 2023-2024 The ggml authors
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
||||||
* of this software and associated documentation files (the "Software"), to deal
|
|
||||||
* in the Software without restriction, including without limitation the rights
|
|
||||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
* copies of the Software, and to permit persons to whom the Software is
|
|
||||||
* furnished to do so, subject to the following conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice shall be included in all
|
|
||||||
* copies or substantial portions of the Software.
|
|
||||||
*
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
||||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
||||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
||||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
||||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
||||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
||||||
* SOFTWARE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include "common.cuh"
|
|
||||||
|
|
||||||
void ggml_cuda_argmax(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
|
29
llama/ggml-cuda/argsort.cuh
vendored
29
llama/ggml-cuda/argsort.cuh
vendored
@ -1,29 +0,0 @@
|
|||||||
/**
|
|
||||||
* llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
|
|
||||||
*
|
|
||||||
* MIT License
|
|
||||||
*
|
|
||||||
* Copyright (c) 2023-2024 The ggml authors
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
||||||
* of this software and associated documentation files (the "Software"), to deal
|
|
||||||
* in the Software without restriction, including without limitation the rights
|
|
||||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
* copies of the Software, and to permit persons to whom the Software is
|
|
||||||
* furnished to do so, subject to the following conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice shall be included in all
|
|
||||||
* copies or substantial portions of the Software.
|
|
||||||
*
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
||||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
||||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
||||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
||||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
||||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
||||||
* SOFTWARE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include "common.cuh"
|
|
||||||
|
|
||||||
void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
|
35
llama/ggml-cuda/binbcast.cuh
vendored
35
llama/ggml-cuda/binbcast.cuh
vendored
@ -1,35 +0,0 @@
|
|||||||
/**
|
|
||||||
* llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
|
|
||||||
*
|
|
||||||
* MIT License
|
|
||||||
*
|
|
||||||
* Copyright (c) 2023-2024 The ggml authors
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
||||||
* of this software and associated documentation files (the "Software"), to deal
|
|
||||||
* in the Software without restriction, including without limitation the rights
|
|
||||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
* copies of the Software, and to permit persons to whom the Software is
|
|
||||||
* furnished to do so, subject to the following conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice shall be included in all
|
|
||||||
* copies or substantial portions of the Software.
|
|
||||||
*
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
||||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
||||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
||||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
||||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
||||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
||||||
* SOFTWARE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include "common.cuh"
|
|
||||||
|
|
||||||
void ggml_cuda_op_repeat(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
|
||||||
void ggml_cuda_op_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
|
||||||
void ggml_cuda_op_sub(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
|
||||||
void ggml_cuda_op_mul(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
|
||||||
void ggml_cuda_op_div(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
|
||||||
|
|
||||||
void ggml_cuda_op_repeat_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
|
60
llama/ggml-cuda/clamp.cu
vendored
60
llama/ggml-cuda/clamp.cu
vendored
@ -1,60 +0,0 @@
|
|||||||
/**
|
|
||||||
* llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
|
|
||||||
*
|
|
||||||
* MIT License
|
|
||||||
*
|
|
||||||
* Copyright (c) 2023-2024 The ggml authors
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
||||||
* of this software and associated documentation files (the "Software"), to deal
|
|
||||||
* in the Software without restriction, including without limitation the rights
|
|
||||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
* copies of the Software, and to permit persons to whom the Software is
|
|
||||||
* furnished to do so, subject to the following conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice shall be included in all
|
|
||||||
* copies or substantial portions of the Software.
|
|
||||||
*
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
||||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
||||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
||||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
||||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
||||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
||||||
* SOFTWARE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include "clamp.cuh"
|
|
||||||
|
|
||||||
static __global__ void clamp_f32(const float * x, float * dst, const float min, const float max, const int k) {
|
|
||||||
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
|
||||||
|
|
||||||
if (i >= k) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void clamp_f32_cuda(const float * x, float * dst, const float min, const float max, const int k, cudaStream_t stream) {
|
|
||||||
const int num_blocks = (k + CUDA_CLAMP_BLOCK_SIZE - 1) / CUDA_CLAMP_BLOCK_SIZE;
|
|
||||||
clamp_f32<<<num_blocks, CUDA_CLAMP_BLOCK_SIZE, 0, stream>>>(x, dst, min, max, k);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void ggml_cuda_op_clamp(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
|
||||||
const ggml_tensor * src0 = dst->src[0];
|
|
||||||
const float * src0_d = (const float *)src0->data;
|
|
||||||
float * dst_d = (float *)dst->data;
|
|
||||||
cudaStream_t stream = ctx.stream();
|
|
||||||
|
|
||||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
|
||||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
|
||||||
|
|
||||||
float min;
|
|
||||||
float max;
|
|
||||||
memcpy(&min, dst->op_params, sizeof(float));
|
|
||||||
memcpy(&max, (float *) dst->op_params + 1, sizeof(float));
|
|
||||||
|
|
||||||
clamp_f32_cuda(src0_d, dst_d, min, max, ggml_nelements(src0), stream);
|
|
||||||
}
|
|
31
llama/ggml-cuda/clamp.cuh
vendored
31
llama/ggml-cuda/clamp.cuh
vendored
@ -1,31 +0,0 @@
|
|||||||
/**
|
|
||||||
* llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
|
|
||||||
*
|
|
||||||
* MIT License
|
|
||||||
*
|
|
||||||
* Copyright (c) 2023-2024 The ggml authors
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
||||||
* of this software and associated documentation files (the "Software"), to deal
|
|
||||||
* in the Software without restriction, including without limitation the rights
|
|
||||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
* copies of the Software, and to permit persons to whom the Software is
|
|
||||||
* furnished to do so, subject to the following conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice shall be included in all
|
|
||||||
* copies or substantial portions of the Software.
|
|
||||||
*
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
||||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
||||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
||||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
||||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
||||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
||||||
* SOFTWARE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include "common.cuh"
|
|
||||||
|
|
||||||
#define CUDA_CLAMP_BLOCK_SIZE 256
|
|
||||||
|
|
||||||
void ggml_cuda_op_clamp(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
|
31
llama/ggml-cuda/concat.cuh
vendored
31
llama/ggml-cuda/concat.cuh
vendored
@ -1,31 +0,0 @@
|
|||||||
/**
|
|
||||||
* llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
|
|
||||||
*
|
|
||||||
* MIT License
|
|
||||||
*
|
|
||||||
* Copyright (c) 2023-2024 The ggml authors
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
||||||
* of this software and associated documentation files (the "Software"), to deal
|
|
||||||
* in the Software without restriction, including without limitation the rights
|
|
||||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
* copies of the Software, and to permit persons to whom the Software is
|
|
||||||
* furnished to do so, subject to the following conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice shall be included in all
|
|
||||||
* copies or substantial portions of the Software.
|
|
||||||
*
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
||||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
||||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
||||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
||||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
||||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
||||||
* SOFTWARE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include "common.cuh"
|
|
||||||
|
|
||||||
#define CUDA_CONCAT_BLOCK_SIZE 256
|
|
||||||
|
|
||||||
void ggml_cuda_op_concat(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
|
31
llama/ggml-cuda/conv-transpose-1d.cuh
vendored
31
llama/ggml-cuda/conv-transpose-1d.cuh
vendored
@ -1,31 +0,0 @@
|
|||||||
/**
|
|
||||||
* llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
|
|
||||||
*
|
|
||||||
* MIT License
|
|
||||||
*
|
|
||||||
* Copyright (c) 2023-2024 The ggml authors
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
||||||
* of this software and associated documentation files (the "Software"), to deal
|
|
||||||
* in the Software without restriction, including without limitation the rights
|
|
||||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
* copies of the Software, and to permit persons to whom the Software is
|
|
||||||
* furnished to do so, subject to the following conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice shall be included in all
|
|
||||||
* copies or substantial portions of the Software.
|
|
||||||
*
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
||||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
||||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
||||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
||||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
||||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
||||||
* SOFTWARE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include "common.cuh"
|
|
||||||
|
|
||||||
#define CUDA_CONV_TRANPOSE_1D_BLOCK_SIZE 256
|
|
||||||
|
|
||||||
void ggml_cuda_op_conv_transpose_1d(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
|
39
llama/ggml-cuda/convert.cuh
vendored
39
llama/ggml-cuda/convert.cuh
vendored
@ -1,39 +0,0 @@
|
|||||||
/**
|
|
||||||
* llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
|
|
||||||
*
|
|
||||||
* MIT License
|
|
||||||
*
|
|
||||||
* Copyright (c) 2023-2024 The ggml authors
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
||||||
* of this software and associated documentation files (the "Software"), to deal
|
|
||||||
* in the Software without restriction, including without limitation the rights
|
|
||||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
* copies of the Software, and to permit persons to whom the Software is
|
|
||||||
* furnished to do so, subject to the following conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice shall be included in all
|
|
||||||
* copies or substantial portions of the Software.
|
|
||||||
*
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
||||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
||||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
||||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
||||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
||||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
||||||
* SOFTWARE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include "common.cuh"
|
|
||||||
|
|
||||||
#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
|
|
||||||
|
|
||||||
template<typename T>
|
|
||||||
using to_t_cuda_t = void (*)(const void * __restrict__ x, T * __restrict__ y, int64_t k, cudaStream_t stream);
|
|
||||||
|
|
||||||
typedef to_t_cuda_t<float> to_fp32_cuda_t;
|
|
||||||
typedef to_t_cuda_t<half> to_fp16_cuda_t;
|
|
||||||
|
|
||||||
to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type);
|
|
||||||
|
|
||||||
to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type);
|
|
31
llama/ggml-cuda/count-equal.cuh
vendored
31
llama/ggml-cuda/count-equal.cuh
vendored
@ -1,31 +0,0 @@
|
|||||||
/**
|
|
||||||
* llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
|
|
||||||
*
|
|
||||||
* MIT License
|
|
||||||
*
|
|
||||||
* Copyright (c) 2023-2024 The ggml authors
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
||||||
* of this software and associated documentation files (the "Software"), to deal
|
|
||||||
* in the Software without restriction, including without limitation the rights
|
|
||||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
* copies of the Software, and to permit persons to whom the Software is
|
|
||||||
* furnished to do so, subject to the following conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice shall be included in all
|
|
||||||
* copies or substantial portions of the Software.
|
|
||||||
*
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
||||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
||||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
||||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
||||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
||||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
||||||
* SOFTWARE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include "common.cuh"
|
|
||||||
|
|
||||||
#define CUDA_COUNT_EQUAL_CHUNK_SIZE 128
|
|
||||||
|
|
||||||
void ggml_cuda_count_equal(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
|
35
llama/ggml-cuda/cpy.cuh
vendored
35
llama/ggml-cuda/cpy.cuh
vendored
@ -1,35 +0,0 @@
|
|||||||
/**
|
|
||||||
* llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
|
|
||||||
*
|
|
||||||
* MIT License
|
|
||||||
*
|
|
||||||
* Copyright (c) 2023-2024 The ggml authors
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
||||||
* of this software and associated documentation files (the "Software"), to deal
|
|
||||||
* in the Software without restriction, including without limitation the rights
|
|
||||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
* copies of the Software, and to permit persons to whom the Software is
|
|
||||||
* furnished to do so, subject to the following conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice shall be included in all
|
|
||||||
* copies or substantial portions of the Software.
|
|
||||||
*
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
||||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
||||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
||||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
||||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
||||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
||||||
* SOFTWARE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include "common.cuh"
|
|
||||||
|
|
||||||
#define CUDA_CPY_BLOCK_SIZE 64
|
|
||||||
|
|
||||||
void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1);
|
|
||||||
|
|
||||||
void ggml_cuda_dup(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
|
||||||
|
|
||||||
void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1);
|
|
33
llama/ggml-cuda/cross-entropy-loss.cuh
vendored
33
llama/ggml-cuda/cross-entropy-loss.cuh
vendored
@ -1,33 +0,0 @@
|
|||||||
/**
|
|
||||||
* llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
|
|
||||||
*
|
|
||||||
* MIT License
|
|
||||||
*
|
|
||||||
* Copyright (c) 2023-2024 The ggml authors
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
||||||
* of this software and associated documentation files (the "Software"), to deal
|
|
||||||
* in the Software without restriction, including without limitation the rights
|
|
||||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
* copies of the Software, and to permit persons to whom the Software is
|
|
||||||
* furnished to do so, subject to the following conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice shall be included in all
|
|
||||||
* copies or substantial portions of the Software.
|
|
||||||
*
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
||||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
||||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
||||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
||||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
||||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
||||||
* SOFTWARE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include "common.cuh"
|
|
||||||
|
|
||||||
#define CUDA_CROSS_ENTROPY_LOSS_BLOCK_SIZE 256
|
|
||||||
|
|
||||||
void ggml_cuda_cross_entropy_loss(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
|
||||||
|
|
||||||
void ggml_cuda_cross_entropy_loss_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
|
31
llama/ggml-cuda/diagmask.cuh
vendored
31
llama/ggml-cuda/diagmask.cuh
vendored
@ -1,31 +0,0 @@
|
|||||||
/**
|
|
||||||
* llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
|
|
||||||
*
|
|
||||||
* MIT License
|
|
||||||
*
|
|
||||||
* Copyright (c) 2023-2024 The ggml authors
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
||||||
* of this software and associated documentation files (the "Software"), to deal
|
|
||||||
* in the Software without restriction, including without limitation the rights
|
|
||||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
* copies of the Software, and to permit persons to whom the Software is
|
|
||||||
* furnished to do so, subject to the following conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice shall be included in all
|
|
||||||
* copies or substantial portions of the Software.
|
|
||||||
*
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
||||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
||||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
||||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
||||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
||||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
||||||
* SOFTWARE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include "common.cuh"
|
|
||||||
|
|
||||||
#define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
|
|
||||||
|
|
||||||
void ggml_cuda_op_diag_mask_inf(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
|
29
llama/ggml-cuda/fattn-tile-f16.cuh
vendored
29
llama/ggml-cuda/fattn-tile-f16.cuh
vendored
@ -1,29 +0,0 @@
|
|||||||
/**
|
|
||||||
* llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
|
|
||||||
*
|
|
||||||
* MIT License
|
|
||||||
*
|
|
||||||
* Copyright (c) 2023-2024 The ggml authors
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
||||||
* of this software and associated documentation files (the "Software"), to deal
|
|
||||||
* in the Software without restriction, including without limitation the rights
|
|
||||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
* copies of the Software, and to permit persons to whom the Software is
|
|
||||||
* furnished to do so, subject to the following conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice shall be included in all
|
|
||||||
* copies or substantial portions of the Software.
|
|
||||||
*
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
||||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
||||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
||||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
||||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
||||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
||||||
* SOFTWARE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include "common.cuh"
|
|
||||||
|
|
||||||
void ggml_cuda_flash_attn_ext_tile_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
|
29
llama/ggml-cuda/fattn-tile-f32.cuh
vendored
29
llama/ggml-cuda/fattn-tile-f32.cuh
vendored
@ -1,29 +0,0 @@
|
|||||||
/**
|
|
||||||
* llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
|
|
||||||
*
|
|
||||||
* MIT License
|
|
||||||
*
|
|
||||||
* Copyright (c) 2023-2024 The ggml authors
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
||||||
* of this software and associated documentation files (the "Software"), to deal
|
|
||||||
* in the Software without restriction, including without limitation the rights
|
|
||||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
* copies of the Software, and to permit persons to whom the Software is
|
|
||||||
* furnished to do so, subject to the following conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice shall be included in all
|
|
||||||
* copies or substantial portions of the Software.
|
|
||||||
*
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
||||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
||||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
||||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
||||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
||||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
||||||
* SOFTWARE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include "common.cuh"
|
|
||||||
|
|
||||||
void ggml_cuda_flash_attn_ext_tile_f32(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
|
29
llama/ggml-cuda/fattn.cuh
vendored
29
llama/ggml-cuda/fattn.cuh
vendored
@ -1,29 +0,0 @@
|
|||||||
/**
|
|
||||||
* llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
|
|
||||||
*
|
|
||||||
* MIT License
|
|
||||||
*
|
|
||||||
* Copyright (c) 2023-2024 The ggml authors
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
||||||
* of this software and associated documentation files (the "Software"), to deal
|
|
||||||
* in the Software without restriction, including without limitation the rights
|
|
||||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
* copies of the Software, and to permit persons to whom the Software is
|
|
||||||
* furnished to do so, subject to the following conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice shall be included in all
|
|
||||||
* copies or substantial portions of the Software.
|
|
||||||
*
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
||||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
||||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
||||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
||||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
||||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
||||||
* SOFTWARE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include "common.cuh"
|
|
||||||
|
|
||||||
void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
|
31
llama/ggml-cuda/getrows.cuh
vendored
31
llama/ggml-cuda/getrows.cuh
vendored
@ -1,31 +0,0 @@
|
|||||||
/**
|
|
||||||
* llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
|
|
||||||
*
|
|
||||||
* MIT License
|
|
||||||
*
|
|
||||||
* Copyright (c) 2023-2024 The ggml authors
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
||||||
* of this software and associated documentation files (the "Software"), to deal
|
|
||||||
* in the Software without restriction, including without limitation the rights
|
|
||||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
* copies of the Software, and to permit persons to whom the Software is
|
|
||||||
* furnished to do so, subject to the following conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice shall be included in all
|
|
||||||
* copies or substantial portions of the Software.
|
|
||||||
*
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
||||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
||||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
||||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
||||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
||||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
||||||
* SOFTWARE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include "common.cuh"
|
|
||||||
|
|
||||||
#define CUDA_GET_ROWS_BLOCK_SIZE 256
|
|
||||||
|
|
||||||
void ggml_cuda_op_get_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
|
31
llama/ggml-cuda/im2col.cuh
vendored
31
llama/ggml-cuda/im2col.cuh
vendored
@ -1,31 +0,0 @@
|
|||||||
/**
|
|
||||||
* llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
|
|
||||||
*
|
|
||||||
* MIT License
|
|
||||||
*
|
|
||||||
* Copyright (c) 2023-2024 The ggml authors
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
||||||
* of this software and associated documentation files (the "Software"), to deal
|
|
||||||
* in the Software without restriction, including without limitation the rights
|
|
||||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
* copies of the Software, and to permit persons to whom the Software is
|
|
||||||
* furnished to do so, subject to the following conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice shall be included in all
|
|
||||||
* copies or substantial portions of the Software.
|
|
||||||
*
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
||||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
||||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
||||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
||||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
||||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
||||||
* SOFTWARE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include "common.cuh"
|
|
||||||
|
|
||||||
#define CUDA_IM2COL_BLOCK_SIZE 256
|
|
||||||
|
|
||||||
void ggml_cuda_op_im2col(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
|
38
llama/ggml-cuda/mmv.cuh
vendored
38
llama/ggml-cuda/mmv.cuh
vendored
@ -1,38 +0,0 @@
|
|||||||
/**
|
|
||||||
* llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
|
|
||||||
*
|
|
||||||
* MIT License
|
|
||||||
*
|
|
||||||
* Copyright (c) 2023-2024 The ggml authors
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
||||||
* of this software and associated documentation files (the "Software"), to deal
|
|
||||||
* in the Software without restriction, including without limitation the rights
|
|
||||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
* copies of the Software, and to permit persons to whom the Software is
|
|
||||||
* furnished to do so, subject to the following conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice shall be included in all
|
|
||||||
* copies or substantial portions of the Software.
|
|
||||||
*
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
||||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
||||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
||||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
||||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
||||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
||||||
* SOFTWARE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include "common.cuh"
|
|
||||||
|
|
||||||
// maximum number of src0 rows with which to use mul_mat_vec over cuBLAS if FP16 tensor cores are available
|
|
||||||
#define MMV_MAX_ROWS 512
|
|
||||||
|
|
||||||
void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
|
|
||||||
|
|
||||||
void ggml_cuda_op_mul_mat_vec(
|
|
||||||
ggml_backend_cuda_context & ctx,
|
|
||||||
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
|
|
||||||
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
|
||||||
const int64_t src1_padded_row_size, cudaStream_t stream);
|
|
35
llama/ggml-cuda/mmvq.cuh
vendored
35
llama/ggml-cuda/mmvq.cuh
vendored
@ -1,35 +0,0 @@
|
|||||||
/**
|
|
||||||
* llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
|
|
||||||
*
|
|
||||||
* MIT License
|
|
||||||
*
|
|
||||||
* Copyright (c) 2023-2024 The ggml authors
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
||||||
* of this software and associated documentation files (the "Software"), to deal
|
|
||||||
* in the Software without restriction, including without limitation the rights
|
|
||||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
* copies of the Software, and to permit persons to whom the Software is
|
|
||||||
* furnished to do so, subject to the following conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice shall be included in all
|
|
||||||
* copies or substantial portions of the Software.
|
|
||||||
*
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
||||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
||||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
||||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
||||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
||||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
||||||
* SOFTWARE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include "common.cuh"
|
|
||||||
|
|
||||||
#define MMVQ_MAX_BATCH_SIZE 8 // Max. batch size for which to use MMVQ kernels.
|
|
||||||
|
|
||||||
void ggml_cuda_op_mul_mat_vec_q(
|
|
||||||
ggml_backend_cuda_context & ctx,
|
|
||||||
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
|
|
||||||
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
|
||||||
const int64_t src1_padded_row_size, cudaStream_t stream);
|
|
33
llama/ggml-cuda/norm.cuh
vendored
33
llama/ggml-cuda/norm.cuh
vendored
@ -1,33 +0,0 @@
|
|||||||
/**
|
|
||||||
* llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
|
|
||||||
*
|
|
||||||
* MIT License
|
|
||||||
*
|
|
||||||
* Copyright (c) 2023-2024 The ggml authors
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
||||||
* of this software and associated documentation files (the "Software"), to deal
|
|
||||||
* in the Software without restriction, including without limitation the rights
|
|
||||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
* copies of the Software, and to permit persons to whom the Software is
|
|
||||||
* furnished to do so, subject to the following conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice shall be included in all
|
|
||||||
* copies or substantial portions of the Software.
|
|
||||||
*
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
||||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
||||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
||||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
||||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
||||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
||||||
* SOFTWARE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include "common.cuh"
|
|
||||||
|
|
||||||
void ggml_cuda_op_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
|
||||||
|
|
||||||
void ggml_cuda_op_group_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
|
||||||
|
|
||||||
void ggml_cuda_op_rms_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
|
31
llama/ggml-cuda/opt-step-adamw.cuh
vendored
31
llama/ggml-cuda/opt-step-adamw.cuh
vendored
@ -1,31 +0,0 @@
|
|||||||
/**
|
|
||||||
* llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
|
|
||||||
*
|
|
||||||
* MIT License
|
|
||||||
*
|
|
||||||
* Copyright (c) 2023-2024 The ggml authors
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
||||||
* of this software and associated documentation files (the "Software"), to deal
|
|
||||||
* in the Software without restriction, including without limitation the rights
|
|
||||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
* copies of the Software, and to permit persons to whom the Software is
|
|
||||||
* furnished to do so, subject to the following conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice shall be included in all
|
|
||||||
* copies or substantial portions of the Software.
|
|
||||||
*
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
||||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
||||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
||||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
||||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
||||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
||||||
* SOFTWARE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include "common.cuh"
|
|
||||||
|
|
||||||
#define CUDA_OPT_STEP_ADAMW_BLOCK_SIZE 256
|
|
||||||
|
|
||||||
void ggml_cuda_opt_step_adamw(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
|
29
llama/ggml-cuda/out-prod.cuh
vendored
29
llama/ggml-cuda/out-prod.cuh
vendored
@ -1,29 +0,0 @@
|
|||||||
/**
|
|
||||||
* llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
|
|
||||||
*
|
|
||||||
* MIT License
|
|
||||||
*
|
|
||||||
* Copyright (c) 2023-2024 The ggml authors
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
||||||
* of this software and associated documentation files (the "Software"), to deal
|
|
||||||
* in the Software without restriction, including without limitation the rights
|
|
||||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
* copies of the Software, and to permit persons to whom the Software is
|
|
||||||
* furnished to do so, subject to the following conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice shall be included in all
|
|
||||||
* copies or substantial portions of the Software.
|
|
||||||
*
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
||||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
||||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
||||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
||||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
||||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
||||||
* SOFTWARE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include "common.cuh"
|
|
||||||
|
|
||||||
void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
|
32
llama/ggml-cuda/pad.cuh
vendored
32
llama/ggml-cuda/pad.cuh
vendored
@ -1,32 +0,0 @@
|
|||||||
/**
|
|
||||||
* llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
|
|
||||||
*
|
|
||||||
* MIT License
|
|
||||||
*
|
|
||||||
* Copyright (c) 2023-2024 The ggml authors
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
||||||
* of this software and associated documentation files (the "Software"), to deal
|
|
||||||
* in the Software without restriction, including without limitation the rights
|
|
||||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
* copies of the Software, and to permit persons to whom the Software is
|
|
||||||
* furnished to do so, subject to the following conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice shall be included in all
|
|
||||||
* copies or substantial portions of the Software.
|
|
||||||
*
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
||||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
||||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
||||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
||||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
||||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
||||||
* SOFTWARE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include "common.cuh"
|
|
||||||
|
|
||||||
#define CUDA_PAD_BLOCK_SIZE 256
|
|
||||||
|
|
||||||
void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
|
||||||
void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
|
31
llama/ggml-cuda/pool2d.cuh
vendored
31
llama/ggml-cuda/pool2d.cuh
vendored
@ -1,31 +0,0 @@
|
|||||||
/**
|
|
||||||
* llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
|
|
||||||
*
|
|
||||||
* MIT License
|
|
||||||
*
|
|
||||||
* Copyright (c) 2023-2024 The ggml authors
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
||||||
* of this software and associated documentation files (the "Software"), to deal
|
|
||||||
* in the Software without restriction, including without limitation the rights
|
|
||||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
* copies of the Software, and to permit persons to whom the Software is
|
|
||||||
* furnished to do so, subject to the following conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice shall be included in all
|
|
||||||
* copies or substantial portions of the Software.
|
|
||||||
*
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
||||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
||||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
||||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
||||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
||||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
||||||
* SOFTWARE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include "common.cuh"
|
|
||||||
|
|
||||||
#define CUDA_POOL2D_BLOCK_SIZE 256
|
|
||||||
|
|
||||||
void ggml_cuda_op_pool2d(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
|
50
llama/ggml-cuda/quantize.cuh
vendored
50
llama/ggml-cuda/quantize.cuh
vendored
@ -1,50 +0,0 @@
|
|||||||
/**
|
|
||||||
* llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
|
|
||||||
*
|
|
||||||
* MIT License
|
|
||||||
*
|
|
||||||
* Copyright (c) 2023-2024 The ggml authors
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
||||||
* of this software and associated documentation files (the "Software"), to deal
|
|
||||||
* in the Software without restriction, including without limitation the rights
|
|
||||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
* copies of the Software, and to permit persons to whom the Software is
|
|
||||||
* furnished to do so, subject to the following conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice shall be included in all
|
|
||||||
* copies or substantial portions of the Software.
|
|
||||||
*
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
||||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
||||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
||||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
||||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
||||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
||||||
* SOFTWARE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#pragma once
|
|
||||||
|
|
||||||
#include "common.cuh"
|
|
||||||
#include "mmq.cuh"
|
|
||||||
|
|
||||||
#include <cstdint>
|
|
||||||
|
|
||||||
#define CUDA_QUANTIZE_BLOCK_SIZE 256
|
|
||||||
#define CUDA_QUANTIZE_BLOCK_SIZE_MMQ 128
|
|
||||||
|
|
||||||
static_assert(MATRIX_ROW_PADDING % CUDA_QUANTIZE_BLOCK_SIZE == 0, "Risk of out-of-bounds access.");
|
|
||||||
static_assert(MATRIX_ROW_PADDING % (4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ) == 0, "Risk of out-of-bounds access.");
|
|
||||||
|
|
||||||
typedef void (*quantize_cuda_t)(
|
|
||||||
const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded,
|
|
||||||
const ggml_type type_x, cudaStream_t stream);
|
|
||||||
|
|
||||||
void quantize_row_q8_1_cuda(
|
|
||||||
const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded,
|
|
||||||
const ggml_type type_x, cudaStream_t stream);
|
|
||||||
|
|
||||||
void quantize_mmq_q8_1_cuda(
|
|
||||||
const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded,
|
|
||||||
const ggml_type type_x, cudaStream_t stream);
|
|
31
llama/ggml-cuda/rope.cuh
vendored
31
llama/ggml-cuda/rope.cuh
vendored
@ -1,31 +0,0 @@
|
|||||||
/**
|
|
||||||
* llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
|
|
||||||
*
|
|
||||||
* MIT License
|
|
||||||
*
|
|
||||||
* Copyright (c) 2023-2024 The ggml authors
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
||||||
* of this software and associated documentation files (the "Software"), to deal
|
|
||||||
* in the Software without restriction, including without limitation the rights
|
|
||||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
* copies of the Software, and to permit persons to whom the Software is
|
|
||||||
* furnished to do so, subject to the following conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice shall be included in all
|
|
||||||
* copies or substantial portions of the Software.
|
|
||||||
*
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
||||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
||||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
||||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
||||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
||||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
||||||
* SOFTWARE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include "common.cuh"
|
|
||||||
|
|
||||||
#define CUDA_ROPE_BLOCK_SIZE 256
|
|
||||||
|
|
||||||
void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
|
57
llama/ggml-cuda/scale.cu
vendored
57
llama/ggml-cuda/scale.cu
vendored
@ -1,57 +0,0 @@
|
|||||||
/**
|
|
||||||
* llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
|
|
||||||
*
|
|
||||||
* MIT License
|
|
||||||
*
|
|
||||||
* Copyright (c) 2023-2024 The ggml authors
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
||||||
* of this software and associated documentation files (the "Software"), to deal
|
|
||||||
* in the Software without restriction, including without limitation the rights
|
|
||||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
* copies of the Software, and to permit persons to whom the Software is
|
|
||||||
* furnished to do so, subject to the following conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice shall be included in all
|
|
||||||
* copies or substantial portions of the Software.
|
|
||||||
*
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
||||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
||||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
||||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
||||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
||||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
||||||
* SOFTWARE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include "scale.cuh"
|
|
||||||
|
|
||||||
static __global__ void scale_f32(const float * x, float * dst, const float scale, const int k) {
|
|
||||||
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
|
||||||
|
|
||||||
if (i >= k) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
dst[i] = scale * x[i];
|
|
||||||
}
|
|
||||||
|
|
||||||
static void scale_f32_cuda(const float * x, float * dst, const float scale, const int k, cudaStream_t stream) {
|
|
||||||
const int num_blocks = (k + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE;
|
|
||||||
scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k);
|
|
||||||
}
|
|
||||||
|
|
||||||
void ggml_cuda_op_scale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
|
||||||
const ggml_tensor * src0 = dst->src[0];
|
|
||||||
const float * src0_d = (const float *)src0->data;
|
|
||||||
float * dst_d = (float *)dst->data;
|
|
||||||
cudaStream_t stream = ctx.stream();
|
|
||||||
|
|
||||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
|
||||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
|
||||||
|
|
||||||
float scale;
|
|
||||||
memcpy(&scale, dst->op_params, sizeof(float));
|
|
||||||
|
|
||||||
scale_f32_cuda(src0_d, dst_d, scale, ggml_nelements(src0), stream);
|
|
||||||
}
|
|
31
llama/ggml-cuda/scale.cuh
vendored
31
llama/ggml-cuda/scale.cuh
vendored
@ -1,31 +0,0 @@
|
|||||||
/**
|
|
||||||
* llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
|
|
||||||
*
|
|
||||||
* MIT License
|
|
||||||
*
|
|
||||||
* Copyright (c) 2023-2024 The ggml authors
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
||||||
* of this software and associated documentation files (the "Software"), to deal
|
|
||||||
* in the Software without restriction, including without limitation the rights
|
|
||||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
* copies of the Software, and to permit persons to whom the Software is
|
|
||||||
* furnished to do so, subject to the following conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice shall be included in all
|
|
||||||
* copies or substantial portions of the Software.
|
|
||||||
*
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
||||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
||||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
||||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
||||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
||||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
||||||
* SOFTWARE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include "common.cuh"
|
|
||||||
|
|
||||||
#define CUDA_SCALE_BLOCK_SIZE 256
|
|
||||||
|
|
||||||
void ggml_cuda_op_scale(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
|
31
llama/ggml-cuda/softmax.cuh
vendored
31
llama/ggml-cuda/softmax.cuh
vendored
@ -1,31 +0,0 @@
|
|||||||
/**
|
|
||||||
* llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
|
|
||||||
*
|
|
||||||
* MIT License
|
|
||||||
*
|
|
||||||
* Copyright (c) 2023-2024 The ggml authors
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
||||||
* of this software and associated documentation files (the "Software"), to deal
|
|
||||||
* in the Software without restriction, including without limitation the rights
|
|
||||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
* copies of the Software, and to permit persons to whom the Software is
|
|
||||||
* furnished to do so, subject to the following conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice shall be included in all
|
|
||||||
* copies or substantial portions of the Software.
|
|
||||||
*
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
||||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
||||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
||||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
||||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
||||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
||||||
* SOFTWARE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include "common.cuh"
|
|
||||||
|
|
||||||
#define CUDA_SOFT_MAX_BLOCK_SIZE 1024
|
|
||||||
|
|
||||||
void ggml_cuda_op_soft_max(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
|
31
llama/ggml-cuda/sum.cuh
vendored
31
llama/ggml-cuda/sum.cuh
vendored
@ -1,31 +0,0 @@
|
|||||||
/**
|
|
||||||
* llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
|
|
||||||
*
|
|
||||||
* MIT License
|
|
||||||
*
|
|
||||||
* Copyright (c) 2023-2024 The ggml authors
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
||||||
* of this software and associated documentation files (the "Software"), to deal
|
|
||||||
* in the Software without restriction, including without limitation the rights
|
|
||||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
* copies of the Software, and to permit persons to whom the Software is
|
|
||||||
* furnished to do so, subject to the following conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice shall be included in all
|
|
||||||
* copies or substantial portions of the Software.
|
|
||||||
*
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
||||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
||||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
||||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
||||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
||||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
||||||
* SOFTWARE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include "common.cuh"
|
|
||||||
|
|
||||||
void sum_f32_cuda(ggml_cuda_pool & pool, const float * x, float * dst, const int64_t ne, cudaStream_t stream);
|
|
||||||
|
|
||||||
void ggml_cuda_op_sum(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
|
65
llama/ggml-cuda/sumrows.cu
vendored
65
llama/ggml-cuda/sumrows.cu
vendored
@ -1,65 +0,0 @@
|
|||||||
/**
|
|
||||||
* llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
|
|
||||||
*
|
|
||||||
* MIT License
|
|
||||||
*
|
|
||||||
* Copyright (c) 2023-2024 The ggml authors
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
||||||
* of this software and associated documentation files (the "Software"), to deal
|
|
||||||
* in the Software without restriction, including without limitation the rights
|
|
||||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
* copies of the Software, and to permit persons to whom the Software is
|
|
||||||
* furnished to do so, subject to the following conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice shall be included in all
|
|
||||||
* copies or substantial portions of the Software.
|
|
||||||
*
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
||||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
||||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
||||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
||||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
||||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
||||||
* SOFTWARE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include "sumrows.cuh"
|
|
||||||
|
|
||||||
static __global__ void k_sum_rows_f32(const float * x, float * dst, const int ncols) {
|
|
||||||
const int row = blockIdx.x;
|
|
||||||
const int col = threadIdx.x;
|
|
||||||
|
|
||||||
float sum = 0.0f;
|
|
||||||
for (int i = col; i < ncols; i += blockDim.x) {
|
|
||||||
sum += x[row * ncols + i];
|
|
||||||
}
|
|
||||||
|
|
||||||
sum = warp_reduce_sum(sum);
|
|
||||||
|
|
||||||
if (col == 0) {
|
|
||||||
dst[row] = sum;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
|
||||||
const dim3 block_dims(WARP_SIZE, 1, 1);
|
|
||||||
const dim3 block_nums(nrows, 1, 1);
|
|
||||||
k_sum_rows_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
|
|
||||||
}
|
|
||||||
|
|
||||||
void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
|
||||||
const ggml_tensor * src0 = dst->src[0];
|
|
||||||
const float * src0_d = (const float *)src0->data;
|
|
||||||
float * dst_d = (float *)dst->data;
|
|
||||||
cudaStream_t stream = ctx.stream();
|
|
||||||
|
|
||||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
|
||||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
|
||||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
|
||||||
|
|
||||||
const int64_t ncols = src0->ne[0];
|
|
||||||
const int64_t nrows = ggml_nrows(src0);
|
|
||||||
|
|
||||||
sum_rows_f32_cuda(src0_d, dst_d, ncols, nrows, stream);
|
|
||||||
}
|
|
31
llama/ggml-cuda/sumrows.cuh
vendored
31
llama/ggml-cuda/sumrows.cuh
vendored
@ -1,31 +0,0 @@
|
|||||||
/**
|
|
||||||
* llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
|
|
||||||
*
|
|
||||||
* MIT License
|
|
||||||
*
|
|
||||||
* Copyright (c) 2023-2024 The ggml authors
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
||||||
* of this software and associated documentation files (the "Software"), to deal
|
|
||||||
* in the Software without restriction, including without limitation the rights
|
|
||||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
* copies of the Software, and to permit persons to whom the Software is
|
|
||||||
* furnished to do so, subject to the following conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice shall be included in all
|
|
||||||
* copies or substantial portions of the Software.
|
|
||||||
*
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
||||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
||||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
||||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
||||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
||||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
||||||
* SOFTWARE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include "common.cuh"
|
|
||||||
|
|
||||||
void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream);
|
|
||||||
|
|
||||||
void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
|
@ -1,31 +0,0 @@
|
|||||||
/**
|
|
||||||
* llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
|
|
||||||
*
|
|
||||||
* MIT License
|
|
||||||
*
|
|
||||||
* Copyright (c) 2023-2024 The ggml authors
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
||||||
* of this software and associated documentation files (the "Software"), to deal
|
|
||||||
* in the Software without restriction, including without limitation the rights
|
|
||||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
* copies of the Software, and to permit persons to whom the Software is
|
|
||||||
* furnished to do so, subject to the following conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice shall be included in all
|
|
||||||
* copies or substantial portions of the Software.
|
|
||||||
*
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
||||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
||||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
||||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
||||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
||||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
||||||
* SOFTWARE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
|
||||||
|
|
||||||
#include "../fattn-vec-f16.cuh"
|
|
||||||
|
|
||||||
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_F16);
|
|
@ -1,31 +0,0 @@
|
|||||||
/**
|
|
||||||
* llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
|
|
||||||
*
|
|
||||||
* MIT License
|
|
||||||
*
|
|
||||||
* Copyright (c) 2023-2024 The ggml authors
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
||||||
* of this software and associated documentation files (the "Software"), to deal
|
|
||||||
* in the Software without restriction, including without limitation the rights
|
|
||||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
* copies of the Software, and to permit persons to whom the Software is
|
|
||||||
* furnished to do so, subject to the following conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice shall be included in all
|
|
||||||
* copies or substantial portions of the Software.
|
|
||||||
*
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
||||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
||||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
||||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
||||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
||||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
||||||
* SOFTWARE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
|
||||||
|
|
||||||
#include "../fattn-vec-f16.cuh"
|
|
||||||
|
|
||||||
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_0);
|
|
@ -1,31 +0,0 @@
|
|||||||
/**
|
|
||||||
* llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
|
|
||||||
*
|
|
||||||
* MIT License
|
|
||||||
*
|
|
||||||
* Copyright (c) 2023-2024 The ggml authors
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
||||||
* of this software and associated documentation files (the "Software"), to deal
|
|
||||||
* in the Software without restriction, including without limitation the rights
|
|
||||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
* copies of the Software, and to permit persons to whom the Software is
|
|
||||||
* furnished to do so, subject to the following conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice shall be included in all
|
|
||||||
* copies or substantial portions of the Software.
|
|
||||||
*
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
||||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
||||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
||||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
||||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
||||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
||||||
* SOFTWARE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
|
||||||
|
|
||||||
#include "../fattn-vec-f16.cuh"
|
|
||||||
|
|
||||||
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_1);
|
|
@ -1,31 +0,0 @@
|
|||||||
/**
|
|
||||||
* llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
|
|
||||||
*
|
|
||||||
* MIT License
|
|
||||||
*
|
|
||||||
* Copyright (c) 2023-2024 The ggml authors
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
||||||
* of this software and associated documentation files (the "Software"), to deal
|
|
||||||
* in the Software without restriction, including without limitation the rights
|
|
||||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
* copies of the Software, and to permit persons to whom the Software is
|
|
||||||
* furnished to do so, subject to the following conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice shall be included in all
|
|
||||||
* copies or substantial portions of the Software.
|
|
||||||
*
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
||||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
||||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
||||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
||||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
||||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
||||||
* SOFTWARE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
|
||||||
|
|
||||||
#include "../fattn-vec-f16.cuh"
|
|
||||||
|
|
||||||
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_0);
|
|
@ -1,31 +0,0 @@
|
|||||||
/**
|
|
||||||
* llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
|
|
||||||
*
|
|
||||||
* MIT License
|
|
||||||
*
|
|
||||||
* Copyright (c) 2023-2024 The ggml authors
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
||||||
* of this software and associated documentation files (the "Software"), to deal
|
|
||||||
* in the Software without restriction, including without limitation the rights
|
|
||||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
* copies of the Software, and to permit persons to whom the Software is
|
|
||||||
* furnished to do so, subject to the following conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice shall be included in all
|
|
||||||
* copies or substantial portions of the Software.
|
|
||||||
*
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
||||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
||||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
||||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
||||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
||||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
||||||
* SOFTWARE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
|
||||||
|
|
||||||
#include "../fattn-vec-f16.cuh"
|
|
||||||
|
|
||||||
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_1);
|
|
@ -1,31 +0,0 @@
|
|||||||
/**
|
|
||||||
* llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
|
|
||||||
*
|
|
||||||
* MIT License
|
|
||||||
*
|
|
||||||
* Copyright (c) 2023-2024 The ggml authors
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
||||||
* of this software and associated documentation files (the "Software"), to deal
|
|
||||||
* in the Software without restriction, including without limitation the rights
|
|
||||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
* copies of the Software, and to permit persons to whom the Software is
|
|
||||||
* furnished to do so, subject to the following conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice shall be included in all
|
|
||||||
* copies or substantial portions of the Software.
|
|
||||||
*
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
||||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
||||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
||||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
||||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
||||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
||||||
* SOFTWARE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
|
||||||
|
|
||||||
#include "../fattn-vec-f16.cuh"
|
|
||||||
|
|
||||||
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q8_0);
|
|
@ -1,31 +0,0 @@
|
|||||||
/**
|
|
||||||
* llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
|
|
||||||
*
|
|
||||||
* MIT License
|
|
||||||
*
|
|
||||||
* Copyright (c) 2023-2024 The ggml authors
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
||||||
* of this software and associated documentation files (the "Software"), to deal
|
|
||||||
* in the Software without restriction, including without limitation the rights
|
|
||||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
* copies of the Software, and to permit persons to whom the Software is
|
|
||||||
* furnished to do so, subject to the following conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice shall be included in all
|
|
||||||
* copies or substantial portions of the Software.
|
|
||||||
*
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
||||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
||||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
||||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
||||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
||||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
||||||
* SOFTWARE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
|
||||||
|
|
||||||
#include "../fattn-vec-f16.cuh"
|
|
||||||
|
|
||||||
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_F16);
|
|
@ -1,31 +0,0 @@
|
|||||||
/**
|
|
||||||
* llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
|
|
||||||
*
|
|
||||||
* MIT License
|
|
||||||
*
|
|
||||||
* Copyright (c) 2023-2024 The ggml authors
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
||||||
* of this software and associated documentation files (the "Software"), to deal
|
|
||||||
* in the Software without restriction, including without limitation the rights
|
|
||||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
* copies of the Software, and to permit persons to whom the Software is
|
|
||||||
* furnished to do so, subject to the following conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice shall be included in all
|
|
||||||
* copies or substantial portions of the Software.
|
|
||||||
*
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
||||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
||||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
||||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
||||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
||||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
||||||
* SOFTWARE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
|
||||||
|
|
||||||
#include "../fattn-vec-f16.cuh"
|
|
||||||
|
|
||||||
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_0);
|
|
@ -1,31 +0,0 @@
|
|||||||
/**
|
|
||||||
* llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
|
|
||||||
*
|
|
||||||
* MIT License
|
|
||||||
*
|
|
||||||
* Copyright (c) 2023-2024 The ggml authors
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
||||||
* of this software and associated documentation files (the "Software"), to deal
|
|
||||||
* in the Software without restriction, including without limitation the rights
|
|
||||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
* copies of the Software, and to permit persons to whom the Software is
|
|
||||||
* furnished to do so, subject to the following conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice shall be included in all
|
|
||||||
* copies or substantial portions of the Software.
|
|
||||||
*
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
||||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
||||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
||||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
||||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
||||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
||||||
* SOFTWARE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
|
||||||
|
|
||||||
#include "../fattn-vec-f16.cuh"
|
|
||||||
|
|
||||||
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1);
|
|
@ -1,31 +0,0 @@
|
|||||||
/**
|
|
||||||
* llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
|
|
||||||
*
|
|
||||||
* MIT License
|
|
||||||
*
|
|
||||||
* Copyright (c) 2023-2024 The ggml authors
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
||||||
* of this software and associated documentation files (the "Software"), to deal
|
|
||||||
* in the Software without restriction, including without limitation the rights
|
|
||||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
* copies of the Software, and to permit persons to whom the Software is
|
|
||||||
* furnished to do so, subject to the following conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice shall be included in all
|
|
||||||
* copies or substantial portions of the Software.
|
|
||||||
*
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
||||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
||||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
||||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
||||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
||||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
||||||
* SOFTWARE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
|
||||||
|
|
||||||
#include "../fattn-vec-f16.cuh"
|
|
||||||
|
|
||||||
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_0);
|
|
@ -1,31 +0,0 @@
|
|||||||
/**
|
|
||||||
* llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
|
|
||||||
*
|
|
||||||
* MIT License
|
|
||||||
*
|
|
||||||
* Copyright (c) 2023-2024 The ggml authors
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
||||||
* of this software and associated documentation files (the "Software"), to deal
|
|
||||||
* in the Software without restriction, including without limitation the rights
|
|
||||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
* copies of the Software, and to permit persons to whom the Software is
|
|
||||||
* furnished to do so, subject to the following conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice shall be included in all
|
|
||||||
* copies or substantial portions of the Software.
|
|
||||||
*
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
||||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
||||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
||||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
||||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
||||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
||||||
* SOFTWARE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
|
||||||
|
|
||||||
#include "../fattn-vec-f16.cuh"
|
|
||||||
|
|
||||||
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_1);
|
|
@ -1,31 +0,0 @@
|
|||||||
/**
|
|
||||||
* llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
|
|
||||||
*
|
|
||||||
* MIT License
|
|
||||||
*
|
|
||||||
* Copyright (c) 2023-2024 The ggml authors
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
||||||
* of this software and associated documentation files (the "Software"), to deal
|
|
||||||
* in the Software without restriction, including without limitation the rights
|
|
||||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
* copies of the Software, and to permit persons to whom the Software is
|
|
||||||
* furnished to do so, subject to the following conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice shall be included in all
|
|
||||||
* copies or substantial portions of the Software.
|
|
||||||
*
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
||||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
||||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
||||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
||||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
||||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
||||||
* SOFTWARE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
|
||||||
|
|
||||||
#include "../fattn-vec-f16.cuh"
|
|
||||||
|
|
||||||
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q8_0);
|
|
@ -1,31 +0,0 @@
|
|||||||
/**
|
|
||||||
* llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
|
|
||||||
*
|
|
||||||
* MIT License
|
|
||||||
*
|
|
||||||
* Copyright (c) 2023-2024 The ggml authors
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
||||||
* of this software and associated documentation files (the "Software"), to deal
|
|
||||||
* in the Software without restriction, including without limitation the rights
|
|
||||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
* copies of the Software, and to permit persons to whom the Software is
|
|
||||||
* furnished to do so, subject to the following conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice shall be included in all
|
|
||||||
* copies or substantial portions of the Software.
|
|
||||||
*
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
||||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
||||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
||||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
||||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
||||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
||||||
* SOFTWARE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
|
||||||
|
|
||||||
#include "../fattn-vec-f16.cuh"
|
|
||||||
|
|
||||||
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_F16);
|
|
@ -1,31 +0,0 @@
|
|||||||
/**
|
|
||||||
* llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
|
|
||||||
*
|
|
||||||
* MIT License
|
|
||||||
*
|
|
||||||
* Copyright (c) 2023-2024 The ggml authors
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
||||||
* of this software and associated documentation files (the "Software"), to deal
|
|
||||||
* in the Software without restriction, including without limitation the rights
|
|
||||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
* copies of the Software, and to permit persons to whom the Software is
|
|
||||||
* furnished to do so, subject to the following conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice shall be included in all
|
|
||||||
* copies or substantial portions of the Software.
|
|
||||||
*
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
||||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
||||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
||||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
||||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
||||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
||||||
* SOFTWARE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
|
||||||
|
|
||||||
#include "../fattn-vec-f16.cuh"
|
|
||||||
|
|
||||||
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_0);
|
|
@ -1,31 +0,0 @@
|
|||||||
/**
|
|
||||||
* llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
|
|
||||||
*
|
|
||||||
* MIT License
|
|
||||||
*
|
|
||||||
* Copyright (c) 2023-2024 The ggml authors
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
||||||
* of this software and associated documentation files (the "Software"), to deal
|
|
||||||
* in the Software without restriction, including without limitation the rights
|
|
||||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
* copies of the Software, and to permit persons to whom the Software is
|
|
||||||
* furnished to do so, subject to the following conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice shall be included in all
|
|
||||||
* copies or substantial portions of the Software.
|
|
||||||
*
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
||||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
||||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
||||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
||||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
||||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
||||||
* SOFTWARE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
|
||||||
|
|
||||||
#include "../fattn-vec-f16.cuh"
|
|
||||||
|
|
||||||
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_1);
|
|
@ -1,31 +0,0 @@
|
|||||||
/**
|
|
||||||
* llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
|
|
||||||
*
|
|
||||||
* MIT License
|
|
||||||
*
|
|
||||||
* Copyright (c) 2023-2024 The ggml authors
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
||||||
* of this software and associated documentation files (the "Software"), to deal
|
|
||||||
* in the Software without restriction, including without limitation the rights
|
|
||||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
* copies of the Software, and to permit persons to whom the Software is
|
|
||||||
* furnished to do so, subject to the following conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice shall be included in all
|
|
||||||
* copies or substantial portions of the Software.
|
|
||||||
*
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
||||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
||||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
||||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
||||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
||||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
||||||
* SOFTWARE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
|
||||||
|
|
||||||
#include "../fattn-vec-f16.cuh"
|
|
||||||
|
|
||||||
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_0);
|
|
@ -1,31 +0,0 @@
|
|||||||
/**
|
|
||||||
* llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
|
|
||||||
*
|
|
||||||
* MIT License
|
|
||||||
*
|
|
||||||
* Copyright (c) 2023-2024 The ggml authors
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
||||||
* of this software and associated documentation files (the "Software"), to deal
|
|
||||||
* in the Software without restriction, including without limitation the rights
|
|
||||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
* copies of the Software, and to permit persons to whom the Software is
|
|
||||||
* furnished to do so, subject to the following conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice shall be included in all
|
|
||||||
* copies or substantial portions of the Software.
|
|
||||||
*
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
||||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
||||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
||||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
||||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
||||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
||||||
* SOFTWARE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
|
||||||
|
|
||||||
#include "../fattn-vec-f16.cuh"
|
|
||||||
|
|
||||||
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_1);
|
|
@ -1,31 +0,0 @@
|
|||||||
/**
|
|
||||||
* llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
|
|
||||||
*
|
|
||||||
* MIT License
|
|
||||||
*
|
|
||||||
* Copyright (c) 2023-2024 The ggml authors
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
||||||
* of this software and associated documentation files (the "Software"), to deal
|
|
||||||
* in the Software without restriction, including without limitation the rights
|
|
||||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
* copies of the Software, and to permit persons to whom the Software is
|
|
||||||
* furnished to do so, subject to the following conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice shall be included in all
|
|
||||||
* copies or substantial portions of the Software.
|
|
||||||
*
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
||||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
||||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
||||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
||||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
||||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
||||||
* SOFTWARE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
|
||||||
|
|
||||||
#include "../fattn-vec-f16.cuh"
|
|
||||||
|
|
||||||
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q8_0);
|
|
@ -1,31 +0,0 @@
|
|||||||
/**
|
|
||||||
* llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
|
|
||||||
*
|
|
||||||
* MIT License
|
|
||||||
*
|
|
||||||
* Copyright (c) 2023-2024 The ggml authors
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
||||||
* of this software and associated documentation files (the "Software"), to deal
|
|
||||||
* in the Software without restriction, including without limitation the rights
|
|
||||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
* copies of the Software, and to permit persons to whom the Software is
|
|
||||||
* furnished to do so, subject to the following conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice shall be included in all
|
|
||||||
* copies or substantial portions of the Software.
|
|
||||||
*
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
||||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
||||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
||||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
||||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
||||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
||||||
* SOFTWARE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
|
||||||
|
|
||||||
#include "../fattn-vec-f16.cuh"
|
|
||||||
|
|
||||||
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_F16);
|
|
@ -1,31 +0,0 @@
|
|||||||
/**
|
|
||||||
* llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
|
|
||||||
*
|
|
||||||
* MIT License
|
|
||||||
*
|
|
||||||
* Copyright (c) 2023-2024 The ggml authors
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
||||||
* of this software and associated documentation files (the "Software"), to deal
|
|
||||||
* in the Software without restriction, including without limitation the rights
|
|
||||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
* copies of the Software, and to permit persons to whom the Software is
|
|
||||||
* furnished to do so, subject to the following conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice shall be included in all
|
|
||||||
* copies or substantial portions of the Software.
|
|
||||||
*
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
||||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
||||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
||||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
||||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
||||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
||||||
* SOFTWARE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
|
||||||
|
|
||||||
#include "../fattn-vec-f16.cuh"
|
|
||||||
|
|
||||||
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_0);
|
|
@ -1,31 +0,0 @@
|
|||||||
/**
|
|
||||||
* llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
|
|
||||||
*
|
|
||||||
* MIT License
|
|
||||||
*
|
|
||||||
* Copyright (c) 2023-2024 The ggml authors
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
||||||
* of this software and associated documentation files (the "Software"), to deal
|
|
||||||
* in the Software without restriction, including without limitation the rights
|
|
||||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
* copies of the Software, and to permit persons to whom the Software is
|
|
||||||
* furnished to do so, subject to the following conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice shall be included in all
|
|
||||||
* copies or substantial portions of the Software.
|
|
||||||
*
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
||||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
||||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
||||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
||||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
||||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
||||||
* SOFTWARE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
|
||||||
|
|
||||||
#include "../fattn-vec-f16.cuh"
|
|
||||||
|
|
||||||
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_1);
|
|
@ -1,31 +0,0 @@
|
|||||||
/**
|
|
||||||
* llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
|
|
||||||
*
|
|
||||||
* MIT License
|
|
||||||
*
|
|
||||||
* Copyright (c) 2023-2024 The ggml authors
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
||||||
* of this software and associated documentation files (the "Software"), to deal
|
|
||||||
* in the Software without restriction, including without limitation the rights
|
|
||||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
* copies of the Software, and to permit persons to whom the Software is
|
|
||||||
* furnished to do so, subject to the following conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice shall be included in all
|
|
||||||
* copies or substantial portions of the Software.
|
|
||||||
*
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
||||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
||||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
||||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
||||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
||||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
||||||
* SOFTWARE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
|
||||||
|
|
||||||
#include "../fattn-vec-f16.cuh"
|
|
||||||
|
|
||||||
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_0);
|
|
@ -1,31 +0,0 @@
|
|||||||
/**
|
|
||||||
* llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
|
|
||||||
*
|
|
||||||
* MIT License
|
|
||||||
*
|
|
||||||
* Copyright (c) 2023-2024 The ggml authors
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
||||||
* of this software and associated documentation files (the "Software"), to deal
|
|
||||||
* in the Software without restriction, including without limitation the rights
|
|
||||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
* copies of the Software, and to permit persons to whom the Software is
|
|
||||||
* furnished to do so, subject to the following conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice shall be included in all
|
|
||||||
* copies or substantial portions of the Software.
|
|
||||||
*
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
||||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
||||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
||||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
||||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
||||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
||||||
* SOFTWARE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
|
||||||
|
|
||||||
#include "../fattn-vec-f16.cuh"
|
|
||||||
|
|
||||||
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_1);
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user