Compare commits
57 Commits
main
...
jmorganca/
Author | SHA1 | Date | |
---|---|---|---|
![]() |
5c76074f66 | ||
![]() |
18d52686de | ||
![]() |
2d2eb5903d | ||
![]() |
533f4c41bd | ||
![]() |
31b2c06393 | ||
![]() |
4ae23deb50 | ||
![]() |
5d3da85a16 | ||
![]() |
8b64b456c1 | ||
![]() |
684f0d9291 | ||
![]() |
3308bff137 | ||
![]() |
bf1929a3bc | ||
![]() |
1a2c413225 | ||
![]() |
57279f89a2 | ||
![]() |
9ceee25d8b | ||
![]() |
661bf04696 | ||
![]() |
2521a55ae6 | ||
![]() |
32948ec952 | ||
![]() |
9876c8453a | ||
![]() |
919b3d6e21 | ||
![]() |
16b13e0cfc | ||
![]() |
75441c56f3 | ||
![]() |
45f96e898d | ||
![]() |
7c555d394c | ||
![]() |
39ee6d2bd0 | ||
![]() |
47705b5168 | ||
![]() |
698a92aa4a | ||
![]() |
150c499cae | ||
![]() |
f1257a7de4 | ||
![]() |
b68af0370f | ||
![]() |
ca981c8a49 | ||
![]() |
b3da8a319e | ||
![]() |
359e1d5b19 | ||
![]() |
bde6b46ce9 | ||
![]() |
ff1f74534b | ||
![]() |
104f802df1 | ||
![]() |
eed0ac2948 | ||
![]() |
fcfad744ff | ||
![]() |
fb3c16f2a2 | ||
![]() |
ee869f35e4 | ||
![]() |
ff5d1a3dc0 | ||
![]() |
88b231f903 | ||
![]() |
7e920c8d75 | ||
![]() |
dd8c619fba | ||
![]() |
2af76d0e7a | ||
![]() |
8d901825f0 | ||
![]() |
04936b719f | ||
![]() |
0f0136d419 | ||
![]() |
80498f76de | ||
![]() |
f8b48aa784 | ||
![]() |
5ff0d538b0 | ||
![]() |
eedc969c35 | ||
![]() |
963531215e | ||
![]() |
3fe090f447 | ||
![]() |
1704072746 | ||
![]() |
c1f9bcb4dd | ||
![]() |
198b1e6db9 | ||
![]() |
51ad65f831 |
6
.github/workflows/release.yaml
vendored
6
.github/workflows/release.yaml
vendored
@ -103,11 +103,6 @@ jobs:
|
|||||||
arch: [amd64]
|
arch: [amd64]
|
||||||
preset: ['CPU']
|
preset: ['CPU']
|
||||||
include:
|
include:
|
||||||
- os: windows
|
|
||||||
arch: amd64
|
|
||||||
preset: 'CUDA 11'
|
|
||||||
install: https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.89_win10.exe
|
|
||||||
cuda-version: '11.3'
|
|
||||||
- os: windows
|
- os: windows
|
||||||
arch: amd64
|
arch: amd64
|
||||||
preset: 'CUDA 12'
|
preset: 'CUDA 12'
|
||||||
@ -324,7 +319,6 @@ jobs:
|
|||||||
case "$COMPONENT" in
|
case "$COMPONENT" in
|
||||||
bin/ollama) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
|
bin/ollama) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
|
||||||
lib/ollama/*.so) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
|
lib/ollama/*.so) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
|
||||||
lib/ollama/cuda_v11) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
|
|
||||||
lib/ollama/cuda_v12) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
|
lib/ollama/cuda_v12) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
|
||||||
lib/ollama/cuda_jetpack5) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack5.tar.in ;;
|
lib/ollama/cuda_jetpack5) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack5.tar.in ;;
|
||||||
lib/ollama/cuda_jetpack6) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack6.tar.in ;;
|
lib/ollama/cuda_jetpack6) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack6.tar.in ;;
|
||||||
|
6
.github/workflows/test.yaml
vendored
6
.github/workflows/test.yaml
vendored
@ -46,7 +46,7 @@ jobs:
|
|||||||
include:
|
include:
|
||||||
- preset: CPU
|
- preset: CPU
|
||||||
- preset: CUDA
|
- preset: CUDA
|
||||||
container: nvidia/cuda:11.8.0-devel-ubuntu22.04
|
container: nvidia/cuda:12.8.1-devel-ubuntu22.04
|
||||||
flags: '-DCMAKE_CUDA_ARCHITECTURES=87'
|
flags: '-DCMAKE_CUDA_ARCHITECTURES=87'
|
||||||
- preset: ROCm
|
- preset: ROCm
|
||||||
container: rocm/dev-ubuntu-22.04:6.1.2
|
container: rocm/dev-ubuntu-22.04:6.1.2
|
||||||
@ -78,7 +78,7 @@ jobs:
|
|||||||
include:
|
include:
|
||||||
- preset: CPU
|
- preset: CPU
|
||||||
- preset: CUDA
|
- preset: CUDA
|
||||||
install: https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.89_win10.exe
|
install: https://developer.download.nvidia.com/compute/cuda/12.8.0/local_installers/cuda_12.8.0_571.96_windows.exe
|
||||||
flags: '-DCMAKE_CUDA_ARCHITECTURES=80'
|
flags: '-DCMAKE_CUDA_ARCHITECTURES=80'
|
||||||
- preset: ROCm
|
- preset: ROCm
|
||||||
install: https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q4-WinSvr2022-For-HIP.exe
|
install: https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q4-WinSvr2022-For-HIP.exe
|
||||||
@ -102,7 +102,7 @@ jobs:
|
|||||||
$ErrorActionPreference = "Stop"
|
$ErrorActionPreference = "Stop"
|
||||||
if ("${{ steps.cache-install.outputs.cache-hit }}" -ne 'true') {
|
if ("${{ steps.cache-install.outputs.cache-hit }}" -ne 'true') {
|
||||||
Invoke-WebRequest -Uri "${{ matrix.install }}" -OutFile "install.exe"
|
Invoke-WebRequest -Uri "${{ matrix.install }}" -OutFile "install.exe"
|
||||||
Start-Process -FilePath .\install.exe -ArgumentList (@("-s", "cudart_11.3", "nvcc_11.3", "cublas_11.3", "cublas_dev_11.3")) -NoNewWindow -Wait
|
Start-Process -FilePath .\install.exe -ArgumentList (@("-s", "cudart_12.8", "nvcc_12.8", "cublas_12.8", "cublas_dev_12.8")) -NoNewWindow -Wait
|
||||||
}
|
}
|
||||||
|
|
||||||
$cudaPath = (Resolve-Path "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\*").path
|
$cudaPath = (Resolve-Path "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\*").path
|
||||||
|
@ -17,14 +17,6 @@
|
|||||||
"name": "CUDA",
|
"name": "CUDA",
|
||||||
"inherits": [ "Default" ]
|
"inherits": [ "Default" ]
|
||||||
},
|
},
|
||||||
{
|
|
||||||
"name": "CUDA 11",
|
|
||||||
"inherits": [ "CUDA" ],
|
|
||||||
"cacheVariables": {
|
|
||||||
"CMAKE_CUDA_ARCHITECTURES": "50;52;53;60;61;70;75;80;86",
|
|
||||||
"CMAKE_CUDA_FLAGS": "-Wno-deprecated-gpu-targets"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"name": "CUDA 12",
|
"name": "CUDA 12",
|
||||||
"inherits": [ "CUDA" ],
|
"inherits": [ "CUDA" ],
|
||||||
@ -78,11 +70,6 @@
|
|||||||
"configurePreset": "CUDA",
|
"configurePreset": "CUDA",
|
||||||
"targets": [ "ggml-cuda" ]
|
"targets": [ "ggml-cuda" ]
|
||||||
},
|
},
|
||||||
{
|
|
||||||
"name": "CUDA 11",
|
|
||||||
"inherits": [ "CUDA" ],
|
|
||||||
"configurePreset": "CUDA 11"
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"name": "CUDA 12",
|
"name": "CUDA 12",
|
||||||
"inherits": [ "CUDA" ],
|
"inherits": [ "CUDA" ],
|
||||||
|
17
Dockerfile
17
Dockerfile
@ -7,14 +7,10 @@ ARG JETPACK5VERSION=r35.4.1
|
|||||||
ARG JETPACK6VERSION=r36.4.0
|
ARG JETPACK6VERSION=r36.4.0
|
||||||
ARG CMAKEVERSION=3.31.2
|
ARG CMAKEVERSION=3.31.2
|
||||||
|
|
||||||
# CUDA v11 requires gcc v10. v10.3 has regressions, so the rockylinux 8.5 AppStream has the latest compatible version
|
|
||||||
FROM --platform=linux/amd64 rocm/dev-almalinux-8:${ROCMVERSION}-complete AS base-amd64
|
FROM --platform=linux/amd64 rocm/dev-almalinux-8:${ROCMVERSION}-complete AS base-amd64
|
||||||
RUN yum install -y yum-utils \
|
RUN yum install -y yum-utils \
|
||||||
&& yum-config-manager --add-repo https://dl.rockylinux.org/vault/rocky/8.5/AppStream/\$basearch/os/ \
|
&& dnf install -y ccache \
|
||||||
&& rpm --import https://dl.rockylinux.org/pub/rocky/RPM-GPG-KEY-Rocky-8 \
|
|
||||||
&& dnf install -y yum-utils ccache gcc-toolset-10-gcc-10.2.1-8.2.el8 gcc-toolset-10-gcc-c++-10.2.1-8.2.el8 gcc-toolset-10-binutils-2.35-11.el8 \
|
|
||||||
&& yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo
|
&& yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo
|
||||||
ENV PATH=/opt/rh/gcc-toolset-10/root/usr/bin:$PATH
|
|
||||||
|
|
||||||
FROM --platform=linux/arm64 almalinux:8 AS base-arm64
|
FROM --platform=linux/arm64 almalinux:8 AS base-arm64
|
||||||
# install epel-release for ccache
|
# install epel-release for ccache
|
||||||
@ -38,15 +34,6 @@ RUN --mount=type=cache,target=/root/.ccache \
|
|||||||
&& cmake --build --parallel --preset 'CPU' \
|
&& cmake --build --parallel --preset 'CPU' \
|
||||||
&& cmake --install build --component CPU --strip --parallel 8
|
&& cmake --install build --component CPU --strip --parallel 8
|
||||||
|
|
||||||
FROM base AS cuda-11
|
|
||||||
ARG CUDA11VERSION=11.3
|
|
||||||
RUN dnf install -y cuda-toolkit-${CUDA11VERSION//./-}
|
|
||||||
ENV PATH=/usr/local/cuda-11/bin:$PATH
|
|
||||||
RUN --mount=type=cache,target=/root/.ccache \
|
|
||||||
cmake --preset 'CUDA 11' \
|
|
||||||
&& cmake --build --parallel --preset 'CUDA 11' \
|
|
||||||
&& cmake --install build --component CUDA --strip --parallel 8
|
|
||||||
|
|
||||||
FROM base AS cuda-12
|
FROM base AS cuda-12
|
||||||
ARG CUDA12VERSION=12.8
|
ARG CUDA12VERSION=12.8
|
||||||
RUN dnf install -y cuda-toolkit-${CUDA12VERSION//./-}
|
RUN dnf install -y cuda-toolkit-${CUDA12VERSION//./-}
|
||||||
@ -98,11 +85,9 @@ RUN --mount=type=cache,target=/root/.cache/go-build \
|
|||||||
go build -trimpath -buildmode=pie -o /bin/ollama .
|
go build -trimpath -buildmode=pie -o /bin/ollama .
|
||||||
|
|
||||||
FROM --platform=linux/amd64 scratch AS amd64
|
FROM --platform=linux/amd64 scratch AS amd64
|
||||||
COPY --from=cuda-11 dist/lib/ollama/cuda_v11 /lib/ollama/cuda_v11
|
|
||||||
COPY --from=cuda-12 dist/lib/ollama/cuda_v12 /lib/ollama/cuda_v12
|
COPY --from=cuda-12 dist/lib/ollama/cuda_v12 /lib/ollama/cuda_v12
|
||||||
|
|
||||||
FROM --platform=linux/arm64 scratch AS arm64
|
FROM --platform=linux/arm64 scratch AS arm64
|
||||||
COPY --from=cuda-11 dist/lib/ollama/cuda_v11 /lib/ollama/cuda_v11
|
|
||||||
COPY --from=cuda-12 dist/lib/ollama/cuda_v12 /lib/ollama/cuda_v12
|
COPY --from=cuda-12 dist/lib/ollama/cuda_v12 /lib/ollama/cuda_v12
|
||||||
COPY --from=jetpack-5 dist/lib/ollama/cuda_v11 /lib/ollama/cuda_jetpack5
|
COPY --from=jetpack-5 dist/lib/ollama/cuda_v11 /lib/ollama/cuda_jetpack5
|
||||||
COPY --from=jetpack-6 dist/lib/ollama/cuda_v12 /lib/ollama/cuda_jetpack6
|
COPY --from=jetpack-6 dist/lib/ollama/cuda_v12 /lib/ollama/cuda_jetpack6
|
||||||
|
@ -15,13 +15,11 @@ help:
|
|||||||
@echo " make -f $(lastword $(MAKEFILE_LIST)) clean sync"
|
@echo " make -f $(lastword $(MAKEFILE_LIST)) clean sync"
|
||||||
|
|
||||||
.PHONY: sync
|
.PHONY: sync
|
||||||
sync: llama/build-info.cpp ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal
|
sync: llama/build-info.cpp llama/llama.cpp ml/backend/ggml/ggml
|
||||||
|
|
||||||
llama/build-info.cpp: llama/build-info.cpp.in llama/llama.cpp
|
.PHONY: llama/build-info.cpp
|
||||||
sed -e 's|@FETCH_HEAD@|$(FETCH_HEAD)|' <$< >$@
|
llama/build-info.cpp: llama/build-info.cpp.in
|
||||||
|
sed -e 's|@FETCH_HEAD@|$(FETCH_HEAD)|' $< > $@
|
||||||
ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal: ml/backend/ggml/ggml
|
|
||||||
go generate ./$(@D)
|
|
||||||
|
|
||||||
.PHONY: llama/llama.cpp
|
.PHONY: llama/llama.cpp
|
||||||
llama/llama.cpp: llama/vendor/
|
llama/llama.cpp: llama/vendor/
|
||||||
@ -32,13 +30,12 @@ ml/backend/ggml/ggml: llama/vendor/ggml/
|
|||||||
rsync -arvzc -f "merge $@/.rsync-filter" $< $@
|
rsync -arvzc -f "merge $@/.rsync-filter" $< $@
|
||||||
|
|
||||||
PATCHES=$(wildcard llama/patches/*.patch)
|
PATCHES=$(wildcard llama/patches/*.patch)
|
||||||
PATCHED=$(join $(dir $(PATCHES)), $(addsuffix ed, $(addprefix ., $(notdir $(PATCHES)))))
|
|
||||||
|
|
||||||
.PHONY: apply-patches
|
.PHONY: apply-patches
|
||||||
.NOTPARALLEL:
|
.NOTPARALLEL:
|
||||||
apply-patches: $(PATCHED)
|
apply-patches: $(addsuffix ed, $(PATCHES))
|
||||||
|
|
||||||
llama/patches/.%.patched: llama/patches/%.patch
|
%.patched: %.patch
|
||||||
@if git -c user.name=nobody -c 'user.email=<>' -C $(WORKDIR) am -3 $(realpath $<); then touch $@; else git -C $(WORKDIR) am --abort; exit 1; fi
|
@if git -c user.name=nobody -c 'user.email=<>' -C $(WORKDIR) am -3 $(realpath $<); then touch $@; else git -C $(WORKDIR) am --abort; exit 1; fi
|
||||||
|
|
||||||
.PHONY: checkout
|
.PHONY: checkout
|
||||||
@ -60,4 +57,4 @@ format-patches: llama/patches
|
|||||||
|
|
||||||
.PHONE: clean
|
.PHONE: clean
|
||||||
clean: checkout
|
clean: checkout
|
||||||
$(RM) llama/patches/.*.patched
|
$(RM) $(addsuffix ed, $(PATCHES))
|
||||||
|
@ -44,7 +44,7 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
|
|||||||
fmt.Fprintln(os.Stderr, "Use \"\"\" to begin a multi-line message.")
|
fmt.Fprintln(os.Stderr, "Use \"\"\" to begin a multi-line message.")
|
||||||
|
|
||||||
if opts.MultiModal {
|
if opts.MultiModal {
|
||||||
fmt.Fprintf(os.Stderr, "Use %s to include .jpg, .png, or .webp images.\n", filepath.FromSlash("/path/to/file"))
|
fmt.Fprintf(os.Stderr, "Use %s to include .jpg or .png images.\n", filepath.FromSlash("/path/to/file"))
|
||||||
}
|
}
|
||||||
|
|
||||||
fmt.Fprintln(os.Stderr, "")
|
fmt.Fprintln(os.Stderr, "")
|
||||||
@ -511,7 +511,7 @@ func extractFileNames(input string) []string {
|
|||||||
// Regex to match file paths starting with optional drive letter, / ./ \ or .\ and include escaped or unescaped spaces (\ or %20)
|
// Regex to match file paths starting with optional drive letter, / ./ \ or .\ and include escaped or unescaped spaces (\ or %20)
|
||||||
// and followed by more characters and a file extension
|
// and followed by more characters and a file extension
|
||||||
// This will capture non filename strings, but we'll check for file existence to remove mismatches
|
// This will capture non filename strings, but we'll check for file existence to remove mismatches
|
||||||
regexPattern := `(?:[a-zA-Z]:)?(?:\./|/|\\)[\S\\ ]+?\.(?i:jpg|jpeg|png|webp)\b`
|
regexPattern := `(?:[a-zA-Z]:)?(?:\./|/|\\)[\S\\ ]+?\.(?i:jpg|jpeg|png)\b`
|
||||||
re := regexp.MustCompile(regexPattern)
|
re := regexp.MustCompile(regexPattern)
|
||||||
|
|
||||||
return re.FindAllString(input, -1)
|
return re.FindAllString(input, -1)
|
||||||
@ -553,7 +553,7 @@ func getImageData(filePath string) ([]byte, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
contentType := http.DetectContentType(buf)
|
contentType := http.DetectContentType(buf)
|
||||||
allowedTypes := []string{"image/jpeg", "image/jpg", "image/png", "image/webp"}
|
allowedTypes := []string{"image/jpeg", "image/jpg", "image/png"}
|
||||||
if !slices.Contains(allowedTypes, contentType) {
|
if !slices.Contains(allowedTypes, contentType) {
|
||||||
return nil, fmt.Errorf("invalid image type: %s", contentType)
|
return nil, fmt.Errorf("invalid image type: %s", contentType)
|
||||||
}
|
}
|
||||||
|
@ -12,17 +12,14 @@ func TestExtractFilenames(t *testing.T) {
|
|||||||
// Unix style paths
|
// Unix style paths
|
||||||
input := ` some preamble
|
input := ` some preamble
|
||||||
./relative\ path/one.png inbetween1 ./not a valid two.jpg inbetween2 ./1.svg
|
./relative\ path/one.png inbetween1 ./not a valid two.jpg inbetween2 ./1.svg
|
||||||
/unescaped space /three.jpeg inbetween3 /valid\ path/dir/four.png "./quoted with spaces/five.JPG
|
/unescaped space /three.jpeg inbetween3 /valid\ path/dir/four.png "./quoted with spaces/five.JPG`
|
||||||
/unescaped space /six.webp inbetween6 /valid\ path/dir/seven.WEBP`
|
|
||||||
res := extractFileNames(input)
|
res := extractFileNames(input)
|
||||||
assert.Len(t, res, 7)
|
assert.Len(t, res, 5)
|
||||||
assert.Contains(t, res[0], "one.png")
|
assert.Contains(t, res[0], "one.png")
|
||||||
assert.Contains(t, res[1], "two.jpg")
|
assert.Contains(t, res[1], "two.jpg")
|
||||||
assert.Contains(t, res[2], "three.jpeg")
|
assert.Contains(t, res[2], "three.jpeg")
|
||||||
assert.Contains(t, res[3], "four.png")
|
assert.Contains(t, res[3], "four.png")
|
||||||
assert.Contains(t, res[4], "five.JPG")
|
assert.Contains(t, res[4], "five.JPG")
|
||||||
assert.Contains(t, res[5], "six.webp")
|
|
||||||
assert.Contains(t, res[6], "seven.WEBP")
|
|
||||||
assert.NotContains(t, res[4], '"')
|
assert.NotContains(t, res[4], '"')
|
||||||
assert.NotContains(t, res, "inbetween1")
|
assert.NotContains(t, res, "inbetween1")
|
||||||
assert.NotContains(t, res, "./1.svg")
|
assert.NotContains(t, res, "./1.svg")
|
||||||
@ -33,12 +30,10 @@ func TestExtractFilenames(t *testing.T) {
|
|||||||
/absolute/nospace/three.jpeg inbetween3 /absolute/with space/four.png inbetween4
|
/absolute/nospace/three.jpeg inbetween3 /absolute/with space/four.png inbetween4
|
||||||
./relative\ path/five.JPG inbetween5 "./relative with/spaces/six.png inbetween6
|
./relative\ path/five.JPG inbetween5 "./relative with/spaces/six.png inbetween6
|
||||||
d:\path with\spaces\seven.JPEG inbetween7 c:\users\jdoe\eight.png inbetween8
|
d:\path with\spaces\seven.JPEG inbetween7 c:\users\jdoe\eight.png inbetween8
|
||||||
d:\program files\someplace\nine.png inbetween9 "E:\program files\someplace\ten.PNG
|
d:\program files\someplace\nine.png inbetween9 "E:\program files\someplace\ten.PNG some ending
|
||||||
c:/users/jdoe/eleven.webp inbetween11 c:/program files/someplace/twelve.WebP inbetween12
|
|
||||||
d:\path with\spaces\thirteen.WEBP some ending
|
|
||||||
`
|
`
|
||||||
res = extractFileNames(input)
|
res = extractFileNames(input)
|
||||||
assert.Len(t, res, 13)
|
assert.Len(t, res, 10)
|
||||||
assert.NotContains(t, res, "inbetween2")
|
assert.NotContains(t, res, "inbetween2")
|
||||||
assert.Contains(t, res[0], "one.png")
|
assert.Contains(t, res[0], "one.png")
|
||||||
assert.Contains(t, res[0], "c:")
|
assert.Contains(t, res[0], "c:")
|
||||||
@ -56,12 +51,6 @@ d:\path with\spaces\thirteen.WEBP some ending
|
|||||||
assert.Contains(t, res[8], "d:")
|
assert.Contains(t, res[8], "d:")
|
||||||
assert.Contains(t, res[9], "ten.PNG")
|
assert.Contains(t, res[9], "ten.PNG")
|
||||||
assert.Contains(t, res[9], "E:")
|
assert.Contains(t, res[9], "E:")
|
||||||
assert.Contains(t, res[10], "eleven.webp")
|
|
||||||
assert.Contains(t, res[10], "c:")
|
|
||||||
assert.Contains(t, res[11], "twelve.WebP")
|
|
||||||
assert.Contains(t, res[11], "c:")
|
|
||||||
assert.Contains(t, res[12], "thirteen.WEBP")
|
|
||||||
assert.Contains(t, res[12], "d:")
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Ensure that file paths wrapped in single quotes are removed with the quotes.
|
// Ensure that file paths wrapped in single quotes are removed with the quotes.
|
||||||
|
@ -1,7 +1,6 @@
|
|||||||
package convert
|
package convert
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"cmp"
|
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
@ -15,12 +14,13 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
type ModelParameters struct {
|
type ModelParameters struct {
|
||||||
Architectures []string `json:"architectures"`
|
Architectures []string `json:"architectures"`
|
||||||
VocabSize uint32 `json:"vocab_size"`
|
VocabSize uint32 `json:"vocab_size"`
|
||||||
|
TextModel TextParameters `json:"text_config"`
|
||||||
|
}
|
||||||
|
|
||||||
TextModel struct {
|
type TextParameters struct {
|
||||||
VocabSize uint32 `json:"vocab_size"`
|
VocabSize uint32 `json:"vocab_size"`
|
||||||
} `json:"text_config"`
|
|
||||||
}
|
}
|
||||||
|
|
||||||
type AdapterParameters struct {
|
type AdapterParameters struct {
|
||||||
@ -173,8 +173,6 @@ func ConvertModel(fsys fs.FS, f *os.File) error {
|
|||||||
switch p.Architectures[0] {
|
switch p.Architectures[0] {
|
||||||
case "LlamaForCausalLM":
|
case "LlamaForCausalLM":
|
||||||
conv = &llamaModel{}
|
conv = &llamaModel{}
|
||||||
case "MllamaForConditionalGeneration":
|
|
||||||
conv = &mllamaModel{}
|
|
||||||
case "Llama4ForConditionalGeneration":
|
case "Llama4ForConditionalGeneration":
|
||||||
conv = &llama4Model{}
|
conv = &llama4Model{}
|
||||||
case "Mistral3ForConditionalGeneration":
|
case "Mistral3ForConditionalGeneration":
|
||||||
@ -216,22 +214,24 @@ func ConvertModel(fsys fs.FS, f *os.File) error {
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
vocabSize := int(cmp.Or(p.VocabSize, p.TextModel.VocabSize))
|
vocabSize := int(p.VocabSize)
|
||||||
|
if vocabSize == 0 {
|
||||||
|
tVocabSize := int(p.TextModel.VocabSize)
|
||||||
|
vocabSize = tVocabSize
|
||||||
|
}
|
||||||
|
|
||||||
switch {
|
switch {
|
||||||
case vocabSize == 0:
|
case vocabSize == 0:
|
||||||
slog.Debug("vocabulary size was not explicitly set by the model", "default size", len(t.Vocabulary.Tokens))
|
slog.Warn("vocabulary size was not explicitly set by the model", "default size", len(t.Vocabulary.Tokens))
|
||||||
case vocabSize > len(t.Vocabulary.Tokens):
|
case vocabSize > len(t.Vocabulary.Tokens):
|
||||||
slog.Debug("vocabulary is smaller than expected, padding with dummy tokens", "expect", vocabSize, "actual", len(t.Vocabulary.Tokens))
|
slog.Warn("vocabulary is smaller than expected, padding with dummy tokens", "expect", vocabSize, "actual", len(t.Vocabulary.Tokens))
|
||||||
for i := range vocabSize - len(t.Vocabulary.Tokens) {
|
for i := range vocabSize - len(t.Vocabulary.Tokens) {
|
||||||
t.Vocabulary.Tokens = append(t.Vocabulary.Tokens, fmt.Sprintf("[PAD%d]", i))
|
t.Vocabulary.Tokens = append(t.Vocabulary.Tokens, fmt.Sprintf("[PAD%d]", i))
|
||||||
t.Vocabulary.Scores = append(t.Vocabulary.Scores, -1)
|
t.Vocabulary.Scores = append(t.Vocabulary.Scores, -1)
|
||||||
t.Vocabulary.Types = append(t.Vocabulary.Types, tokenTypeUserDefined)
|
t.Vocabulary.Types = append(t.Vocabulary.Types, tokenTypeUserDefined)
|
||||||
}
|
}
|
||||||
case vocabSize < len(t.Vocabulary.Tokens):
|
case vocabSize < len(t.Vocabulary.Tokens):
|
||||||
slog.Debug("vocabulary is larger than expected", "want", vocabSize, "got", len(t.Vocabulary.Tokens))
|
return fmt.Errorf("vocabulary is larger than expected '%d' instead of '%d'", len(t.Vocabulary.Tokens), vocabSize)
|
||||||
p.VocabSize = uint32(len(t.Vocabulary.Tokens))
|
|
||||||
p.TextModel.VocabSize = uint32(len(t.Vocabulary.Tokens))
|
|
||||||
default:
|
default:
|
||||||
slog.Debug("vocabulary", "size", len(t.Vocabulary.Tokens))
|
slog.Debug("vocabulary", "size", len(t.Vocabulary.Tokens))
|
||||||
}
|
}
|
||||||
|
@ -1,160 +0,0 @@
|
|||||||
package convert
|
|
||||||
|
|
||||||
import (
|
|
||||||
"strings"
|
|
||||||
|
|
||||||
"github.com/ollama/ollama/fs/ggml"
|
|
||||||
"github.com/pdevine/tensor"
|
|
||||||
"github.com/pdevine/tensor/native"
|
|
||||||
)
|
|
||||||
|
|
||||||
type mllamaModel struct {
|
|
||||||
ModelParameters
|
|
||||||
TextModel struct {
|
|
||||||
llamaModel
|
|
||||||
|
|
||||||
CrossAttentionLayers []int32 `json:"cross_attention_layers"`
|
|
||||||
} `json:"text_config"`
|
|
||||||
VisionModel struct {
|
|
||||||
NumHiddenLayers uint32 `json:"num_hidden_layers"`
|
|
||||||
NumGlobalLayers uint32 `json:"num_global_layers"`
|
|
||||||
IntermediateLayersIndices []int32 `json:"intermediate_layers_indices"`
|
|
||||||
|
|
||||||
HiddenSize uint32 `json:"hidden_size"`
|
|
||||||
IntermediateSize uint32 `json:"intermediate_size"`
|
|
||||||
|
|
||||||
AttentionHeads uint32 `json:"attention_heads"`
|
|
||||||
|
|
||||||
ImageSize uint32 `json:"image_size"`
|
|
||||||
PatchSize uint32 `json:"patch_size"`
|
|
||||||
NumChannels uint32 `json:"num_channels"`
|
|
||||||
MaxNumTiles uint32 `json:"max_num_tiles"`
|
|
||||||
NormEpsilon float32 `json:"norm_eps"`
|
|
||||||
RopeTheta float32 `json:"rope.freq_base"`
|
|
||||||
} `json:"vision_config"`
|
|
||||||
}
|
|
||||||
|
|
||||||
func (m *mllamaModel) KV(t *Tokenizer) ggml.KV {
|
|
||||||
kv := m.ModelParameters.KV(t)
|
|
||||||
kv["general.architecture"] = "mllama"
|
|
||||||
|
|
||||||
for k, v := range m.TextModel.KV(t) {
|
|
||||||
if strings.HasPrefix(k, "llama.") {
|
|
||||||
kv[strings.ReplaceAll(k, "llama.", "mllama.")] = v
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
kv["mllama.attention.cross_attention_layers"] = m.TextModel.CrossAttentionLayers
|
|
||||||
|
|
||||||
kv["mllama.vision.block_count"] = m.VisionModel.NumHiddenLayers
|
|
||||||
kv["mllama.vision.global.block_count"] = m.VisionModel.NumGlobalLayers
|
|
||||||
kv["mllama.vision.intermediate_layers_indices"] = m.VisionModel.IntermediateLayersIndices
|
|
||||||
|
|
||||||
kv["mllama.vision.embedding_length"] = m.VisionModel.HiddenSize
|
|
||||||
kv["mllama.vision.feed_forward_length"] = m.VisionModel.IntermediateSize
|
|
||||||
|
|
||||||
kv["mllama.vision.attention.head_count"] = m.VisionModel.AttentionHeads
|
|
||||||
kv["mllama.vision.attention.layer_norm_epsilon"] = m.VisionModel.NormEpsilon
|
|
||||||
|
|
||||||
kv["mllama.vision.image_size"] = m.VisionModel.ImageSize
|
|
||||||
kv["mllama.vision.patch_size"] = m.VisionModel.PatchSize
|
|
||||||
kv["mllama.vision.max_num_tiles"] = m.VisionModel.MaxNumTiles
|
|
||||||
kv["mllama.vision.num_channels"] = m.VisionModel.NumChannels
|
|
||||||
|
|
||||||
return kv
|
|
||||||
}
|
|
||||||
|
|
||||||
func (m *mllamaModel) Replacements() []string {
|
|
||||||
return append(
|
|
||||||
m.TextModel.Replacements(),
|
|
||||||
"language_model.", "",
|
|
||||||
"gate_attn", "attn_gate",
|
|
||||||
"gate_ffn", "ffn_gate",
|
|
||||||
"cross_attn.", "cross_attn_",
|
|
||||||
"vision_model", "v",
|
|
||||||
"class_embedding", "class_embd",
|
|
||||||
"patch_embedding", "patch_embd",
|
|
||||||
"gated_positional_embedding.tile_embedding", "tile_position_embd",
|
|
||||||
"gated_positional_embedding.embedding", "position_embd.weight",
|
|
||||||
"gated_positional_embedding", "position_embd",
|
|
||||||
"embedding.weight", "weight",
|
|
||||||
"pre_tile_positional_embedding", "pre_tile_position_embd",
|
|
||||||
"post_tile_positional_embedding", "post_tile_position_embd",
|
|
||||||
"layernorm_pre", "pre_ln",
|
|
||||||
"layernorm_post", "post_ln",
|
|
||||||
"global_transformer.layers", "global.blk",
|
|
||||||
"transformer.layers", "blk",
|
|
||||||
"mlp.fc1", "ffn_up",
|
|
||||||
"mlp.fc2", "ffn_down",
|
|
||||||
"multi_modal_projector", "mm.0",
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
func (m *mllamaModel) Tensors(ts []Tensor) []*ggml.Tensor {
|
|
||||||
var out []*ggml.Tensor
|
|
||||||
var text []Tensor
|
|
||||||
for _, t := range ts {
|
|
||||||
if t.Name() == "v.position_embd.gate" {
|
|
||||||
for _, name := range []string{"v.position_embd.gate", "v.tile_position_embd.gate"} {
|
|
||||||
tt := t.Clone()
|
|
||||||
tt.SetRepacker(m.repack(name))
|
|
||||||
out = append(out, &ggml.Tensor{
|
|
||||||
Name: name,
|
|
||||||
Kind: t.Kind(),
|
|
||||||
Shape: t.Shape(),
|
|
||||||
WriterTo: tt,
|
|
||||||
})
|
|
||||||
}
|
|
||||||
} else if t.Name() == "v.pre_tile_position_embd.gate" || t.Name() == "v.post_tile_position_embd.gate" {
|
|
||||||
t.SetRepacker(m.repack(t.Name()))
|
|
||||||
out = append(out, &ggml.Tensor{
|
|
||||||
Name: t.Name(),
|
|
||||||
Kind: t.Kind(),
|
|
||||||
Shape: t.Shape(),
|
|
||||||
WriterTo: t,
|
|
||||||
})
|
|
||||||
} else if strings.HasPrefix(t.Name(), "v.") || strings.HasPrefix(t.Name(), "mm.") {
|
|
||||||
out = append(out, &ggml.Tensor{
|
|
||||||
Name: t.Name(),
|
|
||||||
Kind: t.Kind(),
|
|
||||||
Shape: t.Shape(),
|
|
||||||
WriterTo: t,
|
|
||||||
})
|
|
||||||
} else {
|
|
||||||
text = append(text, t)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return append(out, m.TextModel.Tensors(text)...)
|
|
||||||
}
|
|
||||||
|
|
||||||
func (m *mllamaModel) repack(name string) Repacker {
|
|
||||||
return func(_ string, data []float32, shape []uint64) (_ []float32, err error) {
|
|
||||||
dims := make([]int, len(shape))
|
|
||||||
for i, dim := range shape {
|
|
||||||
dims[i] = int(dim)
|
|
||||||
}
|
|
||||||
|
|
||||||
var t tensor.Tensor = tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
|
|
||||||
|
|
||||||
t, err = tensor.Tanh(t)
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
if name == "v.position_embd.gate" {
|
|
||||||
t, err = tensor.Sub(float32(1), t)
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
t = tensor.Materialize(t)
|
|
||||||
// flatten tensor so it can be return as a vector
|
|
||||||
if err := t.Reshape(t.Shape().TotalSize()); err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
return native.VectorF32(t.(*tensor.Dense))
|
|
||||||
}
|
|
||||||
}
|
|
@ -38,10 +38,7 @@ const (
|
|||||||
func (t tensorBase) Kind() uint32 {
|
func (t tensorBase) Kind() uint32 {
|
||||||
if strings.HasSuffix(t.name, ".ffn_gate_inp.weight") ||
|
if strings.HasSuffix(t.name, ".ffn_gate_inp.weight") ||
|
||||||
t.name == "token_types.weight" ||
|
t.name == "token_types.weight" ||
|
||||||
t.name == "v.positional_embedding_vlm" ||
|
t.name == "v.positional_embedding_vlm" {
|
||||||
t.name == "v.tile_position_embd.weight" ||
|
|
||||||
t.name == "v.pre_tile_position_embd.weight" ||
|
|
||||||
t.name == "v.post_tile_position_embd.weight" {
|
|
||||||
// these tensors are always F32
|
// these tensors are always F32
|
||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
|
@ -3,6 +3,7 @@
|
|||||||
package discover
|
package discover
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"fmt"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
"os"
|
"os"
|
||||||
"regexp"
|
"regexp"
|
||||||
@ -59,6 +60,8 @@ func cudaVariant(gpuInfo CudaGPUInfo) string {
|
|||||||
|
|
||||||
// driver 12.0 has problems with the cuda v12 library, so run v11 on those older drivers
|
// driver 12.0 has problems with the cuda v12 library, so run v11 on those older drivers
|
||||||
if gpuInfo.DriverMajor < 12 || (gpuInfo.DriverMajor == 12 && gpuInfo.DriverMinor == 0) {
|
if gpuInfo.DriverMajor < 12 || (gpuInfo.DriverMajor == 12 && gpuInfo.DriverMinor == 0) {
|
||||||
|
// The detected driver is older than Feb 2023
|
||||||
|
slog.Warn("old CUDA driver detected - please upgrade to a newer driver", "version", fmt.Sprintf("%d.%d", gpuInfo.DriverMajor, gpuInfo.DriverMinor))
|
||||||
return "v11"
|
return "v11"
|
||||||
}
|
}
|
||||||
return "v12"
|
return "v12"
|
||||||
|
@ -12,7 +12,7 @@ import (
|
|||||||
// '../lib/ollama' on Linux and the executable's directory on macOS
|
// '../lib/ollama' on Linux and the executable's directory on macOS
|
||||||
// note: distribution builds, additional GPU-specific libraries are
|
// note: distribution builds, additional GPU-specific libraries are
|
||||||
// found in subdirectories of the returned path, such as
|
// found in subdirectories of the returned path, such as
|
||||||
// 'cuda_v11', 'cuda_v12', 'rocm', etc.
|
// 'cuda_v12', 'rocm', etc.
|
||||||
var LibOllamaPath string = func() string {
|
var LibOllamaPath string = func() string {
|
||||||
exe, err := os.Executable()
|
exe, err := os.Executable()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
67
docs/api.md
67
docs/api.md
@ -19,7 +19,7 @@
|
|||||||
|
|
||||||
### Model names
|
### Model names
|
||||||
|
|
||||||
Model names follow a `model:tag` format, where `model` can have an optional namespace such as `example/model`. Some examples are `orca-mini:3b-q8_0` and `llama3:70b`. The tag is optional and, if not provided, will default to `latest`. The tag is used to identify a specific version.
|
Model names follow a `model:tag` format, where `model` can have an optional namespace such as `example/model`. Some examples are `orca-mini:3b-q4_1` and `llama3:70b`. The tag is optional and, if not provided, will default to `latest`. The tag is used to identify a specific version.
|
||||||
|
|
||||||
### Durations
|
### Durations
|
||||||
|
|
||||||
@ -952,8 +952,19 @@ If you are creating a model from a safetensors directory or from a GGUF file, yo
|
|||||||
|
|
||||||
| Type | Recommended |
|
| Type | Recommended |
|
||||||
| --- | :-: |
|
| --- | :-: |
|
||||||
|
| q2_K | |
|
||||||
|
| q3_K_L | |
|
||||||
|
| q3_K_M | |
|
||||||
|
| q3_K_S | |
|
||||||
|
| q4_0 | |
|
||||||
|
| q4_1 | |
|
||||||
| q4_K_M | * |
|
| q4_K_M | * |
|
||||||
| q4_K_S | |
|
| q4_K_S | |
|
||||||
|
| q5_0 | |
|
||||||
|
| q5_1 | |
|
||||||
|
| q5_K_M | |
|
||||||
|
| q5_K_S | |
|
||||||
|
| q6_K | |
|
||||||
| q8_0 | * |
|
| q8_0 | * |
|
||||||
|
|
||||||
### Examples
|
### Examples
|
||||||
@ -998,8 +1009,8 @@ Quantize a non-quantized model.
|
|||||||
|
|
||||||
```shell
|
```shell
|
||||||
curl http://localhost:11434/api/create -d '{
|
curl http://localhost:11434/api/create -d '{
|
||||||
"model": "llama3.2:quantized",
|
"model": "llama3.1:quantized",
|
||||||
"from": "llama3.2:3b-instruct-fp16",
|
"from": "llama3.1:8b-instruct-fp16",
|
||||||
"quantize": "q4_K_M"
|
"quantize": "q4_K_M"
|
||||||
}'
|
}'
|
||||||
```
|
```
|
||||||
@ -1009,14 +1020,12 @@ curl http://localhost:11434/api/create -d '{
|
|||||||
A stream of JSON objects is returned:
|
A stream of JSON objects is returned:
|
||||||
|
|
||||||
```json
|
```json
|
||||||
{"status":"quantizing F16 model to Q4_K_M","digest":"0","total":6433687776,"completed":12302}
|
{"status":"quantizing F16 model to Q4_K_M"}
|
||||||
{"status":"quantizing F16 model to Q4_K_M","digest":"0","total":6433687776,"completed":6433687552}
|
{"status":"creating new layer sha256:667b0c1932bc6ffc593ed1d03f895bf2dc8dc6df21db3042284a6f4416b06a29"}
|
||||||
{"status":"verifying conversion"}
|
{"status":"using existing layer sha256:11ce4ee3e170f6adebac9a991c22e22ab3f8530e154ee669954c4bc73061c258"}
|
||||||
{"status":"creating new layer sha256:fb7f4f211b89c6c4928ff4ddb73db9f9c0cfca3e000c3e40d6cf27ddc6ca72eb"}
|
{"status":"using existing layer sha256:0ba8f0e314b4264dfd19df045cde9d4c394a52474bf92ed6a3de22a4ca31a177"}
|
||||||
{"status":"using existing layer sha256:966de95ca8a62200913e3f8bfbf84c8494536f1b94b49166851e76644e966396"}
|
|
||||||
{"status":"using existing layer sha256:fcc5a6bec9daf9b561a68827b67ab6088e1dba9d1fa2a50d7bbcc8384e0a265d"}
|
|
||||||
{"status":"using existing layer sha256:a70ff7e570d97baaf4e62ac6e6ad9975e04caa6d900d3742d37698494479e0cd"}
|
|
||||||
{"status":"using existing layer sha256:56bb8bd477a519ffa694fc449c2413c6f0e1d3b1c88fa7e3c9d88d3ae49d4dcb"}
|
{"status":"using existing layer sha256:56bb8bd477a519ffa694fc449c2413c6f0e1d3b1c88fa7e3c9d88d3ae49d4dcb"}
|
||||||
|
{"status":"creating new layer sha256:455f34728c9b5dd3376378bfb809ee166c145b0b4c1f1a6feca069055066ef9a"}
|
||||||
{"status":"writing manifest"}
|
{"status":"writing manifest"}
|
||||||
{"status":"success"}
|
{"status":"success"}
|
||||||
```
|
```
|
||||||
@ -1154,37 +1163,29 @@ A single JSON object will be returned.
|
|||||||
{
|
{
|
||||||
"models": [
|
"models": [
|
||||||
{
|
{
|
||||||
"name": "deepseek-r1:latest",
|
"name": "codellama:13b",
|
||||||
"model": "deepseek-r1:latest",
|
"modified_at": "2023-11-04T14:56:49.277302595-07:00",
|
||||||
"modified_at": "2025-05-10T08:06:48.639712648-07:00",
|
"size": 7365960935,
|
||||||
"size": 4683075271,
|
"digest": "9f438cb9cd581fc025612d27f7c1a6669ff83a8bb0ed86c94fcf4c5440555697",
|
||||||
"digest": "0a8c266910232fd3291e71e5ba1e058cc5af9d411192cf88b6d30e92b6e73163",
|
|
||||||
"details": {
|
"details": {
|
||||||
"parent_model": "",
|
|
||||||
"format": "gguf",
|
"format": "gguf",
|
||||||
"family": "qwen2",
|
"family": "llama",
|
||||||
"families": [
|
"families": null,
|
||||||
"qwen2"
|
"parameter_size": "13B",
|
||||||
],
|
"quantization_level": "Q4_0"
|
||||||
"parameter_size": "7.6B",
|
|
||||||
"quantization_level": "Q4_K_M"
|
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "llama3.2:latest",
|
"name": "llama3:latest",
|
||||||
"model": "llama3.2:latest",
|
"modified_at": "2023-12-07T09:32:18.757212583-08:00",
|
||||||
"modified_at": "2025-05-04T17:37:44.706015396-07:00",
|
"size": 3825819519,
|
||||||
"size": 2019393189,
|
"digest": "fe938a131f40e6f6d40083c9f0f430a515233eb2edaa6d72eb85c50d64f2300e",
|
||||||
"digest": "a80c4f17acd55265feec403c7aef86be0c25983ab279d83f3bcd3abbcb5b8b72",
|
|
||||||
"details": {
|
"details": {
|
||||||
"parent_model": "",
|
|
||||||
"format": "gguf",
|
"format": "gguf",
|
||||||
"family": "llama",
|
"family": "llama",
|
||||||
"families": [
|
"families": null,
|
||||||
"llama"
|
"parameter_size": "7B",
|
||||||
],
|
"quantization_level": "Q4_0"
|
||||||
"parameter_size": "3.2B",
|
|
||||||
"quantization_level": "Q4_K_M"
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
# GPU
|
# GPU
|
||||||
## Nvidia
|
## Nvidia
|
||||||
Ollama supports Nvidia GPUs with compute capability 5.0+.
|
Ollama supports Nvidia GPUs with compute capability 5.0+ and driver version 531 and newer.
|
||||||
|
|
||||||
Check your compute compatibility to see if your card is supported:
|
Check your compute compatibility to see if your card is supported:
|
||||||
[https://developer.nvidia.com/cuda-gpus](https://developer.nvidia.com/cuda-gpus)
|
[https://developer.nvidia.com/cuda-gpus](https://developer.nvidia.com/cuda-gpus)
|
||||||
|
@ -43,7 +43,7 @@ Ollama includes multiple LLM libraries compiled for different GPUs and CPU vecto
|
|||||||
In the server log, you will see a message that looks something like this (varies from release to release):
|
In the server log, you will see a message that looks something like this (varies from release to release):
|
||||||
|
|
||||||
```
|
```
|
||||||
Dynamic LLM libraries [rocm_v6 cpu cpu_avx cpu_avx2 cuda_v11 rocm_v5]
|
Dynamic LLM libraries [rocm_v6 cpu cpu_avx cpu_avx2 cuda_v12 rocm_v5]
|
||||||
```
|
```
|
||||||
|
|
||||||
**Experimental LLM Library Override**
|
**Experimental LLM Library Override**
|
||||||
|
@ -6,7 +6,6 @@ import (
|
|||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
"math"
|
|
||||||
"slices"
|
"slices"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
@ -126,7 +125,6 @@ func (kv KV) OllamaEngineRequired() bool {
|
|||||||
"gemma3",
|
"gemma3",
|
||||||
"mistral3",
|
"mistral3",
|
||||||
"llama4",
|
"llama4",
|
||||||
"mllama",
|
|
||||||
"qwen25vl",
|
"qwen25vl",
|
||||||
}, kv.Architecture())
|
}, kv.Architecture())
|
||||||
}
|
}
|
||||||
@ -651,29 +649,6 @@ func (llm GGML) VisionGraphSize() (weights, graphSize uint64) {
|
|||||||
graphSize = 4 * (imageSize*imageSize*numChannels +
|
graphSize = 4 * (imageSize*imageSize*numChannels +
|
||||||
embeddingLength*patchSize +
|
embeddingLength*patchSize +
|
||||||
numPatches*numPatches*headCount)
|
numPatches*numPatches*headCount)
|
||||||
case "qwen25vl":
|
|
||||||
maxPixels := uint64(llm.KV().Uint("vision.max_pixels", 28*28*1280))
|
|
||||||
mergeSize := uint64(llm.KV().Uint("vision.spatial_merge_size", 2))
|
|
||||||
temporalPatchSize := uint64(2)
|
|
||||||
|
|
||||||
// Calculate max possible patches based on max_pixels
|
|
||||||
maxHeight := uint64(math.Sqrt(float64(maxPixels)))
|
|
||||||
maxWidth := maxPixels / maxHeight
|
|
||||||
maxGridHeight := maxHeight / patchSize
|
|
||||||
maxGridWidth := maxWidth / patchSize
|
|
||||||
// Account for merged patches (2x2 grid)
|
|
||||||
numPatches := (maxGridHeight * maxGridWidth) / (mergeSize * mergeSize)
|
|
||||||
|
|
||||||
// Calculate graph size based on typical operations in ProcessImage and createPatches
|
|
||||||
graphSize = 4 * (maxPixels*numChannels + // Original image storage
|
|
||||||
// Normalized pixels
|
|
||||||
maxPixels*numChannels +
|
|
||||||
// Patches storage (numPatches * channels * temporalPatchSize * patchSize^2)
|
|
||||||
numPatches*numChannels*temporalPatchSize*patchSize*patchSize +
|
|
||||||
// Self-attention calculations (similar to other architectures)
|
|
||||||
numPatches*numPatches*headCount +
|
|
||||||
// Additional buffer for processing
|
|
||||||
embeddingLength*numPatches)
|
|
||||||
case "llama4":
|
case "llama4":
|
||||||
// vision graph is computed independently in the same schedule
|
// vision graph is computed independently in the same schedule
|
||||||
// and is negligible compared to the worst case text graph
|
// and is negligible compared to the worst case text graph
|
||||||
|
125
fs/ggml/type.go
125
fs/ggml/type.go
@ -12,42 +12,42 @@ type FileType uint32
|
|||||||
const (
|
const (
|
||||||
FileTypeF32 FileType = iota
|
FileTypeF32 FileType = iota
|
||||||
FileTypeF16
|
FileTypeF16
|
||||||
fileTypeQ4_0
|
FileTypeQ4_0
|
||||||
fileTypeQ4_1
|
FileTypeQ4_1
|
||||||
fileTypeQ4_1_F16 // unused by GGML
|
fileTypeQ4_1_F16 // unused by GGML
|
||||||
fileTypeQ4_2 // unused by GGML
|
fileTypeQ4_2 // unused by GGML
|
||||||
fileTypeQ4_3 // unused by GGML
|
fileTypeQ4_3 // unused by GGML
|
||||||
FileTypeQ8_0
|
FileTypeQ8_0
|
||||||
fileTypeQ5_0
|
FileTypeQ5_0
|
||||||
fileTypeQ5_1
|
FileTypeQ5_1
|
||||||
fileTypeQ2_K
|
FileTypeQ2_K
|
||||||
fileTypeQ3_K_S
|
FileTypeQ3_K_S
|
||||||
fileTypeQ3_K_M
|
FileTypeQ3_K_M
|
||||||
fileTypeQ3_K_L
|
FileTypeQ3_K_L
|
||||||
FileTypeQ4_K_S
|
FileTypeQ4_K_S
|
||||||
FileTypeQ4_K_M
|
FileTypeQ4_K_M
|
||||||
fileTypeQ5_K_S
|
FileTypeQ5_K_S
|
||||||
fileTypeQ5_K_M
|
FileTypeQ5_K_M
|
||||||
fileTypeQ6_K
|
FileTypeQ6_K
|
||||||
fileTypeIQ2_XXS
|
fileTypeIQ2_XXS // not supported by ollama
|
||||||
fileTypeIQ2_XS
|
fileTypeIQ2_XS // not supported by ollama
|
||||||
fileTypeQ2_K_S
|
FileTypeQ2_K_S
|
||||||
fileTypeIQ3_XS
|
fileTypeIQ3_XS // not supported by ollama
|
||||||
fileTypeIQ3_XXS
|
fileTypeIQ3_XXS // not supported by ollama
|
||||||
fileTypeIQ1_S
|
fileTypeIQ1_S // not supported by ollama
|
||||||
fileTypeIQ4_NL
|
fileTypeIQ4_NL // not supported by ollama
|
||||||
fileTypeIQ3_S
|
fileTypeIQ3_S // not supported by ollama
|
||||||
fileTypeIQ3_M
|
fileTypeIQ3_M // not supported by ollama
|
||||||
fileTypeIQ2_S
|
fileTypeIQ2_S // not supported by ollama
|
||||||
fileTypeIQ2_M
|
fileTypeIQ2_M // not supported by ollama
|
||||||
fileTypeIQ4_XS
|
fileTypeIQ4_XS // not supported by ollama
|
||||||
fileTypeIQ1_M
|
fileTypeIQ1_M // not supported by ollama
|
||||||
FileTypeBF16
|
FileTypeBF16
|
||||||
fileTypeQ4_0_4_4 // unused by GGML
|
fileTypeQ4_0_4_4 // unused by GGML
|
||||||
fileTypeQ4_0_4_8 // unused by GGML
|
fileTypeQ4_0_4_8 // unused by GGML
|
||||||
fileTypeQ4_0_8_8 // unused by GGML
|
fileTypeQ4_0_8_8 // unused by GGML
|
||||||
fileTypeTQ1_0
|
fileTypeTQ1_0 // not supported by ollama
|
||||||
fileTypeTQ2_0
|
fileTypeTQ2_0 // not supported by ollama
|
||||||
|
|
||||||
FileTypeUnknown = 1024
|
FileTypeUnknown = 1024
|
||||||
)
|
)
|
||||||
@ -60,12 +60,36 @@ func ParseFileType(s string) (FileType, error) {
|
|||||||
return FileTypeF32, nil
|
return FileTypeF32, nil
|
||||||
case "F16":
|
case "F16":
|
||||||
return FileTypeF16, nil
|
return FileTypeF16, nil
|
||||||
|
case "Q4_0":
|
||||||
|
return FileTypeQ4_0, nil
|
||||||
|
case "Q4_1":
|
||||||
|
return FileTypeQ4_1, nil
|
||||||
case "Q8_0":
|
case "Q8_0":
|
||||||
return FileTypeQ8_0, nil
|
return FileTypeQ8_0, nil
|
||||||
|
case "Q5_0":
|
||||||
|
return FileTypeQ5_0, nil
|
||||||
|
case "Q5_1":
|
||||||
|
return FileTypeQ5_1, nil
|
||||||
|
case "Q2_K":
|
||||||
|
return FileTypeQ2_K, nil
|
||||||
|
case "Q3_K_S":
|
||||||
|
return FileTypeQ3_K_S, nil
|
||||||
|
case "Q3_K_M":
|
||||||
|
return FileTypeQ3_K_M, nil
|
||||||
|
case "Q3_K_L":
|
||||||
|
return FileTypeQ3_K_L, nil
|
||||||
case "Q4_K_S":
|
case "Q4_K_S":
|
||||||
return FileTypeQ4_K_S, nil
|
return FileTypeQ4_K_S, nil
|
||||||
case "Q4_K_M", "Q4_K":
|
case "Q4_K_M", "Q4_K":
|
||||||
return FileTypeQ4_K_M, nil
|
return FileTypeQ4_K_M, nil
|
||||||
|
case "Q5_K_S":
|
||||||
|
return FileTypeQ5_K_S, nil
|
||||||
|
case "Q5_K_M", "Q5_K":
|
||||||
|
return FileTypeQ5_K_M, nil
|
||||||
|
case "Q6_K":
|
||||||
|
return FileTypeQ6_K, nil
|
||||||
|
case "Q2_K_S":
|
||||||
|
return FileTypeQ2_K_S, nil
|
||||||
case "BF16":
|
case "BF16":
|
||||||
return FileTypeBF16, nil
|
return FileTypeBF16, nil
|
||||||
default:
|
default:
|
||||||
@ -87,41 +111,40 @@ func ParseFileType(s string) (FileType, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (t FileType) String() string {
|
func (t FileType) String() string {
|
||||||
// Note: this routine will return a broader set of file types for existing models
|
|
||||||
switch t {
|
switch t {
|
||||||
case FileTypeF32:
|
case FileTypeF32:
|
||||||
return "F32"
|
return "F32"
|
||||||
case FileTypeF16:
|
case FileTypeF16:
|
||||||
return "F16"
|
return "F16"
|
||||||
case fileTypeQ4_0:
|
case FileTypeQ4_0:
|
||||||
return "Q4_0"
|
return "Q4_0"
|
||||||
case fileTypeQ4_1:
|
case FileTypeQ4_1:
|
||||||
return "Q4_1"
|
return "Q4_1"
|
||||||
case FileTypeQ8_0:
|
case FileTypeQ8_0:
|
||||||
return "Q8_0"
|
return "Q8_0"
|
||||||
case fileTypeQ5_0:
|
case FileTypeQ5_0:
|
||||||
return "Q5_0"
|
return "Q5_0"
|
||||||
case fileTypeQ5_1:
|
case FileTypeQ5_1:
|
||||||
return "Q5_1"
|
return "Q5_1"
|
||||||
case fileTypeQ2_K:
|
case FileTypeQ2_K:
|
||||||
return "Q2_K"
|
return "Q2_K"
|
||||||
case fileTypeQ3_K_S:
|
case FileTypeQ3_K_S:
|
||||||
return "Q3_K_S"
|
return "Q3_K_S"
|
||||||
case fileTypeQ3_K_M:
|
case FileTypeQ3_K_M:
|
||||||
return "Q3_K_M"
|
return "Q3_K_M"
|
||||||
case fileTypeQ3_K_L:
|
case FileTypeQ3_K_L:
|
||||||
return "Q3_K_L"
|
return "Q3_K_L"
|
||||||
case FileTypeQ4_K_S:
|
case FileTypeQ4_K_S:
|
||||||
return "Q4_K_S"
|
return "Q4_K_S"
|
||||||
case FileTypeQ4_K_M:
|
case FileTypeQ4_K_M:
|
||||||
return "Q4_K_M"
|
return "Q4_K_M"
|
||||||
case fileTypeQ5_K_S:
|
case FileTypeQ5_K_S:
|
||||||
return "Q5_K_S"
|
return "Q5_K_S"
|
||||||
case fileTypeQ5_K_M:
|
case FileTypeQ5_K_M:
|
||||||
return "Q5_K_M"
|
return "Q5_K_M"
|
||||||
case fileTypeQ6_K:
|
case FileTypeQ6_K:
|
||||||
return "Q6_K"
|
return "Q6_K"
|
||||||
case fileTypeQ2_K_S:
|
case FileTypeQ2_K_S:
|
||||||
return "Q2_K_S"
|
return "Q2_K_S"
|
||||||
case FileTypeBF16:
|
case FileTypeBF16:
|
||||||
return "BF16"
|
return "BF16"
|
||||||
@ -140,35 +163,35 @@ func (ftype FileType) ToTensorType() TensorType {
|
|||||||
return TensorTypeF32
|
return TensorTypeF32
|
||||||
case FileTypeF16:
|
case FileTypeF16:
|
||||||
return TensorTypeF16
|
return TensorTypeF16
|
||||||
case fileTypeQ4_0:
|
case FileTypeQ4_0:
|
||||||
return TensorTypeQ4_0
|
return TensorTypeQ4_0
|
||||||
case fileTypeQ4_1:
|
case FileTypeQ4_1:
|
||||||
return TensorTypeQ4_1
|
return TensorTypeQ4_1
|
||||||
case FileTypeQ8_0:
|
case FileTypeQ8_0:
|
||||||
return TensorTypeQ8_0
|
return TensorTypeQ8_0
|
||||||
case fileTypeQ5_0:
|
case FileTypeQ5_0:
|
||||||
return TensorTypeQ5_0
|
return TensorTypeQ5_0
|
||||||
case fileTypeQ5_1:
|
case FileTypeQ5_1:
|
||||||
return TensorTypeQ5_1
|
return TensorTypeQ5_1
|
||||||
case fileTypeQ2_K:
|
case FileTypeQ2_K:
|
||||||
return TensorTypeQ2_K
|
return TensorTypeQ2_K
|
||||||
case fileTypeQ3_K_S:
|
case FileTypeQ3_K_S:
|
||||||
return TensorTypeQ3_K
|
return TensorTypeQ3_K
|
||||||
case fileTypeQ3_K_M:
|
case FileTypeQ3_K_M:
|
||||||
return TensorTypeQ3_K
|
return TensorTypeQ3_K
|
||||||
case fileTypeQ3_K_L:
|
case FileTypeQ3_K_L:
|
||||||
return TensorTypeQ3_K
|
return TensorTypeQ3_K
|
||||||
case FileTypeQ4_K_S:
|
case FileTypeQ4_K_S:
|
||||||
return TensorTypeQ4_K
|
return TensorTypeQ4_K
|
||||||
case FileTypeQ4_K_M:
|
case FileTypeQ4_K_M:
|
||||||
return TensorTypeQ4_K
|
return TensorTypeQ4_K
|
||||||
case fileTypeQ5_K_S:
|
case FileTypeQ5_K_S:
|
||||||
return TensorTypeQ5_K
|
return TensorTypeQ5_K
|
||||||
case fileTypeQ5_K_M:
|
case FileTypeQ5_K_M:
|
||||||
return TensorTypeQ5_K
|
return TensorTypeQ5_K
|
||||||
case fileTypeQ6_K:
|
case FileTypeQ6_K:
|
||||||
return TensorTypeQ6_K
|
return TensorTypeQ6_K
|
||||||
case fileTypeQ2_K_S:
|
case FileTypeQ2_K_S:
|
||||||
return TensorTypeQ2_K
|
return TensorTypeQ2_K
|
||||||
case FileTypeBF16:
|
case FileTypeBF16:
|
||||||
return TensorTypeBF16
|
return TensorTypeBF16
|
||||||
|
6
llama/llama.cpp/include/llama.h
vendored
6
llama/llama.cpp/include/llama.h
vendored
@ -258,6 +258,7 @@ extern "C" {
|
|||||||
|
|
||||||
llama_token * token;
|
llama_token * token;
|
||||||
float * embd;
|
float * embd;
|
||||||
|
int32_t n_embd;
|
||||||
llama_pos * pos;
|
llama_pos * pos;
|
||||||
int32_t * n_seq_id;
|
int32_t * n_seq_id;
|
||||||
llama_seq_id ** seq_id;
|
llama_seq_id ** seq_id;
|
||||||
@ -365,6 +366,7 @@ extern "C" {
|
|||||||
bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
|
bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
|
||||||
bool no_perf; // whether to measure performance timings
|
bool no_perf; // whether to measure performance timings
|
||||||
bool op_offload; // whether to offload host tensor operations to device
|
bool op_offload; // whether to offload host tensor operations to device
|
||||||
|
bool cross_attn; // whether to use cross attention
|
||||||
};
|
};
|
||||||
|
|
||||||
// model quantization parameters
|
// model quantization parameters
|
||||||
@ -464,6 +466,10 @@ extern "C" {
|
|||||||
struct llama_context_params params),
|
struct llama_context_params params),
|
||||||
"use llama_init_from_model instead");
|
"use llama_init_from_model instead");
|
||||||
|
|
||||||
|
// TODO (jmorganca): this should most likely be passed in as part of a batch
|
||||||
|
// and not set on the context for all batches.
|
||||||
|
LLAMA_API void llama_set_cross_attention(struct llama_context * ctx, bool cross_attn_state);
|
||||||
|
|
||||||
// Frees all allocated memory
|
// Frees all allocated memory
|
||||||
LLAMA_API void llama_free(struct llama_context * ctx);
|
LLAMA_API void llama_free(struct llama_context * ctx);
|
||||||
|
|
||||||
|
44
llama/llama.cpp/src/llama-arch.cpp
vendored
44
llama/llama.cpp/src/llama-arch.cpp
vendored
@ -6,6 +6,7 @@
|
|||||||
|
|
||||||
static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||||
{ LLM_ARCH_LLAMA, "llama" },
|
{ LLM_ARCH_LLAMA, "llama" },
|
||||||
|
{ LLM_ARCH_MLLAMA, "mllama" },
|
||||||
{ LLM_ARCH_LLAMA4, "llama4" },
|
{ LLM_ARCH_LLAMA4, "llama4" },
|
||||||
{ LLM_ARCH_DECI, "deci" },
|
{ LLM_ARCH_DECI, "deci" },
|
||||||
{ LLM_ARCH_FALCON, "falcon" },
|
{ LLM_ARCH_FALCON, "falcon" },
|
||||||
@ -144,6 +145,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|||||||
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
|
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
|
||||||
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
|
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
|
||||||
{ LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection" },
|
{ LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection" },
|
||||||
|
{ LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS, "%s.attention.cross_attention_layers" },
|
||||||
{ LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" },
|
{ LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" },
|
||||||
{ LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
|
{ LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
|
||||||
|
|
||||||
@ -273,6 +275,40 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|||||||
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
LLM_ARCH_MLLAMA,
|
||||||
|
{
|
||||||
|
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||||
|
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||||
|
{ LLM_TENSOR_OUTPUT, "output" },
|
||||||
|
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
|
||||||
|
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||||
|
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||||
|
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||||
|
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||||
|
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||||
|
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
|
||||||
|
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
||||||
|
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||||
|
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
||||||
|
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||||
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||||
|
{ LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
|
||||||
|
{ LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
|
||||||
|
{ LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
|
||||||
|
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
||||||
|
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
||||||
|
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
||||||
|
{ LLM_TENSOR_CROSS_ATTN_K_NORM, "blk.%d.cross_attn_k_norm" },
|
||||||
|
{ LLM_TENSOR_CROSS_ATTN_K_PROJ, "blk.%d.cross_attn_k_proj" },
|
||||||
|
{ LLM_TENSOR_CROSS_ATTN_O_PROJ, "blk.%d.cross_attn_o_proj" },
|
||||||
|
{ LLM_TENSOR_CROSS_ATTN_Q_NORM, "blk.%d.cross_attn_q_norm" },
|
||||||
|
{ LLM_TENSOR_CROSS_ATTN_Q_PROJ, "blk.%d.cross_attn_q_proj" },
|
||||||
|
{ LLM_TENSOR_CROSS_ATTN_V_PROJ, "blk.%d.cross_attn_v_proj" },
|
||||||
|
{ LLM_TENSOR_CROSS_ATTN_ATTN_GATE, "blk.%d.cross_attn_attn_gate" },
|
||||||
|
{ LLM_TENSOR_CROSS_ATTN_MLP_GATE, "blk.%d.cross_attn_mlp_gate" },
|
||||||
|
},
|
||||||
|
},
|
||||||
{
|
{
|
||||||
LLM_ARCH_DECI,
|
LLM_ARCH_DECI,
|
||||||
{
|
{
|
||||||
@ -1701,6 +1737,14 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
|||||||
// this tensor is loaded for T5, but never used
|
// this tensor is loaded for T5, but never used
|
||||||
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
|
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
|
||||||
{LLM_TENSOR_BSKCN_TV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
{LLM_TENSOR_BSKCN_TV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||||
|
{LLM_TENSOR_CROSS_ATTN_K_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||||
|
{LLM_TENSOR_CROSS_ATTN_K_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||||
|
{LLM_TENSOR_CROSS_ATTN_O_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||||
|
{LLM_TENSOR_CROSS_ATTN_Q_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||||
|
{LLM_TENSOR_CROSS_ATTN_Q_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||||
|
{LLM_TENSOR_CROSS_ATTN_V_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||||
|
{LLM_TENSOR_CROSS_ATTN_ATTN_GATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||||
|
{LLM_TENSOR_CROSS_ATTN_MLP_GATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||||
{LLM_TENSOR_CONV1D, {LLM_TENSOR_LAYER_INPUT, GGML_OP_IM2COL}},
|
{LLM_TENSOR_CONV1D, {LLM_TENSOR_LAYER_INPUT, GGML_OP_IM2COL}},
|
||||||
{LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
{LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||||
{LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
{LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||||
|
10
llama/llama.cpp/src/llama-arch.h
vendored
10
llama/llama.cpp/src/llama-arch.h
vendored
@ -11,6 +11,7 @@
|
|||||||
enum llm_arch {
|
enum llm_arch {
|
||||||
LLM_ARCH_LLAMA,
|
LLM_ARCH_LLAMA,
|
||||||
LLM_ARCH_LLAMA4,
|
LLM_ARCH_LLAMA4,
|
||||||
|
LLM_ARCH_MLLAMA,
|
||||||
LLM_ARCH_DECI,
|
LLM_ARCH_DECI,
|
||||||
LLM_ARCH_FALCON,
|
LLM_ARCH_FALCON,
|
||||||
LLM_ARCH_BAICHUAN,
|
LLM_ARCH_BAICHUAN,
|
||||||
@ -148,6 +149,7 @@ enum llm_kv {
|
|||||||
LLM_KV_ATTENTION_SLIDING_WINDOW,
|
LLM_KV_ATTENTION_SLIDING_WINDOW,
|
||||||
LLM_KV_ATTENTION_SCALE,
|
LLM_KV_ATTENTION_SCALE,
|
||||||
LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
|
LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
|
||||||
|
LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS,
|
||||||
LLM_KV_ATTENTION_KEY_LENGTH_MLA,
|
LLM_KV_ATTENTION_KEY_LENGTH_MLA,
|
||||||
LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
|
LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
|
||||||
|
|
||||||
@ -349,6 +351,14 @@ enum llm_tensor {
|
|||||||
LLM_TENSOR_CLS,
|
LLM_TENSOR_CLS,
|
||||||
LLM_TENSOR_CLS_OUT,
|
LLM_TENSOR_CLS_OUT,
|
||||||
LLM_TENSOR_BSKCN_TV,
|
LLM_TENSOR_BSKCN_TV,
|
||||||
|
LLM_TENSOR_CROSS_ATTN_K_NORM,
|
||||||
|
LLM_TENSOR_CROSS_ATTN_K_PROJ,
|
||||||
|
LLM_TENSOR_CROSS_ATTN_O_PROJ,
|
||||||
|
LLM_TENSOR_CROSS_ATTN_Q_NORM,
|
||||||
|
LLM_TENSOR_CROSS_ATTN_Q_PROJ,
|
||||||
|
LLM_TENSOR_CROSS_ATTN_V_PROJ,
|
||||||
|
LLM_TENSOR_CROSS_ATTN_ATTN_GATE,
|
||||||
|
LLM_TENSOR_CROSS_ATTN_MLP_GATE,
|
||||||
LLM_TENSOR_CONV1D,
|
LLM_TENSOR_CONV1D,
|
||||||
LLM_TENSOR_CONVNEXT_DW,
|
LLM_TENSOR_CONVNEXT_DW,
|
||||||
LLM_TENSOR_CONVNEXT_NORM,
|
LLM_TENSOR_CONVNEXT_NORM,
|
||||||
|
3
llama/llama.cpp/src/llama-batch.cpp
vendored
3
llama/llama.cpp/src/llama-batch.cpp
vendored
@ -320,6 +320,7 @@ struct llama_batch llama_batch_get_one(
|
|||||||
/*n_tokens =*/ n_tokens,
|
/*n_tokens =*/ n_tokens,
|
||||||
/*tokens =*/ tokens,
|
/*tokens =*/ tokens,
|
||||||
/*embd =*/ nullptr,
|
/*embd =*/ nullptr,
|
||||||
|
/*n_embd =*/ 0,
|
||||||
/*pos =*/ nullptr,
|
/*pos =*/ nullptr,
|
||||||
/*n_seq_id =*/ nullptr,
|
/*n_seq_id =*/ nullptr,
|
||||||
/*seq_id =*/ nullptr,
|
/*seq_id =*/ nullptr,
|
||||||
@ -332,6 +333,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
|
|||||||
/*n_tokens =*/ 0,
|
/*n_tokens =*/ 0,
|
||||||
/*tokens =*/ nullptr,
|
/*tokens =*/ nullptr,
|
||||||
/*embd =*/ nullptr,
|
/*embd =*/ nullptr,
|
||||||
|
/*n_embd =*/ 0,
|
||||||
/*pos =*/ nullptr,
|
/*pos =*/ nullptr,
|
||||||
/*n_seq_id =*/ nullptr,
|
/*n_seq_id =*/ nullptr,
|
||||||
/*seq_id =*/ nullptr,
|
/*seq_id =*/ nullptr,
|
||||||
@ -340,6 +342,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
|
|||||||
|
|
||||||
if (embd) {
|
if (embd) {
|
||||||
batch.embd = (float *) malloc(sizeof(float) * n_tokens_alloc * embd);
|
batch.embd = (float *) malloc(sizeof(float) * n_tokens_alloc * embd);
|
||||||
|
batch.n_embd = embd;
|
||||||
} else {
|
} else {
|
||||||
batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens_alloc);
|
batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens_alloc);
|
||||||
}
|
}
|
||||||
|
41
llama/llama.cpp/src/llama-context.cpp
vendored
41
llama/llama.cpp/src/llama-context.cpp
vendored
@ -514,7 +514,7 @@ float * llama_context::get_logits_ith(int32_t i) {
|
|||||||
throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, n_outputs));
|
throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, n_outputs));
|
||||||
}
|
}
|
||||||
|
|
||||||
return logits + j*model.vocab.n_tokens();
|
return logits + j*model.hparams.n_vocab;
|
||||||
} catch (const std::exception & err) {
|
} catch (const std::exception & err) {
|
||||||
LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what());
|
LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what());
|
||||||
#ifndef NDEBUG
|
#ifndef NDEBUG
|
||||||
@ -632,6 +632,10 @@ void llama_context::set_warmup(bool value) {
|
|||||||
cparams.warmup = value;
|
cparams.warmup = value;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void llama_context::set_cross_attn(bool value) {
|
||||||
|
cparams.cross_attn = value;
|
||||||
|
}
|
||||||
|
|
||||||
void llama_context::set_adapter_lora(
|
void llama_context::set_adapter_lora(
|
||||||
llama_adapter_lora * adapter,
|
llama_adapter_lora * adapter,
|
||||||
float scale) {
|
float scale) {
|
||||||
@ -709,7 +713,7 @@ int llama_context::encode(llama_batch & inp_batch) {
|
|||||||
|
|
||||||
const int64_t n_embd = hparams.n_embd;
|
const int64_t n_embd = hparams.n_embd;
|
||||||
|
|
||||||
llama_sbatch sbatch = llama_sbatch(batch, n_embd, /* simple_split */ true, /* logits_all */ true);
|
llama_sbatch sbatch = llama_sbatch(batch, batch.n_embd, /* simple_split */ true, /* logits_all */ true);
|
||||||
|
|
||||||
const llama_ubatch ubatch = sbatch.split_simple(n_tokens);
|
const llama_ubatch ubatch = sbatch.split_simple(n_tokens);
|
||||||
|
|
||||||
@ -863,10 +867,9 @@ int llama_context::decode(llama_batch & inp_batch) {
|
|||||||
|
|
||||||
const llama_batch & batch = batch_allocr.batch;
|
const llama_batch & batch = batch_allocr.batch;
|
||||||
|
|
||||||
const auto & vocab = model.vocab;
|
|
||||||
const auto & hparams = model.hparams;
|
const auto & hparams = model.hparams;
|
||||||
|
|
||||||
const int32_t n_vocab = vocab.n_tokens();
|
const int32_t n_vocab = hparams.n_vocab;
|
||||||
|
|
||||||
const int64_t n_tokens_all = batch.n_tokens;
|
const int64_t n_tokens_all = batch.n_tokens;
|
||||||
const int64_t n_embd = hparams.n_embd;
|
const int64_t n_embd = hparams.n_embd;
|
||||||
@ -947,12 +950,9 @@ int llama_context::decode(llama_batch & inp_batch) {
|
|||||||
|
|
||||||
// find KV slot
|
// find KV slot
|
||||||
if (!kv_self->find_slot(ubatch)) {
|
if (!kv_self->find_slot(ubatch)) {
|
||||||
kv_self->defrag_sched(-1.0f);
|
LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
|
||||||
kv_self->update(*this);
|
|
||||||
if (!kv_self->find_slot(ubatch)) {
|
return 1;
|
||||||
LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_backend_sched_reset(sched.get());
|
ggml_backend_sched_reset(sched.get());
|
||||||
@ -1090,7 +1090,7 @@ int llama_context::decode(llama_batch & inp_batch) {
|
|||||||
// make the outputs have the same order they had in the user-provided batch
|
// make the outputs have the same order they had in the user-provided batch
|
||||||
// note: this is mostly relevant for recurrent models atm
|
// note: this is mostly relevant for recurrent models atm
|
||||||
if (!sorted_output) {
|
if (!sorted_output) {
|
||||||
const uint32_t n_vocab = model.vocab.n_tokens();
|
const uint32_t n_vocab = model.hparams.n_vocab;
|
||||||
const uint32_t n_embd = model.hparams.n_embd;
|
const uint32_t n_embd = model.hparams.n_embd;
|
||||||
|
|
||||||
GGML_ASSERT((size_t) n_outputs == out_ids.size());
|
GGML_ASSERT((size_t) n_outputs == out_ids.size());
|
||||||
@ -1145,12 +1145,11 @@ int llama_context::decode(llama_batch & inp_batch) {
|
|||||||
|
|
||||||
int32_t llama_context::output_reserve(int32_t n_outputs) {
|
int32_t llama_context::output_reserve(int32_t n_outputs) {
|
||||||
const auto & hparams = model.hparams;
|
const auto & hparams = model.hparams;
|
||||||
const auto & vocab = model.vocab;
|
|
||||||
|
|
||||||
const int64_t n_outputs_max = std::max<int64_t>(n_outputs, n_seq_max());
|
const int64_t n_outputs_max = std::max<int64_t>(n_outputs, n_seq_max());
|
||||||
|
|
||||||
const auto n_batch = cparams.n_batch;
|
const auto n_batch = cparams.n_batch;
|
||||||
const auto n_vocab = vocab.n_tokens();
|
const auto n_vocab = hparams.n_vocab;
|
||||||
const auto n_embd = hparams.n_embd;
|
const auto n_embd = hparams.n_embd;
|
||||||
|
|
||||||
// TODO: use a per-batch flag for logits presence instead
|
// TODO: use a per-batch flag for logits presence instead
|
||||||
@ -1685,7 +1684,7 @@ size_t llama_context::state_write_data(llama_io_write_i & io) {
|
|||||||
{
|
{
|
||||||
LLAMA_LOG_DEBUG("%s: - writing logits\n", __func__);
|
LLAMA_LOG_DEBUG("%s: - writing logits\n", __func__);
|
||||||
|
|
||||||
const uint64_t logits_size = std::min((uint64_t) this->logits_size, (uint64_t) n_outputs * model.vocab.n_tokens());
|
const uint64_t logits_size = std::min((uint64_t) this->logits_size, (uint64_t) n_outputs * model.hparams.n_vocab);
|
||||||
|
|
||||||
io.write(&logits_size, sizeof(logits_size));
|
io.write(&logits_size, sizeof(logits_size));
|
||||||
|
|
||||||
@ -1968,12 +1967,9 @@ void llama_context::opt_epoch_iter(
|
|||||||
|
|
||||||
// TODO: not sure if this is needed
|
// TODO: not sure if this is needed
|
||||||
if (!kv_self->find_slot(ubatch)) {
|
if (!kv_self->find_slot(ubatch)) {
|
||||||
kv_self->defrag_sched(-1.0f);
|
LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
|
||||||
kv_self->update(*this);
|
|
||||||
if (!kv_self->find_slot(ubatch)) {
|
GGML_ABORT("TODO: handle this error");
|
||||||
LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
|
|
||||||
GGML_ABORT("TODO: handle this error");
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
auto * gf = graph_init();
|
auto * gf = graph_init();
|
||||||
@ -2097,6 +2093,7 @@ llama_context_params llama_context_default_params() {
|
|||||||
/*.flash_attn =*/ false,
|
/*.flash_attn =*/ false,
|
||||||
/*.no_perf =*/ true,
|
/*.no_perf =*/ true,
|
||||||
/*.op_offload =*/ true,
|
/*.op_offload =*/ true,
|
||||||
|
/*.cross_attn =*/ false,
|
||||||
};
|
};
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
@ -2222,6 +2219,10 @@ void llama_set_warmup(llama_context * ctx, bool warmup) {
|
|||||||
ctx->set_warmup(warmup);
|
ctx->set_warmup(warmup);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void llama_set_cross_attention(struct llama_context * ctx, bool cross_attention) {
|
||||||
|
ctx->set_cross_attn(cross_attention);
|
||||||
|
}
|
||||||
|
|
||||||
void llama_synchronize(llama_context * ctx) {
|
void llama_synchronize(llama_context * ctx) {
|
||||||
ctx->synchronize();
|
ctx->synchronize();
|
||||||
}
|
}
|
||||||
|
1
llama/llama.cpp/src/llama-context.h
vendored
1
llama/llama.cpp/src/llama-context.h
vendored
@ -72,6 +72,7 @@ struct llama_context {
|
|||||||
void set_embeddings (bool value);
|
void set_embeddings (bool value);
|
||||||
void set_causal_attn(bool value);
|
void set_causal_attn(bool value);
|
||||||
void set_warmup(bool value);
|
void set_warmup(bool value);
|
||||||
|
void set_cross_attn(bool value);
|
||||||
|
|
||||||
void set_adapter_lora(
|
void set_adapter_lora(
|
||||||
llama_adapter_lora * adapter,
|
llama_adapter_lora * adapter,
|
||||||
|
1
llama/llama.cpp/src/llama-cparams.h
vendored
1
llama/llama.cpp/src/llama-cparams.h
vendored
@ -31,6 +31,7 @@ struct llama_cparams {
|
|||||||
bool no_perf;
|
bool no_perf;
|
||||||
bool warmup;
|
bool warmup;
|
||||||
bool op_offload;
|
bool op_offload;
|
||||||
|
bool cross_attn;
|
||||||
|
|
||||||
enum llama_pooling_type pooling_type;
|
enum llama_pooling_type pooling_type;
|
||||||
|
|
||||||
|
25
llama/llama.cpp/src/llama-graph.cpp
vendored
25
llama/llama.cpp/src/llama-graph.cpp
vendored
@ -532,6 +532,12 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void llm_graph_input_cross_attn_state::set_input(const llama_ubatch * ubatch) {
|
||||||
|
if (ubatch->embd) {
|
||||||
|
ggml_backend_tensor_set(cross_attn_state, ubatch->embd, 0, ggml_nbytes(cross_attn_state));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
// llm_graph_context
|
// llm_graph_context
|
||||||
//
|
//
|
||||||
@ -1514,6 +1520,25 @@ llm_graph_input_attn_cross * llm_graph_context::build_attn_inp_cross() const {
|
|||||||
return (llm_graph_input_attn_cross *) res->add_input(std::move(inp));
|
return (llm_graph_input_attn_cross *) res->add_input(std::move(inp));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ggml_tensor * llm_graph_context::build_inp_cross_attn_state() const {
|
||||||
|
const int64_t n_embd = hparams.n_embd;
|
||||||
|
|
||||||
|
auto inp = std::make_unique<llm_graph_input_cross_attn_state>();
|
||||||
|
|
||||||
|
ggml_tensor * cur = nullptr;
|
||||||
|
|
||||||
|
inp->cross_attn_state = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd, 1601, 4);
|
||||||
|
ggml_set_input(inp->cross_attn_state);
|
||||||
|
|
||||||
|
cur = inp->cross_attn_state;
|
||||||
|
|
||||||
|
cb(cur, "inp_cross_attn_state", -1);
|
||||||
|
|
||||||
|
res->add_input(std::move(inp));
|
||||||
|
|
||||||
|
return cur;
|
||||||
|
}
|
||||||
|
|
||||||
ggml_tensor * llm_graph_context::build_attn(
|
ggml_tensor * llm_graph_context::build_attn(
|
||||||
llm_graph_input_attn_cross * inp,
|
llm_graph_input_attn_cross * inp,
|
||||||
ggml_cgraph * gf,
|
ggml_cgraph * gf,
|
||||||
|
12
llama/llama.cpp/src/llama-graph.h
vendored
12
llama/llama.cpp/src/llama-graph.h
vendored
@ -87,6 +87,7 @@ public:
|
|||||||
|
|
||||||
ggml_tensor * tokens = nullptr; // I32 [n_batch]
|
ggml_tensor * tokens = nullptr; // I32 [n_batch]
|
||||||
ggml_tensor * embd = nullptr; // F32 [n_embd, n_batch]
|
ggml_tensor * embd = nullptr; // F32 [n_embd, n_batch]
|
||||||
|
ggml_tensor * cross_attn_state; // F32 [4, n_embd, 1061]
|
||||||
};
|
};
|
||||||
|
|
||||||
class llm_graph_input_pos : public llm_graph_input_i {
|
class llm_graph_input_pos : public llm_graph_input_i {
|
||||||
@ -284,6 +285,16 @@ public:
|
|||||||
const llama_cross * cross = nullptr;
|
const llama_cross * cross = nullptr;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
class llm_graph_input_cross_attn_state : public llm_graph_input_i {
|
||||||
|
public:
|
||||||
|
llm_graph_input_cross_attn_state() = default;
|
||||||
|
virtual ~llm_graph_input_cross_attn_state() = default;
|
||||||
|
|
||||||
|
void set_input(const llama_ubatch * ubatch) override;
|
||||||
|
|
||||||
|
ggml_tensor * cross_attn_state; // F32 [4, n_embd, 1061]
|
||||||
|
};
|
||||||
|
|
||||||
//
|
//
|
||||||
// llm_graph_result
|
// llm_graph_result
|
||||||
//
|
//
|
||||||
@ -495,6 +506,7 @@ struct llm_graph_context {
|
|||||||
ggml_tensor * build_inp_cls() const;
|
ggml_tensor * build_inp_cls() const;
|
||||||
ggml_tensor * build_inp_s_copy() const;
|
ggml_tensor * build_inp_s_copy() const;
|
||||||
ggml_tensor * build_inp_s_mask() const;
|
ggml_tensor * build_inp_s_mask() const;
|
||||||
|
ggml_tensor * build_inp_cross_attn_state() const;
|
||||||
|
|
||||||
ggml_tensor * build_inp_cross_embd() const;
|
ggml_tensor * build_inp_cross_embd() const;
|
||||||
ggml_tensor * build_inp_pos_bucket_enc() const;
|
ggml_tensor * build_inp_pos_bucket_enc() const;
|
||||||
|
4
llama/llama.cpp/src/llama-hparams.cpp
vendored
4
llama/llama.cpp/src/llama-hparams.cpp
vendored
@ -85,3 +85,7 @@ bool llama_hparams::is_swa(uint32_t il) const {
|
|||||||
|
|
||||||
GGML_ABORT("fatal error");
|
GGML_ABORT("fatal error");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool llama_hparams::cross_attention_layers(uint32_t il) const {
|
||||||
|
return std::find(cross_attn_layers.begin(), cross_attn_layers.end(), il) != cross_attn_layers.end();
|
||||||
|
}
|
||||||
|
7
llama/llama.cpp/src/llama-hparams.h
vendored
7
llama/llama.cpp/src/llama-hparams.h
vendored
@ -2,6 +2,8 @@
|
|||||||
|
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
|
#include <algorithm>
|
||||||
|
|
||||||
#include <array>
|
#include <array>
|
||||||
|
|
||||||
// bump if necessary
|
// bump if necessary
|
||||||
@ -42,6 +44,7 @@ struct llama_hparams {
|
|||||||
uint32_t n_expert = 0;
|
uint32_t n_expert = 0;
|
||||||
uint32_t n_expert_used = 0;
|
uint32_t n_expert_used = 0;
|
||||||
uint32_t n_rel_attn_bkts = 0;
|
uint32_t n_rel_attn_bkts = 0;
|
||||||
|
uint32_t n_vocab = 0;
|
||||||
|
|
||||||
// note: deepseek2 using MLA converts into MQA with larger heads, then decompresses to MHA
|
// note: deepseek2 using MLA converts into MQA with larger heads, then decompresses to MHA
|
||||||
uint32_t n_embd_head_k_mla = 0;
|
uint32_t n_embd_head_k_mla = 0;
|
||||||
@ -56,6 +59,7 @@ struct llama_hparams {
|
|||||||
std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
|
std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
|
||||||
|
|
||||||
std::array<std::array<uint32_t, LLAMA_MAX_LAYERS>, 4> n_bskcn_arr = {};
|
std::array<std::array<uint32_t, LLAMA_MAX_LAYERS>, 4> n_bskcn_arr = {};
|
||||||
|
std::array<uint32_t, LLAMA_MAX_LAYERS> cross_attn_layers;
|
||||||
|
|
||||||
uint32_t n_layer_dense_lead = 0;
|
uint32_t n_layer_dense_lead = 0;
|
||||||
uint32_t n_lora_q = 0;
|
uint32_t n_lora_q = 0;
|
||||||
@ -159,6 +163,9 @@ struct llama_hparams {
|
|||||||
// Block skip connection
|
// Block skip connection
|
||||||
bool n_bskcn(uint32_t n, uint32_t il) const;
|
bool n_bskcn(uint32_t n, uint32_t il) const;
|
||||||
|
|
||||||
|
// cross attention layers
|
||||||
|
bool cross_attention_layers(uint32_t il) const;
|
||||||
|
|
||||||
bool is_swa(uint32_t il) const;
|
bool is_swa(uint32_t il) const;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
14
llama/llama.cpp/src/llama-kv-cache.cpp
vendored
14
llama/llama.cpp/src/llama-kv-cache.cpp
vendored
@ -100,8 +100,16 @@ llama_kv_cache_unified::llama_kv_cache_unified(
|
|||||||
throw std::runtime_error("failed to create ggml context for kv cache");
|
throw std::runtime_error("failed to create ggml context for kv cache");
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
|
ggml_tensor * k, *v;
|
||||||
ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size);
|
|
||||||
|
// for cross attention layers
|
||||||
|
if (model.arch == LLM_ARCH_MLLAMA && hparams.cross_attention_layers(i)) {
|
||||||
|
k = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hparams.n_embd_head_k, 6404, hparams.n_head_kv(i));
|
||||||
|
v = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hparams.n_embd_head_v, 6404, hparams.n_head_kv(i));
|
||||||
|
} else {
|
||||||
|
k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
|
||||||
|
v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size);
|
||||||
|
}
|
||||||
ggml_format_name(k, "cache_k_l%d", i);
|
ggml_format_name(k, "cache_k_l%d", i);
|
||||||
ggml_format_name(v, "cache_v_l%d", i);
|
ggml_format_name(v, "cache_v_l%d", i);
|
||||||
k_l.push_back(k);
|
k_l.push_back(k);
|
||||||
@ -451,7 +459,7 @@ void llama_kv_cache_unified::set_full() {
|
|||||||
llama_sbatch llama_kv_cache_unified::sbatch_init(
|
llama_sbatch llama_kv_cache_unified::sbatch_init(
|
||||||
const llama_batch & batch,
|
const llama_batch & batch,
|
||||||
bool logits_all) {
|
bool logits_all) {
|
||||||
return llama_sbatch(batch, hparams.n_embd, true, logits_all);
|
return llama_sbatch(batch, batch.n_embd, true, logits_all);
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_ubatch llama_kv_cache_unified::ubatch_next(
|
llama_ubatch llama_kv_cache_unified::ubatch_next(
|
||||||
|
2
llama/llama.cpp/src/llama-model-loader.cpp
vendored
2
llama/llama.cpp/src/llama-model-loader.cpp
vendored
@ -315,6 +315,8 @@ namespace GGUFMeta {
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template bool llama_model_loader::get_arr<std::array<unsigned int, 512>>(enum llm_kv kid, std::array<unsigned int, 512>& result, bool required);
|
||||||
|
|
||||||
template<typename T, size_t N_MAX>
|
template<typename T, size_t N_MAX>
|
||||||
bool llama_model_loader::get_arr(const std::string & key, std::array<T, N_MAX> & result, bool required) {
|
bool llama_model_loader::get_arr(const std::string & key, std::array<T, N_MAX> & result, bool required) {
|
||||||
const int kid = gguf_find_key(meta.get(), key.c_str());
|
const int kid = gguf_find_key(meta.get(), key.c_str());
|
||||||
|
309
llama/llama.cpp/src/llama-model.cpp
vendored
309
llama/llama.cpp/src/llama-model.cpp
vendored
@ -433,6 +433,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||||||
|
|
||||||
// get general kv
|
// get general kv
|
||||||
ml.get_key(LLM_KV_GENERAL_NAME, name, false);
|
ml.get_key(LLM_KV_GENERAL_NAME, name, false);
|
||||||
|
ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab, false);
|
||||||
|
|
||||||
// everything past this point is not vocab-related
|
// everything past this point is not vocab-related
|
||||||
if (hparams.vocab_only) {
|
if (hparams.vocab_only) {
|
||||||
@ -444,6 +445,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||||||
ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer);
|
ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer);
|
||||||
ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
|
ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
|
||||||
ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
|
ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
|
||||||
|
ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false);
|
||||||
|
|
||||||
if (arch == LLM_ARCH_WAVTOKENIZER_DEC) {
|
if (arch == LLM_ARCH_WAVTOKENIZER_DEC) {
|
||||||
ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd_features);
|
ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd_features);
|
||||||
@ -467,9 +469,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||||||
std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0);
|
std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0);
|
||||||
std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
|
std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
|
||||||
std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0);
|
std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0);
|
||||||
|
std::fill(hparams.cross_attn_layers.begin(), hparams.cross_attn_layers.end(), -1);
|
||||||
|
|
||||||
ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer, false);
|
ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer, false);
|
||||||
ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false);
|
ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false);
|
||||||
|
ml.get_arr(LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS, hparams.cross_attn_layers, false);
|
||||||
|
|
||||||
// n_head_kv is optional, default to n_head
|
// n_head_kv is optional, default to n_head
|
||||||
hparams.n_head_kv_arr = hparams.n_head_arr;
|
hparams.n_head_kv_arr = hparams.n_head_arr;
|
||||||
@ -522,7 +526,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||||||
|
|
||||||
ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
|
ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
|
||||||
|
|
||||||
if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON) {
|
if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_MLLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON) {
|
||||||
if (hparams.n_rot != hparams.n_embd_head_k) {
|
if (hparams.n_rot != hparams.n_embd_head_k) {
|
||||||
throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
|
throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
|
||||||
}
|
}
|
||||||
@ -585,6 +589,16 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||||||
hparams.use_kq_norm = false;
|
hparams.use_kq_norm = false;
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
case LLM_ARCH_MLLAMA:
|
||||||
|
{
|
||||||
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||||
|
|
||||||
|
switch (hparams.n_layer) {
|
||||||
|
case 40: type = LLM_TYPE_11B; break;
|
||||||
|
case 100: type = LLM_TYPE_90B; break;
|
||||||
|
default: type = LLM_TYPE_UNKNOWN;
|
||||||
|
}
|
||||||
|
} break;
|
||||||
case LLM_ARCH_DECI:
|
case LLM_ARCH_DECI:
|
||||||
{
|
{
|
||||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||||
@ -1581,7 +1595,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||||||
const int64_t n_embd_head_v = hparams.n_embd_head_v;
|
const int64_t n_embd_head_v = hparams.n_embd_head_v;
|
||||||
const int64_t n_ff = hparams.n_ff();
|
const int64_t n_ff = hparams.n_ff();
|
||||||
const int64_t n_embd_gqa = n_embd_v_gqa;
|
const int64_t n_embd_gqa = n_embd_v_gqa;
|
||||||
const int64_t n_vocab = vocab.n_tokens();
|
const int64_t n_vocab = hparams.n_vocab;
|
||||||
const int64_t n_token_types = vocab.n_token_types();
|
const int64_t n_token_types = vocab.n_token_types();
|
||||||
const int64_t n_rot = hparams.n_rot;
|
const int64_t n_rot = hparams.n_rot;
|
||||||
const int64_t n_expert = hparams.n_expert;
|
const int64_t n_expert = hparams.n_expert;
|
||||||
@ -1840,6 +1854,52 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
case LLM_ARCH_MLLAMA:
|
||||||
|
{
|
||||||
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab+8}, 0);
|
||||||
|
|
||||||
|
// output
|
||||||
|
{
|
||||||
|
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
||||||
|
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||||
|
|
||||||
|
// if output is NULL, init from the input tok embed
|
||||||
|
if (output == NULL) {
|
||||||
|
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < n_layer; ++i) {
|
||||||
|
auto & layer = layers[i];
|
||||||
|
|
||||||
|
if (hparams.cross_attention_layers(i)) {
|
||||||
|
layer.cross_attn_k_norm = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_K_NORM, "weight", i), {128}, 0);
|
||||||
|
layer.cross_attn_k_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_K_PROJ, "weight", i), {n_embd, 1024}, 0);
|
||||||
|
layer.cross_attn_o_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_O_PROJ, "weight", i), {n_embd, n_embd}, 0);
|
||||||
|
layer.cross_attn_q_norm = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_Q_NORM, "weight", i), {128}, 0);
|
||||||
|
layer.cross_attn_q_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_Q_PROJ, "weight", i), {n_embd, n_embd}, 0);
|
||||||
|
layer.cross_attn_v_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_V_PROJ, "weight", i), {n_embd, 1024}, 0);
|
||||||
|
layer.cross_attn_attn_gate = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_ATTN_GATE, i), {1}, 0);
|
||||||
|
layer.cross_attn_mlp_gate = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_MLP_GATE, i), {1}, 0);
|
||||||
|
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
||||||
|
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
|
||||||
|
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
||||||
|
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
||||||
|
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
||||||
|
} else {
|
||||||
|
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
||||||
|
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
||||||
|
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
|
||||||
|
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
|
||||||
|
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
|
||||||
|
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
||||||
|
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
|
||||||
|
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
||||||
|
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
||||||
|
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} break;
|
||||||
case LLM_ARCH_DECI:
|
case LLM_ARCH_DECI:
|
||||||
{
|
{
|
||||||
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||||
@ -4756,6 +4816,246 @@ struct llm_build_llama : public llm_graph_context {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct llm_build_mllama: public llm_graph_context {
|
||||||
|
llm_build_mllama(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
||||||
|
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
||||||
|
int32_t n_tokens = this->n_tokens;
|
||||||
|
|
||||||
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||||
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||||
|
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||||
|
|
||||||
|
ggml_tensor * cur;
|
||||||
|
ggml_tensor * inpL;
|
||||||
|
ggml_tensor * inpCAS;
|
||||||
|
|
||||||
|
inpL = build_inp_embd(model.tok_embd);
|
||||||
|
inpCAS = build_inp_cross_attn_state();
|
||||||
|
|
||||||
|
// inp_pos - contains the positions
|
||||||
|
ggml_tensor * inp_pos = build_inp_pos();
|
||||||
|
|
||||||
|
auto * inp_attn = build_attn_inp_kv_unified();
|
||||||
|
const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
|
||||||
|
|
||||||
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
|
ggml_tensor * inpSA = inpL;
|
||||||
|
|
||||||
|
// norm
|
||||||
|
cur = build_norm(inpL,
|
||||||
|
model.layers[il].attn_norm, NULL,
|
||||||
|
LLM_NORM_RMS, il);
|
||||||
|
cb(cur, "attn_norm", il);
|
||||||
|
|
||||||
|
if (hparams.cross_attention_layers(il)) {
|
||||||
|
if (!ubatch.embd && !cparams.cross_attn) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// cross attention layer
|
||||||
|
ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_q_proj, cur);
|
||||||
|
cb(Qcur, "Qcur", il);
|
||||||
|
|
||||||
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
||||||
|
cb(Qcur, "Qcur", il);
|
||||||
|
|
||||||
|
Qcur = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 0, 2, 1, 3));
|
||||||
|
cb(Qcur, "Qcur", il);
|
||||||
|
|
||||||
|
Qcur = build_norm(Qcur, model.layers[il].cross_attn_q_norm, NULL, LLM_NORM_RMS, il);
|
||||||
|
cb(Qcur, "Qcur", il);
|
||||||
|
|
||||||
|
ggml_tensor * Kcur, * Vcur;
|
||||||
|
if (ubatch.embd) {
|
||||||
|
Kcur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_k_proj, inpCAS);
|
||||||
|
cb(Kcur, "Kcur", il);
|
||||||
|
|
||||||
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, 6404);
|
||||||
|
cb(Kcur, "Kcur", il);
|
||||||
|
|
||||||
|
Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
|
||||||
|
cb(Kcur, "Kcur", il);
|
||||||
|
|
||||||
|
Kcur = build_norm(Kcur, model.layers[il].cross_attn_k_norm, NULL, LLM_NORM_RMS, il);
|
||||||
|
cb(Kcur, "Kcur", il);
|
||||||
|
|
||||||
|
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, kv_self->k_l[il]));
|
||||||
|
|
||||||
|
Vcur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_v_proj, inpCAS);
|
||||||
|
cb(Vcur, "Vcur", il);
|
||||||
|
|
||||||
|
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, 6404);
|
||||||
|
cb(Vcur, "Vcur", il);
|
||||||
|
|
||||||
|
Vcur = ggml_permute(ctx0, Vcur, 0, 2, 1, 3);
|
||||||
|
cb(Vcur, "Vcur", il);
|
||||||
|
|
||||||
|
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, kv_self->v_l[il]));
|
||||||
|
} else {
|
||||||
|
Kcur = ggml_view_tensor(ctx0, kv_self->k_l[il]);
|
||||||
|
cb(Kcur, "Kcur (view)", il);
|
||||||
|
|
||||||
|
Vcur = ggml_view_tensor(ctx0, kv_self->v_l[il]);
|
||||||
|
cb(Vcur, "Vcur (view)", il);
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor * kq = ggml_mul_mat(ctx0, Kcur, Qcur);
|
||||||
|
cb(kq, "kq", il);
|
||||||
|
|
||||||
|
// TODO: apply causal masks
|
||||||
|
struct ggml_tensor * kq_soft_max = ggml_soft_max_ext(ctx0, kq, nullptr, 1.f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias);
|
||||||
|
cb(kq_soft_max, "kq_soft_max", il);
|
||||||
|
|
||||||
|
Vcur = ggml_cont(ctx0, ggml_transpose(ctx0, Vcur));
|
||||||
|
cb(Vcur, "Vcur", il);
|
||||||
|
|
||||||
|
struct ggml_tensor * kqv = ggml_mul_mat(ctx0, Vcur, kq_soft_max);
|
||||||
|
cb(kqv, "kqv", il);
|
||||||
|
|
||||||
|
struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
|
||||||
|
cb(kqv_merged, "kqv_merged", il);
|
||||||
|
|
||||||
|
cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_head_v*n_head, n_tokens);
|
||||||
|
cb(cur, "kqv_merged_cont", il);
|
||||||
|
|
||||||
|
cur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_o_proj, cur);
|
||||||
|
cb(cur, "cur", il);
|
||||||
|
|
||||||
|
// TODO: do this in place once?
|
||||||
|
cur = ggml_mul(ctx0, cur, ggml_tanh(ctx0, model.layers[il].cross_attn_attn_gate));
|
||||||
|
|
||||||
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
||||||
|
cb(ffn_inp, "ffn_inp", il);
|
||||||
|
|
||||||
|
// feed-forward network
|
||||||
|
cur = build_norm(ffn_inp,
|
||||||
|
model.layers[il].ffn_norm, NULL,
|
||||||
|
LLM_NORM_RMS, il);
|
||||||
|
cb(cur, "ffn_norm", il);
|
||||||
|
|
||||||
|
cur = build_ffn(cur,
|
||||||
|
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
||||||
|
model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
|
||||||
|
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
||||||
|
NULL,
|
||||||
|
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
||||||
|
cb(cur, "ffn_out", il);
|
||||||
|
|
||||||
|
// TODO: do this inplace once?
|
||||||
|
cur = ggml_add_inplace(ctx0, ggml_mul_inplace(ctx0, cur, ggml_tanh(ctx0, model.layers[il].cross_attn_mlp_gate)), ffn_inp);
|
||||||
|
cb(cur, "ffn_out", il);
|
||||||
|
|
||||||
|
cur = build_cvec(cur, il);
|
||||||
|
cb(cur, "l_out", il);
|
||||||
|
|
||||||
|
// input for next layer
|
||||||
|
inpL = cur;
|
||||||
|
} else {
|
||||||
|
// self attention layer
|
||||||
|
|
||||||
|
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
||||||
|
ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
|
||||||
|
|
||||||
|
// compute Q and K and RoPE them
|
||||||
|
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
||||||
|
cb(Qcur, "Qcur", il);
|
||||||
|
if (model.layers[il].bq) {
|
||||||
|
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
||||||
|
cb(Qcur, "Qcur", il);
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
||||||
|
cb(Kcur, "Kcur", il);
|
||||||
|
if (model.layers[il].bk) {
|
||||||
|
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
||||||
|
cb(Kcur, "Kcur", il);
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
||||||
|
cb(Vcur, "Vcur", il);
|
||||||
|
if (model.layers[il].bv) {
|
||||||
|
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
||||||
|
cb(Vcur, "Vcur", il);
|
||||||
|
}
|
||||||
|
|
||||||
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
||||||
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
||||||
|
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
||||||
|
|
||||||
|
Qcur = ggml_rope_ext(
|
||||||
|
ctx0, Qcur, inp_pos, rope_factors,
|
||||||
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||||
|
ext_factor, attn_factor, beta_fast, beta_slow
|
||||||
|
);
|
||||||
|
|
||||||
|
Kcur = ggml_rope_ext(
|
||||||
|
ctx0, Kcur, inp_pos, rope_factors,
|
||||||
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||||
|
ext_factor, attn_factor, beta_fast, beta_slow
|
||||||
|
);
|
||||||
|
|
||||||
|
cb(Qcur, "Qcur", il);
|
||||||
|
cb(Kcur, "Kcur", il);
|
||||||
|
cb(Vcur, "Vcur", il);
|
||||||
|
|
||||||
|
cur = build_attn(inp_attn, gf,
|
||||||
|
model.layers[il].wo, model.layers[il].bo,
|
||||||
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||||
|
|
||||||
|
if (il == n_layer - 1) {
|
||||||
|
// skip computing output for unused tokens
|
||||||
|
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||||
|
n_tokens = n_outputs;
|
||||||
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||||
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
||||||
|
cb(ffn_inp, "ffn_inp", il);
|
||||||
|
|
||||||
|
// feed-forward network
|
||||||
|
cur = build_norm(ffn_inp,
|
||||||
|
model.layers[il].ffn_norm, NULL,
|
||||||
|
LLM_NORM_RMS, il);
|
||||||
|
cb(cur, "ffn_norm", il);
|
||||||
|
|
||||||
|
cur = build_ffn(cur,
|
||||||
|
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
||||||
|
model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
|
||||||
|
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
||||||
|
NULL,
|
||||||
|
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
||||||
|
cb(cur, "ffn_out", il);
|
||||||
|
|
||||||
|
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||||
|
cb(cur, "ffn_out", il);
|
||||||
|
|
||||||
|
cur = build_cvec(cur, il);
|
||||||
|
cb(cur, "l_out", il);
|
||||||
|
|
||||||
|
// input for next layer
|
||||||
|
inpL = cur;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
cur = inpL;
|
||||||
|
|
||||||
|
cur = build_norm(cur,
|
||||||
|
model.output_norm, NULL,
|
||||||
|
LLM_NORM_RMS, -1);
|
||||||
|
cb(cur, "result_norm", -1);
|
||||||
|
res->t_embd = cur;
|
||||||
|
|
||||||
|
// lm_head
|
||||||
|
cur = build_lora_mm(model.output, cur);
|
||||||
|
|
||||||
|
cb(cur, "result_output", -1);
|
||||||
|
res->t_logits = cur;
|
||||||
|
|
||||||
|
ggml_build_forward_expand(gf, cur);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
struct llm_build_deci : public llm_graph_context {
|
struct llm_build_deci : public llm_graph_context {
|
||||||
llm_build_deci(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
llm_build_deci(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
||||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||||
@ -13128,6 +13428,10 @@ llm_graph_result_ptr llama_model::build_graph(
|
|||||||
{
|
{
|
||||||
llm = std::make_unique<llm_build_llama>(*this, params, gf);
|
llm = std::make_unique<llm_build_llama>(*this, params, gf);
|
||||||
} break;
|
} break;
|
||||||
|
case LLM_ARCH_MLLAMA:
|
||||||
|
{
|
||||||
|
llm = std::make_unique<llm_build_mllama>(*this, params, gf);
|
||||||
|
} break;
|
||||||
case LLM_ARCH_DECI:
|
case LLM_ARCH_DECI:
|
||||||
{
|
{
|
||||||
llm = std::make_unique<llm_build_deci>(*this, params, gf);
|
llm = std::make_unique<llm_build_deci>(*this, params, gf);
|
||||||
@ -13489,6 +13793,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|||||||
// use what we call a normal RoPE, operating on pairs of consecutive head values
|
// use what we call a normal RoPE, operating on pairs of consecutive head values
|
||||||
case LLM_ARCH_LLAMA:
|
case LLM_ARCH_LLAMA:
|
||||||
case LLM_ARCH_LLAMA4:
|
case LLM_ARCH_LLAMA4:
|
||||||
|
case LLM_ARCH_MLLAMA:
|
||||||
case LLM_ARCH_DECI:
|
case LLM_ARCH_DECI:
|
||||||
case LLM_ARCH_BAICHUAN:
|
case LLM_ARCH_BAICHUAN:
|
||||||
case LLM_ARCH_STARCODER:
|
case LLM_ARCH_STARCODER:
|
||||||
|
12
llama/llama.cpp/src/llama-model.h
vendored
12
llama/llama.cpp/src/llama-model.h
vendored
@ -11,6 +11,7 @@
|
|||||||
#include <string>
|
#include <string>
|
||||||
#include <unordered_map>
|
#include <unordered_map>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
#include <stdexcept>
|
||||||
|
|
||||||
struct llama_cparams;
|
struct llama_cparams;
|
||||||
struct llama_ubatch;
|
struct llama_ubatch;
|
||||||
@ -74,6 +75,7 @@ enum llm_type {
|
|||||||
LLM_TYPE_40B,
|
LLM_TYPE_40B,
|
||||||
LLM_TYPE_65B,
|
LLM_TYPE_65B,
|
||||||
LLM_TYPE_70B,
|
LLM_TYPE_70B,
|
||||||
|
LLM_TYPE_90B,
|
||||||
LLM_TYPE_236B,
|
LLM_TYPE_236B,
|
||||||
LLM_TYPE_290B,
|
LLM_TYPE_290B,
|
||||||
LLM_TYPE_314B,
|
LLM_TYPE_314B,
|
||||||
@ -318,6 +320,16 @@ struct llama_layer {
|
|||||||
|
|
||||||
struct ggml_tensor * bskcn_tv = nullptr;
|
struct ggml_tensor * bskcn_tv = nullptr;
|
||||||
|
|
||||||
|
// cross attention
|
||||||
|
struct ggml_tensor * cross_attn_k_norm = nullptr;
|
||||||
|
struct ggml_tensor * cross_attn_k_proj = nullptr;
|
||||||
|
struct ggml_tensor * cross_attn_o_proj = nullptr;
|
||||||
|
struct ggml_tensor * cross_attn_q_norm = nullptr;
|
||||||
|
struct ggml_tensor * cross_attn_q_proj = nullptr;
|
||||||
|
struct ggml_tensor * cross_attn_v_proj = nullptr;
|
||||||
|
struct ggml_tensor * cross_attn_attn_gate = nullptr;
|
||||||
|
struct ggml_tensor * cross_attn_mlp_gate = nullptr;
|
||||||
|
|
||||||
struct llama_layer_posnet posnet;
|
struct llama_layer_posnet posnet;
|
||||||
|
|
||||||
struct llama_layer_convnext convnext;
|
struct llama_layer_convnext convnext;
|
||||||
|
4
llama/llama.cpp/src/llama-quant.cpp
vendored
4
llama/llama.cpp/src/llama-quant.cpp
vendored
@ -639,7 +639,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|||||||
if (llama_model_has_encoder(&model)) {
|
if (llama_model_has_encoder(&model)) {
|
||||||
n_attn_layer *= 3;
|
n_attn_layer *= 3;
|
||||||
}
|
}
|
||||||
GGML_ASSERT((qs.n_attention_wv == n_attn_layer) && "n_attention_wv is unexpected");
|
if (qs.n_attention_wv != n_attn_layer) {
|
||||||
|
LLAMA_LOG_WARN("%s: n_attention_wv is unexpected, expected: %d, found: %d\n", __func__, n_attn_layer, qs.n_attention_wv);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t total_size_org = 0;
|
size_t total_size_org = 0;
|
||||||
|
2
llama/llama.cpp/src/llama-vocab.cpp
vendored
2
llama/llama.cpp/src/llama-vocab.cpp
vendored
@ -1469,6 +1469,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|||||||
const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
|
const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
|
||||||
if (precompiled_charsmap_keyidx != -1) {
|
if (precompiled_charsmap_keyidx != -1) {
|
||||||
const gguf_type pc_type = gguf_get_arr_type(ctx, precompiled_charsmap_keyidx);
|
const gguf_type pc_type = gguf_get_arr_type(ctx, precompiled_charsmap_keyidx);
|
||||||
|
GGML_ASSERT(pc_type == GGUF_TYPE_INT8 || pc_type == GGUF_TYPE_UINT8);
|
||||||
|
|
||||||
const size_t n_precompiled_charsmap = gguf_get_arr_data_n(ctx, precompiled_charsmap_keyidx);
|
const size_t n_precompiled_charsmap = gguf_get_arr_data_n(ctx, precompiled_charsmap_keyidx);
|
||||||
const char * pc = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx);
|
const char * pc = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx);
|
||||||
precompiled_charsmap.assign(pc, pc + n_precompiled_charsmap);
|
precompiled_charsmap.assign(pc, pc + n_precompiled_charsmap);
|
||||||
|
5
llama/llama.cpp/tools/mtmd/llava.cpp
vendored
5
llama/llama.cpp/tools/mtmd/llava.cpp
vendored
@ -462,7 +462,7 @@ struct llava_embd_batch {
|
|||||||
std::vector<llama_seq_id *> seq_ids;
|
std::vector<llama_seq_id *> seq_ids;
|
||||||
std::vector<int8_t> logits;
|
std::vector<int8_t> logits;
|
||||||
llama_batch batch;
|
llama_batch batch;
|
||||||
llava_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
|
llava_embd_batch(float * embd, int32_t n_embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
|
||||||
pos .resize(n_tokens);
|
pos .resize(n_tokens);
|
||||||
n_seq_id.resize(n_tokens);
|
n_seq_id.resize(n_tokens);
|
||||||
seq_ids .resize(n_tokens + 1);
|
seq_ids .resize(n_tokens + 1);
|
||||||
@ -474,6 +474,7 @@ struct llava_embd_batch {
|
|||||||
/*n_tokens =*/ n_tokens,
|
/*n_tokens =*/ n_tokens,
|
||||||
/*tokens =*/ nullptr,
|
/*tokens =*/ nullptr,
|
||||||
/*embd =*/ embd,
|
/*embd =*/ embd,
|
||||||
|
/*n_embd =*/ n_embd,
|
||||||
/*pos =*/ pos.data(),
|
/*pos =*/ pos.data(),
|
||||||
/*n_seq_id =*/ n_seq_id.data(),
|
/*n_seq_id =*/ n_seq_id.data(),
|
||||||
/*seq_id =*/ seq_ids.data(),
|
/*seq_id =*/ seq_ids.data(),
|
||||||
@ -497,7 +498,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
|
|||||||
n_eval = n_batch;
|
n_eval = n_batch;
|
||||||
}
|
}
|
||||||
float * embd = image_embed->embed+i*n_embd;
|
float * embd = image_embed->embed+i*n_embd;
|
||||||
llava_embd_batch llava_batch = llava_embd_batch(embd, n_eval, *n_past, 0);
|
llava_embd_batch llava_batch = llava_embd_batch(embd, n_embd, n_eval, *n_past, 0);
|
||||||
if (llama_decode(ctx_llama, llava_batch.batch)) {
|
if (llama_decode(ctx_llama, llava_batch.batch)) {
|
||||||
LOG_ERR("%s : failed to eval\n", __func__);
|
LOG_ERR("%s : failed to eval\n", __func__);
|
||||||
return false;
|
return false;
|
||||||
|
@ -17,6 +17,7 @@ package llama
|
|||||||
#include "llava.h"
|
#include "llava.h"
|
||||||
#include "gguf.h"
|
#include "gguf.h"
|
||||||
|
|
||||||
|
#include "mllama.h"
|
||||||
#include "sampling_ext.h"
|
#include "sampling_ext.h"
|
||||||
|
|
||||||
extern bool llamaProgressCallback(float progress, void *user_data);
|
extern bool llamaProgressCallback(float progress, void *user_data);
|
||||||
@ -509,6 +510,63 @@ func (c *ClipContext) NewEmbed(llamaContext *Context, data []byte) ([][]float32,
|
|||||||
return embed, nil
|
return embed, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type MllamaContext struct {
|
||||||
|
c *C.struct_mllama_ctx
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewMllamaContext(llamaContext *Context, modelPath string) (*MllamaContext, error) {
|
||||||
|
mp := C.CString(modelPath)
|
||||||
|
defer C.free(unsafe.Pointer(mp))
|
||||||
|
c := C.mllama_model_load(mp, 1)
|
||||||
|
if c == nil {
|
||||||
|
return nil, fmt.Errorf("unable to load mllama model: %v", modelPath)
|
||||||
|
}
|
||||||
|
|
||||||
|
projEmbedSize := int(C.mllama_n_embd(c))
|
||||||
|
modelEmbedSize := llamaContext.Model().NEmbd()
|
||||||
|
if projEmbedSize != modelEmbedSize {
|
||||||
|
return nil, fmt.Errorf("projector embedding size (%d) does not match model (%d)", projEmbedSize, modelEmbedSize)
|
||||||
|
}
|
||||||
|
|
||||||
|
return &MllamaContext{c: c}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *MllamaContext) Free() {
|
||||||
|
C.mllama_free(m.c)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *MllamaContext) NewEmbed(llamaContext *Context, data []byte, aspectRatioId int) ([][]float32, error) {
|
||||||
|
img := C.mllama_image_init()
|
||||||
|
defer C.mllama_image_free(img)
|
||||||
|
|
||||||
|
ok := bool(C.mllama_image_load_from_data(unsafe.Pointer(&data[0]), C.int(len(data)), 560, 560, 3, 4, C.int(aspectRatioId), img))
|
||||||
|
if !ok {
|
||||||
|
return nil, errors.New("unable to load mllama image data")
|
||||||
|
}
|
||||||
|
|
||||||
|
rows := make([]float32, m.EmbedSize(llamaContext))
|
||||||
|
ok = bool(C.mllama_image_encode(m.c, C.int(llamaContext.numThreads), img, (*C.float)(unsafe.Pointer(&rows[0]))))
|
||||||
|
if !ok {
|
||||||
|
return nil, errors.New("unable to make mllama embedding from image")
|
||||||
|
}
|
||||||
|
|
||||||
|
embed := make([][]float32, 1)
|
||||||
|
embed[0] = rows
|
||||||
|
|
||||||
|
return embed, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *MllamaContext) EmbedSize(llamaContext *Context) int {
|
||||||
|
numTokens := int(C.mllama_n_positions(m.c) * C.mllama_n_tiles(m.c))
|
||||||
|
numEmbed := llamaContext.Model().NEmbd()
|
||||||
|
|
||||||
|
return numTokens * numEmbed
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *Context) SetCrossAttention(state bool) {
|
||||||
|
C.llama_set_cross_attention(c.c, C.bool(state))
|
||||||
|
}
|
||||||
|
|
||||||
func (c *Context) Synchronize() {
|
func (c *Context) Synchronize() {
|
||||||
C.llama_synchronize(c.c)
|
C.llama_synchronize(c.c)
|
||||||
}
|
}
|
||||||
|
887
llama/mllama.cpp
vendored
Normal file
887
llama/mllama.cpp
vendored
Normal file
@ -0,0 +1,887 @@
|
|||||||
|
// NOTE: This is modified from clip.cpp for Mllama only
|
||||||
|
#include "mllama.h"
|
||||||
|
|
||||||
|
#include "ggml-alloc.h"
|
||||||
|
#include "ggml-backend.h"
|
||||||
|
#include "ggml-cpu.h"
|
||||||
|
#include "ggml.h"
|
||||||
|
#include "gguf.h"
|
||||||
|
|
||||||
|
#ifdef GGML_USE_CUDA
|
||||||
|
#include "ggml-cuda.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef GGML_USE_METAL
|
||||||
|
#include "ggml-metal.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef GGML_USE_CANN
|
||||||
|
#include "ggml-cann.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef GGML_USE_VULKAN
|
||||||
|
#include "ggml-vulkan.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include <algorithm>
|
||||||
|
#include <cmath>
|
||||||
|
#include <cstdarg>
|
||||||
|
#include <cstdlib>
|
||||||
|
#include <cstring>
|
||||||
|
#include <fstream>
|
||||||
|
#include <stdexcept>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
#define REQUIRE(x) \
|
||||||
|
do { \
|
||||||
|
if (!(x)) { \
|
||||||
|
throw std::runtime_error("REQUIRE failed: " #x); \
|
||||||
|
} \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
#define LOG(fmt, ...) fprintf(stderr, "%s: " fmt "\n", __func__, ##__VA_ARGS__)
|
||||||
|
|
||||||
|
#if defined(_WIN32)
|
||||||
|
#define WIN32_LEAN_AND_MEAN
|
||||||
|
#ifndef NOMINMAX
|
||||||
|
#define NOMINMAX
|
||||||
|
#endif
|
||||||
|
#include <windows.h>
|
||||||
|
#if __GLIBCXX__
|
||||||
|
#include <cstdio>
|
||||||
|
#include <ext/stdio_filebuf.h>
|
||||||
|
#include <fcntl.h>
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
struct mllama_image {
|
||||||
|
int width;
|
||||||
|
int height;
|
||||||
|
|
||||||
|
int num_channels = 3;
|
||||||
|
int num_tiles = 4;
|
||||||
|
|
||||||
|
int aspect_ratio_id;
|
||||||
|
|
||||||
|
std::vector<float> data;
|
||||||
|
};
|
||||||
|
|
||||||
|
static std::string format(const char *fmt, ...) {
|
||||||
|
va_list args;
|
||||||
|
va_start(args, fmt);
|
||||||
|
std::vector<char> b(128);
|
||||||
|
int n = vsnprintf(b.data(), b.size(), fmt, args);
|
||||||
|
REQUIRE(n >= 0 && n < b.size());
|
||||||
|
va_end(args);
|
||||||
|
return std::string(b.data(), b.size());
|
||||||
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
// utilities to get data from a gguf file
|
||||||
|
//
|
||||||
|
|
||||||
|
static int get_key_index(const gguf_context *ctx, const char *key) {
|
||||||
|
int key_index = gguf_find_key(ctx, key);
|
||||||
|
REQUIRE(key_index != -1);
|
||||||
|
return key_index;
|
||||||
|
}
|
||||||
|
|
||||||
|
static std::vector<uint32_t> get_u32_array(const gguf_context *ctx, const std::string &key) {
|
||||||
|
const int i = get_key_index(ctx, key.c_str());
|
||||||
|
const int n = gguf_get_arr_n(ctx, i);
|
||||||
|
const uint32_t *data = (uint32_t *)gguf_get_arr_data(ctx, i);
|
||||||
|
|
||||||
|
std::vector<uint32_t> s(n);
|
||||||
|
for (size_t j = 0; j < s.size(); j++) {
|
||||||
|
s[j] = data[j];
|
||||||
|
}
|
||||||
|
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
|
||||||
|
static uint32_t get_u32(const gguf_context *ctx, const std::string &key) {
|
||||||
|
return gguf_get_val_u32(ctx, get_key_index(ctx, key.c_str()));
|
||||||
|
}
|
||||||
|
|
||||||
|
static float get_f32(const gguf_context *ctx, const std::string &key) {
|
||||||
|
return gguf_get_val_f32(ctx, get_key_index(ctx, key.c_str()));
|
||||||
|
}
|
||||||
|
|
||||||
|
static std::string get_ftype(int ftype) {
|
||||||
|
return ggml_type_name(static_cast<ggml_type>(ftype));
|
||||||
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
// mllama layers
|
||||||
|
//
|
||||||
|
|
||||||
|
struct mllama_hparams {
|
||||||
|
uint32_t image_size;
|
||||||
|
uint32_t patch_size;
|
||||||
|
uint32_t hidden_size;
|
||||||
|
uint32_t n_intermediate;
|
||||||
|
uint32_t projection_dim;
|
||||||
|
uint32_t n_head;
|
||||||
|
uint32_t n_layer;
|
||||||
|
uint32_t n_global_layer;
|
||||||
|
uint32_t n_tiles;
|
||||||
|
|
||||||
|
float eps;
|
||||||
|
|
||||||
|
std::vector<bool> intermediate_layers;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct mllama_layer {
|
||||||
|
// attention
|
||||||
|
struct ggml_tensor *k_w;
|
||||||
|
struct ggml_tensor *k_b;
|
||||||
|
struct ggml_tensor *q_w;
|
||||||
|
struct ggml_tensor *q_b;
|
||||||
|
struct ggml_tensor *v_w;
|
||||||
|
struct ggml_tensor *v_b;
|
||||||
|
|
||||||
|
struct ggml_tensor *o_w;
|
||||||
|
struct ggml_tensor *o_b;
|
||||||
|
|
||||||
|
struct ggml_tensor *attn_gate;
|
||||||
|
|
||||||
|
// layernorm 1
|
||||||
|
struct ggml_tensor *ln_1_w;
|
||||||
|
struct ggml_tensor *ln_1_b;
|
||||||
|
|
||||||
|
// ff
|
||||||
|
struct ggml_tensor *ff_i_w;
|
||||||
|
struct ggml_tensor *ff_i_b;
|
||||||
|
|
||||||
|
struct ggml_tensor *ff_o_w;
|
||||||
|
struct ggml_tensor *ff_o_b;
|
||||||
|
|
||||||
|
struct ggml_tensor *ff_gate;
|
||||||
|
|
||||||
|
// layernorm 2
|
||||||
|
struct ggml_tensor *ln_2_w;
|
||||||
|
struct ggml_tensor *ln_2_b;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct mllama_vision_model {
|
||||||
|
struct mllama_hparams hparams;
|
||||||
|
|
||||||
|
// embeddings
|
||||||
|
struct ggml_tensor *class_embedding;
|
||||||
|
struct ggml_tensor *patch_embeddings;
|
||||||
|
struct ggml_tensor *position_embeddings;
|
||||||
|
struct ggml_tensor *position_embeddings_gate;
|
||||||
|
struct ggml_tensor *tile_position_embeddings;
|
||||||
|
struct ggml_tensor *tile_position_embeddings_gate;
|
||||||
|
struct ggml_tensor *pre_tile_position_embeddings;
|
||||||
|
struct ggml_tensor *pre_tile_position_embeddings_gate;
|
||||||
|
struct ggml_tensor *post_tile_position_embeddings;
|
||||||
|
struct ggml_tensor *post_tile_position_embeddings_gate;
|
||||||
|
|
||||||
|
struct ggml_tensor *pre_ln_w;
|
||||||
|
struct ggml_tensor *pre_ln_b;
|
||||||
|
|
||||||
|
std::vector<mllama_layer> layers;
|
||||||
|
std::vector<mllama_layer> global_layers;
|
||||||
|
|
||||||
|
struct ggml_tensor *post_ln_w;
|
||||||
|
struct ggml_tensor *post_ln_b;
|
||||||
|
|
||||||
|
struct ggml_tensor *mm_0_w;
|
||||||
|
struct ggml_tensor *mm_0_b;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct mllama_ctx {
|
||||||
|
struct mllama_vision_model vision_model;
|
||||||
|
|
||||||
|
uint32_t ftype = 1;
|
||||||
|
|
||||||
|
struct gguf_context *ctx_gguf;
|
||||||
|
struct ggml_context *ctx_data;
|
||||||
|
|
||||||
|
std::vector<uint8_t> buf_compute_meta;
|
||||||
|
|
||||||
|
// memory buffers to evaluate the model
|
||||||
|
ggml_backend_buffer_t params_buffer = nullptr;
|
||||||
|
|
||||||
|
ggml_backend_t backend = nullptr;
|
||||||
|
ggml_gallocr_t compute_alloc = nullptr;
|
||||||
|
};
|
||||||
|
|
||||||
|
static ggml_tensor *mllama_image_build_encoder_layer(
|
||||||
|
struct ggml_context *ctx0, const size_t il, const struct mllama_layer &layer, struct ggml_tensor *embeddings,
|
||||||
|
const float eps, const int hidden_size, const int batch_size, const int n_head, const int d_head) {
|
||||||
|
struct ggml_tensor *cur = embeddings;
|
||||||
|
|
||||||
|
{
|
||||||
|
// layernorm1
|
||||||
|
cur = ggml_norm(ctx0, cur, eps);
|
||||||
|
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, layer.ln_1_w), layer.ln_1_b);
|
||||||
|
ggml_set_name(cur, format("%d pre layernorm", il).c_str());
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
// self-attention
|
||||||
|
struct ggml_tensor *Q = ggml_mul_mat(ctx0, layer.q_w, cur);
|
||||||
|
if (layer.q_b != nullptr) {
|
||||||
|
Q = ggml_add(ctx0, Q, layer.q_b);
|
||||||
|
}
|
||||||
|
|
||||||
|
Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, Q->ne[1], batch_size);
|
||||||
|
Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
|
||||||
|
ggml_set_name(Q, format("%d query", il).c_str());
|
||||||
|
|
||||||
|
struct ggml_tensor *K = ggml_mul_mat(ctx0, layer.k_w, cur);
|
||||||
|
if (layer.k_b != nullptr) {
|
||||||
|
K = ggml_add(ctx0, K, layer.k_b);
|
||||||
|
}
|
||||||
|
|
||||||
|
K = ggml_reshape_4d(ctx0, K, d_head, n_head, K->ne[1], batch_size);
|
||||||
|
K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
|
||||||
|
ggml_set_name(K, format("%d key", il).c_str());
|
||||||
|
|
||||||
|
struct ggml_tensor *V = ggml_mul_mat(ctx0, layer.v_w, cur);
|
||||||
|
if (layer.v_b != nullptr) {
|
||||||
|
V = ggml_add(ctx0, V, layer.v_b);
|
||||||
|
}
|
||||||
|
|
||||||
|
V = ggml_reshape_4d(ctx0, V, d_head, n_head, V->ne[1], batch_size);
|
||||||
|
V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3));
|
||||||
|
ggml_set_name(V, format("%d value", il).c_str());
|
||||||
|
|
||||||
|
struct ggml_tensor *KQ = ggml_mul_mat(ctx0, K, Q);
|
||||||
|
KQ = ggml_scale_inplace(ctx0, KQ, 1.0f / sqrtf((float)d_head));
|
||||||
|
KQ = ggml_soft_max_inplace(ctx0, KQ);
|
||||||
|
ggml_set_name(KQ, format("%d KQ", il).c_str());
|
||||||
|
|
||||||
|
struct ggml_tensor *KQV = ggml_mul_mat(ctx0, V, KQ);
|
||||||
|
KQV = ggml_reshape_4d(ctx0, KQV, d_head, KQV->ne[1], n_head, batch_size);
|
||||||
|
KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
||||||
|
KQV = ggml_cont_3d(ctx0, KQV, hidden_size, KQV->ne[2], batch_size);
|
||||||
|
ggml_set_name(KQV, format("%d KQV", il).c_str());
|
||||||
|
|
||||||
|
cur = ggml_mul_mat(ctx0, layer.o_w, KQV);
|
||||||
|
if (layer.o_b != nullptr) {
|
||||||
|
cur = ggml_add(ctx0, cur, layer.o_b);
|
||||||
|
}
|
||||||
|
ggml_set_name(cur, format("%d self attention", il).c_str());
|
||||||
|
|
||||||
|
if (layer.attn_gate != nullptr) {
|
||||||
|
cur = ggml_mul_inplace(ctx0, cur, layer.attn_gate);
|
||||||
|
ggml_set_name(cur, format("%d self attention gate", il).c_str());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
cur = ggml_add(ctx0, cur, embeddings);
|
||||||
|
ggml_set_name(cur, format("%d residual", il).c_str());
|
||||||
|
|
||||||
|
embeddings = cur;
|
||||||
|
|
||||||
|
{
|
||||||
|
// layernorm2
|
||||||
|
cur = ggml_norm(ctx0, cur, eps);
|
||||||
|
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, layer.ln_2_w), layer.ln_2_b);
|
||||||
|
ggml_set_name(cur, format("%d post layernorm", il).c_str());
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
// feed forward
|
||||||
|
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, layer.ff_i_w, cur), layer.ff_i_b);
|
||||||
|
cur = ggml_gelu_inplace(ctx0, cur);
|
||||||
|
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, layer.ff_o_w, cur), layer.ff_o_b);
|
||||||
|
ggml_set_name(cur, format("%d feed forward", il).c_str());
|
||||||
|
|
||||||
|
if (layer.ff_gate != nullptr) {
|
||||||
|
cur = ggml_mul_inplace(ctx0, cur, layer.ff_gate);
|
||||||
|
ggml_set_name(cur, format("%d feed forward gate", il).c_str());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// residual 2
|
||||||
|
cur = ggml_add(ctx0, cur, embeddings);
|
||||||
|
ggml_set_name(cur, format("%d residual", il).c_str());
|
||||||
|
|
||||||
|
embeddings = cur;
|
||||||
|
|
||||||
|
return embeddings;
|
||||||
|
}
|
||||||
|
|
||||||
|
static ggml_cgraph *mllama_image_build_graph(mllama_ctx *ctx, const mllama_image_batch *imgs) {
|
||||||
|
const auto &model = ctx->vision_model;
|
||||||
|
const auto &hparams = model.hparams;
|
||||||
|
|
||||||
|
const int image_size = hparams.image_size;
|
||||||
|
const int image_size_width = image_size;
|
||||||
|
const int image_size_height = image_size;
|
||||||
|
|
||||||
|
const int patch_size = hparams.patch_size;
|
||||||
|
const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
|
||||||
|
const int num_positions = num_patches + (model.class_embedding == nullptr ? 0 : 1);
|
||||||
|
const int hidden_size = hparams.hidden_size;
|
||||||
|
const int n_head = hparams.n_head;
|
||||||
|
const int d_head = hidden_size / n_head;
|
||||||
|
|
||||||
|
const int batch_size = imgs->size;
|
||||||
|
REQUIRE(batch_size == 1);
|
||||||
|
|
||||||
|
int num_tiles = 4;
|
||||||
|
int num_channels = 3;
|
||||||
|
if (imgs->data != nullptr) {
|
||||||
|
num_tiles = imgs->data[0].num_tiles > 0 ? imgs->data[0].num_tiles : num_tiles;
|
||||||
|
num_channels = imgs->data[0].num_channels > 0 ? imgs->data[0].num_channels : num_channels;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_init_params params = {
|
||||||
|
ctx->buf_compute_meta.size(), // mem_size
|
||||||
|
ctx->buf_compute_meta.data(), // mem_buffer
|
||||||
|
true, // no_alloc
|
||||||
|
};
|
||||||
|
|
||||||
|
struct ggml_context *ctx0 = ggml_init(params);
|
||||||
|
struct ggml_cgraph *gf = ggml_new_graph(ctx0);
|
||||||
|
|
||||||
|
struct ggml_tensor *inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size_width, image_size_height, num_channels, num_tiles);
|
||||||
|
ggml_set_name(inp_raw, "inp_raw");
|
||||||
|
ggml_set_input(inp_raw);
|
||||||
|
|
||||||
|
struct ggml_tensor *inp = ggml_conv_2d(ctx0, model.patch_embeddings, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
|
||||||
|
|
||||||
|
inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, num_tiles);
|
||||||
|
inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3));
|
||||||
|
|
||||||
|
struct ggml_tensor *aspect_ratios = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, imgs->size);
|
||||||
|
ggml_set_name(aspect_ratios, "aspect_ratios");
|
||||||
|
ggml_set_input(aspect_ratios);
|
||||||
|
|
||||||
|
if (model.pre_tile_position_embeddings != nullptr) {
|
||||||
|
struct ggml_tensor *pre_tile_position_embeddings = ggml_get_rows(ctx0, model.pre_tile_position_embeddings, aspect_ratios);
|
||||||
|
ggml_set_name(pre_tile_position_embeddings, "pre_tile_position_embeddings");
|
||||||
|
|
||||||
|
pre_tile_position_embeddings = ggml_reshape_3d(ctx0, pre_tile_position_embeddings, hidden_size, 1, num_tiles);
|
||||||
|
if (model.pre_tile_position_embeddings_gate != nullptr) {
|
||||||
|
pre_tile_position_embeddings = ggml_mul_inplace(ctx0, pre_tile_position_embeddings, model.pre_tile_position_embeddings_gate);
|
||||||
|
}
|
||||||
|
|
||||||
|
inp = ggml_add(ctx0, inp, pre_tile_position_embeddings);
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor *embeddings = inp;
|
||||||
|
|
||||||
|
if (model.class_embedding != nullptr) {
|
||||||
|
// concat class_embeddings and patch_embeddings
|
||||||
|
embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, num_tiles);
|
||||||
|
ggml_set_name(embeddings, "embeddings");
|
||||||
|
ggml_set_input(embeddings);
|
||||||
|
for (int i = 0; i < num_tiles; ++i) {
|
||||||
|
// repeat class embeddings for each tile
|
||||||
|
embeddings = ggml_acc_inplace(ctx0, embeddings, model.class_embedding, embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], i * embeddings->nb[2]);
|
||||||
|
}
|
||||||
|
|
||||||
|
embeddings = ggml_acc_inplace(ctx0, embeddings, inp, embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor *positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions);
|
||||||
|
ggml_set_name(positions, "positions");
|
||||||
|
ggml_set_input(positions);
|
||||||
|
|
||||||
|
struct ggml_tensor *position_embd = ggml_get_rows(ctx0, model.position_embeddings, positions);
|
||||||
|
if (model.position_embeddings_gate != nullptr) {
|
||||||
|
position_embd = ggml_mul_inplace(ctx0, position_embd, model.position_embeddings_gate);
|
||||||
|
}
|
||||||
|
|
||||||
|
embeddings = ggml_add(ctx0, embeddings, position_embd);
|
||||||
|
|
||||||
|
if (model.tile_position_embeddings != nullptr) {
|
||||||
|
struct ggml_tensor *tile_position_embeddings = ggml_get_rows(ctx0, model.tile_position_embeddings, aspect_ratios);
|
||||||
|
ggml_set_name(tile_position_embeddings, "tile_position_embeddings");
|
||||||
|
|
||||||
|
tile_position_embeddings = ggml_reshape_3d(ctx0, tile_position_embeddings, hidden_size, num_positions, num_tiles);
|
||||||
|
if (model.tile_position_embeddings_gate != nullptr) {
|
||||||
|
tile_position_embeddings = ggml_mul_inplace(ctx0, tile_position_embeddings, model.tile_position_embeddings_gate);
|
||||||
|
}
|
||||||
|
|
||||||
|
embeddings = ggml_add(ctx0, embeddings, tile_position_embeddings);
|
||||||
|
}
|
||||||
|
|
||||||
|
// pre-layernorm
|
||||||
|
if (model.pre_ln_w != nullptr) {
|
||||||
|
embeddings = ggml_mul(ctx0, ggml_norm(ctx0, embeddings, hparams.eps), model.pre_ln_w);
|
||||||
|
if (model.pre_ln_b != nullptr) {
|
||||||
|
embeddings = ggml_add(ctx0, embeddings, model.pre_ln_b);
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_set_name(embeddings, "pre layernorm");
|
||||||
|
}
|
||||||
|
|
||||||
|
const int num_padding_patches = 8 - (embeddings->ne[1] % 8) % 8;
|
||||||
|
|
||||||
|
embeddings = ggml_pad(ctx0, embeddings, 0, num_padding_patches, 0, 0);
|
||||||
|
embeddings = ggml_view_3d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1] * embeddings->ne[2], batch_size, embeddings->nb[1], embeddings->nb[2] * embeddings->ne[3], 0);
|
||||||
|
|
||||||
|
std::vector<struct ggml_tensor *> intermediate_embeddings;
|
||||||
|
|
||||||
|
// encoder
|
||||||
|
for (size_t il = 0; il < model.layers.size(); il++) {
|
||||||
|
if (hparams.intermediate_layers[il]) {
|
||||||
|
intermediate_embeddings.push_back(embeddings);
|
||||||
|
}
|
||||||
|
|
||||||
|
embeddings = mllama_image_build_encoder_layer(
|
||||||
|
ctx0, il, model.layers[il], embeddings,
|
||||||
|
hparams.eps, hidden_size, batch_size, n_head, d_head);
|
||||||
|
}
|
||||||
|
|
||||||
|
// post-layernorm
|
||||||
|
if (model.post_ln_w != nullptr) {
|
||||||
|
embeddings = ggml_mul(ctx0, ggml_norm(ctx0, embeddings, hparams.eps), model.post_ln_w);
|
||||||
|
if (model.post_ln_b != nullptr) {
|
||||||
|
embeddings = ggml_add(ctx0, embeddings, model.post_ln_b);
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_set_name(embeddings, "post layernorm");
|
||||||
|
}
|
||||||
|
|
||||||
|
embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size, num_positions + num_padding_patches, num_tiles);
|
||||||
|
|
||||||
|
if (model.post_tile_position_embeddings != nullptr) {
|
||||||
|
struct ggml_tensor *post_tile_position_embeddings = ggml_get_rows(ctx0, model.post_tile_position_embeddings, aspect_ratios);
|
||||||
|
ggml_set_name(post_tile_position_embeddings, "post_tile_position_embeddings");
|
||||||
|
|
||||||
|
post_tile_position_embeddings = ggml_reshape_3d(ctx0, post_tile_position_embeddings, hidden_size, 1, num_tiles);
|
||||||
|
if (model.post_tile_position_embeddings_gate != nullptr) {
|
||||||
|
post_tile_position_embeddings = ggml_mul(ctx0, post_tile_position_embeddings, model.post_tile_position_embeddings_gate);
|
||||||
|
}
|
||||||
|
|
||||||
|
embeddings = ggml_add(ctx0, embeddings, post_tile_position_embeddings);
|
||||||
|
}
|
||||||
|
|
||||||
|
embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size, num_tiles * (num_positions + num_padding_patches), 1);
|
||||||
|
|
||||||
|
// global encoder
|
||||||
|
for (size_t il = 0; il < model.global_layers.size(); il++) {
|
||||||
|
embeddings = mllama_image_build_encoder_layer(
|
||||||
|
ctx0, il, model.global_layers[il], embeddings,
|
||||||
|
hparams.eps, hidden_size, batch_size, n_head, d_head);
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor *stacked_embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 0, hidden_size, (num_positions + num_padding_patches) * num_tiles);
|
||||||
|
for (size_t i = 0; i < intermediate_embeddings.size(); ++i) {
|
||||||
|
stacked_embeddings = ggml_concat(ctx0, stacked_embeddings, ggml_reshape_3d(ctx0, intermediate_embeddings[i], 1, intermediate_embeddings[i]->ne[0], intermediate_embeddings[i]->ne[1]), 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
stacked_embeddings = ggml_reshape_4d(ctx0, stacked_embeddings, intermediate_embeddings.size() * hidden_size, num_positions + num_padding_patches, num_tiles, batch_size);
|
||||||
|
stacked_embeddings = ggml_unpad(ctx0, stacked_embeddings, 0, num_padding_patches, 0, 0);
|
||||||
|
|
||||||
|
embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size, num_positions + num_padding_patches, num_tiles);
|
||||||
|
embeddings = ggml_unpad(ctx0, embeddings, 0, num_padding_patches, 0, 0);
|
||||||
|
embeddings = ggml_concat(ctx0, embeddings, stacked_embeddings, 0);
|
||||||
|
|
||||||
|
// mllama projector
|
||||||
|
embeddings = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_0_w, embeddings), model.mm_0_b);
|
||||||
|
ggml_set_name(embeddings, "multi modal projector");
|
||||||
|
|
||||||
|
// build the graph
|
||||||
|
ggml_build_forward_expand(gf, embeddings);
|
||||||
|
|
||||||
|
ggml_free(ctx0);
|
||||||
|
|
||||||
|
return gf;
|
||||||
|
}
|
||||||
|
|
||||||
|
static struct ggml_tensor *mllama_tensor_load(struct ggml_context *ctx, const char *name, const bool optional) {
|
||||||
|
struct ggml_tensor *cur = ggml_get_tensor(ctx, name);
|
||||||
|
REQUIRE(cur != nullptr || optional);
|
||||||
|
return cur;
|
||||||
|
}
|
||||||
|
|
||||||
|
static std::vector<struct mllama_layer> mllama_layers_load(struct ggml_context *ctx, const char *prefix, const int n) {
|
||||||
|
std::vector<struct mllama_layer> layers(n);
|
||||||
|
for (size_t i = 0; i < layers.size(); i++) {
|
||||||
|
auto &layer = layers[i];
|
||||||
|
layer.ln_1_w = mllama_tensor_load(ctx, format("%s.blk.%d.ln1.weight", prefix, i).c_str(), false);
|
||||||
|
layer.ln_1_b = mllama_tensor_load(ctx, format("%s.blk.%d.ln1.bias", prefix, i).c_str(), false);
|
||||||
|
layer.ln_2_w = mllama_tensor_load(ctx, format("%s.blk.%d.ln2.weight", prefix, i).c_str(), false);
|
||||||
|
layer.ln_2_b = mllama_tensor_load(ctx, format("%s.blk.%d.ln2.bias", prefix, i).c_str(), false);
|
||||||
|
|
||||||
|
layer.k_w = mllama_tensor_load(ctx, format("%s.blk.%d.attn_k.weight", prefix, i).c_str(), false);
|
||||||
|
layer.k_b = mllama_tensor_load(ctx, format("%s.blk.%d.attn_k.bias", prefix, i).c_str(), true);
|
||||||
|
layer.q_w = mllama_tensor_load(ctx, format("%s.blk.%d.attn_q.weight", prefix, i).c_str(), false);
|
||||||
|
layer.q_b = mllama_tensor_load(ctx, format("%s.blk.%d.attn_q.bias", prefix, i).c_str(), true);
|
||||||
|
layer.v_w = mllama_tensor_load(ctx, format("%s.blk.%d.attn_v.weight", prefix, i).c_str(), false);
|
||||||
|
layer.v_b = mllama_tensor_load(ctx, format("%s.blk.%d.attn_v.bias", prefix, i).c_str(), true);
|
||||||
|
layer.o_w = mllama_tensor_load(ctx, format("%s.blk.%d.attn_out.weight", prefix, i).c_str(), false);
|
||||||
|
layer.o_b = mllama_tensor_load(ctx, format("%s.blk.%d.attn_out.bias", prefix, i).c_str(), true);
|
||||||
|
|
||||||
|
layer.ff_i_w = mllama_tensor_load(ctx, format("%s.blk.%d.ffn_down.weight", prefix, i).c_str(), false);
|
||||||
|
layer.ff_i_b = mllama_tensor_load(ctx, format("%s.blk.%d.ffn_down.bias", prefix, i).c_str(), false);
|
||||||
|
layer.ff_o_w = mllama_tensor_load(ctx, format("%s.blk.%d.ffn_up.weight", prefix, i).c_str(), false);
|
||||||
|
layer.ff_o_b = mllama_tensor_load(ctx, format("%s.blk.%d.ffn_up.bias", prefix, i).c_str(), false);
|
||||||
|
|
||||||
|
layer.attn_gate = mllama_tensor_load(ctx, format("%s.blk.%d.attn_gate", prefix, i).c_str(), true);
|
||||||
|
layer.ff_gate = mllama_tensor_load(ctx, format("%s.blk.%d.ffn_gate", prefix, i).c_str(), true);
|
||||||
|
}
|
||||||
|
|
||||||
|
return layers;
|
||||||
|
}
|
||||||
|
|
||||||
|
// read and create ggml_context containing the tensors and their data
|
||||||
|
struct mllama_ctx *mllama_model_load(const char *fname, const int verbosity = 1) {
|
||||||
|
struct ggml_context *meta = nullptr;
|
||||||
|
|
||||||
|
struct gguf_init_params params = {
|
||||||
|
true, // no_alloc
|
||||||
|
&meta, // ctx
|
||||||
|
};
|
||||||
|
|
||||||
|
struct gguf_context *ctx = gguf_init_from_file(fname, params);
|
||||||
|
REQUIRE(ctx != nullptr);
|
||||||
|
|
||||||
|
if (verbosity >= 1) {
|
||||||
|
const int n_tensors = gguf_get_n_tensors(ctx);
|
||||||
|
const int n_kv = gguf_get_n_kv(ctx);
|
||||||
|
const std::string ftype = get_ftype(get_u32(ctx, "general.file_type"));
|
||||||
|
const int idx_desc = get_key_index(ctx, "general.description");
|
||||||
|
const std::string description = gguf_get_val_str(ctx, idx_desc);
|
||||||
|
const int idx_name = gguf_find_key(ctx, "general.name");
|
||||||
|
if (idx_name != -1) { // make name optional temporarily as some of the uploaded models missing it due to a bug
|
||||||
|
const std::string name = gguf_get_val_str(ctx, idx_name);
|
||||||
|
LOG("model name: %s", name.c_str());
|
||||||
|
}
|
||||||
|
LOG("description: %s", description.c_str());
|
||||||
|
LOG("GGUF version: %d", gguf_get_version(ctx));
|
||||||
|
LOG("alignment: %zu", gguf_get_alignment(ctx));
|
||||||
|
LOG("n_tensors: %d", n_tensors);
|
||||||
|
LOG("n_kv: %d", n_kv);
|
||||||
|
LOG("ftype: %s", ftype.c_str());
|
||||||
|
LOG("");
|
||||||
|
}
|
||||||
|
const int n_tensors = gguf_get_n_tensors(ctx);
|
||||||
|
|
||||||
|
mllama_ctx *new_mllama = new mllama_ctx{};
|
||||||
|
|
||||||
|
ggml_backend_t backend = ggml_backend_init_best();
|
||||||
|
if (backend == nullptr) {
|
||||||
|
LOG("%s: failed to initialize backend\n", __func__);
|
||||||
|
mllama_free(new_mllama);
|
||||||
|
gguf_free(ctx);
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
LOG("%s: using %s backend\n", __func__, ggml_backend_name(backend));
|
||||||
|
new_mllama->backend = backend;
|
||||||
|
|
||||||
|
// load tensors
|
||||||
|
{
|
||||||
|
std::vector<uint8_t> read_buf;
|
||||||
|
struct ggml_init_params params = {
|
||||||
|
(n_tensors + 1) * ggml_tensor_overhead(), // mem_size
|
||||||
|
nullptr, // mem_buffer
|
||||||
|
true, // no_alloc
|
||||||
|
};
|
||||||
|
|
||||||
|
new_mllama->ctx_data = ggml_init(params);
|
||||||
|
if (!new_mllama->ctx_data) {
|
||||||
|
LOG("ggml_init() failed");
|
||||||
|
mllama_free(new_mllama);
|
||||||
|
gguf_free(ctx);
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef _WIN32
|
||||||
|
int wlen = MultiByteToWideChar(CP_UTF8, 0, fname, -1, NULL, 0);
|
||||||
|
if (!wlen) {
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
wchar_t * wbuf = (wchar_t *) malloc(wlen * sizeof(wchar_t));
|
||||||
|
wlen = MultiByteToWideChar(CP_UTF8, 0, fname, -1, wbuf, wlen);
|
||||||
|
if (!wlen) {
|
||||||
|
free(wbuf);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
#if __GLIBCXX__
|
||||||
|
int fd = _wopen(wbuf, _O_RDONLY | _O_BINARY);
|
||||||
|
__gnu_cxx::stdio_filebuf<char> buffer(fd, std::ios_base::in);
|
||||||
|
std::istream fin(&buffer);
|
||||||
|
#else // MSVC
|
||||||
|
// unused in our current build
|
||||||
|
auto fin = std::ifstream(wbuf, std::ios::binary);
|
||||||
|
#endif
|
||||||
|
free(wbuf);
|
||||||
|
#else
|
||||||
|
auto fin = std::ifstream(fname, std::ios::binary);
|
||||||
|
#endif
|
||||||
|
if (!fin) {
|
||||||
|
LOG("cannot open model file for loading tensors\n");
|
||||||
|
mllama_free(new_mllama);
|
||||||
|
gguf_free(ctx);
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
// add tensors to context
|
||||||
|
for (int i = 0; i < n_tensors; ++i) {
|
||||||
|
const char *name = gguf_get_tensor_name(ctx, i);
|
||||||
|
struct ggml_tensor *t = ggml_get_tensor(meta, name);
|
||||||
|
struct ggml_tensor *cur = ggml_dup_tensor(new_mllama->ctx_data, t);
|
||||||
|
ggml_set_name(cur, name);
|
||||||
|
}
|
||||||
|
|
||||||
|
// alloc memory and offload data
|
||||||
|
new_mllama->params_buffer = ggml_backend_alloc_ctx_tensors(new_mllama->ctx_data, new_mllama->backend);
|
||||||
|
for (int i = 0; i < n_tensors; ++i) {
|
||||||
|
const char *name = gguf_get_tensor_name(ctx, i);
|
||||||
|
struct ggml_tensor *cur = ggml_get_tensor(new_mllama->ctx_data, name);
|
||||||
|
const size_t offset = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i);
|
||||||
|
fin.seekg(offset, std::ios::beg);
|
||||||
|
if (!fin) {
|
||||||
|
LOG("failed to seek for tensor %s\n", name);
|
||||||
|
mllama_free(new_mllama);
|
||||||
|
gguf_free(ctx);
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
int num_bytes = ggml_nbytes(cur);
|
||||||
|
if (ggml_backend_buffer_is_host(new_mllama->params_buffer)) {
|
||||||
|
// for the CPU and Metal backend, we can read directly into the tensor
|
||||||
|
fin.read(reinterpret_cast<char *>(cur->data), num_bytes);
|
||||||
|
} else {
|
||||||
|
// read into a temporary buffer first, then copy to device memory
|
||||||
|
read_buf.resize(num_bytes);
|
||||||
|
fin.read(reinterpret_cast<char *>(read_buf.data()), num_bytes);
|
||||||
|
ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#if defined(_WIN32) && defined(__GLIBCXX__)
|
||||||
|
close(fd);
|
||||||
|
#else
|
||||||
|
fin.close();
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
// vision model
|
||||||
|
// load vision model
|
||||||
|
auto &vision_model = new_mllama->vision_model;
|
||||||
|
auto &hparams = vision_model.hparams;
|
||||||
|
hparams.hidden_size = get_u32(ctx, "mllama.vision.embedding_length");
|
||||||
|
hparams.n_head = get_u32(ctx, "mllama.vision.attention.head_count");
|
||||||
|
hparams.n_intermediate = get_u32(ctx, "mllama.vision.feed_forward_length");
|
||||||
|
hparams.n_layer = get_u32(ctx, "mllama.vision.block_count");
|
||||||
|
hparams.n_global_layer = get_u32(ctx, "mllama.vision.global.block_count");
|
||||||
|
hparams.n_tiles = get_u32(ctx, "mllama.vision.max_num_tiles");
|
||||||
|
hparams.image_size = get_u32(ctx, "mllama.vision.image_size");
|
||||||
|
hparams.patch_size = get_u32(ctx, "mllama.vision.patch_size");
|
||||||
|
hparams.projection_dim = get_u32(ctx, "mllama.vision.projection_dim");
|
||||||
|
hparams.eps = get_f32(ctx, "mllama.vision.attention.layer_norm_epsilon");
|
||||||
|
|
||||||
|
std::vector<uint32_t> intermediate_layers_indices = get_u32_array(ctx, "mllama.vision.intermediate_layers_indices");
|
||||||
|
hparams.intermediate_layers.resize(hparams.n_layer);
|
||||||
|
for (size_t i = 0; i < intermediate_layers_indices.size(); i++) {
|
||||||
|
hparams.intermediate_layers[intermediate_layers_indices[i]] = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (verbosity >= 2) {
|
||||||
|
LOG("");
|
||||||
|
LOG("vision model hparams");
|
||||||
|
LOG("image_size %d", hparams.image_size);
|
||||||
|
LOG("patch_size %d", hparams.patch_size);
|
||||||
|
LOG("v_hidden_size %d", hparams.hidden_size);
|
||||||
|
LOG("v_n_intermediate %d", hparams.n_intermediate);
|
||||||
|
LOG("v_projection_dim %d", hparams.projection_dim);
|
||||||
|
LOG("v_n_head %d", hparams.n_head);
|
||||||
|
LOG("v_n_layer %d", hparams.n_layer);
|
||||||
|
LOG("v_n_global_layer %d", hparams.n_global_layer);
|
||||||
|
LOG("v_eps %f", hparams.eps);
|
||||||
|
}
|
||||||
|
|
||||||
|
vision_model.class_embedding = mllama_tensor_load(new_mllama->ctx_data, "v.class_embd", true);
|
||||||
|
vision_model.patch_embeddings = mllama_tensor_load(new_mllama->ctx_data, "v.patch_embd.weight", true);
|
||||||
|
|
||||||
|
vision_model.position_embeddings = mllama_tensor_load(new_mllama->ctx_data, "v.position_embd.weight", true);
|
||||||
|
vision_model.position_embeddings_gate = mllama_tensor_load(new_mllama->ctx_data, "v.position_embd.gate", true);
|
||||||
|
|
||||||
|
vision_model.pre_ln_w = mllama_tensor_load(new_mllama->ctx_data, "v.pre_ln.weight", true);
|
||||||
|
vision_model.pre_ln_b = mllama_tensor_load(new_mllama->ctx_data, "v.pre_ln.bias", true);
|
||||||
|
vision_model.post_ln_w = mllama_tensor_load(new_mllama->ctx_data, "v.post_ln.weight", true);
|
||||||
|
vision_model.post_ln_b = mllama_tensor_load(new_mllama->ctx_data, "v.post_ln.bias", true);
|
||||||
|
|
||||||
|
vision_model.tile_position_embeddings = mllama_tensor_load(new_mllama->ctx_data, "v.tile_position_embd.weight", true);
|
||||||
|
vision_model.tile_position_embeddings_gate = mllama_tensor_load(new_mllama->ctx_data, "v.tile_position_embd.gate", true);
|
||||||
|
|
||||||
|
vision_model.pre_tile_position_embeddings = mllama_tensor_load(new_mllama->ctx_data, "v.pre_tile_position_embd.weight", true);
|
||||||
|
vision_model.pre_tile_position_embeddings_gate = mllama_tensor_load(new_mllama->ctx_data, "v.pre_tile_position_embd.gate", true);
|
||||||
|
|
||||||
|
vision_model.post_tile_position_embeddings = mllama_tensor_load(new_mllama->ctx_data, "v.post_tile_position_embd.weight", true);
|
||||||
|
vision_model.post_tile_position_embeddings_gate = mllama_tensor_load(new_mllama->ctx_data, "v.post_tile_position_embd.gate", true);
|
||||||
|
|
||||||
|
vision_model.mm_0_w = mllama_tensor_load(new_mllama->ctx_data, "mm.0.weight", false);
|
||||||
|
vision_model.mm_0_b = mllama_tensor_load(new_mllama->ctx_data, "mm.0.bias", false);
|
||||||
|
|
||||||
|
vision_model.layers = mllama_layers_load(new_mllama->ctx_data, "v", hparams.n_layer);
|
||||||
|
vision_model.global_layers = mllama_layers_load(new_mllama->ctx_data, "v.global", hparams.n_global_layer);
|
||||||
|
|
||||||
|
ggml_free(meta);
|
||||||
|
|
||||||
|
new_mllama->ctx_gguf = ctx;
|
||||||
|
|
||||||
|
{
|
||||||
|
// measure mem requirement and allocate
|
||||||
|
new_mllama->buf_compute_meta.resize(GGML_DEFAULT_GRAPH_SIZE * ggml_tensor_overhead() + ggml_graph_overhead());
|
||||||
|
new_mllama->compute_alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(new_mllama->backend));
|
||||||
|
struct mllama_image_batch batch;
|
||||||
|
batch.size = 1;
|
||||||
|
ggml_cgraph *gf = mllama_image_build_graph(new_mllama, &batch);
|
||||||
|
ggml_gallocr_reserve(new_mllama->compute_alloc, gf);
|
||||||
|
size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_mllama->compute_alloc, 0);
|
||||||
|
LOG("compute allocated memory: %.2f MB", compute_memory_buffer_size / 1024.0 / 1024.0);
|
||||||
|
}
|
||||||
|
|
||||||
|
return new_mllama;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct mllama_image *mllama_image_init() {
|
||||||
|
return new mllama_image();
|
||||||
|
}
|
||||||
|
|
||||||
|
void mllama_image_free(struct mllama_image *img) { delete img; }
|
||||||
|
void mllama_image_batch_free(struct mllama_image_batch *batch) {
|
||||||
|
if (batch->size > 0) {
|
||||||
|
delete[] batch->data;
|
||||||
|
batch->size = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
bool mllama_image_load_from_data(const void *data, const int n, const int width, const int height, const int num_channels, const int num_tiles, const int aspect_ratio_id, struct mllama_image *img) {
|
||||||
|
img->width = width;
|
||||||
|
img->height = height;
|
||||||
|
img->num_channels = num_channels;
|
||||||
|
img->num_tiles = num_tiles;
|
||||||
|
img->aspect_ratio_id = aspect_ratio_id;
|
||||||
|
img->data.resize(n);
|
||||||
|
|
||||||
|
memcpy(img->data.data(), data, n);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
inline int mllama(int x, int lower, int upper) {
|
||||||
|
return std::max(lower, std::min(x, upper));
|
||||||
|
}
|
||||||
|
|
||||||
|
void mllama_free(mllama_ctx *ctx) {
|
||||||
|
ggml_free(ctx->ctx_data);
|
||||||
|
gguf_free(ctx->ctx_gguf);
|
||||||
|
|
||||||
|
ggml_backend_buffer_free(ctx->params_buffer);
|
||||||
|
ggml_backend_free(ctx->backend);
|
||||||
|
ggml_gallocr_free(ctx->compute_alloc);
|
||||||
|
delete ctx;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool mllama_image_encode(struct mllama_ctx *ctx, const int n_threads, mllama_image *img, float *vec) {
|
||||||
|
mllama_image_batch imgs{};
|
||||||
|
imgs.size = 1;
|
||||||
|
imgs.data = img;
|
||||||
|
return mllama_image_batch_encode(ctx, n_threads, &imgs, vec);
|
||||||
|
}
|
||||||
|
|
||||||
|
bool mllama_image_batch_encode(mllama_ctx *ctx, const int n_threads, const mllama_image_batch *imgs, float *vec) {
|
||||||
|
int batch_size = imgs->size;
|
||||||
|
REQUIRE(batch_size == 1);
|
||||||
|
|
||||||
|
// build the inference graph
|
||||||
|
ggml_cgraph *gf = mllama_image_build_graph(ctx, imgs);
|
||||||
|
ggml_gallocr_alloc_graph(ctx->compute_alloc, gf);
|
||||||
|
|
||||||
|
// set inputs
|
||||||
|
const auto &model = ctx->vision_model;
|
||||||
|
const auto &hparams = model.hparams;
|
||||||
|
|
||||||
|
const int image_size = hparams.image_size;
|
||||||
|
int image_size_width = image_size;
|
||||||
|
int image_size_height = image_size;
|
||||||
|
|
||||||
|
const int patch_size = hparams.patch_size;
|
||||||
|
const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
|
||||||
|
const int num_positions = num_patches + (model.class_embedding == nullptr ? 0 : 1);
|
||||||
|
|
||||||
|
{
|
||||||
|
struct ggml_tensor *inp_raw = ggml_graph_get_tensor(gf, "inp_raw");
|
||||||
|
ggml_backend_tensor_set(inp_raw, imgs->data[0].data.data(), 0, ggml_nbytes(inp_raw));
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
struct ggml_tensor *embeddings = ggml_graph_get_tensor(gf, "embeddings");
|
||||||
|
if (embeddings != nullptr) {
|
||||||
|
void *zeros = malloc(ggml_nbytes(embeddings));
|
||||||
|
memset(zeros, 0, ggml_nbytes(embeddings));
|
||||||
|
ggml_backend_tensor_set(embeddings, zeros, 0, ggml_nbytes(embeddings));
|
||||||
|
free(zeros);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
struct ggml_tensor *positions = ggml_graph_get_tensor(gf, "positions");
|
||||||
|
if (positions != nullptr) {
|
||||||
|
int *positions_data = (int *)malloc(ggml_nbytes(positions));
|
||||||
|
for (int i = 0; i < num_positions; i++) {
|
||||||
|
positions_data[i] = i;
|
||||||
|
}
|
||||||
|
ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
|
||||||
|
free(positions_data);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
struct ggml_tensor *aspect_ratios = ggml_graph_get_tensor(gf, "aspect_ratios");
|
||||||
|
if (aspect_ratios != nullptr) {
|
||||||
|
int *aspect_ratios_data = (int *)malloc(ggml_nbytes(aspect_ratios));
|
||||||
|
aspect_ratios_data[0] = imgs->data[0].aspect_ratio_id;
|
||||||
|
ggml_backend_tensor_set(aspect_ratios, aspect_ratios_data, 0, ggml_nbytes(aspect_ratios));
|
||||||
|
free(aspect_ratios_data);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ggml_backend_is_cpu(ctx->backend)) {
|
||||||
|
ggml_backend_cpu_set_n_threads(ctx->backend, n_threads);
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_backend_graph_compute(ctx->backend, gf);
|
||||||
|
|
||||||
|
// the last node is the embedding tensor
|
||||||
|
struct ggml_tensor *embeddings = ggml_graph_node(gf, ggml_graph_n_nodes(gf) - 1);
|
||||||
|
|
||||||
|
// copy the embeddings to the location passed by the user
|
||||||
|
ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
int32_t mllama_image_size(const struct mllama_ctx *ctx) {
|
||||||
|
return ctx->vision_model.hparams.image_size;
|
||||||
|
}
|
||||||
|
|
||||||
|
int32_t mllama_patch_size(const struct mllama_ctx *ctx) {
|
||||||
|
return ctx->vision_model.hparams.patch_size;
|
||||||
|
}
|
||||||
|
|
||||||
|
int32_t mllama_hidden_size(const struct mllama_ctx *ctx) {
|
||||||
|
return ctx->vision_model.hparams.hidden_size;
|
||||||
|
}
|
||||||
|
|
||||||
|
int mllama_n_patches(const struct mllama_ctx *ctx) {
|
||||||
|
const auto &hparams = ctx->vision_model.hparams;
|
||||||
|
return (hparams.image_size / hparams.patch_size) * (hparams.image_size / hparams.patch_size);
|
||||||
|
}
|
||||||
|
|
||||||
|
int mllama_n_positions(const struct mllama_ctx *ctx) {
|
||||||
|
return mllama_n_patches(ctx) + (ctx->vision_model.class_embedding == nullptr ? 0 : 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
int mllama_n_tiles(const struct mllama_ctx *ctx) {
|
||||||
|
return ctx->vision_model.hparams.n_tiles;
|
||||||
|
}
|
||||||
|
|
||||||
|
int mllama_n_embd(const struct mllama_ctx *ctx) {
|
||||||
|
return ctx->vision_model.hparams.projection_dim;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t mllama_n_embd_bytes(const struct mllama_ctx *ctx) {
|
||||||
|
return mllama_n_positions(ctx) * mllama_n_embd(ctx) * mllama_n_tiles(ctx) * sizeof(float);
|
||||||
|
}
|
61
llama/mllama.h
vendored
Normal file
61
llama/mllama.h
vendored
Normal file
@ -0,0 +1,61 @@
|
|||||||
|
#ifndef MLLAMA_H
|
||||||
|
#define MLLAMA_H
|
||||||
|
|
||||||
|
#include <stddef.h>
|
||||||
|
#include <stdint.h>
|
||||||
|
|
||||||
|
#ifdef LLAMA_SHARED
|
||||||
|
#if defined(_WIN32) && !defined(__MINGW32__)
|
||||||
|
#ifdef LLAMA_BUILD
|
||||||
|
#define MLLAMA_API __declspec(dllexport)
|
||||||
|
#else
|
||||||
|
#define MLLAMA_API __declspec(dllimport)
|
||||||
|
#endif
|
||||||
|
#else
|
||||||
|
#define MLLAMA_API __attribute__((visibility("default")))
|
||||||
|
#endif
|
||||||
|
#else
|
||||||
|
#define MLLAMA_API
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
struct mllama_ctx;
|
||||||
|
|
||||||
|
struct mllama_image_batch {
|
||||||
|
struct mllama_image *data;
|
||||||
|
size_t size;
|
||||||
|
};
|
||||||
|
|
||||||
|
MLLAMA_API struct mllama_ctx *mllama_model_load(const char *fname, int verbosity);
|
||||||
|
MLLAMA_API struct mllama_ctx *mllama_model_load_cpu(const char *fname, int verbosity);
|
||||||
|
|
||||||
|
MLLAMA_API void mllama_free(struct mllama_ctx *ctx);
|
||||||
|
|
||||||
|
MLLAMA_API int32_t mllama_image_size(const struct mllama_ctx *ctx);
|
||||||
|
MLLAMA_API int32_t mllama_patch_size(const struct mllama_ctx *ctx);
|
||||||
|
MLLAMA_API int32_t mllama_hidden_size(const struct mllama_ctx *ctx);
|
||||||
|
|
||||||
|
MLLAMA_API int mllama_n_patches(const struct mllama_ctx *ctx);
|
||||||
|
MLLAMA_API int mllama_n_positions(const struct mllama_ctx *ctx);
|
||||||
|
MLLAMA_API int mllama_n_tiles(const struct mllama_ctx *ctx);
|
||||||
|
MLLAMA_API int mllama_n_embd(const struct mllama_ctx *ctx);
|
||||||
|
MLLAMA_API size_t mllama_n_embd_bytes(const struct mllama_ctx *ctx);
|
||||||
|
|
||||||
|
MLLAMA_API struct mllama_image *mllama_image_init();
|
||||||
|
|
||||||
|
MLLAMA_API void mllama_image_free(struct mllama_image *img);
|
||||||
|
MLLAMA_API void mllama_image_batch_free(struct mllama_image_batch *batch);
|
||||||
|
|
||||||
|
MLLAMA_API bool mllama_image_load_from_data(const void *data, const int n, const int nx, const int ny, const int nc, const int nt, const int aspect_ratio_id, struct mllama_image *img);
|
||||||
|
|
||||||
|
MLLAMA_API bool mllama_image_encode(struct mllama_ctx *ctx, int n_threads, struct mllama_image *img, float *vec);
|
||||||
|
MLLAMA_API bool mllama_image_batch_encode(struct mllama_ctx *ctx, int n_threads, const struct mllama_image_batch *imgs, float *vec);
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif // MLLAMA_H
|
@ -270,7 +270,7 @@ index 3a4e72a3..831b68c0 100644
|
|||||||
+ // self-attention
|
+ // self-attention
|
||||||
+ {
|
+ {
|
||||||
+ // rope freq factors for llama3; may return nullptr for llama2 and other models
|
+ // rope freq factors for llama3; may return nullptr for llama2 and other models
|
||||||
+ ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
|
+ ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
|
||||||
+
|
+
|
||||||
+ // compute Q and K and RoPE them
|
+ // compute Q and K and RoPE them
|
||||||
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
||||||
|
1027
llama/patches/0006-add-mllama-support.patch
Normal file
1027
llama/patches/0006-add-mllama-support.patch
Normal file
File diff suppressed because it is too large
Load Diff
419
llama/patches/0007-add-unpad-operator.patch
Normal file
419
llama/patches/0007-add-unpad-operator.patch
Normal file
@ -0,0 +1,419 @@
|
|||||||
|
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
||||||
|
From: jmorganca <jmorganca@gmail.com>
|
||||||
|
Date: Sun, 13 Apr 2025 22:10:06 -0400
|
||||||
|
Subject: [PATCH] add unpad operator
|
||||||
|
|
||||||
|
adds the unpad operator to GGML
|
||||||
|
---
|
||||||
|
ggml/include/ggml.h | 10 +++++
|
||||||
|
ggml/src/ggml-cpu/ggml-cpu.c | 5 +++
|
||||||
|
ggml/src/ggml-cpu/ops.cpp | 55 ++++++++++++++++++++++++++++
|
||||||
|
ggml/src/ggml-cpu/ops.h | 1 +
|
||||||
|
ggml/src/ggml-cuda/ggml-cuda.cu | 4 ++
|
||||||
|
ggml/src/ggml-cuda/pad.cu | 46 +++++++++++++++++++++++
|
||||||
|
ggml/src/ggml-cuda/pad.cuh | 1 +
|
||||||
|
ggml/src/ggml-metal/ggml-metal.m | 33 +++++++++++++++++
|
||||||
|
ggml/src/ggml-metal/ggml-metal.metal | 45 +++++++++++++++++++++++
|
||||||
|
ggml/src/ggml.c | 25 ++++++++++++-
|
||||||
|
10 files changed, 223 insertions(+), 2 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
|
||||||
|
index e91dedf1..8dc107ba 100644
|
||||||
|
--- a/ggml/include/ggml.h
|
||||||
|
+++ b/ggml/include/ggml.h
|
||||||
|
@@ -489,6 +489,7 @@ extern "C" {
|
||||||
|
GGML_OP_UPSCALE, // nearest interpolate
|
||||||
|
GGML_OP_PAD,
|
||||||
|
GGML_OP_PAD_REFLECT_1D,
|
||||||
|
+ GGML_OP_UNPAD,
|
||||||
|
GGML_OP_ARANGE,
|
||||||
|
GGML_OP_TIMESTEP_EMBEDDING,
|
||||||
|
GGML_OP_ARGSORT,
|
||||||
|
@@ -1781,6 +1782,15 @@ extern "C" {
|
||||||
|
int p0,
|
||||||
|
int p1);
|
||||||
|
|
||||||
|
+ // unpad each dimension: [x, ..., x, y, ..., y] -> [x, ..., x]
|
||||||
|
+ GGML_API struct ggml_tensor * ggml_unpad(
|
||||||
|
+ struct ggml_context * ctx,
|
||||||
|
+ struct ggml_tensor * a,
|
||||||
|
+ int p0,
|
||||||
|
+ int p1,
|
||||||
|
+ int p2,
|
||||||
|
+ int p3);
|
||||||
|
+
|
||||||
|
// Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151
|
||||||
|
// timesteps: [N,]
|
||||||
|
// return: [N, dim]
|
||||||
|
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
|
||||||
|
index a30e67f2..835e6495 100644
|
||||||
|
--- a/ggml/src/ggml-cpu/ggml-cpu.c
|
||||||
|
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
|
||||||
|
@@ -1951,6 +1951,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
||||||
|
{
|
||||||
|
ggml_compute_forward_pad_reflect_1d(params, tensor);
|
||||||
|
} break;
|
||||||
|
+ case GGML_OP_UNPAD:
|
||||||
|
+ {
|
||||||
|
+ ggml_compute_forward_unpad(params, tensor);
|
||||||
|
+ } break;
|
||||||
|
case GGML_OP_ARANGE:
|
||||||
|
{
|
||||||
|
ggml_compute_forward_arange(params, tensor);
|
||||||
|
@@ -2274,6 +2278,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
||||||
|
case GGML_OP_UPSCALE:
|
||||||
|
case GGML_OP_PAD:
|
||||||
|
case GGML_OP_PAD_REFLECT_1D:
|
||||||
|
+ case GGML_OP_UNPAD:
|
||||||
|
case GGML_OP_ARANGE:
|
||||||
|
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||||
|
case GGML_OP_ARGSORT:
|
||||||
|
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
|
||||||
|
index 955fec59..1868a10c 100644
|
||||||
|
--- a/ggml/src/ggml-cpu/ops.cpp
|
||||||
|
+++ b/ggml/src/ggml-cpu/ops.cpp
|
||||||
|
@@ -6690,6 +6690,61 @@ void ggml_compute_forward_pad_reflect_1d(
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
+// ggml_compute_forward_unpad
|
||||||
|
+
|
||||||
|
+static void ggml_compute_forward_unpad_f32(
|
||||||
|
+ const struct ggml_compute_params *params,
|
||||||
|
+ struct ggml_tensor *dst) {
|
||||||
|
+
|
||||||
|
+ const struct ggml_tensor * src0 = dst->src[0];
|
||||||
|
+
|
||||||
|
+ GGML_ASSERT(src0->nb[0] == sizeof(float));
|
||||||
|
+ GGML_ASSERT( dst->nb[0] == sizeof(float));
|
||||||
|
+
|
||||||
|
+ const int ith = params->ith;
|
||||||
|
+ const int nth = params->nth;
|
||||||
|
+
|
||||||
|
+ GGML_TENSOR_UNARY_OP_LOCALS
|
||||||
|
+
|
||||||
|
+ float * dst_ptr = (float *) dst->data;
|
||||||
|
+
|
||||||
|
+ // TODO: optimize
|
||||||
|
+
|
||||||
|
+ for (int64_t i2 = 0; i2 < ne2; ++i2) {
|
||||||
|
+ for (int64_t i1 = ith; i1 < ne1; i1 += nth) {
|
||||||
|
+ for (int64_t i0 = 0; i0 < ne0; ++i0) {
|
||||||
|
+ for (int64_t i3 = 0; i3 < ne3; ++i3) {
|
||||||
|
+ const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0;
|
||||||
|
+
|
||||||
|
+ const float * src_ptr = (const float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
||||||
|
+
|
||||||
|
+ if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
|
||||||
|
+ dst_ptr[dst_idx] = *src_ptr;
|
||||||
|
+ }
|
||||||
|
+ }
|
||||||
|
+ }
|
||||||
|
+ }
|
||||||
|
+ }
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+void ggml_compute_forward_unpad(
|
||||||
|
+ const struct ggml_compute_params * params,
|
||||||
|
+ struct ggml_tensor * dst) {
|
||||||
|
+
|
||||||
|
+ const struct ggml_tensor * src0 = dst->src[0];
|
||||||
|
+
|
||||||
|
+ switch (src0->type) {
|
||||||
|
+ case GGML_TYPE_F32:
|
||||||
|
+ {
|
||||||
|
+ ggml_compute_forward_unpad_f32(params, dst);
|
||||||
|
+ } break;
|
||||||
|
+ default:
|
||||||
|
+ {
|
||||||
|
+ GGML_ABORT("fatal error");
|
||||||
|
+ }
|
||||||
|
+ }
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
// ggml_compute_forward_arange
|
||||||
|
|
||||||
|
static void ggml_compute_forward_arange_f32(
|
||||||
|
diff --git a/ggml/src/ggml-cpu/ops.h b/ggml/src/ggml-cpu/ops.h
|
||||||
|
index dc081b9e..a7125555 100644
|
||||||
|
--- a/ggml/src/ggml-cpu/ops.h
|
||||||
|
+++ b/ggml/src/ggml-cpu/ops.h
|
||||||
|
@@ -72,6 +72,7 @@ void ggml_compute_forward_pool_2d_back(const struct ggml_compute_params * params
|
||||||
|
void ggml_compute_forward_upscale(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||||
|
void ggml_compute_forward_pad(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||||
|
void ggml_compute_forward_pad_reflect_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||||
|
+void ggml_compute_forward_unpad(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||||
|
void ggml_compute_forward_arange(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||||
|
void ggml_compute_forward_timestep_embedding(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||||
|
void ggml_compute_forward_argsort(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||||
|
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||||
|
index cb0d8528..6fe86674 100644
|
||||||
|
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||||
|
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||||
|
@@ -2238,6 +2238,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
||||||
|
case GGML_OP_PAD:
|
||||||
|
ggml_cuda_op_pad(ctx, dst);
|
||||||
|
break;
|
||||||
|
+ case GGML_OP_UNPAD:
|
||||||
|
+ ggml_cuda_op_unpad(ctx, dst);
|
||||||
|
+ break;
|
||||||
|
case GGML_OP_ARANGE:
|
||||||
|
ggml_cuda_op_arange(ctx, dst);
|
||||||
|
break;
|
||||||
|
@@ -3212,6 +3215,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||||
|
case GGML_OP_UPSCALE:
|
||||||
|
return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST;
|
||||||
|
case GGML_OP_PAD:
|
||||||
|
+ case GGML_OP_UNPAD:
|
||||||
|
case GGML_OP_ARANGE:
|
||||||
|
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||||
|
case GGML_OP_LEAKY_RELU:
|
||||||
|
diff --git a/ggml/src/ggml-cuda/pad.cu b/ggml/src/ggml-cuda/pad.cu
|
||||||
|
index 77432b04..7d45a7e1 100644
|
||||||
|
--- a/ggml/src/ggml-cuda/pad.cu
|
||||||
|
+++ b/ggml/src/ggml-cuda/pad.cu
|
||||||
|
@@ -47,3 +47,49 @@ void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||||
|
src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
|
||||||
|
dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], stream);
|
||||||
|
}
|
||||||
|
+
|
||||||
|
+static __global__ void unpad_f32(const float * x, float * dst, const int ne0, const int ne00, const int ne01, const int ne02, const int ne03) {
|
||||||
|
+ // blockIdx.z: idx of ne2*ne3, aka ne02*ne03
|
||||||
|
+ // blockIdx.y: idx of ne1
|
||||||
|
+ // blockIDx.x: idx of ne0 / BLOCK_SIZE
|
||||||
|
+ int nidx = threadIdx.x + blockIdx.x * blockDim.x;
|
||||||
|
+ if (nidx >= ne0) {
|
||||||
|
+ return;
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ // operation
|
||||||
|
+ int offset_dst =
|
||||||
|
+ nidx +
|
||||||
|
+ blockIdx.y * ne0 +
|
||||||
|
+ blockIdx.z * ne0 * gridDim.y;
|
||||||
|
+ if (nidx < ne00 && blockIdx.y < ne01 && blockIdx.z < ne02*ne03) {
|
||||||
|
+ int offset_src =
|
||||||
|
+ nidx +
|
||||||
|
+ blockIdx.y * ne00 +
|
||||||
|
+ blockIdx.z * ne00 * ne01;
|
||||||
|
+ dst[offset_dst] = x[offset_src];
|
||||||
|
+ }
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+static void unpad_f32_cuda(const float * x, float * dst,
|
||||||
|
+ const int ne00, const int ne01, const int ne02, const int ne03,
|
||||||
|
+ const int ne0, const int ne1, const int ne2, const int ne3, cudaStream_t stream) {
|
||||||
|
+ int num_blocks = (ne0 + CUDA_PAD_BLOCK_SIZE - 1) / CUDA_PAD_BLOCK_SIZE;
|
||||||
|
+ dim3 gridDim(num_blocks, ne1, ne2*ne3);
|
||||||
|
+ unpad_f32<<<gridDim, CUDA_PAD_BLOCK_SIZE, 0, stream>>>(x, dst, ne0, ne00, ne01, ne02, ne03);
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||||
|
+ const ggml_tensor * src0 = dst->src[0];
|
||||||
|
+ const float * src0_d = (const float *)src0->data;
|
||||||
|
+ float * dst_d = (float *)dst->data;
|
||||||
|
+ cudaStream_t stream = ctx.stream();
|
||||||
|
+
|
||||||
|
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||||
|
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||||
|
+ GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
|
||||||
|
+
|
||||||
|
+ unpad_f32_cuda(src0_d, dst_d,
|
||||||
|
+ src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
|
||||||
|
+ dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], stream);
|
||||||
|
+}
|
||||||
|
\ No newline at end of file
|
||||||
|
diff --git a/ggml/src/ggml-cuda/pad.cuh b/ggml/src/ggml-cuda/pad.cuh
|
||||||
|
index 8fd386b0..e2ededc3 100644
|
||||||
|
--- a/ggml/src/ggml-cuda/pad.cuh
|
||||||
|
+++ b/ggml/src/ggml-cuda/pad.cuh
|
||||||
|
@@ -3,3 +3,4 @@
|
||||||
|
#define CUDA_PAD_BLOCK_SIZE 256
|
||||||
|
|
||||||
|
void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||||
|
+void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||||
|
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
|
||||||
|
index 1b56f858..7641247e 100644
|
||||||
|
--- a/ggml/src/ggml-metal/ggml-metal.m
|
||||||
|
+++ b/ggml/src/ggml-metal/ggml-metal.m
|
||||||
|
@@ -347,6 +347,7 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
|
||||||
|
GGML_METAL_KERNEL_TYPE_UPSCALE_F32,
|
||||||
|
GGML_METAL_KERNEL_TYPE_PAD_F32,
|
||||||
|
GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32,
|
||||||
|
+ GGML_METAL_KERNEL_TYPE_UNPAD_F32,
|
||||||
|
GGML_METAL_KERNEL_TYPE_ARANGE_F32,
|
||||||
|
GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,
|
||||||
|
GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,
|
||||||
|
@@ -1294,6 +1295,7 @@ @implementation GGMLMetalClass
|
||||||
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32, upscale_f32, true);
|
||||||
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32, pad_f32, true);
|
||||||
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32, pad_reflect_1d_f32, true);
|
||||||
|
+ GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UNPAD_F32, unpad_f32, true);
|
||||||
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32, timestep_embedding_f32, true);
|
||||||
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARANGE_F32, arange_f32, true);
|
||||||
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, argsort_f32_i32_asc, true);
|
||||||
|
@@ -1655,6 +1657,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
|
||||||
|
case GGML_OP_POOL_2D:
|
||||||
|
case GGML_OP_PAD:
|
||||||
|
case GGML_OP_PAD_REFLECT_1D:
|
||||||
|
+ case GGML_OP_UNPAD:
|
||||||
|
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||||
|
case GGML_OP_ARGSORT:
|
||||||
|
case GGML_OP_LEAKY_RELU:
|
||||||
|
@@ -4184,6 +4187,36 @@ static bool ggml_metal_encode_node(
|
||||||
|
|
||||||
|
const int nth = MIN(1024, ne0);
|
||||||
|
|
||||||
|
+ [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
||||||
|
+ } break;
|
||||||
|
+ case GGML_OP_UNPAD:
|
||||||
|
+ {
|
||||||
|
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||||
|
+
|
||||||
|
+ id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_UNPAD_F32].pipeline;
|
||||||
|
+
|
||||||
|
+ [encoder setComputePipelineState:pipeline];
|
||||||
|
+ [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||||
|
+ [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||||
|
+ [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
|
||||||
|
+ [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
|
||||||
|
+ [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
|
||||||
|
+ [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5];
|
||||||
|
+ [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
|
||||||
|
+ [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
|
||||||
|
+ [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
|
||||||
|
+ [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9];
|
||||||
|
+ [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:10];
|
||||||
|
+ [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:11];
|
||||||
|
+ [encoder setBytes:&ne2 length:sizeof(ne2) atIndex:12];
|
||||||
|
+ [encoder setBytes:&ne3 length:sizeof(ne3) atIndex:13];
|
||||||
|
+ [encoder setBytes:&nb0 length:sizeof(nb0) atIndex:14];
|
||||||
|
+ [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:15];
|
||||||
|
+ [encoder setBytes:&nb2 length:sizeof(nb2) atIndex:16];
|
||||||
|
+ [encoder setBytes:&nb3 length:sizeof(nb3) atIndex:17];
|
||||||
|
+
|
||||||
|
+ const int nth = MIN(1024, ne0);
|
||||||
|
+
|
||||||
|
[encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
||||||
|
} break;
|
||||||
|
case GGML_OP_ARANGE:
|
||||||
|
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
|
||||||
|
index 9cfddf45..080a943b 100644
|
||||||
|
--- a/ggml/src/ggml-metal/ggml-metal.metal
|
||||||
|
+++ b/ggml/src/ggml-metal/ggml-metal.metal
|
||||||
|
@@ -3121,6 +3121,51 @@ kernel void kernel_pad_reflect_1d_f32(
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
+kernel void kernel_unpad_f32(
|
||||||
|
+ device const char * src0,
|
||||||
|
+ device char * dst,
|
||||||
|
+ constant int64_t & ne00,
|
||||||
|
+ constant int64_t & ne01,
|
||||||
|
+ constant int64_t & ne02,
|
||||||
|
+ constant int64_t & ne03,
|
||||||
|
+ constant uint64_t & nb00,
|
||||||
|
+ constant uint64_t & nb01,
|
||||||
|
+ constant uint64_t & nb02,
|
||||||
|
+ constant uint64_t & nb03,
|
||||||
|
+ constant int64_t & ne0,
|
||||||
|
+ constant int64_t & ne1,
|
||||||
|
+ constant int64_t & ne2,
|
||||||
|
+ constant int64_t & ne3,
|
||||||
|
+ constant uint64_t & nb0,
|
||||||
|
+ constant uint64_t & nb1,
|
||||||
|
+ constant uint64_t & nb2,
|
||||||
|
+ constant uint64_t & nb3,
|
||||||
|
+ uint3 tgpig[[threadgroup_position_in_grid]],
|
||||||
|
+ uint3 tpitg[[thread_position_in_threadgroup]],
|
||||||
|
+ uint3 ntg[[threads_per_threadgroup]]) {
|
||||||
|
+
|
||||||
|
+ const int64_t i3 = tgpig.z;
|
||||||
|
+ const int64_t i2 = tgpig.y;
|
||||||
|
+ const int64_t i1 = tgpig.x;
|
||||||
|
+
|
||||||
|
+ const int64_t i03 = i3;
|
||||||
|
+ const int64_t i02 = i2;
|
||||||
|
+ const int64_t i01 = i1;
|
||||||
|
+
|
||||||
|
+ device const float * src0_ptr = (device const float *) (src0 + i03*nb03 + i02*nb02 + i01*nb01);
|
||||||
|
+ device float * dst_ptr = (device float *) (dst + i3*nb3 + i2*nb2 + i1*nb1);
|
||||||
|
+
|
||||||
|
+ if (i1 < ne01 && i2 < ne02 && i3 < ne03) {
|
||||||
|
+ for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
|
||||||
|
+ if (i0 < ne00) {
|
||||||
|
+ dst_ptr[i0] = src0_ptr[i0];
|
||||||
|
+ }
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ return;
|
||||||
|
+ }
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
kernel void kernel_arange_f32(
|
||||||
|
device char * dst,
|
||||||
|
constant ggml_metal_kargs_arange & args,
|
||||||
|
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
|
||||||
|
index 8a654624..6b034d35 100644
|
||||||
|
--- a/ggml/src/ggml.c
|
||||||
|
+++ b/ggml/src/ggml.c
|
||||||
|
@@ -923,6 +923,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
||||||
|
"UPSCALE",
|
||||||
|
"PAD",
|
||||||
|
"PAD_REFLECT_1D",
|
||||||
|
+ "UNPAD",
|
||||||
|
"ARANGE",
|
||||||
|
"TIMESTEP_EMBEDDING",
|
||||||
|
"ARGSORT",
|
||||||
|
@@ -953,7 +954,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
||||||
|
"OPT_STEP_ADAMW",
|
||||||
|
};
|
||||||
|
|
||||||
|
-static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
|
||||||
|
+static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
|
||||||
|
|
||||||
|
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||||
|
"none",
|
||||||
|
@@ -1018,6 +1019,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||||
|
"upscale(x)",
|
||||||
|
"pad(x)",
|
||||||
|
"pad_reflect_1d(x)",
|
||||||
|
+ "unpad(x)",
|
||||||
|
"arange(start, stop, step)",
|
||||||
|
"timestep_embedding(timesteps, dim, max_period)",
|
||||||
|
"argsort(x)",
|
||||||
|
@@ -1048,7 +1050,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||||
|
"adamw(x)",
|
||||||
|
};
|
||||||
|
|
||||||
|
-static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
|
||||||
|
+static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
|
||||||
|
|
||||||
|
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
|
||||||
|
|
||||||
|
@@ -4274,6 +4276,25 @@ struct ggml_tensor * ggml_pad_reflect_1d(
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
+// ggml_unpad
|
||||||
|
+
|
||||||
|
+struct ggml_tensor * ggml_unpad(
|
||||||
|
+ struct ggml_context * ctx,
|
||||||
|
+ struct ggml_tensor * a,
|
||||||
|
+ int p0, int p1, int p2, int p3) {
|
||||||
|
+
|
||||||
|
+ struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type,
|
||||||
|
+ a->ne[0] - p0,
|
||||||
|
+ a->ne[1] - p1,
|
||||||
|
+ a->ne[2] - p2,
|
||||||
|
+ a->ne[3] - p3);
|
||||||
|
+
|
||||||
|
+ result->op = GGML_OP_UNPAD;
|
||||||
|
+ result->src[0] = a;
|
||||||
|
+
|
||||||
|
+ return result;
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
// ggml_arange
|
||||||
|
|
||||||
|
struct ggml_tensor * ggml_arange(
|
@ -15,50 +15,13 @@ but this can leave a cache that still does not have adequate space
|
|||||||
even after defragmentation is triggered. Instead, we should do
|
even after defragmentation is triggered. Instead, we should do
|
||||||
multiple batches of processing until everything is complete.
|
multiple batches of processing until everything is complete.
|
||||||
---
|
---
|
||||||
src/llama-context.cpp | 18 ++++---
|
|
||||||
src/llama-context.h | 1 +
|
src/llama-context.h | 1 +
|
||||||
src/llama-kv-cache.cpp | 107 ++++++++++++++---------------------------
|
src/llama-kv-cache.cpp | 107 ++++++++++++++---------------------------
|
||||||
src/llama-kv-cache.h | 12 ++++-
|
src/llama-kv-cache.h | 12 ++++-
|
||||||
4 files changed, 59 insertions(+), 79 deletions(-)
|
3 files changed, 47 insertions(+), 73 deletions(-)
|
||||||
|
|
||||||
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
|
|
||||||
index c22687e4..c5948e8f 100644
|
|
||||||
--- a/src/llama-context.cpp
|
|
||||||
+++ b/src/llama-context.cpp
|
|
||||||
@@ -950,9 +950,12 @@ int llama_context::decode(llama_batch & inp_batch) {
|
|
||||||
|
|
||||||
// find KV slot
|
|
||||||
if (!kv_self->find_slot(ubatch)) {
|
|
||||||
- LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
|
|
||||||
-
|
|
||||||
- return 1;
|
|
||||||
+ kv_self->defrag_sched(-1.0f);
|
|
||||||
+ kv_self->update(*this);
|
|
||||||
+ if (!kv_self->find_slot(ubatch)) {
|
|
||||||
+ LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
|
|
||||||
+ return 1;
|
|
||||||
+ }
|
|
||||||
}
|
|
||||||
|
|
||||||
ggml_backend_sched_reset(sched.get());
|
|
||||||
@@ -1967,9 +1970,12 @@ void llama_context::opt_epoch_iter(
|
|
||||||
|
|
||||||
// TODO: not sure if this is needed
|
|
||||||
if (!kv_self->find_slot(ubatch)) {
|
|
||||||
- LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
|
|
||||||
-
|
|
||||||
- GGML_ABORT("TODO: handle this error");
|
|
||||||
+ kv_self->defrag_sched(-1.0f);
|
|
||||||
+ kv_self->update(*this);
|
|
||||||
+ if (!kv_self->find_slot(ubatch)) {
|
|
||||||
+ LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
|
|
||||||
+ GGML_ABORT("TODO: handle this error");
|
|
||||||
+ }
|
|
||||||
}
|
|
||||||
|
|
||||||
auto * gf = graph_init();
|
|
||||||
diff --git a/src/llama-context.h b/src/llama-context.h
|
diff --git a/src/llama-context.h b/src/llama-context.h
|
||||||
index c0ceacb1..0264e937 100644
|
index c4ab242a..9970dfc6 100644
|
||||||
--- a/src/llama-context.h
|
--- a/src/llama-context.h
|
||||||
+++ b/src/llama-context.h
|
+++ b/src/llama-context.h
|
||||||
@@ -5,6 +5,7 @@
|
@@ -5,6 +5,7 @@
|
||||||
@ -70,10 +33,10 @@ index c0ceacb1..0264e937 100644
|
|||||||
#include "ggml-cpp.h"
|
#include "ggml-cpp.h"
|
||||||
#include "ggml-opt.h"
|
#include "ggml-opt.h"
|
||||||
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
|
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
|
||||||
index 3dcad65b..60e67b03 100644
|
index a7b0a7eb..1a50c034 100644
|
||||||
--- a/src/llama-kv-cache.cpp
|
--- a/src/llama-kv-cache.cpp
|
||||||
+++ b/src/llama-kv-cache.cpp
|
+++ b/src/llama-kv-cache.cpp
|
||||||
@@ -364,8 +364,6 @@ void llama_kv_cache_unified::commit() {
|
@@ -372,8 +372,6 @@ void llama_kv_cache_unified::commit() {
|
||||||
}
|
}
|
||||||
|
|
||||||
bool llama_kv_cache_unified::update(llama_context & lctx) {
|
bool llama_kv_cache_unified::update(llama_context & lctx) {
|
||||||
@ -82,7 +45,7 @@ index 3dcad65b..60e67b03 100644
|
|||||||
auto * sched = lctx.get_sched();
|
auto * sched = lctx.get_sched();
|
||||||
|
|
||||||
if (has_shift) {
|
if (has_shift) {
|
||||||
@@ -388,8 +386,6 @@ bool llama_kv_cache_unified::update(llama_context & lctx) {
|
@@ -396,8 +394,6 @@ bool llama_kv_cache_unified::update(llama_context & lctx) {
|
||||||
res->set_inputs(nullptr);
|
res->set_inputs(nullptr);
|
||||||
|
|
||||||
lctx.graph_compute(gf, false);
|
lctx.graph_compute(gf, false);
|
||||||
@ -91,7 +54,7 @@ index 3dcad65b..60e67b03 100644
|
|||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
@@ -403,27 +399,36 @@ bool llama_kv_cache_unified::update(llama_context & lctx) {
|
@@ -411,27 +407,36 @@ bool llama_kv_cache_unified::update(llama_context & lctx) {
|
||||||
|
|
||||||
if (do_defrag) {
|
if (do_defrag) {
|
||||||
LLAMA_LOG_DEBUG("%s: defragmenting KV cache\n", __func__);
|
LLAMA_LOG_DEBUG("%s: defragmenting KV cache\n", __func__);
|
||||||
@ -133,7 +96,7 @@ index 3dcad65b..60e67b03 100644
|
|||||||
}
|
}
|
||||||
|
|
||||||
void llama_kv_cache_unified::defrag_sched(float thold) {
|
void llama_kv_cache_unified::defrag_sched(float thold) {
|
||||||
@@ -707,11 +712,10 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_shift(
|
@@ -715,11 +720,10 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_shift(
|
||||||
llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag(
|
llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag(
|
||||||
const llama_cparams & cparams,
|
const llama_cparams & cparams,
|
||||||
ggml_context * ctx,
|
ggml_context * ctx,
|
||||||
@ -147,7 +110,7 @@ index 3dcad65b..60e67b03 100644
|
|||||||
#if 0
|
#if 0
|
||||||
// CPU defrag
|
// CPU defrag
|
||||||
//
|
//
|
||||||
@@ -783,32 +787,20 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag(
|
@@ -791,32 +795,20 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag(
|
||||||
ggml_backend_tensor_set(v_l[il], buf_v.data(), 0, buf_v.size());
|
ggml_backend_tensor_set(v_l[il], buf_v.data(), 0, buf_v.size());
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
@ -185,7 +148,7 @@ index 3dcad65b..60e67b03 100644
|
|||||||
|
|
||||||
ggml_tensor * view_v_src;
|
ggml_tensor * view_v_src;
|
||||||
ggml_tensor * view_v_dst;
|
ggml_tensor * view_v_dst;
|
||||||
@@ -816,31 +808,29 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag(
|
@@ -824,31 +816,29 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag(
|
||||||
if (cparams.flash_attn) {
|
if (cparams.flash_attn) {
|
||||||
// NOTE: the V cache is not transposed when using flash attention
|
// NOTE: the V cache is not transposed when using flash attention
|
||||||
view_v_src = ggml_view_2d(ctx, v_l[il],
|
view_v_src = ggml_view_2d(ctx, v_l[il],
|
||||||
@ -225,7 +188,7 @@ index 3dcad65b..60e67b03 100644
|
|||||||
}
|
}
|
||||||
|
|
||||||
//LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
|
//LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
|
||||||
@@ -857,17 +847,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
|
@@ -865,17 +855,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
|
||||||
|
|
||||||
assert(n_used <= n_kv);
|
assert(n_used <= n_kv);
|
||||||
|
|
||||||
@ -244,7 +207,7 @@ index 3dcad65b..60e67b03 100644
|
|||||||
|
|
||||||
// determine which KV cells to move where
|
// determine which KV cells to move where
|
||||||
//
|
//
|
||||||
@@ -875,10 +855,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
|
@@ -883,10 +863,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
|
||||||
//
|
//
|
||||||
// if ids[i] == i || ids[i] == n_kv, then cell i is not moved
|
// if ids[i] == i || ids[i] == n_kv, then cell i is not moved
|
||||||
//
|
//
|
||||||
@ -256,7 +219,7 @@ index 3dcad65b..60e67b03 100644
|
|||||||
|
|
||||||
for (uint32_t i0 = 0; i0 < n_used; ++i0) {
|
for (uint32_t i0 = 0; i0 < n_used; ++i0) {
|
||||||
const auto & cell0 = cells[i0];
|
const auto & cell0 = cells[i0];
|
||||||
@@ -927,19 +904,11 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
|
@@ -935,19 +912,11 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
|
||||||
// are we moving a continuous block of memory?
|
// are we moving a continuous block of memory?
|
||||||
bool cont = false;
|
bool cont = false;
|
||||||
|
|
||||||
@ -276,7 +239,7 @@ index 3dcad65b..60e67b03 100644
|
|||||||
cont = false;
|
cont = false;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@@ -955,8 +924,10 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
|
@@ -963,8 +932,10 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
|
||||||
head = n_used;
|
head = n_used;
|
||||||
|
|
||||||
if (!cont) {
|
if (!cont) {
|
||||||
@ -288,7 +251,7 @@ index 3dcad65b..60e67b03 100644
|
|||||||
}
|
}
|
||||||
|
|
||||||
nf++;
|
nf++;
|
||||||
@@ -966,22 +937,16 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
|
@@ -974,22 +945,16 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -11,7 +11,7 @@ with the fastest acceleration is loaded
|
|||||||
1 file changed, 13 insertions(+), 8 deletions(-)
|
1 file changed, 13 insertions(+), 8 deletions(-)
|
||||||
|
|
||||||
diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
|
diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
|
||||||
index 405d8e31..4e67d243 100644
|
index 82ae1b5b..1487f322 100644
|
||||||
--- a/ggml/src/ggml-backend-reg.cpp
|
--- a/ggml/src/ggml-backend-reg.cpp
|
||||||
+++ b/ggml/src/ggml-backend-reg.cpp
|
+++ b/ggml/src/ggml-backend-reg.cpp
|
||||||
@@ -157,7 +157,7 @@ struct ggml_backend_reg_entry {
|
@@ -157,7 +157,7 @@ struct ggml_backend_reg_entry {
|
@ -9,8 +9,8 @@ such as vocab fields
|
|||||||
---
|
---
|
||||||
ggml/include/gguf.h | 1 +
|
ggml/include/gguf.h | 1 +
|
||||||
ggml/src/gguf.cpp | 7 +++++--
|
ggml/src/gguf.cpp | 7 +++++--
|
||||||
src/llama-vocab.cpp | 4 +---
|
src/llama-vocab.cpp | 2 +-
|
||||||
3 files changed, 7 insertions(+), 5 deletions(-)
|
3 files changed, 7 insertions(+), 3 deletions(-)
|
||||||
|
|
||||||
diff --git a/ggml/include/gguf.h b/ggml/include/gguf.h
|
diff --git a/ggml/include/gguf.h b/ggml/include/gguf.h
|
||||||
index 79ee2020..3efb22f0 100644
|
index 79ee2020..3efb22f0 100644
|
||||||
@ -53,15 +53,13 @@ index 381a9c7d..e45b453d 100644
|
|||||||
}
|
}
|
||||||
|
|
||||||
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
|
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
|
||||||
index 10f34d33..9f5fd57b 100644
|
index 10f34d33..b098bb25 100644
|
||||||
--- a/src/llama-vocab.cpp
|
--- a/src/llama-vocab.cpp
|
||||||
+++ b/src/llama-vocab.cpp
|
+++ b/src/llama-vocab.cpp
|
||||||
@@ -1469,9 +1469,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
@@ -1471,7 +1471,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||||
const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
|
|
||||||
if (precompiled_charsmap_keyidx != -1) {
|
|
||||||
const gguf_type pc_type = gguf_get_arr_type(ctx, precompiled_charsmap_keyidx);
|
const gguf_type pc_type = gguf_get_arr_type(ctx, precompiled_charsmap_keyidx);
|
||||||
- GGML_ASSERT(pc_type == GGUF_TYPE_INT8 || pc_type == GGUF_TYPE_UINT8);
|
GGML_ASSERT(pc_type == GGUF_TYPE_INT8 || pc_type == GGUF_TYPE_UINT8);
|
||||||
-
|
|
||||||
- const size_t n_precompiled_charsmap = gguf_get_arr_n(ctx, precompiled_charsmap_keyidx);
|
- const size_t n_precompiled_charsmap = gguf_get_arr_n(ctx, precompiled_charsmap_keyidx);
|
||||||
+ const size_t n_precompiled_charsmap = gguf_get_arr_data_n(ctx, precompiled_charsmap_keyidx);
|
+ const size_t n_precompiled_charsmap = gguf_get_arr_data_n(ctx, precompiled_charsmap_keyidx);
|
||||||
const char * pc = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx);
|
const char * pc = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx);
|
@ -8,7 +8,7 @@ Subject: [PATCH] ollama debug tensor
|
|||||||
1 file changed, 6 insertions(+)
|
1 file changed, 6 insertions(+)
|
||||||
|
|
||||||
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
|
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
|
||||||
index a30e67f2..2462d2b8 100644
|
index 835e6495..3902894b 100644
|
||||||
--- a/ggml/src/ggml-cpu/ggml-cpu.c
|
--- a/ggml/src/ggml-cpu/ggml-cpu.c
|
||||||
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
|
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
|
||||||
@@ -15,6 +15,8 @@
|
@@ -15,6 +15,8 @@
|
||||||
@ -20,7 +20,7 @@ index a30e67f2..2462d2b8 100644
|
|||||||
#if defined(_MSC_VER) || defined(__MINGW32__)
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
||||||
#include <malloc.h> // using malloc.h with MSC/MINGW
|
#include <malloc.h> // using malloc.h with MSC/MINGW
|
||||||
#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
|
#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
|
||||||
@@ -2841,6 +2843,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
@@ -2846,6 +2848,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
||||||
|
|
||||||
ggml_compute_forward(¶ms, node);
|
ggml_compute_forward(¶ms, node);
|
||||||
|
|
3
llama/sampling_ext.cpp
vendored
3
llama/sampling_ext.cpp
vendored
@ -114,9 +114,6 @@ void grammar_free(struct llama_grammar *g) {
|
|||||||
if (g->vocab != nullptr) {
|
if (g->vocab != nullptr) {
|
||||||
delete g->vocab;
|
delete g->vocab;
|
||||||
}
|
}
|
||||||
if (g->o_vocab != nullptr) {
|
|
||||||
delete g->o_vocab;
|
|
||||||
}
|
|
||||||
llama_grammar_free_impl(g);
|
llama_grammar_free_impl(g);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,12 +1,9 @@
|
|||||||
package llm
|
package llm
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"cmp"
|
|
||||||
"fmt"
|
"fmt"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
"maps"
|
|
||||||
"os"
|
"os"
|
||||||
"slices"
|
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
@ -111,8 +108,9 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
|
|||||||
slog.Debug("evaluating", "library", gpus[0].Library, "gpu_count", len(gpus), "available", availableList)
|
slog.Debug("evaluating", "library", gpus[0].Library, "gpu_count", len(gpus), "available", availableList)
|
||||||
|
|
||||||
for _, projector := range projectors {
|
for _, projector := range projectors {
|
||||||
weight := projectorMemoryRequirements(projector)
|
weight, graph := projectorMemoryRequirements(projector)
|
||||||
projectorWeights += weight
|
projectorWeights += weight
|
||||||
|
projectorGraph += graph
|
||||||
|
|
||||||
// multimodal models require at least 2048 context
|
// multimodal models require at least 2048 context
|
||||||
opts.NumCtx = max(opts.NumCtx, 2048)
|
opts.NumCtx = max(opts.NumCtx, 2048)
|
||||||
@ -122,10 +120,12 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
|
|||||||
}
|
}
|
||||||
|
|
||||||
layers := f.Tensors().GroupLayers()
|
layers := f.Tensors().GroupLayers()
|
||||||
// add one layer (chosing the max layer) worth of memory as a buffer
|
// add one layer worth of memory as a buffer
|
||||||
layerSize = slices.MaxFunc(slices.Collect(maps.Values(layers)), func(a, b ggml.Layer) int {
|
if blk0, ok := layers["blk.0"]; ok {
|
||||||
return cmp.Compare(a.Size(), b.Size())
|
layerSize = blk0.Size()
|
||||||
}).Size()
|
} else {
|
||||||
|
slog.Warn("model missing blk.0 layer size")
|
||||||
|
}
|
||||||
|
|
||||||
var kvct string
|
var kvct string
|
||||||
if envconfig.FlashAttention() &&
|
if envconfig.FlashAttention() &&
|
||||||
@ -219,7 +219,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
|
|||||||
}
|
}
|
||||||
|
|
||||||
// For all the layers, find where they can fit on the GPU(s)
|
// For all the layers, find where they can fit on the GPU(s)
|
||||||
for i := int(f.KV().BlockCount()) - 1; i >= 0; i-- {
|
for i := range int(f.KV().BlockCount()) {
|
||||||
// Some models have inconsistent layer sizes
|
// Some models have inconsistent layer sizes
|
||||||
if blk, ok := layers[fmt.Sprintf("blk.%d", i)]; ok {
|
if blk, ok := layers[fmt.Sprintf("blk.%d", i)]; ok {
|
||||||
layerSize = blk.Size()
|
layerSize = blk.Size()
|
||||||
@ -229,7 +229,6 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
|
|||||||
|
|
||||||
if opts.NumGPU >= 0 && layerCount >= opts.NumGPU {
|
if opts.NumGPU >= 0 && layerCount >= opts.NumGPU {
|
||||||
// Stop allocating on GPU(s) once we hit the users target NumGPU
|
// Stop allocating on GPU(s) once we hit the users target NumGPU
|
||||||
overflow += layerSize
|
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -246,13 +245,13 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
|
|||||||
gpusWithSpace = append(gpusWithSpace[:i%j], gpusWithSpace[i%j+1:]...)
|
gpusWithSpace = append(gpusWithSpace[:i%j], gpusWithSpace[i%j+1:]...)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if len(gpusWithSpace) == 0 {
|
|
||||||
overflow += layerSize
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
if layerCount >= int(f.KV().BlockCount()) {
|
if layerCount >= int(f.KV().BlockCount()) {
|
||||||
fullyLoaded = true
|
fullyLoaded = true
|
||||||
|
} else {
|
||||||
|
for i := layerCount; i < int(f.KV().BlockCount()); i++ {
|
||||||
|
overflow += layerSize
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Determine if we need to consider output then find where it fits
|
// Determine if we need to consider output then find where it fits
|
||||||
@ -408,21 +407,51 @@ func (m MemoryEstimate) LogValue() slog.Value {
|
|||||||
return slog.GroupValue(attrs...)
|
return slog.GroupValue(attrs...)
|
||||||
}
|
}
|
||||||
|
|
||||||
func projectorMemoryRequirements(filename string) (weights uint64) {
|
func projectorMemoryRequirements(filename string) (weights, graphSize uint64) {
|
||||||
file, err := os.Open(filename)
|
file, err := os.Open(filename)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return 0
|
return 0, 0
|
||||||
}
|
}
|
||||||
defer file.Close()
|
defer file.Close()
|
||||||
|
|
||||||
ggml, _, err := ggml.Decode(file, 1024)
|
ggml, _, err := ggml.Decode(file, 1024)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return 0
|
return 0, 0
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, layer := range ggml.Tensors().GroupLayers() {
|
for _, layer := range ggml.Tensors().GroupLayers() {
|
||||||
weights += layer.Size()
|
weights += layer.Size()
|
||||||
}
|
}
|
||||||
|
|
||||||
return weights
|
switch arch := ggml.KV().Architecture(); arch {
|
||||||
|
case "mllama":
|
||||||
|
kv := func(n string) uint64 {
|
||||||
|
if v, ok := ggml.KV()[arch+".vision."+n].(uint32); ok {
|
||||||
|
return uint64(v)
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
imageSize := kv("image_size")
|
||||||
|
|
||||||
|
maxNumTiles := kv("max_num_tiles")
|
||||||
|
embeddingLength := kv("embedding_length")
|
||||||
|
headCount := kv("attention.head_count")
|
||||||
|
|
||||||
|
numPatches := (imageSize / kv("patch_size")) * (imageSize / kv("patch_size"))
|
||||||
|
if _, ok := ggml.Tensors().GroupLayers()["v"]["class_embd"]; ok {
|
||||||
|
numPatches++
|
||||||
|
}
|
||||||
|
|
||||||
|
numPaddedPatches := numPatches + 8 - (numPatches%8)%8
|
||||||
|
|
||||||
|
graphSize = 4 * (8 +
|
||||||
|
imageSize*imageSize*kv("num_channels")*maxNumTiles +
|
||||||
|
embeddingLength*numPatches*maxNumTiles +
|
||||||
|
9*embeddingLength*numPaddedPatches*maxNumTiles +
|
||||||
|
numPaddedPatches*maxNumTiles*numPaddedPatches*maxNumTiles*headCount)
|
||||||
|
}
|
||||||
|
|
||||||
|
return weights, graphSize
|
||||||
}
|
}
|
||||||
|
@ -311,7 +311,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
|
|||||||
params = append(params, "--mmproj", projectors[0])
|
params = append(params, "--mmproj", projectors[0])
|
||||||
}
|
}
|
||||||
|
|
||||||
// iterate through compatible GPU libraries such as 'cuda_v12', 'cuda_v11', 'rocm', etc.
|
// iterate through compatible GPU libraries such as 'cuda_v12', 'rocm', etc.
|
||||||
// adding each library's respective path to the LD_LIBRARY_PATH, until finally running
|
// adding each library's respective path to the LD_LIBRARY_PATH, until finally running
|
||||||
// without any LD_LIBRARY_PATH flags
|
// without any LD_LIBRARY_PATH flags
|
||||||
for {
|
for {
|
||||||
@ -679,8 +679,9 @@ ws ::= ([ \t\n] ws)?
|
|||||||
const maxBufferSize = 512 * format.KiloByte
|
const maxBufferSize = 512 * format.KiloByte
|
||||||
|
|
||||||
type ImageData struct {
|
type ImageData struct {
|
||||||
Data []byte `json:"data"`
|
Data []byte `json:"data"`
|
||||||
ID int `json:"id"`
|
ID int `json:"id"`
|
||||||
|
AspectRatioID int `json:"aspect_ratio_id"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type CompletionRequest struct {
|
type CompletionRequest struct {
|
||||||
|
@ -119,18 +119,22 @@ type Context interface {
|
|||||||
Layer(int) Context
|
Layer(int) Context
|
||||||
}
|
}
|
||||||
|
|
||||||
// RopeOptions contains optional parameters for RoPE function
|
// RopeOpts contains optional parameters for RoPE function
|
||||||
type RopeOptions struct {
|
type RopeOpts struct {
|
||||||
OriginalContextLen uint32
|
DefaultContextLen uint32
|
||||||
|
YarnExtFactor float32
|
||||||
|
YarnAttnFactor float32
|
||||||
|
YarnBetaFast float32
|
||||||
|
YarnBetaSlow float32
|
||||||
}
|
}
|
||||||
|
|
||||||
// RopeOption defines a function that modifies RopeOpts
|
// RopeOption defines a function that modifies RopeOpts
|
||||||
type RopeOption func(*RopeOptions)
|
type RopeOption func(*RopeOpts)
|
||||||
|
|
||||||
// WithContextLen sets a custom context length
|
// WithContextLen sets a custom context length
|
||||||
func WithContextLen(len uint32) RopeOption {
|
func WithContextLen(len uint32) RopeOption {
|
||||||
return func(opts *RopeOptions) {
|
return func(opts *RopeOpts) {
|
||||||
opts.OriginalContextLen = len
|
opts.DefaultContextLen = len
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -160,6 +164,7 @@ type Tensor interface {
|
|||||||
Conv2D(ctx Context, weight Tensor, s0, s1, p0, p1, d0, d1 int) Tensor
|
Conv2D(ctx Context, weight Tensor, s0, s1, p0, p1, d0, d1 int) Tensor
|
||||||
|
|
||||||
RoPE(ctx Context, positionIDs, ropeFactors Tensor, dim, ropeType uint32, base, scale float32, options ...RopeOption) Tensor
|
RoPE(ctx Context, positionIDs, ropeFactors Tensor, dim, ropeType uint32, base, scale float32, options ...RopeOption) Tensor
|
||||||
|
RoPEMulti(ctx Context, positionIDs, ropeFactors Tensor, dim uint32, sections [4]int32, ropeType uint32, base, scale float32) Tensor
|
||||||
IM2Col(ctx Context, weight Tensor, s0, s1, p0, p1, d0, d1 int) Tensor
|
IM2Col(ctx Context, weight Tensor, s0, s1, p0, p1, d0, d1 int) Tensor
|
||||||
|
|
||||||
Sin(ctx Context) Tensor
|
Sin(ctx Context) Tensor
|
||||||
@ -176,6 +181,7 @@ type Tensor interface {
|
|||||||
Set(ctx Context, t2 Tensor, offset int, strides ...int) Tensor
|
Set(ctx Context, t2 Tensor, offset int, strides ...int) Tensor
|
||||||
|
|
||||||
Pad(ctx Context, shape ...int) Tensor
|
Pad(ctx Context, shape ...int) Tensor
|
||||||
|
Unpad(ctx Context, shape ...int) Tensor
|
||||||
|
|
||||||
Stack(ctx Context, dim int, s ...Tensor) Tensor
|
Stack(ctx Context, dim int, s ...Tensor) Tensor
|
||||||
|
|
||||||
|
@ -1017,6 +1017,17 @@ func (t *Tensor) Sigmoid(ctx ml.Context) ml.Tensor {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (t *Tensor) Unpad(ctx ml.Context, shape ...int) ml.Tensor {
|
||||||
|
if len(shape) != 4 {
|
||||||
|
panic("expected 4 dimensions")
|
||||||
|
}
|
||||||
|
|
||||||
|
return &Tensor{
|
||||||
|
b: t.b,
|
||||||
|
t: C.ggml_unpad(ctx.(*Context).ctx, t.t, C.int(shape[0]), C.int(shape[1]), C.int(shape[2]), C.int(shape[3])),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func (t *Tensor) View(ctx ml.Context, offset int, shape ...int) ml.Tensor {
|
func (t *Tensor) View(ctx ml.Context, offset int, shape ...int) ml.Tensor {
|
||||||
switch len(shape) {
|
switch len(shape) {
|
||||||
case 1:
|
case 1:
|
||||||
@ -1062,8 +1073,12 @@ const (
|
|||||||
|
|
||||||
func (t *Tensor) RoPE(ctx ml.Context, positionIDs, ropeFactors ml.Tensor, ropeDim, ropeType uint32, ropeBase, ropeScale float32, options ...ml.RopeOption) ml.Tensor {
|
func (t *Tensor) RoPE(ctx ml.Context, positionIDs, ropeFactors ml.Tensor, ropeDim, ropeType uint32, ropeBase, ropeScale float32, options ...ml.RopeOption) ml.Tensor {
|
||||||
// Default options
|
// Default options
|
||||||
opts := &ml.RopeOptions{
|
opts := &ml.RopeOpts{
|
||||||
OriginalContextLen: 131072,
|
DefaultContextLen: 131072,
|
||||||
|
YarnExtFactor: 0.0,
|
||||||
|
YarnAttnFactor: 1.0,
|
||||||
|
YarnBetaFast: 32.0,
|
||||||
|
YarnBetaSlow: 1.0,
|
||||||
}
|
}
|
||||||
|
|
||||||
// Apply any provided options
|
// Apply any provided options
|
||||||
@ -1089,13 +1104,44 @@ func (t *Tensor) RoPE(ctx ml.Context, positionIDs, ropeFactors ml.Tensor, ropeDi
|
|||||||
ropeFactors.(*Tensor).t,
|
ropeFactors.(*Tensor).t,
|
||||||
C.int(ropeDim),
|
C.int(ropeDim),
|
||||||
C.int(ropeType),
|
C.int(ropeType),
|
||||||
C.int(opts.OriginalContextLen),
|
C.int(128000),
|
||||||
C.float(ropeBase),
|
C.float(ropeBase),
|
||||||
C.float(ropeScale),
|
C.float(ropeScale),
|
||||||
C.float(0.0),
|
C.float(opts.YarnExtFactor),
|
||||||
C.float(1.0),
|
C.float(opts.YarnAttnFactor),
|
||||||
C.float(32.0),
|
C.float(opts.YarnBetaFast),
|
||||||
C.float(1.0),
|
C.float(opts.YarnBetaSlow),
|
||||||
|
),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (t *Tensor) RoPEMulti(ctx ml.Context, positionIDs, ropeFactors ml.Tensor, ropeDim uint32, sections [4]int32, ropeType uint32, ropeBase, ropeScale float32) ml.Tensor {
|
||||||
|
if ropeFactors == nil {
|
||||||
|
ropeFactors = &Tensor{b: t.b}
|
||||||
|
}
|
||||||
|
|
||||||
|
dequant := t.t
|
||||||
|
if C.ggml_is_quantized(t.t._type) {
|
||||||
|
dequant = C.ggml_cast(ctx.(*Context).ctx, t.t, C.GGML_TYPE_F32)
|
||||||
|
}
|
||||||
|
|
||||||
|
return &Tensor{
|
||||||
|
b: t.b,
|
||||||
|
t: C.ggml_rope_multi(
|
||||||
|
ctx.(*Context).ctx,
|
||||||
|
dequant,
|
||||||
|
positionIDs.(*Tensor).t,
|
||||||
|
ropeFactors.(*Tensor).t,
|
||||||
|
C.int(ropeDim),
|
||||||
|
(*C.int)(§ions[0]),
|
||||||
|
C.int(ropeType),
|
||||||
|
C.int(128000), // Default context length
|
||||||
|
C.float(ropeBase),
|
||||||
|
C.float(ropeScale),
|
||||||
|
C.float(0.0), // ext_factor
|
||||||
|
C.float(1.0), // attn_factor
|
||||||
|
C.float(32.0), // beta_fast
|
||||||
|
C.float(1.0), // beta_slow
|
||||||
),
|
),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
10
ml/backend/ggml/ggml/include/ggml.h
vendored
10
ml/backend/ggml/ggml/include/ggml.h
vendored
@ -489,6 +489,7 @@ extern "C" {
|
|||||||
GGML_OP_UPSCALE, // nearest interpolate
|
GGML_OP_UPSCALE, // nearest interpolate
|
||||||
GGML_OP_PAD,
|
GGML_OP_PAD,
|
||||||
GGML_OP_PAD_REFLECT_1D,
|
GGML_OP_PAD_REFLECT_1D,
|
||||||
|
GGML_OP_UNPAD,
|
||||||
GGML_OP_ARANGE,
|
GGML_OP_ARANGE,
|
||||||
GGML_OP_TIMESTEP_EMBEDDING,
|
GGML_OP_TIMESTEP_EMBEDDING,
|
||||||
GGML_OP_ARGSORT,
|
GGML_OP_ARGSORT,
|
||||||
@ -1781,6 +1782,15 @@ extern "C" {
|
|||||||
int p0,
|
int p0,
|
||||||
int p1);
|
int p1);
|
||||||
|
|
||||||
|
// unpad each dimension: [x, ..., x, y, ..., y] -> [x, ..., x]
|
||||||
|
GGML_API struct ggml_tensor * ggml_unpad(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
int p0,
|
||||||
|
int p1,
|
||||||
|
int p2,
|
||||||
|
int p3);
|
||||||
|
|
||||||
// Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151
|
// Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151
|
||||||
// timesteps: [N,]
|
// timesteps: [N,]
|
||||||
// return: [N, dim]
|
// return: [N, dim]
|
||||||
|
@ -178,9 +178,9 @@ struct ggml_backend_registry {
|
|||||||
#ifdef GGML_USE_CANN
|
#ifdef GGML_USE_CANN
|
||||||
register_backend(ggml_backend_cann_reg());
|
register_backend(ggml_backend_cann_reg());
|
||||||
#endif
|
#endif
|
||||||
#ifdef GGML_USE_BLAS
|
// #ifdef GGML_USE_BLAS
|
||||||
register_backend(ggml_backend_blas_reg());
|
// register_backend(ggml_backend_blas_reg());
|
||||||
#endif
|
// #endif
|
||||||
#ifdef GGML_USE_RPC
|
#ifdef GGML_USE_RPC
|
||||||
register_backend(ggml_backend_rpc_reg());
|
register_backend(ggml_backend_rpc_reg());
|
||||||
#endif
|
#endif
|
||||||
|
5
ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.c
vendored
5
ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.c
vendored
@ -1953,6 +1953,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|||||||
{
|
{
|
||||||
ggml_compute_forward_pad_reflect_1d(params, tensor);
|
ggml_compute_forward_pad_reflect_1d(params, tensor);
|
||||||
} break;
|
} break;
|
||||||
|
case GGML_OP_UNPAD:
|
||||||
|
{
|
||||||
|
ggml_compute_forward_unpad(params, tensor);
|
||||||
|
} break;
|
||||||
case GGML_OP_ARANGE:
|
case GGML_OP_ARANGE:
|
||||||
{
|
{
|
||||||
ggml_compute_forward_arange(params, tensor);
|
ggml_compute_forward_arange(params, tensor);
|
||||||
@ -2276,6 +2280,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|||||||
case GGML_OP_UPSCALE:
|
case GGML_OP_UPSCALE:
|
||||||
case GGML_OP_PAD:
|
case GGML_OP_PAD:
|
||||||
case GGML_OP_PAD_REFLECT_1D:
|
case GGML_OP_PAD_REFLECT_1D:
|
||||||
|
case GGML_OP_UNPAD:
|
||||||
case GGML_OP_ARANGE:
|
case GGML_OP_ARANGE:
|
||||||
case GGML_OP_TIMESTEP_EMBEDDING:
|
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||||
case GGML_OP_ARGSORT:
|
case GGML_OP_ARGSORT:
|
||||||
|
55
ml/backend/ggml/ggml/src/ggml-cpu/ops.cpp
vendored
55
ml/backend/ggml/ggml/src/ggml-cpu/ops.cpp
vendored
@ -6690,6 +6690,61 @@ void ggml_compute_forward_pad_reflect_1d(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ggml_compute_forward_unpad
|
||||||
|
|
||||||
|
static void ggml_compute_forward_unpad_f32(
|
||||||
|
const struct ggml_compute_params *params,
|
||||||
|
struct ggml_tensor *dst) {
|
||||||
|
|
||||||
|
const struct ggml_tensor * src0 = dst->src[0];
|
||||||
|
|
||||||
|
GGML_ASSERT(src0->nb[0] == sizeof(float));
|
||||||
|
GGML_ASSERT( dst->nb[0] == sizeof(float));
|
||||||
|
|
||||||
|
const int ith = params->ith;
|
||||||
|
const int nth = params->nth;
|
||||||
|
|
||||||
|
GGML_TENSOR_UNARY_OP_LOCALS
|
||||||
|
|
||||||
|
float * dst_ptr = (float *) dst->data;
|
||||||
|
|
||||||
|
// TODO: optimize
|
||||||
|
|
||||||
|
for (int64_t i2 = 0; i2 < ne2; ++i2) {
|
||||||
|
for (int64_t i1 = ith; i1 < ne1; i1 += nth) {
|
||||||
|
for (int64_t i0 = 0; i0 < ne0; ++i0) {
|
||||||
|
for (int64_t i3 = 0; i3 < ne3; ++i3) {
|
||||||
|
const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0;
|
||||||
|
|
||||||
|
const float * src_ptr = (const float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
||||||
|
|
||||||
|
if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
|
||||||
|
dst_ptr[dst_idx] = *src_ptr;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_compute_forward_unpad(
|
||||||
|
const struct ggml_compute_params * params,
|
||||||
|
struct ggml_tensor * dst) {
|
||||||
|
|
||||||
|
const struct ggml_tensor * src0 = dst->src[0];
|
||||||
|
|
||||||
|
switch (src0->type) {
|
||||||
|
case GGML_TYPE_F32:
|
||||||
|
{
|
||||||
|
ggml_compute_forward_unpad_f32(params, dst);
|
||||||
|
} break;
|
||||||
|
default:
|
||||||
|
{
|
||||||
|
GGML_ABORT("fatal error");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// ggml_compute_forward_arange
|
// ggml_compute_forward_arange
|
||||||
|
|
||||||
static void ggml_compute_forward_arange_f32(
|
static void ggml_compute_forward_arange_f32(
|
||||||
|
1
ml/backend/ggml/ggml/src/ggml-cpu/ops.h
vendored
1
ml/backend/ggml/ggml/src/ggml-cpu/ops.h
vendored
@ -72,6 +72,7 @@ void ggml_compute_forward_pool_2d_back(const struct ggml_compute_params * params
|
|||||||
void ggml_compute_forward_upscale(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
void ggml_compute_forward_upscale(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||||
void ggml_compute_forward_pad(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
void ggml_compute_forward_pad(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||||
void ggml_compute_forward_pad_reflect_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
void ggml_compute_forward_pad_reflect_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||||
|
void ggml_compute_forward_unpad(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||||
void ggml_compute_forward_arange(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
void ggml_compute_forward_arange(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||||
void ggml_compute_forward_timestep_embedding(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
void ggml_compute_forward_timestep_embedding(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||||
void ggml_compute_forward_argsort(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
void ggml_compute_forward_argsort(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||||
|
@ -2238,6 +2238,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
|||||||
case GGML_OP_PAD:
|
case GGML_OP_PAD:
|
||||||
ggml_cuda_op_pad(ctx, dst);
|
ggml_cuda_op_pad(ctx, dst);
|
||||||
break;
|
break;
|
||||||
|
case GGML_OP_UNPAD:
|
||||||
|
ggml_cuda_op_unpad(ctx, dst);
|
||||||
|
break;
|
||||||
case GGML_OP_ARANGE:
|
case GGML_OP_ARANGE:
|
||||||
ggml_cuda_op_arange(ctx, dst);
|
ggml_cuda_op_arange(ctx, dst);
|
||||||
break;
|
break;
|
||||||
@ -3212,6 +3215,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
|||||||
case GGML_OP_UPSCALE:
|
case GGML_OP_UPSCALE:
|
||||||
return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST;
|
return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST;
|
||||||
case GGML_OP_PAD:
|
case GGML_OP_PAD:
|
||||||
|
case GGML_OP_UNPAD:
|
||||||
case GGML_OP_ARANGE:
|
case GGML_OP_ARANGE:
|
||||||
case GGML_OP_TIMESTEP_EMBEDDING:
|
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||||
case GGML_OP_LEAKY_RELU:
|
case GGML_OP_LEAKY_RELU:
|
||||||
|
46
ml/backend/ggml/ggml/src/ggml-cuda/pad.cu
vendored
46
ml/backend/ggml/ggml/src/ggml-cuda/pad.cu
vendored
@ -47,3 +47,49 @@ void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
|||||||
src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
|
src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
|
||||||
dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], stream);
|
dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], stream);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static __global__ void unpad_f32(const float * x, float * dst, const int ne0, const int ne00, const int ne01, const int ne02, const int ne03) {
|
||||||
|
// blockIdx.z: idx of ne2*ne3, aka ne02*ne03
|
||||||
|
// blockIdx.y: idx of ne1
|
||||||
|
// blockIDx.x: idx of ne0 / BLOCK_SIZE
|
||||||
|
int nidx = threadIdx.x + blockIdx.x * blockDim.x;
|
||||||
|
if (nidx >= ne0) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// operation
|
||||||
|
int offset_dst =
|
||||||
|
nidx +
|
||||||
|
blockIdx.y * ne0 +
|
||||||
|
blockIdx.z * ne0 * gridDim.y;
|
||||||
|
if (nidx < ne00 && blockIdx.y < ne01 && blockIdx.z < ne02*ne03) {
|
||||||
|
int offset_src =
|
||||||
|
nidx +
|
||||||
|
blockIdx.y * ne00 +
|
||||||
|
blockIdx.z * ne00 * ne01;
|
||||||
|
dst[offset_dst] = x[offset_src];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void unpad_f32_cuda(const float * x, float * dst,
|
||||||
|
const int ne00, const int ne01, const int ne02, const int ne03,
|
||||||
|
const int ne0, const int ne1, const int ne2, const int ne3, cudaStream_t stream) {
|
||||||
|
int num_blocks = (ne0 + CUDA_PAD_BLOCK_SIZE - 1) / CUDA_PAD_BLOCK_SIZE;
|
||||||
|
dim3 gridDim(num_blocks, ne1, ne2*ne3);
|
||||||
|
unpad_f32<<<gridDim, CUDA_PAD_BLOCK_SIZE, 0, stream>>>(x, dst, ne0, ne00, ne01, ne02, ne03);
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||||
|
const ggml_tensor * src0 = dst->src[0];
|
||||||
|
const float * src0_d = (const float *)src0->data;
|
||||||
|
float * dst_d = (float *)dst->data;
|
||||||
|
cudaStream_t stream = ctx.stream();
|
||||||
|
|
||||||
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||||
|
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||||
|
GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
|
||||||
|
|
||||||
|
unpad_f32_cuda(src0_d, dst_d,
|
||||||
|
src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
|
||||||
|
dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], stream);
|
||||||
|
}
|
1
ml/backend/ggml/ggml/src/ggml-cuda/pad.cuh
vendored
1
ml/backend/ggml/ggml/src/ggml-cuda/pad.cuh
vendored
@ -3,3 +3,4 @@
|
|||||||
#define CUDA_PAD_BLOCK_SIZE 256
|
#define CUDA_PAD_BLOCK_SIZE 256
|
||||||
|
|
||||||
void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||||
|
void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||||
|
@ -5599,6 +5599,51 @@ kernel void kernel_pad_reflect_1d_f32(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
kernel void kernel_unpad_f32(
|
||||||
|
device const char * src0,
|
||||||
|
device char * dst,
|
||||||
|
constant int64_t & ne00,
|
||||||
|
constant int64_t & ne01,
|
||||||
|
constant int64_t & ne02,
|
||||||
|
constant int64_t & ne03,
|
||||||
|
constant uint64_t & nb00,
|
||||||
|
constant uint64_t & nb01,
|
||||||
|
constant uint64_t & nb02,
|
||||||
|
constant uint64_t & nb03,
|
||||||
|
constant int64_t & ne0,
|
||||||
|
constant int64_t & ne1,
|
||||||
|
constant int64_t & ne2,
|
||||||
|
constant int64_t & ne3,
|
||||||
|
constant uint64_t & nb0,
|
||||||
|
constant uint64_t & nb1,
|
||||||
|
constant uint64_t & nb2,
|
||||||
|
constant uint64_t & nb3,
|
||||||
|
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||||
|
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||||
|
uint3 ntg[[threads_per_threadgroup]]) {
|
||||||
|
|
||||||
|
const int64_t i3 = tgpig.z;
|
||||||
|
const int64_t i2 = tgpig.y;
|
||||||
|
const int64_t i1 = tgpig.x;
|
||||||
|
|
||||||
|
const int64_t i03 = i3;
|
||||||
|
const int64_t i02 = i2;
|
||||||
|
const int64_t i01 = i1;
|
||||||
|
|
||||||
|
device const float * src0_ptr = (device const float *) (src0 + i03*nb03 + i02*nb02 + i01*nb01);
|
||||||
|
device float * dst_ptr = (device float *) (dst + i3*nb3 + i2*nb2 + i1*nb1);
|
||||||
|
|
||||||
|
if (i1 < ne01 && i2 < ne02 && i3 < ne03) {
|
||||||
|
for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
|
||||||
|
if (i0 < ne00) {
|
||||||
|
dst_ptr[i0] = src0_ptr[i0];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
kernel void kernel_arange_f32(
|
kernel void kernel_arange_f32(
|
||||||
device char * dst,
|
device char * dst,
|
||||||
constant ggml_metal_kargs_arange & args,
|
constant ggml_metal_kargs_arange & args,
|
||||||
|
33
ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m
vendored
33
ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m
vendored
@ -347,6 +347,7 @@ enum ggml_metal_kernel_type {
|
|||||||
GGML_METAL_KERNEL_TYPE_UPSCALE_F32,
|
GGML_METAL_KERNEL_TYPE_UPSCALE_F32,
|
||||||
GGML_METAL_KERNEL_TYPE_PAD_F32,
|
GGML_METAL_KERNEL_TYPE_PAD_F32,
|
||||||
GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32,
|
GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32,
|
||||||
|
GGML_METAL_KERNEL_TYPE_UNPAD_F32,
|
||||||
GGML_METAL_KERNEL_TYPE_ARANGE_F32,
|
GGML_METAL_KERNEL_TYPE_ARANGE_F32,
|
||||||
GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,
|
GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,
|
||||||
GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,
|
GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,
|
||||||
@ -1294,6 +1295,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
|
|||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32, upscale_f32, true);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32, upscale_f32, true);
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32, pad_f32, true);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32, pad_f32, true);
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32, pad_reflect_1d_f32, true);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32, pad_reflect_1d_f32, true);
|
||||||
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UNPAD_F32, unpad_f32, true);
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32, timestep_embedding_f32, true);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32, timestep_embedding_f32, true);
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARANGE_F32, arange_f32, true);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARANGE_F32, arange_f32, true);
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, argsort_f32_i32_asc, true);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, argsort_f32_i32_asc, true);
|
||||||
@ -1655,6 +1657,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
|
|||||||
case GGML_OP_POOL_2D:
|
case GGML_OP_POOL_2D:
|
||||||
case GGML_OP_PAD:
|
case GGML_OP_PAD:
|
||||||
case GGML_OP_PAD_REFLECT_1D:
|
case GGML_OP_PAD_REFLECT_1D:
|
||||||
|
case GGML_OP_UNPAD:
|
||||||
case GGML_OP_TIMESTEP_EMBEDDING:
|
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||||
case GGML_OP_ARGSORT:
|
case GGML_OP_ARGSORT:
|
||||||
case GGML_OP_LEAKY_RELU:
|
case GGML_OP_LEAKY_RELU:
|
||||||
@ -4184,6 +4187,36 @@ static bool ggml_metal_encode_node(
|
|||||||
|
|
||||||
const int nth = MIN(1024, ne0);
|
const int nth = MIN(1024, ne0);
|
||||||
|
|
||||||
|
[encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
||||||
|
} break;
|
||||||
|
case GGML_OP_UNPAD:
|
||||||
|
{
|
||||||
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||||
|
|
||||||
|
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_UNPAD_F32].pipeline;
|
||||||
|
|
||||||
|
[encoder setComputePipelineState:pipeline];
|
||||||
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||||
|
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||||
|
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
|
||||||
|
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
|
||||||
|
[encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
|
||||||
|
[encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5];
|
||||||
|
[encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
|
||||||
|
[encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
|
||||||
|
[encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
|
||||||
|
[encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9];
|
||||||
|
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:10];
|
||||||
|
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:11];
|
||||||
|
[encoder setBytes:&ne2 length:sizeof(ne2) atIndex:12];
|
||||||
|
[encoder setBytes:&ne3 length:sizeof(ne3) atIndex:13];
|
||||||
|
[encoder setBytes:&nb0 length:sizeof(nb0) atIndex:14];
|
||||||
|
[encoder setBytes:&nb1 length:sizeof(nb1) atIndex:15];
|
||||||
|
[encoder setBytes:&nb2 length:sizeof(nb2) atIndex:16];
|
||||||
|
[encoder setBytes:&nb3 length:sizeof(nb3) atIndex:17];
|
||||||
|
|
||||||
|
const int nth = MIN(1024, ne0);
|
||||||
|
|
||||||
[encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
[encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_ARANGE:
|
case GGML_OP_ARANGE:
|
||||||
|
@ -3121,6 +3121,51 @@ kernel void kernel_pad_reflect_1d_f32(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
kernel void kernel_unpad_f32(
|
||||||
|
device const char * src0,
|
||||||
|
device char * dst,
|
||||||
|
constant int64_t & ne00,
|
||||||
|
constant int64_t & ne01,
|
||||||
|
constant int64_t & ne02,
|
||||||
|
constant int64_t & ne03,
|
||||||
|
constant uint64_t & nb00,
|
||||||
|
constant uint64_t & nb01,
|
||||||
|
constant uint64_t & nb02,
|
||||||
|
constant uint64_t & nb03,
|
||||||
|
constant int64_t & ne0,
|
||||||
|
constant int64_t & ne1,
|
||||||
|
constant int64_t & ne2,
|
||||||
|
constant int64_t & ne3,
|
||||||
|
constant uint64_t & nb0,
|
||||||
|
constant uint64_t & nb1,
|
||||||
|
constant uint64_t & nb2,
|
||||||
|
constant uint64_t & nb3,
|
||||||
|
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||||
|
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||||
|
uint3 ntg[[threads_per_threadgroup]]) {
|
||||||
|
|
||||||
|
const int64_t i3 = tgpig.z;
|
||||||
|
const int64_t i2 = tgpig.y;
|
||||||
|
const int64_t i1 = tgpig.x;
|
||||||
|
|
||||||
|
const int64_t i03 = i3;
|
||||||
|
const int64_t i02 = i2;
|
||||||
|
const int64_t i01 = i1;
|
||||||
|
|
||||||
|
device const float * src0_ptr = (device const float *) (src0 + i03*nb03 + i02*nb02 + i01*nb01);
|
||||||
|
device float * dst_ptr = (device float *) (dst + i3*nb3 + i2*nb2 + i1*nb1);
|
||||||
|
|
||||||
|
if (i1 < ne01 && i2 < ne02 && i3 < ne03) {
|
||||||
|
for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
|
||||||
|
if (i0 < ne00) {
|
||||||
|
dst_ptr[i0] = src0_ptr[i0];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
kernel void kernel_arange_f32(
|
kernel void kernel_arange_f32(
|
||||||
device char * dst,
|
device char * dst,
|
||||||
constant ggml_metal_kargs_arange & args,
|
constant ggml_metal_kargs_arange & args,
|
||||||
|
25
ml/backend/ggml/ggml/src/ggml.c
vendored
25
ml/backend/ggml/ggml/src/ggml.c
vendored
@ -923,6 +923,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|||||||
"UPSCALE",
|
"UPSCALE",
|
||||||
"PAD",
|
"PAD",
|
||||||
"PAD_REFLECT_1D",
|
"PAD_REFLECT_1D",
|
||||||
|
"UNPAD",
|
||||||
"ARANGE",
|
"ARANGE",
|
||||||
"TIMESTEP_EMBEDDING",
|
"TIMESTEP_EMBEDDING",
|
||||||
"ARGSORT",
|
"ARGSORT",
|
||||||
@ -953,7 +954,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|||||||
"OPT_STEP_ADAMW",
|
"OPT_STEP_ADAMW",
|
||||||
};
|
};
|
||||||
|
|
||||||
static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
|
static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
|
||||||
|
|
||||||
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||||
"none",
|
"none",
|
||||||
@ -1018,6 +1019,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|||||||
"upscale(x)",
|
"upscale(x)",
|
||||||
"pad(x)",
|
"pad(x)",
|
||||||
"pad_reflect_1d(x)",
|
"pad_reflect_1d(x)",
|
||||||
|
"unpad(x)",
|
||||||
"arange(start, stop, step)",
|
"arange(start, stop, step)",
|
||||||
"timestep_embedding(timesteps, dim, max_period)",
|
"timestep_embedding(timesteps, dim, max_period)",
|
||||||
"argsort(x)",
|
"argsort(x)",
|
||||||
@ -1048,7 +1050,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|||||||
"adamw(x)",
|
"adamw(x)",
|
||||||
};
|
};
|
||||||
|
|
||||||
static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
|
static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
|
||||||
|
|
||||||
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
|
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
|
||||||
|
|
||||||
@ -4274,6 +4276,25 @@ struct ggml_tensor * ggml_pad_reflect_1d(
|
|||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ggml_unpad
|
||||||
|
|
||||||
|
struct ggml_tensor * ggml_unpad(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
int p0, int p1, int p2, int p3) {
|
||||||
|
|
||||||
|
struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type,
|
||||||
|
a->ne[0] - p0,
|
||||||
|
a->ne[1] - p1,
|
||||||
|
a->ne[2] - p2,
|
||||||
|
a->ne[3] - p3);
|
||||||
|
|
||||||
|
result->op = GGML_OP_UNPAD;
|
||||||
|
result->src[0] = a;
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
// ggml_arange
|
// ggml_arange
|
||||||
|
|
||||||
struct ggml_tensor * ggml_arange(
|
struct ggml_tensor * ggml_arange(
|
||||||
|
@ -45,8 +45,6 @@ func New(c fs.Config) (model.Model, error) {
|
|||||||
Types: c.Ints("tokenizer.ggml.token_type"),
|
Types: c.Ints("tokenizer.ggml.token_type"),
|
||||||
BOS: int32(c.Uint("tokenizer.ggml.bos_token_id")),
|
BOS: int32(c.Uint("tokenizer.ggml.bos_token_id")),
|
||||||
EOS: int32(c.Uint("tokenizer.ggml.eos_token_id")),
|
EOS: int32(c.Uint("tokenizer.ggml.eos_token_id")),
|
||||||
// TODO: set EOT to EOS otherwise 0 will stop generation
|
|
||||||
EOT: int32(c.Uint("tokenizer.ggml.eos_token_id")),
|
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
Layers: make([]Layer, c.Uint("block_count")),
|
Layers: make([]Layer, c.Uint("block_count")),
|
||||||
|
@ -7,6 +7,7 @@ import (
|
|||||||
"github.com/ollama/ollama/kvcache"
|
"github.com/ollama/ollama/kvcache"
|
||||||
"github.com/ollama/ollama/ml"
|
"github.com/ollama/ollama/ml"
|
||||||
"github.com/ollama/ollama/ml/nn"
|
"github.com/ollama/ollama/ml/nn"
|
||||||
|
"github.com/ollama/ollama/model"
|
||||||
"github.com/ollama/ollama/model/input"
|
"github.com/ollama/ollama/model/input"
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -19,6 +20,9 @@ type TextConfig struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
type TextModel struct {
|
type TextModel struct {
|
||||||
|
model.Base
|
||||||
|
model.SentencePieceModel
|
||||||
|
|
||||||
TokenEmbedding *nn.Embedding `gguf:"token_embd"`
|
TokenEmbedding *nn.Embedding `gguf:"token_embd"`
|
||||||
Layers []TextLayer `gguf:"blk"`
|
Layers []TextLayer `gguf:"blk"`
|
||||||
OutputNorm *nn.RMSNorm `gguf:"output_norm"`
|
OutputNorm *nn.RMSNorm `gguf:"output_norm"`
|
||||||
@ -41,6 +45,15 @@ func newTextModel(c fs.Config) *TextModel {
|
|||||||
numBlocks := int(c.Uint("block_count"))
|
numBlocks := int(c.Uint("block_count"))
|
||||||
|
|
||||||
m := TextModel{
|
m := TextModel{
|
||||||
|
SentencePieceModel: model.NewSentencePieceModel(
|
||||||
|
&model.Vocabulary{
|
||||||
|
Values: c.Strings("tokenizer.ggml.tokens"),
|
||||||
|
Scores: c.Floats("tokenizer.ggml.scores"),
|
||||||
|
Types: c.Ints("tokenizer.ggml.token_type"),
|
||||||
|
BOS: int32(c.Uint("tokenizer.ggml.bos_token_id")),
|
||||||
|
EOS: int32(c.Uint("tokenizer.ggml.eos_token_id")),
|
||||||
|
},
|
||||||
|
),
|
||||||
Layers: make([]TextLayer, numBlocks),
|
Layers: make([]TextLayer, numBlocks),
|
||||||
TextConfig: &TextConfig{
|
TextConfig: &TextConfig{
|
||||||
hiddenSize: int(c.Uint("embedding_length")),
|
hiddenSize: int(c.Uint("embedding_length")),
|
||||||
|
@ -47,9 +47,6 @@ func New(c fs.Config) (model.Model, error) {
|
|||||||
AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
|
AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
|
||||||
EOS: int32(c.Uint("tokenizer.ggml.eos_token_id")),
|
EOS: int32(c.Uint("tokenizer.ggml.eos_token_id")),
|
||||||
AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
|
AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
|
||||||
// TODO: set EOT to EOS otherwise 0 will stop generation
|
|
||||||
EOT: int32(c.Uint("tokenizer.ggml.eos_token_id")),
|
|
||||||
AddEOT: c.Bool("tokenizer.ggml.add_eos_token", false),
|
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
Layers: make([]Layer, c.Uint("block_count")),
|
Layers: make([]Layer, c.Uint("block_count")),
|
||||||
|
@ -45,9 +45,6 @@ func New(c fs.Config) (model.Model, error) {
|
|||||||
AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
|
AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
|
||||||
EOS: int32(c.Uint("tokenizer.ggml.eos_token_id")),
|
EOS: int32(c.Uint("tokenizer.ggml.eos_token_id")),
|
||||||
AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
|
AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
|
||||||
// TODO: set EOT to EOS otherwise 0 will stop generation
|
|
||||||
EOT: int32(c.Uint("tokenizer.ggml.eos_token_id")),
|
|
||||||
AddEOT: c.Bool("tokenizer.ggml.add_eos_token", false),
|
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
ImageProcessor: newImageProcessor(c),
|
ImageProcessor: newImageProcessor(c),
|
||||||
|
@ -208,7 +208,7 @@ func (m *VisionModel) Forward(ctx ml.Context, pixelValues ml.Tensor) ml.Tensor {
|
|||||||
}
|
}
|
||||||
|
|
||||||
hiddenStates = m.LayerNormPost.Forward(ctx, hiddenStates, m.eps)
|
hiddenStates = m.LayerNormPost.Forward(ctx, hiddenStates, m.eps)
|
||||||
hiddenStates = hiddenStates.Pad(ctx, 0, -1, 0, 0)
|
hiddenStates = hiddenStates.Unpad(ctx, 0, 1, 0, 0)
|
||||||
hiddenStates = m.VisionAdapter.Forward(ctx, hiddenStates, m.VisionOptions)
|
hiddenStates = m.VisionAdapter.Forward(ctx, hiddenStates, m.VisionOptions)
|
||||||
return hiddenStates
|
return hiddenStates
|
||||||
}
|
}
|
||||||
|
@ -16,8 +16,6 @@ import (
|
|||||||
|
|
||||||
type Model struct {
|
type Model struct {
|
||||||
model.Base
|
model.Base
|
||||||
model.BytePairEncoding
|
|
||||||
|
|
||||||
*TextModel
|
*TextModel
|
||||||
*VisionModel `gguf:"v,vision"`
|
*VisionModel `gguf:"v,vision"`
|
||||||
*MultiModalProjector `gguf:"mm"`
|
*MultiModalProjector `gguf:"mm"`
|
||||||
@ -42,21 +40,6 @@ func New(c fs.Config) (model.Model, error) {
|
|||||||
VisionModel: newVisionModel(c),
|
VisionModel: newVisionModel(c),
|
||||||
ImageProcessor: newImageProcessor(c),
|
ImageProcessor: newImageProcessor(c),
|
||||||
MultiModalProjector: newMultiModalProjector(c),
|
MultiModalProjector: newMultiModalProjector(c),
|
||||||
BytePairEncoding: model.NewBytePairEncoding(
|
|
||||||
c.String("tokenizer.ggml.pretokenizer", `[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
|
|
||||||
&model.Vocabulary{
|
|
||||||
Values: c.Strings("tokenizer.ggml.tokens"),
|
|
||||||
Types: c.Ints("tokenizer.ggml.token_type"),
|
|
||||||
Merges: c.Strings("tokenizer.ggml.merges"),
|
|
||||||
BOS: int32(c.Uint("tokenizer.ggml.bos_token_id", 1)),
|
|
||||||
AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
|
|
||||||
EOS: int32(c.Uint("tokenizer.ggml.eos_token_id", 2)),
|
|
||||||
AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
|
|
||||||
// TODO: set EOT to EOS otherwise 0 will stop generation
|
|
||||||
EOT: int32(c.Uint("tokenizer.ggml.eos_token_id")),
|
|
||||||
AddEOT: c.Bool("tokenizer.ggml.add_eos_token", false),
|
|
||||||
},
|
|
||||||
),
|
|
||||||
}
|
}
|
||||||
|
|
||||||
m.Cache = kvcache.NewCausalCache(m.TextModel.Shift)
|
m.Cache = kvcache.NewCausalCache(m.TextModel.Shift)
|
||||||
|
@ -21,6 +21,7 @@ type TextOptions struct {
|
|||||||
|
|
||||||
type TextModel struct {
|
type TextModel struct {
|
||||||
model.Base
|
model.Base
|
||||||
|
model.BytePairEncoding
|
||||||
|
|
||||||
TokenEmbedding *nn.Embedding `gguf:"token_embd"`
|
TokenEmbedding *nn.Embedding `gguf:"token_embd"`
|
||||||
Layers []Layer `gguf:"blk"`
|
Layers []Layer `gguf:"blk"`
|
||||||
@ -147,6 +148,18 @@ func NewTextModel(c fs.Config) (*TextModel, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
textModel := &TextModel{
|
textModel := &TextModel{
|
||||||
|
BytePairEncoding: model.NewBytePairEncoding(
|
||||||
|
c.String("tokenizer.ggml.pretokenizer", `[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
|
||||||
|
&model.Vocabulary{
|
||||||
|
Values: c.Strings("tokenizer.ggml.tokens"),
|
||||||
|
Types: c.Ints("tokenizer.ggml.token_type"),
|
||||||
|
Merges: c.Strings("tokenizer.ggml.merges"),
|
||||||
|
BOS: int32(c.Uint("tokenizer.ggml.bos_token_id", 1)),
|
||||||
|
AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
|
||||||
|
EOS: int32(c.Uint("tokenizer.ggml.eos_token_id", 2)),
|
||||||
|
AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
|
||||||
|
},
|
||||||
|
),
|
||||||
Layers: make([]Layer, c.Uint("block_count")),
|
Layers: make([]Layer, c.Uint("block_count")),
|
||||||
TextOptions: &TextOptions{
|
TextOptions: &TextOptions{
|
||||||
hiddenSize: int(c.Uint("embedding_length")),
|
hiddenSize: int(c.Uint("embedding_length")),
|
||||||
|
201
model/models/mllama/imageproc.go
Normal file
201
model/models/mllama/imageproc.go
Normal file
@ -0,0 +1,201 @@
|
|||||||
|
package mllama
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"image"
|
||||||
|
_ "image/jpeg"
|
||||||
|
_ "image/png"
|
||||||
|
"io"
|
||||||
|
"math"
|
||||||
|
"slices"
|
||||||
|
|
||||||
|
"golang.org/x/image/draw"
|
||||||
|
|
||||||
|
"github.com/ollama/ollama/model/imageproc"
|
||||||
|
)
|
||||||
|
|
||||||
|
func getSupportedAspectRatios(maxTiles int) []image.Point {
|
||||||
|
ratios := []image.Point{}
|
||||||
|
|
||||||
|
for w := range maxTiles {
|
||||||
|
for h := range maxTiles {
|
||||||
|
if (w+1)*(h+1) <= maxTiles {
|
||||||
|
ratios = append(ratios, image.Point{w + 1, h + 1})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return ratios
|
||||||
|
}
|
||||||
|
|
||||||
|
func clip(a, a_min, a_max int) int {
|
||||||
|
if a < a_min {
|
||||||
|
return a_min
|
||||||
|
} else if a > a_max {
|
||||||
|
return a_max
|
||||||
|
}
|
||||||
|
|
||||||
|
return a
|
||||||
|
}
|
||||||
|
|
||||||
|
func getOptimalTiledCanvas(imageSize image.Point, maxImageTiles, tileSize int) image.Point {
|
||||||
|
possibleTileArrangements := getSupportedAspectRatios(maxImageTiles)
|
||||||
|
possibleCanvasSizes := []image.Point{}
|
||||||
|
for _, pta := range possibleTileArrangements {
|
||||||
|
possibleCanvasSizes = append(possibleCanvasSizes, image.Point{pta.X * tileSize, pta.Y * tileSize})
|
||||||
|
}
|
||||||
|
|
||||||
|
scales := []float64{}
|
||||||
|
|
||||||
|
for _, pcs := range possibleCanvasSizes {
|
||||||
|
scaleHeight := float64(pcs.Y) / float64(imageSize.Y)
|
||||||
|
scaleWidth := float64(pcs.X) / float64(imageSize.X)
|
||||||
|
|
||||||
|
if scaleWidth > scaleHeight {
|
||||||
|
scales = append(scales, scaleHeight)
|
||||||
|
} else {
|
||||||
|
scales = append(scales, scaleWidth)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
var minUpscale float64
|
||||||
|
var maxDownscale float64
|
||||||
|
var upscale bool
|
||||||
|
|
||||||
|
for _, s := range scales {
|
||||||
|
if s > 1.0 {
|
||||||
|
upscale = true
|
||||||
|
if minUpscale == 0 {
|
||||||
|
minUpscale = s
|
||||||
|
} else {
|
||||||
|
minUpscale = math.Min(minUpscale, s)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
maxDownscale = math.Max(maxDownscale, s)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
selectedScale := maxDownscale
|
||||||
|
if upscale {
|
||||||
|
selectedScale = minUpscale
|
||||||
|
}
|
||||||
|
|
||||||
|
var selectedCanvas image.Point
|
||||||
|
for n, pcs := range possibleCanvasSizes {
|
||||||
|
if scales[n] == selectedScale {
|
||||||
|
// choose the smallest possible canvas
|
||||||
|
if selectedCanvas.X == 0 && selectedCanvas.Y == 0 {
|
||||||
|
selectedCanvas = pcs
|
||||||
|
} else if pcs.X*pcs.Y < selectedCanvas.X*selectedCanvas.Y {
|
||||||
|
selectedCanvas = pcs
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return selectedCanvas
|
||||||
|
}
|
||||||
|
|
||||||
|
func getImageSizeFitToCanvas(imageSize, canvasSize image.Point, tileSize int) image.Point {
|
||||||
|
targetWidth := clip(imageSize.X, tileSize, canvasSize.X)
|
||||||
|
targetHeight := clip(imageSize.Y, tileSize, canvasSize.Y)
|
||||||
|
|
||||||
|
scaleWidth := float64(targetWidth) / float64(imageSize.X)
|
||||||
|
scaleHeight := float64(targetHeight) / float64(imageSize.Y)
|
||||||
|
|
||||||
|
var w, h int
|
||||||
|
|
||||||
|
if scaleWidth < scaleHeight {
|
||||||
|
w = targetWidth
|
||||||
|
h = min(int(math.Floor(float64(imageSize.Y)*scaleWidth)), targetHeight)
|
||||||
|
} else {
|
||||||
|
w = min(int(math.Floor(float64(imageSize.X)*scaleHeight)), targetWidth)
|
||||||
|
h = targetHeight
|
||||||
|
}
|
||||||
|
|
||||||
|
return image.Point{w, h}
|
||||||
|
}
|
||||||
|
|
||||||
|
func resizeImage(img image.Image, format string, outputSize image.Point, maxImageTiles int) (image.Image, image.Point) {
|
||||||
|
if format == "png" {
|
||||||
|
img = imageproc.Composite(img)
|
||||||
|
}
|
||||||
|
|
||||||
|
b := img.Bounds()
|
||||||
|
tileSize := outputSize.Y
|
||||||
|
|
||||||
|
canvasSize := getOptimalTiledCanvas(b.Max, maxImageTiles, tileSize)
|
||||||
|
aspectRatio := image.Point{canvasSize.X / tileSize, canvasSize.Y / tileSize}
|
||||||
|
newSize := getImageSizeFitToCanvas(b.Max, canvasSize, tileSize)
|
||||||
|
|
||||||
|
return imageproc.Resize(img, newSize, imageproc.ResizeBilinear), aspectRatio
|
||||||
|
}
|
||||||
|
|
||||||
|
func padImage(img image.Image, outputSize, aspectRatio image.Point) image.Image {
|
||||||
|
paddedSize := image.Point{
|
||||||
|
X: outputSize.X * aspectRatio.X,
|
||||||
|
Y: outputSize.Y * aspectRatio.Y,
|
||||||
|
}
|
||||||
|
|
||||||
|
dst := image.NewRGBA(image.Rect(0, 0, paddedSize.X, paddedSize.Y))
|
||||||
|
draw.Draw(dst, img.Bounds(), img, image.Point{0, 0}, draw.Over)
|
||||||
|
|
||||||
|
return dst
|
||||||
|
}
|
||||||
|
|
||||||
|
func splitToTiles(img image.Image, numTilesSize image.Point) []image.Image {
|
||||||
|
b := img.Bounds()
|
||||||
|
width := b.Max.X - b.Min.X
|
||||||
|
height := b.Max.Y - b.Min.Y
|
||||||
|
tileHeight := height / numTilesSize.Y
|
||||||
|
tileWidth := width / numTilesSize.X
|
||||||
|
|
||||||
|
images := []image.Image{}
|
||||||
|
|
||||||
|
for h := range numTilesSize.Y {
|
||||||
|
for w := range numTilesSize.X {
|
||||||
|
rect := image.Rect(tileWidth*w, tileHeight*h, tileWidth*(w+1), tileHeight*(h+1))
|
||||||
|
images = append(images, img.(interface {
|
||||||
|
SubImage(image.Rectangle) image.Image
|
||||||
|
}).SubImage(rect))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return images
|
||||||
|
}
|
||||||
|
|
||||||
|
func packImages(img image.Image, aspectRatio image.Point) []float32 {
|
||||||
|
subImages := splitToTiles(img, aspectRatio)
|
||||||
|
|
||||||
|
var pixelVals []float32
|
||||||
|
|
||||||
|
rescale := true
|
||||||
|
channelFirst := true
|
||||||
|
|
||||||
|
for _, subImg := range subImages {
|
||||||
|
vals := imageproc.Normalize(subImg, imageproc.ClipDefaultMean, imageproc.ClipDefaultSTD, rescale, channelFirst)
|
||||||
|
pixelVals = append(pixelVals, vals...)
|
||||||
|
}
|
||||||
|
|
||||||
|
return pixelVals
|
||||||
|
}
|
||||||
|
|
||||||
|
func Preprocess(imageData io.Reader) ([]float32, map[string]any, error) {
|
||||||
|
outputSize := image.Point{560, 560}
|
||||||
|
maxTiles := 4
|
||||||
|
|
||||||
|
img, format, err := image.Decode(imageData)
|
||||||
|
if err != nil {
|
||||||
|
return nil, nil, fmt.Errorf("failed to decode image: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
newImage, aspectRatio := resizeImage(img, format, outputSize, maxTiles)
|
||||||
|
newImage = padImage(newImage, outputSize, aspectRatio)
|
||||||
|
|
||||||
|
data := packImages(newImage, aspectRatio)
|
||||||
|
aspectRatioIndex := slices.Index(getSupportedAspectRatios(maxTiles), aspectRatio) + 1
|
||||||
|
|
||||||
|
opts := map[string]any{
|
||||||
|
"aspectRatioIndex": aspectRatioIndex,
|
||||||
|
}
|
||||||
|
|
||||||
|
return data, opts, nil
|
||||||
|
}
|
420
model/models/mllama/imageproc_test.go
Normal file
420
model/models/mllama/imageproc_test.go
Normal file
@ -0,0 +1,420 @@
|
|||||||
|
package mllama
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"image"
|
||||||
|
"image/png"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/google/go-cmp/cmp"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestAspectRatios(t *testing.T) {
|
||||||
|
type aspectCase struct {
|
||||||
|
MaxTiles int
|
||||||
|
Expected []image.Point
|
||||||
|
}
|
||||||
|
|
||||||
|
cases := []aspectCase{
|
||||||
|
{
|
||||||
|
MaxTiles: 1,
|
||||||
|
Expected: []image.Point{{1, 1}},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
MaxTiles: 2,
|
||||||
|
Expected: []image.Point{{1, 1}, {1, 2}, {2, 1}},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
MaxTiles: 3,
|
||||||
|
Expected: []image.Point{{1, 1}, {1, 2}, {1, 3}, {2, 1}, {3, 1}},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
MaxTiles: 4,
|
||||||
|
Expected: []image.Point{{1, 1}, {1, 2}, {1, 3}, {1, 4}, {2, 1}, {2, 2}, {3, 1}, {4, 1}},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, c := range cases {
|
||||||
|
actual := getSupportedAspectRatios(c.MaxTiles)
|
||||||
|
|
||||||
|
if diff := cmp.Diff(actual, c.Expected); diff != "" {
|
||||||
|
t.Errorf("mismatch (-got +want):\n%s", diff)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestGetImageSizeFitToCanvas(t *testing.T) {
|
||||||
|
type imageSizeCase struct {
|
||||||
|
ImageRect image.Point
|
||||||
|
CanvasRect image.Point
|
||||||
|
TileSize int
|
||||||
|
Expected image.Point
|
||||||
|
}
|
||||||
|
|
||||||
|
cases := []imageSizeCase{
|
||||||
|
{
|
||||||
|
ImageRect: image.Point{400, 400},
|
||||||
|
CanvasRect: image.Point{640, 480},
|
||||||
|
TileSize: 200,
|
||||||
|
Expected: image.Point{400, 400},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
ImageRect: image.Point{1024, 768},
|
||||||
|
CanvasRect: image.Point{640, 480},
|
||||||
|
TileSize: 200,
|
||||||
|
Expected: image.Point{640, 480},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
ImageRect: image.Point{500, 500},
|
||||||
|
CanvasRect: image.Point{1000, 1000},
|
||||||
|
TileSize: 750,
|
||||||
|
Expected: image.Point{750, 750},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
ImageRect: image.Point{500, 1000},
|
||||||
|
CanvasRect: image.Point{2000, 2000},
|
||||||
|
TileSize: 2000,
|
||||||
|
Expected: image.Point{1000, 2000},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
ImageRect: image.Point{4000, 3000},
|
||||||
|
CanvasRect: image.Point{2000, 1000},
|
||||||
|
TileSize: 1000,
|
||||||
|
Expected: image.Point{1333, 1000},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
ImageRect: image.Point{667, 1000},
|
||||||
|
CanvasRect: image.Point{1000, 1000},
|
||||||
|
TileSize: 560,
|
||||||
|
Expected: image.Point{667, 1000},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, c := range cases {
|
||||||
|
actual := getImageSizeFitToCanvas(c.ImageRect, c.CanvasRect, c.TileSize)
|
||||||
|
|
||||||
|
if actual != c.Expected {
|
||||||
|
t.Errorf("incorrect image rect: '%#v'. expected: '%#v'", actual, c.Expected)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestGetOptimalTiledCanvas(t *testing.T) {
|
||||||
|
type tiledCanvasSizeCase struct {
|
||||||
|
ImageSize image.Point
|
||||||
|
MaxImageTiles int
|
||||||
|
TileSize int
|
||||||
|
Expected image.Point
|
||||||
|
}
|
||||||
|
|
||||||
|
cases := []tiledCanvasSizeCase{
|
||||||
|
{
|
||||||
|
ImageSize: image.Point{1024, 768},
|
||||||
|
MaxImageTiles: 4,
|
||||||
|
TileSize: 1000,
|
||||||
|
Expected: image.Point{2000, 1000},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
ImageSize: image.Point{1024, 768},
|
||||||
|
MaxImageTiles: 4,
|
||||||
|
TileSize: 560,
|
||||||
|
Expected: image.Point{1120, 1120},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
ImageSize: image.Point{800, 600},
|
||||||
|
MaxImageTiles: 4,
|
||||||
|
TileSize: 560,
|
||||||
|
Expected: image.Point{1120, 1120},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
ImageSize: image.Point{640, 480},
|
||||||
|
MaxImageTiles: 4,
|
||||||
|
TileSize: 560,
|
||||||
|
Expected: image.Point{1120, 560},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
ImageSize: image.Point{320, 200},
|
||||||
|
MaxImageTiles: 4,
|
||||||
|
TileSize: 560,
|
||||||
|
Expected: image.Point{560, 560},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
ImageSize: image.Point{1320, 200},
|
||||||
|
MaxImageTiles: 4,
|
||||||
|
TileSize: 560,
|
||||||
|
Expected: image.Point{1680, 560},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
ImageSize: image.Point{2000, 200},
|
||||||
|
MaxImageTiles: 4,
|
||||||
|
TileSize: 560,
|
||||||
|
Expected: image.Point{2240, 560},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
ImageSize: image.Point{10000, 200},
|
||||||
|
MaxImageTiles: 4,
|
||||||
|
TileSize: 560,
|
||||||
|
Expected: image.Point{2240, 560},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
ImageSize: image.Point{480, 640},
|
||||||
|
MaxImageTiles: 4,
|
||||||
|
TileSize: 560,
|
||||||
|
Expected: image.Point{560, 1120},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
ImageSize: image.Point{200, 320},
|
||||||
|
MaxImageTiles: 4,
|
||||||
|
TileSize: 560,
|
||||||
|
Expected: image.Point{560, 560},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
ImageSize: image.Point{200, 1320},
|
||||||
|
MaxImageTiles: 4,
|
||||||
|
TileSize: 560,
|
||||||
|
Expected: image.Point{560, 1680},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
ImageSize: image.Point{200, 2000},
|
||||||
|
MaxImageTiles: 4,
|
||||||
|
TileSize: 560,
|
||||||
|
Expected: image.Point{560, 2240},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
ImageSize: image.Point{200, 10000},
|
||||||
|
MaxImageTiles: 4,
|
||||||
|
TileSize: 560,
|
||||||
|
Expected: image.Point{560, 2240},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
ImageSize: image.Point{10000, 10000},
|
||||||
|
MaxImageTiles: 4,
|
||||||
|
TileSize: 560,
|
||||||
|
Expected: image.Point{1120, 1120},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, c := range cases {
|
||||||
|
actual := getOptimalTiledCanvas(c.ImageSize, c.MaxImageTiles, c.TileSize)
|
||||||
|
|
||||||
|
if actual != c.Expected {
|
||||||
|
t.Errorf("incorrect tiled canvas: '%#v'. expected: '%#v'", actual, c.Expected)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestSplitToTiles(t *testing.T) {
|
||||||
|
type splitCase struct {
|
||||||
|
TestImage image.Image
|
||||||
|
NumTilesSize image.Point
|
||||||
|
Expected []image.Image
|
||||||
|
}
|
||||||
|
|
||||||
|
cases := []splitCase{
|
||||||
|
{
|
||||||
|
TestImage: image.NewRGBA(image.Rect(0, 0, 1024, 768)),
|
||||||
|
NumTilesSize: image.Point{1, 1},
|
||||||
|
Expected: []image.Image{image.NewRGBA(image.Rect(0, 0, 1024, 768))},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
TestImage: image.NewRGBA(image.Rect(0, 0, 1000, 500)),
|
||||||
|
NumTilesSize: image.Point{2, 1},
|
||||||
|
Expected: []image.Image{
|
||||||
|
image.NewRGBA(image.Rect(0, 0, 500, 500)),
|
||||||
|
image.NewRGBA(image.Rect(500, 0, 1000, 500)),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
TestImage: image.NewRGBA(image.Rect(0, 0, 1000, 1000)),
|
||||||
|
NumTilesSize: image.Point{2, 2},
|
||||||
|
Expected: []image.Image{
|
||||||
|
image.NewRGBA(image.Rect(0, 0, 500, 500)),
|
||||||
|
image.NewRGBA(image.Rect(500, 0, 1000, 500)),
|
||||||
|
image.NewRGBA(image.Rect(0, 500, 500, 1000)),
|
||||||
|
image.NewRGBA(image.Rect(500, 500, 1000, 1000)),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, c := range cases {
|
||||||
|
actual := splitToTiles(c.TestImage, c.NumTilesSize)
|
||||||
|
|
||||||
|
if len(actual) != len(c.Expected) {
|
||||||
|
t.Errorf("incorrect number of images '%d': expected: '%d'", len(actual), len(c.Expected))
|
||||||
|
}
|
||||||
|
|
||||||
|
for i := range actual {
|
||||||
|
if actual[i].Bounds() != c.Expected[i].Bounds() {
|
||||||
|
t.Errorf("image size incorrect: '%#v': expected: '%#v'", actual[i].Bounds(), c.Expected[i].Bounds())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestResize(t *testing.T) {
|
||||||
|
type resizeCase struct {
|
||||||
|
TestImage image.Image
|
||||||
|
OutputSize image.Point
|
||||||
|
MaxImageTiles int
|
||||||
|
ExpectedImage image.Image
|
||||||
|
ExpectedAspectRatio image.Point
|
||||||
|
}
|
||||||
|
|
||||||
|
cases := []resizeCase{
|
||||||
|
{
|
||||||
|
TestImage: image.NewRGBA(image.Rect(0, 0, 200, 200)),
|
||||||
|
OutputSize: image.Point{100, 100},
|
||||||
|
MaxImageTiles: 1,
|
||||||
|
ExpectedImage: image.NewRGBA(image.Rect(0, 0, 100, 100)),
|
||||||
|
ExpectedAspectRatio: image.Point{1, 1},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
TestImage: image.NewRGBA(image.Rect(0, 0, 200, 200)),
|
||||||
|
OutputSize: image.Point{100, 100},
|
||||||
|
MaxImageTiles: 2,
|
||||||
|
ExpectedImage: image.NewRGBA(image.Rect(0, 0, 100, 100)),
|
||||||
|
ExpectedAspectRatio: image.Point{1, 1},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
TestImage: image.NewRGBA(image.Rect(0, 0, 10, 10)),
|
||||||
|
OutputSize: image.Point{560, 560},
|
||||||
|
MaxImageTiles: 4,
|
||||||
|
ExpectedImage: image.NewRGBA(image.Rect(0, 0, 560, 560)),
|
||||||
|
ExpectedAspectRatio: image.Point{1, 1},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
TestImage: image.NewRGBA(image.Rect(0, 0, 2560, 1920)),
|
||||||
|
OutputSize: image.Point{560, 560},
|
||||||
|
MaxImageTiles: 4,
|
||||||
|
ExpectedImage: image.NewRGBA(image.Rect(0, 0, 1120, 840)),
|
||||||
|
ExpectedAspectRatio: image.Point{2, 2},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
TestImage: image.NewRGBA(image.Rect(0, 0, 1024, 768)),
|
||||||
|
OutputSize: image.Point{560, 560},
|
||||||
|
MaxImageTiles: 4,
|
||||||
|
ExpectedImage: image.NewRGBA(image.Rect(0, 0, 1024, 768)),
|
||||||
|
ExpectedAspectRatio: image.Point{2, 2},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, c := range cases {
|
||||||
|
actualImage, actualAspectRatio := resizeImage(c.TestImage, "png", c.OutputSize, c.MaxImageTiles)
|
||||||
|
|
||||||
|
if actualImage.Bounds() != c.ExpectedImage.Bounds() {
|
||||||
|
t.Errorf("image size incorrect: '%#v': expected: '%#v'", actualImage.Bounds(), c.ExpectedImage.Bounds())
|
||||||
|
}
|
||||||
|
|
||||||
|
if actualAspectRatio != c.ExpectedAspectRatio {
|
||||||
|
t.Errorf("aspect ratio incorrect: '%#v': expected: '%#v'", actualAspectRatio, c.ExpectedAspectRatio)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestPad(t *testing.T) {
|
||||||
|
type padCase struct {
|
||||||
|
TestImage image.Image
|
||||||
|
OutputSize image.Point
|
||||||
|
AspectRatio image.Point
|
||||||
|
Expected image.Image
|
||||||
|
}
|
||||||
|
|
||||||
|
cases := []padCase{
|
||||||
|
{
|
||||||
|
TestImage: image.NewRGBA(image.Rect(0, 0, 1000, 667)),
|
||||||
|
OutputSize: image.Point{560, 560},
|
||||||
|
AspectRatio: image.Point{2, 2},
|
||||||
|
Expected: image.NewRGBA(image.Rect(0, 0, 1120, 1120)),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, c := range cases {
|
||||||
|
actual := padImage(c.TestImage, c.OutputSize, c.AspectRatio)
|
||||||
|
|
||||||
|
if actual.Bounds() != c.Expected.Bounds() {
|
||||||
|
t.Errorf("image size incorrect: '%#v': expected: '%#v'", actual.Bounds(), c.Expected.Bounds())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestPackImages(t *testing.T) {
|
||||||
|
type packCase struct {
|
||||||
|
TestImage image.Image
|
||||||
|
AspectRatio image.Point
|
||||||
|
ExpectedVals int
|
||||||
|
}
|
||||||
|
|
||||||
|
cases := []packCase{
|
||||||
|
{
|
||||||
|
TestImage: image.NewRGBA(image.Rect(0, 0, 1120, 1120)),
|
||||||
|
AspectRatio: image.Point{2, 2},
|
||||||
|
ExpectedVals: 2 * 2 * 3 * 560 * 560,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
TestImage: image.NewRGBA(image.Rect(0, 0, 560, 560)),
|
||||||
|
AspectRatio: image.Point{1, 1},
|
||||||
|
ExpectedVals: 1 * 1 * 3 * 560 * 560,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
TestImage: image.NewRGBA(image.Rect(0, 0, 1120, 560)),
|
||||||
|
AspectRatio: image.Point{1, 2},
|
||||||
|
ExpectedVals: 1 * 2 * 3 * 560 * 560,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, c := range cases {
|
||||||
|
actualVals := packImages(c.TestImage, c.AspectRatio)
|
||||||
|
if len(actualVals) != c.ExpectedVals {
|
||||||
|
t.Errorf("packed image size incorrect: '%d': expected: '%d'", len(actualVals), c.ExpectedVals)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestPreprocess(t *testing.T) {
|
||||||
|
type preprocessCase struct {
|
||||||
|
TestImage image.Image
|
||||||
|
ExpectedVals int
|
||||||
|
ExpectedAspectRatioID int
|
||||||
|
}
|
||||||
|
|
||||||
|
cases := []preprocessCase{
|
||||||
|
{
|
||||||
|
TestImage: image.NewRGBA(image.Rect(0, 0, 10, 10)),
|
||||||
|
ExpectedVals: 0,
|
||||||
|
ExpectedAspectRatioID: 1,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
TestImage: image.NewRGBA(image.Rect(0, 0, 1024, 768)),
|
||||||
|
ExpectedVals: 0,
|
||||||
|
ExpectedAspectRatioID: 6,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, c := range cases {
|
||||||
|
var buf bytes.Buffer
|
||||||
|
err := png.Encode(&buf, c.TestImage)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
imgData, opts, err := Preprocess(&buf)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("error processing: %q", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(imgData) == 0 {
|
||||||
|
t.Errorf("no image data returned")
|
||||||
|
}
|
||||||
|
|
||||||
|
ar, ok := opts["aspectRatioIndex"]
|
||||||
|
if !ok {
|
||||||
|
t.Fatalf("no aspect ratio found")
|
||||||
|
}
|
||||||
|
|
||||||
|
aspectRatioID := ar.(int)
|
||||||
|
|
||||||
|
if aspectRatioID != c.ExpectedAspectRatioID {
|
||||||
|
t.Errorf("aspect ratio incorrect: '%d': expected: '%d'", aspectRatioID, c.ExpectedAspectRatioID)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@ -2,7 +2,11 @@ package mllama
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"bytes"
|
"bytes"
|
||||||
|
"encoding/binary"
|
||||||
|
"fmt"
|
||||||
|
"hash/fnv"
|
||||||
"image"
|
"image"
|
||||||
|
"slices"
|
||||||
|
|
||||||
"github.com/ollama/ollama/fs"
|
"github.com/ollama/ollama/fs"
|
||||||
"github.com/ollama/ollama/kvcache"
|
"github.com/ollama/ollama/kvcache"
|
||||||
@ -30,6 +34,10 @@ const (
|
|||||||
)
|
)
|
||||||
|
|
||||||
func New(c fs.Config) (model.Model, error) {
|
func New(c fs.Config) (model.Model, error) {
|
||||||
|
// Verify unified config
|
||||||
|
if c.Uint("vision.block_count") == 0 {
|
||||||
|
return nil, fmt.Errorf("non-unified vision model not supported")
|
||||||
|
}
|
||||||
m := Model{
|
m := Model{
|
||||||
BytePairEncoding: model.NewBytePairEncoding(
|
BytePairEncoding: model.NewBytePairEncoding(
|
||||||
c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
|
c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
|
||||||
@ -41,9 +49,6 @@ func New(c fs.Config) (model.Model, error) {
|
|||||||
AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
|
AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
|
||||||
EOS: int32(c.Uint("tokenizer.ggml.eos_token_id")),
|
EOS: int32(c.Uint("tokenizer.ggml.eos_token_id")),
|
||||||
AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
|
AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
|
||||||
// TODO: set EOT to EOS otherwise 0 will stop generation
|
|
||||||
EOT: int32(c.Uint("tokenizer.ggml.eos_token_id")),
|
|
||||||
AddEOT: c.Bool("tokenizer.ggml.add_eos_token", false),
|
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
ImageProcessor: newImageProcessor(c),
|
ImageProcessor: newImageProcessor(c),
|
||||||
@ -68,19 +73,22 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, er
|
|||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
f32s, ratio, err := m.ImageProcessor.ProcessImage(image)
|
f32s, aspectRatioID, err := m.ImageProcessor.ProcessImage(image)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
pixelValues, err := ctx.Input().FromFloatSlice(f32s, m.imageSize, m.imageSize, m.numChannels, ratio.numTiles())
|
pixelValues, err := ctx.Input().FromFloatSlice(f32s,
|
||||||
|
m.ImageProcessor.imageSize,
|
||||||
|
m.ImageProcessor.imageSize,
|
||||||
|
m.ImageProcessor.numChannels,
|
||||||
|
m.ImageProcessor.maxNumTiles,
|
||||||
|
)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
pixelValues = pixelValues.Pad(ctx, 0, 0, 0, m.ImageProcessor.maxNumTiles-ratio.numTiles())
|
aspectRatio, err := ctx.Input().FromIntSlice([]int32{int32(aspectRatioID)}, 1)
|
||||||
|
|
||||||
aspectRatio, err := ctx.Input().FromIntSlice([]int32{int32(ratio.rank)}, 1)
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
@ -91,19 +99,41 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, er
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
|
func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
|
||||||
|
var images []input.Input
|
||||||
|
fnvHash := fnv.New64a()
|
||||||
|
|
||||||
for i := range inputs {
|
for i := range inputs {
|
||||||
if inputs[i].Multimodal != nil {
|
if inputs[i].Multimodal == nil {
|
||||||
inputs[i].Token = 128256 // <|image|>
|
if len(images) > 0 {
|
||||||
|
inputs[i].Multimodal = []ml.Tensor{images[0].Multimodal.(ml.Tensor)}
|
||||||
|
inputs[i].MultimodalHash = images[0].MultimodalHash
|
||||||
|
for j := 1; j < len(images); j++ {
|
||||||
|
inputs[i].Multimodal = append(inputs[i].Multimodal.([]ml.Tensor), images[0].Multimodal.(ml.Tensor))
|
||||||
|
fnvHash.Reset()
|
||||||
|
binary.Write(fnvHash, binary.NativeEndian, inputs[i].MultimodalHash)
|
||||||
|
binary.Write(fnvHash, binary.NativeEndian, inputs[j].MultimodalHash)
|
||||||
|
inputs[i].MultimodalHash = fnvHash.Sum64()
|
||||||
|
}
|
||||||
|
images = nil
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
images = append(images, inputs[i])
|
||||||
|
inputs[i].Token = -1
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
inputs = slices.DeleteFunc(inputs, func(input input.Input) bool { return input.Token == -1 })
|
||||||
|
|
||||||
return inputs, nil
|
return inputs, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
|
func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
|
||||||
var crossAttentionStates ml.Tensor
|
var crossAttentionStates ml.Tensor
|
||||||
if len(batch.Multimodal) > 0 {
|
if len(batch.Multimodal) > 0 {
|
||||||
crossAttentionStates = batch.Multimodal[len(batch.Multimodal)-1].Multimodal.(ml.Tensor)
|
images := batch.Multimodal[len(batch.Multimodal)-1].Multimodal.([]ml.Tensor)
|
||||||
|
if len(images) > 0 {
|
||||||
|
crossAttentionStates = images[len(images)-1]
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
|
positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
|
||||||
@ -117,7 +147,7 @@ func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// TODO: attention mask, cross attention mask
|
// TODO: attention mask, cross attention mask
|
||||||
return m.TextModel.Forward(ctx, batch.Inputs, positions, outputs, crossAttentionStates, nil, m.Cache.(*kvcache.WrapperCache)), nil
|
return m.TextModel.Forward(ctx, batch.Inputs, positions, outputs, nil, crossAttentionStates, nil, m.Cache.(*kvcache.WrapperCache)), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func init() {
|
func init() {
|
||||||
|
@ -18,7 +18,7 @@ type TextSelfAttention struct {
|
|||||||
RopeFactors ml.Tensor `gguf:"rope_freqs.weight"`
|
RopeFactors ml.Tensor `gguf:"rope_freqs.weight"`
|
||||||
}
|
}
|
||||||
|
|
||||||
func (sa *TextSelfAttention) Forward(ctx ml.Context, hiddenState, positions ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor {
|
func (sa *TextSelfAttention) Forward(ctx ml.Context, hiddenState, positions, _ ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor {
|
||||||
batchSize := hiddenState.Dim(1)
|
batchSize := hiddenState.Dim(1)
|
||||||
headDim := opts.hiddenSize / opts.numHeads
|
headDim := opts.hiddenSize / opts.numHeads
|
||||||
ropeType := uint32(0)
|
ropeType := uint32(0)
|
||||||
@ -69,11 +69,11 @@ type TextSelfAttentionDecoderLayer struct {
|
|||||||
MLP *TextMLP
|
MLP *TextMLP
|
||||||
}
|
}
|
||||||
|
|
||||||
func (d *TextSelfAttentionDecoderLayer) Forward(ctx ml.Context, hiddenState, positions, outputs, _, _ ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor {
|
func (d *TextSelfAttentionDecoderLayer) Forward(ctx ml.Context, hiddenState, positions, outputs, mask, _, _ ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor {
|
||||||
residual := hiddenState
|
residual := hiddenState
|
||||||
|
|
||||||
hiddenState = d.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
|
hiddenState = d.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
|
||||||
hiddenState = d.SelfAttention.Forward(ctx, hiddenState, positions, cache, opts)
|
hiddenState = d.SelfAttention.Forward(ctx, hiddenState, positions, mask, cache, opts)
|
||||||
|
|
||||||
// In the final layer (outputs != nil), optimize by pruning to just the token positions
|
// In the final layer (outputs != nil), optimize by pruning to just the token positions
|
||||||
// we need logits for.
|
// we need logits for.
|
||||||
@ -151,7 +151,7 @@ type TextCrossAttentionDecoderLayer struct {
|
|||||||
MLPGate ml.Tensor `gguf:"cross_attn_mlp_gate"`
|
MLPGate ml.Tensor `gguf:"cross_attn_mlp_gate"`
|
||||||
}
|
}
|
||||||
|
|
||||||
func (d *TextCrossAttentionDecoderLayer) Forward(ctx ml.Context, hiddenState, _, _, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor {
|
func (d *TextCrossAttentionDecoderLayer) Forward(ctx ml.Context, hiddenState, _, _, _, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor {
|
||||||
residual := hiddenState
|
residual := hiddenState
|
||||||
|
|
||||||
hiddenState = d.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
|
hiddenState = d.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
|
||||||
@ -167,14 +167,14 @@ func (d *TextCrossAttentionDecoderLayer) Forward(ctx ml.Context, hiddenState, _,
|
|||||||
}
|
}
|
||||||
|
|
||||||
type TextDecoderLayer interface {
|
type TextDecoderLayer interface {
|
||||||
Forward(ctx ml.Context, hiddenState, positionIDs, outputs, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor
|
Forward(ctx ml.Context, hiddenState, positionIDs, outputs, mask, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor
|
||||||
}
|
}
|
||||||
|
|
||||||
type TextDecoder struct {
|
type TextDecoder struct {
|
||||||
Layers []TextDecoderLayer
|
Layers []TextDecoderLayer
|
||||||
}
|
}
|
||||||
|
|
||||||
func (d *TextDecoder) Forward(ctx ml.Context, hiddenState, positionIDs, outputs, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor {
|
func (d *TextDecoder) Forward(ctx ml.Context, hiddenState, positionIDs, outputs, mask, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor {
|
||||||
for i, layer := range d.Layers {
|
for i, layer := range d.Layers {
|
||||||
layerType := selfAttentionLayer
|
layerType := selfAttentionLayer
|
||||||
if slices.Contains(opts.crossAttentionLayers, int32(i)) {
|
if slices.Contains(opts.crossAttentionLayers, int32(i)) {
|
||||||
@ -190,7 +190,7 @@ func (d *TextDecoder) Forward(ctx ml.Context, hiddenState, positionIDs, outputs,
|
|||||||
lastLayerOutputs = outputs
|
lastLayerOutputs = outputs
|
||||||
}
|
}
|
||||||
|
|
||||||
hiddenState = layer.Forward(ctx, hiddenState, positionIDs, lastLayerOutputs, crossAttentionStates, crossAttentionMask, cache, opts)
|
hiddenState = layer.Forward(ctx, hiddenState, positionIDs, lastLayerOutputs, mask, crossAttentionStates, crossAttentionMask, cache, opts)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -214,9 +214,9 @@ type TextModel struct {
|
|||||||
*TextModelOptions
|
*TextModelOptions
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *TextModel) Forward(ctx ml.Context, inputIDs, positionIDs, outputs, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache) ml.Tensor {
|
func (m *TextModel) Forward(ctx ml.Context, inputIDs, positionIDs, outputs, mask, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache) ml.Tensor {
|
||||||
hiddenState := m.TokenEmbedding.Forward(ctx, inputIDs)
|
hiddenState := m.TokenEmbedding.Forward(ctx, inputIDs)
|
||||||
hiddenState = m.Transformer.Forward(ctx, hiddenState, positionIDs, outputs, crossAttentionStates, crossAttentionMask, cache, m.TextModelOptions)
|
hiddenState = m.Transformer.Forward(ctx, hiddenState, positionIDs, outputs, mask, crossAttentionStates, crossAttentionMask, cache, m.TextModelOptions)
|
||||||
hiddenState = m.OutputNorm.Forward(ctx, hiddenState, m.eps)
|
hiddenState = m.OutputNorm.Forward(ctx, hiddenState, m.eps)
|
||||||
return m.Output.Forward(ctx, hiddenState)
|
return m.Output.Forward(ctx, hiddenState)
|
||||||
}
|
}
|
||||||
|
@ -15,7 +15,7 @@ type VisionSelfAttention struct {
|
|||||||
Query *nn.Linear `gguf:"attn_q"`
|
Query *nn.Linear `gguf:"attn_q"`
|
||||||
Key *nn.Linear `gguf:"attn_k"`
|
Key *nn.Linear `gguf:"attn_k"`
|
||||||
Value *nn.Linear `gguf:"attn_v"`
|
Value *nn.Linear `gguf:"attn_v"`
|
||||||
Output *nn.Linear `gguf:"attn_output"`
|
Output *nn.Linear `gguf:"attn_out"`
|
||||||
|
|
||||||
Gate ml.Tensor `gguf:"attn_gate"`
|
Gate ml.Tensor `gguf:"attn_gate"`
|
||||||
}
|
}
|
||||||
@ -45,29 +45,36 @@ func (sa *VisionSelfAttention) Forward(ctx ml.Context, hiddenState ml.Tensor, op
|
|||||||
attention = attention.Reshape(ctx, opts.hiddenSize, attention.Dim(2), batchSize)
|
attention = attention.Reshape(ctx, opts.hiddenSize, attention.Dim(2), batchSize)
|
||||||
|
|
||||||
hiddenState = sa.Output.Forward(ctx, attention)
|
hiddenState = sa.Output.Forward(ctx, attention)
|
||||||
|
if sa.Gate != nil {
|
||||||
|
hiddenState = hiddenState.Mul(ctx, sa.Gate)
|
||||||
|
}
|
||||||
|
|
||||||
return hiddenState
|
return hiddenState
|
||||||
}
|
}
|
||||||
|
|
||||||
type VisionMLP struct {
|
type VisionMLP struct {
|
||||||
Up *nn.Linear `gguf:"ffn_up"`
|
|
||||||
Down *nn.Linear `gguf:"ffn_down"`
|
Down *nn.Linear `gguf:"ffn_down"`
|
||||||
|
Up *nn.Linear `gguf:"ffn_up"`
|
||||||
|
|
||||||
|
Gate ml.Tensor `gguf:"ffn_gate"`
|
||||||
}
|
}
|
||||||
|
|
||||||
func (mlp *VisionMLP) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *VisionModelOptions) ml.Tensor {
|
func (mlp *VisionMLP) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *VisionModelOptions) ml.Tensor {
|
||||||
hiddenState = mlp.Up.Forward(ctx, hiddenState).GELU(ctx)
|
hiddenState = mlp.Down.Forward(ctx, hiddenState).GELU(ctx)
|
||||||
hiddenState = mlp.Down.Forward(ctx, hiddenState)
|
hiddenState = mlp.Up.Forward(ctx, hiddenState)
|
||||||
|
if mlp.Gate != nil {
|
||||||
|
hiddenState = hiddenState.Mul(ctx, mlp.Gate)
|
||||||
|
}
|
||||||
|
|
||||||
return hiddenState
|
return hiddenState
|
||||||
}
|
}
|
||||||
|
|
||||||
type VisionEncoderLayer struct {
|
type VisionEncoderLayer struct {
|
||||||
AttentionNorm *nn.LayerNorm `gguf:"attn_norm"`
|
AttentionNorm *nn.LayerNorm `gguf:"ln1"`
|
||||||
SelfAttention *VisionSelfAttention
|
SelfAttention *VisionSelfAttention
|
||||||
AttentionGate ml.Tensor `gguf:"attn_gate"`
|
|
||||||
|
|
||||||
MLPNorm *nn.LayerNorm `gguf:"ffn_norm"`
|
MLPNorm *nn.LayerNorm `gguf:"ln2"`
|
||||||
MLP *VisionMLP
|
MLP *VisionMLP
|
||||||
MLPGate ml.Tensor `gguf:"ffn_gate"`
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (e *VisionEncoderLayer) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *VisionModelOptions) ml.Tensor {
|
func (e *VisionEncoderLayer) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *VisionModelOptions) ml.Tensor {
|
||||||
@ -76,22 +83,13 @@ func (e *VisionEncoderLayer) Forward(ctx ml.Context, hiddenState ml.Tensor, opts
|
|||||||
// self attention
|
// self attention
|
||||||
hiddenState = e.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
|
hiddenState = e.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
|
||||||
hiddenState = e.SelfAttention.Forward(ctx, hiddenState, opts)
|
hiddenState = e.SelfAttention.Forward(ctx, hiddenState, opts)
|
||||||
|
|
||||||
if e.AttentionGate != nil {
|
|
||||||
hiddenState = hiddenState.Mul(ctx, e.AttentionGate)
|
|
||||||
}
|
|
||||||
hiddenState = hiddenState.Add(ctx, residual)
|
hiddenState = hiddenState.Add(ctx, residual)
|
||||||
residual = hiddenState
|
residual = hiddenState
|
||||||
|
|
||||||
// feed forward
|
// feed forward
|
||||||
hiddenState = e.MLPNorm.Forward(ctx, hiddenState, opts.eps)
|
hiddenState = e.MLPNorm.Forward(ctx, hiddenState, opts.eps)
|
||||||
hiddenState = e.MLP.Forward(ctx, hiddenState, opts)
|
hiddenState = e.MLP.Forward(ctx, hiddenState, opts)
|
||||||
hiddenState = hiddenState.Add(ctx, residual)
|
return hiddenState.Add(ctx, residual)
|
||||||
if e.MLPGate != nil {
|
|
||||||
hiddenState = hiddenState.Mul(ctx, e.MLPGate)
|
|
||||||
}
|
|
||||||
|
|
||||||
return hiddenState
|
|
||||||
}
|
}
|
||||||
|
|
||||||
type VisionEncoder struct {
|
type VisionEncoder struct {
|
||||||
@ -116,9 +114,9 @@ type PrecomputedAspectRatioEmbedding struct {
|
|||||||
Gate ml.Tensor `gguf:"gate"`
|
Gate ml.Tensor `gguf:"gate"`
|
||||||
}
|
}
|
||||||
|
|
||||||
func (e *PrecomputedAspectRatioEmbedding) Forward(ctx ml.Context, hiddenState ml.Tensor, aspectRatioIDs ml.Tensor, numTiles int, opts *VisionModelOptions) ml.Tensor {
|
func (e *PrecomputedAspectRatioEmbedding) Forward(ctx ml.Context, hiddenState ml.Tensor, aspectRatioIDs ml.Tensor, opts *VisionModelOptions) ml.Tensor {
|
||||||
embeddings := e.Embedding.Forward(ctx, aspectRatioIDs)
|
embeddings := e.Embedding.Forward(ctx, aspectRatioIDs)
|
||||||
embeddings = embeddings.Reshape(ctx, opts.hiddenSize, 1, numTiles)
|
embeddings = embeddings.Reshape(ctx, opts.hiddenSize, 1, opts.numTiles)
|
||||||
if e.Gate != nil {
|
if e.Gate != nil {
|
||||||
embeddings = embeddings.Mul(ctx, e.Gate)
|
embeddings = embeddings.Mul(ctx, e.Gate)
|
||||||
}
|
}
|
||||||
@ -134,7 +132,7 @@ type PrecomputedPositionEmbedding struct {
|
|||||||
TilePositionEmbeddingGate ml.Tensor `gguf:"tile_position_embd.gate"`
|
TilePositionEmbeddingGate ml.Tensor `gguf:"tile_position_embd.gate"`
|
||||||
}
|
}
|
||||||
|
|
||||||
func (e *PrecomputedPositionEmbedding) Forward(ctx ml.Context, hiddenState, positionIDs, aspectRatioIDs ml.Tensor, numPositions, numTiles int, opts *VisionModelOptions) ml.Tensor {
|
func (e *PrecomputedPositionEmbedding) Forward(ctx ml.Context, hiddenState, positionIDs, aspectRatioIDs ml.Tensor, numPositions int, opts *VisionModelOptions) ml.Tensor {
|
||||||
positionEmbedding := e.PositionEmbedding.Forward(ctx, positionIDs)
|
positionEmbedding := e.PositionEmbedding.Forward(ctx, positionIDs)
|
||||||
if e.PositionEmbeddingGate != nil {
|
if e.PositionEmbeddingGate != nil {
|
||||||
positionEmbedding = positionEmbedding.Mul(ctx, e.PositionEmbeddingGate)
|
positionEmbedding = positionEmbedding.Mul(ctx, e.PositionEmbeddingGate)
|
||||||
@ -143,7 +141,7 @@ func (e *PrecomputedPositionEmbedding) Forward(ctx ml.Context, hiddenState, posi
|
|||||||
hiddenState = hiddenState.Add(ctx, positionEmbedding)
|
hiddenState = hiddenState.Add(ctx, positionEmbedding)
|
||||||
|
|
||||||
tilePositionEmbedding := e.TilePositionEmbedding.Forward(ctx, aspectRatioIDs)
|
tilePositionEmbedding := e.TilePositionEmbedding.Forward(ctx, aspectRatioIDs)
|
||||||
tilePositionEmbedding = tilePositionEmbedding.Reshape(ctx, opts.hiddenSize, numPositions, numTiles)
|
tilePositionEmbedding = tilePositionEmbedding.Reshape(ctx, opts.hiddenSize, numPositions, opts.numTiles)
|
||||||
if e.TilePositionEmbeddingGate != nil {
|
if e.TilePositionEmbeddingGate != nil {
|
||||||
tilePositionEmbedding = tilePositionEmbedding.Mul(ctx, e.TilePositionEmbeddingGate)
|
tilePositionEmbedding = tilePositionEmbedding.Mul(ctx, e.TilePositionEmbeddingGate)
|
||||||
}
|
}
|
||||||
@ -152,9 +150,9 @@ func (e *PrecomputedPositionEmbedding) Forward(ctx ml.Context, hiddenState, posi
|
|||||||
}
|
}
|
||||||
|
|
||||||
type VisionModelOptions struct {
|
type VisionModelOptions struct {
|
||||||
hiddenSize, numHeads int
|
hiddenSize, numHeads, numTiles int
|
||||||
imageSize, patchSize int
|
imageSize, patchSize int
|
||||||
eps float32
|
eps float32
|
||||||
|
|
||||||
intermediateLayersIndices []int32
|
intermediateLayersIndices []int32
|
||||||
}
|
}
|
||||||
@ -183,16 +181,14 @@ func (m *VisionModel) Forward(ctx ml.Context, pixelValues, positionIDs, aspectRa
|
|||||||
numPositions++
|
numPositions++
|
||||||
}
|
}
|
||||||
|
|
||||||
numTiles := pixelValues.Dim(3)
|
|
||||||
|
|
||||||
hiddenState := m.PatchEmbeddings.Forward(ctx, pixelValues, m.patchSize, m.patchSize, 0, 0, 1, 1)
|
hiddenState := m.PatchEmbeddings.Forward(ctx, pixelValues, m.patchSize, m.patchSize, 0, 0, 1, 1)
|
||||||
hiddenState = hiddenState.Reshape(ctx, numPatches, m.hiddenSize, numTiles)
|
hiddenState = hiddenState.Reshape(ctx, numPatches, m.hiddenSize, m.numTiles)
|
||||||
hiddenState = hiddenState.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx)
|
hiddenState = hiddenState.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx)
|
||||||
|
|
||||||
hiddenState = m.PreTilePositionEmbedding.Forward(ctx, hiddenState, aspectRatioIDs, numTiles, m.VisionModelOptions)
|
hiddenState = m.PreTilePositionEmbedding.Forward(ctx, hiddenState, aspectRatioIDs, m.VisionModelOptions)
|
||||||
hiddenState = m.ClassEmbedding.Repeat(ctx, 2, numTiles).Concat(ctx, hiddenState, 1)
|
hiddenState = m.ClassEmbedding.Repeat(ctx, 2, m.numTiles).Concat(ctx, hiddenState, 1)
|
||||||
|
|
||||||
hiddenState = m.PositionEmbedding.Forward(ctx, hiddenState, positionIDs, aspectRatioIDs, numPositions, numTiles, m.VisionModelOptions)
|
hiddenState = m.PositionEmbedding.Forward(ctx, hiddenState, positionIDs, aspectRatioIDs, numPositions, m.VisionModelOptions)
|
||||||
hiddenState = m.PreLayerNorm.Forward(ctx, hiddenState, m.eps)
|
hiddenState = m.PreLayerNorm.Forward(ctx, hiddenState, m.eps)
|
||||||
|
|
||||||
numPaddingPatches := 8 - (hiddenState.Dim(1)%8)%8
|
numPaddingPatches := 8 - (hiddenState.Dim(1)%8)%8
|
||||||
@ -203,18 +199,18 @@ func (m *VisionModel) Forward(ctx ml.Context, pixelValues, positionIDs, aspectRa
|
|||||||
|
|
||||||
hiddenState = m.PostLayerNorm.Forward(ctx, hiddenState, m.eps)
|
hiddenState = m.PostLayerNorm.Forward(ctx, hiddenState, m.eps)
|
||||||
|
|
||||||
hiddenState = hiddenState.Reshape(ctx, m.hiddenSize, numPositions+numPaddingPatches, numTiles, batchSize)
|
hiddenState = hiddenState.Reshape(ctx, m.hiddenSize, numPositions+numPaddingPatches, m.numTiles, batchSize)
|
||||||
hiddenState = m.PostTilePositionEmbedding.Forward(ctx, hiddenState, aspectRatioIDs, numTiles, m.VisionModelOptions)
|
hiddenState = m.PostTilePositionEmbedding.Forward(ctx, hiddenState, aspectRatioIDs, m.VisionModelOptions)
|
||||||
|
|
||||||
hiddenState = hiddenState.Reshape(ctx, m.hiddenSize, numTiles*(numPositions+numPaddingPatches), batchSize)
|
hiddenState = hiddenState.Reshape(ctx, m.hiddenSize, m.numTiles*(numPositions+numPaddingPatches), batchSize)
|
||||||
hiddenState, _ = m.GlobalTransformer.Forward(ctx, hiddenState, nil, m.VisionModelOptions)
|
hiddenState, _ = m.GlobalTransformer.Forward(ctx, hiddenState, nil, m.VisionModelOptions)
|
||||||
|
|
||||||
hiddenStates := intermediateHiddenStates[0].Stack(ctx, 0, intermediateHiddenStates[1:]...)
|
hiddenStates := intermediateHiddenStates[0].Stack(ctx, 0, intermediateHiddenStates[1:]...)
|
||||||
hiddenStates = hiddenStates.Reshape(ctx, len(intermediateHiddenStates)*m.hiddenSize, numPositions+numPaddingPatches, numTiles, batchSize)
|
hiddenStates = hiddenStates.Reshape(ctx, len(intermediateHiddenStates)*m.hiddenSize, numPositions+numPaddingPatches, m.numTiles, batchSize)
|
||||||
hiddenStates = hiddenStates.Pad(ctx, 0, -numPaddingPatches, 0, 0)
|
hiddenStates = hiddenStates.Unpad(ctx, 0, numPaddingPatches, 0, 0)
|
||||||
|
|
||||||
hiddenState = hiddenState.Reshape(ctx, m.hiddenSize, numPositions+numPaddingPatches, numTiles, batchSize)
|
hiddenState = hiddenState.Reshape(ctx, m.hiddenSize, numPositions+numPaddingPatches, m.numTiles, batchSize)
|
||||||
hiddenState = hiddenState.Pad(ctx, 0, -numPaddingPatches, 0, 0)
|
hiddenState = hiddenState.Unpad(ctx, 0, numPaddingPatches, 0, 0)
|
||||||
return hiddenState.Concat(ctx, hiddenStates, 0)
|
return hiddenState.Concat(ctx, hiddenStates, 0)
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -226,6 +222,7 @@ func newVisionModel(c fs.Config) *VisionModel {
|
|||||||
VisionModelOptions: &VisionModelOptions{
|
VisionModelOptions: &VisionModelOptions{
|
||||||
hiddenSize: int(c.Uint("vision.embedding_length")),
|
hiddenSize: int(c.Uint("vision.embedding_length")),
|
||||||
numHeads: int(c.Uint("vision.attention.head_count")),
|
numHeads: int(c.Uint("vision.attention.head_count")),
|
||||||
|
numTiles: int(c.Uint("vision.max_num_tiles")),
|
||||||
|
|
||||||
imageSize: int(c.Uint("vision.image_size")),
|
imageSize: int(c.Uint("vision.image_size")),
|
||||||
patchSize: int(c.Uint("vision.patch_size")),
|
patchSize: int(c.Uint("vision.patch_size")),
|
||||||
|
@ -2,31 +2,17 @@ package mllama
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"image"
|
"image"
|
||||||
|
"image/color"
|
||||||
"math"
|
"math"
|
||||||
"slices"
|
"slices"
|
||||||
|
|
||||||
"golang.org/x/image/draw"
|
"golang.org/x/image/draw"
|
||||||
|
|
||||||
"github.com/ollama/ollama/fs"
|
"github.com/ollama/ollama/fs"
|
||||||
"github.com/ollama/ollama/model/imageproc"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
type supportedAspectRatio struct {
|
|
||||||
rank, width, height int
|
|
||||||
}
|
|
||||||
|
|
||||||
func (a supportedAspectRatio) Point() image.Point {
|
|
||||||
return image.Point{a.width, a.height}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (a supportedAspectRatio) numTiles() int {
|
|
||||||
return a.width * a.height
|
|
||||||
}
|
|
||||||
|
|
||||||
type ImageProcessor struct {
|
type ImageProcessor struct {
|
||||||
imageSize, numChannels, maxNumTiles int
|
imageSize, numChannels, maxNumTiles int
|
||||||
|
|
||||||
mean, std [3]float32
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func newImageProcessor(c fs.Config) ImageProcessor {
|
func newImageProcessor(c fs.Config) ImageProcessor {
|
||||||
@ -34,49 +20,71 @@ func newImageProcessor(c fs.Config) ImageProcessor {
|
|||||||
imageSize: int(c.Uint("vision.image_size")),
|
imageSize: int(c.Uint("vision.image_size")),
|
||||||
numChannels: int(c.Uint("vision.num_channels")),
|
numChannels: int(c.Uint("vision.num_channels")),
|
||||||
maxNumTiles: int(c.Uint("vision.max_num_tiles")),
|
maxNumTiles: int(c.Uint("vision.max_num_tiles")),
|
||||||
|
|
||||||
mean: imageproc.ClipDefaultMean,
|
|
||||||
std: imageproc.ClipDefaultSTD,
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p ImageProcessor) supportedAspectRatios() (ratios []supportedAspectRatio) {
|
func (p *ImageProcessor) supportedAspectRatios(maxTiles int) []image.Point {
|
||||||
for w := 1; w <= p.maxNumTiles; w++ {
|
ratios := []image.Point{}
|
||||||
for h := 1; h <= p.maxNumTiles/w; h++ {
|
|
||||||
ratios = append(ratios, supportedAspectRatio{len(ratios) + 1, w, h})
|
for w := range maxTiles {
|
||||||
|
for h := range maxTiles {
|
||||||
|
if (w+1)*(h+1) <= maxTiles {
|
||||||
|
ratios = append(ratios, image.Point{w + 1, h + 1})
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return ratios
|
return ratios
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p ImageProcessor) fitToCanvas(imageSize, canvasSize image.Point) image.Point {
|
func (p *ImageProcessor) clip(a, a_min, a_max int) int {
|
||||||
tw := min(max(imageSize.X, p.imageSize), canvasSize.X)
|
if a < a_min {
|
||||||
th := min(max(imageSize.Y, p.imageSize), canvasSize.Y)
|
return a_min
|
||||||
|
} else if a > a_max {
|
||||||
|
return a_max
|
||||||
|
}
|
||||||
|
|
||||||
r := math.Min(
|
return a
|
||||||
float64(tw)/float64(imageSize.X),
|
}
|
||||||
float64(th)/float64(imageSize.Y),
|
|
||||||
)
|
|
||||||
|
|
||||||
w := min(int(math.Floor(float64(imageSize.X)*r)), tw)
|
func (p *ImageProcessor) fitToCanvas(imageSize, canvasSize image.Point, tileSize int) image.Point {
|
||||||
h := min(int(math.Floor(float64(imageSize.Y)*r)), th)
|
targetWidth := p.clip(imageSize.X, tileSize, canvasSize.X)
|
||||||
|
targetHeight := p.clip(imageSize.Y, tileSize, canvasSize.Y)
|
||||||
|
|
||||||
|
scaleWidth := float64(targetWidth) / float64(imageSize.X)
|
||||||
|
scaleHeight := float64(targetHeight) / float64(imageSize.Y)
|
||||||
|
|
||||||
|
var w, h int
|
||||||
|
|
||||||
|
if scaleWidth < scaleHeight {
|
||||||
|
w = targetWidth
|
||||||
|
h = min(int(math.Floor(float64(imageSize.Y)*scaleWidth)), targetHeight)
|
||||||
|
} else {
|
||||||
|
w = min(int(math.Floor(float64(imageSize.X)*scaleHeight)), targetWidth)
|
||||||
|
h = targetHeight
|
||||||
|
}
|
||||||
|
|
||||||
return image.Point{w, h}
|
return image.Point{w, h}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p ImageProcessor) optimalTiledCanvas(imageSize image.Point) image.Point {
|
func (p *ImageProcessor) optimalTiledCanvas(imageSize image.Point, maxImageTiles, tileSize int) image.Point {
|
||||||
possibleTileArrangements := p.supportedAspectRatios()
|
possibleTileArrangements := p.supportedAspectRatios(maxImageTiles)
|
||||||
possibleCanvasSizes := make([]image.Point, len(possibleTileArrangements))
|
possibleCanvasSizes := []image.Point{}
|
||||||
for i, pta := range possibleTileArrangements {
|
for _, pta := range possibleTileArrangements {
|
||||||
possibleCanvasSizes[i] = image.Point{pta.width * p.imageSize, pta.height * p.imageSize}
|
possibleCanvasSizes = append(possibleCanvasSizes, image.Point{pta.X * tileSize, pta.Y * tileSize})
|
||||||
}
|
}
|
||||||
|
|
||||||
scales := make([]float64, len(possibleCanvasSizes))
|
scales := []float64{}
|
||||||
for i, pcs := range possibleCanvasSizes {
|
|
||||||
scales[i] = min(
|
for _, pcs := range possibleCanvasSizes {
|
||||||
float64(pcs.Y)/float64(imageSize.Y),
|
scaleHeight := float64(pcs.Y) / float64(imageSize.Y)
|
||||||
float64(pcs.X)/float64(imageSize.X),
|
scaleWidth := float64(pcs.X) / float64(imageSize.X)
|
||||||
)
|
|
||||||
|
if scaleWidth > scaleHeight {
|
||||||
|
scales = append(scales, scaleHeight)
|
||||||
|
} else {
|
||||||
|
scales = append(scales, scaleWidth)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
var minUpscale float64
|
var minUpscale float64
|
||||||
@ -115,41 +123,47 @@ func (p ImageProcessor) optimalTiledCanvas(imageSize image.Point) image.Point {
|
|||||||
return selectedCanvas
|
return selectedCanvas
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p ImageProcessor) splitToTiles(img image.Image, numTilesSize image.Point) []image.Image {
|
func (p *ImageProcessor) splitToTiles(img image.Image, numTilesSize image.Point) []image.Image {
|
||||||
b := img.Bounds()
|
b := img.Bounds()
|
||||||
width := b.Max.X - b.Min.X
|
width := b.Max.X - b.Min.X
|
||||||
height := b.Max.Y - b.Min.Y
|
height := b.Max.Y - b.Min.Y
|
||||||
tileHeight := height / numTilesSize.Y
|
tileHeight := height / numTilesSize.Y
|
||||||
tileWidth := width / numTilesSize.X
|
tileWidth := width / numTilesSize.X
|
||||||
|
|
||||||
images := make([]image.Image, 0, numTilesSize.Y*numTilesSize.X)
|
images := []image.Image{}
|
||||||
|
|
||||||
for h := range numTilesSize.Y {
|
for h := range numTilesSize.Y {
|
||||||
for w := range numTilesSize.X {
|
for w := range numTilesSize.X {
|
||||||
rect := image.Rect(tileWidth*w, tileHeight*h, tileWidth*(w+1), tileHeight*(h+1))
|
rect := image.Rect(tileWidth*w, tileHeight*h, tileWidth*(w+1), tileHeight*(h+1))
|
||||||
if subImg, ok := img.(interface {
|
images = append(images, img.(interface {
|
||||||
SubImage(image.Rectangle) image.Image
|
SubImage(image.Rectangle) image.Image
|
||||||
}); ok {
|
}).SubImage(rect))
|
||||||
images = append(images, subImg.SubImage(rect))
|
|
||||||
} else {
|
|
||||||
// Handle the case where img does not implement SubImage
|
|
||||||
// This is a fallback and may not be efficient
|
|
||||||
newImg := image.NewRGBA(rect)
|
|
||||||
draw.Draw(newImg, rect, img, rect.Min, draw.Src)
|
|
||||||
images = append(images, newImg)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return images
|
return images
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p ImageProcessor) resize(img image.Image) (image.Image, image.Point) {
|
// remove the "alpha" channel by drawing over a prefilled image
|
||||||
b := img.Bounds()
|
//
|
||||||
|
//nolint:unused
|
||||||
|
func (p *ImageProcessor) compositeImage(img image.Image) image.Image {
|
||||||
|
dst := image.NewRGBA(img.Bounds())
|
||||||
|
|
||||||
canvasSize := p.optimalTiledCanvas(b.Max)
|
white := color.RGBA{255, 255, 255, 255}
|
||||||
aspectRatio := image.Point{canvasSize.X / p.imageSize, canvasSize.Y / p.imageSize}
|
draw.Draw(dst, dst.Bounds(), &image.Uniform{white}, image.Point{}, draw.Src)
|
||||||
newSize := p.fitToCanvas(b.Max, canvasSize)
|
draw.Draw(dst, dst.Bounds(), img, img.Bounds().Min, draw.Over)
|
||||||
|
|
||||||
|
return dst
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p *ImageProcessor) resize(img image.Image, outputSize image.Point, maxImageTiles int) (image.Image, image.Point) {
|
||||||
|
b := img.Bounds()
|
||||||
|
tileSize := outputSize.Y
|
||||||
|
|
||||||
|
canvasSize := p.optimalTiledCanvas(b.Max, maxImageTiles, tileSize)
|
||||||
|
aspectRatio := image.Point{canvasSize.X / tileSize, canvasSize.Y / tileSize}
|
||||||
|
newSize := p.fitToCanvas(b.Max, canvasSize, tileSize)
|
||||||
|
|
||||||
dst := image.NewRGBA(image.Rect(0, 0, newSize.X, newSize.Y))
|
dst := image.NewRGBA(image.Rect(0, 0, newSize.X, newSize.Y))
|
||||||
|
|
||||||
@ -163,10 +177,10 @@ func (p ImageProcessor) resize(img image.Image) (image.Image, image.Point) {
|
|||||||
return dst, aspectRatio
|
return dst, aspectRatio
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p ImageProcessor) pad(img image.Image, aspectRatio image.Point) image.Image {
|
func (p *ImageProcessor) pad(img image.Image, outputSize, aspectRatio image.Point) image.Image {
|
||||||
paddedSize := image.Point{
|
paddedSize := image.Point{
|
||||||
X: p.imageSize * aspectRatio.X,
|
X: outputSize.X * aspectRatio.X,
|
||||||
Y: p.imageSize * aspectRatio.Y,
|
Y: outputSize.Y * aspectRatio.Y,
|
||||||
}
|
}
|
||||||
|
|
||||||
dst := image.NewRGBA(image.Rect(0, 0, paddedSize.X, paddedSize.Y))
|
dst := image.NewRGBA(image.Rect(0, 0, paddedSize.X, paddedSize.Y))
|
||||||
@ -175,7 +189,7 @@ func (p ImageProcessor) pad(img image.Image, aspectRatio image.Point) image.Imag
|
|||||||
return dst
|
return dst
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p ImageProcessor) pack(img image.Image, aspectRatio image.Point) []float32 {
|
func (p *ImageProcessor) pack(img image.Image, aspectRatio image.Point, mean, std [3]float32) []float32 {
|
||||||
subImages := p.splitToTiles(img, aspectRatio)
|
subImages := p.splitToTiles(img, aspectRatio)
|
||||||
|
|
||||||
var pixelVals []float32
|
var pixelVals []float32
|
||||||
@ -191,9 +205,9 @@ func (p ImageProcessor) pack(img image.Image, aspectRatio image.Point) []float32
|
|||||||
gVal := float32(g>>8) / 255.0
|
gVal := float32(g>>8) / 255.0
|
||||||
bVal := float32(b>>8) / 255.0
|
bVal := float32(b>>8) / 255.0
|
||||||
|
|
||||||
rVal = (rVal - p.mean[0]) / p.std[0]
|
rVal = (rVal - mean[0]) / std[0]
|
||||||
gVal = (gVal - p.mean[1]) / p.std[1]
|
gVal = (gVal - mean[1]) / std[1]
|
||||||
bVal = (bVal - p.mean[2]) / p.std[2]
|
bVal = (bVal - mean[2]) / std[2]
|
||||||
|
|
||||||
rVals = append(rVals, rVal)
|
rVals = append(rVals, rVal)
|
||||||
gVals = append(gVals, gVal)
|
gVals = append(gVals, gVal)
|
||||||
@ -208,15 +222,17 @@ func (p ImageProcessor) pack(img image.Image, aspectRatio image.Point) []float32
|
|||||||
return pixelVals
|
return pixelVals
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p ImageProcessor) ProcessImage(img image.Image) ([]float32, supportedAspectRatio, error) {
|
func (p ImageProcessor) ProcessImage(img image.Image) ([]float32, int, error) {
|
||||||
newImage, newImageRatio := p.resize(img)
|
outputSize := image.Point{p.imageSize, p.imageSize}
|
||||||
newImage = p.pad(newImage, newImageRatio)
|
|
||||||
pixelValues := p.pack(newImage, newImageRatio)
|
|
||||||
|
|
||||||
supportedAspectRatios := p.supportedAspectRatios()
|
// clip values
|
||||||
aspectRatioID := slices.IndexFunc(supportedAspectRatios, func(i supportedAspectRatio) bool {
|
mean := [3]float32{0.48145466, 0.4578275, 0.40821073}
|
||||||
return i.width == newImageRatio.X && i.height == newImageRatio.Y
|
std := [3]float32{0.26862954, 0.26130258, 0.27577711}
|
||||||
})
|
|
||||||
|
|
||||||
return pixelValues, supportedAspectRatios[aspectRatioID], nil
|
newImage, aspectRatio := p.resize(img, outputSize, p.maxNumTiles)
|
||||||
|
newImage = p.pad(newImage, outputSize, aspectRatio)
|
||||||
|
|
||||||
|
data := p.pack(newImage, aspectRatio, mean, std)
|
||||||
|
aspectRatioIndex := slices.Index(p.supportedAspectRatios(p.maxNumTiles), aspectRatio) + 1
|
||||||
|
return data, aspectRatioIndex, nil
|
||||||
}
|
}
|
||||||
|
@ -1,387 +0,0 @@
|
|||||||
package mllama
|
|
||||||
|
|
||||||
import (
|
|
||||||
"image"
|
|
||||||
"testing"
|
|
||||||
|
|
||||||
"github.com/google/go-cmp/cmp"
|
|
||||||
)
|
|
||||||
|
|
||||||
func TestSupportedAspectRatios(t *testing.T) {
|
|
||||||
cases := []struct {
|
|
||||||
p ImageProcessor
|
|
||||||
want []supportedAspectRatio
|
|
||||||
}{
|
|
||||||
{
|
|
||||||
p: ImageProcessor{maxNumTiles: 1},
|
|
||||||
want: []supportedAspectRatio{
|
|
||||||
{1, 1, 1},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
p: ImageProcessor{maxNumTiles: 2},
|
|
||||||
want: []supportedAspectRatio{
|
|
||||||
{1, 1, 1},
|
|
||||||
{2, 1, 2},
|
|
||||||
{3, 2, 1},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
p: ImageProcessor{maxNumTiles: 3},
|
|
||||||
want: []supportedAspectRatio{
|
|
||||||
{1, 1, 1},
|
|
||||||
{2, 1, 2},
|
|
||||||
{3, 1, 3},
|
|
||||||
{4, 2, 1},
|
|
||||||
{5, 3, 1},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
p: ImageProcessor{maxNumTiles: 4},
|
|
||||||
want: []supportedAspectRatio{
|
|
||||||
{1, 1, 1},
|
|
||||||
{2, 1, 2},
|
|
||||||
{3, 1, 3},
|
|
||||||
{4, 1, 4},
|
|
||||||
{5, 2, 1},
|
|
||||||
{6, 2, 2},
|
|
||||||
{7, 3, 1},
|
|
||||||
{8, 4, 1},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, tt := range cases {
|
|
||||||
actual := tt.p.supportedAspectRatios()
|
|
||||||
if diff := cmp.Diff(actual, tt.want, cmp.AllowUnexported(supportedAspectRatio{})); diff != "" {
|
|
||||||
t.Errorf("mismatch (-got +want):\n%s", diff)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestFitToCanvas(t *testing.T) {
|
|
||||||
cases := []struct {
|
|
||||||
p ImageProcessor
|
|
||||||
image image.Point
|
|
||||||
canvas image.Point
|
|
||||||
expect image.Point
|
|
||||||
}{
|
|
||||||
{
|
|
||||||
p: ImageProcessor{imageSize: 200},
|
|
||||||
image: image.Point{400, 400},
|
|
||||||
canvas: image.Point{640, 480},
|
|
||||||
expect: image.Point{400, 400},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
p: ImageProcessor{imageSize: 200},
|
|
||||||
image: image.Point{1024, 768},
|
|
||||||
canvas: image.Point{640, 480},
|
|
||||||
expect: image.Point{640, 480},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
p: ImageProcessor{imageSize: 750},
|
|
||||||
image: image.Point{500, 500},
|
|
||||||
canvas: image.Point{1000, 1000},
|
|
||||||
expect: image.Point{750, 750},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
p: ImageProcessor{imageSize: 2000},
|
|
||||||
image: image.Point{500, 1000},
|
|
||||||
canvas: image.Point{2000, 2000},
|
|
||||||
expect: image.Point{1000, 2000},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
p: ImageProcessor{imageSize: 1000},
|
|
||||||
image: image.Point{4000, 3000},
|
|
||||||
canvas: image.Point{2000, 1000},
|
|
||||||
expect: image.Point{1333, 1000},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
p: ImageProcessor{imageSize: 560},
|
|
||||||
image: image.Point{667, 1000},
|
|
||||||
canvas: image.Point{1000, 1000},
|
|
||||||
expect: image.Point{667, 1000},
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, tt := range cases {
|
|
||||||
actual := tt.p.fitToCanvas(tt.image, tt.canvas)
|
|
||||||
if diff := cmp.Diff(actual, tt.expect); diff != "" {
|
|
||||||
t.Errorf("mismatch (-got +want):\n%s", diff)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestOptimalTiledCanvas(t *testing.T) {
|
|
||||||
cases := []struct {
|
|
||||||
p ImageProcessor
|
|
||||||
image image.Point
|
|
||||||
expect image.Point
|
|
||||||
}{
|
|
||||||
{
|
|
||||||
p: ImageProcessor{maxNumTiles: 4, imageSize: 1000},
|
|
||||||
image: image.Point{1024, 768},
|
|
||||||
expect: image.Point{2000, 1000},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
p: ImageProcessor{maxNumTiles: 4, imageSize: 560},
|
|
||||||
image: image.Point{1024, 768},
|
|
||||||
expect: image.Point{1120, 1120},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
p: ImageProcessor{maxNumTiles: 4, imageSize: 560},
|
|
||||||
image: image.Point{800, 600},
|
|
||||||
expect: image.Point{1120, 1120},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
p: ImageProcessor{maxNumTiles: 4, imageSize: 560},
|
|
||||||
image: image.Point{640, 480},
|
|
||||||
expect: image.Point{1120, 560},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
p: ImageProcessor{maxNumTiles: 4, imageSize: 560},
|
|
||||||
image: image.Point{320, 200},
|
|
||||||
expect: image.Point{560, 560},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
p: ImageProcessor{maxNumTiles: 4, imageSize: 560},
|
|
||||||
image: image.Point{1320, 200},
|
|
||||||
expect: image.Point{1680, 560},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
p: ImageProcessor{maxNumTiles: 4, imageSize: 560},
|
|
||||||
image: image.Point{2000, 200},
|
|
||||||
expect: image.Point{2240, 560},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
p: ImageProcessor{maxNumTiles: 4, imageSize: 560},
|
|
||||||
image: image.Point{10000, 200},
|
|
||||||
expect: image.Point{2240, 560},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
p: ImageProcessor{maxNumTiles: 4, imageSize: 560},
|
|
||||||
image: image.Point{480, 640},
|
|
||||||
expect: image.Point{560, 1120},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
p: ImageProcessor{maxNumTiles: 4, imageSize: 560},
|
|
||||||
image: image.Point{200, 320},
|
|
||||||
expect: image.Point{560, 560},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
p: ImageProcessor{maxNumTiles: 4, imageSize: 560},
|
|
||||||
image: image.Point{200, 1320},
|
|
||||||
expect: image.Point{560, 1680},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
p: ImageProcessor{maxNumTiles: 4, imageSize: 560},
|
|
||||||
image: image.Point{200, 2000},
|
|
||||||
expect: image.Point{560, 2240},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
p: ImageProcessor{maxNumTiles: 4, imageSize: 560},
|
|
||||||
image: image.Point{200, 10000},
|
|
||||||
expect: image.Point{560, 2240},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
p: ImageProcessor{maxNumTiles: 4, imageSize: 560},
|
|
||||||
image: image.Point{10000, 10000},
|
|
||||||
expect: image.Point{1120, 1120},
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, tt := range cases {
|
|
||||||
actual := tt.p.optimalTiledCanvas(tt.image)
|
|
||||||
if diff := cmp.Diff(actual, tt.expect); diff != "" {
|
|
||||||
t.Errorf("mismatch (-got +want):\n%s", diff)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestSplitToTiles(t *testing.T) {
|
|
||||||
cases := []struct {
|
|
||||||
imageMax image.Point
|
|
||||||
numTiles image.Point
|
|
||||||
expect []image.Image
|
|
||||||
}{
|
|
||||||
{
|
|
||||||
imageMax: image.Point{1024, 768},
|
|
||||||
numTiles: image.Point{1, 1},
|
|
||||||
expect: []image.Image{image.NewRGBA(image.Rect(0, 0, 1024, 768))},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
imageMax: image.Point{1000, 500},
|
|
||||||
numTiles: image.Point{2, 1},
|
|
||||||
expect: []image.Image{
|
|
||||||
image.NewRGBA(image.Rect(0, 0, 500, 500)),
|
|
||||||
image.NewRGBA(image.Rect(500, 0, 1000, 500)),
|
|
||||||
},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
imageMax: image.Point{1000, 1000},
|
|
||||||
numTiles: image.Point{2, 2},
|
|
||||||
expect: []image.Image{
|
|
||||||
image.NewRGBA(image.Rect(0, 0, 500, 500)),
|
|
||||||
image.NewRGBA(image.Rect(500, 0, 1000, 500)),
|
|
||||||
image.NewRGBA(image.Rect(0, 500, 500, 1000)),
|
|
||||||
image.NewRGBA(image.Rect(500, 500, 1000, 1000)),
|
|
||||||
},
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
var p ImageProcessor
|
|
||||||
|
|
||||||
for _, tt := range cases {
|
|
||||||
actual := p.splitToTiles(image.NewRGBA(image.Rectangle{Max: tt.imageMax}), tt.numTiles)
|
|
||||||
|
|
||||||
if len(actual) != len(tt.expect) {
|
|
||||||
t.Errorf("incorrect number of images '%d': expect: '%d'", len(actual), len(tt.expect))
|
|
||||||
}
|
|
||||||
|
|
||||||
for i := range actual {
|
|
||||||
if actual[i].Bounds() != tt.expect[i].Bounds() {
|
|
||||||
t.Errorf("image size incorrect: '%#v': expect: '%#v'", actual[i].Bounds(), tt.expect[i].Bounds())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestResize(t *testing.T) {
|
|
||||||
cases := []struct {
|
|
||||||
p ImageProcessor
|
|
||||||
imageMax image.Point
|
|
||||||
expectImage image.Image
|
|
||||||
expectAspectRatio image.Point
|
|
||||||
}{
|
|
||||||
{
|
|
||||||
p: ImageProcessor{maxNumTiles: 1, imageSize: 100},
|
|
||||||
imageMax: image.Point{200, 200},
|
|
||||||
expectImage: image.NewRGBA(image.Rect(0, 0, 100, 100)),
|
|
||||||
expectAspectRatio: image.Point{1, 1},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
p: ImageProcessor{maxNumTiles: 2, imageSize: 100},
|
|
||||||
imageMax: image.Point{200, 200},
|
|
||||||
expectImage: image.NewRGBA(image.Rect(0, 0, 100, 100)),
|
|
||||||
expectAspectRatio: image.Point{1, 1},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
p: ImageProcessor{maxNumTiles: 4, imageSize: 560},
|
|
||||||
imageMax: image.Point{10, 10},
|
|
||||||
expectImage: image.NewRGBA(image.Rect(0, 0, 560, 560)),
|
|
||||||
expectAspectRatio: image.Point{1, 1},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
p: ImageProcessor{maxNumTiles: 4, imageSize: 560},
|
|
||||||
imageMax: image.Point{2560, 1920},
|
|
||||||
expectImage: image.NewRGBA(image.Rect(0, 0, 1120, 840)),
|
|
||||||
expectAspectRatio: image.Point{2, 2},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
p: ImageProcessor{maxNumTiles: 4, imageSize: 560},
|
|
||||||
imageMax: image.Point{1024, 768},
|
|
||||||
expectImage: image.NewRGBA(image.Rect(0, 0, 1024, 768)),
|
|
||||||
expectAspectRatio: image.Point{2, 2},
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, tt := range cases {
|
|
||||||
actualImage, actualAspectRatio := tt.p.resize(image.Rectangle{Max: tt.imageMax})
|
|
||||||
|
|
||||||
if actualImage.Bounds() != tt.expectImage.Bounds() {
|
|
||||||
t.Errorf("image size incorrect: '%#v': expect: '%#v'", actualImage.Bounds(), tt.expectImage.Bounds())
|
|
||||||
}
|
|
||||||
|
|
||||||
if actualAspectRatio != tt.expectAspectRatio {
|
|
||||||
t.Errorf("aspect ratio incorrect: '%#v': expect: '%#v'", actualAspectRatio, tt.expectAspectRatio)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestPad(t *testing.T) {
|
|
||||||
cases := []struct {
|
|
||||||
p ImageProcessor
|
|
||||||
imageMax image.Point
|
|
||||||
aspectRatio image.Point
|
|
||||||
expect image.Image
|
|
||||||
}{
|
|
||||||
{
|
|
||||||
p: ImageProcessor{maxNumTiles: 4, imageSize: 560},
|
|
||||||
imageMax: image.Point{1000, 667},
|
|
||||||
aspectRatio: image.Point{2, 2},
|
|
||||||
expect: image.NewRGBA(image.Rect(0, 0, 1120, 1120)),
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, tt := range cases {
|
|
||||||
actual := tt.p.pad(image.Rectangle{Max: tt.imageMax}, tt.aspectRatio)
|
|
||||||
|
|
||||||
if actual.Bounds() != tt.expect.Bounds() {
|
|
||||||
t.Errorf("image size incorrect: '%#v': expect: '%#v'", actual.Bounds(), tt.expect.Bounds())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestPackImages(t *testing.T) {
|
|
||||||
cases := []struct {
|
|
||||||
imageMax image.Point
|
|
||||||
aspectRatio image.Point
|
|
||||||
expectVals int
|
|
||||||
}{
|
|
||||||
{
|
|
||||||
imageMax: image.Point{1120, 1120},
|
|
||||||
aspectRatio: image.Point{2, 2},
|
|
||||||
expectVals: 2 * 2 * 3 * 560 * 560,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
imageMax: image.Point{560, 560},
|
|
||||||
aspectRatio: image.Point{1, 1},
|
|
||||||
expectVals: 1 * 1 * 3 * 560 * 560,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
imageMax: image.Point{1120, 560},
|
|
||||||
aspectRatio: image.Point{1, 2},
|
|
||||||
expectVals: 1 * 2 * 3 * 560 * 560,
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, tt := range cases {
|
|
||||||
var p ImageProcessor
|
|
||||||
actualVals := p.pack(image.NewRGBA(image.Rectangle{Max: tt.imageMax}), tt.aspectRatio)
|
|
||||||
if len(actualVals) != tt.expectVals {
|
|
||||||
t.Errorf("packed image size incorrect: '%d': expect: '%d'", len(actualVals), tt.expectVals)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestPreprocess(t *testing.T) {
|
|
||||||
cases := []struct {
|
|
||||||
imageMax image.Point
|
|
||||||
expectAspectRatioID int
|
|
||||||
}{
|
|
||||||
{
|
|
||||||
imageMax: image.Point{10, 10},
|
|
||||||
expectAspectRatioID: 1,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
imageMax: image.Point{1024, 768},
|
|
||||||
expectAspectRatioID: 6,
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
p := ImageProcessor{imageSize: 560, maxNumTiles: 4}
|
|
||||||
for _, tt := range cases {
|
|
||||||
img, aspectRatio, err := p.ProcessImage(image.NewRGBA(image.Rectangle{Max: tt.imageMax}))
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("error processing: %q", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
if len(img) == 0 {
|
|
||||||
t.Errorf("no image data returned")
|
|
||||||
}
|
|
||||||
|
|
||||||
if aspectRatio.rank != tt.expectAspectRatioID {
|
|
||||||
t.Errorf("aspect ratio incorrect: '%d': expect: '%d'", aspectRatio, tt.expectAspectRatioID)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
@ -16,8 +16,6 @@ import (
|
|||||||
|
|
||||||
type Model struct {
|
type Model struct {
|
||||||
model.Base
|
model.Base
|
||||||
model.BytePairEncoding
|
|
||||||
|
|
||||||
*TextModel
|
*TextModel
|
||||||
*VisionModel `gguf:"v,vision"`
|
*VisionModel `gguf:"v,vision"`
|
||||||
|
|
||||||
@ -29,20 +27,6 @@ var _ model.MultimodalProcessor = (*Model)(nil)
|
|||||||
|
|
||||||
func New(c fs.Config) (model.Model, error) {
|
func New(c fs.Config) (model.Model, error) {
|
||||||
m := &Model{
|
m := &Model{
|
||||||
BytePairEncoding: model.NewBytePairEncoding(
|
|
||||||
c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
|
|
||||||
&model.Vocabulary{
|
|
||||||
Values: c.Strings("tokenizer.ggml.tokens"),
|
|
||||||
Types: c.Ints("tokenizer.ggml.token_type"),
|
|
||||||
Merges: c.Strings("tokenizer.ggml.merges"),
|
|
||||||
BOS: int32(c.Uint("tokenizer.ggml.bos_token_id")),
|
|
||||||
AddBOS: c.Bool("tokenizer.ggml.add_bos_token", false),
|
|
||||||
EOS: int32(c.Uint("tokenizer.ggml.eos_token_id")),
|
|
||||||
AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
|
|
||||||
EOT: int32(c.Uint("tokenizer.ggml.eos_token_id")),
|
|
||||||
AddEOT: c.Bool("tokenizer.ggml.add_eos_token", false),
|
|
||||||
},
|
|
||||||
),
|
|
||||||
TextModel: NewTextModel(c),
|
TextModel: NewTextModel(c),
|
||||||
VisionModel: newVisionModel(c),
|
VisionModel: newVisionModel(c),
|
||||||
ImageProcessor: newImageProcessor(c),
|
ImageProcessor: newImageProcessor(c),
|
||||||
@ -88,13 +72,13 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, er
|
|||||||
}
|
}
|
||||||
|
|
||||||
visionOutputs := m.VisionModel.Forward(ctx, pixels, grid)
|
visionOutputs := m.VisionModel.Forward(ctx, pixels, grid)
|
||||||
return &chunks{Model: m, Tensor: visionOutputs}, nil
|
return &chunks{Model: m, Tensor: visionOutputs, grid: grid}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
type chunks struct {
|
type chunks struct {
|
||||||
*Model
|
*Model
|
||||||
ml.Tensor
|
ml.Tensor
|
||||||
|
grid *Grid
|
||||||
dataOnce sync.Once
|
dataOnce sync.Once
|
||||||
data []float32
|
data []float32
|
||||||
}
|
}
|
||||||
@ -134,7 +118,7 @@ func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
|
|||||||
// Adding the 'Picture' prefix is a hack, at the time of writing there is no way to prefix
|
// Adding the 'Picture' prefix is a hack, at the time of writing there is no way to prefix
|
||||||
// the image tokens with a prompt, so we add a prefix here
|
// the image tokens with a prompt, so we add a prefix here
|
||||||
nImg++
|
nImg++
|
||||||
pre, err := m.Encode(fmt.Sprintf(" Picture %d: ", nImg), true)
|
pre, err := m.TextModel.Encode(fmt.Sprintf(" Picture %d: ", nImg), true)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("failed to encode image prompt: %w", err)
|
return nil, fmt.Errorf("failed to encode image prompt: %w", err)
|
||||||
}
|
}
|
||||||
@ -169,7 +153,28 @@ func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
|
func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
|
||||||
positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
|
fmt.Println("Forward")
|
||||||
|
pos := make([]int32, len(batch.Positions)*4)
|
||||||
|
var grid = &Grid{}
|
||||||
|
if len(batch.Multimodal) > 0 {
|
||||||
|
image := batch.Multimodal[0].Multimodal
|
||||||
|
grid = image.(*chunk).chunks.grid
|
||||||
|
for y := 0; y < grid.Height/2; y++ {
|
||||||
|
for x := 0; x < grid.Width/2; x++ {
|
||||||
|
i := y*grid.Width/2 + x
|
||||||
|
pos[i] = batch.Positions[i]
|
||||||
|
pos[i+len(batch.Positions)] = batch.Positions[i] + int32(y)
|
||||||
|
pos[i+len(batch.Positions)*2] = batch.Positions[i] + int32(x)
|
||||||
|
pos[i+len(batch.Positions)*3] = 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
copy(pos[:len(batch.Positions)], batch.Positions)
|
||||||
|
copy(pos[len(batch.Positions):len(batch.Positions)*2], batch.Positions)
|
||||||
|
copy(pos[len(batch.Positions)*2:len(batch.Positions)*3], batch.Positions)
|
||||||
|
}
|
||||||
|
|
||||||
|
positions, err := ctx.Input().FromIntSlice(pos, len(pos))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
@ -7,6 +7,7 @@ import (
|
|||||||
"github.com/ollama/ollama/kvcache"
|
"github.com/ollama/ollama/kvcache"
|
||||||
"github.com/ollama/ollama/ml"
|
"github.com/ollama/ollama/ml"
|
||||||
"github.com/ollama/ollama/ml/nn"
|
"github.com/ollama/ollama/ml/nn"
|
||||||
|
"github.com/ollama/ollama/model"
|
||||||
"github.com/ollama/ollama/model/input"
|
"github.com/ollama/ollama/model/input"
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -17,6 +18,9 @@ type TextOptions struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
type TextModel struct {
|
type TextModel struct {
|
||||||
|
model.Base
|
||||||
|
model.BytePairEncoding
|
||||||
|
|
||||||
TokenEmbedding *nn.Embedding `gguf:"token_embd"`
|
TokenEmbedding *nn.Embedding `gguf:"token_embd"`
|
||||||
Layers []Layer `gguf:"blk"`
|
Layers []Layer `gguf:"blk"`
|
||||||
OutputNorm *nn.RMSNorm `gguf:"output_norm"`
|
OutputNorm *nn.RMSNorm `gguf:"output_norm"`
|
||||||
@ -27,6 +31,20 @@ type TextModel struct {
|
|||||||
|
|
||||||
func NewTextModel(c fs.Config) *TextModel {
|
func NewTextModel(c fs.Config) *TextModel {
|
||||||
m := TextModel{
|
m := TextModel{
|
||||||
|
BytePairEncoding: model.NewBytePairEncoding(
|
||||||
|
c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
|
||||||
|
&model.Vocabulary{
|
||||||
|
Values: c.Strings("tokenizer.ggml.tokens"),
|
||||||
|
Types: c.Ints("tokenizer.ggml.token_type"),
|
||||||
|
Merges: c.Strings("tokenizer.ggml.merges"),
|
||||||
|
BOS: int32(c.Uint("tokenizer.ggml.bos_token_id")),
|
||||||
|
AddBOS: c.Bool("tokenizer.ggml.add_bos_token", false),
|
||||||
|
EOS: int32(c.Uint("tokenizer.ggml.eos_token_id")),
|
||||||
|
AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
|
||||||
|
EOT: int32(c.Uint("tokenizer.ggml.eos_token_id")),
|
||||||
|
AddEOT: c.Bool("tokenizer.ggml.add_eos_token", false),
|
||||||
|
},
|
||||||
|
),
|
||||||
Layers: make([]Layer, c.Uint("block_count")),
|
Layers: make([]Layer, c.Uint("block_count")),
|
||||||
TextOptions: &TextOptions{
|
TextOptions: &TextOptions{
|
||||||
ctxLen: int(c.Uint("context_length")),
|
ctxLen: int(c.Uint("context_length")),
|
||||||
@ -47,23 +65,26 @@ func NewTextModel(c fs.Config) *TextModel {
|
|||||||
// SelfAttention implements the multi-head self-attention mechanism
|
// SelfAttention implements the multi-head self-attention mechanism
|
||||||
// with separate projections for query, key, value and output transformations
|
// with separate projections for query, key, value and output transformations
|
||||||
type SelfAttention struct {
|
type SelfAttention struct {
|
||||||
Query *nn.Linear `gguf:"attn_q"`
|
Query *nn.Linear `gguf:"attn_q"`
|
||||||
Key *nn.Linear `gguf:"attn_k"`
|
Key *nn.Linear `gguf:"attn_k"`
|
||||||
Value *nn.Linear `gguf:"attn_v"`
|
Value *nn.Linear `gguf:"attn_v"`
|
||||||
Output *nn.Linear `gguf:"attn_output"`
|
Output *nn.Linear `gguf:"attn_output"`
|
||||||
|
RopeFactors ml.Tensor `gguf:"rope_freqs.weight"`
|
||||||
}
|
}
|
||||||
|
|
||||||
func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *TextOptions) ml.Tensor {
|
func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *TextOptions) ml.Tensor {
|
||||||
batchSize := hiddenState.Dim(1)
|
batchSize := hiddenState.Dim(1)
|
||||||
headDim := opts.hiddenSize / opts.numHeads
|
headDim := opts.hiddenSize / opts.numHeads
|
||||||
|
|
||||||
|
sections := [4]int32{16, 24, 24, 0}
|
||||||
|
|
||||||
q := sa.Query.Forward(ctx, hiddenState)
|
q := sa.Query.Forward(ctx, hiddenState)
|
||||||
q = q.Reshape(ctx, headDim, opts.numHeads, batchSize)
|
q = q.Reshape(ctx, headDim, opts.numHeads, batchSize)
|
||||||
q = q.RoPE(ctx, positionIDs, nil, opts.ropeDim, 2, opts.ropeBase, opts.ropeScale, ml.WithContextLen(opts.defaultContextLen))
|
q = q.RoPEMulti(ctx, positionIDs, sa.RopeFactors, opts.ropeDim, sections, 8, opts.ropeBase, opts.ropeScale)
|
||||||
|
|
||||||
k := sa.Key.Forward(ctx, hiddenState)
|
k := sa.Key.Forward(ctx, hiddenState)
|
||||||
k = k.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
|
k = k.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
|
||||||
k = k.RoPE(ctx, positionIDs, nil, opts.ropeDim, 2, opts.ropeBase, opts.ropeScale, ml.WithContextLen(opts.defaultContextLen))
|
k = k.RoPEMulti(ctx, positionIDs, sa.RopeFactors, opts.ropeDim, sections, 8, opts.ropeBase, opts.ropeScale)
|
||||||
|
|
||||||
v := sa.Value.Forward(ctx, hiddenState)
|
v := sa.Value.Forward(ctx, hiddenState)
|
||||||
v = v.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
|
v = v.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
|
||||||
|
@ -139,7 +139,7 @@ type VisionModelOptions struct {
|
|||||||
ropeTheta float32
|
ropeTheta float32
|
||||||
spatialMergeSize int
|
spatialMergeSize int
|
||||||
windowSize int
|
windowSize int
|
||||||
fullAttnBlocks []int32
|
fullAttnBlocks []int
|
||||||
temporalPatchSize int
|
temporalPatchSize int
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -235,7 +235,7 @@ func (m *VisionModel) Forward(ctx ml.Context, pixelValues ml.Tensor, grid *Grid)
|
|||||||
mask := blockDiagonalMask(ctx, hiddenStates.Dim(1), bounds, m.VisionModelOptions.numHeads)
|
mask := blockDiagonalMask(ctx, hiddenStates.Dim(1), bounds, m.VisionModelOptions.numHeads)
|
||||||
// Apply encoder layers
|
// Apply encoder layers
|
||||||
for i, layer := range m.Layers {
|
for i, layer := range m.Layers {
|
||||||
if slices.Contains(m.fullAttnBlocks, int32(i)) {
|
if slices.Contains(m.fullAttnBlocks, i) {
|
||||||
hiddenStates = layer.Forward(ctx, hiddenStates, cos, sin, nil, m.VisionModelOptions)
|
hiddenStates = layer.Forward(ctx, hiddenStates, cos, sin, nil, m.VisionModelOptions)
|
||||||
} else {
|
} else {
|
||||||
hiddenStates = layer.Forward(
|
hiddenStates = layer.Forward(
|
||||||
@ -383,9 +383,13 @@ func newVisionModel(c fs.Config) *VisionModel {
|
|||||||
spatialMergeSize: spatialMergeSize,
|
spatialMergeSize: spatialMergeSize,
|
||||||
windowSize: windowSize,
|
windowSize: windowSize,
|
||||||
temporalPatchSize: temporalPatchSize,
|
temporalPatchSize: temporalPatchSize,
|
||||||
fullAttnBlocks: fullAttnBlocks,
|
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for i := range fullAttnBlocks {
|
||||||
|
// full attention block indexes have to be converted to int for use with the slices package
|
||||||
|
model.fullAttnBlocks = append(model.fullAttnBlocks, int(fullAttnBlocks[i]))
|
||||||
|
}
|
||||||
|
|
||||||
return model
|
return model
|
||||||
}
|
}
|
||||||
|
@ -11,6 +11,7 @@ import (
|
|||||||
|
|
||||||
// ImageProcessor contains configuration for the Qwen 2.5 VL image processing
|
// ImageProcessor contains configuration for the Qwen 2.5 VL image processing
|
||||||
type ImageProcessor struct {
|
type ImageProcessor struct {
|
||||||
|
imageSize int
|
||||||
numChannels int
|
numChannels int
|
||||||
patchSize int
|
patchSize int
|
||||||
temporalPatchSize int
|
temporalPatchSize int
|
||||||
@ -29,16 +30,17 @@ func newImageProcessor(c fs.Config) ImageProcessor {
|
|||||||
mergeSize := int(c.Uint("vision.spatial_merge_size", 2))
|
mergeSize := int(c.Uint("vision.spatial_merge_size", 2))
|
||||||
|
|
||||||
return ImageProcessor{
|
return ImageProcessor{
|
||||||
|
imageSize: int(c.Uint("vision.image_size", 560)),
|
||||||
numChannels: int(c.Uint("vision.num_channels", 3)), // not set
|
numChannels: int(c.Uint("vision.num_channels", 3)), // not set
|
||||||
patchSize: patchSize,
|
patchSize: patchSize,
|
||||||
temporalPatchSize: 2,
|
temporalPatchSize: 2,
|
||||||
mergeSize: mergeSize,
|
mergeSize: mergeSize,
|
||||||
minPixels: 56 * 56,
|
minPixels: 56 * 56,
|
||||||
maxPixels: int(c.Uint("vision.max_pixels", 28*28*1280)), // 1MP limit
|
maxPixels: 28 * 28 * 4 * 1280,
|
||||||
factor: patchSize * mergeSize,
|
factor: patchSize * mergeSize,
|
||||||
rescaleFactor: 1.0 / 255.0,
|
rescaleFactor: 1.0 / 255.0,
|
||||||
imageMean: imageproc.ClipDefaultMean[:],
|
imageMean: []float32{0.48145466, 0.4578275, 0.40821073},
|
||||||
imageStd: imageproc.ClipDefaultSTD[:],
|
imageStd: []float32{0.26862954, 0.26130258, 0.27577711},
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
47
model/models/qwen25vl/process_image_test.go
Normal file
47
model/models/qwen25vl/process_image_test.go
Normal file
@ -0,0 +1,47 @@
|
|||||||
|
package qwen25vl
|
||||||
|
|
||||||
|
import (
|
||||||
|
"image"
|
||||||
|
_ "image/jpeg" // Register JPEG decoder
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestSmartResize(t *testing.T) {
|
||||||
|
type smartResizeCase struct {
|
||||||
|
TestImage image.Image
|
||||||
|
Expected image.Point
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create an image processor with default values
|
||||||
|
processor := ImageProcessor{
|
||||||
|
imageSize: 560, // Example value
|
||||||
|
numChannels: 3,
|
||||||
|
factor: 28,
|
||||||
|
minPixels: 56 * 56,
|
||||||
|
maxPixels: 14 * 14 * 4 * 1280,
|
||||||
|
}
|
||||||
|
|
||||||
|
cases := []smartResizeCase{
|
||||||
|
{
|
||||||
|
TestImage: image.NewRGBA(image.Rect(0, 0, 1024, 1024)),
|
||||||
|
Expected: image.Point{980, 980},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
TestImage: image.NewRGBA(image.Rect(0, 0, 1024, 768)),
|
||||||
|
Expected: image.Point{1036, 756},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
TestImage: image.NewRGBA(image.Rect(0, 0, 2000, 2000)),
|
||||||
|
Expected: image.Point{980, 980},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, c := range cases {
|
||||||
|
b := c.TestImage.Bounds().Max
|
||||||
|
x, y := processor.SmartResize(b.X, b.Y)
|
||||||
|
actual := image.Point{x, y}
|
||||||
|
if actual != c.Expected {
|
||||||
|
t.Errorf("expected: %v, actual: %v", c.Expected, actual)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
74
model/models/qwen2vl/imageproc.go
Normal file
74
model/models/qwen2vl/imageproc.go
Normal file
@ -0,0 +1,74 @@
|
|||||||
|
package qwen2vl
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"image"
|
||||||
|
_ "image/jpeg"
|
||||||
|
_ "image/png"
|
||||||
|
"io"
|
||||||
|
"math"
|
||||||
|
|
||||||
|
"github.com/ollama/ollama/model/imageproc"
|
||||||
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
DefaultFactor = 28
|
||||||
|
DefaultMinPixels = 56 * 56
|
||||||
|
DefaultMaxPixels = 14 * 14 * 4 * 1280
|
||||||
|
)
|
||||||
|
|
||||||
|
// smartResize calculates the size of the image to resize to based on the
|
||||||
|
// factor, minPixels, and maxPixels.
|
||||||
|
func smartResize(size image.Point, factor, minPixels, maxPixels int) image.Point {
|
||||||
|
// 1. Both dimensions of size are divisible by factor
|
||||||
|
// 2. The area of the image is between minPixels and maxPixels
|
||||||
|
// 3. The aspect ratio of the image is as close to 1:1 as possible
|
||||||
|
|
||||||
|
if size.Y < factor || size.X < factor {
|
||||||
|
panic("image is too small to resize")
|
||||||
|
} else if max(size.X, size.Y)/min(size.X, size.Y) > 200 {
|
||||||
|
panic("aspect ratio must be less than 200:1")
|
||||||
|
}
|
||||||
|
|
||||||
|
f := float64(factor)
|
||||||
|
width := float64(size.X)
|
||||||
|
height := float64(size.Y)
|
||||||
|
|
||||||
|
xBar := math.Round(width/f) * f
|
||||||
|
yBar := math.Round(height/f) * f
|
||||||
|
|
||||||
|
if xBar*yBar > float64(maxPixels) {
|
||||||
|
beta := math.Sqrt(height * width / float64(maxPixels))
|
||||||
|
xBar = math.Floor(width/beta/f) * f
|
||||||
|
yBar = math.Floor(height/beta/f) * f
|
||||||
|
} else if xBar*yBar < float64(minPixels) {
|
||||||
|
beta := math.Sqrt(float64(minPixels) / (height * width))
|
||||||
|
xBar = math.Ceil(width*beta/f) * f
|
||||||
|
yBar = math.Ceil(height*beta/f) * f
|
||||||
|
}
|
||||||
|
|
||||||
|
return image.Point{int(xBar), int(yBar)}
|
||||||
|
}
|
||||||
|
|
||||||
|
func resizeImage(img image.Image, format string, size image.Point) image.Image {
|
||||||
|
if format == "png" {
|
||||||
|
img = imageproc.Composite(img)
|
||||||
|
}
|
||||||
|
|
||||||
|
return imageproc.Resize(img, size, imageproc.ResizeBilinear)
|
||||||
|
}
|
||||||
|
|
||||||
|
func Preprocess(imageData io.Reader) ([]float32, map[string]any, error) {
|
||||||
|
img, format, err := image.Decode(imageData)
|
||||||
|
if err != nil {
|
||||||
|
return nil, nil, fmt.Errorf("failed to decode image: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
size := smartResize(img.Bounds().Max, DefaultFactor, DefaultMinPixels, DefaultMaxPixels)
|
||||||
|
img = resizeImage(img, format, size)
|
||||||
|
|
||||||
|
data := imageproc.Normalize(img, imageproc.ClipDefaultMean, imageproc.ClipDefaultSTD, true, true)
|
||||||
|
|
||||||
|
opts := map[string]any{}
|
||||||
|
return data, opts, nil
|
||||||
|
}
|
78
model/models/qwen2vl/imageproc_test.go
Normal file
78
model/models/qwen2vl/imageproc_test.go
Normal file
@ -0,0 +1,78 @@
|
|||||||
|
package qwen2vl
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"image"
|
||||||
|
"image/png"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestSmartResize(t *testing.T) {
|
||||||
|
type smartResizeCase struct {
|
||||||
|
TestImage image.Image
|
||||||
|
Expected image.Point
|
||||||
|
}
|
||||||
|
|
||||||
|
cases := []smartResizeCase{
|
||||||
|
{
|
||||||
|
TestImage: image.NewRGBA(image.Rect(0, 0, 1024, 1024)),
|
||||||
|
Expected: image.Point{980, 980},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
TestImage: image.NewRGBA(image.Rect(0, 0, 1024, 768)),
|
||||||
|
Expected: image.Point{1036, 756},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
TestImage: image.NewRGBA(image.Rect(0, 0, 2000, 2000)),
|
||||||
|
Expected: image.Point{980, 980},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, c := range cases {
|
||||||
|
b := c.TestImage.Bounds().Max
|
||||||
|
actual := smartResize(b, DefaultFactor, DefaultMinPixels, DefaultMaxPixels)
|
||||||
|
if actual != c.Expected {
|
||||||
|
t.Errorf("expected: %v, actual: %v", c.Expected, actual)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestPreprocess(t *testing.T) {
|
||||||
|
type preprocessCase struct {
|
||||||
|
TestImage image.Image
|
||||||
|
ExpectedLen int
|
||||||
|
}
|
||||||
|
|
||||||
|
cases := []preprocessCase{
|
||||||
|
{
|
||||||
|
TestImage: image.NewRGBA(image.Rect(0, 0, 256, 256)),
|
||||||
|
ExpectedLen: 252 * 252 * 3 * 1,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
TestImage: image.NewRGBA(image.Rect(0, 0, 2000, 2000)),
|
||||||
|
ExpectedLen: 980 * 980 * 3 * 1,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, c := range cases {
|
||||||
|
var buf bytes.Buffer
|
||||||
|
err := png.Encode(&buf, c.TestImage)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
imgData, _, err := Preprocess(&buf)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("error processing: %q", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
switch len(imgData) {
|
||||||
|
case 0:
|
||||||
|
t.Errorf("no image data returned")
|
||||||
|
case c.ExpectedLen:
|
||||||
|
// ok
|
||||||
|
default:
|
||||||
|
t.Errorf("unexpected image data length: %d, expected: %d", len(imgData), c.ExpectedLen)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@ -5,6 +5,7 @@ import (
|
|||||||
"fmt"
|
"fmt"
|
||||||
"hash/maphash"
|
"hash/maphash"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
|
"slices"
|
||||||
"sync"
|
"sync"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
@ -17,7 +18,8 @@ type ImageContext struct {
|
|||||||
// mu is required to be held when generating embeddings or accessing the cache
|
// mu is required to be held when generating embeddings or accessing the cache
|
||||||
mu sync.Mutex
|
mu sync.Mutex
|
||||||
|
|
||||||
clip *llama.ClipContext
|
clip *llama.ClipContext
|
||||||
|
mllama *llama.MllamaContext
|
||||||
|
|
||||||
// cache of images to embeddings
|
// cache of images to embeddings
|
||||||
images []imageCache
|
images []imageCache
|
||||||
@ -33,6 +35,8 @@ func NewImageContext(llamaContext *llama.Context, modelPath string) (*ImageConte
|
|||||||
var c ImageContext
|
var c ImageContext
|
||||||
if arch == "clip" {
|
if arch == "clip" {
|
||||||
c.clip, err = llama.NewClipContext(llamaContext, modelPath)
|
c.clip, err = llama.NewClipContext(llamaContext, modelPath)
|
||||||
|
} else if arch == "mllama" {
|
||||||
|
c.mllama, err = llama.NewMllamaContext(llamaContext, modelPath)
|
||||||
} else {
|
} else {
|
||||||
return nil, fmt.Errorf("unknown vision model architecture: %s", arch)
|
return nil, fmt.Errorf("unknown vision model architecture: %s", arch)
|
||||||
}
|
}
|
||||||
@ -54,9 +58,12 @@ func (c *ImageContext) Free(modelPath string) {
|
|||||||
if c.clip != nil {
|
if c.clip != nil {
|
||||||
c.clip.Free()
|
c.clip.Free()
|
||||||
}
|
}
|
||||||
|
if c.mllama != nil {
|
||||||
|
c.mllama.Free()
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *ImageContext) NewEmbed(llamaContext *llama.Context, data []byte) ([][]float32, error) {
|
func (c *ImageContext) NewEmbed(llamaContext *llama.Context, data []byte, aspectRatioId int) ([][]float32, error) {
|
||||||
if c == nil {
|
if c == nil {
|
||||||
return nil, nil
|
return nil, nil
|
||||||
}
|
}
|
||||||
@ -72,7 +79,12 @@ func (c *ImageContext) NewEmbed(llamaContext *llama.Context, data []byte) ([][]f
|
|||||||
|
|
||||||
embed, err := c.findImage(hash)
|
embed, err := c.findImage(hash)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
if c.clip != nil {
|
if c.mllama != nil {
|
||||||
|
embed, err = c.mllama.NewEmbed(llamaContext, data, aspectRatioId)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
} else if c.clip != nil {
|
||||||
embed, err = c.clip.NewEmbed(llamaContext, data)
|
embed, err = c.clip.NewEmbed(llamaContext, data)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
@ -93,11 +105,33 @@ func (c *ImageContext) BatchSize(configuredBatchSize int) int {
|
|||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Mllama maps an image to 1 embedding token (llava creates many tokens)
|
||||||
|
// and doesn't support more than a single image per request.
|
||||||
|
// The embeddings are large (100 MB), so allocating a big batch can fail
|
||||||
|
// on some systems
|
||||||
|
if c.mllama != nil {
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
return configuredBatchSize
|
return configuredBatchSize
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *ImageContext) EmbedSize(llamaContext *llama.Context) int {
|
func (c *ImageContext) EmbedSize(llamaContext *llama.Context) int {
|
||||||
return llamaContext.Model().NEmbd()
|
if c != nil && c.mllama != nil {
|
||||||
|
return c.mllama.EmbedSize(llamaContext)
|
||||||
|
} else {
|
||||||
|
return llamaContext.Model().NEmbd()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *ImageContext) NeedCrossAttention(inputs ...input) bool {
|
||||||
|
if c == nil || c.mllama == nil {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
return slices.ContainsFunc(inputs, func(input input) bool {
|
||||||
|
return input.embed != nil
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
type imageCache struct {
|
type imageCache struct {
|
||||||
|
@ -57,6 +57,10 @@ type Sequence struct {
|
|||||||
// input cache being used by this sequence
|
// input cache being used by this sequence
|
||||||
cache *InputCacheSlot
|
cache *InputCacheSlot
|
||||||
|
|
||||||
|
// does this sequence require cross-attention layers to be processed? - if we have seen
|
||||||
|
// an image for certain multi-modal models
|
||||||
|
crossAttention bool
|
||||||
|
|
||||||
// channel to send responses over
|
// channel to send responses over
|
||||||
responses chan string
|
responses chan string
|
||||||
|
|
||||||
@ -201,7 +205,7 @@ func (s *Server) inputs(prompt string, images []llm.ImageData) ([]input, error)
|
|||||||
return nil, fmt.Errorf("invalid image index: %d", n)
|
return nil, fmt.Errorf("invalid image index: %d", n)
|
||||||
}
|
}
|
||||||
|
|
||||||
embed, err := s.image.NewEmbed(s.lc, images[imageIndex].Data)
|
embed, err := s.image.NewEmbed(s.lc, images[imageIndex].Data, images[imageIndex].AspectRatioID)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
@ -364,6 +368,7 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
|
|||||||
defer s.mu.Unlock()
|
defer s.mu.Unlock()
|
||||||
|
|
||||||
var batch *llama.Batch
|
var batch *llama.Batch
|
||||||
|
crossAttention := false
|
||||||
|
|
||||||
seqIdx := s.nextSeq - 1
|
seqIdx := s.nextSeq - 1
|
||||||
for range s.seqs {
|
for range s.seqs {
|
||||||
@ -411,8 +416,9 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
|
|||||||
batch = tokenBatch
|
batch = tokenBatch
|
||||||
} else {
|
} else {
|
||||||
batch = embedBatch
|
batch = embedBatch
|
||||||
|
seq.crossAttention = s.image.NeedCrossAttention(input)
|
||||||
}
|
}
|
||||||
} else if embedding != batch.IsEmbedding() {
|
} else if embedding != batch.IsEmbedding() || crossAttention != seq.crossAttention {
|
||||||
s.nextSeq = seqIdx
|
s.nextSeq = seqIdx
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
@ -421,6 +427,7 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
|
|||||||
break
|
break
|
||||||
}
|
}
|
||||||
|
|
||||||
|
crossAttention = seq.crossAttention
|
||||||
batch.Add(input.token, input.embed, len(seq.cache.Inputs)+len(seq.pendingInputs), i+1 == len(seq.inputs), seq.cache.Id)
|
batch.Add(input.token, input.embed, len(seq.cache.Inputs)+len(seq.pendingInputs), i+1 == len(seq.inputs), seq.cache.Id)
|
||||||
seq.pendingInputs = append(seq.pendingInputs, input)
|
seq.pendingInputs = append(seq.pendingInputs, input)
|
||||||
seq.iBatch = batch.NumTokens() - 1
|
seq.iBatch = batch.NumTokens() - 1
|
||||||
@ -433,11 +440,20 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
s.lc.SetCrossAttention(crossAttention)
|
||||||
|
|
||||||
err := s.lc.Decode(batch)
|
err := s.lc.Decode(batch)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("failed to decode batch: %w", err)
|
return fmt.Errorf("failed to decode batch: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if crossAttention {
|
||||||
|
// synchronize state to ensure the cross attention batch is complete.
|
||||||
|
// needed specifically for multi-GPU systems otherwise an inflight
|
||||||
|
// task may be incorrectly invalidated causing a crash
|
||||||
|
s.lc.Synchronize()
|
||||||
|
}
|
||||||
|
|
||||||
for i, seq := range s.seqs {
|
for i, seq := range s.seqs {
|
||||||
if seq == nil {
|
if seq == nil {
|
||||||
continue
|
continue
|
||||||
@ -606,6 +622,8 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
seq.crossAttention = s.image.NeedCrossAttention(seq.cache.Inputs...)
|
||||||
|
|
||||||
s.seqs[i] = seq
|
s.seqs[i] = seq
|
||||||
s.cond.Signal()
|
s.cond.Signal()
|
||||||
found = true
|
found = true
|
||||||
|
@ -27,7 +27,6 @@ function checkEnv() {
|
|||||||
$env:VCToolsRedistDir=(get-item "${MSVC_INSTALL}\VC\Redist\MSVC\*")[0]
|
$env:VCToolsRedistDir=(get-item "${MSVC_INSTALL}\VC\Redist\MSVC\*")[0]
|
||||||
}
|
}
|
||||||
# Locate CUDA versions
|
# Locate CUDA versions
|
||||||
# Note: this assumes every version found will be built
|
|
||||||
$cudaList=(get-item "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v*\bin\" -ea 'silentlycontinue')
|
$cudaList=(get-item "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v*\bin\" -ea 'silentlycontinue')
|
||||||
if ($cudaList.length -eq 0) {
|
if ($cudaList.length -eq 0) {
|
||||||
$d=(get-command -ea 'silentlycontinue' nvcc).path
|
$d=(get-command -ea 'silentlycontinue' nvcc).path
|
||||||
@ -94,19 +93,6 @@ function buildOllama() {
|
|||||||
|
|
||||||
$hashEnv = @{}
|
$hashEnv = @{}
|
||||||
Get-ChildItem env: | foreach { $hashEnv[$_.Name] = $_.Value }
|
Get-ChildItem env: | foreach { $hashEnv[$_.Name] = $_.Value }
|
||||||
if ("$script:CUDA_DIRS".Contains("v11")) {
|
|
||||||
$hashEnv.Keys | foreach { if ($_.Contains("CUDA_PATH_V11")) { $v11="$_" }}
|
|
||||||
$env:CUDAToolkit_ROOT=$hashEnv[$v11]
|
|
||||||
write-host "Building CUDA v11 backend libraries"
|
|
||||||
# Note: cuda v11 requires msvc 2019 so force the older generator
|
|
||||||
# to avoid 2022 (or newer) from being used as the default
|
|
||||||
& cmake --fresh --preset "CUDA 11" -G "Visual Studio 16 2019" --install-prefix $script:DIST_DIR
|
|
||||||
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
|
|
||||||
& cmake --build --preset "CUDA 11" --config Release --parallel $script:JOBS
|
|
||||||
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
|
|
||||||
& cmake --install build --component "CUDA" --strip
|
|
||||||
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
|
|
||||||
}
|
|
||||||
if ("$script:CUDA_DIRS".Contains("v12")) {
|
if ("$script:CUDA_DIRS".Contains("v12")) {
|
||||||
$hashEnv.Keys | foreach { if ($_.Contains("CUDA_PATH_V12")) { $v12="$_" }}
|
$hashEnv.Keys | foreach { if ($_.Contains("CUDA_PATH_V12")) { $v12="$_" }}
|
||||||
$env:CUDAToolkit_ROOT=$hashEnv[$v12]
|
$env:CUDAToolkit_ROOT=$hashEnv[$v12]
|
||||||
|
@ -10,9 +10,7 @@ OLLAMA_COMMON_BUILD_ARGS="--build-arg=VERSION \
|
|||||||
--build-arg=GOFLAGS \
|
--build-arg=GOFLAGS \
|
||||||
--build-arg=OLLAMA_CUSTOM_CPU_DEFS \
|
--build-arg=OLLAMA_CUSTOM_CPU_DEFS \
|
||||||
--build-arg=OLLAMA_SKIP_CUDA_GENERATE \
|
--build-arg=OLLAMA_SKIP_CUDA_GENERATE \
|
||||||
--build-arg=OLLAMA_SKIP_CUDA_11_GENERATE \
|
|
||||||
--build-arg=OLLAMA_SKIP_CUDA_12_GENERATE \
|
--build-arg=OLLAMA_SKIP_CUDA_12_GENERATE \
|
||||||
--build-arg=CUDA_V11_ARCHITECTURES \
|
|
||||||
--build-arg=CUDA_V12_ARCHITECTURES \
|
--build-arg=CUDA_V12_ARCHITECTURES \
|
||||||
--build-arg=OLLAMA_SKIP_ROCM_GENERATE \
|
--build-arg=OLLAMA_SKIP_ROCM_GENERATE \
|
||||||
--build-arg=OLLAMA_FAST_BUILD \
|
--build-arg=OLLAMA_FAST_BUILD \
|
||||||
|
@ -3,32 +3,47 @@ package server
|
|||||||
import (
|
import (
|
||||||
"bytes"
|
"bytes"
|
||||||
"context"
|
"context"
|
||||||
|
"encoding/binary"
|
||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
"slices"
|
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
"github.com/ollama/ollama/api"
|
"github.com/ollama/ollama/api"
|
||||||
"github.com/ollama/ollama/llm"
|
"github.com/ollama/ollama/llm"
|
||||||
|
"github.com/ollama/ollama/model/models/mllama"
|
||||||
"github.com/ollama/ollama/template"
|
"github.com/ollama/ollama/template"
|
||||||
)
|
)
|
||||||
|
|
||||||
type tokenizeFunc func(context.Context, string) ([]int, error)
|
type tokenizeFunc func(context.Context, string) ([]int, error)
|
||||||
|
|
||||||
|
var errTooManyImages = errors.New("vision model only supports a single image per message")
|
||||||
|
|
||||||
// chatPrompt accepts a list of messages and returns the prompt and images that should be used for the next chat turn.
|
// chatPrompt accepts a list of messages and returns the prompt and images that should be used for the next chat turn.
|
||||||
// chatPrompt truncates any messages that exceed the context window of the model, making sure to always include 1) the
|
// chatPrompt truncates any messages that exceed the context window of the model, making sure to always include 1) the
|
||||||
// latest message and 2) system messages
|
// latest message and 2) system messages
|
||||||
func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.Options, msgs []api.Message, tools []api.Tool) (prompt string, images []llm.ImageData, _ error) {
|
func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.Options, msgs []api.Message, tools []api.Tool) (prompt string, images []llm.ImageData, _ error) {
|
||||||
var system []api.Message
|
var system []api.Message
|
||||||
|
|
||||||
|
isMllama := checkMllamaModelFamily(m)
|
||||||
|
|
||||||
|
var imageNumTokens int
|
||||||
// TODO: Ideally we would compute this from the projector metadata but some pieces are implementation dependent
|
// TODO: Ideally we would compute this from the projector metadata but some pieces are implementation dependent
|
||||||
// Clip images are represented as 768 tokens, each an embedding
|
if isMllama {
|
||||||
imageNumTokens := 768
|
// Our mllama implementation packs all of the embeddings into a single token
|
||||||
|
imageNumTokens = 1
|
||||||
|
} else {
|
||||||
|
// Clip images are represented as 768 tokens, each an embedding
|
||||||
|
imageNumTokens = 768
|
||||||
|
}
|
||||||
|
|
||||||
n := len(msgs) - 1
|
n := len(msgs) - 1
|
||||||
// in reverse, find all messages that fit into context window
|
// in reverse, find all messages that fit into context window
|
||||||
for i := n; i >= 0; i-- {
|
for i := n; i >= 0; i-- {
|
||||||
|
if isMllama && len(msgs[i].Images) > 1 {
|
||||||
|
return "", nil, errTooManyImages
|
||||||
|
}
|
||||||
|
|
||||||
// always include the last message
|
// always include the last message
|
||||||
if i == n {
|
if i == n {
|
||||||
continue
|
continue
|
||||||
@ -69,17 +84,48 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.
|
|||||||
currMsgIdx := n
|
currMsgIdx := n
|
||||||
|
|
||||||
for cnt, msg := range msgs[currMsgIdx:] {
|
for cnt, msg := range msgs[currMsgIdx:] {
|
||||||
if slices.Contains(m.Config.ModelFamilies, "mllama") && len(msg.Images) > 1 {
|
prefix := ""
|
||||||
return "", nil, errors.New("this model only supports one image while more than one image requested")
|
imgPrompt := ""
|
||||||
}
|
|
||||||
|
|
||||||
var prefix string
|
|
||||||
prompt := msg.Content
|
prompt := msg.Content
|
||||||
|
|
||||||
for _, i := range msg.Images {
|
for _, i := range msg.Images {
|
||||||
imgData := llm.ImageData{
|
var imgData llm.ImageData
|
||||||
ID: len(images),
|
|
||||||
Data: i,
|
if isMllama {
|
||||||
|
if len(m.ProjectorPaths) == 0 {
|
||||||
|
imgData = llm.ImageData{
|
||||||
|
ID: len(images),
|
||||||
|
Data: i,
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
data, opts, err := mllama.Preprocess(bytes.NewReader(i))
|
||||||
|
if err != nil {
|
||||||
|
return "", nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
buf := new(bytes.Buffer)
|
||||||
|
err = binary.Write(buf, binary.LittleEndian, data)
|
||||||
|
if err != nil {
|
||||||
|
return "", nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
ar, ok := opts["aspectRatioIndex"].(int)
|
||||||
|
if !ok {
|
||||||
|
return "", nil, fmt.Errorf("missing aspect ratio for image")
|
||||||
|
}
|
||||||
|
|
||||||
|
imgData = llm.ImageData{
|
||||||
|
ID: len(images),
|
||||||
|
Data: buf.Bytes(),
|
||||||
|
AspectRatioID: ar,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
imgPrompt = "<|image|>"
|
||||||
|
} else {
|
||||||
|
imgData = llm.ImageData{
|
||||||
|
ID: len(images),
|
||||||
|
Data: i,
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
imgTag := fmt.Sprintf("[img-%d]", imgData.ID)
|
imgTag := fmt.Sprintf("[img-%d]", imgData.ID)
|
||||||
@ -91,7 +137,7 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.
|
|||||||
|
|
||||||
images = append(images, imgData)
|
images = append(images, imgData)
|
||||||
}
|
}
|
||||||
msgs[currMsgIdx+cnt].Content = prefix + prompt
|
msgs[currMsgIdx+cnt].Content = prefix + imgPrompt + prompt
|
||||||
}
|
}
|
||||||
|
|
||||||
// truncate any messages that do not fit into the context window
|
// truncate any messages that do not fit into the context window
|
||||||
@ -102,3 +148,12 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.
|
|||||||
|
|
||||||
return b.String(), images, nil
|
return b.String(), images, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func checkMllamaModelFamily(m *Model) bool {
|
||||||
|
for _, arch := range m.Config.ModelFamilies {
|
||||||
|
if arch == "mllama" {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
@ -2,6 +2,8 @@ package server
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"bytes"
|
"bytes"
|
||||||
|
"image"
|
||||||
|
"image/png"
|
||||||
"testing"
|
"testing"
|
||||||
|
|
||||||
"github.com/google/go-cmp/cmp"
|
"github.com/google/go-cmp/cmp"
|
||||||
@ -12,9 +14,10 @@ import (
|
|||||||
|
|
||||||
func TestChatPrompt(t *testing.T) {
|
func TestChatPrompt(t *testing.T) {
|
||||||
type expect struct {
|
type expect struct {
|
||||||
prompt string
|
prompt string
|
||||||
images [][]byte
|
images [][]byte
|
||||||
error error
|
aspectRatioID int
|
||||||
|
error error
|
||||||
}
|
}
|
||||||
|
|
||||||
tmpl, err := template.Parse(`
|
tmpl, err := template.Parse(`
|
||||||
@ -25,6 +28,28 @@ func TestChatPrompt(t *testing.T) {
|
|||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
visionModel := Model{Template: tmpl, ProjectorPaths: []string{"vision"}}
|
visionModel := Model{Template: tmpl, ProjectorPaths: []string{"vision"}}
|
||||||
|
mllamaModel := Model{Template: tmpl, ProjectorPaths: []string{"vision"}, Config: ConfigV2{ModelFamilies: []string{"mllama"}}}
|
||||||
|
|
||||||
|
createImg := func(width, height int) ([]byte, error) {
|
||||||
|
img := image.NewRGBA(image.Rect(0, 0, width, height))
|
||||||
|
var buf bytes.Buffer
|
||||||
|
|
||||||
|
if err := png.Encode(&buf, img); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
return buf.Bytes(), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
imgBuf, err := createImg(5, 5)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
imgBuf2, err := createImg(6, 6)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
cases := []struct {
|
cases := []struct {
|
||||||
name string
|
name string
|
||||||
@ -202,6 +227,90 @@ func TestChatPrompt(t *testing.T) {
|
|||||||
images: [][]byte{[]byte("one hotdog"), []byte("two hotdogs")},
|
images: [][]byte{[]byte("one hotdog"), []byte("two hotdogs")},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
name: "messages with mllama (no images)",
|
||||||
|
model: mllamaModel,
|
||||||
|
limit: 2048,
|
||||||
|
msgs: []api.Message{
|
||||||
|
{Role: "user", Content: "You're a test, Harry!"},
|
||||||
|
{Role: "assistant", Content: "I-I'm a what?"},
|
||||||
|
{Role: "user", Content: "A test. And a thumping good one at that, I'd wager."},
|
||||||
|
},
|
||||||
|
expect: expect{
|
||||||
|
prompt: "You're a test, Harry! I-I'm a what? A test. And a thumping good one at that, I'd wager. ",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "messages with mllama single prompt",
|
||||||
|
model: mllamaModel,
|
||||||
|
limit: 2048,
|
||||||
|
msgs: []api.Message{
|
||||||
|
{Role: "user", Content: "How many hotdogs are in this image?", Images: []api.ImageData{imgBuf}},
|
||||||
|
},
|
||||||
|
expect: expect{
|
||||||
|
prompt: "[img-0]<|image|>How many hotdogs are in this image? ",
|
||||||
|
images: [][]byte{imgBuf},
|
||||||
|
aspectRatioID: 1,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "messages with mllama",
|
||||||
|
model: mllamaModel,
|
||||||
|
limit: 2048,
|
||||||
|
msgs: []api.Message{
|
||||||
|
{Role: "user", Content: "You're a test, Harry!"},
|
||||||
|
{Role: "assistant", Content: "I-I'm a what?"},
|
||||||
|
{Role: "user", Content: "A test. And a thumping good one at that, I'd wager.", Images: []api.ImageData{imgBuf}},
|
||||||
|
},
|
||||||
|
expect: expect{
|
||||||
|
prompt: "You're a test, Harry! I-I'm a what? [img-0]<|image|>A test. And a thumping good one at that, I'd wager. ",
|
||||||
|
images: [][]byte{imgBuf},
|
||||||
|
aspectRatioID: 1,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "multiple messages with mllama",
|
||||||
|
model: mllamaModel,
|
||||||
|
limit: 2048,
|
||||||
|
msgs: []api.Message{
|
||||||
|
{Role: "user", Content: "You're a test, Harry!", Images: []api.ImageData{imgBuf}},
|
||||||
|
{Role: "assistant", Content: "I-I'm a what?"},
|
||||||
|
{Role: "user", Content: "A test. And a thumping good one at that, I'd wager.", Images: []api.ImageData{imgBuf2}},
|
||||||
|
},
|
||||||
|
expect: expect{
|
||||||
|
prompt: "[img-0]<|image|>You're a test, Harry! I-I'm a what? [img-1]<|image|>A test. And a thumping good one at that, I'd wager. ",
|
||||||
|
images: [][]byte{imgBuf, imgBuf2},
|
||||||
|
aspectRatioID: 1,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "earlier image with mllama",
|
||||||
|
model: mllamaModel,
|
||||||
|
limit: 2048,
|
||||||
|
msgs: []api.Message{
|
||||||
|
{Role: "user", Content: "How many hotdogs are in this image?", Images: []api.ImageData{imgBuf}},
|
||||||
|
{Role: "assistant", Content: "There are four hotdogs."},
|
||||||
|
{Role: "user", Content: "Which ones have mustard?"},
|
||||||
|
},
|
||||||
|
expect: expect{
|
||||||
|
prompt: "[img-0]<|image|>How many hotdogs are in this image? There are four hotdogs. Which ones have mustard? ",
|
||||||
|
images: [][]byte{imgBuf},
|
||||||
|
aspectRatioID: 1,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "too many images with mllama",
|
||||||
|
model: mllamaModel,
|
||||||
|
limit: 2048,
|
||||||
|
msgs: []api.Message{
|
||||||
|
{Role: "user", Content: "You're a test, Harry!"},
|
||||||
|
{Role: "assistant", Content: "I-I'm a what?"},
|
||||||
|
{Role: "user", Content: "A test. And a thumping good one at that, I'd wager.", Images: []api.ImageData{imgBuf, imgBuf}},
|
||||||
|
},
|
||||||
|
expect: expect{
|
||||||
|
error: errTooManyImages,
|
||||||
|
},
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, tt := range cases {
|
for _, tt := range cases {
|
||||||
@ -232,6 +341,10 @@ func TestChatPrompt(t *testing.T) {
|
|||||||
if !bytes.Equal(images[i].Data, tt.images[i]) {
|
if !bytes.Equal(images[i].Data, tt.images[i]) {
|
||||||
t.Errorf("expected %q, got %q", tt.images[i], images[i].Data)
|
t.Errorf("expected %q, got %q", tt.images[i], images[i].Data)
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
if images[i].AspectRatioID != tt.aspectRatioID {
|
||||||
|
t.Errorf("expected aspect ratio %d, got %d", tt.aspectRatioID, images[i].AspectRatioID)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
|
@ -70,7 +70,23 @@ func getTensorNewType(kv fsggml.KV, qs *quantizeState, newType fsggml.TensorType
|
|||||||
newType = fsggml.TensorTypeQ6_K
|
newType = fsggml.TensorTypeQ6_K
|
||||||
}
|
}
|
||||||
} else if strings.Contains(name, "attn_v.weight") {
|
} else if strings.Contains(name, "attn_v.weight") {
|
||||||
if (ftype == fsggml.FileTypeQ4_K_M) &&
|
if ftype == fsggml.FileTypeQ2_K {
|
||||||
|
if kv.GQA() >= 4 {
|
||||||
|
newType = fsggml.TensorTypeQ4_K
|
||||||
|
} else {
|
||||||
|
newType = fsggml.TensorTypeQ3_K
|
||||||
|
}
|
||||||
|
} else if ftype == fsggml.FileTypeQ2_K_S && kv.GQA() >= 4 {
|
||||||
|
newType = fsggml.TensorTypeQ4_K
|
||||||
|
} else if ftype == fsggml.FileTypeQ3_K_M {
|
||||||
|
if qs.iAttnV < 2 {
|
||||||
|
newType = fsggml.TensorTypeQ5_K
|
||||||
|
} else {
|
||||||
|
newType = fsggml.TensorTypeQ4_K
|
||||||
|
}
|
||||||
|
} else if ftype == fsggml.FileTypeQ3_K_L {
|
||||||
|
newType = fsggml.TensorTypeQ5_K
|
||||||
|
} else if (ftype == fsggml.FileTypeQ4_K_M || ftype == fsggml.FileTypeQ5_K_M) &&
|
||||||
useMoreBits(qs.iAttnV, qs.nAttnV) {
|
useMoreBits(qs.iAttnV, qs.nAttnV) {
|
||||||
newType = fsggml.TensorTypeQ6_K
|
newType = fsggml.TensorTypeQ6_K
|
||||||
} else if ftype == fsggml.FileTypeQ4_K_S && qs.iAttnV < 4 {
|
} else if ftype == fsggml.FileTypeQ4_K_S && qs.iAttnV < 4 {
|
||||||
@ -98,23 +114,54 @@ func getTensorNewType(kv fsggml.KV, qs *quantizeState, newType fsggml.TensorType
|
|||||||
} else if strings.Contains(name, "ffn_down") {
|
} else if strings.Contains(name, "ffn_down") {
|
||||||
iLayer := qs.iFfnDown
|
iLayer := qs.iFfnDown
|
||||||
n_layer := qs.nFfnDown
|
n_layer := qs.nFfnDown
|
||||||
if ftype == fsggml.FileTypeQ4_K_M {
|
if ftype == fsggml.FileTypeQ2_K {
|
||||||
|
newType = fsggml.TensorTypeQ3_K
|
||||||
|
} else if ftype == fsggml.FileTypeQ2_K_S {
|
||||||
|
if iLayer < n_layer/8 {
|
||||||
|
newType = fsggml.TensorTypeQ4_K
|
||||||
|
}
|
||||||
|
} else if ftype == fsggml.FileTypeQ3_K_M {
|
||||||
|
if iLayer < n_layer/16 {
|
||||||
|
newType = fsggml.TensorTypeQ5_K
|
||||||
|
} else if useMoreBits(iLayer, n_layer) {
|
||||||
|
newType = fsggml.TensorTypeQ4_K
|
||||||
|
} else {
|
||||||
|
newType = fsggml.TensorTypeQ3_K
|
||||||
|
}
|
||||||
|
} else if ftype == fsggml.FileTypeQ3_K_L {
|
||||||
|
newType = fsggml.TensorTypeQ5_K
|
||||||
|
} else if ftype == fsggml.FileTypeQ4_K_M {
|
||||||
if useMoreBits(iLayer, n_layer) {
|
if useMoreBits(iLayer, n_layer) {
|
||||||
newType = fsggml.TensorTypeQ6_K
|
newType = fsggml.TensorTypeQ6_K
|
||||||
}
|
}
|
||||||
|
} else if ftype == fsggml.FileTypeQ5_K_M && useMoreBits(iLayer, n_layer) {
|
||||||
|
newType = fsggml.TensorTypeQ6_K
|
||||||
} else if ftype == fsggml.FileTypeQ4_K_S && iLayer < n_layer/8 {
|
} else if ftype == fsggml.FileTypeQ4_K_S && iLayer < n_layer/8 {
|
||||||
newType = fsggml.TensorTypeQ5_K
|
newType = fsggml.TensorTypeQ5_K
|
||||||
}
|
}
|
||||||
qs.iFfnDown++
|
qs.iFfnDown++
|
||||||
} else if strings.Contains(name, "attn_output.weight") {
|
} else if strings.Contains(name, "attn_output.weight") {
|
||||||
if nExperts == 8 {
|
if nExperts == 8 {
|
||||||
if ftype == fsggml.FileTypeQ4_K_S || ftype == fsggml.FileTypeQ4_K_M {
|
if ftype == fsggml.FileTypeQ2_K || ftype == fsggml.FileTypeQ3_K_S || ftype == fsggml.FileTypeQ3_K_M ||
|
||||||
|
ftype == fsggml.FileTypeQ4_K_S || ftype == fsggml.FileTypeQ4_K_M {
|
||||||
|
newType = fsggml.TensorTypeQ5_K
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if ftype == fsggml.FileTypeQ2_K {
|
||||||
|
newType = fsggml.TensorTypeQ3_K
|
||||||
|
} else if ftype == fsggml.FileTypeQ3_K_M {
|
||||||
|
newType = fsggml.TensorTypeQ4_K
|
||||||
|
} else if ftype == fsggml.FileTypeQ3_K_L {
|
||||||
newType = fsggml.TensorTypeQ5_K
|
newType = fsggml.TensorTypeQ5_K
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else if strings.Contains(name, "attn_qkv.weight") {
|
} else if strings.Contains(name, "attn_qkv.weight") {
|
||||||
if ftype == fsggml.FileTypeQ4_K_M {
|
if ftype == fsggml.FileTypeQ3_K_M || ftype == fsggml.FileTypeQ3_K_L {
|
||||||
|
newType = fsggml.TensorTypeQ4_K
|
||||||
|
} else if ftype == fsggml.FileTypeQ4_K_M {
|
||||||
newType = fsggml.TensorTypeQ5_K
|
newType = fsggml.TensorTypeQ5_K
|
||||||
|
} else if ftype == fsggml.FileTypeQ5_K_M {
|
||||||
|
newType = fsggml.TensorTypeQ6_K
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -42,6 +42,71 @@ func TestGetTensorNewType(t *testing.T) {
|
|||||||
ftype: fsggml.FileTypeF32,
|
ftype: fsggml.FileTypeF32,
|
||||||
expected: fsggml.TensorTypeQ6_K,
|
expected: fsggml.TensorTypeQ6_K,
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
name: "attn_v.weight_q4_k",
|
||||||
|
kv: map[string]any{
|
||||||
|
"general.architecture": "foo",
|
||||||
|
"foo.attention.head_count": uint32(4),
|
||||||
|
"foo.attention.head_count_kv": uint32(1),
|
||||||
|
},
|
||||||
|
newType: fsggml.TensorTypeQ4_0,
|
||||||
|
tensor_name: "blk.0.attn_v.weight",
|
||||||
|
shape: []uint64{256},
|
||||||
|
ftype: fsggml.FileTypeQ2_K,
|
||||||
|
expected: fsggml.TensorTypeQ4_K,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "attn_v.weight_q3_k",
|
||||||
|
kv: map[string]any{},
|
||||||
|
newType: fsggml.TensorTypeQ4_0,
|
||||||
|
tensor_name: "blk.0.attn_v.weight",
|
||||||
|
shape: []uint64{256},
|
||||||
|
ftype: fsggml.FileTypeQ2_K,
|
||||||
|
expected: fsggml.TensorTypeQ3_K,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "attn_v.weight_q2_k_s_q4_k",
|
||||||
|
kv: map[string]any{
|
||||||
|
"general.architecture": "foo",
|
||||||
|
"foo.attention.head_count": uint32(4),
|
||||||
|
"foo.attention.head_count_kv": uint32(1),
|
||||||
|
},
|
||||||
|
newType: fsggml.TensorTypeQ4_0,
|
||||||
|
tensor_name: "blk.0.attn_v.weight",
|
||||||
|
shape: []uint64{256},
|
||||||
|
ftype: fsggml.FileTypeQ2_K_S,
|
||||||
|
expected: fsggml.TensorTypeQ4_K,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "attn_v.weight_q3_k_m",
|
||||||
|
kv: map[string]any{},
|
||||||
|
newType: fsggml.TensorTypeQ4_0,
|
||||||
|
tensor_name: "blk.0.attn_v.weight",
|
||||||
|
shape: []uint64{256},
|
||||||
|
ftype: fsggml.FileTypeQ3_K_M,
|
||||||
|
expected: fsggml.TensorTypeQ5_K,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "attn_v.weight_q3_k_m_i",
|
||||||
|
qs: quantizeState{
|
||||||
|
iAttnV: 2,
|
||||||
|
},
|
||||||
|
kv: map[string]any{},
|
||||||
|
newType: fsggml.TensorTypeQ4_0,
|
||||||
|
tensor_name: "blk.0.attn_v.weight",
|
||||||
|
shape: []uint64{256},
|
||||||
|
ftype: fsggml.FileTypeQ3_K_M,
|
||||||
|
expected: fsggml.TensorTypeQ4_K,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "attn_v.weight_q3_k_l",
|
||||||
|
kv: map[string]any{},
|
||||||
|
newType: fsggml.TensorTypeQ4_0,
|
||||||
|
tensor_name: "blk.0.attn_v.weight",
|
||||||
|
shape: []uint64{256},
|
||||||
|
ftype: fsggml.FileTypeQ3_K_L,
|
||||||
|
expected: fsggml.TensorTypeQ5_K,
|
||||||
|
},
|
||||||
{
|
{
|
||||||
name: "attn_v.weight_q4_k_m",
|
name: "attn_v.weight_q4_k_m",
|
||||||
qs: quantizeState{
|
qs: quantizeState{
|
||||||
@ -91,6 +156,88 @@ func TestGetTensorNewType(t *testing.T) {
|
|||||||
ftype: fsggml.FileTypeF32,
|
ftype: fsggml.FileTypeF32,
|
||||||
expected: fsggml.TensorTypeQ8_0,
|
expected: fsggml.TensorTypeQ8_0,
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
name: "ffn_down_q2_k",
|
||||||
|
qs: quantizeState{},
|
||||||
|
kv: map[string]any{},
|
||||||
|
newType: fsggml.TensorTypeQ4_0,
|
||||||
|
tensor_name: "ffn_down",
|
||||||
|
shape: []uint64{256},
|
||||||
|
ftype: fsggml.FileTypeQ2_K,
|
||||||
|
expected: fsggml.TensorTypeQ3_K,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "ffn_down_q2_k_s",
|
||||||
|
qs: quantizeState{},
|
||||||
|
kv: map[string]any{},
|
||||||
|
newType: fsggml.TensorTypeQ4_0,
|
||||||
|
tensor_name: "ffn_down",
|
||||||
|
shape: []uint64{256},
|
||||||
|
ftype: fsggml.FileTypeQ2_K_S,
|
||||||
|
expected: fsggml.TensorTypeQ4_0,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "ffn_down_q2_k_s_layers",
|
||||||
|
qs: quantizeState{
|
||||||
|
iFfnDown: 2,
|
||||||
|
nFfnDown: 3 * 8,
|
||||||
|
},
|
||||||
|
kv: map[string]any{},
|
||||||
|
newType: fsggml.TensorTypeQ4_0,
|
||||||
|
tensor_name: "ffn_down",
|
||||||
|
shape: []uint64{256},
|
||||||
|
ftype: fsggml.FileTypeQ2_K_S,
|
||||||
|
expected: fsggml.TensorTypeQ4_K,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "ffn_down_q3_k_m_base",
|
||||||
|
qs: quantizeState{
|
||||||
|
iFfnDown: 1,
|
||||||
|
nFfnDown: 8,
|
||||||
|
},
|
||||||
|
kv: map[string]any{},
|
||||||
|
newType: fsggml.TensorTypeQ4_0,
|
||||||
|
tensor_name: "ffn_down",
|
||||||
|
shape: []uint64{256},
|
||||||
|
ftype: fsggml.FileTypeQ3_K_M,
|
||||||
|
expected: fsggml.TensorTypeQ3_K,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "ffn_down_q3_k_m_16",
|
||||||
|
qs: quantizeState{
|
||||||
|
iFfnDown: 2,
|
||||||
|
nFfnDown: 3 * 16,
|
||||||
|
},
|
||||||
|
kv: map[string]any{},
|
||||||
|
newType: fsggml.TensorTypeQ4_0,
|
||||||
|
tensor_name: "ffn_down",
|
||||||
|
shape: []uint64{256},
|
||||||
|
ftype: fsggml.FileTypeQ3_K_M,
|
||||||
|
expected: fsggml.TensorTypeQ5_K,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "ffn_down_q3_k_m_8",
|
||||||
|
qs: quantizeState{
|
||||||
|
iFfnDown: 2,
|
||||||
|
nFfnDown: 3 * 8,
|
||||||
|
},
|
||||||
|
kv: map[string]any{},
|
||||||
|
newType: fsggml.TensorTypeQ4_0,
|
||||||
|
tensor_name: "ffn_down",
|
||||||
|
shape: []uint64{256},
|
||||||
|
ftype: fsggml.FileTypeQ3_K_M,
|
||||||
|
expected: fsggml.TensorTypeQ4_K,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "ffn_down_q3_k_l",
|
||||||
|
qs: quantizeState{},
|
||||||
|
kv: map[string]any{},
|
||||||
|
newType: fsggml.TensorTypeQ4_0,
|
||||||
|
tensor_name: "ffn_down",
|
||||||
|
shape: []uint64{256},
|
||||||
|
ftype: fsggml.FileTypeQ3_K_L,
|
||||||
|
expected: fsggml.TensorTypeQ5_K,
|
||||||
|
},
|
||||||
{
|
{
|
||||||
name: "ffn_down_q4_k_m",
|
name: "ffn_down_q4_k_m",
|
||||||
qs: quantizeState{
|
qs: quantizeState{
|
||||||
@ -117,6 +264,19 @@ func TestGetTensorNewType(t *testing.T) {
|
|||||||
ftype: fsggml.FileTypeQ4_K_M,
|
ftype: fsggml.FileTypeQ4_K_M,
|
||||||
expected: fsggml.TensorTypeQ6_K,
|
expected: fsggml.TensorTypeQ6_K,
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
name: "ffn_down_q5_k_m",
|
||||||
|
qs: quantizeState{
|
||||||
|
iFfnDown: 2,
|
||||||
|
nFfnDown: 3 * 8,
|
||||||
|
},
|
||||||
|
kv: map[string]any{},
|
||||||
|
newType: fsggml.TensorTypeQ4_0,
|
||||||
|
tensor_name: "ffn_down",
|
||||||
|
shape: []uint64{256},
|
||||||
|
ftype: fsggml.FileTypeQ5_K_M,
|
||||||
|
expected: fsggml.TensorTypeQ6_K,
|
||||||
|
},
|
||||||
{
|
{
|
||||||
name: "ffn_down_q4_k_s",
|
name: "ffn_down_q4_k_s",
|
||||||
qs: quantizeState{
|
qs: quantizeState{
|
||||||
@ -130,6 +290,59 @@ func TestGetTensorNewType(t *testing.T) {
|
|||||||
ftype: fsggml.FileTypeQ4_K_S,
|
ftype: fsggml.FileTypeQ4_K_S,
|
||||||
expected: fsggml.TensorTypeQ5_K,
|
expected: fsggml.TensorTypeQ5_K,
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
name: "attn_output.weight_8_expert",
|
||||||
|
qs: quantizeState{},
|
||||||
|
kv: map[string]any{
|
||||||
|
"general.architecture": "foo",
|
||||||
|
"foo.expert_count": uint32(8),
|
||||||
|
},
|
||||||
|
newType: fsggml.TensorTypeQ4_0,
|
||||||
|
tensor_name: "blk.0.attn_output.weight",
|
||||||
|
shape: []uint64{256},
|
||||||
|
ftype: fsggml.FileTypeQ2_K,
|
||||||
|
expected: fsggml.TensorTypeQ5_K,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "attn_output.weight_q2",
|
||||||
|
qs: quantizeState{},
|
||||||
|
kv: map[string]any{},
|
||||||
|
newType: fsggml.TensorTypeQ4_0,
|
||||||
|
tensor_name: "blk.0.attn_output.weight",
|
||||||
|
shape: []uint64{256},
|
||||||
|
ftype: fsggml.FileTypeQ2_K,
|
||||||
|
expected: fsggml.TensorTypeQ3_K,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "attn_output.weight_q3_k_m",
|
||||||
|
qs: quantizeState{},
|
||||||
|
kv: map[string]any{},
|
||||||
|
newType: fsggml.TensorTypeQ4_0,
|
||||||
|
tensor_name: "blk.0.attn_output.weight",
|
||||||
|
shape: []uint64{256},
|
||||||
|
ftype: fsggml.FileTypeQ3_K_M,
|
||||||
|
expected: fsggml.TensorTypeQ4_K,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "attn_output.weight_q3_k_l",
|
||||||
|
qs: quantizeState{},
|
||||||
|
kv: map[string]any{},
|
||||||
|
newType: fsggml.TensorTypeQ4_0,
|
||||||
|
tensor_name: "blk.0.attn_output.weight",
|
||||||
|
shape: []uint64{256},
|
||||||
|
ftype: fsggml.FileTypeQ3_K_L,
|
||||||
|
expected: fsggml.TensorTypeQ5_K,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "attn_qkv.weight_q3_k_m",
|
||||||
|
qs: quantizeState{},
|
||||||
|
kv: map[string]any{},
|
||||||
|
newType: fsggml.TensorTypeQ4_0,
|
||||||
|
tensor_name: "blk.0.attn_qkv.weight",
|
||||||
|
shape: []uint64{256},
|
||||||
|
ftype: fsggml.FileTypeQ3_K_M,
|
||||||
|
expected: fsggml.TensorTypeQ4_K,
|
||||||
|
},
|
||||||
{
|
{
|
||||||
name: "attn_qkv.weight_q4_k_m",
|
name: "attn_qkv.weight_q4_k_m",
|
||||||
qs: quantizeState{},
|
qs: quantizeState{},
|
||||||
@ -140,6 +353,16 @@ func TestGetTensorNewType(t *testing.T) {
|
|||||||
ftype: fsggml.FileTypeQ4_K_M,
|
ftype: fsggml.FileTypeQ4_K_M,
|
||||||
expected: fsggml.TensorTypeQ5_K,
|
expected: fsggml.TensorTypeQ5_K,
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
name: "attn_qkv.weight_q5_k_m",
|
||||||
|
qs: quantizeState{},
|
||||||
|
kv: map[string]any{},
|
||||||
|
newType: fsggml.TensorTypeQ4_0,
|
||||||
|
tensor_name: "blk.0.attn_qkv.weight",
|
||||||
|
shape: []uint64{256},
|
||||||
|
ftype: fsggml.FileTypeQ5_K_M,
|
||||||
|
expected: fsggml.TensorTypeQ6_K,
|
||||||
|
},
|
||||||
}
|
}
|
||||||
for _, tt := range cases {
|
for _, tt := range cases {
|
||||||
t.Run(tt.name, func(t *testing.T) {
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
|
@ -4,10 +4,10 @@ import (
|
|||||||
"bytes"
|
"bytes"
|
||||||
"cmp"
|
"cmp"
|
||||||
"context"
|
"context"
|
||||||
|
"encoding/binary"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"image"
|
|
||||||
"io"
|
"io"
|
||||||
"io/fs"
|
"io/fs"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
@ -25,7 +25,6 @@ import (
|
|||||||
|
|
||||||
"github.com/gin-contrib/cors"
|
"github.com/gin-contrib/cors"
|
||||||
"github.com/gin-gonic/gin"
|
"github.com/gin-gonic/gin"
|
||||||
"golang.org/x/image/webp"
|
|
||||||
"golang.org/x/sync/errgroup"
|
"golang.org/x/sync/errgroup"
|
||||||
|
|
||||||
"github.com/ollama/ollama/api"
|
"github.com/ollama/ollama/api"
|
||||||
@ -34,6 +33,7 @@ import (
|
|||||||
"github.com/ollama/ollama/fs/ggml"
|
"github.com/ollama/ollama/fs/ggml"
|
||||||
"github.com/ollama/ollama/llm"
|
"github.com/ollama/ollama/llm"
|
||||||
"github.com/ollama/ollama/logutil"
|
"github.com/ollama/ollama/logutil"
|
||||||
|
"github.com/ollama/ollama/model/models/mllama"
|
||||||
"github.com/ollama/ollama/openai"
|
"github.com/ollama/ollama/openai"
|
||||||
"github.com/ollama/ollama/server/internal/client/ollama"
|
"github.com/ollama/ollama/server/internal/client/ollama"
|
||||||
"github.com/ollama/ollama/server/internal/registry"
|
"github.com/ollama/ollama/server/internal/registry"
|
||||||
@ -98,10 +98,6 @@ func (s *Server) scheduleRunner(ctx context.Context, name string, caps []model.C
|
|||||||
return nil, nil, nil, err
|
return nil, nil, nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
if slices.Contains(model.Config.ModelFamilies, "mllama") && len(model.ProjectorPaths) > 0 {
|
|
||||||
return nil, nil, nil, fmt.Errorf("'llama3.2-vision' is no longer compatible with your version of Ollama and has been replaced by a newer version. To re-download, run 'ollama pull llama3.2-vision'")
|
|
||||||
}
|
|
||||||
|
|
||||||
if err := model.CheckCapabilities(caps...); err != nil {
|
if err := model.CheckCapabilities(caps...); err != nil {
|
||||||
return nil, nil, nil, fmt.Errorf("%s %w", name, err)
|
return nil, nil, nil, fmt.Errorf("%s %w", name, err)
|
||||||
}
|
}
|
||||||
@ -208,14 +204,38 @@ func (s *Server) GenerateHandler(c *gin.Context) {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
if slices.Contains(m.Config.ModelFamilies, "mllama") && len(req.Images) > 1 {
|
isMllama := checkMllamaModelFamily(m)
|
||||||
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "this model only supports one image while more than one image requested"})
|
if isMllama && len(req.Images) > 1 {
|
||||||
|
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "this model only supports one image: more than one image sent"})
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
images := make([]llm.ImageData, len(req.Images))
|
images := make([]llm.ImageData, len(req.Images))
|
||||||
for i := range req.Images {
|
for i := range req.Images {
|
||||||
images[i] = llm.ImageData{ID: i, Data: req.Images[i]}
|
if isMllama && len(m.ProjectorPaths) > 0 {
|
||||||
|
data, opts, err := mllama.Preprocess(bytes.NewReader(req.Images[i]))
|
||||||
|
if err != nil {
|
||||||
|
c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": "error processing image"})
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
ar, ok := opts["aspectRatioIndex"].(int)
|
||||||
|
if !ok {
|
||||||
|
c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": "error processing image"})
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
buf := new(bytes.Buffer)
|
||||||
|
err = binary.Write(buf, binary.LittleEndian, data)
|
||||||
|
if err != nil {
|
||||||
|
c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": "error processing image"})
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
images[i] = llm.ImageData{ID: i, Data: buf.Bytes(), AspectRatioID: ar}
|
||||||
|
} else {
|
||||||
|
images[i] = llm.ImageData{ID: i, Data: req.Images[i]}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
prompt := req.Prompt
|
prompt := req.Prompt
|
||||||
@ -247,6 +267,9 @@ func (s *Server) GenerateHandler(c *gin.Context) {
|
|||||||
|
|
||||||
for _, i := range images {
|
for _, i := range images {
|
||||||
imgPrompt := ""
|
imgPrompt := ""
|
||||||
|
if isMllama {
|
||||||
|
imgPrompt = "<|image|>"
|
||||||
|
}
|
||||||
msgs = append(msgs, api.Message{Role: "user", Content: fmt.Sprintf("[img-%d]"+imgPrompt, i.ID)})
|
msgs = append(msgs, api.Message{Role: "user", Content: fmt.Sprintf("[img-%d]"+imgPrompt, i.ID)})
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1281,10 +1304,6 @@ func Serve(ln net.Listener) error {
|
|||||||
|
|
||||||
s.sched.Run(schedCtx)
|
s.sched.Run(schedCtx)
|
||||||
|
|
||||||
// register the experimental webp decoder
|
|
||||||
// so webp images can be used in multimodal inputs
|
|
||||||
image.RegisterFormat("webp", "RIFF????WEBP", webp.Decode, webp.DecodeConfig)
|
|
||||||
|
|
||||||
// At startup we retrieve GPU information so we can get log messages before loading a model
|
// At startup we retrieve GPU information so we can get log messages before loading a model
|
||||||
// This will log warnings to the log in case we have problems with detected GPUs
|
// This will log warnings to the log in case we have problems with detected GPUs
|
||||||
gpus := discover.GetGPUInfo()
|
gpus := discover.GetGPUInfo()
|
||||||
|
@ -8,7 +8,6 @@ import (
|
|||||||
"os"
|
"os"
|
||||||
"reflect"
|
"reflect"
|
||||||
"runtime"
|
"runtime"
|
||||||
"slices"
|
|
||||||
"sort"
|
"sort"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
@ -133,11 +132,11 @@ func (s *Scheduler) processPending(ctx context.Context) {
|
|||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
numParallel := int(envconfig.NumParallel())
|
numParallel := int(envconfig.NumParallel())
|
||||||
// `mllama` is a snowflake and uses an encoder cache which cannot be used with num_parallel > 1
|
// TODO (jmorganca): mllama doesn't support parallel yet
|
||||||
// ref: https://github.com/ollama/ollama/issues/4165
|
// see https://github.com/ollama/ollama/issues/4165
|
||||||
if slices.Contains(pending.model.Config.ModelFamilies, "mllama") && numParallel != 1 {
|
if checkMllamaModelFamily(pending.model) && numParallel != 1 {
|
||||||
numParallel = 1
|
numParallel = 1
|
||||||
slog.Warn("mllama does not currently support parallel requests")
|
slog.Warn("mllama doesn't support parallel requests yet")
|
||||||
}
|
}
|
||||||
|
|
||||||
for {
|
for {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user