Compare commits
1 Commits
main
...
mxyng/qwen
Author | SHA1 | Date | |
---|---|---|---|
![]() |
1546bc4767 |
6
.github/workflows/release.yaml
vendored
6
.github/workflows/release.yaml
vendored
@ -103,11 +103,6 @@ jobs:
|
||||
arch: [amd64]
|
||||
preset: ['CPU']
|
||||
include:
|
||||
- os: windows
|
||||
arch: amd64
|
||||
preset: 'CUDA 11'
|
||||
install: https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.89_win10.exe
|
||||
cuda-version: '11.3'
|
||||
- os: windows
|
||||
arch: amd64
|
||||
preset: 'CUDA 12'
|
||||
@ -324,7 +319,6 @@ jobs:
|
||||
case "$COMPONENT" in
|
||||
bin/ollama) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
|
||||
lib/ollama/*.so) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
|
||||
lib/ollama/cuda_v11) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
|
||||
lib/ollama/cuda_v12) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
|
||||
lib/ollama/cuda_jetpack5) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack5.tar.in ;;
|
||||
lib/ollama/cuda_jetpack6) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack6.tar.in ;;
|
||||
|
6
.github/workflows/test.yaml
vendored
6
.github/workflows/test.yaml
vendored
@ -46,7 +46,7 @@ jobs:
|
||||
include:
|
||||
- preset: CPU
|
||||
- preset: CUDA
|
||||
container: nvidia/cuda:11.8.0-devel-ubuntu22.04
|
||||
container: nvidia/cuda:12.8.1-devel-ubuntu22.04
|
||||
flags: '-DCMAKE_CUDA_ARCHITECTURES=87'
|
||||
- preset: ROCm
|
||||
container: rocm/dev-ubuntu-22.04:6.1.2
|
||||
@ -78,7 +78,7 @@ jobs:
|
||||
include:
|
||||
- preset: CPU
|
||||
- preset: CUDA
|
||||
install: https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.89_win10.exe
|
||||
install: https://developer.download.nvidia.com/compute/cuda/12.8.0/local_installers/cuda_12.8.0_571.96_windows.exe
|
||||
flags: '-DCMAKE_CUDA_ARCHITECTURES=80'
|
||||
- preset: ROCm
|
||||
install: https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q4-WinSvr2022-For-HIP.exe
|
||||
@ -102,7 +102,7 @@ jobs:
|
||||
$ErrorActionPreference = "Stop"
|
||||
if ("${{ steps.cache-install.outputs.cache-hit }}" -ne 'true') {
|
||||
Invoke-WebRequest -Uri "${{ matrix.install }}" -OutFile "install.exe"
|
||||
Start-Process -FilePath .\install.exe -ArgumentList (@("-s", "cudart_11.3", "nvcc_11.3", "cublas_11.3", "cublas_dev_11.3")) -NoNewWindow -Wait
|
||||
Start-Process -FilePath .\install.exe -ArgumentList (@("-s", "cudart_12.8", "nvcc_12.8", "cublas_12.8", "cublas_dev_12.8")) -NoNewWindow -Wait
|
||||
}
|
||||
|
||||
$cudaPath = (Resolve-Path "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\*").path
|
||||
|
@ -19,8 +19,8 @@ linters:
|
||||
- nolintlint
|
||||
- nosprintfhostport
|
||||
- staticcheck
|
||||
- tenv
|
||||
- unconvert
|
||||
- usetesting
|
||||
- wastedassign
|
||||
- whitespace
|
||||
disable:
|
||||
|
@ -17,14 +17,6 @@
|
||||
"name": "CUDA",
|
||||
"inherits": [ "Default" ]
|
||||
},
|
||||
{
|
||||
"name": "CUDA 11",
|
||||
"inherits": [ "CUDA" ],
|
||||
"cacheVariables": {
|
||||
"CMAKE_CUDA_ARCHITECTURES": "50;52;53;60;61;70;75;80;86",
|
||||
"CMAKE_CUDA_FLAGS": "-Wno-deprecated-gpu-targets"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "CUDA 12",
|
||||
"inherits": [ "CUDA" ],
|
||||
@ -78,11 +70,6 @@
|
||||
"configurePreset": "CUDA",
|
||||
"targets": [ "ggml-cuda" ]
|
||||
},
|
||||
{
|
||||
"name": "CUDA 11",
|
||||
"inherits": [ "CUDA" ],
|
||||
"configurePreset": "CUDA 11"
|
||||
},
|
||||
{
|
||||
"name": "CUDA 12",
|
||||
"inherits": [ "CUDA" ],
|
||||
|
17
Dockerfile
17
Dockerfile
@ -7,14 +7,10 @@ ARG JETPACK5VERSION=r35.4.1
|
||||
ARG JETPACK6VERSION=r36.4.0
|
||||
ARG CMAKEVERSION=3.31.2
|
||||
|
||||
# CUDA v11 requires gcc v10. v10.3 has regressions, so the rockylinux 8.5 AppStream has the latest compatible version
|
||||
FROM --platform=linux/amd64 rocm/dev-almalinux-8:${ROCMVERSION}-complete AS base-amd64
|
||||
RUN yum install -y yum-utils \
|
||||
&& yum-config-manager --add-repo https://dl.rockylinux.org/vault/rocky/8.5/AppStream/\$basearch/os/ \
|
||||
&& rpm --import https://dl.rockylinux.org/pub/rocky/RPM-GPG-KEY-Rocky-8 \
|
||||
&& dnf install -y yum-utils ccache gcc-toolset-10-gcc-10.2.1-8.2.el8 gcc-toolset-10-gcc-c++-10.2.1-8.2.el8 gcc-toolset-10-binutils-2.35-11.el8 \
|
||||
&& dnf install -y ccache \
|
||||
&& yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo
|
||||
ENV PATH=/opt/rh/gcc-toolset-10/root/usr/bin:$PATH
|
||||
|
||||
FROM --platform=linux/arm64 almalinux:8 AS base-arm64
|
||||
# install epel-release for ccache
|
||||
@ -38,15 +34,6 @@ RUN --mount=type=cache,target=/root/.ccache \
|
||||
&& cmake --build --parallel --preset 'CPU' \
|
||||
&& cmake --install build --component CPU --strip --parallel 8
|
||||
|
||||
FROM base AS cuda-11
|
||||
ARG CUDA11VERSION=11.3
|
||||
RUN dnf install -y cuda-toolkit-${CUDA11VERSION//./-}
|
||||
ENV PATH=/usr/local/cuda-11/bin:$PATH
|
||||
RUN --mount=type=cache,target=/root/.ccache \
|
||||
cmake --preset 'CUDA 11' \
|
||||
&& cmake --build --parallel --preset 'CUDA 11' \
|
||||
&& cmake --install build --component CUDA --strip --parallel 8
|
||||
|
||||
FROM base AS cuda-12
|
||||
ARG CUDA12VERSION=12.8
|
||||
RUN dnf install -y cuda-toolkit-${CUDA12VERSION//./-}
|
||||
@ -98,11 +85,9 @@ RUN --mount=type=cache,target=/root/.cache/go-build \
|
||||
go build -trimpath -buildmode=pie -o /bin/ollama .
|
||||
|
||||
FROM --platform=linux/amd64 scratch AS amd64
|
||||
COPY --from=cuda-11 dist/lib/ollama/cuda_v11 /lib/ollama/cuda_v11
|
||||
COPY --from=cuda-12 dist/lib/ollama/cuda_v12 /lib/ollama/cuda_v12
|
||||
|
||||
FROM --platform=linux/arm64 scratch AS arm64
|
||||
COPY --from=cuda-11 dist/lib/ollama/cuda_v11 /lib/ollama/cuda_v11
|
||||
COPY --from=cuda-12 dist/lib/ollama/cuda_v12 /lib/ollama/cuda_v12
|
||||
COPY --from=jetpack-5 dist/lib/ollama/cuda_v11 /lib/ollama/cuda_jetpack5
|
||||
COPY --from=jetpack-6 dist/lib/ollama/cuda_v12 /lib/ollama/cuda_jetpack6
|
||||
|
@ -1,6 +1,6 @@
|
||||
UPSTREAM=https://github.com/ggerganov/llama.cpp.git
|
||||
WORKDIR=llama/vendor
|
||||
FETCH_HEAD=de4c07f93783a1a96456a44dc16b9db538ee1618
|
||||
FETCH_HEAD=e1e8e0991ffd9e99a445c6812bb519d5bac9f4b5
|
||||
|
||||
.PHONY: help
|
||||
help:
|
||||
@ -15,13 +15,11 @@ help:
|
||||
@echo " make -f $(lastword $(MAKEFILE_LIST)) clean sync"
|
||||
|
||||
.PHONY: sync
|
||||
sync: llama/build-info.cpp ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal
|
||||
sync: llama/build-info.cpp llama/llama.cpp ml/backend/ggml/ggml
|
||||
|
||||
llama/build-info.cpp: llama/build-info.cpp.in llama/llama.cpp
|
||||
sed -e 's|@FETCH_HEAD@|$(FETCH_HEAD)|' <$< >$@
|
||||
|
||||
ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal: ml/backend/ggml/ggml
|
||||
go generate ./$(@D)
|
||||
.PHONY: llama/build-info.cpp
|
||||
llama/build-info.cpp: llama/build-info.cpp.in
|
||||
sed -e 's|@FETCH_HEAD@|$(FETCH_HEAD)|' $< > $@
|
||||
|
||||
.PHONY: llama/llama.cpp
|
||||
llama/llama.cpp: llama/vendor/
|
||||
@ -32,13 +30,12 @@ ml/backend/ggml/ggml: llama/vendor/ggml/
|
||||
rsync -arvzc -f "merge $@/.rsync-filter" $< $@
|
||||
|
||||
PATCHES=$(wildcard llama/patches/*.patch)
|
||||
PATCHED=$(join $(dir $(PATCHES)), $(addsuffix ed, $(addprefix ., $(notdir $(PATCHES)))))
|
||||
|
||||
.PHONY: apply-patches
|
||||
.NOTPARALLEL:
|
||||
apply-patches: $(PATCHED)
|
||||
apply-patches: $(addsuffix ed, $(PATCHES))
|
||||
|
||||
llama/patches/.%.patched: llama/patches/%.patch
|
||||
%.patched: %.patch
|
||||
@if git -c user.name=nobody -c 'user.email=<>' -C $(WORKDIR) am -3 $(realpath $<); then touch $@; else git -C $(WORKDIR) am --abort; exit 1; fi
|
||||
|
||||
.PHONY: checkout
|
||||
@ -60,4 +57,4 @@ format-patches: llama/patches
|
||||
|
||||
.PHONE: clean
|
||||
clean: checkout
|
||||
$(RM) llama/patches/.*.patched
|
||||
$(RM) $(addsuffix ed, $(PATCHES))
|
||||
|
@ -315,7 +315,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
||||
- [Ollama-chats RPG](https://github.com/drazdra/ollama-chats)
|
||||
- [IntelliBar](https://intellibar.app/) (AI-powered assistant for macOS)
|
||||
- [Jirapt](https://github.com/AliAhmedNada/jirapt) (Jira Integration to generate issues, tasks, epics)
|
||||
- [ojira](https://github.com/AliAhmedNada/ojira) (Jira chrome plugin to easily generate descriptions for tasks)
|
||||
- [QA-Pilot](https://github.com/reid41/QA-Pilot) (Interactive chat tool that can leverage Ollama models for rapid understanding and navigation of GitHub code repositories)
|
||||
- [ChatOllama](https://github.com/sugarforever/chat-ollama) (Open Source Chatbot based on Ollama with Knowledge Bases)
|
||||
- [CRAG Ollama Chat](https://github.com/Nagi-ovo/CRAG-Ollama-Chat) (Simple Web Search with Corrective RAG)
|
||||
@ -527,7 +526,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
||||
- [Abso](https://github.com/lunary-ai/abso) (OpenAI-compatible TypeScript SDK for any LLM provider)
|
||||
- [Nichey](https://github.com/goodreasonai/nichey) is a Python package for generating custom wikis for your research topic
|
||||
- [Ollama for D](https://github.com/kassane/ollama-d)
|
||||
- [OllamaPlusPlus](https://github.com/HardCodeDev777/OllamaPlusPlus) (Very simple C++ library for Ollama)
|
||||
|
||||
### Mobile
|
||||
|
||||
@ -584,7 +582,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
||||
- [Simple-Discord-AI](https://github.com/zyphixor/simple-discord-ai)
|
||||
- [LLM Telegram Bot](https://github.com/innightwolfsleep/llm_telegram_bot) (telegram bot, primary for RP. Oobabooga-like buttons, [A1111](https://github.com/AUTOMATIC1111/stable-diffusion-webui) API integration e.t.c)
|
||||
- [mcp-llm](https://github.com/sammcj/mcp-llm) (MCP Server to allow LLMs to call other LLMs)
|
||||
- [UnityCodeLama](https://github.com/HardCodeDev777/UnityCodeLama) (Unity Edtior tool to analyze scripts via Ollama)
|
||||
|
||||
### Supported backends
|
||||
|
||||
|
@ -1,6 +1,7 @@
|
||||
package api
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net/http"
|
||||
@ -136,7 +137,7 @@ func TestClientStream(t *testing.T) {
|
||||
client := NewClient(&url.URL{Scheme: "http", Host: ts.Listener.Addr().String()}, http.DefaultClient)
|
||||
|
||||
var receivedChunks []ChatResponse
|
||||
err := client.stream(t.Context(), http.MethodPost, "/v1/chat", nil, func(chunk []byte) error {
|
||||
err := client.stream(context.Background(), http.MethodPost, "/v1/chat", nil, func(chunk []byte) error {
|
||||
var resp ChatResponse
|
||||
if err := json.Unmarshal(chunk, &resp); err != nil {
|
||||
return fmt.Errorf("failed to unmarshal chunk: %w", err)
|
||||
@ -222,7 +223,7 @@ func TestClientDo(t *testing.T) {
|
||||
ID string `json:"id"`
|
||||
Success bool `json:"success"`
|
||||
}
|
||||
err := client.do(t.Context(), http.MethodPost, "/v1/messages", nil, &resp)
|
||||
err := client.do(context.Background(), http.MethodPost, "/v1/messages", nil, &resp)
|
||||
|
||||
if tc.wantErr != "" {
|
||||
if err == nil {
|
||||
|
@ -271,6 +271,9 @@ type Options struct {
|
||||
RepeatPenalty float32 `json:"repeat_penalty,omitempty"`
|
||||
PresencePenalty float32 `json:"presence_penalty,omitempty"`
|
||||
FrequencyPenalty float32 `json:"frequency_penalty,omitempty"`
|
||||
Mirostat int `json:"mirostat,omitempty"`
|
||||
MirostatTau float32 `json:"mirostat_tau,omitempty"`
|
||||
MirostatEta float32 `json:"mirostat_eta,omitempty"`
|
||||
Stop []string `json:"stop,omitempty"`
|
||||
}
|
||||
|
||||
@ -645,6 +648,9 @@ func DefaultOptions() Options {
|
||||
RepeatPenalty: 1.1,
|
||||
PresencePenalty: 0.0,
|
||||
FrequencyPenalty: 0.0,
|
||||
Mirostat: 0,
|
||||
MirostatTau: 5.0,
|
||||
MirostatEta: 0.1,
|
||||
Seed: -1,
|
||||
|
||||
Runner: Runner{
|
||||
|
@ -4,14 +4,20 @@ import (
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"github.com/ollama/ollama/envconfig"
|
||||
"github.com/ollama/ollama/logutil"
|
||||
)
|
||||
|
||||
func InitLogging() {
|
||||
level := slog.LevelInfo
|
||||
|
||||
if envconfig.Debug() {
|
||||
level = slog.LevelDebug
|
||||
}
|
||||
|
||||
var logFile *os.File
|
||||
var err error
|
||||
// Detect if we're a GUI app on windows, and if not, send logs to console
|
||||
@ -27,8 +33,20 @@ func InitLogging() {
|
||||
return
|
||||
}
|
||||
}
|
||||
handler := slog.NewTextHandler(logFile, &slog.HandlerOptions{
|
||||
Level: level,
|
||||
AddSource: true,
|
||||
ReplaceAttr: func(_ []string, attr slog.Attr) slog.Attr {
|
||||
if attr.Key == slog.SourceKey {
|
||||
source := attr.Value.Any().(*slog.Source)
|
||||
source.File = filepath.Base(source.File)
|
||||
}
|
||||
return attr
|
||||
},
|
||||
})
|
||||
|
||||
slog.SetDefault(slog.New(handler))
|
||||
|
||||
slog.SetDefault(logutil.NewLogger(logFile, envconfig.LogLevel()))
|
||||
slog.Info("ollama app started")
|
||||
}
|
||||
|
||||
|
@ -78,7 +78,7 @@ func BenchmarkColdStart(b *testing.B) {
|
||||
|
||||
for _, tt := range tests {
|
||||
b.Run(fmt.Sprintf("%s/cold/%s", m, tt.name), func(b *testing.B) {
|
||||
ctx := b.Context()
|
||||
ctx := context.Background()
|
||||
|
||||
// Set number of tokens as our throughput metric
|
||||
b.SetBytes(int64(tt.maxTokens))
|
||||
@ -113,7 +113,7 @@ func BenchmarkWarmStart(b *testing.B) {
|
||||
|
||||
for _, tt := range tests {
|
||||
b.Run(fmt.Sprintf("%s/warm/%s", m, tt.name), func(b *testing.B) {
|
||||
ctx := b.Context()
|
||||
ctx := context.Background()
|
||||
|
||||
// Pre-warm the model
|
||||
warmup(client, m, tt.prompt, b)
|
||||
@ -140,7 +140,7 @@ func setup(b *testing.B) *api.Client {
|
||||
if err != nil {
|
||||
b.Fatal(err)
|
||||
}
|
||||
if _, err := client.Show(b.Context(), &api.ShowRequest{Model: modelName(b)}); err != nil {
|
||||
if _, err := client.Show(context.Background(), &api.ShowRequest{Model: modelName(b)}); err != nil {
|
||||
b.Fatalf("Model unavailable: %v", err)
|
||||
}
|
||||
|
||||
|
@ -2,6 +2,7 @@ package cmd
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"io"
|
||||
"net/http"
|
||||
@ -336,7 +337,7 @@ func TestDeleteHandler(t *testing.T) {
|
||||
t.Cleanup(mockServer.Close)
|
||||
|
||||
cmd := &cobra.Command{}
|
||||
cmd.SetContext(t.Context())
|
||||
cmd.SetContext(context.TODO())
|
||||
if err := DeleteHandler(cmd, []string{"test-model"}); err != nil {
|
||||
t.Fatalf("DeleteHandler failed: %v", err)
|
||||
}
|
||||
@ -398,6 +399,11 @@ func TestGetModelfileName(t *testing.T) {
|
||||
var expectedFilename string
|
||||
|
||||
if tt.fileExists {
|
||||
tempDir, err := os.MkdirTemp("", "modelfiledir")
|
||||
defer os.RemoveAll(tempDir)
|
||||
if err != nil {
|
||||
t.Fatalf("temp modelfile dir creation failed: %v", err)
|
||||
}
|
||||
var fn string
|
||||
if tt.modelfileName != "" {
|
||||
fn = tt.modelfileName
|
||||
@ -405,7 +411,7 @@ func TestGetModelfileName(t *testing.T) {
|
||||
fn = "Modelfile"
|
||||
}
|
||||
|
||||
tempFile, err := os.CreateTemp(t.TempDir(), fn)
|
||||
tempFile, err := os.CreateTemp(tempDir, fn)
|
||||
if err != nil {
|
||||
t.Fatalf("temp modelfile creation failed: %v", err)
|
||||
}
|
||||
@ -524,7 +530,7 @@ func TestPushHandler(t *testing.T) {
|
||||
|
||||
cmd := &cobra.Command{}
|
||||
cmd.Flags().Bool("insecure", false, "")
|
||||
cmd.SetContext(t.Context())
|
||||
cmd.SetContext(context.TODO())
|
||||
|
||||
// Redirect stderr to capture progress output
|
||||
oldStderr := os.Stderr
|
||||
@ -629,7 +635,7 @@ func TestListHandler(t *testing.T) {
|
||||
t.Setenv("OLLAMA_HOST", mockServer.URL)
|
||||
|
||||
cmd := &cobra.Command{}
|
||||
cmd.SetContext(t.Context())
|
||||
cmd.SetContext(context.TODO())
|
||||
|
||||
// Capture stdout
|
||||
oldStdout := os.Stdout
|
||||
@ -724,7 +730,7 @@ func TestCreateHandler(t *testing.T) {
|
||||
}))
|
||||
t.Setenv("OLLAMA_HOST", mockServer.URL)
|
||||
t.Cleanup(mockServer.Close)
|
||||
tempFile, err := os.CreateTemp(t.TempDir(), "modelfile")
|
||||
tempFile, err := os.CreateTemp("", "modelfile")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
@ -744,7 +750,7 @@ func TestCreateHandler(t *testing.T) {
|
||||
}
|
||||
|
||||
cmd.Flags().Bool("insecure", false, "")
|
||||
cmd.SetContext(t.Context())
|
||||
cmd.SetContext(context.TODO())
|
||||
|
||||
// Redirect stderr to capture progress output
|
||||
oldStderr := os.Stderr
|
||||
|
@ -44,7 +44,7 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
|
||||
fmt.Fprintln(os.Stderr, "Use \"\"\" to begin a multi-line message.")
|
||||
|
||||
if opts.MultiModal {
|
||||
fmt.Fprintf(os.Stderr, "Use %s to include .jpg, .png, or .webp images.\n", filepath.FromSlash("/path/to/file"))
|
||||
fmt.Fprintf(os.Stderr, "Use %s to include .jpg or .png images.\n", filepath.FromSlash("/path/to/file"))
|
||||
}
|
||||
|
||||
fmt.Fprintln(os.Stderr, "")
|
||||
@ -511,7 +511,7 @@ func extractFileNames(input string) []string {
|
||||
// Regex to match file paths starting with optional drive letter, / ./ \ or .\ and include escaped or unescaped spaces (\ or %20)
|
||||
// and followed by more characters and a file extension
|
||||
// This will capture non filename strings, but we'll check for file existence to remove mismatches
|
||||
regexPattern := `(?:[a-zA-Z]:)?(?:\./|/|\\)[\S\\ ]+?\.(?i:jpg|jpeg|png|webp)\b`
|
||||
regexPattern := `(?:[a-zA-Z]:)?(?:\./|/|\\)[\S\\ ]+?\.(?i:jpg|jpeg|png)\b`
|
||||
re := regexp.MustCompile(regexPattern)
|
||||
|
||||
return re.FindAllString(input, -1)
|
||||
@ -531,8 +531,6 @@ func extractFileData(input string) (string, []api.ImageData, error) {
|
||||
return "", imgs, err
|
||||
}
|
||||
fmt.Fprintf(os.Stderr, "Added image '%s'\n", nfp)
|
||||
input = strings.ReplaceAll(input, "'"+nfp+"'", "")
|
||||
input = strings.ReplaceAll(input, "'"+fp+"'", "")
|
||||
input = strings.ReplaceAll(input, fp, "")
|
||||
imgs = append(imgs, data)
|
||||
}
|
||||
@ -553,7 +551,7 @@ func getImageData(filePath string) ([]byte, error) {
|
||||
}
|
||||
|
||||
contentType := http.DetectContentType(buf)
|
||||
allowedTypes := []string{"image/jpeg", "image/jpg", "image/png", "image/webp"}
|
||||
allowedTypes := []string{"image/jpeg", "image/jpg", "image/png"}
|
||||
if !slices.Contains(allowedTypes, contentType) {
|
||||
return nil, fmt.Errorf("invalid image type: %s", contentType)
|
||||
}
|
||||
|
@ -1,8 +1,6 @@
|
||||
package cmd
|
||||
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
@ -12,17 +10,14 @@ func TestExtractFilenames(t *testing.T) {
|
||||
// Unix style paths
|
||||
input := ` some preamble
|
||||
./relative\ path/one.png inbetween1 ./not a valid two.jpg inbetween2 ./1.svg
|
||||
/unescaped space /three.jpeg inbetween3 /valid\ path/dir/four.png "./quoted with spaces/five.JPG
|
||||
/unescaped space /six.webp inbetween6 /valid\ path/dir/seven.WEBP`
|
||||
/unescaped space /three.jpeg inbetween3 /valid\ path/dir/four.png "./quoted with spaces/five.JPG`
|
||||
res := extractFileNames(input)
|
||||
assert.Len(t, res, 7)
|
||||
assert.Len(t, res, 5)
|
||||
assert.Contains(t, res[0], "one.png")
|
||||
assert.Contains(t, res[1], "two.jpg")
|
||||
assert.Contains(t, res[2], "three.jpeg")
|
||||
assert.Contains(t, res[3], "four.png")
|
||||
assert.Contains(t, res[4], "five.JPG")
|
||||
assert.Contains(t, res[5], "six.webp")
|
||||
assert.Contains(t, res[6], "seven.WEBP")
|
||||
assert.NotContains(t, res[4], '"')
|
||||
assert.NotContains(t, res, "inbetween1")
|
||||
assert.NotContains(t, res, "./1.svg")
|
||||
@ -33,12 +28,10 @@ func TestExtractFilenames(t *testing.T) {
|
||||
/absolute/nospace/three.jpeg inbetween3 /absolute/with space/four.png inbetween4
|
||||
./relative\ path/five.JPG inbetween5 "./relative with/spaces/six.png inbetween6
|
||||
d:\path with\spaces\seven.JPEG inbetween7 c:\users\jdoe\eight.png inbetween8
|
||||
d:\program files\someplace\nine.png inbetween9 "E:\program files\someplace\ten.PNG
|
||||
c:/users/jdoe/eleven.webp inbetween11 c:/program files/someplace/twelve.WebP inbetween12
|
||||
d:\path with\spaces\thirteen.WEBP some ending
|
||||
d:\program files\someplace\nine.png inbetween9 "E:\program files\someplace\ten.PNG some ending
|
||||
`
|
||||
res = extractFileNames(input)
|
||||
assert.Len(t, res, 13)
|
||||
assert.Len(t, res, 10)
|
||||
assert.NotContains(t, res, "inbetween2")
|
||||
assert.Contains(t, res[0], "one.png")
|
||||
assert.Contains(t, res[0], "c:")
|
||||
@ -56,31 +49,4 @@ d:\path with\spaces\thirteen.WEBP some ending
|
||||
assert.Contains(t, res[8], "d:")
|
||||
assert.Contains(t, res[9], "ten.PNG")
|
||||
assert.Contains(t, res[9], "E:")
|
||||
assert.Contains(t, res[10], "eleven.webp")
|
||||
assert.Contains(t, res[10], "c:")
|
||||
assert.Contains(t, res[11], "twelve.WebP")
|
||||
assert.Contains(t, res[11], "c:")
|
||||
assert.Contains(t, res[12], "thirteen.WEBP")
|
||||
assert.Contains(t, res[12], "d:")
|
||||
}
|
||||
|
||||
// Ensure that file paths wrapped in single quotes are removed with the quotes.
|
||||
func TestExtractFileDataRemovesQuotedFilepath(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
fp := filepath.Join(dir, "img.jpg")
|
||||
data := make([]byte, 600)
|
||||
copy(data, []byte{
|
||||
0xff, 0xd8, 0xff, 0xe0, 0x00, 0x10, 'J', 'F', 'I', 'F',
|
||||
0x00, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0xff, 0xd9,
|
||||
})
|
||||
if err := os.WriteFile(fp, data, 0o600); err != nil {
|
||||
t.Fatalf("failed to write test image: %v", err)
|
||||
}
|
||||
|
||||
input := "before '" + fp + "' after"
|
||||
cleaned, imgs, err := extractFileData(input)
|
||||
assert.NoError(t, err)
|
||||
assert.Len(t, imgs, 1)
|
||||
assert.Equal(t, cleaned, "before after")
|
||||
}
|
||||
|
@ -1,7 +1,6 @@
|
||||
package convert
|
||||
|
||||
import (
|
||||
"cmp"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
@ -15,12 +14,13 @@ import (
|
||||
)
|
||||
|
||||
type ModelParameters struct {
|
||||
Architectures []string `json:"architectures"`
|
||||
VocabSize uint32 `json:"vocab_size"`
|
||||
Architectures []string `json:"architectures"`
|
||||
VocabSize uint32 `json:"vocab_size"`
|
||||
TextModel TextParameters `json:"text_config"`
|
||||
}
|
||||
|
||||
TextModel struct {
|
||||
VocabSize uint32 `json:"vocab_size"`
|
||||
} `json:"text_config"`
|
||||
type TextParameters struct {
|
||||
VocabSize uint32 `json:"vocab_size"`
|
||||
}
|
||||
|
||||
type AdapterParameters struct {
|
||||
@ -173,8 +173,6 @@ func ConvertModel(fsys fs.FS, f *os.File) error {
|
||||
switch p.Architectures[0] {
|
||||
case "LlamaForCausalLM":
|
||||
conv = &llamaModel{}
|
||||
case "MllamaForConditionalGeneration":
|
||||
conv = &mllamaModel{}
|
||||
case "Llama4ForConditionalGeneration":
|
||||
conv = &llama4Model{}
|
||||
case "Mistral3ForConditionalGeneration":
|
||||
@ -191,8 +189,6 @@ func ConvertModel(fsys fs.FS, f *os.File) error {
|
||||
conv = &phi3Model{}
|
||||
case "Qwen2ForCausalLM":
|
||||
conv = &qwen2Model{}
|
||||
case "Qwen2_5_VLForConditionalGeneration":
|
||||
conv = &qwen25VLModel{}
|
||||
case "BertModel":
|
||||
conv = &bertModel{}
|
||||
case "CohereForCausalLM":
|
||||
@ -216,22 +212,24 @@ func ConvertModel(fsys fs.FS, f *os.File) error {
|
||||
return err
|
||||
}
|
||||
|
||||
vocabSize := int(cmp.Or(p.VocabSize, p.TextModel.VocabSize))
|
||||
vocabSize := int(p.VocabSize)
|
||||
if vocabSize == 0 {
|
||||
tVocabSize := int(p.TextModel.VocabSize)
|
||||
vocabSize = tVocabSize
|
||||
}
|
||||
|
||||
switch {
|
||||
case vocabSize == 0:
|
||||
slog.Debug("vocabulary size was not explicitly set by the model", "default size", len(t.Vocabulary.Tokens))
|
||||
slog.Warn("vocabulary size was not explicitly set by the model", "default size", len(t.Vocabulary.Tokens))
|
||||
case vocabSize > len(t.Vocabulary.Tokens):
|
||||
slog.Debug("vocabulary is smaller than expected, padding with dummy tokens", "expect", vocabSize, "actual", len(t.Vocabulary.Tokens))
|
||||
slog.Warn("vocabulary is smaller than expected, padding with dummy tokens", "expect", vocabSize, "actual", len(t.Vocabulary.Tokens))
|
||||
for i := range vocabSize - len(t.Vocabulary.Tokens) {
|
||||
t.Vocabulary.Tokens = append(t.Vocabulary.Tokens, fmt.Sprintf("[PAD%d]", i))
|
||||
t.Vocabulary.Scores = append(t.Vocabulary.Scores, -1)
|
||||
t.Vocabulary.Types = append(t.Vocabulary.Types, tokenTypeUserDefined)
|
||||
}
|
||||
case vocabSize < len(t.Vocabulary.Tokens):
|
||||
slog.Debug("vocabulary is larger than expected", "want", vocabSize, "got", len(t.Vocabulary.Tokens))
|
||||
p.VocabSize = uint32(len(t.Vocabulary.Tokens))
|
||||
p.TextModel.VocabSize = uint32(len(t.Vocabulary.Tokens))
|
||||
return fmt.Errorf("vocabulary is larger than expected '%d' instead of '%d'", len(t.Vocabulary.Tokens), vocabSize)
|
||||
default:
|
||||
slog.Debug("vocabulary", "size", len(t.Vocabulary.Tokens))
|
||||
}
|
||||
|
@ -1,160 +0,0 @@
|
||||
package convert
|
||||
|
||||
import (
|
||||
"strings"
|
||||
|
||||
"github.com/ollama/ollama/fs/ggml"
|
||||
"github.com/pdevine/tensor"
|
||||
"github.com/pdevine/tensor/native"
|
||||
)
|
||||
|
||||
type mllamaModel struct {
|
||||
ModelParameters
|
||||
TextModel struct {
|
||||
llamaModel
|
||||
|
||||
CrossAttentionLayers []int32 `json:"cross_attention_layers"`
|
||||
} `json:"text_config"`
|
||||
VisionModel struct {
|
||||
NumHiddenLayers uint32 `json:"num_hidden_layers"`
|
||||
NumGlobalLayers uint32 `json:"num_global_layers"`
|
||||
IntermediateLayersIndices []int32 `json:"intermediate_layers_indices"`
|
||||
|
||||
HiddenSize uint32 `json:"hidden_size"`
|
||||
IntermediateSize uint32 `json:"intermediate_size"`
|
||||
|
||||
AttentionHeads uint32 `json:"attention_heads"`
|
||||
|
||||
ImageSize uint32 `json:"image_size"`
|
||||
PatchSize uint32 `json:"patch_size"`
|
||||
NumChannels uint32 `json:"num_channels"`
|
||||
MaxNumTiles uint32 `json:"max_num_tiles"`
|
||||
NormEpsilon float32 `json:"norm_eps"`
|
||||
RopeTheta float32 `json:"rope.freq_base"`
|
||||
} `json:"vision_config"`
|
||||
}
|
||||
|
||||
func (m *mllamaModel) KV(t *Tokenizer) ggml.KV {
|
||||
kv := m.ModelParameters.KV(t)
|
||||
kv["general.architecture"] = "mllama"
|
||||
|
||||
for k, v := range m.TextModel.KV(t) {
|
||||
if strings.HasPrefix(k, "llama.") {
|
||||
kv[strings.ReplaceAll(k, "llama.", "mllama.")] = v
|
||||
}
|
||||
}
|
||||
|
||||
kv["mllama.attention.cross_attention_layers"] = m.TextModel.CrossAttentionLayers
|
||||
|
||||
kv["mllama.vision.block_count"] = m.VisionModel.NumHiddenLayers
|
||||
kv["mllama.vision.global.block_count"] = m.VisionModel.NumGlobalLayers
|
||||
kv["mllama.vision.intermediate_layers_indices"] = m.VisionModel.IntermediateLayersIndices
|
||||
|
||||
kv["mllama.vision.embedding_length"] = m.VisionModel.HiddenSize
|
||||
kv["mllama.vision.feed_forward_length"] = m.VisionModel.IntermediateSize
|
||||
|
||||
kv["mllama.vision.attention.head_count"] = m.VisionModel.AttentionHeads
|
||||
kv["mllama.vision.attention.layer_norm_epsilon"] = m.VisionModel.NormEpsilon
|
||||
|
||||
kv["mllama.vision.image_size"] = m.VisionModel.ImageSize
|
||||
kv["mllama.vision.patch_size"] = m.VisionModel.PatchSize
|
||||
kv["mllama.vision.max_num_tiles"] = m.VisionModel.MaxNumTiles
|
||||
kv["mllama.vision.num_channels"] = m.VisionModel.NumChannels
|
||||
|
||||
return kv
|
||||
}
|
||||
|
||||
func (m *mllamaModel) Replacements() []string {
|
||||
return append(
|
||||
m.TextModel.Replacements(),
|
||||
"language_model.", "",
|
||||
"gate_attn", "attn_gate",
|
||||
"gate_ffn", "ffn_gate",
|
||||
"cross_attn.", "cross_attn_",
|
||||
"vision_model", "v",
|
||||
"class_embedding", "class_embd",
|
||||
"patch_embedding", "patch_embd",
|
||||
"gated_positional_embedding.tile_embedding", "tile_position_embd",
|
||||
"gated_positional_embedding.embedding", "position_embd.weight",
|
||||
"gated_positional_embedding", "position_embd",
|
||||
"embedding.weight", "weight",
|
||||
"pre_tile_positional_embedding", "pre_tile_position_embd",
|
||||
"post_tile_positional_embedding", "post_tile_position_embd",
|
||||
"layernorm_pre", "pre_ln",
|
||||
"layernorm_post", "post_ln",
|
||||
"global_transformer.layers", "global.blk",
|
||||
"transformer.layers", "blk",
|
||||
"mlp.fc1", "ffn_up",
|
||||
"mlp.fc2", "ffn_down",
|
||||
"multi_modal_projector", "mm.0",
|
||||
)
|
||||
}
|
||||
|
||||
func (m *mllamaModel) Tensors(ts []Tensor) []*ggml.Tensor {
|
||||
var out []*ggml.Tensor
|
||||
var text []Tensor
|
||||
for _, t := range ts {
|
||||
if t.Name() == "v.position_embd.gate" {
|
||||
for _, name := range []string{"v.position_embd.gate", "v.tile_position_embd.gate"} {
|
||||
tt := t.Clone()
|
||||
tt.SetRepacker(m.repack(name))
|
||||
out = append(out, &ggml.Tensor{
|
||||
Name: name,
|
||||
Kind: t.Kind(),
|
||||
Shape: t.Shape(),
|
||||
WriterTo: tt,
|
||||
})
|
||||
}
|
||||
} else if t.Name() == "v.pre_tile_position_embd.gate" || t.Name() == "v.post_tile_position_embd.gate" {
|
||||
t.SetRepacker(m.repack(t.Name()))
|
||||
out = append(out, &ggml.Tensor{
|
||||
Name: t.Name(),
|
||||
Kind: t.Kind(),
|
||||
Shape: t.Shape(),
|
||||
WriterTo: t,
|
||||
})
|
||||
} else if strings.HasPrefix(t.Name(), "v.") || strings.HasPrefix(t.Name(), "mm.") {
|
||||
out = append(out, &ggml.Tensor{
|
||||
Name: t.Name(),
|
||||
Kind: t.Kind(),
|
||||
Shape: t.Shape(),
|
||||
WriterTo: t,
|
||||
})
|
||||
} else {
|
||||
text = append(text, t)
|
||||
}
|
||||
}
|
||||
|
||||
return append(out, m.TextModel.Tensors(text)...)
|
||||
}
|
||||
|
||||
func (m *mllamaModel) repack(name string) Repacker {
|
||||
return func(_ string, data []float32, shape []uint64) (_ []float32, err error) {
|
||||
dims := make([]int, len(shape))
|
||||
for i, dim := range shape {
|
||||
dims[i] = int(dim)
|
||||
}
|
||||
|
||||
var t tensor.Tensor = tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
|
||||
|
||||
t, err = tensor.Tanh(t)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if name == "v.position_embd.gate" {
|
||||
t, err = tensor.Sub(float32(1), t)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
t = tensor.Materialize(t)
|
||||
// flatten tensor so it can be return as a vector
|
||||
if err := t.Reshape(t.Shape().TotalSize()); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return native.VectorF32(t.(*tensor.Dense))
|
||||
}
|
||||
}
|
@ -15,7 +15,6 @@ type qwen2Model struct {
|
||||
Type string `json:"type"`
|
||||
Factor ropeFactor `json:"factor"`
|
||||
OriginalMaxPositionEmbeddings uint32 `json:"original_max_position_embeddings"`
|
||||
MropeSection []int32 `json:"mrope_section"`
|
||||
} `json:"rope_scaling"`
|
||||
RMSNormEPS float32 `json:"rms_norm_eps"`
|
||||
}
|
||||
@ -40,8 +39,6 @@ func (q *qwen2Model) KV(t *Tokenizer) ggml.KV {
|
||||
case "yarn":
|
||||
kv["qwen2.rope.scaling.type"] = q.RopeScaling.Type
|
||||
kv["qwen2.rope.scaling.factor"] = q.RopeScaling.Factor
|
||||
case "mrope", "default":
|
||||
kv["qwen2.rope.mrope_section"] = q.RopeScaling.MropeSection
|
||||
default:
|
||||
panic("unknown rope scaling type")
|
||||
}
|
||||
|
@ -1,102 +0,0 @@
|
||||
package convert
|
||||
|
||||
import (
|
||||
"cmp"
|
||||
"slices"
|
||||
"strings"
|
||||
|
||||
"github.com/ollama/ollama/fs/ggml"
|
||||
)
|
||||
|
||||
type qwen25VLModel struct {
|
||||
qwen2Model
|
||||
|
||||
VisionModel struct {
|
||||
Depth uint32 `json:"depth"`
|
||||
HiddenSize uint32 `json:"hidden_size"`
|
||||
NumHeads uint32 `json:"num_heads"`
|
||||
InChannels uint32 `json:"in_chans"`
|
||||
PatchSize uint32 `json:"patch_size"`
|
||||
SpatialMergeSize uint32 `json:"spatial_merge_size"`
|
||||
SpatialPatchSize uint32 `json:"spatial_patch_size"`
|
||||
WindowSize uint32 `json:"window_size"`
|
||||
RMSNormEps float32 `json:"layer_norm_epsilon"`
|
||||
RopeTheta float32 `json:"rope_theta"`
|
||||
FullAttentionBlocks []int32 `json:"fullatt_block_indexes"`
|
||||
TemporalPatchSize uint32 `json:"temporal_patch_size"`
|
||||
} `json:"vision_config"`
|
||||
}
|
||||
|
||||
var _ ModelConverter = (*qwen25VLModel)(nil)
|
||||
|
||||
func (q *qwen25VLModel) KV(t *Tokenizer) ggml.KV {
|
||||
kv := q.ModelParameters.KV(t)
|
||||
kv["general.architecture"] = "qwen25vl"
|
||||
|
||||
for k, v := range q.qwen2Model.KV(t) {
|
||||
if strings.HasPrefix(k, "qwen2.") {
|
||||
kv[strings.Replace(k, "qwen2.", "qwen25vl.", 1)] = v
|
||||
}
|
||||
}
|
||||
|
||||
if q.VisionModel.FullAttentionBlocks == nil {
|
||||
kv["qwen25vl.vision.fullatt_block_indexes"] = []int32{7, 15, 23, 31}
|
||||
}
|
||||
|
||||
kv["qwen25vl.vision.block_count"] = cmp.Or(q.VisionModel.Depth, 32)
|
||||
kv["qwen25vl.vision.embedding_length"] = q.VisionModel.HiddenSize
|
||||
kv["qwen25vl.vision.attention.head_count"] = cmp.Or(q.VisionModel.NumHeads, 16)
|
||||
kv["qwen25vl.vision.num_channels"] = q.VisionModel.InChannels
|
||||
kv["qwen25vl.vision.patch_size"] = cmp.Or(q.VisionModel.PatchSize, 14)
|
||||
kv["qwen25vl.vision.spatial_merge_size"] = cmp.Or(q.VisionModel.SpatialMergeSize, 2)
|
||||
kv["qwen25vl.vision.spatial_patch_size"] = q.VisionModel.SpatialPatchSize
|
||||
kv["qwen25vl.vision.window_size"] = cmp.Or(q.VisionModel.WindowSize, 112)
|
||||
kv["qwen25vl.vision.attention.layer_norm_epsilon"] = cmp.Or(q.VisionModel.RMSNormEps, 1e-6)
|
||||
kv["qwen25vl.vision.rope.freq_base"] = cmp.Or(q.VisionModel.RopeTheta, 1e4)
|
||||
kv["qwen25vl.vision.fullatt_block_indexes"] = q.VisionModel.FullAttentionBlocks
|
||||
kv["qwen25vl.vision.temporal_patch_size"] = cmp.Or(q.VisionModel.TemporalPatchSize, 2)
|
||||
|
||||
return kv
|
||||
}
|
||||
|
||||
func (q *qwen25VLModel) Tensors(ts []Tensor) []*ggml.Tensor {
|
||||
var out []*ggml.Tensor
|
||||
|
||||
for _, t := range ts {
|
||||
if strings.Contains(t.Name(), "patch_embed.proj") {
|
||||
for t := range splitDim(t, 2,
|
||||
strings.NewReplacer("patch_embed.proj", "patch_embd_0"),
|
||||
strings.NewReplacer("patch_embed.proj", "patch_embd_1"),
|
||||
) {
|
||||
t.Shape = slices.DeleteFunc(t.Shape, func(i uint64) bool { return i == 1 })
|
||||
out = append(out, t)
|
||||
}
|
||||
} else if strings.Contains(t.Name(), "attn.qkv") {
|
||||
out = append(out, slices.Collect(splitDim(t, 0,
|
||||
strings.NewReplacer("attn.qkv", "attn_q"),
|
||||
strings.NewReplacer("attn.qkv", "attn_k"),
|
||||
strings.NewReplacer("attn.qkv", "attn_v"),
|
||||
))...)
|
||||
} else {
|
||||
out = append(out, &ggml.Tensor{
|
||||
Name: t.Name(),
|
||||
Kind: t.Kind(),
|
||||
Shape: t.Shape(),
|
||||
WriterTo: t,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
return out
|
||||
}
|
||||
|
||||
func (p *qwen25VLModel) Replacements() []string {
|
||||
return append(
|
||||
p.qwen2Model.Replacements(),
|
||||
"visual", "v",
|
||||
"blocks", "blk",
|
||||
"attn.proj", "attn_out",
|
||||
"norm1", "ln1",
|
||||
"norm2", "ln2",
|
||||
)
|
||||
}
|
58
convert/fs.go
Normal file
58
convert/fs.go
Normal file
@ -0,0 +1,58 @@
|
||||
package convert
|
||||
|
||||
import (
|
||||
"archive/zip"
|
||||
"errors"
|
||||
"io"
|
||||
"io/fs"
|
||||
"os"
|
||||
"path/filepath"
|
||||
)
|
||||
|
||||
type ZipReader struct {
|
||||
r *zip.Reader
|
||||
p string
|
||||
|
||||
// limit is the maximum size of a file that can be read directly
|
||||
// from the zip archive. Files larger than this size will be extracted
|
||||
limit int64
|
||||
}
|
||||
|
||||
func NewZipReader(r *zip.Reader, p string, limit int64) fs.FS {
|
||||
return &ZipReader{r, p, limit}
|
||||
}
|
||||
|
||||
func (z *ZipReader) Open(name string) (fs.File, error) {
|
||||
r, err := z.r.Open(name)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer r.Close()
|
||||
|
||||
if fi, err := r.Stat(); err != nil {
|
||||
return nil, err
|
||||
} else if fi.Size() < z.limit {
|
||||
return r, nil
|
||||
}
|
||||
|
||||
if !filepath.IsLocal(name) {
|
||||
return nil, zip.ErrInsecurePath
|
||||
}
|
||||
|
||||
n := filepath.Join(z.p, name)
|
||||
if _, err := os.Stat(n); errors.Is(err, os.ErrNotExist) {
|
||||
w, err := os.Create(n)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer w.Close()
|
||||
|
||||
if _, err := io.Copy(w, r); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
} else if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return os.Open(n)
|
||||
}
|
@ -38,10 +38,7 @@ const (
|
||||
func (t tensorBase) Kind() uint32 {
|
||||
if strings.HasSuffix(t.name, ".ffn_gate_inp.weight") ||
|
||||
t.name == "token_types.weight" ||
|
||||
t.name == "v.positional_embedding_vlm" ||
|
||||
t.name == "v.tile_position_embd.weight" ||
|
||||
t.name == "v.pre_tile_position_embd.weight" ||
|
||||
t.name == "v.post_tile_position_embd.weight" {
|
||||
t.name == "v.positional_embedding_vlm" {
|
||||
// these tensors are always F32
|
||||
return 0
|
||||
}
|
||||
|
@ -1,56 +0,0 @@
|
||||
package convert
|
||||
|
||||
import (
|
||||
"iter"
|
||||
"slices"
|
||||
"strings"
|
||||
|
||||
"github.com/ollama/ollama/fs/ggml"
|
||||
"github.com/pdevine/tensor"
|
||||
"github.com/pdevine/tensor/native"
|
||||
)
|
||||
|
||||
// splitDim splits a tensor along a specified dimension into multiple tensors. The dimension
|
||||
// is split evenly based on the number of replacers provided.
|
||||
func splitDim(t Tensor, dim int, replacers ...*strings.Replacer) iter.Seq[*ggml.Tensor] {
|
||||
return func(yield func(*ggml.Tensor) bool) {
|
||||
for i, replacer := range replacers {
|
||||
shape := slices.Clone(t.Shape())
|
||||
shape[dim] = shape[dim] / uint64(len(replacers))
|
||||
|
||||
slice := slices.Repeat([]tensor.Slice{nil}, len(shape))
|
||||
slice[dim] = tensor.S(i*int(shape[dim]), (i+1)*int(shape[dim]))
|
||||
|
||||
tt := t.Clone()
|
||||
tt.SetRepacker(func(_ string, data []float32, shape []uint64) ([]float32, error) {
|
||||
dims := make([]int, len(shape))
|
||||
for i := range shape {
|
||||
dims[i] = int(shape[i])
|
||||
}
|
||||
|
||||
var t tensor.Tensor = tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
|
||||
t, err := t.Slice(slice...)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
t = tensor.Materialize(t)
|
||||
// flatten tensor so it can be written as a vector
|
||||
if err := t.Reshape(t.Shape().TotalSize()); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return native.VectorF32(t.(*tensor.Dense))
|
||||
})
|
||||
|
||||
if !yield(&ggml.Tensor{
|
||||
Name: replacer.Replace(t.Name()),
|
||||
Kind: t.Kind(),
|
||||
Shape: shape,
|
||||
WriterTo: tt,
|
||||
}) {
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@ -3,6 +3,7 @@
|
||||
package discover
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"os"
|
||||
"regexp"
|
||||
@ -59,6 +60,8 @@ func cudaVariant(gpuInfo CudaGPUInfo) string {
|
||||
|
||||
// driver 12.0 has problems with the cuda v12 library, so run v11 on those older drivers
|
||||
if gpuInfo.DriverMajor < 12 || (gpuInfo.DriverMajor == 12 && gpuInfo.DriverMinor == 0) {
|
||||
// The detected driver is older than Feb 2023
|
||||
slog.Warn("old CUDA driver detected - please upgrade to a newer driver", "version", fmt.Sprintf("%d.%d", gpuInfo.DriverMajor, gpuInfo.DriverMinor))
|
||||
return "v11"
|
||||
}
|
||||
return "v12"
|
||||
|
@ -670,7 +670,7 @@ func loadOneapiMgmt(oneapiLibPaths []string) (int, *C.oneapi_handle_t, string, e
|
||||
}
|
||||
|
||||
func getVerboseState() C.uint16_t {
|
||||
if envconfig.LogLevel() < slog.LevelInfo {
|
||||
if envconfig.Debug() {
|
||||
return C.uint16_t(1)
|
||||
}
|
||||
return C.uint16_t(0)
|
||||
|
@ -12,7 +12,7 @@ import (
|
||||
// '../lib/ollama' on Linux and the executable's directory on macOS
|
||||
// note: distribution builds, additional GPU-specific libraries are
|
||||
// found in subdirectories of the returned path, such as
|
||||
// 'cuda_v11', 'cuda_v12', 'rocm', etc.
|
||||
// 'cuda_v12', 'rocm', etc.
|
||||
var LibOllamaPath string = func() string {
|
||||
exe, err := os.Executable()
|
||||
if err != nil {
|
||||
|
70
docs/api.md
70
docs/api.md
@ -19,7 +19,7 @@
|
||||
|
||||
### Model names
|
||||
|
||||
Model names follow a `model:tag` format, where `model` can have an optional namespace such as `example/model`. Some examples are `orca-mini:3b-q8_0` and `llama3:70b`. The tag is optional and, if not provided, will default to `latest`. The tag is used to identify a specific version.
|
||||
Model names follow a `model:tag` format, where `model` can have an optional namespace such as `example/model`. Some examples are `orca-mini:3b-q4_1` and `llama3:70b`. The tag is optional and, if not provided, will default to `latest`. The tag is used to identify a specific version.
|
||||
|
||||
### Durations
|
||||
|
||||
@ -394,6 +394,9 @@ curl http://localhost:11434/api/generate -d '{
|
||||
"repeat_penalty": 1.2,
|
||||
"presence_penalty": 1.5,
|
||||
"frequency_penalty": 1.0,
|
||||
"mirostat": 1,
|
||||
"mirostat_tau": 0.8,
|
||||
"mirostat_eta": 0.6,
|
||||
"penalize_newline": true,
|
||||
"stop": ["\n", "user:"],
|
||||
"numa": false,
|
||||
@ -952,8 +955,19 @@ If you are creating a model from a safetensors directory or from a GGUF file, yo
|
||||
|
||||
| Type | Recommended |
|
||||
| --- | :-: |
|
||||
| q2_K | |
|
||||
| q3_K_L | |
|
||||
| q3_K_M | |
|
||||
| q3_K_S | |
|
||||
| q4_0 | |
|
||||
| q4_1 | |
|
||||
| q4_K_M | * |
|
||||
| q4_K_S | |
|
||||
| q5_0 | |
|
||||
| q5_1 | |
|
||||
| q5_K_M | |
|
||||
| q5_K_S | |
|
||||
| q6_K | |
|
||||
| q8_0 | * |
|
||||
|
||||
### Examples
|
||||
@ -998,8 +1012,8 @@ Quantize a non-quantized model.
|
||||
|
||||
```shell
|
||||
curl http://localhost:11434/api/create -d '{
|
||||
"model": "llama3.2:quantized",
|
||||
"from": "llama3.2:3b-instruct-fp16",
|
||||
"model": "llama3.1:quantized",
|
||||
"from": "llama3.1:8b-instruct-fp16",
|
||||
"quantize": "q4_K_M"
|
||||
}'
|
||||
```
|
||||
@ -1009,14 +1023,12 @@ curl http://localhost:11434/api/create -d '{
|
||||
A stream of JSON objects is returned:
|
||||
|
||||
```json
|
||||
{"status":"quantizing F16 model to Q4_K_M","digest":"0","total":6433687776,"completed":12302}
|
||||
{"status":"quantizing F16 model to Q4_K_M","digest":"0","total":6433687776,"completed":6433687552}
|
||||
{"status":"verifying conversion"}
|
||||
{"status":"creating new layer sha256:fb7f4f211b89c6c4928ff4ddb73db9f9c0cfca3e000c3e40d6cf27ddc6ca72eb"}
|
||||
{"status":"using existing layer sha256:966de95ca8a62200913e3f8bfbf84c8494536f1b94b49166851e76644e966396"}
|
||||
{"status":"using existing layer sha256:fcc5a6bec9daf9b561a68827b67ab6088e1dba9d1fa2a50d7bbcc8384e0a265d"}
|
||||
{"status":"using existing layer sha256:a70ff7e570d97baaf4e62ac6e6ad9975e04caa6d900d3742d37698494479e0cd"}
|
||||
{"status":"quantizing F16 model to Q4_K_M"}
|
||||
{"status":"creating new layer sha256:667b0c1932bc6ffc593ed1d03f895bf2dc8dc6df21db3042284a6f4416b06a29"}
|
||||
{"status":"using existing layer sha256:11ce4ee3e170f6adebac9a991c22e22ab3f8530e154ee669954c4bc73061c258"}
|
||||
{"status":"using existing layer sha256:0ba8f0e314b4264dfd19df045cde9d4c394a52474bf92ed6a3de22a4ca31a177"}
|
||||
{"status":"using existing layer sha256:56bb8bd477a519ffa694fc449c2413c6f0e1d3b1c88fa7e3c9d88d3ae49d4dcb"}
|
||||
{"status":"creating new layer sha256:455f34728c9b5dd3376378bfb809ee166c145b0b4c1f1a6feca069055066ef9a"}
|
||||
{"status":"writing manifest"}
|
||||
{"status":"success"}
|
||||
```
|
||||
@ -1154,37 +1166,29 @@ A single JSON object will be returned.
|
||||
{
|
||||
"models": [
|
||||
{
|
||||
"name": "deepseek-r1:latest",
|
||||
"model": "deepseek-r1:latest",
|
||||
"modified_at": "2025-05-10T08:06:48.639712648-07:00",
|
||||
"size": 4683075271,
|
||||
"digest": "0a8c266910232fd3291e71e5ba1e058cc5af9d411192cf88b6d30e92b6e73163",
|
||||
"name": "codellama:13b",
|
||||
"modified_at": "2023-11-04T14:56:49.277302595-07:00",
|
||||
"size": 7365960935,
|
||||
"digest": "9f438cb9cd581fc025612d27f7c1a6669ff83a8bb0ed86c94fcf4c5440555697",
|
||||
"details": {
|
||||
"parent_model": "",
|
||||
"format": "gguf",
|
||||
"family": "qwen2",
|
||||
"families": [
|
||||
"qwen2"
|
||||
],
|
||||
"parameter_size": "7.6B",
|
||||
"quantization_level": "Q4_K_M"
|
||||
"family": "llama",
|
||||
"families": null,
|
||||
"parameter_size": "13B",
|
||||
"quantization_level": "Q4_0"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "llama3.2:latest",
|
||||
"model": "llama3.2:latest",
|
||||
"modified_at": "2025-05-04T17:37:44.706015396-07:00",
|
||||
"size": 2019393189,
|
||||
"digest": "a80c4f17acd55265feec403c7aef86be0c25983ab279d83f3bcd3abbcb5b8b72",
|
||||
"name": "llama3:latest",
|
||||
"modified_at": "2023-12-07T09:32:18.757212583-08:00",
|
||||
"size": 3825819519,
|
||||
"digest": "fe938a131f40e6f6d40083c9f0f430a515233eb2edaa6d72eb85c50d64f2300e",
|
||||
"details": {
|
||||
"parent_model": "",
|
||||
"format": "gguf",
|
||||
"family": "llama",
|
||||
"families": [
|
||||
"llama"
|
||||
],
|
||||
"parameter_size": "3.2B",
|
||||
"quantization_level": "Q4_K_M"
|
||||
"families": null,
|
||||
"parameter_size": "7B",
|
||||
"quantization_level": "Q4_0"
|
||||
}
|
||||
}
|
||||
]
|
||||
|
@ -1,6 +1,6 @@
|
||||
# GPU
|
||||
## Nvidia
|
||||
Ollama supports Nvidia GPUs with compute capability 5.0+.
|
||||
Ollama supports Nvidia GPUs with compute capability 5.0+ and driver version 531 and newer.
|
||||
|
||||
Check your compute compatibility to see if your card is supported:
|
||||
[https://developer.nvidia.com/cuda-gpus](https://developer.nvidia.com/cuda-gpus)
|
||||
|
@ -150,6 +150,9 @@ PARAMETER <parameter> <parametervalue>
|
||||
|
||||
| Parameter | Description | Value Type | Example Usage |
|
||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------- | -------------------- |
|
||||
| mirostat | Enable Mirostat sampling for controlling perplexity. (default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0) | int | mirostat 0 |
|
||||
| mirostat_eta | Influences how quickly the algorithm responds to feedback from the generated text. A lower learning rate will result in slower adjustments, while a higher learning rate will make the algorithm more responsive. (Default: 0.1) | float | mirostat_eta 0.1 |
|
||||
| mirostat_tau | Controls the balance between coherence and diversity of the output. A lower value will result in more focused and coherent text. (Default: 5.0) | float | mirostat_tau 5.0 |
|
||||
| num_ctx | Sets the size of the context window used to generate the next token. (Default: 2048) | int | num_ctx 4096 |
|
||||
| repeat_last_n | Sets how far back for the model to look back to prevent repetition. (Default: 64, 0 = disabled, -1 = num_ctx) | int | repeat_last_n 64 |
|
||||
| repeat_penalty | Sets how strongly to penalize repetitions. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. (Default: 1.1) | float | repeat_penalty 1.1 |
|
||||
|
@ -43,7 +43,7 @@ Ollama includes multiple LLM libraries compiled for different GPUs and CPU vecto
|
||||
In the server log, you will see a message that looks something like this (varies from release to release):
|
||||
|
||||
```
|
||||
Dynamic LLM libraries [rocm_v6 cpu cpu_avx cpu_avx2 cuda_v11 rocm_v5]
|
||||
Dynamic LLM libraries [rocm_v6 cpu cpu_avx cpu_avx2 cuda_v12 rocm_v5]
|
||||
```
|
||||
|
||||
**Experimental LLM Library Override**
|
||||
|
@ -149,22 +149,9 @@ func Bool(k string) func() bool {
|
||||
}
|
||||
}
|
||||
|
||||
// LogLevel returns the log level for the application.
|
||||
// Values are 0 or false INFO (Default), 1 or true DEBUG, 2 TRACE
|
||||
func LogLevel() slog.Level {
|
||||
level := slog.LevelInfo
|
||||
if s := Var("OLLAMA_DEBUG"); s != "" {
|
||||
if b, _ := strconv.ParseBool(s); b {
|
||||
level = slog.LevelDebug
|
||||
} else if i, _ := strconv.ParseInt(s, 10, 64); i != 0 {
|
||||
level = slog.Level(i * -4)
|
||||
}
|
||||
}
|
||||
|
||||
return level
|
||||
}
|
||||
|
||||
var (
|
||||
// Debug enabled additional debug information.
|
||||
Debug = Bool("OLLAMA_DEBUG")
|
||||
// FlashAttention enables the experimental flash attention feature.
|
||||
FlashAttention = Bool("OLLAMA_FLASH_ATTENTION")
|
||||
// KvCacheType is the quantization type for the K/V cache.
|
||||
@ -222,6 +209,8 @@ var (
|
||||
MaxRunners = Uint("OLLAMA_MAX_LOADED_MODELS", 0)
|
||||
// MaxQueue sets the maximum number of queued requests. MaxQueue can be configured via the OLLAMA_MAX_QUEUE environment variable.
|
||||
MaxQueue = Uint("OLLAMA_MAX_QUEUE", 512)
|
||||
// MaxVRAM sets a maximum VRAM override in bytes. MaxVRAM can be configured via the OLLAMA_MAX_VRAM environment variable.
|
||||
MaxVRAM = Uint("OLLAMA_MAX_VRAM", 0)
|
||||
)
|
||||
|
||||
func Uint64(key string, defaultValue uint64) func() uint64 {
|
||||
@ -249,7 +238,7 @@ type EnvVar struct {
|
||||
|
||||
func AsMap() map[string]EnvVar {
|
||||
ret := map[string]EnvVar{
|
||||
"OLLAMA_DEBUG": {"OLLAMA_DEBUG", LogLevel(), "Show additional debug information (e.g. OLLAMA_DEBUG=1)"},
|
||||
"OLLAMA_DEBUG": {"OLLAMA_DEBUG", Debug(), "Show additional debug information (e.g. OLLAMA_DEBUG=1)"},
|
||||
"OLLAMA_FLASH_ATTENTION": {"OLLAMA_FLASH_ATTENTION", FlashAttention(), "Enabled flash attention"},
|
||||
"OLLAMA_KV_CACHE_TYPE": {"OLLAMA_KV_CACHE_TYPE", KvCacheType(), "Quantization type for the K/V cache (default: f16)"},
|
||||
"OLLAMA_GPU_OVERHEAD": {"OLLAMA_GPU_OVERHEAD", GpuOverhead(), "Reserve a portion of VRAM per GPU (bytes)"},
|
||||
|
@ -1,13 +1,11 @@
|
||||
package envconfig
|
||||
|
||||
import (
|
||||
"log/slog"
|
||||
"math"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/google/go-cmp/cmp"
|
||||
"github.com/ollama/ollama/logutil"
|
||||
)
|
||||
|
||||
func TestHost(t *testing.T) {
|
||||
@ -294,34 +292,3 @@ func TestContextLength(t *testing.T) {
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestLogLevel(t *testing.T) {
|
||||
cases := map[string]slog.Level{
|
||||
// Default to INFO
|
||||
"": slog.LevelInfo,
|
||||
"false": slog.LevelInfo,
|
||||
"f": slog.LevelInfo,
|
||||
"0": slog.LevelInfo,
|
||||
|
||||
// True values enable Debug
|
||||
"true": slog.LevelDebug,
|
||||
"t": slog.LevelDebug,
|
||||
|
||||
// Positive values increase verbosity
|
||||
"1": slog.LevelDebug,
|
||||
"2": logutil.LevelTrace,
|
||||
|
||||
// Negative values decrease verbosity
|
||||
"-1": slog.LevelWarn,
|
||||
"-2": slog.LevelError,
|
||||
}
|
||||
|
||||
for k, v := range cases {
|
||||
t.Run(k, func(t *testing.T) {
|
||||
t.Setenv("OLLAMA_DEBUG", k)
|
||||
if i := LogLevel(); i != v {
|
||||
t.Errorf("%s: expected %d, got %d", k, v, i)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
@ -6,7 +6,6 @@ import (
|
||||
"fmt"
|
||||
"io"
|
||||
"log/slog"
|
||||
"math"
|
||||
"slices"
|
||||
"strings"
|
||||
|
||||
@ -126,8 +125,6 @@ func (kv KV) OllamaEngineRequired() bool {
|
||||
"gemma3",
|
||||
"mistral3",
|
||||
"llama4",
|
||||
"mllama",
|
||||
"qwen25vl",
|
||||
}, kv.Architecture())
|
||||
}
|
||||
|
||||
@ -651,29 +648,6 @@ func (llm GGML) VisionGraphSize() (weights, graphSize uint64) {
|
||||
graphSize = 4 * (imageSize*imageSize*numChannels +
|
||||
embeddingLength*patchSize +
|
||||
numPatches*numPatches*headCount)
|
||||
case "qwen25vl":
|
||||
maxPixels := uint64(llm.KV().Uint("vision.max_pixels", 28*28*1280))
|
||||
mergeSize := uint64(llm.KV().Uint("vision.spatial_merge_size", 2))
|
||||
temporalPatchSize := uint64(2)
|
||||
|
||||
// Calculate max possible patches based on max_pixels
|
||||
maxHeight := uint64(math.Sqrt(float64(maxPixels)))
|
||||
maxWidth := maxPixels / maxHeight
|
||||
maxGridHeight := maxHeight / patchSize
|
||||
maxGridWidth := maxWidth / patchSize
|
||||
// Account for merged patches (2x2 grid)
|
||||
numPatches := (maxGridHeight * maxGridWidth) / (mergeSize * mergeSize)
|
||||
|
||||
// Calculate graph size based on typical operations in ProcessImage and createPatches
|
||||
graphSize = 4 * (maxPixels*numChannels + // Original image storage
|
||||
// Normalized pixels
|
||||
maxPixels*numChannels +
|
||||
// Patches storage (numPatches * channels * temporalPatchSize * patchSize^2)
|
||||
numPatches*numChannels*temporalPatchSize*patchSize*patchSize +
|
||||
// Self-attention calculations (similar to other architectures)
|
||||
numPatches*numPatches*headCount +
|
||||
// Additional buffer for processing
|
||||
embeddingLength*numPatches)
|
||||
case "llama4":
|
||||
// vision graph is computed independently in the same schedule
|
||||
// and is negligible compared to the worst case text graph
|
||||
|
125
fs/ggml/type.go
125
fs/ggml/type.go
@ -12,42 +12,42 @@ type FileType uint32
|
||||
const (
|
||||
FileTypeF32 FileType = iota
|
||||
FileTypeF16
|
||||
fileTypeQ4_0
|
||||
fileTypeQ4_1
|
||||
FileTypeQ4_0
|
||||
FileTypeQ4_1
|
||||
fileTypeQ4_1_F16 // unused by GGML
|
||||
fileTypeQ4_2 // unused by GGML
|
||||
fileTypeQ4_3 // unused by GGML
|
||||
FileTypeQ8_0
|
||||
fileTypeQ5_0
|
||||
fileTypeQ5_1
|
||||
fileTypeQ2_K
|
||||
fileTypeQ3_K_S
|
||||
fileTypeQ3_K_M
|
||||
fileTypeQ3_K_L
|
||||
FileTypeQ5_0
|
||||
FileTypeQ5_1
|
||||
FileTypeQ2_K
|
||||
FileTypeQ3_K_S
|
||||
FileTypeQ3_K_M
|
||||
FileTypeQ3_K_L
|
||||
FileTypeQ4_K_S
|
||||
FileTypeQ4_K_M
|
||||
fileTypeQ5_K_S
|
||||
fileTypeQ5_K_M
|
||||
fileTypeQ6_K
|
||||
fileTypeIQ2_XXS
|
||||
fileTypeIQ2_XS
|
||||
fileTypeQ2_K_S
|
||||
fileTypeIQ3_XS
|
||||
fileTypeIQ3_XXS
|
||||
fileTypeIQ1_S
|
||||
fileTypeIQ4_NL
|
||||
fileTypeIQ3_S
|
||||
fileTypeIQ3_M
|
||||
fileTypeIQ2_S
|
||||
fileTypeIQ2_M
|
||||
fileTypeIQ4_XS
|
||||
fileTypeIQ1_M
|
||||
FileTypeQ5_K_S
|
||||
FileTypeQ5_K_M
|
||||
FileTypeQ6_K
|
||||
fileTypeIQ2_XXS // not supported by ollama
|
||||
fileTypeIQ2_XS // not supported by ollama
|
||||
FileTypeQ2_K_S
|
||||
fileTypeIQ3_XS // not supported by ollama
|
||||
fileTypeIQ3_XXS // not supported by ollama
|
||||
fileTypeIQ1_S // not supported by ollama
|
||||
fileTypeIQ4_NL // not supported by ollama
|
||||
fileTypeIQ3_S // not supported by ollama
|
||||
fileTypeIQ3_M // not supported by ollama
|
||||
fileTypeIQ2_S // not supported by ollama
|
||||
fileTypeIQ2_M // not supported by ollama
|
||||
fileTypeIQ4_XS // not supported by ollama
|
||||
fileTypeIQ1_M // not supported by ollama
|
||||
FileTypeBF16
|
||||
fileTypeQ4_0_4_4 // unused by GGML
|
||||
fileTypeQ4_0_4_8 // unused by GGML
|
||||
fileTypeQ4_0_8_8 // unused by GGML
|
||||
fileTypeTQ1_0
|
||||
fileTypeTQ2_0
|
||||
fileTypeTQ1_0 // not supported by ollama
|
||||
fileTypeTQ2_0 // not supported by ollama
|
||||
|
||||
FileTypeUnknown = 1024
|
||||
)
|
||||
@ -60,12 +60,36 @@ func ParseFileType(s string) (FileType, error) {
|
||||
return FileTypeF32, nil
|
||||
case "F16":
|
||||
return FileTypeF16, nil
|
||||
case "Q4_0":
|
||||
return FileTypeQ4_0, nil
|
||||
case "Q4_1":
|
||||
return FileTypeQ4_1, nil
|
||||
case "Q8_0":
|
||||
return FileTypeQ8_0, nil
|
||||
case "Q5_0":
|
||||
return FileTypeQ5_0, nil
|
||||
case "Q5_1":
|
||||
return FileTypeQ5_1, nil
|
||||
case "Q2_K":
|
||||
return FileTypeQ2_K, nil
|
||||
case "Q3_K_S":
|
||||
return FileTypeQ3_K_S, nil
|
||||
case "Q3_K_M":
|
||||
return FileTypeQ3_K_M, nil
|
||||
case "Q3_K_L":
|
||||
return FileTypeQ3_K_L, nil
|
||||
case "Q4_K_S":
|
||||
return FileTypeQ4_K_S, nil
|
||||
case "Q4_K_M", "Q4_K":
|
||||
return FileTypeQ4_K_M, nil
|
||||
case "Q5_K_S":
|
||||
return FileTypeQ5_K_S, nil
|
||||
case "Q5_K_M", "Q5_K":
|
||||
return FileTypeQ5_K_M, nil
|
||||
case "Q6_K":
|
||||
return FileTypeQ6_K, nil
|
||||
case "Q2_K_S":
|
||||
return FileTypeQ2_K_S, nil
|
||||
case "BF16":
|
||||
return FileTypeBF16, nil
|
||||
default:
|
||||
@ -87,41 +111,40 @@ func ParseFileType(s string) (FileType, error) {
|
||||
}
|
||||
|
||||
func (t FileType) String() string {
|
||||
// Note: this routine will return a broader set of file types for existing models
|
||||
switch t {
|
||||
case FileTypeF32:
|
||||
return "F32"
|
||||
case FileTypeF16:
|
||||
return "F16"
|
||||
case fileTypeQ4_0:
|
||||
case FileTypeQ4_0:
|
||||
return "Q4_0"
|
||||
case fileTypeQ4_1:
|
||||
case FileTypeQ4_1:
|
||||
return "Q4_1"
|
||||
case FileTypeQ8_0:
|
||||
return "Q8_0"
|
||||
case fileTypeQ5_0:
|
||||
case FileTypeQ5_0:
|
||||
return "Q5_0"
|
||||
case fileTypeQ5_1:
|
||||
case FileTypeQ5_1:
|
||||
return "Q5_1"
|
||||
case fileTypeQ2_K:
|
||||
case FileTypeQ2_K:
|
||||
return "Q2_K"
|
||||
case fileTypeQ3_K_S:
|
||||
case FileTypeQ3_K_S:
|
||||
return "Q3_K_S"
|
||||
case fileTypeQ3_K_M:
|
||||
case FileTypeQ3_K_M:
|
||||
return "Q3_K_M"
|
||||
case fileTypeQ3_K_L:
|
||||
case FileTypeQ3_K_L:
|
||||
return "Q3_K_L"
|
||||
case FileTypeQ4_K_S:
|
||||
return "Q4_K_S"
|
||||
case FileTypeQ4_K_M:
|
||||
return "Q4_K_M"
|
||||
case fileTypeQ5_K_S:
|
||||
case FileTypeQ5_K_S:
|
||||
return "Q5_K_S"
|
||||
case fileTypeQ5_K_M:
|
||||
case FileTypeQ5_K_M:
|
||||
return "Q5_K_M"
|
||||
case fileTypeQ6_K:
|
||||
case FileTypeQ6_K:
|
||||
return "Q6_K"
|
||||
case fileTypeQ2_K_S:
|
||||
case FileTypeQ2_K_S:
|
||||
return "Q2_K_S"
|
||||
case FileTypeBF16:
|
||||
return "BF16"
|
||||
@ -140,35 +163,35 @@ func (ftype FileType) ToTensorType() TensorType {
|
||||
return TensorTypeF32
|
||||
case FileTypeF16:
|
||||
return TensorTypeF16
|
||||
case fileTypeQ4_0:
|
||||
case FileTypeQ4_0:
|
||||
return TensorTypeQ4_0
|
||||
case fileTypeQ4_1:
|
||||
case FileTypeQ4_1:
|
||||
return TensorTypeQ4_1
|
||||
case FileTypeQ8_0:
|
||||
return TensorTypeQ8_0
|
||||
case fileTypeQ5_0:
|
||||
case FileTypeQ5_0:
|
||||
return TensorTypeQ5_0
|
||||
case fileTypeQ5_1:
|
||||
case FileTypeQ5_1:
|
||||
return TensorTypeQ5_1
|
||||
case fileTypeQ2_K:
|
||||
case FileTypeQ2_K:
|
||||
return TensorTypeQ2_K
|
||||
case fileTypeQ3_K_S:
|
||||
case FileTypeQ3_K_S:
|
||||
return TensorTypeQ3_K
|
||||
case fileTypeQ3_K_M:
|
||||
case FileTypeQ3_K_M:
|
||||
return TensorTypeQ3_K
|
||||
case fileTypeQ3_K_L:
|
||||
case FileTypeQ3_K_L:
|
||||
return TensorTypeQ3_K
|
||||
case FileTypeQ4_K_S:
|
||||
return TensorTypeQ4_K
|
||||
case FileTypeQ4_K_M:
|
||||
return TensorTypeQ4_K
|
||||
case fileTypeQ5_K_S:
|
||||
case FileTypeQ5_K_S:
|
||||
return TensorTypeQ5_K
|
||||
case fileTypeQ5_K_M:
|
||||
case FileTypeQ5_K_M:
|
||||
return TensorTypeQ5_K
|
||||
case fileTypeQ6_K:
|
||||
case FileTypeQ6_K:
|
||||
return TensorTypeQ6_K
|
||||
case fileTypeQ2_K_S:
|
||||
case FileTypeQ2_K_S:
|
||||
return TensorTypeQ2_K
|
||||
case FileTypeBF16:
|
||||
return TensorTypeBF16
|
||||
|
2
llama/build-info.cpp
generated
vendored
2
llama/build-info.cpp
generated
vendored
@ -1,4 +1,4 @@
|
||||
int LLAMA_BUILD_NUMBER = 0;
|
||||
char const *LLAMA_COMMIT = "de4c07f93783a1a96456a44dc16b9db538ee1618";
|
||||
char const *LLAMA_COMMIT = "e1e8e0991ffd9e99a445c6812bb519d5bac9f4b5";
|
||||
char const *LLAMA_COMPILER = "";
|
||||
char const *LLAMA_BUILD_TARGET = "";
|
||||
|
@ -10,11 +10,11 @@ include common/stb_image.*
|
||||
include include/
|
||||
include include/llama.*
|
||||
include include/llama-*.*
|
||||
include tools/
|
||||
include tools/mtmd/
|
||||
include tools/mtmd/clip.*
|
||||
include tools/mtmd/clip-impl.*
|
||||
include tools/mtmd/llava.*
|
||||
include examples/
|
||||
include examples/llava/
|
||||
include examples/llava/clip.*
|
||||
include examples/llava/clip-impl.*
|
||||
include examples/llava/llava.*
|
||||
include src/
|
||||
include src/llama.*
|
||||
include src/llama-*.*
|
||||
|
19
llama/llama.cpp/common/common.cpp
vendored
19
llama/llama.cpp/common/common.cpp
vendored
@ -1096,6 +1096,7 @@ struct llama_context_params common_context_params_to_llama(const common_params &
|
||||
cparams.n_threads = params.cpuparams.n_threads;
|
||||
cparams.n_threads_batch = params.cpuparams_batch.n_threads == -1 ?
|
||||
params.cpuparams.n_threads : params.cpuparams_batch.n_threads;
|
||||
cparams.logits_all = params.logits_all;
|
||||
cparams.embeddings = params.embedding;
|
||||
cparams.rope_scaling_type = params.rope_scaling_type;
|
||||
cparams.rope_freq_base = params.rope_freq_base;
|
||||
@ -1113,7 +1114,6 @@ struct llama_context_params common_context_params_to_llama(const common_params &
|
||||
cparams.offload_kqv = !params.no_kv_offload;
|
||||
cparams.flash_attn = params.flash_attn;
|
||||
cparams.no_perf = params.no_perf;
|
||||
cparams.op_offload = !params.no_op_offload;
|
||||
|
||||
if (params.reranking) {
|
||||
cparams.embeddings = true;
|
||||
@ -1565,20 +1565,3 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std::vector<llama_token> & tokens, int64_t stride) {
|
||||
const int64_t ne_datapoint = llama_n_ctx(ctx);
|
||||
const int64_t ndata = (tokens.size() - ne_datapoint - 1) / stride;
|
||||
ggml_opt_dataset_t result = ggml_opt_dataset_init(
|
||||
GGML_TYPE_I32, GGML_TYPE_I32, ne_datapoint, ne_datapoint, ndata, /*ndata_shard =*/ 1);
|
||||
|
||||
llama_token * data = (llama_token *) ggml_opt_dataset_data(result)->data;
|
||||
llama_token * labels = (llama_token *) ggml_opt_dataset_labels(result)->data;
|
||||
|
||||
for (int64_t idata = 0; idata < ndata; ++idata) {
|
||||
memcpy(data + idata*ne_datapoint, tokens.data() + idata*stride + 0, ne_datapoint*sizeof(llama_token));
|
||||
memcpy(labels + idata*ne_datapoint, tokens.data() + idata*stride + 1, ne_datapoint*sizeof(llama_token));
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
18
llama/llama.cpp/common/common.h
vendored
18
llama/llama.cpp/common/common.h
vendored
@ -66,6 +66,7 @@ enum llama_example {
|
||||
LLAMA_EXAMPLE_COMMON,
|
||||
LLAMA_EXAMPLE_SPECULATIVE,
|
||||
LLAMA_EXAMPLE_MAIN,
|
||||
LLAMA_EXAMPLE_INFILL,
|
||||
LLAMA_EXAMPLE_EMBEDDING,
|
||||
LLAMA_EXAMPLE_PERPLEXITY,
|
||||
LLAMA_EXAMPLE_RETRIEVAL,
|
||||
@ -95,7 +96,6 @@ enum common_sampler_type {
|
||||
COMMON_SAMPLER_TYPE_XTC = 8,
|
||||
COMMON_SAMPLER_TYPE_INFILL = 9,
|
||||
COMMON_SAMPLER_TYPE_PENALTIES = 10,
|
||||
COMMON_SAMPLER_TYPE_TOP_N_SIGMA = 11,
|
||||
};
|
||||
|
||||
// dimensionality reduction methods, used by cvector-generator
|
||||
@ -161,7 +161,6 @@ struct common_params_sampling {
|
||||
std::vector<enum common_sampler_type> samplers = {
|
||||
COMMON_SAMPLER_TYPE_PENALTIES,
|
||||
COMMON_SAMPLER_TYPE_DRY,
|
||||
COMMON_SAMPLER_TYPE_TOP_N_SIGMA,
|
||||
COMMON_SAMPLER_TYPE_TOP_K,
|
||||
COMMON_SAMPLER_TYPE_TYPICAL_P,
|
||||
COMMON_SAMPLER_TYPE_TOP_P,
|
||||
@ -324,6 +323,7 @@ struct common_params {
|
||||
bool ctx_shift = true; // context shift on inifinite text generation
|
||||
|
||||
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
|
||||
bool logits_all = false; // return logits for all tokens in the batch
|
||||
bool use_mmap = true; // use mmap for faster loads
|
||||
bool use_mlock = false; // use mlock to keep model in memory
|
||||
bool verbose_prompt = false; // print prompt tokens before generation
|
||||
@ -332,7 +332,6 @@ struct common_params {
|
||||
bool no_kv_offload = false; // disable KV offloading
|
||||
bool warmup = true; // warmup run
|
||||
bool check_tensors = false; // validate tensor data
|
||||
bool no_op_offload = false; // globally disable offload host tensor operations to device
|
||||
|
||||
bool single_turn = false; // single turn chat conversation
|
||||
|
||||
@ -341,7 +340,7 @@ struct common_params {
|
||||
|
||||
common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO;
|
||||
|
||||
// multimodal models (see tools/mtmd)
|
||||
// multimodal models (see examples/llava)
|
||||
struct common_params_model mmproj;
|
||||
bool mmproj_use_gpu = true; // use GPU for multimodal model
|
||||
bool no_mmproj = false; // explicitly disable multimodal model
|
||||
@ -410,14 +409,13 @@ struct common_params {
|
||||
|
||||
bool process_output = false; // collect data for the output tensor
|
||||
bool compute_ppl = true; // whether to compute perplexity
|
||||
bool parse_special = false; // whether to parse special tokens during imatrix tokenization
|
||||
|
||||
// cvector-generator params
|
||||
int n_pca_batch = 100;
|
||||
int n_pca_iterations = 1000;
|
||||
dimre_method cvector_dimre_method = DIMRE_METHOD_PCA;
|
||||
std::string cvector_positive_file = "tools/cvector-generator/positive.txt";
|
||||
std::string cvector_negative_file = "tools/cvector-generator/negative.txt";
|
||||
std::string cvector_positive_file = "examples/cvector-generator/positive.txt";
|
||||
std::string cvector_negative_file = "examples/cvector-generator/negative.txt";
|
||||
|
||||
bool spm_infill = false; // suffix/prefix/middle pattern for infill
|
||||
|
||||
@ -666,9 +664,3 @@ const char * const LLM_KV_SPLIT_COUNT = "split.count";
|
||||
const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
|
||||
|
||||
}
|
||||
|
||||
//
|
||||
// training utils
|
||||
//
|
||||
|
||||
ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std::vector<llama_token> & tokens, int64_t stride);
|
||||
|
107
llama/llama.cpp/common/sampling.cpp
vendored
107
llama/llama.cpp/common/sampling.cpp
vendored
@ -1,7 +1,6 @@
|
||||
#include "sampling.h"
|
||||
|
||||
#include "common.h"
|
||||
#include "log.h"
|
||||
|
||||
#include <cmath>
|
||||
#include <unordered_map>
|
||||
@ -230,48 +229,51 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
|
||||
params.logit_bias.data()));
|
||||
|
||||
if (params.mirostat == 0) {
|
||||
for (const auto & cnstr : params.samplers) {
|
||||
switch (cnstr) {
|
||||
case COMMON_SAMPLER_TYPE_DRY:
|
||||
{
|
||||
std::vector<const char *> c_breakers;
|
||||
c_breakers.reserve(params.dry_sequence_breakers.size());
|
||||
for (const auto & str : params.dry_sequence_breakers) {
|
||||
c_breakers.push_back(str.c_str());
|
||||
}
|
||||
if (params.top_n_sigma >= 0) {
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k));
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_temp (params.temp));
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_top_n_sigma (params.top_n_sigma));
|
||||
} else {
|
||||
for (const auto & cnstr : params.samplers) {
|
||||
switch (cnstr) {
|
||||
case COMMON_SAMPLER_TYPE_DRY:
|
||||
{
|
||||
std::vector<const char *> c_breakers;
|
||||
c_breakers.reserve(params.dry_sequence_breakers.size());
|
||||
for (const auto & str : params.dry_sequence_breakers) {
|
||||
c_breakers.push_back(str.c_str());
|
||||
}
|
||||
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_dry (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
|
||||
}
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_TOP_K:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_TOP_P:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_top_p (params.top_p, params.min_keep));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_TOP_N_SIGMA:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_top_n_sigma (params.top_n_sigma));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_MIN_P:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_XTC:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_TYPICAL_P:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_typical (params.typ_p, params.min_keep));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_TEMPERATURE:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_INFILL:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_infill (vocab));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_PENALTIES:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_penalties (params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false && "unknown sampler type");
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_dry (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
|
||||
}
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_TOP_K:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_TOP_P:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_top_p (params.top_p, params.min_keep));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_MIN_P:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_XTC:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_TYPICAL_P:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_typical (params.typ_p, params.min_keep));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_TEMPERATURE:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_INFILL:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_infill (vocab));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_PENALTIES:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_penalties(params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false && "unknown sampler type");
|
||||
}
|
||||
}
|
||||
}
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed));
|
||||
@ -473,7 +475,6 @@ char common_sampler_type_to_chr(enum common_sampler_type cnstr) {
|
||||
case COMMON_SAMPLER_TYPE_TOP_K: return 'k';
|
||||
case COMMON_SAMPLER_TYPE_TYPICAL_P: return 'y';
|
||||
case COMMON_SAMPLER_TYPE_TOP_P: return 'p';
|
||||
case COMMON_SAMPLER_TYPE_TOP_N_SIGMA: return 's';
|
||||
case COMMON_SAMPLER_TYPE_MIN_P: return 'm';
|
||||
case COMMON_SAMPLER_TYPE_TEMPERATURE: return 't';
|
||||
case COMMON_SAMPLER_TYPE_XTC: return 'x';
|
||||
@ -489,7 +490,6 @@ std::string common_sampler_type_to_str(enum common_sampler_type cnstr) {
|
||||
case COMMON_SAMPLER_TYPE_TOP_K: return "top_k";
|
||||
case COMMON_SAMPLER_TYPE_TYPICAL_P: return "typ_p";
|
||||
case COMMON_SAMPLER_TYPE_TOP_P: return "top_p";
|
||||
case COMMON_SAMPLER_TYPE_TOP_N_SIGMA: return "top_n_sigma";
|
||||
case COMMON_SAMPLER_TYPE_MIN_P: return "min_p";
|
||||
case COMMON_SAMPLER_TYPE_TEMPERATURE: return "temperature";
|
||||
case COMMON_SAMPLER_TYPE_XTC: return "xtc";
|
||||
@ -504,7 +504,6 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
|
||||
{ "dry", COMMON_SAMPLER_TYPE_DRY },
|
||||
{ "top_k", COMMON_SAMPLER_TYPE_TOP_K },
|
||||
{ "top_p", COMMON_SAMPLER_TYPE_TOP_P },
|
||||
{ "top_n_sigma", COMMON_SAMPLER_TYPE_TOP_N_SIGMA },
|
||||
{ "typ_p", COMMON_SAMPLER_TYPE_TYPICAL_P },
|
||||
{ "min_p", COMMON_SAMPLER_TYPE_MIN_P },
|
||||
{ "temperature", COMMON_SAMPLER_TYPE_TEMPERATURE },
|
||||
@ -518,7 +517,6 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
|
||||
std::unordered_map<std::string, common_sampler_type> sampler_alt_name_map {
|
||||
{ "top-k", COMMON_SAMPLER_TYPE_TOP_K },
|
||||
{ "top-p", COMMON_SAMPLER_TYPE_TOP_P },
|
||||
{ "top-n-sigma", COMMON_SAMPLER_TYPE_TOP_N_SIGMA },
|
||||
{ "nucleus", COMMON_SAMPLER_TYPE_TOP_P },
|
||||
{ "typical-p", COMMON_SAMPLER_TYPE_TYPICAL_P },
|
||||
{ "typical", COMMON_SAMPLER_TYPE_TYPICAL_P },
|
||||
@ -535,16 +533,14 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
|
||||
auto sampler = sampler_canonical_name_map.find(name);
|
||||
if (sampler != sampler_canonical_name_map.end()) {
|
||||
samplers.push_back(sampler->second);
|
||||
continue;
|
||||
}
|
||||
if (allow_alt_names) {
|
||||
sampler = sampler_alt_name_map.find(name);
|
||||
if (sampler != sampler_alt_name_map.end()) {
|
||||
samplers.push_back(sampler->second);
|
||||
continue;
|
||||
} else {
|
||||
if (allow_alt_names) {
|
||||
sampler = sampler_alt_name_map.find(name);
|
||||
if (sampler != sampler_alt_name_map.end()) {
|
||||
samplers.push_back(sampler->second);
|
||||
}
|
||||
}
|
||||
}
|
||||
LOG_WRN("%s: unable to match sampler by name '%s'\n", __func__, name.c_str());
|
||||
}
|
||||
|
||||
return samplers;
|
||||
@ -556,7 +552,6 @@ std::vector<common_sampler_type> common_sampler_types_from_chars(const std::stri
|
||||
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_K), COMMON_SAMPLER_TYPE_TOP_K },
|
||||
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TYPICAL_P), COMMON_SAMPLER_TYPE_TYPICAL_P },
|
||||
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_P), COMMON_SAMPLER_TYPE_TOP_P },
|
||||
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_N_SIGMA), COMMON_SAMPLER_TYPE_TOP_N_SIGMA },
|
||||
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_MIN_P), COMMON_SAMPLER_TYPE_MIN_P },
|
||||
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TEMPERATURE), COMMON_SAMPLER_TYPE_TEMPERATURE },
|
||||
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_XTC), COMMON_SAMPLER_TYPE_XTC },
|
||||
@ -571,8 +566,6 @@ std::vector<common_sampler_type> common_sampler_types_from_chars(const std::stri
|
||||
const auto sampler = sampler_name_map.find(c);
|
||||
if (sampler != sampler_name_map.end()) {
|
||||
samplers.push_back(sampler->second);
|
||||
} else {
|
||||
LOG_WRN("%s: unable to match sampler by char '%c'\n", __func__, c);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -31,7 +31,9 @@
|
||||
#define KEY_FEATURE_LAYER "clip.vision.feature_layer"
|
||||
#define KEY_PROJ_SCALE_FACTOR "clip.vision.projector.scale_factor"
|
||||
#define KEY_PROJ_TYPE "clip.projector_type"
|
||||
#define KEY_SPATIAL_MERGE_SIZE "clip.vision.spatial_merge_size"
|
||||
|
||||
#define KEY_USE_GLU_MLP "clip.use_glu_mlp" // for qwen2.5vl
|
||||
#define KEY_USE_RMS_NORM "clip.use_rms_norm" // for qwen2.5vl
|
||||
|
||||
#define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type"
|
||||
#define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints"
|
||||
@ -53,16 +55,12 @@
|
||||
#define TN_ATTN_Q "%s.blk.%d.attn_q.%s"
|
||||
#define TN_ATTN_V "%s.blk.%d.attn_v.%s"
|
||||
#define TN_ATTN_OUTPUT "%s.blk.%d.attn_out.%s"
|
||||
#define TN_ATTN_K_NORM "%s.blk.%d.attn_k_norm.%s"
|
||||
#define TN_ATTN_Q_NORM "%s.blk.%d.attn_q_norm.%s"
|
||||
#define TN_FFN_DOWN "%s.blk.%d.ffn_down.%s"
|
||||
#define TN_FFN_GATE "%s.blk.%d.ffn_gate.%s"
|
||||
#define TN_FFN_UP "%s.blk.%d.ffn_up.%s"
|
||||
#define TN_FFN_GATE "%s.blk.%d.ffn_gate.%s"
|
||||
#define TN_LN_1 "%s.blk.%d.ln1.%s" // layer norm
|
||||
#define TN_LN_2 "%s.blk.%d.ln2.%s" // layer norm
|
||||
#define TN_LS_1 "%s.blk.%d.ls1.%s" // layer scale
|
||||
#define TN_LS_2 "%s.blk.%d.ls2.%s" // layer scale
|
||||
#define TN_LN_1 "%s.blk.%d.ln1.%s"
|
||||
#define TN_LN_2 "%s.blk.%d.ln2.%s"
|
||||
#define TN_LN_PRE "%s.pre_ln.%s"
|
||||
#define TN_LN_POST "%s.post_ln.%s"
|
||||
#define TN_LLAVA_PROJ "mm.%d.%s"
|
||||
@ -70,14 +68,10 @@
|
||||
#define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s"
|
||||
#define TN_MVLM_PROJ_PEG "mm.model.peg.%d.%s"
|
||||
#define TN_IMAGE_NEWLINE "model.image_newline"
|
||||
#define TN_MM_INP_NORM "mm.input_norm.weight"
|
||||
#define TN_MM_INP_PROJ "mm.input_projection.weight" // gemma3
|
||||
#define TN_MM_SOFT_EMB_N "mm.soft_emb_norm.weight" // gemma3
|
||||
#define TN_MM_PROJECTOR "mm.model.fc.weight" // idefics3
|
||||
#define TN_MM_PATCH_MERGER "mm.patch_merger.weight" // mistral small 3.1
|
||||
#define TN_TOK_IMG_BREAK "v.token_embd.img_break" // pixtral
|
||||
#define TN_TOK_GLM_BOI "adapter.boi" // glm-edge (these embeddings are not in text model)
|
||||
#define TN_TOK_GLM_EOI "adapter.eoi" // glm-edge (these embeddings are not in text model)
|
||||
|
||||
// mimicpmv
|
||||
#define TN_MINICPMV_POS_EMBD_K "resampler.pos_embed_k"
|
||||
@ -94,9 +88,6 @@
|
||||
#define TN_GLM_ADAPTER_GATE "adapter.linear.gate.%s"
|
||||
#define TN_GLM_ADAPTER_D_4H_2_H "adapter.linear.dense_4h_to_h.%s"
|
||||
|
||||
// align x to upper multiple of n
|
||||
#define CLIP_ALIGN(x, n) ((((x) + (n) - 1) / (n)) * (n))
|
||||
|
||||
enum projector_type {
|
||||
PROJECTOR_TYPE_MLP,
|
||||
PROJECTOR_TYPE_MLP_NORM,
|
||||
@ -109,7 +100,6 @@ enum projector_type {
|
||||
PROJECTOR_TYPE_IDEFICS3,
|
||||
PROJECTOR_TYPE_PIXTRAL,
|
||||
PROJECTOR_TYPE_QWEN25VL,
|
||||
PROJECTOR_TYPE_INTERNVL,
|
||||
PROJECTOR_TYPE_UNKNOWN,
|
||||
};
|
||||
|
||||
@ -124,7 +114,6 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
|
||||
{ PROJECTOR_TYPE_GEMMA3, "gemma3"},
|
||||
{ PROJECTOR_TYPE_IDEFICS3, "idefics3"},
|
||||
{ PROJECTOR_TYPE_PIXTRAL, "pixtral"},
|
||||
{ PROJECTOR_TYPE_INTERNVL, "internvl"},
|
||||
};
|
||||
|
||||
static projector_type clip_projector_type_from_string(const std::string & str) {
|
||||
@ -239,15 +228,6 @@ struct clip_image_u8_batch {
|
||||
|
||||
struct clip_image_f32_batch {
|
||||
std::vector<clip_image_f32_ptr> entries;
|
||||
|
||||
clip_image_f32_batch clone() const {
|
||||
clip_image_f32_batch new_batch;
|
||||
new_batch.entries.reserve(entries.size());
|
||||
for (const auto & entry : entries) {
|
||||
new_batch.entries.emplace_back(new clip_image_f32(*entry));
|
||||
}
|
||||
return new_batch;
|
||||
}
|
||||
};
|
||||
|
||||
//
|
File diff suppressed because it is too large
Load Diff
@ -78,10 +78,10 @@ CLIP_API int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip);
|
||||
CLIP_API void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size);
|
||||
CLIP_API struct clip_image_size * clip_get_load_image_size(struct clip_ctx * ctx_clip);
|
||||
|
||||
CLIP_API struct clip_image_size * clip_image_size_init(void);
|
||||
CLIP_API struct clip_image_u8 * clip_image_u8_init (void);
|
||||
CLIP_API struct clip_image_f32 * clip_image_f32_init(void);
|
||||
CLIP_API struct clip_image_f32_batch * clip_image_f32_batch_init(void); // only used by libllava
|
||||
CLIP_API struct clip_image_size * clip_image_size_init();
|
||||
CLIP_API struct clip_image_u8 * clip_image_u8_init ();
|
||||
CLIP_API struct clip_image_f32 * clip_image_f32_init();
|
||||
CLIP_API struct clip_image_f32_batch * clip_image_f32_batch_init(); // only used by libllava
|
||||
|
||||
// nx, ny are the output image dimensions
|
||||
CLIP_API unsigned char * clip_image_u8_get_data(struct clip_image_u8 * img, uint32_t * nx, uint32_t * ny);
|
@ -2,7 +2,6 @@
|
||||
#include "llava.h"
|
||||
|
||||
#include "llama.h"
|
||||
#include "ggml-cpp.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cerrno>
|
||||
@ -210,11 +209,7 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
|
||||
struct ggml_tensor *flatten = ggml_view_2d(model.ctx, permuted_cont, clip_n_mmproj_embd(ctx_clip), num_patches_height * num_patches_width * num_patches_per_side * num_patches_per_side, size_ele * clip_n_mmproj_embd(ctx_clip), 0);
|
||||
// ggml_tensor_printf(flatten,"flatten",__LINE__,false,false);
|
||||
ggml_build_forward_expand(gf, flatten);
|
||||
|
||||
ggml_backend_ptr backend { ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr) };
|
||||
GGML_ASSERT(backend != nullptr && "failed to initialize CPU backend");
|
||||
ggml_backend_graph_compute(backend.get(), gf);
|
||||
|
||||
ggml_graph_compute_with_ctx(model.ctx, gf, 1);
|
||||
struct ggml_tensor* result = ggml_graph_node(gf, -1);
|
||||
|
||||
memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context
|
||||
@ -462,7 +457,7 @@ struct llava_embd_batch {
|
||||
std::vector<llama_seq_id *> seq_ids;
|
||||
std::vector<int8_t> logits;
|
||||
llama_batch batch;
|
||||
llava_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
|
||||
llava_embd_batch(float * embd, int32_t n_embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
|
||||
pos .resize(n_tokens);
|
||||
n_seq_id.resize(n_tokens);
|
||||
seq_ids .resize(n_tokens + 1);
|
||||
@ -474,6 +469,7 @@ struct llava_embd_batch {
|
||||
/*n_tokens =*/ n_tokens,
|
||||
/*tokens =*/ nullptr,
|
||||
/*embd =*/ embd,
|
||||
/*n_embd =*/ n_embd,
|
||||
/*pos =*/ pos.data(),
|
||||
/*n_seq_id =*/ n_seq_id.data(),
|
||||
/*seq_id =*/ seq_ids.data(),
|
||||
@ -497,7 +493,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
|
||||
n_eval = n_batch;
|
||||
}
|
||||
float * embd = image_embed->embed+i*n_embd;
|
||||
llava_embd_batch llava_batch = llava_embd_batch(embd, n_eval, *n_past, 0);
|
||||
llava_embd_batch llava_batch = llava_embd_batch(embd, n_embd, n_eval, *n_past, 0);
|
||||
if (llama_decode(ctx_llama, llava_batch.batch)) {
|
||||
LOG_ERR("%s : failed to eval\n", __func__);
|
||||
return false;
|
@ -1,4 +1,4 @@
|
||||
package mtmd
|
||||
package llava
|
||||
|
||||
// #cgo CXXFLAGS: -std=c++11
|
||||
// #cgo CPPFLAGS: -I${SRCDIR}/../../include -I${SRCDIR}/../../common
|
67
llama/llama.cpp/include/llama.h
vendored
67
llama/llama.cpp/include/llama.h
vendored
@ -4,7 +4,6 @@
|
||||
#include "ggml.h"
|
||||
#include "ggml-cpu.h"
|
||||
#include "ggml-backend.h"
|
||||
#include "ggml-opt.h"
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
@ -113,7 +112,6 @@ extern "C" {
|
||||
LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32,
|
||||
LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33,
|
||||
LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34,
|
||||
LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35,
|
||||
};
|
||||
|
||||
enum llama_rope_type {
|
||||
@ -258,6 +256,7 @@ extern "C" {
|
||||
|
||||
llama_token * token;
|
||||
float * embd;
|
||||
int32_t n_embd;
|
||||
llama_pos * pos;
|
||||
int32_t * n_seq_id;
|
||||
llama_seq_id ** seq_id;
|
||||
@ -353,18 +352,20 @@ extern "C" {
|
||||
enum ggml_type type_k; // data type for K cache [EXPERIMENTAL]
|
||||
enum ggml_type type_v; // data type for V cache [EXPERIMENTAL]
|
||||
|
||||
// Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value.
|
||||
// TODO: move at the end of the struct
|
||||
bool logits_all; // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
|
||||
bool embeddings; // if true, extract embeddings (together with logits)
|
||||
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
|
||||
bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
|
||||
bool no_perf; // whether to measure performance timings
|
||||
bool cross_attn; // whether to use cross attention
|
||||
|
||||
// Abort callback
|
||||
// if it returns true, execution of llama_decode() will be aborted
|
||||
// currently works only with CPU execution
|
||||
ggml_abort_callback abort_callback;
|
||||
void * abort_callback_data;
|
||||
|
||||
// Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value.
|
||||
bool embeddings; // if true, extract embeddings (together with logits)
|
||||
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
|
||||
bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
|
||||
bool no_perf; // whether to measure performance timings
|
||||
bool op_offload; // whether to offload host tensor operations to device
|
||||
};
|
||||
|
||||
// model quantization parameters
|
||||
@ -446,10 +447,6 @@ extern "C" {
|
||||
size_t n_paths,
|
||||
struct llama_model_params params);
|
||||
|
||||
LLAMA_API void llama_model_save_to_file(
|
||||
const struct llama_model * model,
|
||||
const char * path_model);
|
||||
|
||||
DEPRECATED(LLAMA_API void llama_free_model(struct llama_model * model),
|
||||
"use llama_model_free instead");
|
||||
|
||||
@ -464,6 +461,10 @@ extern "C" {
|
||||
struct llama_context_params params),
|
||||
"use llama_init_from_model instead");
|
||||
|
||||
// TODO (jmorganca): this should most likely be passed in as part of a batch
|
||||
// and not set on the context for all batches.
|
||||
LLAMA_API void llama_set_cross_attention(struct llama_context * ctx, bool cross_attn_state);
|
||||
|
||||
// Frees all allocated memory
|
||||
LLAMA_API void llama_free(struct llama_context * ctx);
|
||||
|
||||
@ -929,19 +930,14 @@ extern "C" {
|
||||
// Frees a batch of tokens allocated with llama_batch_init()
|
||||
LLAMA_API void llama_batch_free(struct llama_batch batch);
|
||||
|
||||
// Process a batch of tokens.
|
||||
// In contrast to llama_decode() - this call does not use KV cache.
|
||||
// For encode-decoder contexts, processes the batch using the encoder.
|
||||
// Can store the encoder output internally for later use by the decoder's cross-attention layers.
|
||||
// Processes a batch of tokens with the ecoder part of the encoder-decoder model.
|
||||
// Stores the encoder output internally for later use by the decoder cross-attention layers.
|
||||
// 0 - success
|
||||
// < 0 - error. the KV cache state is restored to the state before this call
|
||||
LLAMA_API int32_t llama_encode(
|
||||
struct llama_context * ctx,
|
||||
struct llama_batch batch);
|
||||
|
||||
// Process a batch of tokens.
|
||||
// Requires KV cache.
|
||||
// For encode-decoder contexts, processes the batch using the decoder.
|
||||
// Positive return values does not mean a fatal error, but rather a warning.
|
||||
// 0 - success
|
||||
// 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
|
||||
@ -1438,37 +1434,6 @@ extern "C" {
|
||||
LLAMA_API void llama_perf_sampler_print(const struct llama_sampler * chain);
|
||||
LLAMA_API void llama_perf_sampler_reset( struct llama_sampler * chain);
|
||||
|
||||
//
|
||||
// training
|
||||
//
|
||||
|
||||
// function that returns whether or not a given tensor contains trainable parameters
|
||||
typedef bool (*llama_opt_param_filter)(const struct ggml_tensor * tensor, void * userdata);
|
||||
|
||||
// always returns true
|
||||
LLAMA_API bool llama_opt_param_filter_all(const struct ggml_tensor * tensor, void * userdata);
|
||||
|
||||
struct llama_opt_params {
|
||||
uint32_t n_ctx_train; // assumed context size post training, use context size specified in llama_context if 0
|
||||
|
||||
llama_opt_param_filter param_filter; // callback for determining which tensors contain trainable parameters
|
||||
void * param_filter_ud; // userdata for determining which tensors contain trainable parameters
|
||||
|
||||
ggml_opt_get_optimizer_params get_opt_pars; // callback for calculating optimizer parameters
|
||||
void * get_opt_pars_ud; // userdata for calculating optimizer parameters
|
||||
};
|
||||
|
||||
LLAMA_API void llama_opt_init(struct llama_context * lctx, struct llama_model * model, struct llama_opt_params lopt_params);
|
||||
|
||||
LLAMA_API void llama_opt_epoch(
|
||||
struct llama_context * lctx,
|
||||
ggml_opt_dataset_t dataset,
|
||||
ggml_opt_result_t result_train,
|
||||
ggml_opt_result_t result_eval,
|
||||
int64_t idata_split,
|
||||
ggml_opt_epoch_callback callback_train,
|
||||
ggml_opt_epoch_callback callback_eval);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
6
llama/llama.cpp/src/llama-adapter.cpp
vendored
6
llama/llama.cpp/src/llama-adapter.cpp
vendored
@ -253,9 +253,6 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_
|
||||
std::vector<ggml_backend_buffer_type_t> buft_extra;
|
||||
{
|
||||
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
||||
if (!cpu_dev) {
|
||||
throw std::runtime_error(format("%s: no CPU backend found", __func__));
|
||||
}
|
||||
auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
|
||||
|
||||
auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
|
||||
@ -294,9 +291,6 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_
|
||||
LLAMA_LOG_WARN("%s: lora for '%s' cannot use buft '%s', fallback to CPU\n", __func__, model_tensor->name, ggml_backend_buft_name(buft));
|
||||
|
||||
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
||||
if (!cpu_dev) {
|
||||
throw std::runtime_error(format("%s: no CPU backend found", __func__));
|
||||
}
|
||||
buft = ggml_backend_dev_buffer_type(cpu_dev);
|
||||
|
||||
break;
|
||||
|
44
llama/llama.cpp/src/llama-arch.cpp
vendored
44
llama/llama.cpp/src/llama-arch.cpp
vendored
@ -6,6 +6,7 @@
|
||||
|
||||
static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
{ LLM_ARCH_LLAMA, "llama" },
|
||||
{ LLM_ARCH_MLLAMA, "mllama" },
|
||||
{ LLM_ARCH_LLAMA4, "llama4" },
|
||||
{ LLM_ARCH_DECI, "deci" },
|
||||
{ LLM_ARCH_FALCON, "falcon" },
|
||||
@ -144,6 +145,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
|
||||
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
|
||||
{ LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection" },
|
||||
{ LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS, "%s.attention.cross_attention_layers" },
|
||||
{ LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" },
|
||||
{ LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
|
||||
|
||||
@ -273,6 +275,40 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_MLLAMA,
|
||||
{
|
||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||
{ LLM_TENSOR_OUTPUT, "output" },
|
||||
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
|
||||
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
|
||||
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
||||
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
||||
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
{ LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
|
||||
{ LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
|
||||
{ LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
|
||||
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
||||
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
||||
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
||||
{ LLM_TENSOR_CROSS_ATTN_K_NORM, "blk.%d.cross_attn_k_norm" },
|
||||
{ LLM_TENSOR_CROSS_ATTN_K_PROJ, "blk.%d.cross_attn_k_proj" },
|
||||
{ LLM_TENSOR_CROSS_ATTN_O_PROJ, "blk.%d.cross_attn_o_proj" },
|
||||
{ LLM_TENSOR_CROSS_ATTN_Q_NORM, "blk.%d.cross_attn_q_norm" },
|
||||
{ LLM_TENSOR_CROSS_ATTN_Q_PROJ, "blk.%d.cross_attn_q_proj" },
|
||||
{ LLM_TENSOR_CROSS_ATTN_V_PROJ, "blk.%d.cross_attn_v_proj" },
|
||||
{ LLM_TENSOR_CROSS_ATTN_ATTN_GATE, "blk.%d.cross_attn_attn_gate" },
|
||||
{ LLM_TENSOR_CROSS_ATTN_MLP_GATE, "blk.%d.cross_attn_mlp_gate" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_DECI,
|
||||
{
|
||||
@ -1701,6 +1737,14 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
||||
// this tensor is loaded for T5, but never used
|
||||
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
|
||||
{LLM_TENSOR_BSKCN_TV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_CROSS_ATTN_K_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_CROSS_ATTN_K_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_CROSS_ATTN_O_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_CROSS_ATTN_Q_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_CROSS_ATTN_Q_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_CROSS_ATTN_V_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_CROSS_ATTN_ATTN_GATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_CROSS_ATTN_MLP_GATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_CONV1D, {LLM_TENSOR_LAYER_INPUT, GGML_OP_IM2COL}},
|
||||
{LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
|
10
llama/llama.cpp/src/llama-arch.h
vendored
10
llama/llama.cpp/src/llama-arch.h
vendored
@ -11,6 +11,7 @@
|
||||
enum llm_arch {
|
||||
LLM_ARCH_LLAMA,
|
||||
LLM_ARCH_LLAMA4,
|
||||
LLM_ARCH_MLLAMA,
|
||||
LLM_ARCH_DECI,
|
||||
LLM_ARCH_FALCON,
|
||||
LLM_ARCH_BAICHUAN,
|
||||
@ -148,6 +149,7 @@ enum llm_kv {
|
||||
LLM_KV_ATTENTION_SLIDING_WINDOW,
|
||||
LLM_KV_ATTENTION_SCALE,
|
||||
LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
|
||||
LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS,
|
||||
LLM_KV_ATTENTION_KEY_LENGTH_MLA,
|
||||
LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
|
||||
|
||||
@ -349,6 +351,14 @@ enum llm_tensor {
|
||||
LLM_TENSOR_CLS,
|
||||
LLM_TENSOR_CLS_OUT,
|
||||
LLM_TENSOR_BSKCN_TV,
|
||||
LLM_TENSOR_CROSS_ATTN_K_NORM,
|
||||
LLM_TENSOR_CROSS_ATTN_K_PROJ,
|
||||
LLM_TENSOR_CROSS_ATTN_O_PROJ,
|
||||
LLM_TENSOR_CROSS_ATTN_Q_NORM,
|
||||
LLM_TENSOR_CROSS_ATTN_Q_PROJ,
|
||||
LLM_TENSOR_CROSS_ATTN_V_PROJ,
|
||||
LLM_TENSOR_CROSS_ATTN_ATTN_GATE,
|
||||
LLM_TENSOR_CROSS_ATTN_MLP_GATE,
|
||||
LLM_TENSOR_CONV1D,
|
||||
LLM_TENSOR_CONVNEXT_DW,
|
||||
LLM_TENSOR_CONVNEXT_NORM,
|
||||
|
9
llama/llama.cpp/src/llama-batch.cpp
vendored
9
llama/llama.cpp/src/llama-batch.cpp
vendored
@ -189,7 +189,7 @@ llama_ubatch llama_sbatch::split_seq(size_t n_ubatch) {
|
||||
return ubatch;
|
||||
}
|
||||
|
||||
llama_sbatch::llama_sbatch(const llama_batch & batch, size_t n_embd, bool simple_split, bool logits_all) {
|
||||
void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd, bool simple_split, bool logits_all) {
|
||||
GGML_ASSERT(batch.n_tokens >= 0);
|
||||
this->batch = &batch;
|
||||
this->n_embd = n_embd;
|
||||
@ -203,7 +203,6 @@ llama_sbatch::llama_sbatch(const llama_batch & batch, size_t n_embd, bool simple
|
||||
for (size_t i = 0; i < n_tokens; ++i) {
|
||||
ids[i] = i;
|
||||
}
|
||||
|
||||
if (simple_split) {
|
||||
seq.resize(1);
|
||||
llama_sbatch_seq & s = seq[0];
|
||||
@ -213,7 +212,6 @@ llama_sbatch::llama_sbatch(const llama_batch & batch, size_t n_embd, bool simple
|
||||
s.length = n_tokens;
|
||||
return;
|
||||
}
|
||||
|
||||
std::sort(ids.begin(), ids.end(),
|
||||
[&batch](size_t a, size_t b) {
|
||||
int32_t n_seq_a = batch.n_seq_id ? batch.n_seq_id[a] : 1;
|
||||
@ -241,7 +239,6 @@ llama_sbatch::llama_sbatch(const llama_batch & batch, size_t n_embd, bool simple
|
||||
return n_seq_a > n_seq_b;
|
||||
}
|
||||
);
|
||||
|
||||
// init seq
|
||||
llama_sbatch_seq * last_seq = nullptr;
|
||||
|
||||
@ -265,7 +262,6 @@ llama_sbatch::llama_sbatch(const llama_batch & batch, size_t n_embd, bool simple
|
||||
seq.push_back(new_seq);
|
||||
last_seq = &seq.back();
|
||||
}
|
||||
|
||||
// keep shared prompts first at the end, then sort by length descending.
|
||||
std::sort(seq.begin(), seq.end(),
|
||||
[](llama_sbatch_seq & a, llama_sbatch_seq & b) {
|
||||
@ -320,6 +316,7 @@ struct llama_batch llama_batch_get_one(
|
||||
/*n_tokens =*/ n_tokens,
|
||||
/*tokens =*/ tokens,
|
||||
/*embd =*/ nullptr,
|
||||
/*n_embd =*/ 0,
|
||||
/*pos =*/ nullptr,
|
||||
/*n_seq_id =*/ nullptr,
|
||||
/*seq_id =*/ nullptr,
|
||||
@ -332,6 +329,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
|
||||
/*n_tokens =*/ 0,
|
||||
/*tokens =*/ nullptr,
|
||||
/*embd =*/ nullptr,
|
||||
/*n_embd =*/ 0,
|
||||
/*pos =*/ nullptr,
|
||||
/*n_seq_id =*/ nullptr,
|
||||
/*seq_id =*/ nullptr,
|
||||
@ -340,6 +338,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
|
||||
|
||||
if (embd) {
|
||||
batch.embd = (float *) malloc(sizeof(float) * n_tokens_alloc * embd);
|
||||
batch.n_embd = embd;
|
||||
} else {
|
||||
batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens_alloc);
|
||||
}
|
||||
|
3
llama/llama.cpp/src/llama-batch.h
vendored
3
llama/llama.cpp/src/llama-batch.h
vendored
@ -70,8 +70,7 @@ struct llama_sbatch {
|
||||
// sequence-wise split
|
||||
llama_ubatch split_seq(size_t n_ubatch);
|
||||
|
||||
llama_sbatch() = default;
|
||||
llama_sbatch(const llama_batch & batch, size_t n_embd, bool simple_split = false, bool logits_all = false);
|
||||
void from_batch(const llama_batch & batch, size_t n_embd, bool simple_split = false, bool logits_all = false);
|
||||
};
|
||||
|
||||
// temporary allocate memory for the input batch if needed
|
||||
|
24
llama/llama.cpp/src/llama-chat.cpp
vendored
24
llama/llama.cpp/src/llama-chat.cpp
vendored
@ -35,7 +35,6 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
|
||||
{ "mistral-v3", LLM_CHAT_TEMPLATE_MISTRAL_V3 },
|
||||
{ "mistral-v3-tekken", LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN },
|
||||
{ "mistral-v7", LLM_CHAT_TEMPLATE_MISTRAL_V7 },
|
||||
{ "mistral-v7-tekken", LLM_CHAT_TEMPLATE_MISTRAL_V7_TEKKEN },
|
||||
{ "phi3", LLM_CHAT_TEMPLATE_PHI_3 },
|
||||
{ "phi4", LLM_CHAT_TEMPLATE_PHI_4 },
|
||||
{ "falcon3", LLM_CHAT_TEMPLATE_FALCON_3 },
|
||||
@ -203,20 +202,19 @@ int32_t llm_chat_apply_template(
|
||||
if (add_ass) {
|
||||
ss << "<|im_start|>assistant\n";
|
||||
}
|
||||
} else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7 || tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7_TEKKEN) {
|
||||
} else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7) {
|
||||
// Official mistral 'v7' template
|
||||
// See: https://huggingface.co/mistralai/Mistral-Large-Instruct-2411#basic-instruct-template-v7
|
||||
// https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503#basic-instruct-template-v7-tekken
|
||||
const char * trailing_space = tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7 ? " " : "";
|
||||
for (auto message : chat) {
|
||||
std::string role(message->role);
|
||||
std::string content(message->content);
|
||||
if (role == "system") {
|
||||
ss << "[SYSTEM_PROMPT]" << trailing_space << content << "[/SYSTEM_PROMPT]";
|
||||
ss << "[SYSTEM_PROMPT] " << content << "[/SYSTEM_PROMPT]";
|
||||
} else if (role == "user") {
|
||||
ss << "[INST]" << trailing_space << content << "[/INST]";
|
||||
} else {
|
||||
ss << trailing_space << content << "</s>";
|
||||
ss << "[INST] " << content << "[/INST]";
|
||||
}
|
||||
else {
|
||||
ss << " " << content << "</s>";
|
||||
}
|
||||
}
|
||||
} else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V1
|
||||
@ -449,16 +447,8 @@ int32_t llm_chat_apply_template(
|
||||
if (add_ass) {
|
||||
ss << "<|assistant|>";
|
||||
}
|
||||
} else if (tmpl == LLM_CHAT_TEMPLATE_CHATGLM_4) {
|
||||
} else if (tmpl == LLM_CHAT_TEMPLATE_CHATGLM_4 || tmpl == LLM_CHAT_TEMPLATE_GLMEDGE) {
|
||||
ss << "[gMASK]" << "<sop>";
|
||||
for (auto message : chat) {
|
||||
std::string role(message->role);
|
||||
ss << "<|" << role << "|>" << "\n" << message->content;
|
||||
}
|
||||
if (add_ass) {
|
||||
ss << "<|assistant|>\n";
|
||||
}
|
||||
} else if (tmpl == LLM_CHAT_TEMPLATE_GLMEDGE) {
|
||||
for (auto message : chat) {
|
||||
std::string role(message->role);
|
||||
ss << "<|" << role << "|>" << "\n" << message->content;
|
||||
|
1
llama/llama.cpp/src/llama-chat.h
vendored
1
llama/llama.cpp/src/llama-chat.h
vendored
@ -14,7 +14,6 @@ enum llm_chat_template {
|
||||
LLM_CHAT_TEMPLATE_MISTRAL_V3,
|
||||
LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN,
|
||||
LLM_CHAT_TEMPLATE_MISTRAL_V7,
|
||||
LLM_CHAT_TEMPLATE_MISTRAL_V7_TEKKEN,
|
||||
LLM_CHAT_TEMPLATE_PHI_3,
|
||||
LLM_CHAT_TEMPLATE_PHI_4,
|
||||
LLM_CHAT_TEMPLATE_FALCON_3,
|
||||
|
914
llama/llama.cpp/src/llama-context.cpp
vendored
914
llama/llama.cpp/src/llama-context.cpp
vendored
File diff suppressed because it is too large
Load Diff
78
llama/llama.cpp/src/llama-context.h
vendored
78
llama/llama.cpp/src/llama-context.h
vendored
@ -8,7 +8,6 @@
|
||||
#include "llama-kv-cache.h"
|
||||
|
||||
#include "ggml-cpp.h"
|
||||
#include "ggml-opt.h"
|
||||
|
||||
#include <map>
|
||||
#include <vector>
|
||||
@ -29,12 +28,7 @@ struct llama_context {
|
||||
|
||||
void synchronize();
|
||||
|
||||
const llama_model & get_model() const;
|
||||
const llama_cparams & get_cparams() const;
|
||||
|
||||
ggml_backend_sched_t get_sched() const;
|
||||
|
||||
ggml_context * get_ctx_compute() const;
|
||||
const llama_model & get_model() const;
|
||||
|
||||
uint32_t n_ctx() const;
|
||||
uint32_t n_ctx_per_seq() const;
|
||||
@ -72,6 +66,7 @@ struct llama_context {
|
||||
void set_embeddings (bool value);
|
||||
void set_causal_attn(bool value);
|
||||
void set_warmup(bool value);
|
||||
void set_cross_attn(bool value);
|
||||
|
||||
void set_adapter_lora(
|
||||
llama_adapter_lora * adapter,
|
||||
@ -135,32 +130,6 @@ struct llama_context {
|
||||
llama_perf_context_data perf_get_data() const;
|
||||
void perf_reset();
|
||||
|
||||
//
|
||||
// training
|
||||
//
|
||||
|
||||
void opt_init(struct llama_model * model, struct llama_opt_params lopt_params);
|
||||
|
||||
void opt_epoch(
|
||||
ggml_opt_dataset_t dataset,
|
||||
ggml_opt_result_t result_train,
|
||||
ggml_opt_result_t result_eval,
|
||||
int64_t idata_split,
|
||||
ggml_opt_epoch_callback callback_train,
|
||||
ggml_opt_epoch_callback callback_eval);
|
||||
|
||||
void opt_epoch_iter(
|
||||
ggml_opt_dataset_t dataset,
|
||||
ggml_opt_result_t result,
|
||||
const std::vector<llama_token> & tokens,
|
||||
const std::vector<llama_token> & labels_sparse,
|
||||
llama_batch & batch,
|
||||
ggml_opt_epoch_callback callback,
|
||||
bool train,
|
||||
int64_t idata_in_loop,
|
||||
int64_t ndata_in_loop,
|
||||
int64_t t_loop_start);
|
||||
|
||||
private:
|
||||
//
|
||||
// output
|
||||
@ -170,30 +139,50 @@ private:
|
||||
// Returns max number of outputs for which space was reserved.
|
||||
int32_t output_reserve(int32_t n_outputs);
|
||||
|
||||
// make the outputs have the same order they had in the user-provided batch
|
||||
// TODO: maybe remove this
|
||||
void output_reorder();
|
||||
|
||||
//
|
||||
// graph
|
||||
//
|
||||
|
||||
public:
|
||||
int32_t graph_max_nodes() const;
|
||||
|
||||
// zero-out inputs and create the ctx_compute for the compute graph
|
||||
ggml_cgraph * graph_init();
|
||||
|
||||
// returns the result of ggml_backend_sched_graph_compute_async execution
|
||||
ggml_status graph_compute(
|
||||
ggml_cgraph * gf,
|
||||
bool batched);
|
||||
|
||||
private:
|
||||
llm_graph_result_ptr graph_build(
|
||||
ggml_context * ctx,
|
||||
ggml_cgraph * gf,
|
||||
const llama_ubatch & ubatch,
|
||||
llm_graph_type gtype);
|
||||
|
||||
// returns the result of ggml_backend_sched_graph_compute_async execution
|
||||
ggml_status graph_compute(
|
||||
ggml_cgraph * gf,
|
||||
bool batched);
|
||||
|
||||
llm_graph_cb graph_get_cb() const;
|
||||
|
||||
// used by kv_self_update()
|
||||
ggml_tensor * build_rope_shift(
|
||||
ggml_context * ctx0,
|
||||
ggml_tensor * cur,
|
||||
ggml_tensor * shift,
|
||||
ggml_tensor * factors,
|
||||
float freq_base,
|
||||
float freq_scale) const;
|
||||
|
||||
llm_graph_result_ptr build_kv_self_shift(
|
||||
ggml_context * ctx0,
|
||||
ggml_cgraph * gf) const;
|
||||
|
||||
llm_graph_result_ptr build_kv_self_defrag(
|
||||
ggml_context * ctx0,
|
||||
ggml_cgraph * gf,
|
||||
const std::vector<struct llama_kv_defrag_move> & moves) const;
|
||||
|
||||
// TODO: read/write lora adapters and cvec
|
||||
size_t state_write_data(llama_io_write_i & io);
|
||||
size_t state_read_data (llama_io_read_i & io);
|
||||
@ -210,10 +199,14 @@ private:
|
||||
llama_cparams cparams;
|
||||
llama_adapter_cvec cvec;
|
||||
llama_adapter_loras loras;
|
||||
llama_sbatch sbatch;
|
||||
|
||||
llama_cross cross; // TODO: tmp for handling cross-attention - need something better probably
|
||||
|
||||
std::unique_ptr<llama_memory_i> memory;
|
||||
std::unique_ptr<llama_kv_cache_unified> kv_self;
|
||||
|
||||
// TODO: remove
|
||||
bool logits_all = false;
|
||||
|
||||
// decode output (2-dimensional array: [n_outputs][n_vocab])
|
||||
size_t logits_size = 0; // capacity (of floats) for logits
|
||||
@ -240,9 +233,6 @@ private:
|
||||
|
||||
ggml_context_ptr ctx_compute;
|
||||
|
||||
// training
|
||||
ggml_opt_context_t opt_ctx = nullptr;
|
||||
|
||||
ggml_threadpool_t threadpool = nullptr;
|
||||
ggml_threadpool_t threadpool_batch = nullptr;
|
||||
|
||||
|
2
llama/llama.cpp/src/llama-cparams.h
vendored
2
llama/llama.cpp/src/llama-cparams.h
vendored
@ -29,8 +29,8 @@ struct llama_cparams {
|
||||
bool offload_kqv;
|
||||
bool flash_attn;
|
||||
bool no_perf;
|
||||
bool cross_attn;
|
||||
bool warmup;
|
||||
bool op_offload;
|
||||
|
||||
enum llama_pooling_type pooling_type;
|
||||
|
||||
|
83
llama/llama.cpp/src/llama-graph.cpp
vendored
83
llama/llama.cpp/src/llama-graph.cpp
vendored
@ -284,7 +284,24 @@ void llm_graph_input_s_copy::set_input(const llama_ubatch * ubatch) {
|
||||
|
||||
// assuming copy destinations ALWAYS happen ONLY on the cells between head and head+n
|
||||
for (uint32_t i = 0; i < n_kv; ++i) {
|
||||
data[i] = kv_self->s_copy(i);
|
||||
const uint32_t cell_id = i + kv_self->head;
|
||||
|
||||
//////////////////////////////////////////////
|
||||
// TODO: this should not mutate the KV cache !
|
||||
llama_kv_cell & kv_cell = const_cast<class llama_kv_cache_unified *>(kv_self)->cells[i];
|
||||
|
||||
// prevent out-of-bound sources
|
||||
if (kv_cell.src < 0 || (uint32_t) kv_cell.src >= kv_self->size) {
|
||||
kv_cell.src = cell_id;
|
||||
}
|
||||
|
||||
data[i] = kv_cell.src;
|
||||
|
||||
// TODO: do not mutate the KV cache
|
||||
// ensure copy only happens once
|
||||
if (kv_cell.src != (int32_t) cell_id) {
|
||||
kv_cell.src = cell_id;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -300,7 +317,18 @@ void llm_graph_input_s_mask::set_input(const llama_ubatch * ubatch) {
|
||||
|
||||
// clear unused states
|
||||
for (int i = 0; i < n_kv; ++i) {
|
||||
data[i] = kv_self->s_mask(i);
|
||||
const uint32_t cell_id = i + kv_self->head;
|
||||
|
||||
//////////////////////////////////////////////
|
||||
// TODO: this should not mutate the KV cache !
|
||||
llama_kv_cell & kv_cell = const_cast<class llama_kv_cache_unified *>(kv_self)->cells[i];
|
||||
|
||||
data[i] = (float) (kv_cell.src >= 0);
|
||||
|
||||
// only clear once
|
||||
if (kv_cell.src < 0) {
|
||||
kv_cell.src = cell_id;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -532,6 +560,12 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
|
||||
}
|
||||
}
|
||||
|
||||
void llm_graph_input_cross_attn_state::set_input(const llama_ubatch * ubatch) {
|
||||
if (ubatch->embd) {
|
||||
ggml_backend_tensor_set(cross_attn_state, ubatch->embd, 0, ggml_nbytes(cross_attn_state));
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// llm_graph_context
|
||||
//
|
||||
@ -782,7 +816,7 @@ ggml_tensor * llm_graph_context::build_ffn(
|
||||
} break;
|
||||
}
|
||||
|
||||
if (gate && type_gate == LLM_FFN_PAR) {
|
||||
if (type_gate == LLM_FFN_PAR) {
|
||||
cur = ggml_mul(ctx0, cur, tmp);
|
||||
cb(cur, "ffn_gate_par", il);
|
||||
}
|
||||
@ -971,7 +1005,6 @@ ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {
|
||||
inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);
|
||||
//cb(inp->tokens, "inp_tokens", -1);
|
||||
ggml_set_input(inp->tokens);
|
||||
res->t_tokens = inp->tokens;
|
||||
|
||||
cur = ggml_get_rows(ctx0, tok_embd, inp->tokens);
|
||||
|
||||
@ -1078,7 +1111,7 @@ ggml_tensor * llm_graph_context::build_inp_cls() const {
|
||||
}
|
||||
|
||||
ggml_tensor * llm_graph_context::build_inp_s_copy() const {
|
||||
const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
|
||||
const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
|
||||
|
||||
auto inp = std::make_unique<llm_graph_input_s_copy>(kv_self);
|
||||
|
||||
@ -1095,7 +1128,7 @@ ggml_tensor * llm_graph_context::build_inp_s_copy() const {
|
||||
}
|
||||
|
||||
ggml_tensor * llm_graph_context::build_inp_s_mask() const {
|
||||
const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
|
||||
const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
|
||||
|
||||
auto inp = std::make_unique<llm_graph_input_s_mask>(kv_self);
|
||||
|
||||
@ -1228,19 +1261,8 @@ ggml_tensor * llm_graph_context::build_attn_mha(
|
||||
ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
|
||||
|
||||
if (v_mla) {
|
||||
#if 0
|
||||
// v_mla can be applied as a matrix-vector multiplication with broadcasting across dimension 3 == n_tokens.
|
||||
// However, the code is optimized for dimensions 0 and 1 being large, so this is ineffient.
|
||||
cur = ggml_reshape_4d(ctx0, cur, v_mla->ne[0], 1, n_head, n_tokens);
|
||||
cur = ggml_mul_mat(ctx0, v_mla, cur);
|
||||
#else
|
||||
// It's preferable to do the calculation as a matrix-matrix multiplication with n_tokens in dimension 1.
|
||||
// The permutations are noops and only change how the tensor data is interpreted.
|
||||
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
|
||||
cur = ggml_mul_mat(ctx0, v_mla, cur);
|
||||
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
|
||||
cur = ggml_cont(ctx0, cur); // Needed because ggml_reshape_2d expects contiguous inputs.
|
||||
#endif
|
||||
}
|
||||
|
||||
cur = ggml_reshape_2d(ctx0, cur, cur->ne[0]*n_head, n_tokens);
|
||||
@ -1420,6 +1442,8 @@ ggml_tensor * llm_graph_context::build_attn(
|
||||
|
||||
// store to KV cache
|
||||
{
|
||||
GGML_ASSERT(!kv_self->recurrent);
|
||||
|
||||
const auto kv_head = kv_self->head;
|
||||
|
||||
GGML_ASSERT(kv_self->size == n_ctx);
|
||||
@ -1514,6 +1538,25 @@ llm_graph_input_attn_cross * llm_graph_context::build_attn_inp_cross() const {
|
||||
return (llm_graph_input_attn_cross *) res->add_input(std::move(inp));
|
||||
}
|
||||
|
||||
ggml_tensor * llm_graph_context::build_inp_cross_attn_state() const {
|
||||
const int64_t n_embd = hparams.n_embd;
|
||||
|
||||
auto inp = std::make_unique<llm_graph_input_cross_attn_state>();
|
||||
|
||||
ggml_tensor * cur = nullptr;
|
||||
|
||||
inp->cross_attn_state = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd, 1601, 4);
|
||||
ggml_set_input(inp->cross_attn_state);
|
||||
|
||||
cur = inp->cross_attn_state;
|
||||
|
||||
cb(cur, "inp_cross_attn_state", -1);
|
||||
|
||||
res->add_input(std::move(inp));
|
||||
|
||||
return cur;
|
||||
}
|
||||
|
||||
ggml_tensor * llm_graph_context::build_attn(
|
||||
llm_graph_input_attn_cross * inp,
|
||||
ggml_cgraph * gf,
|
||||
@ -1569,7 +1612,7 @@ ggml_tensor * llm_graph_context::build_copy_mask_state(
|
||||
ggml_tensor * state_mask,
|
||||
int32_t n_state,
|
||||
int32_t n_seqs) const {
|
||||
const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
|
||||
const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
|
||||
|
||||
const auto n_kv = kv_self->n;
|
||||
const auto kv_head = kv_self->head;
|
||||
@ -1601,7 +1644,7 @@ ggml_tensor * llm_graph_context::build_rwkv_token_shift_load(
|
||||
ggml_tensor * state_mask,
|
||||
const llama_ubatch & ubatch,
|
||||
int il) const {
|
||||
const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
|
||||
const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
|
||||
|
||||
const auto token_shift_count = hparams.token_shift_count;
|
||||
|
||||
@ -1622,7 +1665,7 @@ ggml_tensor * llm_graph_context::build_rwkv_token_shift_store(
|
||||
ggml_tensor * token_shift,
|
||||
const llama_ubatch & ubatch,
|
||||
int il) const {
|
||||
const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
|
||||
const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
|
||||
|
||||
const auto token_shift_count = hparams.token_shift_count;
|
||||
const auto n_embd = hparams.n_embd;
|
||||
|
32
llama/llama.cpp/src/llama-graph.h
vendored
32
llama/llama.cpp/src/llama-graph.h
vendored
@ -19,7 +19,6 @@ struct llama_cparams;
|
||||
|
||||
class llama_memory_i;
|
||||
class llama_kv_cache_unified;
|
||||
class llama_kv_cache_recurrent;
|
||||
|
||||
// certain models (typically multi-modal) can produce different types of graphs
|
||||
enum llm_graph_type {
|
||||
@ -87,6 +86,7 @@ public:
|
||||
|
||||
ggml_tensor * tokens = nullptr; // I32 [n_batch]
|
||||
ggml_tensor * embd = nullptr; // F32 [n_embd, n_batch]
|
||||
ggml_tensor * cross_attn_state; // F32 [4, n_embd, 1061]
|
||||
};
|
||||
|
||||
class llm_graph_input_pos : public llm_graph_input_i {
|
||||
@ -187,26 +187,26 @@ public:
|
||||
|
||||
class llm_graph_input_s_copy : public llm_graph_input_i {
|
||||
public:
|
||||
llm_graph_input_s_copy(const llama_kv_cache_recurrent * kv_self) : kv_self(kv_self) {}
|
||||
llm_graph_input_s_copy(const llama_kv_cache_unified * kv_self) : kv_self(kv_self) {}
|
||||
virtual ~llm_graph_input_s_copy() = default;
|
||||
|
||||
void set_input(const llama_ubatch * ubatch) override;
|
||||
|
||||
ggml_tensor * s_copy; // I32 [kv_size]
|
||||
|
||||
const llama_kv_cache_recurrent * kv_self;
|
||||
const llama_kv_cache_unified * kv_self;
|
||||
};
|
||||
|
||||
class llm_graph_input_s_mask : public llm_graph_input_i {
|
||||
public:
|
||||
llm_graph_input_s_mask(const llama_kv_cache_recurrent * kv_self) : kv_self(kv_self) {}
|
||||
llm_graph_input_s_mask(const llama_kv_cache_unified * kv_self) : kv_self(kv_self) {}
|
||||
virtual ~llm_graph_input_s_mask() = default;
|
||||
|
||||
void set_input(const llama_ubatch * ubatch) override;
|
||||
|
||||
ggml_tensor * s_mask; // F32 [1, n_kv]
|
||||
|
||||
const llama_kv_cache_recurrent * kv_self;
|
||||
const llama_kv_cache_unified * kv_self;
|
||||
};
|
||||
|
||||
class llm_graph_input_cross_embd : public llm_graph_input_i {
|
||||
@ -284,6 +284,16 @@ public:
|
||||
const llama_cross * cross = nullptr;
|
||||
};
|
||||
|
||||
class llm_graph_input_cross_attn_state : public llm_graph_input_i {
|
||||
public:
|
||||
llm_graph_input_cross_attn_state() = default;
|
||||
virtual ~llm_graph_input_cross_attn_state() = default;
|
||||
|
||||
void set_input(const llama_ubatch * ubatch) override;
|
||||
|
||||
ggml_tensor * cross_attn_state; // F32 [4, n_embd, 1061]
|
||||
};
|
||||
|
||||
//
|
||||
// llm_graph_result
|
||||
//
|
||||
@ -298,7 +308,6 @@ class llm_graph_result_i {
|
||||
public:
|
||||
virtual ~llm_graph_result_i() = default;
|
||||
|
||||
virtual ggml_tensor * get_tokens() = 0;
|
||||
virtual ggml_tensor * get_logits() = 0;
|
||||
virtual ggml_tensor * get_embd() = 0;
|
||||
virtual ggml_tensor * get_embd_pooled() = 0;
|
||||
@ -313,7 +322,6 @@ class llm_graph_result : public llm_graph_result_i {
|
||||
public:
|
||||
virtual ~llm_graph_result() = default;
|
||||
|
||||
ggml_tensor * get_tokens() override { return t_tokens; }
|
||||
ggml_tensor * get_logits() override { return t_logits; }
|
||||
ggml_tensor * get_embd() override { return t_embd; }
|
||||
ggml_tensor * get_embd_pooled() override { return t_embd_pooled; }
|
||||
@ -330,7 +338,6 @@ public:
|
||||
}
|
||||
|
||||
// important graph nodes
|
||||
ggml_tensor * t_tokens = nullptr;
|
||||
ggml_tensor * t_logits = nullptr;
|
||||
ggml_tensor * t_embd = nullptr;
|
||||
ggml_tensor * t_embd_pooled = nullptr;
|
||||
@ -354,8 +361,8 @@ struct llm_graph_params {
|
||||
const llama_cparams & cparams;
|
||||
const llama_ubatch & ubatch;
|
||||
|
||||
ggml_backend_sched_t sched;
|
||||
ggml_backend_t backend_cpu;
|
||||
ggml_backend_sched * sched;
|
||||
ggml_backend * backend_cpu;
|
||||
|
||||
const llama_adapter_cvec * cvec;
|
||||
const llama_adapter_loras * loras;
|
||||
@ -406,9 +413,9 @@ struct llm_graph_context {
|
||||
|
||||
ggml_context * ctx0 = nullptr;
|
||||
|
||||
ggml_backend_sched_t sched;
|
||||
ggml_backend_sched * sched;
|
||||
|
||||
ggml_backend_t backend_cpu; // TODO: needed by build_attn_mha, figure out a way to remove?
|
||||
ggml_backend * backend_cpu; // TODO: needed by build_attn_mha, figure out a way to remove?
|
||||
|
||||
const llama_adapter_cvec * cvec;
|
||||
const llama_adapter_loras * loras;
|
||||
@ -495,6 +502,7 @@ struct llm_graph_context {
|
||||
ggml_tensor * build_inp_cls() const;
|
||||
ggml_tensor * build_inp_s_copy() const;
|
||||
ggml_tensor * build_inp_s_mask() const;
|
||||
ggml_tensor * build_inp_cross_attn_state() const;
|
||||
|
||||
ggml_tensor * build_inp_cross_embd() const;
|
||||
ggml_tensor * build_inp_pos_bucket_enc() const;
|
||||
|
4
llama/llama.cpp/src/llama-hparams.cpp
vendored
4
llama/llama.cpp/src/llama-hparams.cpp
vendored
@ -85,3 +85,7 @@ bool llama_hparams::is_swa(uint32_t il) const {
|
||||
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
bool llama_hparams::cross_attention_layers(uint32_t il) const {
|
||||
return std::find(cross_attn_layers.begin(), cross_attn_layers.end(), il) != cross_attn_layers.end();
|
||||
}
|
||||
|
7
llama/llama.cpp/src/llama-hparams.h
vendored
7
llama/llama.cpp/src/llama-hparams.h
vendored
@ -2,6 +2,8 @@
|
||||
|
||||
#include "llama.h"
|
||||
|
||||
#include <algorithm>
|
||||
|
||||
#include <array>
|
||||
|
||||
// bump if necessary
|
||||
@ -42,6 +44,7 @@ struct llama_hparams {
|
||||
uint32_t n_expert = 0;
|
||||
uint32_t n_expert_used = 0;
|
||||
uint32_t n_rel_attn_bkts = 0;
|
||||
uint32_t n_vocab = 0;
|
||||
|
||||
// note: deepseek2 using MLA converts into MQA with larger heads, then decompresses to MHA
|
||||
uint32_t n_embd_head_k_mla = 0;
|
||||
@ -56,6 +59,7 @@ struct llama_hparams {
|
||||
std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
|
||||
|
||||
std::array<std::array<uint32_t, LLAMA_MAX_LAYERS>, 4> n_bskcn_arr = {};
|
||||
std::array<uint32_t, LLAMA_MAX_LAYERS> cross_attn_layers;
|
||||
|
||||
uint32_t n_layer_dense_lead = 0;
|
||||
uint32_t n_lora_q = 0;
|
||||
@ -159,6 +163,9 @@ struct llama_hparams {
|
||||
// Block skip connection
|
||||
bool n_bskcn(uint32_t n, uint32_t il) const;
|
||||
|
||||
// cross attention layers
|
||||
bool cross_attention_layers(uint32_t il) const;
|
||||
|
||||
bool is_swa(uint32_t il) const;
|
||||
};
|
||||
|
||||
|
1826
llama/llama.cpp/src/llama-kv-cache.cpp
vendored
1826
llama/llama.cpp/src/llama-kv-cache.cpp
vendored
File diff suppressed because it is too large
Load Diff
367
llama/llama.cpp/src/llama-kv-cache.h
vendored
367
llama/llama.cpp/src/llama-kv-cache.h
vendored
@ -2,72 +2,32 @@
|
||||
|
||||
#include "llama.h"
|
||||
#include "llama-io.h"
|
||||
#include "llama-graph.h"
|
||||
#include "llama-memory.h"
|
||||
|
||||
#include "ggml-cpp.h"
|
||||
|
||||
#include <functional>
|
||||
#include <set>
|
||||
#include <vector>
|
||||
|
||||
struct llama_cparams;
|
||||
struct llama_hparams;
|
||||
struct llama_ubatch;
|
||||
struct llama_sbatch;
|
||||
struct llama_model;
|
||||
struct llama_context;
|
||||
|
||||
struct llama_kv_cache : public llama_memory_i {
|
||||
virtual ~llama_kv_cache() = default;
|
||||
using llama_memory_i::llama_memory_i;
|
||||
|
||||
// call if batch processing fails - restores the cache state
|
||||
virtual void restore() = 0;
|
||||
virtual void restore() = 0; // call if batch processing fails - restores the cache state
|
||||
virtual void commit() = 0; // call after successful batch processing - clears any pending state
|
||||
|
||||
// call after successful batch processing - clears any pending state
|
||||
virtual void commit() = 0;
|
||||
virtual int32_t get_n_tokens() const = 0;
|
||||
virtual int32_t get_used_cells() const = 0; // TODO: remove, this is too-specific to the unified cache
|
||||
|
||||
// process any pending defrag/shift/etc. operations
|
||||
// optionally call once before processing a new batch
|
||||
virtual bool update(llama_context & lctx) = 0;
|
||||
|
||||
// schedule a defrag if the fragmentation threshold is exceeded. otherwise, do nothing
|
||||
virtual void defrag_sched(float thold) = 0;
|
||||
|
||||
// simulate full cache, used for allocating worst-case compute buffers
|
||||
virtual void set_full() = 0;
|
||||
|
||||
//
|
||||
// batch processing
|
||||
//
|
||||
|
||||
virtual llama_sbatch sbatch_init(const llama_batch & batch, bool logits_all) = 0;
|
||||
|
||||
// different KV caches require different batch splitting strategies
|
||||
virtual llama_ubatch ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) const = 0;
|
||||
|
||||
// find an empty slot of size "n_tokens" in the cache
|
||||
virtual bool find_slot(const llama_ubatch & batch) = 0;
|
||||
|
||||
// getters
|
||||
virtual int32_t get_n_tokens() const = 0;
|
||||
virtual int32_t get_used_cells() const = 0; // TODO: remove, this is too-specific to the unified cache
|
||||
virtual llama_pos get_pos_max() const = 0;
|
||||
virtual bool get_can_shift() const = 0;
|
||||
virtual bool get_can_shift() const = 0;
|
||||
|
||||
bool get_can_edit() const override { return get_can_shift(); }
|
||||
|
||||
//
|
||||
// state write/read
|
||||
//
|
||||
|
||||
virtual void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const = 0;
|
||||
virtual void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1) = 0;
|
||||
};
|
||||
|
||||
//
|
||||
// llama_kv_cache_guard
|
||||
//
|
||||
|
||||
struct llama_kv_cache_guard {
|
||||
llama_kv_cache_guard(llama_kv_cache * kv) : kv(kv) {}
|
||||
|
||||
@ -82,7 +42,7 @@ struct llama_kv_cache_guard {
|
||||
private:
|
||||
llama_kv_cache * kv;
|
||||
};
|
||||
|
||||
|
||||
// block of KV slots to move when defragging
|
||||
struct llama_kv_defrag_move {
|
||||
uint32_t src;
|
||||
@ -90,50 +50,65 @@ struct llama_kv_defrag_move {
|
||||
uint32_t len;
|
||||
};
|
||||
|
||||
//
|
||||
// llama_kv_cache_unified
|
||||
//
|
||||
struct llama_kv_cell {
|
||||
llama_pos pos = -1;
|
||||
llama_pos delta = 0;
|
||||
int32_t src = -1; // used by recurrent state models to copy states
|
||||
int32_t tail = -1;
|
||||
|
||||
std::set<llama_seq_id> seq_id;
|
||||
|
||||
bool has_seq_id(const llama_seq_id & id) const {
|
||||
return seq_id.find(id) != seq_id.end();
|
||||
}
|
||||
|
||||
bool is_empty() const {
|
||||
return seq_id.empty();
|
||||
}
|
||||
|
||||
bool is_same_seq(const llama_kv_cell & other) const {
|
||||
return seq_id == other.seq_id;
|
||||
}
|
||||
};
|
||||
|
||||
// ring-buffer of cached KV data
|
||||
// TODO: pimpl
|
||||
// TODO: add notion of max sequences
|
||||
class llama_kv_cache_unified : public llama_kv_cache {
|
||||
public:
|
||||
struct kv_cell {
|
||||
llama_pos pos = -1;
|
||||
llama_pos delta = 0;
|
||||
|
||||
std::set<llama_seq_id> seq_id;
|
||||
|
||||
bool has_seq_id(const llama_seq_id & id) const {
|
||||
return seq_id.find(id) != seq_id.end();
|
||||
}
|
||||
|
||||
bool is_empty() const {
|
||||
return seq_id.empty();
|
||||
}
|
||||
|
||||
bool is_same_seq(const kv_cell & other) const {
|
||||
return seq_id == other.seq_id;
|
||||
}
|
||||
// can be used to query data from the model if needed
|
||||
struct callbacks {
|
||||
std::function<ggml_tensor * (uint32_t n_ctx_per_seq, int il)> get_rope_factors;
|
||||
};
|
||||
|
||||
static uint32_t get_padding(const llama_cparams & cparams);
|
||||
|
||||
llama_kv_cache_unified(
|
||||
const llama_model & model,
|
||||
const llama_hparams & hparams,
|
||||
callbacks cbs);
|
||||
|
||||
virtual ~llama_kv_cache_unified() = default;
|
||||
|
||||
// TODO: become constructor
|
||||
bool init(
|
||||
const llama_model & model, // TODO: do not reference the model
|
||||
const llama_cparams & cparams,
|
||||
ggml_type type_k,
|
||||
ggml_type type_v,
|
||||
bool v_trans,
|
||||
bool offload,
|
||||
uint32_t kv_size,
|
||||
uint32_t padding);
|
||||
bool offload);
|
||||
|
||||
~llama_kv_cache_unified() = default;
|
||||
int32_t get_n_tokens() const override;
|
||||
int32_t get_used_cells() const override;
|
||||
|
||||
//
|
||||
// llama_memory_i
|
||||
//
|
||||
size_t total_size() const;
|
||||
|
||||
// TODO: better data structures to reduce the cost of this operation
|
||||
llama_pos pos_max() const;
|
||||
|
||||
void clear() override;
|
||||
void defrag() override;
|
||||
|
||||
virtual void restore() override;
|
||||
virtual void commit() override;
|
||||
|
||||
bool seq_rm (llama_seq_id seq_id, llama_pos p0, llama_pos p1) override;
|
||||
void seq_cp (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override;
|
||||
@ -143,76 +118,25 @@ public:
|
||||
|
||||
llama_pos seq_pos_max(llama_seq_id seq_id) const override;
|
||||
|
||||
//
|
||||
// llama_kv_cache
|
||||
//
|
||||
|
||||
void restore() override;
|
||||
void commit() override;
|
||||
|
||||
bool update(llama_context & ctx) override;
|
||||
|
||||
void defrag_sched(float thold) override;
|
||||
|
||||
void set_full() override;
|
||||
|
||||
llama_sbatch sbatch_init(const llama_batch & batch, bool logits_all) override;
|
||||
|
||||
llama_ubatch ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) const override;
|
||||
bool get_can_shift() const override;
|
||||
|
||||
// find an empty slot of size "n_tokens" in the cache
|
||||
// updates the cache head
|
||||
// Note: On success, it's important that cache.head points
|
||||
// to the first cell of the slot.
|
||||
bool find_slot(const llama_ubatch & batch) override;
|
||||
bool find_slot(const llama_ubatch & batch);
|
||||
|
||||
int32_t get_n_tokens() const override;
|
||||
int32_t get_used_cells() const override;
|
||||
// TODO: maybe not needed
|
||||
uint32_t get_padding(const llama_cparams & cparams) const;
|
||||
|
||||
// TODO: better data structures to reduce the cost of this operation
|
||||
llama_pos get_pos_max() const override;
|
||||
// find how many cells are currently in use
|
||||
uint32_t cell_max() const;
|
||||
|
||||
bool get_can_shift() const override;
|
||||
|
||||
// state write/load
|
||||
|
||||
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
|
||||
void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1) override;
|
||||
|
||||
// Note: The value of head isn't only used to optimize searching
|
||||
// for a free KV slot. llama_decode_impl also uses it, so it
|
||||
// cannot be freely changed after a slot has been allocated.
|
||||
uint32_t head = 0;
|
||||
uint32_t size = 0;
|
||||
uint32_t used = 0; // used cells (i.e. at least one seq_id)
|
||||
|
||||
// computed before each graph build
|
||||
uint32_t n = 0;
|
||||
|
||||
std::vector<kv_cell> cells;
|
||||
|
||||
std::vector<ggml_tensor *> k_l; // per layer
|
||||
std::vector<ggml_tensor *> v_l;
|
||||
|
||||
private:
|
||||
const llama_model & model;
|
||||
const llama_hparams & hparams;
|
||||
|
||||
bool has_shift = false;
|
||||
bool do_defrag = false;
|
||||
|
||||
bool v_trans = true; // the value tensor is transposed
|
||||
bool can_shift = false;
|
||||
|
||||
// required padding
|
||||
uint32_t padding = 1;
|
||||
|
||||
ggml_type type_k = GGML_TYPE_F16;
|
||||
ggml_type type_v = GGML_TYPE_F16;
|
||||
|
||||
std::vector<ggml_context_ptr> ctxs;
|
||||
std::vector<ggml_backend_buffer_ptr> bufs;
|
||||
size_t size_k_bytes() const;
|
||||
size_t size_v_bytes() const;
|
||||
|
||||
// defrag
|
||||
|
||||
struct {
|
||||
std::vector<llama_kv_defrag_move> moves;
|
||||
} defrag_info;
|
||||
@ -221,6 +145,7 @@ private:
|
||||
bool defrag_prepare(int32_t n_max_nodes);
|
||||
|
||||
// commit/restore cache
|
||||
|
||||
struct slot_range {
|
||||
uint32_t c0 = 0; // note: these are cell indices, not sequence positions
|
||||
uint32_t c1 = 0;
|
||||
@ -231,125 +156,25 @@ private:
|
||||
std::vector<slot_range> ranges;
|
||||
} pending;
|
||||
|
||||
// find how many cells are currently in use
|
||||
uint32_t cell_max() const;
|
||||
|
||||
size_t total_size() const;
|
||||
|
||||
size_t size_k_bytes() const;
|
||||
size_t size_v_bytes() const;
|
||||
|
||||
ggml_tensor * build_rope_shift(
|
||||
const llama_cparams & cparams,
|
||||
ggml_context * ctx,
|
||||
ggml_tensor * cur,
|
||||
ggml_tensor * shift,
|
||||
ggml_tensor * factors,
|
||||
float freq_base,
|
||||
float freq_scale) const;
|
||||
|
||||
llm_graph_result_ptr build_graph_shift(
|
||||
const llama_cparams & cparams,
|
||||
ggml_context * ctx,
|
||||
ggml_cgraph * gf) const;
|
||||
|
||||
llm_graph_result_ptr build_graph_defrag(
|
||||
const llama_cparams & cparams,
|
||||
ggml_context * ctx,
|
||||
ggml_cgraph * gf,
|
||||
const std::vector<llama_kv_defrag_move> & moves) const;
|
||||
|
||||
void state_write_meta(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id = -1) const;
|
||||
void state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) const;
|
||||
|
||||
bool state_read_meta(llama_io_read_i & io, uint32_t cell_count, llama_seq_id dest_seq_id = -1);
|
||||
bool state_read_data(llama_io_read_i & io, uint32_t cell_count);
|
||||
};
|
||||
|
||||
//
|
||||
// llama_kv_cache_recurrent
|
||||
//
|
||||
|
||||
class llama_kv_cache_recurrent : public llama_kv_cache {
|
||||
public:
|
||||
struct kv_cell {
|
||||
llama_pos pos = -1;
|
||||
int32_t src = -1; // used to copy states
|
||||
int32_t tail = -1;
|
||||
|
||||
std::set<llama_seq_id> seq_id;
|
||||
|
||||
bool has_seq_id(const llama_seq_id & id) const {
|
||||
return seq_id.find(id) != seq_id.end();
|
||||
}
|
||||
|
||||
bool is_empty() const {
|
||||
return seq_id.empty();
|
||||
}
|
||||
|
||||
bool is_same_seq(const kv_cell & other) const {
|
||||
return seq_id == other.seq_id;
|
||||
}
|
||||
};
|
||||
|
||||
llama_kv_cache_recurrent(
|
||||
const llama_model & model,
|
||||
ggml_type type_k,
|
||||
ggml_type type_v,
|
||||
bool offload,
|
||||
uint32_t kv_size);
|
||||
|
||||
~llama_kv_cache_recurrent() = default;
|
||||
|
||||
//
|
||||
// llama_memory_i
|
||||
//
|
||||
|
||||
void clear() override;
|
||||
|
||||
bool seq_rm (llama_seq_id seq_id, llama_pos p0, llama_pos p1) override;
|
||||
void seq_cp (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override;
|
||||
void seq_keep(llama_seq_id seq_id) override;
|
||||
void seq_add (llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) override;
|
||||
void seq_div (llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) override;
|
||||
|
||||
llama_pos seq_pos_max(llama_seq_id seq_id) const override;
|
||||
|
||||
//
|
||||
// llama_kv_cache
|
||||
//
|
||||
|
||||
void restore() override;
|
||||
void commit() override;
|
||||
|
||||
bool update(llama_context & lctx) override;
|
||||
|
||||
void defrag_sched(float thold) override;
|
||||
|
||||
void set_full() override;
|
||||
|
||||
llama_sbatch sbatch_init(const llama_batch & batch, bool logits_all) override;
|
||||
|
||||
llama_ubatch ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) const override;
|
||||
|
||||
bool find_slot(const llama_ubatch & batch) override;
|
||||
|
||||
int32_t get_n_tokens() const override;
|
||||
int32_t get_used_cells() const override;
|
||||
|
||||
// TODO: better data structures to reduce the cost of this operation
|
||||
llama_pos get_pos_max() const override;
|
||||
|
||||
bool get_can_shift() const override;
|
||||
|
||||
// TODO: temporary methods - they are not really const as they do const_cast<>, fix this
|
||||
int32_t s_copy(int i) const;
|
||||
float s_mask(int i) const;
|
||||
|
||||
// state write/load
|
||||
|
||||
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
|
||||
void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1) override;
|
||||
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const;
|
||||
void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1);
|
||||
|
||||
// members
|
||||
|
||||
const llama_hparams & hparams;
|
||||
|
||||
callbacks cbs;
|
||||
|
||||
bool has_shift = false;
|
||||
bool do_defrag = false;
|
||||
|
||||
// TODO: remove this and implement llama_kv_cache_recurrent instead
|
||||
bool recurrent = false; // with recurrent state models, a cell can hold the state for more than one past token
|
||||
|
||||
bool v_trans = true; // the value tensor is transposed
|
||||
bool can_shift = false;
|
||||
|
||||
// Note: The value of head isn't only used to optimize searching
|
||||
// for a free KV slot. llama_decode_impl also uses it, so it
|
||||
@ -361,41 +186,18 @@ public:
|
||||
// computed before each graph build
|
||||
uint32_t n = 0;
|
||||
|
||||
std::vector<kv_cell> cells;
|
||||
std::vector<llama_kv_cell> cells;
|
||||
|
||||
std::vector<ggml_tensor *> k_l; // per layer
|
||||
std::vector<ggml_tensor *> v_l;
|
||||
|
||||
private:
|
||||
//const llama_model & model;
|
||||
const llama_hparams & hparams;
|
||||
|
||||
// commit/restore cache
|
||||
// TODO: rework for recurrent cache
|
||||
struct slot_range {
|
||||
uint32_t c0 = 0; // note: these are cell indices, not sequence positions
|
||||
uint32_t c1 = 0;
|
||||
};
|
||||
|
||||
// pending cell updates that are not yet committed
|
||||
struct {
|
||||
std::vector<slot_range> ranges;
|
||||
} pending;
|
||||
|
||||
ggml_type type_k = GGML_TYPE_F16;
|
||||
ggml_type type_v = GGML_TYPE_F16;
|
||||
|
||||
std::vector<ggml_context_ptr> ctxs;
|
||||
std::vector<ggml_backend_buffer_ptr> bufs;
|
||||
|
||||
// find how many cells are currently in use
|
||||
uint32_t cell_max() const;
|
||||
|
||||
size_t total_size() const;
|
||||
|
||||
size_t size_k_bytes() const;
|
||||
size_t size_v_bytes() const;
|
||||
|
||||
void state_write_meta(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id = -1) const;
|
||||
void state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) const;
|
||||
|
||||
@ -403,6 +205,11 @@ private:
|
||||
bool state_read_data(llama_io_read_i & io, uint32_t cell_count);
|
||||
};
|
||||
|
||||
// TODO: temporary reusing llama_kv_cache_unified -- implement recurrent cache and simplify llama_kv_cache_unified
|
||||
//class llama_kv_cache_recurrent : public llama_kv_cache_unified {
|
||||
//public:
|
||||
// using llama_kv_cache_unified::llama_kv_cache_unified;
|
||||
//};
|
||||
|
||||
//
|
||||
// kv cache view
|
||||
|
12
llama/llama.cpp/src/llama-memory.h
vendored
12
llama/llama.cpp/src/llama-memory.h
vendored
@ -2,22 +2,12 @@
|
||||
|
||||
#include "llama.h"
|
||||
|
||||
struct llama_memory_params {
|
||||
// kv cache
|
||||
ggml_type type_k;
|
||||
ggml_type type_v;
|
||||
|
||||
// parameters for other types of memory
|
||||
// ...
|
||||
};
|
||||
|
||||
// general concept of LLM memory
|
||||
// the KV cache is a type of LLM memory, but there can be other types
|
||||
class llama_memory_i {
|
||||
public:
|
||||
virtual ~llama_memory_i() = default;
|
||||
|
||||
virtual void clear() = 0;
|
||||
virtual void defrag() = 0;
|
||||
|
||||
virtual bool seq_rm (llama_seq_id seq_id, llama_pos p0, llama_pos p1) = 0;
|
||||
virtual void seq_cp (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) = 0;
|
||||
|
26
llama/llama.cpp/src/llama-model-loader.cpp
vendored
26
llama/llama.cpp/src/llama-model-loader.cpp
vendored
@ -301,12 +301,12 @@ namespace GGUFMeta {
|
||||
GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta.get(), kid);
|
||||
|
||||
switch (arr_info.gt) {
|
||||
case GGUF_TYPE_UINT32:
|
||||
case GGUF_TYPE_INT32: GGML_ASSERT((std::is_same<T, int32_t>::value) ||
|
||||
(std::is_same<T, uint32_t>::value)); break;
|
||||
case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same<T, float>::value)); break;
|
||||
case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same<T, float>::value)); break;
|
||||
case GGUF_TYPE_INT32: GGML_ASSERT(
|
||||
(std::is_same<T, int32_t>::value) ||
|
||||
(std::is_same<T, uint32_t>::value)); break;
|
||||
default:
|
||||
throw std::runtime_error(format("%s is not a float32/uint32/int32 array", key.c_str()));
|
||||
throw std::runtime_error(format("%s is not a float32, int32 array", key.c_str()));
|
||||
}
|
||||
|
||||
result.resize(arr_info.length);
|
||||
@ -315,6 +315,8 @@ namespace GGUFMeta {
|
||||
return true;
|
||||
}
|
||||
|
||||
template bool llama_model_loader::get_arr<std::array<unsigned int, 512>>(enum llm_kv kid, std::array<unsigned int, 512>& result, bool required);
|
||||
|
||||
template<typename T, size_t N_MAX>
|
||||
bool llama_model_loader::get_arr(const std::string & key, std::array<T, N_MAX> & result, bool required) {
|
||||
const int kid = gguf_find_key(meta.get(), key.c_str());
|
||||
@ -330,12 +332,12 @@ namespace GGUFMeta {
|
||||
GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta.get(), kid);
|
||||
|
||||
switch (arr_info.gt) {
|
||||
case GGUF_TYPE_UINT32:
|
||||
case GGUF_TYPE_INT32: GGML_ASSERT((std::is_same<T, int32_t>::value) ||
|
||||
(std::is_same<T, uint32_t>::value)); break;
|
||||
case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same<T, float>::value)); break;
|
||||
case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same<T, float>::value)); break;
|
||||
case GGUF_TYPE_INT32: GGML_ASSERT(
|
||||
(std::is_same<T, int32_t>::value) ||
|
||||
(std::is_same<T, uint32_t>::value)); break;
|
||||
default:
|
||||
throw std::runtime_error(format("%s is not a float32/uint32/int32 array", key.c_str()));
|
||||
throw std::runtime_error(format("%s is not a float32, int32 array", key.c_str()));
|
||||
}
|
||||
|
||||
if (arr_info.length > N_MAX) {
|
||||
@ -824,10 +826,6 @@ void llama_model_loader::init_mappings(bool prefetch, llama_mlocks * mlock_mmaps
|
||||
mmaps_used.reserve(files.size());
|
||||
for (const auto & file : files) {
|
||||
auto * reg = ggml_backend_dev_backend_reg(ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU));
|
||||
if (!reg) {
|
||||
throw std::runtime_error(format("%s: no CPU backend found", __func__));
|
||||
}
|
||||
|
||||
auto * is_numa_fn = (decltype(ggml_is_numa) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_is_numa");
|
||||
std::unique_ptr<llama_mmap> mapping = std::make_unique<llama_mmap>(file.get(), prefetch ? -1 : 0, is_numa_fn());
|
||||
mmaps_used.emplace_back(mapping->size(), 0);
|
||||
|
281
llama/llama.cpp/src/llama-model-saver.cpp
vendored
281
llama/llama.cpp/src/llama-model-saver.cpp
vendored
@ -1,281 +0,0 @@
|
||||
#include "llama-model-saver.h"
|
||||
|
||||
#include "gguf.h"
|
||||
|
||||
#include "llama.h"
|
||||
#include "llama-hparams.h"
|
||||
#include "llama-model.h"
|
||||
#include "llama-vocab.h"
|
||||
|
||||
#include <string>
|
||||
|
||||
llama_model_saver::llama_model_saver(const struct llama_model & model) : model(model), llm_kv(model.arch) {
|
||||
gguf_ctx = gguf_init_empty();
|
||||
}
|
||||
|
||||
llama_model_saver::~llama_model_saver() {
|
||||
gguf_free(gguf_ctx);
|
||||
}
|
||||
|
||||
void llama_model_saver::add_kv(const enum llm_kv key, const uint32_t value) {
|
||||
gguf_set_val_u32(gguf_ctx, llm_kv(key).c_str(), value);
|
||||
}
|
||||
|
||||
void llama_model_saver::add_kv(const enum llm_kv key, const int32_t value) {
|
||||
gguf_set_val_i32(gguf_ctx, llm_kv(key).c_str(), value);
|
||||
}
|
||||
|
||||
void llama_model_saver::add_kv(const enum llm_kv key, const float value) {
|
||||
gguf_set_val_f32(gguf_ctx, llm_kv(key).c_str(), value);
|
||||
}
|
||||
|
||||
void llama_model_saver::add_kv(const enum llm_kv key, const bool value) {
|
||||
gguf_set_val_bool(gguf_ctx, llm_kv(key).c_str(), value);
|
||||
}
|
||||
|
||||
void llama_model_saver::add_kv(const enum llm_kv key, const char * value) {
|
||||
gguf_set_val_str(gguf_ctx, llm_kv(key).c_str(), value);
|
||||
}
|
||||
|
||||
[[noreturn]]
|
||||
void llama_model_saver::add_kv(const enum llm_kv key, const char value) {
|
||||
GGML_UNUSED(key);
|
||||
GGML_UNUSED(value);
|
||||
GGML_ABORT("fatal error"); // this should never be called, only needed to make the template below compile
|
||||
}
|
||||
|
||||
template <typename Container>
|
||||
void llama_model_saver::add_kv(const enum llm_kv key, const Container & value, const bool per_layer) {
|
||||
const size_t n_values = per_layer ? size_t(model.hparams.n_layer) : value.size();
|
||||
GGML_ASSERT(n_values <= value.size());
|
||||
|
||||
if (n_values == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (per_layer) {
|
||||
bool all_values_the_same = true;
|
||||
for (size_t i = 1; i < n_values; ++i) {
|
||||
if (value[i] != value[0]) {
|
||||
all_values_the_same = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (all_values_the_same) {
|
||||
add_kv(key, value[0]);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
if (std::is_same<typename Container::value_type, uint8_t>::value) {
|
||||
gguf_set_arr_data(gguf_ctx, llm_kv(key).c_str(), GGUF_TYPE_UINT8, value.data(), n_values);
|
||||
} else if (std::is_same<typename Container::value_type, int8_t>::value) {
|
||||
gguf_set_arr_data(gguf_ctx, llm_kv(key).c_str(), GGUF_TYPE_INT8, value.data(), n_values);
|
||||
} else if (std::is_same<typename Container::value_type, uint32_t>::value) {
|
||||
gguf_set_arr_data(gguf_ctx, llm_kv(key).c_str(), GGUF_TYPE_UINT32, value.data(), n_values);
|
||||
} else if (std::is_same<typename Container::value_type, int32_t>::value) {
|
||||
gguf_set_arr_data(gguf_ctx, llm_kv(key).c_str(), GGUF_TYPE_INT32, value.data(), n_values);
|
||||
} else if (std::is_same<typename Container::value_type, float>::value) {
|
||||
gguf_set_arr_data(gguf_ctx, llm_kv(key).c_str(), GGUF_TYPE_FLOAT32, value.data(), n_values);
|
||||
} else if (std::is_same<Container, std::string>::value) {
|
||||
gguf_set_val_str(gguf_ctx, llm_kv(key).c_str(), reinterpret_cast<const char *>(value.data()));
|
||||
} else {
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
|
||||
void llama_model_saver::add_kv(const enum llm_kv key, const std::vector<std::string> & value) {
|
||||
std::vector<const char *> tmp(value.size());
|
||||
for (size_t i = 0; i < value.size(); ++i) {
|
||||
tmp[i] = value[i].c_str();
|
||||
}
|
||||
gguf_set_arr_str(gguf_ctx, llm_kv(key).c_str(), tmp.data(), tmp.size());
|
||||
}
|
||||
|
||||
void llama_model_saver::add_tensor(const struct ggml_tensor * tensor) {
|
||||
if (!tensor) {
|
||||
return;
|
||||
}
|
||||
if (gguf_find_tensor(gguf_ctx, tensor->name) >= 0) {
|
||||
GGML_ASSERT(std::string(tensor->name) == "rope_freqs.weight"); // FIXME
|
||||
return;
|
||||
}
|
||||
gguf_add_tensor(gguf_ctx, tensor);
|
||||
}
|
||||
|
||||
void llama_model_saver::add_kv_from_model() {
|
||||
const llama_hparams & hparams = model.hparams;
|
||||
const llama_vocab & vocab = model.vocab;
|
||||
|
||||
const int32_t n_vocab = vocab.n_tokens();
|
||||
std::vector<std::string> tokens(n_vocab);
|
||||
std::vector<float> scores(n_vocab);
|
||||
std::vector<int32_t> token_types(n_vocab);
|
||||
|
||||
for (int32_t id = 0; id < n_vocab; ++id) {
|
||||
const llama_vocab::token_data & token_data = vocab.get_token_data(id);
|
||||
|
||||
tokens[id] = token_data.text;
|
||||
scores[id] = token_data.score;
|
||||
|
||||
switch(token_data.attr) {
|
||||
case LLAMA_TOKEN_ATTR_UNKNOWN: token_types[id] = LLAMA_TOKEN_TYPE_UNKNOWN; break;
|
||||
case LLAMA_TOKEN_ATTR_UNUSED: token_types[id] = LLAMA_TOKEN_TYPE_UNUSED; break;
|
||||
case LLAMA_TOKEN_ATTR_NORMAL: token_types[id] = LLAMA_TOKEN_TYPE_NORMAL; break;
|
||||
case LLAMA_TOKEN_ATTR_CONTROL: token_types[id] = LLAMA_TOKEN_TYPE_CONTROL; break;
|
||||
case LLAMA_TOKEN_ATTR_USER_DEFINED: token_types[id] = LLAMA_TOKEN_TYPE_USER_DEFINED; break;
|
||||
case LLAMA_TOKEN_ATTR_BYTE: token_types[id] = LLAMA_TOKEN_TYPE_BYTE; break;
|
||||
case LLAMA_TOKEN_ATTR_UNDEFINED:
|
||||
default: token_types[id] = LLAMA_TOKEN_TYPE_UNDEFINED; break;
|
||||
}
|
||||
}
|
||||
|
||||
// add_kv(LLM_KV_GENERAL_TYPE, ???);
|
||||
add_kv(LLM_KV_GENERAL_ARCHITECTURE, model.arch_name());
|
||||
// add_kv(LLM_KV_GENERAL_QUANTIZATION_VERSION, ???);
|
||||
// add_kv(LLM_KV_GENERAL_ALIGNMENT, ???);
|
||||
add_kv(LLM_KV_GENERAL_NAME, model.name);
|
||||
// add_kv(LLM_KV_GENERAL_AUTHOR, ???);
|
||||
// add_kv(LLM_KV_GENERAL_VERSION, ???);
|
||||
// add_kv(LLM_KV_GENERAL_URL, ???);
|
||||
// add_kv(LLM_KV_GENERAL_DESCRIPTION, ???);
|
||||
// add_kv(LLM_KV_GENERAL_LICENSE, ???);
|
||||
// add_kv(LLM_KV_GENERAL_SOURCE_URL, ???);
|
||||
// add_kv(LLM_KV_GENERAL_SOURCE_HF_REPO, ???);
|
||||
|
||||
add_kv(LLM_KV_VOCAB_SIZE, vocab.n_tokens());
|
||||
add_kv(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
|
||||
add_kv(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
|
||||
add_kv(LLM_KV_BLOCK_COUNT, hparams.n_layer);
|
||||
add_kv(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
|
||||
add_kv(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, true);
|
||||
add_kv(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
||||
add_kv(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
||||
add_kv(LLM_KV_USE_PARALLEL_RESIDUAL, hparams.use_par_res);
|
||||
// add_kv(LLM_KV_TENSOR_DATA_LAYOUT, ???);
|
||||
add_kv(LLM_KV_EXPERT_COUNT, hparams.n_expert);
|
||||
add_kv(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used);
|
||||
add_kv(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
|
||||
add_kv(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
|
||||
add_kv(LLM_KV_POOLING_TYPE, uint32_t(hparams.pooling_type));
|
||||
add_kv(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
|
||||
add_kv(LLM_KV_DECODER_START_TOKEN_ID, hparams.dec_start_token_id);
|
||||
add_kv(LLM_KV_ATTN_LOGIT_SOFTCAPPING, hparams.f_attn_logit_softcapping);
|
||||
add_kv(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping);
|
||||
add_kv(LLM_KV_SWIN_NORM, hparams.swin_norm);
|
||||
add_kv(LLM_KV_RESCALE_EVERY_N_LAYERS, hparams.rescale_every_n_layers);
|
||||
add_kv(LLM_KV_TIME_MIX_EXTRA_DIM, hparams.time_mix_extra_dim);
|
||||
add_kv(LLM_KV_TIME_DECAY_EXTRA_DIM, hparams.time_decay_extra_dim);
|
||||
add_kv(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale);
|
||||
add_kv(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale);
|
||||
|
||||
add_kv(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, true);
|
||||
add_kv(LLM_KV_ATTENTION_HEAD_COUNT_KV, hparams.n_head_kv_arr, true);
|
||||
add_kv(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
|
||||
add_kv(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv);
|
||||
add_kv(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k);
|
||||
add_kv(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v);
|
||||
add_kv(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
add_kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
add_kv(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
||||
add_kv(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
|
||||
add_kv(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
|
||||
add_kv(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, hparams.n_rel_attn_bkts);
|
||||
add_kv(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
||||
add_kv(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale);
|
||||
|
||||
const float rope_scaling_factor = hparams.rope_freq_scale_train == 1.0f ? 0.0f : 1.0f/hparams.rope_freq_scale_train;
|
||||
|
||||
add_kv(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot);
|
||||
add_kv(LLM_KV_ROPE_FREQ_BASE, hparams.rope_freq_base_train);
|
||||
// add_kv(LLM_KV_ROPE_SCALE_LINEAR, rope_scaling_factor); // old name
|
||||
add_kv(LLM_KV_ROPE_SCALING_TYPE, llama_rope_scaling_type_name(hparams.rope_scaling_type_train));
|
||||
add_kv(LLM_KV_ROPE_SCALING_FACTOR, rope_scaling_factor);
|
||||
add_kv(LLM_KV_ROPE_SCALING_ATTN_FACTOR, hparams.rope_attn_factor);
|
||||
add_kv(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, hparams.n_ctx_orig_yarn);
|
||||
add_kv(LLM_KV_ROPE_SCALING_FINETUNED, hparams.rope_finetuned);
|
||||
add_kv(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul);
|
||||
|
||||
// TODO: implement split file support
|
||||
// add_kv(LLM_KV_SPLIT_NO, ???);
|
||||
// add_kv(LLM_KV_SPLIT_COUNT, ???);
|
||||
// add_kv(LLM_KV_SPLIT_TENSORS_COUNT, ???);
|
||||
|
||||
add_kv(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
|
||||
add_kv(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
|
||||
add_kv(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
|
||||
add_kv(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
|
||||
add_kv(LLM_KV_SSM_DT_B_C_RMS, hparams.ssm_dt_b_c_rms);
|
||||
|
||||
add_kv(LLM_KV_WKV_HEAD_SIZE, hparams.wkv_head_size);
|
||||
|
||||
add_kv(LLM_KV_TOKENIZER_MODEL, vocab.get_tokenizer_model());
|
||||
add_kv(LLM_KV_TOKENIZER_PRE, vocab.get_tokenizer_pre());
|
||||
add_kv(LLM_KV_TOKENIZER_LIST, tokens);
|
||||
add_kv(LLM_KV_TOKENIZER_TOKEN_TYPE, token_types);
|
||||
add_kv(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, vocab.n_token_types());
|
||||
add_kv(LLM_KV_TOKENIZER_SCORES, scores);
|
||||
add_kv(LLM_KV_TOKENIZER_MERGES, vocab.get_bpe_merges());
|
||||
// FIXME llama_token is type i32 but when reading in a GGUF file u32 is expected, not an issue for writing though
|
||||
add_kv(LLM_KV_TOKENIZER_BOS_ID, uint32_t(vocab.token_bos()));
|
||||
add_kv(LLM_KV_TOKENIZER_EOS_ID, uint32_t(vocab.token_eos()));
|
||||
add_kv(LLM_KV_TOKENIZER_EOT_ID, uint32_t(vocab.token_eot()));
|
||||
add_kv(LLM_KV_TOKENIZER_EOM_ID, uint32_t(vocab.token_eom()));
|
||||
add_kv(LLM_KV_TOKENIZER_UNK_ID, uint32_t(vocab.token_unk()));
|
||||
add_kv(LLM_KV_TOKENIZER_SEP_ID, uint32_t(vocab.token_sep()));
|
||||
add_kv(LLM_KV_TOKENIZER_PAD_ID, uint32_t(vocab.token_pad()));
|
||||
// add_kv(LLM_KV_TOKENIZER_CLS_ID, uint32_t(vocab.token_bos())); // deprecated
|
||||
// add_kv(LLM_KV_TOKENIZER_MASK_ID, ???);
|
||||
add_kv(LLM_KV_TOKENIZER_ADD_BOS, vocab.get_add_bos());
|
||||
add_kv(LLM_KV_TOKENIZER_ADD_EOS, vocab.get_add_eos());
|
||||
add_kv(LLM_KV_TOKENIZER_ADD_PREFIX, vocab.get_add_space_prefix());
|
||||
add_kv(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, vocab.get_remove_extra_whitespaces());
|
||||
add_kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP, vocab.get_precompiled_charsmap());
|
||||
// add_kv(LLM_KV_TOKENIZER_HF_JSON, ???);
|
||||
// add_kv(LLM_KV_TOKENIZER_RWKV, ???);
|
||||
add_kv(LLM_KV_TOKENIZER_FIM_PRE_ID, uint32_t(vocab.token_fim_pre()));
|
||||
add_kv(LLM_KV_TOKENIZER_FIM_SUF_ID, uint32_t(vocab.token_fim_suf()));
|
||||
add_kv(LLM_KV_TOKENIZER_FIM_MID_ID, uint32_t(vocab.token_fim_mid()));
|
||||
add_kv(LLM_KV_TOKENIZER_FIM_PAD_ID, uint32_t(vocab.token_fim_pad()));
|
||||
add_kv(LLM_KV_TOKENIZER_FIM_REP_ID, uint32_t(vocab.token_fim_rep()));
|
||||
add_kv(LLM_KV_TOKENIZER_FIM_SEP_ID, uint32_t(vocab.token_fim_sep()));
|
||||
|
||||
// TODO: implement LoRA support
|
||||
// add_kv(LLM_KV_ADAPTER_TYPE, ???);
|
||||
// add_kv(LLM_KV_ADAPTER_LORA_ALPHA, ???);
|
||||
|
||||
// deprecated
|
||||
// add_kv(LLM_KV_TOKENIZER_PREFIX_ID, ???);
|
||||
// add_kv(LLM_KV_TOKENIZER_SUFFIX_ID, ???);
|
||||
// add_kv(LLM_KV_TOKENIZER_MIDDLE_ID, ???);
|
||||
}
|
||||
|
||||
void llama_model_saver::add_tensors_from_model() {
|
||||
if (std::string(model.output->name) != std::string(model.tok_embd->name)) {
|
||||
add_tensor(model.tok_embd); // some models use the same tensor for tok_embd and output
|
||||
}
|
||||
add_tensor(model.type_embd);
|
||||
add_tensor(model.pos_embd);
|
||||
add_tensor(model.tok_norm);
|
||||
add_tensor(model.tok_norm_b);
|
||||
add_tensor(model.output_norm);
|
||||
add_tensor(model.output_norm_b);
|
||||
add_tensor(model.output);
|
||||
add_tensor(model.output_b);
|
||||
add_tensor(model.output_norm_enc);
|
||||
add_tensor(model.cls);
|
||||
add_tensor(model.cls_b);
|
||||
add_tensor(model.cls_out);
|
||||
add_tensor(model.cls_out_b);
|
||||
|
||||
for (const struct llama_layer & layer : model.layers) {
|
||||
for (size_t i = 0; i < sizeof(layer)/sizeof(struct ggml_tensor *); ++i) {
|
||||
add_tensor(reinterpret_cast<const struct ggml_tensor * const *>(&layer)[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void llama_model_saver::save(const std::string & path_model) {
|
||||
gguf_write_to_file(gguf_ctx, path_model.c_str(), false);
|
||||
}
|
||||
|
37
llama/llama.cpp/src/llama-model-saver.h
vendored
37
llama/llama.cpp/src/llama-model-saver.h
vendored
@ -1,37 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include "llama.h"
|
||||
#include "llama-arch.h"
|
||||
|
||||
#include <vector>
|
||||
|
||||
struct llama_model_saver {
|
||||
struct gguf_context * gguf_ctx = nullptr;
|
||||
const struct llama_model & model;
|
||||
const struct LLM_KV llm_kv;
|
||||
|
||||
llama_model_saver(const struct llama_model & model);
|
||||
~llama_model_saver();
|
||||
|
||||
void add_kv(enum llm_kv key, uint32_t value);
|
||||
void add_kv(enum llm_kv key, int32_t value);
|
||||
void add_kv(enum llm_kv key, float value);
|
||||
void add_kv(enum llm_kv key, bool value);
|
||||
void add_kv(enum llm_kv key, const char * value);
|
||||
|
||||
[[noreturn]]
|
||||
void add_kv(enum llm_kv key, char value); // needed to make the template below compile
|
||||
|
||||
template <typename Container>
|
||||
void add_kv(enum llm_kv key, const Container & value, bool per_layer = false);
|
||||
|
||||
void add_kv(enum llm_kv key, const std::vector<std::string> & value);
|
||||
|
||||
void add_tensor(const struct ggml_tensor * tensor);
|
||||
|
||||
void add_kv_from_model();
|
||||
|
||||
void add_tensors_from_model();
|
||||
|
||||
void save(const std::string & path_model);
|
||||
};
|
460
llama/llama.cpp/src/llama-model.cpp
vendored
460
llama/llama.cpp/src/llama-model.cpp
vendored
@ -40,7 +40,6 @@ const char * llm_type_name(llm_type type) {
|
||||
case LLM_TYPE_335M: return "335M";
|
||||
case LLM_TYPE_410M: return "410M";
|
||||
case LLM_TYPE_450M: return "450M";
|
||||
case LLM_TYPE_475M: return "475M";
|
||||
case LLM_TYPE_770M: return "770M";
|
||||
case LLM_TYPE_780M: return "780M";
|
||||
case LLM_TYPE_0_5B: return "0.5B";
|
||||
@ -80,7 +79,6 @@ const char * llm_type_name(llm_type type) {
|
||||
case LLM_TYPE_236B: return "236B";
|
||||
case LLM_TYPE_290B: return "290B";
|
||||
case LLM_TYPE_314B: return "314B";
|
||||
case LLM_TYPE_405B: return "405B";
|
||||
case LLM_TYPE_671B: return "671B";
|
||||
case LLM_TYPE_SMALL: return "0.1B";
|
||||
case LLM_TYPE_MEDIUM: return "0.4B";
|
||||
@ -117,10 +115,6 @@ static const std::map<llama_rope_scaling_type, const char *> LLAMA_ROPE_SCALING_
|
||||
{ LLAMA_ROPE_SCALING_TYPE_LONGROPE, "longrope" },
|
||||
};
|
||||
|
||||
std::string llama_rope_scaling_type_name(llama_rope_scaling_type rope_scaling_type) {
|
||||
return LLAMA_ROPE_SCALING_TYPES.at(rope_scaling_type);
|
||||
}
|
||||
|
||||
static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::string & name) {
|
||||
for (const auto & kv : LLAMA_ROPE_SCALING_TYPES) {
|
||||
if (kv.second == name) {
|
||||
@ -303,10 +297,6 @@ static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & de
|
||||
// add extra buffer types, only if no GPU device is present
|
||||
// ref: https://github.com/ggml-org/llama.cpp/issues/12481#issuecomment-2743136094
|
||||
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
||||
if (cpu_dev == nullptr) {
|
||||
throw std::runtime_error(format("%s: no CPU backend found", __func__));
|
||||
}
|
||||
|
||||
auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
|
||||
auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
|
||||
ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
|
||||
@ -433,6 +423,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
|
||||
// get general kv
|
||||
ml.get_key(LLM_KV_GENERAL_NAME, name, false);
|
||||
ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab, false);
|
||||
|
||||
// everything past this point is not vocab-related
|
||||
if (hparams.vocab_only) {
|
||||
@ -444,6 +435,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer);
|
||||
ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
|
||||
ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
|
||||
ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false);
|
||||
|
||||
if (arch == LLM_ARCH_WAVTOKENIZER_DEC) {
|
||||
ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd_features);
|
||||
@ -467,9 +459,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0);
|
||||
std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
|
||||
std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0);
|
||||
std::fill(hparams.cross_attn_layers.begin(), hparams.cross_attn_layers.end(), -1);
|
||||
|
||||
ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer, false);
|
||||
ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false);
|
||||
ml.get_arr(LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS, hparams.cross_attn_layers, false);
|
||||
|
||||
// n_head_kv is optional, default to n_head
|
||||
hparams.n_head_kv_arr = hparams.n_head_arr;
|
||||
@ -522,7 +516,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
|
||||
ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
|
||||
|
||||
if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON) {
|
||||
if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_MLLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON) {
|
||||
if (hparams.n_rot != hparams.n_embd_head_k) {
|
||||
throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
|
||||
}
|
||||
@ -585,13 +579,22 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
hparams.use_kq_norm = false;
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_MLLAMA:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
case 40: type = LLM_TYPE_11B; break;
|
||||
case 100: type = LLM_TYPE_90B; break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_DECI:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
switch (hparams.n_layer) {
|
||||
case 32: type = LLM_TYPE_7B; break;
|
||||
case 80: type = LLM_TYPE_70B; break;
|
||||
case 162: type = LLM_TYPE_405B; break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
@ -718,11 +721,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_MOE_EVERY_N_LAYERS, hparams.moe_every_n_layers, 0);
|
||||
|
||||
if (hparams.n_layer == 12 && hparams.n_embd == 768) {
|
||||
if (arch == LLM_ARCH_NOMIC_BERT) {
|
||||
type = LLM_TYPE_137M;
|
||||
} else if (arch == LLM_ARCH_NOMIC_BERT_MOE && hparams.moe_every_n_layers == 2) {
|
||||
type = LLM_TYPE_475M;
|
||||
}
|
||||
type = LLM_TYPE_137M;
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_BLOOM:
|
||||
@ -783,7 +782,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
// fall through
|
||||
case LLM_ARCH_QWEN2:
|
||||
{
|
||||
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
switch (hparams.n_layer) {
|
||||
case 24: type = hparams.n_embd == 1024 ? LLM_TYPE_0_5B : LLM_TYPE_1B; break;
|
||||
@ -1507,9 +1505,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
}
|
||||
|
||||
ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
||||
if (cpu_dev == nullptr) {
|
||||
throw std::runtime_error(format("%s: no CPU backend found", __func__));
|
||||
}
|
||||
const int i_gpu_start = std::max((int) hparams.n_layer - n_gpu_layers, (int) 0);
|
||||
const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, (int)n_layer + 1);
|
||||
auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev {
|
||||
@ -1581,7 +1576,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
const int64_t n_embd_head_v = hparams.n_embd_head_v;
|
||||
const int64_t n_ff = hparams.n_ff();
|
||||
const int64_t n_embd_gqa = n_embd_v_gqa;
|
||||
const int64_t n_vocab = vocab.n_tokens();
|
||||
const int64_t n_vocab = hparams.n_vocab;
|
||||
const int64_t n_token_types = vocab.n_token_types();
|
||||
const int64_t n_rot = hparams.n_rot;
|
||||
const int64_t n_expert = hparams.n_expert;
|
||||
@ -1677,11 +1672,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) {
|
||||
std::regex pattern(overrides->pattern);
|
||||
if (std::regex_search(tensor_name, pattern)) {
|
||||
LLAMA_LOG_DEBUG("tensor %s buffer type overriden to %s\n", tensor_name.c_str(), ggml_backend_buft_name(overrides->buft));
|
||||
buft = overrides->buft;
|
||||
LLAMA_LOG_DEBUG("tensor %s (%zu MiB %s) buffer type overridden to %s\n",
|
||||
tensor_name.c_str(),
|
||||
ggml_nbytes(t_meta) / 1024 / 1024, ggml_type_name(t_meta->type),
|
||||
ggml_backend_buft_name(buft));
|
||||
break;
|
||||
}
|
||||
}
|
||||
@ -1698,9 +1690,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
auto * buft_dev = ggml_backend_buft_get_device(buft);
|
||||
if (ml.use_mmap && buft_dev && buft == ggml_backend_dev_host_buffer_type(buft_dev)) {
|
||||
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
||||
if (!cpu_dev) {
|
||||
throw std::runtime_error("no CPU backend found");
|
||||
}
|
||||
buft = ggml_backend_dev_buffer_type(cpu_dev);
|
||||
}
|
||||
|
||||
@ -1840,6 +1829,52 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
}
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_MLLAMA:
|
||||
{
|
||||
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab+8}, 0);
|
||||
|
||||
// output
|
||||
{
|
||||
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
||||
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||
|
||||
// if output is NULL, init from the input tok embed
|
||||
if (output == NULL) {
|
||||
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < n_layer; ++i) {
|
||||
auto & layer = layers[i];
|
||||
|
||||
if (hparams.cross_attention_layers(i)) {
|
||||
layer.cross_attn_k_norm = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_K_NORM, "weight", i), {128}, 0);
|
||||
layer.cross_attn_k_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_K_PROJ, "weight", i), {n_embd, 1024}, 0);
|
||||
layer.cross_attn_o_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_O_PROJ, "weight", i), {n_embd, n_embd}, 0);
|
||||
layer.cross_attn_q_norm = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_Q_NORM, "weight", i), {128}, 0);
|
||||
layer.cross_attn_q_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_Q_PROJ, "weight", i), {n_embd, n_embd}, 0);
|
||||
layer.cross_attn_v_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_V_PROJ, "weight", i), {n_embd, 1024}, 0);
|
||||
layer.cross_attn_attn_gate = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_ATTN_GATE, i), {1}, 0);
|
||||
layer.cross_attn_mlp_gate = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_MLP_GATE, i), {1}, 0);
|
||||
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
||||
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
|
||||
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
||||
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
||||
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
||||
} else {
|
||||
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
||||
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
||||
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
|
||||
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
|
||||
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
|
||||
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
||||
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
|
||||
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
||||
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
||||
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
||||
}
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_DECI:
|
||||
{
|
||||
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||
@ -1882,9 +1917,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
|
||||
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
||||
|
||||
if (n_ff > 0) {
|
||||
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
||||
}
|
||||
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
||||
|
||||
if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
|
||||
layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
|
||||
@ -1894,11 +1927,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
|
||||
}
|
||||
|
||||
if (n_ff > 0) {
|
||||
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
||||
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
||||
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
||||
}
|
||||
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
||||
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
||||
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
||||
|
||||
// optional MLP bias
|
||||
layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
|
||||
@ -3542,11 +3573,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
|
||||
// output
|
||||
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
||||
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
||||
// if output is NULL, init from the input tok embed
|
||||
if (output == NULL) {
|
||||
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
||||
}
|
||||
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
||||
|
||||
for (int i = 0; i < n_layer; ++i) {
|
||||
auto & layer = layers[i];
|
||||
@ -4179,9 +4206,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
if (!dev) {
|
||||
// FIXME: workaround for CPU backend buft having a NULL device
|
||||
dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
||||
if (!dev) {
|
||||
throw std::runtime_error(format("%s: no CPU backend found", __func__));
|
||||
}
|
||||
}
|
||||
ggml_backend_dev_props props;
|
||||
ggml_backend_dev_get_props(dev, &props);
|
||||
@ -4311,7 +4335,7 @@ uint64_t llama_model::n_elements() const {
|
||||
}
|
||||
|
||||
void llama_model::print_info() const {
|
||||
const std::string rope_scaling_type = llama_rope_scaling_type_name(hparams.rope_scaling_type_train);
|
||||
const char * rope_scaling_type = LLAMA_ROPE_SCALING_TYPES.at(hparams.rope_scaling_type_train);
|
||||
|
||||
auto print_f = [](const std::function<uint32_t(uint32_t)> & f, uint32_t n) {
|
||||
bool is_var = false;
|
||||
@ -4372,7 +4396,7 @@ void llama_model::print_info() const {
|
||||
LLAMA_LOG_INFO("%s: causal attn = %d\n", __func__, hparams.causal_attn);
|
||||
LLAMA_LOG_INFO("%s: pooling type = %d\n", __func__, hparams.pooling_type);
|
||||
LLAMA_LOG_INFO("%s: rope type = %d\n", __func__, hparams.rope_type);
|
||||
LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type.c_str());
|
||||
LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type);
|
||||
LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
|
||||
LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
|
||||
LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn);
|
||||
@ -4519,19 +4543,6 @@ const ggml_tensor * llama_model::get_tensor(const char * name) const {
|
||||
return it->second;
|
||||
}
|
||||
|
||||
ggml_tensor * llama_model::get_rope_factors(uint32_t n_ctx_per_seq, int il) const {
|
||||
// choose long/short freq factors based on the context size
|
||||
if (layers[il].rope_freqs != nullptr) {
|
||||
return layers[il].rope_freqs;
|
||||
}
|
||||
|
||||
if (n_ctx_per_seq > hparams.n_ctx_orig_yarn) {
|
||||
return layers[il].rope_long;
|
||||
}
|
||||
|
||||
return layers[il].rope_short;
|
||||
}
|
||||
|
||||
struct llm_build_llama : public llm_graph_context {
|
||||
llm_build_llama(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
@ -4572,7 +4583,7 @@ struct llm_build_llama : public llm_graph_context {
|
||||
// self-attention
|
||||
{
|
||||
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
||||
ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
|
||||
ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
|
||||
|
||||
// compute Q and K and RoPE them
|
||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
||||
@ -4756,6 +4767,246 @@ struct llm_build_llama : public llm_graph_context {
|
||||
}
|
||||
};
|
||||
|
||||
struct llm_build_mllama: public llm_graph_context {
|
||||
llm_build_mllama(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
||||
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
||||
int32_t n_tokens = this->n_tokens;
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
ggml_tensor * inpCAS;
|
||||
|
||||
inpL = build_inp_embd(model.tok_embd);
|
||||
inpCAS = build_inp_cross_attn_state();
|
||||
|
||||
// inp_pos - contains the positions
|
||||
ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
auto * inp_attn = build_attn_inp_kv_unified();
|
||||
const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
ggml_tensor * inpSA = inpL;
|
||||
|
||||
// norm
|
||||
cur = build_norm(inpL,
|
||||
model.layers[il].attn_norm, NULL,
|
||||
LLM_NORM_RMS, il);
|
||||
cb(cur, "attn_norm", il);
|
||||
|
||||
if (hparams.cross_attention_layers(il)) {
|
||||
if (!ubatch.embd && !cparams.cross_attn) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// cross attention layer
|
||||
ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_q_proj, cur);
|
||||
cb(Qcur, "Qcur", il);
|
||||
|
||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
||||
cb(Qcur, "Qcur", il);
|
||||
|
||||
Qcur = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 0, 2, 1, 3));
|
||||
cb(Qcur, "Qcur", il);
|
||||
|
||||
Qcur = build_norm(Qcur, model.layers[il].cross_attn_q_norm, NULL, LLM_NORM_RMS, il);
|
||||
cb(Qcur, "Qcur", il);
|
||||
|
||||
ggml_tensor * Kcur, * Vcur;
|
||||
if (ubatch.embd) {
|
||||
Kcur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_k_proj, inpCAS);
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, 6404);
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
||||
Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
||||
Kcur = build_norm(Kcur, model.layers[il].cross_attn_k_norm, NULL, LLM_NORM_RMS, il);
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
||||
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, kv_self->k_l[il]));
|
||||
|
||||
Vcur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_v_proj, inpCAS);
|
||||
cb(Vcur, "Vcur", il);
|
||||
|
||||
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, 6404);
|
||||
cb(Vcur, "Vcur", il);
|
||||
|
||||
Vcur = ggml_permute(ctx0, Vcur, 0, 2, 1, 3);
|
||||
cb(Vcur, "Vcur", il);
|
||||
|
||||
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, kv_self->v_l[il]));
|
||||
} else {
|
||||
Kcur = ggml_view_tensor(ctx0, kv_self->k_l[il]);
|
||||
cb(Kcur, "Kcur (view)", il);
|
||||
|
||||
Vcur = ggml_view_tensor(ctx0, kv_self->v_l[il]);
|
||||
cb(Vcur, "Vcur (view)", il);
|
||||
}
|
||||
|
||||
struct ggml_tensor * kq = ggml_mul_mat(ctx0, Kcur, Qcur);
|
||||
cb(kq, "kq", il);
|
||||
|
||||
// TODO: apply causal masks
|
||||
struct ggml_tensor * kq_soft_max = ggml_soft_max_ext(ctx0, kq, nullptr, 1.f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias);
|
||||
cb(kq_soft_max, "kq_soft_max", il);
|
||||
|
||||
Vcur = ggml_cont(ctx0, ggml_transpose(ctx0, Vcur));
|
||||
cb(Vcur, "Vcur", il);
|
||||
|
||||
struct ggml_tensor * kqv = ggml_mul_mat(ctx0, Vcur, kq_soft_max);
|
||||
cb(kqv, "kqv", il);
|
||||
|
||||
struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
|
||||
cb(kqv_merged, "kqv_merged", il);
|
||||
|
||||
cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_head_v*n_head, n_tokens);
|
||||
cb(cur, "kqv_merged_cont", il);
|
||||
|
||||
cur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_o_proj, cur);
|
||||
cb(cur, "cur", il);
|
||||
|
||||
// TODO: do this in place once?
|
||||
cur = ggml_mul(ctx0, cur, ggml_tanh(ctx0, model.layers[il].cross_attn_attn_gate));
|
||||
|
||||
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
||||
cb(ffn_inp, "ffn_inp", il);
|
||||
|
||||
// feed-forward network
|
||||
cur = build_norm(ffn_inp,
|
||||
model.layers[il].ffn_norm, NULL,
|
||||
LLM_NORM_RMS, il);
|
||||
cb(cur, "ffn_norm", il);
|
||||
|
||||
cur = build_ffn(cur,
|
||||
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
||||
model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
|
||||
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
||||
NULL,
|
||||
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
|
||||
// TODO: do this inplace once?
|
||||
cur = ggml_add_inplace(ctx0, ggml_mul_inplace(ctx0, cur, ggml_tanh(ctx0, model.layers[il].cross_attn_mlp_gate)), ffn_inp);
|
||||
cb(cur, "ffn_out", il);
|
||||
|
||||
cur = build_cvec(cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
inpL = cur;
|
||||
} else {
|
||||
// self attention layer
|
||||
|
||||
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
||||
ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
|
||||
|
||||
// compute Q and K and RoPE them
|
||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
||||
cb(Qcur, "Qcur", il);
|
||||
if (model.layers[il].bq) {
|
||||
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
||||
cb(Qcur, "Qcur", il);
|
||||
}
|
||||
|
||||
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
||||
cb(Kcur, "Kcur", il);
|
||||
if (model.layers[il].bk) {
|
||||
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
||||
cb(Kcur, "Kcur", il);
|
||||
}
|
||||
|
||||
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
||||
cb(Vcur, "Vcur", il);
|
||||
if (model.layers[il].bv) {
|
||||
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
||||
cb(Vcur, "Vcur", il);
|
||||
}
|
||||
|
||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
||||
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
||||
|
||||
Qcur = ggml_rope_ext(
|
||||
ctx0, Qcur, inp_pos, rope_factors,
|
||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
|
||||
Kcur = ggml_rope_ext(
|
||||
ctx0, Kcur, inp_pos, rope_factors,
|
||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
|
||||
cb(Qcur, "Qcur", il);
|
||||
cb(Kcur, "Kcur", il);
|
||||
cb(Vcur, "Vcur", il);
|
||||
|
||||
cur = build_attn(inp_attn, gf,
|
||||
model.layers[il].wo, model.layers[il].bo,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||
|
||||
if (il == n_layer - 1) {
|
||||
// skip computing output for unused tokens
|
||||
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
n_tokens = n_outputs;
|
||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
||||
}
|
||||
|
||||
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
||||
cb(ffn_inp, "ffn_inp", il);
|
||||
|
||||
// feed-forward network
|
||||
cur = build_norm(ffn_inp,
|
||||
model.layers[il].ffn_norm, NULL,
|
||||
LLM_NORM_RMS, il);
|
||||
cb(cur, "ffn_norm", il);
|
||||
|
||||
cur = build_ffn(cur,
|
||||
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
||||
model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
|
||||
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
||||
NULL,
|
||||
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
cb(cur, "ffn_out", il);
|
||||
|
||||
cur = build_cvec(cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
inpL = cur;
|
||||
}
|
||||
}
|
||||
|
||||
cur = inpL;
|
||||
|
||||
cur = build_norm(cur,
|
||||
model.output_norm, NULL,
|
||||
LLM_NORM_RMS, -1);
|
||||
cb(cur, "result_norm", -1);
|
||||
res->t_embd = cur;
|
||||
|
||||
// lm_head
|
||||
cur = build_lora_mm(model.output, cur);
|
||||
|
||||
cb(cur, "result_output", -1);
|
||||
res->t_logits = cur;
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
}
|
||||
};
|
||||
|
||||
struct llm_build_deci : public llm_graph_context {
|
||||
llm_build_deci(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
@ -4778,7 +5029,6 @@ struct llm_build_deci : public llm_graph_context {
|
||||
ggml_tensor * inpSA = inpL;
|
||||
const int64_t n_head_kv = hparams.n_head_kv(il);
|
||||
const int64_t n_head = hparams.n_head(il);
|
||||
const int64_t n_ff = hparams.n_ff(il);
|
||||
|
||||
if (n_head == 0) {
|
||||
// attention-free layer of Llama-3_1-Nemotron-51B
|
||||
@ -4798,7 +5048,7 @@ struct llm_build_deci : public llm_graph_context {
|
||||
} else if (n_head > 0) {
|
||||
// self-attention
|
||||
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
||||
ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
|
||||
ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
|
||||
|
||||
// compute Q and K and RoPE them
|
||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
||||
@ -4854,11 +5104,6 @@ struct llm_build_deci : public llm_graph_context {
|
||||
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
||||
}
|
||||
|
||||
// FFN-free layer of Llama-3_1-Nemotron-Ultra-253B
|
||||
if (n_ff == 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// For Granite architecture
|
||||
if (hparams.f_residual_scale) {
|
||||
cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
|
||||
@ -7285,7 +7530,7 @@ struct llm_build_phi3 : public llm_graph_context {
|
||||
// self-attention
|
||||
{
|
||||
// rope freq factors for 128k context
|
||||
ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
|
||||
ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
|
||||
|
||||
ggml_tensor* attn_norm_output = build_norm(inpL,
|
||||
model.layers[il].attn_norm,
|
||||
@ -8037,7 +8282,7 @@ struct llm_build_minicpm3 : public llm_graph_context {
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
ggml_tensor * inpSA = inpL;
|
||||
|
||||
ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
|
||||
ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
|
||||
|
||||
// norm
|
||||
cur = build_norm(inpL,
|
||||
@ -8804,7 +9049,7 @@ struct llm_build_mamba : public llm_graph_context {
|
||||
ggml_tensor * state_mask,
|
||||
const llama_ubatch & ubatch,
|
||||
int il) const {
|
||||
const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
|
||||
const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
|
||||
|
||||
const auto kv_head = kv_self->head;
|
||||
|
||||
@ -9105,7 +9350,7 @@ struct llm_build_cohere2 : public llm_graph_context {
|
||||
// self-attention
|
||||
{
|
||||
// rope freq factors for 128k context
|
||||
ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
|
||||
ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
|
||||
|
||||
// compute Q and K and RoPE them
|
||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
||||
@ -10043,7 +10288,7 @@ struct llm_build_deepseek : public llm_graph_context {
|
||||
// self-attention
|
||||
{
|
||||
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
||||
ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
|
||||
ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
|
||||
|
||||
// compute Q and K and RoPE them
|
||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
||||
@ -11407,7 +11652,7 @@ struct llm_build_exaone : public llm_graph_context {
|
||||
// self-attention
|
||||
{
|
||||
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
||||
ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
|
||||
ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
|
||||
|
||||
// compute Q and K and RoPE them
|
||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
||||
@ -11552,7 +11797,7 @@ struct llm_build_rwkv6_base : public llm_graph_context {
|
||||
ggml_tensor * state_mask,
|
||||
const llama_ubatch & ubatch,
|
||||
int il) const {
|
||||
const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
|
||||
const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
|
||||
|
||||
const auto n_tokens = ubatch.n_tokens;
|
||||
const auto n_seqs = ubatch.n_seqs;
|
||||
@ -11948,7 +12193,7 @@ struct llm_build_rwkv7_base : public llm_graph_context {
|
||||
ggml_tensor *& first_layer_value,
|
||||
const llama_ubatch & ubatch,
|
||||
int il) const {
|
||||
const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
|
||||
const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
|
||||
|
||||
const auto n_tokens = ubatch.n_tokens;
|
||||
const auto n_seqs = ubatch.n_seqs;
|
||||
@ -12496,7 +12741,7 @@ struct llm_build_solar : public llm_graph_context {
|
||||
// self-attention
|
||||
{
|
||||
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
||||
ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
|
||||
ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
|
||||
|
||||
// compute Q and K and RoPE them
|
||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
||||
@ -12947,7 +13192,7 @@ struct llm_build_bailingmoe : public llm_graph_context {
|
||||
// self-attention
|
||||
{
|
||||
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
||||
ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
|
||||
ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
|
||||
|
||||
// compute Q and K and RoPE them
|
||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
||||
@ -13067,46 +13312,36 @@ struct llm_build_bailingmoe : public llm_graph_context {
|
||||
}
|
||||
};
|
||||
|
||||
llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_cparams & cparams) const {
|
||||
llama_memory_i * llama_model::create_memory() const {
|
||||
llama_memory_i * res;
|
||||
|
||||
switch (arch) {
|
||||
case LLM_ARCH_BERT:
|
||||
case LLM_ARCH_JINA_BERT_V2:
|
||||
case LLM_ARCH_NOMIC_BERT:
|
||||
case LLM_ARCH_NOMIC_BERT_MOE:
|
||||
{
|
||||
res = nullptr;
|
||||
} break;
|
||||
case LLM_ARCH_MAMBA:
|
||||
case LLM_ARCH_RWKV6:
|
||||
case LLM_ARCH_RWKV6QWEN2:
|
||||
case LLM_ARCH_RWKV7:
|
||||
case LLM_ARCH_ARWKV7:
|
||||
{
|
||||
res = new llama_kv_cache_recurrent(
|
||||
*this,
|
||||
GGML_TYPE_F32,
|
||||
GGML_TYPE_F32,
|
||||
cparams.offload_kqv,
|
||||
std::max((uint32_t) 1, cparams.n_seq_max));
|
||||
res = new llama_kv_cache_unified(hparams, {
|
||||
/*.get_rope_factors =*/ nullptr
|
||||
});
|
||||
} break;
|
||||
default:
|
||||
{
|
||||
const auto padding = llama_kv_cache_unified::get_padding(cparams);
|
||||
res = new llama_kv_cache_unified(hparams, {
|
||||
/*.get_rope_factors =*/ [this](uint32_t n_ctx_per_seq, int il) {
|
||||
// choose long/short freq factors based on the context size
|
||||
if (layers[il].rope_freqs != nullptr) {
|
||||
return layers[il].rope_freqs;
|
||||
}
|
||||
|
||||
cparams.n_ctx = GGML_PAD(cparams.n_ctx, padding);
|
||||
if (n_ctx_per_seq > hparams.n_ctx_orig_yarn) {
|
||||
return layers[il].rope_long;
|
||||
}
|
||||
|
||||
LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx);
|
||||
|
||||
res = new llama_kv_cache_unified(
|
||||
*this,
|
||||
params.type_k,
|
||||
params.type_v,
|
||||
!cparams.flash_attn,
|
||||
cparams.offload_kqv,
|
||||
cparams.n_ctx,
|
||||
padding);
|
||||
return layers[il].rope_short;
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@ -13128,6 +13363,10 @@ llm_graph_result_ptr llama_model::build_graph(
|
||||
{
|
||||
llm = std::make_unique<llm_build_llama>(*this, params, gf);
|
||||
} break;
|
||||
case LLM_ARCH_MLLAMA:
|
||||
{
|
||||
llm = std::make_unique<llm_build_mllama>(*this, params, gf);
|
||||
} break;
|
||||
case LLM_ARCH_DECI:
|
||||
{
|
||||
llm = std::make_unique<llm_build_deci>(*this, params, gf);
|
||||
@ -13489,9 +13728,12 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
||||
// use what we call a normal RoPE, operating on pairs of consecutive head values
|
||||
case LLM_ARCH_LLAMA:
|
||||
case LLM_ARCH_LLAMA4:
|
||||
case LLM_ARCH_MLLAMA:
|
||||
case LLM_ARCH_DECI:
|
||||
case LLM_ARCH_BAICHUAN:
|
||||
case LLM_ARCH_STARCODER:
|
||||
case LLM_ARCH_PLAMO:
|
||||
case LLM_ARCH_ORION:
|
||||
case LLM_ARCH_INTERNLM2:
|
||||
case LLM_ARCH_MINICPM:
|
||||
case LLM_ARCH_XVERSE:
|
||||
@ -13530,7 +13772,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
||||
case LLM_ARCH_PHI2:
|
||||
case LLM_ARCH_PHI3:
|
||||
case LLM_ARCH_PHIMOE:
|
||||
case LLM_ARCH_PLAMO:
|
||||
case LLM_ARCH_GEMMA:
|
||||
case LLM_ARCH_GEMMA2:
|
||||
case LLM_ARCH_GEMMA3:
|
||||
@ -13538,7 +13779,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
||||
case LLM_ARCH_OPENELM:
|
||||
case LLM_ARCH_GPTNEOX:
|
||||
case LLM_ARCH_CODESHELL:
|
||||
case LLM_ARCH_ORION:
|
||||
case LLM_ARCH_NEMOTRON:
|
||||
case LLM_ARCH_EXAONE:
|
||||
case LLM_ARCH_MINICPM3:
|
||||
@ -13611,14 +13851,6 @@ const char * llama_model_chat_template(const llama_model * model, const char * n
|
||||
: LLM_KV(model->arch)(LLM_KV_TOKENIZER_CHAT_TEMPLATE);
|
||||
const auto & it = model->gguf_kv.find(key);
|
||||
if (it == model->gguf_kv.end()) {
|
||||
// one-off fix for very popular models (so we are not flooded with issues)
|
||||
// do not extend this list unless absolutely necessary
|
||||
// Mistral-Small-2503 does not have built-in chat template
|
||||
llama_vocab_pre_type pre_type = model->vocab.get_pre_type();
|
||||
if (pre_type == LLAMA_VOCAB_PRE_TYPE_TEKKEN && model->layers.size() == 40) {
|
||||
return "mistral-v7-tekken";
|
||||
}
|
||||
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
|
21
llama/llama.cpp/src/llama-model.h
vendored
21
llama/llama.cpp/src/llama-model.h
vendored
@ -11,6 +11,7 @@
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
#include <stdexcept>
|
||||
|
||||
struct llama_cparams;
|
||||
struct llama_ubatch;
|
||||
@ -36,7 +37,6 @@ enum llm_type {
|
||||
LLM_TYPE_335M,
|
||||
LLM_TYPE_410M,
|
||||
LLM_TYPE_450M,
|
||||
LLM_TYPE_475M,
|
||||
LLM_TYPE_770M,
|
||||
LLM_TYPE_780M,
|
||||
LLM_TYPE_0_5B,
|
||||
@ -74,10 +74,10 @@ enum llm_type {
|
||||
LLM_TYPE_40B,
|
||||
LLM_TYPE_65B,
|
||||
LLM_TYPE_70B,
|
||||
LLM_TYPE_90B,
|
||||
LLM_TYPE_236B,
|
||||
LLM_TYPE_290B,
|
||||
LLM_TYPE_314B,
|
||||
LLM_TYPE_405B,
|
||||
LLM_TYPE_671B,
|
||||
LLM_TYPE_SMALL,
|
||||
LLM_TYPE_MEDIUM,
|
||||
@ -97,8 +97,6 @@ enum llm_type {
|
||||
LLM_TYPE_235B_A22B,
|
||||
};
|
||||
|
||||
std::string llama_rope_scaling_type_name(llama_rope_scaling_type rope_scaling_type);
|
||||
|
||||
struct llama_layer_posnet {
|
||||
// resnet
|
||||
struct ggml_tensor * norm1 = nullptr;
|
||||
@ -318,6 +316,16 @@ struct llama_layer {
|
||||
|
||||
struct ggml_tensor * bskcn_tv = nullptr;
|
||||
|
||||
// cross attention
|
||||
struct ggml_tensor * cross_attn_k_norm = nullptr;
|
||||
struct ggml_tensor * cross_attn_k_proj = nullptr;
|
||||
struct ggml_tensor * cross_attn_o_proj = nullptr;
|
||||
struct ggml_tensor * cross_attn_q_norm = nullptr;
|
||||
struct ggml_tensor * cross_attn_q_proj = nullptr;
|
||||
struct ggml_tensor * cross_attn_v_proj = nullptr;
|
||||
struct ggml_tensor * cross_attn_attn_gate = nullptr;
|
||||
struct ggml_tensor * cross_attn_mlp_gate = nullptr;
|
||||
|
||||
struct llama_layer_posnet posnet;
|
||||
|
||||
struct llama_layer_convnext convnext;
|
||||
@ -401,11 +409,8 @@ struct llama_model {
|
||||
|
||||
const struct ggml_tensor * get_tensor(const char * name) const;
|
||||
|
||||
ggml_tensor * get_rope_factors(uint32_t n_ctx_per_seq, int il) const;
|
||||
|
||||
// note: can mutate `cparams`
|
||||
// TODO: move this to new llm_arch_model_i interface
|
||||
llama_memory_i * create_memory(const llama_memory_params & params, llama_cparams & cparams) const;
|
||||
llama_memory_i * create_memory() const; // TODO: params
|
||||
|
||||
// TODO: move this to new llm_arch_model_i interface
|
||||
llm_graph_result_ptr build_graph(
|
||||
|
8
llama/llama.cpp/src/llama-quant.cpp
vendored
8
llama/llama.cpp/src/llama-quant.cpp
vendored
@ -519,7 +519,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
||||
nthread = std::thread::hardware_concurrency();
|
||||
}
|
||||
|
||||
// mmap consistently increases speed on Linux, and also increases speed on Windows with
|
||||
// mmap consistently increases speed Linux, and also increases speed on Windows with
|
||||
// hot cache. It may cause a slowdown on macOS, possibly related to free memory.
|
||||
#if defined(__linux__) || defined(_WIN32)
|
||||
constexpr bool use_mmap = true;
|
||||
@ -529,7 +529,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
||||
|
||||
llama_model_kv_override * kv_overrides = nullptr;
|
||||
if (params->kv_overrides) {
|
||||
auto * v = (std::vector<llama_model_kv_override>*)params->kv_overrides;
|
||||
auto v = (std::vector<llama_model_kv_override>*)params->kv_overrides;
|
||||
kv_overrides = v->data();
|
||||
}
|
||||
|
||||
@ -639,7 +639,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
||||
if (llama_model_has_encoder(&model)) {
|
||||
n_attn_layer *= 3;
|
||||
}
|
||||
GGML_ASSERT((qs.n_attention_wv == n_attn_layer) && "n_attention_wv is unexpected");
|
||||
if (qs.n_attention_wv != n_attn_layer) {
|
||||
LLAMA_LOG_WARN("%s: n_attention_wv is unexpected, expected: %d, found: %d\n", __func__, n_attn_layer, qs.n_attention_wv);
|
||||
}
|
||||
}
|
||||
|
||||
size_t total_size_org = 0;
|
||||
|
24
llama/llama.cpp/src/llama-sampling.cpp
vendored
24
llama/llama.cpp/src/llama-sampling.cpp
vendored
@ -1750,35 +1750,23 @@ static const char * llama_sampler_top_n_sigma_name(const struct llama_sampler *
|
||||
static void llama_sampler_top_n_sigma_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
||||
const auto * ctx = (llama_sampler_top_n_sigma *) smpl->ctx;
|
||||
|
||||
if (ctx->n <= 0.0f || cur_p->size <= 1) {
|
||||
return;
|
||||
}
|
||||
|
||||
// find max logit and calculate mean
|
||||
float max = cur_p->data[0].logit;
|
||||
float logits_sum = 0;
|
||||
size_t valid_count = 0;
|
||||
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||
// Only count non-negative infinity values
|
||||
if (cur_p->data[i].logit != -INFINITY) {
|
||||
if (cur_p->data[i].logit > max) {
|
||||
max = cur_p->data[i].logit;
|
||||
}
|
||||
logits_sum += cur_p->data[i].logit;
|
||||
valid_count++;
|
||||
if (cur_p->data[i].logit > max) {
|
||||
max = cur_p->data[i].logit;
|
||||
}
|
||||
logits_sum += cur_p->data[i].logit;
|
||||
}
|
||||
float mean = valid_count > 0 ? logits_sum/valid_count : 0;
|
||||
float mean = logits_sum/cur_p->size;
|
||||
|
||||
// calculate standard deviation
|
||||
float acc = 0;
|
||||
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||
// Skip -infinity in std calculation
|
||||
if (cur_p->data[i].logit != -INFINITY) {
|
||||
acc += pow(cur_p->data[i].logit - mean, 2);
|
||||
}
|
||||
acc += pow(cur_p->data[i].logit - mean, 2);
|
||||
}
|
||||
float std = valid_count > 0 ? sqrt(acc/valid_count) : 0;
|
||||
float std = sqrt(acc/cur_p->size);
|
||||
|
||||
//apply mask
|
||||
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||
|
44
llama/llama.cpp/src/llama-vocab.cpp
vendored
44
llama/llama.cpp/src/llama-vocab.cpp
vendored
@ -1,7 +1,5 @@
|
||||
#include "llama-vocab.h"
|
||||
|
||||
#include "ggml.h"
|
||||
#include "gguf.h"
|
||||
#include "llama-impl.h"
|
||||
#include "llama-model-loader.h"
|
||||
|
||||
@ -417,13 +415,6 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
||||
"'(?:[sSdDmMtT]|[lL][lL]|[vV][eE]|[rR][eE])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+",
|
||||
};
|
||||
break;
|
||||
case LLAMA_VOCAB_PRE_TYPE_SEED_CODER:
|
||||
regex_exprs = {
|
||||
// original regex from tokenizer.json
|
||||
// "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1}| ?[^\\s\\p{L}\\p{N}\r\n]+|\\s*[\r\n]+|\\s+(?!\\S)|\\s+"
|
||||
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1}| ?[^\\s\\p{L}\\p{N}\\r\\n]+|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
||||
};
|
||||
break;
|
||||
default:
|
||||
// default regex for BPE tokenization pre-processing
|
||||
regex_exprs = {
|
||||
@ -1236,9 +1227,6 @@ struct fragment_buffer_variant {
|
||||
struct llama_vocab::impl {
|
||||
uint32_t n_token_types = 0; // for BERT-style token types
|
||||
|
||||
std::string tokenizer_model;
|
||||
std::string tokenizer_pre;
|
||||
|
||||
enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
|
||||
enum llama_vocab_pre_type pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||||
|
||||
@ -1374,6 +1362,9 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
|
||||
// determine vocab type
|
||||
{
|
||||
std::string tokenizer_model;
|
||||
std::string tokenizer_pre;
|
||||
|
||||
ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_model);
|
||||
ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
|
||||
|
||||
@ -1468,8 +1459,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
|
||||
const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
|
||||
if (precompiled_charsmap_keyidx != -1) {
|
||||
const gguf_type pc_type = gguf_get_arr_type(ctx, precompiled_charsmap_keyidx);
|
||||
const size_t n_precompiled_charsmap = gguf_get_arr_data_n(ctx, precompiled_charsmap_keyidx);
|
||||
size_t n_precompiled_charsmap = gguf_get_arr_data_n(ctx, precompiled_charsmap_keyidx);
|
||||
const char * pc = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx);
|
||||
precompiled_charsmap.assign(pc, pc + n_precompiled_charsmap);
|
||||
#ifdef IS_BIG_ENDIAN
|
||||
@ -1635,10 +1625,6 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
tokenizer_pre == "bailingmoe") {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_BAILINGMOE;
|
||||
clean_spaces = false;
|
||||
} else if (
|
||||
tokenizer_pre == "seed-coder") {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_SEED_CODER;
|
||||
clean_spaces = false;
|
||||
} else {
|
||||
LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||||
@ -2784,14 +2770,6 @@ void llama_vocab::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
pimpl->load(ml, kv);
|
||||
}
|
||||
|
||||
std::string llama_vocab::get_tokenizer_model() const {
|
||||
return pimpl->tokenizer_model;
|
||||
}
|
||||
|
||||
std::string llama_vocab::get_tokenizer_pre() const {
|
||||
return pimpl->tokenizer_pre;
|
||||
}
|
||||
|
||||
enum llama_vocab_type llama_vocab::get_type() const {
|
||||
return pimpl->type;
|
||||
}
|
||||
@ -3014,20 +2992,6 @@ int llama_vocab::find_bpe_rank(const std::string & token_left, const std::string
|
||||
return it->second;
|
||||
}
|
||||
|
||||
std::vector<std::string> llama_vocab::get_bpe_merges() const {
|
||||
std::vector<std::string> result(pimpl->bpe_ranks.size());
|
||||
|
||||
for (const auto & pair : pimpl->bpe_ranks) {
|
||||
result[pair.second] = pair.first.first + " " + pair.first.second;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
std::vector<char> llama_vocab::get_precompiled_charsmap() const {
|
||||
return pimpl->precompiled_charsmap;
|
||||
}
|
||||
|
||||
int32_t llama_vocab::tokenize(
|
||||
const char * text,
|
||||
int32_t text_len,
|
||||
|
6
llama/llama.cpp/src/llama-vocab.h
vendored
6
llama/llama.cpp/src/llama-vocab.h
vendored
@ -21,9 +21,6 @@ struct llama_vocab {
|
||||
|
||||
void load(llama_model_loader & ml, const LLM_KV & kv);
|
||||
|
||||
std::string get_tokenizer_model() const;
|
||||
std::string get_tokenizer_pre() const;
|
||||
|
||||
enum llama_vocab_type get_type() const;
|
||||
enum llama_vocab_pre_type get_pre_type() const;
|
||||
|
||||
@ -83,9 +80,6 @@ struct llama_vocab {
|
||||
int max_token_len() const;
|
||||
|
||||
int find_bpe_rank(const std::string & token_left, const std::string & token_right) const;
|
||||
std::vector<std::string> get_bpe_merges() const;
|
||||
|
||||
std::vector<char> get_precompiled_charsmap() const;
|
||||
|
||||
int32_t tokenize(
|
||||
const char * text,
|
||||
|
9
llama/llama.cpp/src/llama.cpp
vendored
9
llama/llama.cpp/src/llama.cpp
vendored
@ -4,7 +4,6 @@
|
||||
#include "llama-mmap.h"
|
||||
#include "llama-vocab.h"
|
||||
#include "llama-model-loader.h"
|
||||
#include "llama-model-saver.h"
|
||||
#include "llama-model.h"
|
||||
|
||||
#include "ggml.h"
|
||||
@ -254,13 +253,6 @@ struct llama_model * llama_model_load_from_splits(
|
||||
return llama_model_load_from_file_impl(splits.front(), splits, params);
|
||||
}
|
||||
|
||||
void llama_model_save_to_file(const struct llama_model * model, const char * path_model) {
|
||||
llama_model_saver ms(*model);
|
||||
ms.add_kv_from_model();
|
||||
ms.add_tensors_from_model();
|
||||
ms.save(path_model);
|
||||
}
|
||||
|
||||
//
|
||||
// chat templates
|
||||
//
|
||||
@ -346,4 +338,3 @@ const char * llama_print_system_info(void) {
|
||||
|
||||
return s.c_str();
|
||||
}
|
||||
|
||||
|
@ -6,7 +6,7 @@ package llama
|
||||
#cgo CXXFLAGS: -std=c++17
|
||||
#cgo CPPFLAGS: -I${SRCDIR}/llama.cpp/include
|
||||
#cgo CPPFLAGS: -I${SRCDIR}/llama.cpp/common
|
||||
#cgo CPPFLAGS: -I${SRCDIR}/llama.cpp/tools/mtmd
|
||||
#cgo CPPFLAGS: -I${SRCDIR}/llama.cpp/examples/llava
|
||||
#cgo CPPFLAGS: -I${SRCDIR}/llama.cpp/src
|
||||
#cgo CPPFLAGS: -I${SRCDIR}/../ml/backend/ggml/ggml/include
|
||||
|
||||
@ -17,6 +17,7 @@ package llama
|
||||
#include "llava.h"
|
||||
#include "gguf.h"
|
||||
|
||||
#include "mllama.h"
|
||||
#include "sampling_ext.h"
|
||||
|
||||
extern bool llamaProgressCallback(float progress, void *user_data);
|
||||
@ -39,8 +40,8 @@ import (
|
||||
"unsafe"
|
||||
|
||||
_ "github.com/ollama/ollama/llama/llama.cpp/common"
|
||||
_ "github.com/ollama/ollama/llama/llama.cpp/examples/llava"
|
||||
_ "github.com/ollama/ollama/llama/llama.cpp/src"
|
||||
_ "github.com/ollama/ollama/llama/llama.cpp/tools/mtmd"
|
||||
ggml "github.com/ollama/ollama/ml/backend/ggml/ggml/src"
|
||||
)
|
||||
|
||||
@ -509,6 +510,63 @@ func (c *ClipContext) NewEmbed(llamaContext *Context, data []byte) ([][]float32,
|
||||
return embed, nil
|
||||
}
|
||||
|
||||
type MllamaContext struct {
|
||||
c *C.struct_mllama_ctx
|
||||
}
|
||||
|
||||
func NewMllamaContext(llamaContext *Context, modelPath string) (*MllamaContext, error) {
|
||||
mp := C.CString(modelPath)
|
||||
defer C.free(unsafe.Pointer(mp))
|
||||
c := C.mllama_model_load(mp, 1)
|
||||
if c == nil {
|
||||
return nil, fmt.Errorf("unable to load mllama model: %v", modelPath)
|
||||
}
|
||||
|
||||
projEmbedSize := int(C.mllama_n_embd(c))
|
||||
modelEmbedSize := llamaContext.Model().NEmbd()
|
||||
if projEmbedSize != modelEmbedSize {
|
||||
return nil, fmt.Errorf("projector embedding size (%d) does not match model (%d)", projEmbedSize, modelEmbedSize)
|
||||
}
|
||||
|
||||
return &MllamaContext{c: c}, nil
|
||||
}
|
||||
|
||||
func (m *MllamaContext) Free() {
|
||||
C.mllama_free(m.c)
|
||||
}
|
||||
|
||||
func (m *MllamaContext) NewEmbed(llamaContext *Context, data []byte, aspectRatioId int) ([][]float32, error) {
|
||||
img := C.mllama_image_init()
|
||||
defer C.mllama_image_free(img)
|
||||
|
||||
ok := bool(C.mllama_image_load_from_data(unsafe.Pointer(&data[0]), C.int(len(data)), 560, 560, 3, 4, C.int(aspectRatioId), img))
|
||||
if !ok {
|
||||
return nil, errors.New("unable to load mllama image data")
|
||||
}
|
||||
|
||||
rows := make([]float32, m.EmbedSize(llamaContext))
|
||||
ok = bool(C.mllama_image_encode(m.c, C.int(llamaContext.numThreads), img, (*C.float)(unsafe.Pointer(&rows[0]))))
|
||||
if !ok {
|
||||
return nil, errors.New("unable to make mllama embedding from image")
|
||||
}
|
||||
|
||||
embed := make([][]float32, 1)
|
||||
embed[0] = rows
|
||||
|
||||
return embed, nil
|
||||
}
|
||||
|
||||
func (m *MllamaContext) EmbedSize(llamaContext *Context) int {
|
||||
numTokens := int(C.mllama_n_positions(m.c) * C.mllama_n_tiles(m.c))
|
||||
numEmbed := llamaContext.Model().NEmbd()
|
||||
|
||||
return numTokens * numEmbed
|
||||
}
|
||||
|
||||
func (c *Context) SetCrossAttention(state bool) {
|
||||
C.llama_set_cross_attention(c.c, C.bool(state))
|
||||
}
|
||||
|
||||
func (c *Context) Synchronize() {
|
||||
C.llama_synchronize(c.c)
|
||||
}
|
||||
@ -529,6 +587,9 @@ type SamplingParams struct {
|
||||
PenaltyRepeat float32
|
||||
PenaltyFreq float32
|
||||
PenaltyPresent float32
|
||||
Mirostat int
|
||||
MirostatTau float32
|
||||
MirostatEta float32
|
||||
PenalizeNl bool
|
||||
Seed uint32
|
||||
Grammar string
|
||||
@ -545,6 +606,9 @@ func NewSamplingContext(model *Model, params SamplingParams) (*SamplingContext,
|
||||
cparams.penalty_repeat = C.float(params.PenaltyRepeat)
|
||||
cparams.penalty_freq = C.float(params.PenaltyFreq)
|
||||
cparams.penalty_present = C.float(params.PenaltyFreq)
|
||||
cparams.mirostat = C.int32_t(params.Mirostat)
|
||||
cparams.mirostat_tau = C.float(params.MirostatTau)
|
||||
cparams.mirostat_eta = C.float(params.MirostatEta)
|
||||
cparams.seed = C.uint32_t(params.Seed)
|
||||
|
||||
grammar := C.CString(params.Grammar)
|
||||
@ -579,8 +643,8 @@ func SchemaToGrammar(schema []byte) []byte {
|
||||
cStr := C.CString(string(schema))
|
||||
defer C.free(unsafe.Pointer(cStr))
|
||||
|
||||
// Allocate buffer for grammar based on schema length but with upper bound
|
||||
maxLen := min(1024*1024, len(schema)*4)
|
||||
// Allocate buffer for grammar output with reasonable size
|
||||
const maxLen = 32768 // 32KB
|
||||
buf := make([]byte, maxLen)
|
||||
|
||||
// Call C function to convert schema to grammar
|
||||
|
887
llama/mllama.cpp
vendored
Normal file
887
llama/mllama.cpp
vendored
Normal file
@ -0,0 +1,887 @@
|
||||
// NOTE: This is modified from clip.cpp for Mllama only
|
||||
#include "mllama.h"
|
||||
|
||||
#include "ggml-alloc.h"
|
||||
#include "ggml-backend.h"
|
||||
#include "ggml-cpu.h"
|
||||
#include "ggml.h"
|
||||
#include "gguf.h"
|
||||
|
||||
#ifdef GGML_USE_CUDA
|
||||
#include "ggml-cuda.h"
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_METAL
|
||||
#include "ggml-metal.h"
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_CANN
|
||||
#include "ggml-cann.h"
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_VULKAN
|
||||
#include "ggml-vulkan.h"
|
||||
#endif
|
||||
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <cstdarg>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <fstream>
|
||||
#include <stdexcept>
|
||||
#include <vector>
|
||||
|
||||
#define REQUIRE(x) \
|
||||
do { \
|
||||
if (!(x)) { \
|
||||
throw std::runtime_error("REQUIRE failed: " #x); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define LOG(fmt, ...) fprintf(stderr, "%s: " fmt "\n", __func__, ##__VA_ARGS__)
|
||||
|
||||
#if defined(_WIN32)
|
||||
#define WIN32_LEAN_AND_MEAN
|
||||
#ifndef NOMINMAX
|
||||
#define NOMINMAX
|
||||
#endif
|
||||
#include <windows.h>
|
||||
#if __GLIBCXX__
|
||||
#include <cstdio>
|
||||
#include <ext/stdio_filebuf.h>
|
||||
#include <fcntl.h>
|
||||
#endif
|
||||
#endif
|
||||
|
||||
struct mllama_image {
|
||||
int width;
|
||||
int height;
|
||||
|
||||
int num_channels = 3;
|
||||
int num_tiles = 4;
|
||||
|
||||
int aspect_ratio_id;
|
||||
|
||||
std::vector<float> data;
|
||||
};
|
||||
|
||||
static std::string format(const char *fmt, ...) {
|
||||
va_list args;
|
||||
va_start(args, fmt);
|
||||
std::vector<char> b(128);
|
||||
int n = vsnprintf(b.data(), b.size(), fmt, args);
|
||||
REQUIRE(n >= 0 && n < b.size());
|
||||
va_end(args);
|
||||
return std::string(b.data(), b.size());
|
||||
}
|
||||
|
||||
//
|
||||
// utilities to get data from a gguf file
|
||||
//
|
||||
|
||||
static int get_key_index(const gguf_context *ctx, const char *key) {
|
||||
int key_index = gguf_find_key(ctx, key);
|
||||
REQUIRE(key_index != -1);
|
||||
return key_index;
|
||||
}
|
||||
|
||||
static std::vector<uint32_t> get_u32_array(const gguf_context *ctx, const std::string &key) {
|
||||
const int i = get_key_index(ctx, key.c_str());
|
||||
const int n = gguf_get_arr_n(ctx, i);
|
||||
const uint32_t *data = (uint32_t *)gguf_get_arr_data(ctx, i);
|
||||
|
||||
std::vector<uint32_t> s(n);
|
||||
for (size_t j = 0; j < s.size(); j++) {
|
||||
s[j] = data[j];
|
||||
}
|
||||
|
||||
return s;
|
||||
}
|
||||
|
||||
static uint32_t get_u32(const gguf_context *ctx, const std::string &key) {
|
||||
return gguf_get_val_u32(ctx, get_key_index(ctx, key.c_str()));
|
||||
}
|
||||
|
||||
static float get_f32(const gguf_context *ctx, const std::string &key) {
|
||||
return gguf_get_val_f32(ctx, get_key_index(ctx, key.c_str()));
|
||||
}
|
||||
|
||||
static std::string get_ftype(int ftype) {
|
||||
return ggml_type_name(static_cast<ggml_type>(ftype));
|
||||
}
|
||||
|
||||
//
|
||||
// mllama layers
|
||||
//
|
||||
|
||||
struct mllama_hparams {
|
||||
uint32_t image_size;
|
||||
uint32_t patch_size;
|
||||
uint32_t hidden_size;
|
||||
uint32_t n_intermediate;
|
||||
uint32_t projection_dim;
|
||||
uint32_t n_head;
|
||||
uint32_t n_layer;
|
||||
uint32_t n_global_layer;
|
||||
uint32_t n_tiles;
|
||||
|
||||
float eps;
|
||||
|
||||
std::vector<bool> intermediate_layers;
|
||||
};
|
||||
|
||||
struct mllama_layer {
|
||||
// attention
|
||||
struct ggml_tensor *k_w;
|
||||
struct ggml_tensor *k_b;
|
||||
struct ggml_tensor *q_w;
|
||||
struct ggml_tensor *q_b;
|
||||
struct ggml_tensor *v_w;
|
||||
struct ggml_tensor *v_b;
|
||||
|
||||
struct ggml_tensor *o_w;
|
||||
struct ggml_tensor *o_b;
|
||||
|
||||
struct ggml_tensor *attn_gate;
|
||||
|
||||
// layernorm 1
|
||||
struct ggml_tensor *ln_1_w;
|
||||
struct ggml_tensor *ln_1_b;
|
||||
|
||||
// ff
|
||||
struct ggml_tensor *ff_i_w;
|
||||
struct ggml_tensor *ff_i_b;
|
||||
|
||||
struct ggml_tensor *ff_o_w;
|
||||
struct ggml_tensor *ff_o_b;
|
||||
|
||||
struct ggml_tensor *ff_gate;
|
||||
|
||||
// layernorm 2
|
||||
struct ggml_tensor *ln_2_w;
|
||||
struct ggml_tensor *ln_2_b;
|
||||
};
|
||||
|
||||
struct mllama_vision_model {
|
||||
struct mllama_hparams hparams;
|
||||
|
||||
// embeddings
|
||||
struct ggml_tensor *class_embedding;
|
||||
struct ggml_tensor *patch_embeddings;
|
||||
struct ggml_tensor *position_embeddings;
|
||||
struct ggml_tensor *position_embeddings_gate;
|
||||
struct ggml_tensor *tile_position_embeddings;
|
||||
struct ggml_tensor *tile_position_embeddings_gate;
|
||||
struct ggml_tensor *pre_tile_position_embeddings;
|
||||
struct ggml_tensor *pre_tile_position_embeddings_gate;
|
||||
struct ggml_tensor *post_tile_position_embeddings;
|
||||
struct ggml_tensor *post_tile_position_embeddings_gate;
|
||||
|
||||
struct ggml_tensor *pre_ln_w;
|
||||
struct ggml_tensor *pre_ln_b;
|
||||
|
||||
std::vector<mllama_layer> layers;
|
||||
std::vector<mllama_layer> global_layers;
|
||||
|
||||
struct ggml_tensor *post_ln_w;
|
||||
struct ggml_tensor *post_ln_b;
|
||||
|
||||
struct ggml_tensor *mm_0_w;
|
||||
struct ggml_tensor *mm_0_b;
|
||||
};
|
||||
|
||||
struct mllama_ctx {
|
||||
struct mllama_vision_model vision_model;
|
||||
|
||||
uint32_t ftype = 1;
|
||||
|
||||
struct gguf_context *ctx_gguf;
|
||||
struct ggml_context *ctx_data;
|
||||
|
||||
std::vector<uint8_t> buf_compute_meta;
|
||||
|
||||
// memory buffers to evaluate the model
|
||||
ggml_backend_buffer_t params_buffer = nullptr;
|
||||
|
||||
ggml_backend_t backend = nullptr;
|
||||
ggml_gallocr_t compute_alloc = nullptr;
|
||||
};
|
||||
|
||||
static ggml_tensor *mllama_image_build_encoder_layer(
|
||||
struct ggml_context *ctx0, const size_t il, const struct mllama_layer &layer, struct ggml_tensor *embeddings,
|
||||
const float eps, const int hidden_size, const int batch_size, const int n_head, const int d_head) {
|
||||
struct ggml_tensor *cur = embeddings;
|
||||
|
||||
{
|
||||
// layernorm1
|
||||
cur = ggml_norm(ctx0, cur, eps);
|
||||
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, layer.ln_1_w), layer.ln_1_b);
|
||||
ggml_set_name(cur, format("%d pre layernorm", il).c_str());
|
||||
}
|
||||
|
||||
{
|
||||
// self-attention
|
||||
struct ggml_tensor *Q = ggml_mul_mat(ctx0, layer.q_w, cur);
|
||||
if (layer.q_b != nullptr) {
|
||||
Q = ggml_add(ctx0, Q, layer.q_b);
|
||||
}
|
||||
|
||||
Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, Q->ne[1], batch_size);
|
||||
Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
|
||||
ggml_set_name(Q, format("%d query", il).c_str());
|
||||
|
||||
struct ggml_tensor *K = ggml_mul_mat(ctx0, layer.k_w, cur);
|
||||
if (layer.k_b != nullptr) {
|
||||
K = ggml_add(ctx0, K, layer.k_b);
|
||||
}
|
||||
|
||||
K = ggml_reshape_4d(ctx0, K, d_head, n_head, K->ne[1], batch_size);
|
||||
K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
|
||||
ggml_set_name(K, format("%d key", il).c_str());
|
||||
|
||||
struct ggml_tensor *V = ggml_mul_mat(ctx0, layer.v_w, cur);
|
||||
if (layer.v_b != nullptr) {
|
||||
V = ggml_add(ctx0, V, layer.v_b);
|
||||
}
|
||||
|
||||
V = ggml_reshape_4d(ctx0, V, d_head, n_head, V->ne[1], batch_size);
|
||||
V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3));
|
||||
ggml_set_name(V, format("%d value", il).c_str());
|
||||
|
||||
struct ggml_tensor *KQ = ggml_mul_mat(ctx0, K, Q);
|
||||
KQ = ggml_scale_inplace(ctx0, KQ, 1.0f / sqrtf((float)d_head));
|
||||
KQ = ggml_soft_max_inplace(ctx0, KQ);
|
||||
ggml_set_name(KQ, format("%d KQ", il).c_str());
|
||||
|
||||
struct ggml_tensor *KQV = ggml_mul_mat(ctx0, V, KQ);
|
||||
KQV = ggml_reshape_4d(ctx0, KQV, d_head, KQV->ne[1], n_head, batch_size);
|
||||
KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
||||
KQV = ggml_cont_3d(ctx0, KQV, hidden_size, KQV->ne[2], batch_size);
|
||||
ggml_set_name(KQV, format("%d KQV", il).c_str());
|
||||
|
||||
cur = ggml_mul_mat(ctx0, layer.o_w, KQV);
|
||||
if (layer.o_b != nullptr) {
|
||||
cur = ggml_add(ctx0, cur, layer.o_b);
|
||||
}
|
||||
ggml_set_name(cur, format("%d self attention", il).c_str());
|
||||
|
||||
if (layer.attn_gate != nullptr) {
|
||||
cur = ggml_mul_inplace(ctx0, cur, layer.attn_gate);
|
||||
ggml_set_name(cur, format("%d self attention gate", il).c_str());
|
||||
}
|
||||
}
|
||||
|
||||
cur = ggml_add(ctx0, cur, embeddings);
|
||||
ggml_set_name(cur, format("%d residual", il).c_str());
|
||||
|
||||
embeddings = cur;
|
||||
|
||||
{
|
||||
// layernorm2
|
||||
cur = ggml_norm(ctx0, cur, eps);
|
||||
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, layer.ln_2_w), layer.ln_2_b);
|
||||
ggml_set_name(cur, format("%d post layernorm", il).c_str());
|
||||
}
|
||||
|
||||
{
|
||||
// feed forward
|
||||
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, layer.ff_i_w, cur), layer.ff_i_b);
|
||||
cur = ggml_gelu_inplace(ctx0, cur);
|
||||
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, layer.ff_o_w, cur), layer.ff_o_b);
|
||||
ggml_set_name(cur, format("%d feed forward", il).c_str());
|
||||
|
||||
if (layer.ff_gate != nullptr) {
|
||||
cur = ggml_mul_inplace(ctx0, cur, layer.ff_gate);
|
||||
ggml_set_name(cur, format("%d feed forward gate", il).c_str());
|
||||
}
|
||||
}
|
||||
|
||||
// residual 2
|
||||
cur = ggml_add(ctx0, cur, embeddings);
|
||||
ggml_set_name(cur, format("%d residual", il).c_str());
|
||||
|
||||
embeddings = cur;
|
||||
|
||||
return embeddings;
|
||||
}
|
||||
|
||||
static ggml_cgraph *mllama_image_build_graph(mllama_ctx *ctx, const mllama_image_batch *imgs) {
|
||||
const auto &model = ctx->vision_model;
|
||||
const auto &hparams = model.hparams;
|
||||
|
||||
const int image_size = hparams.image_size;
|
||||
const int image_size_width = image_size;
|
||||
const int image_size_height = image_size;
|
||||
|
||||
const int patch_size = hparams.patch_size;
|
||||
const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
|
||||
const int num_positions = num_patches + (model.class_embedding == nullptr ? 0 : 1);
|
||||
const int hidden_size = hparams.hidden_size;
|
||||
const int n_head = hparams.n_head;
|
||||
const int d_head = hidden_size / n_head;
|
||||
|
||||
const int batch_size = imgs->size;
|
||||
REQUIRE(batch_size == 1);
|
||||
|
||||
int num_tiles = 4;
|
||||
int num_channels = 3;
|
||||
if (imgs->data != nullptr) {
|
||||
num_tiles = imgs->data[0].num_tiles > 0 ? imgs->data[0].num_tiles : num_tiles;
|
||||
num_channels = imgs->data[0].num_channels > 0 ? imgs->data[0].num_channels : num_channels;
|
||||
}
|
||||
|
||||
struct ggml_init_params params = {
|
||||
ctx->buf_compute_meta.size(), // mem_size
|
||||
ctx->buf_compute_meta.data(), // mem_buffer
|
||||
true, // no_alloc
|
||||
};
|
||||
|
||||
struct ggml_context *ctx0 = ggml_init(params);
|
||||
struct ggml_cgraph *gf = ggml_new_graph(ctx0);
|
||||
|
||||
struct ggml_tensor *inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size_width, image_size_height, num_channels, num_tiles);
|
||||
ggml_set_name(inp_raw, "inp_raw");
|
||||
ggml_set_input(inp_raw);
|
||||
|
||||
struct ggml_tensor *inp = ggml_conv_2d(ctx0, model.patch_embeddings, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
|
||||
|
||||
inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, num_tiles);
|
||||
inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3));
|
||||
|
||||
struct ggml_tensor *aspect_ratios = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, imgs->size);
|
||||
ggml_set_name(aspect_ratios, "aspect_ratios");
|
||||
ggml_set_input(aspect_ratios);
|
||||
|
||||
if (model.pre_tile_position_embeddings != nullptr) {
|
||||
struct ggml_tensor *pre_tile_position_embeddings = ggml_get_rows(ctx0, model.pre_tile_position_embeddings, aspect_ratios);
|
||||
ggml_set_name(pre_tile_position_embeddings, "pre_tile_position_embeddings");
|
||||
|
||||
pre_tile_position_embeddings = ggml_reshape_3d(ctx0, pre_tile_position_embeddings, hidden_size, 1, num_tiles);
|
||||
if (model.pre_tile_position_embeddings_gate != nullptr) {
|
||||
pre_tile_position_embeddings = ggml_mul_inplace(ctx0, pre_tile_position_embeddings, model.pre_tile_position_embeddings_gate);
|
||||
}
|
||||
|
||||
inp = ggml_add(ctx0, inp, pre_tile_position_embeddings);
|
||||
}
|
||||
|
||||
struct ggml_tensor *embeddings = inp;
|
||||
|
||||
if (model.class_embedding != nullptr) {
|
||||
// concat class_embeddings and patch_embeddings
|
||||
embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, num_tiles);
|
||||
ggml_set_name(embeddings, "embeddings");
|
||||
ggml_set_input(embeddings);
|
||||
for (int i = 0; i < num_tiles; ++i) {
|
||||
// repeat class embeddings for each tile
|
||||
embeddings = ggml_acc_inplace(ctx0, embeddings, model.class_embedding, embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], i * embeddings->nb[2]);
|
||||
}
|
||||
|
||||
embeddings = ggml_acc_inplace(ctx0, embeddings, inp, embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
|
||||
}
|
||||
|
||||
struct ggml_tensor *positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions);
|
||||
ggml_set_name(positions, "positions");
|
||||
ggml_set_input(positions);
|
||||
|
||||
struct ggml_tensor *position_embd = ggml_get_rows(ctx0, model.position_embeddings, positions);
|
||||
if (model.position_embeddings_gate != nullptr) {
|
||||
position_embd = ggml_mul_inplace(ctx0, position_embd, model.position_embeddings_gate);
|
||||
}
|
||||
|
||||
embeddings = ggml_add(ctx0, embeddings, position_embd);
|
||||
|
||||
if (model.tile_position_embeddings != nullptr) {
|
||||
struct ggml_tensor *tile_position_embeddings = ggml_get_rows(ctx0, model.tile_position_embeddings, aspect_ratios);
|
||||
ggml_set_name(tile_position_embeddings, "tile_position_embeddings");
|
||||
|
||||
tile_position_embeddings = ggml_reshape_3d(ctx0, tile_position_embeddings, hidden_size, num_positions, num_tiles);
|
||||
if (model.tile_position_embeddings_gate != nullptr) {
|
||||
tile_position_embeddings = ggml_mul_inplace(ctx0, tile_position_embeddings, model.tile_position_embeddings_gate);
|
||||
}
|
||||
|
||||
embeddings = ggml_add(ctx0, embeddings, tile_position_embeddings);
|
||||
}
|
||||
|
||||
// pre-layernorm
|
||||
if (model.pre_ln_w != nullptr) {
|
||||
embeddings = ggml_mul(ctx0, ggml_norm(ctx0, embeddings, hparams.eps), model.pre_ln_w);
|
||||
if (model.pre_ln_b != nullptr) {
|
||||
embeddings = ggml_add(ctx0, embeddings, model.pre_ln_b);
|
||||
}
|
||||
|
||||
ggml_set_name(embeddings, "pre layernorm");
|
||||
}
|
||||
|
||||
const int num_padding_patches = 8 - (embeddings->ne[1] % 8) % 8;
|
||||
|
||||
embeddings = ggml_pad(ctx0, embeddings, 0, num_padding_patches, 0, 0);
|
||||
embeddings = ggml_view_3d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1] * embeddings->ne[2], batch_size, embeddings->nb[1], embeddings->nb[2] * embeddings->ne[3], 0);
|
||||
|
||||
std::vector<struct ggml_tensor *> intermediate_embeddings;
|
||||
|
||||
// encoder
|
||||
for (size_t il = 0; il < model.layers.size(); il++) {
|
||||
if (hparams.intermediate_layers[il]) {
|
||||
intermediate_embeddings.push_back(embeddings);
|
||||
}
|
||||
|
||||
embeddings = mllama_image_build_encoder_layer(
|
||||
ctx0, il, model.layers[il], embeddings,
|
||||
hparams.eps, hidden_size, batch_size, n_head, d_head);
|
||||
}
|
||||
|
||||
// post-layernorm
|
||||
if (model.post_ln_w != nullptr) {
|
||||
embeddings = ggml_mul(ctx0, ggml_norm(ctx0, embeddings, hparams.eps), model.post_ln_w);
|
||||
if (model.post_ln_b != nullptr) {
|
||||
embeddings = ggml_add(ctx0, embeddings, model.post_ln_b);
|
||||
}
|
||||
|
||||
ggml_set_name(embeddings, "post layernorm");
|
||||
}
|
||||
|
||||
embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size, num_positions + num_padding_patches, num_tiles);
|
||||
|
||||
if (model.post_tile_position_embeddings != nullptr) {
|
||||
struct ggml_tensor *post_tile_position_embeddings = ggml_get_rows(ctx0, model.post_tile_position_embeddings, aspect_ratios);
|
||||
ggml_set_name(post_tile_position_embeddings, "post_tile_position_embeddings");
|
||||
|
||||
post_tile_position_embeddings = ggml_reshape_3d(ctx0, post_tile_position_embeddings, hidden_size, 1, num_tiles);
|
||||
if (model.post_tile_position_embeddings_gate != nullptr) {
|
||||
post_tile_position_embeddings = ggml_mul(ctx0, post_tile_position_embeddings, model.post_tile_position_embeddings_gate);
|
||||
}
|
||||
|
||||
embeddings = ggml_add(ctx0, embeddings, post_tile_position_embeddings);
|
||||
}
|
||||
|
||||
embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size, num_tiles * (num_positions + num_padding_patches), 1);
|
||||
|
||||
// global encoder
|
||||
for (size_t il = 0; il < model.global_layers.size(); il++) {
|
||||
embeddings = mllama_image_build_encoder_layer(
|
||||
ctx0, il, model.global_layers[il], embeddings,
|
||||
hparams.eps, hidden_size, batch_size, n_head, d_head);
|
||||
}
|
||||
|
||||
struct ggml_tensor *stacked_embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 0, hidden_size, (num_positions + num_padding_patches) * num_tiles);
|
||||
for (size_t i = 0; i < intermediate_embeddings.size(); ++i) {
|
||||
stacked_embeddings = ggml_concat(ctx0, stacked_embeddings, ggml_reshape_3d(ctx0, intermediate_embeddings[i], 1, intermediate_embeddings[i]->ne[0], intermediate_embeddings[i]->ne[1]), 0);
|
||||
}
|
||||
|
||||
stacked_embeddings = ggml_reshape_4d(ctx0, stacked_embeddings, intermediate_embeddings.size() * hidden_size, num_positions + num_padding_patches, num_tiles, batch_size);
|
||||
stacked_embeddings = ggml_unpad(ctx0, stacked_embeddings, 0, num_padding_patches, 0, 0);
|
||||
|
||||
embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size, num_positions + num_padding_patches, num_tiles);
|
||||
embeddings = ggml_unpad(ctx0, embeddings, 0, num_padding_patches, 0, 0);
|
||||
embeddings = ggml_concat(ctx0, embeddings, stacked_embeddings, 0);
|
||||
|
||||
// mllama projector
|
||||
embeddings = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_0_w, embeddings), model.mm_0_b);
|
||||
ggml_set_name(embeddings, "multi modal projector");
|
||||
|
||||
// build the graph
|
||||
ggml_build_forward_expand(gf, embeddings);
|
||||
|
||||
ggml_free(ctx0);
|
||||
|
||||
return gf;
|
||||
}
|
||||
|
||||
static struct ggml_tensor *mllama_tensor_load(struct ggml_context *ctx, const char *name, const bool optional) {
|
||||
struct ggml_tensor *cur = ggml_get_tensor(ctx, name);
|
||||
REQUIRE(cur != nullptr || optional);
|
||||
return cur;
|
||||
}
|
||||
|
||||
static std::vector<struct mllama_layer> mllama_layers_load(struct ggml_context *ctx, const char *prefix, const int n) {
|
||||
std::vector<struct mllama_layer> layers(n);
|
||||
for (size_t i = 0; i < layers.size(); i++) {
|
||||
auto &layer = layers[i];
|
||||
layer.ln_1_w = mllama_tensor_load(ctx, format("%s.blk.%d.ln1.weight", prefix, i).c_str(), false);
|
||||
layer.ln_1_b = mllama_tensor_load(ctx, format("%s.blk.%d.ln1.bias", prefix, i).c_str(), false);
|
||||
layer.ln_2_w = mllama_tensor_load(ctx, format("%s.blk.%d.ln2.weight", prefix, i).c_str(), false);
|
||||
layer.ln_2_b = mllama_tensor_load(ctx, format("%s.blk.%d.ln2.bias", prefix, i).c_str(), false);
|
||||
|
||||
layer.k_w = mllama_tensor_load(ctx, format("%s.blk.%d.attn_k.weight", prefix, i).c_str(), false);
|
||||
layer.k_b = mllama_tensor_load(ctx, format("%s.blk.%d.attn_k.bias", prefix, i).c_str(), true);
|
||||
layer.q_w = mllama_tensor_load(ctx, format("%s.blk.%d.attn_q.weight", prefix, i).c_str(), false);
|
||||
layer.q_b = mllama_tensor_load(ctx, format("%s.blk.%d.attn_q.bias", prefix, i).c_str(), true);
|
||||
layer.v_w = mllama_tensor_load(ctx, format("%s.blk.%d.attn_v.weight", prefix, i).c_str(), false);
|
||||
layer.v_b = mllama_tensor_load(ctx, format("%s.blk.%d.attn_v.bias", prefix, i).c_str(), true);
|
||||
layer.o_w = mllama_tensor_load(ctx, format("%s.blk.%d.attn_out.weight", prefix, i).c_str(), false);
|
||||
layer.o_b = mllama_tensor_load(ctx, format("%s.blk.%d.attn_out.bias", prefix, i).c_str(), true);
|
||||
|
||||
layer.ff_i_w = mllama_tensor_load(ctx, format("%s.blk.%d.ffn_down.weight", prefix, i).c_str(), false);
|
||||
layer.ff_i_b = mllama_tensor_load(ctx, format("%s.blk.%d.ffn_down.bias", prefix, i).c_str(), false);
|
||||
layer.ff_o_w = mllama_tensor_load(ctx, format("%s.blk.%d.ffn_up.weight", prefix, i).c_str(), false);
|
||||
layer.ff_o_b = mllama_tensor_load(ctx, format("%s.blk.%d.ffn_up.bias", prefix, i).c_str(), false);
|
||||
|
||||
layer.attn_gate = mllama_tensor_load(ctx, format("%s.blk.%d.attn_gate", prefix, i).c_str(), true);
|
||||
layer.ff_gate = mllama_tensor_load(ctx, format("%s.blk.%d.ffn_gate", prefix, i).c_str(), true);
|
||||
}
|
||||
|
||||
return layers;
|
||||
}
|
||||
|
||||
// read and create ggml_context containing the tensors and their data
|
||||
struct mllama_ctx *mllama_model_load(const char *fname, const int verbosity = 1) {
|
||||
struct ggml_context *meta = nullptr;
|
||||
|
||||
struct gguf_init_params params = {
|
||||
true, // no_alloc
|
||||
&meta, // ctx
|
||||
};
|
||||
|
||||
struct gguf_context *ctx = gguf_init_from_file(fname, params);
|
||||
REQUIRE(ctx != nullptr);
|
||||
|
||||
if (verbosity >= 1) {
|
||||
const int n_tensors = gguf_get_n_tensors(ctx);
|
||||
const int n_kv = gguf_get_n_kv(ctx);
|
||||
const std::string ftype = get_ftype(get_u32(ctx, "general.file_type"));
|
||||
const int idx_desc = get_key_index(ctx, "general.description");
|
||||
const std::string description = gguf_get_val_str(ctx, idx_desc);
|
||||
const int idx_name = gguf_find_key(ctx, "general.name");
|
||||
if (idx_name != -1) { // make name optional temporarily as some of the uploaded models missing it due to a bug
|
||||
const std::string name = gguf_get_val_str(ctx, idx_name);
|
||||
LOG("model name: %s", name.c_str());
|
||||
}
|
||||
LOG("description: %s", description.c_str());
|
||||
LOG("GGUF version: %d", gguf_get_version(ctx));
|
||||
LOG("alignment: %zu", gguf_get_alignment(ctx));
|
||||
LOG("n_tensors: %d", n_tensors);
|
||||
LOG("n_kv: %d", n_kv);
|
||||
LOG("ftype: %s", ftype.c_str());
|
||||
LOG("");
|
||||
}
|
||||
const int n_tensors = gguf_get_n_tensors(ctx);
|
||||
|
||||
mllama_ctx *new_mllama = new mllama_ctx{};
|
||||
|
||||
ggml_backend_t backend = ggml_backend_init_best();
|
||||
if (backend == nullptr) {
|
||||
LOG("%s: failed to initialize backend\n", __func__);
|
||||
mllama_free(new_mllama);
|
||||
gguf_free(ctx);
|
||||
return nullptr;
|
||||
}
|
||||
LOG("%s: using %s backend\n", __func__, ggml_backend_name(backend));
|
||||
new_mllama->backend = backend;
|
||||
|
||||
// load tensors
|
||||
{
|
||||
std::vector<uint8_t> read_buf;
|
||||
struct ggml_init_params params = {
|
||||
(n_tensors + 1) * ggml_tensor_overhead(), // mem_size
|
||||
nullptr, // mem_buffer
|
||||
true, // no_alloc
|
||||
};
|
||||
|
||||
new_mllama->ctx_data = ggml_init(params);
|
||||
if (!new_mllama->ctx_data) {
|
||||
LOG("ggml_init() failed");
|
||||
mllama_free(new_mllama);
|
||||
gguf_free(ctx);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
#ifdef _WIN32
|
||||
int wlen = MultiByteToWideChar(CP_UTF8, 0, fname, -1, NULL, 0);
|
||||
if (!wlen) {
|
||||
return NULL;
|
||||
}
|
||||
wchar_t * wbuf = (wchar_t *) malloc(wlen * sizeof(wchar_t));
|
||||
wlen = MultiByteToWideChar(CP_UTF8, 0, fname, -1, wbuf, wlen);
|
||||
if (!wlen) {
|
||||
free(wbuf);
|
||||
return NULL;
|
||||
}
|
||||
#if __GLIBCXX__
|
||||
int fd = _wopen(wbuf, _O_RDONLY | _O_BINARY);
|
||||
__gnu_cxx::stdio_filebuf<char> buffer(fd, std::ios_base::in);
|
||||
std::istream fin(&buffer);
|
||||
#else // MSVC
|
||||
// unused in our current build
|
||||
auto fin = std::ifstream(wbuf, std::ios::binary);
|
||||
#endif
|
||||
free(wbuf);
|
||||
#else
|
||||
auto fin = std::ifstream(fname, std::ios::binary);
|
||||
#endif
|
||||
if (!fin) {
|
||||
LOG("cannot open model file for loading tensors\n");
|
||||
mllama_free(new_mllama);
|
||||
gguf_free(ctx);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// add tensors to context
|
||||
for (int i = 0; i < n_tensors; ++i) {
|
||||
const char *name = gguf_get_tensor_name(ctx, i);
|
||||
struct ggml_tensor *t = ggml_get_tensor(meta, name);
|
||||
struct ggml_tensor *cur = ggml_dup_tensor(new_mllama->ctx_data, t);
|
||||
ggml_set_name(cur, name);
|
||||
}
|
||||
|
||||
// alloc memory and offload data
|
||||
new_mllama->params_buffer = ggml_backend_alloc_ctx_tensors(new_mllama->ctx_data, new_mllama->backend);
|
||||
for (int i = 0; i < n_tensors; ++i) {
|
||||
const char *name = gguf_get_tensor_name(ctx, i);
|
||||
struct ggml_tensor *cur = ggml_get_tensor(new_mllama->ctx_data, name);
|
||||
const size_t offset = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i);
|
||||
fin.seekg(offset, std::ios::beg);
|
||||
if (!fin) {
|
||||
LOG("failed to seek for tensor %s\n", name);
|
||||
mllama_free(new_mllama);
|
||||
gguf_free(ctx);
|
||||
return nullptr;
|
||||
}
|
||||
int num_bytes = ggml_nbytes(cur);
|
||||
if (ggml_backend_buffer_is_host(new_mllama->params_buffer)) {
|
||||
// for the CPU and Metal backend, we can read directly into the tensor
|
||||
fin.read(reinterpret_cast<char *>(cur->data), num_bytes);
|
||||
} else {
|
||||
// read into a temporary buffer first, then copy to device memory
|
||||
read_buf.resize(num_bytes);
|
||||
fin.read(reinterpret_cast<char *>(read_buf.data()), num_bytes);
|
||||
ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
|
||||
}
|
||||
}
|
||||
|
||||
#if defined(_WIN32) && defined(__GLIBCXX__)
|
||||
close(fd);
|
||||
#else
|
||||
fin.close();
|
||||
#endif
|
||||
}
|
||||
|
||||
// vision model
|
||||
// load vision model
|
||||
auto &vision_model = new_mllama->vision_model;
|
||||
auto &hparams = vision_model.hparams;
|
||||
hparams.hidden_size = get_u32(ctx, "mllama.vision.embedding_length");
|
||||
hparams.n_head = get_u32(ctx, "mllama.vision.attention.head_count");
|
||||
hparams.n_intermediate = get_u32(ctx, "mllama.vision.feed_forward_length");
|
||||
hparams.n_layer = get_u32(ctx, "mllama.vision.block_count");
|
||||
hparams.n_global_layer = get_u32(ctx, "mllama.vision.global.block_count");
|
||||
hparams.n_tiles = get_u32(ctx, "mllama.vision.max_num_tiles");
|
||||
hparams.image_size = get_u32(ctx, "mllama.vision.image_size");
|
||||
hparams.patch_size = get_u32(ctx, "mllama.vision.patch_size");
|
||||
hparams.projection_dim = get_u32(ctx, "mllama.vision.projection_dim");
|
||||
hparams.eps = get_f32(ctx, "mllama.vision.attention.layer_norm_epsilon");
|
||||
|
||||
std::vector<uint32_t> intermediate_layers_indices = get_u32_array(ctx, "mllama.vision.intermediate_layers_indices");
|
||||
hparams.intermediate_layers.resize(hparams.n_layer);
|
||||
for (size_t i = 0; i < intermediate_layers_indices.size(); i++) {
|
||||
hparams.intermediate_layers[intermediate_layers_indices[i]] = true;
|
||||
}
|
||||
|
||||
if (verbosity >= 2) {
|
||||
LOG("");
|
||||
LOG("vision model hparams");
|
||||
LOG("image_size %d", hparams.image_size);
|
||||
LOG("patch_size %d", hparams.patch_size);
|
||||
LOG("v_hidden_size %d", hparams.hidden_size);
|
||||
LOG("v_n_intermediate %d", hparams.n_intermediate);
|
||||
LOG("v_projection_dim %d", hparams.projection_dim);
|
||||
LOG("v_n_head %d", hparams.n_head);
|
||||
LOG("v_n_layer %d", hparams.n_layer);
|
||||
LOG("v_n_global_layer %d", hparams.n_global_layer);
|
||||
LOG("v_eps %f", hparams.eps);
|
||||
}
|
||||
|
||||
vision_model.class_embedding = mllama_tensor_load(new_mllama->ctx_data, "v.class_embd", true);
|
||||
vision_model.patch_embeddings = mllama_tensor_load(new_mllama->ctx_data, "v.patch_embd.weight", true);
|
||||
|
||||
vision_model.position_embeddings = mllama_tensor_load(new_mllama->ctx_data, "v.position_embd.weight", true);
|
||||
vision_model.position_embeddings_gate = mllama_tensor_load(new_mllama->ctx_data, "v.position_embd.gate", true);
|
||||
|
||||
vision_model.pre_ln_w = mllama_tensor_load(new_mllama->ctx_data, "v.pre_ln.weight", true);
|
||||
vision_model.pre_ln_b = mllama_tensor_load(new_mllama->ctx_data, "v.pre_ln.bias", true);
|
||||
vision_model.post_ln_w = mllama_tensor_load(new_mllama->ctx_data, "v.post_ln.weight", true);
|
||||
vision_model.post_ln_b = mllama_tensor_load(new_mllama->ctx_data, "v.post_ln.bias", true);
|
||||
|
||||
vision_model.tile_position_embeddings = mllama_tensor_load(new_mllama->ctx_data, "v.tile_position_embd.weight", true);
|
||||
vision_model.tile_position_embeddings_gate = mllama_tensor_load(new_mllama->ctx_data, "v.tile_position_embd.gate", true);
|
||||
|
||||
vision_model.pre_tile_position_embeddings = mllama_tensor_load(new_mllama->ctx_data, "v.pre_tile_position_embd.weight", true);
|
||||
vision_model.pre_tile_position_embeddings_gate = mllama_tensor_load(new_mllama->ctx_data, "v.pre_tile_position_embd.gate", true);
|
||||
|
||||
vision_model.post_tile_position_embeddings = mllama_tensor_load(new_mllama->ctx_data, "v.post_tile_position_embd.weight", true);
|
||||
vision_model.post_tile_position_embeddings_gate = mllama_tensor_load(new_mllama->ctx_data, "v.post_tile_position_embd.gate", true);
|
||||
|
||||
vision_model.mm_0_w = mllama_tensor_load(new_mllama->ctx_data, "mm.0.weight", false);
|
||||
vision_model.mm_0_b = mllama_tensor_load(new_mllama->ctx_data, "mm.0.bias", false);
|
||||
|
||||
vision_model.layers = mllama_layers_load(new_mllama->ctx_data, "v", hparams.n_layer);
|
||||
vision_model.global_layers = mllama_layers_load(new_mllama->ctx_data, "v.global", hparams.n_global_layer);
|
||||
|
||||
ggml_free(meta);
|
||||
|
||||
new_mllama->ctx_gguf = ctx;
|
||||
|
||||
{
|
||||
// measure mem requirement and allocate
|
||||
new_mllama->buf_compute_meta.resize(GGML_DEFAULT_GRAPH_SIZE * ggml_tensor_overhead() + ggml_graph_overhead());
|
||||
new_mllama->compute_alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(new_mllama->backend));
|
||||
struct mllama_image_batch batch;
|
||||
batch.size = 1;
|
||||
ggml_cgraph *gf = mllama_image_build_graph(new_mllama, &batch);
|
||||
ggml_gallocr_reserve(new_mllama->compute_alloc, gf);
|
||||
size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_mllama->compute_alloc, 0);
|
||||
LOG("compute allocated memory: %.2f MB", compute_memory_buffer_size / 1024.0 / 1024.0);
|
||||
}
|
||||
|
||||
return new_mllama;
|
||||
}
|
||||
|
||||
struct mllama_image *mllama_image_init() {
|
||||
return new mllama_image();
|
||||
}
|
||||
|
||||
void mllama_image_free(struct mllama_image *img) { delete img; }
|
||||
void mllama_image_batch_free(struct mllama_image_batch *batch) {
|
||||
if (batch->size > 0) {
|
||||
delete[] batch->data;
|
||||
batch->size = 0;
|
||||
}
|
||||
}
|
||||
|
||||
bool mllama_image_load_from_data(const void *data, const int n, const int width, const int height, const int num_channels, const int num_tiles, const int aspect_ratio_id, struct mllama_image *img) {
|
||||
img->width = width;
|
||||
img->height = height;
|
||||
img->num_channels = num_channels;
|
||||
img->num_tiles = num_tiles;
|
||||
img->aspect_ratio_id = aspect_ratio_id;
|
||||
img->data.resize(n);
|
||||
|
||||
memcpy(img->data.data(), data, n);
|
||||
return true;
|
||||
}
|
||||
|
||||
inline int mllama(int x, int lower, int upper) {
|
||||
return std::max(lower, std::min(x, upper));
|
||||
}
|
||||
|
||||
void mllama_free(mllama_ctx *ctx) {
|
||||
ggml_free(ctx->ctx_data);
|
||||
gguf_free(ctx->ctx_gguf);
|
||||
|
||||
ggml_backend_buffer_free(ctx->params_buffer);
|
||||
ggml_backend_free(ctx->backend);
|
||||
ggml_gallocr_free(ctx->compute_alloc);
|
||||
delete ctx;
|
||||
}
|
||||
|
||||
bool mllama_image_encode(struct mllama_ctx *ctx, const int n_threads, mllama_image *img, float *vec) {
|
||||
mllama_image_batch imgs{};
|
||||
imgs.size = 1;
|
||||
imgs.data = img;
|
||||
return mllama_image_batch_encode(ctx, n_threads, &imgs, vec);
|
||||
}
|
||||
|
||||
bool mllama_image_batch_encode(mllama_ctx *ctx, const int n_threads, const mllama_image_batch *imgs, float *vec) {
|
||||
int batch_size = imgs->size;
|
||||
REQUIRE(batch_size == 1);
|
||||
|
||||
// build the inference graph
|
||||
ggml_cgraph *gf = mllama_image_build_graph(ctx, imgs);
|
||||
ggml_gallocr_alloc_graph(ctx->compute_alloc, gf);
|
||||
|
||||
// set inputs
|
||||
const auto &model = ctx->vision_model;
|
||||
const auto &hparams = model.hparams;
|
||||
|
||||
const int image_size = hparams.image_size;
|
||||
int image_size_width = image_size;
|
||||
int image_size_height = image_size;
|
||||
|
||||
const int patch_size = hparams.patch_size;
|
||||
const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
|
||||
const int num_positions = num_patches + (model.class_embedding == nullptr ? 0 : 1);
|
||||
|
||||
{
|
||||
struct ggml_tensor *inp_raw = ggml_graph_get_tensor(gf, "inp_raw");
|
||||
ggml_backend_tensor_set(inp_raw, imgs->data[0].data.data(), 0, ggml_nbytes(inp_raw));
|
||||
}
|
||||
|
||||
{
|
||||
struct ggml_tensor *embeddings = ggml_graph_get_tensor(gf, "embeddings");
|
||||
if (embeddings != nullptr) {
|
||||
void *zeros = malloc(ggml_nbytes(embeddings));
|
||||
memset(zeros, 0, ggml_nbytes(embeddings));
|
||||
ggml_backend_tensor_set(embeddings, zeros, 0, ggml_nbytes(embeddings));
|
||||
free(zeros);
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
struct ggml_tensor *positions = ggml_graph_get_tensor(gf, "positions");
|
||||
if (positions != nullptr) {
|
||||
int *positions_data = (int *)malloc(ggml_nbytes(positions));
|
||||
for (int i = 0; i < num_positions; i++) {
|
||||
positions_data[i] = i;
|
||||
}
|
||||
ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
|
||||
free(positions_data);
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
struct ggml_tensor *aspect_ratios = ggml_graph_get_tensor(gf, "aspect_ratios");
|
||||
if (aspect_ratios != nullptr) {
|
||||
int *aspect_ratios_data = (int *)malloc(ggml_nbytes(aspect_ratios));
|
||||
aspect_ratios_data[0] = imgs->data[0].aspect_ratio_id;
|
||||
ggml_backend_tensor_set(aspect_ratios, aspect_ratios_data, 0, ggml_nbytes(aspect_ratios));
|
||||
free(aspect_ratios_data);
|
||||
}
|
||||
}
|
||||
|
||||
if (ggml_backend_is_cpu(ctx->backend)) {
|
||||
ggml_backend_cpu_set_n_threads(ctx->backend, n_threads);
|
||||
}
|
||||
|
||||
ggml_backend_graph_compute(ctx->backend, gf);
|
||||
|
||||
// the last node is the embedding tensor
|
||||
struct ggml_tensor *embeddings = ggml_graph_node(gf, ggml_graph_n_nodes(gf) - 1);
|
||||
|
||||
// copy the embeddings to the location passed by the user
|
||||
ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
int32_t mllama_image_size(const struct mllama_ctx *ctx) {
|
||||
return ctx->vision_model.hparams.image_size;
|
||||
}
|
||||
|
||||
int32_t mllama_patch_size(const struct mllama_ctx *ctx) {
|
||||
return ctx->vision_model.hparams.patch_size;
|
||||
}
|
||||
|
||||
int32_t mllama_hidden_size(const struct mllama_ctx *ctx) {
|
||||
return ctx->vision_model.hparams.hidden_size;
|
||||
}
|
||||
|
||||
int mllama_n_patches(const struct mllama_ctx *ctx) {
|
||||
const auto &hparams = ctx->vision_model.hparams;
|
||||
return (hparams.image_size / hparams.patch_size) * (hparams.image_size / hparams.patch_size);
|
||||
}
|
||||
|
||||
int mllama_n_positions(const struct mllama_ctx *ctx) {
|
||||
return mllama_n_patches(ctx) + (ctx->vision_model.class_embedding == nullptr ? 0 : 1);
|
||||
}
|
||||
|
||||
int mllama_n_tiles(const struct mllama_ctx *ctx) {
|
||||
return ctx->vision_model.hparams.n_tiles;
|
||||
}
|
||||
|
||||
int mllama_n_embd(const struct mllama_ctx *ctx) {
|
||||
return ctx->vision_model.hparams.projection_dim;
|
||||
}
|
||||
|
||||
size_t mllama_n_embd_bytes(const struct mllama_ctx *ctx) {
|
||||
return mllama_n_positions(ctx) * mllama_n_embd(ctx) * mllama_n_tiles(ctx) * sizeof(float);
|
||||
}
|
61
llama/mllama.h
vendored
Normal file
61
llama/mllama.h
vendored
Normal file
@ -0,0 +1,61 @@
|
||||
#ifndef MLLAMA_H
|
||||
#define MLLAMA_H
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#ifdef LLAMA_SHARED
|
||||
#if defined(_WIN32) && !defined(__MINGW32__)
|
||||
#ifdef LLAMA_BUILD
|
||||
#define MLLAMA_API __declspec(dllexport)
|
||||
#else
|
||||
#define MLLAMA_API __declspec(dllimport)
|
||||
#endif
|
||||
#else
|
||||
#define MLLAMA_API __attribute__((visibility("default")))
|
||||
#endif
|
||||
#else
|
||||
#define MLLAMA_API
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
struct mllama_ctx;
|
||||
|
||||
struct mllama_image_batch {
|
||||
struct mllama_image *data;
|
||||
size_t size;
|
||||
};
|
||||
|
||||
MLLAMA_API struct mllama_ctx *mllama_model_load(const char *fname, int verbosity);
|
||||
MLLAMA_API struct mllama_ctx *mllama_model_load_cpu(const char *fname, int verbosity);
|
||||
|
||||
MLLAMA_API void mllama_free(struct mllama_ctx *ctx);
|
||||
|
||||
MLLAMA_API int32_t mllama_image_size(const struct mllama_ctx *ctx);
|
||||
MLLAMA_API int32_t mllama_patch_size(const struct mllama_ctx *ctx);
|
||||
MLLAMA_API int32_t mllama_hidden_size(const struct mllama_ctx *ctx);
|
||||
|
||||
MLLAMA_API int mllama_n_patches(const struct mllama_ctx *ctx);
|
||||
MLLAMA_API int mllama_n_positions(const struct mllama_ctx *ctx);
|
||||
MLLAMA_API int mllama_n_tiles(const struct mllama_ctx *ctx);
|
||||
MLLAMA_API int mllama_n_embd(const struct mllama_ctx *ctx);
|
||||
MLLAMA_API size_t mllama_n_embd_bytes(const struct mllama_ctx *ctx);
|
||||
|
||||
MLLAMA_API struct mllama_image *mllama_image_init();
|
||||
|
||||
MLLAMA_API void mllama_image_free(struct mllama_image *img);
|
||||
MLLAMA_API void mllama_image_batch_free(struct mllama_image_batch *batch);
|
||||
|
||||
MLLAMA_API bool mllama_image_load_from_data(const void *data, const int n, const int nx, const int ny, const int nc, const int nt, const int aspect_ratio_id, struct mllama_image *img);
|
||||
|
||||
MLLAMA_API bool mllama_image_encode(struct mllama_ctx *ctx, int n_threads, struct mllama_image *img, float *vec);
|
||||
MLLAMA_API bool mllama_image_batch_encode(struct mllama_ctx *ctx, int n_threads, const struct mllama_image_batch *imgs, float *vec);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // MLLAMA_H
|
@ -24,7 +24,7 @@ problem.
|
||||
9 files changed, 21 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
|
||||
index b30b4cb3..0ce73a99 100644
|
||||
index 273075f4..dd11f304 100644
|
||||
--- a/ggml/src/ggml-backend.cpp
|
||||
+++ b/ggml/src/ggml-backend.cpp
|
||||
@@ -107,7 +107,6 @@ void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
|
||||
@ -43,7 +43,7 @@ index b30b4cb3..0ce73a99 100644
|
||||
}
|
||||
|
||||
static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
||||
@@ -1871,6 +1871,11 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
@@ -1867,6 +1867,11 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
|
||||
static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
ggml_aligned_free(buffer->context, buffer->size);
|
||||
@ -55,7 +55,7 @@ index b30b4cb3..0ce73a99 100644
|
||||
}
|
||||
|
||||
static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
|
||||
@@ -1918,7 +1923,7 @@ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = {
|
||||
@@ -1914,7 +1919,7 @@ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = {
|
||||
};
|
||||
|
||||
static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_from_ptr_i = {
|
||||
@ -85,7 +85,7 @@ index e2617b06..242e50a7 100644
|
||||
|
||||
/**
|
||||
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
index b4b85abc..cb0d8528 100644
|
||||
index 9fb2134f..04ce764e 100644
|
||||
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
@@ -534,6 +534,7 @@ struct ggml_backend_cuda_buffer_context {
|
||||
@ -96,7 +96,7 @@ index b4b85abc..cb0d8528 100644
|
||||
}
|
||||
|
||||
static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) {
|
||||
@@ -790,6 +791,7 @@ struct ggml_backend_cuda_split_buffer_context {
|
||||
@@ -789,6 +790,7 @@ struct ggml_backend_cuda_split_buffer_context {
|
||||
static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
|
||||
delete ctx;
|
||||
@ -104,7 +104,7 @@ index b4b85abc..cb0d8528 100644
|
||||
}
|
||||
|
||||
static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
@@ -1067,6 +1069,7 @@ static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_
|
||||
@@ -1062,6 +1064,7 @@ static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_
|
||||
|
||||
static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
CUDA_CHECK(cudaFreeHost(buffer->context));
|
||||
@ -125,10 +125,10 @@ index 50579227..2799a0a5 100644
|
||||
|
||||
static void * ggml_backend_kompute_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
|
||||
index 576f9581..1b56f858 100644
|
||||
index d92392ed..425524d0 100644
|
||||
--- a/ggml/src/ggml-metal/ggml-metal.m
|
||||
+++ b/ggml/src/ggml-metal/ggml-metal.m
|
||||
@@ -5214,6 +5214,7 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
|
||||
@@ -5077,6 +5077,7 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
|
||||
}
|
||||
|
||||
free(ctx);
|
||||
@ -149,10 +149,10 @@ index 05a2f4e6..392cc18d 100644
|
||||
|
||||
static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp
|
||||
index 4f0abb5a..de1ec184 100644
|
||||
index 140a775f..e33c4ba0 100644
|
||||
--- a/ggml/src/ggml-rpc/ggml-rpc.cpp
|
||||
+++ b/ggml/src/ggml-rpc/ggml-rpc.cpp
|
||||
@@ -483,6 +483,7 @@ static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
@@ -477,6 +477,7 @@ static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_FREE_BUFFER, &request, sizeof(request), nullptr, 0);
|
||||
GGML_ASSERT(status);
|
||||
delete ctx;
|
||||
@ -161,10 +161,10 @@ index 4f0abb5a..de1ec184 100644
|
||||
|
||||
static void * ggml_backend_rpc_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
|
||||
index 0ea72994..ae3a3c33 100644
|
||||
index 66b6f2cc..e3e6deae 100644
|
||||
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
|
||||
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
|
||||
@@ -320,6 +320,7 @@ ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try {
|
||||
@@ -317,6 +317,7 @@ ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try {
|
||||
ggml_sycl_set_device(ctx->device);
|
||||
|
||||
delete ctx;
|
||||
@ -172,7 +172,7 @@ index 0ea72994..ae3a3c33 100644
|
||||
}
|
||||
catch (sycl::exception const &exc) {
|
||||
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
||||
@@ -765,6 +766,7 @@ struct ggml_backend_sycl_split_buffer_context {
|
||||
@@ -762,6 +763,7 @@ struct ggml_backend_sycl_split_buffer_context {
|
||||
static void ggml_backend_sycl_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context;
|
||||
delete ctx;
|
||||
@ -180,7 +180,7 @@ index 0ea72994..ae3a3c33 100644
|
||||
}
|
||||
|
||||
static void * ggml_backend_sycl_split_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
@@ -1099,6 +1101,7 @@ static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_
|
||||
@@ -1096,6 +1098,7 @@ static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_
|
||||
|
||||
static void ggml_backend_sycl_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
ggml_sycl_host_free(buffer->context);
|
||||
@ -189,10 +189,10 @@ index 0ea72994..ae3a3c33 100644
|
||||
|
||||
static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
index e2b357fd..68768029 100644
|
||||
index c0bdb9e1..03d03064 100644
|
||||
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
@@ -8962,6 +8962,7 @@ static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
@@ -8660,6 +8660,7 @@ static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
|
||||
ggml_vk_destroy_buffer(ctx->dev_buffer);
|
||||
delete ctx;
|
||||
@ -200,7 +200,7 @@ index e2b357fd..68768029 100644
|
||||
}
|
||||
|
||||
static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
@@ -9105,6 +9106,7 @@ static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe
|
||||
@@ -8803,6 +8804,7 @@ static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe
|
||||
static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
VK_LOG_MEMORY("ggml_backend_vk_host_buffer_free_buffer()");
|
||||
ggml_vk_host_free(vk_instance.devices[0], buffer->context);
|
||||
|
@ -10,10 +10,10 @@ logs instead of throwing an error
|
||||
1 file changed, 3 insertions(+), 11 deletions(-)
|
||||
|
||||
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
|
||||
index 9389ca80..806c1b3d 100644
|
||||
index 50ded286..a9ee9f03 100644
|
||||
--- a/src/llama-vocab.cpp
|
||||
+++ b/src/llama-vocab.cpp
|
||||
@@ -1503,16 +1503,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
@@ -1491,16 +1491,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
if (type == LLAMA_VOCAB_TYPE_BPE) {
|
||||
add_space_prefix = false;
|
||||
clean_spaces = true;
|
||||
@ -31,8 +31,8 @@ index 9389ca80..806c1b3d 100644
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||||
} else if (
|
||||
tokenizer_pre == "llama3" ||
|
||||
@@ -1651,7 +1642,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_SEED_CODER;
|
||||
@@ -1635,7 +1626,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_BAILINGMOE;
|
||||
clean_spaces = false;
|
||||
} else {
|
||||
- throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
||||
|
@ -11,10 +11,10 @@ instead of forcing one or the error
|
||||
1 file changed, 3 insertions(+), 3 deletions(-)
|
||||
|
||||
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
|
||||
index 62246c10..dca22d8b 100644
|
||||
index 5a2eef9b..9c1fe93f 100644
|
||||
--- a/src/llama-context.cpp
|
||||
+++ b/src/llama-context.cpp
|
||||
@@ -901,7 +901,7 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
@@ -1225,7 +1225,7 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
int64_t n_outputs_all = 0;
|
||||
|
||||
// count outputs
|
||||
@ -23,7 +23,7 @@ index 62246c10..dca22d8b 100644
|
||||
for (uint32_t i = 0; i < n_tokens_all; ++i) {
|
||||
n_outputs_all += batch.logits[i] != 0;
|
||||
}
|
||||
@@ -982,7 +982,7 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
@@ -1337,7 +1337,7 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
// ggml_graph_dump_dot(gf, NULL, "llama.dot");
|
||||
//}
|
||||
|
||||
@ -32,7 +32,7 @@ index 62246c10..dca22d8b 100644
|
||||
auto * t_embd = cparams.embeddings ? res->get_embd() : nullptr;
|
||||
|
||||
if (t_embd && res->get_embd_pooled()) {
|
||||
@@ -1151,7 +1151,7 @@ int32_t llama_context::output_reserve(int32_t n_outputs) {
|
||||
@@ -1481,7 +1481,7 @@ int32_t llama_context::output_reserve(int32_t n_outputs) {
|
||||
const auto n_embd = hparams.n_embd;
|
||||
|
||||
// TODO: use a per-batch flag for logits presence instead
|
||||
|
@ -6,16 +6,16 @@ Subject: [PATCH] clip-unicode
|
||||
fixes loading vision models in llama.cpp on windows
|
||||
filesystems for paths that include wide characters
|
||||
---
|
||||
tools/mtmd/clip.cpp | 39 +++++++++++++++++++++++++++++++++++++++
|
||||
examples/llava/clip.cpp | 39 +++++++++++++++++++++++++++++++++++++++
|
||||
1 file changed, 39 insertions(+)
|
||||
|
||||
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
|
||||
index 41ba45a7..cdd8ca44 100644
|
||||
--- a/tools/mtmd/clip.cpp
|
||||
+++ b/tools/mtmd/clip.cpp
|
||||
@@ -31,6 +31,19 @@
|
||||
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
|
||||
index ad3e7df1..b3218c78 100644
|
||||
--- a/examples/llava/clip.cpp
|
||||
+++ b/examples/llava/clip.cpp
|
||||
@@ -30,6 +30,19 @@
|
||||
#include <array>
|
||||
#include <numeric>
|
||||
#include <functional>
|
||||
|
||||
+#if defined(_WIN32)
|
||||
+#define WIN32_LEAN_AND_MEAN
|
||||
@ -32,8 +32,8 @@ index 41ba45a7..cdd8ca44 100644
|
||||
+
|
||||
struct clip_logger_state g_logger_state = {GGML_LOG_LEVEL_CONT, clip_log_callback_default, NULL};
|
||||
|
||||
enum ffn_op_type {
|
||||
@@ -2190,7 +2203,29 @@ struct clip_model_loader {
|
||||
//#define CLIP_DEBUG_FUNCTIONS
|
||||
@@ -1971,7 +1984,29 @@ struct clip_model_loader {
|
||||
{
|
||||
std::vector<uint8_t> read_buf;
|
||||
|
||||
@ -63,7 +63,7 @@ index 41ba45a7..cdd8ca44 100644
|
||||
if (!fin) {
|
||||
throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str()));
|
||||
}
|
||||
@@ -2217,7 +2252,11 @@ struct clip_model_loader {
|
||||
@@ -1998,7 +2033,11 @@ struct clip_model_loader {
|
||||
ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
|
||||
}
|
||||
}
|
||||
|
@ -138,7 +138,7 @@ index 7ee6a5b7..48dce407 100644
|
||||
};
|
||||
|
||||
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
|
||||
index 4cce5166..7f6617fa 100644
|
||||
index ea73a8a7..a012aeae 100644
|
||||
--- a/src/llama-model-loader.cpp
|
||||
+++ b/src/llama-model-loader.cpp
|
||||
@@ -439,6 +439,7 @@ namespace GGUFMeta {
|
||||
@ -150,10 +150,10 @@ index 4cce5166..7f6617fa 100644
|
||||
llama_model_loader::llama_model_loader(
|
||||
const std::string & fname,
|
||||
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
|
||||
index 3a4e72a3..831b68c0 100644
|
||||
index 822e2bb2..572378c9 100644
|
||||
--- a/src/llama-model.cpp
|
||||
+++ b/src/llama-model.cpp
|
||||
@@ -1402,6 +1402,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
@@ -1386,6 +1386,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
@ -175,7 +175,7 @@ index 3a4e72a3..831b68c0 100644
|
||||
case LLM_ARCH_WAVTOKENIZER_DEC:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
@@ -3774,6 +3789,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
@@ -3741,6 +3756,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
|
||||
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
||||
|
||||
@ -210,7 +210,7 @@ index 3a4e72a3..831b68c0 100644
|
||||
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
||||
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
||||
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
||||
@@ -12397,6 +12440,165 @@ struct llm_build_chameleon : public llm_graph_context {
|
||||
@@ -12342,6 +12385,165 @@ struct llm_build_chameleon : public llm_graph_context {
|
||||
}
|
||||
};
|
||||
|
||||
@ -270,7 +270,7 @@ index 3a4e72a3..831b68c0 100644
|
||||
+ // self-attention
|
||||
+ {
|
||||
+ // rope freq factors for llama3; may return nullptr for llama2 and other models
|
||||
+ ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
|
||||
+ ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
|
||||
+
|
||||
+ // compute Q and K and RoPE them
|
||||
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
||||
@ -376,7 +376,7 @@ index 3a4e72a3..831b68c0 100644
|
||||
struct llm_build_wavtokenizer_dec : public llm_graph_context {
|
||||
llm_build_wavtokenizer_dec(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
||||
ggml_tensor * cur;
|
||||
@@ -13157,6 +13359,10 @@ llm_graph_result_ptr llama_model::build_graph(
|
||||
@@ -13092,6 +13294,10 @@ llm_graph_result_ptr llama_model::build_graph(
|
||||
{
|
||||
llm = std::make_unique<llm_build_chameleon>(*this, params, gf);
|
||||
} break;
|
||||
@ -387,7 +387,7 @@ index 3a4e72a3..831b68c0 100644
|
||||
case LLM_ARCH_WAVTOKENIZER_DEC:
|
||||
{
|
||||
llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params, gf);
|
||||
@@ -13301,6 +13507,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
||||
@@ -13238,6 +13444,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
||||
case LLM_ARCH_GRANITE:
|
||||
case LLM_ARCH_GRANITE_MOE:
|
||||
case LLM_ARCH_CHAMELEON:
|
||||
@ -396,10 +396,10 @@ index 3a4e72a3..831b68c0 100644
|
||||
return LLAMA_ROPE_TYPE_NORM;
|
||||
|
||||
diff --git a/src/llama-model.h b/src/llama-model.h
|
||||
index 6bdec263..43746c7d 100644
|
||||
index 95eca002..856e6042 100644
|
||||
--- a/src/llama-model.h
|
||||
+++ b/src/llama-model.h
|
||||
@@ -65,6 +65,7 @@ enum llm_type {
|
||||
@@ -64,6 +64,7 @@ enum llm_type {
|
||||
LLM_TYPE_15B,
|
||||
LLM_TYPE_16B,
|
||||
LLM_TYPE_20B,
|
||||
@ -407,7 +407,7 @@ index 6bdec263..43746c7d 100644
|
||||
LLM_TYPE_27B,
|
||||
LLM_TYPE_30B,
|
||||
LLM_TYPE_32B,
|
||||
@@ -315,6 +316,8 @@ struct llama_layer {
|
||||
@@ -311,6 +312,8 @@ struct llama_layer {
|
||||
struct ggml_tensor * ffn_up_scale = nullptr;
|
||||
struct ggml_tensor * ffn_down_scale = nullptr;
|
||||
|
||||
|
1010
llama/patches/0006-add-mllama-support.patch
Normal file
1010
llama/patches/0006-add-mllama-support.patch
Normal file
File diff suppressed because it is too large
Load Diff
419
llama/patches/0007-add-unpad-operator.patch
Normal file
419
llama/patches/0007-add-unpad-operator.patch
Normal file
@ -0,0 +1,419 @@
|
||||
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
||||
From: jmorganca <jmorganca@gmail.com>
|
||||
Date: Sun, 13 Apr 2025 22:10:06 -0400
|
||||
Subject: [PATCH] add unpad operator
|
||||
|
||||
adds the unpad operator to GGML
|
||||
---
|
||||
ggml/include/ggml.h | 10 +++++
|
||||
ggml/src/ggml-cpu/ggml-cpu.c | 5 +++
|
||||
ggml/src/ggml-cpu/ops.cpp | 55 ++++++++++++++++++++++++++++
|
||||
ggml/src/ggml-cpu/ops.h | 1 +
|
||||
ggml/src/ggml-cuda/ggml-cuda.cu | 4 ++
|
||||
ggml/src/ggml-cuda/pad.cu | 46 +++++++++++++++++++++++
|
||||
ggml/src/ggml-cuda/pad.cuh | 1 +
|
||||
ggml/src/ggml-metal/ggml-metal.m | 33 +++++++++++++++++
|
||||
ggml/src/ggml-metal/ggml-metal.metal | 45 +++++++++++++++++++++++
|
||||
ggml/src/ggml.c | 25 ++++++++++++-
|
||||
10 files changed, 223 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
|
||||
index 1b8603e7..53ef31b2 100644
|
||||
--- a/ggml/include/ggml.h
|
||||
+++ b/ggml/include/ggml.h
|
||||
@@ -489,6 +489,7 @@ extern "C" {
|
||||
GGML_OP_UPSCALE, // nearest interpolate
|
||||
GGML_OP_PAD,
|
||||
GGML_OP_PAD_REFLECT_1D,
|
||||
+ GGML_OP_UNPAD,
|
||||
GGML_OP_ARANGE,
|
||||
GGML_OP_TIMESTEP_EMBEDDING,
|
||||
GGML_OP_ARGSORT,
|
||||
@@ -1777,6 +1778,15 @@ extern "C" {
|
||||
int p0,
|
||||
int p1);
|
||||
|
||||
+ // unpad each dimension: [x, ..., x, y, ..., y] -> [x, ..., x]
|
||||
+ GGML_API struct ggml_tensor * ggml_unpad(
|
||||
+ struct ggml_context * ctx,
|
||||
+ struct ggml_tensor * a,
|
||||
+ int p0,
|
||||
+ int p1,
|
||||
+ int p2,
|
||||
+ int p3);
|
||||
+
|
||||
// Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151
|
||||
// timesteps: [N,]
|
||||
// return: [N, dim]
|
||||
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
|
||||
index 64405449..34624cca 100644
|
||||
--- a/ggml/src/ggml-cpu/ggml-cpu.c
|
||||
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
|
||||
@@ -1964,6 +1964,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
||||
{
|
||||
ggml_compute_forward_pad_reflect_1d(params, tensor);
|
||||
} break;
|
||||
+ case GGML_OP_UNPAD:
|
||||
+ {
|
||||
+ ggml_compute_forward_unpad(params, tensor);
|
||||
+ } break;
|
||||
case GGML_OP_ARANGE:
|
||||
{
|
||||
ggml_compute_forward_arange(params, tensor);
|
||||
@@ -2287,6 +2291,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
||||
case GGML_OP_UPSCALE:
|
||||
case GGML_OP_PAD:
|
||||
case GGML_OP_PAD_REFLECT_1D:
|
||||
+ case GGML_OP_UNPAD:
|
||||
case GGML_OP_ARANGE:
|
||||
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||
case GGML_OP_ARGSORT:
|
||||
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
|
||||
index 7413192b..becdae07 100644
|
||||
--- a/ggml/src/ggml-cpu/ops.cpp
|
||||
+++ b/ggml/src/ggml-cpu/ops.cpp
|
||||
@@ -6703,6 +6703,61 @@ void ggml_compute_forward_pad_reflect_1d(
|
||||
}
|
||||
}
|
||||
|
||||
+// ggml_compute_forward_unpad
|
||||
+
|
||||
+static void ggml_compute_forward_unpad_f32(
|
||||
+ const struct ggml_compute_params *params,
|
||||
+ struct ggml_tensor *dst) {
|
||||
+
|
||||
+ const struct ggml_tensor * src0 = dst->src[0];
|
||||
+
|
||||
+ GGML_ASSERT(src0->nb[0] == sizeof(float));
|
||||
+ GGML_ASSERT( dst->nb[0] == sizeof(float));
|
||||
+
|
||||
+ const int ith = params->ith;
|
||||
+ const int nth = params->nth;
|
||||
+
|
||||
+ GGML_TENSOR_UNARY_OP_LOCALS
|
||||
+
|
||||
+ float * dst_ptr = (float *) dst->data;
|
||||
+
|
||||
+ // TODO: optimize
|
||||
+
|
||||
+ for (int64_t i2 = 0; i2 < ne2; ++i2) {
|
||||
+ for (int64_t i1 = ith; i1 < ne1; i1 += nth) {
|
||||
+ for (int64_t i0 = 0; i0 < ne0; ++i0) {
|
||||
+ for (int64_t i3 = 0; i3 < ne3; ++i3) {
|
||||
+ const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0;
|
||||
+
|
||||
+ const float * src_ptr = (const float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
||||
+
|
||||
+ if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
|
||||
+ dst_ptr[dst_idx] = *src_ptr;
|
||||
+ }
|
||||
+ }
|
||||
+ }
|
||||
+ }
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
+void ggml_compute_forward_unpad(
|
||||
+ const struct ggml_compute_params * params,
|
||||
+ struct ggml_tensor * dst) {
|
||||
+
|
||||
+ const struct ggml_tensor * src0 = dst->src[0];
|
||||
+
|
||||
+ switch (src0->type) {
|
||||
+ case GGML_TYPE_F32:
|
||||
+ {
|
||||
+ ggml_compute_forward_unpad_f32(params, dst);
|
||||
+ } break;
|
||||
+ default:
|
||||
+ {
|
||||
+ GGML_ABORT("fatal error");
|
||||
+ }
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
// ggml_compute_forward_arange
|
||||
|
||||
static void ggml_compute_forward_arange_f32(
|
||||
diff --git a/ggml/src/ggml-cpu/ops.h b/ggml/src/ggml-cpu/ops.h
|
||||
index dc081b9e..a7125555 100644
|
||||
--- a/ggml/src/ggml-cpu/ops.h
|
||||
+++ b/ggml/src/ggml-cpu/ops.h
|
||||
@@ -72,6 +72,7 @@ void ggml_compute_forward_pool_2d_back(const struct ggml_compute_params * params
|
||||
void ggml_compute_forward_upscale(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_pad(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_pad_reflect_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
+void ggml_compute_forward_unpad(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_arange(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_timestep_embedding(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_argsort(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
index 04ce764e..491acccb 100644
|
||||
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
@@ -2223,6 +2223,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
||||
case GGML_OP_PAD:
|
||||
ggml_cuda_op_pad(ctx, dst);
|
||||
break;
|
||||
+ case GGML_OP_UNPAD:
|
||||
+ ggml_cuda_op_unpad(ctx, dst);
|
||||
+ break;
|
||||
case GGML_OP_ARANGE:
|
||||
ggml_cuda_op_arange(ctx, dst);
|
||||
break;
|
||||
@@ -3197,6 +3200,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
case GGML_OP_UPSCALE:
|
||||
return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST;
|
||||
case GGML_OP_PAD:
|
||||
+ case GGML_OP_UNPAD:
|
||||
case GGML_OP_ARANGE:
|
||||
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||
case GGML_OP_LEAKY_RELU:
|
||||
diff --git a/ggml/src/ggml-cuda/pad.cu b/ggml/src/ggml-cuda/pad.cu
|
||||
index 77432b04..7d45a7e1 100644
|
||||
--- a/ggml/src/ggml-cuda/pad.cu
|
||||
+++ b/ggml/src/ggml-cuda/pad.cu
|
||||
@@ -47,3 +47,49 @@ void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
|
||||
dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], stream);
|
||||
}
|
||||
+
|
||||
+static __global__ void unpad_f32(const float * x, float * dst, const int ne0, const int ne00, const int ne01, const int ne02, const int ne03) {
|
||||
+ // blockIdx.z: idx of ne2*ne3, aka ne02*ne03
|
||||
+ // blockIdx.y: idx of ne1
|
||||
+ // blockIDx.x: idx of ne0 / BLOCK_SIZE
|
||||
+ int nidx = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
+ if (nidx >= ne0) {
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
+ // operation
|
||||
+ int offset_dst =
|
||||
+ nidx +
|
||||
+ blockIdx.y * ne0 +
|
||||
+ blockIdx.z * ne0 * gridDim.y;
|
||||
+ if (nidx < ne00 && blockIdx.y < ne01 && blockIdx.z < ne02*ne03) {
|
||||
+ int offset_src =
|
||||
+ nidx +
|
||||
+ blockIdx.y * ne00 +
|
||||
+ blockIdx.z * ne00 * ne01;
|
||||
+ dst[offset_dst] = x[offset_src];
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
+static void unpad_f32_cuda(const float * x, float * dst,
|
||||
+ const int ne00, const int ne01, const int ne02, const int ne03,
|
||||
+ const int ne0, const int ne1, const int ne2, const int ne3, cudaStream_t stream) {
|
||||
+ int num_blocks = (ne0 + CUDA_PAD_BLOCK_SIZE - 1) / CUDA_PAD_BLOCK_SIZE;
|
||||
+ dim3 gridDim(num_blocks, ne1, ne2*ne3);
|
||||
+ unpad_f32<<<gridDim, CUDA_PAD_BLOCK_SIZE, 0, stream>>>(x, dst, ne0, ne00, ne01, ne02, ne03);
|
||||
+}
|
||||
+
|
||||
+void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
+ const ggml_tensor * src0 = dst->src[0];
|
||||
+ const float * src0_d = (const float *)src0->data;
|
||||
+ float * dst_d = (float *)dst->data;
|
||||
+ cudaStream_t stream = ctx.stream();
|
||||
+
|
||||
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||
+ GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
|
||||
+
|
||||
+ unpad_f32_cuda(src0_d, dst_d,
|
||||
+ src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
|
||||
+ dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], stream);
|
||||
+}
|
||||
\ No newline at end of file
|
||||
diff --git a/ggml/src/ggml-cuda/pad.cuh b/ggml/src/ggml-cuda/pad.cuh
|
||||
index 8fd386b0..e2ededc3 100644
|
||||
--- a/ggml/src/ggml-cuda/pad.cuh
|
||||
+++ b/ggml/src/ggml-cuda/pad.cuh
|
||||
@@ -3,3 +3,4 @@
|
||||
#define CUDA_PAD_BLOCK_SIZE 256
|
||||
|
||||
void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
+void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
|
||||
index 425524d0..112abef6 100644
|
||||
--- a/ggml/src/ggml-metal/ggml-metal.m
|
||||
+++ b/ggml/src/ggml-metal/ggml-metal.m
|
||||
@@ -341,6 +341,7 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
|
||||
GGML_METAL_KERNEL_TYPE_UPSCALE_F32,
|
||||
GGML_METAL_KERNEL_TYPE_PAD_F32,
|
||||
GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32,
|
||||
+ GGML_METAL_KERNEL_TYPE_UNPAD_F32,
|
||||
GGML_METAL_KERNEL_TYPE_ARANGE_F32,
|
||||
GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,
|
||||
GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,
|
||||
@@ -1277,6 +1278,7 @@ @implementation GGMLMetalClass
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32, upscale_f32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32, pad_f32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32, pad_reflect_1d_f32, true);
|
||||
+ GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UNPAD_F32, unpad_f32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32, timestep_embedding_f32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARANGE_F32, arange_f32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, argsort_f32_i32_asc, true);
|
||||
@@ -1647,6 +1649,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
|
||||
case GGML_OP_POOL_2D:
|
||||
case GGML_OP_PAD:
|
||||
case GGML_OP_PAD_REFLECT_1D:
|
||||
+ case GGML_OP_UNPAD:
|
||||
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||
case GGML_OP_ARGSORT:
|
||||
case GGML_OP_LEAKY_RELU:
|
||||
@@ -4047,6 +4050,36 @@ static bool ggml_metal_encode_node(
|
||||
|
||||
const int nth = MIN(1024, ne0);
|
||||
|
||||
+ [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
||||
+ } break;
|
||||
+ case GGML_OP_UNPAD:
|
||||
+ {
|
||||
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
+
|
||||
+ id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_UNPAD_F32].pipeline;
|
||||
+
|
||||
+ [encoder setComputePipelineState:pipeline];
|
||||
+ [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
+ [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||
+ [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
|
||||
+ [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
|
||||
+ [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
|
||||
+ [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5];
|
||||
+ [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
|
||||
+ [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
|
||||
+ [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
|
||||
+ [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9];
|
||||
+ [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:10];
|
||||
+ [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:11];
|
||||
+ [encoder setBytes:&ne2 length:sizeof(ne2) atIndex:12];
|
||||
+ [encoder setBytes:&ne3 length:sizeof(ne3) atIndex:13];
|
||||
+ [encoder setBytes:&nb0 length:sizeof(nb0) atIndex:14];
|
||||
+ [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:15];
|
||||
+ [encoder setBytes:&nb2 length:sizeof(nb2) atIndex:16];
|
||||
+ [encoder setBytes:&nb3 length:sizeof(nb3) atIndex:17];
|
||||
+
|
||||
+ const int nth = MIN(1024, ne0);
|
||||
+
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
||||
} break;
|
||||
case GGML_OP_ARANGE:
|
||||
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
|
||||
index 9f4147e9..6ceb3cef 100644
|
||||
--- a/ggml/src/ggml-metal/ggml-metal.metal
|
||||
+++ b/ggml/src/ggml-metal/ggml-metal.metal
|
||||
@@ -2975,6 +2975,51 @@ kernel void kernel_pad_reflect_1d_f32(
|
||||
}
|
||||
}
|
||||
|
||||
+kernel void kernel_unpad_f32(
|
||||
+ device const char * src0,
|
||||
+ device char * dst,
|
||||
+ constant int64_t & ne00,
|
||||
+ constant int64_t & ne01,
|
||||
+ constant int64_t & ne02,
|
||||
+ constant int64_t & ne03,
|
||||
+ constant uint64_t & nb00,
|
||||
+ constant uint64_t & nb01,
|
||||
+ constant uint64_t & nb02,
|
||||
+ constant uint64_t & nb03,
|
||||
+ constant int64_t & ne0,
|
||||
+ constant int64_t & ne1,
|
||||
+ constant int64_t & ne2,
|
||||
+ constant int64_t & ne3,
|
||||
+ constant uint64_t & nb0,
|
||||
+ constant uint64_t & nb1,
|
||||
+ constant uint64_t & nb2,
|
||||
+ constant uint64_t & nb3,
|
||||
+ uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
+ uint3 tpitg[[thread_position_in_threadgroup]],
|
||||
+ uint3 ntg[[threads_per_threadgroup]]) {
|
||||
+
|
||||
+ const int64_t i3 = tgpig.z;
|
||||
+ const int64_t i2 = tgpig.y;
|
||||
+ const int64_t i1 = tgpig.x;
|
||||
+
|
||||
+ const int64_t i03 = i3;
|
||||
+ const int64_t i02 = i2;
|
||||
+ const int64_t i01 = i1;
|
||||
+
|
||||
+ device const float * src0_ptr = (device const float *) (src0 + i03*nb03 + i02*nb02 + i01*nb01);
|
||||
+ device float * dst_ptr = (device float *) (dst + i3*nb3 + i2*nb2 + i1*nb1);
|
||||
+
|
||||
+ if (i1 < ne01 && i2 < ne02 && i3 < ne03) {
|
||||
+ for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
|
||||
+ if (i0 < ne00) {
|
||||
+ dst_ptr[i0] = src0_ptr[i0];
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ return;
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
kernel void kernel_arange_f32(
|
||||
device char * dst,
|
||||
constant ggml_metal_kargs_arange & args,
|
||||
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
|
||||
index 7654ae17..3c57aff8 100644
|
||||
--- a/ggml/src/ggml.c
|
||||
+++ b/ggml/src/ggml.c
|
||||
@@ -923,6 +923,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
||||
"UPSCALE",
|
||||
"PAD",
|
||||
"PAD_REFLECT_1D",
|
||||
+ "UNPAD",
|
||||
"ARANGE",
|
||||
"TIMESTEP_EMBEDDING",
|
||||
"ARGSORT",
|
||||
@@ -953,7 +954,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
||||
"OPT_STEP_ADAMW",
|
||||
};
|
||||
|
||||
-static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
|
||||
+static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
|
||||
|
||||
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||
"none",
|
||||
@@ -1018,6 +1019,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||
"upscale(x)",
|
||||
"pad(x)",
|
||||
"pad_reflect_1d(x)",
|
||||
+ "unpad(x)",
|
||||
"arange(start, stop, step)",
|
||||
"timestep_embedding(timesteps, dim, max_period)",
|
||||
"argsort(x)",
|
||||
@@ -1048,7 +1050,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||
"adamw(x)",
|
||||
};
|
||||
|
||||
-static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
|
||||
+static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
|
||||
|
||||
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
|
||||
|
||||
@@ -4270,6 +4272,25 @@ struct ggml_tensor * ggml_pad_reflect_1d(
|
||||
return result;
|
||||
}
|
||||
|
||||
+// ggml_unpad
|
||||
+
|
||||
+struct ggml_tensor * ggml_unpad(
|
||||
+ struct ggml_context * ctx,
|
||||
+ struct ggml_tensor * a,
|
||||
+ int p0, int p1, int p2, int p3) {
|
||||
+
|
||||
+ struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type,
|
||||
+ a->ne[0] - p0,
|
||||
+ a->ne[1] - p1,
|
||||
+ a->ne[2] - p2,
|
||||
+ a->ne[3] - p3);
|
||||
+
|
||||
+ result->op = GGML_OP_UNPAD;
|
||||
+ result->src[0] = a;
|
||||
+
|
||||
+ return result;
|
||||
+}
|
||||
+
|
||||
// ggml_arange
|
||||
|
||||
struct ggml_tensor * ggml_arange(
|
@ -1,352 +0,0 @@
|
||||
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
||||
From: jmorganca <jmorganca@gmail.com>
|
||||
Date: Tue, 15 Apr 2025 14:27:40 -0400
|
||||
Subject: [PATCH] ensure KV cache is fully defragmented
|
||||
|
||||
Sometimes the KV cache requires defragmentation even without
|
||||
triggering the threshold heuristic. In this case, decoding
|
||||
will not being able to find a KV cache slot. This is particularly
|
||||
difficult for the caller to handle if it happens in between
|
||||
ubatches. To avoid this, we should immediately trigger a defrag.
|
||||
|
||||
In addition, a heavily fragmented cache can require more than
|
||||
max_moves to defragment. Currently, we stop when we hit the limit
|
||||
but this can leave a cache that still does not have adequate space
|
||||
even after defragmentation is triggered. Instead, we should do
|
||||
multiple batches of processing until everything is complete.
|
||||
---
|
||||
src/llama-context.cpp | 18 ++++---
|
||||
src/llama-context.h | 1 +
|
||||
src/llama-kv-cache.cpp | 107 ++++++++++++++---------------------------
|
||||
src/llama-kv-cache.h | 12 ++++-
|
||||
4 files changed, 59 insertions(+), 79 deletions(-)
|
||||
|
||||
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
|
||||
index c22687e4..c5948e8f 100644
|
||||
--- a/src/llama-context.cpp
|
||||
+++ b/src/llama-context.cpp
|
||||
@@ -950,9 +950,12 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
|
||||
// find KV slot
|
||||
if (!kv_self->find_slot(ubatch)) {
|
||||
- LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
|
||||
-
|
||||
- return 1;
|
||||
+ kv_self->defrag_sched(-1.0f);
|
||||
+ kv_self->update(*this);
|
||||
+ if (!kv_self->find_slot(ubatch)) {
|
||||
+ LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
|
||||
+ return 1;
|
||||
+ }
|
||||
}
|
||||
|
||||
ggml_backend_sched_reset(sched.get());
|
||||
@@ -1967,9 +1970,12 @@ void llama_context::opt_epoch_iter(
|
||||
|
||||
// TODO: not sure if this is needed
|
||||
if (!kv_self->find_slot(ubatch)) {
|
||||
- LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
|
||||
-
|
||||
- GGML_ABORT("TODO: handle this error");
|
||||
+ kv_self->defrag_sched(-1.0f);
|
||||
+ kv_self->update(*this);
|
||||
+ if (!kv_self->find_slot(ubatch)) {
|
||||
+ LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
|
||||
+ GGML_ABORT("TODO: handle this error");
|
||||
+ }
|
||||
}
|
||||
|
||||
auto * gf = graph_init();
|
||||
diff --git a/src/llama-context.h b/src/llama-context.h
|
||||
index c0ceacb1..0264e937 100644
|
||||
--- a/src/llama-context.h
|
||||
+++ b/src/llama-context.h
|
||||
@@ -5,6 +5,7 @@
|
||||
#include "llama-cparams.h"
|
||||
#include "llama-graph.h"
|
||||
#include "llama-adapter.h"
|
||||
+#include "llama-kv-cache.h"
|
||||
|
||||
#include "ggml-cpp.h"
|
||||
#include "ggml-opt.h"
|
||||
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
|
||||
index 3dcad65b..60e67b03 100644
|
||||
--- a/src/llama-kv-cache.cpp
|
||||
+++ b/src/llama-kv-cache.cpp
|
||||
@@ -364,8 +364,6 @@ void llama_kv_cache_unified::commit() {
|
||||
}
|
||||
|
||||
bool llama_kv_cache_unified::update(llama_context & lctx) {
|
||||
- bool need_reserve = false;
|
||||
-
|
||||
auto * sched = lctx.get_sched();
|
||||
|
||||
if (has_shift) {
|
||||
@@ -388,8 +386,6 @@ bool llama_kv_cache_unified::update(llama_context & lctx) {
|
||||
res->set_inputs(nullptr);
|
||||
|
||||
lctx.graph_compute(gf, false);
|
||||
-
|
||||
- need_reserve = true;
|
||||
}
|
||||
|
||||
{
|
||||
@@ -403,27 +399,36 @@ bool llama_kv_cache_unified::update(llama_context & lctx) {
|
||||
|
||||
if (do_defrag) {
|
||||
LLAMA_LOG_DEBUG("%s: defragmenting KV cache\n", __func__);
|
||||
+ const uint32_t n_max_nodes = lctx.graph_max_nodes();
|
||||
+ const uint32_t max_moves = (n_max_nodes - 2*model.hparams.n_layer)/(6*model.hparams.n_layer);
|
||||
+ if (!defrag_prepare(n_max_nodes)) {
|
||||
+ LLAMA_LOG_ERROR("%s: failed to prepare defragmentation\n", __func__);
|
||||
+ return false;
|
||||
+ }
|
||||
+
|
||||
+ for (std::size_t i = 0; i < defrag_info.moves.size(); i += max_moves) {
|
||||
+ std::vector<struct llama_kv_defrag_move> chunk;
|
||||
+ auto end = std::min(i + max_moves, defrag_info.moves.size());
|
||||
+ chunk.assign(defrag_info.moves.begin() + i, defrag_info.moves.begin() + end);
|
||||
|
||||
- if (defrag_prepare(lctx.graph_max_nodes())) {
|
||||
ggml_backend_sched_reset(sched);
|
||||
|
||||
auto * gf = lctx.graph_init();
|
||||
|
||||
- auto res = build_graph_defrag(lctx.get_cparams(), lctx.get_ctx_compute(), gf);
|
||||
+ auto res = build_graph_defrag(lctx.get_cparams(), lctx.get_ctx_compute(), gf, chunk);
|
||||
|
||||
ggml_backend_sched_alloc_graph(sched, gf);
|
||||
|
||||
res->set_inputs(nullptr);
|
||||
|
||||
lctx.graph_compute(gf, false);
|
||||
-
|
||||
- need_reserve = true;
|
||||
}
|
||||
|
||||
do_defrag = false;
|
||||
}
|
||||
|
||||
- return need_reserve;
|
||||
+ // we never need to reserve a worst case graph
|
||||
+ return false;
|
||||
}
|
||||
|
||||
void llama_kv_cache_unified::defrag_sched(float thold) {
|
||||
@@ -707,11 +712,10 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_shift(
|
||||
llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag(
|
||||
const llama_cparams & cparams,
|
||||
ggml_context * ctx,
|
||||
- ggml_cgraph * gf) const {
|
||||
+ ggml_cgraph * gf,
|
||||
+ const std::vector<struct llama_kv_defrag_move> & moves) const {
|
||||
auto res = std::make_unique<llm_graph_result>();
|
||||
|
||||
- const auto & ids = defrag_info.ids;
|
||||
-
|
||||
#if 0
|
||||
// CPU defrag
|
||||
//
|
||||
@@ -783,32 +787,20 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag(
|
||||
ggml_backend_tensor_set(v_l[il], buf_v.data(), 0, buf_v.size());
|
||||
}
|
||||
#else
|
||||
- for (uint32_t i = 0; i < ids.size(); ++i) {
|
||||
- const uint32_t id = ids[i];
|
||||
-
|
||||
- if (i == id || id == ids.size()) {
|
||||
- continue;
|
||||
- }
|
||||
-
|
||||
- uint32_t nm = 1;
|
||||
-
|
||||
- while (i + nm < ids.size() && ids[i + nm] == id + nm) {
|
||||
- nm++;
|
||||
- }
|
||||
-
|
||||
+ for (const auto & move : moves) {
|
||||
for (uint32_t il = 0; il < hparams.n_layer; ++il) { // NOLINT
|
||||
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
|
||||
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
|
||||
|
||||
ggml_tensor * view_k_src = ggml_view_2d(ctx, k_l[il],
|
||||
- n_embd_k_gqa, nm,
|
||||
+ n_embd_k_gqa, move.len,
|
||||
ggml_row_size(k_l[il]->type, n_embd_k_gqa),
|
||||
- ggml_row_size(k_l[il]->type, n_embd_k_gqa*i));
|
||||
+ ggml_row_size(k_l[il]->type, n_embd_k_gqa*move.src));
|
||||
|
||||
ggml_tensor * view_k_dst = ggml_view_2d(ctx, k_l[il],
|
||||
- n_embd_k_gqa, nm,
|
||||
+ n_embd_k_gqa, move.len,
|
||||
ggml_row_size(k_l[il]->type, n_embd_k_gqa),
|
||||
- ggml_row_size(k_l[il]->type, n_embd_k_gqa*id));
|
||||
+ ggml_row_size(k_l[il]->type, n_embd_k_gqa*move.dst));
|
||||
|
||||
ggml_tensor * view_v_src;
|
||||
ggml_tensor * view_v_dst;
|
||||
@@ -816,31 +808,29 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag(
|
||||
if (cparams.flash_attn) {
|
||||
// NOTE: the V cache is not transposed when using flash attention
|
||||
view_v_src = ggml_view_2d(ctx, v_l[il],
|
||||
- n_embd_v_gqa, nm,
|
||||
+ n_embd_v_gqa, move.len,
|
||||
ggml_row_size(v_l[il]->type, n_embd_v_gqa),
|
||||
- ggml_row_size(v_l[il]->type, n_embd_v_gqa*i));
|
||||
+ ggml_row_size(v_l[il]->type, n_embd_v_gqa*move.dst));
|
||||
|
||||
view_v_dst = ggml_view_2d(ctx, v_l[il],
|
||||
- n_embd_v_gqa, nm,
|
||||
+ move.len, n_embd_v_gqa,
|
||||
ggml_row_size(v_l[il]->type, n_embd_v_gqa),
|
||||
- ggml_row_size(v_l[il]->type, n_embd_v_gqa*id));
|
||||
+ ggml_row_size(v_l[il]->type, move.src));
|
||||
} else {
|
||||
view_v_src = ggml_view_2d(ctx, v_l[il],
|
||||
- nm, n_embd_v_gqa,
|
||||
+ move.len, n_embd_v_gqa,
|
||||
ggml_row_size(v_l[il]->type, size),
|
||||
- ggml_row_size(v_l[il]->type, i));
|
||||
+ ggml_row_size(v_l[il]->type, move.src));
|
||||
|
||||
view_v_dst = ggml_view_2d(ctx, v_l[il],
|
||||
- nm, n_embd_v_gqa,
|
||||
+ move.len, n_embd_v_gqa,
|
||||
ggml_row_size(v_l[il]->type, size),
|
||||
- ggml_row_size(v_l[il]->type, id));
|
||||
+ ggml_row_size(v_l[il]->type, move.dst));
|
||||
}
|
||||
|
||||
ggml_build_forward_expand(gf, ggml_cpy(ctx, view_k_src, view_k_dst));
|
||||
ggml_build_forward_expand(gf, ggml_cpy(ctx, view_v_src, view_v_dst));
|
||||
}
|
||||
-
|
||||
- i += nm - 1;
|
||||
}
|
||||
|
||||
//LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
|
||||
@@ -857,17 +847,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
|
||||
|
||||
assert(n_used <= n_kv);
|
||||
|
||||
- //const int64_t t_start = ggml_time_us();
|
||||
-
|
||||
- // number of cells moved
|
||||
- uint32_t n_moves = 0;
|
||||
-
|
||||
- // each move requires 6*n_layer tensors (see graph_build_kv_self_defrag)
|
||||
- // - source view, destination view, copy operation
|
||||
- // - x2 for keys and values
|
||||
- //const uint32_t max_moves = max_nodes()/(6*n_layer);
|
||||
- // TODO: tmp fix https://github.com/ggerganov/llama.cpp/issues/6685#issuecomment-2057579516
|
||||
- const uint32_t max_moves = (n_max_nodes - 2*n_layer)/(6*n_layer);
|
||||
+ defrag_info.moves.clear();
|
||||
|
||||
// determine which KV cells to move where
|
||||
//
|
||||
@@ -875,10 +855,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
|
||||
//
|
||||
// if ids[i] == i || ids[i] == n_kv, then cell i is not moved
|
||||
//
|
||||
- auto & ids = defrag_info.ids;
|
||||
-
|
||||
- ids.clear();
|
||||
- ids.resize(n_kv, n_kv);
|
||||
+ std::vector<uint32_t> ids(n_kv, n_kv);
|
||||
|
||||
for (uint32_t i0 = 0; i0 < n_used; ++i0) {
|
||||
const auto & cell0 = cells[i0];
|
||||
@@ -927,19 +904,11 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
|
||||
// are we moving a continuous block of memory?
|
||||
bool cont = false;
|
||||
|
||||
- // should we stop searching for the next move?
|
||||
- bool stop = false;
|
||||
-
|
||||
// go back and move the nf cells to the hole
|
||||
for (; i1 < n_kv; ++i1) {
|
||||
auto & cell1 = cells[i1];
|
||||
|
||||
if (cell1.is_empty() || ids[i1] != n_kv) {
|
||||
- if (n_moves == max_moves) {
|
||||
- stop = true;
|
||||
- break;
|
||||
- }
|
||||
-
|
||||
cont = false;
|
||||
continue;
|
||||
}
|
||||
@@ -955,8 +924,10 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
|
||||
head = n_used;
|
||||
|
||||
if (!cont) {
|
||||
- n_moves++;
|
||||
+ defrag_info.moves.push_back({i1, i0 + nf, 1});
|
||||
cont = true;
|
||||
+ } else {
|
||||
+ defrag_info.moves.back().len++;
|
||||
}
|
||||
|
||||
nf++;
|
||||
@@ -966,22 +937,16 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
|
||||
}
|
||||
}
|
||||
|
||||
- if (stop || n_moves == max_moves) {
|
||||
- break;
|
||||
- }
|
||||
-
|
||||
//LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh);
|
||||
|
||||
i0 += nh - 1;
|
||||
}
|
||||
|
||||
- if (n_moves == 0) {
|
||||
+ if (defrag_info.moves.size() == 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
- LLAMA_LOG_DEBUG("%s: (tmp log) KV defrag cell moves: %u\n", __func__, n_moves);
|
||||
-
|
||||
- LLAMA_LOG_DEBUG("%s: expected gf nodes: %u\n", __func__, 6*n_moves*n_layer);
|
||||
+ // LLAMA_LOG_DEBUG("(tmp log) KV defrag cell moves: %u\n", n_moves);
|
||||
|
||||
return true;
|
||||
}
|
||||
diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h
|
||||
index bf3b4b6a..928b9712 100644
|
||||
--- a/src/llama-kv-cache.h
|
||||
+++ b/src/llama-kv-cache.h
|
||||
@@ -82,6 +82,13 @@ struct llama_kv_cache_guard {
|
||||
private:
|
||||
llama_kv_cache * kv;
|
||||
};
|
||||
+
|
||||
+// block of KV slots to move when defragging
|
||||
+struct llama_kv_defrag_move {
|
||||
+ uint32_t src;
|
||||
+ uint32_t dst;
|
||||
+ uint32_t len;
|
||||
+};
|
||||
|
||||
//
|
||||
// llama_kv_cache_unified
|
||||
@@ -207,7 +214,7 @@ private:
|
||||
|
||||
// defrag
|
||||
struct {
|
||||
- std::vector<uint32_t> ids;
|
||||
+ std::vector<llama_kv_defrag_move> moves;
|
||||
} defrag_info;
|
||||
|
||||
// return true if cells have been moved
|
||||
@@ -249,7 +256,8 @@ private:
|
||||
llm_graph_result_ptr build_graph_defrag(
|
||||
const llama_cparams & cparams,
|
||||
ggml_context * ctx,
|
||||
- ggml_cgraph * gf) const;
|
||||
+ ggml_cgraph * gf,
|
||||
+ const std::vector<llama_kv_defrag_move> & moves) const;
|
||||
|
||||
void state_write_meta(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id = -1) const;
|
||||
void state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) const;
|
@ -12,10 +12,10 @@ regex
|
||||
2 files changed, 22 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
|
||||
index 806c1b3d..10f34d33 100644
|
||||
index a9ee9f03..1306864e 100644
|
||||
--- a/src/llama-vocab.cpp
|
||||
+++ b/src/llama-vocab.cpp
|
||||
@@ -298,7 +298,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
||||
@@ -296,7 +296,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
||||
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
|
||||
regex_exprs = {
|
||||
"[\r\n]",
|
361
llama/patches/0010-ensure-KV-cache-is-fully-defragmented.patch
Normal file
361
llama/patches/0010-ensure-KV-cache-is-fully-defragmented.patch
Normal file
@ -0,0 +1,361 @@
|
||||
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
||||
From: jmorganca <jmorganca@gmail.com>
|
||||
Date: Tue, 15 Apr 2025 14:27:40 -0400
|
||||
Subject: [PATCH] ensure KV cache is fully defragmented
|
||||
|
||||
Sometimes the KV cache requires defragmentation even without
|
||||
triggering the threshold heuristic. In this case, decoding
|
||||
will not being able to find a KV cache slot. This is particularly
|
||||
difficult for the caller to handle if it happens in between
|
||||
ubatches. To avoid this, we should immediately trigger a defrag.
|
||||
|
||||
In addition, a heavily fragmented cache can require more than
|
||||
max_moves to defragment. Currently, we stop when we hit the limit
|
||||
but this can leave a cache that still does not have adequate space
|
||||
even after defragmentation is triggered. Instead, we should do
|
||||
multiple batches of processing until everything is complete.
|
||||
---
|
||||
src/llama-context.cpp | 105 +++++++++++++----------------------------
|
||||
src/llama-context.h | 4 +-
|
||||
src/llama-kv-cache.cpp | 39 +++------------
|
||||
src/llama-kv-cache.h | 9 +++-
|
||||
4 files changed, 51 insertions(+), 106 deletions(-)
|
||||
|
||||
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
|
||||
index cd06ad91..77177c5e 100644
|
||||
--- a/src/llama-context.cpp
|
||||
+++ b/src/llama-context.cpp
|
||||
@@ -583,13 +583,12 @@ llm_graph_result_ptr llama_context::build_kv_self_shift(
|
||||
|
||||
llm_graph_result_ptr llama_context::build_kv_self_defrag(
|
||||
ggml_context * ctx0,
|
||||
- ggml_cgraph * gf) const {
|
||||
+ ggml_cgraph * gf,
|
||||
+ const std::vector<struct llama_kv_defrag_move> & moves) const {
|
||||
auto res = std::make_unique<llm_graph_result>();
|
||||
|
||||
const auto & hparams = model.hparams;
|
||||
|
||||
- const auto & ids = kv_self->defrag_info.ids;
|
||||
-
|
||||
#if 0
|
||||
// CPU defrag
|
||||
//
|
||||
@@ -661,32 +660,20 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag(
|
||||
ggml_backend_tensor_set(v_l[il], buf_v.data(), 0, buf_v.size());
|
||||
}
|
||||
#else
|
||||
- for (uint32_t i = 0; i < ids.size(); ++i) {
|
||||
- const uint32_t id = ids[i];
|
||||
-
|
||||
- if (i == id || id == ids.size()) {
|
||||
- continue;
|
||||
- }
|
||||
-
|
||||
- uint32_t nm = 1;
|
||||
-
|
||||
- while (i + nm < ids.size() && ids[i + nm] == id + nm) {
|
||||
- nm++;
|
||||
- }
|
||||
-
|
||||
+ for (const auto & move : moves) {
|
||||
for (uint32_t il = 0; il < hparams.n_layer; ++il) { // NOLINT
|
||||
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
|
||||
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
|
||||
|
||||
ggml_tensor * view_k_src = ggml_view_2d(ctx0, kv_self->k_l[il],
|
||||
- n_embd_k_gqa, nm,
|
||||
+ n_embd_k_gqa, move.len,
|
||||
ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa),
|
||||
- ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa*i));
|
||||
+ ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa*move.src));
|
||||
|
||||
ggml_tensor * view_k_dst = ggml_view_2d(ctx0, kv_self->k_l[il],
|
||||
- n_embd_k_gqa, nm,
|
||||
+ n_embd_k_gqa, move.len,
|
||||
ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa),
|
||||
- ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa*id));
|
||||
+ ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa*move.dst));
|
||||
|
||||
ggml_tensor * view_v_src;
|
||||
ggml_tensor * view_v_dst;
|
||||
@@ -694,34 +681,30 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag(
|
||||
if (cparams.flash_attn) {
|
||||
// NOTE: the V cache is not transposed when using flash attention
|
||||
view_v_src = ggml_view_2d(ctx0, kv_self->v_l[il],
|
||||
- n_embd_v_gqa, nm,
|
||||
+ n_embd_v_gqa, move.len,
|
||||
ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa),
|
||||
- ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa*i));
|
||||
+ ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa*move.src));
|
||||
|
||||
view_v_dst = ggml_view_2d(ctx0, kv_self->v_l[il],
|
||||
- n_embd_v_gqa, nm,
|
||||
+ n_embd_v_gqa, move.len,
|
||||
ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa),
|
||||
- ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa*id));
|
||||
+ ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa*move.dst));
|
||||
} else {
|
||||
view_v_src = ggml_view_2d(ctx0, kv_self->v_l[il],
|
||||
- nm, n_embd_v_gqa,
|
||||
+ move.len, n_embd_v_gqa,
|
||||
ggml_row_size(kv_self->v_l[il]->type, kv_self->size),
|
||||
- ggml_row_size(kv_self->v_l[il]->type, i));
|
||||
+ ggml_row_size(kv_self->v_l[il]->type, move.src));
|
||||
|
||||
view_v_dst = ggml_view_2d(ctx0, kv_self->v_l[il],
|
||||
- nm, n_embd_v_gqa,
|
||||
+ move.len, n_embd_v_gqa,
|
||||
ggml_row_size(kv_self->v_l[il]->type, kv_self->size),
|
||||
- ggml_row_size(kv_self->v_l[il]->type, id));
|
||||
+ ggml_row_size(kv_self->v_l[il]->type, move.dst));
|
||||
}
|
||||
|
||||
ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_k_src, view_k_dst));
|
||||
ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_v_src, view_v_dst));
|
||||
}
|
||||
-
|
||||
- i += nm - 1;
|
||||
}
|
||||
-
|
||||
- //LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
|
||||
#endif
|
||||
|
||||
return res;
|
||||
@@ -730,8 +713,6 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag(
|
||||
void llama_context::kv_self_update() {
|
||||
auto & kv = kv_self;
|
||||
|
||||
- bool need_reserve = false;
|
||||
-
|
||||
if (kv->has_shift) {
|
||||
if (!kv->get_can_shift()) {
|
||||
GGML_ABORT("The current context does not support K-shift");
|
||||
@@ -752,8 +733,6 @@ void llama_context::kv_self_update() {
|
||||
res->set_inputs(nullptr);
|
||||
|
||||
graph_compute(gf, false);
|
||||
-
|
||||
- need_reserve = true;
|
||||
}
|
||||
|
||||
{
|
||||
@@ -768,49 +747,28 @@ void llama_context::kv_self_update() {
|
||||
// defragment the KV cache if needed
|
||||
if (kv->do_defrag) {
|
||||
LLAMA_LOG_DEBUG("%s: defragmenting KV cache\n", __func__);
|
||||
+ const uint32_t n_max_nodes = graph_max_nodes();
|
||||
+ const uint32_t max_moves = (n_max_nodes - 2*model.hparams.n_layer)/(6*model.hparams.n_layer);
|
||||
+ if (!kv->defrag_prepare(n_max_nodes)) {
|
||||
+ LLAMA_LOG_ERROR("%s: failed to prepare defragmentation\n", __func__);
|
||||
+ return;
|
||||
+ }
|
||||
|
||||
- if (kv->defrag_prepare(graph_max_nodes())) {
|
||||
- ggml_backend_sched_reset(sched.get());
|
||||
+ for (std::size_t i = 0; i < kv_self->defrag_info.moves.size(); i += max_moves) {
|
||||
+ std::vector<struct llama_kv_defrag_move> chunk;
|
||||
+ auto end = std::min(i + max_moves, kv_self->defrag_info.moves.size());
|
||||
+ chunk.assign(kv_self->defrag_info.moves.begin() + i, kv_self->defrag_info.moves.begin() + end);
|
||||
|
||||
+ ggml_backend_sched_reset(sched.get());
|
||||
auto * gf = graph_init();
|
||||
-
|
||||
- auto res = build_kv_self_defrag(ctx_compute.get(), gf);
|
||||
-
|
||||
+ auto res = build_kv_self_defrag(ctx_compute.get(), gf, chunk);
|
||||
ggml_backend_sched_alloc_graph(sched.get(), gf);
|
||||
-
|
||||
res->set_inputs(nullptr);
|
||||
-
|
||||
graph_compute(gf, false);
|
||||
-
|
||||
- need_reserve = true;
|
||||
}
|
||||
|
||||
kv->do_defrag = false;
|
||||
}
|
||||
-
|
||||
- // reserve a worst case graph if needed
|
||||
- if (need_reserve) {
|
||||
- LLAMA_LOG_DEBUG("%s: reserving a worst case graph\n", __func__);
|
||||
-
|
||||
- // build worst-case graph
|
||||
- uint32_t n_seqs = 1; // TODO: worst-case number of sequences
|
||||
- uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
|
||||
-
|
||||
- // simulate full KV cache
|
||||
- kv_self->n = kv_self->size;
|
||||
-
|
||||
- llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
|
||||
- llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
|
||||
-
|
||||
- auto * gf = graph_init();
|
||||
- graph_build(ctx_compute.get(), gf, ubatch, LLM_GRAPH_TYPE_DEFAULT);
|
||||
-
|
||||
- // initialize scheduler with the worst-case graph
|
||||
- ggml_backend_sched_reset(sched.get());
|
||||
- if (!ggml_backend_sched_reserve(sched.get(), gf)) {
|
||||
- LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
|
||||
- }
|
||||
- }
|
||||
}
|
||||
|
||||
enum llama_pooling_type llama_context::pooling_type() const {
|
||||
@@ -1294,9 +1252,12 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
// find KV slot
|
||||
{
|
||||
if (!kv_self->find_slot(ubatch)) {
|
||||
- LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
|
||||
-
|
||||
- return 1;
|
||||
+ kv_self->defrag();
|
||||
+ kv_self_update();
|
||||
+ if (!kv_self->find_slot(ubatch)) {
|
||||
+ LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
|
||||
+ return 1;
|
||||
+ }
|
||||
}
|
||||
|
||||
if (!kv_self->recurrent) {
|
||||
diff --git a/src/llama-context.h b/src/llama-context.h
|
||||
index a50c4afa..30f84bfd 100644
|
||||
--- a/src/llama-context.h
|
||||
+++ b/src/llama-context.h
|
||||
@@ -5,6 +5,7 @@
|
||||
#include "llama-cparams.h"
|
||||
#include "llama-graph.h"
|
||||
#include "llama-adapter.h"
|
||||
+#include "llama-kv-cache.h"
|
||||
|
||||
#include "ggml-cpp.h"
|
||||
|
||||
@@ -179,7 +180,8 @@ private:
|
||||
|
||||
llm_graph_result_ptr build_kv_self_defrag(
|
||||
ggml_context * ctx0,
|
||||
- ggml_cgraph * gf) const;
|
||||
+ ggml_cgraph * gf,
|
||||
+ const std::vector<struct llama_kv_defrag_move> & moves) const;
|
||||
|
||||
// TODO: read/write lora adapters and cvec
|
||||
size_t state_write_data(llama_io_write_i & io);
|
||||
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
|
||||
index 69f8d35a..35a750d3 100644
|
||||
--- a/src/llama-kv-cache.cpp
|
||||
+++ b/src/llama-kv-cache.cpp
|
||||
@@ -781,17 +781,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
|
||||
|
||||
assert(n_used <= n_kv);
|
||||
|
||||
- //const int64_t t_start = ggml_time_us();
|
||||
-
|
||||
- // number of cells moved
|
||||
- uint32_t n_moves = 0;
|
||||
-
|
||||
- // each move requires 6*n_layer tensors (see graph_build_kv_self_defrag)
|
||||
- // - source view, destination view, copy operation
|
||||
- // - x2 for keys and values
|
||||
- //const uint32_t max_moves = max_nodes()/(6*n_layer);
|
||||
- // TODO: tmp fix https://github.com/ggerganov/llama.cpp/issues/6685#issuecomment-2057579516
|
||||
- const uint32_t max_moves = (n_max_nodes - 2*n_layer)/(6*n_layer);
|
||||
+ defrag_info.moves.clear();
|
||||
|
||||
// determine which KV cells to move where
|
||||
//
|
||||
@@ -799,10 +789,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
|
||||
//
|
||||
// if ids[i] == i || ids[i] == n_kv, then cell i is not moved
|
||||
//
|
||||
- auto & ids = defrag_info.ids;
|
||||
-
|
||||
- ids.clear();
|
||||
- ids.resize(n_kv, n_kv);
|
||||
+ std::vector<uint32_t> ids(n_kv, n_kv);
|
||||
|
||||
for (uint32_t i0 = 0; i0 < n_used; ++i0) {
|
||||
const auto & cell0 = cells[i0];
|
||||
@@ -851,19 +838,11 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
|
||||
// are we moving a continuous block of memory?
|
||||
bool cont = false;
|
||||
|
||||
- // should we stop searching for the next move?
|
||||
- bool stop = false;
|
||||
-
|
||||
// go back and move the nf cells to the hole
|
||||
for (; i1 < n_kv; ++i1) {
|
||||
auto & cell1 = cells[i1];
|
||||
|
||||
if (cell1.is_empty() || ids[i1] != n_kv) {
|
||||
- if (n_moves == max_moves) {
|
||||
- stop = true;
|
||||
- break;
|
||||
- }
|
||||
-
|
||||
cont = false;
|
||||
continue;
|
||||
}
|
||||
@@ -879,8 +858,10 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
|
||||
head = n_used;
|
||||
|
||||
if (!cont) {
|
||||
- n_moves++;
|
||||
+ defrag_info.moves.push_back({i1, i0 + nf, 1});
|
||||
cont = true;
|
||||
+ } else {
|
||||
+ defrag_info.moves.back().len++;
|
||||
}
|
||||
|
||||
nf++;
|
||||
@@ -890,22 +871,16 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
|
||||
}
|
||||
}
|
||||
|
||||
- if (stop || n_moves == max_moves) {
|
||||
- break;
|
||||
- }
|
||||
-
|
||||
//LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh);
|
||||
|
||||
i0 += nh - 1;
|
||||
}
|
||||
|
||||
- if (n_moves == 0) {
|
||||
+ if (defrag_info.moves.size() == 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
- LLAMA_LOG_DEBUG("(tmp log) KV defrag cell moves: %u\n", n_moves);
|
||||
-
|
||||
- LLAMA_LOG_DEBUG("expected gf nodes: %u\n", 6*n_moves*n_layer);
|
||||
+ // LLAMA_LOG_DEBUG("(tmp log) KV defrag cell moves: %u\n", n_moves);
|
||||
|
||||
return true;
|
||||
}
|
||||
diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h
|
||||
index 56c74035..25cbcb56 100644
|
||||
--- a/src/llama-kv-cache.h
|
||||
+++ b/src/llama-kv-cache.h
|
||||
@@ -43,6 +43,13 @@ private:
|
||||
llama_kv_cache * kv;
|
||||
};
|
||||
|
||||
+// block of KV slots to move when defragging
|
||||
+struct llama_kv_defrag_move {
|
||||
+ uint32_t src;
|
||||
+ uint32_t dst;
|
||||
+ uint32_t len;
|
||||
+};
|
||||
+
|
||||
struct llama_kv_cell {
|
||||
llama_pos pos = -1;
|
||||
llama_pos delta = 0;
|
||||
@@ -131,7 +138,7 @@ public:
|
||||
// defrag
|
||||
|
||||
struct {
|
||||
- std::vector<uint32_t> ids;
|
||||
+ std::vector<llama_kv_defrag_move> moves;
|
||||
} defrag_info;
|
||||
|
||||
// return true if cells have been moved
|
@ -11,7 +11,7 @@ with the fastest acceleration is loaded
|
||||
1 file changed, 13 insertions(+), 8 deletions(-)
|
||||
|
||||
diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
|
||||
index 405d8e31..4e67d243 100644
|
||||
index 82ae1b5b..1487f322 100644
|
||||
--- a/ggml/src/ggml-backend-reg.cpp
|
||||
+++ b/ggml/src/ggml-backend-reg.cpp
|
||||
@@ -157,7 +157,7 @@ struct ggml_backend_reg_entry {
|
@ -8,7 +8,7 @@ Subject: [PATCH] add phony target ggml-cpu for all cpu variants
|
||||
1 file changed, 2 insertions(+)
|
||||
|
||||
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
|
||||
index ddea5ad3..45918bf6 100644
|
||||
index 43d9fc4f..4c0d3824 100644
|
||||
--- a/ggml/src/CMakeLists.txt
|
||||
+++ b/ggml/src/CMakeLists.txt
|
||||
@@ -279,6 +279,7 @@ function(ggml_add_cpu_backend_variant tag_name)
|
@ -9,7 +9,7 @@ disable amx as it reduces performance on some systems
|
||||
1 file changed, 4 deletions(-)
|
||||
|
||||
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
|
||||
index 45918bf6..0beaed86 100644
|
||||
index 4c0d3824..79c26312 100644
|
||||
--- a/ggml/src/CMakeLists.txt
|
||||
+++ b/ggml/src/CMakeLists.txt
|
||||
@@ -296,10 +296,6 @@ if (GGML_CPU_ALL_VARIANTS)
|
@ -9,8 +9,8 @@ such as vocab fields
|
||||
---
|
||||
ggml/include/gguf.h | 1 +
|
||||
ggml/src/gguf.cpp | 7 +++++--
|
||||
src/llama-vocab.cpp | 4 +---
|
||||
3 files changed, 7 insertions(+), 5 deletions(-)
|
||||
src/llama-vocab.cpp | 2 +-
|
||||
3 files changed, 7 insertions(+), 3 deletions(-)
|
||||
|
||||
diff --git a/ggml/include/gguf.h b/ggml/include/gguf.h
|
||||
index 79ee2020..3efb22f0 100644
|
||||
@ -53,17 +53,15 @@ index 381a9c7d..e45b453d 100644
|
||||
}
|
||||
|
||||
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
|
||||
index 10f34d33..9f5fd57b 100644
|
||||
index 1306864e..d6515ff6 100644
|
||||
--- a/src/llama-vocab.cpp
|
||||
+++ b/src/llama-vocab.cpp
|
||||
@@ -1469,9 +1469,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
@@ -1459,7 +1459,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
|
||||
const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
|
||||
if (precompiled_charsmap_keyidx != -1) {
|
||||
const gguf_type pc_type = gguf_get_arr_type(ctx, precompiled_charsmap_keyidx);
|
||||
- GGML_ASSERT(pc_type == GGUF_TYPE_INT8 || pc_type == GGUF_TYPE_UINT8);
|
||||
-
|
||||
- const size_t n_precompiled_charsmap = gguf_get_arr_n(ctx, precompiled_charsmap_keyidx);
|
||||
+ const size_t n_precompiled_charsmap = gguf_get_arr_data_n(ctx, precompiled_charsmap_keyidx);
|
||||
- size_t n_precompiled_charsmap = gguf_get_arr_n(ctx, precompiled_charsmap_keyidx);
|
||||
+ size_t n_precompiled_charsmap = gguf_get_arr_data_n(ctx, precompiled_charsmap_keyidx);
|
||||
const char * pc = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx);
|
||||
precompiled_charsmap.assign(pc, pc + n_precompiled_charsmap);
|
||||
#ifdef IS_BIG_ENDIAN
|
@ -1,277 +0,0 @@
|
||||
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
||||
From: Michael Yang <git@mxy.ng>
|
||||
Date: Thu, 1 May 2025 13:45:12 -0700
|
||||
Subject: [PATCH] add argsort and cuda copy for i32
|
||||
|
||||
---
|
||||
ggml/src/ggml-cpu/ops.cpp | 43 ++++++++++++++
|
||||
ggml/src/ggml-cuda/argsort.cu | 102 +++++++++++++++++++++++++++++++++-
|
||||
ggml/src/ggml-cuda/cpy.cu | 49 ++++++++++++++++
|
||||
3 files changed, 192 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
|
||||
index becdae07..7a44b6cf 100644
|
||||
--- a/ggml/src/ggml-cpu/ops.cpp
|
||||
+++ b/ggml/src/ggml-cpu/ops.cpp
|
||||
@@ -6890,6 +6890,45 @@ static void ggml_compute_forward_argsort_f32(
|
||||
}
|
||||
}
|
||||
|
||||
+static void ggml_compute_forward_argsort_i32(
|
||||
+ const ggml_compute_params * params,
|
||||
+ ggml_tensor * dst) {
|
||||
+
|
||||
+ const ggml_tensor * src0 = dst->src[0];
|
||||
+
|
||||
+ GGML_TENSOR_UNARY_OP_LOCALS
|
||||
+
|
||||
+ GGML_ASSERT(nb0 == sizeof(int32_t));
|
||||
+
|
||||
+ const int ith = params->ith;
|
||||
+ const int nth = params->nth;
|
||||
+
|
||||
+ const int64_t nr = ggml_nrows(src0);
|
||||
+
|
||||
+ ggml_sort_order order = (ggml_sort_order) ggml_get_op_params_i32(dst, 0);
|
||||
+
|
||||
+ for (int64_t i = ith; i < nr; i += nth) {
|
||||
+ int32_t * dst_data = (int32_t *)((char *) dst->data + i*nb1);
|
||||
+ const int32_t * src_data = (int32_t *)((char *) src0->data + i*nb01);
|
||||
+
|
||||
+ for (int64_t j = 0; j < ne0; j++) {
|
||||
+ dst_data[j] = j;
|
||||
+ }
|
||||
+
|
||||
+ // C doesn't have a functional sort, so we do a bubble sort instead
|
||||
+ for (int64_t j = 0; j < ne0; j++) {
|
||||
+ for (int64_t k = j + 1; k < ne0; k++) {
|
||||
+ if ((order == GGML_SORT_ORDER_ASC && src_data[dst_data[j]] > src_data[dst_data[k]]) ||
|
||||
+ (order == GGML_SORT_ORDER_DESC && src_data[dst_data[j]] < src_data[dst_data[k]])) {
|
||||
+ int32_t tmp = dst_data[j];
|
||||
+ dst_data[j] = dst_data[k];
|
||||
+ dst_data[k] = tmp;
|
||||
+ }
|
||||
+ }
|
||||
+ }
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
void ggml_compute_forward_argsort(
|
||||
const ggml_compute_params * params,
|
||||
ggml_tensor * dst) {
|
||||
@@ -6901,6 +6940,10 @@ void ggml_compute_forward_argsort(
|
||||
{
|
||||
ggml_compute_forward_argsort_f32(params, dst);
|
||||
} break;
|
||||
+ case GGML_TYPE_I32:
|
||||
+ {
|
||||
+ ggml_compute_forward_argsort_i32(params, dst);
|
||||
+ } break;
|
||||
default:
|
||||
{
|
||||
GGML_ABORT("fatal error");
|
||||
diff --git a/ggml/src/ggml-cuda/argsort.cu b/ggml/src/ggml-cuda/argsort.cu
|
||||
index 607ded85..53b02634 100644
|
||||
--- a/ggml/src/ggml-cuda/argsort.cu
|
||||
+++ b/ggml/src/ggml-cuda/argsort.cu
|
||||
@@ -85,13 +85,107 @@ static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, co
|
||||
}
|
||||
}
|
||||
|
||||
+
|
||||
+template<ggml_sort_order order>
|
||||
+static __global__ void k_argsort_i32_i32(const int32_t * x, int * dst, const int ncols, const int ncols_pad) {
|
||||
+ extern __shared__ int shared_mem[];
|
||||
+ int * indices = shared_mem;
|
||||
+
|
||||
+ const int tid = threadIdx.x;
|
||||
+ const int row = blockIdx.y;
|
||||
+
|
||||
+ // Initialize all indices, handling the case where threads < ncols_pad
|
||||
+ for (int i = tid; i < ncols_pad; i += blockDim.x) {
|
||||
+ indices[i] = i < ncols ? i : 0; // Use 0 for padding indices
|
||||
+ }
|
||||
+ __syncthreads();
|
||||
+
|
||||
+ // Bitonic sort
|
||||
+ for (int k = 2; k <= ncols_pad; k *= 2) {
|
||||
+ for (int j = k/2; j > 0; j /= 2) {
|
||||
+ for (int i = tid; i < ncols_pad; i += blockDim.x) {
|
||||
+ const int ij = i ^ j;
|
||||
+ if (ij > i) {
|
||||
+ // Only compare values within the actual data range
|
||||
+ if (i < ncols && ij < ncols) {
|
||||
+ if ((i & k) == 0) {
|
||||
+ if (order == GGML_SORT_ORDER_ASC) {
|
||||
+ if (x[row * ncols + indices[i]] > x[row * ncols + indices[ij]]) {
|
||||
+ int tmp = indices[i];
|
||||
+ indices[i] = indices[ij];
|
||||
+ indices[ij] = tmp;
|
||||
+ }
|
||||
+ } else {
|
||||
+ if (x[row * ncols + indices[i]] < x[row * ncols + indices[ij]]) {
|
||||
+ int tmp = indices[i];
|
||||
+ indices[i] = indices[ij];
|
||||
+ indices[ij] = tmp;
|
||||
+ }
|
||||
+ }
|
||||
+ } else {
|
||||
+ if (order == GGML_SORT_ORDER_ASC) {
|
||||
+ if (x[row * ncols + indices[i]] < x[row * ncols + indices[ij]]) {
|
||||
+ int tmp = indices[i];
|
||||
+ indices[i] = indices[ij];
|
||||
+ indices[ij] = tmp;
|
||||
+ }
|
||||
+ } else {
|
||||
+ if (x[row * ncols + indices[i]] > x[row * ncols + indices[ij]]) {
|
||||
+ int tmp = indices[i];
|
||||
+ indices[i] = indices[ij];
|
||||
+ indices[ij] = tmp;
|
||||
+ }
|
||||
+ }
|
||||
+ }
|
||||
+ }
|
||||
+ }
|
||||
+ }
|
||||
+ __syncthreads();
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ // Write sorted indices to output, only threads handling valid data
|
||||
+ for (int i = tid; i < ncols; i += blockDim.x) {
|
||||
+ dst[row * ncols + i] = indices[i];
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
+static void argsort_i32_i32_cuda(const int32_t * x, int * dst, const int ncols, const int nrows, ggml_sort_order order, cudaStream_t stream) {
|
||||
+ // Bitonic sort requires ncols to be power of 2
|
||||
+ const int ncols_pad = next_power_of_2(ncols);
|
||||
+
|
||||
+ // Ensure thread count doesn't exceed maximum (typically 1024)
|
||||
+ const int max_threads = 1024; // This is the typical max for most GPUs
|
||||
+ const int threads_per_block = ncols_pad > max_threads ? max_threads : ncols_pad;
|
||||
+
|
||||
+ const dim3 block_dims(threads_per_block, 1, 1);
|
||||
+ const dim3 block_nums(1, nrows, 1);
|
||||
+ const size_t shared_mem = ncols_pad * sizeof(int);
|
||||
+
|
||||
+ // Check if shared memory size is within limits
|
||||
+ const size_t max_shared_mem = ggml_cuda_info().devices[ggml_cuda_get_device()].smpb;
|
||||
+
|
||||
+ // Instead of logging an error, use GGML_ASSERT with a descriptive message
|
||||
+ GGML_ASSERT(shared_mem <= max_shared_mem && "argsort: required shared memory exceeds device limit");
|
||||
+
|
||||
+ // Launch kernels with the updated thread configuration
|
||||
+ if (order == GGML_SORT_ORDER_ASC) {
|
||||
+ k_argsort_i32_i32<GGML_SORT_ORDER_ASC><<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad);
|
||||
+ } else if (order == GGML_SORT_ORDER_DESC) {
|
||||
+ k_argsort_i32_i32<GGML_SORT_ORDER_DESC><<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad);
|
||||
+ } else {
|
||||
+ GGML_ABORT("fatal error");
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
+
|
||||
void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const float * src0_d = (const float *)src0->data;
|
||||
float * dst_d = (float *)dst->data;
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
- GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
+ GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_I32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_I32);
|
||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||
|
||||
@@ -100,5 +194,9 @@ void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
|
||||
enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0];
|
||||
|
||||
- argsort_f32_i32_cuda(src0_d, (int *)dst_d, ncols, nrows, order, stream);
|
||||
+ if (src0->type == GGML_TYPE_I32) {
|
||||
+ argsort_i32_i32_cuda((const int32_t *)src0_d, (int *)dst_d, ncols, nrows, order, stream);
|
||||
+ } else {
|
||||
+ argsort_f32_i32_cuda(src0_d, (int *)dst_d, ncols, nrows, order, stream);
|
||||
+ }
|
||||
}
|
||||
diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu
|
||||
index 2d46176e..47383486 100644
|
||||
--- a/ggml/src/ggml-cuda/cpy.cu
|
||||
+++ b/ggml/src/ggml-cuda/cpy.cu
|
||||
@@ -38,6 +38,13 @@ static __device__ void cpy_1_f16_f32(const char * cxi, char * cdsti) {
|
||||
*dsti = *xi;
|
||||
}
|
||||
|
||||
+static __device__ void cpy_1_i32_i32(const char * cxi, char * cdsti) {
|
||||
+ const int32_t * xi = (const int32_t *) cxi;
|
||||
+ int32_t * dsti = (int32_t *) cdsti;
|
||||
+
|
||||
+ *dsti = *xi;
|
||||
+}
|
||||
+
|
||||
template <cpy_kernel_t cpy_1>
|
||||
static __global__ void cpy_f32_f16(const char * cx, char * cdst_direct, const int ne,
|
||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
||||
@@ -68,6 +75,44 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst_direct, const in
|
||||
cpy_1(cx + x_offset, cdst + dst_offset);
|
||||
}
|
||||
|
||||
+// First, add this template function after the other template functions
|
||||
+template <cpy_kernel_t cpy_1>
|
||||
+static __global__ void cpy_i32_i32(const char * cx, char * cdst, const int ne,
|
||||
+ const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
||||
+ const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
|
||||
+ const int nb12, const int nb13) {
|
||||
+ const int64_t i = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
+
|
||||
+ if (i >= ne) {
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
+ const int64_t i03 = i/(ne00 * ne01 * ne02);
|
||||
+ const int64_t i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
|
||||
+ const int64_t i01 = (i - i03*ne00*ne01*ne02 - i02*ne01*ne00) / ne00;
|
||||
+ const int64_t i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
|
||||
+ const int64_t x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;
|
||||
+
|
||||
+ const int64_t i13 = i/(ne10 * ne11 * ne12);
|
||||
+ const int64_t i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
|
||||
+ const int64_t i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
|
||||
+ const int64_t i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
|
||||
+ const int64_t dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13 * nb13;
|
||||
+
|
||||
+ cpy_1(cx + x_offset, cdst + dst_offset);
|
||||
+}
|
||||
+
|
||||
+// Then modify the ggml_cpy_i32_i32_cuda function to use the new template
|
||||
+static void ggml_cpy_i32_i32_cuda(
|
||||
+ const char * cx, char * cdst, const int ne,
|
||||
+ const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
||||
+ const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int graph_cpynode_index) {
|
||||
+
|
||||
+ const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
|
||||
+ cpy_i32_i32<cpy_1_i32_i32><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
|
||||
+ (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
||||
+}
|
||||
+
|
||||
static __device__ void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) {
|
||||
const float * xi = (const float *) cxi;
|
||||
block_q8_0 * dsti = (block_q8_0 *) cdsti;
|
||||
@@ -631,6 +676,8 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
|
||||
ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
||||
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
|
||||
ggml_cpy_f16_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
||||
+ } else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32) {
|
||||
+ ggml_cpy_i32_i32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
||||
} else {
|
||||
GGML_ABORT("%s: unsupported type combination (%s to %s)\n", __func__,
|
||||
ggml_type_name(src0->type), ggml_type_name(src1->type));
|
||||
@@ -686,6 +733,8 @@ void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) {
|
||||
return (void*) cpy_f32_f16<cpy_1_f32_f16>;
|
||||
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
|
||||
return (void*) cpy_f32_f16<cpy_1_f16_f32>;
|
||||
+ } else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32) {
|
||||
+ return (void*) cpy_i32_i32<cpy_1_i32_i32>;
|
||||
} else {
|
||||
GGML_ABORT("%s: unsupported type combination (%s to %s)\n", __func__,
|
||||
ggml_type_name(src0->type), ggml_type_name(src1->type));
|
@ -8,7 +8,7 @@ Subject: [PATCH] ollama debug tensor
|
||||
1 file changed, 6 insertions(+)
|
||||
|
||||
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
|
||||
index a30e67f2..2462d2b8 100644
|
||||
index 34624cca..59bd3c62 100644
|
||||
--- a/ggml/src/ggml-cpu/ggml-cpu.c
|
||||
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
|
||||
@@ -15,6 +15,8 @@
|
||||
@ -20,7 +20,7 @@ index a30e67f2..2462d2b8 100644
|
||||
#if defined(_MSC_VER) || defined(__MINGW32__)
|
||||
#include <malloc.h> // using malloc.h with MSC/MINGW
|
||||
#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
|
||||
@@ -2841,6 +2843,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
||||
@@ -2859,6 +2861,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
||||
|
||||
ggml_compute_forward(¶ms, node);
|
||||
|
@ -184,7 +184,7 @@ index f8c291de..2a3a62db 100644
|
||||
const char * grammar_root,
|
||||
bool lazy,
|
||||
diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
|
||||
index 804b11e0..15a10ca8 100644
|
||||
index c0a5f934..75731053 100644
|
||||
--- a/src/llama-sampling.cpp
|
||||
+++ b/src/llama-sampling.cpp
|
||||
@@ -1466,7 +1466,7 @@ static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
|
@ -0,0 +1,38 @@
|
||||
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
||||
From: Jesse Gross <jesse@kernel.org>
|
||||
Date: Thu, 1 May 2025 13:46:10 -0700
|
||||
Subject: [PATCH] ggml: Don't assert fail when tensor data changes (#13222)
|
||||
|
||||
The following scenario will cause an assertion failure in the graph
|
||||
allocator:
|
||||
- Build and allocate a graph containing a tensor with a non-NULL data
|
||||
pointer
|
||||
- Build and allocate a new graph where that data is NULL
|
||||
|
||||
Result:
|
||||
ggml-alloc.c:819: GGML_ASSERT(talloc->buffer_id >= 0) failed
|
||||
|
||||
This happens during revalidation because we think that memory should
|
||||
have been previously allocated based on the current graph but in
|
||||
reality the previous graph was different. In this situation, we
|
||||
should do a full reallocation pass.
|
||||
---
|
||||
ggml/src/ggml-alloc.c | 5 ++++-
|
||||
1 file changed, 4 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c
|
||||
index a3d3f690..5fd379f6 100644
|
||||
--- a/ggml/src/ggml-alloc.c
|
||||
+++ b/ggml/src/ggml-alloc.c
|
||||
@@ -816,7 +816,10 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
|
||||
static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) {
|
||||
size_t node_size = 0;
|
||||
if (!node->data && !node->view_src) {
|
||||
- GGML_ASSERT(talloc->buffer_id >= 0); // prevent segfault when misusing the API
|
||||
+ // If we previously had data but don't now then reallocate
|
||||
+ if (talloc->buffer_id < 0) {
|
||||
+ return false;
|
||||
+ }
|
||||
node_size = ggml_backend_buft_get_alloc_size(galloc->bufts[talloc->buffer_id], node);
|
||||
}
|
||||
return talloc->size_max >= node_size;
|
6
llama/sampling_ext.cpp
vendored
6
llama/sampling_ext.cpp
vendored
@ -19,6 +19,9 @@ struct common_sampler *common_sampler_cinit(const struct llama_model *model, str
|
||||
sparams.penalty_repeat = params->penalty_repeat;
|
||||
sparams.penalty_freq = params->penalty_freq;
|
||||
sparams.penalty_present = params->penalty_present;
|
||||
sparams.mirostat = params->mirostat;
|
||||
sparams.mirostat_tau = params->mirostat_tau;
|
||||
sparams.mirostat_eta = params->mirostat_eta;
|
||||
sparams.seed = params->seed;
|
||||
sparams.grammar = params->grammar;
|
||||
sparams.xtc_probability = 0.0;
|
||||
@ -114,9 +117,6 @@ void grammar_free(struct llama_grammar *g) {
|
||||
if (g->vocab != nullptr) {
|
||||
delete g->vocab;
|
||||
}
|
||||
if (g->o_vocab != nullptr) {
|
||||
delete g->o_vocab;
|
||||
}
|
||||
llama_grammar_free_impl(g);
|
||||
}
|
||||
}
|
||||
|
3
llama/sampling_ext.h
vendored
3
llama/sampling_ext.h
vendored
@ -20,6 +20,9 @@ extern "C"
|
||||
float penalty_repeat;
|
||||
float penalty_freq;
|
||||
float penalty_present;
|
||||
int32_t mirostat;
|
||||
float mirostat_tau;
|
||||
float mirostat_eta;
|
||||
uint32_t seed;
|
||||
char *grammar;
|
||||
};
|
||||
|
@ -1,12 +1,9 @@
|
||||
package llm
|
||||
|
||||
import (
|
||||
"cmp"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"maps"
|
||||
"os"
|
||||
"slices"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
@ -111,8 +108,9 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
|
||||
slog.Debug("evaluating", "library", gpus[0].Library, "gpu_count", len(gpus), "available", availableList)
|
||||
|
||||
for _, projector := range projectors {
|
||||
weight := projectorMemoryRequirements(projector)
|
||||
weight, graph := projectorMemoryRequirements(projector)
|
||||
projectorWeights += weight
|
||||
projectorGraph += graph
|
||||
|
||||
// multimodal models require at least 2048 context
|
||||
opts.NumCtx = max(opts.NumCtx, 2048)
|
||||
@ -122,10 +120,12 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
|
||||
}
|
||||
|
||||
layers := f.Tensors().GroupLayers()
|
||||
// add one layer (chosing the max layer) worth of memory as a buffer
|
||||
layerSize = slices.MaxFunc(slices.Collect(maps.Values(layers)), func(a, b ggml.Layer) int {
|
||||
return cmp.Compare(a.Size(), b.Size())
|
||||
}).Size()
|
||||
// add one layer worth of memory as a buffer
|
||||
if blk0, ok := layers["blk.0"]; ok {
|
||||
layerSize = blk0.Size()
|
||||
} else {
|
||||
slog.Warn("model missing blk.0 layer size")
|
||||
}
|
||||
|
||||
var kvct string
|
||||
if envconfig.FlashAttention() &&
|
||||
@ -219,7 +219,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
|
||||
}
|
||||
|
||||
// For all the layers, find where they can fit on the GPU(s)
|
||||
for i := int(f.KV().BlockCount()) - 1; i >= 0; i-- {
|
||||
for i := range int(f.KV().BlockCount()) {
|
||||
// Some models have inconsistent layer sizes
|
||||
if blk, ok := layers[fmt.Sprintf("blk.%d", i)]; ok {
|
||||
layerSize = blk.Size()
|
||||
@ -229,7 +229,6 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
|
||||
|
||||
if opts.NumGPU >= 0 && layerCount >= opts.NumGPU {
|
||||
// Stop allocating on GPU(s) once we hit the users target NumGPU
|
||||
overflow += layerSize
|
||||
continue
|
||||
}
|
||||
|
||||
@ -246,13 +245,13 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
|
||||
gpusWithSpace = append(gpusWithSpace[:i%j], gpusWithSpace[i%j+1:]...)
|
||||
}
|
||||
}
|
||||
|
||||
if len(gpusWithSpace) == 0 {
|
||||
overflow += layerSize
|
||||
}
|
||||
}
|
||||
if layerCount >= int(f.KV().BlockCount()) {
|
||||
fullyLoaded = true
|
||||
} else {
|
||||
for i := layerCount; i < int(f.KV().BlockCount()); i++ {
|
||||
overflow += layerSize
|
||||
}
|
||||
}
|
||||
|
||||
// Determine if we need to consider output then find where it fits
|
||||
@ -408,21 +407,51 @@ func (m MemoryEstimate) LogValue() slog.Value {
|
||||
return slog.GroupValue(attrs...)
|
||||
}
|
||||
|
||||
func projectorMemoryRequirements(filename string) (weights uint64) {
|
||||
func projectorMemoryRequirements(filename string) (weights, graphSize uint64) {
|
||||
file, err := os.Open(filename)
|
||||
if err != nil {
|
||||
return 0
|
||||
return 0, 0
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
ggml, _, err := ggml.Decode(file, 1024)
|
||||
if err != nil {
|
||||
return 0
|
||||
return 0, 0
|
||||
}
|
||||
|
||||
for _, layer := range ggml.Tensors().GroupLayers() {
|
||||
weights += layer.Size()
|
||||
}
|
||||
|
||||
return weights
|
||||
switch arch := ggml.KV().Architecture(); arch {
|
||||
case "mllama":
|
||||
kv := func(n string) uint64 {
|
||||
if v, ok := ggml.KV()[arch+".vision."+n].(uint32); ok {
|
||||
return uint64(v)
|
||||
}
|
||||
|
||||
return 0
|
||||
}
|
||||
|
||||
imageSize := kv("image_size")
|
||||
|
||||
maxNumTiles := kv("max_num_tiles")
|
||||
embeddingLength := kv("embedding_length")
|
||||
headCount := kv("attention.head_count")
|
||||
|
||||
numPatches := (imageSize / kv("patch_size")) * (imageSize / kv("patch_size"))
|
||||
if _, ok := ggml.Tensors().GroupLayers()["v"]["class_embd"]; ok {
|
||||
numPatches++
|
||||
}
|
||||
|
||||
numPaddedPatches := numPatches + 8 - (numPatches%8)%8
|
||||
|
||||
graphSize = 4 * (8 +
|
||||
imageSize*imageSize*kv("num_channels")*maxNumTiles +
|
||||
embeddingLength*numPatches*maxNumTiles +
|
||||
9*embeddingLength*numPaddedPatches*maxNumTiles +
|
||||
numPaddedPatches*maxNumTiles*numPaddedPatches*maxNumTiles*headCount)
|
||||
}
|
||||
|
||||
return weights, graphSize
|
||||
}
|
||||
|
@ -17,7 +17,6 @@ import (
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"runtime"
|
||||
"slices"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
@ -31,37 +30,9 @@ import (
|
||||
"github.com/ollama/ollama/format"
|
||||
"github.com/ollama/ollama/fs/ggml"
|
||||
"github.com/ollama/ollama/llama"
|
||||
"github.com/ollama/ollama/logutil"
|
||||
"github.com/ollama/ollama/model"
|
||||
)
|
||||
|
||||
type filteredEnv []string
|
||||
|
||||
func (e filteredEnv) LogValue() slog.Value {
|
||||
var attrs []slog.Attr
|
||||
for _, env := range e {
|
||||
if key, value, ok := strings.Cut(env, "="); ok {
|
||||
switch {
|
||||
case strings.HasPrefix(key, "OLLAMA_"),
|
||||
strings.HasPrefix(key, "CUDA_"),
|
||||
strings.HasPrefix(key, "ROCR_"),
|
||||
strings.HasPrefix(key, "ROCM_"),
|
||||
strings.HasPrefix(key, "HIP_"),
|
||||
strings.HasPrefix(key, "GPU_"),
|
||||
strings.HasPrefix(key, "HSA_"),
|
||||
strings.HasPrefix(key, "GGML_"),
|
||||
slices.Contains([]string{
|
||||
"PATH",
|
||||
"LD_LIBRARY_PATH",
|
||||
"DYLD_LIBRARY_PATH",
|
||||
}, key):
|
||||
attrs = append(attrs, slog.String(key, value))
|
||||
}
|
||||
}
|
||||
}
|
||||
return slog.GroupValue(attrs...)
|
||||
}
|
||||
|
||||
type LlamaServer interface {
|
||||
Ping(ctx context.Context) error
|
||||
WaitUntilRunning(ctx context.Context) error
|
||||
@ -177,6 +148,10 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
|
||||
params = append(params, "--n-gpu-layers", strconv.Itoa(opts.NumGPU))
|
||||
}
|
||||
|
||||
if envconfig.Debug() {
|
||||
params = append(params, "--verbose")
|
||||
}
|
||||
|
||||
if opts.MainGPU > 0 {
|
||||
params = append(params, "--main-gpu", strconv.Itoa(opts.MainGPU))
|
||||
}
|
||||
@ -311,7 +286,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
|
||||
params = append(params, "--mmproj", projectors[0])
|
||||
}
|
||||
|
||||
// iterate through compatible GPU libraries such as 'cuda_v12', 'cuda_v11', 'rocm', etc.
|
||||
// iterate through compatible GPU libraries such as 'cuda_v12', 'rocm', etc.
|
||||
// adding each library's respective path to the LD_LIBRARY_PATH, until finally running
|
||||
// without any LD_LIBRARY_PATH flags
|
||||
for {
|
||||
@ -429,7 +404,26 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
|
||||
}
|
||||
|
||||
slog.Info("starting llama server", "cmd", s.cmd)
|
||||
slog.Debug("subprocess", "", filteredEnv(s.cmd.Env))
|
||||
if envconfig.Debug() {
|
||||
filteredEnv := []string{}
|
||||
for _, ev := range s.cmd.Env {
|
||||
if strings.HasPrefix(ev, "OLLAMA_") ||
|
||||
strings.HasPrefix(ev, "CUDA_") ||
|
||||
strings.HasPrefix(ev, "ROCR_") ||
|
||||
strings.HasPrefix(ev, "ROCM_") ||
|
||||
strings.HasPrefix(ev, "HIP_") ||
|
||||
strings.HasPrefix(ev, "GPU_") ||
|
||||
strings.HasPrefix(ev, "HSA_") ||
|
||||
strings.HasPrefix(ev, "GGML_") ||
|
||||
strings.HasPrefix(ev, "PATH=") ||
|
||||
strings.HasPrefix(ev, "LD_LIBRARY_PATH=") ||
|
||||
strings.HasPrefix(ev, "DYLD_LIBRARY_PATH=") {
|
||||
filteredEnv = append(filteredEnv, ev)
|
||||
}
|
||||
}
|
||||
// Log at debug as the environment is inherited and might contain sensitive information
|
||||
slog.Debug("subprocess", "environment", filteredEnv)
|
||||
}
|
||||
|
||||
if err = s.cmd.Start(); err != nil {
|
||||
var msg string
|
||||
@ -679,8 +673,9 @@ ws ::= ([ \t\n] ws)?
|
||||
const maxBufferSize = 512 * format.KiloByte
|
||||
|
||||
type ImageData struct {
|
||||
Data []byte `json:"data"`
|
||||
ID int `json:"id"`
|
||||
Data []byte `json:"data"`
|
||||
ID int `json:"id"`
|
||||
AspectRatioID int `json:"aspect_ratio_id"`
|
||||
}
|
||||
|
||||
type CompletionRequest struct {
|
||||
@ -726,9 +721,6 @@ type CompletionResponse struct {
|
||||
}
|
||||
|
||||
func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error {
|
||||
slog.Debug("completion request", "images", len(req.Images), "prompt", len(req.Prompt), "format", string(req.Format))
|
||||
slog.Log(ctx, logutil.LevelTrace, "completion request", "prompt", req.Prompt)
|
||||
|
||||
if len(req.Format) > 0 {
|
||||
switch string(req.Format) {
|
||||
case `null`, `""`:
|
||||
@ -892,8 +884,6 @@ type EmbeddingResponse struct {
|
||||
}
|
||||
|
||||
func (s *llmServer) Embedding(ctx context.Context, input string) ([]float32, error) {
|
||||
slog.Log(ctx, logutil.LevelTrace, "embedding request", "input", input)
|
||||
|
||||
if err := s.sem.Acquire(ctx, 1); err != nil {
|
||||
if errors.Is(err, context.Canceled) {
|
||||
slog.Info("aborting embedding request due to client closing the connection")
|
||||
|
@ -16,7 +16,7 @@ func TestLLMServerCompletionFormat(t *testing.T) {
|
||||
// of a mess, and but it's good enough, until we can refactoring the
|
||||
// Completion method to be more testable.
|
||||
|
||||
ctx, cancel := context.WithCancel(t.Context())
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
s := &llmServer{
|
||||
sem: semaphore.NewWeighted(1), // required to prevent nil panic
|
||||
}
|
||||
|
@ -1,29 +0,0 @@
|
||||
package logutil
|
||||
|
||||
import (
|
||||
"io"
|
||||
"log/slog"
|
||||
"path/filepath"
|
||||
)
|
||||
|
||||
const LevelTrace slog.Level = -8
|
||||
|
||||
func NewLogger(w io.Writer, level slog.Level) *slog.Logger {
|
||||
return slog.New(slog.NewTextHandler(w, &slog.HandlerOptions{
|
||||
Level: level,
|
||||
AddSource: true,
|
||||
ReplaceAttr: func(_ []string, attr slog.Attr) slog.Attr {
|
||||
switch attr.Key {
|
||||
case slog.LevelKey:
|
||||
switch attr.Value.Any().(slog.Level) {
|
||||
case LevelTrace:
|
||||
attr.Value = slog.StringValue("TRACE")
|
||||
}
|
||||
case slog.SourceKey:
|
||||
source := attr.Value.Any().(*slog.Source)
|
||||
source.File = filepath.Base(source.File)
|
||||
}
|
||||
return attr
|
||||
},
|
||||
}))
|
||||
}
|
@ -5,7 +5,6 @@ import (
|
||||
"context"
|
||||
"encoding/binary"
|
||||
"fmt"
|
||||
"math"
|
||||
"os"
|
||||
"slices"
|
||||
"strconv"
|
||||
@ -119,21 +118,6 @@ type Context interface {
|
||||
Layer(int) Context
|
||||
}
|
||||
|
||||
// RopeOptions contains optional parameters for RoPE function
|
||||
type RopeOptions struct {
|
||||
OriginalContextLen uint32
|
||||
}
|
||||
|
||||
// RopeOption defines a function that modifies RopeOpts
|
||||
type RopeOption func(*RopeOptions)
|
||||
|
||||
// WithContextLen sets a custom context length
|
||||
func WithContextLen(len uint32) RopeOption {
|
||||
return func(opts *RopeOptions) {
|
||||
opts.OriginalContextLen = len
|
||||
}
|
||||
}
|
||||
|
||||
type Tensor interface {
|
||||
Dim(n int) int
|
||||
Stride(n int) int
|
||||
@ -159,7 +143,7 @@ type Tensor interface {
|
||||
AvgPool2D(ctx Context, k, s int, p float32) Tensor
|
||||
Conv2D(ctx Context, weight Tensor, s0, s1, p0, p1, d0, d1 int) Tensor
|
||||
|
||||
RoPE(ctx Context, positionIDs, ropeFactors Tensor, dim, ropeType uint32, base, scale float32, options ...RopeOption) Tensor
|
||||
RoPE(ctx Context, positionIDs, ropeFactors Tensor, dim, ropeType uint32, base, scale float32) Tensor
|
||||
IM2Col(ctx Context, weight Tensor, s0, s1, p0, p1, d0, d1 int) Tensor
|
||||
|
||||
Sin(ctx Context) Tensor
|
||||
@ -176,6 +160,7 @@ type Tensor interface {
|
||||
Set(ctx Context, t2 Tensor, offset int, strides ...int) Tensor
|
||||
|
||||
Pad(ctx Context, shape ...int) Tensor
|
||||
Unpad(ctx Context, shape ...int) Tensor
|
||||
|
||||
Stack(ctx Context, dim int, s ...Tensor) Tensor
|
||||
|
||||
@ -187,7 +172,6 @@ type Tensor interface {
|
||||
Duplicate(ctx Context) Tensor
|
||||
|
||||
TopK(ctx Context, k int) Tensor
|
||||
Argsort(ctx Context) Tensor
|
||||
}
|
||||
|
||||
// ScaledDotProductAttention implements a fused attention
|
||||
@ -230,58 +214,35 @@ func mul[T number](s ...T) T {
|
||||
return p
|
||||
}
|
||||
|
||||
type DumpOptions func(*dumpOptions)
|
||||
type DumpOptions struct {
|
||||
// Items is the number of elements to print at the beginning and end of each dimension.
|
||||
Items int
|
||||
|
||||
// DumpWithPrecision sets the number of decimal places to print. Applies to float32 and float64.
|
||||
func DumpWithPrecision(n int) DumpOptions {
|
||||
return func(opts *dumpOptions) {
|
||||
opts.Precision = n
|
||||
}
|
||||
// Precision is the number of decimal places to print. Applies to float32 and float64.
|
||||
Precision int
|
||||
}
|
||||
|
||||
// DumpWithThreshold sets the threshold for printing the entire tensor. If the number of elements
|
||||
// is less than or equal to this value, the entire tensor will be printed. Otherwise, only the
|
||||
// beginning and end of each dimension will be printed.
|
||||
func DumpWithThreshold(n int) DumpOptions {
|
||||
return func(opts *dumpOptions) {
|
||||
opts.Threshold = n
|
||||
}
|
||||
}
|
||||
|
||||
// DumpWithEdgeItems sets the number of elements to print at the beginning and end of each dimension.
|
||||
func DumpWithEdgeItems(n int) DumpOptions {
|
||||
return func(opts *dumpOptions) {
|
||||
opts.EdgeItems = n
|
||||
}
|
||||
}
|
||||
|
||||
type dumpOptions struct {
|
||||
Precision, Threshold, EdgeItems int
|
||||
}
|
||||
|
||||
func Dump(ctx Context, t Tensor, optsFuncs ...DumpOptions) string {
|
||||
opts := dumpOptions{Precision: 4, Threshold: 1000, EdgeItems: 3}
|
||||
for _, optsFunc := range optsFuncs {
|
||||
optsFunc(&opts)
|
||||
}
|
||||
|
||||
if mul(t.Shape()...) <= opts.Threshold {
|
||||
opts.EdgeItems = math.MaxInt
|
||||
func Dump(ctx Context, t Tensor, opts ...DumpOptions) string {
|
||||
if len(opts) < 1 {
|
||||
opts = append(opts, DumpOptions{
|
||||
Items: 3,
|
||||
Precision: 4,
|
||||
})
|
||||
}
|
||||
|
||||
switch t.DType() {
|
||||
case DTypeF32:
|
||||
return dump[[]float32](ctx, t, opts.EdgeItems, func(f float32) string {
|
||||
return strconv.FormatFloat(float64(f), 'f', opts.Precision, 32)
|
||||
return dump[[]float32](ctx, t, opts[0].Items, func(f float32) string {
|
||||
return strconv.FormatFloat(float64(f), 'f', opts[0].Precision, 32)
|
||||
})
|
||||
case DTypeF16, DTypeQ80, DTypeQ40:
|
||||
f32 := ctx.Input().Empty(DTypeF32, t.Shape()...)
|
||||
f32 = t.Copy(ctx, f32)
|
||||
return dump[[]float32](ctx, f32, opts.EdgeItems, func(f float32) string {
|
||||
return strconv.FormatFloat(float64(f), 'f', opts.Precision, 32)
|
||||
return dump[[]float32](ctx, f32, opts[0].Items, func(f float32) string {
|
||||
return strconv.FormatFloat(float64(f), 'f', opts[0].Precision, 32)
|
||||
})
|
||||
case DTypeI32:
|
||||
return dump[[]int32](ctx, t, opts.EdgeItems, func(i int32) string {
|
||||
return dump[[]int32](ctx, t, opts[0].Items, func(i int32) string {
|
||||
return strconv.FormatInt(int64(i), 10)
|
||||
})
|
||||
default:
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user