Compare commits
42 Commits
mxyng/next
...
brucemacd/
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
5f62064e2f | ||
|
|
e3f3043f5b | ||
|
|
b5fc84c930 | ||
|
|
827b6b5d16 | ||
|
|
0d15036d82 | ||
|
|
d2eb226c91 | ||
|
|
e13e7c8d94 | ||
|
|
78f403ff45 | ||
|
|
08a299e1d0 | ||
|
|
7b5d916a9a | ||
|
|
33ad61b112 | ||
|
|
716e365615 | ||
|
|
3b4424ff98 | ||
|
|
f9c7ead160 | ||
|
|
5930aaeb1a | ||
|
|
faf67db089 | ||
|
|
0667baddc6 | ||
|
|
d006e1e09b | ||
|
|
df2680b4b9 | ||
|
|
010313bb63 | ||
|
|
5296f487a8 | ||
|
|
f05774b04c | ||
|
|
6600bd7d91 | ||
|
|
ed443a0393 | ||
|
|
6945617af5 | ||
|
|
7916f55009 | ||
|
|
d650ad398f | ||
|
|
d223f3b697 | ||
|
|
60830695c2 | ||
|
|
01d9a46854 | ||
|
|
d773b7d671 | ||
|
|
4d4463b2bd | ||
|
|
0e38297f87 | ||
|
|
7e13f568dc | ||
|
|
58245413f4 | ||
|
|
8cf16063a5 | ||
|
|
3a4449e2f1 | ||
|
|
10d59d5f90 | ||
|
|
a4f69a0191 | ||
|
|
82658c3eec | ||
|
|
378d6e1e6a | ||
|
|
afa55bc70c |
4
.github/workflows/release.yaml
vendored
4
.github/workflows/release.yaml
vendored
@@ -329,7 +329,9 @@ jobs:
|
||||
done
|
||||
working-directory: dist/${{ matrix.os }}-${{ matrix.arch }}
|
||||
- run: |
|
||||
for ARCHIVE in dist/${{ matrix.os }}-${{ matrix.arch }}/*.tar.in; do tar c -C dist/${{ matrix.os }}-${{ matrix.arch }} -T $ARCHIVE | pigz -9vc >$(basename ${ARCHIVE//.*/}.tgz); done
|
||||
for ARCHIVE in dist/${{ matrix.os }}-${{ matrix.arch }}/*.tar.in; do
|
||||
tar c -C dist/${{ matrix.os }}-${{ matrix.arch }} -T $ARCHIVE --owner 0 --group 0 | pigz -9vc >$(basename ${ARCHIVE//.*/}.tgz);
|
||||
done
|
||||
- uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: dist-${{ matrix.os }}-${{ matrix.arch }}-${{ matrix.target }}
|
||||
|
||||
@@ -24,7 +24,7 @@ set(GGML_LLAMAFILE ON)
|
||||
set(GGML_CUDA_PEER_MAX_BATCH_SIZE 128)
|
||||
set(GGML_CUDA_GRAPHS ON)
|
||||
|
||||
if((NOT CMAKE_OSX_ARCHITECTURES MATCHES "arm64")
|
||||
if((CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_OSX_ARCHITECTURES MATCHES "arm64")
|
||||
OR (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_SYSTEM_PROCESSOR MATCHES "arm|aarch64|ARM64|ARMv[0-9]+"))
|
||||
set(GGML_CPU_ALL_VARIANTS ON)
|
||||
endif()
|
||||
@@ -104,6 +104,10 @@ if(CMAKE_HIP_COMPILER)
|
||||
if(AMDGPU_TARGETS)
|
||||
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-hip)
|
||||
|
||||
if (WIN32)
|
||||
target_compile_definitions(ggml-hip PRIVATE GGML_CUDA_NO_PEER_COPY=1)
|
||||
endif()
|
||||
|
||||
set(OLLAMA_HIP_INSTALL_DIR ${OLLAMA_INSTALL_DIR}/rocm)
|
||||
install(TARGETS ggml-hip
|
||||
RUNTIME_DEPENDENCIES
|
||||
|
||||
@@ -380,6 +380,8 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
||||
- [Chipper](https://github.com/TilmanGriesel/chipper) AI interface for tinkerers (Ollama, Haystack RAG, Python)
|
||||
- [ChibiChat](https://github.com/CosmicEventHorizon/ChibiChat) (Kotlin-based Android app to chat with Ollama and Koboldcpp API endpoints)
|
||||
- [LocalLLM](https://github.com/qusaismael/localllm) (Minimal Web-App to run ollama models on it with a GUI)
|
||||
- [Ollamazing](https://github.com/buiducnhat/ollamazing) (Web extension to run Ollama models)
|
||||
- [OpenDeepResearcher-via-searxng](https://github.com/benhaotang/OpenDeepResearcher-via-searxng) (A Deep Research equivent endpoint with Ollama support for running locally)
|
||||
|
||||
### Cloud
|
||||
|
||||
@@ -437,9 +439,10 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
||||
|
||||
- [Pacman](https://archlinux.org/packages/extra/x86_64/ollama/)
|
||||
- [Gentoo](https://github.com/gentoo/guru/tree/master/app-misc/ollama)
|
||||
- [Homebrew](https://formulae.brew.sh/formula/ollama)
|
||||
- [Helm Chart](https://artifacthub.io/packages/helm/ollama-helm/ollama)
|
||||
- [Guix channel](https://codeberg.org/tusharhero/ollama-guix)
|
||||
- [Nix package](https://search.nixos.org/packages?channel=24.05&show=ollama&from=0&size=50&sort=relevance&type=packages&query=ollama)
|
||||
- [Nix package](https://search.nixos.org/packages?show=ollama&from=0&size=50&sort=relevance&type=packages&query=ollama)
|
||||
- [Flox](https://flox.dev/blog/ollama-part-one)
|
||||
|
||||
### Libraries
|
||||
@@ -494,7 +497,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
||||
- [multi-llm-ts](https://github.com/nbonamy/multi-llm-ts) (A Typescript/JavaScript library allowing access to different LLM in unified API)
|
||||
- [LlmTornado](https://github.com/lofcz/llmtornado) (C# library providing a unified interface for major FOSS & Commercial inference APIs)
|
||||
- [Ollama for Zig](https://github.com/dravenk/ollama-zig)
|
||||
- [Abso](https://github.com/lunary-ai/abso/blob/main/README.md#ollama) (OpenAI-compatible TypeScript SDK for any LLM provider)
|
||||
- [Abso](https://github.com/lunary-ai/abso) (OpenAI-compatible TypeScript SDK for any LLM provider)
|
||||
|
||||
### Mobile
|
||||
|
||||
@@ -546,6 +549,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
||||
- [Alfred Ollama](https://github.com/zeitlings/alfred-ollama) (Alfred Workflow)
|
||||
- [TextLLaMA](https://github.com/adarshM84/TextLLaMA) A Chrome Extension that helps you write emails, correct grammar, and translate into any language
|
||||
- [Simple-Discord-AI](https://github.com/zyphixor/simple-discord-ai)
|
||||
- [LLM Telegram Bot](https://github.com/innightwolfsleep/llm_telegram_bot) (telegram bot, primary for RP. Oobabooga-like buttons, [A1111](https://github.com/AUTOMATIC1111/stable-diffusion-webui) API integration e.t.c)
|
||||
|
||||
### Supported backends
|
||||
|
||||
|
||||
@@ -35,9 +35,9 @@ import (
|
||||
"github.com/ollama/ollama/envconfig"
|
||||
"github.com/ollama/ollama/format"
|
||||
"github.com/ollama/ollama/llama"
|
||||
"github.com/ollama/ollama/llama/runner"
|
||||
"github.com/ollama/ollama/parser"
|
||||
"github.com/ollama/ollama/progress"
|
||||
"github.com/ollama/ollama/runner"
|
||||
"github.com/ollama/ollama/server"
|
||||
"github.com/ollama/ollama/types/model"
|
||||
"github.com/ollama/ollama/version"
|
||||
@@ -338,7 +338,10 @@ func RunHandler(cmd *cobra.Command, args []string) error {
|
||||
return err
|
||||
}
|
||||
|
||||
opts.MultiModal = len(info.ProjectorInfo) != 0
|
||||
// TODO(jessegross): We should either find another way to know if this is
|
||||
// a vision model or remove the logic. Also consider that other modalities will
|
||||
// need different behavior anyways.
|
||||
opts.MultiModal = len(info.ProjectorInfo) != 0 || envconfig.NewEngine()
|
||||
opts.ParentModel = info.Details.ParentModel
|
||||
|
||||
if interactive {
|
||||
|
||||
@@ -4,7 +4,7 @@ import (
|
||||
"fmt"
|
||||
"os"
|
||||
|
||||
"github.com/ollama/ollama/llama/runner"
|
||||
"github.com/ollama/ollama/runner"
|
||||
)
|
||||
|
||||
func main() {
|
||||
|
||||
@@ -9,7 +9,7 @@ import (
|
||||
"log/slog"
|
||||
"strings"
|
||||
|
||||
"github.com/ollama/ollama/llm"
|
||||
"github.com/ollama/ollama/fs/ggml"
|
||||
)
|
||||
|
||||
type ModelParameters struct {
|
||||
@@ -27,8 +27,8 @@ type AdapterParameters struct {
|
||||
} `json:"lora_parameters"`
|
||||
}
|
||||
|
||||
func (ModelParameters) KV(t *Tokenizer) llm.KV {
|
||||
kv := llm.KV{
|
||||
func (ModelParameters) KV(t *Tokenizer) ggml.KV {
|
||||
kv := ggml.KV{
|
||||
"general.file_type": uint32(1),
|
||||
"general.quantization_version": uint32(2),
|
||||
"tokenizer.ggml.pre": t.Pre,
|
||||
@@ -54,7 +54,7 @@ func (ModelParameters) KV(t *Tokenizer) llm.KV {
|
||||
return kv
|
||||
}
|
||||
|
||||
func (p AdapterParameters) KV() llm.KV {
|
||||
func (p AdapterParameters) KV() ggml.KV {
|
||||
var alpha float32
|
||||
if p.LoraParameters.Alpha == 0 {
|
||||
alpha = float32(p.Alpha)
|
||||
@@ -62,7 +62,7 @@ func (p AdapterParameters) KV() llm.KV {
|
||||
alpha = p.LoraParameters.Alpha
|
||||
}
|
||||
|
||||
kv := llm.KV{
|
||||
kv := ggml.KV{
|
||||
"adapter.lora.alpha": alpha,
|
||||
"adapter.type": "lora",
|
||||
"general.file_type": uint32(1),
|
||||
@@ -79,19 +79,19 @@ func (ModelParameters) specialTokenTypes() []string {
|
||||
}
|
||||
}
|
||||
|
||||
func (ModelParameters) writeFile(ws io.WriteSeeker, kv llm.KV, ts []llm.Tensor) error {
|
||||
return llm.WriteGGUF(ws, kv, ts)
|
||||
func (ModelParameters) writeFile(ws io.WriteSeeker, kv ggml.KV, ts []ggml.Tensor) error {
|
||||
return ggml.WriteGGUF(ws, kv, ts)
|
||||
}
|
||||
|
||||
func (AdapterParameters) writeFile(ws io.WriteSeeker, kv llm.KV, ts []llm.Tensor) error {
|
||||
return llm.WriteGGUF(ws, kv, ts)
|
||||
func (AdapterParameters) writeFile(ws io.WriteSeeker, kv ggml.KV, ts []ggml.Tensor) error {
|
||||
return ggml.WriteGGUF(ws, kv, ts)
|
||||
}
|
||||
|
||||
type ModelConverter interface {
|
||||
// KV maps parameters to LLM key-values
|
||||
KV(*Tokenizer) llm.KV
|
||||
KV(*Tokenizer) ggml.KV
|
||||
// Tensors maps input tensors to LLM tensors. Model specific modifications can be done here.
|
||||
Tensors([]Tensor) []llm.Tensor
|
||||
Tensors([]Tensor) []ggml.Tensor
|
||||
// Replacements returns a list of string pairs to replace in tensor names.
|
||||
// See [strings.Replacer](https://pkg.go.dev/strings#Replacer) for details
|
||||
Replacements() []string
|
||||
@@ -99,7 +99,7 @@ type ModelConverter interface {
|
||||
// specialTokenTypes returns any special token types the model uses
|
||||
specialTokenTypes() []string
|
||||
// writeFile writes the model to the provided io.WriteSeeker
|
||||
writeFile(io.WriteSeeker, llm.KV, []llm.Tensor) error
|
||||
writeFile(io.WriteSeeker, ggml.KV, []ggml.Tensor) error
|
||||
}
|
||||
|
||||
type moreParser interface {
|
||||
@@ -108,17 +108,17 @@ type moreParser interface {
|
||||
|
||||
type AdapterConverter interface {
|
||||
// KV maps parameters to LLM key-values
|
||||
KV(llm.KV) llm.KV
|
||||
KV(ggml.KV) ggml.KV
|
||||
// Tensors maps input tensors to LLM tensors. Adapter specific modifications can be done here.
|
||||
Tensors([]Tensor) []llm.Tensor
|
||||
Tensors([]Tensor) []ggml.Tensor
|
||||
// Replacements returns a list of string pairs to replace in tensor names.
|
||||
// See [strings.Replacer](https://pkg.go.dev/strings#Replacer) for details
|
||||
Replacements() []string
|
||||
|
||||
writeFile(io.WriteSeeker, llm.KV, []llm.Tensor) error
|
||||
writeFile(io.WriteSeeker, ggml.KV, []ggml.Tensor) error
|
||||
}
|
||||
|
||||
func ConvertAdapter(fsys fs.FS, ws io.WriteSeeker, baseKV llm.KV) error {
|
||||
func ConvertAdapter(fsys fs.FS, ws io.WriteSeeker, baseKV ggml.KV) error {
|
||||
bts, err := fs.ReadFile(fsys, "adapter_config.json")
|
||||
if err != nil {
|
||||
return err
|
||||
|
||||
@@ -8,7 +8,7 @@ import (
|
||||
"slices"
|
||||
"strings"
|
||||
|
||||
"github.com/ollama/ollama/llm"
|
||||
"github.com/ollama/ollama/fs/ggml"
|
||||
)
|
||||
|
||||
type bertModel struct {
|
||||
@@ -85,7 +85,7 @@ func (p *bertModel) parseMore(fsys fs.FS) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (p *bertModel) KV(t *Tokenizer) llm.KV {
|
||||
func (p *bertModel) KV(t *Tokenizer) ggml.KV {
|
||||
kv := p.ModelParameters.KV(t)
|
||||
kv["general.architecture"] = "bert"
|
||||
kv["bert.attention.causal"] = false
|
||||
@@ -132,8 +132,8 @@ func (p *bertModel) KV(t *Tokenizer) llm.KV {
|
||||
return kv
|
||||
}
|
||||
|
||||
func (p *bertModel) Tensors(ts []Tensor) []llm.Tensor {
|
||||
var out []llm.Tensor
|
||||
func (p *bertModel) Tensors(ts []Tensor) []ggml.Tensor {
|
||||
var out []ggml.Tensor
|
||||
for _, t := range ts {
|
||||
if slices.Contains([]string{
|
||||
"embeddings.position_ids",
|
||||
@@ -143,7 +143,7 @@ func (p *bertModel) Tensors(ts []Tensor) []llm.Tensor {
|
||||
continue
|
||||
}
|
||||
|
||||
out = append(out, llm.Tensor{
|
||||
out = append(out, ggml.Tensor{
|
||||
Name: t.Name(),
|
||||
Kind: t.Kind(),
|
||||
Shape: t.Shape(),
|
||||
|
||||
@@ -3,7 +3,7 @@ package convert
|
||||
import (
|
||||
"cmp"
|
||||
|
||||
"github.com/ollama/ollama/llm"
|
||||
"github.com/ollama/ollama/fs/ggml"
|
||||
)
|
||||
|
||||
type commandrModel struct {
|
||||
@@ -24,7 +24,7 @@ type commandrModel struct {
|
||||
|
||||
var _ ModelConverter = (*commandrModel)(nil)
|
||||
|
||||
func (p *commandrModel) KV(t *Tokenizer) llm.KV {
|
||||
func (p *commandrModel) KV(t *Tokenizer) ggml.KV {
|
||||
kv := p.ModelParameters.KV(t)
|
||||
kv["general.architecture"] = "command-r"
|
||||
kv["general.name"] = "command-r"
|
||||
@@ -43,10 +43,10 @@ func (p *commandrModel) KV(t *Tokenizer) llm.KV {
|
||||
return kv
|
||||
}
|
||||
|
||||
func (p *commandrModel) Tensors(ts []Tensor) []llm.Tensor {
|
||||
var out []llm.Tensor
|
||||
func (p *commandrModel) Tensors(ts []Tensor) []ggml.Tensor {
|
||||
var out []ggml.Tensor
|
||||
for _, t := range ts {
|
||||
out = append(out, llm.Tensor{
|
||||
out = append(out, ggml.Tensor{
|
||||
Name: t.Name(),
|
||||
Kind: t.Kind(),
|
||||
Shape: t.Shape(),
|
||||
|
||||
@@ -6,7 +6,7 @@ import (
|
||||
"github.com/pdevine/tensor"
|
||||
"github.com/pdevine/tensor/native"
|
||||
|
||||
"github.com/ollama/ollama/llm"
|
||||
"github.com/ollama/ollama/fs/ggml"
|
||||
)
|
||||
|
||||
type gemmaModel struct {
|
||||
@@ -23,7 +23,7 @@ type gemmaModel struct {
|
||||
|
||||
var _ ModelConverter = (*gemmaModel)(nil)
|
||||
|
||||
func (p *gemmaModel) KV(t *Tokenizer) llm.KV {
|
||||
func (p *gemmaModel) KV(t *Tokenizer) ggml.KV {
|
||||
kv := p.ModelParameters.KV(t)
|
||||
kv["general.architecture"] = "gemma"
|
||||
kv["gemma.context_length"] = p.MaxPositionEmbeddings
|
||||
@@ -42,14 +42,14 @@ func (p *gemmaModel) KV(t *Tokenizer) llm.KV {
|
||||
return kv
|
||||
}
|
||||
|
||||
func (p *gemmaModel) Tensors(ts []Tensor) []llm.Tensor {
|
||||
var out []llm.Tensor
|
||||
func (p *gemmaModel) Tensors(ts []Tensor) []ggml.Tensor {
|
||||
var out []ggml.Tensor
|
||||
for _, t := range ts {
|
||||
if strings.HasSuffix(t.Name(), "_norm.weight") {
|
||||
t.SetRepacker(p.addOne)
|
||||
}
|
||||
|
||||
out = append(out, llm.Tensor{
|
||||
out = append(out, ggml.Tensor{
|
||||
Name: t.Name(),
|
||||
Kind: t.Kind(),
|
||||
Shape: t.Shape(),
|
||||
|
||||
@@ -1,8 +1,6 @@
|
||||
package convert
|
||||
|
||||
import (
|
||||
"github.com/ollama/ollama/llm"
|
||||
)
|
||||
import "github.com/ollama/ollama/fs/ggml"
|
||||
|
||||
type gemma2Model struct {
|
||||
gemmaModel
|
||||
@@ -11,7 +9,7 @@ type gemma2Model struct {
|
||||
FinalLogitSoftcap float32 `json:"final_logit_softcapping"`
|
||||
}
|
||||
|
||||
func (p *gemma2Model) KV(t *Tokenizer) llm.KV {
|
||||
func (p *gemma2Model) KV(t *Tokenizer) ggml.KV {
|
||||
kv := p.ModelParameters.KV(t)
|
||||
kv["general.architecture"] = "gemma2"
|
||||
kv["gemma2.context_length"] = p.MaxPositionEmbeddings
|
||||
|
||||
@@ -6,7 +6,7 @@ import (
|
||||
"github.com/pdevine/tensor"
|
||||
"github.com/pdevine/tensor/native"
|
||||
|
||||
"github.com/ollama/ollama/llm"
|
||||
"github.com/ollama/ollama/fs/ggml"
|
||||
)
|
||||
|
||||
type gemma2Adapter struct {
|
||||
@@ -15,14 +15,14 @@ type gemma2Adapter struct {
|
||||
|
||||
var _ AdapterConverter = (*gemma2Adapter)(nil)
|
||||
|
||||
func (p *gemma2Adapter) KV(baseKV llm.KV) llm.KV {
|
||||
func (p *gemma2Adapter) KV(baseKV ggml.KV) ggml.KV {
|
||||
kv := p.AdapterParameters.KV()
|
||||
kv["general.architecture"] = "gemma2"
|
||||
return kv
|
||||
}
|
||||
|
||||
func (p *gemma2Adapter) Tensors(ts []Tensor) []llm.Tensor {
|
||||
var out []llm.Tensor
|
||||
func (p *gemma2Adapter) Tensors(ts []Tensor) []ggml.Tensor {
|
||||
var out []ggml.Tensor
|
||||
for _, t := range ts {
|
||||
shape := t.Shape()
|
||||
if (strings.HasSuffix(t.Name(), "weight.lora_a") && shape[0] > shape[1]) ||
|
||||
@@ -31,7 +31,7 @@ func (p *gemma2Adapter) Tensors(ts []Tensor) []llm.Tensor {
|
||||
t.SetRepacker(p.repack)
|
||||
}
|
||||
|
||||
out = append(out, llm.Tensor{
|
||||
out = append(out, ggml.Tensor{
|
||||
Name: t.Name(),
|
||||
Kind: t.Kind(),
|
||||
Shape: t.Shape(),
|
||||
|
||||
@@ -9,7 +9,7 @@ import (
|
||||
"github.com/pdevine/tensor"
|
||||
"github.com/pdevine/tensor/native"
|
||||
|
||||
"github.com/ollama/ollama/llm"
|
||||
"github.com/ollama/ollama/fs/ggml"
|
||||
)
|
||||
|
||||
type llamaModel struct {
|
||||
@@ -46,7 +46,7 @@ type llamaModel struct {
|
||||
|
||||
var _ ModelConverter = (*llamaModel)(nil)
|
||||
|
||||
func (p *llamaModel) KV(t *Tokenizer) llm.KV {
|
||||
func (p *llamaModel) KV(t *Tokenizer) ggml.KV {
|
||||
kv := p.ModelParameters.KV(t)
|
||||
kv["general.architecture"] = "llama"
|
||||
kv["llama.vocab_size"] = p.VocabSize
|
||||
@@ -120,11 +120,11 @@ func (p *llamaModel) KV(t *Tokenizer) llm.KV {
|
||||
return kv
|
||||
}
|
||||
|
||||
func (p *llamaModel) Tensors(ts []Tensor) []llm.Tensor {
|
||||
var out []llm.Tensor
|
||||
func (p *llamaModel) Tensors(ts []Tensor) []ggml.Tensor {
|
||||
var out []ggml.Tensor
|
||||
|
||||
if p.RopeScaling.factors != nil {
|
||||
out = append(out, llm.Tensor{
|
||||
out = append(out, ggml.Tensor{
|
||||
Name: "rope_freqs.weight",
|
||||
Kind: 0,
|
||||
Shape: []uint64{uint64(len(p.RopeScaling.factors))},
|
||||
@@ -138,7 +138,7 @@ func (p *llamaModel) Tensors(ts []Tensor) []llm.Tensor {
|
||||
t.SetRepacker(p.repack)
|
||||
}
|
||||
|
||||
out = append(out, llm.Tensor{
|
||||
out = append(out, ggml.Tensor{
|
||||
Name: t.Name(),
|
||||
Kind: t.Kind(),
|
||||
Shape: t.Shape(),
|
||||
|
||||
@@ -7,7 +7,7 @@ import (
|
||||
"github.com/pdevine/tensor"
|
||||
"github.com/pdevine/tensor/native"
|
||||
|
||||
"github.com/ollama/ollama/llm"
|
||||
"github.com/ollama/ollama/fs/ggml"
|
||||
)
|
||||
|
||||
type llamaAdapter struct {
|
||||
@@ -18,7 +18,7 @@ type llamaAdapter struct {
|
||||
|
||||
var _ AdapterConverter = (*llamaAdapter)(nil)
|
||||
|
||||
func (p *llamaAdapter) KV(baseKV llm.KV) llm.KV {
|
||||
func (p *llamaAdapter) KV(baseKV ggml.KV) ggml.KV {
|
||||
kv := p.AdapterParameters.KV()
|
||||
kv["general.architecture"] = "llama"
|
||||
kv["llama.attention.head_count"] = baseKV["llama.attention.head_count"]
|
||||
@@ -29,8 +29,8 @@ func (p *llamaAdapter) KV(baseKV llm.KV) llm.KV {
|
||||
return kv
|
||||
}
|
||||
|
||||
func (p *llamaAdapter) Tensors(ts []Tensor) []llm.Tensor {
|
||||
var out []llm.Tensor
|
||||
func (p *llamaAdapter) Tensors(ts []Tensor) []ggml.Tensor {
|
||||
var out []ggml.Tensor
|
||||
for _, t := range ts {
|
||||
shape := t.Shape()
|
||||
if (strings.HasSuffix(t.Name(), "weight.lora_a") && shape[0] > shape[1]) ||
|
||||
@@ -41,7 +41,7 @@ func (p *llamaAdapter) Tensors(ts []Tensor) []llm.Tensor {
|
||||
t.SetRepacker(p.repack)
|
||||
}
|
||||
|
||||
out = append(out, llm.Tensor{
|
||||
out = append(out, ggml.Tensor{
|
||||
Name: t.Name(),
|
||||
Kind: t.Kind(),
|
||||
Shape: shape,
|
||||
|
||||
@@ -6,7 +6,7 @@ import (
|
||||
"slices"
|
||||
"strings"
|
||||
|
||||
"github.com/ollama/ollama/llm"
|
||||
"github.com/ollama/ollama/fs/ggml"
|
||||
)
|
||||
|
||||
type mixtralModel struct {
|
||||
@@ -15,7 +15,7 @@ type mixtralModel struct {
|
||||
NumExpertsPerToken uint32 `json:"num_experts_per_tok"`
|
||||
}
|
||||
|
||||
func (p *mixtralModel) KV(t *Tokenizer) llm.KV {
|
||||
func (p *mixtralModel) KV(t *Tokenizer) ggml.KV {
|
||||
kv := p.llamaModel.KV(t)
|
||||
|
||||
if p.NumLocalExperts > 0 {
|
||||
@@ -29,7 +29,7 @@ func (p *mixtralModel) KV(t *Tokenizer) llm.KV {
|
||||
return kv
|
||||
}
|
||||
|
||||
func (p *mixtralModel) Tensors(ts []Tensor) []llm.Tensor {
|
||||
func (p *mixtralModel) Tensors(ts []Tensor) []ggml.Tensor {
|
||||
oldnew := []string{
|
||||
"model.layers", "blk",
|
||||
"w1", "ffn_gate_exps",
|
||||
@@ -56,10 +56,10 @@ func (p *mixtralModel) Tensors(ts []Tensor) []llm.Tensor {
|
||||
return true
|
||||
})
|
||||
|
||||
var out []llm.Tensor
|
||||
var out []ggml.Tensor
|
||||
for n, e := range experts {
|
||||
// TODO(mxyng): sanity check experts
|
||||
out = append(out, llm.Tensor{
|
||||
out = append(out, ggml.Tensor{
|
||||
Name: n,
|
||||
Kind: e[0].Kind(),
|
||||
Shape: append([]uint64{uint64(len(e))}, e[0].Shape()...),
|
||||
|
||||
@@ -8,7 +8,7 @@ import (
|
||||
"strings"
|
||||
"sync"
|
||||
|
||||
"github.com/ollama/ollama/llm"
|
||||
"github.com/ollama/ollama/fs/ggml"
|
||||
)
|
||||
|
||||
type phi3Model struct {
|
||||
@@ -37,7 +37,7 @@ type phi3Model struct {
|
||||
|
||||
var _ ModelConverter = (*phi3Model)(nil)
|
||||
|
||||
func (p *phi3Model) KV(t *Tokenizer) llm.KV {
|
||||
func (p *phi3Model) KV(t *Tokenizer) ggml.KV {
|
||||
kv := p.ModelParameters.KV(t)
|
||||
kv["general.architecture"] = "phi3"
|
||||
kv["phi3.context_length"] = p.MaxPositionEmbeddings
|
||||
@@ -68,19 +68,19 @@ func (p *phi3Model) KV(t *Tokenizer) llm.KV {
|
||||
return kv
|
||||
}
|
||||
|
||||
func (p *phi3Model) Tensors(ts []Tensor) []llm.Tensor {
|
||||
func (p *phi3Model) Tensors(ts []Tensor) []ggml.Tensor {
|
||||
var addRopeFactors sync.Once
|
||||
|
||||
out := make([]llm.Tensor, 0, len(ts)+2)
|
||||
out := make([]ggml.Tensor, 0, len(ts)+2)
|
||||
for _, t := range ts {
|
||||
if strings.HasPrefix(t.Name(), "blk.0.") {
|
||||
addRopeFactors.Do(func() {
|
||||
out = append(out, llm.Tensor{
|
||||
out = append(out, ggml.Tensor{
|
||||
Name: "rope_factors_long.weight",
|
||||
Kind: 0,
|
||||
Shape: []uint64{uint64(len(p.RopeScaling.LongFactor))},
|
||||
WriterTo: p.RopeScaling.LongFactor,
|
||||
}, llm.Tensor{
|
||||
}, ggml.Tensor{
|
||||
Name: "rope_factors_short.weight",
|
||||
Kind: 0,
|
||||
Shape: []uint64{uint64(len(p.RopeScaling.ShortFactor))},
|
||||
@@ -89,7 +89,7 @@ func (p *phi3Model) Tensors(ts []Tensor) []llm.Tensor {
|
||||
})
|
||||
}
|
||||
|
||||
out = append(out, llm.Tensor{
|
||||
out = append(out, ggml.Tensor{
|
||||
Name: t.Name(),
|
||||
Kind: t.Kind(),
|
||||
Shape: t.Shape(),
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
package convert
|
||||
|
||||
import "github.com/ollama/ollama/llm"
|
||||
import "github.com/ollama/ollama/fs/ggml"
|
||||
|
||||
type qwen2Model struct {
|
||||
ModelParameters
|
||||
@@ -21,7 +21,7 @@ type qwen2Model struct {
|
||||
|
||||
var _ ModelConverter = (*qwen2Model)(nil)
|
||||
|
||||
func (q *qwen2Model) KV(t *Tokenizer) llm.KV {
|
||||
func (q *qwen2Model) KV(t *Tokenizer) ggml.KV {
|
||||
kv := q.ModelParameters.KV(t)
|
||||
kv["general.architecture"] = "qwen2"
|
||||
kv["qwen2.block_count"] = q.HiddenLayers
|
||||
@@ -45,10 +45,10 @@ func (q *qwen2Model) KV(t *Tokenizer) llm.KV {
|
||||
return kv
|
||||
}
|
||||
|
||||
func (q *qwen2Model) Tensors(ts []Tensor) []llm.Tensor {
|
||||
var out []llm.Tensor
|
||||
func (q *qwen2Model) Tensors(ts []Tensor) []ggml.Tensor {
|
||||
var out []ggml.Tensor
|
||||
for _, t := range ts {
|
||||
out = append(out, llm.Tensor{
|
||||
out = append(out, ggml.Tensor{
|
||||
Name: t.Name(),
|
||||
Kind: t.Kind(),
|
||||
Shape: t.Shape(),
|
||||
|
||||
@@ -20,7 +20,7 @@ import (
|
||||
|
||||
"golang.org/x/exp/maps"
|
||||
|
||||
"github.com/ollama/ollama/llm"
|
||||
"github.com/ollama/ollama/fs/ggml"
|
||||
)
|
||||
|
||||
type tensorData struct {
|
||||
@@ -29,7 +29,7 @@ type tensorData struct {
|
||||
Shape []int `json:"shape"`
|
||||
}
|
||||
|
||||
func convertFull(t *testing.T, fsys fs.FS) (*os.File, llm.KV, *llm.Tensors) {
|
||||
func convertFull(t *testing.T, fsys fs.FS) (*os.File, ggml.KV, ggml.Tensors) {
|
||||
t.Helper()
|
||||
|
||||
f, err := os.CreateTemp(t.TempDir(), "f16")
|
||||
@@ -48,7 +48,7 @@ func convertFull(t *testing.T, fsys fs.FS) (*os.File, llm.KV, *llm.Tensors) {
|
||||
}
|
||||
t.Cleanup(func() { r.Close() })
|
||||
|
||||
m, _, err := llm.DecodeGGML(r, math.MaxInt)
|
||||
m, _, err := ggml.Decode(r, math.MaxInt)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
@@ -60,7 +60,7 @@ func convertFull(t *testing.T, fsys fs.FS) (*os.File, llm.KV, *llm.Tensors) {
|
||||
return r, m.KV(), m.Tensors()
|
||||
}
|
||||
|
||||
func generateResultsJSON(t *testing.T, f *os.File, kv llm.KV, tensors *llm.Tensors) map[string]string {
|
||||
func generateResultsJSON(t *testing.T, f *os.File, kv ggml.KV, tensors ggml.Tensors) map[string]string {
|
||||
actual := make(map[string]string)
|
||||
for k, v := range kv {
|
||||
if s, ok := v.(json.Marshaler); !ok {
|
||||
@@ -75,7 +75,7 @@ func generateResultsJSON(t *testing.T, f *os.File, kv llm.KV, tensors *llm.Tenso
|
||||
}
|
||||
}
|
||||
|
||||
for _, tensor := range tensors.Items {
|
||||
for _, tensor := range tensors.Items() {
|
||||
sha256sum := sha256.New()
|
||||
sr := io.NewSectionReader(f, int64(tensors.Offset+tensor.Offset), int64(tensor.Size()))
|
||||
if _, err := io.Copy(sha256sum, sr); err != nil {
|
||||
@@ -332,7 +332,7 @@ func TestConvertAdapter(t *testing.T) {
|
||||
}
|
||||
defer r.Close()
|
||||
|
||||
m, _, err := llm.DecodeGGML(r, math.MaxInt)
|
||||
m, _, err := ggml.Decode(r, math.MaxInt)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
@@ -19,9 +19,8 @@ var LibOllamaPath string = func() string {
|
||||
return ""
|
||||
}
|
||||
|
||||
exe, err = filepath.EvalSymlinks(exe)
|
||||
if err != nil {
|
||||
return ""
|
||||
if eval, err := filepath.EvalSymlinks(exe); err == nil {
|
||||
exe = eval
|
||||
}
|
||||
|
||||
var libPath string
|
||||
|
||||
338
docs/add-a-model.md
Normal file
338
docs/add-a-model.md
Normal file
@@ -0,0 +1,338 @@
|
||||
# Guide: Implementing Models in Ollama's Go Inference Engine
|
||||
|
||||
> **Note**: This guide and the Go inference engine are in early development and will be updated as implementation details evolve.
|
||||
|
||||
This guide outlines the process of implementing a new model in Ollama's inference engine. It covers everything from initial setup to publishing your model to ollama.com.
|
||||
|
||||
## Architecture Overview
|
||||
|
||||
Below is a diagram showing Ollama's inference engine architecture layers and how they interact:
|
||||
|
||||
```mermaid
|
||||
graph TB
|
||||
subgraph Models["Model Layer: LLM Implementations"]
|
||||
direction TB
|
||||
llama["model/models/llama"]
|
||||
mllama["model/models/mllama"]
|
||||
qwen["model/models/qwen2"]
|
||||
etc["...etc"]
|
||||
|
||||
note1[" Each model implements a<br>specific architecture:<br>- Defines model parameters<br>- Implements forward pass"]
|
||||
end
|
||||
|
||||
subgraph ML_Ops["Neural Network Operations"]
|
||||
direction TB
|
||||
nn_ops[" nn/<br>linear.go: Matrix multiplication<br>embedding.go: Token embedding lookups<br>normalization.go: Layer norm operations<br>convolution.go: Convolutional operations "]
|
||||
|
||||
backend[" ml/backend.go<br>Hardware Abstraction Layer:<br>- Defines tensor operations<br>- Manages computation graphs<br>- Handles memory allocation "]
|
||||
|
||||
note2[" Common neural net operations:<br>- Abstracts hardware details<br>- Provides unified API<br>- Manages computation flow "]
|
||||
end
|
||||
|
||||
subgraph Hardware["Backend Execution Layer"]
|
||||
direction TB
|
||||
backend_impl[" The backend package provides:<br>- Unified computation interface<br>- Automatic hardware selection<br>- Optimized kernels<br>- Efficient memory management "]
|
||||
|
||||
subgraph Backends["Backend Implementations"]
|
||||
direction LR
|
||||
cpu["backend/cpu<br>- Pure Go implementation<br>- Fallback for all platforms"]
|
||||
|
||||
metal["backend/metal<br>- Apple Silicon (M1/M2/M3)<br>- MLX integration<br>- Leverages Apple Neural Engine"]
|
||||
|
||||
onnx["backend/onnx<br>- Cross-platform compatibility<br>- ONNX Runtime integration<br>- Pre-compiled graph execution"]
|
||||
|
||||
ggml["backend/ggml<br>- CPU/GPU quantized compute<br>- Low-precision operations<br>- Memory-efficient inferencing"]
|
||||
end
|
||||
end
|
||||
|
||||
Models --> |" Makes high-level calls<br>(e.g., self-attention) "| ML_Ops
|
||||
ML_Ops --> |" Translates to tensor operations<br>(e.g., matmul, softmax) "| Hardware
|
||||
backend_impl --> Backends
|
||||
```
|
||||
|
||||
When implementing a new model, you'll primarily work in the model layer, interfacing with the neural network operations layer.
|
||||
|
||||
## Implementation Process Overview
|
||||
|
||||
Here's the high-level process for implementing a new model in Ollama:
|
||||
|
||||
1. **Environment Setup**: Clone the repository and set up your development environment
|
||||
2. **Research Implementation**: Understand the original model architecture
|
||||
3. **Project Structure Setup**: Set up the necessary file structure
|
||||
4. **Create Basic Modelfile**: Create a simple Modelfile for testing
|
||||
5. **Implement Weight Conversion**: Map from original format to GGUF
|
||||
6. **Open a Draft PR**: Create a draft pull request to establish communication with maintainers
|
||||
7. **Implement Model Logic**: Create the model architecture and forward pass
|
||||
8. **Quality Check and Final Steps**: Create a Modelfile, add tests and ensure functionality
|
||||
10. **Finalize PR and Publish**: Complete the PR and publish to ollama.com
|
||||
|
||||
## Implementation Steps in Detail
|
||||
|
||||
### 1. Environment Setup
|
||||
|
||||
First, clone the Ollama repository and get it running locally. Follow the development setup guide at:
|
||||
https://github.com/ollama/ollama/blob/main/docs/development.md
|
||||
|
||||
### 2. Research Implementation
|
||||
|
||||
Get the original model implementation running. This typically involves:
|
||||
- Cloning the research code repository (usually Python-based)
|
||||
- Setting up the required environment
|
||||
- Running inference with sample inputs
|
||||
- Understanding the model architecture and forward pass
|
||||
|
||||
### 3. Project Structure Setup
|
||||
|
||||
Create the necessary file structure by referencing previous model implementations. You'll need:
|
||||
|
||||
```
|
||||
convert/
|
||||
└── convert_your-model.go # Weight conversion logic (PyTorch/SafeTensors to GGML)
|
||||
model/
|
||||
└── your-model/
|
||||
└── model.go # Architecture and forward pass implementation
|
||||
```
|
||||
|
||||
Add your model to the main paths in [model/models/models.go](https://github.com/ollama/ollama/blob/main/model/models/models.go):
|
||||
|
||||
```
|
||||
package models
|
||||
|
||||
import (
|
||||
_ "github.com/ollama/ollama/model/models/llama"
|
||||
_ "github.com/ollama/ollama/model/models/mllama"
|
||||
_ "github.com/ollama/ollama/model/models/your-model" // Add your model here
|
||||
)
|
||||
```
|
||||
|
||||
### 4. Create a Basic Modelfile
|
||||
|
||||
Create a simple Modelfile early in the process to facilitate testing:
|
||||
|
||||
```
|
||||
FROM /path/to/model
|
||||
TEMPLATE "{{.Prompt}}" # Use a static prompt format for initial testing
|
||||
```
|
||||
|
||||
This allows you to test your implementation with consistent inputs before finalizing the proper prompt template.
|
||||
|
||||
### 5. Implement Weight Conversion
|
||||
|
||||
- Work on `convert/convert_your-model.go`
|
||||
- Reference existing conversion implementations
|
||||
- Conversion involves mapping from PyTorch/SafeTensors naming to GGUF naming as you see fit
|
||||
- Understand typical GGUF layout and structure:
|
||||
|
||||
**Typical GGUF Layout:**
|
||||
```
|
||||
GGUF
|
||||
├── Metadata Section
|
||||
│ ├── Model Parameters
|
||||
│ │ ├── General architecture parameters
|
||||
│ │ │ ├── "{arch}.vocab_size" (e.g., "llama.vocab_size")
|
||||
│ │ │ ├── "{arch}.context_length" (e.g., "llama.context_length")
|
||||
│ │ │ ├── "{arch}.embedding_length" (e.g., "llama.embedding_length")
|
||||
│ │ │ └── "{arch}.block_count" (e.g., "llama.block_count")
|
||||
│ │ │
|
||||
│ │ └── Architecture-specific parameters
|
||||
│ │ ├── "{arch}.attention.head_count" (e.g., "llama.attention.head_count")
|
||||
│ │ ├── "{arch}.attention.head_count_kv" (e.g., "llama.attention.head_count_kv")
|
||||
│ │ ├── "{arch}.rope.dimension_count" (e.g., "llama.rope.dimension_count")
|
||||
│ │ └── "{arch}.attention.layer_norm_rms_epsilon" (e.g., "llama.attention.layer_norm_rms_epsilon")
|
||||
│ │
|
||||
│ ├── Tokenizer parameters
|
||||
│ │ ├── "tokenizer.ggml.model" (e.g., "llama")
|
||||
│ │ ├── "tokenizer.ggml.tokens" (vocabulary tokens)
|
||||
│ │ ├── "tokenizer.ggml.bos_id" (beginning of sequence token ID)
|
||||
│ │ └── "tokenizer.ggml.eos_id" (end of sequence token ID)
|
||||
│ │
|
||||
│ └── General metadata
|
||||
│ └── "general.architecture" (e.g., "llama", "qwen2", "phi")
|
||||
│
|
||||
└── Tensor Data Section
|
||||
├── Common tensors:
|
||||
│ ├── "token_embd.weight" (token embedding matrix)
|
||||
│ ├── "rope_freqs.weight" (RoPE frequency weights)
|
||||
│ ├── "output_norm.weight" (final layer normalization)
|
||||
│ └── "output.weight" (output projection)
|
||||
│
|
||||
└── Layer-specific tensors:
|
||||
├── "blk.{i}.attn_q.weight" (query projection)
|
||||
├── "blk.{i}.attn_k.weight" (key projection)
|
||||
├── "blk.{i}.attn_v.weight" (value projection)
|
||||
├── "blk.{i}.attn_output.weight" (attention output)
|
||||
├── "blk.{i}.attn_norm.weight" (attention normalization)
|
||||
├── "blk.{i}.ffn_norm.weight" (feed-forward normalization)
|
||||
├── "blk.{i}.ffn_up.weight" (FFN up projection)
|
||||
├── "blk.{i}.ffn_down.weight" (FFN down projection)
|
||||
└── "blk.{i}.ffn_gate.weight" (FFN gate projection)
|
||||
```
|
||||
|
||||
- Key conversion details include:
|
||||
- Linear weight matrices (sometimes need transposition)
|
||||
- Layer normalization weights (might need reshaping)
|
||||
- **Note: In GGML, FFN values are for the MLP (Multi-Layer Perceptron) part of the architecture**
|
||||
|
||||
- Test conversion:
|
||||
```bash
|
||||
go run . create <my-model> -f /path/to/Modelfile
|
||||
```
|
||||
|
||||
### 6. Open a Draft PR
|
||||
|
||||
After implementing the initial weight conversion, creating a draft pull request is recommended as it:
|
||||
- Establishes a communication channel with Ollama maintainers
|
||||
- Allows for early feedback on your approach
|
||||
- Makes it easier to track progress and changes
|
||||
|
||||
To open a draft PR:
|
||||
1. Fork the repository
|
||||
2. Create a new branch for your model implementation
|
||||
3. Make initial commits with your weight conversion implementation
|
||||
4. Open a PR in the `ollama/ollama` repository and mark it as draft
|
||||
5. Include a clear description of the model you're implementing
|
||||
|
||||
### 7. Implement Model Logic
|
||||
|
||||
- Reference existing model implementations
|
||||
- Implement `New()` and `Forward()` functions in `model.go`:
|
||||
|
||||
**The `New()` function:**
|
||||
- Creates and initializes your model structure
|
||||
- Loads configuration parameters (embedding size, attention heads, etc.)
|
||||
- Sets up the tokenizer with vocabulary and special tokens
|
||||
- Initializes all model layers and weights
|
||||
- **Important**: Sets up the KV cache for efficient inference
|
||||
- Example:
|
||||
```go
|
||||
func New(c ml.Config) (model.Model, error) {
|
||||
m := &Model{
|
||||
// Initialize tokenizer
|
||||
BytePairEncoding: model.NewBytePairEncoding(...),
|
||||
// Create layer arrays
|
||||
Layers: make([]Layer, c.Uint("block_count")),
|
||||
// Set model parameters
|
||||
Options: &Options{...},
|
||||
}
|
||||
// Initialize KV cache for efficient inference
|
||||
m.Cache = kvcache.NewCausalCache(m.Shift)
|
||||
return m, nil
|
||||
}
|
||||
```
|
||||
|
||||
**The `Forward()` function:**
|
||||
- **What it does**: Defines the computational graph of your model
|
||||
- **Important**: The graph is NOT executed immediately - it's built first, then executed later when predictions are needed
|
||||
- Takes input tokens and converts them to embeddings
|
||||
- Processes inputs through transformer layers (attention and feed-forward networks)
|
||||
- Creates the path for data flow through your model's components
|
||||
- Example:
|
||||
```go
|
||||
func (m *Model) Forward(ctx ml.Context, opts model.Options) (ml.Tensor, error) {
|
||||
// Convert inputs to tensors
|
||||
inputTensor, _ := ctx.FromIntSlice(opts.Inputs, len(opts.Inputs))
|
||||
positionsTensor, _ := ctx.FromIntSlice(opts.Positions, len(opts.Positions))
|
||||
|
||||
// Initial token embedding
|
||||
hiddenStates := m.TokenEmbedding.Forward(ctx, inputTensor)
|
||||
|
||||
// Process through transformer layers
|
||||
for i, layer := range m.Layers {
|
||||
m.Cache.SetLayer(i)
|
||||
hiddenStates = layer.Forward(ctx, hiddenStates, positionsTensor, m.Cache, m.Options)
|
||||
}
|
||||
|
||||
// Final processing and output
|
||||
normalizedOutput := m.OutputNorm.Forward(ctx, hiddenStates, m.modelEpsilon)
|
||||
logits := m.Output.Forward(ctx, normalizedOutput)
|
||||
|
||||
// Return logits for requested positions
|
||||
outputsTensor, _ := ctx.FromIntSlice(opts.Outputs, len(opts.Outputs))
|
||||
return logits.Rows(ctx, outputsTensor), nil
|
||||
}
|
||||
```
|
||||
|
||||
**Key Components to Implement:**
|
||||
|
||||
1. **KV Cache**:
|
||||
- Improves inference performance for text generation
|
||||
- How it works: Stores previously computed key and value tensors from self-attention, avoiding redundant computations
|
||||
- Implementation: Use the `kvcache.NewCausalCache()` for autoregressive models
|
||||
- Important: Must implement the `Shift()` function to handle rotary position embeddings with the cache
|
||||
|
||||
2. **Self-Attention**:
|
||||
- Core component that learns contextual relationships between tokens
|
||||
- Implements query, key, value projections and their interactions
|
||||
- Must handle positional encoding (usually Rotary Position Embeddings)
|
||||
- Uses the KV cache to make generation efficient
|
||||
|
||||
3. **Normalization Layers**:
|
||||
- Purpose: Stabilizes training and maintains consistent activation distributions
|
||||
- Types: RMSNorm, LayerNorm, etc. depending on model architecture
|
||||
- Implementation: Apply before attention and feed-forward networks
|
||||
- Example: `normalizedOutput := m.OutputNorm.Forward(ctx, hiddenStates, m.modelEpsilon)`
|
||||
|
||||
4. **Activation Functions**:
|
||||
- Purpose: Introduces non-linearity into the model
|
||||
- Common types: SILU (Sigmoid Linear Unit), GELU, ReLU
|
||||
- Found in feed-forward/MLP blocks
|
||||
- Example:
|
||||
```go
|
||||
// SwiGLU activation in MLP
|
||||
gateActivation := mlp.Gate.Forward(ctx, hiddenState).SILU(ctx)
|
||||
upProjection := mlp.Up.Forward(ctx, hiddenState)
|
||||
intermediateStates := gateActivation.Mul(ctx, upProjection)
|
||||
```
|
||||
- Run your forward pass:
|
||||
```bash
|
||||
# in the root of the ollama directory
|
||||
go build .
|
||||
OLLAMA_DEBUG=1 ./ollama serve
|
||||
OLLAMA_DEBUG=1 ./ollama run <my-model>
|
||||
```
|
||||
- Compare output with research implementation
|
||||
|
||||
### 8. Quality Check and Final Steps
|
||||
|
||||
1. Add comprehensive tests to:
|
||||
- `model_test.go`
|
||||
- `convert_test.go`
|
||||
|
||||
2. Ensure tests cover:
|
||||
- Weight conversion
|
||||
- Model initialization
|
||||
- Text generation
|
||||
|
||||
3. **Create Final Modelfile**
|
||||
- Replace the static prompt with the proper Go template for your model:
|
||||
```
|
||||
FROM <converted-gguf>
|
||||
TEMPLATE <prompt-template> # Add the proper Go template for your model, including tools if needed
|
||||
LICENSE <license-info> # Add appropriate license information
|
||||
# Add additional parameters if needed
|
||||
```
|
||||
|
||||
4. **End-to-end Testing**
|
||||
- Run your model with your local Ollama build to ensure that it functions as expected
|
||||
|
||||
5. Benchmark
|
||||
- Run performance benchmarks on your model implementation
|
||||
```go
|
||||
# from the root of the Ollama directory, while a server is running locally
|
||||
go build .
|
||||
OLLAMA_DEBUG=1 ./ollama serve
|
||||
go test -bench=. -m <your-model-name> ./...
|
||||
```
|
||||
|
||||
### 9. Finalize PR and Publish to ollama.com
|
||||
|
||||
1. **Finalize Pull Request**
|
||||
- Move PR out of draft state
|
||||
- Address reviewer feedback
|
||||
|
||||
2. **Publish to ollama.com**
|
||||
- Push to ollama.com:
|
||||
```bash
|
||||
ollama create <your-namespace>/<your-model> -f /path/to/Modelfile
|
||||
ollama push <your-namespace>/<your-model>
|
||||
```
|
||||
@@ -7,7 +7,7 @@ Check your compute compatibility to see if your card is supported:
|
||||
|
||||
| Compute Capability | Family | Cards |
|
||||
| ------------------ | ------------------- | ----------------------------------------------------------------------------------------------------------- |
|
||||
| 9.0 | NVIDIA | `H100` |
|
||||
| 9.0 | NVIDIA | `H200` `H100` |
|
||||
| 8.9 | GeForce RTX 40xx | `RTX 4090` `RTX 4080 SUPER` `RTX 4080` `RTX 4070 Ti SUPER` `RTX 4070 Ti` `RTX 4070 SUPER` `RTX 4070` `RTX 4060 Ti` `RTX 4060` |
|
||||
| | NVIDIA Professional | `L4` `L40` `RTX 6000` |
|
||||
| 8.6 | GeForce RTX 30xx | `RTX 3090 Ti` `RTX 3090` `RTX 3080 Ti` `RTX 3080` `RTX 3070 Ti` `RTX 3070` `RTX 3060 Ti` `RTX 3060` `RTX 3050 Ti` `RTX 3050` |
|
||||
|
||||
@@ -55,7 +55,7 @@ Here's a quick example showing API access from `powershell`
|
||||
## Troubleshooting
|
||||
|
||||
Ollama on Windows stores files in a few different locations. You can view them in
|
||||
the explorer window by hitting `<cmd>+R` and type in:
|
||||
the explorer window by hitting `<Ctrl>+R` and type in:
|
||||
- `explorer %LOCALAPPDATA%\Ollama` contains logs, and downloaded updates
|
||||
- *app.log* contains most resent logs from the GUI application
|
||||
- *server.log* contains the most recent server logs
|
||||
|
||||
@@ -165,6 +165,8 @@ var (
|
||||
IntelGPU = Bool("OLLAMA_INTEL_GPU")
|
||||
// MultiUserCache optimizes prompt caching for multi-user scenarios
|
||||
MultiUserCache = Bool("OLLAMA_MULTIUSER_CACHE")
|
||||
// Enable the new Ollama engine
|
||||
NewEngine = Bool("OLLAMA_NEW_ENGINE")
|
||||
)
|
||||
|
||||
func String(s string) func() string {
|
||||
@@ -250,6 +252,7 @@ func AsMap() map[string]EnvVar {
|
||||
"OLLAMA_ORIGINS": {"OLLAMA_ORIGINS", Origins(), "A comma separated list of allowed origins"},
|
||||
"OLLAMA_SCHED_SPREAD": {"OLLAMA_SCHED_SPREAD", SchedSpread(), "Always schedule model across all GPUs"},
|
||||
"OLLAMA_MULTIUSER_CACHE": {"OLLAMA_MULTIUSER_CACHE", MultiUserCache(), "Optimize prompt caching for multi-user scenarios"},
|
||||
"OLLAMA_NEW_ENGINE": {"OLLAMA_NEW_ENGINE", NewEngine(), "Enable the new Ollama engine"},
|
||||
|
||||
// Informational
|
||||
"HTTP_PROXY": {"HTTP_PROXY", String("HTTP_PROXY")(), "HTTP proxy"},
|
||||
|
||||
@@ -12,6 +12,9 @@ func TestHumanNumber(t *testing.T) {
|
||||
|
||||
testCases := []testCase{
|
||||
{0, "0"},
|
||||
{999, "999"},
|
||||
{1000, "1K"},
|
||||
{1001, "1K"},
|
||||
{1000000, "1M"},
|
||||
{125000000, "125M"},
|
||||
{500500000, "500.50M"},
|
||||
|
||||
@@ -1,15 +1,15 @@
|
||||
package llm
|
||||
package ggml
|
||||
|
||||
import (
|
||||
"encoding/binary"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"log/slog"
|
||||
"slices"
|
||||
"strings"
|
||||
"sync"
|
||||
|
||||
"github.com/ollama/ollama/util/bufioutil"
|
||||
"github.com/ollama/ollama/fs/util/bufioutil"
|
||||
)
|
||||
|
||||
type GGML struct {
|
||||
@@ -19,145 +19,166 @@ type GGML struct {
|
||||
|
||||
type model interface {
|
||||
KV() KV
|
||||
Tensors() *Tensors
|
||||
Tensors() Tensors
|
||||
}
|
||||
|
||||
type KV map[string]any
|
||||
|
||||
func (kv KV) u64(key string) uint64 {
|
||||
switch v := kv[key].(type) {
|
||||
case uint64:
|
||||
return v
|
||||
case uint32:
|
||||
return uint64(v)
|
||||
case float64:
|
||||
return uint64(v)
|
||||
default:
|
||||
return 0
|
||||
}
|
||||
}
|
||||
|
||||
func (kv KV) Architecture() string {
|
||||
if s, ok := kv["general.architecture"].(string); ok {
|
||||
return s
|
||||
}
|
||||
|
||||
return "unknown"
|
||||
return kv.String("general.architecture", "unknown")
|
||||
}
|
||||
|
||||
func (kv KV) Kind() string {
|
||||
if s, ok := kv["general.type"].(string); ok {
|
||||
return s
|
||||
}
|
||||
|
||||
return "unknown"
|
||||
return kv.String("general.type", "unknown")
|
||||
}
|
||||
|
||||
func (kv KV) ParameterCount() uint64 {
|
||||
return kv.u64("general.parameter_count")
|
||||
return keyValue[uint64](kv, "general.parameter_count")
|
||||
}
|
||||
|
||||
func (kv KV) FileType() fileType {
|
||||
if u64 := kv.u64("general.file_type"); u64 > 0 {
|
||||
return fileType(uint32(u64))
|
||||
if t := kv.Uint("general.file_type"); t > 0 {
|
||||
return fileType(t)
|
||||
}
|
||||
|
||||
return fileTypeUnknown
|
||||
}
|
||||
|
||||
func (kv KV) BlockCount() uint64 {
|
||||
return kv.u64(fmt.Sprintf("%s.block_count", kv.Architecture()))
|
||||
return uint64(kv.Uint("block_count"))
|
||||
}
|
||||
|
||||
func (kv KV) EmbeddingLength() uint64 {
|
||||
return uint64(kv.Uint("embedding_length"))
|
||||
}
|
||||
|
||||
func (kv KV) HeadCount() uint64 {
|
||||
return kv.u64(fmt.Sprintf("%s.attention.head_count", kv.Architecture()))
|
||||
return uint64(kv.Uint("attention.head_count"))
|
||||
}
|
||||
|
||||
func (kv KV) HeadCountKV() uint64 {
|
||||
if headCountKV := kv.u64(fmt.Sprintf("%s.attention.head_count_kv", kv.Architecture())); headCountKV > 0 {
|
||||
return headCountKV
|
||||
}
|
||||
|
||||
return 1
|
||||
return uint64(kv.Uint("attention.head_count_kv", 1))
|
||||
}
|
||||
|
||||
func (kv KV) EmbeddingHeadCount() uint64 {
|
||||
if heads := kv.HeadCount(); heads > 0 {
|
||||
return kv.EmbeddingLength() / kv.HeadCount()
|
||||
return kv.EmbeddingLength() / heads
|
||||
}
|
||||
|
||||
return 0
|
||||
}
|
||||
|
||||
func (kv KV) EmbeddingHeadCountK() uint64 {
|
||||
if k := kv.u64(fmt.Sprintf("%s.attention.key_length", kv.Architecture())); k > 0 {
|
||||
return k
|
||||
}
|
||||
|
||||
return kv.EmbeddingHeadCount()
|
||||
return uint64(kv.Uint("attention.key_length", uint32(kv.EmbeddingHeadCount())))
|
||||
}
|
||||
|
||||
func (kv KV) EmbeddingHeadCountV() uint64 {
|
||||
if v := kv.u64(fmt.Sprintf("%s.attention.value_length", kv.Architecture())); v > 0 {
|
||||
return v
|
||||
}
|
||||
|
||||
return kv.EmbeddingHeadCount()
|
||||
return uint64(kv.Uint("attention.value_length", uint32(kv.EmbeddingHeadCount())))
|
||||
}
|
||||
|
||||
func (kv KV) GQA() uint64 {
|
||||
return kv.HeadCount() / kv.HeadCountKV()
|
||||
}
|
||||
|
||||
func (kv KV) EmbeddingLength() uint64 {
|
||||
return kv.u64(fmt.Sprintf("%s.embedding_length", kv.Architecture()))
|
||||
}
|
||||
|
||||
func (kv KV) ContextLength() uint64 {
|
||||
return kv.u64(fmt.Sprintf("%s.context_length", kv.Architecture()))
|
||||
return uint64(kv.Uint("context_length"))
|
||||
}
|
||||
|
||||
func (kv KV) ChatTemplate() string {
|
||||
s, _ := kv["tokenizer.chat_template"].(string)
|
||||
return kv.String("tokenizer.chat_template")
|
||||
}
|
||||
|
||||
func (kv KV) String(key string, defaultValue ...string) string {
|
||||
return keyValue(kv, key, append(defaultValue, "")...)
|
||||
}
|
||||
|
||||
func (kv KV) Uint(key string, defaultValue ...uint32) uint32 {
|
||||
return keyValue(kv, key, append(defaultValue, 0)...)
|
||||
}
|
||||
|
||||
func (kv KV) Float(key string, defaultValue ...float32) float32 {
|
||||
return keyValue(kv, key, append(defaultValue, 0)...)
|
||||
}
|
||||
|
||||
func (kv KV) Strings(key string, defaultValue ...[]string) []string {
|
||||
r := keyValue(kv, key, &array{})
|
||||
s := make([]string, r.size)
|
||||
for i := range r.size {
|
||||
s[i] = r.values[i].(string)
|
||||
}
|
||||
|
||||
return s
|
||||
}
|
||||
|
||||
type Tensors struct {
|
||||
Items []*Tensor
|
||||
Offset uint64
|
||||
func (kv KV) Uints(key string, defaultValue ...[]uint32) []uint32 {
|
||||
r := keyValue(kv, key, &array{})
|
||||
s := make([]uint32, r.size)
|
||||
for i := range r.size {
|
||||
s[i] = uint32(r.values[i].(int32))
|
||||
}
|
||||
|
||||
layers map[string]Layer
|
||||
layersOnce sync.Once
|
||||
return s
|
||||
}
|
||||
|
||||
func (ts *Tensors) Layers() map[string]Layer {
|
||||
ts.layersOnce.Do(func() {
|
||||
ts.layers = make(map[string]Layer)
|
||||
for _, t := range ts.Items {
|
||||
parts := strings.Split(t.Name, ".")
|
||||
if index := slices.IndexFunc(parts, func(s string) bool { return s == "blk" || s == "mm" }); index != -1 {
|
||||
if len(parts) > index+2 {
|
||||
// blk and mm should have a number after them, join it
|
||||
parts = append(
|
||||
[]string{strings.Join(parts[:index+2], ".")},
|
||||
parts[index+2:]...)
|
||||
}
|
||||
}
|
||||
func keyValue[T string | uint32 | uint64 | float32 | *array](kv KV, key string, defaultValue ...T) T {
|
||||
if !strings.HasPrefix(key, "tokenizer.") && !strings.HasPrefix(key, "general.") {
|
||||
key = kv.Architecture() + "." + key
|
||||
}
|
||||
|
||||
if _, ok := ts.layers[parts[0]]; !ok {
|
||||
ts.layers[parts[0]] = make(Layer)
|
||||
}
|
||||
if val, ok := kv[key]; ok {
|
||||
return val.(T)
|
||||
}
|
||||
|
||||
ts.layers[parts[0]][strings.Join(parts[1:], ".")] = t
|
||||
slog.Warn("key not found", "key", key, "default", defaultValue[0])
|
||||
return defaultValue[0]
|
||||
}
|
||||
|
||||
type Tensors struct {
|
||||
items []*Tensor
|
||||
Offset uint64
|
||||
}
|
||||
|
||||
func (s Tensors) Items(prefix ...string) []*Tensor {
|
||||
if len(prefix) == 0 {
|
||||
return s.items
|
||||
}
|
||||
|
||||
var items []*Tensor
|
||||
for _, t := range s.items {
|
||||
if strings.HasPrefix(t.Name, prefix[0]) {
|
||||
items = append(items, t)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
return ts.layers
|
||||
return items
|
||||
}
|
||||
|
||||
func (ts Tensors) GroupLayers() map[string]Layer {
|
||||
layers := make(map[string]Layer)
|
||||
for _, t := range ts.items {
|
||||
parts := strings.Split(t.Name, ".")
|
||||
if index := slices.IndexFunc(parts, func(s string) bool { return s == "blk" || s == "mm" }); index != -1 {
|
||||
if len(parts) > index+2 {
|
||||
// blk and mm should have a number after them, join it
|
||||
parts = append(
|
||||
[]string{strings.Join(parts[:index+2], ".")},
|
||||
parts[index+2:]...)
|
||||
}
|
||||
}
|
||||
|
||||
if _, ok := layers[parts[0]]; !ok {
|
||||
layers[parts[0]] = make(Layer)
|
||||
}
|
||||
|
||||
layers[parts[0]][strings.Join(parts[1:], ".")] = t
|
||||
}
|
||||
|
||||
return layers
|
||||
}
|
||||
|
||||
type Layer map[string]*Tensor
|
||||
|
||||
func (l Layer) size() (size uint64) {
|
||||
func (l Layer) Size() (size uint64) {
|
||||
for _, t := range l {
|
||||
size += t.Size()
|
||||
}
|
||||
@@ -255,8 +276,6 @@ func (t Tensor) typeSize() uint64 {
|
||||
return 8
|
||||
case 29: // IQ1_M
|
||||
return blockSize/8 + blockSize/16 + blockSize/32
|
||||
case 30: // BF16
|
||||
return 2
|
||||
default:
|
||||
return 0
|
||||
}
|
||||
@@ -295,7 +314,7 @@ const (
|
||||
|
||||
var ErrUnsupportedFormat = errors.New("unsupported model format")
|
||||
|
||||
func DetectGGMLType(b []byte) string {
|
||||
func DetectContentType(b []byte) string {
|
||||
switch binary.LittleEndian.Uint32(b[:4]) {
|
||||
case FILE_MAGIC_GGML:
|
||||
return "ggml"
|
||||
@@ -312,12 +331,12 @@ func DetectGGMLType(b []byte) string {
|
||||
}
|
||||
}
|
||||
|
||||
// DecodeGGML decodes a GGML model from the given reader.
|
||||
// Decode decodes a GGML model from the given reader.
|
||||
//
|
||||
// It collects array values for arrays with a size less than or equal to
|
||||
// maxArraySize. If maxArraySize is 0, the default value of 1024 is used. If
|
||||
// the maxArraySize is negative, all arrays are collected.
|
||||
func DecodeGGML(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) {
|
||||
func Decode(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) {
|
||||
if maxArraySize == 0 {
|
||||
maxArraySize = 1024
|
||||
}
|
||||
@@ -331,10 +350,6 @@ func DecodeGGML(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) {
|
||||
|
||||
var c container
|
||||
switch magic {
|
||||
case FILE_MAGIC_GGML, FILE_MAGIC_GGMF, FILE_MAGIC_GGJT:
|
||||
return nil, 0, ErrUnsupportedFormat
|
||||
case FILE_MAGIC_GGLA:
|
||||
c = &containerGGLA{}
|
||||
case FILE_MAGIC_GGUF_LE:
|
||||
c = &containerGGUF{ByteOrder: binary.LittleEndian, maxArraySize: maxArraySize}
|
||||
case FILE_MAGIC_GGUF_BE:
|
||||
@@ -360,22 +375,22 @@ func DecodeGGML(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) {
|
||||
}, offset, nil
|
||||
}
|
||||
|
||||
func (llm GGML) GraphSize(context, batch uint64, kvCacheType string) (kv, partialOffload, fullOffload uint64) {
|
||||
embedding := llm.KV().EmbeddingLength()
|
||||
heads := llm.KV().HeadCount()
|
||||
headsKV := llm.KV().HeadCountKV()
|
||||
vocab := uint64(llm.KV()["tokenizer.ggml.tokens"].(*array).size)
|
||||
func (f GGML) GraphSize(context, batch uint64, kvCacheType string) (kv, partialOffload, fullOffload uint64) {
|
||||
embedding := f.KV().EmbeddingLength()
|
||||
heads := f.KV().HeadCount()
|
||||
headsKV := f.KV().HeadCountKV()
|
||||
vocab := uint64(f.KV()["tokenizer.ggml.tokens"].(*array).size)
|
||||
|
||||
embeddingHeads := llm.KV().EmbeddingHeadCount()
|
||||
embeddingHeadsK := llm.KV().EmbeddingHeadCountK()
|
||||
embeddingHeadsV := llm.KV().EmbeddingHeadCountV()
|
||||
embeddingHeads := f.KV().EmbeddingHeadCount()
|
||||
embeddingHeadsK := f.KV().EmbeddingHeadCountK()
|
||||
embeddingHeadsV := f.KV().EmbeddingHeadCountV()
|
||||
|
||||
layers := llm.Tensors().Layers()
|
||||
layers := f.Tensors().GroupLayers()
|
||||
|
||||
bytesPerElement := kvCacheBytesPerElement(kvCacheType)
|
||||
kv = uint64(float64(context*llm.KV().BlockCount()*(embeddingHeadsK+embeddingHeadsV)*headsKV) * bytesPerElement)
|
||||
kv = uint64(float64(context*f.KV().BlockCount()*(embeddingHeadsK+embeddingHeadsV)*headsKV) * bytesPerElement)
|
||||
|
||||
switch llm.KV().Architecture() {
|
||||
switch f.KV().Architecture() {
|
||||
case "llama":
|
||||
fullOffload = max(
|
||||
4*batch*(1+4*embedding+context*(1+heads)),
|
||||
@@ -390,7 +405,7 @@ func (llm GGML) GraphSize(context, batch uint64, kvCacheType string) (kv, partia
|
||||
|
||||
if ffnGateExpsWeight, ok := layers["blk.0"]["ffn_gate_exps.weight"]; ok {
|
||||
// mixtral 8x22b
|
||||
ff := uint64(llm.KV()["llama.feed_forward_length"].(uint32))
|
||||
ff := uint64(f.KV()["llama.feed_forward_length"].(uint32))
|
||||
partialOffload = max(
|
||||
3*ffnGateExpsWeight.Size()+4*batch*(2*ff+headsKV+embedding+context+embeddingHeads*headsKV),
|
||||
4*(context*batch*heads+context*embeddingHeads*headsKV+batch*1024+embeddingHeads*headsKV*batch),
|
||||
@@ -407,11 +422,11 @@ func (llm GGML) GraphSize(context, batch uint64, kvCacheType string) (kv, partia
|
||||
case "mllama":
|
||||
var visionTokens, tiles uint64 = 1601, 4
|
||||
|
||||
if crossAttentionLayers, ok := llm.KV()["mllama.attention.cross_attention_layers"].(*array); ok {
|
||||
if crossAttentionLayers, ok := f.KV()["mllama.attention.cross_attention_layers"].(*array); ok {
|
||||
kv = headsKV *
|
||||
(embeddingHeadsK + embeddingHeadsV) * // one for K, one for V
|
||||
(2* // sizeof(float16)
|
||||
(llm.KV().BlockCount()-uint64(crossAttentionLayers.size))* // num non-cross attention layers
|
||||
(f.KV().BlockCount()-uint64(crossAttentionLayers.size))* // num non-cross attention layers
|
||||
context +
|
||||
4* // sizeof(float32)
|
||||
uint64(crossAttentionLayers.size)* // num cross attention layers
|
||||
@@ -426,7 +441,7 @@ func (llm GGML) GraphSize(context, batch uint64, kvCacheType string) (kv, partia
|
||||
)
|
||||
|
||||
var ropeFreqsCount uint64
|
||||
if ropeFreqs, ok := llm.Tensors().Layers()["rope_freqs"]; ok {
|
||||
if ropeFreqs, ok := f.Tensors().GroupLayers()["rope_freqs"]; ok {
|
||||
if ropeFreqsWeights, ok := ropeFreqs["weights"]; ok {
|
||||
ropeFreqsCount = ropeFreqsWeights.parameters()
|
||||
}
|
||||
@@ -530,21 +545,20 @@ func (llm GGML) GraphSize(context, batch uint64, kvCacheType string) (kv, partia
|
||||
}
|
||||
|
||||
// SupportsKVCacheType checks if the requested cache type is supported
|
||||
func (ggml GGML) SupportsKVCacheType(cacheType string) bool {
|
||||
validKVCacheTypes := []string{"f16", "q8_0", "q4_0"}
|
||||
return slices.Contains(validKVCacheTypes, cacheType)
|
||||
func (f GGML) SupportsKVCacheType(cacheType string) bool {
|
||||
return slices.Contains([]string{"f16", "q8_0", "q4_0"}, cacheType)
|
||||
}
|
||||
|
||||
// SupportsFlashAttention checks if the model supports flash attention
|
||||
func (ggml GGML) SupportsFlashAttention() bool {
|
||||
_, isEmbedding := ggml.KV()[fmt.Sprintf("%s.pooling_type", ggml.KV().Architecture())]
|
||||
func (f GGML) SupportsFlashAttention() bool {
|
||||
_, isEmbedding := f.KV()[fmt.Sprintf("%s.pooling_type", f.KV().Architecture())]
|
||||
if isEmbedding {
|
||||
return false
|
||||
}
|
||||
|
||||
// Check head counts match and are non-zero
|
||||
headCountK := ggml.KV().EmbeddingHeadCountK()
|
||||
headCountV := ggml.KV().EmbeddingHeadCountV()
|
||||
headCountK := f.KV().EmbeddingHeadCountK()
|
||||
headCountV := f.KV().EmbeddingHeadCountV()
|
||||
return headCountK != 0 && headCountV != 0 && headCountK == headCountV
|
||||
}
|
||||
|
||||
159
fs/ggml/ggml_test.go
Normal file
159
fs/ggml/ggml_test.go
Normal file
@@ -0,0 +1,159 @@
|
||||
package ggml
|
||||
|
||||
import (
|
||||
"maps"
|
||||
"slices"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"github.com/google/go-cmp/cmp"
|
||||
)
|
||||
|
||||
func TestTensorLayers(t *testing.T) {
|
||||
tensors := make(map[string]*Tensor)
|
||||
for _, name := range []string{
|
||||
"token_embd.weight",
|
||||
"blk.0.attn_k.weight",
|
||||
"blk.0.attn_output.weight",
|
||||
"blk.0.attn_q.weight",
|
||||
"blk.0.attn_v.weight",
|
||||
"blk.0.attn_norm.weight",
|
||||
"blk.0.ffn_down.weight",
|
||||
"blk.0.ffn_gate.weight",
|
||||
"blk.0.ffn_up.weight",
|
||||
"blk.0.ffn_norm.weight",
|
||||
"output_norm.weight",
|
||||
"mm.0.bias",
|
||||
"mm.0.weight",
|
||||
"v.blk.0.attn_k.weight",
|
||||
"v.blk.0.attn_output.weight",
|
||||
"v.blk.0.attn_q.weight",
|
||||
"v.blk.0.attn_v.weight",
|
||||
"v.blk.0.attn_norm.weight",
|
||||
"v.blk.0.ffn_down.weight",
|
||||
"v.blk.0.ffn_gate.weight",
|
||||
"v.blk.0.ffn_up.weight",
|
||||
"v.blk.0.ffn_norm.weight",
|
||||
"v.patch_embd.weight",
|
||||
"v.position_embd.gate",
|
||||
"v.position_embd.weight",
|
||||
} {
|
||||
tensors[name] = &Tensor{Name: name}
|
||||
}
|
||||
|
||||
cases := []struct {
|
||||
name string
|
||||
items []*Tensor
|
||||
want map[string]Layer
|
||||
}{
|
||||
{
|
||||
name: "text",
|
||||
items: slices.Collect(func(yield func(*Tensor) bool) {
|
||||
for k, v := range tensors {
|
||||
if !strings.HasPrefix(k, "mm.") && !strings.HasPrefix(k, "v.") {
|
||||
if !yield(v) {
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
}),
|
||||
want: map[string]Layer{
|
||||
"blk.0": {
|
||||
"attn_k.weight": tensors["blk.0.attn_k.weight"],
|
||||
"attn_q.weight": tensors["blk.0.attn_q.weight"],
|
||||
"attn_v.weight": tensors["blk.0.attn_v.weight"],
|
||||
"attn_output.weight": tensors["blk.0.attn_output.weight"],
|
||||
"attn_norm.weight": tensors["blk.0.attn_norm.weight"],
|
||||
"ffn_down.weight": tensors["blk.0.ffn_down.weight"],
|
||||
"ffn_gate.weight": tensors["blk.0.ffn_gate.weight"],
|
||||
"ffn_up.weight": tensors["blk.0.ffn_up.weight"],
|
||||
"ffn_norm.weight": tensors["blk.0.ffn_norm.weight"],
|
||||
},
|
||||
"token_embd": {"weight": tensors["token_embd.weight"]},
|
||||
"output_norm": {"weight": tensors["output_norm.weight"]},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "vision",
|
||||
items: slices.Collect(func(yield func(*Tensor) bool) {
|
||||
for k, v := range tensors {
|
||||
if strings.HasPrefix(k, "mm.") || strings.HasPrefix(k, "v.") {
|
||||
if !yield(v) {
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
}),
|
||||
want: map[string]Layer{
|
||||
"mm.0": {
|
||||
"bias": tensors["mm.0.bias"],
|
||||
"weight": tensors["mm.0.weight"],
|
||||
},
|
||||
"v.blk.0": {
|
||||
"attn_k.weight": tensors["v.blk.0.attn_k.weight"],
|
||||
"attn_q.weight": tensors["v.blk.0.attn_q.weight"],
|
||||
"attn_v.weight": tensors["v.blk.0.attn_v.weight"],
|
||||
"attn_output.weight": tensors["v.blk.0.attn_output.weight"],
|
||||
"attn_norm.weight": tensors["v.blk.0.attn_norm.weight"],
|
||||
"ffn_down.weight": tensors["v.blk.0.ffn_down.weight"],
|
||||
"ffn_gate.weight": tensors["v.blk.0.ffn_gate.weight"],
|
||||
"ffn_up.weight": tensors["v.blk.0.ffn_up.weight"],
|
||||
"ffn_norm.weight": tensors["v.blk.0.ffn_norm.weight"],
|
||||
},
|
||||
"v": {
|
||||
"patch_embd.weight": tensors["v.patch_embd.weight"],
|
||||
"position_embd.gate": tensors["v.position_embd.gate"],
|
||||
"position_embd.weight": tensors["v.position_embd.weight"],
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "vision and text",
|
||||
items: slices.Collect(maps.Values(tensors)),
|
||||
want: map[string]Layer{
|
||||
"blk.0": {
|
||||
"attn_k.weight": tensors["blk.0.attn_k.weight"],
|
||||
"attn_q.weight": tensors["blk.0.attn_q.weight"],
|
||||
"attn_v.weight": tensors["blk.0.attn_v.weight"],
|
||||
"attn_output.weight": tensors["blk.0.attn_output.weight"],
|
||||
"attn_norm.weight": tensors["blk.0.attn_norm.weight"],
|
||||
"ffn_down.weight": tensors["blk.0.ffn_down.weight"],
|
||||
"ffn_gate.weight": tensors["blk.0.ffn_gate.weight"],
|
||||
"ffn_up.weight": tensors["blk.0.ffn_up.weight"],
|
||||
"ffn_norm.weight": tensors["blk.0.ffn_norm.weight"],
|
||||
},
|
||||
"token_embd": {"weight": tensors["token_embd.weight"]},
|
||||
"output_norm": {"weight": tensors["output_norm.weight"]},
|
||||
"mm.0": {
|
||||
"bias": tensors["mm.0.bias"],
|
||||
"weight": tensors["mm.0.weight"],
|
||||
},
|
||||
"v.blk.0": {
|
||||
"attn_k.weight": tensors["v.blk.0.attn_k.weight"],
|
||||
"attn_q.weight": tensors["v.blk.0.attn_q.weight"],
|
||||
"attn_v.weight": tensors["v.blk.0.attn_v.weight"],
|
||||
"attn_output.weight": tensors["v.blk.0.attn_output.weight"],
|
||||
"attn_norm.weight": tensors["v.blk.0.attn_norm.weight"],
|
||||
"ffn_down.weight": tensors["v.blk.0.ffn_down.weight"],
|
||||
"ffn_gate.weight": tensors["v.blk.0.ffn_gate.weight"],
|
||||
"ffn_up.weight": tensors["v.blk.0.ffn_up.weight"],
|
||||
"ffn_norm.weight": tensors["v.blk.0.ffn_norm.weight"],
|
||||
},
|
||||
"v": {
|
||||
"patch_embd.weight": tensors["v.patch_embd.weight"],
|
||||
"position_embd.gate": tensors["v.position_embd.gate"],
|
||||
"position_embd.weight": tensors["v.position_embd.weight"],
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range cases {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
got := Tensors{items: tt.items}.GroupLayers()
|
||||
if diff := cmp.Diff(got, tt.want); diff != "" {
|
||||
t.Errorf("unexpected layers (-got +want):\n%s", diff)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -1,4 +1,4 @@
|
||||
package llm
|
||||
package ggml
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
@@ -8,10 +8,9 @@ import (
|
||||
"fmt"
|
||||
"io"
|
||||
"log/slog"
|
||||
"maps"
|
||||
"slices"
|
||||
"strings"
|
||||
|
||||
"golang.org/x/exp/maps"
|
||||
)
|
||||
|
||||
type containerGGUF struct {
|
||||
@@ -110,9 +109,9 @@ func (llm *gguf) KV() KV {
|
||||
return llm.kv
|
||||
}
|
||||
|
||||
func (llm *gguf) Tensors() *Tensors {
|
||||
return &Tensors{
|
||||
Items: llm.tensors,
|
||||
func (llm *gguf) Tensors() Tensors {
|
||||
return Tensors{
|
||||
items: llm.tensors,
|
||||
Offset: llm.tensorOffset,
|
||||
}
|
||||
}
|
||||
@@ -523,7 +522,7 @@ func WriteGGUF(ws io.WriteSeeker, kv KV, ts []Tensor) error {
|
||||
return err
|
||||
}
|
||||
|
||||
keys := maps.Keys(kv)
|
||||
keys := slices.Collect(maps.Keys(kv))
|
||||
slices.Sort(keys)
|
||||
|
||||
for _, key := range keys {
|
||||
@@ -1,4 +1,4 @@
|
||||
package llm
|
||||
package ggml
|
||||
|
||||
import "fmt"
|
||||
|
||||
@@ -98,10 +98,10 @@ func ParseFileType(s string) (fileType, error) {
|
||||
return fileTypeIQ3_M, nil
|
||||
case "IQ2_S":
|
||||
return fileTypeIQ2_S, nil
|
||||
case "IQ4_XS":
|
||||
return fileTypeIQ4_XS, nil
|
||||
case "IQ2_M":
|
||||
return fileTypeIQ2_M, nil
|
||||
case "IQ4_XS":
|
||||
return fileTypeIQ4_XS, nil
|
||||
case "IQ1_M":
|
||||
return fileTypeIQ1_M, nil
|
||||
case "BF16":
|
||||
54
kvcache/cache.go
Normal file
54
kvcache/cache.go
Normal file
@@ -0,0 +1,54 @@
|
||||
package kvcache
|
||||
|
||||
import (
|
||||
"errors"
|
||||
|
||||
"github.com/ollama/ollama/ml"
|
||||
)
|
||||
|
||||
var (
|
||||
ErrKvCacheFull = errors.New("could not find a kv cache slot")
|
||||
ErrNotSupported = errors.New("model does not support operation")
|
||||
)
|
||||
|
||||
type Cache interface {
|
||||
// ** used by model implementations **
|
||||
|
||||
// SetLayer sets the active layer of the cache
|
||||
SetLayer(layer int)
|
||||
|
||||
// Get returns the history of key and value tensors plus a mask
|
||||
//
|
||||
// The shape of the tensors is documented in the specific
|
||||
// cache implementation used.
|
||||
Get(ctx ml.Context) (ml.Tensor, ml.Tensor, ml.Tensor)
|
||||
|
||||
// Put stores a batch of key and value in the cache
|
||||
//
|
||||
// The shape of the tensors is documented in the specific
|
||||
// cache implementation used.
|
||||
Put(ctx ml.Context, key, value ml.Tensor)
|
||||
|
||||
// ** cache management **
|
||||
|
||||
// Init sets up runtime parameters
|
||||
Init(backend ml.Backend, dtype ml.DType, capacity int32)
|
||||
|
||||
// Close closes the cache and frees resources associated with it
|
||||
Close()
|
||||
|
||||
// StartForward is called before the start of the model's forward pass.
|
||||
// For each token in the coming batch, there must be a corresponding
|
||||
// entry in positions and seqs.
|
||||
StartForward(ctx ml.Context, positions []int32, seqs []int) error
|
||||
|
||||
// CopyPrefix copies tokens in the range [0, len) from srcSeq to dstSeq
|
||||
CopyPrefix(srcSeq, dstSeq int, len int32)
|
||||
|
||||
// Remove deletes tokens in the range [beginIndex, endIndex) from seq. Set
|
||||
// endIndex to math.MaxInt32 to remove everything starting at beginIndex.
|
||||
//
|
||||
// If an error occurs, the entire context for the sequence should be
|
||||
// removed by calling Remove(seq, 0, math.MaxInt32)
|
||||
Remove(seq int, beginIndex, endIndex int32) error
|
||||
}
|
||||
455
kvcache/causal.go
Normal file
455
kvcache/causal.go
Normal file
@@ -0,0 +1,455 @@
|
||||
package kvcache
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"math"
|
||||
"slices"
|
||||
|
||||
"github.com/ollama/ollama/ml"
|
||||
)
|
||||
|
||||
type shiftFn func(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error)
|
||||
|
||||
// Causal cache stores K and V tensors according to their position in the
|
||||
// sequence. Returns the history and a mask for attending to past tokens
|
||||
//
|
||||
// The tensors are of shape embed dim, kv heads, batch size
|
||||
// The mask is of shape history size, batch size
|
||||
type Causal struct {
|
||||
DType ml.DType
|
||||
Capacity int32
|
||||
windowSize int32
|
||||
|
||||
// ** current forward pass **
|
||||
|
||||
// the active layer for Get and Put
|
||||
curLayer int
|
||||
|
||||
// starting location for data storage for this batch
|
||||
curLoc int
|
||||
|
||||
// size of the current batch
|
||||
curBatchSize int
|
||||
|
||||
// mask of the cache as used by this batch
|
||||
curMask ml.Tensor
|
||||
|
||||
// locations in the cache that are needed for this batch
|
||||
curCellRange cellRange
|
||||
|
||||
// ** cache metadata **
|
||||
|
||||
// for each possible location in the cache, stores the position and set of sequences
|
||||
// that reference the data there
|
||||
cells []cacheCell
|
||||
|
||||
// maps from sequence to the range of locations where it is stored in the cache
|
||||
cellRanges map[int]cellRange
|
||||
|
||||
// ** cache data storage **
|
||||
|
||||
shiftFn shiftFn
|
||||
backend ml.Backend
|
||||
cacheCtx ml.Context
|
||||
keys, values []ml.Tensor
|
||||
}
|
||||
|
||||
type cacheCell struct {
|
||||
pos int32
|
||||
sequences []int
|
||||
}
|
||||
|
||||
type cellRange struct {
|
||||
min int
|
||||
max int
|
||||
}
|
||||
|
||||
func NewCausalCache(shift shiftFn) *Causal {
|
||||
return &Causal{windowSize: math.MaxInt32, shiftFn: shift}
|
||||
}
|
||||
|
||||
func NewSWACache(windowSize int32, shift shiftFn) *Causal {
|
||||
return &Causal{windowSize: windowSize, shiftFn: shift}
|
||||
}
|
||||
|
||||
func (c *Causal) Init(backend ml.Backend, dtype ml.DType, capacity int32) {
|
||||
c.DType = dtype
|
||||
c.Capacity = capacity
|
||||
c.cells = make([]cacheCell, capacity)
|
||||
c.cellRanges = make(map[int]cellRange)
|
||||
c.backend = backend
|
||||
c.cacheCtx = backend.NewContext()
|
||||
}
|
||||
|
||||
func (c *Causal) Close() {
|
||||
c.cacheCtx.Close()
|
||||
}
|
||||
|
||||
func (c *Causal) StartForward(ctx ml.Context, positions []int32, seqs []int) error {
|
||||
c.curBatchSize = len(positions)
|
||||
|
||||
var err error
|
||||
c.curLoc, err = c.findStartLoc()
|
||||
if errors.Is(err, ErrKvCacheFull) {
|
||||
c.defrag()
|
||||
c.curLoc, err = c.findStartLoc()
|
||||
}
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
c.curCellRange = newRange()
|
||||
for i, pos := range positions {
|
||||
seq := seqs[i]
|
||||
|
||||
c.cells[c.curLoc+i] = cacheCell{pos: pos, sequences: []int{seq}}
|
||||
|
||||
seqRange, ok := c.cellRanges[seq]
|
||||
if !ok {
|
||||
seqRange = newRange()
|
||||
}
|
||||
|
||||
if c.curLoc+i > seqRange.max {
|
||||
seqRange.max = c.curLoc + i
|
||||
}
|
||||
if seqRange.max > c.curCellRange.max {
|
||||
c.curCellRange.max = seqRange.max
|
||||
}
|
||||
|
||||
if c.curLoc+i < seqRange.min {
|
||||
seqRange.min = c.curLoc + i
|
||||
}
|
||||
if seqRange.min < c.curCellRange.min {
|
||||
c.curCellRange.min = seqRange.min
|
||||
}
|
||||
c.cellRanges[seq] = seqRange
|
||||
}
|
||||
|
||||
c.curMask, err = c.buildMask(ctx, positions, seqs)
|
||||
|
||||
return err
|
||||
}
|
||||
|
||||
func newRange() cellRange {
|
||||
return cellRange{
|
||||
min: math.MaxInt,
|
||||
max: 0,
|
||||
}
|
||||
}
|
||||
|
||||
// Find the first contiguous block of at least curBatchSize
|
||||
func (c *Causal) findStartLoc() (int, error) {
|
||||
var start, count int
|
||||
for i := range c.cells {
|
||||
if len(c.cells[i].sequences) == 0 {
|
||||
count++
|
||||
if count >= c.curBatchSize {
|
||||
return start, nil
|
||||
}
|
||||
} else {
|
||||
start = i + 1
|
||||
count = 0
|
||||
}
|
||||
}
|
||||
|
||||
return 0, fmt.Errorf("%w (length: %v)", ErrKvCacheFull, c.Capacity)
|
||||
}
|
||||
|
||||
// Builds a mask of history x batch indicating whether for each token in the batch the
|
||||
// token in the history should apply. This is based on both the sequence and causality (the
|
||||
// position of the history is not ahead of the token in the batch).
|
||||
func (c *Causal) buildMask(ctx ml.Context, positions []int32, seqs []int) (ml.Tensor, error) {
|
||||
// TODO(jessegross): This does not do padding, which is required for flash attention
|
||||
len := c.curCellRange.max - c.curCellRange.min + 1
|
||||
mask := make([]float32, c.curBatchSize*len)
|
||||
|
||||
for i := range c.curBatchSize {
|
||||
for j := c.curCellRange.min; j <= c.curCellRange.max; j++ {
|
||||
if !slices.Contains(c.cells[j].sequences, seqs[i]) || c.cells[j].pos > positions[i] ||
|
||||
c.cells[j].pos < positions[i]-c.windowSize {
|
||||
mask[i*len+(j-c.curCellRange.min)] = float32(math.Inf(-1))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return ctx.FromFloatSlice(mask, len, c.curBatchSize)
|
||||
}
|
||||
|
||||
func moveCell(ctx ml.Context, objs []ml.Tensor, src, dst, len int) {
|
||||
for _, obj := range objs {
|
||||
if obj == nil {
|
||||
continue
|
||||
}
|
||||
|
||||
srcView := obj.View(ctx, obj.Stride(2)*src, obj.Dim(0)*obj.Dim(1)*len)
|
||||
dstView := obj.View(ctx, obj.Stride(2)*dst, obj.Dim(0)*obj.Dim(1)*len)
|
||||
|
||||
ctx.Forward(srcView.Copy(ctx, dstView))
|
||||
}
|
||||
}
|
||||
|
||||
func (c *Causal) defrag() {
|
||||
slog.Debug("defragmenting kv cache")
|
||||
|
||||
// Defrag strategy:
|
||||
// - Search for empty holes at the beginning of the cache,
|
||||
// filling them with active data starting at the end
|
||||
// - If there are contiguous elements that need to be moved,
|
||||
// combine them into a single operation by holding new moves
|
||||
// until we see that the next one is non-contiguous
|
||||
// - Fill up the context with the maximum number of operations it
|
||||
// can hold then compute that and continue with a new context
|
||||
//
|
||||
// We could try to optimize placement by grouping blocks from
|
||||
// the same sequences together but most likely the next forward
|
||||
// pass will disrupt this anyways, so the real world benefit
|
||||
// seems limited as this time.
|
||||
|
||||
ctx := c.backend.NewContext()
|
||||
|
||||
// For every move, 6 tensors are required per layer (2 views and a
|
||||
// copy for each of k and v).
|
||||
layers := 0
|
||||
for _, key := range c.keys {
|
||||
if key == nil {
|
||||
continue
|
||||
}
|
||||
layers++
|
||||
}
|
||||
|
||||
maxMoves := ctx.MaxTensors() / (6 * layers)
|
||||
moves := 0
|
||||
|
||||
var pendingSrc, pendingDst, pendingLen int
|
||||
src := len(c.cells) - 1
|
||||
|
||||
for dst := 0; dst < src; dst++ {
|
||||
if len(c.cells[dst].sequences) == 0 {
|
||||
for ; src > dst; src-- {
|
||||
if len(c.cells[src].sequences) != 0 {
|
||||
c.cells[dst] = c.cells[src]
|
||||
c.cells[src] = cacheCell{}
|
||||
|
||||
if pendingLen > 0 {
|
||||
if src == pendingSrc-pendingLen && dst == pendingDst+pendingLen {
|
||||
pendingSrc = src
|
||||
pendingLen++
|
||||
break
|
||||
} else {
|
||||
moveCell(ctx, c.keys, pendingSrc, pendingDst, pendingLen)
|
||||
moveCell(ctx, c.values, pendingSrc, pendingDst, pendingLen)
|
||||
moves++
|
||||
}
|
||||
}
|
||||
|
||||
pendingSrc = src
|
||||
pendingDst = dst
|
||||
pendingLen = 1
|
||||
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if moves >= maxMoves {
|
||||
ctx.Compute()
|
||||
ctx.Close()
|
||||
ctx = c.backend.NewContext()
|
||||
|
||||
moves = 0
|
||||
}
|
||||
}
|
||||
|
||||
if pendingLen > 0 {
|
||||
moveCell(ctx, c.keys, pendingSrc, pendingDst, pendingLen)
|
||||
moveCell(ctx, c.values, pendingSrc, pendingDst, pendingLen)
|
||||
moves++
|
||||
}
|
||||
|
||||
if moves > 0 {
|
||||
ctx.Compute()
|
||||
}
|
||||
ctx.Close()
|
||||
|
||||
// Reset range metadata
|
||||
for seq := range c.cellRanges {
|
||||
seqRange := newRange()
|
||||
|
||||
for i, cell := range c.cells {
|
||||
if slices.Contains(cell.sequences, seq) {
|
||||
if i < seqRange.min {
|
||||
seqRange.min = i
|
||||
}
|
||||
if i > seqRange.max {
|
||||
seqRange.max = i
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
c.cellRanges[seq] = seqRange
|
||||
}
|
||||
}
|
||||
|
||||
func (c *Causal) SetLayer(layer int) {
|
||||
if layer >= len(c.keys) {
|
||||
c.keys = append(c.keys, make([]ml.Tensor, layer-len(c.keys)+1)...)
|
||||
c.values = append(c.values, make([]ml.Tensor, layer-len(c.values)+1)...)
|
||||
}
|
||||
|
||||
c.curLayer = layer
|
||||
}
|
||||
|
||||
func (c *Causal) Get(ctx ml.Context) (ml.Tensor, ml.Tensor, ml.Tensor) {
|
||||
key := c.keys[c.curLayer]
|
||||
value := c.values[c.curLayer]
|
||||
|
||||
key = key.View(ctx, key.Stride(2)*c.curCellRange.min,
|
||||
key.Dim(0), key.Stride(1),
|
||||
key.Dim(1), key.Stride(2),
|
||||
c.curMask.Dim(0),
|
||||
)
|
||||
|
||||
value = value.View(ctx, key.Stride(2)*c.curCellRange.min,
|
||||
value.Dim(0), value.Stride(1),
|
||||
value.Dim(1), value.Stride(2),
|
||||
c.curMask.Dim(0),
|
||||
)
|
||||
|
||||
return key, value, c.curMask
|
||||
}
|
||||
|
||||
func (c *Causal) Put(ctx ml.Context, key, value ml.Tensor) {
|
||||
if c.curBatchSize != key.Dim(2) {
|
||||
panic(fmt.Errorf("inconsistent batch sizes (layer: %v, batch size: %v layer batch size: %v)", c.curLayer, c.curBatchSize, key.Dim(2)))
|
||||
}
|
||||
|
||||
if c.keys[c.curLayer] == nil || c.values[c.curLayer] == nil {
|
||||
c.keys[c.curLayer] = c.cacheCtx.Zeros(c.DType, key.Dim(0), key.Dim(1), int(c.Capacity))
|
||||
c.values[c.curLayer] = c.cacheCtx.Zeros(c.DType, value.Dim(0), value.Dim(1), int(c.Capacity))
|
||||
}
|
||||
|
||||
ctx.Forward(key.Copy(ctx, c.keys[c.curLayer].View(ctx, c.keys[c.curLayer].Stride(2)*c.curLoc, key.Dim(0)*key.Dim(1)*key.Dim(2))))
|
||||
ctx.Forward(value.Copy(ctx, c.values[c.curLayer].View(ctx, c.values[c.curLayer].Stride(2)*c.curLoc, value.Dim(0)*value.Dim(1)*value.Dim(2))))
|
||||
}
|
||||
|
||||
func (c *Causal) CopyPrefix(srcSeq, dstSeq int, len int32) {
|
||||
seqRange := newRange()
|
||||
|
||||
for i := range c.cells {
|
||||
// Remove the contents of dstSeq so that we only have the copied prefix, metadata will be reset at the end
|
||||
if slices.Contains(c.cells[i].sequences, dstSeq) {
|
||||
c.cells[i].sequences = slices.DeleteFunc(c.cells[i].sequences, func(s int) bool { return s == dstSeq })
|
||||
}
|
||||
|
||||
if slices.Contains(c.cells[i].sequences, srcSeq) && c.cells[i].pos < len {
|
||||
c.cells[i].sequences = append(c.cells[i].sequences, dstSeq)
|
||||
if i < seqRange.min {
|
||||
seqRange.min = i
|
||||
}
|
||||
if i > seqRange.max {
|
||||
seqRange.max = i
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
c.cellRanges[dstSeq] = seqRange
|
||||
}
|
||||
|
||||
func (c *Causal) shift(seq int, beginIndex, offset int32) error {
|
||||
if c.shiftFn == nil {
|
||||
return ErrNotSupported
|
||||
}
|
||||
|
||||
ctx := c.backend.NewContext()
|
||||
defer ctx.Close()
|
||||
|
||||
seqRange := c.cellRanges[seq]
|
||||
size := seqRange.max - seqRange.min + 1
|
||||
|
||||
offsets := make([]int32, size)
|
||||
for i := range offsets {
|
||||
cell := c.cells[seqRange.min+i]
|
||||
|
||||
if slices.Contains(cell.sequences, seq) && cell.pos >= beginIndex {
|
||||
offsets[i] = offset
|
||||
}
|
||||
}
|
||||
|
||||
kShift, err := ctx.FromIntSlice(offsets, len(offsets))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
for i, key := range c.keys {
|
||||
if key == nil {
|
||||
continue
|
||||
}
|
||||
|
||||
key = key.View(ctx, key.Stride(2)*seqRange.min,
|
||||
key.Dim(0), key.Stride(1),
|
||||
key.Dim(1), key.Stride(2),
|
||||
size,
|
||||
)
|
||||
|
||||
roped, err := c.shiftFn(ctx, i, key, kShift)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
ctx.Forward(roped.Copy(ctx, key))
|
||||
}
|
||||
|
||||
ctx.Compute()
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c *Causal) Remove(seq int, beginIndex, endIndex int32) error {
|
||||
var offset int32
|
||||
if endIndex != math.MaxInt32 {
|
||||
offset = beginIndex - endIndex
|
||||
}
|
||||
|
||||
seqRange := newRange()
|
||||
|
||||
for i := range c.cells {
|
||||
if slices.Contains(c.cells[i].sequences, seq) {
|
||||
if c.cells[i].pos >= beginIndex && c.cells[i].pos < endIndex {
|
||||
c.cells[i].sequences = slices.DeleteFunc(c.cells[i].sequences, func(s int) bool { return s == seq })
|
||||
} else {
|
||||
if c.cells[i].pos >= endIndex {
|
||||
if slices.ContainsFunc(c.cells[i].sequences, func(s int) bool { return s != seq }) {
|
||||
// TODO(jessegross): Need to be careful about data shared between sequences
|
||||
return errors.New("shifting on cells shared by multiple sequences not yet implemented")
|
||||
}
|
||||
|
||||
c.cells[i].pos += offset
|
||||
}
|
||||
if i < seqRange.min {
|
||||
seqRange.min = i
|
||||
}
|
||||
if i > seqRange.max {
|
||||
seqRange.max = i
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if seqRange == newRange() {
|
||||
delete(c.cellRanges, seq)
|
||||
return nil
|
||||
}
|
||||
|
||||
c.cellRanges[seq] = seqRange
|
||||
|
||||
if endIndex != math.MaxInt32 {
|
||||
err := c.shift(seq, endIndex+offset, offset)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
510
kvcache/causal_test.go
Normal file
510
kvcache/causal_test.go
Normal file
@@ -0,0 +1,510 @@
|
||||
package kvcache
|
||||
|
||||
import (
|
||||
"math"
|
||||
"slices"
|
||||
"testing"
|
||||
|
||||
"github.com/ollama/ollama/ml"
|
||||
)
|
||||
|
||||
type testCase struct {
|
||||
name string
|
||||
in []float32
|
||||
inShape []int
|
||||
seqs []int
|
||||
pos []int32
|
||||
expected []float32
|
||||
expectedShape []int
|
||||
expectedMask []float32
|
||||
}
|
||||
|
||||
func TestStore(t *testing.T) {
|
||||
backend := &testBackend{}
|
||||
cache := NewCausalCache(nil)
|
||||
defer cache.Close()
|
||||
|
||||
cache.Init(backend, ml.DTypeF16, 16)
|
||||
|
||||
tests := []testCase{
|
||||
{
|
||||
name: "FirstBatch",
|
||||
in: []float32{111, 211, 121, 221, 131, 231, 112, 212, 122, 222, 132, 232, 113, 213, 123, 223, 133, 233, 114, 214, 124, 224, 134, 234},
|
||||
inShape: []int{2, 3, 4},
|
||||
seqs: []int{0, 0, 0, 0},
|
||||
pos: []int32{0, 1, 2, 3},
|
||||
expected: []float32{111, 211, 121, 221, 131, 231, 112, 212, 122, 222, 132, 232, 113, 213, 123, 223, 133, 233, 114, 214, 124, 224, 134, 234},
|
||||
expectedShape: []int{2, 3, 4},
|
||||
expectedMask: []float32{0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, 0, float32(math.Inf(-1)), 0, 0, 0, 0},
|
||||
},
|
||||
{
|
||||
name: "SecondBatch",
|
||||
in: []float32{115, 215, 125, 225, 135, 235},
|
||||
inShape: []int{2, 3, 1},
|
||||
seqs: []int{0},
|
||||
pos: []int32{4},
|
||||
expected: []float32{111, 211, 121, 221, 131, 231, 112, 212, 122, 222, 132, 232, 113, 213, 123, 223, 133, 233, 114, 214, 124, 224, 134, 234, 115, 215, 125, 225, 135, 235},
|
||||
expectedShape: []int{2, 3, 5},
|
||||
expectedMask: []float32{0, 0, 0, 0, 0},
|
||||
},
|
||||
}
|
||||
|
||||
testCache(t, backend, cache, tests)
|
||||
}
|
||||
|
||||
func TestSWA(t *testing.T) {
|
||||
backend := &testBackend{}
|
||||
cache := NewSWACache(1, nil)
|
||||
defer cache.Close()
|
||||
|
||||
cache.Init(backend, ml.DTypeF32, 16)
|
||||
|
||||
tests := []testCase{
|
||||
{
|
||||
name: "SlidingWindow",
|
||||
in: []float32{1, 2, 3, 4},
|
||||
inShape: []int{1, 1, 4},
|
||||
seqs: []int{0, 0, 0, 0},
|
||||
pos: []int32{0, 1, 2, 3},
|
||||
expected: []float32{1, 2, 3, 4},
|
||||
expectedShape: []int{1, 1, 4},
|
||||
expectedMask: []float32{0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0},
|
||||
},
|
||||
}
|
||||
|
||||
testCache(t, backend, cache, tests)
|
||||
}
|
||||
|
||||
func TestSequences(t *testing.T) {
|
||||
backend := &testBackend{}
|
||||
cache := NewCausalCache(nil)
|
||||
defer cache.Close()
|
||||
|
||||
cache.Init(backend, ml.DTypeF16, 16)
|
||||
|
||||
tests := []testCase{
|
||||
{
|
||||
name: "FirstBatch",
|
||||
in: []float32{1, 2, 3, 4},
|
||||
inShape: []int{1, 1, 4},
|
||||
seqs: []int{0, 0, 1, 1},
|
||||
pos: []int32{0, 1, 0, 1},
|
||||
expected: []float32{1, 2, 3, 4},
|
||||
expectedShape: []int{1, 1, 4},
|
||||
expectedMask: []float32{0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0},
|
||||
},
|
||||
{
|
||||
name: "SecondBatch",
|
||||
in: []float32{5, 6},
|
||||
inShape: []int{1, 1, 2},
|
||||
seqs: []int{0, 1},
|
||||
pos: []int32{2, 2},
|
||||
expected: []float32{1, 2, 3, 4, 5, 6},
|
||||
expectedShape: []int{1, 1, 6},
|
||||
expectedMask: []float32{0, 0, float32(math.Inf(-1)), float32(math.Inf(-1)), 0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, float32(math.Inf(-1)), 0},
|
||||
},
|
||||
}
|
||||
|
||||
testCache(t, backend, cache, tests)
|
||||
}
|
||||
|
||||
func TestRemove(t *testing.T) {
|
||||
backend := &testBackend{}
|
||||
cache := NewCausalCache(func(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
|
||||
return key.Add(ctx, shift), nil
|
||||
})
|
||||
defer cache.Close()
|
||||
|
||||
cache.Init(backend, ml.DTypeF16, 16)
|
||||
|
||||
tests := []testCase{
|
||||
{
|
||||
name: "FirstBatch",
|
||||
in: []float32{1, 2, 3, 4},
|
||||
inShape: []int{1, 1, 4},
|
||||
seqs: []int{0, 0, 1, 1},
|
||||
pos: []int32{0, 1, 0, 1},
|
||||
expected: []float32{1, 2, 3, 4},
|
||||
expectedShape: []int{1, 1, 4},
|
||||
expectedMask: []float32{0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0},
|
||||
},
|
||||
}
|
||||
|
||||
testCache(t, backend, cache, tests)
|
||||
|
||||
err := cache.Remove(0, 1, math.MaxInt32)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
tests = []testCase{
|
||||
{
|
||||
name: "RemoveEnd",
|
||||
in: []float32{5, 6},
|
||||
inShape: []int{1, 1, 2},
|
||||
seqs: []int{0, 1},
|
||||
pos: []int32{1, 2},
|
||||
expected: []float32{1, 2, 3, 4, 5, 6},
|
||||
expectedShape: []int{1, 1, 6},
|
||||
expectedMask: []float32{0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, float32(math.Inf(-1)), 0},
|
||||
},
|
||||
}
|
||||
|
||||
testCache(t, backend, cache, tests)
|
||||
|
||||
err = cache.Remove(0, 0, 1)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
tests = []testCase{
|
||||
{
|
||||
name: "RemoveMiddle",
|
||||
in: []float32{7, 8},
|
||||
inShape: []int{1, 1, 2},
|
||||
seqs: []int{0, 0},
|
||||
pos: []int32{1, 2},
|
||||
expected: []float32{7, 8, 3, 4, 4},
|
||||
expectedShape: []int{1, 1, 5},
|
||||
expectedMask: []float32{0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, 0, float32(math.Inf(-1)), float32(math.Inf(-1)), 0},
|
||||
},
|
||||
}
|
||||
|
||||
testCache(t, backend, cache, tests)
|
||||
}
|
||||
|
||||
func TestDefrag(t *testing.T) {
|
||||
backend := &testBackend{}
|
||||
cache := NewCausalCache(func(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
|
||||
return key.Add(ctx, shift), nil
|
||||
})
|
||||
defer cache.Close()
|
||||
|
||||
cache.Init(backend, ml.DTypeF16, 16)
|
||||
|
||||
tests := []testCase{
|
||||
{
|
||||
name: "FirstBatch",
|
||||
in: []float32{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
|
||||
inShape: []int{1, 1, 16},
|
||||
seqs: []int{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
|
||||
pos: []int32{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
|
||||
expected: []float32{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
|
||||
expectedShape: []int{1, 1, 16},
|
||||
expectedMask: []float32{0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, 0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, 0, 0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, 0, 0, 0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, 0, 0, 0, 0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, 0, 0, 0, 0, 0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, 0, 0, 0, 0, 0, 0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, 0, 0, 0, 0, 0, 0, 0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, float32(math.Inf(-1)), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
|
||||
},
|
||||
}
|
||||
|
||||
testCache(t, backend, cache, tests)
|
||||
|
||||
err := cache.Remove(0, 2, 4)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
err = cache.Remove(0, 13, math.MaxInt32)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
tests = []testCase{
|
||||
{
|
||||
name: "Defrag",
|
||||
in: []float32{17, 18, 19},
|
||||
inShape: []int{1, 1, 3},
|
||||
seqs: []int{0, 0, 0},
|
||||
pos: []int32{16, 17, 18},
|
||||
expected: []float32{1, 2, 12, 13, 3, 4, 5, 6, 7, 8, 9, 10, 11, 17, 18, 19},
|
||||
expectedShape: []int{1, 1, 16},
|
||||
expectedMask: []float32{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, float32(math.Inf(-1)), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
|
||||
},
|
||||
}
|
||||
|
||||
testCache(t, backend, cache, tests)
|
||||
}
|
||||
|
||||
func TestCopy(t *testing.T) {
|
||||
backend := &testBackend{}
|
||||
cache := NewCausalCache(func(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) { return key, nil })
|
||||
defer cache.Close()
|
||||
|
||||
cache.Init(backend, ml.DTypeF16, 16)
|
||||
|
||||
tests := []testCase{
|
||||
{
|
||||
name: "FirstBatch",
|
||||
in: []float32{1, 2, 3, 4},
|
||||
inShape: []int{1, 1, 4},
|
||||
seqs: []int{0, 0, 0, 0},
|
||||
pos: []int32{0, 1, 2, 3},
|
||||
expected: []float32{1, 2, 3, 4},
|
||||
expectedShape: []int{1, 1, 4},
|
||||
expectedMask: []float32{0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, 0, float32(math.Inf(-1)), 0, 0, 0, 0},
|
||||
},
|
||||
}
|
||||
|
||||
testCache(t, backend, cache, tests)
|
||||
|
||||
cache.CopyPrefix(0, 1, 2)
|
||||
|
||||
tests = []testCase{
|
||||
{
|
||||
name: "Copy",
|
||||
in: []float32{5, 6},
|
||||
inShape: []int{1, 1, 2},
|
||||
seqs: []int{1, 1},
|
||||
pos: []int32{3, 4},
|
||||
expected: []float32{1, 2, 3, 4, 5, 6},
|
||||
expectedShape: []int{1, 1, 6},
|
||||
expectedMask: []float32{0, 0, float32(math.Inf(-1)), float32(math.Inf(-1)), 0, float32(math.Inf(-1)), 0, 0, float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0},
|
||||
},
|
||||
}
|
||||
|
||||
testCache(t, backend, cache, tests)
|
||||
}
|
||||
|
||||
func testCache(t *testing.T, backend ml.Backend, cache Cache, tests []testCase) {
|
||||
for _, test := range tests {
|
||||
t.Run(test.name, func(t *testing.T) {
|
||||
context := backend.NewContext()
|
||||
defer context.Close()
|
||||
|
||||
err := cache.StartForward(context, test.pos, test.seqs)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
cache.SetLayer(0)
|
||||
tensor, _ := context.FromFloatSlice(test.in, test.inShape...)
|
||||
cache.Put(context, tensor, tensor)
|
||||
|
||||
out, _, mask := cache.Get(context)
|
||||
|
||||
context.Forward(out)
|
||||
context.Forward(mask)
|
||||
context.Compute(out, mask)
|
||||
|
||||
if !slices.Equal(out.Floats(), test.expected) || !slices.Equal(out.Shape(), test.expectedShape) || !slices.Equal(mask.Floats(), test.expectedMask) {
|
||||
t.Errorf("TestCache: have %v (shape %v); want %v (shape %v); mask: have %v (shape %v) want %v", out.Floats(), out.Shape(), test.expected, test.expectedShape, mask.Floats(), mask.Shape(), test.expectedMask)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
type testBackend struct{}
|
||||
|
||||
func (b *testBackend) Config() ml.Config {
|
||||
panic("not implemented")
|
||||
}
|
||||
|
||||
func (b *testBackend) Get(name string) ml.Tensor {
|
||||
panic("not implemented")
|
||||
}
|
||||
|
||||
func (b *testBackend) NewContext() ml.Context {
|
||||
return &testContext{}
|
||||
}
|
||||
|
||||
func (b *testBackend) SystemInfo() string {
|
||||
return "not implemented"
|
||||
}
|
||||
|
||||
type testContext struct{}
|
||||
|
||||
func (c *testContext) Zeros(dtype ml.DType, shape ...int) ml.Tensor {
|
||||
total := 0
|
||||
|
||||
if len(shape) > 0 {
|
||||
total = 1
|
||||
for _, s := range shape {
|
||||
total *= s
|
||||
}
|
||||
}
|
||||
|
||||
return &testTensor{dtype: dtype, elementSize: 4, data: make([]float32, total), shape: shape}
|
||||
}
|
||||
|
||||
func (c *testContext) FromFloatSlice(s []float32, shape ...int) (ml.Tensor, error) {
|
||||
t := c.Zeros(ml.DTypeF32, shape...).(*testTensor)
|
||||
|
||||
copy(t.data, s)
|
||||
|
||||
return t, nil
|
||||
}
|
||||
|
||||
func (c *testContext) FromIntSlice(s []int32, shape ...int) (ml.Tensor, error) {
|
||||
f := make([]float32, len(s))
|
||||
for i := range f {
|
||||
f[i] = float32(s[i])
|
||||
}
|
||||
|
||||
out, _ := c.FromFloatSlice(f, shape...)
|
||||
out.(*testTensor).dtype = ml.DTypeI32
|
||||
|
||||
return out, nil
|
||||
}
|
||||
|
||||
func (c *testContext) Forward(ml.Tensor) {}
|
||||
|
||||
func (c *testContext) Compute(...ml.Tensor) {}
|
||||
|
||||
func (c *testContext) MaxTensors() int {
|
||||
return 10
|
||||
}
|
||||
|
||||
func (c *testContext) Close() {}
|
||||
|
||||
type testTensor struct {
|
||||
dtype ml.DType
|
||||
elementSize int
|
||||
data []float32
|
||||
shape []int
|
||||
}
|
||||
|
||||
func (t *testTensor) Dim(n int) int {
|
||||
return t.shape[n]
|
||||
}
|
||||
|
||||
func (t *testTensor) Stride(n int) int {
|
||||
stride := t.elementSize
|
||||
for i := range n {
|
||||
stride *= t.shape[i]
|
||||
}
|
||||
|
||||
return stride
|
||||
}
|
||||
|
||||
func (t *testTensor) Shape() []int {
|
||||
return t.shape
|
||||
}
|
||||
|
||||
func (t *testTensor) DType() ml.DType {
|
||||
return t.dtype
|
||||
}
|
||||
|
||||
func (t *testTensor) Bytes() []byte {
|
||||
panic("not implemented")
|
||||
}
|
||||
|
||||
func (t *testTensor) Floats() []float32 {
|
||||
out := make([]float32, len(t.data))
|
||||
copy(out, t.data)
|
||||
return out
|
||||
}
|
||||
|
||||
func (t *testTensor) Add(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
|
||||
out := ctx.Zeros(t.DType(), t.Shape()...).(*testTensor)
|
||||
|
||||
for i := range out.data {
|
||||
out.data[i] = t.data[i] + t2.(*testTensor).data[i]
|
||||
}
|
||||
|
||||
return out
|
||||
}
|
||||
|
||||
func (t *testTensor) Mul(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
|
||||
panic("not implemented")
|
||||
}
|
||||
|
||||
func (t *testTensor) Mulmat(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
|
||||
panic("not implemented")
|
||||
}
|
||||
|
||||
func (t *testTensor) MulmatFullPrec(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
|
||||
panic("not implemented")
|
||||
}
|
||||
|
||||
func (t *testTensor) Softmax(ctx ml.Context) ml.Tensor {
|
||||
panic("not implemented")
|
||||
}
|
||||
|
||||
func (t *testTensor) LayerNorm(ctx ml.Context, weight, bias ml.Tensor, eps float32) ml.Tensor {
|
||||
panic("not implemented")
|
||||
}
|
||||
|
||||
func (t *testTensor) RMSNorm(ctx ml.Context, weight ml.Tensor, eps float32) ml.Tensor {
|
||||
panic("not implemented")
|
||||
}
|
||||
|
||||
func (t *testTensor) Scale(ctx ml.Context, s float64) ml.Tensor {
|
||||
panic("not implemented")
|
||||
}
|
||||
|
||||
func (t *testTensor) Conv2D(ctx ml.Context, weight ml.Tensor, s0, s1, p0, p1, d0, d1 int) ml.Tensor {
|
||||
panic("not implemented")
|
||||
}
|
||||
|
||||
func (t *testTensor) RoPE(ctx ml.Context, positionIDs, ropeFactors ml.Tensor, dim uint32, base, scale float32) ml.Tensor {
|
||||
panic("not implemented")
|
||||
}
|
||||
|
||||
func (t *testTensor) Tanh(ctx ml.Context) ml.Tensor {
|
||||
panic("not implemented")
|
||||
}
|
||||
|
||||
func (t *testTensor) GELU(ctx ml.Context) ml.Tensor {
|
||||
panic("not implemented")
|
||||
}
|
||||
|
||||
func (t *testTensor) SILU(ctx ml.Context) ml.Tensor {
|
||||
panic("not implemented")
|
||||
}
|
||||
|
||||
func (t *testTensor) Reshape(ctx ml.Context, shape ...int) ml.Tensor {
|
||||
panic("not implemented")
|
||||
}
|
||||
|
||||
func (t *testTensor) View(ctx ml.Context, offset int, shape ...int) ml.Tensor {
|
||||
offset /= t.elementSize
|
||||
|
||||
var s []int
|
||||
|
||||
switch len(shape) {
|
||||
case 1:
|
||||
s = []int{shape[0]}
|
||||
case 5:
|
||||
s = []int{shape[0], shape[2], shape[4]}
|
||||
default:
|
||||
panic("unsupported number of dimensions")
|
||||
}
|
||||
|
||||
context := &testContext{}
|
||||
|
||||
view := context.Zeros(t.dtype, s...).(*testTensor)
|
||||
view.data = t.data[offset : offset+len(view.data)]
|
||||
|
||||
return view
|
||||
}
|
||||
|
||||
func (t *testTensor) Permute(ctx ml.Context, shape ...int) ml.Tensor {
|
||||
panic("not implemented")
|
||||
}
|
||||
|
||||
func (t *testTensor) Contiguous(ctx ml.Context) ml.Tensor {
|
||||
panic("not implemented")
|
||||
}
|
||||
|
||||
func (t *testTensor) Pad(ctx ml.Context, shape ...int) ml.Tensor {
|
||||
panic("not implemented")
|
||||
}
|
||||
|
||||
func (t *testTensor) Unpad(ctx ml.Context, shape ...int) ml.Tensor {
|
||||
panic("not implemented")
|
||||
}
|
||||
|
||||
func (t *testTensor) Stack(ctx ml.Context, dim int, s ...ml.Tensor) ml.Tensor {
|
||||
panic("not implemented")
|
||||
}
|
||||
|
||||
func (t *testTensor) Concat(ctx ml.Context, t2 ml.Tensor, dim int) ml.Tensor {
|
||||
panic("not implemented")
|
||||
}
|
||||
|
||||
func (t *testTensor) Rows(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
|
||||
panic("not implemented")
|
||||
}
|
||||
|
||||
func (t *testTensor) Copy(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
|
||||
copy(t2.(*testTensor).data, t.data)
|
||||
return nil
|
||||
}
|
||||
97
kvcache/encoder.go
Normal file
97
kvcache/encoder.go
Normal file
@@ -0,0 +1,97 @@
|
||||
package kvcache
|
||||
|
||||
import (
|
||||
"github.com/ollama/ollama/ml"
|
||||
)
|
||||
|
||||
// Encoder cache stores K and V tensors that are position independent
|
||||
//
|
||||
// The tensors can be of any shape and will be returned as they were stored
|
||||
// The mask is currently always nil
|
||||
//
|
||||
// Not currently safe for multiple sequences
|
||||
type EncoderCache struct {
|
||||
// ** current forward pass **
|
||||
|
||||
// the active layer for Get and Put
|
||||
curLayer int
|
||||
|
||||
// if something is stored during this pass, this
|
||||
// will be the position (but there is no guarantee
|
||||
// anything will be stored)
|
||||
curPos int32
|
||||
|
||||
// ** cache metadata **
|
||||
|
||||
// was something stored in the cache?
|
||||
encoderCached bool
|
||||
|
||||
// position of the cached data
|
||||
encoderPos int32
|
||||
|
||||
// ** cache data storage **
|
||||
|
||||
cacheCtx ml.Context
|
||||
keys, values []ml.Tensor
|
||||
}
|
||||
|
||||
func NewEncoderCache() *EncoderCache {
|
||||
return &EncoderCache{}
|
||||
}
|
||||
|
||||
func (c *EncoderCache) Init(backend ml.Backend, dtype ml.DType, capacity int32) {
|
||||
c.cacheCtx = backend.NewContext()
|
||||
}
|
||||
|
||||
func (c *EncoderCache) Close() {
|
||||
c.cacheCtx.Close()
|
||||
}
|
||||
|
||||
func (c *EncoderCache) StartForward(ctx ml.Context, positions []int32, seqs []int) error {
|
||||
// The image is always in the first position
|
||||
c.curPos = positions[0]
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c *EncoderCache) SetLayer(layer int) {
|
||||
if layer >= len(c.keys) {
|
||||
c.keys = append(c.keys, make([]ml.Tensor, layer-len(c.keys)+1)...)
|
||||
c.values = append(c.values, make([]ml.Tensor, layer-len(c.values)+1)...)
|
||||
}
|
||||
|
||||
c.curLayer = layer
|
||||
}
|
||||
|
||||
func (c *EncoderCache) EncoderCached() bool {
|
||||
return c.encoderCached
|
||||
}
|
||||
|
||||
func (c *EncoderCache) Get(ctx ml.Context) (ml.Tensor, ml.Tensor, ml.Tensor) {
|
||||
return c.keys[c.curLayer], c.values[c.curLayer], nil
|
||||
}
|
||||
|
||||
func (c *EncoderCache) Put(ctx ml.Context, key, value ml.Tensor) {
|
||||
c.encoderPos = c.curPos
|
||||
c.encoderCached = true
|
||||
|
||||
if c.keys[c.curLayer] == nil || c.values[c.curLayer] == nil {
|
||||
c.keys[c.curLayer] = c.cacheCtx.Zeros(key.DType(), key.Shape()...)
|
||||
c.values[c.curLayer] = c.cacheCtx.Zeros(value.DType(), value.Shape()...)
|
||||
}
|
||||
|
||||
ctx.Forward(key.Copy(ctx, c.keys[c.curLayer]))
|
||||
ctx.Forward(value.Copy(ctx, c.values[c.curLayer]))
|
||||
}
|
||||
|
||||
func (c *EncoderCache) CopyPrefix(srcSeq, dstSeq int, len int32) {
|
||||
panic("encoder cache does not support multiple sequences")
|
||||
}
|
||||
|
||||
func (c *EncoderCache) Remove(seq int, beginIndex, endIndex int32) error {
|
||||
if c.encoderPos >= beginIndex && c.encoderPos < endIndex {
|
||||
c.encoderCached = false
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
93
kvcache/wrapper.go
Normal file
93
kvcache/wrapper.go
Normal file
@@ -0,0 +1,93 @@
|
||||
package kvcache
|
||||
|
||||
import (
|
||||
"math"
|
||||
|
||||
"github.com/ollama/ollama/ml"
|
||||
)
|
||||
|
||||
// Wrapper cache is a container for multiple types of caches,
|
||||
// such as for the encoding and decoding portions of a model.
|
||||
type WrapperCache struct {
|
||||
// caches we are wrapping
|
||||
caches []Cache
|
||||
|
||||
// cache to be used for this layer
|
||||
curType int
|
||||
}
|
||||
|
||||
func NewWrapperCache(caches ...Cache) *WrapperCache {
|
||||
return &WrapperCache{
|
||||
caches: caches,
|
||||
}
|
||||
}
|
||||
|
||||
func (c *WrapperCache) Init(backend ml.Backend, dtype ml.DType, capacity int32) {
|
||||
for _, cache := range c.caches {
|
||||
cache.Init(backend, dtype, capacity)
|
||||
}
|
||||
}
|
||||
|
||||
func (c *WrapperCache) Close() {
|
||||
for _, cache := range c.caches {
|
||||
cache.Close()
|
||||
}
|
||||
}
|
||||
|
||||
func (c *WrapperCache) StartForward(ctx ml.Context, positions []int32, seqs []int) error {
|
||||
for i, cache := range c.caches {
|
||||
err := cache.StartForward(ctx, positions, seqs)
|
||||
if err != nil {
|
||||
// unwind on error - Remove with endIndex set to math.MaxInt32 does not fail
|
||||
for j := i - 1; j >= 0; j-- {
|
||||
for k := range positions {
|
||||
_ = c.caches[j].Remove(seqs[k], positions[k], math.MaxInt32)
|
||||
}
|
||||
}
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
c.curType = 0
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c *WrapperCache) SetLayer(layer int) {
|
||||
for _, cache := range c.caches {
|
||||
cache.SetLayer(layer)
|
||||
}
|
||||
}
|
||||
|
||||
func (c *WrapperCache) SetLayerType(layerType int) {
|
||||
c.curType = layerType
|
||||
}
|
||||
|
||||
func (c *WrapperCache) UnderlyingCache() Cache {
|
||||
return c.caches[c.curType]
|
||||
}
|
||||
|
||||
func (c *WrapperCache) Get(ctx ml.Context) (ml.Tensor, ml.Tensor, ml.Tensor) {
|
||||
return c.caches[c.curType].Get(ctx)
|
||||
}
|
||||
|
||||
func (c *WrapperCache) Put(ctx ml.Context, key, value ml.Tensor) {
|
||||
c.caches[c.curType].Put(ctx, key, value)
|
||||
}
|
||||
|
||||
func (c *WrapperCache) CopyPrefix(srcSeq, dstSeq int, len int32) {
|
||||
for _, cache := range c.caches {
|
||||
cache.CopyPrefix(srcSeq, dstSeq, len)
|
||||
}
|
||||
}
|
||||
|
||||
func (c *WrapperCache) Remove(seq int, beginIndex, endIndex int32) error {
|
||||
// If the one of these fails, the caller is supposed to retry with endIndex set to math.MaxInt32, which should not fail
|
||||
for _, cache := range c.caches {
|
||||
err := cache.Remove(seq, beginIndex, endIndex)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
@@ -8,7 +8,7 @@ Subject: [PATCH] sort devices by score
|
||||
1 file changed, 13 insertions(+), 8 deletions(-)
|
||||
|
||||
diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
|
||||
index 899d16f2..ac5cda07 100644
|
||||
index 899d16f2..135f7df0 100644
|
||||
--- a/ggml/src/ggml-backend-reg.cpp
|
||||
+++ b/ggml/src/ggml-backend-reg.cpp
|
||||
@@ -150,7 +150,7 @@ struct ggml_backend_reg_entry {
|
||||
@@ -29,7 +29,7 @@ index 899d16f2..ac5cda07 100644
|
||||
if (!reg) {
|
||||
return;
|
||||
}
|
||||
@@ -206,15 +206,15 @@ struct ggml_backend_registry {
|
||||
@@ -206,15 +206,20 @@ struct ggml_backend_registry {
|
||||
#endif
|
||||
backends.push_back({ reg, std::move(handle) });
|
||||
for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) {
|
||||
@@ -45,10 +45,15 @@ index 899d16f2..ac5cda07 100644
|
||||
#endif
|
||||
- devices.push_back(device);
|
||||
+ devices.push_back({device, score});
|
||||
+ std::stable_sort(devices.begin(), devices.end(),
|
||||
+ [](const auto & a, const auto & b) {
|
||||
+ return a.second > b.second;
|
||||
+ }
|
||||
+ );
|
||||
}
|
||||
|
||||
ggml_backend_reg_t load_backend(const std::wstring & path, bool silent) {
|
||||
@@ -257,7 +257,7 @@ struct ggml_backend_registry {
|
||||
@@ -257,7 +262,7 @@ struct ggml_backend_registry {
|
||||
|
||||
GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), utf16_to_utf8(path).c_str());
|
||||
|
||||
@@ -57,7 +62,7 @@ index 899d16f2..ac5cda07 100644
|
||||
|
||||
return reg;
|
||||
}
|
||||
@@ -280,7 +280,7 @@ struct ggml_backend_registry {
|
||||
@@ -280,7 +285,7 @@ struct ggml_backend_registry {
|
||||
// remove devices
|
||||
devices.erase(
|
||||
std::remove_if(devices.begin(), devices.end(),
|
||||
@@ -66,17 +71,12 @@ index 899d16f2..ac5cda07 100644
|
||||
devices.end());
|
||||
|
||||
// remove backend
|
||||
@@ -338,7 +338,12 @@ size_t ggml_backend_dev_count() {
|
||||
@@ -338,7 +343,7 @@ size_t ggml_backend_dev_count() {
|
||||
|
||||
ggml_backend_dev_t ggml_backend_dev_get(size_t index) {
|
||||
GGML_ASSERT(index < ggml_backend_dev_count());
|
||||
- return get_reg().devices[index];
|
||||
+ auto devices = get_reg().devices;
|
||||
+ if (!std::is_heap(devices.begin(), devices.end())) {
|
||||
+ std::make_heap(devices.begin(), devices.end(), [](const auto & a, const auto & b) { return a.second < b.second; });
|
||||
+ }
|
||||
+
|
||||
+ return devices[index].first;
|
||||
+ return get_reg().devices[index].first;
|
||||
}
|
||||
|
||||
ggml_backend_dev_t ggml_backend_dev_by_name(const char * name) {
|
||||
|
||||
@@ -8,7 +8,7 @@ Subject: [PATCH] try/catch backend load
|
||||
1 file changed, 23 insertions(+), 22 deletions(-)
|
||||
|
||||
diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
|
||||
index ac5cda07..374c3b21 100644
|
||||
index 135f7df0..84b21dd8 100644
|
||||
--- a/ggml/src/ggml-backend-reg.cpp
|
||||
+++ b/ggml/src/ggml-backend-reg.cpp
|
||||
@@ -512,32 +512,33 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
|
||||
|
||||
@@ -0,0 +1,285 @@
|
||||
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
||||
From: jmorganca <jmorganca@gmail.com>
|
||||
Date: Sun, 16 Feb 2025 20:00:22 -0500
|
||||
Subject: [PATCH] use std::filesystem::path instead of wstring
|
||||
|
||||
---
|
||||
ggml/src/ggml-backend-reg.cpp | 116 ++++++++++++----------------------
|
||||
1 file changed, 40 insertions(+), 76 deletions(-)
|
||||
|
||||
diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
|
||||
index 84b21dd8..de78feae 100644
|
||||
--- a/ggml/src/ggml-backend-reg.cpp
|
||||
+++ b/ggml/src/ggml-backend-reg.cpp
|
||||
@@ -72,16 +72,6 @@
|
||||
# pragma clang diagnostic ignored "-Wdeprecated-declarations"
|
||||
#endif
|
||||
|
||||
-static std::wstring utf8_to_utf16(const std::string & str) {
|
||||
- std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
|
||||
- return converter.from_bytes(str);
|
||||
-}
|
||||
-
|
||||
-static std::string utf16_to_utf8(const std::wstring & str) {
|
||||
- std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
|
||||
- return converter.to_bytes(str);
|
||||
-}
|
||||
-
|
||||
#if defined(__clang__)
|
||||
# pragma clang diagnostic pop
|
||||
#endif
|
||||
@@ -96,12 +86,12 @@ struct dl_handle_deleter {
|
||||
}
|
||||
};
|
||||
|
||||
-static dl_handle * dl_load_library(const std::wstring & path) {
|
||||
+static dl_handle * dl_load_library(const std::filesystem::path & path) {
|
||||
// suppress error dialogs for missing DLLs
|
||||
DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
|
||||
SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
|
||||
|
||||
- HMODULE handle = LoadLibraryW(path.c_str());
|
||||
+ HMODULE handle = LoadLibraryW(path.wstring().c_str());
|
||||
|
||||
SetErrorMode(old_mode);
|
||||
|
||||
@@ -129,8 +119,8 @@ struct dl_handle_deleter {
|
||||
}
|
||||
};
|
||||
|
||||
-static void * dl_load_library(const std::wstring & path) {
|
||||
- dl_handle * handle = dlopen(utf16_to_utf8(path).c_str(), RTLD_NOW | RTLD_LOCAL);
|
||||
+static void * dl_load_library(const std::filesystem::path & path) {
|
||||
+ dl_handle * handle = dlopen(path.string().c_str(), RTLD_NOW | RTLD_LOCAL);
|
||||
|
||||
return handle;
|
||||
}
|
||||
@@ -222,11 +212,11 @@ struct ggml_backend_registry {
|
||||
);
|
||||
}
|
||||
|
||||
- ggml_backend_reg_t load_backend(const std::wstring & path, bool silent) {
|
||||
+ ggml_backend_reg_t load_backend(const std::filesystem::path & path, bool silent) {
|
||||
dl_handle_ptr handle { dl_load_library(path) };
|
||||
if (!handle) {
|
||||
if (!silent) {
|
||||
- GGML_LOG_ERROR("%s: failed to load %s\n", __func__, utf16_to_utf8(path).c_str());
|
||||
+ GGML_LOG_ERROR("%s: failed to load %s\n", __func__, path.string().c_str());
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
@@ -234,7 +224,7 @@ struct ggml_backend_registry {
|
||||
auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
|
||||
if (score_fn && score_fn() == 0) {
|
||||
if (!silent) {
|
||||
- GGML_LOG_INFO("%s: backend %s is not supported on this system\n", __func__, utf16_to_utf8(path).c_str());
|
||||
+ GGML_LOG_INFO("%s: backend %s is not supported on this system\n", __func__, path.string().c_str());
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
@@ -242,7 +232,7 @@ struct ggml_backend_registry {
|
||||
auto backend_init_fn = (ggml_backend_init_t) dl_get_sym(handle.get(), "ggml_backend_init");
|
||||
if (!backend_init_fn) {
|
||||
if (!silent) {
|
||||
- GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s\n", __func__, utf16_to_utf8(path).c_str());
|
||||
+ GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s\n", __func__, path.string().c_str());
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
@@ -251,16 +241,16 @@ struct ggml_backend_registry {
|
||||
if (!reg || reg->api_version != GGML_BACKEND_API_VERSION) {
|
||||
if (!silent) {
|
||||
if (!reg) {
|
||||
- GGML_LOG_ERROR("%s: failed to initialize backend from %s: ggml_backend_init returned NULL\n", __func__, utf16_to_utf8(path).c_str());
|
||||
+ GGML_LOG_ERROR("%s: failed to initialize backend from %s: ggml_backend_init returned NULL\n", __func__, path.string().c_str());
|
||||
} else {
|
||||
GGML_LOG_ERROR("%s: failed to initialize backend from %s: incompatible API version (backend: %d, current: %d)\n",
|
||||
- __func__, utf16_to_utf8(path).c_str(), reg->api_version, GGML_BACKEND_API_VERSION);
|
||||
+ __func__, path.string().c_str(), reg->api_version, GGML_BACKEND_API_VERSION);
|
||||
}
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
- GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), utf16_to_utf8(path).c_str());
|
||||
+ GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), path.string().c_str());
|
||||
|
||||
register_backend(reg, score_fn ? score_fn() : -1, std::move(handle));
|
||||
|
||||
@@ -396,14 +386,14 @@ ggml_backend_t ggml_backend_init_best(void) {
|
||||
|
||||
// Dynamic loading
|
||||
ggml_backend_reg_t ggml_backend_load(const char * path) {
|
||||
- return get_reg().load_backend(utf8_to_utf16(path), false);
|
||||
+ return get_reg().load_backend(path, false);
|
||||
}
|
||||
|
||||
void ggml_backend_unload(ggml_backend_reg_t reg) {
|
||||
get_reg().unload_backend(reg, true);
|
||||
}
|
||||
|
||||
-static std::wstring get_executable_path() {
|
||||
+static std::filesystem::path get_executable_path() {
|
||||
#if defined(__APPLE__)
|
||||
// get executable path
|
||||
std::vector<char> path;
|
||||
@@ -415,15 +405,9 @@ static std::wstring get_executable_path() {
|
||||
}
|
||||
path.resize(size);
|
||||
}
|
||||
- std::string base_path(path.data(), size);
|
||||
- // remove executable name
|
||||
- auto last_slash = base_path.find_last_of('/');
|
||||
- if (last_slash != std::string::npos) {
|
||||
- base_path = base_path.substr(0, last_slash);
|
||||
- }
|
||||
- return utf8_to_utf16(base_path + "/");
|
||||
+
|
||||
+ return std::filesystem::path(path.data()).parent_path();
|
||||
#elif defined(__linux__) || defined(__FreeBSD__)
|
||||
- std::string base_path = ".";
|
||||
std::vector<char> path(1024);
|
||||
while (true) {
|
||||
// get executable path
|
||||
@@ -436,76 +420,56 @@ static std::wstring get_executable_path() {
|
||||
break;
|
||||
}
|
||||
if (len < (ssize_t) path.size()) {
|
||||
- base_path = std::string(path.data(), len);
|
||||
- // remove executable name
|
||||
- auto last_slash = base_path.find_last_of('/');
|
||||
- if (last_slash != std::string::npos) {
|
||||
- base_path = base_path.substr(0, last_slash);
|
||||
- }
|
||||
- break;
|
||||
+ return std::filesystem::path(path.data()).parent_path();
|
||||
}
|
||||
path.resize(path.size() * 2);
|
||||
}
|
||||
-
|
||||
- return utf8_to_utf16(base_path + "/");
|
||||
#elif defined(_WIN32)
|
||||
std::vector<wchar_t> path(MAX_PATH);
|
||||
DWORD len = GetModuleFileNameW(NULL, path.data(), path.size());
|
||||
if (len == 0) {
|
||||
return {};
|
||||
}
|
||||
- std::wstring base_path(path.data(), len);
|
||||
- // remove executable name
|
||||
- auto last_slash = base_path.find_last_of('\\');
|
||||
- if (last_slash != std::string::npos) {
|
||||
- base_path = base_path.substr(0, last_slash);
|
||||
- }
|
||||
- return base_path + L"\\";
|
||||
-#else
|
||||
- return {};
|
||||
-#endif
|
||||
-}
|
||||
|
||||
-static std::wstring backend_filename_prefix() {
|
||||
-#ifdef _WIN32
|
||||
- return L"ggml-";
|
||||
+ return std::filesystem::path(path.data()).parent_path();
|
||||
#else
|
||||
- return L"libggml-";
|
||||
+ return {};
|
||||
#endif
|
||||
}
|
||||
|
||||
-static std::wstring backend_filename_suffix() {
|
||||
+static std::string backend_filename_prefix() {
|
||||
#ifdef _WIN32
|
||||
- return L".dll";
|
||||
+ return "ggml-";
|
||||
#else
|
||||
- return L".so";
|
||||
+ return "libggml-";
|
||||
#endif
|
||||
}
|
||||
|
||||
-static std::wstring path_separator() {
|
||||
+static std::string backend_filename_suffix() {
|
||||
#ifdef _WIN32
|
||||
- return L"\\";
|
||||
+ return ".dll";
|
||||
#else
|
||||
- return L"/";
|
||||
+ return ".so";
|
||||
#endif
|
||||
}
|
||||
|
||||
static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent, const char * user_search_path) {
|
||||
// enumerate all the files that match [lib]ggml-name-*.[so|dll] in the search paths
|
||||
// TODO: search system paths
|
||||
- std::wstring file_prefix = backend_filename_prefix() + utf8_to_utf16(name) + L"-";
|
||||
- std::vector<std::wstring> search_paths;
|
||||
+ namespace fs = std::filesystem;
|
||||
+ std::string file_prefix = backend_filename_prefix() + name + "-";
|
||||
+ std::vector<fs::path> search_paths;
|
||||
+
|
||||
if (user_search_path == nullptr) {
|
||||
- search_paths.push_back(L"." + path_separator());
|
||||
+ search_paths.push_back(fs::current_path());
|
||||
search_paths.push_back(get_executable_path());
|
||||
} else {
|
||||
- search_paths.push_back(utf8_to_utf16(user_search_path) + path_separator());
|
||||
+ search_paths.push_back(fs::u8path(user_search_path));
|
||||
}
|
||||
|
||||
int best_score = 0;
|
||||
- std::wstring best_path;
|
||||
+ fs::path best_path;
|
||||
|
||||
- namespace fs = std::filesystem;
|
||||
for (const auto & search_path : search_paths) {
|
||||
if (!fs::exists(search_path)) {
|
||||
continue;
|
||||
@@ -514,31 +478,31 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
|
||||
for (const auto & entry : dir_it) {
|
||||
try {
|
||||
if (entry.is_regular_file()) {
|
||||
- std::wstring filename = entry.path().filename().wstring();
|
||||
- std::wstring ext = entry.path().extension().wstring();
|
||||
+ std::string filename = entry.path().filename().string();
|
||||
+ std::string ext = entry.path().extension().string();
|
||||
if (filename.find(file_prefix) == 0 && ext == backend_filename_suffix()) {
|
||||
- dl_handle_ptr handle { dl_load_library(entry.path().wstring()) };
|
||||
+ dl_handle_ptr handle { dl_load_library(entry.path()) };
|
||||
if (!handle) {
|
||||
- GGML_LOG_ERROR("%s: failed to load %s\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str());
|
||||
+ GGML_LOG_ERROR("%s: failed to load %s\n", __func__, entry.path().string().c_str());
|
||||
continue;
|
||||
}
|
||||
|
||||
auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
|
||||
if (!score_fn) {
|
||||
- GGML_LOG_DEBUG("%s: failed to find ggml_backend_score in %s\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str());
|
||||
+ GGML_LOG_DEBUG("%s: failed to find ggml_backend_score in %s\n", __func__, entry.path().string().c_str());
|
||||
continue;
|
||||
}
|
||||
|
||||
int s = score_fn();
|
||||
- GGML_LOG_DEBUG("%s: %s score: %d\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str(), s);
|
||||
+ GGML_LOG_DEBUG("%s: %s score: %d\n", __func__, entry.path().string().c_str(), s);
|
||||
if (s > best_score) {
|
||||
best_score = s;
|
||||
- best_path = entry.path().wstring();
|
||||
+ best_path = entry.path();
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (const std::exception & e) {
|
||||
- GGML_LOG_ERROR("%s: failed to load %s: %s\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str(), e.what());
|
||||
+ GGML_LOG_ERROR("%s: failed to load %s: %s\n", __func__, entry.path().string().c_str(), e.what());
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -546,7 +510,7 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
|
||||
if (best_score == 0) {
|
||||
// try to load the base backend
|
||||
for (const auto & search_path : search_paths) {
|
||||
- std::wstring path = search_path + backend_filename_prefix() + utf8_to_utf16(name) + backend_filename_suffix();
|
||||
+ fs::path path = fs::path(search_path) / (backend_filename_prefix() + name + backend_filename_suffix());
|
||||
if (fs::exists(path)) {
|
||||
return get_reg().load_backend(path, silent);
|
||||
}
|
||||
149
llm/ggla.go
149
llm/ggla.go
@@ -1,149 +0,0 @@
|
||||
package llm
|
||||
|
||||
import (
|
||||
"encoding/binary"
|
||||
"errors"
|
||||
"io"
|
||||
"slices"
|
||||
)
|
||||
|
||||
type containerGGLA struct {
|
||||
version uint32
|
||||
}
|
||||
|
||||
func (c *containerGGLA) Name() string {
|
||||
return "ggla"
|
||||
}
|
||||
|
||||
func (c *containerGGLA) Decode(rs io.ReadSeeker) (model, error) {
|
||||
if err := binary.Read(rs, binary.LittleEndian, &c.version); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
switch c.version {
|
||||
case 1:
|
||||
default:
|
||||
return nil, errors.New("invalid version")
|
||||
}
|
||||
|
||||
model := newGGLA(c)
|
||||
err := model.decode(rs)
|
||||
return model, err
|
||||
}
|
||||
|
||||
type ggla struct {
|
||||
*containerGGLA
|
||||
|
||||
kv KV
|
||||
tensors []*Tensor
|
||||
|
||||
tensorOffset uint64
|
||||
}
|
||||
|
||||
func newGGLA(container *containerGGLA) *ggla {
|
||||
return &ggla{
|
||||
containerGGLA: container,
|
||||
kv: make(KV),
|
||||
}
|
||||
}
|
||||
|
||||
func (llm *ggla) KV() KV {
|
||||
return llm.kv
|
||||
}
|
||||
|
||||
func (llm *ggla) Tensors() *Tensors {
|
||||
return &Tensors{
|
||||
Items: llm.tensors,
|
||||
Offset: llm.tensorOffset,
|
||||
}
|
||||
}
|
||||
|
||||
func (llm *ggla) decode(rs io.ReadSeeker) (retErr error) {
|
||||
var r uint32
|
||||
if err := binary.Read(rs, binary.LittleEndian, &r); err != nil {
|
||||
return err
|
||||
}
|
||||
llm.kv["r"] = r
|
||||
|
||||
var alpha uint32
|
||||
if err := binary.Read(rs, binary.LittleEndian, &alpha); err != nil {
|
||||
return err
|
||||
}
|
||||
llm.kv["alpha"] = alpha
|
||||
|
||||
offset, err := rs.Seek(0, io.SeekCurrent)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
llm.tensorOffset = uint64(offset)
|
||||
|
||||
for {
|
||||
var dims uint32
|
||||
if err := binary.Read(rs, binary.LittleEndian, &dims); err != nil {
|
||||
if errors.Is(err, io.EOF) {
|
||||
return nil
|
||||
}
|
||||
return err
|
||||
}
|
||||
|
||||
defer func() {
|
||||
if errors.Is(retErr, io.EOF) {
|
||||
retErr = io.ErrUnexpectedEOF
|
||||
}
|
||||
}()
|
||||
|
||||
var namesize uint32
|
||||
if err := binary.Read(rs, binary.LittleEndian, &namesize); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
var t Tensor
|
||||
if err := binary.Read(rs, binary.LittleEndian, &t.Kind); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
t.Shape = make([]uint64, dims)
|
||||
for i := 0; uint32(i) < dims; i++ {
|
||||
var shape32 uint32
|
||||
if err := binary.Read(rs, binary.LittleEndian, &shape32); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
t.Shape[i] = uint64(shape32)
|
||||
}
|
||||
|
||||
// ggla tensor shape is reversed
|
||||
// ref: https://github.com/ggerganov/llama.cpp/blob/29ae62d2ae163e2b68aa0ad3bf2ab4636de0c957/convert-lora-to-ggml.py#L44
|
||||
slices.Reverse(t.Shape)
|
||||
|
||||
name := make([]byte, namesize)
|
||||
if err := binary.Read(rs, binary.LittleEndian, &name); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
t.Name = string(name)
|
||||
|
||||
offset, err := rs.Seek(0, io.SeekCurrent)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if _, err := rs.Seek((offset+31)&-32-offset, io.SeekCurrent); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
offset, err = rs.Seek(0, io.SeekCurrent)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
t.Offset = uint64(offset)
|
||||
|
||||
if _, err := rs.Seek(int64(t.Size()), io.SeekCurrent); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
llm.tensors = append(llm.tensors, &t)
|
||||
}
|
||||
}
|
||||
@@ -1 +0,0 @@
|
||||
package llm
|
||||
@@ -11,18 +11,19 @@ import (
|
||||
"github.com/ollama/ollama/discover"
|
||||
"github.com/ollama/ollama/envconfig"
|
||||
"github.com/ollama/ollama/format"
|
||||
"github.com/ollama/ollama/fs/ggml"
|
||||
)
|
||||
|
||||
// This algorithm looks for a complete fit to determine if we need to unload other models
|
||||
func PredictServerFit(allGpus discover.GpuInfoList, ggml *GGML, adapters, projectors []string, opts api.Options) (bool, uint64) {
|
||||
func PredictServerFit(allGpus discover.GpuInfoList, f *ggml.GGML, adapters, projectors []string, opts api.Options) (bool, uint64) {
|
||||
// Split up the GPUs by type and try them
|
||||
var estimatedVRAM uint64
|
||||
for _, gpus := range allGpus.ByLibrary() {
|
||||
var layerCount int
|
||||
estimate := EstimateGPULayers(gpus, ggml, projectors, opts)
|
||||
estimate := EstimateGPULayers(gpus, f, projectors, opts)
|
||||
layerCount, estimatedVRAM = estimate.Layers, estimate.VRAMSize
|
||||
if opts.NumGPU < 0 {
|
||||
if layerCount > 0 && layerCount >= int(ggml.KV().BlockCount()+1) {
|
||||
if layerCount > 0 && layerCount >= int(f.KV().BlockCount()+1) {
|
||||
return true, estimatedVRAM
|
||||
}
|
||||
} else {
|
||||
@@ -70,7 +71,7 @@ type MemoryEstimate struct {
|
||||
|
||||
// Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size
|
||||
// The GPUs provided must all be the same Library
|
||||
func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string, opts api.Options) MemoryEstimate {
|
||||
func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []string, opts api.Options) MemoryEstimate {
|
||||
// Graph size for a partial offload, applies to all GPUs
|
||||
var graphPartialOffload uint64
|
||||
|
||||
@@ -115,33 +116,31 @@ func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string,
|
||||
opts.NumCtx = max(opts.NumCtx, 2048)
|
||||
}
|
||||
|
||||
layers := ggml.Tensors().Layers()
|
||||
layers := f.Tensors().GroupLayers()
|
||||
// add one layer worth of memory as a buffer
|
||||
if blk0, ok := layers["blk.0"]; ok {
|
||||
layerSize = blk0.size()
|
||||
layerSize = blk0.Size()
|
||||
} else {
|
||||
slog.Warn("model missing blk.0 layer size")
|
||||
}
|
||||
|
||||
fa := envconfig.FlashAttention() &&
|
||||
discover.GetGPUInfo().FlashAttentionSupported() &&
|
||||
ggml.SupportsFlashAttention()
|
||||
|
||||
var kvct string
|
||||
if fa {
|
||||
if envconfig.FlashAttention() &&
|
||||
discover.GetGPUInfo().FlashAttentionSupported() &&
|
||||
f.SupportsFlashAttention() {
|
||||
requested := strings.ToLower(envconfig.KvCacheType())
|
||||
if requested != "" && ggml.SupportsKVCacheType(requested) {
|
||||
if requested != "" && f.SupportsKVCacheType(requested) {
|
||||
kvct = requested
|
||||
}
|
||||
}
|
||||
|
||||
kv, graphPartialOffload, graphFullOffload := ggml.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)), kvct)
|
||||
kv, graphPartialOffload, graphFullOffload := f.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)), kvct)
|
||||
|
||||
// KV is proportional to the number of layers
|
||||
layerSize += kv / ggml.KV().BlockCount()
|
||||
layerSize += kv / f.KV().BlockCount()
|
||||
|
||||
if graphPartialOffload == 0 {
|
||||
graphPartialOffload = ggml.KV().GQA() * kv / 6
|
||||
graphPartialOffload = f.KV().GQA() * kv / 6
|
||||
}
|
||||
if graphFullOffload == 0 {
|
||||
graphFullOffload = graphPartialOffload
|
||||
@@ -156,12 +155,12 @@ func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string,
|
||||
}
|
||||
|
||||
if layer, ok := layers["output_norm"]; ok {
|
||||
memoryLayerOutput += layer.size()
|
||||
memoryLayerOutput += layer.Size()
|
||||
}
|
||||
if layer, ok := layers["output"]; ok {
|
||||
memoryLayerOutput += layer.size()
|
||||
memoryLayerOutput += layer.Size()
|
||||
} else if layer, ok := layers["token_embd"]; ok {
|
||||
memoryLayerOutput += layer.size()
|
||||
memoryLayerOutput += layer.Size()
|
||||
}
|
||||
|
||||
// Output layer handled at the end if we have space
|
||||
@@ -211,11 +210,11 @@ func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string,
|
||||
}
|
||||
|
||||
// For all the layers, find where they can fit on the GPU(s)
|
||||
for i := range int(ggml.KV().BlockCount()) {
|
||||
for i := range int(f.KV().BlockCount()) {
|
||||
// Some models have inconsistent layer sizes
|
||||
if blk, ok := layers[fmt.Sprintf("blk.%d", i)]; ok {
|
||||
layerSize = blk.size()
|
||||
layerSize += kv / ggml.KV().BlockCount()
|
||||
layerSize = blk.Size()
|
||||
layerSize += kv / f.KV().BlockCount()
|
||||
}
|
||||
memoryWeights += layerSize
|
||||
|
||||
@@ -238,10 +237,10 @@ func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string,
|
||||
}
|
||||
}
|
||||
}
|
||||
if layerCount >= int(ggml.KV().BlockCount()) {
|
||||
if layerCount >= int(f.KV().BlockCount()) {
|
||||
fullyLoaded = true
|
||||
} else {
|
||||
for i := layerCount; i < int(ggml.KV().BlockCount()); i++ {
|
||||
for i := layerCount; i < int(f.KV().BlockCount()); i++ {
|
||||
overflow += layerSize
|
||||
}
|
||||
}
|
||||
@@ -259,7 +258,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string,
|
||||
}
|
||||
}
|
||||
|
||||
if layerCount < int(ggml.KV().BlockCount())+1 {
|
||||
if layerCount < int(f.KV().BlockCount())+1 {
|
||||
fullyLoaded = false
|
||||
overflow += memoryLayerOutput
|
||||
}
|
||||
@@ -311,7 +310,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string,
|
||||
|
||||
inferenceLibrary: gpus[0].Library,
|
||||
layersRequested: opts.NumGPU,
|
||||
layersModel: int(ggml.KV().BlockCount()) + 1,
|
||||
layersModel: int(f.KV().BlockCount()) + 1,
|
||||
availableList: availableList,
|
||||
kv: kv,
|
||||
allocationsList: allocationsList,
|
||||
@@ -339,22 +338,9 @@ func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string,
|
||||
return estimate
|
||||
}
|
||||
|
||||
func (m MemoryEstimate) log() {
|
||||
overhead := envconfig.GpuOverhead()
|
||||
|
||||
log := slog.With()
|
||||
if m.projectorWeights > 0 {
|
||||
log = log.With(
|
||||
slog.Group(
|
||||
"projector",
|
||||
"weights", format.HumanBytes2(m.projectorWeights),
|
||||
"graph", format.HumanBytes2(m.projectorGraph),
|
||||
),
|
||||
)
|
||||
}
|
||||
|
||||
log.Info(
|
||||
"offload to "+m.inferenceLibrary,
|
||||
func (m MemoryEstimate) LogValue() slog.Value {
|
||||
attrs := []slog.Attr{
|
||||
slog.String("library", m.inferenceLibrary),
|
||||
slog.Group(
|
||||
"layers",
|
||||
// requested number of layers to offload
|
||||
@@ -370,7 +356,7 @@ func (m MemoryEstimate) log() {
|
||||
"memory",
|
||||
// memory available by GPU for offloading
|
||||
"available", m.availableList,
|
||||
"gpu_overhead", format.HumanBytes2(overhead),
|
||||
"gpu_overhead", format.HumanBytes2(envconfig.GpuOverhead()),
|
||||
slog.Group(
|
||||
"required",
|
||||
// memory required for full offloading
|
||||
@@ -399,7 +385,17 @@ func (m MemoryEstimate) log() {
|
||||
"partial", format.HumanBytes2(m.graphPartialOffload),
|
||||
),
|
||||
),
|
||||
)
|
||||
}
|
||||
|
||||
if m.projectorWeights > 0 {
|
||||
attrs = append(attrs, slog.Group(
|
||||
"projector",
|
||||
"weights", format.HumanBytes2(m.projectorWeights),
|
||||
"graph", format.HumanBytes2(m.projectorGraph),
|
||||
))
|
||||
}
|
||||
|
||||
return slog.GroupValue(attrs...)
|
||||
}
|
||||
|
||||
func projectorMemoryRequirements(filename string) (weights, graphSize uint64) {
|
||||
@@ -409,13 +405,13 @@ func projectorMemoryRequirements(filename string) (weights, graphSize uint64) {
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
ggml, _, err := DecodeGGML(file, 0)
|
||||
ggml, _, err := ggml.Decode(file, 0)
|
||||
if err != nil {
|
||||
return 0, 0
|
||||
}
|
||||
|
||||
for _, layer := range ggml.Tensors().Layers() {
|
||||
weights += layer.size()
|
||||
for _, layer := range ggml.Tensors().GroupLayers() {
|
||||
weights += layer.Size()
|
||||
}
|
||||
|
||||
switch arch := ggml.KV().Architecture(); arch {
|
||||
@@ -435,7 +431,7 @@ func projectorMemoryRequirements(filename string) (weights, graphSize uint64) {
|
||||
headCount := kv("attention.head_count")
|
||||
|
||||
numPatches := (imageSize / kv("patch_size")) * (imageSize / kv("patch_size"))
|
||||
if _, ok := ggml.Tensors().Layers()["v"]["class_embd"]; ok {
|
||||
if _, ok := ggml.Tensors().GroupLayers()["v"]["class_embd"]; ok {
|
||||
numPatches++
|
||||
}
|
||||
|
||||
|
||||
@@ -11,6 +11,7 @@ import (
|
||||
|
||||
"github.com/ollama/ollama/api"
|
||||
"github.com/ollama/ollama/discover"
|
||||
"github.com/ollama/ollama/fs/ggml"
|
||||
)
|
||||
|
||||
func TestEstimateGPULayers(t *testing.T) {
|
||||
@@ -23,7 +24,7 @@ func TestEstimateGPULayers(t *testing.T) {
|
||||
defer f.Close()
|
||||
inputLayerCount := 5
|
||||
|
||||
tensors := []Tensor{
|
||||
tensors := []ggml.Tensor{
|
||||
{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
|
||||
{Name: "blk.1.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
|
||||
{Name: "blk.2.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
|
||||
@@ -32,7 +33,7 @@ func TestEstimateGPULayers(t *testing.T) {
|
||||
{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
|
||||
}
|
||||
assert.Len(t, tensors, inputLayerCount+1)
|
||||
err = WriteGGUF(f, KV{
|
||||
err = ggml.WriteGGUF(f, ggml.KV{
|
||||
"general.architecture": "llama",
|
||||
"llama.context_length": uint32(32),
|
||||
"llama.embedding_length": uint32(4096),
|
||||
|
||||
@@ -28,6 +28,7 @@ import (
|
||||
"github.com/ollama/ollama/discover"
|
||||
"github.com/ollama/ollama/envconfig"
|
||||
"github.com/ollama/ollama/format"
|
||||
"github.com/ollama/ollama/fs/ggml"
|
||||
"github.com/ollama/ollama/llama"
|
||||
)
|
||||
|
||||
@@ -71,7 +72,7 @@ type llmServer struct {
|
||||
// It collects array values for arrays with a size less than or equal to
|
||||
// maxArraySize. If maxArraySize is 0, the default value of 1024 is used. If
|
||||
// the maxArraySize is negative, all arrays are collected.
|
||||
func LoadModel(model string, maxArraySize int) (*GGML, error) {
|
||||
func LoadModel(model string, maxArraySize int) (*ggml.GGML, error) {
|
||||
if _, err := os.Stat(model); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@@ -82,21 +83,17 @@ func LoadModel(model string, maxArraySize int) (*GGML, error) {
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
ggml, _, err := DecodeGGML(f, maxArraySize)
|
||||
ggml, _, err := ggml.Decode(f, maxArraySize)
|
||||
return ggml, err
|
||||
}
|
||||
|
||||
// NewLlamaServer will run a server for the given GPUs
|
||||
// The gpu list must be a single family.
|
||||
func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapters, projectors []string, opts api.Options, numParallel int) (LlamaServer, error) {
|
||||
var systemTotalMemory uint64
|
||||
var systemFreeMemory uint64
|
||||
var systemSwapFreeMemory uint64
|
||||
|
||||
func NewLlamaServer(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters, projectors []string, opts api.Options, numParallel int) (LlamaServer, error) {
|
||||
systemInfo := discover.GetSystemInfo()
|
||||
systemTotalMemory = systemInfo.System.TotalMemory
|
||||
systemFreeMemory = systemInfo.System.FreeMemory
|
||||
systemSwapFreeMemory = systemInfo.System.FreeSwap
|
||||
systemTotalMemory := systemInfo.System.TotalMemory
|
||||
systemFreeMemory := systemInfo.System.FreeMemory
|
||||
systemSwapFreeMemory := systemInfo.System.FreeSwap
|
||||
slog.Info("system memory", "total", format.HumanBytes2(systemTotalMemory), "free", format.HumanBytes2(systemFreeMemory), "free_swap", format.HumanBytes2(systemSwapFreeMemory))
|
||||
|
||||
// If the user wants zero GPU layers, reset the gpu list to be CPU/system ram info
|
||||
@@ -104,7 +101,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
|
||||
gpus = discover.GetCPUInfo()
|
||||
}
|
||||
|
||||
estimate := EstimateGPULayers(gpus, ggml, projectors, opts)
|
||||
estimate := EstimateGPULayers(gpus, f, projectors, opts)
|
||||
if len(gpus) > 1 || gpus[0].Library != "cpu" {
|
||||
switch {
|
||||
case gpus[0].Library == "metal" && estimate.VRAMSize > systemTotalMemory:
|
||||
@@ -130,7 +127,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
|
||||
}
|
||||
}
|
||||
|
||||
estimate.log()
|
||||
slog.Info("offload", "", estimate)
|
||||
|
||||
params := []string{
|
||||
"--model", model,
|
||||
@@ -174,7 +171,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
|
||||
fa = false
|
||||
}
|
||||
|
||||
if fa && !ggml.SupportsFlashAttention() {
|
||||
if fa && !f.SupportsFlashAttention() {
|
||||
slog.Warn("flash attention enabled but not supported by model")
|
||||
fa = false
|
||||
}
|
||||
@@ -187,7 +184,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
|
||||
|
||||
// Flash Attention also supports kv cache quantization
|
||||
// Enable if the requested and kv cache type is supported by the model
|
||||
if kvct != "" && ggml.SupportsKVCacheType(kvct) {
|
||||
if kvct != "" && f.SupportsKVCacheType(kvct) {
|
||||
params = append(params, "--kv-cache-type", kvct)
|
||||
} else {
|
||||
slog.Warn("kv cache type not supported by model", "type", kvct)
|
||||
@@ -200,7 +197,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
|
||||
for _, g := range gpus {
|
||||
if g.Library == "metal" &&
|
||||
uint64(opts.NumGPU) > 0 &&
|
||||
uint64(opts.NumGPU) < ggml.KV().BlockCount()+1 {
|
||||
uint64(opts.NumGPU) < f.KV().BlockCount()+1 {
|
||||
opts.UseMMap = new(bool)
|
||||
*opts.UseMMap = false
|
||||
}
|
||||
@@ -278,6 +275,9 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
|
||||
port = rand.Intn(65535-49152) + 49152 // get a random port in the ephemeral range
|
||||
}
|
||||
finalParams := []string{"runner"}
|
||||
if envconfig.NewEngine() {
|
||||
finalParams = append(finalParams, "--ollama-engine")
|
||||
}
|
||||
finalParams = append(finalParams, params...)
|
||||
finalParams = append(finalParams, "--port", strconv.Itoa(port))
|
||||
|
||||
@@ -320,9 +320,8 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
|
||||
return nil, fmt.Errorf("unable to lookup executable path: %w", err)
|
||||
}
|
||||
|
||||
exe, err = filepath.EvalSymlinks(exe)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("unable to evaluate symlinks for executable path: %w", err)
|
||||
if eval, err := filepath.EvalSymlinks(exe); err == nil {
|
||||
exe = eval
|
||||
}
|
||||
|
||||
// TODO - once fully switched to the Go runner, load the model here for tokenize/detokenize cgo access
|
||||
@@ -335,7 +334,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
|
||||
estimate: estimate,
|
||||
numParallel: numParallel,
|
||||
sem: semaphore.NewWeighted(int64(numParallel)),
|
||||
totalLayers: ggml.KV().BlockCount() + 1,
|
||||
totalLayers: f.KV().BlockCount() + 1,
|
||||
gpus: gpus,
|
||||
done: make(chan error, 1),
|
||||
}
|
||||
|
||||
206
ml/backend.go
Normal file
206
ml/backend.go
Normal file
@@ -0,0 +1,206 @@
|
||||
package ml
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/binary"
|
||||
"fmt"
|
||||
"os"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
type Config interface {
|
||||
Architecture() string
|
||||
String(string, ...string) string
|
||||
Uint(string, ...uint32) uint32
|
||||
Float(string, ...float32) float32
|
||||
|
||||
Strings(string, ...[]string) []string
|
||||
Uints(string, ...[]uint32) []uint32
|
||||
}
|
||||
|
||||
type Backend interface {
|
||||
Config() Config
|
||||
Get(name string) Tensor
|
||||
NewContext() Context
|
||||
SystemInfo() string
|
||||
}
|
||||
|
||||
var backends = make(map[string]func(*os.File) (Backend, error))
|
||||
|
||||
func RegisterBackend(name string, f func(*os.File) (Backend, error)) {
|
||||
if _, ok := backends[name]; ok {
|
||||
panic("backend: backend already registered")
|
||||
}
|
||||
|
||||
backends[name] = f
|
||||
}
|
||||
|
||||
func NewBackend(f *os.File) (Backend, error) {
|
||||
if backend, ok := backends["ggml"]; ok {
|
||||
return backend(f)
|
||||
}
|
||||
|
||||
return nil, fmt.Errorf("unsupported backend")
|
||||
}
|
||||
|
||||
type Context interface {
|
||||
Zeros(dtype DType, shape ...int) Tensor
|
||||
FromFloatSlice(s []float32, shape ...int) (Tensor, error)
|
||||
FromIntSlice(s []int32, shape ...int) (Tensor, error)
|
||||
|
||||
Forward(Tensor)
|
||||
Compute(...Tensor)
|
||||
MaxTensors() int
|
||||
Close()
|
||||
}
|
||||
|
||||
type Tensor interface {
|
||||
Dim(n int) int
|
||||
Stride(n int) int
|
||||
|
||||
Shape() []int
|
||||
DType() DType
|
||||
|
||||
Bytes() []byte
|
||||
Floats() []float32
|
||||
|
||||
Add(ctx Context, t2 Tensor) Tensor
|
||||
Mul(ctx Context, t2 Tensor) Tensor
|
||||
Mulmat(ctx Context, t2 Tensor) Tensor
|
||||
MulmatFullPrec(ctx Context, t2 Tensor) Tensor
|
||||
|
||||
Softmax(ctx Context) Tensor
|
||||
LayerNorm(ctx Context, weight, bias Tensor, eps float32) Tensor
|
||||
RMSNorm(ctx Context, weight Tensor, eps float32) Tensor
|
||||
Scale(ctx Context, s float64) Tensor
|
||||
|
||||
Conv2D(ctx Context, weight Tensor, s0, s1, p0, p1, d0, d1 int) Tensor
|
||||
RoPE(ctx Context, positionIDs, ropeFactors Tensor, dim uint32, base, scale float32) Tensor
|
||||
|
||||
Tanh(ctx Context) Tensor
|
||||
GELU(ctx Context) Tensor
|
||||
SILU(ctx Context) Tensor
|
||||
|
||||
Reshape(ctx Context, shape ...int) Tensor
|
||||
View(ctx Context, offset int, shape ...int) Tensor
|
||||
Permute(ctx Context, shape ...int) Tensor
|
||||
Contiguous(ctx Context) Tensor
|
||||
|
||||
Pad(ctx Context, shape ...int) Tensor
|
||||
Unpad(ctx Context, shape ...int) Tensor
|
||||
|
||||
Stack(ctx Context, dim int, s ...Tensor) Tensor
|
||||
Concat(ctx Context, t2 Tensor, dim int) Tensor
|
||||
Rows(ctx Context, t2 Tensor) Tensor
|
||||
Copy(ctx Context, t2 Tensor) Tensor
|
||||
}
|
||||
|
||||
type number interface {
|
||||
~int | ~int8 | ~int16 | ~int32 | ~int64 |
|
||||
~uint | ~uint8 | ~uint16 | ~uint32 | ~uint64 |
|
||||
~float32 | ~float64 |
|
||||
~complex64 | ~complex128
|
||||
}
|
||||
|
||||
func mul[T number](s ...T) T {
|
||||
p := T(1)
|
||||
for _, v := range s {
|
||||
p *= v
|
||||
}
|
||||
|
||||
return p
|
||||
}
|
||||
|
||||
type DumpOptions struct {
|
||||
// Items is the number of elements to print at the beginning and end of each dimension.
|
||||
Items int
|
||||
|
||||
// Precision is the number of decimal places to print. Applies to float32 and float64.
|
||||
Precision int
|
||||
}
|
||||
|
||||
func Dump(ctx Context, t Tensor, opts ...DumpOptions) string {
|
||||
if len(opts) < 1 {
|
||||
opts = append(opts, DumpOptions{
|
||||
Items: 3,
|
||||
Precision: 4,
|
||||
})
|
||||
}
|
||||
|
||||
switch t.DType() {
|
||||
case DTypeF32:
|
||||
return dump[[]float32](ctx, t, opts[0].Items, func(f float32) string {
|
||||
return strconv.FormatFloat(float64(f), 'f', opts[0].Precision, 32)
|
||||
})
|
||||
case DTypeF16:
|
||||
f32 := ctx.Zeros(DTypeF32, t.Shape()...)
|
||||
f32 = t.Copy(ctx, f32)
|
||||
return dump[[]float32](ctx, f32, opts[0].Items, func(f float32) string {
|
||||
return strconv.FormatFloat(float64(f), 'f', opts[0].Precision, 32)
|
||||
})
|
||||
case DTypeI32:
|
||||
return dump[[]int32](ctx, t, opts[0].Items, func(i int32) string {
|
||||
return strconv.FormatInt(int64(i), 10)
|
||||
})
|
||||
default:
|
||||
return "<unsupported>"
|
||||
}
|
||||
}
|
||||
|
||||
func dump[S ~[]E, E number](ctx Context, t Tensor, items int, fn func(E) string) string {
|
||||
if t.Bytes() == nil {
|
||||
ctx.Forward(t)
|
||||
ctx.Compute(t)
|
||||
}
|
||||
|
||||
s := make(S, mul(t.Shape()...))
|
||||
if err := binary.Read(bytes.NewBuffer(t.Bytes()), binary.LittleEndian, &s); err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
shape := t.Shape()
|
||||
|
||||
var sb strings.Builder
|
||||
var f func([]int, int)
|
||||
f = func(dims []int, stride int) {
|
||||
prefix := strings.Repeat(" ", len(shape)-len(dims)+1)
|
||||
fmt.Fprint(&sb, "[")
|
||||
defer func() { fmt.Fprint(&sb, "]") }()
|
||||
for i := 0; i < dims[0]; i++ {
|
||||
if i >= items && i < dims[0]-items {
|
||||
fmt.Fprint(&sb, "..., ")
|
||||
// skip to next printable element
|
||||
skip := dims[0] - 2*items
|
||||
if len(dims) > 1 {
|
||||
stride += mul(append(dims[1:], skip)...)
|
||||
fmt.Fprint(&sb, strings.Repeat("\n", len(dims)-1), prefix)
|
||||
}
|
||||
i += skip - 1
|
||||
} else if len(dims) > 1 {
|
||||
f(dims[1:], stride)
|
||||
stride += mul(dims[1:]...)
|
||||
if i < dims[0]-1 {
|
||||
fmt.Fprint(&sb, ",", strings.Repeat("\n", len(dims)-1), prefix)
|
||||
}
|
||||
} else {
|
||||
fmt.Fprint(&sb, fn(s[stride+i]))
|
||||
if i < dims[0]-1 {
|
||||
fmt.Fprint(&sb, ", ")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
f(shape, 0)
|
||||
|
||||
return sb.String()
|
||||
}
|
||||
|
||||
type DType int
|
||||
|
||||
const (
|
||||
DTypeOther DType = iota
|
||||
DTypeF32
|
||||
DTypeF16
|
||||
DTypeI32
|
||||
)
|
||||
5
ml/backend/backend.go
Normal file
5
ml/backend/backend.go
Normal file
@@ -0,0 +1,5 @@
|
||||
package backend
|
||||
|
||||
import (
|
||||
_ "github.com/ollama/ollama/ml/backend/ggml"
|
||||
)
|
||||
675
ml/backend/ggml/ggml.go
Normal file
675
ml/backend/ggml/ggml.go
Normal file
@@ -0,0 +1,675 @@
|
||||
package ggml
|
||||
|
||||
/*
|
||||
#cgo CPPFLAGS: -I${SRCDIR}/ggml/include
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include "ggml.h"
|
||||
#include "ggml-cpu.h"
|
||||
#include "ggml-backend.h"
|
||||
static struct ggml_backend_feature * getBackendFeatures(void *fp, ggml_backend_reg_t reg) {return ((ggml_backend_get_features_t)(fp))(reg);}
|
||||
static struct ggml_backend_feature * getNextBackendFeatures(struct ggml_backend_feature * feature) { return &feature[1];}
|
||||
|
||||
typedef enum {COMP_UNKNOWN,COMP_GCC,COMP_CLANG} COMPILER;
|
||||
COMPILER inline get_compiler() {
|
||||
#if defined(__clang__)
|
||||
return COMP_CLANG;
|
||||
#elif defined(__GNUC__)
|
||||
return COMP_GCC;
|
||||
#else
|
||||
return UNKNOWN_COMPILER;
|
||||
#endif
|
||||
}
|
||||
|
||||
*/
|
||||
import "C"
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"io"
|
||||
"log/slog"
|
||||
"os"
|
||||
"sync"
|
||||
"unsafe"
|
||||
|
||||
"github.com/ollama/ollama/format"
|
||||
fs "github.com/ollama/ollama/fs/ggml"
|
||||
"github.com/ollama/ollama/ml"
|
||||
"golang.org/x/sync/errgroup"
|
||||
|
||||
ggml "github.com/ollama/ollama/ml/backend/ggml/ggml/src"
|
||||
)
|
||||
|
||||
type device struct {
|
||||
d *C.struct_ggml_backend_device
|
||||
}
|
||||
|
||||
func (d device) LogValue() slog.Value {
|
||||
var free, total uint64
|
||||
C.ggml_backend_dev_memory(d.d, (*C.size_t)(&free), (*C.size_t)(&total))
|
||||
|
||||
kind := "unknown"
|
||||
switch C.ggml_backend_dev_type(d.d) {
|
||||
case C.GGML_BACKEND_DEVICE_TYPE_CPU:
|
||||
kind = "cpu"
|
||||
case C.GGML_BACKEND_DEVICE_TYPE_GPU:
|
||||
kind = "gpu"
|
||||
case C.GGML_BACKEND_DEVICE_TYPE_ACCEL:
|
||||
kind = "accel"
|
||||
}
|
||||
|
||||
return slog.GroupValue(
|
||||
slog.String("name", C.GoString(C.ggml_backend_dev_name(d.d))),
|
||||
slog.String("description", C.GoString(C.ggml_backend_dev_description(d.d))),
|
||||
slog.String("kind", kind),
|
||||
slog.String("free", format.HumanBytes2(free)),
|
||||
slog.String("total", format.HumanBytes2(total)),
|
||||
)
|
||||
}
|
||||
|
||||
var devices = sync.OnceValue(func() []device {
|
||||
ggml.OnceLoad()
|
||||
|
||||
s := make([]device, C.ggml_backend_dev_count())
|
||||
for i := range s {
|
||||
s[i] = device{C.ggml_backend_dev_get(C.size_t(i))}
|
||||
}
|
||||
|
||||
return s
|
||||
})
|
||||
|
||||
type Backend struct {
|
||||
meta *fs.GGML
|
||||
cpus, gpus []Context
|
||||
tensors map[string]*Context
|
||||
}
|
||||
|
||||
func New(r *os.File) (ml.Backend, error) {
|
||||
meta, n, err := fs.Decode(r, -1)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
slog.Info(
|
||||
"",
|
||||
"architecture", meta.KV().Architecture(),
|
||||
"file_type", meta.KV().FileType(),
|
||||
"name", meta.KV().String("general.name"),
|
||||
"description", meta.KV().String("general.description"),
|
||||
"num_tensors", len(meta.Tensors().Items()),
|
||||
"num_key_values", len(meta.KV()),
|
||||
)
|
||||
|
||||
var cpus, gpus []Context
|
||||
for _, d := range devices() {
|
||||
switch C.ggml_backend_dev_type(d.d) {
|
||||
case C.GGML_BACKEND_DEVICE_TYPE_CPU,
|
||||
C.GGML_BACKEND_DEVICE_TYPE_ACCEL:
|
||||
slog.Info("cpu", "device", d)
|
||||
cpus = append(cpus, Context{
|
||||
ctx: C.ggml_init(C.struct_ggml_init_params{
|
||||
mem_size: C.size_t(int(C.ggml_tensor_overhead()) * (len(meta.Tensors().Items()) + 1 + int(meta.KV().BlockCount())*2)),
|
||||
no_alloc: true,
|
||||
}),
|
||||
backend: C.ggml_backend_dev_init(d.d, nil),
|
||||
})
|
||||
case C.GGML_BACKEND_DEVICE_TYPE_GPU:
|
||||
slog.Info("gpu", "device", d)
|
||||
gpus = append(gpus, Context{
|
||||
ctx: C.ggml_init(C.struct_ggml_init_params{
|
||||
mem_size: C.size_t(int(C.ggml_tensor_overhead()) * (len(meta.Tensors().Items()) + 1 + int(meta.KV().BlockCount())*2)),
|
||||
no_alloc: true,
|
||||
}),
|
||||
backend: C.ggml_backend_dev_init(d.d, nil),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
ctxFunc := func(s []Context) (*Context, error) {
|
||||
for _, e := range s {
|
||||
return &e, nil
|
||||
}
|
||||
|
||||
return nil, fmt.Errorf("no devices available")
|
||||
}
|
||||
|
||||
tensors := make(map[*fs.Tensor]*Context, len(meta.Tensors().Items()))
|
||||
for _, t := range meta.Tensors().Items() {
|
||||
c, err := ctxFunc(append(gpus, cpus...))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
func() {
|
||||
tt := C.ggml_new_tensor(c.ctx, t.Kind, C.int(len(t.Shape)), (*C.int64_t)(unsafe.Pointer(&t.Shape[0])))
|
||||
|
||||
cname := C.CString(t.Name)
|
||||
defer C.free(unsafe.Pointer(cname))
|
||||
C.ggml_set_name(tt, cname)
|
||||
|
||||
tensors[t] = c
|
||||
}()
|
||||
}
|
||||
|
||||
for _, b := range append(gpus, cpus...) {
|
||||
C.ggml_backend_alloc_ctx_tensors(b.ctx, b.backend)
|
||||
}
|
||||
|
||||
sr := io.NewSectionReader(r, int64(meta.Tensors().Offset), n-int64(meta.Tensors().Offset))
|
||||
|
||||
var g errgroup.Group
|
||||
for t, c := range tensors {
|
||||
g.Go(func() error {
|
||||
bts := make([]byte, t.Size())
|
||||
n, err := io.ReadFull(io.NewSectionReader(sr, int64(t.Offset), int64(t.Size())), bts)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if n != int(t.Size()) {
|
||||
return fmt.Errorf("expected %d bytes, got %d", t.Size(), n)
|
||||
}
|
||||
|
||||
cname := C.CString(t.Name)
|
||||
defer C.free(unsafe.Pointer(cname))
|
||||
|
||||
C.ggml_backend_tensor_set(C.ggml_get_tensor(c.ctx, cname), unsafe.Pointer(&bts[0]), 0, C.size_t(n))
|
||||
return nil
|
||||
})
|
||||
}
|
||||
|
||||
if err := g.Wait(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return &Backend{
|
||||
meta: meta,
|
||||
cpus: cpus,
|
||||
gpus: gpus,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
ml.RegisterBackend("ggml", New)
|
||||
}
|
||||
|
||||
func (b *Backend) Config() ml.Config {
|
||||
return b.meta.KV()
|
||||
}
|
||||
|
||||
func (b *Backend) Get(name string) ml.Tensor {
|
||||
cname := C.CString(name)
|
||||
defer C.free(unsafe.Pointer(cname))
|
||||
|
||||
for _, c := range append(b.gpus, b.cpus...) {
|
||||
if t := C.ggml_get_tensor(c.ctx, cname); t != nil {
|
||||
return &Tensor{t: t}
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (b *Backend) NewContext() ml.Context {
|
||||
nodes := max(8192, len(b.meta.Tensors().Items())*5)
|
||||
c := C.ggml_init(C.struct_ggml_init_params{
|
||||
mem_buffer: nil,
|
||||
mem_size: C.size_t(nodes)*C.ggml_tensor_overhead() + C.ggml_graph_overhead_custom(C.size_t(nodes), false),
|
||||
no_alloc: true,
|
||||
})
|
||||
|
||||
backends := make([]*C.struct_ggml_backend, len(b.gpus)+len(b.cpus))
|
||||
bufts := make([]*C.struct_ggml_backend_buffer_type, len(b.gpus)+len(b.cpus))
|
||||
for i, c := range append(b.gpus, b.cpus...) {
|
||||
backends[i] = c.backend
|
||||
bufts[i] = C.ggml_backend_get_default_buffer_type(c.backend)
|
||||
}
|
||||
|
||||
return &Context{
|
||||
ctx: c,
|
||||
backend: backends[0],
|
||||
nodes: nodes,
|
||||
sched: C.ggml_backend_sched_new(
|
||||
(*C.ggml_backend_t)(unsafe.Pointer(&backends[0])),
|
||||
(*C.ggml_backend_buffer_type_t)(unsafe.Pointer(&bufts[0])),
|
||||
C.int(len(backends)),
|
||||
C.size_t(nodes),
|
||||
true,
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
type Context struct {
|
||||
ctx *C.struct_ggml_context
|
||||
backend *C.struct_ggml_backend
|
||||
|
||||
sched *C.struct_ggml_backend_sched
|
||||
graph *C.struct_ggml_cgraph
|
||||
nodes int
|
||||
}
|
||||
|
||||
func (c *Context) Forward(t ml.Tensor) {
|
||||
if c.graph == nil {
|
||||
c.graph = C.ggml_new_graph_custom(c.ctx, C.size_t(c.nodes), false)
|
||||
}
|
||||
|
||||
C.ggml_build_forward_expand(c.graph, t.(*Tensor).t)
|
||||
}
|
||||
|
||||
func (c *Context) Compute(tensors ...ml.Tensor) {
|
||||
C.ggml_backend_sched_graph_compute_async(c.sched, c.graph)
|
||||
|
||||
needSync := true
|
||||
sync := func() {
|
||||
if needSync {
|
||||
C.ggml_backend_sched_synchronize(c.sched)
|
||||
needSync = false
|
||||
}
|
||||
}
|
||||
|
||||
for _, t := range tensors {
|
||||
if C.ggml_nbytes(t.(*Tensor).t) > 0 {
|
||||
t.(*Tensor).sync = sync
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (c *Context) MaxTensors() int {
|
||||
return c.nodes
|
||||
}
|
||||
|
||||
func shapeToGGML(shape []int) *C.int64_t {
|
||||
sh := make([]C.int64_t, len(shape))
|
||||
for i, s := range shape {
|
||||
sh[i] = (C.int64_t)(s)
|
||||
}
|
||||
|
||||
return &sh[0]
|
||||
}
|
||||
|
||||
func (c Context) Zeros(dtype ml.DType, shape ...int) ml.Tensor {
|
||||
if len(shape) < 1 || len(shape) > 4 {
|
||||
panic("unsupported number of dimensions")
|
||||
}
|
||||
|
||||
for _, dim := range shape {
|
||||
if dim < 1 {
|
||||
panic("invalid shape")
|
||||
}
|
||||
}
|
||||
|
||||
var t *C.struct_ggml_tensor
|
||||
switch dtype {
|
||||
case ml.DTypeF32:
|
||||
t = C.ggml_new_tensor(c.ctx, C.GGML_TYPE_F32, C.int(len(shape)), shapeToGGML(shape))
|
||||
case ml.DTypeF16:
|
||||
t = C.ggml_new_tensor(c.ctx, C.GGML_TYPE_F16, C.int(len(shape)), shapeToGGML(shape))
|
||||
case ml.DTypeI32:
|
||||
t = C.ggml_new_tensor(c.ctx, C.GGML_TYPE_I32, C.int(len(shape)), shapeToGGML(shape))
|
||||
default:
|
||||
panic("unsupported dtype")
|
||||
}
|
||||
|
||||
b := C.ggml_backend_alloc_buffer(c.backend, C.ggml_nbytes(t))
|
||||
C.ggml_backend_tensor_alloc(b, t, C.ggml_backend_buffer_get_base(b))
|
||||
C.ggml_set_zero(t)
|
||||
return &Tensor{t: t}
|
||||
}
|
||||
|
||||
func fromSlice[S ~[]E, E float32 | int32](ctx Context, s S, shape []int, dtype uint32) (ml.Tensor, error) {
|
||||
n := len(s)
|
||||
|
||||
if n == 0 {
|
||||
var shape C.int64_t = 0
|
||||
t := C.ggml_new_tensor(ctx.ctx, dtype, 1, &shape)
|
||||
return &Tensor{t: t}, nil
|
||||
}
|
||||
|
||||
for _, v := range shape {
|
||||
n /= v
|
||||
}
|
||||
|
||||
if n != 1 {
|
||||
return nil, fmt.Errorf("invalid shape %v for %d elements", shape, len(s))
|
||||
}
|
||||
|
||||
t := C.ggml_new_tensor(ctx.ctx, dtype, C.int(len(shape)), shapeToGGML(shape))
|
||||
b := C.ggml_backend_alloc_buffer(ctx.backend, C.ggml_nbytes(t))
|
||||
C.ggml_backend_tensor_alloc(b, t, C.ggml_backend_buffer_get_base(b))
|
||||
C.ggml_backend_tensor_set(t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t))
|
||||
return &Tensor{t: t}, nil
|
||||
}
|
||||
|
||||
func (c Context) FromFloatSlice(s []float32, shape ...int) (ml.Tensor, error) {
|
||||
return fromSlice(c, s, shape, C.GGML_TYPE_F32)
|
||||
}
|
||||
|
||||
func (c Context) FromIntSlice(s []int32, shape ...int) (ml.Tensor, error) {
|
||||
return fromSlice(c, s, shape, C.GGML_TYPE_I32)
|
||||
}
|
||||
|
||||
func (c *Context) Close() {
|
||||
if c != nil {
|
||||
C.ggml_backend_sched_free(c.sched)
|
||||
C.ggml_free(c.ctx)
|
||||
}
|
||||
}
|
||||
|
||||
type Tensor struct {
|
||||
t *C.struct_ggml_tensor
|
||||
sync func()
|
||||
}
|
||||
|
||||
func (t *Tensor) LogValue() slog.Value {
|
||||
return slog.GroupValue(
|
||||
slog.String("name", C.GoString(C.ggml_get_name(t.t))),
|
||||
slog.String("type", C.GoString(C.ggml_type_name(t.t._type))),
|
||||
slog.Any("shape", t.Shape()),
|
||||
)
|
||||
}
|
||||
|
||||
func (t *Tensor) Dim(n int) int {
|
||||
return int(t.t.ne[n])
|
||||
}
|
||||
|
||||
func (t *Tensor) Stride(n int) int {
|
||||
return int(t.t.nb[n])
|
||||
}
|
||||
|
||||
func (t *Tensor) Shape() []int {
|
||||
shape := make([]int, C.ggml_n_dims(t.t))
|
||||
for i := range shape {
|
||||
shape[i] = t.Dim(i)
|
||||
}
|
||||
|
||||
return shape
|
||||
}
|
||||
|
||||
func (t *Tensor) Bytes() (data []byte) {
|
||||
if t.sync != nil {
|
||||
data = make([]byte, C.ggml_nbytes(t.t))
|
||||
|
||||
t.sync()
|
||||
C.ggml_backend_tensor_get(t.t, unsafe.Pointer(&data[0]), 0, C.ggml_nbytes(t.t))
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
func (t *Tensor) Floats() (data []float32) {
|
||||
if t.sync != nil {
|
||||
data = make([]float32, C.ggml_nelements(t.t))
|
||||
|
||||
t.sync()
|
||||
C.ggml_backend_tensor_get(t.t, unsafe.Pointer(&data[0]), 0, C.ggml_nbytes(t.t))
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
func (t *Tensor) DType() ml.DType {
|
||||
switch t.t._type {
|
||||
case C.GGML_TYPE_F32:
|
||||
return ml.DTypeF32
|
||||
case C.GGML_TYPE_F16:
|
||||
return ml.DTypeF16
|
||||
case C.GGML_TYPE_I32:
|
||||
return ml.DTypeI32
|
||||
default:
|
||||
return ml.DTypeOther
|
||||
}
|
||||
}
|
||||
|
||||
func (t *Tensor) Add(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
|
||||
return &Tensor{
|
||||
t: C.ggml_add(ctx.(*Context).ctx, t.t, t2.(*Tensor).t),
|
||||
}
|
||||
}
|
||||
|
||||
func (t *Tensor) Stack(ctx ml.Context, dim int, s ...ml.Tensor) ml.Tensor {
|
||||
if len(s) > 0 {
|
||||
return t.Concat(ctx, s[0].Stack(ctx, dim, s[1:]...), dim)
|
||||
}
|
||||
|
||||
return t
|
||||
}
|
||||
|
||||
func (t *Tensor) Concat(ctx ml.Context, t2 ml.Tensor, dim int) ml.Tensor {
|
||||
return &Tensor{
|
||||
t: C.ggml_concat(ctx.(*Context).ctx, t.t, t2.(*Tensor).t, C.int(dim)),
|
||||
}
|
||||
}
|
||||
|
||||
func (t *Tensor) Contiguous(ctx ml.Context) ml.Tensor {
|
||||
return &Tensor{
|
||||
t: C.ggml_cont(ctx.(*Context).ctx, t.t),
|
||||
}
|
||||
}
|
||||
|
||||
func (t *Tensor) Mul(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
|
||||
return &Tensor{
|
||||
t: C.ggml_mul(ctx.(*Context).ctx, t.t, t2.(*Tensor).t),
|
||||
}
|
||||
}
|
||||
|
||||
func (t *Tensor) Mulmat(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
|
||||
return &Tensor{
|
||||
t: C.ggml_mul_mat(ctx.(*Context).ctx, t.t, t2.(*Tensor).t),
|
||||
}
|
||||
}
|
||||
|
||||
func (t *Tensor) MulmatFullPrec(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
|
||||
mul := C.ggml_mul_mat(ctx.(*Context).ctx, t.t, t2.(*Tensor).t)
|
||||
C.ggml_mul_mat_set_prec(mul, C.GGML_PREC_F32)
|
||||
|
||||
return &Tensor{
|
||||
t: mul,
|
||||
}
|
||||
}
|
||||
|
||||
func (t *Tensor) LayerNorm(ctx ml.Context, w, b ml.Tensor, eps float32) ml.Tensor {
|
||||
tt := (&Tensor{t: C.ggml_norm(ctx.(*Context).ctx, t.t, C.float(eps))}).Mul(ctx, w)
|
||||
if b != nil {
|
||||
tt = tt.Add(ctx, b)
|
||||
}
|
||||
|
||||
return tt
|
||||
}
|
||||
|
||||
func (t *Tensor) RMSNorm(ctx ml.Context, w ml.Tensor, eps float32) ml.Tensor {
|
||||
return (&Tensor{t: C.ggml_norm(ctx.(*Context).ctx, t.t, C.float(eps))}).Mul(ctx, w)
|
||||
}
|
||||
|
||||
func (t *Tensor) Pad(ctx ml.Context, shape ...int) ml.Tensor {
|
||||
if len(shape) != 4 {
|
||||
panic("expected 4 dimensions")
|
||||
}
|
||||
|
||||
return &Tensor{
|
||||
t: C.ggml_pad(ctx.(*Context).ctx, t.t, C.int(shape[0]), C.int(shape[1]), C.int(shape[2]), C.int(shape[3])),
|
||||
}
|
||||
}
|
||||
|
||||
func (t *Tensor) Permute(ctx ml.Context, shape ...int) ml.Tensor {
|
||||
if len(shape) != 4 {
|
||||
panic("expected 4 dimensions")
|
||||
}
|
||||
|
||||
return &Tensor{
|
||||
t: C.ggml_permute(ctx.(*Context).ctx, t.t, C.int(shape[0]), C.int(shape[1]), C.int(shape[2]), C.int(shape[3])),
|
||||
}
|
||||
}
|
||||
|
||||
func (t *Tensor) Rows(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
|
||||
return &Tensor{
|
||||
t: C.ggml_get_rows(ctx.(*Context).ctx, t.t, t2.(*Tensor).t),
|
||||
}
|
||||
}
|
||||
|
||||
func (t *Tensor) Copy(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
|
||||
return &Tensor{
|
||||
t: C.ggml_cpy(ctx.(*Context).ctx, t.t, t2.(*Tensor).t),
|
||||
}
|
||||
}
|
||||
|
||||
func (t *Tensor) Reshape(ctx ml.Context, shape ...int) ml.Tensor {
|
||||
switch len(shape) {
|
||||
case 1:
|
||||
return &Tensor{
|
||||
t: C.ggml_reshape_1d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0])),
|
||||
}
|
||||
case 2:
|
||||
return &Tensor{
|
||||
t: C.ggml_reshape_2d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0]), C.int64_t(shape[1])),
|
||||
}
|
||||
case 3:
|
||||
return &Tensor{
|
||||
t: C.ggml_reshape_3d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0]), C.int64_t(shape[1]), C.int64_t(shape[2])),
|
||||
}
|
||||
case 4:
|
||||
return &Tensor{
|
||||
t: C.ggml_reshape_4d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0]), C.int64_t(shape[1]), C.int64_t(shape[2]), C.int64_t(shape[3])),
|
||||
}
|
||||
default:
|
||||
panic("unsupported number of dimensions")
|
||||
}
|
||||
}
|
||||
|
||||
func (t *Tensor) Scale(ctx ml.Context, s float64) ml.Tensor {
|
||||
return &Tensor{
|
||||
t: C.ggml_scale(ctx.(*Context).ctx, t.t, (C.float)(s)),
|
||||
}
|
||||
}
|
||||
|
||||
func (t *Tensor) Softmax(ctx ml.Context) ml.Tensor {
|
||||
return &Tensor{
|
||||
t: C.ggml_soft_max(ctx.(*Context).ctx, t.t),
|
||||
}
|
||||
}
|
||||
|
||||
func (t *Tensor) Tanh(ctx ml.Context) ml.Tensor {
|
||||
return &Tensor{
|
||||
t: C.ggml_tanh_inplace(ctx.(*Context).ctx, t.t),
|
||||
}
|
||||
}
|
||||
|
||||
func (t *Tensor) Unpad(ctx ml.Context, shape ...int) ml.Tensor {
|
||||
if len(shape) != 4 {
|
||||
panic("expected 4 dimensions")
|
||||
}
|
||||
|
||||
return &Tensor{
|
||||
t: C.ggml_unpad(ctx.(*Context).ctx, t.t, C.int(shape[0]), C.int(shape[1]), C.int(shape[2]), C.int(shape[3])),
|
||||
}
|
||||
}
|
||||
|
||||
func (t *Tensor) View(ctx ml.Context, offset int, shape ...int) ml.Tensor {
|
||||
switch len(shape) {
|
||||
case 1:
|
||||
return &Tensor{
|
||||
t: C.ggml_view_1d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0]), C.size_t(offset)),
|
||||
}
|
||||
case 3:
|
||||
return &Tensor{
|
||||
t: C.ggml_view_2d(ctx.(*Context).ctx, t.t,
|
||||
C.int64_t(shape[0]), C.int64_t(shape[2]),
|
||||
C.size_t(shape[1]),
|
||||
C.size_t(offset)),
|
||||
}
|
||||
case 5:
|
||||
return &Tensor{
|
||||
t: C.ggml_view_3d(ctx.(*Context).ctx, t.t,
|
||||
C.int64_t(shape[0]), C.int64_t(shape[2]), C.int64_t(shape[4]),
|
||||
C.size_t(shape[1]), C.size_t(shape[3]),
|
||||
C.size_t(offset)),
|
||||
}
|
||||
case 7:
|
||||
return &Tensor{
|
||||
t: C.ggml_view_4d(ctx.(*Context).ctx, t.t,
|
||||
C.int64_t(shape[0]), C.int64_t(shape[2]), C.int64_t(shape[4]), C.int64_t(shape[6]),
|
||||
C.size_t(shape[1]), C.size_t(shape[3]), C.size_t(shape[5]),
|
||||
C.size_t(offset)),
|
||||
}
|
||||
default:
|
||||
panic("unsupported number of dimensions")
|
||||
}
|
||||
}
|
||||
|
||||
const (
|
||||
ropeTypeNorm C.int = iota
|
||||
)
|
||||
|
||||
func (t *Tensor) RoPE(ctx ml.Context, positionIDs, ropeFactors ml.Tensor, ropeDim uint32, ropeBase, ropeScale float32) ml.Tensor {
|
||||
if ropeFactors == nil {
|
||||
ropeFactors = &Tensor{}
|
||||
}
|
||||
|
||||
dequant := t.t
|
||||
if C.ggml_is_quantized(t.t._type) {
|
||||
dequant = C.ggml_cast(ctx.(*Context).ctx, t.t, C.GGML_TYPE_F32)
|
||||
}
|
||||
|
||||
return &Tensor{
|
||||
t: C.ggml_rope_ext(
|
||||
ctx.(*Context).ctx, dequant, positionIDs.(*Tensor).t, ropeFactors.(*Tensor).t,
|
||||
C.int(ropeDim),
|
||||
131072, // YaRN n_ctx_train
|
||||
ropeTypeNorm, // ROPE_TYPE_NORM
|
||||
C.float(ropeBase),
|
||||
C.float(ropeScale),
|
||||
0., // YaRN ext_factor
|
||||
1., // YaRN attn_factor
|
||||
32., // YaRN beta_fast
|
||||
1., // YaRN beta_slow
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
func (t *Tensor) GELU(ctx ml.Context) ml.Tensor {
|
||||
return &Tensor{
|
||||
t: C.ggml_gelu_inplace(ctx.(*Context).ctx, t.t),
|
||||
}
|
||||
}
|
||||
|
||||
func (t *Tensor) SILU(ctx ml.Context) ml.Tensor {
|
||||
return &Tensor{
|
||||
t: C.ggml_silu_inplace(ctx.(*Context).ctx, t.t),
|
||||
}
|
||||
}
|
||||
|
||||
func (t *Tensor) Conv2D(ctx ml.Context, t2 ml.Tensor, s0, s1, p0, p1, d0, d1 int) ml.Tensor {
|
||||
return &Tensor{
|
||||
t: C.ggml_conv_2d(ctx.(*Context).ctx, t.t, t2.(*Tensor).t, C.int(s0), C.int(s1), C.int(p0), C.int(p1), C.int(d0), C.int(d1)),
|
||||
}
|
||||
}
|
||||
|
||||
func (b *Backend) SystemInfo() string {
|
||||
var compiler string
|
||||
switch C.get_compiler() {
|
||||
case C.COMP_UNKNOWN:
|
||||
compiler = "cgo(unknown_compiler)"
|
||||
case C.COMP_GCC:
|
||||
compiler = "cgo(gcc)"
|
||||
case C.COMP_CLANG:
|
||||
compiler = "cgo(clang)"
|
||||
}
|
||||
|
||||
var s string
|
||||
for i := range C.ggml_backend_reg_count() {
|
||||
reg := C.ggml_backend_reg_get(i)
|
||||
fName := C.CString("ggml_backend_get_features")
|
||||
defer C.free(unsafe.Pointer(fName))
|
||||
get_features_fn := C.ggml_backend_reg_get_proc_address(reg, fName)
|
||||
if get_features_fn != nil {
|
||||
s += C.GoString(C.ggml_backend_reg_name(reg))
|
||||
s += " : "
|
||||
for features := C.getBackendFeatures(get_features_fn, reg); features.name != nil; features = C.getNextBackendFeatures(features) {
|
||||
s += C.GoString(features.name)
|
||||
s += " = "
|
||||
s += C.GoString(features.value)
|
||||
s += " | "
|
||||
}
|
||||
}
|
||||
}
|
||||
return s + compiler
|
||||
}
|
||||
128
ml/backend/ggml/ggml/src/ggml-backend-reg.cpp
vendored
128
ml/backend/ggml/ggml/src/ggml-backend-reg.cpp
vendored
@@ -72,16 +72,6 @@
|
||||
# pragma clang diagnostic ignored "-Wdeprecated-declarations"
|
||||
#endif
|
||||
|
||||
static std::wstring utf8_to_utf16(const std::string & str) {
|
||||
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
|
||||
return converter.from_bytes(str);
|
||||
}
|
||||
|
||||
static std::string utf16_to_utf8(const std::wstring & str) {
|
||||
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
|
||||
return converter.to_bytes(str);
|
||||
}
|
||||
|
||||
#if defined(__clang__)
|
||||
# pragma clang diagnostic pop
|
||||
#endif
|
||||
@@ -96,12 +86,12 @@ struct dl_handle_deleter {
|
||||
}
|
||||
};
|
||||
|
||||
static dl_handle * dl_load_library(const std::wstring & path) {
|
||||
static dl_handle * dl_load_library(const std::filesystem::path & path) {
|
||||
// suppress error dialogs for missing DLLs
|
||||
DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
|
||||
SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
|
||||
|
||||
HMODULE handle = LoadLibraryW(path.c_str());
|
||||
HMODULE handle = LoadLibraryW(path.wstring().c_str());
|
||||
|
||||
SetErrorMode(old_mode);
|
||||
|
||||
@@ -129,8 +119,8 @@ struct dl_handle_deleter {
|
||||
}
|
||||
};
|
||||
|
||||
static void * dl_load_library(const std::wstring & path) {
|
||||
dl_handle * handle = dlopen(utf16_to_utf8(path).c_str(), RTLD_NOW | RTLD_LOCAL);
|
||||
static void * dl_load_library(const std::filesystem::path & path) {
|
||||
dl_handle * handle = dlopen(path.string().c_str(), RTLD_NOW | RTLD_LOCAL);
|
||||
|
||||
return handle;
|
||||
}
|
||||
@@ -215,13 +205,18 @@ struct ggml_backend_registry {
|
||||
GGML_LOG_DEBUG("%s: registered device %s (%s)\n", __func__, ggml_backend_dev_name(device), ggml_backend_dev_description(device));
|
||||
#endif
|
||||
devices.push_back({device, score});
|
||||
std::stable_sort(devices.begin(), devices.end(),
|
||||
[](const auto & a, const auto & b) {
|
||||
return a.second > b.second;
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
ggml_backend_reg_t load_backend(const std::wstring & path, bool silent) {
|
||||
ggml_backend_reg_t load_backend(const std::filesystem::path & path, bool silent) {
|
||||
dl_handle_ptr handle { dl_load_library(path) };
|
||||
if (!handle) {
|
||||
if (!silent) {
|
||||
GGML_LOG_ERROR("%s: failed to load %s\n", __func__, utf16_to_utf8(path).c_str());
|
||||
GGML_LOG_ERROR("%s: failed to load %s\n", __func__, path.string().c_str());
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
@@ -229,7 +224,7 @@ struct ggml_backend_registry {
|
||||
auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
|
||||
if (score_fn && score_fn() == 0) {
|
||||
if (!silent) {
|
||||
GGML_LOG_INFO("%s: backend %s is not supported on this system\n", __func__, utf16_to_utf8(path).c_str());
|
||||
GGML_LOG_INFO("%s: backend %s is not supported on this system\n", __func__, path.string().c_str());
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
@@ -237,7 +232,7 @@ struct ggml_backend_registry {
|
||||
auto backend_init_fn = (ggml_backend_init_t) dl_get_sym(handle.get(), "ggml_backend_init");
|
||||
if (!backend_init_fn) {
|
||||
if (!silent) {
|
||||
GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s\n", __func__, utf16_to_utf8(path).c_str());
|
||||
GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s\n", __func__, path.string().c_str());
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
@@ -246,16 +241,16 @@ struct ggml_backend_registry {
|
||||
if (!reg || reg->api_version != GGML_BACKEND_API_VERSION) {
|
||||
if (!silent) {
|
||||
if (!reg) {
|
||||
GGML_LOG_ERROR("%s: failed to initialize backend from %s: ggml_backend_init returned NULL\n", __func__, utf16_to_utf8(path).c_str());
|
||||
GGML_LOG_ERROR("%s: failed to initialize backend from %s: ggml_backend_init returned NULL\n", __func__, path.string().c_str());
|
||||
} else {
|
||||
GGML_LOG_ERROR("%s: failed to initialize backend from %s: incompatible API version (backend: %d, current: %d)\n",
|
||||
__func__, utf16_to_utf8(path).c_str(), reg->api_version, GGML_BACKEND_API_VERSION);
|
||||
__func__, path.string().c_str(), reg->api_version, GGML_BACKEND_API_VERSION);
|
||||
}
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), utf16_to_utf8(path).c_str());
|
||||
GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), path.string().c_str());
|
||||
|
||||
register_backend(reg, score_fn ? score_fn() : -1, std::move(handle));
|
||||
|
||||
@@ -338,12 +333,7 @@ size_t ggml_backend_dev_count() {
|
||||
|
||||
ggml_backend_dev_t ggml_backend_dev_get(size_t index) {
|
||||
GGML_ASSERT(index < ggml_backend_dev_count());
|
||||
auto devices = get_reg().devices;
|
||||
if (!std::is_heap(devices.begin(), devices.end())) {
|
||||
std::make_heap(devices.begin(), devices.end(), [](const auto & a, const auto & b) { return a.second < b.second; });
|
||||
}
|
||||
|
||||
return devices[index].first;
|
||||
return get_reg().devices[index].first;
|
||||
}
|
||||
|
||||
ggml_backend_dev_t ggml_backend_dev_by_name(const char * name) {
|
||||
@@ -396,14 +386,14 @@ ggml_backend_t ggml_backend_init_best(void) {
|
||||
|
||||
// Dynamic loading
|
||||
ggml_backend_reg_t ggml_backend_load(const char * path) {
|
||||
return get_reg().load_backend(utf8_to_utf16(path), false);
|
||||
return get_reg().load_backend(path, false);
|
||||
}
|
||||
|
||||
void ggml_backend_unload(ggml_backend_reg_t reg) {
|
||||
get_reg().unload_backend(reg, true);
|
||||
}
|
||||
|
||||
static std::wstring get_executable_path() {
|
||||
static std::filesystem::path get_executable_path() {
|
||||
#if defined(__APPLE__)
|
||||
// get executable path
|
||||
std::vector<char> path;
|
||||
@@ -415,15 +405,9 @@ static std::wstring get_executable_path() {
|
||||
}
|
||||
path.resize(size);
|
||||
}
|
||||
std::string base_path(path.data(), size);
|
||||
// remove executable name
|
||||
auto last_slash = base_path.find_last_of('/');
|
||||
if (last_slash != std::string::npos) {
|
||||
base_path = base_path.substr(0, last_slash);
|
||||
}
|
||||
return utf8_to_utf16(base_path + "/");
|
||||
|
||||
return std::filesystem::path(path.data()).parent_path();
|
||||
#elif defined(__linux__) || defined(__FreeBSD__)
|
||||
std::string base_path = ".";
|
||||
std::vector<char> path(1024);
|
||||
while (true) {
|
||||
// get executable path
|
||||
@@ -436,76 +420,56 @@ static std::wstring get_executable_path() {
|
||||
break;
|
||||
}
|
||||
if (len < (ssize_t) path.size()) {
|
||||
base_path = std::string(path.data(), len);
|
||||
// remove executable name
|
||||
auto last_slash = base_path.find_last_of('/');
|
||||
if (last_slash != std::string::npos) {
|
||||
base_path = base_path.substr(0, last_slash);
|
||||
}
|
||||
break;
|
||||
return std::filesystem::path(path.data()).parent_path();
|
||||
}
|
||||
path.resize(path.size() * 2);
|
||||
}
|
||||
|
||||
return utf8_to_utf16(base_path + "/");
|
||||
#elif defined(_WIN32)
|
||||
std::vector<wchar_t> path(MAX_PATH);
|
||||
DWORD len = GetModuleFileNameW(NULL, path.data(), path.size());
|
||||
if (len == 0) {
|
||||
return {};
|
||||
}
|
||||
std::wstring base_path(path.data(), len);
|
||||
// remove executable name
|
||||
auto last_slash = base_path.find_last_of('\\');
|
||||
if (last_slash != std::string::npos) {
|
||||
base_path = base_path.substr(0, last_slash);
|
||||
}
|
||||
return base_path + L"\\";
|
||||
|
||||
return std::filesystem::path(path.data()).parent_path();
|
||||
#else
|
||||
return {};
|
||||
#endif
|
||||
}
|
||||
|
||||
static std::wstring backend_filename_prefix() {
|
||||
static std::string backend_filename_prefix() {
|
||||
#ifdef _WIN32
|
||||
return L"ggml-";
|
||||
return "ggml-";
|
||||
#else
|
||||
return L"libggml-";
|
||||
return "libggml-";
|
||||
#endif
|
||||
}
|
||||
|
||||
static std::wstring backend_filename_suffix() {
|
||||
static std::string backend_filename_suffix() {
|
||||
#ifdef _WIN32
|
||||
return L".dll";
|
||||
return ".dll";
|
||||
#else
|
||||
return L".so";
|
||||
#endif
|
||||
}
|
||||
|
||||
static std::wstring path_separator() {
|
||||
#ifdef _WIN32
|
||||
return L"\\";
|
||||
#else
|
||||
return L"/";
|
||||
return ".so";
|
||||
#endif
|
||||
}
|
||||
|
||||
static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent, const char * user_search_path) {
|
||||
// enumerate all the files that match [lib]ggml-name-*.[so|dll] in the search paths
|
||||
// TODO: search system paths
|
||||
std::wstring file_prefix = backend_filename_prefix() + utf8_to_utf16(name) + L"-";
|
||||
std::vector<std::wstring> search_paths;
|
||||
namespace fs = std::filesystem;
|
||||
std::string file_prefix = backend_filename_prefix() + name + "-";
|
||||
std::vector<fs::path> search_paths;
|
||||
|
||||
if (user_search_path == nullptr) {
|
||||
search_paths.push_back(L"." + path_separator());
|
||||
search_paths.push_back(fs::current_path());
|
||||
search_paths.push_back(get_executable_path());
|
||||
} else {
|
||||
search_paths.push_back(utf8_to_utf16(user_search_path) + path_separator());
|
||||
search_paths.push_back(fs::u8path(user_search_path));
|
||||
}
|
||||
|
||||
int best_score = 0;
|
||||
std::wstring best_path;
|
||||
fs::path best_path;
|
||||
|
||||
namespace fs = std::filesystem;
|
||||
for (const auto & search_path : search_paths) {
|
||||
if (!fs::exists(search_path)) {
|
||||
continue;
|
||||
@@ -514,31 +478,31 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
|
||||
for (const auto & entry : dir_it) {
|
||||
try {
|
||||
if (entry.is_regular_file()) {
|
||||
std::wstring filename = entry.path().filename().wstring();
|
||||
std::wstring ext = entry.path().extension().wstring();
|
||||
std::string filename = entry.path().filename().string();
|
||||
std::string ext = entry.path().extension().string();
|
||||
if (filename.find(file_prefix) == 0 && ext == backend_filename_suffix()) {
|
||||
dl_handle_ptr handle { dl_load_library(entry.path().wstring()) };
|
||||
dl_handle_ptr handle { dl_load_library(entry.path()) };
|
||||
if (!handle) {
|
||||
GGML_LOG_ERROR("%s: failed to load %s\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str());
|
||||
GGML_LOG_ERROR("%s: failed to load %s\n", __func__, entry.path().string().c_str());
|
||||
continue;
|
||||
}
|
||||
|
||||
auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
|
||||
if (!score_fn) {
|
||||
GGML_LOG_DEBUG("%s: failed to find ggml_backend_score in %s\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str());
|
||||
GGML_LOG_DEBUG("%s: failed to find ggml_backend_score in %s\n", __func__, entry.path().string().c_str());
|
||||
continue;
|
||||
}
|
||||
|
||||
int s = score_fn();
|
||||
GGML_LOG_DEBUG("%s: %s score: %d\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str(), s);
|
||||
GGML_LOG_DEBUG("%s: %s score: %d\n", __func__, entry.path().string().c_str(), s);
|
||||
if (s > best_score) {
|
||||
best_score = s;
|
||||
best_path = entry.path().wstring();
|
||||
best_path = entry.path();
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (const std::exception & e) {
|
||||
GGML_LOG_ERROR("%s: failed to load %s: %s\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str(), e.what());
|
||||
GGML_LOG_ERROR("%s: failed to load %s: %s\n", __func__, entry.path().string().c_str(), e.what());
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -546,7 +510,7 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
|
||||
if (best_score == 0) {
|
||||
// try to load the base backend
|
||||
for (const auto & search_path : search_paths) {
|
||||
std::wstring path = search_path + backend_filename_prefix() + utf8_to_utf16(name) + backend_filename_suffix();
|
||||
fs::path path = fs::path(search_path) / (backend_filename_prefix() + name + backend_filename_suffix());
|
||||
if (fs::exists(path)) {
|
||||
return get_reg().load_backend(path, silent);
|
||||
}
|
||||
|
||||
@@ -1,6 +0,0 @@
|
||||
//go:build debug
|
||||
|
||||
package ggml
|
||||
|
||||
// #cgo CPPFLAGS: -DOLLAMA_DEBUG
|
||||
import "C"
|
||||
11
ml/nn/convolution.go
Normal file
11
ml/nn/convolution.go
Normal file
@@ -0,0 +1,11 @@
|
||||
package nn
|
||||
|
||||
import "github.com/ollama/ollama/ml"
|
||||
|
||||
type Conv2D struct {
|
||||
Weight ml.Tensor `gguf:"weight"`
|
||||
}
|
||||
|
||||
func (m *Conv2D) Forward(ctx ml.Context, t ml.Tensor, s0, s1, p0, p1, d0, d1 int) ml.Tensor {
|
||||
return m.Weight.Conv2D(ctx, t, s0, s1, p0, p1, d0, d1)
|
||||
}
|
||||
11
ml/nn/embedding.go
Normal file
11
ml/nn/embedding.go
Normal file
@@ -0,0 +1,11 @@
|
||||
package nn
|
||||
|
||||
import "github.com/ollama/ollama/ml"
|
||||
|
||||
type Embedding struct {
|
||||
Weight ml.Tensor `gguf:"weight"`
|
||||
}
|
||||
|
||||
func (m *Embedding) Forward(ctx ml.Context, hiddenState ml.Tensor) ml.Tensor {
|
||||
return m.Weight.Rows(ctx, hiddenState)
|
||||
}
|
||||
17
ml/nn/linear.go
Normal file
17
ml/nn/linear.go
Normal file
@@ -0,0 +1,17 @@
|
||||
package nn
|
||||
|
||||
import "github.com/ollama/ollama/ml"
|
||||
|
||||
type Linear struct {
|
||||
Weight ml.Tensor `gguf:"weight"`
|
||||
Bias ml.Tensor `gguf:"bias"`
|
||||
}
|
||||
|
||||
func (m *Linear) Forward(ctx ml.Context, t ml.Tensor) ml.Tensor {
|
||||
t = m.Weight.Mulmat(ctx, t)
|
||||
if m.Bias != nil {
|
||||
t = t.Add(ctx, m.Bias)
|
||||
}
|
||||
|
||||
return t
|
||||
}
|
||||
22
ml/nn/normalization.go
Normal file
22
ml/nn/normalization.go
Normal file
@@ -0,0 +1,22 @@
|
||||
package nn
|
||||
|
||||
import (
|
||||
"github.com/ollama/ollama/ml"
|
||||
)
|
||||
|
||||
type LayerNorm struct {
|
||||
Weight ml.Tensor `gguf:"weight"`
|
||||
Bias ml.Tensor `gguf:"bias"`
|
||||
}
|
||||
|
||||
func (m *LayerNorm) Forward(ctx ml.Context, t ml.Tensor, eps float32) ml.Tensor {
|
||||
return t.LayerNorm(ctx, m.Weight, m.Bias, eps)
|
||||
}
|
||||
|
||||
type RMSNorm struct {
|
||||
Weight ml.Tensor `gguf:"weight"`
|
||||
}
|
||||
|
||||
func (m *RMSNorm) Forward(ctx ml.Context, t ml.Tensor, eps float32) ml.Tensor {
|
||||
return t.RMSNorm(ctx, m.Weight, eps)
|
||||
}
|
||||
255
model/model.go
Normal file
255
model/model.go
Normal file
@@ -0,0 +1,255 @@
|
||||
package model
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"image"
|
||||
_ "image/jpeg"
|
||||
_ "image/png"
|
||||
"log/slog"
|
||||
"os"
|
||||
"reflect"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
_ "golang.org/x/image/bmp"
|
||||
_ "golang.org/x/image/tiff"
|
||||
_ "golang.org/x/image/webp"
|
||||
|
||||
"github.com/ollama/ollama/kvcache"
|
||||
"github.com/ollama/ollama/ml"
|
||||
_ "github.com/ollama/ollama/ml/backend"
|
||||
)
|
||||
|
||||
// Options contains the inputs for a model forward pass
|
||||
type Options struct {
|
||||
Inputs []int32
|
||||
Positions []int32
|
||||
Sequences []int
|
||||
Outputs []int32
|
||||
|
||||
Images []image.Image
|
||||
}
|
||||
|
||||
type config struct {
|
||||
Cache kvcache.Cache
|
||||
}
|
||||
|
||||
// Base implements the common fields and methods for all models
|
||||
type Base struct {
|
||||
b ml.Backend
|
||||
config
|
||||
}
|
||||
|
||||
// Backend returns the underlying backend that will run the model
|
||||
func (m *Base) Backend() ml.Backend {
|
||||
return m.b
|
||||
}
|
||||
|
||||
func (m *Base) Config() config {
|
||||
return m.config
|
||||
}
|
||||
|
||||
// Model implements a specific model architecture, defining the forward pass and any model-specific configuration
|
||||
type Model interface {
|
||||
Forward(ml.Context, Options) (ml.Tensor, error)
|
||||
|
||||
Backend() ml.Backend
|
||||
Config() config
|
||||
}
|
||||
|
||||
var models = make(map[string]func(ml.Config) (Model, error))
|
||||
|
||||
// Register registers a model constructor for the given architecture
|
||||
func Register(name string, f func(ml.Config) (Model, error)) {
|
||||
if _, ok := models[name]; ok {
|
||||
panic("model: model already registered")
|
||||
}
|
||||
|
||||
models[name] = f
|
||||
}
|
||||
|
||||
// New initializes a new model instance with the provided configuration based on the metadata in the model file
|
||||
func New(modelPath string) (Model, error) {
|
||||
r, err := os.Open(modelPath)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer r.Close()
|
||||
|
||||
b, err := ml.NewBackend(r)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
arch := b.Config().Architecture()
|
||||
f, ok := models[arch]
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("unsupported model architecture %q", arch)
|
||||
}
|
||||
|
||||
m, err := f(b.Config())
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
base := Base{b: b, config: m.Config()}
|
||||
|
||||
v := reflect.ValueOf(m)
|
||||
v.Elem().Set(populateFields(base, v.Elem()))
|
||||
return m, nil
|
||||
}
|
||||
|
||||
func populateFields(base Base, v reflect.Value, tags ...Tag) reflect.Value {
|
||||
t := v.Type()
|
||||
|
||||
if t.Kind() == reflect.Struct {
|
||||
allNil := true
|
||||
for i := range t.NumField() {
|
||||
tt := t.Field(i).Type
|
||||
vv := v.Field(i)
|
||||
if !vv.CanSet() {
|
||||
continue
|
||||
}
|
||||
|
||||
// make a copy
|
||||
tagsCopy := tags
|
||||
if tag := t.Field(i).Tag.Get("gguf"); tag != "" {
|
||||
tagsCopy = append(tagsCopy, ParseTags(tag))
|
||||
}
|
||||
|
||||
if tt == reflect.TypeOf((*Base)(nil)).Elem() {
|
||||
vv.Set(reflect.ValueOf(base))
|
||||
} else if tt == reflect.TypeOf((*ml.Tensor)(nil)).Elem() {
|
||||
var fn func([]Tag) [][]string
|
||||
fn = func(tags []Tag) (values [][]string) {
|
||||
if len(tags) < 1 {
|
||||
return nil
|
||||
}
|
||||
|
||||
values = [][]string{{tags[0].Name}}
|
||||
for _, alt := range tags[0].Alternate {
|
||||
values = append(values, []string{alt})
|
||||
}
|
||||
|
||||
for i, value := range values {
|
||||
for _, rest := range fn(tags[1:]) {
|
||||
value = append(value, rest...)
|
||||
}
|
||||
|
||||
values[i] = value
|
||||
}
|
||||
|
||||
return values
|
||||
}
|
||||
|
||||
names := fn(tagsCopy)
|
||||
for _, name := range names {
|
||||
if tensor := base.Backend().Get(strings.Join(name, ".")); tensor != nil {
|
||||
slog.Debug("found tensor", "", tensor)
|
||||
vv.Set(reflect.ValueOf(tensor))
|
||||
break
|
||||
}
|
||||
}
|
||||
} else if tt.Kind() == reflect.Pointer || tt.Kind() == reflect.Interface {
|
||||
setPointer(base, vv, tagsCopy)
|
||||
} else if tt.Kind() == reflect.Slice || tt.Kind() == reflect.Array {
|
||||
for i := range vv.Len() {
|
||||
vvv := vv.Index(i)
|
||||
if vvv.Kind() == reflect.Pointer || vvv.Kind() == reflect.Interface {
|
||||
setPointer(base, vvv, append(tagsCopy, Tag{Name: strconv.Itoa(i)}))
|
||||
} else {
|
||||
vvv.Set(populateFields(base, vvv, append(tagsCopy, Tag{Name: strconv.Itoa(i)})...))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if !canNil(tt) || !vv.IsNil() {
|
||||
allNil = false
|
||||
}
|
||||
}
|
||||
|
||||
if allNil {
|
||||
return reflect.Zero(t)
|
||||
}
|
||||
}
|
||||
|
||||
return v
|
||||
}
|
||||
|
||||
func setPointer(base Base, v reflect.Value, tags []Tag) {
|
||||
vv := v
|
||||
if v.Kind() == reflect.Interface {
|
||||
if v.IsNil() {
|
||||
return
|
||||
}
|
||||
|
||||
vv = vv.Elem()
|
||||
}
|
||||
|
||||
vv = vv.Elem()
|
||||
if v.IsNil() {
|
||||
vv = reflect.New(v.Type().Elem()).Elem()
|
||||
}
|
||||
|
||||
if f := populateFields(base, vv, tags...); f.CanAddr() {
|
||||
v.Set(f.Addr())
|
||||
}
|
||||
}
|
||||
|
||||
type Tag struct {
|
||||
Name string
|
||||
Alternate []string
|
||||
}
|
||||
|
||||
func ParseTags(s string) (tag Tag) {
|
||||
parts := strings.Split(s, ",")
|
||||
if len(parts) > 0 {
|
||||
tag.Name = parts[0]
|
||||
|
||||
for _, part := range parts[1:] {
|
||||
if value, ok := strings.CutPrefix(part, "alt:"); ok {
|
||||
tag.Alternate = append(tag.Alternate, value)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
func canNil(t reflect.Type) bool {
|
||||
return t.Kind() == reflect.Chan ||
|
||||
t.Kind() == reflect.Func ||
|
||||
t.Kind() == reflect.Interface ||
|
||||
t.Kind() == reflect.Map ||
|
||||
t.Kind() == reflect.Pointer ||
|
||||
t.Kind() == reflect.Slice
|
||||
}
|
||||
|
||||
func Forward(ctx ml.Context, m Model, opts Options) (ml.Tensor, error) {
|
||||
if len(opts.Positions) != len(opts.Sequences) {
|
||||
return nil, fmt.Errorf("length of positions (%v) must match length of seqs (%v)", len(opts.Positions), len(opts.Sequences))
|
||||
}
|
||||
|
||||
if len(opts.Positions) < 1 {
|
||||
return nil, errors.New("batch size cannot be less than 1")
|
||||
}
|
||||
|
||||
cache := m.Config().Cache
|
||||
if cache != nil {
|
||||
err := cache.StartForward(ctx, opts.Positions, opts.Sequences)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
t, err := m.Forward(ctx, opts)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
ctx.Forward(t)
|
||||
ctx.Compute(t)
|
||||
|
||||
return t, nil
|
||||
}
|
||||
136
model/model_test.go
Normal file
136
model/model_test.go
Normal file
@@ -0,0 +1,136 @@
|
||||
package model
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"slices"
|
||||
"testing"
|
||||
|
||||
"github.com/google/go-cmp/cmp"
|
||||
"github.com/ollama/ollama/ml"
|
||||
"github.com/ollama/ollama/ml/backend/ggml"
|
||||
"github.com/ollama/ollama/ml/nn"
|
||||
)
|
||||
|
||||
func TestParseTags(t *testing.T) {
|
||||
cases := []struct {
|
||||
value string
|
||||
want Tag
|
||||
}{
|
||||
{
|
||||
value: "output",
|
||||
want: Tag{
|
||||
Name: "output",
|
||||
},
|
||||
},
|
||||
{
|
||||
value: "output,alt:token_embd",
|
||||
want: Tag{
|
||||
Name: "output",
|
||||
Alternate: []string{
|
||||
"token_embd",
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range cases {
|
||||
t.Run(tt.value, func(t *testing.T) {
|
||||
got := ParseTags(tt.value)
|
||||
if diff := cmp.Diff(tt.want, got); diff != "" {
|
||||
t.Errorf("ParseTags() returned unexpected values (-want +got):\n%s", diff)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
type fakeBackend struct {
|
||||
*ggml.Backend
|
||||
names []string
|
||||
}
|
||||
|
||||
type fakeTensor struct {
|
||||
*ggml.Tensor
|
||||
Name string
|
||||
}
|
||||
|
||||
func (m *fakeBackend) Get(name string) ml.Tensor {
|
||||
if slices.Contains(m.names, name) {
|
||||
return &fakeTensor{Name: name}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func TestPopulateFields(t *testing.T) {
|
||||
type fakeLayer struct {
|
||||
Query *nn.Linear `gguf:"attn_q"`
|
||||
Key *nn.Linear `gguf:"attn_k"`
|
||||
Value *nn.Linear `gguf:"attn_v"`
|
||||
Output *nn.Linear `gguf:"attn_o"`
|
||||
}
|
||||
|
||||
type fakeModel struct {
|
||||
Input *nn.Embedding `gguf:"input"`
|
||||
OutputNorm *nn.RMSNorm `gguf:"output_norm"`
|
||||
Output *nn.Linear `gguf:"output"`
|
||||
Layers [2]fakeLayer `gguf:"blk"`
|
||||
}
|
||||
|
||||
var m fakeModel
|
||||
v := reflect.ValueOf(&m)
|
||||
v.Elem().Set(populateFields(Base{b: &fakeBackend{
|
||||
names: []string{
|
||||
"input.weight",
|
||||
"blk.0.attn_q.weight",
|
||||
"blk.0.attn_k.weight",
|
||||
"blk.0.attn_v.weight",
|
||||
"blk.1.attn_q.weight",
|
||||
"blk.1.attn_k.weight",
|
||||
"blk.1.attn_v.weight",
|
||||
"output_norm.weight",
|
||||
"output.weight",
|
||||
},
|
||||
}}, v.Elem()))
|
||||
|
||||
if diff := cmp.Diff(fakeModel{
|
||||
Input: &nn.Embedding{Weight: &fakeTensor{Name: "input.weight"}},
|
||||
OutputNorm: &nn.RMSNorm{Weight: &fakeTensor{Name: "output_norm.weight"}},
|
||||
Output: &nn.Linear{Weight: &fakeTensor{Name: "output.weight"}},
|
||||
Layers: [2]fakeLayer{
|
||||
{
|
||||
Query: &nn.Linear{Weight: &fakeTensor{Name: "blk.0.attn_q.weight"}},
|
||||
Key: &nn.Linear{Weight: &fakeTensor{Name: "blk.0.attn_k.weight"}},
|
||||
Value: &nn.Linear{Weight: &fakeTensor{Name: "blk.0.attn_v.weight"}},
|
||||
},
|
||||
{
|
||||
Query: &nn.Linear{Weight: &fakeTensor{Name: "blk.1.attn_q.weight"}},
|
||||
Key: &nn.Linear{Weight: &fakeTensor{Name: "blk.1.attn_k.weight"}},
|
||||
Value: &nn.Linear{Weight: &fakeTensor{Name: "blk.1.attn_v.weight"}},
|
||||
},
|
||||
},
|
||||
}, m); diff != "" {
|
||||
t.Errorf("populateFields() set incorrect values (-want +got):\n%s", diff)
|
||||
}
|
||||
}
|
||||
|
||||
func TestPopulateFieldsAlternateName(t *testing.T) {
|
||||
type fakeModel struct {
|
||||
Input *nn.Embedding `gguf:"input"`
|
||||
Output *nn.Linear `gguf:"output,alt:input"`
|
||||
}
|
||||
|
||||
m := fakeModel{}
|
||||
v := reflect.ValueOf(&m)
|
||||
v.Elem().Set(populateFields(Base{b: &fakeBackend{
|
||||
names: []string{
|
||||
"input.weight",
|
||||
},
|
||||
}}, v.Elem()))
|
||||
|
||||
if diff := cmp.Diff(fakeModel{
|
||||
Input: &nn.Embedding{Weight: &fakeTensor{Name: "input.weight"}},
|
||||
Output: &nn.Linear{Weight: &fakeTensor{Name: "input.weight"}},
|
||||
}, m); diff != "" {
|
||||
t.Errorf("populateFields() set incorrect values (-want +got):\n%s", diff)
|
||||
}
|
||||
}
|
||||
167
model/models/llama/model.go
Normal file
167
model/models/llama/model.go
Normal file
@@ -0,0 +1,167 @@
|
||||
package llama
|
||||
|
||||
import (
|
||||
"math"
|
||||
|
||||
"github.com/ollama/ollama/kvcache"
|
||||
"github.com/ollama/ollama/ml"
|
||||
"github.com/ollama/ollama/ml/nn"
|
||||
"github.com/ollama/ollama/model"
|
||||
)
|
||||
|
||||
type Options struct {
|
||||
RopeFactors ml.Tensor `gguf:"rope_freqs.weight"`
|
||||
hiddenSize, numHeads, numKVHeads int
|
||||
eps, ropeBase, ropeScale float32
|
||||
ropeDim uint32
|
||||
}
|
||||
|
||||
type Model struct {
|
||||
model.Base
|
||||
model.BytePairEncoding
|
||||
|
||||
TokenEmbedding *nn.Embedding `gguf:"token_embd"`
|
||||
Layers []Layer `gguf:"blk"`
|
||||
OutputNorm *nn.RMSNorm `gguf:"output_norm"`
|
||||
Output *nn.Linear `gguf:"output,alt:token_embd"`
|
||||
|
||||
*Options
|
||||
}
|
||||
|
||||
func New(c ml.Config) (model.Model, error) {
|
||||
m := Model{
|
||||
BytePairEncoding: model.NewBytePairEncoding(
|
||||
c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
|
||||
&model.Vocabulary{
|
||||
Values: c.Strings("tokenizer.ggml.tokens"),
|
||||
Types: c.Uints("tokenizer.ggml.token_type"),
|
||||
Merges: c.Strings("tokenizer.ggml.merges"),
|
||||
BOS: int32(c.Uint("tokenizer.ggml.bos_token_id")),
|
||||
EOS: int32(c.Uint("tokenizer.ggml.eos_token_id")),
|
||||
},
|
||||
),
|
||||
Layers: make([]Layer, c.Uint("block_count")),
|
||||
Options: &Options{
|
||||
hiddenSize: int(c.Uint("embedding_length")),
|
||||
numHeads: int(c.Uint("attention.head_count")),
|
||||
numKVHeads: int(c.Uint("attention.head_count_kv")),
|
||||
eps: c.Float("attention.layer_norm_rms_epsilon"),
|
||||
ropeBase: c.Float("rope.freq_base"),
|
||||
ropeScale: c.Float("rope.freq_scale", 1),
|
||||
ropeDim: c.Uint("rope.dimension_count"),
|
||||
},
|
||||
}
|
||||
|
||||
m.Cache = kvcache.NewCausalCache(m.Shift)
|
||||
|
||||
return &m, nil
|
||||
}
|
||||
|
||||
type SelfAttention struct {
|
||||
Query *nn.Linear `gguf:"attn_q"`
|
||||
Key *nn.Linear `gguf:"attn_k"`
|
||||
Value *nn.Linear `gguf:"attn_v"`
|
||||
Output *nn.Linear `gguf:"attn_output"`
|
||||
}
|
||||
|
||||
func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
|
||||
batchSize := hiddenState.Dim(1)
|
||||
headDim := opts.hiddenSize / opts.numHeads
|
||||
|
||||
q := sa.Query.Forward(ctx, hiddenState)
|
||||
q = q.Reshape(ctx, headDim, opts.numHeads, batchSize)
|
||||
q = q.RoPE(ctx, positionIDs, opts.RopeFactors, opts.ropeDim, opts.ropeBase, opts.ropeScale)
|
||||
|
||||
k := sa.Key.Forward(ctx, hiddenState)
|
||||
k = k.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
|
||||
k = k.RoPE(ctx, positionIDs, opts.RopeFactors, opts.ropeDim, opts.ropeBase, opts.ropeScale)
|
||||
|
||||
v := sa.Value.Forward(ctx, hiddenState)
|
||||
v = v.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
|
||||
|
||||
cache.Put(ctx, k, v)
|
||||
k, v, mask := cache.Get(ctx)
|
||||
|
||||
q = q.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
|
||||
k = k.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
|
||||
v = v.Permute(ctx, 1, 2, 0, 3).Contiguous(ctx)
|
||||
|
||||
kq := k.MulmatFullPrec(ctx, q)
|
||||
kq = kq.Scale(ctx, 1.0/math.Sqrt(float64(headDim)))
|
||||
kq = kq.Add(ctx, mask)
|
||||
kq = kq.Softmax(ctx)
|
||||
|
||||
kqv := v.Mulmat(ctx, kq)
|
||||
kqv = kqv.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
|
||||
kqv = kqv.Reshape(ctx, opts.hiddenSize, batchSize)
|
||||
|
||||
return sa.Output.Forward(ctx, kqv)
|
||||
}
|
||||
|
||||
func (m *Model) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
|
||||
return key.RoPE(ctx, shift, m.Options.RopeFactors, m.Options.ropeDim, m.Options.ropeBase, m.Options.ropeScale), nil
|
||||
}
|
||||
|
||||
type MLP struct {
|
||||
Up *nn.Linear `gguf:"ffn_up"`
|
||||
Down *nn.Linear `gguf:"ffn_down"`
|
||||
Gate *nn.Linear `gguf:"ffn_gate"`
|
||||
}
|
||||
|
||||
func (mlp *MLP) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *Options) ml.Tensor {
|
||||
hiddenState = mlp.Gate.Forward(ctx, hiddenState).SILU(ctx).Mul(ctx, mlp.Up.Forward(ctx, hiddenState))
|
||||
return mlp.Down.Forward(ctx, hiddenState)
|
||||
}
|
||||
|
||||
type Layer struct {
|
||||
AttentionNorm *nn.RMSNorm `gguf:"attn_norm"`
|
||||
SelfAttention *SelfAttention
|
||||
MLPNorm *nn.RMSNorm `gguf:"ffn_norm"`
|
||||
MLP *MLP
|
||||
}
|
||||
|
||||
func (l *Layer) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
|
||||
residual := hiddenState
|
||||
|
||||
hiddenState = l.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
|
||||
hiddenState = l.SelfAttention.Forward(ctx, hiddenState, positionIDs, cache, opts)
|
||||
hiddenState = hiddenState.Add(ctx, residual)
|
||||
residual = hiddenState
|
||||
|
||||
hiddenState = l.MLPNorm.Forward(ctx, hiddenState, opts.eps)
|
||||
hiddenState = l.MLP.Forward(ctx, hiddenState, opts)
|
||||
return hiddenState.Add(ctx, residual)
|
||||
}
|
||||
|
||||
func (m *Model) Forward(ctx ml.Context, opts model.Options) (ml.Tensor, error) {
|
||||
inputs, err := ctx.FromIntSlice(opts.Inputs, len(opts.Inputs))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
positions, err := ctx.FromIntSlice(opts.Positions, len(opts.Positions))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
hiddenState := m.TokenEmbedding.Forward(ctx, inputs)
|
||||
|
||||
for i, layer := range m.Layers {
|
||||
m.Cache.SetLayer(i)
|
||||
hiddenState = layer.Forward(ctx, hiddenState, positions, m.Cache, m.Options)
|
||||
}
|
||||
|
||||
hiddenState = m.OutputNorm.Forward(ctx, hiddenState, m.eps)
|
||||
hiddenState = m.Output.Forward(ctx, hiddenState)
|
||||
|
||||
outputs, err := ctx.FromIntSlice(opts.Outputs, len(opts.Outputs))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return hiddenState.Rows(ctx, outputs), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
model.Register("llama", New)
|
||||
}
|
||||
109
model/models/mllama/model.go
Normal file
109
model/models/mllama/model.go
Normal file
@@ -0,0 +1,109 @@
|
||||
package mllama
|
||||
|
||||
import (
|
||||
"github.com/ollama/ollama/kvcache"
|
||||
"github.com/ollama/ollama/ml"
|
||||
"github.com/ollama/ollama/ml/nn"
|
||||
"github.com/ollama/ollama/model"
|
||||
)
|
||||
|
||||
type Model struct {
|
||||
model.Base
|
||||
model.BytePairEncoding
|
||||
|
||||
*VisionModel `gguf:"v,vision"`
|
||||
*TextModel
|
||||
|
||||
Projector *nn.Linear `gguf:"mm.0"`
|
||||
|
||||
ImageProcessor
|
||||
}
|
||||
|
||||
const (
|
||||
crossAttentionLayer = iota
|
||||
selfAttentionLayer
|
||||
)
|
||||
|
||||
func New(c ml.Config) (model.Model, error) {
|
||||
m := Model{
|
||||
BytePairEncoding: model.NewBytePairEncoding(
|
||||
c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
|
||||
&model.Vocabulary{
|
||||
Values: c.Strings("tokenizer.ggml.tokens"),
|
||||
Types: c.Uints("tokenizer.ggml.token_type"),
|
||||
Merges: c.Strings("tokenizer.ggml.merges"),
|
||||
BOS: int32(c.Uint("tokenizer.ggml.bos_token_id")),
|
||||
EOS: int32(c.Uint("tokenizer.ggml.eos_token_id")),
|
||||
},
|
||||
),
|
||||
ImageProcessor: newImageProcessor(c),
|
||||
VisionModel: newVisionModel(c),
|
||||
TextModel: newTextModel(c),
|
||||
}
|
||||
|
||||
m.Cache = kvcache.NewWrapperCache(kvcache.NewEncoderCache(), kvcache.NewCausalCache(m.TextModel.Shift))
|
||||
|
||||
return &m, nil
|
||||
}
|
||||
|
||||
func (m *Model) Forward(ctx ml.Context, opts model.Options) (ml.Tensor, error) {
|
||||
var crossAttentionStates ml.Tensor
|
||||
if opts.Images != nil {
|
||||
f32s, aspectRatioID, err := m.ImageProcessor.ProcessImage(opts.Images[0])
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
pixelValues, err := ctx.FromFloatSlice(f32s,
|
||||
m.ImageProcessor.imageSize,
|
||||
m.ImageProcessor.imageSize,
|
||||
m.ImageProcessor.numChannels,
|
||||
m.ImageProcessor.maxNumTiles,
|
||||
)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
aspectRatio, err := ctx.FromIntSlice([]int32{int32(aspectRatioID)}, 1)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
positions := make([]int32, 1601)
|
||||
for i := range positions {
|
||||
positions[i] = int32(i)
|
||||
}
|
||||
|
||||
positionIDs, err := ctx.FromIntSlice(positions, len(positions))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
crossAttentionStates = m.VisionModel.Forward(ctx, pixelValues, positionIDs, aspectRatio)
|
||||
crossAttentionStates = m.Projector.Forward(ctx, crossAttentionStates)
|
||||
}
|
||||
|
||||
inputs, err := ctx.FromIntSlice(opts.Inputs, len(opts.Inputs))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
positions, err := ctx.FromIntSlice(opts.Positions, len(opts.Positions))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// TODO: attention mask, cross attention mask
|
||||
hiddenState := m.TextModel.Forward(ctx, inputs, positions, nil, crossAttentionStates, nil, m.Cache.(*kvcache.WrapperCache))
|
||||
|
||||
outputs, err := ctx.FromIntSlice(opts.Outputs, len(opts.Outputs))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return hiddenState.Rows(ctx, outputs), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
model.Register("mllama", New)
|
||||
}
|
||||
241
model/models/mllama/model_text.go
Normal file
241
model/models/mllama/model_text.go
Normal file
@@ -0,0 +1,241 @@
|
||||
package mllama
|
||||
|
||||
import (
|
||||
"math"
|
||||
"slices"
|
||||
|
||||
"github.com/ollama/ollama/kvcache"
|
||||
"github.com/ollama/ollama/ml"
|
||||
"github.com/ollama/ollama/ml/nn"
|
||||
)
|
||||
|
||||
type TextSelfAttention struct {
|
||||
Query *nn.Linear `gguf:"attn_q"`
|
||||
Key *nn.Linear `gguf:"attn_k"`
|
||||
Value *nn.Linear `gguf:"attn_v"`
|
||||
Output *nn.Linear `gguf:"attn_output"`
|
||||
}
|
||||
|
||||
func (sa *TextSelfAttention) Forward(ctx ml.Context, hiddenState, positions, _ ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor {
|
||||
batchSize := hiddenState.Dim(1)
|
||||
headDim := opts.hiddenSize / opts.numHeads
|
||||
|
||||
query := sa.Query.Forward(ctx, hiddenState)
|
||||
query = query.Reshape(ctx, headDim, opts.numHeads, batchSize)
|
||||
query = query.RoPE(ctx, positions, opts.RopeFactors, opts.ropeDim, opts.ropeBase, opts.ropeScale)
|
||||
|
||||
key := sa.Key.Forward(ctx, hiddenState)
|
||||
key = key.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
|
||||
key = key.RoPE(ctx, positions, opts.RopeFactors, opts.ropeDim, opts.ropeBase, opts.ropeScale)
|
||||
|
||||
value := sa.Value.Forward(ctx, hiddenState)
|
||||
value = value.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
|
||||
|
||||
cache.Put(ctx, key, value)
|
||||
key, value, mask := cache.Get(ctx)
|
||||
|
||||
query = query.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
|
||||
key = key.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
|
||||
value = value.Permute(ctx, 1, 2, 0, 3).Contiguous(ctx)
|
||||
|
||||
scores := key.MulmatFullPrec(ctx, query)
|
||||
scores = scores.Scale(ctx, 1.0/math.Sqrt(float64(headDim)))
|
||||
scores = scores.Add(ctx, mask)
|
||||
scores = scores.Softmax(ctx)
|
||||
|
||||
attention := value.Mulmat(ctx, scores)
|
||||
attention = attention.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
|
||||
attention = attention.Reshape(ctx, opts.hiddenSize, batchSize)
|
||||
|
||||
return sa.Output.Forward(ctx, attention)
|
||||
}
|
||||
|
||||
func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
|
||||
// This will only get called for layers in the cache, which are just the self attention layers
|
||||
return key.RoPE(ctx, shift, m.RopeFactors, m.ropeDim, m.ropeBase, m.ropeScale), nil
|
||||
}
|
||||
|
||||
type TextMLP struct {
|
||||
Up *nn.Linear `gguf:"ffn_up"`
|
||||
Down *nn.Linear `gguf:"ffn_down"`
|
||||
Gate *nn.Linear `gguf:"ffn_gate"`
|
||||
}
|
||||
|
||||
func (mlp *TextMLP) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *TextModelOptions) ml.Tensor {
|
||||
hiddenState = mlp.Gate.Forward(ctx, hiddenState).SILU(ctx).Mul(ctx, mlp.Up.Forward(ctx, hiddenState))
|
||||
return mlp.Down.Forward(ctx, hiddenState)
|
||||
}
|
||||
|
||||
type TextSelfAttentionDecoderLayer struct {
|
||||
AttentionNorm *nn.RMSNorm `gguf:"attn_norm"`
|
||||
SelfAttention *TextSelfAttention
|
||||
|
||||
MLPNorm *nn.RMSNorm `gguf:"ffn_norm"`
|
||||
MLP *TextMLP
|
||||
}
|
||||
|
||||
func (d *TextSelfAttentionDecoderLayer) Forward(ctx ml.Context, hiddenState, positions, mask, _, _ ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor {
|
||||
residual := hiddenState
|
||||
|
||||
hiddenState = d.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
|
||||
hiddenState = d.SelfAttention.Forward(ctx, hiddenState, positions, mask, cache, opts)
|
||||
hiddenState = hiddenState.Add(ctx, residual)
|
||||
residual = hiddenState
|
||||
|
||||
hiddenState = d.MLPNorm.Forward(ctx, hiddenState, opts.eps)
|
||||
hiddenState = d.MLP.Forward(ctx, hiddenState, opts)
|
||||
return hiddenState.Add(ctx, residual)
|
||||
}
|
||||
|
||||
type TextCrossAttention struct {
|
||||
QueryNorm *nn.RMSNorm `gguf:"cross_attn_q_norm"`
|
||||
Query *nn.Linear `gguf:"cross_attn_q_proj"`
|
||||
KeyNorm *nn.RMSNorm `gguf:"cross_attn_k_norm"`
|
||||
Key *nn.Linear `gguf:"cross_attn_k_proj"`
|
||||
Value *nn.Linear `gguf:"cross_attn_v_proj"`
|
||||
Output *nn.Linear `gguf:"cross_attn_o_proj"`
|
||||
}
|
||||
|
||||
func (ca *TextCrossAttention) Forward(ctx ml.Context, hiddenState, crossAttentionStates ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor {
|
||||
batchSize := hiddenState.Dim(1)
|
||||
headDim := opts.hiddenSize / opts.numHeads
|
||||
|
||||
query := ca.Query.Forward(ctx, hiddenState)
|
||||
query = query.Reshape(ctx, headDim, opts.numHeads, batchSize)
|
||||
query = ca.QueryNorm.Forward(ctx, query, opts.eps)
|
||||
|
||||
var key, value ml.Tensor
|
||||
if crossAttentionStates != nil {
|
||||
numVisionTokens, numTiles := crossAttentionStates.Dim(1), crossAttentionStates.Dim(2)
|
||||
|
||||
key = ca.Key.Forward(ctx, crossAttentionStates)
|
||||
key = key.Reshape(ctx, headDim, opts.numKVHeads, numVisionTokens*numTiles)
|
||||
key = ca.KeyNorm.Forward(ctx, key, opts.eps)
|
||||
|
||||
value = ca.Value.Forward(ctx, crossAttentionStates)
|
||||
value = value.Reshape(ctx, headDim, opts.numKVHeads, numVisionTokens*numTiles)
|
||||
|
||||
cache.Put(ctx, key, value)
|
||||
} else {
|
||||
key, value, _ = cache.Get(ctx)
|
||||
}
|
||||
|
||||
query = query.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
|
||||
key = key.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
|
||||
value = value.Permute(ctx, 1, 2, 0, 3).Contiguous(ctx)
|
||||
|
||||
scores := key.Mulmat(ctx, query)
|
||||
scores = scores.Scale(ctx, 1.0/math.Sqrt(float64(headDim)))
|
||||
scores = scores.Softmax(ctx)
|
||||
|
||||
attention := value.Mulmat(ctx, scores)
|
||||
attention = attention.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
|
||||
attention = attention.Reshape(ctx, opts.hiddenSize, batchSize)
|
||||
|
||||
return ca.Output.Forward(ctx, attention)
|
||||
}
|
||||
|
||||
type TextCrossAttentionDecoderLayer struct {
|
||||
AttentionNorm *nn.RMSNorm `gguf:"attn_norm"`
|
||||
CrossAttention *TextCrossAttention
|
||||
AttentionGate ml.Tensor `gguf:"cross_attn_attn_gate"`
|
||||
|
||||
MLPNorm *nn.RMSNorm `gguf:"ffn_norm"`
|
||||
MLP *TextMLP
|
||||
MLPGate ml.Tensor `gguf:"cross_attn_mlp_gate"`
|
||||
}
|
||||
|
||||
func (d *TextCrossAttentionDecoderLayer) Forward(ctx ml.Context, hiddenState, _, _, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor {
|
||||
residual := hiddenState
|
||||
|
||||
hiddenState = d.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
|
||||
hiddenState = d.CrossAttention.Forward(ctx, hiddenState, crossAttentionStates, cache, opts)
|
||||
hiddenState = hiddenState.Mul(ctx, d.AttentionGate.Tanh(ctx))
|
||||
hiddenState = hiddenState.Add(ctx, residual)
|
||||
residual = hiddenState
|
||||
|
||||
hiddenState = d.MLPNorm.Forward(ctx, hiddenState, opts.eps)
|
||||
hiddenState = d.MLP.Forward(ctx, hiddenState, opts)
|
||||
hiddenState = hiddenState.Mul(ctx, d.MLPGate.Tanh(ctx))
|
||||
return hiddenState.Add(ctx, residual)
|
||||
}
|
||||
|
||||
type TextDecoderLayer interface {
|
||||
Forward(ctx ml.Context, hiddenState, positionIDs, mask, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor
|
||||
}
|
||||
|
||||
type TextDecoder struct {
|
||||
Layers []TextDecoderLayer
|
||||
}
|
||||
|
||||
func (d *TextDecoder) Forward(ctx ml.Context, hiddenState, positionIDs, mask, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor {
|
||||
for i, layer := range d.Layers {
|
||||
layerType := selfAttentionLayer
|
||||
if slices.Contains(opts.crossAttentionLayers, uint32(i)) {
|
||||
layerType = crossAttentionLayer
|
||||
}
|
||||
|
||||
cache.SetLayer(i)
|
||||
cache.SetLayerType(layerType)
|
||||
|
||||
if layerType == selfAttentionLayer || crossAttentionStates != nil || cache.UnderlyingCache().(*kvcache.EncoderCache).EncoderCached() {
|
||||
hiddenState = layer.Forward(ctx, hiddenState, positionIDs, mask, crossAttentionStates, crossAttentionMask, cache, opts)
|
||||
}
|
||||
}
|
||||
|
||||
return hiddenState
|
||||
}
|
||||
|
||||
type TextModelOptions struct {
|
||||
RopeFactors ml.Tensor `gguf:"rope_freqs.weight"`
|
||||
|
||||
hiddenSize, numHeads, numKVHeads int
|
||||
eps, ropeBase, ropeScale float32
|
||||
ropeDim uint32
|
||||
|
||||
crossAttentionLayers []uint32
|
||||
}
|
||||
|
||||
type TextModel struct {
|
||||
TokenEmbedding *nn.Embedding `gguf:"token_embd"`
|
||||
Transformer *TextDecoder `gguf:"blk"`
|
||||
OutputNorm *nn.RMSNorm `gguf:"output_norm"`
|
||||
Output *nn.Linear `gguf:"output"`
|
||||
|
||||
*TextModelOptions
|
||||
}
|
||||
|
||||
func (m *TextModel) Forward(ctx ml.Context, inputIDs, positionIDs, mask, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache) ml.Tensor {
|
||||
hiddenState := m.TokenEmbedding.Forward(ctx, inputIDs)
|
||||
hiddenState = m.Transformer.Forward(ctx, hiddenState, positionIDs, mask, crossAttentionStates, crossAttentionMask, cache, m.TextModelOptions)
|
||||
hiddenState = m.OutputNorm.Forward(ctx, hiddenState, m.eps)
|
||||
return m.Output.Forward(ctx, hiddenState)
|
||||
}
|
||||
|
||||
func newTextModel(c ml.Config) *TextModel {
|
||||
var decoderLayers []TextDecoderLayer
|
||||
for i := range c.Uint("block_count") {
|
||||
var textDecoderLayer TextDecoderLayer
|
||||
if slices.Contains(c.Uints("attention.cross_attention_layers"), i) {
|
||||
textDecoderLayer = &TextCrossAttentionDecoderLayer{}
|
||||
} else {
|
||||
textDecoderLayer = &TextSelfAttentionDecoderLayer{}
|
||||
}
|
||||
|
||||
decoderLayers = append(decoderLayers, textDecoderLayer)
|
||||
}
|
||||
|
||||
return &TextModel{
|
||||
Transformer: &TextDecoder{Layers: decoderLayers},
|
||||
TextModelOptions: &TextModelOptions{
|
||||
hiddenSize: int(c.Uint("embedding_length")),
|
||||
numHeads: int(c.Uint("attention.head_count")),
|
||||
numKVHeads: int(c.Uint("attention.head_count_kv")),
|
||||
eps: c.Float("attention.layer_norm_rms_epsilon"),
|
||||
ropeBase: c.Float("rope.freq_base"),
|
||||
ropeScale: c.Float("rope.freq_scale", 1),
|
||||
ropeDim: c.Uint("rope.dimension_count"),
|
||||
crossAttentionLayers: c.Uints("attention.cross_attention_layers"),
|
||||
},
|
||||
}
|
||||
}
|
||||
234
model/models/mllama/model_vision.go
Normal file
234
model/models/mllama/model_vision.go
Normal file
@@ -0,0 +1,234 @@
|
||||
package mllama
|
||||
|
||||
import (
|
||||
"math"
|
||||
"slices"
|
||||
|
||||
"github.com/ollama/ollama/ml"
|
||||
"github.com/ollama/ollama/ml/nn"
|
||||
)
|
||||
|
||||
var batchSize int = 1
|
||||
|
||||
type VisionSelfAttention struct {
|
||||
Query *nn.Linear `gguf:"attn_q"`
|
||||
Key *nn.Linear `gguf:"attn_k"`
|
||||
Value *nn.Linear `gguf:"attn_v"`
|
||||
Output *nn.Linear `gguf:"attn_out"`
|
||||
|
||||
Gate ml.Tensor `gguf:"attn_gate"`
|
||||
}
|
||||
|
||||
func (sa *VisionSelfAttention) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *VisionModelOptions) ml.Tensor {
|
||||
headDim := opts.hiddenSize / opts.numHeads
|
||||
|
||||
query := sa.Query.Forward(ctx, hiddenState)
|
||||
query = query.Reshape(ctx, headDim, opts.numHeads, query.Dim(1), batchSize)
|
||||
query = query.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
|
||||
|
||||
key := sa.Key.Forward(ctx, hiddenState)
|
||||
key = key.Reshape(ctx, headDim, opts.numHeads, key.Dim(1), batchSize)
|
||||
key = key.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
|
||||
|
||||
value := sa.Value.Forward(ctx, hiddenState)
|
||||
value = value.Reshape(ctx, headDim, opts.numHeads, value.Dim(1), batchSize)
|
||||
value = value.Permute(ctx, 1, 2, 0, 3).Contiguous(ctx)
|
||||
|
||||
scores := key.Mulmat(ctx, query)
|
||||
scores = scores.Scale(ctx, 1.0/math.Sqrt(float64(headDim)))
|
||||
scores = scores.Softmax(ctx)
|
||||
|
||||
attention := value.Mulmat(ctx, scores)
|
||||
attention = attention.Reshape(ctx, headDim, attention.Dim(1), opts.numHeads, batchSize)
|
||||
attention = attention.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
|
||||
attention = attention.Reshape(ctx, opts.hiddenSize, attention.Dim(2), batchSize)
|
||||
|
||||
hiddenState = sa.Output.Forward(ctx, attention)
|
||||
if sa.Gate != nil {
|
||||
hiddenState = hiddenState.Mul(ctx, sa.Gate)
|
||||
}
|
||||
|
||||
return hiddenState
|
||||
}
|
||||
|
||||
type VisionMLP struct {
|
||||
Down *nn.Linear `gguf:"ffn_down"`
|
||||
Up *nn.Linear `gguf:"ffn_up"`
|
||||
|
||||
Gate ml.Tensor `gguf:"ffn_gate"`
|
||||
}
|
||||
|
||||
func (mlp *VisionMLP) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *VisionModelOptions) ml.Tensor {
|
||||
hiddenState = mlp.Down.Forward(ctx, hiddenState).GELU(ctx)
|
||||
hiddenState = mlp.Up.Forward(ctx, hiddenState)
|
||||
if mlp.Gate != nil {
|
||||
hiddenState = hiddenState.Mul(ctx, mlp.Gate)
|
||||
}
|
||||
|
||||
return hiddenState
|
||||
}
|
||||
|
||||
type VisionEncoderLayer struct {
|
||||
AttentionNorm *nn.LayerNorm `gguf:"ln1"`
|
||||
SelfAttention *VisionSelfAttention
|
||||
|
||||
MLPNorm *nn.LayerNorm `gguf:"ln2"`
|
||||
MLP *VisionMLP
|
||||
}
|
||||
|
||||
func (e *VisionEncoderLayer) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *VisionModelOptions) ml.Tensor {
|
||||
residual := hiddenState
|
||||
|
||||
// self attention
|
||||
hiddenState = e.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
|
||||
hiddenState = e.SelfAttention.Forward(ctx, hiddenState, opts)
|
||||
hiddenState = hiddenState.Add(ctx, residual)
|
||||
residual = hiddenState
|
||||
|
||||
// feed forward
|
||||
hiddenState = e.MLPNorm.Forward(ctx, hiddenState, opts.eps)
|
||||
hiddenState = e.MLP.Forward(ctx, hiddenState, opts)
|
||||
return hiddenState.Add(ctx, residual)
|
||||
}
|
||||
|
||||
type VisionEncoder struct {
|
||||
Layers []VisionEncoderLayer
|
||||
}
|
||||
|
||||
func (e *VisionEncoder) Forward(ctx ml.Context, hiddenState ml.Tensor, intermediateLayersIndices []uint32, opts *VisionModelOptions) (ml.Tensor, []ml.Tensor) {
|
||||
var intermediateHiddenStates []ml.Tensor
|
||||
for i, layer := range e.Layers {
|
||||
if slices.Contains(intermediateLayersIndices, uint32(i)) {
|
||||
intermediateHiddenStates = append(intermediateHiddenStates, hiddenState.Reshape(ctx, append([]int{1}, hiddenState.Shape()...)...))
|
||||
}
|
||||
|
||||
hiddenState = layer.Forward(ctx, hiddenState, opts)
|
||||
}
|
||||
|
||||
return hiddenState, intermediateHiddenStates
|
||||
}
|
||||
|
||||
type PrecomputedAspectRatioEmbedding struct {
|
||||
Embedding *nn.Embedding
|
||||
Gate ml.Tensor `gguf:"gate"`
|
||||
}
|
||||
|
||||
func (e *PrecomputedAspectRatioEmbedding) Forward(ctx ml.Context, hiddenState ml.Tensor, aspectRatioIDs ml.Tensor, opts *VisionModelOptions) ml.Tensor {
|
||||
embeddings := e.Embedding.Forward(ctx, aspectRatioIDs)
|
||||
embeddings = embeddings.Reshape(ctx, opts.hiddenSize, 1, opts.numTiles)
|
||||
if e.Gate != nil {
|
||||
embeddings = embeddings.Mul(ctx, e.Gate)
|
||||
}
|
||||
|
||||
return hiddenState.Add(ctx, embeddings)
|
||||
}
|
||||
|
||||
type PrecomputedPositionEmbedding struct {
|
||||
PositionEmbedding *nn.Embedding `gguf:"position_embd"`
|
||||
PositionEmbeddingGate ml.Tensor `gguf:"position_embd.gate"`
|
||||
|
||||
TilePositionEmbedding *nn.Embedding `gguf:"tile_position_embd"`
|
||||
TilePositionEmbeddingGate ml.Tensor `gguf:"tile_position_embd.gate"`
|
||||
}
|
||||
|
||||
func (e *PrecomputedPositionEmbedding) Forward(ctx ml.Context, hiddenState, positionIDs, aspectRatioIDs ml.Tensor, numPositions int, opts *VisionModelOptions) ml.Tensor {
|
||||
positionEmbedding := e.PositionEmbedding.Forward(ctx, positionIDs)
|
||||
if e.PositionEmbeddingGate != nil {
|
||||
positionEmbedding = positionEmbedding.Mul(ctx, e.PositionEmbeddingGate)
|
||||
}
|
||||
|
||||
hiddenState = hiddenState.Add(ctx, positionEmbedding)
|
||||
|
||||
tilePositionEmbedding := e.TilePositionEmbedding.Forward(ctx, aspectRatioIDs)
|
||||
tilePositionEmbedding = tilePositionEmbedding.Reshape(ctx, opts.hiddenSize, numPositions, opts.numTiles)
|
||||
if e.TilePositionEmbeddingGate != nil {
|
||||
tilePositionEmbedding = tilePositionEmbedding.Mul(ctx, e.TilePositionEmbeddingGate)
|
||||
}
|
||||
|
||||
return hiddenState.Add(ctx, tilePositionEmbedding)
|
||||
}
|
||||
|
||||
type VisionModelOptions struct {
|
||||
hiddenSize, numHeads, numTiles int
|
||||
imageSize, patchSize int
|
||||
eps float32
|
||||
|
||||
intermediateLayersIndices []uint32
|
||||
}
|
||||
|
||||
type VisionModel struct {
|
||||
PatchEmbeddings *nn.Conv2D `gguf:"patch_embd"`
|
||||
|
||||
PreTilePositionEmbedding *PrecomputedAspectRatioEmbedding `gguf:"pre_tile_position_embd"`
|
||||
PostTilePositionEmbedding *PrecomputedAspectRatioEmbedding `gguf:"post_tile_position_embd"`
|
||||
PositionEmbedding *PrecomputedPositionEmbedding
|
||||
|
||||
PreLayerNorm *nn.LayerNorm `gguf:"pre_ln"`
|
||||
PostLayerNorm *nn.LayerNorm `gguf:"post_ln"`
|
||||
ClassEmbedding ml.Tensor `gguf:"class_embd"`
|
||||
|
||||
Transformer *VisionEncoder `gguf:"blk"`
|
||||
GlobalTransformer *VisionEncoder `gguf:"global.blk"`
|
||||
|
||||
*VisionModelOptions
|
||||
}
|
||||
|
||||
func (m *VisionModel) Forward(ctx ml.Context, pixelValues, positionIDs, aspectRatioIDs ml.Tensor) ml.Tensor {
|
||||
numPatches := (m.imageSize / m.patchSize) * (m.imageSize / m.patchSize)
|
||||
numPositions := numPatches
|
||||
if m.ClassEmbedding != nil {
|
||||
numPositions++
|
||||
}
|
||||
|
||||
hiddenState := m.PatchEmbeddings.Forward(ctx, pixelValues, m.patchSize, m.patchSize, 0, 0, 1, 1)
|
||||
hiddenState = hiddenState.Reshape(ctx, numPatches, m.hiddenSize, m.numTiles)
|
||||
hiddenState = hiddenState.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx)
|
||||
|
||||
hiddenState = m.PreTilePositionEmbedding.Forward(ctx, hiddenState, aspectRatioIDs, m.VisionModelOptions)
|
||||
hiddenState = m.ClassEmbedding.Stack(ctx, 2, slices.Repeat([]ml.Tensor{m.ClassEmbedding}, m.numTiles-1)...).Concat(ctx, hiddenState, 1)
|
||||
|
||||
hiddenState = m.PositionEmbedding.Forward(ctx, hiddenState, positionIDs, aspectRatioIDs, numPositions, m.VisionModelOptions)
|
||||
hiddenState = m.PreLayerNorm.Forward(ctx, hiddenState, m.eps)
|
||||
|
||||
numPaddingPatches := 8 - (hiddenState.Dim(1)%8)%8
|
||||
hiddenState = hiddenState.Pad(ctx, 0, numPaddingPatches, 0, 0)
|
||||
|
||||
hiddenState = hiddenState.Reshape(ctx, hiddenState.Dim(0), hiddenState.Dim(1)*hiddenState.Dim(2), batchSize)
|
||||
hiddenState, intermediateHiddenStates := m.Transformer.Forward(ctx, hiddenState, m.intermediateLayersIndices, m.VisionModelOptions)
|
||||
|
||||
hiddenState = m.PostLayerNorm.Forward(ctx, hiddenState, m.eps)
|
||||
|
||||
hiddenState = hiddenState.Reshape(ctx, m.hiddenSize, numPositions+numPaddingPatches, m.numTiles, batchSize)
|
||||
hiddenState = m.PostTilePositionEmbedding.Forward(ctx, hiddenState, aspectRatioIDs, m.VisionModelOptions)
|
||||
|
||||
hiddenState = hiddenState.Reshape(ctx, m.hiddenSize, m.numTiles*(numPositions+numPaddingPatches), batchSize)
|
||||
hiddenState, _ = m.GlobalTransformer.Forward(ctx, hiddenState, nil, m.VisionModelOptions)
|
||||
|
||||
hiddenStates := intermediateHiddenStates[0].Stack(ctx, 0, intermediateHiddenStates[1:]...)
|
||||
hiddenStates = hiddenStates.Reshape(ctx, len(intermediateHiddenStates)*m.hiddenSize, numPositions+numPaddingPatches, m.numTiles, batchSize)
|
||||
hiddenStates = hiddenStates.Unpad(ctx, 0, numPaddingPatches, 0, 0)
|
||||
|
||||
hiddenState = hiddenState.Reshape(ctx, m.hiddenSize, numPositions+numPaddingPatches, m.numTiles, batchSize)
|
||||
hiddenState = hiddenState.Unpad(ctx, 0, numPaddingPatches, 0, 0)
|
||||
return hiddenState.Concat(ctx, hiddenStates, 0)
|
||||
}
|
||||
|
||||
func newVisionModel(c ml.Config) *VisionModel {
|
||||
return &VisionModel{
|
||||
Transformer: &VisionEncoder{Layers: make([]VisionEncoderLayer, c.Uint("vision.block_count"))},
|
||||
GlobalTransformer: &VisionEncoder{Layers: make([]VisionEncoderLayer, c.Uint("vision.global.block_count"))},
|
||||
|
||||
VisionModelOptions: &VisionModelOptions{
|
||||
hiddenSize: int(c.Uint("vision.embedding_length")),
|
||||
numHeads: int(c.Uint("vision.attention.head_count")),
|
||||
numTiles: int(c.Uint("vision.max_num_tiles")),
|
||||
|
||||
imageSize: int(c.Uint("vision.image_size")),
|
||||
patchSize: int(c.Uint("vision.patch_size")),
|
||||
|
||||
eps: c.Float("vision.attention.layer_norm_epsilon"),
|
||||
|
||||
intermediateLayersIndices: c.Uints("vision.intermediate_layers_indices"),
|
||||
},
|
||||
}
|
||||
}
|
||||
240
model/models/mllama/process_image.go
Normal file
240
model/models/mllama/process_image.go
Normal file
@@ -0,0 +1,240 @@
|
||||
package mllama
|
||||
|
||||
import (
|
||||
"image"
|
||||
"image/color"
|
||||
"math"
|
||||
"slices"
|
||||
|
||||
"golang.org/x/image/draw"
|
||||
|
||||
"github.com/ollama/ollama/ml"
|
||||
)
|
||||
|
||||
type ImageProcessor struct {
|
||||
imageSize, numChannels, maxNumTiles int
|
||||
}
|
||||
|
||||
func newImageProcessor(c ml.Config) ImageProcessor {
|
||||
return ImageProcessor{
|
||||
imageSize: int(c.Uint("vision.image_size")),
|
||||
numChannels: int(c.Uint("vision.num_channels")),
|
||||
maxNumTiles: int(c.Uint("vision.max_num_tiles")),
|
||||
}
|
||||
}
|
||||
|
||||
func (p *ImageProcessor) supportedAspectRatios(maxTiles int) []image.Point {
|
||||
ratios := []image.Point{}
|
||||
|
||||
for w := range maxTiles {
|
||||
for h := range maxTiles {
|
||||
if (w+1)*(h+1) <= maxTiles {
|
||||
ratios = append(ratios, image.Point{w + 1, h + 1})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return ratios
|
||||
}
|
||||
|
||||
func (p *ImageProcessor) clip(a, a_min, a_max int) int {
|
||||
if a < a_min {
|
||||
return a_min
|
||||
} else if a > a_max {
|
||||
return a_max
|
||||
}
|
||||
|
||||
return a
|
||||
}
|
||||
|
||||
func (p *ImageProcessor) fitToCanvas(imageSize, canvasSize image.Point, tileSize int) image.Point {
|
||||
targetWidth := p.clip(imageSize.X, tileSize, canvasSize.X)
|
||||
targetHeight := p.clip(imageSize.Y, tileSize, canvasSize.Y)
|
||||
|
||||
scaleWidth := float64(targetWidth) / float64(imageSize.X)
|
||||
scaleHeight := float64(targetHeight) / float64(imageSize.Y)
|
||||
|
||||
var w, h int
|
||||
|
||||
if scaleWidth < scaleHeight {
|
||||
w = targetWidth
|
||||
h = min(int(math.Floor(float64(imageSize.Y)*scaleWidth)), targetHeight)
|
||||
} else {
|
||||
w = min(int(math.Floor(float64(imageSize.X)*scaleHeight)), targetWidth)
|
||||
h = targetHeight
|
||||
}
|
||||
|
||||
return image.Point{w, h}
|
||||
}
|
||||
|
||||
func (p *ImageProcessor) optimalTiledCanvas(imageSize image.Point, maxImageTiles, tileSize int) image.Point {
|
||||
possibleTileArrangements := p.supportedAspectRatios(maxImageTiles)
|
||||
possibleCanvasSizes := []image.Point{}
|
||||
for _, pta := range possibleTileArrangements {
|
||||
possibleCanvasSizes = append(possibleCanvasSizes, image.Point{pta.X * tileSize, pta.Y * tileSize})
|
||||
}
|
||||
|
||||
scales := []float64{}
|
||||
|
||||
for _, pcs := range possibleCanvasSizes {
|
||||
scaleHeight := float64(pcs.Y) / float64(imageSize.Y)
|
||||
scaleWidth := float64(pcs.X) / float64(imageSize.X)
|
||||
|
||||
if scaleWidth > scaleHeight {
|
||||
scales = append(scales, scaleHeight)
|
||||
} else {
|
||||
scales = append(scales, scaleWidth)
|
||||
}
|
||||
}
|
||||
|
||||
var minUpscale float64
|
||||
var maxDownscale float64
|
||||
var upscale bool
|
||||
|
||||
for _, s := range scales {
|
||||
if s > 1.0 {
|
||||
upscale = true
|
||||
if minUpscale == 0 {
|
||||
minUpscale = s
|
||||
} else {
|
||||
minUpscale = math.Min(minUpscale, s)
|
||||
}
|
||||
} else {
|
||||
maxDownscale = math.Max(maxDownscale, s)
|
||||
}
|
||||
}
|
||||
|
||||
selectedScale := maxDownscale
|
||||
if upscale {
|
||||
selectedScale = minUpscale
|
||||
}
|
||||
|
||||
var selectedCanvas image.Point
|
||||
for n, pcs := range possibleCanvasSizes {
|
||||
if scales[n] == selectedScale {
|
||||
// choose the smallest possible canvas
|
||||
if selectedCanvas.X == 0 && selectedCanvas.Y == 0 {
|
||||
selectedCanvas = pcs
|
||||
} else if pcs.X*pcs.Y < selectedCanvas.X*selectedCanvas.Y {
|
||||
selectedCanvas = pcs
|
||||
}
|
||||
}
|
||||
}
|
||||
return selectedCanvas
|
||||
}
|
||||
|
||||
func (p *ImageProcessor) splitToTiles(img image.Image, numTilesSize image.Point) []image.Image {
|
||||
b := img.Bounds()
|
||||
width := b.Max.X - b.Min.X
|
||||
height := b.Max.Y - b.Min.Y
|
||||
tileHeight := height / numTilesSize.Y
|
||||
tileWidth := width / numTilesSize.X
|
||||
|
||||
images := []image.Image{}
|
||||
|
||||
for h := range numTilesSize.Y {
|
||||
for w := range numTilesSize.X {
|
||||
rect := image.Rect(tileWidth*w, tileHeight*h, tileWidth*(w+1), tileHeight*(h+1))
|
||||
images = append(images, img.(interface {
|
||||
SubImage(image.Rectangle) image.Image
|
||||
}).SubImage(rect))
|
||||
}
|
||||
}
|
||||
|
||||
return images
|
||||
}
|
||||
|
||||
// remove the "alpha" channel by drawing over a prefilled image
|
||||
//
|
||||
// remove the "alpha" channel by drawing over a prefilled image
|
||||
//
|
||||
//nolint:unused
|
||||
func (p *ImageProcessor) compositeImage(img image.Image) image.Image {
|
||||
dst := image.NewRGBA(img.Bounds())
|
||||
|
||||
white := color.RGBA{255, 255, 255, 255}
|
||||
draw.Draw(dst, dst.Bounds(), &image.Uniform{white}, image.Point{}, draw.Src)
|
||||
draw.Draw(dst, dst.Bounds(), img, img.Bounds().Min, draw.Over)
|
||||
|
||||
return dst
|
||||
}
|
||||
|
||||
func (p *ImageProcessor) resize(img image.Image, outputSize image.Point, maxImageTiles int) (image.Image, image.Point) {
|
||||
b := img.Bounds()
|
||||
tileSize := outputSize.Y
|
||||
|
||||
canvasSize := p.optimalTiledCanvas(b.Max, maxImageTiles, tileSize)
|
||||
aspectRatio := image.Point{canvasSize.X / tileSize, canvasSize.Y / tileSize}
|
||||
newSize := p.fitToCanvas(b.Max, canvasSize, tileSize)
|
||||
|
||||
dst := image.NewRGBA(image.Rect(0, 0, newSize.X, newSize.Y))
|
||||
|
||||
// scaling choices:
|
||||
// NearestNeighbor fast, blocky output
|
||||
// ApproxBiLinear fast, medium quality
|
||||
// BiLinear slow, high quality
|
||||
// CatmullRom very slow, very high quality
|
||||
draw.BiLinear.Scale(dst, dst.Rect, img, b, draw.Over, nil)
|
||||
|
||||
return dst, aspectRatio
|
||||
}
|
||||
|
||||
func (p *ImageProcessor) pad(img image.Image, outputSize, aspectRatio image.Point) image.Image {
|
||||
paddedSize := image.Point{
|
||||
X: outputSize.X * aspectRatio.X,
|
||||
Y: outputSize.Y * aspectRatio.Y,
|
||||
}
|
||||
|
||||
dst := image.NewRGBA(image.Rect(0, 0, paddedSize.X, paddedSize.Y))
|
||||
draw.Draw(dst, img.Bounds(), img, image.Point{0, 0}, draw.Over)
|
||||
|
||||
return dst
|
||||
}
|
||||
|
||||
func (p *ImageProcessor) pack(img image.Image, aspectRatio image.Point, mean, std [3]float32) []float32 {
|
||||
subImages := p.splitToTiles(img, aspectRatio)
|
||||
|
||||
var pixelVals []float32
|
||||
|
||||
for _, subImg := range subImages {
|
||||
bounds := subImg.Bounds()
|
||||
var rVals, gVals, bVals []float32
|
||||
for y := bounds.Min.Y; y < bounds.Max.Y; y++ {
|
||||
for x := bounds.Min.X; x < bounds.Max.X; x++ {
|
||||
c := subImg.At(x, y)
|
||||
r, g, b, _ := c.RGBA()
|
||||
rVal := float32(r>>8) / 255.0
|
||||
gVal := float32(g>>8) / 255.0
|
||||
bVal := float32(b>>8) / 255.0
|
||||
|
||||
rVal = (rVal - mean[0]) / std[0]
|
||||
gVal = (gVal - mean[1]) / std[1]
|
||||
bVal = (bVal - mean[2]) / std[2]
|
||||
|
||||
rVals = append(rVals, rVal)
|
||||
gVals = append(gVals, gVal)
|
||||
bVals = append(bVals, bVal)
|
||||
}
|
||||
}
|
||||
pixelVals = append(pixelVals, rVals...)
|
||||
pixelVals = append(pixelVals, gVals...)
|
||||
pixelVals = append(pixelVals, bVals...)
|
||||
}
|
||||
|
||||
return pixelVals
|
||||
}
|
||||
|
||||
func (p ImageProcessor) ProcessImage(img image.Image) ([]float32, int, error) {
|
||||
outputSize := image.Point{p.imageSize, p.imageSize}
|
||||
|
||||
// clip values
|
||||
mean := [3]float32{0.48145466, 0.4578275, 0.40821073}
|
||||
std := [3]float32{0.26862954, 0.26130258, 0.27577711}
|
||||
|
||||
newImage, aspectRatio := p.resize(img, outputSize, p.maxNumTiles)
|
||||
newImage = p.pad(newImage, outputSize, aspectRatio)
|
||||
|
||||
data := p.pack(newImage, aspectRatio, mean, std)
|
||||
aspectRatioIndex := slices.Index(p.supportedAspectRatios(p.maxNumTiles), aspectRatio) + 1
|
||||
return data, aspectRatioIndex, nil
|
||||
}
|
||||
6
model/models/models.go
Normal file
6
model/models/models.go
Normal file
@@ -0,0 +1,6 @@
|
||||
package models
|
||||
|
||||
import (
|
||||
_ "github.com/ollama/ollama/model/models/llama"
|
||||
_ "github.com/ollama/ollama/model/models/mllama"
|
||||
)
|
||||
313
model/process_text.go
Normal file
313
model/process_text.go
Normal file
@@ -0,0 +1,313 @@
|
||||
package model
|
||||
|
||||
import (
|
||||
"cmp"
|
||||
"iter"
|
||||
"log/slog"
|
||||
"strings"
|
||||
"sync"
|
||||
|
||||
"github.com/dlclark/regexp2"
|
||||
heap "github.com/emirpasic/gods/v2/trees/binaryheap"
|
||||
)
|
||||
|
||||
type Special int32
|
||||
|
||||
const (
|
||||
SpecialBOS Special = iota
|
||||
SpecialEOS
|
||||
)
|
||||
|
||||
type TextProcessor interface {
|
||||
Encode(string) ([]int32, error)
|
||||
Decode([]int32) (string, error)
|
||||
Is(int32, Special) bool
|
||||
}
|
||||
|
||||
type Vocabulary struct {
|
||||
Values []string
|
||||
Types []uint32
|
||||
Scores []uint32
|
||||
Merges []string
|
||||
|
||||
BOS, EOS int32
|
||||
|
||||
specialOnce sync.Once
|
||||
special []string
|
||||
|
||||
valuesOnce sync.Once
|
||||
values map[string]int32
|
||||
|
||||
mergeOnce sync.Once
|
||||
merge map[string]int32
|
||||
}
|
||||
|
||||
func (v *Vocabulary) Is(id int32, special Special) bool {
|
||||
switch special {
|
||||
case SpecialBOS:
|
||||
return id == v.BOS
|
||||
case SpecialEOS:
|
||||
return id == v.EOS
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
func (v *Vocabulary) Encode(s string) int32 {
|
||||
v.valuesOnce.Do(func() {
|
||||
v.values = make(map[string]int32, len(v.Values))
|
||||
for i, value := range v.Values {
|
||||
v.values[value] = int32(i)
|
||||
}
|
||||
})
|
||||
|
||||
if id, ok := v.values[s]; ok {
|
||||
return id
|
||||
}
|
||||
|
||||
return -1
|
||||
}
|
||||
|
||||
func (v *Vocabulary) Decode(id int32) string {
|
||||
return v.Values[id]
|
||||
}
|
||||
|
||||
func (v *Vocabulary) SpecialVocabulary() []string {
|
||||
v.specialOnce.Do(func() {
|
||||
for i := range v.Values {
|
||||
if v.Types[i] == 3 {
|
||||
v.special = append(v.special, v.Values[i])
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
return v.special
|
||||
}
|
||||
|
||||
func (v *Vocabulary) Merge(left, right string) int {
|
||||
v.mergeOnce.Do(func() {
|
||||
v.merge = make(map[string]int32, len(v.Merges))
|
||||
for i, merge := range v.Merges {
|
||||
v.merge[merge] = int32(i)
|
||||
}
|
||||
})
|
||||
|
||||
if id, ok := v.merge[left+" "+right]; ok {
|
||||
return int(id)
|
||||
}
|
||||
|
||||
return -1
|
||||
}
|
||||
|
||||
type BytePairEncoding struct {
|
||||
pre *regexp2.Regexp
|
||||
vocab *Vocabulary
|
||||
}
|
||||
|
||||
func NewBytePairEncoding(pre string, vocab *Vocabulary) BytePairEncoding {
|
||||
return BytePairEncoding{
|
||||
pre: regexp2.MustCompile(pre, regexp2.Unicode|regexp2.RE2),
|
||||
vocab: vocab,
|
||||
}
|
||||
}
|
||||
|
||||
func (bpe BytePairEncoding) Is(id int32, special Special) bool {
|
||||
return bpe.vocab.Is(id, special)
|
||||
}
|
||||
|
||||
func (bpe *BytePairEncoding) split(s string) iter.Seq[string] {
|
||||
return func(yield func(string) bool) {
|
||||
for m, _ := bpe.pre.FindStringMatch(s); m != nil; m, _ = bpe.pre.FindNextMatch(m) {
|
||||
if !yield(m.String()) {
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// fragment is a string fragment and their corresponding token IDs
|
||||
type fragment struct {
|
||||
value string
|
||||
ids []int32
|
||||
}
|
||||
|
||||
// pair is a pair of runes and its rank
|
||||
type pair struct {
|
||||
a, b int
|
||||
rank int
|
||||
value string
|
||||
}
|
||||
|
||||
type merge struct {
|
||||
p, n int
|
||||
runes []rune
|
||||
}
|
||||
|
||||
func (bpe BytePairEncoding) Encode(s string) ([]int32, error) {
|
||||
fragments := []fragment{{value: s}}
|
||||
for _, special := range bpe.vocab.SpecialVocabulary() {
|
||||
// TODO: process special tokens concurrently
|
||||
id := bpe.vocab.Encode(special)
|
||||
for i := 0; i < len(fragments); i++ {
|
||||
frag := fragments[i]
|
||||
if len(frag.ids) > 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
var middle []fragment
|
||||
switch i := strings.Index(frag.value, special); {
|
||||
case i < 0:
|
||||
middle = append(middle, frag)
|
||||
case i > 0:
|
||||
middle = append(middle, fragment{value: frag.value[:i]})
|
||||
fallthrough
|
||||
default:
|
||||
middle = append(middle, fragment{value: special, ids: []int32{id}})
|
||||
if rest := frag.value[i+len(special):]; rest != "" {
|
||||
middle = append(middle, fragment{value: rest})
|
||||
}
|
||||
}
|
||||
|
||||
fragments = append(fragments[:i], append(middle, fragments[i+1:]...)...)
|
||||
}
|
||||
}
|
||||
|
||||
var ids []int32
|
||||
for _, frag := range fragments {
|
||||
if len(frag.ids) > 0 {
|
||||
ids = append(ids, frag.ids...)
|
||||
slog.Debug("encoded", "text", frag.value, "ids", frag.ids, "special", true)
|
||||
continue
|
||||
}
|
||||
|
||||
for split := range bpe.split(frag.value) {
|
||||
// TODO: process splits concurrently
|
||||
var sb strings.Builder
|
||||
for _, b := range []byte(split) {
|
||||
r := rune(b)
|
||||
switch {
|
||||
case r == 0x00ad:
|
||||
r = 0x0143
|
||||
case r <= 0x0020:
|
||||
r = r + 0x0100
|
||||
case r >= 0x007e && r <= 0x00a0:
|
||||
r = r + 0x00a2
|
||||
}
|
||||
|
||||
sb.WriteRune(r)
|
||||
}
|
||||
|
||||
// short circuit if the fragment is in the vocabulary
|
||||
if id := bpe.vocab.Encode(sb.String()); id >= 0 {
|
||||
ids = append(ids, id)
|
||||
slog.Debug("encoded", "text", sb.String(), "ids", []int32{id})
|
||||
continue
|
||||
}
|
||||
|
||||
runes := []rune(sb.String())
|
||||
merges := make([]merge, len(runes))
|
||||
for r := range runes {
|
||||
merges[r] = merge{
|
||||
p: r - 1,
|
||||
n: r + 1,
|
||||
runes: []rune{runes[r]},
|
||||
}
|
||||
}
|
||||
|
||||
pairwise := func(a, b int) *pair {
|
||||
if a < 0 || b >= len(runes) {
|
||||
return nil
|
||||
}
|
||||
|
||||
left, right := string(merges[a].runes), string(merges[b].runes)
|
||||
rank := bpe.vocab.Merge(left, right)
|
||||
if rank < 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
return &pair{
|
||||
a: a,
|
||||
b: b,
|
||||
rank: rank,
|
||||
value: left + right,
|
||||
}
|
||||
}
|
||||
|
||||
pairs := heap.NewWith(func(i, j *pair) int {
|
||||
return cmp.Compare(i.rank, j.rank)
|
||||
})
|
||||
|
||||
for i := range len(runes) - 1 {
|
||||
if pair := pairwise(i, i+1); pair != nil {
|
||||
pairs.Push(pair)
|
||||
}
|
||||
}
|
||||
|
||||
for !pairs.Empty() {
|
||||
pair, _ := pairs.Pop()
|
||||
|
||||
left, right := merges[pair.a], merges[pair.b]
|
||||
if len(left.runes) == 0 || len(right.runes) == 0 ||
|
||||
string(left.runes)+string(right.runes) != pair.value {
|
||||
continue
|
||||
}
|
||||
|
||||
merges[pair.a].runes = append(left.runes, right.runes...)
|
||||
merges[pair.b].runes = nil
|
||||
|
||||
merges[pair.a].n = right.n
|
||||
if right.n < len(merges) {
|
||||
merges[right.n].p = pair.a
|
||||
}
|
||||
|
||||
if pair := pairwise(merges[pair.a].p, pair.a); pair != nil {
|
||||
pairs.Push(pair)
|
||||
}
|
||||
|
||||
if pair := pairwise(pair.a, merges[pair.a].n); pair != nil {
|
||||
pairs.Push(pair)
|
||||
}
|
||||
}
|
||||
|
||||
for _, merge := range merges {
|
||||
if len(merge.runes) > 0 {
|
||||
// TODO: handle the edge case where the rune isn't in the vocabulary
|
||||
if id := bpe.vocab.Encode(string(merge.runes)); id >= 0 {
|
||||
ids = append(ids, id)
|
||||
slog.Debug("encoded", "text", string(merge.runes), "ids", []int32{id})
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return ids, nil
|
||||
}
|
||||
|
||||
func (bpe BytePairEncoding) Decode(ids []int32) (string, error) {
|
||||
var sb strings.Builder
|
||||
for _, id := range ids {
|
||||
for _, r := range bpe.vocab.Decode(id) {
|
||||
switch {
|
||||
case r == 0x0100:
|
||||
// this produces 0x00 aka NULL
|
||||
continue
|
||||
case r == 0x0143:
|
||||
r = 0x00ad
|
||||
case r > 0x0100 && r <= 0x0120:
|
||||
r = r - 0x0100
|
||||
case r > 0x0120 && r <= 0x0142:
|
||||
r = r - 0x00a2
|
||||
}
|
||||
|
||||
// NOTE: not using WriteRune here because it writes the UTF-8
|
||||
// encoding of the rune which is _not_ what we want
|
||||
if err := sb.WriteByte(byte(r)); err != nil {
|
||||
return "", err
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
slog.Debug("decoded", "ids", ids, "text", sb.String())
|
||||
return sb.String(), nil
|
||||
}
|
||||
254
model/process_text_test.go
Normal file
254
model/process_text_test.go
Normal file
@@ -0,0 +1,254 @@
|
||||
package model
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"encoding/json"
|
||||
"math"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"slices"
|
||||
"strconv"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"github.com/google/go-cmp/cmp"
|
||||
)
|
||||
|
||||
func llama(t testing.TB) BytePairEncoding {
|
||||
t.Helper()
|
||||
|
||||
f, err := os.Open(filepath.Join("testdata", "llama3.2", "encoder.json"))
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
vocab := make(map[string]int32)
|
||||
if err := json.NewDecoder(f).Decode(&vocab); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
types := make([]uint32, len(vocab))
|
||||
tokens := make([]string, len(vocab))
|
||||
for token, id := range vocab {
|
||||
tokens[id] = token
|
||||
types[id] = 1
|
||||
}
|
||||
|
||||
for _, token := range []string{"<|begin_of_text|>", "<|end_of_text|>"} {
|
||||
if _, ok := vocab[token]; !ok {
|
||||
tokens = append(tokens, token) //nolint:makezero
|
||||
types = append(types, 3) //nolint:makezero
|
||||
vocab[token] = int32(len(vocab))
|
||||
}
|
||||
}
|
||||
|
||||
f, err = os.Open(filepath.Join("testdata", "llama3.2", "vocab.bpe"))
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
merges := make([]string, 0, 50000)
|
||||
|
||||
scanner := bufio.NewScanner(f)
|
||||
for scanner.Scan() {
|
||||
if !strings.HasPrefix(scanner.Text(), "#") {
|
||||
merges = append(merges, scanner.Text())
|
||||
}
|
||||
}
|
||||
|
||||
return NewBytePairEncoding(
|
||||
`(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`,
|
||||
&Vocabulary{
|
||||
Values: tokens,
|
||||
Types: types,
|
||||
Merges: merges,
|
||||
},
|
||||
)
|
||||
}
|
||||
|
||||
func TestLlama(t *testing.T) {
|
||||
tokenizer := llama(t)
|
||||
|
||||
t.Run("simple", func(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
ids, err := tokenizer.Encode("hello world")
|
||||
if err != nil {
|
||||
t.Error(err)
|
||||
}
|
||||
|
||||
if diff := cmp.Diff([]int32{15339, 1917}, ids); diff != "" {
|
||||
t.Errorf("no match (-theirs +ours):\n%s", diff)
|
||||
}
|
||||
|
||||
s, err := tokenizer.Decode([]int32{15339, 1917})
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if s != "hello world" {
|
||||
t.Errorf("got %q, want hello world", s)
|
||||
}
|
||||
|
||||
ids, err = tokenizer.Encode("hello <|end_of_text|>")
|
||||
if err != nil {
|
||||
t.Error(err)
|
||||
}
|
||||
|
||||
if diff := cmp.Diff([]int32{15339, 220, 128001}, ids); diff != "" {
|
||||
t.Errorf("no match (-theirs +ours):\n%s", diff)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("simple repeated", func(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
cases := map[string][]int32{
|
||||
strings.Repeat("0", 1): {15},
|
||||
strings.Repeat("0", 2): {410},
|
||||
strings.Repeat("0", 3): {931},
|
||||
strings.Repeat("0", 4): {931, 15},
|
||||
strings.Repeat("0", 5): {931, 410},
|
||||
strings.Repeat("0", 6): {931, 931},
|
||||
strings.Repeat("0", 7): {931, 931, 15},
|
||||
strings.Repeat("0", 8): {931, 931, 410},
|
||||
strings.Repeat("0", 9): {931, 931, 931},
|
||||
strings.Repeat("0", 10): {931, 931, 931, 15},
|
||||
strings.Repeat("0", 11): {931, 931, 931, 410},
|
||||
strings.Repeat("0", 12): {931, 931, 931, 931},
|
||||
strings.Repeat("0", 13): {931, 931, 931, 931, 15},
|
||||
strings.Repeat("0", 14): {931, 931, 931, 931, 410},
|
||||
strings.Repeat("0", 15): {931, 931, 931, 931, 931},
|
||||
strings.Repeat("0", 16): {931, 931, 931, 931, 931, 15},
|
||||
strings.Repeat("0", 17): {931, 931, 931, 931, 931, 410},
|
||||
}
|
||||
|
||||
for s, want := range cases {
|
||||
ids, err := tokenizer.Encode(s)
|
||||
if err != nil {
|
||||
t.Error(err)
|
||||
}
|
||||
|
||||
if diff := cmp.Diff(want, ids); diff != "" {
|
||||
t.Errorf("%q no match (-theirs +ours):\n%s", s, diff)
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("basic roundtrip", func(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
cases := []string{
|
||||
"hello",
|
||||
"hello ",
|
||||
"hello ",
|
||||
" hello",
|
||||
" hello ",
|
||||
" hello ",
|
||||
"hello world",
|
||||
"请考试我的软件!12345",
|
||||
}
|
||||
|
||||
for _, want := range cases {
|
||||
ids, err := tokenizer.Encode(want)
|
||||
if err != nil {
|
||||
t.Error(err)
|
||||
}
|
||||
|
||||
if got, err := tokenizer.Decode(ids); err != nil {
|
||||
t.Fatal(err)
|
||||
} else if got != want {
|
||||
t.Errorf("got %q, want %q", got, want)
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("special", func(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
cases := map[string][]int32{
|
||||
"<|begin_of_text|>A B!": {128000, 32, 426, 0},
|
||||
"<|begin_of_text|>A<|end_of_text|>B!": {128000, 32, 128001, 33, 0},
|
||||
"<|begin_of_text|>A<|end_of_text|>B<|begin_of_text|>!": {128000, 32, 128001, 33, 128000, 0},
|
||||
"<|begin_of_text|>A<|end_of_text|>B<|begin_of_text|>!<|end_of_text|>": {128000, 32, 128001, 33, 128000, 0, 128001},
|
||||
}
|
||||
|
||||
for s, want := range cases {
|
||||
ids, err := tokenizer.Encode(s)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if diff := cmp.Diff(want, ids); diff != "" {
|
||||
t.Errorf("no match (-theirs +ours):\n%s", diff)
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("split", func(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
cases := map[string][]string{
|
||||
"Hello World!": {"Hello", " World", "!"},
|
||||
"I'm don't won't": {"I", "'m", " don", "'t", " won", "'t"},
|
||||
"In 2024 there are 366 days": {"In", " ", "202", "4", " there", " are", " ", "366", " days"},
|
||||
"Hello!! ...world": {"Hello", "!!", " ...", "world"},
|
||||
"Hello World": {"Hello", " ", " World"},
|
||||
"Hello\nWorld": {"Hello", "\n", "World"},
|
||||
"Hello, WORLD!! How's it going?": {"Hello", ",", " WORLD", "!!", " How", "'s", " it", " going", "?"},
|
||||
}
|
||||
|
||||
for s, want := range cases {
|
||||
got := slices.Collect(tokenizer.split(s))
|
||||
if diff := cmp.Diff(want, got); diff != "" {
|
||||
t.Errorf("no match (-theirs +ours):\n%s", diff)
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func BenchmarkBytePairEncoding(b *testing.B) {
|
||||
tokenizer := llama(b)
|
||||
bts, err := os.ReadFile(filepath.Join("testdata", "war-and-peace.txt"))
|
||||
if err != nil {
|
||||
b.Fatal(err)
|
||||
}
|
||||
|
||||
for i := range 8 {
|
||||
n := min(int(math.Pow10(i)), len(bts))
|
||||
bts := bts[:n]
|
||||
b.Run("encode"+strconv.Itoa(n), func(b *testing.B) {
|
||||
b.ResetTimer()
|
||||
for range b.N {
|
||||
_, err := tokenizer.Encode(string(bts))
|
||||
if err != nil {
|
||||
b.Fatal(err)
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
b.Run("decode"+strconv.Itoa(n), func(b *testing.B) {
|
||||
ids, err := tokenizer.Encode(string(bts))
|
||||
if err != nil {
|
||||
b.Fatal(err)
|
||||
}
|
||||
|
||||
b.ResetTimer()
|
||||
for range b.N {
|
||||
_, err := tokenizer.Decode(ids)
|
||||
if err != nil {
|
||||
b.Fatal(err)
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
b.Run("split"+strconv.Itoa(n), func(b *testing.B) {
|
||||
b.ResetTimer()
|
||||
for range b.N {
|
||||
slices.Collect(tokenizer.split(string(bts)))
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
128002
model/testdata/llama3.2/encoder.json
vendored
Normal file
128002
model/testdata/llama3.2/encoder.json
vendored
Normal file
File diff suppressed because it is too large
Load Diff
280147
model/testdata/llama3.2/vocab.bpe
vendored
Normal file
280147
model/testdata/llama3.2/vocab.bpe
vendored
Normal file
File diff suppressed because it is too large
Load Diff
63845
model/testdata/war-and-peace.txt
vendored
Normal file
63845
model/testdata/war-and-peace.txt
vendored
Normal file
File diff suppressed because it is too large
Load Diff
@@ -20,6 +20,8 @@ import (
|
||||
"github.com/ollama/ollama/types/model"
|
||||
)
|
||||
|
||||
var finishReasonToolCalls = "tool_calls"
|
||||
|
||||
type Error struct {
|
||||
Message string `json:"message"`
|
||||
Type string `json:"type"`
|
||||
@@ -266,7 +268,7 @@ func toChatCompletion(id string, r api.ChatResponse) ChatCompletion {
|
||||
}
|
||||
}
|
||||
|
||||
func toChunk(id string, r api.ChatResponse) ChatCompletionChunk {
|
||||
func toChunk(id string, r api.ChatResponse, toolCallSent bool) ChatCompletionChunk {
|
||||
toolCalls := toToolCalls(r.Message.ToolCalls)
|
||||
return ChatCompletionChunk{
|
||||
Id: id,
|
||||
@@ -279,6 +281,9 @@ func toChunk(id string, r api.ChatResponse) ChatCompletionChunk {
|
||||
Delta: Message{Role: "assistant", Content: r.Message.Content, ToolCalls: toolCalls},
|
||||
FinishReason: func(reason string) *string {
|
||||
if len(reason) > 0 {
|
||||
if toolCallSent {
|
||||
return &finishReasonToolCalls
|
||||
}
|
||||
return &reason
|
||||
}
|
||||
return nil
|
||||
@@ -585,6 +590,7 @@ type ChatWriter struct {
|
||||
stream bool
|
||||
streamOptions *StreamOptions
|
||||
id string
|
||||
toolCallSent bool
|
||||
BaseWriter
|
||||
}
|
||||
|
||||
@@ -634,11 +640,14 @@ func (w *ChatWriter) writeResponse(data []byte) (int, error) {
|
||||
|
||||
// chat chunk
|
||||
if w.stream {
|
||||
c := toChunk(w.id, chatResponse)
|
||||
c := toChunk(w.id, chatResponse, w.toolCallSent)
|
||||
d, err := json.Marshal(c)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
if !w.toolCallSent && len(c.Choices) > 0 && len(c.Choices[0].Delta.ToolCalls) > 0 {
|
||||
w.toolCallSent = true
|
||||
}
|
||||
|
||||
w.ResponseWriter.Header().Set("Content-Type", "text/event-stream")
|
||||
_, err = w.ResponseWriter.Write([]byte(fmt.Sprintf("data: %s\n\n", d)))
|
||||
|
||||
@@ -19,7 +19,7 @@ import (
|
||||
"golang.org/x/text/encoding/unicode"
|
||||
|
||||
"github.com/ollama/ollama/api"
|
||||
"github.com/ollama/ollama/llm"
|
||||
"github.com/ollama/ollama/fs/ggml"
|
||||
)
|
||||
|
||||
func TestParseFileFile(t *testing.T) {
|
||||
@@ -769,7 +769,7 @@ func getSHA256Digest(t *testing.T, r io.Reader) (string, int64) {
|
||||
return fmt.Sprintf("sha256:%x", h.Sum(nil)), n
|
||||
}
|
||||
|
||||
func createBinFile(t *testing.T, kv map[string]any, ti []llm.Tensor) (string, string) {
|
||||
func createBinFile(t *testing.T, kv map[string]any, ti []ggml.Tensor) (string, string) {
|
||||
t.Helper()
|
||||
|
||||
f, err := os.CreateTemp(t.TempDir(), "testbin.*.gguf")
|
||||
@@ -778,7 +778,7 @@ func createBinFile(t *testing.T, kv map[string]any, ti []llm.Tensor) (string, st
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
if err := llm.WriteGGUF(f, kv, ti); err != nil {
|
||||
if err := ggml.WriteGGUF(f, kv, ti); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
// Calculate sha256 of file
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
package progress
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"fmt"
|
||||
"io"
|
||||
"sync"
|
||||
@@ -13,7 +14,8 @@ type State interface {
|
||||
|
||||
type Progress struct {
|
||||
mu sync.Mutex
|
||||
w io.Writer
|
||||
// buffer output to minimize flickering on all terminals
|
||||
w *bufio.Writer
|
||||
|
||||
pos int
|
||||
|
||||
@@ -22,7 +24,7 @@ type Progress struct {
|
||||
}
|
||||
|
||||
func NewProgress(w io.Writer) *Progress {
|
||||
p := &Progress{w: w}
|
||||
p := &Progress{w: bufio.NewWriter(w)}
|
||||
go p.start()
|
||||
return p
|
||||
}
|
||||
@@ -48,11 +50,14 @@ func (p *Progress) Stop() bool {
|
||||
stopped := p.stop()
|
||||
if stopped {
|
||||
fmt.Fprint(p.w, "\n")
|
||||
p.w.Flush()
|
||||
}
|
||||
return stopped
|
||||
}
|
||||
|
||||
func (p *Progress) StopAndClear() bool {
|
||||
defer p.w.Flush()
|
||||
|
||||
fmt.Fprint(p.w, "\033[?25l")
|
||||
defer fmt.Fprint(p.w, "\033[?25h")
|
||||
|
||||
@@ -81,20 +86,24 @@ func (p *Progress) render() {
|
||||
p.mu.Lock()
|
||||
defer p.mu.Unlock()
|
||||
|
||||
defer p.w.Flush()
|
||||
|
||||
// eliminate flickering on terminals that support synchronized output
|
||||
fmt.Fprint(p.w, "\033[?2026h")
|
||||
defer fmt.Fprint(p.w, "\033[?2026l")
|
||||
|
||||
fmt.Fprint(p.w, "\033[?25l")
|
||||
defer fmt.Fprint(p.w, "\033[?25h")
|
||||
|
||||
// clear already rendered progress lines
|
||||
for i := range p.pos {
|
||||
if i > 0 {
|
||||
fmt.Fprint(p.w, "\033[A")
|
||||
}
|
||||
fmt.Fprint(p.w, "\033[2K\033[1G")
|
||||
// move the cursor back to the beginning
|
||||
for range p.pos - 1 {
|
||||
fmt.Fprint(p.w, "\033[A")
|
||||
}
|
||||
fmt.Fprint(p.w, "\033[1G")
|
||||
|
||||
// render progress lines
|
||||
for i, state := range p.states {
|
||||
fmt.Fprint(p.w, state.String())
|
||||
fmt.Fprint(p.w, state.String(), "\033[K")
|
||||
if i < len(p.states)-1 {
|
||||
fmt.Fprint(p.w, "\n")
|
||||
}
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
package runner
|
||||
package common
|
||||
|
||||
import (
|
||||
"strings"
|
||||
)
|
||||
|
||||
func findStop(sequence string, stops []string) (bool, string) {
|
||||
func FindStop(sequence string, stops []string) (bool, string) {
|
||||
for _, stop := range stops {
|
||||
if strings.Contains(sequence, stop) {
|
||||
return true, stop
|
||||
@@ -14,7 +14,7 @@ func findStop(sequence string, stops []string) (bool, string) {
|
||||
return false, ""
|
||||
}
|
||||
|
||||
func containsStopSuffix(sequence string, stops []string) bool {
|
||||
func ContainsStopSuffix(sequence string, stops []string) bool {
|
||||
for _, stop := range stops {
|
||||
for i := 1; i <= len(stop); i++ {
|
||||
if strings.HasSuffix(sequence, stop[:i]) {
|
||||
@@ -29,7 +29,7 @@ func containsStopSuffix(sequence string, stops []string) bool {
|
||||
// truncateStop removes the provided stop string from pieces,
|
||||
// returning the partial pieces with stop removed, including truncating
|
||||
// the last piece if required (and signalling if this was the case)
|
||||
func truncateStop(pieces []string, stop string) ([]string, bool) {
|
||||
func TruncateStop(pieces []string, stop string) ([]string, bool) {
|
||||
joined := strings.Join(pieces, "")
|
||||
|
||||
index := strings.Index(joined, stop)
|
||||
@@ -65,7 +65,7 @@ func truncateStop(pieces []string, stop string) ([]string, bool) {
|
||||
return result, tokenTruncated
|
||||
}
|
||||
|
||||
func incompleteUnicode(token string) bool {
|
||||
func IncompleteUnicode(token string) bool {
|
||||
incomplete := false
|
||||
|
||||
// check if there is incomplete UTF-8 character at the end
|
||||
@@ -1,4 +1,4 @@
|
||||
package runner
|
||||
package common
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
@@ -52,7 +52,7 @@ func TestTruncateStop(t *testing.T) {
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
result, resultTrunc := truncateStop(tt.pieces, tt.stop)
|
||||
result, resultTrunc := TruncateStop(tt.pieces, tt.stop)
|
||||
if !reflect.DeepEqual(result, tt.expected) || resultTrunc != tt.expectedTrunc {
|
||||
t.Errorf("truncateStop(%v, %s): have %v (%v); want %v (%v)", tt.pieces, tt.stop, result, resultTrunc, tt.expected, tt.expectedTrunc)
|
||||
}
|
||||
@@ -120,7 +120,7 @@ func TestIncompleteUnicode(t *testing.T) {
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
result := incompleteUnicode(tt.input)
|
||||
result := IncompleteUnicode(tt.input)
|
||||
if result != tt.expected {
|
||||
t.Errorf("incompleteUnicode(%s): have %v; want %v", tt.input, result, tt.expected)
|
||||
}
|
||||
@@ -1,4 +1,4 @@
|
||||
package runner
|
||||
package llamarunner
|
||||
|
||||
import (
|
||||
"errors"
|
||||
@@ -1,4 +1,4 @@
|
||||
package runner
|
||||
package llamarunner
|
||||
|
||||
import (
|
||||
"testing"
|
||||
@@ -1,4 +1,4 @@
|
||||
package runner
|
||||
package llamarunner
|
||||
|
||||
import (
|
||||
"errors"
|
||||
@@ -1,4 +1,4 @@
|
||||
package runner
|
||||
package llamarunner
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
@@ -1,4 +1,4 @@
|
||||
package runner
|
||||
package llamarunner
|
||||
|
||||
import (
|
||||
"context"
|
||||
@@ -24,6 +24,7 @@ import (
|
||||
|
||||
"github.com/ollama/ollama/api"
|
||||
"github.com/ollama/ollama/llama"
|
||||
"github.com/ollama/ollama/runner/common"
|
||||
)
|
||||
|
||||
// input is an element of the prompt to process, either
|
||||
@@ -498,12 +499,12 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
|
||||
seq.pendingResponses = append(seq.pendingResponses, piece)
|
||||
sequence := strings.Join(seq.pendingResponses, "")
|
||||
|
||||
if ok, stop := findStop(sequence, seq.stop); ok {
|
||||
if ok, stop := common.FindStop(sequence, seq.stop); ok {
|
||||
slog.Debug("hit stop token", "pending", seq.pendingResponses, "stop", stop)
|
||||
|
||||
var tokenTruncated bool
|
||||
origLen := len(seq.pendingResponses)
|
||||
seq.pendingResponses, tokenTruncated = truncateStop(seq.pendingResponses, stop)
|
||||
seq.pendingResponses, tokenTruncated = common.TruncateStop(seq.pendingResponses, stop)
|
||||
newLen := len(seq.pendingResponses)
|
||||
|
||||
// Update the cache based on the tokens that will be returned:
|
||||
@@ -524,11 +525,11 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
|
||||
continue
|
||||
}
|
||||
|
||||
if containsStopSuffix(sequence, seq.stop) {
|
||||
if common.ContainsStopSuffix(sequence, seq.stop) {
|
||||
continue
|
||||
}
|
||||
|
||||
if incompleteUnicode(sequence) {
|
||||
if common.IncompleteUnicode(sequence) {
|
||||
continue
|
||||
}
|
||||
|
||||
@@ -844,8 +845,6 @@ func (s *Server) loadModel(
|
||||
threads int,
|
||||
multiUserCache bool,
|
||||
) {
|
||||
llama.BackendInit()
|
||||
|
||||
var err error
|
||||
s.model, err = llama.LoadModelFromFile(mpath, params)
|
||||
if err != nil {
|
||||
@@ -885,9 +884,6 @@ func (s *Server) loadModel(
|
||||
}
|
||||
|
||||
func Execute(args []string) error {
|
||||
if args[0] == "runner" {
|
||||
args = args[1:]
|
||||
}
|
||||
fs := flag.NewFlagSet("runner", flag.ExitOnError)
|
||||
mpath := fs.String("model", "", "Path to model binary file")
|
||||
ppath := fs.String("mmproj", "", "Path to projector binary file")
|
||||
@@ -934,6 +930,8 @@ func Execute(args []string) error {
|
||||
})
|
||||
slog.SetDefault(slog.New(handler))
|
||||
slog.Info("starting go runner")
|
||||
|
||||
llama.BackendInit()
|
||||
slog.Info("system", "info", llama.PrintSystemInfo(), "threads", *threads)
|
||||
|
||||
server := &Server{
|
||||
280
runner/ollamarunner/cache.go
Normal file
280
runner/ollamarunner/cache.go
Normal file
@@ -0,0 +1,280 @@
|
||||
package ollamarunner
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"math"
|
||||
"reflect"
|
||||
"time"
|
||||
|
||||
"github.com/ollama/ollama/kvcache"
|
||||
"github.com/ollama/ollama/ml"
|
||||
"github.com/ollama/ollama/model"
|
||||
)
|
||||
|
||||
type InputCache struct {
|
||||
// context window size (per slot)
|
||||
numCtx int32
|
||||
|
||||
// does the cache store data or do we need to always send the full input?
|
||||
// note that when enabled is false the underlying cache may either be nil
|
||||
// or a non-nil dummy that doesn't actually store anything
|
||||
enabled bool
|
||||
|
||||
// individual KV caches
|
||||
slots []InputCacheSlot
|
||||
|
||||
// optimize cache eviction for multiple users
|
||||
multiUserCache bool
|
||||
|
||||
cache kvcache.Cache
|
||||
}
|
||||
|
||||
func NewInputCache(model model.Model, kvCacheType string, kvSize int32, numSlots int, multiUserCache bool) (*InputCache, error) {
|
||||
if kvSize/int32(numSlots) < 1 {
|
||||
return nil, fmt.Errorf("must have at least one kv cache entry per parallel sequence (kv: %v parallel: %v)", kvSize, numSlots)
|
||||
}
|
||||
|
||||
slots := make([]InputCacheSlot, numSlots)
|
||||
|
||||
for i := range slots {
|
||||
slots[i] = InputCacheSlot{
|
||||
Id: i,
|
||||
Inputs: make([]input, 0),
|
||||
}
|
||||
}
|
||||
|
||||
cache := model.Config().Cache
|
||||
if cache != nil {
|
||||
cache.Init(model.Backend(), kvCacheTypeFromStr(kvCacheType), kvSize)
|
||||
}
|
||||
|
||||
return &InputCache{
|
||||
numCtx: kvSize / int32(numSlots),
|
||||
enabled: cache != nil,
|
||||
slots: slots,
|
||||
multiUserCache: multiUserCache,
|
||||
cache: cache,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func kvCacheTypeFromStr(s string) ml.DType {
|
||||
switch s {
|
||||
case "q8_0":
|
||||
panic("kv cache quantization not yet implemented")
|
||||
case "q4_0":
|
||||
panic("kv cache quantization not yet implemented")
|
||||
default:
|
||||
return ml.DTypeF16
|
||||
}
|
||||
}
|
||||
|
||||
func (c *InputCache) Close() {
|
||||
c.cache.Close()
|
||||
}
|
||||
|
||||
// Locking: Operations on InputCacheSlot (including finding one
|
||||
// through LoadCacheSlot) require a lock to be be held that serializes
|
||||
// these operations with each other and processBatch
|
||||
|
||||
type InputCacheSlot struct {
|
||||
// Index in the KV cache
|
||||
Id int
|
||||
|
||||
// Inputs that are stored in the KV cache
|
||||
Inputs []input
|
||||
|
||||
// is this cache actively being processed as part of a sequence?
|
||||
InUse bool
|
||||
|
||||
// last time this cache was used (as of start of processing)
|
||||
lastUsed time.Time
|
||||
}
|
||||
|
||||
func (c *InputCache) LoadCacheSlot(prompt []input, cachePrompt bool) (*InputCacheSlot, []input, error) {
|
||||
var slot *InputCacheSlot
|
||||
var numPast int32
|
||||
var err error
|
||||
|
||||
// In single-user scenarios, the longest cache slot works fine for getting good input
|
||||
// cache hit rates and it keeps the footprint of the cache small, which improves throughput.
|
||||
// For multiple users, the "best" cache slot produces better input cache hit rates
|
||||
// at the cost of worse performance when we miss the input cache.
|
||||
if !c.multiUserCache {
|
||||
slot, numPast, err = c.findLongestCacheSlot(prompt)
|
||||
} else {
|
||||
slot, numPast, err = c.findBestCacheSlot(prompt)
|
||||
}
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
|
||||
if !cachePrompt {
|
||||
numPast = 0
|
||||
}
|
||||
|
||||
slot.InUse = true
|
||||
slot.lastUsed = time.Now()
|
||||
|
||||
if numPast == int32(len(prompt)) {
|
||||
// Leave one input to sample so we can get a response
|
||||
numPast--
|
||||
}
|
||||
|
||||
if c.cache != nil {
|
||||
err = c.cache.Remove(slot.Id, numPast, math.MaxInt32)
|
||||
if err != nil {
|
||||
// Some models don't support partial erasure
|
||||
err = c.cache.Remove(slot.Id, 0, math.MaxInt32)
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
numPast = 0
|
||||
}
|
||||
}
|
||||
|
||||
slog.Debug("loading cache slot", "id", slot.Id, "cache", len(slot.Inputs), "prompt", len(prompt),
|
||||
"used", numPast, "remaining", int32(len(prompt))-numPast)
|
||||
|
||||
prompt = prompt[numPast:]
|
||||
slot.Inputs = slot.Inputs[:numPast]
|
||||
|
||||
return slot, prompt, nil
|
||||
}
|
||||
|
||||
func (c *InputCache) findLongestCacheSlot(prompt []input) (*InputCacheSlot, int32, error) {
|
||||
longest := int32(-1)
|
||||
var longestSlot *InputCacheSlot
|
||||
|
||||
for i, s := range c.slots {
|
||||
if s.InUse {
|
||||
continue
|
||||
}
|
||||
|
||||
count := countCommonPrefix(s.Inputs, prompt)
|
||||
if count > longest {
|
||||
longest = count
|
||||
longestSlot = &c.slots[i]
|
||||
}
|
||||
}
|
||||
|
||||
if longestSlot == nil {
|
||||
return nil, 0, errors.New("no available cache slots")
|
||||
}
|
||||
|
||||
return longestSlot, longest, nil
|
||||
}
|
||||
|
||||
func (c *InputCache) findBestCacheSlot(prompt []input) (*InputCacheSlot, int32, error) {
|
||||
oldest := time.Now()
|
||||
var oldestSlot *InputCacheSlot
|
||||
|
||||
longest := int32(-1)
|
||||
var longestSlot *InputCacheSlot
|
||||
|
||||
for i, s := range c.slots {
|
||||
count := countCommonPrefix(s.Inputs, prompt)
|
||||
if count > longest {
|
||||
longest = count
|
||||
longestSlot = &c.slots[i]
|
||||
}
|
||||
|
||||
if s.lastUsed.Compare(oldest) < 0 && !s.InUse {
|
||||
oldest = s.lastUsed
|
||||
oldestSlot = &c.slots[i]
|
||||
}
|
||||
}
|
||||
|
||||
if longest == int32(len(longestSlot.Inputs)) && !longestSlot.InUse {
|
||||
return longestSlot, longest, nil
|
||||
}
|
||||
|
||||
if oldestSlot.InUse {
|
||||
return nil, 0, errors.New("no available cache slots")
|
||||
}
|
||||
|
||||
if len(oldestSlot.Inputs) != 0 {
|
||||
slog.Debug("evicting cache slot", "id", oldestSlot.Id, "inputs", len(oldestSlot.Inputs),
|
||||
"used", oldestSlot.lastUsed)
|
||||
}
|
||||
|
||||
if longest > 0 && longestSlot != oldestSlot {
|
||||
slog.Debug("forking cache slot", "src", longestSlot.Id, "dst", oldestSlot.Id, "inputs", longest, "total",
|
||||
len(longestSlot.Inputs))
|
||||
oldestSlot.Inputs = make([]input, longest)
|
||||
copy(oldestSlot.Inputs, longestSlot.Inputs[:longest])
|
||||
if c.cache != nil {
|
||||
c.cache.CopyPrefix(longestSlot.Id, oldestSlot.Id, longest)
|
||||
}
|
||||
}
|
||||
|
||||
return oldestSlot, longest, nil
|
||||
}
|
||||
|
||||
func countCommonPrefix(a []input, b []input) int32 {
|
||||
var count int32
|
||||
|
||||
for i := range a {
|
||||
if i >= len(b) {
|
||||
break
|
||||
}
|
||||
|
||||
if !reflect.DeepEqual(a[i], b[i]) {
|
||||
break
|
||||
}
|
||||
|
||||
count++
|
||||
}
|
||||
|
||||
return count
|
||||
}
|
||||
|
||||
func (c *InputCache) ShiftDiscard(inputLen int32, numKeep int32) int32 {
|
||||
targetFree := (c.numCtx - numKeep) / 2
|
||||
targetFree = max(targetFree, 1)
|
||||
|
||||
currentFree := c.numCtx - inputLen
|
||||
discard := targetFree - currentFree
|
||||
|
||||
if discard < 0 {
|
||||
discard = 0
|
||||
}
|
||||
|
||||
return discard
|
||||
}
|
||||
|
||||
// Frees up space in the KV cache by deleting the oldest half of history and shifting
|
||||
// the newest half into that space (saving numKeep inputs at the beginning).
|
||||
//
|
||||
// Assumes that at least 1 entry can be freed up by shifting (i.e. numKeep < numCtx)
|
||||
func (c *InputCache) ShiftCacheSlot(slot *InputCacheSlot, numKeep int32) error {
|
||||
if numKeep >= c.numCtx {
|
||||
return fmt.Errorf("unable to shift context - keep exceeds context (keep: %v context: %v)", numKeep, c.numCtx)
|
||||
}
|
||||
|
||||
inputLen := int32(len(slot.Inputs))
|
||||
discard := c.ShiftDiscard(inputLen, numKeep)
|
||||
|
||||
if discard <= 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
slog.Debug("context limit hit - shifting", "id", slot.Id, "limit", c.numCtx, "input", len(slot.Inputs),
|
||||
"keep", numKeep, "discard", discard)
|
||||
|
||||
// TODO (jessegross): KV cache removal can fail for certain types of models
|
||||
if c.cache != nil {
|
||||
err := c.cache.Remove(slot.Id, numKeep, numKeep+discard)
|
||||
if err != nil {
|
||||
return fmt.Errorf("unable to remove old kv cache entries (id: %v, keep: %v discard: %v): %w", slot.Id, numKeep, discard, err)
|
||||
}
|
||||
}
|
||||
|
||||
for i := numKeep + discard; i < inputLen; i++ {
|
||||
slot.Inputs[i-discard] = slot.Inputs[i]
|
||||
}
|
||||
slot.Inputs = slot.Inputs[:inputLen-discard]
|
||||
|
||||
return nil
|
||||
}
|
||||
291
runner/ollamarunner/cache_test.go
Normal file
291
runner/ollamarunner/cache_test.go
Normal file
@@ -0,0 +1,291 @@
|
||||
package ollamarunner
|
||||
|
||||
import (
|
||||
"image"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func TestCountCommon(t *testing.T) {
|
||||
imgA := image.NewRGBA(image.Rect(0, 0, 100, 100))
|
||||
imgB := image.NewRGBA(image.Rect(0, 0, 50, 50))
|
||||
imgC := image.NewRGBA(image.Rect(50, 50, 100, 100))
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
t1 []input
|
||||
t2 []input
|
||||
expected int32
|
||||
}{
|
||||
{
|
||||
name: "Equal",
|
||||
t1: []input{{token: 1}, {token: 2}, {token: 3}},
|
||||
t2: []input{{token: 1}, {token: 2}, {token: 3}},
|
||||
expected: 3,
|
||||
},
|
||||
{
|
||||
name: "Prefix",
|
||||
t1: []input{{token: 1}},
|
||||
t2: []input{{token: 1}, {token: 2}, {token: 3}},
|
||||
expected: 1,
|
||||
},
|
||||
{
|
||||
name: "Image Prefix",
|
||||
t1: []input{{image: imgA}},
|
||||
t2: []input{{image: imgA}, {image: imgB}, {image: imgC}},
|
||||
expected: 1,
|
||||
},
|
||||
{
|
||||
name: "Mixed",
|
||||
t1: []input{{token: 1}, {image: imgA}},
|
||||
t2: []input{{token: 1}, {image: imgA}, {token: 5}},
|
||||
expected: 2,
|
||||
},
|
||||
{
|
||||
name: "Empty",
|
||||
t1: []input{},
|
||||
t2: []input{{token: 1}, {token: 2}, {token: 3}},
|
||||
expected: 0,
|
||||
},
|
||||
{
|
||||
name: "Both Empty",
|
||||
t1: []input{},
|
||||
t2: []input{},
|
||||
expected: 0,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
result := countCommonPrefix(tt.t1, tt.t2)
|
||||
if result != tt.expected {
|
||||
t.Errorf("countCommonPrefix(%v, %v): have %v; want %v", tt.t1, tt.t2, result, tt.expected)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestFindCacheSlot(t *testing.T) {
|
||||
type expected struct {
|
||||
result int
|
||||
len int32
|
||||
}
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
cache InputCache
|
||||
prompt []input
|
||||
longest expected
|
||||
best expected
|
||||
}{
|
||||
{
|
||||
name: "Empty",
|
||||
cache: InputCache{slots: []InputCacheSlot{
|
||||
{
|
||||
Id: 0,
|
||||
Inputs: []input{},
|
||||
InUse: false,
|
||||
lastUsed: time.Time{},
|
||||
},
|
||||
{
|
||||
Id: 1,
|
||||
Inputs: []input{},
|
||||
InUse: false,
|
||||
lastUsed: time.Time{},
|
||||
},
|
||||
}},
|
||||
prompt: []input{{token: 1}},
|
||||
longest: expected{result: 0, len: 0},
|
||||
best: expected{result: 0, len: 0},
|
||||
},
|
||||
{
|
||||
name: "Extend",
|
||||
cache: InputCache{slots: []InputCacheSlot{
|
||||
{
|
||||
Id: 0,
|
||||
Inputs: []input{{token: 1}},
|
||||
InUse: false,
|
||||
lastUsed: time.Now().Add(-time.Second),
|
||||
},
|
||||
{
|
||||
Id: 1,
|
||||
Inputs: []input{{token: 1}, {token: 2}},
|
||||
InUse: false,
|
||||
lastUsed: time.Now().Add(-2 * time.Second),
|
||||
},
|
||||
}},
|
||||
prompt: []input{{token: 1}, {token: 2}},
|
||||
longest: expected{result: 1, len: 2},
|
||||
best: expected{result: 1, len: 2},
|
||||
},
|
||||
{
|
||||
name: "New",
|
||||
cache: InputCache{slots: []InputCacheSlot{
|
||||
{
|
||||
Id: 0,
|
||||
Inputs: []input{{token: 1}, {token: 2}},
|
||||
InUse: false,
|
||||
lastUsed: time.Now().Add(-time.Second),
|
||||
},
|
||||
{
|
||||
Id: 1,
|
||||
Inputs: []input{},
|
||||
InUse: false,
|
||||
lastUsed: time.Time{},
|
||||
},
|
||||
}},
|
||||
prompt: []input{{token: 2}},
|
||||
longest: expected{result: 0, len: 0},
|
||||
best: expected{result: 1, len: 0},
|
||||
},
|
||||
{
|
||||
name: "Fork",
|
||||
cache: InputCache{
|
||||
slots: []InputCacheSlot{
|
||||
{
|
||||
Id: 0,
|
||||
Inputs: []input{{token: 1}, {token: 2}},
|
||||
InUse: false,
|
||||
lastUsed: time.Now().Add(-time.Second),
|
||||
},
|
||||
{
|
||||
Id: 1,
|
||||
Inputs: []input{},
|
||||
InUse: false,
|
||||
lastUsed: time.Time{},
|
||||
},
|
||||
},
|
||||
},
|
||||
prompt: []input{{token: 1}},
|
||||
longest: expected{result: 0, len: 1},
|
||||
best: expected{result: 1, len: 1},
|
||||
},
|
||||
{
|
||||
name: "Evict",
|
||||
cache: InputCache{slots: []InputCacheSlot{
|
||||
{
|
||||
Id: 0,
|
||||
Inputs: []input{{token: 1}},
|
||||
InUse: false,
|
||||
lastUsed: time.Now().Add(-time.Second),
|
||||
},
|
||||
{
|
||||
Id: 1,
|
||||
Inputs: []input{{token: 1}, {token: 2}},
|
||||
InUse: false,
|
||||
lastUsed: time.Now().Add(-2 * time.Second),
|
||||
},
|
||||
}},
|
||||
prompt: []input{{token: 2}, {token: 3}},
|
||||
longest: expected{result: 0, len: 0},
|
||||
best: expected{result: 1, len: 0},
|
||||
},
|
||||
{
|
||||
name: "In use",
|
||||
cache: InputCache{slots: []InputCacheSlot{
|
||||
{
|
||||
Id: 0,
|
||||
Inputs: []input{{token: 1}, {token: 2}},
|
||||
InUse: true,
|
||||
lastUsed: time.Now().Add(-time.Second),
|
||||
},
|
||||
{
|
||||
Id: 1,
|
||||
Inputs: []input{{token: 1}},
|
||||
InUse: false,
|
||||
lastUsed: time.Now().Add(-2 * time.Second),
|
||||
},
|
||||
}},
|
||||
prompt: []input{{token: 1}, {token: 2}},
|
||||
longest: expected{result: 1, len: 1},
|
||||
best: expected{result: 1, len: 2},
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run("Longest-"+tt.name, func(t *testing.T) {
|
||||
result, resultLen, err := tt.cache.findLongestCacheSlot(tt.prompt)
|
||||
if err != nil {
|
||||
t.Errorf("findLongestCacheSlot: err %v", err)
|
||||
} else if result.Id != tt.longest.result || resultLen != tt.longest.len {
|
||||
t.Errorf("findLongestCacheSlot: slot have %v, want %v len have %v, want %v",
|
||||
result.Id, tt.longest.result, resultLen, tt.longest.len)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run("Best-"+tt.name, func(t *testing.T) {
|
||||
result, resultLen, err := tt.cache.findBestCacheSlot(tt.prompt)
|
||||
if err != nil {
|
||||
t.Errorf("findBestCacheSlot: err %v", err)
|
||||
} else if result.Id != tt.best.result || resultLen != tt.best.len {
|
||||
t.Errorf("findBestCacheSlot: slot have %v, want %v len have %v, want %v",
|
||||
result.Id, tt.best.result, resultLen, tt.best.len)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestShiftDiscard(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
numCtx int32
|
||||
numKeep int32
|
||||
inputLen int32
|
||||
expected int32
|
||||
}{
|
||||
{
|
||||
name: "Shift",
|
||||
numCtx: 2048,
|
||||
numKeep: 5,
|
||||
inputLen: 2048,
|
||||
expected: 1021,
|
||||
},
|
||||
{
|
||||
name: "Max Keep",
|
||||
numCtx: 2048,
|
||||
numKeep: 2047,
|
||||
inputLen: 2048,
|
||||
expected: 1,
|
||||
},
|
||||
{
|
||||
name: "No Keep",
|
||||
numCtx: 2048,
|
||||
numKeep: 0,
|
||||
inputLen: 2048,
|
||||
expected: 1024,
|
||||
},
|
||||
{
|
||||
name: "Truncate",
|
||||
numCtx: 2048,
|
||||
numKeep: 5,
|
||||
inputLen: 5000,
|
||||
expected: 3973,
|
||||
},
|
||||
{
|
||||
name: "Truncate Keep",
|
||||
numCtx: 2048,
|
||||
numKeep: 2047,
|
||||
inputLen: 5000,
|
||||
expected: 2953,
|
||||
},
|
||||
{
|
||||
name: "No Op",
|
||||
numCtx: 2048,
|
||||
numKeep: 5,
|
||||
inputLen: 512,
|
||||
expected: 0,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
c := InputCache{numCtx: tt.numCtx}
|
||||
result := c.ShiftDiscard(tt.inputLen, tt.numKeep)
|
||||
if result != tt.expected {
|
||||
t.Errorf("shiftDiscard(ctx: %v, keep: %v input: %v): have %v; want %v", tt.numCtx, tt.numKeep, tt.inputLen, result, tt.expected)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
946
runner/ollamarunner/runner.go
Normal file
946
runner/ollamarunner/runner.go
Normal file
@@ -0,0 +1,946 @@
|
||||
package ollamarunner
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"flag"
|
||||
"fmt"
|
||||
"image"
|
||||
"log"
|
||||
"log/slog"
|
||||
"net"
|
||||
"net/http"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
"runtime"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
"unicode/utf8"
|
||||
|
||||
"golang.org/x/sync/semaphore"
|
||||
|
||||
"github.com/ollama/ollama/api"
|
||||
"github.com/ollama/ollama/model"
|
||||
"github.com/ollama/ollama/runner/common"
|
||||
"github.com/ollama/ollama/sample"
|
||||
|
||||
_ "github.com/ollama/ollama/model/models"
|
||||
)
|
||||
|
||||
// input is an element of the prompt to process, either a token or an image
|
||||
type input struct {
|
||||
token int32
|
||||
|
||||
image image.Image
|
||||
}
|
||||
|
||||
type Sequence struct {
|
||||
// batch index
|
||||
iBatch int
|
||||
|
||||
// prompt inputs left to evaluate
|
||||
inputs []input
|
||||
|
||||
// inputs that have been added to a batch but not yet submitted to Forward
|
||||
pendingInputs []input
|
||||
|
||||
// tokens that have been generated but not returned yet (e.g. for stop sequences)
|
||||
pendingResponses []string
|
||||
|
||||
// input cache being used by this sequence
|
||||
cache *InputCacheSlot
|
||||
|
||||
// channel to send responses over
|
||||
responses chan string
|
||||
|
||||
// channel to stop decoding (such as if the remote connection is closed)
|
||||
quit chan bool
|
||||
|
||||
// number of tokens to predict
|
||||
numPredict int
|
||||
|
||||
// set of samplers to run on generated logits
|
||||
samplers []sample.Sampler
|
||||
|
||||
// channel to send back the embedding if embedding only
|
||||
embedding chan []float32
|
||||
|
||||
// stop sequences
|
||||
stop []string
|
||||
|
||||
// number of inputs to keep at the beginning when shifting context window
|
||||
numKeep int32
|
||||
|
||||
// true if an embedding are to be returned instead of text generation
|
||||
embeddingOnly bool
|
||||
|
||||
doneReason string
|
||||
|
||||
// Metrics
|
||||
startProcessingTime time.Time
|
||||
startGenerationTime time.Time
|
||||
numPredicted int
|
||||
numPromptInputs int
|
||||
}
|
||||
|
||||
type NewSequenceParams struct {
|
||||
numPredict int
|
||||
stop []string
|
||||
numKeep int32
|
||||
samplers []sample.Sampler
|
||||
embedding bool
|
||||
}
|
||||
|
||||
func (s *Server) NewSequence(prompt string, images []ImageData, params NewSequenceParams) (*Sequence, error) {
|
||||
s.ready.Wait()
|
||||
|
||||
startTime := time.Now()
|
||||
|
||||
inputs, err := s.inputs(prompt, images)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to process inputs: %w", err)
|
||||
} else if len(inputs) == 0 {
|
||||
return nil, errors.New("no input provided")
|
||||
}
|
||||
|
||||
if params.numKeep < 0 {
|
||||
params.numKeep = int32(len(inputs))
|
||||
}
|
||||
|
||||
// Ensure that at least 1 input can be discarded during shift
|
||||
params.numKeep = min(params.numKeep, s.cache.numCtx-1)
|
||||
|
||||
if int32(len(inputs)) > s.cache.numCtx {
|
||||
discard := int32(len(inputs)) - s.cache.numCtx
|
||||
newInputs := inputs[:params.numKeep]
|
||||
newInputs = append(newInputs, inputs[params.numKeep+discard:]...)
|
||||
|
||||
slog.Warn("truncating input prompt", "limit", s.cache.numCtx, "prompt", len(inputs), "keep", params.numKeep, "new", len(newInputs))
|
||||
inputs = newInputs
|
||||
}
|
||||
|
||||
// TODO(jessegross): Ingest cached history for grammar
|
||||
|
||||
return &Sequence{
|
||||
inputs: inputs,
|
||||
numPromptInputs: len(inputs),
|
||||
startProcessingTime: startTime,
|
||||
numPredict: params.numPredict,
|
||||
pendingResponses: make([]string, 0),
|
||||
responses: make(chan string, 100),
|
||||
quit: make(chan bool, 1),
|
||||
embedding: make(chan []float32, 1),
|
||||
samplers: params.samplers,
|
||||
embeddingOnly: params.embedding,
|
||||
stop: params.stop,
|
||||
numKeep: params.numKeep,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// inputs processes the prompt and images into a list of inputs
|
||||
// by splitting the prompt on [img-<n>] tags, tokenizing text and
|
||||
// decoding images
|
||||
func (s *Server) inputs(prompt string, images []ImageData) ([]input, error) {
|
||||
var inputs []input
|
||||
var parts []string
|
||||
var matches [][]string
|
||||
|
||||
// TODO(jessegross): This can sometimes trigger for matching text in the
|
||||
// user's prompt. We previously tried to avoid it by only looking for images
|
||||
// on image models. We don't have a clear indication now but it would be better
|
||||
// to properly escape it in any case.
|
||||
re := regexp.MustCompile(`\[img-(\d+)\]`)
|
||||
parts = re.Split(prompt, -1)
|
||||
matches = re.FindAllStringSubmatch(prompt, -1)
|
||||
|
||||
for i, part := range parts {
|
||||
// text - tokenize
|
||||
tokens, err := s.model.(model.TextProcessor).Encode(part)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
for _, t := range tokens {
|
||||
inputs = append(inputs, input{token: t})
|
||||
}
|
||||
|
||||
// image - decode and store
|
||||
if i < len(matches) {
|
||||
n, _ := strconv.Atoi(matches[i][1])
|
||||
|
||||
imageIndex := -1
|
||||
for j := range images {
|
||||
if images[j].ID == n {
|
||||
imageIndex = j
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if imageIndex < 0 {
|
||||
return nil, fmt.Errorf("invalid image index: %d", n)
|
||||
}
|
||||
|
||||
image, _, err := image.Decode(bytes.NewReader(images[imageIndex].Data))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
inputs = append(inputs, input{image: image})
|
||||
}
|
||||
}
|
||||
|
||||
return inputs, nil
|
||||
}
|
||||
|
||||
type Server struct {
|
||||
// is the server ready to process requests?
|
||||
// protects access to model and image
|
||||
ready sync.WaitGroup
|
||||
|
||||
// loaded model
|
||||
model model.Model
|
||||
|
||||
// status for external health reporting - loading, ready to serve, etc.
|
||||
status ServerStatus
|
||||
|
||||
// current progress on loading the model
|
||||
progress float32
|
||||
|
||||
// number of simultaneous requests to handle
|
||||
parallel int
|
||||
|
||||
// maximum number of elements in a batch (per sequence)
|
||||
// TODO (jmorganca): make this n_batch
|
||||
batchSize int
|
||||
|
||||
// protects access to everything below this line
|
||||
// this is context state needed for decoding
|
||||
mu sync.Mutex
|
||||
|
||||
// indicates that data is ready for processing
|
||||
cond *sync.Cond
|
||||
|
||||
// the list of simultaneous sequences being evaluated
|
||||
seqs []*Sequence
|
||||
|
||||
// seqs can have a maximum of parallel entries, which
|
||||
// is enfoced by seqSem
|
||||
seqsSem *semaphore.Weighted
|
||||
|
||||
// KV cache
|
||||
cache *InputCache
|
||||
|
||||
// next sequence for prompt processing to avoid starvation
|
||||
nextSeq int
|
||||
}
|
||||
|
||||
func (s *Server) allNil() bool {
|
||||
for _, item := range s.seqs {
|
||||
if item != nil {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func flushPending(seq *Sequence) bool {
|
||||
joined := strings.Join(seq.pendingResponses, "")
|
||||
seq.pendingResponses = []string{}
|
||||
|
||||
// Check if there are any partial UTF-8 characters remaining.
|
||||
// We already check and queue as we are generating but some may
|
||||
// still make it here:
|
||||
// - Sequence is ending, e.g. generation limit has been hit
|
||||
// - Invalid characters in the middle of a string
|
||||
// This is a stricter check to ensure we never output invalid Unicode.
|
||||
for !utf8.ValidString(joined) {
|
||||
joined = joined[:len(joined)-1]
|
||||
}
|
||||
|
||||
if len(joined) == 0 {
|
||||
return true
|
||||
}
|
||||
|
||||
select {
|
||||
case seq.responses <- joined:
|
||||
return true
|
||||
case <-seq.quit:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
func (s *Server) removeSequence(seqIndex int, reason string) {
|
||||
seq := s.seqs[seqIndex]
|
||||
|
||||
flushPending(seq)
|
||||
seq.doneReason = reason
|
||||
close(seq.responses)
|
||||
close(seq.embedding)
|
||||
seq.cache.InUse = false
|
||||
s.seqs[seqIndex] = nil
|
||||
s.seqsSem.Release(1)
|
||||
}
|
||||
|
||||
func (s *Server) run(ctx context.Context) {
|
||||
s.ready.Wait()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
default:
|
||||
err := s.processBatch()
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (s *Server) processBatch() error {
|
||||
s.mu.Lock()
|
||||
for s.allNil() {
|
||||
s.cond.Wait() // Wait until an item is added
|
||||
}
|
||||
defer s.mu.Unlock()
|
||||
|
||||
var options model.Options
|
||||
imgSeq := -1
|
||||
|
||||
seqIdx := s.nextSeq - 1
|
||||
for range s.seqs {
|
||||
seqIdx = (seqIdx + 1) % len(s.seqs)
|
||||
seq := s.seqs[seqIdx]
|
||||
|
||||
if seq == nil {
|
||||
continue
|
||||
}
|
||||
|
||||
// if past the num predict limit
|
||||
if seq.numPredict > 0 && seq.numPredicted >= seq.numPredict {
|
||||
s.removeSequence(seqIdx, "limit")
|
||||
continue
|
||||
}
|
||||
|
||||
if !s.cache.enabled {
|
||||
seq.inputs = append(seq.cache.Inputs, seq.inputs...)
|
||||
seq.cache.Inputs = []input{}
|
||||
}
|
||||
|
||||
for i, input := range seq.inputs {
|
||||
if int32(len(seq.cache.Inputs)+len(seq.pendingInputs)+1) > s.cache.numCtx {
|
||||
if len(seq.pendingInputs) == 0 {
|
||||
err := s.cache.ShiftCacheSlot(seq.cache, seq.numKeep)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
} else {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if i >= s.batchSize {
|
||||
break
|
||||
}
|
||||
|
||||
// TODO(jessegross): Image inputs need to be rethought - it's
|
||||
// it doesn't work well for different types of models or multiple sequences
|
||||
if input.image != nil {
|
||||
if len(seq.pendingInputs) != len(options.Images) {
|
||||
break
|
||||
}
|
||||
|
||||
if imgSeq != seqIdx && imgSeq != -1 {
|
||||
s.nextSeq = seqIdx
|
||||
break
|
||||
}
|
||||
|
||||
imgSeq = seqIdx
|
||||
options.Images = append(options.Images, input.image)
|
||||
seq.pendingInputs = append(seq.pendingInputs, input)
|
||||
continue
|
||||
}
|
||||
|
||||
options.Inputs = append(options.Inputs, input.token)
|
||||
options.Positions = append(options.Positions, int32(len(seq.cache.Inputs)+len(seq.pendingInputs)))
|
||||
options.Sequences = append(options.Sequences, seq.cache.Id)
|
||||
|
||||
seq.iBatch = len(options.Outputs)
|
||||
if i+1 == len(seq.inputs) {
|
||||
options.Outputs = append(options.Outputs, int32(len(options.Inputs)-1))
|
||||
}
|
||||
seq.pendingInputs = append(seq.pendingInputs, input)
|
||||
}
|
||||
|
||||
seq.inputs = seq.inputs[len(seq.pendingInputs):]
|
||||
}
|
||||
|
||||
if len(options.Inputs) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
ctx := s.model.Backend().NewContext()
|
||||
defer ctx.Close()
|
||||
|
||||
modelOutput, err := model.Forward(ctx, s.model, options)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to decode batch: %w", err)
|
||||
}
|
||||
|
||||
f32s := modelOutput.Floats()
|
||||
|
||||
// TODO(jessegross): This will no longer be necessary once the sampling interface takes f32s
|
||||
logits := make([]float64, len(f32s))
|
||||
for i, f32 := range f32s {
|
||||
logits[i] = float64(f32)
|
||||
}
|
||||
|
||||
for i, seq := range s.seqs {
|
||||
if seq == nil {
|
||||
continue
|
||||
}
|
||||
|
||||
// After calling Forward, pending inputs are now in the cache
|
||||
if len(seq.pendingInputs) > 0 {
|
||||
seq.cache.Inputs = append(seq.cache.Inputs, seq.pendingInputs...)
|
||||
seq.pendingInputs = []input{}
|
||||
}
|
||||
|
||||
// don't sample prompt processing
|
||||
if len(seq.inputs) != 0 {
|
||||
if !s.cache.enabled {
|
||||
return errors.New("caching disabled but unable to fit entire input in a batch")
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
seq.numPredicted++
|
||||
if seq.numPredicted == 1 {
|
||||
seq.startGenerationTime = time.Now()
|
||||
}
|
||||
|
||||
// if done processing the prompt, generate an embedding and return
|
||||
if seq.embeddingOnly {
|
||||
// TODO(jessegross): Embedding support
|
||||
s.removeSequence(i, "")
|
||||
continue
|
||||
}
|
||||
|
||||
// sample a token
|
||||
vocabSize := len(f32s) / len(options.Outputs)
|
||||
tokens, err := sample.Sample(logits[seq.iBatch*vocabSize:(seq.iBatch+1)*vocabSize], seq.samplers...)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// TODO(jessegross): Sampler will output a single int32 in the future
|
||||
token := int32(tokens[0])
|
||||
|
||||
// if it's an end of sequence token, break
|
||||
if s.model.(model.TextProcessor).Is(token, model.SpecialEOS) {
|
||||
// TODO (jmorganca): we should send this back
|
||||
// as it's important for the /api/generate context
|
||||
// seq.responses <- piece
|
||||
|
||||
s.removeSequence(i, "stop")
|
||||
continue
|
||||
}
|
||||
|
||||
piece, err := s.model.(model.TextProcessor).Decode([]int32{token})
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
seq.inputs = []input{{token: token}}
|
||||
|
||||
seq.pendingResponses = append(seq.pendingResponses, piece)
|
||||
sequence := strings.Join(seq.pendingResponses, "")
|
||||
|
||||
if ok, stop := common.FindStop(sequence, seq.stop); ok {
|
||||
slog.Debug("hit stop token", "pending", seq.pendingResponses, "stop", stop)
|
||||
|
||||
var tokenTruncated bool
|
||||
origLen := len(seq.pendingResponses)
|
||||
seq.pendingResponses, tokenTruncated = common.TruncateStop(seq.pendingResponses, stop)
|
||||
newLen := len(seq.pendingResponses)
|
||||
|
||||
// Update the cache based on the tokens that will be returned:
|
||||
// - We have 1 token more than is currently in the cache because
|
||||
// the last one generated wasn't submitted to Decode
|
||||
// - Remove any stop sequences that we stripped out
|
||||
// - If truncateStop removed a portion of a token, drop that
|
||||
// - As defense-in-depth, if truncatedToken didn't find a stop token
|
||||
// remove the extra one that we added to the cache len
|
||||
tokenLen := len(seq.cache.Inputs) + 1
|
||||
tokenLen -= origLen - newLen
|
||||
if tokenTruncated || origLen == newLen {
|
||||
tokenLen--
|
||||
}
|
||||
seq.cache.Inputs = seq.cache.Inputs[:tokenLen]
|
||||
|
||||
s.removeSequence(i, "stop")
|
||||
continue
|
||||
}
|
||||
|
||||
if common.ContainsStopSuffix(sequence, seq.stop) {
|
||||
continue
|
||||
}
|
||||
|
||||
if common.IncompleteUnicode(sequence) {
|
||||
continue
|
||||
}
|
||||
|
||||
if !flushPending(seq) {
|
||||
s.removeSequence(i, "connection")
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// TODO (jmorganca): use structs from the api package to avoid duplication
|
||||
// this way the api acts as a proxy instead of using a different api for the
|
||||
// runner
|
||||
type Options struct {
|
||||
api.Runner
|
||||
|
||||
NumKeep int `json:"n_keep"`
|
||||
Seed int `json:"seed"`
|
||||
NumPredict int `json:"n_predict"`
|
||||
TopK int `json:"top_k"`
|
||||
TopP float32 `json:"top_p"`
|
||||
MinP float32 `json:"min_p"`
|
||||
TypicalP float32 `json:"typical_p"`
|
||||
RepeatLastN int `json:"repeat_last_n"`
|
||||
Temperature float32 `json:"temperature"`
|
||||
RepeatPenalty float32 `json:"repeat_penalty"`
|
||||
PresencePenalty float32 `json:"presence_penalty"`
|
||||
FrequencyPenalty float32 `json:"frequency_penalty"`
|
||||
Mirostat int `json:"mirostat"`
|
||||
MirostatTau float32 `json:"mirostat_tau"`
|
||||
MirostatEta float32 `json:"mirostat_eta"`
|
||||
Stop []string `json:"stop"`
|
||||
}
|
||||
|
||||
type ImageData struct {
|
||||
Data []byte `json:"data"`
|
||||
ID int `json:"id"`
|
||||
AspectRatioID int `json:"aspect_ratio_id"`
|
||||
}
|
||||
|
||||
type CompletionRequest struct {
|
||||
Prompt string `json:"prompt"`
|
||||
Images []ImageData `json:"image_data"`
|
||||
Grammar string `json:"grammar"`
|
||||
CachePrompt bool `json:"cache_prompt"`
|
||||
|
||||
Options
|
||||
}
|
||||
|
||||
type Timings struct {
|
||||
PredictedN int `json:"predicted_n"`
|
||||
PredictedMS float64 `json:"predicted_ms"`
|
||||
PromptN int `json:"prompt_n"`
|
||||
PromptMS float64 `json:"prompt_ms"`
|
||||
}
|
||||
|
||||
type CompletionResponse struct {
|
||||
Content string `json:"content"`
|
||||
Stop bool `json:"stop"`
|
||||
|
||||
Model string `json:"model,omitempty"`
|
||||
Prompt string `json:"prompt,omitempty"`
|
||||
StoppedLimit bool `json:"stopped_limit,omitempty"`
|
||||
PredictedN int `json:"predicted_n,omitempty"`
|
||||
PredictedMS float64 `json:"predicted_ms,omitempty"`
|
||||
PromptN int `json:"prompt_n,omitempty"`
|
||||
PromptMS float64 `json:"prompt_ms,omitempty"`
|
||||
|
||||
Timings Timings `json:"timings"`
|
||||
}
|
||||
|
||||
func getSamplers(_ CompletionRequest) []sample.Sampler {
|
||||
// TODO(jessegross): Waiting for sampling code
|
||||
|
||||
/*samplingParams.TopK = req.TopK
|
||||
samplingParams.TopP = req.TopP
|
||||
samplingParams.MinP = req.MinP
|
||||
samplingParams.TypicalP = req.TypicalP
|
||||
samplingParams.Temp = req.Temperature
|
||||
samplingParams.RepeatLastN = req.RepeatLastN
|
||||
samplingParams.PenaltyRepeat = req.RepeatPenalty
|
||||
samplingParams.PenaltyFreq = req.FrequencyPenalty
|
||||
samplingParams.PenaltyPresent = req.PresencePenalty
|
||||
samplingParams.Mirostat = req.Mirostat
|
||||
samplingParams.MirostatTau = req.MirostatTau
|
||||
samplingParams.MirostatEta = req.MirostatEta
|
||||
samplingParams.Seed = uint32(req.Seed)
|
||||
samplingParams.Grammar = req.Grammar*/
|
||||
|
||||
return []sample.Sampler{sample.Greedy()}
|
||||
}
|
||||
|
||||
func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
|
||||
var req CompletionRequest
|
||||
req.Options = Options(api.DefaultOptions())
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
http.Error(w, "Bad request", http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
|
||||
// Set the headers to indicate streaming
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
w.Header().Set("Transfer-Encoding", "chunked")
|
||||
|
||||
flusher, ok := w.(http.Flusher)
|
||||
if !ok {
|
||||
http.Error(w, "Streaming not supported", http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
|
||||
seq, err := s.NewSequence(req.Prompt, req.Images, NewSequenceParams{
|
||||
numPredict: req.NumPredict,
|
||||
stop: req.Stop,
|
||||
numKeep: int32(req.NumKeep),
|
||||
samplers: getSamplers(req),
|
||||
embedding: false,
|
||||
})
|
||||
if err != nil {
|
||||
http.Error(w, fmt.Sprintf("Failed to create new sequence: %v", err), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
|
||||
// Ensure there is a place to put the sequence, released when removed from s.seqs
|
||||
if err := s.seqsSem.Acquire(r.Context(), 1); err != nil {
|
||||
if errors.Is(err, context.Canceled) {
|
||||
slog.Info("aborting completion request due to client closing the connection")
|
||||
} else {
|
||||
slog.Error("Failed to acquire semaphore", "error", err)
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
s.mu.Lock()
|
||||
found := false
|
||||
for i, sq := range s.seqs {
|
||||
if sq == nil {
|
||||
seq.cache, seq.inputs, err = s.cache.LoadCacheSlot(seq.inputs, req.CachePrompt)
|
||||
if err != nil {
|
||||
s.mu.Unlock()
|
||||
http.Error(w, fmt.Sprintf("Failed to load cache: %v", err), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
|
||||
s.seqs[i] = seq
|
||||
s.cond.Signal()
|
||||
found = true
|
||||
break
|
||||
}
|
||||
}
|
||||
s.mu.Unlock()
|
||||
|
||||
if !found {
|
||||
http.Error(w, "could not find an available sequence", http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-r.Context().Done():
|
||||
close(seq.quit)
|
||||
return
|
||||
case content, ok := <-seq.responses:
|
||||
if ok {
|
||||
if err := json.NewEncoder(w).Encode(&CompletionResponse{
|
||||
Content: content,
|
||||
}); err != nil {
|
||||
http.Error(w, fmt.Sprintf("failed to encode response: %v", err), http.StatusInternalServerError)
|
||||
close(seq.quit)
|
||||
return
|
||||
}
|
||||
|
||||
flusher.Flush()
|
||||
} else {
|
||||
// Send the final response
|
||||
if err := json.NewEncoder(w).Encode(&CompletionResponse{
|
||||
Stop: true,
|
||||
StoppedLimit: seq.doneReason == "limit",
|
||||
Timings: Timings{
|
||||
PromptN: seq.numPromptInputs,
|
||||
PromptMS: float64(seq.startGenerationTime.Sub(seq.startProcessingTime).Milliseconds()),
|
||||
PredictedN: seq.numPredicted,
|
||||
PredictedMS: float64(time.Since(seq.startGenerationTime).Milliseconds()),
|
||||
},
|
||||
}); err != nil {
|
||||
http.Error(w, fmt.Sprintf("failed to encode final response: %v", err), http.StatusInternalServerError)
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
type EmbeddingRequest struct {
|
||||
Content string `json:"content"`
|
||||
CachePrompt bool `json:"cache_prompt"`
|
||||
}
|
||||
|
||||
type EmbeddingResponse struct {
|
||||
Embedding []float32 `json:"embedding"`
|
||||
}
|
||||
|
||||
func (s *Server) embeddings(w http.ResponseWriter, r *http.Request) {
|
||||
var req EmbeddingRequest
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
http.Error(w, fmt.Sprintf("bad request: %s", err), http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
|
||||
slog.Debug("embedding request", "content", req.Content)
|
||||
|
||||
seq, err := s.NewSequence(req.Content, nil, NewSequenceParams{embedding: true})
|
||||
if err != nil {
|
||||
http.Error(w, fmt.Sprintf("Failed to create new sequence: %v", err), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
|
||||
// Ensure there is a place to put the sequence, released when removed from s.seqs
|
||||
if err := s.seqsSem.Acquire(r.Context(), 1); err != nil {
|
||||
if errors.Is(err, context.Canceled) {
|
||||
slog.Info("aborting embeddings request due to client closing the connection")
|
||||
} else {
|
||||
slog.Error("Failed to acquire semaphore", "error", err)
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
s.mu.Lock()
|
||||
found := false
|
||||
for i, sq := range s.seqs {
|
||||
if sq == nil {
|
||||
seq.cache, seq.inputs, err = s.cache.LoadCacheSlot(seq.inputs, req.CachePrompt)
|
||||
if err != nil {
|
||||
s.mu.Unlock()
|
||||
http.Error(w, fmt.Sprintf("Failed to load cache: %v", err), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
s.seqs[i] = seq
|
||||
s.cond.Signal()
|
||||
found = true
|
||||
break
|
||||
}
|
||||
}
|
||||
s.mu.Unlock()
|
||||
|
||||
if !found {
|
||||
http.Error(w, "could not find an available sequence", http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
|
||||
embedding := <-seq.embedding
|
||||
|
||||
if err := json.NewEncoder(w).Encode(&EmbeddingResponse{
|
||||
Embedding: embedding,
|
||||
}); err != nil {
|
||||
http.Error(w, fmt.Sprintf("failed to encode response: %v", err), http.StatusInternalServerError)
|
||||
}
|
||||
}
|
||||
|
||||
type HealthResponse struct {
|
||||
Status string `json:"status"`
|
||||
Progress float32 `json:"progress"`
|
||||
}
|
||||
|
||||
type ServerStatus int
|
||||
|
||||
const (
|
||||
ServerStatusReady ServerStatus = iota
|
||||
ServerStatusLoadingModel
|
||||
ServerStatusError
|
||||
)
|
||||
|
||||
func (s ServerStatus) ToString() string {
|
||||
switch s {
|
||||
case ServerStatusReady:
|
||||
return "ok"
|
||||
case ServerStatusLoadingModel:
|
||||
return "loading model"
|
||||
default:
|
||||
return "server error"
|
||||
}
|
||||
}
|
||||
|
||||
func (s *Server) health(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
if err := json.NewEncoder(w).Encode(&HealthResponse{
|
||||
Status: s.status.ToString(),
|
||||
Progress: s.progress,
|
||||
}); err != nil {
|
||||
http.Error(w, fmt.Sprintf("failed to encode response: %v", err), http.StatusInternalServerError)
|
||||
}
|
||||
}
|
||||
|
||||
type multiLPath []string
|
||||
|
||||
func (m *multiLPath) Set(value string) error {
|
||||
*m = append(*m, value)
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *multiLPath) String() string {
|
||||
return strings.Join(*m, ", ")
|
||||
}
|
||||
|
||||
func (s *Server) loadModel(
|
||||
mpath string,
|
||||
lpath multiLPath,
|
||||
parallel int,
|
||||
kvCacheType string,
|
||||
kvSize int,
|
||||
multiUserCache bool,
|
||||
) {
|
||||
var err error
|
||||
s.model, err = model.New(mpath)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
slog.Info("system", "info", s.model.Backend().SystemInfo() /* "threads", *threads */)
|
||||
|
||||
// TODO(jessegross): LoRA loading
|
||||
if lpath.String() != "" {
|
||||
panic("loras are not yet implemented")
|
||||
}
|
||||
|
||||
s.cache, err = NewInputCache(s.model, kvCacheType, int32(kvSize), parallel, multiUserCache)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
if !s.cache.enabled && parallel > 1 {
|
||||
parallel = 1
|
||||
slog.Warn("model does not support caching, disabling parallel processing")
|
||||
}
|
||||
|
||||
s.parallel = parallel
|
||||
s.seqs = make([]*Sequence, s.parallel)
|
||||
s.seqsSem = semaphore.NewWeighted(int64(s.parallel))
|
||||
|
||||
s.status = ServerStatusReady
|
||||
s.ready.Done()
|
||||
}
|
||||
|
||||
func Execute(args []string) error {
|
||||
fs := flag.NewFlagSet("runner", flag.ExitOnError)
|
||||
mpath := fs.String("model", "", "Path to model binary file")
|
||||
parallel := fs.Int("parallel", 1, "Number of sequences to handle simultaneously")
|
||||
batchSize := fs.Int("batch-size", 512, "Batch size")
|
||||
_ = fs.Int("n-gpu-layers", 0, "Number of layers to offload to GPU")
|
||||
_ = fs.Int("main-gpu", 0, "Main GPU")
|
||||
_ = fs.Bool("flash-attn", false, "Enable flash attention")
|
||||
kvSize := fs.Int("ctx-size", 2048, "Context (or KV cache) size")
|
||||
kvCacheType := fs.String("kv-cache-type", "", "quantization type for KV cache (default: f16)")
|
||||
port := fs.Int("port", 8080, "Port to expose the server on")
|
||||
_ = fs.Int("threads", runtime.NumCPU(), "Number of threads to use during generation")
|
||||
verbose := fs.Bool("verbose", false, "verbose output (default: disabled)")
|
||||
_ = fs.Bool("no-mmap", false, "do not memory-map model (slower load but may reduce pageouts if not using mlock)")
|
||||
_ = fs.Bool("mlock", false, "force system to keep model in RAM rather than swapping or compressing")
|
||||
_ = fs.String("tensor-split", "", "fraction of the model to offload to each GPU, comma-separated list of proportions")
|
||||
multiUserCache := fs.Bool("multiuser-cache", false, "optimize input cache algorithm for multiple users")
|
||||
|
||||
var lpaths multiLPath
|
||||
fs.Var(&lpaths, "lora", "Path to lora layer file (can be specified multiple times)")
|
||||
|
||||
fs.Usage = func() {
|
||||
fmt.Fprintf(fs.Output(), "Runner usage\n")
|
||||
fs.PrintDefaults()
|
||||
}
|
||||
if err := fs.Parse(args); err != nil {
|
||||
return err
|
||||
}
|
||||
level := slog.LevelInfo
|
||||
if *verbose {
|
||||
level = slog.LevelDebug
|
||||
}
|
||||
handler := slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{
|
||||
Level: level,
|
||||
AddSource: true,
|
||||
ReplaceAttr: func(_ []string, attr slog.Attr) slog.Attr {
|
||||
if attr.Key == slog.SourceKey {
|
||||
source := attr.Value.Any().(*slog.Source)
|
||||
source.File = filepath.Base(source.File)
|
||||
}
|
||||
return attr
|
||||
},
|
||||
})
|
||||
slog.SetDefault(slog.New(handler))
|
||||
slog.Info("starting ollama engine")
|
||||
|
||||
server := &Server{
|
||||
batchSize: *batchSize,
|
||||
status: ServerStatusLoadingModel,
|
||||
}
|
||||
|
||||
// TODO(jessegross): Parameters that need to be implemented:
|
||||
// n-gpu-layers
|
||||
// main-gpu
|
||||
// flash-attn
|
||||
// threads
|
||||
// no-mmap
|
||||
// mlock
|
||||
// tensor-split
|
||||
|
||||
/*var tensorSplitFloats []float32
|
||||
if *tensorSplit != "" {
|
||||
stringFloats := regexp.MustCompile(",").Split(*tensorSplit, -1)
|
||||
|
||||
tensorSplitFloats = make([]float32, 0, len(stringFloats))
|
||||
for _, s := range stringFloats {
|
||||
f, _ := strconv.ParseFloat(s, 32)
|
||||
tensorSplitFloats = append(tensorSplitFloats, float32(f))
|
||||
}
|
||||
}*/
|
||||
|
||||
server.ready.Add(1)
|
||||
go server.loadModel(*mpath, lpaths, *parallel, *kvCacheType, *kvSize, *multiUserCache)
|
||||
|
||||
server.cond = sync.NewCond(&server.mu)
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
go server.run(ctx)
|
||||
|
||||
addr := "127.0.0.1:" + strconv.Itoa(*port)
|
||||
listener, err := net.Listen("tcp", addr)
|
||||
if err != nil {
|
||||
fmt.Println("Listen error:", err)
|
||||
cancel()
|
||||
return err
|
||||
}
|
||||
defer listener.Close()
|
||||
|
||||
mux := http.NewServeMux()
|
||||
mux.HandleFunc("/embedding", server.embeddings)
|
||||
mux.HandleFunc("/completion", server.completion)
|
||||
mux.HandleFunc("/health", server.health)
|
||||
|
||||
httpServer := http.Server{
|
||||
Handler: mux,
|
||||
}
|
||||
|
||||
log.Println("Server listening on", addr)
|
||||
if err := httpServer.Serve(listener); err != nil {
|
||||
log.Fatal("server error:", err)
|
||||
return err
|
||||
}
|
||||
|
||||
cancel()
|
||||
return nil
|
||||
}
|
||||
24
runner/runner.go
Normal file
24
runner/runner.go
Normal file
@@ -0,0 +1,24 @@
|
||||
package runner
|
||||
|
||||
import (
|
||||
"github.com/ollama/ollama/runner/llamarunner"
|
||||
"github.com/ollama/ollama/runner/ollamarunner"
|
||||
)
|
||||
|
||||
func Execute(args []string) error {
|
||||
if args[0] == "runner" {
|
||||
args = args[1:]
|
||||
}
|
||||
|
||||
var newRunner bool
|
||||
if args[0] == "--ollama-engine" {
|
||||
args = args[1:]
|
||||
newRunner = true
|
||||
}
|
||||
|
||||
if newRunner {
|
||||
return ollamarunner.Execute(args)
|
||||
} else {
|
||||
return llamarunner.Execute(args)
|
||||
}
|
||||
}
|
||||
13
sample/greedy.go
Normal file
13
sample/greedy.go
Normal file
@@ -0,0 +1,13 @@
|
||||
package sample
|
||||
|
||||
import "gonum.org/v1/gonum/floats"
|
||||
|
||||
type greedy struct{}
|
||||
|
||||
func Greedy() Sampler {
|
||||
return greedy{}
|
||||
}
|
||||
|
||||
func (s greedy) Sample(t []float64) ([]float64, error) {
|
||||
return []float64{float64(floats.MaxIdx(t))}, nil
|
||||
}
|
||||
74
sample/sample.go
Normal file
74
sample/sample.go
Normal file
@@ -0,0 +1,74 @@
|
||||
package sample
|
||||
|
||||
import (
|
||||
"slices"
|
||||
|
||||
"gonum.org/v1/gonum/floats"
|
||||
"gonum.org/v1/gonum/stat/sampleuv"
|
||||
)
|
||||
|
||||
type Sampler interface {
|
||||
Sample([]float64) ([]float64, error)
|
||||
}
|
||||
|
||||
type Temperature float64
|
||||
|
||||
func (s Temperature) Sample(t []float64) ([]float64, error) {
|
||||
floats.Div(t, slices.Repeat([]float64{float64(s)}, len(t)))
|
||||
return t, nil
|
||||
}
|
||||
|
||||
type softmax struct{}
|
||||
|
||||
func Softmax() Sampler {
|
||||
return softmax{}
|
||||
}
|
||||
|
||||
func (softmax) Sample(t []float64) ([]float64, error) {
|
||||
return t, nil
|
||||
}
|
||||
|
||||
type TopK int
|
||||
|
||||
func (s TopK) Sample(t []float64) ([]float64, error) {
|
||||
return t, nil
|
||||
}
|
||||
|
||||
type TopP float32
|
||||
|
||||
func (s TopP) Sample(t []float64) ([]float64, error) {
|
||||
return t, nil
|
||||
}
|
||||
|
||||
type MinP float32
|
||||
|
||||
func (s MinP) Sample(t []float64) ([]float64, error) {
|
||||
return t, nil
|
||||
}
|
||||
|
||||
type weighed struct{}
|
||||
|
||||
func Weighed() Sampler {
|
||||
return weighed{}
|
||||
}
|
||||
|
||||
func (s weighed) Sample(t []float64) ([]float64, error) {
|
||||
w := sampleuv.NewWeighted(t, nil)
|
||||
if v, ok := w.Take(); ok {
|
||||
return []float64{float64(v)}, nil
|
||||
}
|
||||
|
||||
return t, nil
|
||||
}
|
||||
|
||||
func Sample(floats []float64, samplers ...Sampler) ([]float64, error) {
|
||||
var err error
|
||||
for _, sampler := range samplers {
|
||||
floats, err = sampler.Sample(floats)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
return floats, nil
|
||||
}
|
||||
@@ -21,8 +21,8 @@ import (
|
||||
"github.com/ollama/ollama/convert"
|
||||
"github.com/ollama/ollama/envconfig"
|
||||
"github.com/ollama/ollama/format"
|
||||
"github.com/ollama/ollama/fs/ggml"
|
||||
"github.com/ollama/ollama/llama"
|
||||
"github.com/ollama/ollama/llm"
|
||||
"github.com/ollama/ollama/template"
|
||||
"github.com/ollama/ollama/types/errtypes"
|
||||
"github.com/ollama/ollama/types/model"
|
||||
@@ -205,7 +205,7 @@ func detectModelTypeFromFiles(files map[string]string) string {
|
||||
return ""
|
||||
}
|
||||
|
||||
ct := llm.DetectGGMLType(buf)
|
||||
ct := ggml.DetectContentType(buf)
|
||||
if ct == "gguf" {
|
||||
return "gguf"
|
||||
}
|
||||
@@ -271,11 +271,11 @@ func convertFromSafetensors(files map[string]string, baseLayers []*layerGGML, is
|
||||
return nil, err
|
||||
}
|
||||
|
||||
ggml, _, err := llm.DecodeGGML(bin, 0)
|
||||
f, _, err := ggml.Decode(bin, 0)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
layers := []*layerGGML{{layer, ggml}}
|
||||
layers := []*layerGGML{{layer, f}}
|
||||
|
||||
if !isAdapter {
|
||||
return detectChatTemplate(layers)
|
||||
@@ -283,13 +283,13 @@ func convertFromSafetensors(files map[string]string, baseLayers []*layerGGML, is
|
||||
return layers, nil
|
||||
}
|
||||
|
||||
func kvFromLayers(baseLayers []*layerGGML) (llm.KV, error) {
|
||||
func kvFromLayers(baseLayers []*layerGGML) (ggml.KV, error) {
|
||||
for _, l := range baseLayers {
|
||||
if l.GGML != nil {
|
||||
return l.KV(), nil
|
||||
}
|
||||
}
|
||||
return llm.KV{}, fmt.Errorf("no base model was found")
|
||||
return ggml.KV{}, fmt.Errorf("no base model was found")
|
||||
}
|
||||
|
||||
func createModel(r api.CreateRequest, name model.Name, baseLayers []*layerGGML, fn func(resp api.ProgressResponse)) (err error) {
|
||||
@@ -306,7 +306,7 @@ func createModel(r api.CreateRequest, name model.Name, baseLayers []*layerGGML,
|
||||
if layer.GGML != nil {
|
||||
quantType := strings.ToUpper(cmp.Or(r.Quantize, r.Quantization))
|
||||
if quantType != "" && layer.GGML.Name() == "gguf" && layer.MediaType == "application/vnd.ollama.image.model" {
|
||||
want, err := llm.ParseFileType(quantType)
|
||||
want, err := ggml.ParseFileType(quantType)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -403,7 +403,7 @@ func quantizeLayer(layer *layerGGML, quantizeType string, fn func(resp api.Progr
|
||||
ft := layer.GGML.KV().FileType()
|
||||
fn(api.ProgressResponse{Status: fmt.Sprintf("quantizing %s model to %s", ft, quantizeType)})
|
||||
|
||||
want, err := llm.ParseFileType(quantizeType)
|
||||
want, err := ggml.ParseFileType(quantizeType)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@@ -433,13 +433,13 @@ func quantizeLayer(layer *layerGGML, quantizeType string, fn func(resp api.Progr
|
||||
return nil, err
|
||||
}
|
||||
|
||||
ggml, _, err := llm.DecodeGGML(temp, 0)
|
||||
f, _, err := ggml.Decode(temp, 0)
|
||||
if err != nil {
|
||||
slog.Error(fmt.Sprintf("error decoding ggml: %s\n", err))
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return &layerGGML{newLayer, ggml}, nil
|
||||
return &layerGGML{newLayer, f}, nil
|
||||
}
|
||||
|
||||
func ggufLayers(digest string, fn func(resp api.ProgressResponse)) ([]*layerGGML, error) {
|
||||
@@ -475,7 +475,7 @@ func ggufLayers(digest string, fn func(resp api.ProgressResponse)) ([]*layerGGML
|
||||
|
||||
var offset int64
|
||||
for offset < stat.Size() {
|
||||
ggml, n, err := llm.DecodeGGML(blob, 0)
|
||||
f, n, err := ggml.Decode(blob, 0)
|
||||
if errors.Is(err, io.EOF) {
|
||||
break
|
||||
} else if err != nil {
|
||||
@@ -483,9 +483,9 @@ func ggufLayers(digest string, fn func(resp api.ProgressResponse)) ([]*layerGGML
|
||||
}
|
||||
|
||||
mediatype := "application/vnd.ollama.image.model"
|
||||
if ggml.KV().Kind() == "adapter" {
|
||||
if f.KV().Kind() == "adapter" {
|
||||
mediatype = "application/vnd.ollama.image.adapter"
|
||||
} else if _, ok := ggml.KV()[fmt.Sprintf("%s.vision.block_count", ggml.KV().Architecture())]; ok || ggml.KV().Kind() == "projector" {
|
||||
} else if _, ok := f.KV()[fmt.Sprintf("%s.vision.block_count", f.KV().Architecture())]; ok || f.KV().Kind() == "projector" {
|
||||
mediatype = "application/vnd.ollama.image.projector"
|
||||
}
|
||||
|
||||
@@ -506,7 +506,7 @@ func ggufLayers(digest string, fn func(resp api.ProgressResponse)) ([]*layerGGML
|
||||
}
|
||||
}
|
||||
|
||||
layers = append(layers, &layerGGML{layer, ggml})
|
||||
layers = append(layers, &layerGGML{layer, f})
|
||||
offset = n
|
||||
}
|
||||
|
||||
|
||||
@@ -23,7 +23,7 @@ import (
|
||||
|
||||
"github.com/ollama/ollama/api"
|
||||
"github.com/ollama/ollama/envconfig"
|
||||
"github.com/ollama/ollama/llm"
|
||||
"github.com/ollama/ollama/fs/ggml"
|
||||
"github.com/ollama/ollama/parser"
|
||||
"github.com/ollama/ollama/template"
|
||||
"github.com/ollama/ollama/types/model"
|
||||
@@ -78,21 +78,21 @@ func (m *Model) CheckCapabilities(caps ...Capability) error {
|
||||
for _, cap := range caps {
|
||||
switch cap {
|
||||
case CapabilityCompletion:
|
||||
f, err := os.Open(m.ModelPath)
|
||||
r, err := os.Open(m.ModelPath)
|
||||
if err != nil {
|
||||
slog.Error("couldn't open model file", "error", err)
|
||||
continue
|
||||
}
|
||||
defer f.Close()
|
||||
defer r.Close()
|
||||
|
||||
// TODO(mxyng): decode the GGML into model to avoid doing this multiple times
|
||||
ggml, _, err := llm.DecodeGGML(f, 0)
|
||||
f, _, err := ggml.Decode(r, 0)
|
||||
if err != nil {
|
||||
slog.Error("couldn't decode ggml", "error", err)
|
||||
continue
|
||||
}
|
||||
|
||||
if _, ok := ggml.KV()[fmt.Sprintf("%s.pooling_type", ggml.KV().Architecture())]; ok {
|
||||
if _, ok := f.KV()[fmt.Sprintf("%s.pooling_type", f.KV().Architecture())]; ok {
|
||||
errs = append(errs, errCapabilityCompletion)
|
||||
}
|
||||
case CapabilityTools:
|
||||
|
||||
@@ -15,7 +15,7 @@ import (
|
||||
"text/template/parse"
|
||||
|
||||
"github.com/ollama/ollama/api"
|
||||
"github.com/ollama/ollama/llm"
|
||||
"github.com/ollama/ollama/fs/ggml"
|
||||
"github.com/ollama/ollama/template"
|
||||
"github.com/ollama/ollama/types/model"
|
||||
)
|
||||
@@ -24,7 +24,7 @@ var intermediateBlobs map[string]string = make(map[string]string)
|
||||
|
||||
type layerGGML struct {
|
||||
Layer
|
||||
*llm.GGML
|
||||
*ggml.GGML
|
||||
}
|
||||
|
||||
func parseFromModel(ctx context.Context, name model.Name, fn func(api.ProgressResponse)) (layers []*layerGGML, err error) {
|
||||
@@ -64,12 +64,12 @@ func parseFromModel(ctx context.Context, name model.Name, fn func(api.ProgressRe
|
||||
}
|
||||
defer blob.Close()
|
||||
|
||||
ggml, _, err := llm.DecodeGGML(blob, 0)
|
||||
f, _, err := ggml.Decode(blob, 0)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
layers = append(layers, &layerGGML{layer, ggml})
|
||||
layers = append(layers, &layerGGML{layer, f})
|
||||
default:
|
||||
layers = append(layers, &layerGGML{layer, nil})
|
||||
}
|
||||
@@ -118,7 +118,7 @@ func detectContentType(r io.Reader) (string, error) {
|
||||
return "", err
|
||||
}
|
||||
|
||||
if contentType := llm.DetectGGMLType(b.Bytes()); contentType != "" {
|
||||
if contentType := ggml.DetectContentType(b.Bytes()); contentType != "" {
|
||||
return contentType, nil
|
||||
}
|
||||
|
||||
|
||||
@@ -10,8 +10,9 @@ import (
|
||||
"strings"
|
||||
|
||||
"github.com/ollama/ollama/api"
|
||||
"github.com/ollama/ollama/envconfig"
|
||||
"github.com/ollama/ollama/llm"
|
||||
"github.com/ollama/ollama/model/mllama"
|
||||
"github.com/ollama/ollama/model/models/mllama"
|
||||
"github.com/ollama/ollama/template"
|
||||
)
|
||||
|
||||
@@ -92,26 +93,33 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.
|
||||
var imgData llm.ImageData
|
||||
|
||||
if isMllama {
|
||||
data, opts, err := mllama.Preprocess(bytes.NewReader(i))
|
||||
if err != nil {
|
||||
return "", nil, err
|
||||
}
|
||||
if envconfig.NewEngine() {
|
||||
imgData = llm.ImageData{
|
||||
ID: len(images),
|
||||
Data: i,
|
||||
}
|
||||
} else {
|
||||
data, opts, err := mllama.Preprocess(bytes.NewReader(i))
|
||||
if err != nil {
|
||||
return "", nil, err
|
||||
}
|
||||
|
||||
buf := new(bytes.Buffer)
|
||||
err = binary.Write(buf, binary.LittleEndian, data)
|
||||
if err != nil {
|
||||
return "", nil, err
|
||||
}
|
||||
buf := new(bytes.Buffer)
|
||||
err = binary.Write(buf, binary.LittleEndian, data)
|
||||
if err != nil {
|
||||
return "", nil, err
|
||||
}
|
||||
|
||||
ar, ok := opts["aspectRatioIndex"].(int)
|
||||
if !ok {
|
||||
return "", nil, fmt.Errorf("missing aspect ratio for image")
|
||||
}
|
||||
ar, ok := opts["aspectRatioIndex"].(int)
|
||||
if !ok {
|
||||
return "", nil, fmt.Errorf("missing aspect ratio for image")
|
||||
}
|
||||
|
||||
imgData = llm.ImageData{
|
||||
ID: len(images),
|
||||
Data: buf.Bytes(),
|
||||
AspectRatioID: ar,
|
||||
imgData = llm.ImageData{
|
||||
ID: len(images),
|
||||
Data: buf.Bytes(),
|
||||
AspectRatioID: ar,
|
||||
}
|
||||
}
|
||||
imgPrompt = "<|image|>"
|
||||
} else {
|
||||
|
||||
@@ -30,8 +30,9 @@ import (
|
||||
"github.com/ollama/ollama/api"
|
||||
"github.com/ollama/ollama/discover"
|
||||
"github.com/ollama/ollama/envconfig"
|
||||
"github.com/ollama/ollama/fs/ggml"
|
||||
"github.com/ollama/ollama/llm"
|
||||
"github.com/ollama/ollama/model/mllama"
|
||||
"github.com/ollama/ollama/model/models/mllama"
|
||||
"github.com/ollama/ollama/openai"
|
||||
"github.com/ollama/ollama/template"
|
||||
"github.com/ollama/ollama/types/errtypes"
|
||||
@@ -202,7 +203,7 @@ func (s *Server) GenerateHandler(c *gin.Context) {
|
||||
|
||||
images := make([]llm.ImageData, len(req.Images))
|
||||
for i := range req.Images {
|
||||
if isMllama {
|
||||
if isMllama && !envconfig.NewEngine() {
|
||||
data, opts, err := mllama.Preprocess(bytes.NewReader(req.Images[i]))
|
||||
if err != nil {
|
||||
c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": "error processing image"})
|
||||
@@ -860,7 +861,7 @@ func GetModelInfo(req api.ShowRequest) (*api.ShowResponse, error) {
|
||||
return resp, nil
|
||||
}
|
||||
|
||||
func getKVData(digest string, verbose bool) (llm.KV, error) {
|
||||
func getKVData(digest string, verbose bool) (ggml.KV, error) {
|
||||
maxArraySize := 0
|
||||
if verbose {
|
||||
maxArraySize = -1
|
||||
|
||||
@@ -19,12 +19,12 @@ import (
|
||||
|
||||
"github.com/ollama/ollama/api"
|
||||
"github.com/ollama/ollama/envconfig"
|
||||
"github.com/ollama/ollama/llm"
|
||||
"github.com/ollama/ollama/fs/ggml"
|
||||
)
|
||||
|
||||
var stream bool = false
|
||||
|
||||
func createBinFile(t *testing.T, kv map[string]any, ti []llm.Tensor) (string, string) {
|
||||
func createBinFile(t *testing.T, kv map[string]any, ti []ggml.Tensor) (string, string) {
|
||||
t.Helper()
|
||||
t.Setenv("OLLAMA_MODELS", cmp.Or(os.Getenv("OLLAMA_MODELS"), t.TempDir()))
|
||||
|
||||
@@ -36,7 +36,7 @@ func createBinFile(t *testing.T, kv map[string]any, ti []llm.Tensor) (string, st
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
if err := llm.WriteGGUF(f, kv, ti); err != nil {
|
||||
if err := ggml.WriteGGUF(f, kv, ti); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
// Calculate sha256 of file
|
||||
@@ -672,7 +672,7 @@ func TestCreateDetectTemplate(t *testing.T) {
|
||||
var s Server
|
||||
|
||||
t.Run("matched", func(t *testing.T) {
|
||||
_, digest := createBinFile(t, llm.KV{
|
||||
_, digest := createBinFile(t, ggml.KV{
|
||||
"tokenizer.chat_template": "{{ bos_token }}{% for message in messages %}{{'<|' + message['role'] + '|>' + '\n' + message['content'] + '<|end|>\n' }}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}",
|
||||
}, nil)
|
||||
w := createRequest(t, s.CreateHandler, api.CreateRequest{
|
||||
|
||||
@@ -16,6 +16,7 @@ import (
|
||||
|
||||
"github.com/ollama/ollama/api"
|
||||
"github.com/ollama/ollama/discover"
|
||||
"github.com/ollama/ollama/fs/ggml"
|
||||
"github.com/ollama/ollama/llm"
|
||||
)
|
||||
|
||||
@@ -45,8 +46,8 @@ func (mockRunner) Tokenize(_ context.Context, s string) (tokens []int, err error
|
||||
return
|
||||
}
|
||||
|
||||
func newMockServer(mock *mockRunner) func(discover.GpuInfoList, string, *llm.GGML, []string, []string, api.Options, int) (llm.LlamaServer, error) {
|
||||
return func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, projectors, system []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
||||
func newMockServer(mock *mockRunner) func(discover.GpuInfoList, string, *ggml.GGML, []string, []string, api.Options, int) (llm.LlamaServer, error) {
|
||||
return func(_ discover.GpuInfoList, _ string, _ *ggml.GGML, _, _ []string, _ api.Options, _ int) (llm.LlamaServer, error) {
|
||||
return mock, nil
|
||||
}
|
||||
}
|
||||
@@ -76,7 +77,7 @@ func TestGenerateChat(t *testing.T) {
|
||||
getGpuFn: discover.GetGPUInfo,
|
||||
getCpuFn: discover.GetCPUInfo,
|
||||
reschedDelay: 250 * time.Millisecond,
|
||||
loadFn: func(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel int) {
|
||||
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ int) {
|
||||
// add small delay to simulate loading
|
||||
time.Sleep(time.Millisecond)
|
||||
req.successCh <- &runnerRef{
|
||||
@@ -88,7 +89,7 @@ func TestGenerateChat(t *testing.T) {
|
||||
|
||||
go s.sched.Run(context.TODO())
|
||||
|
||||
_, digest := createBinFile(t, llm.KV{
|
||||
_, digest := createBinFile(t, ggml.KV{
|
||||
"general.architecture": "llama",
|
||||
"llama.block_count": uint32(1),
|
||||
"llama.context_length": uint32(8192),
|
||||
@@ -98,7 +99,7 @@ func TestGenerateChat(t *testing.T) {
|
||||
"tokenizer.ggml.tokens": []string{""},
|
||||
"tokenizer.ggml.scores": []float32{0},
|
||||
"tokenizer.ggml.token_type": []int32{0},
|
||||
}, []llm.Tensor{
|
||||
}, []ggml.Tensor{
|
||||
{Name: "token_embd.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
|
||||
{Name: "blk.0.attn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
|
||||
{Name: "blk.0.ffn_down.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
|
||||
@@ -154,10 +155,10 @@ func TestGenerateChat(t *testing.T) {
|
||||
})
|
||||
|
||||
t.Run("missing capabilities chat", func(t *testing.T) {
|
||||
_, digest := createBinFile(t, llm.KV{
|
||||
_, digest := createBinFile(t, ggml.KV{
|
||||
"general.architecture": "bert",
|
||||
"bert.pooling_type": uint32(0),
|
||||
}, []llm.Tensor{})
|
||||
}, []ggml.Tensor{})
|
||||
w := createRequest(t, s.CreateHandler, api.CreateRequest{
|
||||
Model: "bert",
|
||||
Files: map[string]string{"bert.gguf": digest},
|
||||
@@ -612,7 +613,7 @@ func TestGenerate(t *testing.T) {
|
||||
getGpuFn: discover.GetGPUInfo,
|
||||
getCpuFn: discover.GetCPUInfo,
|
||||
reschedDelay: 250 * time.Millisecond,
|
||||
loadFn: func(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel int) {
|
||||
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ int) {
|
||||
// add small delay to simulate loading
|
||||
time.Sleep(time.Millisecond)
|
||||
req.successCh <- &runnerRef{
|
||||
@@ -624,7 +625,7 @@ func TestGenerate(t *testing.T) {
|
||||
|
||||
go s.sched.Run(context.TODO())
|
||||
|
||||
_, digest := createBinFile(t, llm.KV{
|
||||
_, digest := createBinFile(t, ggml.KV{
|
||||
"general.architecture": "llama",
|
||||
"llama.block_count": uint32(1),
|
||||
"llama.context_length": uint32(8192),
|
||||
@@ -634,7 +635,7 @@ func TestGenerate(t *testing.T) {
|
||||
"tokenizer.ggml.tokens": []string{""},
|
||||
"tokenizer.ggml.scores": []float32{0},
|
||||
"tokenizer.ggml.token_type": []int32{0},
|
||||
}, []llm.Tensor{
|
||||
}, []ggml.Tensor{
|
||||
{Name: "token_embd.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
|
||||
{Name: "blk.0.attn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
|
||||
{Name: "blk.0.ffn_down.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
|
||||
@@ -686,10 +687,10 @@ func TestGenerate(t *testing.T) {
|
||||
})
|
||||
|
||||
t.Run("missing capabilities generate", func(t *testing.T) {
|
||||
_, digest := createBinFile(t, llm.KV{
|
||||
_, digest := createBinFile(t, ggml.KV{
|
||||
"general.architecture": "bert",
|
||||
"bert.pooling_type": uint32(0),
|
||||
}, []llm.Tensor{})
|
||||
}, []ggml.Tensor{})
|
||||
|
||||
w := createRequest(t, s.CreateHandler, api.CreateRequest{
|
||||
Model: "bert",
|
||||
|
||||
@@ -21,7 +21,7 @@ import (
|
||||
"unicode"
|
||||
|
||||
"github.com/ollama/ollama/api"
|
||||
"github.com/ollama/ollama/llm"
|
||||
"github.com/ollama/ollama/fs/ggml"
|
||||
"github.com/ollama/ollama/openai"
|
||||
"github.com/ollama/ollama/types/model"
|
||||
"github.com/ollama/ollama/version"
|
||||
@@ -654,8 +654,8 @@ func TestShow(t *testing.T) {
|
||||
|
||||
var s Server
|
||||
|
||||
_, digest1 := createBinFile(t, llm.KV{"general.architecture": "test"}, nil)
|
||||
_, digest2 := createBinFile(t, llm.KV{"general.type": "projector", "general.architecture": "clip"}, nil)
|
||||
_, digest1 := createBinFile(t, ggml.KV{"general.architecture": "test"}, nil)
|
||||
_, digest2 := createBinFile(t, ggml.KV{"general.type": "projector", "general.architecture": "clip"}, nil)
|
||||
|
||||
createRequest(t, s.CreateHandler, api.CreateRequest{
|
||||
Name: "show-model",
|
||||
|
||||
@@ -18,6 +18,7 @@ import (
|
||||
"github.com/ollama/ollama/discover"
|
||||
"github.com/ollama/ollama/envconfig"
|
||||
"github.com/ollama/ollama/format"
|
||||
"github.com/ollama/ollama/fs/ggml"
|
||||
"github.com/ollama/ollama/llm"
|
||||
)
|
||||
|
||||
@@ -41,8 +42,8 @@ type Scheduler struct {
|
||||
loaded map[string]*runnerRef
|
||||
loadedMu sync.Mutex
|
||||
|
||||
loadFn func(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel int)
|
||||
newServerFn func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error)
|
||||
loadFn func(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, numParallel int)
|
||||
newServerFn func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error)
|
||||
getGpuFn func() discover.GpuInfoList
|
||||
getCpuFn func() discover.GpuInfoList
|
||||
reschedDelay time.Duration
|
||||
@@ -409,7 +410,7 @@ func (pending *LlmRequest) useLoadedRunner(runner *runnerRef, finished chan *Llm
|
||||
}()
|
||||
}
|
||||
|
||||
func (s *Scheduler) load(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel int) {
|
||||
func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, numParallel int) {
|
||||
if numParallel < 1 {
|
||||
numParallel = 1
|
||||
}
|
||||
@@ -417,12 +418,12 @@ func (s *Scheduler) load(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoL
|
||||
if req.sessionDuration != nil {
|
||||
sessionDuration = req.sessionDuration.Duration
|
||||
}
|
||||
llama, err := s.newServerFn(gpus, req.model.ModelPath, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, numParallel)
|
||||
llama, err := s.newServerFn(gpus, req.model.ModelPath, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, numParallel)
|
||||
if err != nil {
|
||||
// some older models are not compatible with newer versions of llama.cpp
|
||||
// show a generalized compatibility error until there is a better way to
|
||||
// check for model compatibility
|
||||
if errors.Is(err, llm.ErrUnsupportedFormat) || strings.Contains(err.Error(), "failed to load model") {
|
||||
if errors.Is(err, ggml.ErrUnsupportedFormat) || strings.Contains(err.Error(), "failed to load model") {
|
||||
err = fmt.Errorf("%v: this model may be incompatible with your version of Ollama. If you previously pulled this model, try updating it by running `ollama pull %s`", err, req.model.ShortName)
|
||||
}
|
||||
slog.Info("NewLlamaServer failed", "model", req.model.ModelPath, "error", err)
|
||||
@@ -685,7 +686,7 @@ func (a ByDuration) Less(i, j int) bool {
|
||||
// If the model can not be fit fully within the available GPU(s) nil is returned
|
||||
// If numParallel is <= 0, this will attempt try to optimize parallelism based on available VRAM, and adjust
|
||||
// opts.NumCtx accordingly
|
||||
func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel *int) discover.GpuInfoList {
|
||||
func pickBestFullFitByLibrary(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, numParallel *int) discover.GpuInfoList {
|
||||
var estimatedVRAM uint64
|
||||
|
||||
var numParallelToTry []int
|
||||
@@ -710,7 +711,7 @@ func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus discover.Gpu
|
||||
req.opts.NumCtx = req.origNumCtx * p
|
||||
if !envconfig.SchedSpread() {
|
||||
for _, g := range sgl {
|
||||
if ok, estimatedVRAM = llm.PredictServerFit([]discover.GpuInfo{g}, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
|
||||
if ok, estimatedVRAM = llm.PredictServerFit([]discover.GpuInfo{g}, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
|
||||
slog.Info("new model will fit in available VRAM in single GPU, loading", "model", req.model.ModelPath, "gpu", g.ID, "parallel", p, "available", g.FreeMemory, "required", format.HumanBytes2(estimatedVRAM))
|
||||
*numParallel = p
|
||||
return []discover.GpuInfo{g}
|
||||
@@ -726,7 +727,7 @@ func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus discover.Gpu
|
||||
// Now try all the GPUs
|
||||
for _, p := range numParallelToTry {
|
||||
req.opts.NumCtx = req.origNumCtx * p
|
||||
if ok, estimatedVRAM = llm.PredictServerFit(sgl, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
|
||||
if ok, estimatedVRAM = llm.PredictServerFit(sgl, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
|
||||
slog.Info("new model will fit in available VRAM, loading", "model", req.model.ModelPath, "library", sgl[0].Library, "parallel", p, "required", format.HumanBytes2(estimatedVRAM))
|
||||
*numParallel = p
|
||||
return sgl
|
||||
@@ -737,7 +738,7 @@ func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus discover.Gpu
|
||||
}
|
||||
|
||||
// If multiple Libraries are detected, pick the Library which loads the most layers for the model
|
||||
func pickBestPartialFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel *int) discover.GpuInfoList {
|
||||
func pickBestPartialFitByLibrary(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, numParallel *int) discover.GpuInfoList {
|
||||
if *numParallel <= 0 {
|
||||
*numParallel = 1
|
||||
req.opts.NumCtx = req.origNumCtx
|
||||
@@ -749,7 +750,7 @@ func pickBestPartialFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus discover.
|
||||
var bestEstimate uint64
|
||||
var bestFit int
|
||||
for i, gl := range byLibrary {
|
||||
_, estimatedVRAM := llm.PredictServerFit(gl, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts)
|
||||
_, estimatedVRAM := llm.PredictServerFit(gl, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts)
|
||||
if estimatedVRAM > bestEstimate {
|
||||
bestEstimate = estimatedVRAM
|
||||
bestFit = i
|
||||
@@ -822,9 +823,9 @@ func (s *Scheduler) expireRunner(model *Model) {
|
||||
|
||||
// If other runners are loaded, make sure the pending request will fit in system memory
|
||||
// If not, pick a runner to unload, else return nil and the request can be loaded
|
||||
func (s *Scheduler) maybeFindCPURunnerToUnload(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList) *runnerRef {
|
||||
func (s *Scheduler) maybeFindCPURunnerToUnload(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList) *runnerRef {
|
||||
slog.Debug("evaluating if CPU model load will fit in available system memory")
|
||||
estimate := llm.EstimateGPULayers(gpus, ggml, req.model.ProjectorPaths, req.opts)
|
||||
estimate := llm.EstimateGPULayers(gpus, f, req.model.ProjectorPaths, req.opts)
|
||||
if estimate.TotalSize <= gpus[0].FreeMemory {
|
||||
slog.Debug("cpu inference mode, model fits in available system memory", "model", format.HumanBytes2(estimate.TotalSize), "available", format.HumanBytes2(gpus[0].FreeMemory))
|
||||
return nil
|
||||
|
||||
@@ -15,6 +15,7 @@ import (
|
||||
"github.com/ollama/ollama/app/lifecycle"
|
||||
"github.com/ollama/ollama/discover"
|
||||
"github.com/ollama/ollama/format"
|
||||
"github.com/ollama/ollama/fs/ggml"
|
||||
"github.com/ollama/ollama/llm"
|
||||
)
|
||||
|
||||
@@ -37,7 +38,7 @@ func TestLoad(t *testing.T) {
|
||||
ctx, done := context.WithTimeout(context.Background(), 20*time.Millisecond)
|
||||
defer done()
|
||||
s := InitScheduler(ctx)
|
||||
var ggml *llm.GGML // value not used in tests
|
||||
var f *ggml.GGML // value not used in tests
|
||||
req := &LlmRequest{
|
||||
ctx: ctx,
|
||||
model: &Model{ModelPath: "foo"},
|
||||
@@ -47,11 +48,11 @@ func TestLoad(t *testing.T) {
|
||||
sessionDuration: &api.Duration{Duration: 2 * time.Second},
|
||||
}
|
||||
// Fail to load model first
|
||||
s.newServerFn = func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
||||
s.newServerFn = func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
||||
return nil, errors.New("something failed to load model blah")
|
||||
}
|
||||
gpus := discover.GpuInfoList{}
|
||||
s.load(req, ggml, gpus, 0)
|
||||
s.load(req, f, gpus, 0)
|
||||
require.Empty(t, req.successCh)
|
||||
require.Len(t, req.errCh, 1)
|
||||
s.loadedMu.Lock()
|
||||
@@ -61,10 +62,10 @@ func TestLoad(t *testing.T) {
|
||||
require.Contains(t, err.Error(), "this model may be incompatible")
|
||||
|
||||
server := &mockLlm{estimatedVRAM: 10, estimatedVRAMByGPU: map[string]uint64{}}
|
||||
s.newServerFn = func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
||||
s.newServerFn = func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
||||
return server, nil
|
||||
}
|
||||
s.load(req, ggml, gpus, 0)
|
||||
s.load(req, f, gpus, 0)
|
||||
select {
|
||||
case err := <-req.errCh:
|
||||
require.NoError(t, err)
|
||||
@@ -78,7 +79,7 @@ func TestLoad(t *testing.T) {
|
||||
|
||||
req.model.ModelPath = "dummy_model_path"
|
||||
server.waitResp = errors.New("wait failure")
|
||||
s.load(req, ggml, gpus, 0)
|
||||
s.load(req, f, gpus, 0)
|
||||
select {
|
||||
case err := <-req.errCh:
|
||||
require.Contains(t, err.Error(), "wait failure")
|
||||
@@ -99,10 +100,10 @@ type reqBundle struct {
|
||||
ctxDone func()
|
||||
srv *mockLlm
|
||||
req *LlmRequest
|
||||
ggml *llm.GGML
|
||||
f *ggml.GGML
|
||||
}
|
||||
|
||||
func (scenario *reqBundle) newServer(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
||||
func (scenario *reqBundle) newServer(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
||||
return scenario.srv, nil
|
||||
}
|
||||
|
||||
@@ -115,7 +116,7 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est
|
||||
require.NoError(t, err)
|
||||
defer f.Close()
|
||||
|
||||
require.NoError(t, llm.WriteGGUF(f, llm.KV{
|
||||
require.NoError(t, ggml.WriteGGUF(f, ggml.KV{
|
||||
"general.architecture": "llama",
|
||||
"llama.context_length": uint32(32),
|
||||
"llama.embedding_length": uint32(4096),
|
||||
@@ -125,7 +126,7 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est
|
||||
"tokenizer.ggml.tokens": []string{" "},
|
||||
"tokenizer.ggml.scores": []float32{0},
|
||||
"tokenizer.ggml.token_type": []int32{0},
|
||||
}, []llm.Tensor{
|
||||
}, []ggml.Tensor{
|
||||
{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
|
||||
{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
|
||||
}))
|
||||
@@ -133,7 +134,7 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est
|
||||
|
||||
fname := f.Name()
|
||||
model := &Model{Name: modelName, ModelPath: fname}
|
||||
b.ggml, err = llm.LoadModel(model.ModelPath, 0)
|
||||
b.f, err = llm.LoadModel(model.ModelPath, 0)
|
||||
require.NoError(t, err)
|
||||
|
||||
if duration == nil {
|
||||
@@ -174,7 +175,7 @@ func TestRequestsSameModelSameRequest(t *testing.T) {
|
||||
a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond})
|
||||
b := newScenarioRequest(t, ctx, "ollama-model-1", 11, &api.Duration{Duration: 0})
|
||||
b.req.model = a.req.model
|
||||
b.ggml = a.ggml
|
||||
b.f = a.f
|
||||
|
||||
s.newServerFn = a.newServer
|
||||
slog.Info("a")
|
||||
@@ -218,7 +219,7 @@ func TestRequestsSimpleReloadSameModel(t *testing.T) {
|
||||
b := newScenarioRequest(t, ctx, "ollama-model-1", 20, &api.Duration{Duration: 5 * time.Millisecond})
|
||||
tmpModel := *a.req.model
|
||||
b.req.model = &tmpModel
|
||||
b.ggml = a.ggml
|
||||
b.f = a.f
|
||||
|
||||
s.newServerFn = a.newServer
|
||||
slog.Info("a")
|
||||
@@ -419,13 +420,13 @@ func TestExpireRunner(t *testing.T) {
|
||||
sessionDuration: &api.Duration{Duration: 2 * time.Minute},
|
||||
}
|
||||
|
||||
var ggml *llm.GGML
|
||||
var f *ggml.GGML
|
||||
gpus := discover.GpuInfoList{}
|
||||
server := &mockLlm{estimatedVRAM: 10, estimatedVRAMByGPU: map[string]uint64{}}
|
||||
s.newServerFn = func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
||||
s.newServerFn = func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
||||
return server, nil
|
||||
}
|
||||
s.load(req, ggml, gpus, 0)
|
||||
s.load(req, f, gpus, 0)
|
||||
|
||||
select {
|
||||
case err := <-req.errCh:
|
||||
@@ -729,9 +730,9 @@ func TestHomogeneousGPUs(t *testing.T) {
|
||||
}
|
||||
s.getCpuFn = getCpuFn
|
||||
a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond})
|
||||
s.newServerFn = func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
||||
s.newServerFn = func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
||||
require.Len(t, gpus, 1)
|
||||
return a.newServer(gpus, model, ggml, adapters, projectors, opts, numParallel)
|
||||
return a.newServer(gpus, model, f, adapters, projectors, opts, numParallel)
|
||||
}
|
||||
slog.Info("a")
|
||||
s.pendingReqCh <- a.req
|
||||
|
||||
@@ -14,7 +14,7 @@ import (
|
||||
"github.com/google/go-cmp/cmp"
|
||||
|
||||
"github.com/ollama/ollama/api"
|
||||
"github.com/ollama/ollama/llm"
|
||||
"github.com/ollama/ollama/fs/ggml"
|
||||
)
|
||||
|
||||
func TestNamed(t *testing.T) {
|
||||
@@ -33,7 +33,7 @@ func TestNamed(t *testing.T) {
|
||||
|
||||
for k, v := range ss {
|
||||
t.Run(k, func(t *testing.T) {
|
||||
kv := llm.KV{"tokenizer.chat_template": v}
|
||||
kv := ggml.KV{"tokenizer.chat_template": v}
|
||||
s := kv.ChatTemplate()
|
||||
r, err := Named(s)
|
||||
if err != nil {
|
||||
|
||||
Reference in New Issue
Block a user