Compare commits
2 Commits
Author | SHA1 | Date | |
---|---|---|---|
![]() |
2d7e8e82ab | ||
![]() |
c8245f3ef3 |
13
fs/config.go
Normal file
13
fs/config.go
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
package fs
|
||||||
|
|
||||||
|
type Config interface {
|
||||||
|
Architecture() string
|
||||||
|
String(string, ...string) string
|
||||||
|
Uint(string, ...uint32) uint32
|
||||||
|
Float(string, ...float32) float32
|
||||||
|
Bool(string, ...bool) bool
|
||||||
|
|
||||||
|
Strings(string, ...[]string) []string
|
||||||
|
Uints(string, ...[]uint32) []uint32
|
||||||
|
Floats(string, ...[]float32) []float32
|
||||||
|
}
|
88
fs/fs.go
Normal file
88
fs/fs.go
Normal file
@ -0,0 +1,88 @@
|
|||||||
|
package fs
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"io"
|
||||||
|
"log/slog"
|
||||||
|
"os"
|
||||||
|
|
||||||
|
"github.com/ollama/ollama/fs/ggml"
|
||||||
|
)
|
||||||
|
|
||||||
|
type DType int
|
||||||
|
|
||||||
|
type Model struct {
|
||||||
|
KV Config
|
||||||
|
Tensors map[string]TensorReader
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m Model) LogValue() slog.Value {
|
||||||
|
return slog.GroupValue(
|
||||||
|
slog.String("architecture", m.KV.Architecture()),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
type Tensor interface {
|
||||||
|
Name() string
|
||||||
|
Shape() []int
|
||||||
|
DType() DType
|
||||||
|
Size() int
|
||||||
|
}
|
||||||
|
|
||||||
|
type TensorReader interface {
|
||||||
|
Tensor
|
||||||
|
io.Reader
|
||||||
|
}
|
||||||
|
|
||||||
|
type shimTensorReader struct {
|
||||||
|
internal *ggml.Tensor
|
||||||
|
*io.SectionReader
|
||||||
|
}
|
||||||
|
|
||||||
|
func (t *shimTensorReader) Name() string {
|
||||||
|
return t.internal.Name
|
||||||
|
}
|
||||||
|
|
||||||
|
func (t *shimTensorReader) Shape() []int {
|
||||||
|
shape := make([]int, len(t.internal.Shape))
|
||||||
|
for i, s := range t.internal.Shape {
|
||||||
|
shape[i] = int(s)
|
||||||
|
}
|
||||||
|
|
||||||
|
return shape
|
||||||
|
}
|
||||||
|
|
||||||
|
func (t *shimTensorReader) Size() int {
|
||||||
|
return int(t.internal.Size())
|
||||||
|
}
|
||||||
|
|
||||||
|
func (t *shimTensorReader) DType() DType {
|
||||||
|
return DType(t.internal.Kind)
|
||||||
|
}
|
||||||
|
|
||||||
|
func ReadFrom(f *os.File) (*Model, error) {
|
||||||
|
bts, err := io.ReadAll(io.NewSectionReader(f, 0, 4))
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
switch ggml.DetectContentType(bts[:4]) {
|
||||||
|
case "gguf":
|
||||||
|
c, _, err := ggml.Decode(f, -1)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
tensors := make(map[string]TensorReader, len(c.Tensors().Items()))
|
||||||
|
for _, t := range c.Tensors().Items() {
|
||||||
|
tensors[t.Name] = &shimTensorReader{
|
||||||
|
internal: t,
|
||||||
|
SectionReader: io.NewSectionReader(f, int64(c.Tensors().Offset+t.Offset), int64(t.Size())),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return &Model{KV: c.KV(), Tensors: tensors}, nil
|
||||||
|
default:
|
||||||
|
return nil, fmt.Errorf("unsupported file type")
|
||||||
|
}
|
||||||
|
}
|
@ -5,6 +5,7 @@ import (
|
|||||||
"slices"
|
"slices"
|
||||||
"testing"
|
"testing"
|
||||||
|
|
||||||
|
"github.com/ollama/ollama/fs"
|
||||||
"github.com/ollama/ollama/ml"
|
"github.com/ollama/ollama/ml"
|
||||||
"github.com/ollama/ollama/model/input"
|
"github.com/ollama/ollama/model/input"
|
||||||
)
|
)
|
||||||
@ -373,7 +374,7 @@ func TestCanResume(t *testing.T) {
|
|||||||
|
|
||||||
type testBackend struct{}
|
type testBackend struct{}
|
||||||
|
|
||||||
func (b *testBackend) Config() ml.Config {
|
func (b *testBackend) Config() fs.Config {
|
||||||
panic("not implemented")
|
panic("not implemented")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -9,22 +9,12 @@ import (
|
|||||||
"slices"
|
"slices"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
|
"github.com/ollama/ollama/fs"
|
||||||
)
|
)
|
||||||
|
|
||||||
type Config interface {
|
|
||||||
Architecture() string
|
|
||||||
String(string, ...string) string
|
|
||||||
Uint(string, ...uint32) uint32
|
|
||||||
Float(string, ...float32) float32
|
|
||||||
Bool(string, ...bool) bool
|
|
||||||
|
|
||||||
Strings(string, ...[]string) []string
|
|
||||||
Uints(string, ...[]uint32) []uint32
|
|
||||||
Floats(string, ...[]float32) []float32
|
|
||||||
}
|
|
||||||
|
|
||||||
type Backend interface {
|
type Backend interface {
|
||||||
Config() Config
|
Config() fs.Config
|
||||||
Get(name string) Tensor
|
Get(name string) Tensor
|
||||||
NewContext() Context
|
NewContext() Context
|
||||||
NewContextSize(size int) Context
|
NewContextSize(size int) Context
|
||||||
|
@ -9,7 +9,9 @@ package ggml
|
|||||||
import "C"
|
import "C"
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"bytes"
|
||||||
"context"
|
"context"
|
||||||
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
@ -19,12 +21,14 @@ import (
|
|||||||
"slices"
|
"slices"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
|
"sync"
|
||||||
"sync/atomic"
|
"sync/atomic"
|
||||||
"unicode"
|
"unicode"
|
||||||
"unsafe"
|
"unsafe"
|
||||||
|
|
||||||
"github.com/ollama/ollama/format"
|
"github.com/ollama/ollama/format"
|
||||||
fs "github.com/ollama/ollama/fs/ggml"
|
"github.com/ollama/ollama/fs"
|
||||||
|
fsggml "github.com/ollama/ollama/fs/ggml"
|
||||||
"github.com/ollama/ollama/ml"
|
"github.com/ollama/ollama/ml"
|
||||||
ggml "github.com/ollama/ollama/ml/backend/ggml/ggml/src"
|
ggml "github.com/ollama/ollama/ml/backend/ggml/ggml/src"
|
||||||
"golang.org/x/sync/errgroup"
|
"golang.org/x/sync/errgroup"
|
||||||
@ -41,7 +45,7 @@ func devices() []*C.struct_ggml_backend_device {
|
|||||||
}
|
}
|
||||||
|
|
||||||
type Backend struct {
|
type Backend struct {
|
||||||
meta *fs.GGML
|
meta *fsggml.GGML
|
||||||
sched *C.struct_ggml_backend_sched
|
sched *C.struct_ggml_backend_sched
|
||||||
tensors map[string]*C.struct_ggml_tensor
|
tensors map[string]*C.struct_ggml_tensor
|
||||||
|
|
||||||
@ -58,7 +62,7 @@ type Backend struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend, error) {
|
func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend, error) {
|
||||||
meta, n, err := fs.Decode(r, -1)
|
meta, n, err := fsggml.Decode(r, -1)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
@ -182,7 +186,7 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
|
|||||||
maxTensors += blocks * 2
|
maxTensors += blocks * 2
|
||||||
|
|
||||||
type tensor struct {
|
type tensor struct {
|
||||||
source *fs.Tensor
|
source *fsggml.Tensor
|
||||||
target string
|
target string
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -298,6 +302,11 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
|
|||||||
|
|
||||||
var doneBytes atomic.Uint64
|
var doneBytes atomic.Uint64
|
||||||
totalBytes := uint64(n) - meta.Tensors().Offset
|
totalBytes := uint64(n) - meta.Tensors().Offset
|
||||||
|
pool := sync.Pool{
|
||||||
|
New: func() any {
|
||||||
|
return new(bytes.Buffer)
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
g, ctx := errgroup.WithContext(ctx)
|
g, ctx := errgroup.WithContext(ctx)
|
||||||
g.SetLimit(runtime.GOMAXPROCS(0))
|
g.SetLimit(runtime.GOMAXPROCS(0))
|
||||||
@ -319,19 +328,32 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
|
|||||||
}
|
}
|
||||||
|
|
||||||
sr := io.NewSectionReader(r, int64(meta.Tensors().Offset+t.Offset), int64(t.Size()))
|
sr := io.NewSectionReader(r, int64(meta.Tensors().Offset+t.Offset), int64(t.Size()))
|
||||||
bts := make([]byte, 128*format.KibiByte)
|
// bts := make([]byte, 128*format.KibiByte)
|
||||||
|
|
||||||
var s uint64
|
var s uint64
|
||||||
for s < t.Size() {
|
for s < t.Size() {
|
||||||
n, err := io.ReadFull(sr, bts[:min(len(bts), int(t.Size()-s))])
|
b := pool.Get().(*bytes.Buffer)
|
||||||
if err != nil {
|
b.Reset()
|
||||||
|
|
||||||
|
// n, err := io.ReadFull(sr, bts[:min(len(bts), int(t.Size()-s))])
|
||||||
|
// if err != nil {
|
||||||
|
// return err
|
||||||
|
// }
|
||||||
|
n, err := io.CopyN(b, sr, 256*format.KibiByte)
|
||||||
|
if n > 0 {
|
||||||
|
} else if errors.Is(err, io.EOF) {
|
||||||
|
break
|
||||||
|
} else if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bts := b.Bytes()
|
||||||
for _, tt := range tts {
|
for _, tt := range tts {
|
||||||
C.ggml_backend_tensor_set(tt, unsafe.Pointer(&bts[0]), C.size_t(s), C.size_t(n))
|
C.ggml_backend_tensor_set(tt, unsafe.Pointer(&bts[0]), C.size_t(s), C.size_t(n))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pool.Put(b)
|
||||||
|
|
||||||
s += uint64(n)
|
s += uint64(n)
|
||||||
|
|
||||||
if params.Progress != nil {
|
if params.Progress != nil {
|
||||||
@ -413,7 +435,7 @@ func init() {
|
|||||||
ml.RegisterBackend("ggml", New)
|
ml.RegisterBackend("ggml", New)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (b *Backend) Config() ml.Config {
|
func (b *Backend) Config() fs.Config {
|
||||||
return b.meta.KV()
|
return b.meta.KV()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
273
ml/backend/ggml/ggml2.go
Normal file
273
ml/backend/ggml/ggml2.go
Normal file
@ -0,0 +1,273 @@
|
|||||||
|
package ggml
|
||||||
|
|
||||||
|
// #cgo CPPFLAGS: -I${SRCDIR}/ggml/include
|
||||||
|
// #include <stdlib.h>
|
||||||
|
// #include <stdint.h>
|
||||||
|
// #include "ggml.h"
|
||||||
|
// #include "ggml-cpu.h"
|
||||||
|
// #include "ggml-backend.h"
|
||||||
|
import "C"
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"context"
|
||||||
|
"errors"
|
||||||
|
"io"
|
||||||
|
"log/slog"
|
||||||
|
"runtime"
|
||||||
|
"sync"
|
||||||
|
"unsafe"
|
||||||
|
|
||||||
|
"github.com/ollama/ollama/format"
|
||||||
|
"github.com/ollama/ollama/fs"
|
||||||
|
"github.com/ollama/ollama/ml"
|
||||||
|
ggml "github.com/ollama/ollama/ml/backend/ggml/ggml/src"
|
||||||
|
"golang.org/x/sync/errgroup"
|
||||||
|
)
|
||||||
|
|
||||||
|
type backend struct {
|
||||||
|
gpus, cpus []*C.struct_ggml_backend_device
|
||||||
|
bufts map[*C.struct_ggml_backend_device][]*C.struct_ggml_backend_buffer_type
|
||||||
|
ctxs map[*C.struct_ggml_backend_buffer_type]*C.struct_ggml_context
|
||||||
|
bbs map[*C.struct_ggml_backend_buffer_type]*C.struct_ggml_backend_buffer
|
||||||
|
readers map[*C.struct_ggml_tensor]io.Reader
|
||||||
|
reserved map[*C.struct_ggml_context]uint64
|
||||||
|
|
||||||
|
onceScheduler sync.Once
|
||||||
|
scheduler *scheduler
|
||||||
|
}
|
||||||
|
|
||||||
|
var _ ml.Backend2 = (*backend)(nil)
|
||||||
|
|
||||||
|
func New2() (ml.Backend2, error) {
|
||||||
|
ggml.OnceLoad()
|
||||||
|
|
||||||
|
var cpus, accels, gpus []*C.struct_ggml_backend_device
|
||||||
|
for i := range C.ggml_backend_dev_count() {
|
||||||
|
d := C.ggml_backend_dev_get(C.size_t(i))
|
||||||
|
switch C.ggml_backend_dev_type(d) {
|
||||||
|
case C.GGML_BACKEND_DEVICE_TYPE_CPU:
|
||||||
|
// only the first cpu device should be used
|
||||||
|
if len(cpus) > 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
cpus = append(cpus, d)
|
||||||
|
case C.GGML_BACKEND_DEVICE_TYPE_ACCEL:
|
||||||
|
accels = append(accels, d)
|
||||||
|
case C.GGML_BACKEND_DEVICE_TYPE_GPU:
|
||||||
|
gpus = append(gpus, d)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
bufts := make(map[*C.struct_ggml_backend_device][]*C.struct_ggml_backend_buffer_type)
|
||||||
|
|
||||||
|
cpu := C.ggml_backend_dev_by_type(C.GGML_BACKEND_DEVICE_TYPE_CPU)
|
||||||
|
for _, d := range append(accels, cpus...) {
|
||||||
|
bufts[cpu] = append(bufts[cpu], C.ggml_backend_dev_buffer_type(d))
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, d := range gpus {
|
||||||
|
bufts[d] = append(bufts[d], append([]*C.struct_ggml_backend_buffer_type{C.ggml_backend_dev_buffer_type(d)}, bufts[cpu]...)...)
|
||||||
|
}
|
||||||
|
|
||||||
|
return &backend{
|
||||||
|
// merge accels and cpus
|
||||||
|
gpus: gpus,
|
||||||
|
cpus: append(accels, cpus...),
|
||||||
|
bufts: bufts,
|
||||||
|
ctxs: make(map[*C.struct_ggml_backend_buffer_type]*C.struct_ggml_context, len(bufts)),
|
||||||
|
bbs: make(map[*C.struct_ggml_backend_buffer_type]*C.struct_ggml_backend_buffer, len(bufts)),
|
||||||
|
readers: make(map[*C.struct_ggml_tensor]io.Reader),
|
||||||
|
reserved: make(map[*C.struct_ggml_context]uint64),
|
||||||
|
}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (b *backend) Close() {
|
||||||
|
}
|
||||||
|
|
||||||
|
func (b *backend) NewContext() ml.Context {
|
||||||
|
return &Context{
|
||||||
|
b: &Backend{
|
||||||
|
input: b.bufts[b.cpus[0]][0],
|
||||||
|
output: b.bufts[b.cpus[0]][0],
|
||||||
|
layers: func() map[int]*C.struct_ggml_backend_buffer_type {
|
||||||
|
m := make(map[int]*C.struct_ggml_backend_buffer_type)
|
||||||
|
for i := range 100 {
|
||||||
|
m[i] = b.bufts[b.gpus[0]][0]
|
||||||
|
}
|
||||||
|
return m
|
||||||
|
}(),
|
||||||
|
sched: func() *C.struct_ggml_backend_sched {
|
||||||
|
return b.Scheduler().(*scheduler).s
|
||||||
|
}(),
|
||||||
|
maxGraphNodes: 8192,
|
||||||
|
},
|
||||||
|
ctx: C.ggml_init(C.struct_ggml_init_params{
|
||||||
|
mem_size: C.ggml_tensor_overhead() * C.size_t(4000),
|
||||||
|
no_alloc: true,
|
||||||
|
}),
|
||||||
|
buft: b.bufts[b.cpus[0]][0],
|
||||||
|
maxGraphNodes: 8192,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (b *backend) Get(tensorReader fs.TensorReader, preferredDevice ml.Device) ml.Tensor {
|
||||||
|
var ctx *C.struct_ggml_context
|
||||||
|
|
||||||
|
var devices []*C.struct_ggml_backend_device
|
||||||
|
if preferredDevice == ml.GPU {
|
||||||
|
devices = b.gpus
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, d := range append(devices, b.cpus...) {
|
||||||
|
var free, total C.size_t
|
||||||
|
C.ggml_backend_dev_memory(d, &free, &total)
|
||||||
|
|
||||||
|
for _, buft := range b.bufts[d] {
|
||||||
|
if _, ok := b.ctxs[buft]; !ok {
|
||||||
|
b.ctxs[buft] = C.ggml_init(C.struct_ggml_init_params{
|
||||||
|
mem_size: C.ggml_tensor_overhead() * C.size_t(1000),
|
||||||
|
no_alloc: true,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
ctx = b.ctxs[buft]
|
||||||
|
if free > 0 && b.reserved[ctx]+uint64(tensorReader.Size()) >= uint64(free) {
|
||||||
|
slog.Info("no space available", "device", C.GoString(C.ggml_backend_dev_name(d)), "free", format.HumanBytes2(uint64(free)), "total", format.HumanBytes2(uint64(total)), "reserve", format.HumanBytes2(b.reserved[ctx]), "size", format.HumanBytes2(uint64(tensorReader.Size())))
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
cname := C.CString(tensorReader.Name())
|
||||||
|
defer C.free(unsafe.Pointer(cname))
|
||||||
|
|
||||||
|
if t := C.ggml_get_tensor(ctx, cname); t != nil {
|
||||||
|
slog.Info("using existing tensor in buffer type", "name", tensorReader.Name(), "buffer_type", C.GoString(C.ggml_backend_buft_name(buft)))
|
||||||
|
return &Tensor{t: t}
|
||||||
|
}
|
||||||
|
|
||||||
|
shape := make([]C.int64_t, len(tensorReader.Shape()))
|
||||||
|
for i, s := range tensorReader.Shape() {
|
||||||
|
shape[i] = C.int64_t(s)
|
||||||
|
}
|
||||||
|
|
||||||
|
t := C.ggml_new_tensor(ctx, uint32(tensorReader.DType()), C.int(len(tensorReader.Shape())), unsafe.SliceData(shape))
|
||||||
|
C.ggml_set_name(t, cname)
|
||||||
|
|
||||||
|
b.readers[t] = tensorReader
|
||||||
|
b.reserved[ctx] += uint64(tensorReader.Size())
|
||||||
|
|
||||||
|
slog.Info("creating new tensor in buffer type", "name", tensorReader.Name(), "buffer_type", C.GoString(C.ggml_backend_buft_name(buft)), "reserve", format.HumanBytes2(b.reserved[ctx]))
|
||||||
|
return &Tensor{t: t}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
panic("no device available")
|
||||||
|
}
|
||||||
|
|
||||||
|
func (b *backend) LoadAll(ctx context.Context) error {
|
||||||
|
// allocate buffers for each context
|
||||||
|
for buft, ctx := range b.ctxs {
|
||||||
|
if C.ggml_get_first_tensor(ctx) == nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
bb := C.ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft)
|
||||||
|
C.ggml_backend_buffer_set_usage(bb, C.GGML_BACKEND_BUFFER_USAGE_WEIGHTS)
|
||||||
|
b.bbs[buft] = bb
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, bb := range b.bbs {
|
||||||
|
slog.Info("", "buffer.size", C.ggml_backend_buffer_get_size(bb), "buffer.usage", C.ggml_backend_buffer_get_usage(bb))
|
||||||
|
}
|
||||||
|
|
||||||
|
pool := sync.Pool{
|
||||||
|
New: func() any {
|
||||||
|
return new(bytes.Buffer)
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
g, ctx := errgroup.WithContext(context.Background())
|
||||||
|
g.SetLimit(runtime.GOMAXPROCS(0))
|
||||||
|
for t, r := range b.readers {
|
||||||
|
g.Go(func() error {
|
||||||
|
var s uint64
|
||||||
|
|
||||||
|
for {
|
||||||
|
b := pool.Get().(*bytes.Buffer)
|
||||||
|
b.Reset()
|
||||||
|
|
||||||
|
n, err := io.CopyN(b, r, 32*format.KibiByte)
|
||||||
|
if n > 0 {
|
||||||
|
} else if errors.Is(err, io.EOF) {
|
||||||
|
break
|
||||||
|
} else if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
C.ggml_backend_tensor_set(t, unsafe.Pointer(&b.Bytes()[0]), C.size_t(s), C.size_t(n))
|
||||||
|
pool.Put(b)
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
go func() {
|
||||||
|
<-ctx.Done()
|
||||||
|
g.Go(func() error {
|
||||||
|
return ctx.Err()
|
||||||
|
})
|
||||||
|
}()
|
||||||
|
|
||||||
|
return g.Wait()
|
||||||
|
}
|
||||||
|
|
||||||
|
type scheduler struct {
|
||||||
|
s *C.struct_ggml_backend_sched
|
||||||
|
}
|
||||||
|
|
||||||
|
var (
|
||||||
|
_ ml.Scheduler = (*scheduler)(nil)
|
||||||
|
_ ml.Reserver = (*scheduler)(nil)
|
||||||
|
)
|
||||||
|
|
||||||
|
func (b *backend) Scheduler() ml.Scheduler {
|
||||||
|
b.onceScheduler.Do(func() {
|
||||||
|
devices := append(b.gpus, b.cpus...)
|
||||||
|
backends := make([]C.ggml_backend_t, len(devices))
|
||||||
|
bufts := make([]C.ggml_backend_buffer_type_t, len(devices))
|
||||||
|
for i, device := range devices {
|
||||||
|
backend := C.ggml_backend_dev_init(device, nil)
|
||||||
|
buft := C.ggml_backend_get_default_buffer_type(backend)
|
||||||
|
if d := C.ggml_backend_get_device(backend); C.ggml_backend_dev_type(d) == C.GGML_BACKEND_DEVICE_TYPE_CPU && len(b.gpus) > 0 {
|
||||||
|
if hbt := C.ggml_backend_dev_host_buffer_type(b.gpus[0]); hbt != nil {
|
||||||
|
buft = hbt
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
slog.Info("scheduler", "backend", C.GoString(C.ggml_backend_name(backend)), "buffer_type", C.GoString(C.ggml_backend_buft_name(buft)))
|
||||||
|
backends[i] = backend
|
||||||
|
bufts[i] = buft
|
||||||
|
}
|
||||||
|
|
||||||
|
maxGraphNodes := max(8192, 1)
|
||||||
|
b.scheduler = &scheduler{
|
||||||
|
s: C.ggml_backend_sched_new(
|
||||||
|
unsafe.SliceData(backends),
|
||||||
|
unsafe.SliceData(bufts),
|
||||||
|
C.int(len(backends)),
|
||||||
|
C.size_t(maxGraphNodes),
|
||||||
|
C._Bool(len(b.gpus) > 1),
|
||||||
|
),
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
return b.scheduler
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s scheduler) Schedule() {
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s scheduler) Reserve() {
|
||||||
|
}
|
25
ml/backend2.go
Normal file
25
ml/backend2.go
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
package ml
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
|
||||||
|
"github.com/ollama/ollama/fs"
|
||||||
|
)
|
||||||
|
|
||||||
|
type Device int
|
||||||
|
|
||||||
|
const (
|
||||||
|
CPU Device = iota
|
||||||
|
GPU
|
||||||
|
)
|
||||||
|
|
||||||
|
type Backend2 interface {
|
||||||
|
Close()
|
||||||
|
|
||||||
|
NewContext() Context
|
||||||
|
|
||||||
|
Scheduler() Scheduler
|
||||||
|
|
||||||
|
Get(fs.TensorReader, Device) Tensor
|
||||||
|
LoadAll(context.Context) error
|
||||||
|
}
|
11
ml/scheduler.go
Normal file
11
ml/scheduler.go
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
package ml
|
||||||
|
|
||||||
|
// Scheduler is an interface that can be implemented by a Backend to schedule resources.
|
||||||
|
type Scheduler interface {
|
||||||
|
Schedule()
|
||||||
|
}
|
||||||
|
|
||||||
|
// Reserver is an optional interface that can be implemented by a Scheduler to reserve resources for the compute graph.
|
||||||
|
type Reserver interface {
|
||||||
|
Reserve()
|
||||||
|
}
|
@ -16,7 +16,8 @@ import (
|
|||||||
_ "golang.org/x/image/tiff"
|
_ "golang.org/x/image/tiff"
|
||||||
_ "golang.org/x/image/webp"
|
_ "golang.org/x/image/webp"
|
||||||
|
|
||||||
fs "github.com/ollama/ollama/fs/ggml"
|
"github.com/ollama/ollama/fs"
|
||||||
|
fsggml "github.com/ollama/ollama/fs/ggml"
|
||||||
"github.com/ollama/ollama/kvcache"
|
"github.com/ollama/ollama/kvcache"
|
||||||
"github.com/ollama/ollama/ml"
|
"github.com/ollama/ollama/ml"
|
||||||
_ "github.com/ollama/ollama/ml/backend"
|
_ "github.com/ollama/ollama/ml/backend"
|
||||||
@ -83,10 +84,10 @@ func (m *Base) Config() config {
|
|||||||
return m.config
|
return m.config
|
||||||
}
|
}
|
||||||
|
|
||||||
var models = make(map[string]func(ml.Config) (Model, error))
|
var models = make(map[string]func(fs.Config) (Model, error))
|
||||||
|
|
||||||
// Register registers a model constructor for the given architecture
|
// Register registers a model constructor for the given architecture
|
||||||
func Register(name string, f func(ml.Config) (Model, error)) {
|
func Register(name string, f func(fs.Config) (Model, error)) {
|
||||||
if _, ok := models[name]; ok {
|
if _, ok := models[name]; ok {
|
||||||
panic("model: model already registered")
|
panic("model: model already registered")
|
||||||
}
|
}
|
||||||
@ -131,14 +132,14 @@ func NewTextProcessor(s string) (TextProcessor, error) {
|
|||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
defer r.Close()
|
defer r.Close()
|
||||||
meta, _, err := fs.Decode(r, -1)
|
meta, _, err := fsggml.Decode(r, -1)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
return getTextProcessor(meta.KV())
|
return getTextProcessor(meta.KV())
|
||||||
}
|
}
|
||||||
|
|
||||||
func getTextProcessor(kv fs.KV) (TextProcessor, error) {
|
func getTextProcessor(kv fsggml.KV) (TextProcessor, error) {
|
||||||
arch := kv.Architecture()
|
arch := kv.Architecture()
|
||||||
f, ok := models[arch]
|
f, ok := models[arch]
|
||||||
if !ok {
|
if !ok {
|
||||||
@ -255,16 +256,23 @@ func setPointer(base Base, v reflect.Value, tags []Tag) {
|
|||||||
type Tag struct {
|
type Tag struct {
|
||||||
Name string
|
Name string
|
||||||
Alternate []string
|
Alternate []string
|
||||||
|
Root bool
|
||||||
|
Device ml.Device
|
||||||
}
|
}
|
||||||
|
|
||||||
func ParseTags(s string) (tag Tag) {
|
func ParseTags(s string) (tag Tag) {
|
||||||
parts := strings.Split(s, ",")
|
parts := strings.Split(s, ",")
|
||||||
if len(parts) > 0 {
|
if len(parts) > 0 {
|
||||||
tag.Name = parts[0]
|
tag.Name = parts[0]
|
||||||
|
tag.Device = ml.GPU
|
||||||
|
|
||||||
for _, part := range parts[1:] {
|
for _, part := range parts[1:] {
|
||||||
if value, ok := strings.CutPrefix(part, "alt:"); ok {
|
if value, ok := strings.CutPrefix(part, "alt:"); ok {
|
||||||
tag.Alternate = append(tag.Alternate, value)
|
tag.Alternate = append(tag.Alternate, value)
|
||||||
|
} else if value, ok := strings.CutPrefix(part, "root:"); ok {
|
||||||
|
tag.Root, _ = strconv.ParseBool(value)
|
||||||
|
} else if part == "cpu" {
|
||||||
|
tag.Device = ml.CPU
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
139
model/model2.go
Normal file
139
model/model2.go
Normal file
@ -0,0 +1,139 @@
|
|||||||
|
package model
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"reflect"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"github.com/ollama/ollama/fs"
|
||||||
|
"github.com/ollama/ollama/ml"
|
||||||
|
)
|
||||||
|
|
||||||
|
type Model2 struct {
|
||||||
|
ml.Backend2
|
||||||
|
Model
|
||||||
|
}
|
||||||
|
|
||||||
|
func New2(cfg *fs.Model, b ml.Backend2) (*Model2, error) {
|
||||||
|
fn, ok := models[cfg.KV.Architecture()]
|
||||||
|
if !ok {
|
||||||
|
return nil, fmt.Errorf("unsupported model architecture %q", cfg.KV.Architecture())
|
||||||
|
}
|
||||||
|
|
||||||
|
m, err := fn(cfg.KV)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: load tensors from the model into the backend
|
||||||
|
v := reflect.ValueOf(m)
|
||||||
|
v.Elem().Set(temp(b, cfg.Tensors, v.Elem()))
|
||||||
|
|
||||||
|
if r, ok := b.Scheduler().(ml.Reserver); ok {
|
||||||
|
// TODO: build a graph of the model and reserve the necessary resources
|
||||||
|
r.Reserve()
|
||||||
|
}
|
||||||
|
|
||||||
|
return &Model2{b, m}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func temp(b ml.Backend2, tensors map[string]fs.TensorReader, v reflect.Value, tags ...Tag) reflect.Value {
|
||||||
|
t := v.Type()
|
||||||
|
if t.Kind() != reflect.Struct {
|
||||||
|
return v
|
||||||
|
}
|
||||||
|
|
||||||
|
allNil := true
|
||||||
|
for i := range t.NumField() {
|
||||||
|
tt := t.Field(i).Type
|
||||||
|
vv := v.Field(i)
|
||||||
|
if !vv.CanSet() {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
tagsCopy := tags
|
||||||
|
if s := t.Field(i).Tag.Get("gguf"); s != "" {
|
||||||
|
tag := ParseTags(s)
|
||||||
|
if tag.Root {
|
||||||
|
tagsCopy = []Tag{tag}
|
||||||
|
} else {
|
||||||
|
tagsCopy = append(tagsCopy, ParseTags(s))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
switch {
|
||||||
|
case tt == reflect.TypeOf((*ml.Tensor)(nil)).Elem():
|
||||||
|
var permute func([]Tag) [][]string
|
||||||
|
permute = func(tags []Tag) (values [][]string) {
|
||||||
|
if len(tags) < 1 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
values = [][]string{{tags[0].Name}}
|
||||||
|
for _, alt := range tags[0].Alternate {
|
||||||
|
values = append(values, []string{alt})
|
||||||
|
}
|
||||||
|
|
||||||
|
for i, value := range values {
|
||||||
|
for _, rest := range permute(tags[1:]) {
|
||||||
|
value = append(value, rest...)
|
||||||
|
}
|
||||||
|
|
||||||
|
values[i] = value
|
||||||
|
}
|
||||||
|
|
||||||
|
return values
|
||||||
|
}
|
||||||
|
|
||||||
|
names := permute(tagsCopy)
|
||||||
|
for _, name := range names {
|
||||||
|
if tensor, ok := tensors[strings.Join(name, ".")]; ok {
|
||||||
|
vv.Set(reflect.ValueOf(b.Get(tensor, tags[0].Device)))
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
case tt.Kind() == reflect.Pointer || tt.Kind() == reflect.Interface:
|
||||||
|
setPointer2(b, tensors, vv, tagsCopy)
|
||||||
|
case tt.Kind() == reflect.Slice || tt.Kind() == reflect.Array:
|
||||||
|
for i := vv.Len() - 1; i >= 0; i-- {
|
||||||
|
vvv := vv.Index(i)
|
||||||
|
if vvv.Kind() == reflect.Pointer || vvv.Kind() == reflect.Interface {
|
||||||
|
setPointer2(b, tensors, vvv, append(tagsCopy, Tag{Name: strconv.Itoa(i)}))
|
||||||
|
} else {
|
||||||
|
vvv.Set(temp(b, tensors, vvv, append(tagsCopy, Tag{Name: strconv.Itoa(i)})...))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if !canNil(tt) || !vv.IsNil() {
|
||||||
|
allNil = false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if allNil {
|
||||||
|
return reflect.Zero(t)
|
||||||
|
}
|
||||||
|
|
||||||
|
return v
|
||||||
|
}
|
||||||
|
|
||||||
|
func setPointer2(b ml.Backend2, tensors map[string]fs.TensorReader, v reflect.Value, tags []Tag) {
|
||||||
|
vv := v
|
||||||
|
if v.Kind() == reflect.Interface {
|
||||||
|
if v.IsNil() {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
vv = vv.Elem()
|
||||||
|
}
|
||||||
|
|
||||||
|
vv = vv.Elem()
|
||||||
|
if v.IsNil() {
|
||||||
|
vv = reflect.New(v.Type().Elem()).Elem()
|
||||||
|
}
|
||||||
|
|
||||||
|
if f := temp(b, tensors, vv, tags...); f.CanAddr() {
|
||||||
|
v.Set(f.Addr())
|
||||||
|
}
|
||||||
|
}
|
@ -7,7 +7,8 @@ import (
|
|||||||
"testing"
|
"testing"
|
||||||
|
|
||||||
"github.com/google/go-cmp/cmp"
|
"github.com/google/go-cmp/cmp"
|
||||||
fs "github.com/ollama/ollama/fs/ggml"
|
"github.com/ollama/ollama/fs"
|
||||||
|
fsggml "github.com/ollama/ollama/fs/ggml"
|
||||||
"github.com/ollama/ollama/ml"
|
"github.com/ollama/ollama/ml"
|
||||||
"github.com/ollama/ollama/ml/backend/ggml"
|
"github.com/ollama/ollama/ml/backend/ggml"
|
||||||
"github.com/ollama/ollama/ml/nn"
|
"github.com/ollama/ollama/ml/nn"
|
||||||
@ -139,7 +140,7 @@ func TestPopulateFieldsAlternateName(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func TestGetTextProcessor(t *testing.T) {
|
func TestGetTextProcessor(t *testing.T) {
|
||||||
tp, err := getTextProcessor(fs.KV{})
|
tp, err := getTextProcessor(fsggml.KV{})
|
||||||
if err == nil {
|
if err == nil {
|
||||||
t.Error("expected error")
|
t.Error("expected error")
|
||||||
} else if !strings.Contains(err.Error(), "unsupported model architecture") {
|
} else if !strings.Contains(err.Error(), "unsupported model architecture") {
|
||||||
@ -148,10 +149,10 @@ func TestGetTextProcessor(t *testing.T) {
|
|||||||
t.Error("expected nil tp")
|
t.Error("expected nil tp")
|
||||||
}
|
}
|
||||||
|
|
||||||
models["dummy"] = func(ml.Config) (Model, error) {
|
models["dummy"] = func(fs.Config) (Model, error) {
|
||||||
return notTextProcessorModel{}, nil
|
return notTextProcessorModel{}, nil
|
||||||
}
|
}
|
||||||
tp, err = getTextProcessor(fs.KV{"general.architecture": "dummy"})
|
tp, err = getTextProcessor(fsggml.KV{"general.architecture": "dummy"})
|
||||||
if err == nil {
|
if err == nil {
|
||||||
t.Error("expected error")
|
t.Error("expected error")
|
||||||
} else if !strings.Contains(err.Error(), "not a TextProcessor") {
|
} else if !strings.Contains(err.Error(), "not a TextProcessor") {
|
||||||
|
@ -3,6 +3,7 @@ package gemma2
|
|||||||
import (
|
import (
|
||||||
"math"
|
"math"
|
||||||
|
|
||||||
|
"github.com/ollama/ollama/fs"
|
||||||
"github.com/ollama/ollama/kvcache"
|
"github.com/ollama/ollama/kvcache"
|
||||||
"github.com/ollama/ollama/ml"
|
"github.com/ollama/ollama/ml"
|
||||||
"github.com/ollama/ollama/ml/nn"
|
"github.com/ollama/ollama/ml/nn"
|
||||||
@ -35,7 +36,7 @@ const (
|
|||||||
gemma27BLayerCount = 46
|
gemma27BLayerCount = 46
|
||||||
)
|
)
|
||||||
|
|
||||||
func New(c ml.Config) (model.Model, error) {
|
func New(c fs.Config) (model.Model, error) {
|
||||||
m := Model{
|
m := Model{
|
||||||
SentencePieceModel: model.NewSentencePieceModel(
|
SentencePieceModel: model.NewSentencePieceModel(
|
||||||
&model.Vocabulary{
|
&model.Vocabulary{
|
||||||
|
@ -6,6 +6,7 @@ import (
|
|||||||
"math"
|
"math"
|
||||||
"slices"
|
"slices"
|
||||||
|
|
||||||
|
"github.com/ollama/ollama/fs"
|
||||||
"github.com/ollama/ollama/kvcache"
|
"github.com/ollama/ollama/kvcache"
|
||||||
"github.com/ollama/ollama/ml"
|
"github.com/ollama/ollama/ml"
|
||||||
"github.com/ollama/ollama/ml/nn"
|
"github.com/ollama/ollama/ml/nn"
|
||||||
@ -52,7 +53,7 @@ func (p *MultiModalProjector) Forward(ctx ml.Context, visionOutputs ml.Tensor, i
|
|||||||
return visionOutputs
|
return visionOutputs
|
||||||
}
|
}
|
||||||
|
|
||||||
func New(c ml.Config) (model.Model, error) {
|
func New(c fs.Config) (model.Model, error) {
|
||||||
m := Model{
|
m := Model{
|
||||||
SentencePieceModel: model.NewSentencePieceModel(
|
SentencePieceModel: model.NewSentencePieceModel(
|
||||||
&model.Vocabulary{
|
&model.Vocabulary{
|
||||||
|
@ -3,6 +3,7 @@ package gemma3
|
|||||||
import (
|
import (
|
||||||
"math"
|
"math"
|
||||||
|
|
||||||
|
"github.com/ollama/ollama/fs"
|
||||||
"github.com/ollama/ollama/kvcache"
|
"github.com/ollama/ollama/kvcache"
|
||||||
"github.com/ollama/ollama/ml"
|
"github.com/ollama/ollama/ml"
|
||||||
"github.com/ollama/ollama/ml/nn"
|
"github.com/ollama/ollama/ml/nn"
|
||||||
@ -40,7 +41,7 @@ const (
|
|||||||
cacheTypeCausal
|
cacheTypeCausal
|
||||||
)
|
)
|
||||||
|
|
||||||
func newTextModel(c ml.Config) *TextModel {
|
func newTextModel(c fs.Config) *TextModel {
|
||||||
numBlocks := int(c.Uint("block_count"))
|
numBlocks := int(c.Uint("block_count"))
|
||||||
|
|
||||||
m := TextModel{
|
m := TextModel{
|
||||||
|
@ -3,6 +3,7 @@ package gemma3
|
|||||||
import (
|
import (
|
||||||
"math"
|
"math"
|
||||||
|
|
||||||
|
"github.com/ollama/ollama/fs"
|
||||||
"github.com/ollama/ollama/ml"
|
"github.com/ollama/ollama/ml"
|
||||||
"github.com/ollama/ollama/ml/nn"
|
"github.com/ollama/ollama/ml/nn"
|
||||||
)
|
)
|
||||||
@ -111,7 +112,7 @@ func (m *VisionModel) Forward(ctx ml.Context, pixelValues ml.Tensor) ml.Tensor {
|
|||||||
return hiddenState
|
return hiddenState
|
||||||
}
|
}
|
||||||
|
|
||||||
func newVisionModel(c ml.Config) *VisionModel {
|
func newVisionModel(c fs.Config) *VisionModel {
|
||||||
return &VisionModel{
|
return &VisionModel{
|
||||||
Layers: make([]VisionEncoderLayer, c.Uint("vision.block_count")),
|
Layers: make([]VisionEncoderLayer, c.Uint("vision.block_count")),
|
||||||
VisionModelOptions: &VisionModelOptions{
|
VisionModelOptions: &VisionModelOptions{
|
||||||
|
@ -3,7 +3,7 @@ package gemma3
|
|||||||
import (
|
import (
|
||||||
"image"
|
"image"
|
||||||
|
|
||||||
"github.com/ollama/ollama/ml"
|
"github.com/ollama/ollama/fs"
|
||||||
"github.com/ollama/ollama/model/imageproc"
|
"github.com/ollama/ollama/model/imageproc"
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -11,7 +11,7 @@ type ImageProcessor struct {
|
|||||||
imageSize, patchSize, numChannels int
|
imageSize, patchSize, numChannels int
|
||||||
}
|
}
|
||||||
|
|
||||||
func newImageProcessor(c ml.Config) ImageProcessor {
|
func newImageProcessor(c fs.Config) ImageProcessor {
|
||||||
return ImageProcessor{
|
return ImageProcessor{
|
||||||
imageSize: int(c.Uint("vision.image_size")),
|
imageSize: int(c.Uint("vision.image_size")),
|
||||||
patchSize: int(c.Uint("vision.patch_size")),
|
patchSize: int(c.Uint("vision.patch_size")),
|
||||||
|
@ -5,6 +5,7 @@ import (
|
|||||||
"math"
|
"math"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
|
"github.com/ollama/ollama/fs"
|
||||||
"github.com/ollama/ollama/kvcache"
|
"github.com/ollama/ollama/kvcache"
|
||||||
"github.com/ollama/ollama/ml"
|
"github.com/ollama/ollama/ml"
|
||||||
"github.com/ollama/ollama/ml/nn"
|
"github.com/ollama/ollama/ml/nn"
|
||||||
@ -22,7 +23,7 @@ type Model struct {
|
|||||||
model.Base
|
model.Base
|
||||||
model.BytePairEncoding
|
model.BytePairEncoding
|
||||||
|
|
||||||
TokenEmbedding *nn.Embedding `gguf:"token_embd"`
|
TokenEmbedding *nn.Embedding `gguf:"token_embd,cpu"`
|
||||||
Layers []Layer `gguf:"blk"`
|
Layers []Layer `gguf:"blk"`
|
||||||
OutputNorm *nn.RMSNorm `gguf:"output_norm"`
|
OutputNorm *nn.RMSNorm `gguf:"output_norm"`
|
||||||
Output *nn.Linear `gguf:"output,alt:token_embd"`
|
Output *nn.Linear `gguf:"output,alt:token_embd"`
|
||||||
@ -30,7 +31,7 @@ type Model struct {
|
|||||||
*Options
|
*Options
|
||||||
}
|
}
|
||||||
|
|
||||||
func New(c ml.Config) (model.Model, error) {
|
func New(c fs.Config) (model.Model, error) {
|
||||||
if !strings.EqualFold(c.String("tokenizer.ggml.model"), "gpt2") {
|
if !strings.EqualFold(c.String("tokenizer.ggml.model"), "gpt2") {
|
||||||
return nil, fmt.Errorf("tokenizer %s not yet supported", c.String("tokenizer.ggml.model"))
|
return nil, fmt.Errorf("tokenizer %s not yet supported", c.String("tokenizer.ggml.model"))
|
||||||
}
|
}
|
||||||
@ -60,7 +61,7 @@ func New(c ml.Config) (model.Model, error) {
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
m.Cache = kvcache.NewCausalCache(m.Shift)
|
// m.Cache = kvcache.NewCausalCache(m.Shift)
|
||||||
|
|
||||||
return &m, nil
|
return &m, nil
|
||||||
}
|
}
|
||||||
@ -70,7 +71,7 @@ type SelfAttention struct {
|
|||||||
Key *nn.Linear `gguf:"attn_k"`
|
Key *nn.Linear `gguf:"attn_k"`
|
||||||
Value *nn.Linear `gguf:"attn_v"`
|
Value *nn.Linear `gguf:"attn_v"`
|
||||||
Output *nn.Linear `gguf:"attn_output"`
|
Output *nn.Linear `gguf:"attn_output"`
|
||||||
RopeFactors ml.Tensor `gguf:"rope_freqs.weight"`
|
RopeFactors ml.Tensor `gguf:"rope_freqs.weight,root:true"`
|
||||||
}
|
}
|
||||||
|
|
||||||
func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
|
func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
|
||||||
@ -90,7 +91,7 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Ten
|
|||||||
v = v.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
|
v = v.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
|
||||||
|
|
||||||
scaleFactor := 1.0 / math.Sqrt(float64(headDim))
|
scaleFactor := 1.0 / math.Sqrt(float64(headDim))
|
||||||
kqv := nn.Attention(ctx, q, k, v, scaleFactor, cache)
|
kqv := nn.Attention(ctx, q, k, v, scaleFactor, nil)
|
||||||
kqv = kqv.Reshape(ctx, opts.hiddenSize, batchSize)
|
kqv = kqv.Reshape(ctx, opts.hiddenSize, batchSize)
|
||||||
|
|
||||||
return sa.Output.Forward(ctx, kqv)
|
return sa.Output.Forward(ctx, kqv)
|
||||||
@ -153,7 +154,7 @@ func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
|
|||||||
hiddenState := m.TokenEmbedding.Forward(ctx, batch.Inputs)
|
hiddenState := m.TokenEmbedding.Forward(ctx, batch.Inputs)
|
||||||
|
|
||||||
for i, layer := range m.Layers {
|
for i, layer := range m.Layers {
|
||||||
m.Cache.SetLayer(i)
|
// m.Cache.SetLayer(i)
|
||||||
|
|
||||||
var lastLayerOutputs ml.Tensor
|
var lastLayerOutputs ml.Tensor
|
||||||
if i == len(m.Layers)-1 {
|
if i == len(m.Layers)-1 {
|
||||||
|
@ -8,6 +8,7 @@ import (
|
|||||||
"image"
|
"image"
|
||||||
"slices"
|
"slices"
|
||||||
|
|
||||||
|
"github.com/ollama/ollama/fs"
|
||||||
"github.com/ollama/ollama/kvcache"
|
"github.com/ollama/ollama/kvcache"
|
||||||
"github.com/ollama/ollama/ml"
|
"github.com/ollama/ollama/ml"
|
||||||
"github.com/ollama/ollama/ml/nn"
|
"github.com/ollama/ollama/ml/nn"
|
||||||
@ -32,7 +33,7 @@ const (
|
|||||||
selfAttentionLayer
|
selfAttentionLayer
|
||||||
)
|
)
|
||||||
|
|
||||||
func New(c ml.Config) (model.Model, error) {
|
func New(c fs.Config) (model.Model, error) {
|
||||||
// Verify unified config
|
// Verify unified config
|
||||||
if c.Uint("vision.block_count") == 0 {
|
if c.Uint("vision.block_count") == 0 {
|
||||||
return nil, fmt.Errorf("non-unified vision model not supported")
|
return nil, fmt.Errorf("non-unified vision model not supported")
|
||||||
|
@ -4,6 +4,7 @@ import (
|
|||||||
"math"
|
"math"
|
||||||
"slices"
|
"slices"
|
||||||
|
|
||||||
|
"github.com/ollama/ollama/fs"
|
||||||
"github.com/ollama/ollama/kvcache"
|
"github.com/ollama/ollama/kvcache"
|
||||||
"github.com/ollama/ollama/ml"
|
"github.com/ollama/ollama/ml"
|
||||||
"github.com/ollama/ollama/ml/nn"
|
"github.com/ollama/ollama/ml/nn"
|
||||||
@ -220,7 +221,7 @@ func (m *TextModel) Forward(ctx ml.Context, inputIDs, positionIDs, outputs, mask
|
|||||||
return m.Output.Forward(ctx, hiddenState)
|
return m.Output.Forward(ctx, hiddenState)
|
||||||
}
|
}
|
||||||
|
|
||||||
func newTextModel(c ml.Config) *TextModel {
|
func newTextModel(c fs.Config) *TextModel {
|
||||||
var decoderLayers []TextDecoderLayer
|
var decoderLayers []TextDecoderLayer
|
||||||
for i := range c.Uint("block_count") {
|
for i := range c.Uint("block_count") {
|
||||||
var textDecoderLayer TextDecoderLayer
|
var textDecoderLayer TextDecoderLayer
|
||||||
|
@ -4,6 +4,7 @@ import (
|
|||||||
"math"
|
"math"
|
||||||
"slices"
|
"slices"
|
||||||
|
|
||||||
|
"github.com/ollama/ollama/fs"
|
||||||
"github.com/ollama/ollama/ml"
|
"github.com/ollama/ollama/ml"
|
||||||
"github.com/ollama/ollama/ml/nn"
|
"github.com/ollama/ollama/ml/nn"
|
||||||
)
|
)
|
||||||
@ -213,7 +214,7 @@ func (m *VisionModel) Forward(ctx ml.Context, pixelValues, positionIDs, aspectRa
|
|||||||
return hiddenState.Concat(ctx, hiddenStates, 0)
|
return hiddenState.Concat(ctx, hiddenStates, 0)
|
||||||
}
|
}
|
||||||
|
|
||||||
func newVisionModel(c ml.Config) *VisionModel {
|
func newVisionModel(c fs.Config) *VisionModel {
|
||||||
return &VisionModel{
|
return &VisionModel{
|
||||||
Transformer: &VisionEncoder{Layers: make([]VisionEncoderLayer, c.Uint("vision.block_count"))},
|
Transformer: &VisionEncoder{Layers: make([]VisionEncoderLayer, c.Uint("vision.block_count"))},
|
||||||
GlobalTransformer: &VisionEncoder{Layers: make([]VisionEncoderLayer, c.Uint("vision.global.block_count"))},
|
GlobalTransformer: &VisionEncoder{Layers: make([]VisionEncoderLayer, c.Uint("vision.global.block_count"))},
|
||||||
|
@ -8,14 +8,14 @@ import (
|
|||||||
|
|
||||||
"golang.org/x/image/draw"
|
"golang.org/x/image/draw"
|
||||||
|
|
||||||
"github.com/ollama/ollama/ml"
|
"github.com/ollama/ollama/fs"
|
||||||
)
|
)
|
||||||
|
|
||||||
type ImageProcessor struct {
|
type ImageProcessor struct {
|
||||||
imageSize, numChannels, maxNumTiles int
|
imageSize, numChannels, maxNumTiles int
|
||||||
}
|
}
|
||||||
|
|
||||||
func newImageProcessor(c ml.Config) ImageProcessor {
|
func newImageProcessor(c fs.Config) ImageProcessor {
|
||||||
return ImageProcessor{
|
return ImageProcessor{
|
||||||
imageSize: int(c.Uint("vision.image_size")),
|
imageSize: int(c.Uint("vision.image_size")),
|
||||||
numChannels: int(c.Uint("vision.num_channels")),
|
numChannels: int(c.Uint("vision.num_channels")),
|
||||||
|
Loading…
x
Reference in New Issue
Block a user