ggla checkin
This commit is contained in:
parent
9f32c634ae
commit
2e055e3af8
51
convert/convert_adapter.go
Normal file
51
convert/convert_adapter.go
Normal file
@ -0,0 +1,51 @@
|
||||
package convert
|
||||
|
||||
import (
|
||||
"strings"
|
||||
|
||||
"github.com/ollama/ollama/llm"
|
||||
)
|
||||
|
||||
type adapter struct {
|
||||
Parameters
|
||||
}
|
||||
|
||||
var _ Converter = (*adapter)(nil)
|
||||
|
||||
func (p *adapter) KV(t *Tokenizer) llm.KV {
|
||||
// todo - need a way to pass these in
|
||||
kv := llm.KV{
|
||||
"r": uint32(8),
|
||||
"alpha": uint32(16),
|
||||
}
|
||||
return kv
|
||||
}
|
||||
|
||||
func (p *adapter) Tensors(ts []Tensor) []*llm.Tensor {
|
||||
var out []*llm.Tensor
|
||||
for _, t := range ts {
|
||||
name := p.tensorName(t.Name())
|
||||
|
||||
out = append(out, &llm.Tensor{
|
||||
Name: name,
|
||||
Kind: t.Kind(),
|
||||
Shape: t.Shape(),
|
||||
WriterTo: t,
|
||||
})
|
||||
}
|
||||
|
||||
return out
|
||||
}
|
||||
|
||||
func (p *adapter) tensorName(n string) string {
|
||||
return strings.NewReplacer(
|
||||
"model.layers", "blk",
|
||||
"self_attn.q_proj", "attn_q.weight",
|
||||
"self_attn.k_proj", "attn_k.weight",
|
||||
"self_attn.v_proj", "attn_v.weight",
|
||||
"self_attn.o_proj", "attn_output.weight",
|
||||
"lora_a", "loraA",
|
||||
"lora_b", "loraB",
|
||||
".npy", "",
|
||||
).Replace(n)
|
||||
}
|
@ -1,8 +1,10 @@
|
||||
package convert
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"crypto/sha256"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"flag"
|
||||
"fmt"
|
||||
"io"
|
||||
@ -14,6 +16,7 @@ import (
|
||||
"testing"
|
||||
|
||||
"github.com/ollama/ollama/llm"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"golang.org/x/exp/maps"
|
||||
)
|
||||
|
||||
@ -123,3 +126,72 @@ func TestConvertFull(t *testing.T) {
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestConvertNPZ(t *testing.T) {
|
||||
cases := []string{
|
||||
"adapters.npz",
|
||||
}
|
||||
|
||||
for _, fn := range cases {
|
||||
ts, err := parseNPZ(filepath.Join("testdata", fn))
|
||||
assert.Nil(t, err)
|
||||
assert.Equal(t, len(ts), 16*2*2) // 16 layers, 2 tensors, 2 loras
|
||||
|
||||
a := adapter{}
|
||||
|
||||
for _, m := range ts {
|
||||
at := m.(adapterTensor)
|
||||
assert.Equal(t, at.path, filepath.Join("testdata", fn))
|
||||
assert.Equal(t, at.dtype, "F32") // only float32s supported
|
||||
assert.Equal(t, len(at.tensorBase.shape), 2)
|
||||
}
|
||||
|
||||
var ws io.WriteSeeker = &memWriter{}
|
||||
err = llm.WriteGGLA(ws, a.KV(nil), a.Tensors(ts))
|
||||
assert.Nil(t, err)
|
||||
|
||||
mw := ws.(*memWriter)
|
||||
slog.Info(fmt.Sprintf("buffer len = %d", len(mw.buf)))
|
||||
rs := bytes.NewReader(mw.buf)
|
||||
ggml, _, err := llm.DecodeGGML(rs, len(mw.buf))
|
||||
assert.Nil(t, err)
|
||||
assert.NotNil(t, ggml)
|
||||
}
|
||||
}
|
||||
|
||||
type memWriter struct {
|
||||
buf []byte
|
||||
pos int
|
||||
}
|
||||
|
||||
func (m *memWriter) Write(p []byte) (n int, err error) {
|
||||
minCap := m.pos + len(p)
|
||||
if minCap > cap(m.buf) {
|
||||
buf2 := make([]byte, len(m.buf), minCap+len(p)) // add some extra
|
||||
copy(buf2, m.buf)
|
||||
m.buf = buf2
|
||||
}
|
||||
if minCap > len(m.buf) {
|
||||
m.buf = m.buf[:minCap]
|
||||
}
|
||||
copy(m.buf[m.pos:], p)
|
||||
m.pos += len(p)
|
||||
return len(p), nil
|
||||
}
|
||||
|
||||
func (m *memWriter) Seek(offset int64, whence int) (int64, error) {
|
||||
newPos, offs := 0, int(offset)
|
||||
switch whence {
|
||||
case io.SeekStart:
|
||||
newPos = offs
|
||||
case io.SeekCurrent:
|
||||
newPos = m.pos + offs
|
||||
case io.SeekEnd:
|
||||
newPos = len(m.buf) + offs
|
||||
}
|
||||
if newPos < 0 {
|
||||
return 0, errors.New("negative result pos")
|
||||
}
|
||||
m.pos = newPos
|
||||
return int64(newPos), nil
|
||||
}
|
||||
|
128
convert/reader_npz.go
Normal file
128
convert/reader_npz.go
Normal file
@ -0,0 +1,128 @@
|
||||
package convert
|
||||
|
||||
import (
|
||||
"encoding/binary"
|
||||
"fmt"
|
||||
"io"
|
||||
"log/slog"
|
||||
"strings"
|
||||
|
||||
"github.com/pdevine/tensor"
|
||||
"github.com/pdevine/tensor/native"
|
||||
"github.com/sbinet/npyio/npz"
|
||||
)
|
||||
|
||||
type adapterTensor struct {
|
||||
path string
|
||||
dtype string
|
||||
*tensorBase
|
||||
}
|
||||
|
||||
func parseNPZ(fn string) ([]Tensor, error) {
|
||||
var ts []Tensor
|
||||
|
||||
f, err := npz.Open(fn)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
for _, name := range f.Keys() {
|
||||
slog.Info(fmt.Sprintf("reading layer '%s'", name))
|
||||
h := f.Header(name)
|
||||
|
||||
shape := make([]uint64, 2)
|
||||
for cnt, v := range h.Descr.Shape {
|
||||
// llamacpp expects the loraB layer to be reversed
|
||||
if strings.Contains(name, "lora_b") {
|
||||
shape[len(shape)-cnt-1] = uint64(v)
|
||||
} else {
|
||||
shape[cnt] = uint64(v)
|
||||
}
|
||||
}
|
||||
|
||||
dtypeMap := map[string]string{
|
||||
"<f2": "F16",
|
||||
"<f4": "F32",
|
||||
}
|
||||
dtype, ok := dtypeMap[h.Descr.Type]
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("Unknown type '%s' for '%s'", h.Descr.Type, name)
|
||||
}
|
||||
|
||||
ts = append(ts, adapterTensor{
|
||||
path: fn,
|
||||
dtype: dtype,
|
||||
tensorBase: &tensorBase{
|
||||
name: name,
|
||||
shape: shape,
|
||||
},
|
||||
})
|
||||
}
|
||||
return ts, nil
|
||||
}
|
||||
|
||||
func (t adapterTensor) Kind() uint32 {
|
||||
switch t.dtype {
|
||||
case "F32":
|
||||
return 0
|
||||
case "F16":
|
||||
return 1
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
func (t adapterTensor) WriteTo(w io.Writer) (int64, error) {
|
||||
f, err := npz.Open(t.path)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
switch t.dtype {
|
||||
case "F32":
|
||||
var f32s []float32
|
||||
err = f.Read(t.tensorBase.name, &f32s)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
// ggla expects the loraB to be transposed
|
||||
if strings.Contains(t.tensorBase.name, "lora_b") {
|
||||
f32s, err = transpose(f32s, t.tensorBase.shape)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
}
|
||||
|
||||
return 0, binary.Write(w, binary.LittleEndian, f32s)
|
||||
default:
|
||||
return 0, fmt.Errorf("unknown data type: %s", t.dtype)
|
||||
}
|
||||
|
||||
return 0, fmt.Errorf("unknown error")
|
||||
}
|
||||
|
||||
func transpose(f32s []float32, shape []uint64) ([]float32, error) {
|
||||
if len(shape) != 2 {
|
||||
return nil, fmt.Errorf("only 2 dimensions supported for transpose")
|
||||
}
|
||||
|
||||
// the shape is already backward
|
||||
n := tensor.New(tensor.WithShape(int(shape[1]), int(shape[0])), tensor.WithBacking(f32s))
|
||||
if err := n.T(1, 0); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if err := n.Transpose(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
ts, err := native.SelectF32(n, 1)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
f32s = make([]float32, 0)
|
||||
for _, t := range ts {
|
||||
f32s = append(f32s, t...)
|
||||
}
|
||||
return f32s, nil
|
||||
}
|
BIN
convert/testdata/adapters.npz
vendored
Normal file
BIN
convert/testdata/adapters.npz
vendored
Normal file
Binary file not shown.
3
go.mod
3
go.mod
@ -21,6 +21,7 @@ require (
|
||||
github.com/mattn/go-runewidth v0.0.14
|
||||
github.com/nlpodyssey/gopickle v0.3.0
|
||||
github.com/pdevine/tensor v0.0.0-20240510204454-f88f4562727c
|
||||
github.com/sbinet/npyio v0.9.0
|
||||
)
|
||||
|
||||
require (
|
||||
@ -71,7 +72,7 @@ require (
|
||||
golang.org/x/net v0.25.0 // indirect
|
||||
golang.org/x/sys v0.20.0
|
||||
golang.org/x/term v0.20.0
|
||||
golang.org/x/text v0.15.0 // indirect
|
||||
golang.org/x/text v0.15.0
|
||||
google.golang.org/protobuf v1.34.1
|
||||
gopkg.in/yaml.v3 v3.0.1 // indirect
|
||||
)
|
||||
|
2
go.sum
2
go.sum
@ -171,6 +171,8 @@ github.com/rogpeppe/go-internal v1.8.0 h1:FCbCCtXNOY3UtUuHUYaghJg4y7Fd14rXifAYUA
|
||||
github.com/rogpeppe/go-internal v1.8.0/go.mod h1:WmiCO8CzOY8rg0OYDC4/i/2WRWAB6poM+XZ2dLUbcbE=
|
||||
github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
|
||||
github.com/ruudk/golang-pdf417 v0.0.0-20181029194003-1af4ab5afa58/go.mod h1:6lfFZQK844Gfx8o5WFuvpxWRwnSoipWe/p622j1v06w=
|
||||
github.com/sbinet/npyio v0.9.0 h1:A7h8OyYsOsc+NPRtynRMSf70xSgATZNpamNp8nQ8Tjc=
|
||||
github.com/sbinet/npyio v0.9.0/go.mod h1:vgjQEMRTS9aMS9GdXhr+5jounCmGqjDO2JI+IpSokns=
|
||||
github.com/spf13/cobra v1.7.0 h1:hyqWnYt1ZQShIddO5kBpj3vu05/++x6tJ6dg8EC572I=
|
||||
github.com/spf13/cobra v1.7.0/go.mod h1:uLxZILRyS/50WlhOIKD7W6V5bgeIt+4sICxh6uRMrb0=
|
||||
github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA=
|
||||
|
98
llm/ggla.go
98
llm/ggla.go
@ -1,9 +1,12 @@
|
||||
package llm
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/binary"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"log/slog"
|
||||
"slices"
|
||||
)
|
||||
|
||||
@ -16,6 +19,7 @@ func (c *containerGGLA) Name() string {
|
||||
}
|
||||
|
||||
func (c *containerGGLA) Decode(rs io.ReadSeeker) (model, error) {
|
||||
slog.Info("decoding ggla")
|
||||
if err := binary.Read(rs, binary.LittleEndian, &c.version); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@ -58,7 +62,7 @@ func (llm *ggla) Tensors() Tensors {
|
||||
}
|
||||
}
|
||||
|
||||
func (llm *ggla) decode(rs io.ReadSeeker) (retErr error) {
|
||||
func (llm *ggla) decode(rs io.ReadSeeker) error {
|
||||
var r uint32
|
||||
if err := binary.Read(rs, binary.LittleEndian, &r); err != nil {
|
||||
return err
|
||||
@ -87,12 +91,6 @@ func (llm *ggla) decode(rs io.ReadSeeker) (retErr error) {
|
||||
return err
|
||||
}
|
||||
|
||||
defer func() {
|
||||
if errors.Is(retErr, io.EOF) {
|
||||
retErr = io.ErrUnexpectedEOF
|
||||
}
|
||||
}()
|
||||
|
||||
var namesize uint32
|
||||
if err := binary.Read(rs, binary.LittleEndian, &namesize); err != nil {
|
||||
return err
|
||||
@ -123,13 +121,14 @@ func (llm *ggla) decode(rs io.ReadSeeker) (retErr error) {
|
||||
}
|
||||
|
||||
t.Name = string(name)
|
||||
slog.Info(fmt.Sprintf("%s: [%d, %d] k=%d", t.Name, t.Shape[0], t.Shape[1], t.Kind))
|
||||
|
||||
offset, err := rs.Seek(0, io.SeekCurrent)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if _, err := rs.Seek((offset+31)&-32-offset, io.SeekCurrent); err != nil {
|
||||
if _, err := rs.Seek((offset+31)&-32, io.SeekStart); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
@ -146,4 +145,87 @@ func (llm *ggla) decode(rs io.ReadSeeker) (retErr error) {
|
||||
|
||||
llm.tensors = append(llm.tensors, &t)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func WriteGGLA(ws io.WriteSeeker, kv KV, ts []*Tensor) error {
|
||||
slog.Debug("writing ggla")
|
||||
if err := binary.Write(ws, binary.LittleEndian, []byte("algg")); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if err := binary.Write(ws, binary.LittleEndian, uint32(1)); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
var r uint32
|
||||
var alpha uint32
|
||||
var ok bool
|
||||
|
||||
if r, ok = kv["r"].(uint32); !ok {
|
||||
r = 8
|
||||
}
|
||||
|
||||
if err := binary.Write(ws, binary.LittleEndian, r); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if alpha, ok = kv["alpha"].(uint32); !ok {
|
||||
alpha = 16
|
||||
}
|
||||
|
||||
if err := binary.Write(ws, binary.LittleEndian, alpha); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
for _, t := range ts {
|
||||
dims := 0
|
||||
for cnt := 0; cnt < len(t.Shape); cnt++ {
|
||||
if t.Shape[cnt] > 0 {
|
||||
dims++
|
||||
}
|
||||
}
|
||||
|
||||
if err := binary.Write(ws, binary.LittleEndian, uint32(dims)); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if err := binary.Write(ws, binary.LittleEndian, uint32(len(t.Name))); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if err := binary.Write(ws, binary.LittleEndian, t.Kind); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
for cnt := 0; cnt < dims; cnt++ {
|
||||
if err := binary.Write(ws, binary.LittleEndian, uint32(t.Shape[dims-1-cnt])); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
if err := binary.Write(ws, binary.LittleEndian, []byte(t.Name)); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
offset, err := ws.Seek(0, io.SeekCurrent)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
var alignment int32 = 32
|
||||
pad := gglaPadding(int32(offset), alignment)
|
||||
if err := binary.Write(ws, binary.LittleEndian, bytes.Repeat([]byte{0}, int(pad))); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if _, err := t.WriteTo(ws); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func gglaPadding(offset, align int32) int32 {
|
||||
return (align - offset%align) % align
|
||||
}
|
||||
|
79
llm/patches/08-lora.diff
Normal file
79
llm/patches/08-lora.diff
Normal file
@ -0,0 +1,79 @@
|
||||
diff --git a/llama.cpp b/llama.cpp
|
||||
index 61948751..d54fc537 100644
|
||||
--- a/llama.cpp
|
||||
+++ b/llama.cpp
|
||||
@@ -15940,6 +15940,20 @@ static int llama_apply_lora_from_file_internal(
|
||||
return 1;
|
||||
}
|
||||
|
||||
+ // show tensor data
|
||||
+ auto show_tensor = [](std::string name, ggml_tensor *t) {
|
||||
+ LLAMA_LOG_INFO("%s\n", name.c_str());
|
||||
+
|
||||
+ for(int i=0; i<3; i++) {
|
||||
+ for(int j=0; j<3; j++) {
|
||||
+ float v = ggml_get_f32_nd(t, i, j, 0, 0);
|
||||
+ LLAMA_LOG_INFO("%.8f ", v);
|
||||
+ }
|
||||
+ LLAMA_LOG_INFO(" ...\n");
|
||||
+ }
|
||||
+ LLAMA_LOG_INFO(" ...\n");
|
||||
+ };
|
||||
+
|
||||
// load tensor data
|
||||
auto load_tensor = [&read_buf, &fin](const tensor_meta & tensor_meta, ggml_tensor * tensor) {
|
||||
read_buf.resize(ggml_nbytes(tensor));
|
||||
@@ -15950,6 +15964,9 @@ static int llama_apply_lora_from_file_internal(
|
||||
load_tensor(metaA, loraA);
|
||||
load_tensor(metaB, loraB);
|
||||
|
||||
+ show_tensor(base_name + ".loraA", loraA);
|
||||
+ show_tensor(base_name + ".loraB", loraB);
|
||||
+
|
||||
// load base model tensor data
|
||||
if (ml) {
|
||||
ml->load_data_for(base_t);
|
||||
@@ -15964,8 +15981,10 @@ static int llama_apply_lora_from_file_internal(
|
||||
}
|
||||
|
||||
if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) {
|
||||
- LLAMA_LOG_ERROR("%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");"
|
||||
- " are you sure that this adapter is for this model?\n", __func__, base_t->ne[0], loraA->ne[1]);
|
||||
+ LLAMA_LOG_ERROR("%s: incompatible tensors: base [%lld, %lld] loraA [%lld, %lld] loraB [%lld, %lld]\n", __func__,
|
||||
+ base_t->ne[0], base_t->ne[1],
|
||||
+ loraA->ne[0], loraA->ne[1],
|
||||
+ loraB->ne[0], loraB->ne[1]);
|
||||
ggml_free(lora_ctx);
|
||||
ggml_backend_buffer_free(lora_buf);
|
||||
ggml_backend_free(backend_cpu);
|
||||
@@ -15973,15 +15992,19 @@ static int llama_apply_lora_from_file_internal(
|
||||
}
|
||||
|
||||
auto build_lora_graph = [&]() {
|
||||
- // w = w + BA*s
|
||||
- ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB);
|
||||
+ // Wlora = Worig + scaling * BA
|
||||
+ ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraB, loraA);
|
||||
ggml_set_name(BA, "BA");
|
||||
|
||||
if (scaling != 1.0f) {
|
||||
- BA = ggml_scale(lora_ctx, BA, scaling);
|
||||
+ //BA = ggml_scale(lora_ctx, BA, scaling);
|
||||
+ BA = ggml_scale(lora_ctx, BA, 20.0);
|
||||
ggml_set_name(BA, "BA_scaled");
|
||||
}
|
||||
|
||||
+ // transpose matrix before we add
|
||||
+ BA = ggml_cont(lora_ctx, ggml_transpose(lora_ctx, BA));
|
||||
+
|
||||
ggml_tensor * r;
|
||||
r = ggml_add_inplace(lora_ctx, base_t, BA);
|
||||
ggml_set_name(r, "r_add");
|
||||
@@ -16009,6 +16032,7 @@ static int llama_apply_lora_from_file_internal(
|
||||
}
|
||||
|
||||
ggml_backend_graph_compute(backend_cpu, gf);
|
||||
+ show_tensor("Result " + base_name, r);
|
||||
|
||||
ggml_backend_tensor_set(model_t, r->data, 0, ggml_nbytes(r));
|
||||
|
Loading…
x
Reference in New Issue
Block a user