diff --git a/convert/convert_adapter.go b/convert/convert_adapter.go new file mode 100644 index 000000000..6f27fd445 --- /dev/null +++ b/convert/convert_adapter.go @@ -0,0 +1,51 @@ +package convert + +import ( + "strings" + + "github.com/ollama/ollama/llm" +) + +type adapter struct { + Parameters +} + +var _ Converter = (*adapter)(nil) + +func (p *adapter) KV(t *Tokenizer) llm.KV { + // todo - need a way to pass these in + kv := llm.KV{ + "r": uint32(8), + "alpha": uint32(16), + } + return kv +} + +func (p *adapter) Tensors(ts []Tensor) []*llm.Tensor { + var out []*llm.Tensor + for _, t := range ts { + name := p.tensorName(t.Name()) + + out = append(out, &llm.Tensor{ + Name: name, + Kind: t.Kind(), + Shape: t.Shape(), + WriterTo: t, + }) + } + + return out +} + +func (p *adapter) tensorName(n string) string { + return strings.NewReplacer( + "model.layers", "blk", + "self_attn.q_proj", "attn_q.weight", + "self_attn.k_proj", "attn_k.weight", + "self_attn.v_proj", "attn_v.weight", + "self_attn.o_proj", "attn_output.weight", + "lora_a", "loraA", + "lora_b", "loraB", + ".npy", "", + ).Replace(n) +} diff --git a/convert/convert_test.go b/convert/convert_test.go index 0fbd436f5..c4fd5dbd5 100644 --- a/convert/convert_test.go +++ b/convert/convert_test.go @@ -1,8 +1,10 @@ package convert import ( + "bytes" "crypto/sha256" "encoding/json" + "errors" "flag" "fmt" "io" @@ -14,6 +16,7 @@ import ( "testing" "github.com/ollama/ollama/llm" + "github.com/stretchr/testify/assert" "golang.org/x/exp/maps" ) @@ -123,3 +126,72 @@ func TestConvertFull(t *testing.T) { }) } } + +func TestConvertNPZ(t *testing.T) { + cases := []string{ + "adapters.npz", + } + + for _, fn := range cases { + ts, err := parseNPZ(filepath.Join("testdata", fn)) + assert.Nil(t, err) + assert.Equal(t, len(ts), 16*2*2) // 16 layers, 2 tensors, 2 loras + + a := adapter{} + + for _, m := range ts { + at := m.(adapterTensor) + assert.Equal(t, at.path, filepath.Join("testdata", fn)) + assert.Equal(t, at.dtype, "F32") // only float32s supported + assert.Equal(t, len(at.tensorBase.shape), 2) + } + + var ws io.WriteSeeker = &memWriter{} + err = llm.WriteGGLA(ws, a.KV(nil), a.Tensors(ts)) + assert.Nil(t, err) + + mw := ws.(*memWriter) + slog.Info(fmt.Sprintf("buffer len = %d", len(mw.buf))) + rs := bytes.NewReader(mw.buf) + ggml, _, err := llm.DecodeGGML(rs, len(mw.buf)) + assert.Nil(t, err) + assert.NotNil(t, ggml) + } +} + +type memWriter struct { + buf []byte + pos int +} + +func (m *memWriter) Write(p []byte) (n int, err error) { + minCap := m.pos + len(p) + if minCap > cap(m.buf) { + buf2 := make([]byte, len(m.buf), minCap+len(p)) // add some extra + copy(buf2, m.buf) + m.buf = buf2 + } + if minCap > len(m.buf) { + m.buf = m.buf[:minCap] + } + copy(m.buf[m.pos:], p) + m.pos += len(p) + return len(p), nil +} + +func (m *memWriter) Seek(offset int64, whence int) (int64, error) { + newPos, offs := 0, int(offset) + switch whence { + case io.SeekStart: + newPos = offs + case io.SeekCurrent: + newPos = m.pos + offs + case io.SeekEnd: + newPos = len(m.buf) + offs + } + if newPos < 0 { + return 0, errors.New("negative result pos") + } + m.pos = newPos + return int64(newPos), nil +} diff --git a/convert/reader_npz.go b/convert/reader_npz.go new file mode 100644 index 000000000..f2f225ae2 --- /dev/null +++ b/convert/reader_npz.go @@ -0,0 +1,128 @@ +package convert + +import ( + "encoding/binary" + "fmt" + "io" + "log/slog" + "strings" + + "github.com/pdevine/tensor" + "github.com/pdevine/tensor/native" + "github.com/sbinet/npyio/npz" +) + +type adapterTensor struct { + path string + dtype string + *tensorBase +} + +func parseNPZ(fn string) ([]Tensor, error) { + var ts []Tensor + + f, err := npz.Open(fn) + if err != nil { + return nil, err + } + defer f.Close() + + for _, name := range f.Keys() { + slog.Info(fmt.Sprintf("reading layer '%s'", name)) + h := f.Header(name) + + shape := make([]uint64, 2) + for cnt, v := range h.Descr.Shape { + // llamacpp expects the loraB layer to be reversed + if strings.Contains(name, "lora_b") { + shape[len(shape)-cnt-1] = uint64(v) + } else { + shape[cnt] = uint64(v) + } + } + + dtypeMap := map[string]string{ + " 0 { + dims++ + } + } + + if err := binary.Write(ws, binary.LittleEndian, uint32(dims)); err != nil { + return err + } + + if err := binary.Write(ws, binary.LittleEndian, uint32(len(t.Name))); err != nil { + return err + } + + if err := binary.Write(ws, binary.LittleEndian, t.Kind); err != nil { + return err + } + + for cnt := 0; cnt < dims; cnt++ { + if err := binary.Write(ws, binary.LittleEndian, uint32(t.Shape[dims-1-cnt])); err != nil { + return err + } + } + + if err := binary.Write(ws, binary.LittleEndian, []byte(t.Name)); err != nil { + return err + } + + offset, err := ws.Seek(0, io.SeekCurrent) + if err != nil { + return err + } + + var alignment int32 = 32 + pad := gglaPadding(int32(offset), alignment) + if err := binary.Write(ws, binary.LittleEndian, bytes.Repeat([]byte{0}, int(pad))); err != nil { + return err + } + + if _, err := t.WriteTo(ws); err != nil { + return err + } + } + return nil +} + +func gglaPadding(offset, align int32) int32 { + return (align - offset%align) % align } diff --git a/llm/patches/08-lora.diff b/llm/patches/08-lora.diff new file mode 100644 index 000000000..6a14c4336 --- /dev/null +++ b/llm/patches/08-lora.diff @@ -0,0 +1,79 @@ +diff --git a/llama.cpp b/llama.cpp +index 61948751..d54fc537 100644 +--- a/llama.cpp ++++ b/llama.cpp +@@ -15940,6 +15940,20 @@ static int llama_apply_lora_from_file_internal( + return 1; + } + ++ // show tensor data ++ auto show_tensor = [](std::string name, ggml_tensor *t) { ++ LLAMA_LOG_INFO("%s\n", name.c_str()); ++ ++ for(int i=0; i<3; i++) { ++ for(int j=0; j<3; j++) { ++ float v = ggml_get_f32_nd(t, i, j, 0, 0); ++ LLAMA_LOG_INFO("%.8f ", v); ++ } ++ LLAMA_LOG_INFO(" ...\n"); ++ } ++ LLAMA_LOG_INFO(" ...\n"); ++ }; ++ + // load tensor data + auto load_tensor = [&read_buf, &fin](const tensor_meta & tensor_meta, ggml_tensor * tensor) { + read_buf.resize(ggml_nbytes(tensor)); +@@ -15950,6 +15964,9 @@ static int llama_apply_lora_from_file_internal( + load_tensor(metaA, loraA); + load_tensor(metaB, loraB); + ++ show_tensor(base_name + ".loraA", loraA); ++ show_tensor(base_name + ".loraB", loraB); ++ + // load base model tensor data + if (ml) { + ml->load_data_for(base_t); +@@ -15964,8 +15981,10 @@ static int llama_apply_lora_from_file_internal( + } + + if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) { +- LLAMA_LOG_ERROR("%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");" +- " are you sure that this adapter is for this model?\n", __func__, base_t->ne[0], loraA->ne[1]); ++ LLAMA_LOG_ERROR("%s: incompatible tensors: base [%lld, %lld] loraA [%lld, %lld] loraB [%lld, %lld]\n", __func__, ++ base_t->ne[0], base_t->ne[1], ++ loraA->ne[0], loraA->ne[1], ++ loraB->ne[0], loraB->ne[1]); + ggml_free(lora_ctx); + ggml_backend_buffer_free(lora_buf); + ggml_backend_free(backend_cpu); +@@ -15973,15 +15992,19 @@ static int llama_apply_lora_from_file_internal( + } + + auto build_lora_graph = [&]() { +- // w = w + BA*s +- ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB); ++ // Wlora = Worig + scaling * BA ++ ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraB, loraA); + ggml_set_name(BA, "BA"); + + if (scaling != 1.0f) { +- BA = ggml_scale(lora_ctx, BA, scaling); ++ //BA = ggml_scale(lora_ctx, BA, scaling); ++ BA = ggml_scale(lora_ctx, BA, 20.0); + ggml_set_name(BA, "BA_scaled"); + } + ++ // transpose matrix before we add ++ BA = ggml_cont(lora_ctx, ggml_transpose(lora_ctx, BA)); ++ + ggml_tensor * r; + r = ggml_add_inplace(lora_ctx, base_t, BA); + ggml_set_name(r, "r_add"); +@@ -16009,6 +16032,7 @@ static int llama_apply_lora_from_file_internal( + } + + ggml_backend_graph_compute(backend_cpu, gf); ++ show_tensor("Result " + base_name, r); + + ggml_backend_tensor_set(model_t, r->data, 0, ggml_nbytes(r)); +