feed the linter again + llama.cpp patches

This commit is contained in:
Patrick Devine 2024-07-08 17:03:13 -07:00
parent a3058002c4
commit 8ba3f38f82
2 changed files with 57 additions and 36 deletions

View File

@ -16,7 +16,6 @@ import (
"testing"
"github.com/ollama/ollama/llm"
"github.com/stretchr/testify/assert"
"golang.org/x/exp/maps"
)
@ -134,44 +133,67 @@ func TestConvertNPZ(t *testing.T) {
for _, fn := range cases {
ts, err := parseNPZ(filepath.Join("testdata", fn))
assert.NoError(t, err)
assert.Len(t, ts, 16*2*2) // 16 layers, 2 tensors, 2 loras
if err != nil {
t.Fatal(err)
}
if len(ts) != 16*2*2 {
t.Errorf("got: %d want: %d total layers", len(ts), 16*2*2)
}
a := adapter{}
for _, m := range ts {
at := m.(adapterTensor)
assert.Equal(t, filepath.Join("testdata", fn), at.path)
assert.Equal(t, "F32", at.dtype) // only float32s supported
assert.Len(t, at.tensorBase.shape, 2)
if at.path != filepath.Join("testdata", fn) {
t.Errorf("got: %s want: %s", at.path, filepath.Join("testdata", fn))
}
if at.dtype != "F32" {
t.Errorf("got: %s but only F32s are currently supported", at.dtype)
}
if len(at.tensorBase.shape) != 2 {
t.Errorf("got: %d want: %d tensor shape", at.tensorBase.shape, 2)
}
}
var ws io.WriteSeeker = &memWriter{}
err = llm.WriteGGLA(ws, a.KV(nil), a.Tensors(ts))
assert.NoError(t, err)
if err != nil {
t.Fatal(err)
}
mw := ws.(*memWriter)
slog.Info(fmt.Sprintf("buffer len = %d", len(mw.buf)))
assert.NotEmpty(t, mw.buf)
if len(mw.buf) == 0 {
t.Errorf("ggla layer not written correctly")
}
rs := bytes.NewReader(mw.buf)
ggml, _, err := llm.DecodeGGML(rs, len(mw.buf))
assert.NoError(t, err, "decode ggml failed")
assert.NotNil(t, ggml, "ggml was empty")
if err != nil {
t.Fatal(err)
}
if ggml == nil {
t.Errorf("ggla didn't convert to ggml correctly")
}
kv := ggml.KV()
assert.NotNil(t, kv, "lora KVs not found")
if kv == nil {
t.Errorf("no lora KVs were set")
}
r, ok := kv["r"]
assert.Equal(t, true, ok, "lora rank not set")
assert.Equal(t, uint32(8), r, "lora rank was incorrect")
if !ok || r != uint32(8) {
t.Errorf("lora rank was not set correctly")
}
alpha, ok := kv["alpha"]
assert.Equal(t, true, ok, "lora alpha not set")
assert.Equal(t, uint32(160), alpha, "lora alpha value was incorrect")
if !ok || alpha != uint32(160) {
t.Errorf("lora alpha was not set correctly")
}
gts := ggml.Tensors()
assert.NotNil(t, gts, "no tensors found")
assert.Equal(t, len(ts), len(gts.Items))
if len(ts) != len(gts.Items) {
t.Errorf("got: %d want: %d tensors in ggla", len(gts.Items), len(ts))
}
}
}

View File

@ -1,8 +1,17 @@
diff --git a/llama.cpp b/llama.cpp
index 61948751..d54fc537 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -15940,6 +15940,20 @@ static int llama_apply_lora_from_file_internal(
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4f6cd687..b8c6896b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -189,3 +189,4 @@ if (LLAMA_BUILD_EXAMPLES)
add_subdirectory(examples)
add_subdirectory(pocs)
endif()
+add_subdirectory(../ext_server ext_server) # ollama
diff --git a/src/llama.cpp b/src/llama.cpp
index 2b9ace28..b0151571 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -18609,6 +18609,20 @@ static int llama_apply_lora_from_file_internal(
return 1;
}
@ -23,7 +32,7 @@ index 61948751..d54fc537 100644
// load tensor data
auto load_tensor = [&read_buf, &fin](const tensor_meta & tensor_meta, ggml_tensor * tensor) {
read_buf.resize(ggml_nbytes(tensor));
@@ -15950,6 +15964,9 @@ static int llama_apply_lora_from_file_internal(
@@ -18619,6 +18633,9 @@ static int llama_apply_lora_from_file_internal(
load_tensor(metaA, loraA);
load_tensor(metaB, loraB);
@ -33,7 +42,7 @@ index 61948751..d54fc537 100644
// load base model tensor data
if (ml) {
ml->load_data_for(base_t);
@@ -15964,8 +15981,10 @@ static int llama_apply_lora_from_file_internal(
@@ -18633,8 +18650,10 @@ static int llama_apply_lora_from_file_internal(
}
if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) {
@ -46,13 +55,11 @@ index 61948751..d54fc537 100644
ggml_free(lora_ctx);
ggml_backend_buffer_free(lora_buf);
ggml_backend_free(backend_cpu);
@@ -15973,15 +15992,19 @@ static int llama_apply_lora_from_file_internal(
}
@@ -18643,14 +18662,18 @@ static int llama_apply_lora_from_file_internal(
auto build_lora_graph = [&]() {
- // w = w + BA*s
// w = w + BA*s
- ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB);
+ // Wlora = Worig + scaling * BA
+ ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraB, loraA);
ggml_set_name(BA, "BA");
@ -69,11 +76,3 @@ index 61948751..d54fc537 100644
ggml_tensor * r;
r = ggml_add_inplace(lora_ctx, base_t, BA);
ggml_set_name(r, "r_add");
@@ -16009,6 +16032,7 @@ static int llama_apply_lora_from_file_internal(
}
ggml_backend_graph_compute(backend_cpu, gf);
+ show_tensor("Result " + base_name, r);
ggml_backend_tensor_set(model_t, r->data, 0, ggml_nbytes(r));