diff --git a/convert/convert_test.go b/convert/convert_test.go index abb1c7712..619cb10cd 100644 --- a/convert/convert_test.go +++ b/convert/convert_test.go @@ -16,7 +16,6 @@ import ( "testing" "github.com/ollama/ollama/llm" - "github.com/stretchr/testify/assert" "golang.org/x/exp/maps" ) @@ -134,44 +133,67 @@ func TestConvertNPZ(t *testing.T) { for _, fn := range cases { ts, err := parseNPZ(filepath.Join("testdata", fn)) - assert.NoError(t, err) - assert.Len(t, ts, 16*2*2) // 16 layers, 2 tensors, 2 loras + if err != nil { + t.Fatal(err) + } + if len(ts) != 16*2*2 { + t.Errorf("got: %d want: %d total layers", len(ts), 16*2*2) + } a := adapter{} for _, m := range ts { at := m.(adapterTensor) - assert.Equal(t, filepath.Join("testdata", fn), at.path) - assert.Equal(t, "F32", at.dtype) // only float32s supported - assert.Len(t, at.tensorBase.shape, 2) + if at.path != filepath.Join("testdata", fn) { + t.Errorf("got: %s want: %s", at.path, filepath.Join("testdata", fn)) + } + if at.dtype != "F32" { + t.Errorf("got: %s but only F32s are currently supported", at.dtype) + } + if len(at.tensorBase.shape) != 2 { + t.Errorf("got: %d want: %d tensor shape", at.tensorBase.shape, 2) + } } var ws io.WriteSeeker = &memWriter{} err = llm.WriteGGLA(ws, a.KV(nil), a.Tensors(ts)) - assert.NoError(t, err) + if err != nil { + t.Fatal(err) + } mw := ws.(*memWriter) slog.Info(fmt.Sprintf("buffer len = %d", len(mw.buf))) - assert.NotEmpty(t, mw.buf) + if len(mw.buf) == 0 { + t.Errorf("ggla layer not written correctly") + } rs := bytes.NewReader(mw.buf) ggml, _, err := llm.DecodeGGML(rs, len(mw.buf)) - assert.NoError(t, err, "decode ggml failed") - assert.NotNil(t, ggml, "ggml was empty") + if err != nil { + t.Fatal(err) + } + if ggml == nil { + t.Errorf("ggla didn't convert to ggml correctly") + } kv := ggml.KV() - assert.NotNil(t, kv, "lora KVs not found") + if kv == nil { + t.Errorf("no lora KVs were set") + } r, ok := kv["r"] - assert.Equal(t, true, ok, "lora rank not set") - assert.Equal(t, uint32(8), r, "lora rank was incorrect") + if !ok || r != uint32(8) { + t.Errorf("lora rank was not set correctly") + } alpha, ok := kv["alpha"] - assert.Equal(t, true, ok, "lora alpha not set") - assert.Equal(t, uint32(160), alpha, "lora alpha value was incorrect") + if !ok || alpha != uint32(160) { + t.Errorf("lora alpha was not set correctly") + } gts := ggml.Tensors() - assert.NotNil(t, gts, "no tensors found") - assert.Equal(t, len(ts), len(gts.Items)) + if len(ts) != len(gts.Items) { + t.Errorf("got: %d want: %d tensors in ggla", len(gts.Items), len(ts)) + } } } diff --git a/llm/patches/08-lora.diff b/llm/patches/10-lora.diff similarity index 77% rename from llm/patches/08-lora.diff rename to llm/patches/10-lora.diff index 6a14c4336..83695c1e3 100644 --- a/llm/patches/08-lora.diff +++ b/llm/patches/10-lora.diff @@ -1,8 +1,17 @@ -diff --git a/llama.cpp b/llama.cpp -index 61948751..d54fc537 100644 ---- a/llama.cpp -+++ b/llama.cpp -@@ -15940,6 +15940,20 @@ static int llama_apply_lora_from_file_internal( +diff --git a/CMakeLists.txt b/CMakeLists.txt +index 4f6cd687..b8c6896b 100644 +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -189,3 +189,4 @@ if (LLAMA_BUILD_EXAMPLES) + add_subdirectory(examples) + add_subdirectory(pocs) + endif() ++add_subdirectory(../ext_server ext_server) # ollama +diff --git a/src/llama.cpp b/src/llama.cpp +index 2b9ace28..b0151571 100644 +--- a/src/llama.cpp ++++ b/src/llama.cpp +@@ -18609,6 +18609,20 @@ static int llama_apply_lora_from_file_internal( return 1; } @@ -23,7 +32,7 @@ index 61948751..d54fc537 100644 // load tensor data auto load_tensor = [&read_buf, &fin](const tensor_meta & tensor_meta, ggml_tensor * tensor) { read_buf.resize(ggml_nbytes(tensor)); -@@ -15950,6 +15964,9 @@ static int llama_apply_lora_from_file_internal( +@@ -18619,6 +18633,9 @@ static int llama_apply_lora_from_file_internal( load_tensor(metaA, loraA); load_tensor(metaB, loraB); @@ -33,7 +42,7 @@ index 61948751..d54fc537 100644 // load base model tensor data if (ml) { ml->load_data_for(base_t); -@@ -15964,8 +15981,10 @@ static int llama_apply_lora_from_file_internal( +@@ -18633,8 +18650,10 @@ static int llama_apply_lora_from_file_internal( } if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) { @@ -46,13 +55,11 @@ index 61948751..d54fc537 100644 ggml_free(lora_ctx); ggml_backend_buffer_free(lora_buf); ggml_backend_free(backend_cpu); -@@ -15973,15 +15992,19 @@ static int llama_apply_lora_from_file_internal( - } +@@ -18643,14 +18662,18 @@ static int llama_apply_lora_from_file_internal( auto build_lora_graph = [&]() { -- // w = w + BA*s + // w = w + BA*s - ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB); -+ // Wlora = Worig + scaling * BA + ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraB, loraA); ggml_set_name(BA, "BA"); @@ -69,11 +76,3 @@ index 61948751..d54fc537 100644 ggml_tensor * r; r = ggml_add_inplace(lora_ctx, base_t, BA); ggml_set_name(r, "r_add"); -@@ -16009,6 +16032,7 @@ static int llama_apply_lora_from_file_internal( - } - - ggml_backend_graph_compute(backend_cpu, gf); -+ show_tensor("Result " + base_name, r); - - ggml_backend_tensor_set(model_t, r->data, 0, ggml_nbytes(r)); -