feed the linter again + llama.cpp patches

2024-07-08 17:03:13 -07:00 · 2024-07-08 17:03:13 -07:00 · 8ba3f38f82
commit 8ba3f38f82
parent a3058002c4
2 changed files with 57 additions and 36 deletions
--- a/convert/convert_test.go
+++ b/convert/convert_test.go
@ -16,7 +16,6 @@ import (
 	"testing"

 	"github.com/ollama/ollama/llm"
-	"github.com/stretchr/testify/assert"
 	"golang.org/x/exp/maps"
 )

@ -134,44 +133,67 @@ func TestConvertNPZ(t *testing.T) {

 	for _, fn := range cases {
 		ts, err := parseNPZ(filepath.Join("testdata", fn))
-		assert.NoError(t, err)
-		assert.Len(t, ts, 16*2*2) // 16 layers, 2 tensors, 2 loras
+		if err != nil {
+			t.Fatal(err)
+		}
+		if len(ts) != 16*2*2 {
+			t.Errorf("got: %d want: %d total layers", len(ts), 16*2*2)
+		}

 		a := adapter{}

 		for _, m := range ts {
 			at := m.(adapterTensor)
-			assert.Equal(t, filepath.Join("testdata", fn), at.path)
-			assert.Equal(t, "F32", at.dtype) // only float32s supported
-			assert.Len(t, at.tensorBase.shape, 2)
+			if at.path != filepath.Join("testdata", fn) {
+				t.Errorf("got: %s want: %s", at.path, filepath.Join("testdata", fn))
+			}
+			if at.dtype != "F32" {
+				t.Errorf("got: %s but only F32s are currently supported", at.dtype)
+			}
+			if len(at.tensorBase.shape) != 2 {
+				t.Errorf("got: %d want: %d tensor shape", at.tensorBase.shape, 2)
+			}
 		}

 		var ws io.WriteSeeker = &memWriter{}
 		err = llm.WriteGGLA(ws, a.KV(nil), a.Tensors(ts))
-		assert.NoError(t, err)
+		if err != nil {
+			t.Fatal(err)
+		}

 		mw := ws.(*memWriter)
 		slog.Info(fmt.Sprintf("buffer len = %d", len(mw.buf)))
-		assert.NotEmpty(t, mw.buf)
+		if len(mw.buf) == 0 {
+			t.Errorf("ggla layer not written correctly")
+		}
 		rs := bytes.NewReader(mw.buf)
 		ggml, _, err := llm.DecodeGGML(rs, len(mw.buf))
-		assert.NoError(t, err, "decode ggml failed")
-		assert.NotNil(t, ggml, "ggml was empty")
+		if err != nil {
+			t.Fatal(err)
+		}
+		if ggml == nil {
+			t.Errorf("ggla didn't convert to ggml correctly")
+		}

 		kv := ggml.KV()
-		assert.NotNil(t, kv, "lora KVs not found")
+		if kv == nil {
+			t.Errorf("no lora KVs were set")
+		}

 		r, ok := kv["r"]
-		assert.Equal(t, true, ok, "lora rank not set")
-		assert.Equal(t, uint32(8), r, "lora rank was incorrect")
+		if !ok || r != uint32(8) {
+			t.Errorf("lora rank was not set correctly")
+		}

 		alpha, ok := kv["alpha"]
-		assert.Equal(t, true, ok, "lora alpha not set")
-		assert.Equal(t, uint32(160), alpha, "lora alpha value was incorrect")
+		if !ok || alpha != uint32(160) {
+			t.Errorf("lora alpha was not set correctly")
+		}

 		gts := ggml.Tensors()
-		assert.NotNil(t, gts, "no tensors found")
-		assert.Equal(t, len(ts), len(gts.Items))
+		if len(ts) != len(gts.Items) {
+			t.Errorf("got: %d want: %d tensors in ggla", len(gts.Items), len(ts))
+		}
 	}
 }

--- a/llm/patches/10-lora.diff
+++ b/llm/patches/10-lora.diff
@ -1,8 +1,17 @@
-diff --git a/llama.cpp b/llama.cpp
-index 61948751..d54fc537 100644
--- a/llama.cpp
-+++ b/llama.cpp
-@@ -15940,6 +15940,20 @@ static int llama_apply_lora_from_file_internal(
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index 4f6cd687..b8c6896b 100644
+--- a/CMakeLists.txt
+++ b/CMakeLists.txt
+@@ -189,3 +189,4 @@ if (LLAMA_BUILD_EXAMPLES)
+     add_subdirectory(examples)
+     add_subdirectory(pocs)
+ endif()
+add_subdirectory(../ext_server ext_server) # ollama
+diff --git a/src/llama.cpp b/src/llama.cpp
+index 2b9ace28..b0151571 100644
+--- a/src/llama.cpp
+++ b/src/llama.cpp
+@@ -18609,6 +18609,20 @@ static int llama_apply_lora_from_file_internal(
             return 1;
         }
 
@ -23,7 +32,7 @@ index 61948751..d54fc537 100644
         // load tensor data
         auto load_tensor = [&read_buf, &fin](const tensor_meta & tensor_meta, ggml_tensor * tensor) {
             read_buf.resize(ggml_nbytes(tensor));
-@@ -15950,6 +15964,9 @@ static int llama_apply_lora_from_file_internal(
+@@ -18619,6 +18633,9 @@ static int llama_apply_lora_from_file_internal(
         load_tensor(metaA, loraA);
         load_tensor(metaB, loraB);
 
@ -33,7 +42,7 @@ index 61948751..d54fc537 100644
         // load base model tensor data
         if (ml) {
             ml->load_data_for(base_t);
-@@ -15964,8 +15981,10 @@ static int llama_apply_lora_from_file_internal(
+@@ -18633,8 +18650,10 @@ static int llama_apply_lora_from_file_internal(
         }
 
         if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) {
@ -46,13 +55,11 @@ index 61948751..d54fc537 100644
             ggml_free(lora_ctx);
             ggml_backend_buffer_free(lora_buf);
             ggml_backend_free(backend_cpu);
-@@ -15973,15 +15992,19 @@ static int llama_apply_lora_from_file_internal(
-         }
+@@ -18643,14 +18662,18 @@ static int llama_apply_lora_from_file_internal(
 
         auto build_lora_graph = [&]() {
-            // w = w + BA*s
+             // w = w + BA*s
 -            ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB);
-+            // Wlora = Worig + scaling * BA
 +            ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraB, loraA);
             ggml_set_name(BA, "BA");
 
@ -69,11 +76,3 @@ index 61948751..d54fc537 100644
             ggml_tensor * r;
             r = ggml_add_inplace(lora_ctx, base_t, BA);
             ggml_set_name(r, "r_add");
-@@ -16009,6 +16032,7 @@ static int llama_apply_lora_from_file_internal(
-         }
- 
-         ggml_backend_graph_compute(backend_cpu, gf);
-+        show_tensor("Result " + base_name, r);
- 
-         ggml_backend_tensor_set(model_t, r->data, 0, ggml_nbytes(r));
-