From 057cc54b6666c9f9c7320fe811dba2a5f3a6ff79 Mon Sep 17 00:00:00 2001
From: Bruce MacDonald <brucewmacdonald@gmail.com>
Date: Fri, 14 Feb 2025 15:28:24 -0800
Subject: [PATCH] benchmark: compare backend graph computation times

Track execution time of individual tensor operations (views, copies, reshapes etc)
during LLM forward passes using CGo bindings to the native graph runtime. This
helps identify performance bottlenecks in the computation graph and optimize memory
operations that can significantly impact inference latency.
---
 benchmark/ggml_backend_benchmark_test.go |  86 ++++++++++++++++++
 envconfig/config.go                      |   2 +
 kvcache/causal_test.go                   |   4 +
 ml/backend.go                            |  27 +++++-
 ml/backend/ggml/ggml.go                  | 107 +++++++++++++++++++++++
 5 files changed, 225 insertions(+), 1 deletion(-)
 create mode 100644 benchmark/ggml_backend_benchmark_test.go

diff --git a/benchmark/ggml_backend_benchmark_test.go b/benchmark/ggml_backend_benchmark_test.go
new file mode 100644
index 000000000..be952c5c9
--- /dev/null
+++ b/benchmark/ggml_backend_benchmark_test.go
@@ -0,0 +1,86 @@
+package backend
+
+import (
+	"flag"
+	"fmt"
+	"io"
+	"log"
+	"os"
+	"testing"
+
+	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/model"
+	"github.com/ollama/ollama/server"
+
+	_ "github.com/ollama/ollama/model/models/llama"
+)
+
+var modelName = flag.String("m", "", "Name of the model to benchmark")
+
+func suppressOutput() (cleanup func()) {
+	oldStdout, oldStderr := os.Stdout, os.Stderr
+	os.Stdout, os.Stderr = nil, nil
+	log.SetOutput(io.Discard)
+
+	return func() {
+		os.Stdout, os.Stderr = oldStdout, oldStderr
+		log.SetOutput(os.Stderr)
+	}
+}
+
+func setupModel(b *testing.B) model.Model {
+	if *modelName == "" {
+		b.Fatal("Error: -m flag is required for benchmark tests")
+	}
+
+	sm, err := server.GetModel(*modelName)
+	if err != nil {
+		b.Fatal(err)
+	}
+
+	m, err := model.New(sm.ModelPath)
+	if err != nil {
+		b.Fatal(err)
+	}
+
+	m.Config().Cache.Init(m.Backend(), ml.DTypeF32, 2048)
+	return m
+}
+
+func BenchmarkGGMLOperations(b *testing.B) {
+	// loading the GGML back-end logs to standard out and makes the bench output messy
+	cleanup := suppressOutput()
+	defer cleanup()
+
+	b.Setenv("OLLAMA_BENCHMARK", "1")
+	b.Setenv("OLLAMA_BACKEND", "ggml")
+
+	m := setupModel(b)
+
+	// Sample input data
+	inputIDs := []int32{1, 2, 3, 4, 5}
+	options := model.Options{
+		Inputs:    inputIDs,
+		Positions: []int32{1, 2, 3, 4, 5},
+		Sequences: []int{1, 1, 1, 1, 1},
+		Outputs:   []int32{int32(len(inputIDs) - 1)},
+	}
+
+	b.ResetTimer()
+
+	for range b.N {
+		ctx := m.Backend().NewContext()
+		defer ctx.Close()
+
+		modelOutput, err := model.Forward(ctx, m, options)
+		if err != nil {
+			b.Fatal(fmt.Errorf("forward pass failed: %v", err))
+		}
+
+		ctx.Compute(modelOutput)
+
+		for _, op := range ctx.Timing() {
+			b.ReportMetric(op.Duration, fmt.Sprintf("%s_ms", op.Type))
+		}
+	}
+}
diff --git a/envconfig/config.go b/envconfig/config.go
index fbd881ba7..74cb5bc24 100644
--- a/envconfig/config.go
+++ b/envconfig/config.go
@@ -167,6 +167,8 @@ var (
 	MultiUserCache = Bool("OLLAMA_MULTIUSER_CACHE")
 	// Enable the new Ollama engine
 	NewEngine = Bool("OLLAMA_NEW_ENGINE")
+	// Ollama is running in a benchmark context, additional timing data will be collected.
+	Benchmark = Bool("OLLAMA_BENCHMARK")
 )
 
 func String(s string) func() string {
diff --git a/kvcache/causal_test.go b/kvcache/causal_test.go
index 874e47433..9db3cac51 100644
--- a/kvcache/causal_test.go
+++ b/kvcache/causal_test.go
@@ -352,6 +352,10 @@ func (c *testContext) MaxTensors() int {
 	return 10
 }
 
+func (c *testContext) Timing() []ml.OpTiming {
+	return []ml.OpTiming{}
+}
+
 func (c *testContext) Close() {}
 
 type testTensor struct {
diff --git a/ml/backend.go b/ml/backend.go
index aebf86f76..f534813dd 100644
--- a/ml/backend.go
+++ b/ml/backend.go
@@ -2,6 +2,7 @@ package ml
 
 import (
 	"bytes"
+	"cmp"
 	"encoding/binary"
 	"fmt"
 	"os"
@@ -37,7 +38,7 @@ func RegisterBackend(name string, f func(*os.File) (Backend, error)) {
 }
 
 func NewBackend(f *os.File) (Backend, error) {
-	if backend, ok := backends["ggml"]; ok {
+	if backend, ok := backends[cmp.Or(os.Getenv("OLLAMA_BACKEND"), "ggml")]; ok {
 		return backend(f)
 	}
 
@@ -53,6 +54,30 @@ type Context interface {
 	Compute(...Tensor)
 	MaxTensors() int
 	Close()
+
+	Timing() []OpTiming
+}
+
+// OpType is the type of operation performed during a forward pass.
+type OpType string
+
+const (
+	View       OpType = "View"
+	Copy       OpType = "Copy"
+	Reshape    OpType = "Reshape"
+	Permute    OpType = "Permute"
+	Contiguous OpType = "Contiguous"
+	Input      OpType = "Input"
+	ComputeOp  OpType = "Compute"
+	Transpose  OpType = "Transpose"
+)
+
+// OpTiming stores the timing information for a single operation.
+type OpTiming struct {
+	Type      OpType
+	Operation string
+	Duration  float64
+	Order     int
 }
 
 type Tensor interface {
diff --git a/ml/backend/ggml/ggml.go b/ml/backend/ggml/ggml.go
index 5ba36361e..70c46d06a 100644
--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@@ -4,6 +4,8 @@ package ggml
 #cgo CPPFLAGS: -I${SRCDIR}/ggml/include
 #include <stdlib.h>
 #include <stdint.h>
+#include <time.h>
+#include <string.h>
 #include "ggml.h"
 #include "ggml-cpu.h"
 #include "ggml-backend.h"
@@ -21,6 +23,54 @@ COMPILER inline get_compiler() {
 #endif
 }
 
+// Define a fixed-size struct to store timing data
+#define MAX_TENSOR_NAME 256
+#define MAX_TIMINGS 1000
+
+typedef struct {
+    char tensor_name[MAX_TENSOR_NAME];
+    double duration_ms;
+} timing_entry;
+
+typedef struct {
+    timing_entry entries[MAX_TIMINGS];
+    int count;
+} timing_data;
+
+// Global timing data structure
+timing_data g_timings = {0};
+
+double get_time_ms() {
+    struct timespec ts;
+    clock_gettime(CLOCK_MONOTONIC, &ts);
+    return ts.tv_sec * 1000.0 + ts.tv_nsec / 1000000.0;
+}
+
+bool debug_callback(struct ggml_tensor * t, bool ask, void * user_data) {
+    static double start_time;
+    static char current_tensor[MAX_TENSOR_NAME];
+
+    if (ask) {
+        start_time = get_time_ms();
+        strncpy(current_tensor, t->name, MAX_TENSOR_NAME - 1);
+        current_tensor[MAX_TENSOR_NAME - 1] = '\0';
+    } else {
+        double end_time = get_time_ms();
+        double duration = end_time - start_time;
+
+        if (g_timings.count < MAX_TIMINGS) {
+            strncpy(g_timings.entries[g_timings.count].tensor_name, current_tensor, MAX_TENSOR_NAME - 1);
+            g_timings.entries[g_timings.count].duration_ms = duration;
+            g_timings.count++;
+        }
+    }
+    return true;
+}
+
+void clear_timings() {
+    g_timings.count = 0;
+}
+
 */
 import "C"
 
@@ -29,9 +79,11 @@ import (
 	"io"
 	"log/slog"
 	"os"
+	"strings"
 	"sync"
 	"unsafe"
 
+	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/format"
 	fs "github.com/ollama/ollama/fs/ggml"
 	"github.com/ollama/ollama/ml"
@@ -256,7 +308,62 @@ func (c *Context) Forward(t ml.Tensor) {
 	C.ggml_build_forward_expand(c.graph, t.(*Tensor).t)
 }
 
+// Timing retrieves the collected timing data
+func (c *Context) Timing() []ml.OpTiming {
+	sequence := make([]ml.OpTiming, C.g_timings.count)
+
+	for i := range int(C.g_timings.count) {
+		entry := C.g_timings.entries[i]
+		tensorName := C.GoString(&entry.tensor_name[0])
+
+		// Determine operation type and description based on tensor name
+		var opType ml.OpType
+		var opDesc string
+
+		switch {
+		case strings.Contains(tensorName, "(view)"):
+			opType, opDesc = ml.View, "Memory view"
+		case strings.Contains(tensorName, "(copy)") || strings.Contains(tensorName, "(copy of"):
+			opType, opDesc = ml.Copy, "Memory copy"
+		case strings.Contains(tensorName, "(reshaped)"):
+			opType, opDesc = ml.Reshape, "Reshape"
+		case strings.Contains(tensorName, "(permuted)"):
+			opType, opDesc = ml.Permute, "Permute dimensions"
+		case strings.Contains(tensorName, "(cont)"):
+			opType, opDesc = ml.Contiguous, "Make contiguous"
+		case strings.Contains(tensorName, "(transposed)"):
+			opType, opDesc = ml.Transpose, "Transpose"
+		case strings.HasPrefix(tensorName, "leaf_"):
+			opType, opDesc = ml.Input, fmt.Sprintf("Input tensor %s", tensorName)
+		case strings.HasPrefix(tensorName, "node_"):
+			opType, opDesc = ml.ComputeOp, fmt.Sprintf("Computation %s", tensorName)
+		default:
+			opType, opDesc = "Unknown", tensorName
+		}
+
+		sequence[i] = ml.OpTiming{
+			Type:      opType,
+			Operation: opDesc,
+			Duration:  float64(entry.duration_ms),
+			Order:     i,
+		}
+	}
+
+	return sequence
+}
+
 func (c *Context) Compute(tensors ...ml.Tensor) {
+	if envconfig.Benchmark() {
+		// Clear previous timings before new computation
+		C.clear_timings()
+
+		C.ggml_backend_sched_set_eval_callback(
+			c.sched,
+			C.ggml_backend_eval_callback(C.debug_callback),
+			nil,
+		)
+	}
+
 	C.ggml_backend_sched_graph_compute_async(c.sched, c.graph)
 
 	needSync := true