benchmark: compare backend graph computation times

Track execution time of individual tensor operations (views, copies, reshapes etc) during LLM forward passes using CGo bindings to the native graph runtime. This helps identify performance bottlenecks in the computation graph and optimize memory operations that can significantly impact inference latency.
2025-02-14 15:28:24 -08:00 · 2025-02-14 15:28:24 -08:00 · 057cc54b66
commit 057cc54b66
parent 1e438b237c
5 changed files with 225 additions and 1 deletions
--- a/benchmark/ggml_backend_benchmark_test.go
+++ b/benchmark/ggml_backend_benchmark_test.go
@ -0,0 +1,86 @@
+package backend
+
+import (
+	"flag"
+	"fmt"
+	"io"
+	"log"
+	"os"
+	"testing"
+
+	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/model"
+	"github.com/ollama/ollama/server"
+
+	_ "github.com/ollama/ollama/model/models/llama"
+)
+
+var modelName = flag.String("m", "", "Name of the model to benchmark")
+
+func suppressOutput() (cleanup func()) {
+	oldStdout, oldStderr := os.Stdout, os.Stderr
+	os.Stdout, os.Stderr = nil, nil
+	log.SetOutput(io.Discard)
+
+	return func() {
+		os.Stdout, os.Stderr = oldStdout, oldStderr
+		log.SetOutput(os.Stderr)
+	}
+}
+
+func setupModel(b *testing.B) model.Model {
+	if *modelName == "" {
+		b.Fatal("Error: -m flag is required for benchmark tests")
+	}
+
+	sm, err := server.GetModel(*modelName)
+	if err != nil {
+		b.Fatal(err)
+	}
+
+	m, err := model.New(sm.ModelPath)
+	if err != nil {
+		b.Fatal(err)
+	}
+
+	m.Config().Cache.Init(m.Backend(), ml.DTypeF32, 2048)
+	return m
+}
+
+func BenchmarkGGMLOperations(b *testing.B) {
+	// loading the GGML back-end logs to standard out and makes the bench output messy
+	cleanup := suppressOutput()
+	defer cleanup()
+
+	b.Setenv("OLLAMA_BENCHMARK", "1")
+	b.Setenv("OLLAMA_BACKEND", "ggml")
+
+	m := setupModel(b)
+
+	// Sample input data
+	inputIDs := []int32{1, 2, 3, 4, 5}
+	options := model.Options{
+		Inputs:    inputIDs,
+		Positions: []int32{1, 2, 3, 4, 5},
+		Sequences: []int{1, 1, 1, 1, 1},
+		Outputs:   []int32{int32(len(inputIDs) - 1)},
+	}
+
+	b.ResetTimer()
+
+	for range b.N {
+		ctx := m.Backend().NewContext()
+		defer ctx.Close()
+
+		modelOutput, err := model.Forward(ctx, m, options)
+		if err != nil {
+			b.Fatal(fmt.Errorf("forward pass failed: %v", err))
+		}
+
+		ctx.Compute(modelOutput)
+
+		for _, op := range ctx.Timing() {
+			b.ReportMetric(op.Duration, fmt.Sprintf("%s_ms", op.Type))
+		}
+	}
+}
--- a/envconfig/config.go
+++ b/envconfig/config.go
@ -167,6 +167,8 @@ var (
 	MultiUserCache = Bool("OLLAMA_MULTIUSER_CACHE")
 	// Enable the new Ollama engine
 	NewEngine = Bool("OLLAMA_NEW_ENGINE")
+	// Ollama is running in a benchmark context, additional timing data will be collected.
+	Benchmark = Bool("OLLAMA_BENCHMARK")
 )

 func String(s string) func() string {
--- a/kvcache/causal_test.go
+++ b/kvcache/causal_test.go
@ -352,6 +352,10 @@ func (c *testContext) MaxTensors() int {
 	return 10
 }

+func (c *testContext) Timing() []ml.OpTiming {
+	return []ml.OpTiming{}
+}
+
 func (c *testContext) Close() {}

 type testTensor struct {
--- a/ml/backend.go
+++ b/ml/backend.go
@ -2,6 +2,7 @@ package ml

 import (
 	"bytes"
+	"cmp"
 	"encoding/binary"
 	"fmt"
 	"os"
@ -37,7 +38,7 @@ func RegisterBackend(name string, f func(*os.File) (Backend, error)) {
 }

 func NewBackend(f *os.File) (Backend, error) {
-	if backend, ok := backends["ggml"]; ok {
+	if backend, ok := backends[cmp.Or(os.Getenv("OLLAMA_BACKEND"), "ggml")]; ok {
 		return backend(f)
 	}

@ -53,6 +54,30 @@ type Context interface {
 	Compute(...Tensor)
 	MaxTensors() int
 	Close()
+
+	Timing() []OpTiming
+}
+
+// OpType is the type of operation performed during a forward pass.
+type OpType string
+
+const (
+	View       OpType = "View"
+	Copy       OpType = "Copy"
+	Reshape    OpType = "Reshape"
+	Permute    OpType = "Permute"
+	Contiguous OpType = "Contiguous"
+	Input      OpType = "Input"
+	ComputeOp  OpType = "Compute"
+	Transpose  OpType = "Transpose"
+)
+
+// OpTiming stores the timing information for a single operation.
+type OpTiming struct {
+	Type      OpType
+	Operation string
+	Duration  float64
+	Order     int
 }

 type Tensor interface {
--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@ -4,6 +4,8 @@ package ggml
 #cgo CPPFLAGS: -I${SRCDIR}/ggml/include
 #include <stdlib.h>
 #include <stdint.h>
+#include <time.h>
+#include <string.h>
 #include "ggml.h"
 #include "ggml-cpu.h"
 #include "ggml-backend.h"
@ -21,6 +23,54 @@ COMPILER inline get_compiler() {
 #endif
 }

+// Define a fixed-size struct to store timing data
+#define MAX_TENSOR_NAME 256
+#define MAX_TIMINGS 1000
+
+typedef struct {
+    char tensor_name[MAX_TENSOR_NAME];
+    double duration_ms;
+} timing_entry;
+
+typedef struct {
+    timing_entry entries[MAX_TIMINGS];
+    int count;
+} timing_data;
+
+// Global timing data structure
+timing_data g_timings = {0};
+
+double get_time_ms() {
+    struct timespec ts;
+    clock_gettime(CLOCK_MONOTONIC, &ts);
+    return ts.tv_sec * 1000.0 + ts.tv_nsec / 1000000.0;
+}
+
+bool debug_callback(struct ggml_tensor * t, bool ask, void * user_data) {
+    static double start_time;
+    static char current_tensor[MAX_TENSOR_NAME];
+
+    if (ask) {
+        start_time = get_time_ms();
+        strncpy(current_tensor, t->name, MAX_TENSOR_NAME - 1);
+        current_tensor[MAX_TENSOR_NAME - 1] = '\0';
+    } else {
+        double end_time = get_time_ms();
+        double duration = end_time - start_time;
+
+        if (g_timings.count < MAX_TIMINGS) {
+            strncpy(g_timings.entries[g_timings.count].tensor_name, current_tensor, MAX_TENSOR_NAME - 1);
+            g_timings.entries[g_timings.count].duration_ms = duration;
+            g_timings.count++;
+        }
+    }
+    return true;
+}
+
+void clear_timings() {
+    g_timings.count = 0;
+}
+
 */
 import "C"

@ -29,9 +79,11 @@ import (
 	"io"
 	"log/slog"
 	"os"
+	"strings"
 	"sync"
 	"unsafe"

+	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/format"
 	fs "github.com/ollama/ollama/fs/ggml"
 	"github.com/ollama/ollama/ml"
@ -256,7 +308,62 @@ func (c *Context) Forward(t ml.Tensor) {
 	C.ggml_build_forward_expand(c.graph, t.(*Tensor).t)
 }

+// Timing retrieves the collected timing data
+func (c *Context) Timing() []ml.OpTiming {
+	sequence := make([]ml.OpTiming, C.g_timings.count)
+
+	for i := range int(C.g_timings.count) {
+		entry := C.g_timings.entries[i]
+		tensorName := C.GoString(&entry.tensor_name[0])
+
+		// Determine operation type and description based on tensor name
+		var opType ml.OpType
+		var opDesc string
+
+		switch {
+		case strings.Contains(tensorName, "(view)"):
+			opType, opDesc = ml.View, "Memory view"
+		case strings.Contains(tensorName, "(copy)") || strings.Contains(tensorName, "(copy of"):
+			opType, opDesc = ml.Copy, "Memory copy"
+		case strings.Contains(tensorName, "(reshaped)"):
+			opType, opDesc = ml.Reshape, "Reshape"
+		case strings.Contains(tensorName, "(permuted)"):
+			opType, opDesc = ml.Permute, "Permute dimensions"
+		case strings.Contains(tensorName, "(cont)"):
+			opType, opDesc = ml.Contiguous, "Make contiguous"
+		case strings.Contains(tensorName, "(transposed)"):
+			opType, opDesc = ml.Transpose, "Transpose"
+		case strings.HasPrefix(tensorName, "leaf_"):
+			opType, opDesc = ml.Input, fmt.Sprintf("Input tensor %s", tensorName)
+		case strings.HasPrefix(tensorName, "node_"):
+			opType, opDesc = ml.ComputeOp, fmt.Sprintf("Computation %s", tensorName)
+		default:
+			opType, opDesc = "Unknown", tensorName
+		}
+
+		sequence[i] = ml.OpTiming{
+			Type:      opType,
+			Operation: opDesc,
+			Duration:  float64(entry.duration_ms),
+			Order:     i,
+		}
+	}
+
+	return sequence
+}
+
 func (c *Context) Compute(tensors ...ml.Tensor) {
+	if envconfig.Benchmark() {
+		// Clear previous timings before new computation
+		C.clear_timings()
+
+		C.ggml_backend_sched_set_eval_callback(
+			c.sched,
+			C.ggml_backend_eval_callback(C.debug_callback),
+			nil,
+		)
+	}
+
 	C.ggml_backend_sched_graph_compute_async(c.sched, c.graph)

 	needSync := true