benchmark: compare backend graph computation times

Track execution time of individual tensor operations (views, copies, reshapes etc) during LLM forward passes using CGo bindings to the native graph runtime. This helps identify performance bottlenecks in the computation graph and optimize memory operations that can significantly impact inference latency.
2025-02-19 15:22:53 -08:00
5 changed files with 225 additions and 1 deletions
--- a/benchmark/ggml_backend_benchmark_test.go
+++ b/benchmark/ggml_backend_benchmark_test.go
@ -0,0 +1,86 @@
 package backend
 import (
 	"flag"
 	"fmt"
 	"io"
 	"log"
 	"os"
 	"testing"
 	"github.com/ollama/ollama/ml"
 	"github.com/ollama/ollama/model"
 	"github.com/ollama/ollama/server"
 	_ "github.com/ollama/ollama/model/models/llama"
 )
 var modelName = flag.String("m", "", "Name of the model to benchmark")
 func suppressOutput() (cleanup func()) {
 	oldStdout, oldStderr := os.Stdout, os.Stderr
 	os.Stdout, os.Stderr = nil, nil
 	log.SetOutput(io.Discard)
 	return func() {
 		os.Stdout, os.Stderr = oldStdout, oldStderr
 		log.SetOutput(os.Stderr)
 	}
 }
 func setupModel(b *testing.B) model.Model {
 	if *modelName == "" {
 		b.Fatal("Error: -m flag is required for benchmark tests")
 	}
 	sm, err := server.GetModel(*modelName)
 	if err != nil {
 		b.Fatal(err)
 	}
 	m, err := model.New(sm.ModelPath)
 	if err != nil {
 		b.Fatal(err)
 	}
 	m.Config().Cache.Init(m.Backend(), ml.DTypeF32, 2048)
 	return m
 }
 func BenchmarkGGMLOperations(b *testing.B) {
 	// loading the GGML back-end logs to standard out and makes the bench output messy
 	cleanup := suppressOutput()
 	defer cleanup()
 	b.Setenv("OLLAMA_BENCHMARK", "1")
 	b.Setenv("OLLAMA_BACKEND", "ggml")
 	m := setupModel(b)
 	// Sample input data
 	inputIDs := []int32{1, 2, 3, 4, 5}
 	options := model.Options{
 		Inputs:    inputIDs,
 		Positions: []int32{1, 2, 3, 4, 5},
 		Sequences: []int{1, 1, 1, 1, 1},
 		Outputs:   []int32{int32(len(inputIDs) - 1)},
 	}
 	b.ResetTimer()
 	for range b.N {
 		ctx := m.Backend().NewContext()
 		defer ctx.Close()
 		modelOutput, err := model.Forward(ctx, m, options)
 		if err != nil {
 			b.Fatal(fmt.Errorf("forward pass failed: %v", err))
 		}
 		ctx.Compute(modelOutput)
 		for _, op := range ctx.Timing() {
 			b.ReportMetric(op.Duration, fmt.Sprintf("%s_ms", op.Type))
 		}
 	}
 }
--- a/envconfig/config.go
+++ b/envconfig/config.go
@ -167,6 +167,8 @@ var (
 	MultiUserCache = Bool("OLLAMA_MULTIUSER_CACHE")
 	// Enable the new Ollama engine
 	NewEngine = Bool("OLLAMA_NEW_ENGINE")
 	// Ollama is running in a benchmark context, additional timing data will be collected.
 	Benchmark = Bool("OLLAMA_BENCHMARK")
 )
 func String(s string) func() string {
--- a/kvcache/causal_test.go
+++ b/kvcache/causal_test.go
@ -352,6 +352,10 @@ func (c *testContext) MaxTensors() int {
 	return 10
 }
 func (c *testContext) Timing() []ml.OpTiming {
 	return []ml.OpTiming{}
 }
 func (c *testContext) Close() {}
 type testTensor struct {
--- a/ml/backend.go
+++ b/ml/backend.go
@ -2,6 +2,7 @@ package ml
 import (
 	"bytes"
 	"cmp"
 	"encoding/binary"
 	"fmt"
 	"os"
@ -37,7 +38,7 @@ func RegisterBackend(name string, f func(*os.File) (Backend, error)) {
 }
 func NewBackend(f *os.File) (Backend, error) {
-	if backend, ok := backends["ggml"]; ok {
+	if backend, ok := backends[cmp.Or(os.Getenv("OLLAMA_BACKEND"), "ggml")]; ok {
 		return backend(f)
 	}
@ -53,6 +54,30 @@ type Context interface {
 	Compute(...Tensor)
 	MaxTensors() int
 	Close()
 	Timing() []OpTiming
 }
 // OpType is the type of operation performed during a forward pass.
 type OpType string
 const (
 	View       OpType = "View"
 	Copy       OpType = "Copy"
 	Reshape    OpType = "Reshape"
 	Permute    OpType = "Permute"
 	Contiguous OpType = "Contiguous"
 	Input      OpType = "Input"
 	ComputeOp  OpType = "Compute"
 	Transpose  OpType = "Transpose"
 )
 // OpTiming stores the timing information for a single operation.
 type OpTiming struct {
 	Type      OpType
 	Operation string
 	Duration  float64
 	Order     int
 }
 type Tensor interface {
--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@ -4,6 +4,8 @@ package ggml
 #cgo CPPFLAGS: -I${SRCDIR}/ggml/include
 #include <stdlib.h>
 #include <stdint.h>
 #include <time.h>
 #include <string.h>
 #include "ggml.h"
 #include "ggml-cpu.h"
 #include "ggml-backend.h"
@ -21,6 +23,54 @@ COMPILER inline get_compiler() {
 #endif
 }
 // Define a fixed-size struct to store timing data
 #define MAX_TENSOR_NAME 256
 #define MAX_TIMINGS 1000
 typedef struct {
    char tensor_name[MAX_TENSOR_NAME];
    double duration_ms;
 } timing_entry;
 typedef struct {
    timing_entry entries[MAX_TIMINGS];
    int count;
 } timing_data;
 // Global timing data structure
 timing_data g_timings = {0};
 double get_time_ms() {
    struct timespec ts;
    clock_gettime(CLOCK_MONOTONIC, &ts);
    return ts.tv_sec * 1000.0 + ts.tv_nsec / 1000000.0;
 }
 bool debug_callback(struct ggml_tensor * t, bool ask, void * user_data) {
    static double start_time;
    static char current_tensor[MAX_TENSOR_NAME];
    if (ask) {
        start_time = get_time_ms();
        strncpy(current_tensor, t->name, MAX_TENSOR_NAME - 1);
        current_tensor[MAX_TENSOR_NAME - 1] = '\0';
    } else {
        double end_time = get_time_ms();
        double duration = end_time - start_time;
        if (g_timings.count < MAX_TIMINGS) {
            strncpy(g_timings.entries[g_timings.count].tensor_name, current_tensor, MAX_TENSOR_NAME - 1);
            g_timings.entries[g_timings.count].duration_ms = duration;
            g_timings.count++;
        }
    }
    return true;
 }
 void clear_timings() {
    g_timings.count = 0;
 }
 */
 import "C"
@ -29,9 +79,11 @@ import (
 	"io"
 	"log/slog"
 	"os"
 	"strings"
 	"sync"
 	"unsafe"
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/format"
 	fs "github.com/ollama/ollama/fs/ggml"
 	"github.com/ollama/ollama/ml"
@ -256,7 +308,62 @@ func (c *Context) Forward(t ml.Tensor) {
 	C.ggml_build_forward_expand(c.graph, t.(*Tensor).t)
 }
 // Timing retrieves the collected timing data
 func (c *Context) Timing() []ml.OpTiming {
 	sequence := make([]ml.OpTiming, C.g_timings.count)
 	for i := range int(C.g_timings.count) {
 		entry := C.g_timings.entries[i]
 		tensorName := C.GoString(&entry.tensor_name[0])
 		// Determine operation type and description based on tensor name
 		var opType ml.OpType
 		var opDesc string
 		switch {
 		case strings.Contains(tensorName, "(view)"):
 			opType, opDesc = ml.View, "Memory view"
 		case strings.Contains(tensorName, "(copy)") || strings.Contains(tensorName, "(copy of"):
 			opType, opDesc = ml.Copy, "Memory copy"
 		case strings.Contains(tensorName, "(reshaped)"):
 			opType, opDesc = ml.Reshape, "Reshape"
 		case strings.Contains(tensorName, "(permuted)"):
 			opType, opDesc = ml.Permute, "Permute dimensions"
 		case strings.Contains(tensorName, "(cont)"):
 			opType, opDesc = ml.Contiguous, "Make contiguous"
 		case strings.Contains(tensorName, "(transposed)"):
 			opType, opDesc = ml.Transpose, "Transpose"
 		case strings.HasPrefix(tensorName, "leaf_"):
 			opType, opDesc = ml.Input, fmt.Sprintf("Input tensor %s", tensorName)
 		case strings.HasPrefix(tensorName, "node_"):
 			opType, opDesc = ml.ComputeOp, fmt.Sprintf("Computation %s", tensorName)
 		default:
 			opType, opDesc = "Unknown", tensorName
 		}
 		sequence[i] = ml.OpTiming{
 			Type:      opType,
 			Operation: opDesc,
 			Duration:  float64(entry.duration_ms),
 			Order:     i,
 		}
 	}
 	return sequence
 }
 func (c *Context) Compute(tensors ...ml.Tensor) {
 	if envconfig.Benchmark() {
 		// Clear previous timings before new computation
 		C.clear_timings()
 		C.ggml_backend_sched_set_eval_callback(
 			c.sched,
 			C.ggml_backend_eval_callback(C.debug_callback),
 			nil,
 		)
 	}
 	C.ggml_backend_sched_graph_compute_async(c.sched, c.graph)
 	needSync := true