diff --git a/benchmark/ggml_backend_benchmark_test.go b/benchmark/ggml_backend_benchmark_test.go new file mode 100644 index 000000000..be952c5c9 --- /dev/null +++ b/benchmark/ggml_backend_benchmark_test.go @@ -0,0 +1,86 @@ +package backend + +import ( + "flag" + "fmt" + "io" + "log" + "os" + "testing" + + "github.com/ollama/ollama/ml" + "github.com/ollama/ollama/model" + "github.com/ollama/ollama/server" + + _ "github.com/ollama/ollama/model/models/llama" +) + +var modelName = flag.String("m", "", "Name of the model to benchmark") + +func suppressOutput() (cleanup func()) { + oldStdout, oldStderr := os.Stdout, os.Stderr + os.Stdout, os.Stderr = nil, nil + log.SetOutput(io.Discard) + + return func() { + os.Stdout, os.Stderr = oldStdout, oldStderr + log.SetOutput(os.Stderr) + } +} + +func setupModel(b *testing.B) model.Model { + if *modelName == "" { + b.Fatal("Error: -m flag is required for benchmark tests") + } + + sm, err := server.GetModel(*modelName) + if err != nil { + b.Fatal(err) + } + + m, err := model.New(sm.ModelPath) + if err != nil { + b.Fatal(err) + } + + m.Config().Cache.Init(m.Backend(), ml.DTypeF32, 2048) + return m +} + +func BenchmarkGGMLOperations(b *testing.B) { + // loading the GGML back-end logs to standard out and makes the bench output messy + cleanup := suppressOutput() + defer cleanup() + + b.Setenv("OLLAMA_BENCHMARK", "1") + b.Setenv("OLLAMA_BACKEND", "ggml") + + m := setupModel(b) + + // Sample input data + inputIDs := []int32{1, 2, 3, 4, 5} + options := model.Options{ + Inputs: inputIDs, + Positions: []int32{1, 2, 3, 4, 5}, + Sequences: []int{1, 1, 1, 1, 1}, + Outputs: []int32{int32(len(inputIDs) - 1)}, + } + + b.ResetTimer() + + for range b.N { + ctx := m.Backend().NewContext() + defer ctx.Close() + + modelOutput, err := model.Forward(ctx, m, options) + if err != nil { + b.Fatal(fmt.Errorf("forward pass failed: %v", err)) + } + + ctx.Compute(modelOutput) + + for _, op := range ctx.Timing() { + b.ReportMetric(op.Duration, fmt.Sprintf("%s_ms", op.Type)) + } + } +} diff --git a/envconfig/config.go b/envconfig/config.go index fbd881ba7..74cb5bc24 100644 --- a/envconfig/config.go +++ b/envconfig/config.go @@ -167,6 +167,8 @@ var ( MultiUserCache = Bool("OLLAMA_MULTIUSER_CACHE") // Enable the new Ollama engine NewEngine = Bool("OLLAMA_NEW_ENGINE") + // Ollama is running in a benchmark context, additional timing data will be collected. + Benchmark = Bool("OLLAMA_BENCHMARK") ) func String(s string) func() string { diff --git a/kvcache/causal_test.go b/kvcache/causal_test.go index 874e47433..9db3cac51 100644 --- a/kvcache/causal_test.go +++ b/kvcache/causal_test.go @@ -352,6 +352,10 @@ func (c *testContext) MaxTensors() int { return 10 } +func (c *testContext) Timing() []ml.OpTiming { + return []ml.OpTiming{} +} + func (c *testContext) Close() {} type testTensor struct { diff --git a/ml/backend.go b/ml/backend.go index aebf86f76..f534813dd 100644 --- a/ml/backend.go +++ b/ml/backend.go @@ -2,6 +2,7 @@ package ml import ( "bytes" + "cmp" "encoding/binary" "fmt" "os" @@ -37,7 +38,7 @@ func RegisterBackend(name string, f func(*os.File) (Backend, error)) { } func NewBackend(f *os.File) (Backend, error) { - if backend, ok := backends["ggml"]; ok { + if backend, ok := backends[cmp.Or(os.Getenv("OLLAMA_BACKEND"), "ggml")]; ok { return backend(f) } @@ -53,6 +54,30 @@ type Context interface { Compute(...Tensor) MaxTensors() int Close() + + Timing() []OpTiming +} + +// OpType is the type of operation performed during a forward pass. +type OpType string + +const ( + View OpType = "View" + Copy OpType = "Copy" + Reshape OpType = "Reshape" + Permute OpType = "Permute" + Contiguous OpType = "Contiguous" + Input OpType = "Input" + ComputeOp OpType = "Compute" + Transpose OpType = "Transpose" +) + +// OpTiming stores the timing information for a single operation. +type OpTiming struct { + Type OpType + Operation string + Duration float64 + Order int } type Tensor interface { diff --git a/ml/backend/ggml/ggml.go b/ml/backend/ggml/ggml.go index 5ba36361e..70c46d06a 100644 --- a/ml/backend/ggml/ggml.go +++ b/ml/backend/ggml/ggml.go @@ -4,6 +4,8 @@ package ggml #cgo CPPFLAGS: -I${SRCDIR}/ggml/include #include #include +#include +#include #include "ggml.h" #include "ggml-cpu.h" #include "ggml-backend.h" @@ -21,6 +23,54 @@ COMPILER inline get_compiler() { #endif } +// Define a fixed-size struct to store timing data +#define MAX_TENSOR_NAME 256 +#define MAX_TIMINGS 1000 + +typedef struct { + char tensor_name[MAX_TENSOR_NAME]; + double duration_ms; +} timing_entry; + +typedef struct { + timing_entry entries[MAX_TIMINGS]; + int count; +} timing_data; + +// Global timing data structure +timing_data g_timings = {0}; + +double get_time_ms() { + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + return ts.tv_sec * 1000.0 + ts.tv_nsec / 1000000.0; +} + +bool debug_callback(struct ggml_tensor * t, bool ask, void * user_data) { + static double start_time; + static char current_tensor[MAX_TENSOR_NAME]; + + if (ask) { + start_time = get_time_ms(); + strncpy(current_tensor, t->name, MAX_TENSOR_NAME - 1); + current_tensor[MAX_TENSOR_NAME - 1] = '\0'; + } else { + double end_time = get_time_ms(); + double duration = end_time - start_time; + + if (g_timings.count < MAX_TIMINGS) { + strncpy(g_timings.entries[g_timings.count].tensor_name, current_tensor, MAX_TENSOR_NAME - 1); + g_timings.entries[g_timings.count].duration_ms = duration; + g_timings.count++; + } + } + return true; +} + +void clear_timings() { + g_timings.count = 0; +} + */ import "C" @@ -29,9 +79,11 @@ import ( "io" "log/slog" "os" + "strings" "sync" "unsafe" + "github.com/ollama/ollama/envconfig" "github.com/ollama/ollama/format" fs "github.com/ollama/ollama/fs/ggml" "github.com/ollama/ollama/ml" @@ -256,7 +308,62 @@ func (c *Context) Forward(t ml.Tensor) { C.ggml_build_forward_expand(c.graph, t.(*Tensor).t) } +// Timing retrieves the collected timing data +func (c *Context) Timing() []ml.OpTiming { + sequence := make([]ml.OpTiming, C.g_timings.count) + + for i := range int(C.g_timings.count) { + entry := C.g_timings.entries[i] + tensorName := C.GoString(&entry.tensor_name[0]) + + // Determine operation type and description based on tensor name + var opType ml.OpType + var opDesc string + + switch { + case strings.Contains(tensorName, "(view)"): + opType, opDesc = ml.View, "Memory view" + case strings.Contains(tensorName, "(copy)") || strings.Contains(tensorName, "(copy of"): + opType, opDesc = ml.Copy, "Memory copy" + case strings.Contains(tensorName, "(reshaped)"): + opType, opDesc = ml.Reshape, "Reshape" + case strings.Contains(tensorName, "(permuted)"): + opType, opDesc = ml.Permute, "Permute dimensions" + case strings.Contains(tensorName, "(cont)"): + opType, opDesc = ml.Contiguous, "Make contiguous" + case strings.Contains(tensorName, "(transposed)"): + opType, opDesc = ml.Transpose, "Transpose" + case strings.HasPrefix(tensorName, "leaf_"): + opType, opDesc = ml.Input, fmt.Sprintf("Input tensor %s", tensorName) + case strings.HasPrefix(tensorName, "node_"): + opType, opDesc = ml.ComputeOp, fmt.Sprintf("Computation %s", tensorName) + default: + opType, opDesc = "Unknown", tensorName + } + + sequence[i] = ml.OpTiming{ + Type: opType, + Operation: opDesc, + Duration: float64(entry.duration_ms), + Order: i, + } + } + + return sequence +} + func (c *Context) Compute(tensors ...ml.Tensor) { + if envconfig.Benchmark() { + // Clear previous timings before new computation + C.clear_timings() + + C.ggml_backend_sched_set_eval_callback( + c.sched, + C.ggml_backend_eval_callback(C.debug_callback), + nil, + ) + } + C.ggml_backend_sched_graph_compute_async(c.sched, c.graph) needSync := true