Compare commits
1 Commits
main
...
brucemacd/
Author | SHA1 | Date | |
---|---|---|---|
![]() |
057cc54b66 |
86
benchmark/ggml_backend_benchmark_test.go
Normal file
86
benchmark/ggml_backend_benchmark_test.go
Normal file
@ -0,0 +1,86 @@
|
||||
package backend
|
||||
|
||||
import (
|
||||
"flag"
|
||||
"fmt"
|
||||
"io"
|
||||
"log"
|
||||
"os"
|
||||
"testing"
|
||||
|
||||
"github.com/ollama/ollama/ml"
|
||||
"github.com/ollama/ollama/model"
|
||||
"github.com/ollama/ollama/server"
|
||||
|
||||
_ "github.com/ollama/ollama/model/models/llama"
|
||||
)
|
||||
|
||||
var modelName = flag.String("m", "", "Name of the model to benchmark")
|
||||
|
||||
func suppressOutput() (cleanup func()) {
|
||||
oldStdout, oldStderr := os.Stdout, os.Stderr
|
||||
os.Stdout, os.Stderr = nil, nil
|
||||
log.SetOutput(io.Discard)
|
||||
|
||||
return func() {
|
||||
os.Stdout, os.Stderr = oldStdout, oldStderr
|
||||
log.SetOutput(os.Stderr)
|
||||
}
|
||||
}
|
||||
|
||||
func setupModel(b *testing.B) model.Model {
|
||||
if *modelName == "" {
|
||||
b.Fatal("Error: -m flag is required for benchmark tests")
|
||||
}
|
||||
|
||||
sm, err := server.GetModel(*modelName)
|
||||
if err != nil {
|
||||
b.Fatal(err)
|
||||
}
|
||||
|
||||
m, err := model.New(sm.ModelPath)
|
||||
if err != nil {
|
||||
b.Fatal(err)
|
||||
}
|
||||
|
||||
m.Config().Cache.Init(m.Backend(), ml.DTypeF32, 2048)
|
||||
return m
|
||||
}
|
||||
|
||||
func BenchmarkGGMLOperations(b *testing.B) {
|
||||
// loading the GGML back-end logs to standard out and makes the bench output messy
|
||||
cleanup := suppressOutput()
|
||||
defer cleanup()
|
||||
|
||||
b.Setenv("OLLAMA_BENCHMARK", "1")
|
||||
b.Setenv("OLLAMA_BACKEND", "ggml")
|
||||
|
||||
m := setupModel(b)
|
||||
|
||||
// Sample input data
|
||||
inputIDs := []int32{1, 2, 3, 4, 5}
|
||||
options := model.Options{
|
||||
Inputs: inputIDs,
|
||||
Positions: []int32{1, 2, 3, 4, 5},
|
||||
Sequences: []int{1, 1, 1, 1, 1},
|
||||
Outputs: []int32{int32(len(inputIDs) - 1)},
|
||||
}
|
||||
|
||||
b.ResetTimer()
|
||||
|
||||
for range b.N {
|
||||
ctx := m.Backend().NewContext()
|
||||
defer ctx.Close()
|
||||
|
||||
modelOutput, err := model.Forward(ctx, m, options)
|
||||
if err != nil {
|
||||
b.Fatal(fmt.Errorf("forward pass failed: %v", err))
|
||||
}
|
||||
|
||||
ctx.Compute(modelOutput)
|
||||
|
||||
for _, op := range ctx.Timing() {
|
||||
b.ReportMetric(op.Duration, fmt.Sprintf("%s_ms", op.Type))
|
||||
}
|
||||
}
|
||||
}
|
@ -167,6 +167,8 @@ var (
|
||||
MultiUserCache = Bool("OLLAMA_MULTIUSER_CACHE")
|
||||
// Enable the new Ollama engine
|
||||
NewEngine = Bool("OLLAMA_NEW_ENGINE")
|
||||
// Ollama is running in a benchmark context, additional timing data will be collected.
|
||||
Benchmark = Bool("OLLAMA_BENCHMARK")
|
||||
)
|
||||
|
||||
func String(s string) func() string {
|
||||
|
@ -352,6 +352,10 @@ func (c *testContext) MaxTensors() int {
|
||||
return 10
|
||||
}
|
||||
|
||||
func (c *testContext) Timing() []ml.OpTiming {
|
||||
return []ml.OpTiming{}
|
||||
}
|
||||
|
||||
func (c *testContext) Close() {}
|
||||
|
||||
type testTensor struct {
|
||||
|
@ -2,6 +2,7 @@ package ml
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"cmp"
|
||||
"encoding/binary"
|
||||
"fmt"
|
||||
"os"
|
||||
@ -37,7 +38,7 @@ func RegisterBackend(name string, f func(*os.File) (Backend, error)) {
|
||||
}
|
||||
|
||||
func NewBackend(f *os.File) (Backend, error) {
|
||||
if backend, ok := backends["ggml"]; ok {
|
||||
if backend, ok := backends[cmp.Or(os.Getenv("OLLAMA_BACKEND"), "ggml")]; ok {
|
||||
return backend(f)
|
||||
}
|
||||
|
||||
@ -53,6 +54,30 @@ type Context interface {
|
||||
Compute(...Tensor)
|
||||
MaxTensors() int
|
||||
Close()
|
||||
|
||||
Timing() []OpTiming
|
||||
}
|
||||
|
||||
// OpType is the type of operation performed during a forward pass.
|
||||
type OpType string
|
||||
|
||||
const (
|
||||
View OpType = "View"
|
||||
Copy OpType = "Copy"
|
||||
Reshape OpType = "Reshape"
|
||||
Permute OpType = "Permute"
|
||||
Contiguous OpType = "Contiguous"
|
||||
Input OpType = "Input"
|
||||
ComputeOp OpType = "Compute"
|
||||
Transpose OpType = "Transpose"
|
||||
)
|
||||
|
||||
// OpTiming stores the timing information for a single operation.
|
||||
type OpTiming struct {
|
||||
Type OpType
|
||||
Operation string
|
||||
Duration float64
|
||||
Order int
|
||||
}
|
||||
|
||||
type Tensor interface {
|
||||
|
@ -4,6 +4,8 @@ package ggml
|
||||
#cgo CPPFLAGS: -I${SRCDIR}/ggml/include
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <time.h>
|
||||
#include <string.h>
|
||||
#include "ggml.h"
|
||||
#include "ggml-cpu.h"
|
||||
#include "ggml-backend.h"
|
||||
@ -21,6 +23,54 @@ COMPILER inline get_compiler() {
|
||||
#endif
|
||||
}
|
||||
|
||||
// Define a fixed-size struct to store timing data
|
||||
#define MAX_TENSOR_NAME 256
|
||||
#define MAX_TIMINGS 1000
|
||||
|
||||
typedef struct {
|
||||
char tensor_name[MAX_TENSOR_NAME];
|
||||
double duration_ms;
|
||||
} timing_entry;
|
||||
|
||||
typedef struct {
|
||||
timing_entry entries[MAX_TIMINGS];
|
||||
int count;
|
||||
} timing_data;
|
||||
|
||||
// Global timing data structure
|
||||
timing_data g_timings = {0};
|
||||
|
||||
double get_time_ms() {
|
||||
struct timespec ts;
|
||||
clock_gettime(CLOCK_MONOTONIC, &ts);
|
||||
return ts.tv_sec * 1000.0 + ts.tv_nsec / 1000000.0;
|
||||
}
|
||||
|
||||
bool debug_callback(struct ggml_tensor * t, bool ask, void * user_data) {
|
||||
static double start_time;
|
||||
static char current_tensor[MAX_TENSOR_NAME];
|
||||
|
||||
if (ask) {
|
||||
start_time = get_time_ms();
|
||||
strncpy(current_tensor, t->name, MAX_TENSOR_NAME - 1);
|
||||
current_tensor[MAX_TENSOR_NAME - 1] = '\0';
|
||||
} else {
|
||||
double end_time = get_time_ms();
|
||||
double duration = end_time - start_time;
|
||||
|
||||
if (g_timings.count < MAX_TIMINGS) {
|
||||
strncpy(g_timings.entries[g_timings.count].tensor_name, current_tensor, MAX_TENSOR_NAME - 1);
|
||||
g_timings.entries[g_timings.count].duration_ms = duration;
|
||||
g_timings.count++;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void clear_timings() {
|
||||
g_timings.count = 0;
|
||||
}
|
||||
|
||||
*/
|
||||
import "C"
|
||||
|
||||
@ -29,9 +79,11 @@ import (
|
||||
"io"
|
||||
"log/slog"
|
||||
"os"
|
||||
"strings"
|
||||
"sync"
|
||||
"unsafe"
|
||||
|
||||
"github.com/ollama/ollama/envconfig"
|
||||
"github.com/ollama/ollama/format"
|
||||
fs "github.com/ollama/ollama/fs/ggml"
|
||||
"github.com/ollama/ollama/ml"
|
||||
@ -256,7 +308,62 @@ func (c *Context) Forward(t ml.Tensor) {
|
||||
C.ggml_build_forward_expand(c.graph, t.(*Tensor).t)
|
||||
}
|
||||
|
||||
// Timing retrieves the collected timing data
|
||||
func (c *Context) Timing() []ml.OpTiming {
|
||||
sequence := make([]ml.OpTiming, C.g_timings.count)
|
||||
|
||||
for i := range int(C.g_timings.count) {
|
||||
entry := C.g_timings.entries[i]
|
||||
tensorName := C.GoString(&entry.tensor_name[0])
|
||||
|
||||
// Determine operation type and description based on tensor name
|
||||
var opType ml.OpType
|
||||
var opDesc string
|
||||
|
||||
switch {
|
||||
case strings.Contains(tensorName, "(view)"):
|
||||
opType, opDesc = ml.View, "Memory view"
|
||||
case strings.Contains(tensorName, "(copy)") || strings.Contains(tensorName, "(copy of"):
|
||||
opType, opDesc = ml.Copy, "Memory copy"
|
||||
case strings.Contains(tensorName, "(reshaped)"):
|
||||
opType, opDesc = ml.Reshape, "Reshape"
|
||||
case strings.Contains(tensorName, "(permuted)"):
|
||||
opType, opDesc = ml.Permute, "Permute dimensions"
|
||||
case strings.Contains(tensorName, "(cont)"):
|
||||
opType, opDesc = ml.Contiguous, "Make contiguous"
|
||||
case strings.Contains(tensorName, "(transposed)"):
|
||||
opType, opDesc = ml.Transpose, "Transpose"
|
||||
case strings.HasPrefix(tensorName, "leaf_"):
|
||||
opType, opDesc = ml.Input, fmt.Sprintf("Input tensor %s", tensorName)
|
||||
case strings.HasPrefix(tensorName, "node_"):
|
||||
opType, opDesc = ml.ComputeOp, fmt.Sprintf("Computation %s", tensorName)
|
||||
default:
|
||||
opType, opDesc = "Unknown", tensorName
|
||||
}
|
||||
|
||||
sequence[i] = ml.OpTiming{
|
||||
Type: opType,
|
||||
Operation: opDesc,
|
||||
Duration: float64(entry.duration_ms),
|
||||
Order: i,
|
||||
}
|
||||
}
|
||||
|
||||
return sequence
|
||||
}
|
||||
|
||||
func (c *Context) Compute(tensors ...ml.Tensor) {
|
||||
if envconfig.Benchmark() {
|
||||
// Clear previous timings before new computation
|
||||
C.clear_timings()
|
||||
|
||||
C.ggml_backend_sched_set_eval_callback(
|
||||
c.sched,
|
||||
C.ggml_backend_eval_callback(C.debug_callback),
|
||||
nil,
|
||||
)
|
||||
}
|
||||
|
||||
C.ggml_backend_sched_graph_compute_async(c.sched, c.graph)
|
||||
|
||||
needSync := true
|
||||
|
Loading…
x
Reference in New Issue
Block a user