Compare commits
1 Commits
main
...
brucemacd/
Author | SHA1 | Date | |
---|---|---|---|
![]() |
057cc54b66 |
86
benchmark/ggml_backend_benchmark_test.go
Normal file
86
benchmark/ggml_backend_benchmark_test.go
Normal file
@ -0,0 +1,86 @@
|
|||||||
|
package backend
|
||||||
|
|
||||||
|
import (
|
||||||
|
"flag"
|
||||||
|
"fmt"
|
||||||
|
"io"
|
||||||
|
"log"
|
||||||
|
"os"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/ollama/ollama/ml"
|
||||||
|
"github.com/ollama/ollama/model"
|
||||||
|
"github.com/ollama/ollama/server"
|
||||||
|
|
||||||
|
_ "github.com/ollama/ollama/model/models/llama"
|
||||||
|
)
|
||||||
|
|
||||||
|
var modelName = flag.String("m", "", "Name of the model to benchmark")
|
||||||
|
|
||||||
|
func suppressOutput() (cleanup func()) {
|
||||||
|
oldStdout, oldStderr := os.Stdout, os.Stderr
|
||||||
|
os.Stdout, os.Stderr = nil, nil
|
||||||
|
log.SetOutput(io.Discard)
|
||||||
|
|
||||||
|
return func() {
|
||||||
|
os.Stdout, os.Stderr = oldStdout, oldStderr
|
||||||
|
log.SetOutput(os.Stderr)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func setupModel(b *testing.B) model.Model {
|
||||||
|
if *modelName == "" {
|
||||||
|
b.Fatal("Error: -m flag is required for benchmark tests")
|
||||||
|
}
|
||||||
|
|
||||||
|
sm, err := server.GetModel(*modelName)
|
||||||
|
if err != nil {
|
||||||
|
b.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
m, err := model.New(sm.ModelPath)
|
||||||
|
if err != nil {
|
||||||
|
b.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
m.Config().Cache.Init(m.Backend(), ml.DTypeF32, 2048)
|
||||||
|
return m
|
||||||
|
}
|
||||||
|
|
||||||
|
func BenchmarkGGMLOperations(b *testing.B) {
|
||||||
|
// loading the GGML back-end logs to standard out and makes the bench output messy
|
||||||
|
cleanup := suppressOutput()
|
||||||
|
defer cleanup()
|
||||||
|
|
||||||
|
b.Setenv("OLLAMA_BENCHMARK", "1")
|
||||||
|
b.Setenv("OLLAMA_BACKEND", "ggml")
|
||||||
|
|
||||||
|
m := setupModel(b)
|
||||||
|
|
||||||
|
// Sample input data
|
||||||
|
inputIDs := []int32{1, 2, 3, 4, 5}
|
||||||
|
options := model.Options{
|
||||||
|
Inputs: inputIDs,
|
||||||
|
Positions: []int32{1, 2, 3, 4, 5},
|
||||||
|
Sequences: []int{1, 1, 1, 1, 1},
|
||||||
|
Outputs: []int32{int32(len(inputIDs) - 1)},
|
||||||
|
}
|
||||||
|
|
||||||
|
b.ResetTimer()
|
||||||
|
|
||||||
|
for range b.N {
|
||||||
|
ctx := m.Backend().NewContext()
|
||||||
|
defer ctx.Close()
|
||||||
|
|
||||||
|
modelOutput, err := model.Forward(ctx, m, options)
|
||||||
|
if err != nil {
|
||||||
|
b.Fatal(fmt.Errorf("forward pass failed: %v", err))
|
||||||
|
}
|
||||||
|
|
||||||
|
ctx.Compute(modelOutput)
|
||||||
|
|
||||||
|
for _, op := range ctx.Timing() {
|
||||||
|
b.ReportMetric(op.Duration, fmt.Sprintf("%s_ms", op.Type))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@ -167,6 +167,8 @@ var (
|
|||||||
MultiUserCache = Bool("OLLAMA_MULTIUSER_CACHE")
|
MultiUserCache = Bool("OLLAMA_MULTIUSER_CACHE")
|
||||||
// Enable the new Ollama engine
|
// Enable the new Ollama engine
|
||||||
NewEngine = Bool("OLLAMA_NEW_ENGINE")
|
NewEngine = Bool("OLLAMA_NEW_ENGINE")
|
||||||
|
// Ollama is running in a benchmark context, additional timing data will be collected.
|
||||||
|
Benchmark = Bool("OLLAMA_BENCHMARK")
|
||||||
)
|
)
|
||||||
|
|
||||||
func String(s string) func() string {
|
func String(s string) func() string {
|
||||||
|
@ -352,6 +352,10 @@ func (c *testContext) MaxTensors() int {
|
|||||||
return 10
|
return 10
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (c *testContext) Timing() []ml.OpTiming {
|
||||||
|
return []ml.OpTiming{}
|
||||||
|
}
|
||||||
|
|
||||||
func (c *testContext) Close() {}
|
func (c *testContext) Close() {}
|
||||||
|
|
||||||
type testTensor struct {
|
type testTensor struct {
|
||||||
|
@ -2,6 +2,7 @@ package ml
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"bytes"
|
"bytes"
|
||||||
|
"cmp"
|
||||||
"encoding/binary"
|
"encoding/binary"
|
||||||
"fmt"
|
"fmt"
|
||||||
"os"
|
"os"
|
||||||
@ -37,7 +38,7 @@ func RegisterBackend(name string, f func(*os.File) (Backend, error)) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func NewBackend(f *os.File) (Backend, error) {
|
func NewBackend(f *os.File) (Backend, error) {
|
||||||
if backend, ok := backends["ggml"]; ok {
|
if backend, ok := backends[cmp.Or(os.Getenv("OLLAMA_BACKEND"), "ggml")]; ok {
|
||||||
return backend(f)
|
return backend(f)
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -53,6 +54,30 @@ type Context interface {
|
|||||||
Compute(...Tensor)
|
Compute(...Tensor)
|
||||||
MaxTensors() int
|
MaxTensors() int
|
||||||
Close()
|
Close()
|
||||||
|
|
||||||
|
Timing() []OpTiming
|
||||||
|
}
|
||||||
|
|
||||||
|
// OpType is the type of operation performed during a forward pass.
|
||||||
|
type OpType string
|
||||||
|
|
||||||
|
const (
|
||||||
|
View OpType = "View"
|
||||||
|
Copy OpType = "Copy"
|
||||||
|
Reshape OpType = "Reshape"
|
||||||
|
Permute OpType = "Permute"
|
||||||
|
Contiguous OpType = "Contiguous"
|
||||||
|
Input OpType = "Input"
|
||||||
|
ComputeOp OpType = "Compute"
|
||||||
|
Transpose OpType = "Transpose"
|
||||||
|
)
|
||||||
|
|
||||||
|
// OpTiming stores the timing information for a single operation.
|
||||||
|
type OpTiming struct {
|
||||||
|
Type OpType
|
||||||
|
Operation string
|
||||||
|
Duration float64
|
||||||
|
Order int
|
||||||
}
|
}
|
||||||
|
|
||||||
type Tensor interface {
|
type Tensor interface {
|
||||||
|
@ -4,6 +4,8 @@ package ggml
|
|||||||
#cgo CPPFLAGS: -I${SRCDIR}/ggml/include
|
#cgo CPPFLAGS: -I${SRCDIR}/ggml/include
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
|
#include <time.h>
|
||||||
|
#include <string.h>
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
#include "ggml-cpu.h"
|
#include "ggml-cpu.h"
|
||||||
#include "ggml-backend.h"
|
#include "ggml-backend.h"
|
||||||
@ -21,6 +23,54 @@ COMPILER inline get_compiler() {
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Define a fixed-size struct to store timing data
|
||||||
|
#define MAX_TENSOR_NAME 256
|
||||||
|
#define MAX_TIMINGS 1000
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
char tensor_name[MAX_TENSOR_NAME];
|
||||||
|
double duration_ms;
|
||||||
|
} timing_entry;
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
timing_entry entries[MAX_TIMINGS];
|
||||||
|
int count;
|
||||||
|
} timing_data;
|
||||||
|
|
||||||
|
// Global timing data structure
|
||||||
|
timing_data g_timings = {0};
|
||||||
|
|
||||||
|
double get_time_ms() {
|
||||||
|
struct timespec ts;
|
||||||
|
clock_gettime(CLOCK_MONOTONIC, &ts);
|
||||||
|
return ts.tv_sec * 1000.0 + ts.tv_nsec / 1000000.0;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool debug_callback(struct ggml_tensor * t, bool ask, void * user_data) {
|
||||||
|
static double start_time;
|
||||||
|
static char current_tensor[MAX_TENSOR_NAME];
|
||||||
|
|
||||||
|
if (ask) {
|
||||||
|
start_time = get_time_ms();
|
||||||
|
strncpy(current_tensor, t->name, MAX_TENSOR_NAME - 1);
|
||||||
|
current_tensor[MAX_TENSOR_NAME - 1] = '\0';
|
||||||
|
} else {
|
||||||
|
double end_time = get_time_ms();
|
||||||
|
double duration = end_time - start_time;
|
||||||
|
|
||||||
|
if (g_timings.count < MAX_TIMINGS) {
|
||||||
|
strncpy(g_timings.entries[g_timings.count].tensor_name, current_tensor, MAX_TENSOR_NAME - 1);
|
||||||
|
g_timings.entries[g_timings.count].duration_ms = duration;
|
||||||
|
g_timings.count++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
void clear_timings() {
|
||||||
|
g_timings.count = 0;
|
||||||
|
}
|
||||||
|
|
||||||
*/
|
*/
|
||||||
import "C"
|
import "C"
|
||||||
|
|
||||||
@ -29,9 +79,11 @@ import (
|
|||||||
"io"
|
"io"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
"os"
|
"os"
|
||||||
|
"strings"
|
||||||
"sync"
|
"sync"
|
||||||
"unsafe"
|
"unsafe"
|
||||||
|
|
||||||
|
"github.com/ollama/ollama/envconfig"
|
||||||
"github.com/ollama/ollama/format"
|
"github.com/ollama/ollama/format"
|
||||||
fs "github.com/ollama/ollama/fs/ggml"
|
fs "github.com/ollama/ollama/fs/ggml"
|
||||||
"github.com/ollama/ollama/ml"
|
"github.com/ollama/ollama/ml"
|
||||||
@ -256,7 +308,62 @@ func (c *Context) Forward(t ml.Tensor) {
|
|||||||
C.ggml_build_forward_expand(c.graph, t.(*Tensor).t)
|
C.ggml_build_forward_expand(c.graph, t.(*Tensor).t)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Timing retrieves the collected timing data
|
||||||
|
func (c *Context) Timing() []ml.OpTiming {
|
||||||
|
sequence := make([]ml.OpTiming, C.g_timings.count)
|
||||||
|
|
||||||
|
for i := range int(C.g_timings.count) {
|
||||||
|
entry := C.g_timings.entries[i]
|
||||||
|
tensorName := C.GoString(&entry.tensor_name[0])
|
||||||
|
|
||||||
|
// Determine operation type and description based on tensor name
|
||||||
|
var opType ml.OpType
|
||||||
|
var opDesc string
|
||||||
|
|
||||||
|
switch {
|
||||||
|
case strings.Contains(tensorName, "(view)"):
|
||||||
|
opType, opDesc = ml.View, "Memory view"
|
||||||
|
case strings.Contains(tensorName, "(copy)") || strings.Contains(tensorName, "(copy of"):
|
||||||
|
opType, opDesc = ml.Copy, "Memory copy"
|
||||||
|
case strings.Contains(tensorName, "(reshaped)"):
|
||||||
|
opType, opDesc = ml.Reshape, "Reshape"
|
||||||
|
case strings.Contains(tensorName, "(permuted)"):
|
||||||
|
opType, opDesc = ml.Permute, "Permute dimensions"
|
||||||
|
case strings.Contains(tensorName, "(cont)"):
|
||||||
|
opType, opDesc = ml.Contiguous, "Make contiguous"
|
||||||
|
case strings.Contains(tensorName, "(transposed)"):
|
||||||
|
opType, opDesc = ml.Transpose, "Transpose"
|
||||||
|
case strings.HasPrefix(tensorName, "leaf_"):
|
||||||
|
opType, opDesc = ml.Input, fmt.Sprintf("Input tensor %s", tensorName)
|
||||||
|
case strings.HasPrefix(tensorName, "node_"):
|
||||||
|
opType, opDesc = ml.ComputeOp, fmt.Sprintf("Computation %s", tensorName)
|
||||||
|
default:
|
||||||
|
opType, opDesc = "Unknown", tensorName
|
||||||
|
}
|
||||||
|
|
||||||
|
sequence[i] = ml.OpTiming{
|
||||||
|
Type: opType,
|
||||||
|
Operation: opDesc,
|
||||||
|
Duration: float64(entry.duration_ms),
|
||||||
|
Order: i,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return sequence
|
||||||
|
}
|
||||||
|
|
||||||
func (c *Context) Compute(tensors ...ml.Tensor) {
|
func (c *Context) Compute(tensors ...ml.Tensor) {
|
||||||
|
if envconfig.Benchmark() {
|
||||||
|
// Clear previous timings before new computation
|
||||||
|
C.clear_timings()
|
||||||
|
|
||||||
|
C.ggml_backend_sched_set_eval_callback(
|
||||||
|
c.sched,
|
||||||
|
C.ggml_backend_eval_callback(C.debug_callback),
|
||||||
|
nil,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
C.ggml_backend_sched_graph_compute_async(c.sched, c.graph)
|
C.ggml_backend_sched_graph_compute_async(c.sched, c.graph)
|
||||||
|
|
||||||
needSync := true
|
needSync := true
|
||||||
|
Loading…
x
Reference in New Issue
Block a user