Track execution time of individual tensor operations (views, copies, reshapes etc) during LLM forward passes using CGo bindings to the native graph runtime. This helps identify performance bottlenecks in the computation graph and optimize memory operations that can significantly impact inference latency.
232 lines
5.2 KiB
Go
232 lines
5.2 KiB
Go
package ml
|
|
|
|
import (
|
|
"bytes"
|
|
"cmp"
|
|
"encoding/binary"
|
|
"fmt"
|
|
"os"
|
|
"strconv"
|
|
"strings"
|
|
)
|
|
|
|
type Config interface {
|
|
Architecture() string
|
|
String(string, ...string) string
|
|
Uint(string, ...uint32) uint32
|
|
Float(string, ...float32) float32
|
|
|
|
Strings(string, ...[]string) []string
|
|
Uints(string, ...[]uint32) []uint32
|
|
}
|
|
|
|
type Backend interface {
|
|
Config() Config
|
|
Get(name string) Tensor
|
|
NewContext() Context
|
|
SystemInfo() string
|
|
}
|
|
|
|
var backends = make(map[string]func(*os.File) (Backend, error))
|
|
|
|
func RegisterBackend(name string, f func(*os.File) (Backend, error)) {
|
|
if _, ok := backends[name]; ok {
|
|
panic("backend: backend already registered")
|
|
}
|
|
|
|
backends[name] = f
|
|
}
|
|
|
|
func NewBackend(f *os.File) (Backend, error) {
|
|
if backend, ok := backends[cmp.Or(os.Getenv("OLLAMA_BACKEND"), "ggml")]; ok {
|
|
return backend(f)
|
|
}
|
|
|
|
return nil, fmt.Errorf("unsupported backend")
|
|
}
|
|
|
|
type Context interface {
|
|
Zeros(dtype DType, shape ...int) Tensor
|
|
FromFloatSlice(s []float32, shape ...int) (Tensor, error)
|
|
FromIntSlice(s []int32, shape ...int) (Tensor, error)
|
|
|
|
Forward(Tensor)
|
|
Compute(...Tensor)
|
|
MaxTensors() int
|
|
Close()
|
|
|
|
Timing() []OpTiming
|
|
}
|
|
|
|
// OpType is the type of operation performed during a forward pass.
|
|
type OpType string
|
|
|
|
const (
|
|
View OpType = "View"
|
|
Copy OpType = "Copy"
|
|
Reshape OpType = "Reshape"
|
|
Permute OpType = "Permute"
|
|
Contiguous OpType = "Contiguous"
|
|
Input OpType = "Input"
|
|
ComputeOp OpType = "Compute"
|
|
Transpose OpType = "Transpose"
|
|
)
|
|
|
|
// OpTiming stores the timing information for a single operation.
|
|
type OpTiming struct {
|
|
Type OpType
|
|
Operation string
|
|
Duration float64
|
|
Order int
|
|
}
|
|
|
|
type Tensor interface {
|
|
Dim(n int) int
|
|
Stride(n int) int
|
|
|
|
Shape() []int
|
|
DType() DType
|
|
|
|
Bytes() []byte
|
|
Floats() []float32
|
|
|
|
Add(ctx Context, t2 Tensor) Tensor
|
|
Mul(ctx Context, t2 Tensor) Tensor
|
|
Mulmat(ctx Context, t2 Tensor) Tensor
|
|
MulmatFullPrec(ctx Context, t2 Tensor) Tensor
|
|
|
|
Softmax(ctx Context) Tensor
|
|
LayerNorm(ctx Context, weight, bias Tensor, eps float32) Tensor
|
|
RMSNorm(ctx Context, weight Tensor, eps float32) Tensor
|
|
Scale(ctx Context, s float64) Tensor
|
|
|
|
Conv2D(ctx Context, weight Tensor, s0, s1, p0, p1, d0, d1 int) Tensor
|
|
RoPE(ctx Context, positionIDs, ropeFactors Tensor, dim uint32, base, scale float32) Tensor
|
|
|
|
Tanh(ctx Context) Tensor
|
|
GELU(ctx Context) Tensor
|
|
SILU(ctx Context) Tensor
|
|
|
|
Reshape(ctx Context, shape ...int) Tensor
|
|
View(ctx Context, offset int, shape ...int) Tensor
|
|
Permute(ctx Context, shape ...int) Tensor
|
|
Contiguous(ctx Context) Tensor
|
|
|
|
Pad(ctx Context, shape ...int) Tensor
|
|
Unpad(ctx Context, shape ...int) Tensor
|
|
|
|
Stack(ctx Context, dim int, s ...Tensor) Tensor
|
|
Concat(ctx Context, t2 Tensor, dim int) Tensor
|
|
Rows(ctx Context, t2 Tensor) Tensor
|
|
Copy(ctx Context, t2 Tensor) Tensor
|
|
}
|
|
|
|
type number interface {
|
|
~int | ~int8 | ~int16 | ~int32 | ~int64 |
|
|
~uint | ~uint8 | ~uint16 | ~uint32 | ~uint64 |
|
|
~float32 | ~float64 |
|
|
~complex64 | ~complex128
|
|
}
|
|
|
|
func mul[T number](s ...T) T {
|
|
p := T(1)
|
|
for _, v := range s {
|
|
p *= v
|
|
}
|
|
|
|
return p
|
|
}
|
|
|
|
type DumpOptions struct {
|
|
// Items is the number of elements to print at the beginning and end of each dimension.
|
|
Items int
|
|
|
|
// Precision is the number of decimal places to print. Applies to float32 and float64.
|
|
Precision int
|
|
}
|
|
|
|
func Dump(ctx Context, t Tensor, opts ...DumpOptions) string {
|
|
if len(opts) < 1 {
|
|
opts = append(opts, DumpOptions{
|
|
Items: 3,
|
|
Precision: 4,
|
|
})
|
|
}
|
|
|
|
switch t.DType() {
|
|
case DTypeF32:
|
|
return dump[[]float32](ctx, t, opts[0].Items, func(f float32) string {
|
|
return strconv.FormatFloat(float64(f), 'f', opts[0].Precision, 32)
|
|
})
|
|
case DTypeF16:
|
|
f32 := ctx.Zeros(DTypeF32, t.Shape()...)
|
|
f32 = t.Copy(ctx, f32)
|
|
return dump[[]float32](ctx, f32, opts[0].Items, func(f float32) string {
|
|
return strconv.FormatFloat(float64(f), 'f', opts[0].Precision, 32)
|
|
})
|
|
case DTypeI32:
|
|
return dump[[]int32](ctx, t, opts[0].Items, func(i int32) string {
|
|
return strconv.FormatInt(int64(i), 10)
|
|
})
|
|
default:
|
|
return "<unsupported>"
|
|
}
|
|
}
|
|
|
|
func dump[S ~[]E, E number](ctx Context, t Tensor, items int, fn func(E) string) string {
|
|
if t.Bytes() == nil {
|
|
ctx.Forward(t)
|
|
ctx.Compute(t)
|
|
}
|
|
|
|
s := make(S, mul(t.Shape()...))
|
|
if err := binary.Read(bytes.NewBuffer(t.Bytes()), binary.LittleEndian, &s); err != nil {
|
|
panic(err)
|
|
}
|
|
|
|
shape := t.Shape()
|
|
|
|
var sb strings.Builder
|
|
var f func([]int, int)
|
|
f = func(dims []int, stride int) {
|
|
prefix := strings.Repeat(" ", len(shape)-len(dims)+1)
|
|
fmt.Fprint(&sb, "[")
|
|
defer func() { fmt.Fprint(&sb, "]") }()
|
|
for i := 0; i < dims[0]; i++ {
|
|
if i >= items && i < dims[0]-items {
|
|
fmt.Fprint(&sb, "..., ")
|
|
// skip to next printable element
|
|
skip := dims[0] - 2*items
|
|
if len(dims) > 1 {
|
|
stride += mul(append(dims[1:], skip)...)
|
|
fmt.Fprint(&sb, strings.Repeat("\n", len(dims)-1), prefix)
|
|
}
|
|
i += skip - 1
|
|
} else if len(dims) > 1 {
|
|
f(dims[1:], stride)
|
|
stride += mul(dims[1:]...)
|
|
if i < dims[0]-1 {
|
|
fmt.Fprint(&sb, ",", strings.Repeat("\n", len(dims)-1), prefix)
|
|
}
|
|
} else {
|
|
fmt.Fprint(&sb, fn(s[stride+i]))
|
|
if i < dims[0]-1 {
|
|
fmt.Fprint(&sb, ", ")
|
|
}
|
|
}
|
|
}
|
|
}
|
|
f(shape, 0)
|
|
|
|
return sb.String()
|
|
}
|
|
|
|
type DType int
|
|
|
|
const (
|
|
DTypeOther DType = iota
|
|
DTypeF32
|
|
DTypeF16
|
|
DTypeI32
|
|
)
|