Compare commits
4 Commits
drifkin/ar
...
parth/pyth
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
b4cd1118ab | ||
|
|
128c90d3ac | ||
|
|
f5872a097c | ||
|
|
3ac5e0f102 |
278
fs/ggml/ggml.go
278
fs/ggml/ggml.go
@@ -6,7 +6,6 @@ import (
|
|||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
"reflect"
|
|
||||||
"slices"
|
"slices"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
@@ -53,80 +52,32 @@ func (kv KV) EmbeddingLength() uint64 {
|
|||||||
return uint64(kv.Uint("embedding_length"))
|
return uint64(kv.Uint("embedding_length"))
|
||||||
}
|
}
|
||||||
|
|
||||||
func (kv KV) HeadCounts() []uint64 {
|
func (kv KV) HeadCount() uint64 {
|
||||||
return kv.UintOrArrayAsArray("attention.head_count", kv.BlockCount(), 1)
|
return uint64(kv.Uint("attention.head_count"))
|
||||||
}
|
}
|
||||||
|
|
||||||
func (kv KV) HeadCountKVs() []uint64 {
|
func (kv KV) HeadCountKV() uint64 {
|
||||||
return kv.UintOrArrayAsArray("attention.head_count_kv", kv.BlockCount(), 1)
|
return uint64(kv.Uint("attention.head_count_kv", 1))
|
||||||
}
|
}
|
||||||
|
|
||||||
func (kv KV) EmbeddingHeadCount() []uint64 {
|
func (kv KV) EmbeddingHeadCount() uint64 {
|
||||||
headCount := kv.HeadCounts()
|
if heads := kv.HeadCount(); heads > 0 {
|
||||||
embeddingHeadCount := make([]uint64, len(headCount))
|
return kv.EmbeddingLength() / heads
|
||||||
for i, heads := range headCount {
|
|
||||||
if heads == 0 {
|
|
||||||
embeddingHeadCount[i] = 0
|
|
||||||
} else {
|
|
||||||
embeddingHeadCount[i] = kv.EmbeddingLength() / heads
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return embeddingHeadCount
|
return 0
|
||||||
}
|
}
|
||||||
|
|
||||||
func (kv KV) FillArrayOrDefault(key string, defaultValue []uint64) []uint64 {
|
func (kv KV) EmbeddingHeadCountK() uint64 {
|
||||||
length := len(defaultValue)
|
return uint64(kv.Uint("attention.key_length", uint32(kv.EmbeddingHeadCount())))
|
||||||
if v, ok := keyValueUntyped(kv, key); ok {
|
|
||||||
switch v := v.(type) {
|
|
||||||
case uint32:
|
|
||||||
return FillArray(uint64(v), length)
|
|
||||||
case uint64:
|
|
||||||
return FillArray(v, length)
|
|
||||||
case int32:
|
|
||||||
return FillArray(uint64(v), length)
|
|
||||||
default:
|
|
||||||
slog.Warn("unsupported type", "key", key, "type", reflect.TypeOf(v))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return defaultValue
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (kv KV) EmbeddingHeadCountK() []uint64 {
|
func (kv KV) EmbeddingHeadCountV() uint64 {
|
||||||
return kv.FillArrayOrDefault("attention.key_length", kv.EmbeddingHeadCount())
|
return uint64(kv.Uint("attention.value_length", uint32(kv.EmbeddingHeadCount())))
|
||||||
}
|
}
|
||||||
|
|
||||||
func (kv KV) EmbeddingHeadCountV() []uint64 {
|
func (kv KV) GQA() uint64 {
|
||||||
return kv.FillArrayOrDefault("attention.value_length", kv.EmbeddingHeadCount())
|
return kv.HeadCount() / kv.HeadCountKV()
|
||||||
}
|
|
||||||
|
|
||||||
func (kv KV) GQAMax() uint64 {
|
|
||||||
heads := kv.HeadCounts()
|
|
||||||
headsKV := kv.HeadCountKVs()
|
|
||||||
if len(heads) != len(headsKV) {
|
|
||||||
slog.Warn("head count and head count kv are not the same length")
|
|
||||||
return 0
|
|
||||||
}
|
|
||||||
if len(heads) == 0 {
|
|
||||||
slog.Warn("head count is empty")
|
|
||||||
return 0
|
|
||||||
}
|
|
||||||
|
|
||||||
maxGQA := uint64(0)
|
|
||||||
for i := range heads {
|
|
||||||
head := heads[i]
|
|
||||||
headKV := headsKV[i]
|
|
||||||
if head == 0 || headKV == 0 {
|
|
||||||
return 0
|
|
||||||
}
|
|
||||||
gqa := head / headKV
|
|
||||||
if gqa > maxGQA {
|
|
||||||
maxGQA = gqa
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return maxGQA
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (kv KV) ContextLength() uint64 {
|
func (kv KV) ContextLength() uint64 {
|
||||||
@@ -153,41 +104,6 @@ func (kv KV) Bool(key string, defaultValue ...bool) bool {
|
|||||||
return keyValue(kv, key, append(defaultValue, false)...)
|
return keyValue(kv, key, append(defaultValue, false)...)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (kv KV) UintOrArrayAsArray(key string, n uint64, defaultSingleValue ...uint64) []uint64 {
|
|
||||||
var singleValue *uint64
|
|
||||||
if v, ok := keyValueUntyped(kv, key); ok {
|
|
||||||
switch v := v.(type) {
|
|
||||||
case *array:
|
|
||||||
switch v.values[0].(type) {
|
|
||||||
case int32, uint32, uint64:
|
|
||||||
values, ok := AsUint64Array(v.values)
|
|
||||||
if ok {
|
|
||||||
return values
|
|
||||||
}
|
|
||||||
default:
|
|
||||||
slog.Warn("unexpected array value type", "key", key, "type", reflect.TypeOf(v))
|
|
||||||
}
|
|
||||||
case uint32:
|
|
||||||
val := uint64(v)
|
|
||||||
singleValue = &val
|
|
||||||
case int32:
|
|
||||||
val := uint64(v)
|
|
||||||
singleValue = &val
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if singleValue == nil {
|
|
||||||
slog.Warn("falling back to default")
|
|
||||||
singleValue = &defaultSingleValue[0]
|
|
||||||
}
|
|
||||||
|
|
||||||
values := make([]uint64, n)
|
|
||||||
for i := range values {
|
|
||||||
values[i] = *singleValue
|
|
||||||
}
|
|
||||||
|
|
||||||
return values
|
|
||||||
}
|
|
||||||
|
|
||||||
func (kv KV) Strings(key string, defaultValue ...[]string) []string {
|
func (kv KV) Strings(key string, defaultValue ...[]string) []string {
|
||||||
r := keyValue(kv, key, &array{})
|
r := keyValue(kv, key, &array{})
|
||||||
s := make([]string, r.size)
|
s := make([]string, r.size)
|
||||||
@@ -225,24 +141,16 @@ func (kv KV) OllamaEngineRequired() bool {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func keyValue[T string | uint32 | uint64 | float32 | *array | bool](kv KV, key string, defaultValue ...T) T {
|
func keyValue[T string | uint32 | uint64 | float32 | *array | bool](kv KV, key string, defaultValue ...T) T {
|
||||||
if val, ok := keyValueUntyped(kv, key); ok {
|
|
||||||
return val.(T)
|
|
||||||
}
|
|
||||||
|
|
||||||
slog.Warn("key not found", "key", key, "default", defaultValue[0])
|
|
||||||
return defaultValue[0]
|
|
||||||
}
|
|
||||||
|
|
||||||
func keyValueUntyped(kv KV, key string) (any, bool) {
|
|
||||||
if !strings.HasPrefix(key, "tokenizer.") && !strings.HasPrefix(key, "general.") {
|
if !strings.HasPrefix(key, "tokenizer.") && !strings.HasPrefix(key, "general.") {
|
||||||
key = kv.Architecture() + "." + key
|
key = kv.Architecture() + "." + key
|
||||||
}
|
}
|
||||||
|
|
||||||
if val, ok := kv[key]; ok {
|
if val, ok := kv[key]; ok {
|
||||||
return val, true
|
return val.(T)
|
||||||
}
|
}
|
||||||
|
|
||||||
return nil, false
|
slog.Warn("key not found", "key", key, "default", defaultValue[0])
|
||||||
|
return defaultValue[0]
|
||||||
}
|
}
|
||||||
|
|
||||||
type Tensors struct {
|
type Tensors struct {
|
||||||
@@ -510,22 +418,12 @@ func Decode(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) {
|
|||||||
|
|
||||||
func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType string) (kv []uint64, partialOffload, fullOffload uint64) {
|
func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType string) (kv []uint64, partialOffload, fullOffload uint64) {
|
||||||
embedding := f.KV().EmbeddingLength()
|
embedding := f.KV().EmbeddingLength()
|
||||||
heads := f.KV().HeadCounts()
|
heads := f.KV().HeadCount()
|
||||||
headsKV := f.KV().HeadCountKVs()
|
headsKV := f.KV().HeadCountKV()
|
||||||
vocab := uint64(f.KV()["tokenizer.ggml.tokens"].(*array).size)
|
vocab := uint64(f.KV()["tokenizer.ggml.tokens"].(*array).size)
|
||||||
|
|
||||||
embeddingHeads := f.KV().EmbeddingHeadCount()
|
embeddingHeads := f.KV().EmbeddingHeadCount()
|
||||||
maxEmbeddingHeads, ok := MaxValue(embeddingHeads)
|
|
||||||
if !ok {
|
|
||||||
maxEmbeddingHeads = 1
|
|
||||||
slog.Warn("failed to get max embedding heads")
|
|
||||||
}
|
|
||||||
embeddingHeadsK := f.KV().EmbeddingHeadCountK()
|
embeddingHeadsK := f.KV().EmbeddingHeadCountK()
|
||||||
maxEmbeddingHeadsK, ok := MaxValue(embeddingHeadsK)
|
|
||||||
if !ok {
|
|
||||||
maxEmbeddingHeadsK = 1
|
|
||||||
slog.Warn("failed to get max embedding headsK")
|
|
||||||
}
|
|
||||||
embeddingHeadsV := f.KV().EmbeddingHeadCountV()
|
embeddingHeadsV := f.KV().EmbeddingHeadCountV()
|
||||||
|
|
||||||
layers := f.Tensors().GroupLayers()
|
layers := f.Tensors().GroupLayers()
|
||||||
@@ -533,30 +431,19 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri
|
|||||||
bytesPerElement := kvCacheBytesPerElement(kvCacheType)
|
bytesPerElement := kvCacheBytesPerElement(kvCacheType)
|
||||||
kv = make([]uint64, f.KV().BlockCount())
|
kv = make([]uint64, f.KV().BlockCount())
|
||||||
for i := range kv {
|
for i := range kv {
|
||||||
kv[i] = uint64(float64(context*(embeddingHeadsK[i]+embeddingHeadsV[i])*headsKV[i]) * bytesPerElement)
|
kv[i] = uint64(float64(context*(embeddingHeadsK+embeddingHeadsV)*headsKV) * bytesPerElement)
|
||||||
}
|
|
||||||
|
|
||||||
maxHeads, ok := MaxValue(heads)
|
|
||||||
if !ok {
|
|
||||||
maxHeads = 1
|
|
||||||
slog.Warn("failed to get max heads")
|
|
||||||
}
|
|
||||||
maxHeadsKV, ok := MaxValue(headsKV)
|
|
||||||
if !ok {
|
|
||||||
maxHeadsKV = 1
|
|
||||||
slog.Warn("failed to get max headsKV")
|
|
||||||
}
|
}
|
||||||
|
|
||||||
switch f.KV().Architecture() {
|
switch f.KV().Architecture() {
|
||||||
case "llama":
|
case "llama":
|
||||||
fullOffload = max(
|
fullOffload = max(
|
||||||
4*batch*(1+4*embedding+context*(1+maxHeads)),
|
4*batch*(1+4*embedding+context*(1+heads)),
|
||||||
4*batch*(embedding+vocab),
|
4*batch*(embedding+vocab),
|
||||||
)
|
)
|
||||||
|
|
||||||
partialOffload = 4 * batch * embedding
|
partialOffload = 4 * batch * embedding
|
||||||
partialOffload += max(
|
partialOffload += max(
|
||||||
4*batch*(1+embedding+max(context, embedding))+embedding*embedding*9/16+4*context*(batch*maxHeads+maxEmbeddingHeads*maxHeadsKV),
|
4*batch*(1+embedding+max(context, embedding))+embedding*embedding*9/16+4*context*(batch*heads+embeddingHeads*headsKV),
|
||||||
4*batch*(embedding+vocab)+embedding*vocab*105/128,
|
4*batch*(embedding+vocab)+embedding*vocab*105/128,
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -564,16 +451,16 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri
|
|||||||
// mixtral 8x22b
|
// mixtral 8x22b
|
||||||
ff := uint64(f.KV()["llama.feed_forward_length"].(uint32))
|
ff := uint64(f.KV()["llama.feed_forward_length"].(uint32))
|
||||||
partialOffload = max(
|
partialOffload = max(
|
||||||
3*ffnGateExpsWeight.Size()+4*batch*(2*ff+maxHeadsKV+embedding+context+maxEmbeddingHeads*maxHeadsKV),
|
3*ffnGateExpsWeight.Size()+4*batch*(2*ff+headsKV+embedding+context+embeddingHeads*headsKV),
|
||||||
4*(context*batch*maxHeads+context*maxEmbeddingHeads*maxHeadsKV+batch*1024+maxEmbeddingHeads*maxHeadsKV*batch),
|
4*(context*batch*heads+context*embeddingHeads*headsKV+batch*1024+embeddingHeads*headsKV*batch),
|
||||||
)
|
)
|
||||||
} else if ffnGateWeight, ok := layers["blk.0"]["ffn_gate.0.weight"]; ok {
|
} else if ffnGateWeight, ok := layers["blk.0"]["ffn_gate.0.weight"]; ok {
|
||||||
// mixtral 8x7b
|
// mixtral 8x7b
|
||||||
ffnGateWeight1 := ffnGateWeight.Shape[1]
|
ffnGateWeight1 := ffnGateWeight.Shape[1]
|
||||||
fullOffload = 4 * batch * (2 + 3*embedding + context*(1+maxHeads) + 2*maxHeadsKV + ffnGateWeight1)
|
fullOffload = 4 * batch * (2 + 3*embedding + context*(1+heads) + 2*headsKV + ffnGateWeight1)
|
||||||
partialOffload = max(
|
partialOffload = max(
|
||||||
4*batch*(3+maxEmbeddingHeads*maxHeadsKV+embedding+context*(1+maxHeads)+ffnGateWeight1)+(embedding*embedding+3*embedding*maxHeadsKV*ffnGateWeight1)*9/16,
|
4*batch*(3+embeddingHeads*headsKV+embedding+context*(1+heads)+ffnGateWeight1)+(embedding*embedding+3*embedding*headsKV*ffnGateWeight1)*9/16,
|
||||||
4*batch*(1+2*embedding+context*(1+maxHeads))+embedding*(6*context*maxHeadsKV/maxHeads+embedding*9/16),
|
4*batch*(1+2*embedding+context*(1+heads))+embedding*(6*context*headsKV/heads+embedding*9/16),
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
case "mllama":
|
case "mllama":
|
||||||
@@ -582,7 +469,7 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri
|
|||||||
crossAttentionLayers := f.KV().Uints("attention.cross_attention_layers")
|
crossAttentionLayers := f.KV().Uints("attention.cross_attention_layers")
|
||||||
for i := range kv {
|
for i := range kv {
|
||||||
if slices.Contains(crossAttentionLayers, uint32(i)) {
|
if slices.Contains(crossAttentionLayers, uint32(i)) {
|
||||||
kv[i] = headsKV[i] * (embeddingHeadsK[i] + embeddingHeadsV[i]) *
|
kv[i] = headsKV * (embeddingHeadsK + embeddingHeadsV) *
|
||||||
4 * // sizeof(float32)
|
4 * // sizeof(float32)
|
||||||
visionTokens *
|
visionTokens *
|
||||||
tiles
|
tiles
|
||||||
@@ -590,7 +477,7 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri
|
|||||||
}
|
}
|
||||||
|
|
||||||
fullOffload = max(
|
fullOffload = max(
|
||||||
4*batch*(2+3*embedding+maxEmbeddingHeadsK*maxHeads+context*(1+maxHeads)),
|
4*batch*(2+3*embedding+embeddingHeadsK*heads+context*(1+heads)),
|
||||||
// vocab graph
|
// vocab graph
|
||||||
4*batch*(embedding+vocab),
|
4*batch*(embedding+vocab),
|
||||||
)
|
)
|
||||||
@@ -604,23 +491,23 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri
|
|||||||
|
|
||||||
partialOffload = max(
|
partialOffload = max(
|
||||||
4*(batch*
|
4*(batch*
|
||||||
(2*embedding+1+context*(1+maxHeads)+maxEmbeddingHeadsK*maxHeads)+
|
(2*embedding+1+context*(1+heads)+embeddingHeadsK*heads)+
|
||||||
ropeFreqsCount+
|
ropeFreqsCount+
|
||||||
maxEmbeddingHeadsK*context*maxHeadsKV),
|
embeddingHeadsK*context*headsKV),
|
||||||
// vocab graph
|
// vocab graph
|
||||||
4*batch*(embedding+vocab)+embedding*vocab*105/128,
|
4*batch*(embedding+vocab)+embedding*vocab*105/128,
|
||||||
)
|
)
|
||||||
case "gemma", "gemma2", "gemma3":
|
case "gemma", "gemma2", "gemma3":
|
||||||
fullOffload = max(
|
fullOffload = max(
|
||||||
4*batch*(embedding+vocab),
|
4*batch*(embedding+vocab),
|
||||||
4*batch*(2+context+context*maxHeads+2*embedding+2*maxEmbeddingHeadsK*maxHeads),
|
4*batch*(2+context+context*heads+2*embedding+2*embeddingHeadsK*heads),
|
||||||
)
|
)
|
||||||
|
|
||||||
partialOffload = max(
|
partialOffload = max(
|
||||||
4*embedding*batch+embedding*vocab*105/128+4*vocab*batch,
|
4*embedding*batch+embedding*vocab*105/128+4*vocab*batch,
|
||||||
4*batch*(2*embedding+1+2*maxEmbeddingHeadsK*maxHeads+context+context*maxHeads)+
|
4*batch*(2*embedding+1+2*embeddingHeadsK*heads+context+context*heads)+
|
||||||
4*maxEmbeddingHeadsK*context*8+
|
4*embeddingHeadsK*context*8+
|
||||||
embedding*embedding*maxEmbeddingHeadsK*maxHeads*9/16,
|
embedding*embeddingHeadsK*heads*9/16,
|
||||||
)
|
)
|
||||||
|
|
||||||
// Gemma2 also has sliding window attention but we only have an optimized implementation in the Ollama
|
// Gemma2 also has sliding window attention but we only have an optimized implementation in the Ollama
|
||||||
@@ -632,42 +519,42 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri
|
|||||||
// Every 6th layer is a global layer, which is the full context size that has already been set. The other
|
// Every 6th layer is a global layer, which is the full context size that has already been set. The other
|
||||||
// layers are the smaller local (sliding) layers.
|
// layers are the smaller local (sliding) layers.
|
||||||
if (i+1)%gemma3GlobalCacheCount != 0 {
|
if (i+1)%gemma3GlobalCacheCount != 0 {
|
||||||
kv[i] = uint64(float64(slidingWindow*(embeddingHeadsK[i]+embeddingHeadsV[i])*headsKV[i]) * bytesPerElement)
|
kv[i] = uint64(float64(slidingWindow*(embeddingHeadsK+embeddingHeadsV)*headsKV) * bytesPerElement)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
case "command-r":
|
case "command-r":
|
||||||
fullOffload = max(
|
fullOffload = max(
|
||||||
4*batch*(embedding+vocab),
|
4*batch*(embedding+vocab),
|
||||||
4*batch*(2+4*embedding+context*(1+maxHeads)),
|
4*batch*(2+4*embedding+context*(1+heads)),
|
||||||
)
|
)
|
||||||
|
|
||||||
partialOffload = max(
|
partialOffload = max(
|
||||||
4*batch*(embedding+vocab)+embedding*vocab*105/128,
|
4*batch*(embedding+vocab)+embedding*vocab*105/128,
|
||||||
4*batch*(1+2*embedding+context*(1+maxHeads))+4*embedding*context+embedding*embedding*9/16,
|
4*batch*(1+2*embedding+context*(1+heads))+4*embedding*context+embedding*embedding*9/16,
|
||||||
)
|
)
|
||||||
case "qwen2":
|
case "qwen2":
|
||||||
fullOffload = max(
|
fullOffload = max(
|
||||||
4*batch*(embedding+vocab),
|
4*batch*(embedding+vocab),
|
||||||
4*batch*(1+2*embedding+context+context*maxHeads),
|
4*batch*(1+2*embedding+context+context*heads),
|
||||||
)
|
)
|
||||||
|
|
||||||
partialOffload = max(
|
partialOffload = max(
|
||||||
4*batch*(embedding+vocab)+embedding*vocab*105/128,
|
4*batch*(embedding+vocab)+embedding*vocab*105/128,
|
||||||
4*(batch*(1+2*embedding+context*(1+maxHeads))+embedding*(1+context)),
|
4*(batch*(1+2*embedding+context*(1+heads))+embedding*(1+context)),
|
||||||
)
|
)
|
||||||
case "phi2":
|
case "phi2":
|
||||||
fullOffload = max(
|
fullOffload = max(
|
||||||
4*batch*(embedding+vocab),
|
4*batch*(embedding+vocab),
|
||||||
4*batch*(1+4*embedding+context+context*maxHeads),
|
4*batch*(1+4*embedding+context+context*heads),
|
||||||
)
|
)
|
||||||
|
|
||||||
partialOffload = max(
|
partialOffload = max(
|
||||||
4*batch*(2*embedding+vocab)+embedding*vocab*105/128,
|
4*batch*(2*embedding+vocab)+embedding*vocab*105/128,
|
||||||
4*batch*(2+3*embedding+context+context*maxHeads),
|
4*batch*(2+3*embedding+context+context*heads),
|
||||||
)
|
)
|
||||||
case "stablelm":
|
case "stablelm":
|
||||||
fullOffload = 4 * batch * (context*(1+maxHeads) + 3*embedding + 2)
|
fullOffload = 4 * batch * (context*(1+heads) + 3*embedding + 2)
|
||||||
partialOffload = max(
|
partialOffload = max(
|
||||||
4*batch*(vocab+2*embedding),
|
4*batch*(vocab+2*embedding),
|
||||||
fullOffload,
|
fullOffload,
|
||||||
@@ -675,12 +562,12 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri
|
|||||||
case "deepseek2":
|
case "deepseek2":
|
||||||
fullOffload = max(
|
fullOffload = max(
|
||||||
4*batch*(3*embedding+vocab),
|
4*batch*(3*embedding+vocab),
|
||||||
4*batch*(3*embedding+2+context*(1+maxHeadsKV)+2*maxEmbeddingHeadsK*maxHeadsKV),
|
4*batch*(3*embedding+2+context*(1+headsKV)+2*embeddingHeadsK*headsKV),
|
||||||
)
|
)
|
||||||
|
|
||||||
partialOffload = max(
|
partialOffload = max(
|
||||||
4*batch*(3*embedding+vocab)+embedding*vocab*105/128,
|
4*batch*(3*embedding+vocab)+embedding*vocab*105/128,
|
||||||
4*batch*(2*embedding+1+2*maxEmbeddingHeadsK*maxHeadsKV+context+context*maxHeadsKV)+4*maxEmbeddingHeadsK*context*maxHeadsKV+embedding*embedding*maxEmbeddingHeadsK*maxHeadsKV*9/16,
|
4*batch*(2*embedding+1+2*embeddingHeadsK*headsKV+context+context*headsKV)+4*embeddingHeadsK*context*headsKV+embedding*embeddingHeadsK*headsKV*9/16,
|
||||||
)
|
)
|
||||||
case "chatglm":
|
case "chatglm":
|
||||||
fullOffload = 4 * batch * (embedding + vocab)
|
fullOffload = 4 * batch * (embedding + vocab)
|
||||||
@@ -691,8 +578,8 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri
|
|||||||
4*batch*(2+
|
4*batch*(2+
|
||||||
2*embedding+
|
2*embedding+
|
||||||
context+
|
context+
|
||||||
context*maxHeads+
|
context*heads+
|
||||||
maxEmbeddingHeadsK*maxHeads+
|
embeddingHeadsK*heads+
|
||||||
qkvBias.Shape[0]),
|
qkvBias.Shape[0]),
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -700,11 +587,11 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri
|
|||||||
partialOffload,
|
partialOffload,
|
||||||
4*batch*(1+
|
4*batch*(1+
|
||||||
2*embedding+
|
2*embedding+
|
||||||
maxEmbeddingHeadsK*maxHeads+
|
embeddingHeadsK*heads+
|
||||||
context+
|
context+
|
||||||
context*maxHeads)+
|
context*heads)+
|
||||||
4*maxEmbeddingHeadsK*context+
|
4*embeddingHeadsK*context+
|
||||||
4*context*maxEmbeddingHeadsK+
|
4*context*embeddingHeadsK+
|
||||||
4*qkvBias.Shape[0],
|
4*qkvBias.Shape[0],
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
@@ -776,15 +663,9 @@ func (f GGML) SupportsFlashAttention() bool {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Check head counts match and are non-zero
|
// Check head counts match and are non-zero
|
||||||
headCount := f.KV().HeadCounts()
|
headCountK := f.KV().EmbeddingHeadCountK()
|
||||||
embeddingHeadCountK := f.KV().EmbeddingHeadCountK()
|
headCountV := f.KV().EmbeddingHeadCountV()
|
||||||
embeddingHeadCountV := f.KV().EmbeddingHeadCountV()
|
return headCountK != 0 && headCountV != 0 && headCountK == headCountV
|
||||||
for i := range headCount {
|
|
||||||
if embeddingHeadCountK[i] != embeddingHeadCountV[i] {
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return true
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// kvCacheBytesPerElement returns the number of bytes per element for a given KV cache type
|
// kvCacheBytesPerElement returns the number of bytes per element for a given KV cache type
|
||||||
@@ -798,54 +679,3 @@ func kvCacheBytesPerElement(cacheType string) float64 {
|
|||||||
return 2 // f16 (default)
|
return 2 // f16 (default)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func AsUint64Array(v []any) ([]uint64, bool) {
|
|
||||||
switch v[0].(type) {
|
|
||||||
case uint32:
|
|
||||||
values := make([]uint64, len(v))
|
|
||||||
for i, v := range v {
|
|
||||||
values[i] = uint64(v.(uint32))
|
|
||||||
}
|
|
||||||
return values, true
|
|
||||||
case uint64:
|
|
||||||
values := make([]uint64, len(v))
|
|
||||||
for i, v := range v {
|
|
||||||
values[i] = v.(uint64)
|
|
||||||
}
|
|
||||||
return values, true
|
|
||||||
case int32:
|
|
||||||
values := make([]uint64, len(v))
|
|
||||||
for i, val := range v {
|
|
||||||
val := val.(int32)
|
|
||||||
if val < 0 {
|
|
||||||
slog.Warn("negative value in int32 array", "value", val)
|
|
||||||
return nil, false
|
|
||||||
}
|
|
||||||
values[i] = uint64(val)
|
|
||||||
}
|
|
||||||
return values, true
|
|
||||||
}
|
|
||||||
return nil, false
|
|
||||||
}
|
|
||||||
|
|
||||||
func MaxValue(values []uint64) (uint64, bool) {
|
|
||||||
if len(values) == 0 {
|
|
||||||
return 0, false
|
|
||||||
}
|
|
||||||
|
|
||||||
max := values[0]
|
|
||||||
for _, v := range values {
|
|
||||||
if v > max {
|
|
||||||
max = v
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return max, true
|
|
||||||
}
|
|
||||||
|
|
||||||
func FillArray[T any](value T, n int) []T {
|
|
||||||
values := make([]T, n)
|
|
||||||
for i := range values {
|
|
||||||
values[i] = value
|
|
||||||
}
|
|
||||||
return values
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -149,7 +149,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
|
|||||||
}
|
}
|
||||||
|
|
||||||
if graphPartialOffload == 0 {
|
if graphPartialOffload == 0 {
|
||||||
graphPartialOffload = f.KV().GQAMax() * kvTotal / 6
|
graphPartialOffload = f.KV().GQA() * kvTotal / 6
|
||||||
}
|
}
|
||||||
if graphFullOffload == 0 {
|
if graphFullOffload == 0 {
|
||||||
graphFullOffload = graphPartialOffload
|
graphFullOffload = graphPartialOffload
|
||||||
|
|||||||
386
server/model.go
386
server/model.go
@@ -10,6 +10,7 @@ import (
|
|||||||
"log/slog"
|
"log/slog"
|
||||||
"net/http"
|
"net/http"
|
||||||
"os"
|
"os"
|
||||||
|
"regexp"
|
||||||
"slices"
|
"slices"
|
||||||
"strings"
|
"strings"
|
||||||
"text/template/parse"
|
"text/template/parse"
|
||||||
@@ -153,99 +154,342 @@ func parseObjects(s string) []map[string]any {
|
|||||||
return objs
|
return objs
|
||||||
}
|
}
|
||||||
|
|
||||||
// parseToolCalls attempts to parse a JSON string into a slice of ToolCalls.
|
// Get tool call token from model template
|
||||||
// mxyng: this only really works if the input contains tool calls in some JSON format
|
func (m *Model) TemplateToolToken() (string, string, bool) {
|
||||||
func (m *Model) parseToolCalls(s string) ([]api.ToolCall, bool) {
|
// Try to detect the tool call format from the model's template
|
||||||
// create a subtree from the node that ranges over .ToolCalls
|
|
||||||
tmpl := m.Template.Subtree(func(n parse.Node) bool {
|
tmpl := m.Template.Subtree(func(n parse.Node) bool {
|
||||||
if t, ok := n.(*parse.RangeNode); ok {
|
if t, ok := n.(*parse.RangeNode); ok {
|
||||||
return slices.Contains(template.Identifiers(t.Pipe), "ToolCalls")
|
return slices.Contains(template.Identifiers(t.Pipe), "ToolCalls")
|
||||||
}
|
}
|
||||||
|
|
||||||
return false
|
return false
|
||||||
})
|
})
|
||||||
|
|
||||||
if tmpl == nil {
|
// fmt.Println("tool call template", tmpl)
|
||||||
return nil, false
|
if tmpl != nil {
|
||||||
}
|
// Execute template with test data to see the format
|
||||||
|
var b bytes.Buffer
|
||||||
var b bytes.Buffer
|
if err := tmpl.Execute(&b, map[string][]api.ToolCall{
|
||||||
if err := tmpl.Execute(&b, map[string][]api.ToolCall{
|
"ToolCalls": {
|
||||||
"ToolCalls": {
|
{
|
||||||
{
|
Function: api.ToolCallFunction{
|
||||||
Function: api.ToolCallFunction{
|
Name: "function_name",
|
||||||
Name: "@@name@@",
|
Arguments: api.ToolCallFunctionArguments{
|
||||||
Arguments: api.ToolCallFunctionArguments{
|
"argument1": "value1",
|
||||||
"@@argument@@": 1,
|
// "argument2": "value2",
|
||||||
|
},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
},
|
}); err == nil {
|
||||||
}); err != nil {
|
// Look for special tokens in the template output
|
||||||
return nil, false
|
output := strings.TrimSpace(b.String())
|
||||||
}
|
slog.Debug("tool call template output", "output", output)
|
||||||
|
if strings.Contains(output, "<") {
|
||||||
templateObjects := parseObjects(b.String())
|
// Extract the special token between < and >
|
||||||
if len(templateObjects) == 0 {
|
start := strings.Index(output, "<")
|
||||||
return nil, false
|
end := strings.Index(output, ">")
|
||||||
}
|
if start >= 0 && end > start {
|
||||||
|
token := output[start : end+1]
|
||||||
// find the keys that correspond to the name and arguments fields
|
return output, token, true
|
||||||
var name, arguments string
|
}
|
||||||
for k, v := range templateObjects[0] {
|
} else if strings.Contains(output, "[") {
|
||||||
switch v.(type) {
|
// Check if it's a tool call token rather than JSON array
|
||||||
case string:
|
start := strings.Index(output, "[")
|
||||||
name = k
|
end := strings.Index(output, "]")
|
||||||
case map[string]any:
|
if start >= 0 && end > start {
|
||||||
arguments = k
|
token := output[start : end+1]
|
||||||
}
|
// Only consider it a token if it's not valid JSON
|
||||||
}
|
var jsonTest any
|
||||||
|
if err := json.Unmarshal([]byte(token), &jsonTest); err != nil {
|
||||||
if name == "" || arguments == "" {
|
return output, token, true
|
||||||
return nil, false
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
responseObjects := parseObjects(s)
|
|
||||||
if len(responseObjects) == 0 {
|
|
||||||
return nil, false
|
|
||||||
}
|
|
||||||
|
|
||||||
// collect all nested objects
|
|
||||||
var collect func(any) []map[string]any
|
|
||||||
collect = func(obj any) (all []map[string]any) {
|
|
||||||
switch o := obj.(type) {
|
|
||||||
case map[string]any:
|
|
||||||
all = append(all, o)
|
|
||||||
for _, v := range o {
|
|
||||||
all = append(all, collect(v)...)
|
|
||||||
}
|
|
||||||
case []any:
|
|
||||||
for _, v := range o {
|
|
||||||
all = append(all, collect(v)...)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return all
|
|
||||||
}
|
}
|
||||||
|
return "", "", false
|
||||||
|
}
|
||||||
|
|
||||||
var objs []map[string]any
|
func parsePythonFunctionCall(s string) ([]api.ToolCall, bool) {
|
||||||
for _, p := range responseObjects {
|
re := regexp.MustCompile(`(\w+)\((.*?)\)`)
|
||||||
objs = append(objs, collect(p)...)
|
matches := re.FindAllStringSubmatchIndex(s, -1)
|
||||||
|
if len(matches) == 0 {
|
||||||
|
return nil, false
|
||||||
}
|
}
|
||||||
|
|
||||||
var toolCalls []api.ToolCall
|
var toolCalls []api.ToolCall
|
||||||
for _, kv := range objs {
|
for _, match := range matches {
|
||||||
n, nok := kv[name].(string)
|
name := s[match[2]:match[3]]
|
||||||
a, aok := kv[arguments].(map[string]any)
|
args := s[match[4]:match[5]]
|
||||||
if nok && aok {
|
|
||||||
|
arguments := make(api.ToolCallFunctionArguments)
|
||||||
|
if strings.Contains(args, "=") { // Keyword args
|
||||||
|
pairs := strings.SplitSeq(args, ",")
|
||||||
|
for pair := range pairs {
|
||||||
|
pair = strings.TrimSpace(pair)
|
||||||
|
kv := strings.Split(pair, "=")
|
||||||
|
if len(kv) == 2 {
|
||||||
|
key := strings.TrimSpace(kv[0])
|
||||||
|
value := strings.TrimSpace(kv[1])
|
||||||
|
arguments[key] = value
|
||||||
|
}
|
||||||
|
}
|
||||||
toolCalls = append(toolCalls, api.ToolCall{
|
toolCalls = append(toolCalls, api.ToolCall{
|
||||||
Function: api.ToolCallFunction{
|
Function: api.ToolCallFunction{
|
||||||
Name: n,
|
Name: name,
|
||||||
Arguments: a,
|
Arguments: arguments,
|
||||||
},
|
},
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return toolCalls, len(toolCalls) > 0
|
if len(toolCalls) > 0 {
|
||||||
|
return toolCalls, true
|
||||||
|
}
|
||||||
|
return nil, false
|
||||||
|
}
|
||||||
|
|
||||||
|
// ToolCallFormat represents different possible formats for tool calls
|
||||||
|
type toolCallFormat struct {
|
||||||
|
// Direct format
|
||||||
|
Name string `json:"name,omitempty"`
|
||||||
|
Arguments map[string]any `json:"arguments,omitempty"`
|
||||||
|
|
||||||
|
// Command-r-plus format
|
||||||
|
ToolName string `json:"tool_name,omitempty"`
|
||||||
|
Parameters map[string]any `json:"parameters,omitempty"`
|
||||||
|
|
||||||
|
// Function format
|
||||||
|
Function *struct {
|
||||||
|
Name string `json:"name"`
|
||||||
|
Arguments map[string]any `json:"arguments,omitempty"`
|
||||||
|
Parameters map[string]any `json:"parameters,omitempty"`
|
||||||
|
} `json:"function,omitempty"`
|
||||||
|
|
||||||
|
// Xlam format
|
||||||
|
ToolCalls []toolCallFormat `json:"tool_calls,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
func parseJSONToolCalls(obj map[string]any) ([]api.ToolCall, bool) {
|
||||||
|
// Helper to convert any to []any safely
|
||||||
|
toArray := func(v any) []any {
|
||||||
|
if arr, ok := v.([]any); ok {
|
||||||
|
return arr
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Convert a single format to a tool call
|
||||||
|
makeToolCall := func(f toolCallFormat) (api.ToolCall, bool) {
|
||||||
|
switch {
|
||||||
|
case f.Name != "" && f.Arguments != nil:
|
||||||
|
return api.ToolCall{
|
||||||
|
Function: api.ToolCallFunction{
|
||||||
|
Name: f.Name,
|
||||||
|
Arguments: f.Arguments,
|
||||||
|
},
|
||||||
|
}, true
|
||||||
|
case f.Name != "" && f.Parameters != nil: // Handle parameters field
|
||||||
|
return api.ToolCall{
|
||||||
|
Function: api.ToolCallFunction{
|
||||||
|
Name: f.Name,
|
||||||
|
Arguments: f.Parameters,
|
||||||
|
},
|
||||||
|
}, true
|
||||||
|
case f.ToolName != "" && f.Parameters != nil:
|
||||||
|
return api.ToolCall{
|
||||||
|
Function: api.ToolCallFunction{
|
||||||
|
Name: f.ToolName,
|
||||||
|
Arguments: f.Parameters,
|
||||||
|
},
|
||||||
|
}, true
|
||||||
|
case f.Function != nil && f.Function.Name != "":
|
||||||
|
args := f.Function.Arguments
|
||||||
|
if args == nil {
|
||||||
|
args = f.Function.Parameters
|
||||||
|
}
|
||||||
|
if args != nil {
|
||||||
|
return api.ToolCall{
|
||||||
|
Function: api.ToolCallFunction{
|
||||||
|
Name: f.Function.Name,
|
||||||
|
Arguments: args,
|
||||||
|
},
|
||||||
|
}, true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return api.ToolCall{}, false
|
||||||
|
}
|
||||||
|
|
||||||
|
// Try parsing as array first
|
||||||
|
if arr := toArray(obj); arr != nil {
|
||||||
|
var calls []api.ToolCall
|
||||||
|
for _, item := range arr {
|
||||||
|
if itemMap, ok := item.(map[string]any); ok {
|
||||||
|
var format toolCallFormat
|
||||||
|
data, _ := json.Marshal(itemMap)
|
||||||
|
if err := json.Unmarshal(data, &format); err == nil {
|
||||||
|
if call, ok := makeToolCall(format); ok {
|
||||||
|
calls = append(calls, call)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if len(calls) > 0 {
|
||||||
|
return calls, true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Try parsing as single object
|
||||||
|
var format toolCallFormat
|
||||||
|
data, _ := json.Marshal(obj)
|
||||||
|
if err := json.Unmarshal(data, &format); err != nil {
|
||||||
|
return nil, false
|
||||||
|
}
|
||||||
|
|
||||||
|
// Handle xlam format (tool_calls array)
|
||||||
|
if len(format.ToolCalls) > 0 {
|
||||||
|
var calls []api.ToolCall
|
||||||
|
for _, f := range format.ToolCalls {
|
||||||
|
if call, ok := makeToolCall(f); ok {
|
||||||
|
calls = append(calls, call)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if len(calls) > 0 {
|
||||||
|
return calls, true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Try as single tool call
|
||||||
|
if call, ok := makeToolCall(format); ok {
|
||||||
|
return []api.ToolCall{call}, true
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil, false
|
||||||
|
}
|
||||||
|
|
||||||
|
// token, partial, success
|
||||||
|
func deriveToolToken(s string, prefix string) (string, bool, bool) {
|
||||||
|
// There shouldn't be spaces in a tool token
|
||||||
|
if len(strings.Fields(s)) > 1 {
|
||||||
|
return "", false, false
|
||||||
|
}
|
||||||
|
|
||||||
|
if prefix == "[" && len(s) > 1 && s[len(s)-1] == ']' {
|
||||||
|
return s, false, true
|
||||||
|
} else if prefix == "<" && len(s) > 1 && s[len(s)-1] == '>' {
|
||||||
|
return s, false, true
|
||||||
|
}
|
||||||
|
return "", true, true
|
||||||
|
}
|
||||||
|
|
||||||
|
func parseJSON(s string) ([]api.ToolCall, bool) {
|
||||||
|
objs := parseObjects(s)
|
||||||
|
tcs := []api.ToolCall{}
|
||||||
|
for _, obj := range objs {
|
||||||
|
toolCalls, ok := parseJSONToolCalls(obj)
|
||||||
|
if ok {
|
||||||
|
tcs = append(tcs, toolCalls...)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if len(tcs) > 0 {
|
||||||
|
return tcs, true
|
||||||
|
}
|
||||||
|
return nil, false
|
||||||
|
}
|
||||||
|
|
||||||
|
// returns tool calls, partial, success
|
||||||
|
func (m *Model) ParseToolCalls(s string, toolToken *string) ([]api.ToolCall, bool, bool) {
|
||||||
|
// [ case can either be JSON, Python or a Tool Token
|
||||||
|
s = strings.TrimSpace(s)
|
||||||
|
fmt.Printf("ParseToolCallsNew input: %q\n", s)
|
||||||
|
if len(s) == 0 {
|
||||||
|
return nil, false, false
|
||||||
|
}
|
||||||
|
|
||||||
|
if strings.HasPrefix(s, "[") {
|
||||||
|
fmt.Println("Found [ prefix")
|
||||||
|
// JSON case
|
||||||
|
// we do not consider array JSONs as tool calls
|
||||||
|
if strings.HasPrefix(s, "[{") {
|
||||||
|
fmt.Println("Found [{ prefix - attempting JSON parse")
|
||||||
|
// TODO: mark as JSON partial
|
||||||
|
if calls, ok := parseJSON(s); ok {
|
||||||
|
fmt.Printf("Successfully parsed JSON, found %d calls\n", len(calls))
|
||||||
|
return calls, false, true
|
||||||
|
}
|
||||||
|
return nil, true, true
|
||||||
|
}
|
||||||
|
// Python Case
|
||||||
|
// We just do a full python check here
|
||||||
|
fmt.Println("Attempting Python function parse")
|
||||||
|
tc, ok := parsePythonFunctionCall(s)
|
||||||
|
if ok {
|
||||||
|
fmt.Printf("Successfully parsed Python function: %+v\n", tc)
|
||||||
|
return tc, false, true
|
||||||
|
}
|
||||||
|
// Tool Token Case - this is okay if it's a real tool token and we couldn't get from template
|
||||||
|
fmt.Println("Attempting to derive tool token")
|
||||||
|
if toolToken == nil || *toolToken == "" {
|
||||||
|
toolTok, partial, ok := deriveToolToken(s, "[")
|
||||||
|
if !ok {
|
||||||
|
return nil, false, false
|
||||||
|
}
|
||||||
|
if partial {
|
||||||
|
return nil, true, true
|
||||||
|
}
|
||||||
|
*toolToken = toolTok
|
||||||
|
}
|
||||||
|
fmt.Printf("Found tool token: %q\n", *toolToken)
|
||||||
|
s = strings.TrimSpace(s[len(*toolToken):])
|
||||||
|
fmt.Printf("Recursing with remaining string: %q\n", s)
|
||||||
|
if toolCalls, partial, ok := m.ParseToolCalls(s, toolToken); ok {
|
||||||
|
return toolCalls, partial, true
|
||||||
|
}
|
||||||
|
return nil, true, true
|
||||||
|
} else if strings.HasPrefix(s, "{") || strings.HasPrefix(s, "```") {
|
||||||
|
// // TODO: temp fix
|
||||||
|
// if strings.HasPrefix(s, "```") && len(s) == 3 {
|
||||||
|
// return nil, false, false
|
||||||
|
// }
|
||||||
|
fmt.Println("Found { prefix - attempting JSON parse with ", s)
|
||||||
|
if calls, ok := parseJSON(s); ok {
|
||||||
|
fmt.Printf("Successfully parsed JSON object, found %d calls\n", len(calls))
|
||||||
|
return calls, false, true
|
||||||
|
}
|
||||||
|
fmt.Println("Failed to parse JSON in JSON case")
|
||||||
|
// TODO: possible case where it never finishes parsing - then what?
|
||||||
|
return nil, true, true
|
||||||
|
} else if strings.HasPrefix(s, "<") {
|
||||||
|
fmt.Println("Found < prefix - attempting to derive tool token")
|
||||||
|
if toolToken == nil || *toolToken == "" {
|
||||||
|
toolTok, partial, ok := deriveToolToken(s, "<")
|
||||||
|
if !ok {
|
||||||
|
return nil, false, false
|
||||||
|
}
|
||||||
|
if partial {
|
||||||
|
return nil, true, true
|
||||||
|
}
|
||||||
|
*toolToken = toolTok
|
||||||
|
fmt.Printf("Found tool token: %q\n", *toolToken)
|
||||||
|
}
|
||||||
|
fmt.Printf("Found tool token: %q\n", *toolToken)
|
||||||
|
s = strings.TrimSpace(s[len(*toolToken):])
|
||||||
|
fmt.Printf("Recursing with remaining string: %q\n", s)
|
||||||
|
if toolCalls, partial, ok := m.ParseToolCalls(s, toolToken); ok {
|
||||||
|
return toolCalls, partial, true
|
||||||
|
}
|
||||||
|
return nil, true, true
|
||||||
|
} else if strings.Contains(s, "(") || len(strings.Fields(s)) == 1 {
|
||||||
|
fmt.Println("Attempting Python function parse")
|
||||||
|
tc, ok := parsePythonFunctionCall(s)
|
||||||
|
if ok {
|
||||||
|
fmt.Printf("Successfully parsed Python function: %+v\n", tc)
|
||||||
|
return tc, false, true
|
||||||
|
}
|
||||||
|
fmt.Printf("Failed to parse Python function: %q, returning partial", s)
|
||||||
|
return nil, true, true
|
||||||
|
}
|
||||||
|
fmt.Println("No successful parse paths found")
|
||||||
|
fmt.Printf("failed string: %q\n", s)
|
||||||
|
return nil, false, false
|
||||||
}
|
}
|
||||||
|
|||||||
112
server/routes.go
112
server/routes.go
@@ -1526,6 +1526,17 @@ func (s *Server) ChatHandler(c *gin.Context) {
|
|||||||
defer close(ch)
|
defer close(ch)
|
||||||
var sb strings.Builder
|
var sb strings.Builder
|
||||||
var toolCallIndex int = 0
|
var toolCallIndex int = 0
|
||||||
|
var sentWithTools int = 0
|
||||||
|
// var prefix string
|
||||||
|
// var templateToolToken string
|
||||||
|
_, templateToolToken, _ := m.TemplateToolToken()
|
||||||
|
// fmt.Println("special token", templateToolToken)
|
||||||
|
|
||||||
|
var minDuration time.Duration = math.MaxInt64
|
||||||
|
var maxDuration time.Duration
|
||||||
|
var totalDuration time.Duration
|
||||||
|
var checkCount int
|
||||||
|
const maxToolTokens = 1
|
||||||
if err := r.Completion(c.Request.Context(), llm.CompletionRequest{
|
if err := r.Completion(c.Request.Context(), llm.CompletionRequest{
|
||||||
Prompt: prompt,
|
Prompt: prompt,
|
||||||
Images: images,
|
Images: images,
|
||||||
@@ -1546,6 +1557,14 @@ func (s *Server) ChatHandler(c *gin.Context) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if r.Done {
|
if r.Done {
|
||||||
|
slog.Debug("min duration", "duration", minDuration)
|
||||||
|
slog.Debug("max duration", "duration", maxDuration)
|
||||||
|
slog.Debug("total duration", "duration", totalDuration)
|
||||||
|
slog.Debug("check count", "count", checkCount)
|
||||||
|
// slog.Debug("average duration", "duration", totalDuration/time.Duration(checkCount))
|
||||||
|
// if sb.Len() > 0 {
|
||||||
|
// res.Message.Content = sb.String()
|
||||||
|
// }
|
||||||
res.DoneReason = r.DoneReason.String()
|
res.DoneReason = r.DoneReason.String()
|
||||||
res.TotalDuration = time.Since(checkpointStart)
|
res.TotalDuration = time.Since(checkpointStart)
|
||||||
res.LoadDuration = checkpointLoaded.Sub(checkpointStart)
|
res.LoadDuration = checkpointLoaded.Sub(checkpointStart)
|
||||||
@@ -1563,25 +1582,48 @@ func (s *Server) ChatHandler(c *gin.Context) {
|
|||||||
// If tools are recognized, use a flag to track the sending of a tool downstream
|
// If tools are recognized, use a flag to track the sending of a tool downstream
|
||||||
// This ensures that content is cleared from the message on the last chunk sent
|
// This ensures that content is cleared from the message on the last chunk sent
|
||||||
sb.WriteString(r.Content)
|
sb.WriteString(r.Content)
|
||||||
if toolCalls, ok := m.parseToolCalls(sb.String()); ok {
|
startTime := time.Now()
|
||||||
res.Message.ToolCalls = toolCalls
|
// TODO: work max tool tok logic
|
||||||
for i := range toolCalls {
|
if len(req.Tools) > 0 && sentWithTools < maxToolTokens {
|
||||||
toolCalls[i].Function.Index = toolCallIndex
|
toolCalls, partial, ok := m.ParseToolCalls(sb.String(), &templateToolToken)
|
||||||
toolCallIndex++
|
duration := time.Since(startTime)
|
||||||
|
checkCount++
|
||||||
|
minDuration = min(minDuration, duration)
|
||||||
|
maxDuration = max(maxDuration, duration)
|
||||||
|
totalDuration += duration
|
||||||
|
slog.Debug("tool call duration", "duration", duration)
|
||||||
|
if ok {
|
||||||
|
// fmt.Println("toolCalls", toolCalls, partial, ok, duration)
|
||||||
|
if partial {
|
||||||
|
// If the tool call is partial, we need to wait for the next chunk
|
||||||
|
return
|
||||||
|
}
|
||||||
|
slog.Debug("toolCalls", "toolCalls", toolCalls, "partial", partial, "ok", ok)
|
||||||
|
res.Message.ToolCalls = toolCalls
|
||||||
|
for i := range toolCalls {
|
||||||
|
toolCalls[i].Function.Index = toolCallIndex
|
||||||
|
toolCallIndex++
|
||||||
|
}
|
||||||
|
sentWithTools = 0
|
||||||
|
// prefix = ""
|
||||||
|
templateToolToken = ""
|
||||||
|
res.Message.Content = ""
|
||||||
|
sb.Reset()
|
||||||
|
ch <- res
|
||||||
|
// TODO: revisit this
|
||||||
|
sentWithTools++
|
||||||
|
slog.Debug("fired on tool call", "toolCalls", toolCalls, "toolCallIndex", toolCallIndex)
|
||||||
|
return
|
||||||
}
|
}
|
||||||
res.Message.Content = ""
|
|
||||||
sb.Reset()
|
|
||||||
ch <- res
|
|
||||||
return
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if r.Done {
|
// Send any remaining content if no tool calls were detected
|
||||||
// Send any remaining content if no tool calls were detected
|
// if toolCallIndex == 0 {
|
||||||
if toolCallIndex == 0 {
|
// fmt.Println("toolCallIndex", toolCallIndex)
|
||||||
res.Message.Content = sb.String()
|
sentWithTools++
|
||||||
}
|
res.Message.Content = sb.String()
|
||||||
ch <- res
|
sb.Reset()
|
||||||
}
|
ch <- res
|
||||||
}); err != nil {
|
}); err != nil {
|
||||||
ch <- gin.H{"error": err.Error()}
|
ch <- gin.H{"error": err.Error()}
|
||||||
}
|
}
|
||||||
@@ -1590,11 +1632,33 @@ func (s *Server) ChatHandler(c *gin.Context) {
|
|||||||
if req.Stream != nil && !*req.Stream {
|
if req.Stream != nil && !*req.Stream {
|
||||||
var resp api.ChatResponse
|
var resp api.ChatResponse
|
||||||
var sb strings.Builder
|
var sb strings.Builder
|
||||||
|
var toolCalls []api.ToolCall
|
||||||
|
const MAX_TOOL_TOKENS = 1
|
||||||
|
sentWithTools := 0
|
||||||
|
var tb strings.Builder
|
||||||
|
_, templateToolToken, _ := m.TemplateToolToken()
|
||||||
for rr := range ch {
|
for rr := range ch {
|
||||||
switch t := rr.(type) {
|
switch t := rr.(type) {
|
||||||
case api.ChatResponse:
|
case api.ChatResponse:
|
||||||
sb.WriteString(t.Message.Content)
|
sb.WriteString(t.Message.Content)
|
||||||
resp = t
|
resp = t
|
||||||
|
// TODO: work max tool tok logic
|
||||||
|
if len(req.Tools) > 0 && sentWithTools < MAX_TOOL_TOKENS {
|
||||||
|
tb.WriteString(t.Message.Content)
|
||||||
|
if tcs, partial, ok := m.ParseToolCalls(tb.String(), &templateToolToken); ok {
|
||||||
|
if !partial {
|
||||||
|
// resp.Message.ToolCalls = toolCalls
|
||||||
|
toolCalls = append(toolCalls, tcs...)
|
||||||
|
resp.Message.Content = ""
|
||||||
|
tb.Reset()
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// equivalent to no partial - send the content downstream
|
||||||
|
tb.Reset()
|
||||||
|
sentWithTools++
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
case gin.H:
|
case gin.H:
|
||||||
msg, ok := t["error"].(string)
|
msg, ok := t["error"].(string)
|
||||||
if !ok {
|
if !ok {
|
||||||
@@ -1610,14 +1674,18 @@ func (s *Server) ChatHandler(c *gin.Context) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
resp.Message.Content = sb.String()
|
resp.Message.Content = sb.String()
|
||||||
|
if len(toolCalls) > 0 {
|
||||||
if len(req.Tools) > 0 {
|
resp.Message.ToolCalls = toolCalls
|
||||||
if toolCalls, ok := m.parseToolCalls(sb.String()); ok {
|
// resp.Message.Content = ""
|
||||||
resp.Message.ToolCalls = toolCalls
|
|
||||||
resp.Message.Content = ""
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// if len(req.Tools) > 0 {
|
||||||
|
// if toolCalls, ok := m.ParseToolCalls(sb.String()); ok {
|
||||||
|
// resp.Message.ToolCalls = toolCalls
|
||||||
|
// resp.Message.Content = ""
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
|
||||||
c.JSON(http.StatusOK, resp)
|
c.JSON(http.StatusOK, resp)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user