memory
This commit is contained in:
parent
7ba9fa9c7d
commit
f0ad49ea17
@ -430,7 +430,7 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri
|
|||||||
}
|
}
|
||||||
|
|
||||||
switch f.KV().Architecture() {
|
switch f.KV().Architecture() {
|
||||||
case "llama":
|
case "llama", "llama4":
|
||||||
fullOffload = max(
|
fullOffload = max(
|
||||||
4*batch*(1+4*embedding+context*(1+heads)),
|
4*batch*(1+4*embedding+context*(1+heads)),
|
||||||
4*batch*(embedding+vocab),
|
4*batch*(embedding+vocab),
|
||||||
@ -444,7 +444,7 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri
|
|||||||
|
|
||||||
if ffnGateExpsWeight, ok := layers["blk.0"]["ffn_gate_exps.weight"]; ok {
|
if ffnGateExpsWeight, ok := layers["blk.0"]["ffn_gate_exps.weight"]; ok {
|
||||||
// mixtral 8x22b
|
// mixtral 8x22b
|
||||||
ff := uint64(f.KV()["llama.feed_forward_length"].(uint32))
|
ff := uint64(f.KV().Uint("feed_forward_length"))
|
||||||
partialOffload = max(
|
partialOffload = max(
|
||||||
3*ffnGateExpsWeight.Size()+4*batch*(2*ff+headsKV+embedding+context+embeddingHeads*headsKV),
|
3*ffnGateExpsWeight.Size()+4*batch*(2*ff+headsKV+embedding+context+embeddingHeads*headsKV),
|
||||||
4*(context*batch*heads+context*embeddingHeads*headsKV+batch*1024+embeddingHeads*headsKV*batch),
|
4*(context*batch*heads+context*embeddingHeads*headsKV+batch*1024+embeddingHeads*headsKV*batch),
|
||||||
@ -640,6 +640,9 @@ func (llm GGML) VisionGraphSize() (weights, graphSize uint64) {
|
|||||||
graphSize = 4 * (imageSize*imageSize*numChannels +
|
graphSize = 4 * (imageSize*imageSize*numChannels +
|
||||||
embeddingLength*patchSize +
|
embeddingLength*patchSize +
|
||||||
numPatches*numPatches*headCount)
|
numPatches*numPatches*headCount)
|
||||||
|
case "llama4":
|
||||||
|
// vision graph is computed independently in the same schedule
|
||||||
|
// and is negligible compared to the worst case text graph
|
||||||
}
|
}
|
||||||
|
|
||||||
return weights, graphSize
|
return weights, graphSize
|
||||||
|
Loading…
x
Reference in New Issue
Block a user