Fixed over vram allcation dure to small initial layer sizes.
Co-authored-by: Tej Kiran <kiran.tej@amd.com> Co-authored-by: Michael Yang <mxyng@pm.me> Co-authored-by: Tej Kiran <itej89@gmailcom>
This commit is contained in:
parent
8cc33f4c2b
commit
0478d440f0
@ -1,9 +1,12 @@
|
|||||||
package llm
|
package llm
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"cmp"
|
||||||
"fmt"
|
"fmt"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
|
"maps"
|
||||||
"os"
|
"os"
|
||||||
|
"slices"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
@ -120,12 +123,10 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
|
|||||||
}
|
}
|
||||||
|
|
||||||
layers := f.Tensors().GroupLayers()
|
layers := f.Tensors().GroupLayers()
|
||||||
// add one layer worth of memory as a buffer
|
// add one layer (chosing the max layer) worth of memory as a buffer
|
||||||
if blk0, ok := layers["blk.0"]; ok {
|
layerSize = slices.MaxFunc(slices.Collect(maps.Values(layers)), func(a, b ggml.Layer) int {
|
||||||
layerSize = blk0.Size()
|
return cmp.Compare(a.Size(), b.Size())
|
||||||
} else {
|
}).Size()
|
||||||
slog.Warn("model missing blk.0 layer size")
|
|
||||||
}
|
|
||||||
|
|
||||||
var kvct string
|
var kvct string
|
||||||
if envconfig.FlashAttention() &&
|
if envconfig.FlashAttention() &&
|
||||||
@ -219,7 +220,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
|
|||||||
}
|
}
|
||||||
|
|
||||||
// For all the layers, find where they can fit on the GPU(s)
|
// For all the layers, find where they can fit on the GPU(s)
|
||||||
for i := range int(f.KV().BlockCount()) {
|
for i := int(f.KV().BlockCount()) - 1; i >= 0; i-- {
|
||||||
// Some models have inconsistent layer sizes
|
// Some models have inconsistent layer sizes
|
||||||
if blk, ok := layers[fmt.Sprintf("blk.%d", i)]; ok {
|
if blk, ok := layers[fmt.Sprintf("blk.%d", i)]; ok {
|
||||||
layerSize = blk.Size()
|
layerSize = blk.Size()
|
||||||
@ -229,6 +230,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
|
|||||||
|
|
||||||
if opts.NumGPU >= 0 && layerCount >= opts.NumGPU {
|
if opts.NumGPU >= 0 && layerCount >= opts.NumGPU {
|
||||||
// Stop allocating on GPU(s) once we hit the users target NumGPU
|
// Stop allocating on GPU(s) once we hit the users target NumGPU
|
||||||
|
overflow += layerSize
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -245,13 +247,13 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
|
|||||||
gpusWithSpace = append(gpusWithSpace[:i%j], gpusWithSpace[i%j+1:]...)
|
gpusWithSpace = append(gpusWithSpace[:i%j], gpusWithSpace[i%j+1:]...)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if len(gpusWithSpace) == 0 {
|
||||||
|
overflow += layerSize
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if layerCount >= int(f.KV().BlockCount()) {
|
if layerCount >= int(f.KV().BlockCount()) {
|
||||||
fullyLoaded = true
|
fullyLoaded = true
|
||||||
} else {
|
|
||||||
for i := layerCount; i < int(f.KV().BlockCount()); i++ {
|
|
||||||
overflow += layerSize
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Determine if we need to consider output then find where it fits
|
// Determine if we need to consider output then find where it fits
|
||||||
|
Loading…
x
Reference in New Issue
Block a user