Move quantization to new backend (#10363)
* Move quantization logic to GGML via new backend This moves the model aware logic to Go code and calls GGMLs quantization code for model creation. * Remove "add model quantizations" This is no longer needed now that quantization is implemented in Go+GGML code directly.
This commit is contained in:
parent
95e744beeb
commit
424810450f
@ -162,7 +162,11 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
|
|||||||
if resp.Digest != "" {
|
if resp.Digest != "" {
|
||||||
bar, ok := bars[resp.Digest]
|
bar, ok := bars[resp.Digest]
|
||||||
if !ok {
|
if !ok {
|
||||||
bar = progress.NewBar(fmt.Sprintf("pulling %s...", resp.Digest[7:19]), resp.Total, resp.Completed)
|
msg := resp.Status
|
||||||
|
if msg == "" {
|
||||||
|
msg = fmt.Sprintf("pulling %s...", resp.Digest[7:19])
|
||||||
|
}
|
||||||
|
bar = progress.NewBar(msg, resp.Total, resp.Completed)
|
||||||
bars[resp.Digest] = bar
|
bars[resp.Digest] = bar
|
||||||
p.Add(resp.Digest, bar)
|
p.Add(resp.Digest, bar)
|
||||||
}
|
}
|
||||||
|
@ -4,9 +4,9 @@ import (
|
|||||||
"encoding/json"
|
"encoding/json"
|
||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
|
||||||
"io/fs"
|
"io/fs"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
|
"os"
|
||||||
"slices"
|
"slices"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
@ -89,7 +89,7 @@ type ModelConverter interface {
|
|||||||
// KV maps parameters to LLM key-values
|
// KV maps parameters to LLM key-values
|
||||||
KV(*Tokenizer) ggml.KV
|
KV(*Tokenizer) ggml.KV
|
||||||
// Tensors maps input tensors to LLM tensors. Model specific modifications can be done here.
|
// Tensors maps input tensors to LLM tensors. Model specific modifications can be done here.
|
||||||
Tensors([]Tensor) []ggml.Tensor
|
Tensors([]Tensor) []*ggml.Tensor
|
||||||
// Replacements returns a list of string pairs to replace in tensor names.
|
// Replacements returns a list of string pairs to replace in tensor names.
|
||||||
// See [strings.Replacer](https://pkg.go.dev/strings#Replacer) for details
|
// See [strings.Replacer](https://pkg.go.dev/strings#Replacer) for details
|
||||||
Replacements() []string
|
Replacements() []string
|
||||||
@ -106,13 +106,13 @@ type AdapterConverter interface {
|
|||||||
// KV maps parameters to LLM key-values
|
// KV maps parameters to LLM key-values
|
||||||
KV(ggml.KV) ggml.KV
|
KV(ggml.KV) ggml.KV
|
||||||
// Tensors maps input tensors to LLM tensors. Adapter specific modifications can be done here.
|
// Tensors maps input tensors to LLM tensors. Adapter specific modifications can be done here.
|
||||||
Tensors([]Tensor) []ggml.Tensor
|
Tensors([]Tensor) []*ggml.Tensor
|
||||||
// Replacements returns a list of string pairs to replace in tensor names.
|
// Replacements returns a list of string pairs to replace in tensor names.
|
||||||
// See [strings.Replacer](https://pkg.go.dev/strings#Replacer) for details
|
// See [strings.Replacer](https://pkg.go.dev/strings#Replacer) for details
|
||||||
Replacements() []string
|
Replacements() []string
|
||||||
}
|
}
|
||||||
|
|
||||||
func ConvertAdapter(fsys fs.FS, ws io.WriteSeeker, baseKV ggml.KV) error {
|
func ConvertAdapter(fsys fs.FS, f *os.File, baseKV ggml.KV) error {
|
||||||
bts, err := fs.ReadFile(fsys, "adapter_config.json")
|
bts, err := fs.ReadFile(fsys, "adapter_config.json")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
@ -147,14 +147,14 @@ func ConvertAdapter(fsys fs.FS, ws io.WriteSeeker, baseKV ggml.KV) error {
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
return writeFile(ws, conv.KV(baseKV), conv.Tensors(ts))
|
return writeFile(f, conv.KV(baseKV), conv.Tensors(ts))
|
||||||
}
|
}
|
||||||
|
|
||||||
// Convert writes an Ollama compatible model to the provided io.WriteSeeker based on configurations
|
// Convert writes an Ollama compatible model to the provided io.WriteSeeker based on configurations
|
||||||
// and files it finds in the input path.
|
// and files it finds in the input path.
|
||||||
// Supported input model formats include safetensors.
|
// Supported input model formats include safetensors.
|
||||||
// Supported input tokenizers files include tokenizer.json (preferred) and tokenizer.model.
|
// Supported input tokenizers files include tokenizer.json (preferred) and tokenizer.model.
|
||||||
func ConvertModel(fsys fs.FS, ws io.WriteSeeker) error {
|
func ConvertModel(fsys fs.FS, f *os.File) error {
|
||||||
bts, err := fs.ReadFile(fsys, "config.json")
|
bts, err := fs.ReadFile(fsys, "config.json")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
@ -239,13 +239,13 @@ func ConvertModel(fsys fs.FS, ws io.WriteSeeker) error {
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
return writeFile(ws, conv.KV(t), conv.Tensors(ts))
|
return writeFile(f, conv.KV(t), conv.Tensors(ts))
|
||||||
}
|
}
|
||||||
|
|
||||||
func writeFile(ws io.WriteSeeker, kv ggml.KV, ts []ggml.Tensor) error {
|
func writeFile(f *os.File, kv ggml.KV, ts []*ggml.Tensor) error {
|
||||||
for i := range ts {
|
for i := range ts {
|
||||||
ts[i].Shape = slices.Clone(ts[i].Shape)
|
ts[i].Shape = slices.Clone(ts[i].Shape)
|
||||||
slices.Reverse(ts[i].Shape)
|
slices.Reverse(ts[i].Shape)
|
||||||
}
|
}
|
||||||
return ggml.WriteGGUF(ws, kv, ts)
|
return ggml.WriteGGUF(f, kv, ts)
|
||||||
}
|
}
|
||||||
|
@ -132,8 +132,8 @@ func (p *bertModel) KV(t *Tokenizer) ggml.KV {
|
|||||||
return kv
|
return kv
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *bertModel) Tensors(ts []Tensor) []ggml.Tensor {
|
func (p *bertModel) Tensors(ts []Tensor) []*ggml.Tensor {
|
||||||
var out []ggml.Tensor
|
var out []*ggml.Tensor
|
||||||
for _, t := range ts {
|
for _, t := range ts {
|
||||||
if slices.Contains([]string{
|
if slices.Contains([]string{
|
||||||
"embeddings.position_ids",
|
"embeddings.position_ids",
|
||||||
@ -143,7 +143,7 @@ func (p *bertModel) Tensors(ts []Tensor) []ggml.Tensor {
|
|||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
out = append(out, ggml.Tensor{
|
out = append(out, &ggml.Tensor{
|
||||||
Name: t.Name(),
|
Name: t.Name(),
|
||||||
Kind: t.Kind(),
|
Kind: t.Kind(),
|
||||||
Shape: t.Shape(),
|
Shape: t.Shape(),
|
||||||
|
@ -43,10 +43,10 @@ func (p *commandrModel) KV(t *Tokenizer) ggml.KV {
|
|||||||
return kv
|
return kv
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *commandrModel) Tensors(ts []Tensor) []ggml.Tensor {
|
func (p *commandrModel) Tensors(ts []Tensor) []*ggml.Tensor {
|
||||||
var out []ggml.Tensor
|
var out []*ggml.Tensor
|
||||||
for _, t := range ts {
|
for _, t := range ts {
|
||||||
out = append(out, ggml.Tensor{
|
out = append(out, &ggml.Tensor{
|
||||||
Name: t.Name(),
|
Name: t.Name(),
|
||||||
Kind: t.Kind(),
|
Kind: t.Kind(),
|
||||||
Shape: t.Shape(),
|
Shape: t.Shape(),
|
||||||
|
@ -42,14 +42,14 @@ func (p *gemmaModel) KV(t *Tokenizer) ggml.KV {
|
|||||||
return kv
|
return kv
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *gemmaModel) Tensors(ts []Tensor) []ggml.Tensor {
|
func (p *gemmaModel) Tensors(ts []Tensor) []*ggml.Tensor {
|
||||||
var out []ggml.Tensor
|
var out []*ggml.Tensor
|
||||||
for _, t := range ts {
|
for _, t := range ts {
|
||||||
if !strings.HasPrefix(t.Name(), "v.") && strings.HasSuffix(t.Name(), "_norm.weight") {
|
if !strings.HasPrefix(t.Name(), "v.") && strings.HasSuffix(t.Name(), "_norm.weight") {
|
||||||
t.SetRepacker(p.addOne)
|
t.SetRepacker(p.addOne)
|
||||||
}
|
}
|
||||||
|
|
||||||
out = append(out, ggml.Tensor{
|
out = append(out, &ggml.Tensor{
|
||||||
Name: t.Name(),
|
Name: t.Name(),
|
||||||
Kind: t.Kind(),
|
Kind: t.Kind(),
|
||||||
Shape: t.Shape(),
|
Shape: t.Shape(),
|
||||||
|
@ -21,8 +21,8 @@ func (p *gemma2Adapter) KV(baseKV ggml.KV) ggml.KV {
|
|||||||
return kv
|
return kv
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *gemma2Adapter) Tensors(ts []Tensor) []ggml.Tensor {
|
func (p *gemma2Adapter) Tensors(ts []Tensor) []*ggml.Tensor {
|
||||||
var out []ggml.Tensor
|
var out []*ggml.Tensor
|
||||||
for _, t := range ts {
|
for _, t := range ts {
|
||||||
shape := t.Shape()
|
shape := t.Shape()
|
||||||
if (strings.HasSuffix(t.Name(), "weight.lora_a") && shape[0] > shape[1]) ||
|
if (strings.HasSuffix(t.Name(), "weight.lora_a") && shape[0] > shape[1]) ||
|
||||||
@ -31,7 +31,7 @@ func (p *gemma2Adapter) Tensors(ts []Tensor) []ggml.Tensor {
|
|||||||
t.SetRepacker(p.repack)
|
t.SetRepacker(p.repack)
|
||||||
}
|
}
|
||||||
|
|
||||||
out = append(out, ggml.Tensor{
|
out = append(out, &ggml.Tensor{
|
||||||
Name: t.Name(),
|
Name: t.Name(),
|
||||||
Kind: t.Kind(),
|
Kind: t.Kind(),
|
||||||
Shape: t.Shape(),
|
Shape: t.Shape(),
|
||||||
|
@ -126,11 +126,11 @@ func (p *llamaModel) KV(t *Tokenizer) ggml.KV {
|
|||||||
return kv
|
return kv
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *llamaModel) Tensors(ts []Tensor) []ggml.Tensor {
|
func (p *llamaModel) Tensors(ts []Tensor) []*ggml.Tensor {
|
||||||
var out []ggml.Tensor
|
var out []*ggml.Tensor
|
||||||
|
|
||||||
if p.RopeScaling.factors != nil {
|
if p.RopeScaling.factors != nil {
|
||||||
out = append(out, ggml.Tensor{
|
out = append(out, &ggml.Tensor{
|
||||||
Name: "rope_freqs.weight",
|
Name: "rope_freqs.weight",
|
||||||
Kind: 0,
|
Kind: 0,
|
||||||
Shape: []uint64{uint64(len(p.RopeScaling.factors))},
|
Shape: []uint64{uint64(len(p.RopeScaling.factors))},
|
||||||
@ -145,7 +145,7 @@ func (p *llamaModel) Tensors(ts []Tensor) []ggml.Tensor {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
out = append(out, ggml.Tensor{
|
out = append(out, &ggml.Tensor{
|
||||||
Name: t.Name(),
|
Name: t.Name(),
|
||||||
Kind: t.Kind(),
|
Kind: t.Kind(),
|
||||||
Shape: t.Shape(),
|
Shape: t.Shape(),
|
||||||
|
@ -88,13 +88,13 @@ func (p *llama4Model) Replacements() []string {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Tensors implements ModelConverter.
|
// Tensors implements ModelConverter.
|
||||||
func (p *llama4Model) Tensors(ts []Tensor) []ggml.Tensor {
|
func (p *llama4Model) Tensors(ts []Tensor) []*ggml.Tensor {
|
||||||
var out []ggml.Tensor
|
var out []*ggml.Tensor
|
||||||
|
|
||||||
var textTensors []Tensor
|
var textTensors []Tensor
|
||||||
for _, t := range ts {
|
for _, t := range ts {
|
||||||
if strings.HasPrefix(t.Name(), "v.") || strings.HasPrefix(t.Name(), "mm.") {
|
if strings.HasPrefix(t.Name(), "v.") || strings.HasPrefix(t.Name(), "mm.") {
|
||||||
out = append(out, ggml.Tensor{
|
out = append(out, &ggml.Tensor{
|
||||||
Name: t.Name(),
|
Name: t.Name(),
|
||||||
Kind: t.Kind(),
|
Kind: t.Kind(),
|
||||||
Shape: t.Shape(),
|
Shape: t.Shape(),
|
||||||
@ -112,7 +112,7 @@ func (p *llama4Model) Tensors(ts []Tensor) []ggml.Tensor {
|
|||||||
// clone tensor since we need separate repackers
|
// clone tensor since we need separate repackers
|
||||||
tt := t.Clone()
|
tt := t.Clone()
|
||||||
tt.SetRepacker(p.repack(nil, nil, tensor.S(i*halfDim, (i+1)*halfDim)))
|
tt.SetRepacker(p.repack(nil, nil, tensor.S(i*halfDim, (i+1)*halfDim)))
|
||||||
out = append(out, ggml.Tensor{
|
out = append(out, &ggml.Tensor{
|
||||||
Name: strings.ReplaceAll(tt.Name(), "ffn_gate_up_exps", name),
|
Name: strings.ReplaceAll(tt.Name(), "ffn_gate_up_exps", name),
|
||||||
Kind: tt.Kind(),
|
Kind: tt.Kind(),
|
||||||
Shape: newShape,
|
Shape: newShape,
|
||||||
@ -125,7 +125,7 @@ func (p *llama4Model) Tensors(ts []Tensor) []ggml.Tensor {
|
|||||||
t.SetRepacker(p.repack())
|
t.SetRepacker(p.repack())
|
||||||
newShape := slices.Clone(t.Shape())
|
newShape := slices.Clone(t.Shape())
|
||||||
newShape[1], newShape[2] = newShape[2], newShape[1]
|
newShape[1], newShape[2] = newShape[2], newShape[1]
|
||||||
out = append(out, ggml.Tensor{
|
out = append(out, &ggml.Tensor{
|
||||||
Name: t.Name(),
|
Name: t.Name(),
|
||||||
Kind: t.Kind(),
|
Kind: t.Kind(),
|
||||||
Shape: newShape,
|
Shape: newShape,
|
||||||
|
@ -29,8 +29,8 @@ func (p *llamaAdapter) KV(baseKV ggml.KV) ggml.KV {
|
|||||||
return kv
|
return kv
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *llamaAdapter) Tensors(ts []Tensor) []ggml.Tensor {
|
func (p *llamaAdapter) Tensors(ts []Tensor) []*ggml.Tensor {
|
||||||
var out []ggml.Tensor
|
var out []*ggml.Tensor
|
||||||
for _, t := range ts {
|
for _, t := range ts {
|
||||||
shape := t.Shape()
|
shape := t.Shape()
|
||||||
if (strings.HasSuffix(t.Name(), "weight.lora_a") && shape[0] > shape[1]) ||
|
if (strings.HasSuffix(t.Name(), "weight.lora_a") && shape[0] > shape[1]) ||
|
||||||
@ -41,7 +41,7 @@ func (p *llamaAdapter) Tensors(ts []Tensor) []ggml.Tensor {
|
|||||||
t.SetRepacker(p.repack)
|
t.SetRepacker(p.repack)
|
||||||
}
|
}
|
||||||
|
|
||||||
out = append(out, ggml.Tensor{
|
out = append(out, &ggml.Tensor{
|
||||||
Name: t.Name(),
|
Name: t.Name(),
|
||||||
Kind: t.Kind(),
|
Kind: t.Kind(),
|
||||||
Shape: shape,
|
Shape: shape,
|
||||||
|
@ -89,8 +89,8 @@ func (p *mistral3Model) KV(t *Tokenizer) ggml.KV {
|
|||||||
return kv
|
return kv
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *mistral3Model) Tensors(ts []Tensor) []ggml.Tensor {
|
func (p *mistral3Model) Tensors(ts []Tensor) []*ggml.Tensor {
|
||||||
var out []ggml.Tensor
|
var out []*ggml.Tensor
|
||||||
|
|
||||||
for _, t := range ts {
|
for _, t := range ts {
|
||||||
if !strings.HasPrefix(t.Name(), "v.") {
|
if !strings.HasPrefix(t.Name(), "v.") {
|
||||||
@ -100,7 +100,7 @@ func (p *mistral3Model) Tensors(ts []Tensor) []ggml.Tensor {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
out = append(out, ggml.Tensor{
|
out = append(out, &ggml.Tensor{
|
||||||
Name: t.Name(),
|
Name: t.Name(),
|
||||||
Kind: t.Kind(),
|
Kind: t.Kind(),
|
||||||
Shape: t.Shape(),
|
Shape: t.Shape(),
|
||||||
|
@ -29,7 +29,7 @@ func (p *mixtralModel) KV(t *Tokenizer) ggml.KV {
|
|||||||
return kv
|
return kv
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *mixtralModel) Tensors(ts []Tensor) []ggml.Tensor {
|
func (p *mixtralModel) Tensors(ts []Tensor) []*ggml.Tensor {
|
||||||
oldnew := []string{
|
oldnew := []string{
|
||||||
"model.layers", "blk",
|
"model.layers", "blk",
|
||||||
"w1", "ffn_gate_exps",
|
"w1", "ffn_gate_exps",
|
||||||
@ -56,10 +56,10 @@ func (p *mixtralModel) Tensors(ts []Tensor) []ggml.Tensor {
|
|||||||
return true
|
return true
|
||||||
})
|
})
|
||||||
|
|
||||||
var out []ggml.Tensor
|
var out []*ggml.Tensor
|
||||||
for n, e := range experts {
|
for n, e := range experts {
|
||||||
// TODO(mxyng): sanity check experts
|
// TODO(mxyng): sanity check experts
|
||||||
out = append(out, ggml.Tensor{
|
out = append(out, &ggml.Tensor{
|
||||||
Name: n,
|
Name: n,
|
||||||
Kind: e[0].Kind(),
|
Kind: e[0].Kind(),
|
||||||
Shape: append([]uint64{uint64(len(e))}, e[0].Shape()...),
|
Shape: append([]uint64{uint64(len(e))}, e[0].Shape()...),
|
||||||
|
@ -68,19 +68,19 @@ func (p *phi3Model) KV(t *Tokenizer) ggml.KV {
|
|||||||
return kv
|
return kv
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *phi3Model) Tensors(ts []Tensor) []ggml.Tensor {
|
func (p *phi3Model) Tensors(ts []Tensor) []*ggml.Tensor {
|
||||||
var addRopeFactors sync.Once
|
var addRopeFactors sync.Once
|
||||||
|
|
||||||
out := make([]ggml.Tensor, 0, len(ts)+2)
|
out := make([]*ggml.Tensor, 0, len(ts)+2)
|
||||||
for _, t := range ts {
|
for _, t := range ts {
|
||||||
if strings.HasPrefix(t.Name(), "blk.0.") {
|
if strings.HasPrefix(t.Name(), "blk.0.") {
|
||||||
addRopeFactors.Do(func() {
|
addRopeFactors.Do(func() {
|
||||||
out = append(out, ggml.Tensor{
|
out = append(out, &ggml.Tensor{
|
||||||
Name: "rope_factors_long.weight",
|
Name: "rope_factors_long.weight",
|
||||||
Kind: 0,
|
Kind: 0,
|
||||||
Shape: []uint64{uint64(len(p.RopeScaling.LongFactor))},
|
Shape: []uint64{uint64(len(p.RopeScaling.LongFactor))},
|
||||||
WriterTo: p.RopeScaling.LongFactor,
|
WriterTo: p.RopeScaling.LongFactor,
|
||||||
}, ggml.Tensor{
|
}, &ggml.Tensor{
|
||||||
Name: "rope_factors_short.weight",
|
Name: "rope_factors_short.weight",
|
||||||
Kind: 0,
|
Kind: 0,
|
||||||
Shape: []uint64{uint64(len(p.RopeScaling.ShortFactor))},
|
Shape: []uint64{uint64(len(p.RopeScaling.ShortFactor))},
|
||||||
@ -89,7 +89,7 @@ func (p *phi3Model) Tensors(ts []Tensor) []ggml.Tensor {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
out = append(out, ggml.Tensor{
|
out = append(out, &ggml.Tensor{
|
||||||
Name: t.Name(),
|
Name: t.Name(),
|
||||||
Kind: t.Kind(),
|
Kind: t.Kind(),
|
||||||
Shape: t.Shape(),
|
Shape: t.Shape(),
|
||||||
|
@ -45,10 +45,10 @@ func (q *qwen2Model) KV(t *Tokenizer) ggml.KV {
|
|||||||
return kv
|
return kv
|
||||||
}
|
}
|
||||||
|
|
||||||
func (q *qwen2Model) Tensors(ts []Tensor) []ggml.Tensor {
|
func (q *qwen2Model) Tensors(ts []Tensor) []*ggml.Tensor {
|
||||||
var out []ggml.Tensor
|
var out []*ggml.Tensor
|
||||||
for _, t := range ts {
|
for _, t := range ts {
|
||||||
out = append(out, ggml.Tensor{
|
out = append(out, &ggml.Tensor{
|
||||||
Name: t.Name(),
|
Name: t.Name(),
|
||||||
Kind: t.Kind(),
|
Kind: t.Kind(),
|
||||||
Shape: t.Shape(),
|
Shape: t.Shape(),
|
||||||
|
@ -36,12 +36,12 @@ func (kv KV) ParameterCount() uint64 {
|
|||||||
return keyValue(kv, "general.parameter_count", uint64(0))
|
return keyValue(kv, "general.parameter_count", uint64(0))
|
||||||
}
|
}
|
||||||
|
|
||||||
func (kv KV) FileType() fileType {
|
func (kv KV) FileType() FileType {
|
||||||
if t := kv.Uint("general.file_type"); t > 0 {
|
if t := kv.Uint("general.file_type"); t > 0 {
|
||||||
return fileType(t)
|
return FileType(t)
|
||||||
}
|
}
|
||||||
|
|
||||||
return fileTypeUnknown
|
return FileTypeUnknown
|
||||||
}
|
}
|
||||||
|
|
||||||
func (kv KV) BlockCount() uint64 {
|
func (kv KV) BlockCount() uint64 {
|
||||||
@ -226,7 +226,11 @@ func (t Tensor) block() (n int) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (t Tensor) blockSize() uint64 {
|
func (t Tensor) blockSize() uint64 {
|
||||||
switch t.Kind {
|
return (TensorType)(t.Kind).BlockSize()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (t TensorType) BlockSize() uint64 {
|
||||||
|
switch t {
|
||||||
case
|
case
|
||||||
0, // F32
|
0, // F32
|
||||||
1, // F16
|
1, // F16
|
||||||
@ -252,73 +256,77 @@ func (t Tensor) blockSize() uint64 {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (t Tensor) typeSize() uint64 {
|
func (t Tensor) typeSize() uint64 {
|
||||||
blockSize := t.blockSize()
|
return TensorType(t.Kind).TypeSize()
|
||||||
|
}
|
||||||
|
|
||||||
switch t.Kind {
|
func (t TensorType) TypeSize() uint64 {
|
||||||
case 0: // FP32
|
blockSize := t.BlockSize()
|
||||||
|
|
||||||
|
switch t {
|
||||||
|
case TensorTypeF32:
|
||||||
return 4
|
return 4
|
||||||
case 1: // FP16
|
case TensorTypeF16:
|
||||||
return 2
|
return 2
|
||||||
case 2: // Q4_0
|
case TensorTypeQ4_0:
|
||||||
return 2 + blockSize/2
|
return 2 + blockSize/2
|
||||||
case 3: // Q4_1
|
case TensorTypeQ4_1:
|
||||||
return 2 + 2 + blockSize/2
|
return 2 + 2 + blockSize/2
|
||||||
case 6: // Q5_0
|
case TensorTypeQ5_0:
|
||||||
return 2 + 4 + blockSize/2
|
return 2 + 4 + blockSize/2
|
||||||
case 7: // Q5_1
|
case TensorTypeQ5_1:
|
||||||
return 2 + 2 + 4 + blockSize/2
|
return 2 + 2 + 4 + blockSize/2
|
||||||
case 8: // Q8_0
|
case TensorTypeQ8_0:
|
||||||
return 2 + blockSize
|
return 2 + blockSize
|
||||||
case 9: // Q8_1
|
case TensorTypeQ8_1:
|
||||||
return 2 + 2 + blockSize
|
return 2 + 2 + blockSize
|
||||||
case 10: // Q2_K
|
case TensorTypeQ2_K:
|
||||||
return blockSize/16 + blockSize/4 + 2 + 2
|
return blockSize/16 + blockSize/4 + 2 + 2
|
||||||
case 11: // Q3_K
|
case TensorTypeQ3_K:
|
||||||
return blockSize/8 + blockSize/4 + 12 + 2
|
return blockSize/8 + blockSize/4 + 12 + 2
|
||||||
case 12: // Q4_K
|
case TensorTypeQ4_K:
|
||||||
return 2 + 2 + 12 + blockSize/2
|
return 2 + 2 + 12 + blockSize/2
|
||||||
case 13: // Q5_K
|
case TensorTypeQ5_K:
|
||||||
return 2 + 2 + 12 + blockSize/8 + blockSize/2
|
return 2 + 2 + 12 + blockSize/8 + blockSize/2
|
||||||
case 14: // Q6_K
|
case TensorTypeQ6_K:
|
||||||
return blockSize/2 + blockSize/4 + blockSize/16 + 2
|
return blockSize/2 + blockSize/4 + blockSize/16 + 2
|
||||||
case 15: // Q8_K
|
case TensorTypeQ8_K:
|
||||||
return 4 + blockSize + 2*blockSize/16
|
return 4 + blockSize + 2*blockSize/16
|
||||||
case 16: // IQ2_XXS
|
case tensorTypeIQ2_XXS:
|
||||||
return 2 + 2*blockSize/8
|
return 2 + 2*blockSize/8
|
||||||
case 17: // IQ2_XS
|
case tensorTypeIQ2_XS:
|
||||||
return 2 + 2*blockSize/8 + blockSize/32
|
return 2 + 2*blockSize/8 + blockSize/32
|
||||||
case 18: // IQ3_XXS
|
case tensorTypeIQ3_XXS:
|
||||||
return 2 + blockSize/4 + blockSize/8
|
return 2 + blockSize/4 + blockSize/8
|
||||||
case 19: // IQ1_S
|
case tensorTypeIQ1_S:
|
||||||
return 2 + blockSize/8 + blockSize/16
|
return 2 + blockSize/8 + blockSize/16
|
||||||
case 20: // IQ4_NL
|
case tensorTypeIQ4_NL:
|
||||||
return 2 + blockSize/2
|
return 2 + blockSize/2
|
||||||
case 21: // IQ3_S
|
case tensorTypeIQ3_S:
|
||||||
return 2 + blockSize/4 + blockSize/8 + blockSize/32 + 4
|
return 2 + blockSize/4 + blockSize/8 + blockSize/32 + 4
|
||||||
case 22: // IQ2_S
|
case tensorTypeIQ2_S:
|
||||||
return 2 + blockSize/4 + blockSize/16
|
return 2 + blockSize/4 + blockSize/16
|
||||||
case 23: // IQ4_XS
|
case tensorTypeIQ4_XS:
|
||||||
return 2 + 2 + blockSize/2 + blockSize/64
|
return 2 + 2 + blockSize/2 + blockSize/64
|
||||||
case 24: // I8
|
case TensorTypeI8:
|
||||||
return 1
|
return 1
|
||||||
case 25: // I16
|
case TensorTypeI16:
|
||||||
return 2
|
return 2
|
||||||
case 26: // I32
|
case TensorTypeI32:
|
||||||
return 4
|
return 4
|
||||||
case 27: // I64
|
case TensorTypeI64:
|
||||||
return 8
|
return 8
|
||||||
case 28: // F64
|
case TensorTypeF64:
|
||||||
return 8
|
return 8
|
||||||
case 29: // IQ1_M
|
case tensorTypeIQ1_M:
|
||||||
return blockSize/8 + blockSize/16 + blockSize/32
|
return blockSize/8 + blockSize/16 + blockSize/32
|
||||||
case 30: // BF16
|
case TensorTypeBF16:
|
||||||
return 2
|
return 2
|
||||||
default:
|
default:
|
||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (t Tensor) parameters() uint64 {
|
func (t Tensor) Elements() uint64 {
|
||||||
var count uint64 = 1
|
var count uint64 = 1
|
||||||
for _, n := range t.Shape {
|
for _, n := range t.Shape {
|
||||||
count *= n
|
count *= n
|
||||||
@ -327,11 +335,11 @@ func (t Tensor) parameters() uint64 {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (t Tensor) Size() uint64 {
|
func (t Tensor) Size() uint64 {
|
||||||
return t.parameters() * t.typeSize() / t.blockSize()
|
return t.Elements() * t.typeSize() / t.blockSize()
|
||||||
}
|
}
|
||||||
|
|
||||||
func (t Tensor) Type() string {
|
func (t Tensor) Type() string {
|
||||||
return fileType(t.Kind).String()
|
return TensorType(t.Kind).String()
|
||||||
}
|
}
|
||||||
|
|
||||||
type container interface {
|
type container interface {
|
||||||
@ -480,7 +488,7 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri
|
|||||||
var ropeFreqsCount uint64
|
var ropeFreqsCount uint64
|
||||||
if ropeFreqs, ok := f.Tensors().GroupLayers()["rope_freqs"]; ok {
|
if ropeFreqs, ok := f.Tensors().GroupLayers()["rope_freqs"]; ok {
|
||||||
if ropeFreqsWeights, ok := ropeFreqs["weights"]; ok {
|
if ropeFreqsWeights, ok := ropeFreqs["weights"]; ok {
|
||||||
ropeFreqsCount = ropeFreqsWeights.parameters()
|
ropeFreqsCount = ropeFreqsWeights.Elements()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
109
fs/ggml/gguf.go
109
fs/ggml/gguf.go
@ -9,8 +9,12 @@ import (
|
|||||||
"io"
|
"io"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
"maps"
|
"maps"
|
||||||
|
"os"
|
||||||
|
"runtime"
|
||||||
"slices"
|
"slices"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
|
"golang.org/x/sync/errgroup"
|
||||||
)
|
)
|
||||||
|
|
||||||
type containerGGUF struct {
|
type containerGGUF struct {
|
||||||
@ -225,7 +229,7 @@ func (llm *gguf) Decode(rs io.ReadSeeker) error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
llm.tensors = append(llm.tensors, &tensor)
|
llm.tensors = append(llm.tensors, &tensor)
|
||||||
llm.parameters += tensor.parameters()
|
llm.parameters += tensor.Elements()
|
||||||
}
|
}
|
||||||
|
|
||||||
// patch KV with parameter count
|
// patch KV with parameter count
|
||||||
@ -488,25 +492,38 @@ func writeGGUFArray[S ~[]E, E any](w io.Writer, t uint32, s S) error {
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if t == ggufTypeString {
|
||||||
|
for _, e := range any(s).([]string) {
|
||||||
|
if err := binary.Write(w, binary.LittleEndian, uint64(len(e))); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := binary.Write(w, binary.LittleEndian, []byte(e)); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
return binary.Write(w, binary.LittleEndian, s)
|
return binary.Write(w, binary.LittleEndian, s)
|
||||||
}
|
}
|
||||||
|
|
||||||
func WriteGGUF(ws io.WriteSeeker, kv KV, ts []Tensor) error {
|
func WriteGGUF(f *os.File, kv KV, ts []*Tensor) error {
|
||||||
alignment := kv.Uint("general.alignment", 32)
|
alignment := kv.Uint("general.alignment", 32)
|
||||||
|
|
||||||
if err := binary.Write(ws, binary.LittleEndian, []byte("GGUF")); err != nil {
|
if err := binary.Write(f, binary.LittleEndian, []byte("GGUF")); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
if err := binary.Write(ws, binary.LittleEndian, uint32(3)); err != nil {
|
if err := binary.Write(f, binary.LittleEndian, uint32(3)); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
if err := binary.Write(ws, binary.LittleEndian, uint64(len(ts))); err != nil {
|
if err := binary.Write(f, binary.LittleEndian, uint64(len(ts))); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
if err := binary.Write(ws, binary.LittleEndian, uint64(len(kv))); err != nil {
|
if err := binary.Write(f, binary.LittleEndian, uint64(len(kv))); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -514,12 +531,12 @@ func WriteGGUF(ws io.WriteSeeker, kv KV, ts []Tensor) error {
|
|||||||
slices.Sort(keys)
|
slices.Sort(keys)
|
||||||
|
|
||||||
for _, key := range keys {
|
for _, key := range keys {
|
||||||
if err := ggufWriteKV(ws, key, kv[key]); err != nil {
|
if err := ggufWriteKV(f, key, kv[key]); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
slices.SortStableFunc(ts, func(a, b Tensor) int {
|
slices.SortStableFunc(ts, func(a, b *Tensor) int {
|
||||||
if i, j := a.block(), b.block(); i < 0 && j > 0 {
|
if i, j := a.block(), b.block(); i < 0 && j > 0 {
|
||||||
return 1
|
return 1
|
||||||
} else if i > 0 && j < 0 {
|
} else if i > 0 && j < 0 {
|
||||||
@ -530,22 +547,34 @@ func WriteGGUF(ws io.WriteSeeker, kv KV, ts []Tensor) error {
|
|||||||
})
|
})
|
||||||
|
|
||||||
var s uint64
|
var s uint64
|
||||||
for _, t := range ts {
|
for i := range ts {
|
||||||
t.Offset = s
|
ts[i].Offset = s
|
||||||
if err := ggufWriteTensorInfo(ws, t); err != nil {
|
if err := ggufWriteTensorInfo(f, ts[i]); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
s += t.Size()
|
s += ts[i].Size()
|
||||||
s += uint64(ggufPadding(int64(s), int64(alignment)))
|
s += uint64(ggufPadding(int64(s), int64(alignment)))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
offset, err := f.Seek(0, io.SeekCurrent)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
offset += ggufPadding(offset, int64(alignment))
|
||||||
|
|
||||||
|
var g errgroup.Group
|
||||||
|
g.SetLimit(runtime.GOMAXPROCS(0))
|
||||||
|
// TODO consider reducing if tensors size * gomaxprocs is larger than free memory
|
||||||
for _, t := range ts {
|
for _, t := range ts {
|
||||||
if err := ggufWriteTensor(ws, t, int64(alignment)); err != nil {
|
t := t
|
||||||
|
w := io.NewOffsetWriter(f, offset+int64(t.Offset))
|
||||||
|
g.Go(func() error {
|
||||||
|
_, err = t.WriteTo(w)
|
||||||
return err
|
return err
|
||||||
}
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
return nil
|
return g.Wait()
|
||||||
}
|
}
|
||||||
|
|
||||||
func ggufWriteKV(ws io.WriteSeeker, k string, v any) error {
|
func ggufWriteKV(ws io.WriteSeeker, k string, v any) error {
|
||||||
@ -560,8 +589,10 @@ func ggufWriteKV(ws io.WriteSeeker, k string, v any) error {
|
|||||||
|
|
||||||
var err error
|
var err error
|
||||||
switch v := v.(type) {
|
switch v := v.(type) {
|
||||||
case uint32:
|
case uint32, FileType:
|
||||||
err = writeGGUF(ws, ggufTypeUint32, v)
|
err = writeGGUF(ws, ggufTypeUint32, v)
|
||||||
|
case uint64:
|
||||||
|
err = writeGGUF(ws, ggufTypeUint64, v)
|
||||||
case float32:
|
case float32:
|
||||||
err = writeGGUF(ws, ggufTypeFloat32, v)
|
err = writeGGUF(ws, ggufTypeFloat32, v)
|
||||||
case bool:
|
case bool:
|
||||||
@ -570,32 +601,20 @@ func ggufWriteKV(ws io.WriteSeeker, k string, v any) error {
|
|||||||
err = writeGGUFString(ws, v)
|
err = writeGGUFString(ws, v)
|
||||||
case []int32:
|
case []int32:
|
||||||
err = writeGGUFArray(ws, ggufTypeInt32, v)
|
err = writeGGUFArray(ws, ggufTypeInt32, v)
|
||||||
|
case *array[int32]:
|
||||||
|
err = writeGGUFArray(ws, ggufTypeInt32, v.values)
|
||||||
case []uint32:
|
case []uint32:
|
||||||
err = writeGGUFArray(ws, ggufTypeUint32, v)
|
err = writeGGUFArray(ws, ggufTypeUint32, v)
|
||||||
|
case *array[uint32]:
|
||||||
|
err = writeGGUFArray(ws, ggufTypeUint32, v.values)
|
||||||
case []float32:
|
case []float32:
|
||||||
err = writeGGUFArray(ws, ggufTypeFloat32, v)
|
err = writeGGUFArray(ws, ggufTypeFloat32, v)
|
||||||
|
case *array[float32]:
|
||||||
|
err = writeGGUFArray(ws, ggufTypeFloat32, v.values)
|
||||||
case []string:
|
case []string:
|
||||||
if err := binary.Write(ws, binary.LittleEndian, ggufTypeArray); err != nil {
|
err = writeGGUFArray(ws, ggufTypeString, v)
|
||||||
return err
|
case *array[string]:
|
||||||
}
|
err = writeGGUFArray(ws, ggufTypeString, v.values)
|
||||||
|
|
||||||
if err := binary.Write(ws, binary.LittleEndian, ggufTypeString); err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
if err := binary.Write(ws, binary.LittleEndian, uint64(len(v))); err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, e := range v {
|
|
||||||
if err := binary.Write(ws, binary.LittleEndian, uint64(len(e))); err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
if err := binary.Write(ws, binary.LittleEndian, []byte(e)); err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
}
|
|
||||||
default:
|
default:
|
||||||
return fmt.Errorf("improper type for '%s'", k)
|
return fmt.Errorf("improper type for '%s'", k)
|
||||||
}
|
}
|
||||||
@ -603,7 +622,7 @@ func ggufWriteKV(ws io.WriteSeeker, k string, v any) error {
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
func ggufWriteTensorInfo(ws io.WriteSeeker, t Tensor) error {
|
func ggufWriteTensorInfo(ws io.WriteSeeker, t *Tensor) error {
|
||||||
slog.Debug(t.Name, "kind", t.Kind, "shape", t.Shape, "offset", t.Offset)
|
slog.Debug(t.Name, "kind", t.Kind, "shape", t.Shape, "offset", t.Offset)
|
||||||
if err := binary.Write(ws, binary.LittleEndian, uint64(len(t.Name))); err != nil {
|
if err := binary.Write(ws, binary.LittleEndian, uint64(len(t.Name))); err != nil {
|
||||||
return err
|
return err
|
||||||
@ -630,20 +649,6 @@ func ggufWriteTensorInfo(ws io.WriteSeeker, t Tensor) error {
|
|||||||
return binary.Write(ws, binary.LittleEndian, t.Offset)
|
return binary.Write(ws, binary.LittleEndian, t.Offset)
|
||||||
}
|
}
|
||||||
|
|
||||||
func ggufWriteTensor(ws io.WriteSeeker, t Tensor, alignment int64) error {
|
|
||||||
offset, err := ws.Seek(0, io.SeekCurrent)
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
if err := binary.Write(ws, binary.LittleEndian, bytes.Repeat([]byte{0}, int(ggufPadding(offset, alignment)))); err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
_, err = t.WriteTo(ws)
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
func ggufPadding(offset, align int64) int64 {
|
func ggufPadding(offset, align int64) int64 {
|
||||||
return (align - offset%align) % align
|
return (align - offset%align) % align
|
||||||
}
|
}
|
||||||
|
@ -18,7 +18,7 @@ func TestWriteGGUF(t *testing.T) {
|
|||||||
|
|
||||||
if err := WriteGGUF(w, KV{
|
if err := WriteGGUF(w, KV{
|
||||||
"general.alignment": uint32(16),
|
"general.alignment": uint32(16),
|
||||||
}, []Tensor{
|
}, []*Tensor{
|
||||||
{Name: "test.0", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(slices.Repeat([]byte{0}, 2*3*4))},
|
{Name: "test.0", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(slices.Repeat([]byte{0}, 2*3*4))},
|
||||||
{Name: "test.1", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(slices.Repeat([]byte{0}, 2*3*4))},
|
{Name: "test.1", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(slices.Repeat([]byte{0}, 2*3*4))},
|
||||||
{Name: "test.2", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(slices.Repeat([]byte{0}, 2*3*4))},
|
{Name: "test.2", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(slices.Repeat([]byte{0}, 2*3*4))},
|
||||||
|
424
fs/ggml/type.go
424
fs/ggml/type.go
@ -1,185 +1,341 @@
|
|||||||
package ggml
|
package ggml
|
||||||
|
|
||||||
import "fmt"
|
import (
|
||||||
|
"fmt"
|
||||||
type fileType uint32
|
"log/slog"
|
||||||
|
"strings"
|
||||||
const (
|
|
||||||
fileTypeF32 fileType = iota
|
|
||||||
fileTypeF16
|
|
||||||
fileTypeQ4_0
|
|
||||||
fileTypeQ4_1
|
|
||||||
fileTypeQ4_1_F16
|
|
||||||
fileTypeQ4_2 // unused
|
|
||||||
fileTypeQ4_3 // unused
|
|
||||||
fileTypeQ8_0
|
|
||||||
fileTypeQ5_0
|
|
||||||
fileTypeQ5_1
|
|
||||||
fileTypeQ2_K
|
|
||||||
fileTypeQ3_K_S
|
|
||||||
fileTypeQ3_K_M
|
|
||||||
fileTypeQ3_K_L
|
|
||||||
fileTypeQ4_K_S
|
|
||||||
fileTypeQ4_K_M
|
|
||||||
fileTypeQ5_K_S
|
|
||||||
fileTypeQ5_K_M
|
|
||||||
fileTypeQ6_K
|
|
||||||
fileTypeIQ2_XXS
|
|
||||||
fileTypeIQ2_XS
|
|
||||||
fileTypeQ2_K_S
|
|
||||||
fileTypeIQ3_XS
|
|
||||||
fileTypeIQ3_XXS
|
|
||||||
fileTypeIQ1_S
|
|
||||||
fileTypeIQ4_NL
|
|
||||||
fileTypeIQ3_S
|
|
||||||
fileTypeIQ3_M
|
|
||||||
fileTypeIQ2_S
|
|
||||||
fileTypeIQ2_M
|
|
||||||
fileTypeIQ4_XS
|
|
||||||
fileTypeIQ1_M
|
|
||||||
fileTypeBF16
|
|
||||||
|
|
||||||
fileTypeUnknown
|
|
||||||
)
|
)
|
||||||
|
|
||||||
func ParseFileType(s string) (fileType, error) {
|
// FileType is the Go equivalent to llama_ftype used for gguf file typing
|
||||||
|
type FileType uint32
|
||||||
|
|
||||||
|
const (
|
||||||
|
FileTypeF32 FileType = iota
|
||||||
|
FileTypeF16
|
||||||
|
FileTypeQ4_0
|
||||||
|
FileTypeQ4_1
|
||||||
|
fileTypeQ4_1_F16 // unused by GGML
|
||||||
|
fileTypeQ4_2 // unused by GGML
|
||||||
|
fileTypeQ4_3 // unused by GGML
|
||||||
|
FileTypeQ8_0
|
||||||
|
FileTypeQ5_0
|
||||||
|
FileTypeQ5_1
|
||||||
|
FileTypeQ2_K
|
||||||
|
FileTypeQ3_K_S
|
||||||
|
FileTypeQ3_K_M
|
||||||
|
FileTypeQ3_K_L
|
||||||
|
FileTypeQ4_K_S
|
||||||
|
FileTypeQ4_K_M
|
||||||
|
FileTypeQ5_K_S
|
||||||
|
FileTypeQ5_K_M
|
||||||
|
FileTypeQ6_K
|
||||||
|
fileTypeIQ2_XXS // not supported by ollama
|
||||||
|
fileTypeIQ2_XS // not supported by ollama
|
||||||
|
FileTypeQ2_K_S
|
||||||
|
fileTypeIQ3_XS // not supported by ollama
|
||||||
|
fileTypeIQ3_XXS // not supported by ollama
|
||||||
|
fileTypeIQ1_S // not supported by ollama
|
||||||
|
fileTypeIQ4_NL // not supported by ollama
|
||||||
|
fileTypeIQ3_S // not supported by ollama
|
||||||
|
fileTypeIQ3_M // not supported by ollama
|
||||||
|
fileTypeIQ2_S // not supported by ollama
|
||||||
|
fileTypeIQ2_M // not supported by ollama
|
||||||
|
fileTypeIQ4_XS // not supported by ollama
|
||||||
|
fileTypeIQ1_M // not supported by ollama
|
||||||
|
FileTypeBF16
|
||||||
|
fileTypeQ4_0_4_4 // unused by GGML
|
||||||
|
fileTypeQ4_0_4_8 // unused by GGML
|
||||||
|
fileTypeQ4_0_8_8 // unused by GGML
|
||||||
|
fileTypeTQ1_0 // not supported by ollama
|
||||||
|
fileTypeTQ2_0 // not supported by ollama
|
||||||
|
|
||||||
|
FileTypeUnknown = 1024
|
||||||
|
)
|
||||||
|
|
||||||
|
// ParseFileType parses the provided GGUF file type
|
||||||
|
// Only Ollama supported types are considered valid
|
||||||
|
func ParseFileType(s string) (FileType, error) {
|
||||||
switch s {
|
switch s {
|
||||||
case "F32":
|
case "F32":
|
||||||
return fileTypeF32, nil
|
return FileTypeF32, nil
|
||||||
case "F16":
|
case "F16":
|
||||||
return fileTypeF16, nil
|
return FileTypeF16, nil
|
||||||
case "Q4_0":
|
case "Q4_0":
|
||||||
return fileTypeQ4_0, nil
|
return FileTypeQ4_0, nil
|
||||||
case "Q4_1":
|
case "Q4_1":
|
||||||
return fileTypeQ4_1, nil
|
return FileTypeQ4_1, nil
|
||||||
case "Q4_1_F16":
|
|
||||||
return fileTypeQ4_1_F16, nil
|
|
||||||
case "Q8_0":
|
case "Q8_0":
|
||||||
return fileTypeQ8_0, nil
|
return FileTypeQ8_0, nil
|
||||||
case "Q5_0":
|
case "Q5_0":
|
||||||
return fileTypeQ5_0, nil
|
return FileTypeQ5_0, nil
|
||||||
case "Q5_1":
|
case "Q5_1":
|
||||||
return fileTypeQ5_1, nil
|
return FileTypeQ5_1, nil
|
||||||
case "Q2_K":
|
case "Q2_K":
|
||||||
return fileTypeQ2_K, nil
|
return FileTypeQ2_K, nil
|
||||||
case "Q3_K_S":
|
case "Q3_K_S":
|
||||||
return fileTypeQ3_K_S, nil
|
return FileTypeQ3_K_S, nil
|
||||||
case "Q3_K_M":
|
case "Q3_K_M":
|
||||||
return fileTypeQ3_K_M, nil
|
return FileTypeQ3_K_M, nil
|
||||||
case "Q3_K_L":
|
case "Q3_K_L":
|
||||||
return fileTypeQ3_K_L, nil
|
return FileTypeQ3_K_L, nil
|
||||||
case "Q4_K_S":
|
case "Q4_K_S":
|
||||||
return fileTypeQ4_K_S, nil
|
return FileTypeQ4_K_S, nil
|
||||||
case "Q4_K_M":
|
case "Q4_K_M", "Q4_K":
|
||||||
return fileTypeQ4_K_M, nil
|
return FileTypeQ4_K_M, nil
|
||||||
case "Q5_K_S":
|
case "Q5_K_S":
|
||||||
return fileTypeQ5_K_S, nil
|
return FileTypeQ5_K_S, nil
|
||||||
case "Q5_K_M":
|
case "Q5_K_M", "Q5_K":
|
||||||
return fileTypeQ5_K_M, nil
|
return FileTypeQ5_K_M, nil
|
||||||
case "Q6_K":
|
case "Q6_K":
|
||||||
return fileTypeQ6_K, nil
|
return FileTypeQ6_K, nil
|
||||||
case "IQ2_XXS":
|
|
||||||
return fileTypeIQ2_XXS, nil
|
|
||||||
case "IQ2_XS":
|
|
||||||
return fileTypeIQ2_XS, nil
|
|
||||||
case "Q2_K_S":
|
case "Q2_K_S":
|
||||||
return fileTypeQ2_K_S, nil
|
return FileTypeQ2_K_S, nil
|
||||||
case "IQ3_XS":
|
|
||||||
return fileTypeIQ3_XS, nil
|
|
||||||
case "IQ3_XXS":
|
|
||||||
return fileTypeIQ3_XXS, nil
|
|
||||||
case "IQ1_S":
|
|
||||||
return fileTypeIQ1_S, nil
|
|
||||||
case "IQ4_NL":
|
|
||||||
return fileTypeIQ4_NL, nil
|
|
||||||
case "IQ3_S":
|
|
||||||
return fileTypeIQ3_S, nil
|
|
||||||
case "IQ3_M":
|
|
||||||
return fileTypeIQ3_M, nil
|
|
||||||
case "IQ2_S":
|
|
||||||
return fileTypeIQ2_S, nil
|
|
||||||
case "IQ2_M":
|
|
||||||
return fileTypeIQ2_M, nil
|
|
||||||
case "IQ4_XS":
|
|
||||||
return fileTypeIQ4_XS, nil
|
|
||||||
case "IQ1_M":
|
|
||||||
return fileTypeIQ1_M, nil
|
|
||||||
case "BF16":
|
case "BF16":
|
||||||
return fileTypeBF16, nil
|
return FileTypeBF16, nil
|
||||||
default:
|
default:
|
||||||
return fileTypeUnknown, fmt.Errorf("unknown fileType: %s", s)
|
supportedFileTypes := []FileType{
|
||||||
|
FileTypeF32,
|
||||||
|
FileTypeF16,
|
||||||
|
FileTypeQ4_K_S,
|
||||||
|
FileTypeQ4_K_M,
|
||||||
|
FileTypeQ8_0,
|
||||||
|
// fsggml.FileTypeBF16, // TODO
|
||||||
|
}
|
||||||
|
strs := make([]string, len(supportedFileTypes))
|
||||||
|
for i := range supportedFileTypes {
|
||||||
|
strs[i] = supportedFileTypes[i].String()
|
||||||
|
}
|
||||||
|
|
||||||
|
return FileTypeUnknown, fmt.Errorf("unsupported quantization type %s - supported types are %s", s, strings.Join(strs, ", "))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (t fileType) String() string {
|
func (t FileType) String() string {
|
||||||
switch t {
|
switch t {
|
||||||
case fileTypeF32:
|
case FileTypeF32:
|
||||||
return "F32"
|
return "F32"
|
||||||
case fileTypeF16:
|
case FileTypeF16:
|
||||||
return "F16"
|
return "F16"
|
||||||
case fileTypeQ4_0:
|
case FileTypeQ4_0:
|
||||||
return "Q4_0"
|
return "Q4_0"
|
||||||
case fileTypeQ4_1:
|
case FileTypeQ4_1:
|
||||||
return "Q4_1"
|
return "Q4_1"
|
||||||
case fileTypeQ4_1_F16:
|
case FileTypeQ8_0:
|
||||||
return "Q4_1_F16"
|
|
||||||
case fileTypeQ8_0:
|
|
||||||
return "Q8_0"
|
return "Q8_0"
|
||||||
case fileTypeQ5_0:
|
case FileTypeQ5_0:
|
||||||
return "Q5_0"
|
return "Q5_0"
|
||||||
case fileTypeQ5_1:
|
case FileTypeQ5_1:
|
||||||
return "Q5_1"
|
return "Q5_1"
|
||||||
case fileTypeQ2_K:
|
case FileTypeQ2_K:
|
||||||
return "Q2_K"
|
return "Q2_K"
|
||||||
case fileTypeQ3_K_S:
|
case FileTypeQ3_K_S:
|
||||||
return "Q3_K_S"
|
return "Q3_K_S"
|
||||||
case fileTypeQ3_K_M:
|
case FileTypeQ3_K_M:
|
||||||
return "Q3_K_M"
|
return "Q3_K_M"
|
||||||
case fileTypeQ3_K_L:
|
case FileTypeQ3_K_L:
|
||||||
return "Q3_K_L"
|
return "Q3_K_L"
|
||||||
case fileTypeQ4_K_S:
|
case FileTypeQ4_K_S:
|
||||||
return "Q4_K_S"
|
return "Q4_K_S"
|
||||||
case fileTypeQ4_K_M:
|
case FileTypeQ4_K_M:
|
||||||
return "Q4_K_M"
|
return "Q4_K_M"
|
||||||
case fileTypeQ5_K_S:
|
case FileTypeQ5_K_S:
|
||||||
return "Q5_K_S"
|
return "Q5_K_S"
|
||||||
case fileTypeQ5_K_M:
|
case FileTypeQ5_K_M:
|
||||||
return "Q5_K_M"
|
return "Q5_K_M"
|
||||||
case fileTypeQ6_K:
|
case FileTypeQ6_K:
|
||||||
return "Q6_K"
|
return "Q6_K"
|
||||||
case fileTypeIQ2_XXS:
|
case FileTypeQ2_K_S:
|
||||||
return "IQ2_XXS"
|
|
||||||
case fileTypeIQ2_XS:
|
|
||||||
return "IQ2_XS"
|
|
||||||
case fileTypeQ2_K_S:
|
|
||||||
return "Q2_K_S"
|
return "Q2_K_S"
|
||||||
case fileTypeIQ3_XS:
|
case FileTypeBF16:
|
||||||
return "IQ3_XS"
|
|
||||||
case fileTypeIQ3_XXS:
|
|
||||||
return "IQ3_XXS"
|
|
||||||
case fileTypeIQ1_S:
|
|
||||||
return "IQ1_S"
|
|
||||||
case fileTypeIQ4_NL:
|
|
||||||
return "IQ4_NL"
|
|
||||||
case fileTypeIQ3_S:
|
|
||||||
return "IQ3_S"
|
|
||||||
case fileTypeIQ3_M:
|
|
||||||
return "IQ3_M"
|
|
||||||
case fileTypeIQ2_S:
|
|
||||||
return "IQ2_S"
|
|
||||||
case fileTypeIQ4_XS:
|
|
||||||
return "IQ4_XS"
|
|
||||||
case fileTypeIQ2_M:
|
|
||||||
return "IQ2_M"
|
|
||||||
case fileTypeIQ1_M:
|
|
||||||
return "IQ1_M"
|
|
||||||
case fileTypeBF16:
|
|
||||||
return "BF16"
|
return "BF16"
|
||||||
default:
|
default:
|
||||||
return "unknown"
|
return "unknown"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (t fileType) Value() uint32 {
|
func (t FileType) Value() uint32 {
|
||||||
return uint32(t)
|
return uint32(t)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (ftype FileType) ToTensorType() TensorType {
|
||||||
|
switch ftype {
|
||||||
|
case FileTypeF32:
|
||||||
|
return TensorTypeF32
|
||||||
|
case FileTypeF16:
|
||||||
|
return TensorTypeF16
|
||||||
|
case FileTypeQ4_0:
|
||||||
|
return TensorTypeQ4_0
|
||||||
|
case FileTypeQ4_1:
|
||||||
|
return TensorTypeQ4_1
|
||||||
|
case FileTypeQ8_0:
|
||||||
|
return TensorTypeQ8_0
|
||||||
|
case FileTypeQ5_0:
|
||||||
|
return TensorTypeQ5_0
|
||||||
|
case FileTypeQ5_1:
|
||||||
|
return TensorTypeQ5_1
|
||||||
|
case FileTypeQ2_K:
|
||||||
|
return TensorTypeQ2_K
|
||||||
|
case FileTypeQ3_K_S:
|
||||||
|
return TensorTypeQ3_K
|
||||||
|
case FileTypeQ3_K_M:
|
||||||
|
return TensorTypeQ3_K
|
||||||
|
case FileTypeQ3_K_L:
|
||||||
|
return TensorTypeQ3_K
|
||||||
|
case FileTypeQ4_K_S:
|
||||||
|
return TensorTypeQ4_K
|
||||||
|
case FileTypeQ4_K_M:
|
||||||
|
return TensorTypeQ4_K
|
||||||
|
case FileTypeQ5_K_S:
|
||||||
|
return TensorTypeQ5_K
|
||||||
|
case FileTypeQ5_K_M:
|
||||||
|
return TensorTypeQ5_K
|
||||||
|
case FileTypeQ6_K:
|
||||||
|
return TensorTypeQ6_K
|
||||||
|
case FileTypeQ2_K_S:
|
||||||
|
return TensorTypeQ2_K
|
||||||
|
case FileTypeBF16:
|
||||||
|
return TensorTypeBF16
|
||||||
|
default:
|
||||||
|
slog.Warn("unsupported file type", "type", ftype)
|
||||||
|
return 0 // F32
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TensorType is equivalent to ggml_type for individual tensor types
|
||||||
|
// Note: these are not the same as FileType
|
||||||
|
type TensorType uint32
|
||||||
|
|
||||||
|
const (
|
||||||
|
TensorTypeF32 TensorType = iota
|
||||||
|
TensorTypeF16
|
||||||
|
TensorTypeQ4_0
|
||||||
|
TensorTypeQ4_1
|
||||||
|
tensorTypeQ4_2 // unused by GGML
|
||||||
|
tensorTypeQ4_3 // unused by GGML
|
||||||
|
TensorTypeQ5_0
|
||||||
|
TensorTypeQ5_1
|
||||||
|
TensorTypeQ8_0
|
||||||
|
TensorTypeQ8_1
|
||||||
|
TensorTypeQ2_K
|
||||||
|
TensorTypeQ3_K
|
||||||
|
TensorTypeQ4_K
|
||||||
|
TensorTypeQ5_K
|
||||||
|
TensorTypeQ6_K
|
||||||
|
TensorTypeQ8_K
|
||||||
|
tensorTypeIQ2_XXS // not supported by ollama
|
||||||
|
tensorTypeIQ2_XS // not supported by ollama
|
||||||
|
tensorTypeIQ3_XXS // not supported by ollama
|
||||||
|
tensorTypeIQ1_S // not supported by ollama
|
||||||
|
tensorTypeIQ4_NL // not supported by ollama
|
||||||
|
tensorTypeIQ3_S // not supported by ollama
|
||||||
|
tensorTypeIQ2_S // not supported by ollama
|
||||||
|
tensorTypeIQ4_XS // not supported by ollama
|
||||||
|
TensorTypeI8
|
||||||
|
TensorTypeI16
|
||||||
|
TensorTypeI32
|
||||||
|
TensorTypeI64
|
||||||
|
TensorTypeF64
|
||||||
|
tensorTypeIQ1_M // not supported by ollama
|
||||||
|
TensorTypeBF16
|
||||||
|
tensorTypeQ4_0_4_4 // unused by GGML
|
||||||
|
tensorTypeQ4_0_4_8 // unused by GGML
|
||||||
|
tensorTypeQ4_0_8_8 // unused by GGML
|
||||||
|
tensorTypeTQ1_0 // not supported by ollama
|
||||||
|
tensorTypeTQ2_0 // not supported by ollama
|
||||||
|
tensorTypeIQ4_NL_4_4 // unused by GGML
|
||||||
|
tensorTypeIQ4_NL_4_8 // unused by GGML
|
||||||
|
tensorTypeIQ4_NL_8_8 // unused by GGML
|
||||||
|
)
|
||||||
|
|
||||||
|
// ParseFileType parses the provided GGUF file type
|
||||||
|
// Only Ollama supported types are considered valid
|
||||||
|
func ParseTensorType(s string) (TensorType, error) {
|
||||||
|
switch s {
|
||||||
|
case "F32":
|
||||||
|
return TensorTypeF32, nil
|
||||||
|
case "F16":
|
||||||
|
return TensorTypeF16, nil
|
||||||
|
case "Q4_0":
|
||||||
|
return TensorTypeQ4_0, nil
|
||||||
|
case "Q4_1":
|
||||||
|
return TensorTypeQ4_1, nil
|
||||||
|
case "Q5_0":
|
||||||
|
return TensorTypeQ5_0, nil
|
||||||
|
case "Q5_1":
|
||||||
|
return TensorTypeQ5_1, nil
|
||||||
|
case "Q8_0":
|
||||||
|
return TensorTypeQ8_0, nil
|
||||||
|
case "Q8_1":
|
||||||
|
return TensorTypeQ8_1, nil
|
||||||
|
case "Q2_K":
|
||||||
|
return TensorTypeQ2_K, nil
|
||||||
|
case "Q3_K":
|
||||||
|
return TensorTypeQ3_K, nil
|
||||||
|
case "Q4_K":
|
||||||
|
return TensorTypeQ4_K, nil
|
||||||
|
case "Q5_K":
|
||||||
|
return TensorTypeQ5_K, nil
|
||||||
|
case "Q6_K":
|
||||||
|
return TensorTypeQ6_K, nil
|
||||||
|
case "Q8_K":
|
||||||
|
return TensorTypeQ8_K, nil
|
||||||
|
case "F64":
|
||||||
|
return TensorTypeF64, nil
|
||||||
|
case "BF16":
|
||||||
|
return TensorTypeBF16, nil
|
||||||
|
default:
|
||||||
|
return 0, fmt.Errorf("unsupported quantization type %s", s)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (t TensorType) IsQuantized() bool {
|
||||||
|
switch t {
|
||||||
|
case TensorTypeF32, TensorTypeF16, TensorTypeBF16:
|
||||||
|
return false
|
||||||
|
default:
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (t TensorType) RowSize(ne uint64) uint64 {
|
||||||
|
return t.TypeSize() * ne / t.BlockSize()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (t TensorType) String() string {
|
||||||
|
switch t {
|
||||||
|
case TensorTypeF32:
|
||||||
|
return "F32"
|
||||||
|
case TensorTypeF16:
|
||||||
|
return "F16"
|
||||||
|
case TensorTypeQ4_0:
|
||||||
|
return "Q4_0"
|
||||||
|
case TensorTypeQ4_1:
|
||||||
|
return "Q4_1"
|
||||||
|
case TensorTypeQ5_0:
|
||||||
|
return "Q5_0"
|
||||||
|
case TensorTypeQ5_1:
|
||||||
|
return "Q5_1"
|
||||||
|
case TensorTypeQ8_0:
|
||||||
|
return "Q8_0"
|
||||||
|
case TensorTypeQ8_1:
|
||||||
|
return "Q8_1"
|
||||||
|
case TensorTypeQ2_K:
|
||||||
|
return "Q2_K"
|
||||||
|
case TensorTypeQ3_K:
|
||||||
|
return "Q3_K"
|
||||||
|
case TensorTypeQ4_K:
|
||||||
|
return "Q4_K"
|
||||||
|
case TensorTypeQ5_K:
|
||||||
|
return "Q5_K"
|
||||||
|
case TensorTypeQ6_K:
|
||||||
|
return "Q6_K"
|
||||||
|
case TensorTypeQ8_K:
|
||||||
|
return "Q8_K"
|
||||||
|
case TensorTypeF64:
|
||||||
|
return "F64"
|
||||||
|
case TensorTypeBF16:
|
||||||
|
return "BF16"
|
||||||
|
default:
|
||||||
|
return "unknown"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
@ -48,17 +48,6 @@ var (
|
|||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
func getTimeouts(t *testing.T) (soft time.Duration, hard time.Duration) {
|
|
||||||
deadline, hasDeadline := t.Deadline()
|
|
||||||
if !hasDeadline {
|
|
||||||
return 8 * time.Minute, 10 * time.Minute
|
|
||||||
} else if deadline.Compare(time.Now().Add(2*time.Minute)) <= 0 {
|
|
||||||
t.Skip("too little time")
|
|
||||||
return time.Duration(0), time.Duration(0)
|
|
||||||
}
|
|
||||||
return -time.Since(deadline.Add(-2 * time.Minute)), -time.Since(deadline.Add(-20 * time.Second))
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestModelsGenerate(t *testing.T) {
|
func TestModelsGenerate(t *testing.T) {
|
||||||
softTimeout, hardTimeout := getTimeouts(t)
|
softTimeout, hardTimeout := getTimeouts(t)
|
||||||
slog.Info("Setting timeouts", "soft", softTimeout, "hard", hardTimeout)
|
slog.Info("Setting timeouts", "soft", softTimeout, "hard", hardTimeout)
|
||||||
|
130
integration/quantization_test.go
Normal file
130
integration/quantization_test.go
Normal file
@ -0,0 +1,130 @@
|
|||||||
|
//go:build integration && models
|
||||||
|
|
||||||
|
package integration
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"context"
|
||||||
|
"fmt"
|
||||||
|
"log/slog"
|
||||||
|
"strings"
|
||||||
|
"testing"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/ollama/ollama/api"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestQuantization(t *testing.T) {
|
||||||
|
sourceModels := []string{
|
||||||
|
"qwen2.5:0.5b-instruct-fp16",
|
||||||
|
}
|
||||||
|
quantizations := []string{
|
||||||
|
"Q8_0",
|
||||||
|
"Q4_K_S",
|
||||||
|
"Q4_K_M",
|
||||||
|
"Q4_K",
|
||||||
|
}
|
||||||
|
softTimeout, hardTimeout := getTimeouts(t)
|
||||||
|
started := time.Now()
|
||||||
|
slog.Info("Setting timeouts", "soft", softTimeout, "hard", hardTimeout)
|
||||||
|
ctx, cancel := context.WithTimeout(context.Background(), hardTimeout)
|
||||||
|
defer cancel()
|
||||||
|
client, _, cleanup := InitServerConnection(ctx, t)
|
||||||
|
defer cleanup()
|
||||||
|
|
||||||
|
for _, base := range sourceModels {
|
||||||
|
if err := PullIfMissing(ctx, client, base); err != nil {
|
||||||
|
t.Fatalf("pull failed %s", err)
|
||||||
|
}
|
||||||
|
for _, quant := range quantizations {
|
||||||
|
newName := fmt.Sprintf("%s__%s", base, quant)
|
||||||
|
t.Run(newName, func(t *testing.T) {
|
||||||
|
if time.Now().Sub(started) > softTimeout {
|
||||||
|
t.Skip("skipping remaining tests to avoid excessive runtime")
|
||||||
|
}
|
||||||
|
req := &api.CreateRequest{
|
||||||
|
Model: newName,
|
||||||
|
Quantization: quant,
|
||||||
|
From: base,
|
||||||
|
}
|
||||||
|
fn := func(resp api.ProgressResponse) error {
|
||||||
|
// fmt.Print(".")
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
t.Logf("quantizing: %s -> %s", base, quant)
|
||||||
|
if err := client.Create(ctx, req, fn); err != nil {
|
||||||
|
t.Fatalf("create failed %s", err)
|
||||||
|
}
|
||||||
|
defer func() {
|
||||||
|
req := &api.DeleteRequest{
|
||||||
|
Model: newName,
|
||||||
|
}
|
||||||
|
t.Logf("deleting: %s -> %s", base, quant)
|
||||||
|
if err := client.Delete(ctx, req); err != nil {
|
||||||
|
t.Logf("failed to clean up %s: %s", req.Model, err)
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
// Check metadata on the model
|
||||||
|
resp, err := client.Show(ctx, &api.ShowRequest{Name: newName})
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("unable to show model: %s", err)
|
||||||
|
}
|
||||||
|
if !strings.Contains(resp.Details.QuantizationLevel, quant) {
|
||||||
|
t.Fatalf("unexpected quantization for %s:\ngot: %s", newName, resp.Details.QuantizationLevel)
|
||||||
|
}
|
||||||
|
|
||||||
|
stream := true
|
||||||
|
genReq := api.GenerateRequest{
|
||||||
|
Model: newName,
|
||||||
|
Prompt: "why is the sky blue?",
|
||||||
|
KeepAlive: &api.Duration{Duration: 3 * time.Second},
|
||||||
|
Options: map[string]any{
|
||||||
|
"seed": 42,
|
||||||
|
"temperature": 0.0,
|
||||||
|
},
|
||||||
|
Stream: &stream,
|
||||||
|
}
|
||||||
|
t.Logf("verifying: %s -> %s", base, quant)
|
||||||
|
|
||||||
|
// Some smaller quantizations can cause models to have poor quality
|
||||||
|
// or get stuck in repetition loops, so we stop as soon as we have any matches
|
||||||
|
anyResp := []string{"rayleigh", "scattering", "day", "sun", "moon", "color", "nitrogen", "oxygen"}
|
||||||
|
reqCtx, reqCancel := context.WithCancel(ctx)
|
||||||
|
atLeastOne := false
|
||||||
|
var buf bytes.Buffer
|
||||||
|
genfn := func(response api.GenerateResponse) error {
|
||||||
|
buf.Write([]byte(response.Response))
|
||||||
|
fullResp := strings.ToLower(buf.String())
|
||||||
|
for _, resp := range anyResp {
|
||||||
|
if strings.Contains(fullResp, resp) {
|
||||||
|
atLeastOne = true
|
||||||
|
t.Log(fullResp)
|
||||||
|
reqCancel()
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
done := make(chan int)
|
||||||
|
var genErr error
|
||||||
|
go func() {
|
||||||
|
genErr = client.Generate(reqCtx, &genReq, genfn)
|
||||||
|
done <- 0
|
||||||
|
}()
|
||||||
|
|
||||||
|
select {
|
||||||
|
case <-done:
|
||||||
|
if genErr != nil && !atLeastOne {
|
||||||
|
t.Fatalf("failed with %s request prompt %s ", genReq.Model, genReq.Prompt)
|
||||||
|
}
|
||||||
|
case <-ctx.Done():
|
||||||
|
t.Error("outer test context done while waiting for generate")
|
||||||
|
}
|
||||||
|
|
||||||
|
t.Logf("passed")
|
||||||
|
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@ -359,3 +359,14 @@ func skipUnderMinVRAM(t *testing.T, gb uint64) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func getTimeouts(t *testing.T) (soft time.Duration, hard time.Duration) {
|
||||||
|
deadline, hasDeadline := t.Deadline()
|
||||||
|
if !hasDeadline {
|
||||||
|
return 8 * time.Minute, 10 * time.Minute
|
||||||
|
} else if deadline.Compare(time.Now().Add(2*time.Minute)) <= 0 {
|
||||||
|
t.Skip("too little time")
|
||||||
|
return time.Duration(0), time.Duration(0)
|
||||||
|
}
|
||||||
|
return -time.Since(deadline.Add(-2 * time.Minute)), -time.Since(deadline.Add(-20 * time.Second))
|
||||||
|
}
|
||||||
|
17
llama/llama.cpp/src/llama-arch.cpp
vendored
17
llama/llama.cpp/src/llama-arch.cpp
vendored
@ -74,7 +74,6 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|||||||
{ LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
|
{ LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
|
||||||
{ LLM_ARCH_PLM, "plm" },
|
{ LLM_ARCH_PLM, "plm" },
|
||||||
{ LLM_ARCH_BAILINGMOE, "bailingmoe" },
|
{ LLM_ARCH_BAILINGMOE, "bailingmoe" },
|
||||||
{ LLM_ARCH_MISTRAL3, "mistral3" },
|
|
||||||
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -1607,22 +1606,6 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|||||||
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
{
|
|
||||||
LLM_ARCH_MISTRAL3,
|
|
||||||
{
|
|
||||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
||||||
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
|
||||||
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
|
||||||
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
|
||||||
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
|
||||||
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
|
||||||
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
|
||||||
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
|
||||||
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
|
||||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
||||||
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
LLM_ARCH_UNKNOWN,
|
LLM_ARCH_UNKNOWN,
|
||||||
{
|
{
|
||||||
|
1
llama/llama.cpp/src/llama-arch.h
vendored
1
llama/llama.cpp/src/llama-arch.h
vendored
@ -76,7 +76,6 @@ enum llm_arch {
|
|||||||
LLM_ARCH_CHAMELEON,
|
LLM_ARCH_CHAMELEON,
|
||||||
LLM_ARCH_SOLAR,
|
LLM_ARCH_SOLAR,
|
||||||
LLM_ARCH_WAVTOKENIZER_DEC,
|
LLM_ARCH_WAVTOKENIZER_DEC,
|
||||||
LLM_ARCH_MISTRAL3,
|
|
||||||
LLM_ARCH_PLM,
|
LLM_ARCH_PLM,
|
||||||
LLM_ARCH_BAILINGMOE,
|
LLM_ARCH_BAILINGMOE,
|
||||||
LLM_ARCH_UNKNOWN,
|
LLM_ARCH_UNKNOWN,
|
||||||
|
2
llama/llama.cpp/src/llama-model.cpp
vendored
2
llama/llama.cpp/src/llama-model.cpp
vendored
@ -1437,7 +1437,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||||||
default: type = LLM_TYPE_UNKNOWN;
|
default: type = LLM_TYPE_UNKNOWN;
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
case LLM_ARCH_MISTRAL3: break;
|
|
||||||
default: throw std::runtime_error("unsupported model architecture");
|
default: throw std::runtime_error("unsupported model architecture");
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -13752,7 +13751,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|||||||
case LLM_ARCH_CHAMELEON:
|
case LLM_ARCH_CHAMELEON:
|
||||||
case LLM_ARCH_SOLAR:
|
case LLM_ARCH_SOLAR:
|
||||||
case LLM_ARCH_BAILINGMOE:
|
case LLM_ARCH_BAILINGMOE:
|
||||||
case LLM_ARCH_MISTRAL3:
|
|
||||||
return LLAMA_ROPE_TYPE_NORM;
|
return LLAMA_ROPE_TYPE_NORM;
|
||||||
|
|
||||||
// the pairs of head values are offset by n_rot/2
|
// the pairs of head values are offset by n_rot/2
|
||||||
|
4
llama/llama.cpp/src/llama-quant.cpp
vendored
4
llama/llama.cpp/src/llama-quant.cpp
vendored
@ -744,10 +744,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|||||||
// This used to be a regex, but <regex> has an extreme cost to compile times.
|
// This used to be a regex, but <regex> has an extreme cost to compile times.
|
||||||
bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'?
|
bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'?
|
||||||
|
|
||||||
// don't quantize vision stuff
|
|
||||||
quantize &= name.find("v.") == std::string::npos;
|
|
||||||
quantize &= name.find("mm.") == std::string::npos;
|
|
||||||
|
|
||||||
// quantize only 2D and 3D tensors (experts)
|
// quantize only 2D and 3D tensors (experts)
|
||||||
quantize &= (ggml_n_dims(tensor) >= 2);
|
quantize &= (ggml_n_dims(tensor) >= 2);
|
||||||
|
|
||||||
|
@ -460,24 +460,6 @@ func (m *Model) NEmbd() int {
|
|||||||
return int(C.llama_model_n_embd(m.c))
|
return int(C.llama_model_n_embd(m.c))
|
||||||
}
|
}
|
||||||
|
|
||||||
func Quantize(infile, outfile string, ftype uint32) error {
|
|
||||||
cinfile := C.CString(infile)
|
|
||||||
defer C.free(unsafe.Pointer(cinfile))
|
|
||||||
|
|
||||||
coutfile := C.CString(outfile)
|
|
||||||
defer C.free(unsafe.Pointer(coutfile))
|
|
||||||
|
|
||||||
params := C.llama_model_quantize_default_params()
|
|
||||||
params.nthread = -1
|
|
||||||
params.ftype = ftype
|
|
||||||
|
|
||||||
if rc := C.llama_model_quantize(cinfile, coutfile, ¶ms); rc != 0 {
|
|
||||||
return fmt.Errorf("llama_model_quantize: %d", rc)
|
|
||||||
}
|
|
||||||
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// vision processing
|
// vision processing
|
||||||
type ClipContext struct {
|
type ClipContext struct {
|
||||||
c *C.struct_clip_ctx
|
c *C.struct_clip_ctx
|
||||||
|
@ -1,96 +0,0 @@
|
|||||||
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
|
||||||
From: jmorganca <jmorganca@gmail.com>
|
|
||||||
Date: Tue, 8 Apr 2025 20:39:32 -0700
|
|
||||||
Subject: [PATCH] add model quantizations
|
|
||||||
|
|
||||||
a temporary patch to add model quantization for
|
|
||||||
models not supported in llama.cpp
|
|
||||||
---
|
|
||||||
src/llama-arch.cpp | 17 +++++++++++++++++
|
|
||||||
src/llama-arch.h | 1 +
|
|
||||||
src/llama-model.cpp | 2 ++
|
|
||||||
src/llama-quant.cpp | 4 ++++
|
|
||||||
4 files changed, 24 insertions(+)
|
|
||||||
|
|
||||||
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
|
|
||||||
index eb7b5325..df42d1a5 100644
|
|
||||||
--- a/src/llama-arch.cpp
|
|
||||||
+++ b/src/llama-arch.cpp
|
|
||||||
@@ -74,6 +74,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
||||||
{ LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
|
|
||||||
{ LLM_ARCH_PLM, "plm" },
|
|
||||||
{ LLM_ARCH_BAILINGMOE, "bailingmoe" },
|
|
||||||
+ { LLM_ARCH_MISTRAL3, "mistral3" },
|
|
||||||
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
|
||||||
};
|
|
||||||
|
|
||||||
@@ -1606,6 +1607,22 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
||||||
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
|
||||||
},
|
|
||||||
},
|
|
||||||
+ {
|
|
||||||
+ LLM_ARCH_MISTRAL3,
|
|
||||||
+ {
|
|
||||||
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
||||||
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
|
||||||
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
|
||||||
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
|
||||||
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
|
||||||
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
|
||||||
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
|
||||||
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
|
||||||
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
|
||||||
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
||||||
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
|
||||||
+ }
|
|
||||||
+ },
|
|
||||||
{
|
|
||||||
LLM_ARCH_UNKNOWN,
|
|
||||||
{
|
|
||||||
diff --git a/src/llama-arch.h b/src/llama-arch.h
|
|
||||||
index bc8a4f0b..bda9d071 100644
|
|
||||||
--- a/src/llama-arch.h
|
|
||||||
+++ b/src/llama-arch.h
|
|
||||||
@@ -76,6 +76,7 @@ enum llm_arch {
|
|
||||||
LLM_ARCH_CHAMELEON,
|
|
||||||
LLM_ARCH_SOLAR,
|
|
||||||
LLM_ARCH_WAVTOKENIZER_DEC,
|
|
||||||
+ LLM_ARCH_MISTRAL3,
|
|
||||||
LLM_ARCH_PLM,
|
|
||||||
LLM_ARCH_BAILINGMOE,
|
|
||||||
LLM_ARCH_UNKNOWN,
|
|
||||||
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
|
|
||||||
index 9d099f11..ef70486d 100644
|
|
||||||
--- a/src/llama-model.cpp
|
|
||||||
+++ b/src/llama-model.cpp
|
|
||||||
@@ -1437,6 +1437,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
||||||
default: type = LLM_TYPE_UNKNOWN;
|
|
||||||
}
|
|
||||||
} break;
|
|
||||||
+ case LLM_ARCH_MISTRAL3: break;
|
|
||||||
default: throw std::runtime_error("unsupported model architecture");
|
|
||||||
}
|
|
||||||
|
|
||||||
@@ -13751,6 +13752,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
||||||
case LLM_ARCH_CHAMELEON:
|
|
||||||
case LLM_ARCH_SOLAR:
|
|
||||||
case LLM_ARCH_BAILINGMOE:
|
|
||||||
+ case LLM_ARCH_MISTRAL3:
|
|
||||||
return LLAMA_ROPE_TYPE_NORM;
|
|
||||||
|
|
||||||
// the pairs of head values are offset by n_rot/2
|
|
||||||
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
|
|
||||||
index 223e1f3f..8ae6dde8 100644
|
|
||||||
--- a/src/llama-quant.cpp
|
|
||||||
+++ b/src/llama-quant.cpp
|
|
||||||
@@ -744,6 +744,10 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
||||||
// This used to be a regex, but <regex> has an extreme cost to compile times.
|
|
||||||
bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'?
|
|
||||||
|
|
||||||
+ // don't quantize vision stuff
|
|
||||||
+ quantize &= name.find("v.") == std::string::npos;
|
|
||||||
+ quantize &= name.find("mm.") == std::string::npos;
|
|
||||||
+
|
|
||||||
// quantize only 2D and 3D tensors (experts)
|
|
||||||
quantize &= (ggml_n_dims(tensor) >= 2);
|
|
||||||
|
|
@ -25,7 +25,7 @@ func TestEstimateGPULayers(t *testing.T) {
|
|||||||
defer f.Close()
|
defer f.Close()
|
||||||
inputLayerCount := 5
|
inputLayerCount := 5
|
||||||
|
|
||||||
tensors := []ggml.Tensor{
|
tensors := []*ggml.Tensor{
|
||||||
{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
|
{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
|
||||||
{Name: "blk.1.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
|
{Name: "blk.1.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
|
||||||
{Name: "blk.2.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
|
{Name: "blk.2.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
|
||||||
|
@ -312,6 +312,7 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
|
|||||||
g, ctx := errgroup.WithContext(ctx)
|
g, ctx := errgroup.WithContext(ctx)
|
||||||
g.SetLimit(runtime.GOMAXPROCS(0))
|
g.SetLimit(runtime.GOMAXPROCS(0))
|
||||||
for _, t := range meta.Tensors().Items() {
|
for _, t := range meta.Tensors().Items() {
|
||||||
|
t := t
|
||||||
g.Go(func() error {
|
g.Go(func() error {
|
||||||
tts := make([]*C.struct_ggml_tensor, max(1, len(targets[t.Name])))
|
tts := make([]*C.struct_ggml_tensor, max(1, len(targets[t.Name])))
|
||||||
for i := range tts {
|
for i := range tts {
|
||||||
|
83
ml/backend/ggml/quantization.go
Normal file
83
ml/backend/ggml/quantization.go
Normal file
@ -0,0 +1,83 @@
|
|||||||
|
package ggml
|
||||||
|
|
||||||
|
// #cgo CPPFLAGS: -I${SRCDIR}/ggml/src
|
||||||
|
// #include <stdlib.h>
|
||||||
|
// #include <stdint.h>
|
||||||
|
// #include "ggml.h"
|
||||||
|
// #include "ggml-cpu.h"
|
||||||
|
// #include "ggml-backend.h"
|
||||||
|
// #include "ggml-quants.h"
|
||||||
|
import "C"
|
||||||
|
|
||||||
|
import (
|
||||||
|
"unsafe"
|
||||||
|
|
||||||
|
fsggml "github.com/ollama/ollama/fs/ggml"
|
||||||
|
)
|
||||||
|
|
||||||
|
// convertToF32 converts (dequantizes) the raw data to F32 so we can then quantize it
|
||||||
|
func ConvertToF32(data []byte, dtype uint32, nelements uint64) []float32 {
|
||||||
|
f32s := make([]float32, nelements)
|
||||||
|
elems := C.int64_t(nelements)
|
||||||
|
switch dtype {
|
||||||
|
case C.GGML_TYPE_F16:
|
||||||
|
C.ggml_fp16_to_fp32_row((*C.uint16_t)(unsafe.Pointer(&data[0])), (*C.float)(&f32s[0]), elems)
|
||||||
|
case C.GGML_TYPE_Q4_0:
|
||||||
|
C.dequantize_row_q4_0((*C.block_q4_0)(unsafe.Pointer(&data[0])), (*C.float)(&f32s[0]), elems)
|
||||||
|
case C.GGML_TYPE_Q4_1:
|
||||||
|
C.dequantize_row_q4_1((*C.block_q4_1)(unsafe.Pointer(&data[0])), (*C.float)(&f32s[0]), elems)
|
||||||
|
case C.GGML_TYPE_Q5_0:
|
||||||
|
C.dequantize_row_q5_0((*C.block_q5_0)(unsafe.Pointer(&data[0])), (*C.float)(&f32s[0]), elems)
|
||||||
|
case C.GGML_TYPE_Q5_1:
|
||||||
|
C.dequantize_row_q5_1((*C.block_q5_1)(unsafe.Pointer(&data[0])), (*C.float)(&f32s[0]), elems)
|
||||||
|
case C.GGML_TYPE_Q8_0:
|
||||||
|
C.dequantize_row_q8_0((*C.block_q8_0)(unsafe.Pointer(&data[0])), (*C.float)(&f32s[0]), elems)
|
||||||
|
case C.GGML_TYPE_Q2_K:
|
||||||
|
C.dequantize_row_q2_K((*C.block_q2_K)(unsafe.Pointer(&data[0])), (*C.float)(&f32s[0]), elems)
|
||||||
|
case C.GGML_TYPE_Q3_K:
|
||||||
|
C.dequantize_row_q3_K((*C.block_q3_K)(unsafe.Pointer(&data[0])), (*C.float)(&f32s[0]), elems)
|
||||||
|
case C.GGML_TYPE_Q4_K:
|
||||||
|
C.dequantize_row_q4_K((*C.block_q4_K)(unsafe.Pointer(&data[0])), (*C.float)(&f32s[0]), elems)
|
||||||
|
case C.GGML_TYPE_Q5_K:
|
||||||
|
C.dequantize_row_q5_K((*C.block_q5_K)(unsafe.Pointer(&data[0])), (*C.float)(&f32s[0]), elems)
|
||||||
|
case C.GGML_TYPE_Q6_K:
|
||||||
|
C.dequantize_row_q6_K((*C.block_q6_K)(unsafe.Pointer(&data[0])), (*C.float)(&f32s[0]), elems)
|
||||||
|
case C.GGML_TYPE_BF16:
|
||||||
|
C.ggml_bf16_to_fp32_row((*C.ggml_bf16_t)(unsafe.Pointer(&data[0])), (*C.float)(&f32s[0]), elems)
|
||||||
|
default:
|
||||||
|
panic("unsupported quantization format")
|
||||||
|
}
|
||||||
|
return f32s
|
||||||
|
}
|
||||||
|
|
||||||
|
func Quantize(newType fsggml.TensorType, f32s []float32, shape []uint64) []byte {
|
||||||
|
buf := make([]byte, len(f32s)*4) // upper bound on size
|
||||||
|
nPerRow := C.int64_t(shape[0])
|
||||||
|
nrows := C.int64_t(1)
|
||||||
|
if len(shape) > 1 {
|
||||||
|
nrows = C.int64_t(shape[1])
|
||||||
|
}
|
||||||
|
shape2 := C.int64_t(1)
|
||||||
|
if len(shape) > 2 {
|
||||||
|
shape2 = C.int64_t(shape[2])
|
||||||
|
}
|
||||||
|
nelements_matrix := nPerRow * nrows
|
||||||
|
newSize := C.size_t(0)
|
||||||
|
for i03 := C.int64_t(0); i03 < shape2; i03++ {
|
||||||
|
f32s_03 := i03 * nelements_matrix
|
||||||
|
buf_03 := C.int64_t(C.ggml_row_size(uint32(newType), nPerRow)) * i03 * nrows
|
||||||
|
newSize += C.ggml_quantize_chunk(
|
||||||
|
uint32(newType),
|
||||||
|
(*C.float)(&f32s[f32s_03]),
|
||||||
|
unsafe.Pointer((uintptr)(unsafe.Pointer(&buf[0]))+uintptr(buf_03)),
|
||||||
|
0,
|
||||||
|
nrows,
|
||||||
|
nPerRow,
|
||||||
|
nil)
|
||||||
|
}
|
||||||
|
return buf[:newSize]
|
||||||
|
}
|
||||||
|
|
||||||
|
func QuantizationVersion() uint32 {
|
||||||
|
return uint32(C.GGML_QNT_VERSION)
|
||||||
|
}
|
@ -765,7 +765,7 @@ func getSHA256Digest(t *testing.T, r io.Reader) (string, int64) {
|
|||||||
return fmt.Sprintf("sha256:%x", h.Sum(nil)), n
|
return fmt.Sprintf("sha256:%x", h.Sum(nil)), n
|
||||||
}
|
}
|
||||||
|
|
||||||
func createBinFile(t *testing.T, kv map[string]any, ti []ggml.Tensor) (string, string) {
|
func createBinFile(t *testing.T, kv map[string]any, ti []*ggml.Tensor) (string, string) {
|
||||||
t.Helper()
|
t.Helper()
|
||||||
|
|
||||||
f, err := os.CreateTemp(t.TempDir(), "testbin.*.gguf")
|
f, err := os.CreateTemp(t.TempDir(), "testbin.*.gguf")
|
||||||
|
@ -15,6 +15,7 @@ import (
|
|||||||
"path/filepath"
|
"path/filepath"
|
||||||
"slices"
|
"slices"
|
||||||
"strings"
|
"strings"
|
||||||
|
"sync/atomic"
|
||||||
|
|
||||||
"github.com/gin-gonic/gin"
|
"github.com/gin-gonic/gin"
|
||||||
|
|
||||||
@ -23,7 +24,6 @@ import (
|
|||||||
"github.com/ollama/ollama/envconfig"
|
"github.com/ollama/ollama/envconfig"
|
||||||
"github.com/ollama/ollama/format"
|
"github.com/ollama/ollama/format"
|
||||||
"github.com/ollama/ollama/fs/ggml"
|
"github.com/ollama/ollama/fs/ggml"
|
||||||
"github.com/ollama/ollama/llama"
|
|
||||||
"github.com/ollama/ollama/template"
|
"github.com/ollama/ollama/template"
|
||||||
"github.com/ollama/ollama/types/errtypes"
|
"github.com/ollama/ollama/types/errtypes"
|
||||||
"github.com/ollama/ollama/types/model"
|
"github.com/ollama/ollama/types/model"
|
||||||
@ -425,9 +425,14 @@ func createModel(r api.CreateRequest, name model.Name, baseLayers []*layerGGML,
|
|||||||
|
|
||||||
func quantizeLayer(layer *layerGGML, quantizeType string, fn func(resp api.ProgressResponse)) (*layerGGML, error) {
|
func quantizeLayer(layer *layerGGML, quantizeType string, fn func(resp api.ProgressResponse)) (*layerGGML, error) {
|
||||||
ft := layer.GGML.KV().FileType()
|
ft := layer.GGML.KV().FileType()
|
||||||
fn(api.ProgressResponse{Status: fmt.Sprintf("quantizing %s model to %s", ft, quantizeType)})
|
var doneBytes atomic.Uint64
|
||||||
|
totalBytes := uint64(layer.Size) - layer.GGML.Tensors().Offset
|
||||||
want, err := ggml.ParseFileType(quantizeType)
|
fnWrap := func(n uint64) {
|
||||||
|
done := doneBytes.Add(n)
|
||||||
|
progress := float32(done) / float32(totalBytes)
|
||||||
|
fn(api.ProgressResponse{Status: fmt.Sprintf("quantizing %s model to %s", ft, quantizeType), Digest: "0", Total: layer.Size, Completed: int64(progress * float32(layer.Size))})
|
||||||
|
}
|
||||||
|
ftype, err := ggml.ParseFileType(quantizeType)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
@ -436,6 +441,11 @@ func quantizeLayer(layer *layerGGML, quantizeType string, fn func(resp api.Progr
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
fp, err := os.Open(blob)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
defer fp.Close()
|
||||||
|
|
||||||
temp, err := os.CreateTemp(filepath.Dir(blob), quantizeType)
|
temp, err := os.CreateTemp(filepath.Dir(blob), quantizeType)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@ -444,15 +454,15 @@ func quantizeLayer(layer *layerGGML, quantizeType string, fn func(resp api.Progr
|
|||||||
defer temp.Close()
|
defer temp.Close()
|
||||||
defer os.Remove(temp.Name())
|
defer os.Remove(temp.Name())
|
||||||
|
|
||||||
if err := llama.Quantize(blob, temp.Name(), uint32(want)); err != nil {
|
if err := quantize(fp, temp, layer.GGML, ftype, fnWrap); err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
temp.Seek(0, io.SeekStart)
|
||||||
|
fn(api.ProgressResponse{Status: "verifying conversion"})
|
||||||
newLayer, err := NewLayer(temp, layer.MediaType)
|
newLayer, err := NewLayer(temp, layer.MediaType)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
if _, err := temp.Seek(0, io.SeekStart); err != nil {
|
if _, err := temp.Seek(0, io.SeekStart); err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
@ -462,7 +472,6 @@ func quantizeLayer(layer *layerGGML, quantizeType string, fn func(resp api.Progr
|
|||||||
slog.Error(fmt.Sprintf("error decoding ggml: %s\n", err))
|
slog.Error(fmt.Sprintf("error decoding ggml: %s\n", err))
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
return &layerGGML{newLayer, f}, nil
|
return &layerGGML{newLayer, f}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -64,7 +64,7 @@ func parseFromModel(ctx context.Context, name model.Name, fn func(api.ProgressRe
|
|||||||
}
|
}
|
||||||
defer blob.Close()
|
defer blob.Close()
|
||||||
|
|
||||||
f, _, err := ggml.Decode(blob, 1024)
|
f, _, err := ggml.Decode(blob, -1)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
274
server/quantization.go
Normal file
274
server/quantization.go
Normal file
@ -0,0 +1,274 @@
|
|||||||
|
package server
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"io"
|
||||||
|
"log/slog"
|
||||||
|
"maps"
|
||||||
|
"os"
|
||||||
|
"strings"
|
||||||
|
"unsafe"
|
||||||
|
|
||||||
|
fsggml "github.com/ollama/ollama/fs/ggml"
|
||||||
|
"github.com/ollama/ollama/ml/backend/ggml"
|
||||||
|
)
|
||||||
|
|
||||||
|
type quantizer struct {
|
||||||
|
*os.File
|
||||||
|
offset uint64
|
||||||
|
from, to *fsggml.Tensor
|
||||||
|
progressFn func(n uint64)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (q quantizer) WriteTo(w io.Writer) (int64, error) {
|
||||||
|
quantize := q.from.Kind != q.to.Kind
|
||||||
|
sr := io.NewSectionReader(q, int64(q.offset), int64(q.from.Size()))
|
||||||
|
if !quantize {
|
||||||
|
n, err := io.Copy(w, sr)
|
||||||
|
q.progressFn(q.from.Size())
|
||||||
|
return n, err
|
||||||
|
}
|
||||||
|
data, err := io.ReadAll(sr)
|
||||||
|
if err != nil {
|
||||||
|
slog.Warn("file read error", "tensor", q.from.Name, "file", q.Name(), "error", err)
|
||||||
|
return 0, fmt.Errorf("unable to read tensor %s from %s: %s", q.from.Name, q.Name(), err)
|
||||||
|
}
|
||||||
|
var f32s []float32
|
||||||
|
newType := fsggml.TensorType(q.to.Kind)
|
||||||
|
if fsggml.TensorType(q.from.Kind) == fsggml.TensorTypeF32 {
|
||||||
|
f32s = unsafe.Slice((*float32)(unsafe.Pointer(&data[0])), q.from.Elements())
|
||||||
|
} else {
|
||||||
|
f32s = ggml.ConvertToF32(data, q.from.Kind, q.from.Elements())
|
||||||
|
}
|
||||||
|
data = ggml.Quantize(newType, f32s, q.from.Shape)
|
||||||
|
n, err := w.Write(data)
|
||||||
|
q.progressFn(q.from.Size())
|
||||||
|
return int64(n), err
|
||||||
|
}
|
||||||
|
|
||||||
|
type quantizeState struct {
|
||||||
|
nAttnV int // Number of attn_*v* weight tensors
|
||||||
|
nFfnDown int // Number of ffn_down tensors
|
||||||
|
iAttnV int // Running counter of number of attn_v tensors that have been processed
|
||||||
|
iFfnDown int // Running counter of number of ffn_down tensors that have been processed
|
||||||
|
hasOutput bool // used to figure out if a model shares tok_embd with the output weight
|
||||||
|
}
|
||||||
|
|
||||||
|
func useMoreBits(iLayer, nLayers int) bool {
|
||||||
|
return iLayer < (nLayers/8) || iLayer >= 7*nLayers/8 || (iLayer-nLayers/8)%3 == 2
|
||||||
|
}
|
||||||
|
|
||||||
|
func getTensorNewType(kv fsggml.KV, qs *quantizeState, newType fsggml.TensorType, name string, shape []uint64, ftype fsggml.FileType) fsggml.TensorType {
|
||||||
|
// Ported from llama_tensor_get_type, removed unsupported quantization types
|
||||||
|
nExperts := max(1, kv.Uint("expert_count", 0))
|
||||||
|
if name == "output.weight" || name == "output_norm.weight" || (!qs.hasOutput && name == "token_embd.weight") {
|
||||||
|
nx := shape[0]
|
||||||
|
qk_k := newType.BlockSize()
|
||||||
|
if nx%qk_k != 0 {
|
||||||
|
newType = fsggml.TensorTypeQ8_0
|
||||||
|
} else if newType != fsggml.TensorTypeQ8_0 {
|
||||||
|
newType = fsggml.TensorTypeQ6_K
|
||||||
|
}
|
||||||
|
} else if strings.Contains(name, "attn_v.weight") {
|
||||||
|
if ftype == fsggml.FileTypeQ2_K {
|
||||||
|
if kv.GQA() >= 4 {
|
||||||
|
newType = fsggml.TensorTypeQ4_K
|
||||||
|
} else {
|
||||||
|
newType = fsggml.TensorTypeQ3_K
|
||||||
|
}
|
||||||
|
} else if ftype == fsggml.FileTypeQ2_K_S && kv.GQA() >= 4 {
|
||||||
|
newType = fsggml.TensorTypeQ4_K
|
||||||
|
} else if ftype == fsggml.FileTypeQ3_K_M {
|
||||||
|
if qs.iAttnV < 2 {
|
||||||
|
newType = fsggml.TensorTypeQ5_K
|
||||||
|
} else {
|
||||||
|
newType = fsggml.TensorTypeQ4_K
|
||||||
|
}
|
||||||
|
} else if ftype == fsggml.FileTypeQ3_K_L {
|
||||||
|
newType = fsggml.TensorTypeQ5_K
|
||||||
|
} else if (ftype == fsggml.FileTypeQ4_K_M || ftype == fsggml.FileTypeQ5_K_M) &&
|
||||||
|
useMoreBits(qs.iAttnV, qs.nAttnV) {
|
||||||
|
newType = fsggml.TensorTypeQ6_K
|
||||||
|
} else if ftype == fsggml.FileTypeQ4_K_S && qs.iAttnV < 4 {
|
||||||
|
newType = fsggml.TensorTypeQ5_K
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO
|
||||||
|
// if (qs.model.type == LLM_TYPE_70B) {
|
||||||
|
// // In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
|
||||||
|
// // 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
|
||||||
|
// // nearly negligible increase in model size by quantizing this tensor with more bits:
|
||||||
|
// if (newType == GGML_TYPE_Q3_K || newType == GGML_TYPE_Q4_K) newType = GGML_TYPE_Q5_K;
|
||||||
|
// }
|
||||||
|
|
||||||
|
if nExperts == 8 {
|
||||||
|
// for the 8-expert model, bumping this to Q8_0 trades just ~128MB
|
||||||
|
newType = fsggml.TensorTypeQ8_0
|
||||||
|
}
|
||||||
|
qs.iAttnV++
|
||||||
|
} else if strings.Contains(name, "attn_k.weight") {
|
||||||
|
if nExperts == 8 {
|
||||||
|
// for the 8-expert model, bumping this to Q8_0 trades just ~128MB
|
||||||
|
newType = fsggml.TensorTypeQ8_0
|
||||||
|
}
|
||||||
|
} else if strings.Contains(name, "ffn_down") {
|
||||||
|
iLayer := qs.iFfnDown
|
||||||
|
n_layer := qs.nFfnDown
|
||||||
|
if ftype == fsggml.FileTypeQ2_K {
|
||||||
|
newType = fsggml.TensorTypeQ3_K
|
||||||
|
} else if ftype == fsggml.FileTypeQ2_K_S {
|
||||||
|
if iLayer < n_layer/8 {
|
||||||
|
newType = fsggml.TensorTypeQ4_K
|
||||||
|
}
|
||||||
|
} else if ftype == fsggml.FileTypeQ3_K_M {
|
||||||
|
if iLayer < n_layer/16 {
|
||||||
|
newType = fsggml.TensorTypeQ5_K
|
||||||
|
} else if useMoreBits(iLayer, n_layer) {
|
||||||
|
newType = fsggml.TensorTypeQ4_K
|
||||||
|
} else {
|
||||||
|
newType = fsggml.TensorTypeQ3_K
|
||||||
|
}
|
||||||
|
} else if ftype == fsggml.FileTypeQ3_K_L {
|
||||||
|
newType = fsggml.TensorTypeQ5_K
|
||||||
|
} else if ftype == fsggml.FileTypeQ4_K_M {
|
||||||
|
if useMoreBits(iLayer, n_layer) {
|
||||||
|
newType = fsggml.TensorTypeQ6_K
|
||||||
|
}
|
||||||
|
} else if ftype == fsggml.FileTypeQ5_K_M && useMoreBits(iLayer, n_layer) {
|
||||||
|
newType = fsggml.TensorTypeQ6_K
|
||||||
|
} else if ftype == fsggml.FileTypeQ4_K_S && iLayer < n_layer/8 {
|
||||||
|
newType = fsggml.TensorTypeQ5_K
|
||||||
|
}
|
||||||
|
qs.iFfnDown++
|
||||||
|
} else if strings.Contains(name, "attn_output.weight") {
|
||||||
|
if nExperts == 8 {
|
||||||
|
if ftype == fsggml.FileTypeQ2_K || ftype == fsggml.FileTypeQ3_K_S || ftype == fsggml.FileTypeQ3_K_M ||
|
||||||
|
ftype == fsggml.FileTypeQ4_K_S || ftype == fsggml.FileTypeQ4_K_M {
|
||||||
|
newType = fsggml.TensorTypeQ5_K
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if ftype == fsggml.FileTypeQ2_K {
|
||||||
|
newType = fsggml.TensorTypeQ3_K
|
||||||
|
} else if ftype == fsggml.FileTypeQ3_K_M {
|
||||||
|
newType = fsggml.TensorTypeQ4_K
|
||||||
|
} else if ftype == fsggml.FileTypeQ3_K_L {
|
||||||
|
newType = fsggml.TensorTypeQ5_K
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else if strings.Contains(name, "attn_qkv.weight") {
|
||||||
|
if ftype == fsggml.FileTypeQ3_K_M || ftype == fsggml.FileTypeQ3_K_L {
|
||||||
|
newType = fsggml.TensorTypeQ4_K
|
||||||
|
} else if ftype == fsggml.FileTypeQ4_K_M {
|
||||||
|
newType = fsggml.TensorTypeQ5_K
|
||||||
|
} else if ftype == fsggml.FileTypeQ5_K_M {
|
||||||
|
newType = fsggml.TensorTypeQ6_K
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if newType.IsQuantized() {
|
||||||
|
nx := shape[0]
|
||||||
|
ny := uint64(1)
|
||||||
|
if len(shape) > 1 {
|
||||||
|
ny = shape[1]
|
||||||
|
}
|
||||||
|
qk_k := newType.BlockSize()
|
||||||
|
if nx%qk_k != 0 {
|
||||||
|
slog.Warn(fmt.Sprintf("tensor cols %d x %d are not divisible by %d, required for %s. Falling back to quantization %s", nx, ny, qk_k, newType.String(), fsggml.TensorTypeF16.String()))
|
||||||
|
newType = fsggml.TensorTypeF16
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return newType
|
||||||
|
}
|
||||||
|
|
||||||
|
func quantize(in, out *os.File, orig *fsggml.GGML, newFileType fsggml.FileType, progressFn func(n uint64)) error {
|
||||||
|
kv := maps.Clone(orig.KV())
|
||||||
|
kv["general.file_type"] = newFileType
|
||||||
|
// kv["general.quantization_version"] = ggml.QuantizationVersion()
|
||||||
|
qs := &quantizeState{}
|
||||||
|
// Build up the quantize state so newType can adjust types
|
||||||
|
layerCount := 0
|
||||||
|
for k, l := range orig.Tensors().GroupLayers() {
|
||||||
|
if strings.HasPrefix(k, "blk.") {
|
||||||
|
layerCount++
|
||||||
|
}
|
||||||
|
for _, tensor := range l {
|
||||||
|
if strings.Contains(tensor.Name, "attn_v.weight") ||
|
||||||
|
strings.Contains(tensor.Name, "attn_qkv.weight") ||
|
||||||
|
strings.Contains(tensor.Name, "attn_kv_b.weight") {
|
||||||
|
qs.nAttnV++
|
||||||
|
} else if tensor.Name == "output.weight" {
|
||||||
|
qs.hasOutput = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
qs.nFfnDown = layerCount
|
||||||
|
|
||||||
|
origTensors := orig.Tensors().Items()
|
||||||
|
outputTensors := make([]*fsggml.Tensor, len(origTensors))
|
||||||
|
for i, tensor := range origTensors {
|
||||||
|
tensor := tensor
|
||||||
|
newType := newType(tensor, kv, qs, newFileType)
|
||||||
|
newTensor := &fsggml.Tensor{
|
||||||
|
Name: tensor.Name,
|
||||||
|
Shape: tensor.Shape,
|
||||||
|
Kind: uint32(newType),
|
||||||
|
}
|
||||||
|
outputTensors[i] = newTensor
|
||||||
|
outputTensors[i].WriterTo = quantizer{
|
||||||
|
File: in,
|
||||||
|
offset: orig.Tensors().Offset + tensor.Offset,
|
||||||
|
from: tensor,
|
||||||
|
to: newTensor,
|
||||||
|
progressFn: progressFn,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return fsggml.WriteGGUF(out, kv, outputTensors)
|
||||||
|
}
|
||||||
|
|
||||||
|
func newType(t *fsggml.Tensor, kv fsggml.KV, qs *quantizeState, ftype fsggml.FileType) fsggml.TensorType {
|
||||||
|
defaultType := ftype.ToTensorType()
|
||||||
|
name := t.Name
|
||||||
|
quantize := strings.HasSuffix(name, "weight")
|
||||||
|
|
||||||
|
// don't quantize vision stuff
|
||||||
|
quantize = quantize && (!strings.Contains(name, "v.") || strings.Contains(name, "_v."))
|
||||||
|
quantize = quantize && !strings.Contains(name, "mm.")
|
||||||
|
|
||||||
|
// quantize only 2D and 3D tensors (experts)
|
||||||
|
quantize = quantize && (len(t.Shape) >= 2)
|
||||||
|
|
||||||
|
// do not quantize norm tensors
|
||||||
|
quantize = quantize && !strings.Contains(name, "_norm.weight")
|
||||||
|
|
||||||
|
// do not quantize expert gating tensors
|
||||||
|
quantize = quantize && !strings.Contains(name, "ffn_gate_inp.weight")
|
||||||
|
|
||||||
|
// do not quantize positional embeddings and token types (BERT)
|
||||||
|
quantize = quantize && (name != "position_embd.weight")
|
||||||
|
quantize = quantize && (name != "token_types.weight")
|
||||||
|
|
||||||
|
// do not quantize Mamba's small yet 2D weights
|
||||||
|
// NOTE: can't use LLM_TN here because the layer number is not known
|
||||||
|
quantize = quantize && !strings.Contains(name, "ssm_conv1d.weight")
|
||||||
|
|
||||||
|
// do not quantize RWKV's time_mix_first tensors
|
||||||
|
quantize = quantize && !strings.Contains(name, "time_mix_first.weight")
|
||||||
|
quantize = quantize && !strings.Contains(name, "time_mix_w1.weight")
|
||||||
|
quantize = quantize && !strings.Contains(name, "time_mix_w2.weight")
|
||||||
|
quantize = quantize && !strings.Contains(name, "time_mix_decay_w1.weight")
|
||||||
|
quantize = quantize && !strings.Contains(name, "time_mix_decay_w2.weight")
|
||||||
|
quantize = quantize && !strings.Contains(name, "time_mix_lerp_fused.weight")
|
||||||
|
|
||||||
|
// do not quantize relative position bias (T5)
|
||||||
|
quantize = quantize && !strings.Contains(name, "attn_rel_b.weight")
|
||||||
|
|
||||||
|
newType := fsggml.TensorType(t.Kind)
|
||||||
|
if quantize {
|
||||||
|
// get more optimal quantization type based on the tensor shape, layer, etc.
|
||||||
|
newType = getTensorNewType(kv, qs, defaultType, t.Name, t.Shape, ftype)
|
||||||
|
if newType != defaultType {
|
||||||
|
slog.Debug("tensor quantization adjusted for better quality", "name", t.Name, "requested", defaultType, "quantization", newType)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return newType
|
||||||
|
}
|
882
server/quantization_test.go
Normal file
882
server/quantization_test.go
Normal file
@ -0,0 +1,882 @@
|
|||||||
|
package server
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"fmt"
|
||||||
|
"math"
|
||||||
|
"os"
|
||||||
|
"strings"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
fsggml "github.com/ollama/ollama/fs/ggml"
|
||||||
|
"github.com/ollama/ollama/ml/backend/ggml"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestGetTensorNewType(t *testing.T) {
|
||||||
|
cases := []struct {
|
||||||
|
name string
|
||||||
|
kv map[string]any
|
||||||
|
qs quantizeState
|
||||||
|
newType fsggml.TensorType
|
||||||
|
tensor_name string
|
||||||
|
shape []uint64
|
||||||
|
ftype fsggml.FileType
|
||||||
|
expected fsggml.TensorType
|
||||||
|
expectedPanic string
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "output_unsupported",
|
||||||
|
kv: map[string]any{},
|
||||||
|
newType: fsggml.TensorTypeQ4_0,
|
||||||
|
tensor_name: "output.weight",
|
||||||
|
shape: []uint64{100, 100},
|
||||||
|
ftype: fsggml.FileTypeF32,
|
||||||
|
expected: fsggml.TensorTypeF16,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "output_Q8",
|
||||||
|
kv: map[string]any{},
|
||||||
|
newType: fsggml.TensorTypeQ4_0,
|
||||||
|
tensor_name: "output.weight",
|
||||||
|
shape: []uint64{1024, 1024},
|
||||||
|
ftype: fsggml.FileTypeF32,
|
||||||
|
expected: fsggml.TensorTypeQ6_K,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "attn_v.weight_q4_k",
|
||||||
|
kv: map[string]any{
|
||||||
|
"general.architecture": "foo",
|
||||||
|
"foo.attention.head_count": uint32(4),
|
||||||
|
"foo.attention.head_count_kv": uint32(1),
|
||||||
|
},
|
||||||
|
newType: fsggml.TensorTypeQ4_0,
|
||||||
|
tensor_name: "blk.0.attn_v.weight",
|
||||||
|
shape: []uint64{256},
|
||||||
|
ftype: fsggml.FileTypeQ2_K,
|
||||||
|
expected: fsggml.TensorTypeQ4_K,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "attn_v.weight_q3_k",
|
||||||
|
kv: map[string]any{},
|
||||||
|
newType: fsggml.TensorTypeQ4_0,
|
||||||
|
tensor_name: "blk.0.attn_v.weight",
|
||||||
|
shape: []uint64{256},
|
||||||
|
ftype: fsggml.FileTypeQ2_K,
|
||||||
|
expected: fsggml.TensorTypeQ3_K,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "attn_v.weight_q2_k_s_q4_k",
|
||||||
|
kv: map[string]any{
|
||||||
|
"general.architecture": "foo",
|
||||||
|
"foo.attention.head_count": uint32(4),
|
||||||
|
"foo.attention.head_count_kv": uint32(1),
|
||||||
|
},
|
||||||
|
newType: fsggml.TensorTypeQ4_0,
|
||||||
|
tensor_name: "blk.0.attn_v.weight",
|
||||||
|
shape: []uint64{256},
|
||||||
|
ftype: fsggml.FileTypeQ2_K_S,
|
||||||
|
expected: fsggml.TensorTypeQ4_K,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "attn_v.weight_q3_k_m",
|
||||||
|
kv: map[string]any{},
|
||||||
|
newType: fsggml.TensorTypeQ4_0,
|
||||||
|
tensor_name: "blk.0.attn_v.weight",
|
||||||
|
shape: []uint64{256},
|
||||||
|
ftype: fsggml.FileTypeQ3_K_M,
|
||||||
|
expected: fsggml.TensorTypeQ5_K,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "attn_v.weight_q3_k_m_i",
|
||||||
|
qs: quantizeState{
|
||||||
|
iAttnV: 2,
|
||||||
|
},
|
||||||
|
kv: map[string]any{},
|
||||||
|
newType: fsggml.TensorTypeQ4_0,
|
||||||
|
tensor_name: "blk.0.attn_v.weight",
|
||||||
|
shape: []uint64{256},
|
||||||
|
ftype: fsggml.FileTypeQ3_K_M,
|
||||||
|
expected: fsggml.TensorTypeQ4_K,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "attn_v.weight_q3_k_l",
|
||||||
|
kv: map[string]any{},
|
||||||
|
newType: fsggml.TensorTypeQ4_0,
|
||||||
|
tensor_name: "blk.0.attn_v.weight",
|
||||||
|
shape: []uint64{256},
|
||||||
|
ftype: fsggml.FileTypeQ3_K_L,
|
||||||
|
expected: fsggml.TensorTypeQ5_K,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "attn_v.weight_q4_k_m",
|
||||||
|
qs: quantizeState{
|
||||||
|
iAttnV: 2,
|
||||||
|
nAttnV: 3 * 8,
|
||||||
|
},
|
||||||
|
kv: map[string]any{},
|
||||||
|
newType: fsggml.TensorTypeQ4_0,
|
||||||
|
tensor_name: "blk.0.attn_v.weight",
|
||||||
|
shape: []uint64{256},
|
||||||
|
ftype: fsggml.FileTypeQ4_K_M,
|
||||||
|
expected: fsggml.TensorTypeQ6_K,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "attn_v.weight_q4_k_s",
|
||||||
|
qs: quantizeState{},
|
||||||
|
kv: map[string]any{},
|
||||||
|
newType: fsggml.TensorTypeQ4_0,
|
||||||
|
tensor_name: "blk.0.attn_v.weight",
|
||||||
|
shape: []uint64{256},
|
||||||
|
ftype: fsggml.FileTypeQ4_K_S,
|
||||||
|
expected: fsggml.TensorTypeQ5_K,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "attn_v.weight_8_expert",
|
||||||
|
qs: quantizeState{},
|
||||||
|
kv: map[string]any{
|
||||||
|
"general.architecture": "foo",
|
||||||
|
"foo.expert_count": uint32(8),
|
||||||
|
},
|
||||||
|
newType: fsggml.TensorTypeQ4_0,
|
||||||
|
tensor_name: "blk.0.attn_v.weight",
|
||||||
|
shape: []uint64{256},
|
||||||
|
ftype: fsggml.FileTypeF32,
|
||||||
|
expected: fsggml.TensorTypeQ8_0,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "attn_k.weight_8_expert",
|
||||||
|
qs: quantizeState{},
|
||||||
|
kv: map[string]any{
|
||||||
|
"general.architecture": "foo",
|
||||||
|
"foo.expert_count": uint32(8),
|
||||||
|
},
|
||||||
|
newType: fsggml.TensorTypeQ4_0,
|
||||||
|
tensor_name: "blk.0.attn_k.weight",
|
||||||
|
shape: []uint64{256},
|
||||||
|
ftype: fsggml.FileTypeF32,
|
||||||
|
expected: fsggml.TensorTypeQ8_0,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "ffn_down_q2_k",
|
||||||
|
qs: quantizeState{},
|
||||||
|
kv: map[string]any{},
|
||||||
|
newType: fsggml.TensorTypeQ4_0,
|
||||||
|
tensor_name: "ffn_down",
|
||||||
|
shape: []uint64{256},
|
||||||
|
ftype: fsggml.FileTypeQ2_K,
|
||||||
|
expected: fsggml.TensorTypeQ3_K,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "ffn_down_q2_k_s",
|
||||||
|
qs: quantizeState{},
|
||||||
|
kv: map[string]any{},
|
||||||
|
newType: fsggml.TensorTypeQ4_0,
|
||||||
|
tensor_name: "ffn_down",
|
||||||
|
shape: []uint64{256},
|
||||||
|
ftype: fsggml.FileTypeQ2_K_S,
|
||||||
|
expected: fsggml.TensorTypeQ4_0,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "ffn_down_q2_k_s_layers",
|
||||||
|
qs: quantizeState{
|
||||||
|
iFfnDown: 2,
|
||||||
|
nFfnDown: 3 * 8,
|
||||||
|
},
|
||||||
|
kv: map[string]any{},
|
||||||
|
newType: fsggml.TensorTypeQ4_0,
|
||||||
|
tensor_name: "ffn_down",
|
||||||
|
shape: []uint64{256},
|
||||||
|
ftype: fsggml.FileTypeQ2_K_S,
|
||||||
|
expected: fsggml.TensorTypeQ4_K,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "ffn_down_q3_k_m_base",
|
||||||
|
qs: quantizeState{
|
||||||
|
iFfnDown: 1,
|
||||||
|
nFfnDown: 8,
|
||||||
|
},
|
||||||
|
kv: map[string]any{},
|
||||||
|
newType: fsggml.TensorTypeQ4_0,
|
||||||
|
tensor_name: "ffn_down",
|
||||||
|
shape: []uint64{256},
|
||||||
|
ftype: fsggml.FileTypeQ3_K_M,
|
||||||
|
expected: fsggml.TensorTypeQ3_K,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "ffn_down_q3_k_m_16",
|
||||||
|
qs: quantizeState{
|
||||||
|
iFfnDown: 2,
|
||||||
|
nFfnDown: 3 * 16,
|
||||||
|
},
|
||||||
|
kv: map[string]any{},
|
||||||
|
newType: fsggml.TensorTypeQ4_0,
|
||||||
|
tensor_name: "ffn_down",
|
||||||
|
shape: []uint64{256},
|
||||||
|
ftype: fsggml.FileTypeQ3_K_M,
|
||||||
|
expected: fsggml.TensorTypeQ5_K,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "ffn_down_q3_k_m_8",
|
||||||
|
qs: quantizeState{
|
||||||
|
iFfnDown: 2,
|
||||||
|
nFfnDown: 3 * 8,
|
||||||
|
},
|
||||||
|
kv: map[string]any{},
|
||||||
|
newType: fsggml.TensorTypeQ4_0,
|
||||||
|
tensor_name: "ffn_down",
|
||||||
|
shape: []uint64{256},
|
||||||
|
ftype: fsggml.FileTypeQ3_K_M,
|
||||||
|
expected: fsggml.TensorTypeQ4_K,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "ffn_down_q3_k_l",
|
||||||
|
qs: quantizeState{},
|
||||||
|
kv: map[string]any{},
|
||||||
|
newType: fsggml.TensorTypeQ4_0,
|
||||||
|
tensor_name: "ffn_down",
|
||||||
|
shape: []uint64{256},
|
||||||
|
ftype: fsggml.FileTypeQ3_K_L,
|
||||||
|
expected: fsggml.TensorTypeQ5_K,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "ffn_down_q4_k_m",
|
||||||
|
qs: quantizeState{
|
||||||
|
iFfnDown: 1,
|
||||||
|
nFfnDown: 8,
|
||||||
|
},
|
||||||
|
kv: map[string]any{},
|
||||||
|
newType: fsggml.TensorTypeQ4_0,
|
||||||
|
tensor_name: "ffn_down",
|
||||||
|
shape: []uint64{256},
|
||||||
|
ftype: fsggml.FileTypeQ4_K_M,
|
||||||
|
expected: fsggml.TensorTypeQ4_0,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "ffn_down_q4_k_m_6",
|
||||||
|
qs: quantizeState{
|
||||||
|
iFfnDown: 2,
|
||||||
|
nFfnDown: 3 * 8,
|
||||||
|
},
|
||||||
|
kv: map[string]any{},
|
||||||
|
newType: fsggml.TensorTypeQ4_0,
|
||||||
|
tensor_name: "ffn_down",
|
||||||
|
shape: []uint64{256},
|
||||||
|
ftype: fsggml.FileTypeQ4_K_M,
|
||||||
|
expected: fsggml.TensorTypeQ6_K,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "ffn_down_q5_k_m",
|
||||||
|
qs: quantizeState{
|
||||||
|
iFfnDown: 2,
|
||||||
|
nFfnDown: 3 * 8,
|
||||||
|
},
|
||||||
|
kv: map[string]any{},
|
||||||
|
newType: fsggml.TensorTypeQ4_0,
|
||||||
|
tensor_name: "ffn_down",
|
||||||
|
shape: []uint64{256},
|
||||||
|
ftype: fsggml.FileTypeQ5_K_M,
|
||||||
|
expected: fsggml.TensorTypeQ6_K,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "ffn_down_q4_k_s",
|
||||||
|
qs: quantizeState{
|
||||||
|
iFfnDown: 2,
|
||||||
|
nFfnDown: 3 * 8,
|
||||||
|
},
|
||||||
|
kv: map[string]any{},
|
||||||
|
newType: fsggml.TensorTypeQ4_0,
|
||||||
|
tensor_name: "ffn_down",
|
||||||
|
shape: []uint64{256},
|
||||||
|
ftype: fsggml.FileTypeQ4_K_S,
|
||||||
|
expected: fsggml.TensorTypeQ5_K,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "attn_output.weight_8_expert",
|
||||||
|
qs: quantizeState{},
|
||||||
|
kv: map[string]any{
|
||||||
|
"general.architecture": "foo",
|
||||||
|
"foo.expert_count": uint32(8),
|
||||||
|
},
|
||||||
|
newType: fsggml.TensorTypeQ4_0,
|
||||||
|
tensor_name: "blk.0.attn_output.weight",
|
||||||
|
shape: []uint64{256},
|
||||||
|
ftype: fsggml.FileTypeQ2_K,
|
||||||
|
expected: fsggml.TensorTypeQ5_K,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "attn_output.weight_q2",
|
||||||
|
qs: quantizeState{},
|
||||||
|
kv: map[string]any{},
|
||||||
|
newType: fsggml.TensorTypeQ4_0,
|
||||||
|
tensor_name: "blk.0.attn_output.weight",
|
||||||
|
shape: []uint64{256},
|
||||||
|
ftype: fsggml.FileTypeQ2_K,
|
||||||
|
expected: fsggml.TensorTypeQ3_K,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "attn_output.weight_q3_k_m",
|
||||||
|
qs: quantizeState{},
|
||||||
|
kv: map[string]any{},
|
||||||
|
newType: fsggml.TensorTypeQ4_0,
|
||||||
|
tensor_name: "blk.0.attn_output.weight",
|
||||||
|
shape: []uint64{256},
|
||||||
|
ftype: fsggml.FileTypeQ3_K_M,
|
||||||
|
expected: fsggml.TensorTypeQ4_K,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "attn_output.weight_q3_k_l",
|
||||||
|
qs: quantizeState{},
|
||||||
|
kv: map[string]any{},
|
||||||
|
newType: fsggml.TensorTypeQ4_0,
|
||||||
|
tensor_name: "blk.0.attn_output.weight",
|
||||||
|
shape: []uint64{256},
|
||||||
|
ftype: fsggml.FileTypeQ3_K_L,
|
||||||
|
expected: fsggml.TensorTypeQ5_K,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "attn_qkv.weight_q3_k_m",
|
||||||
|
qs: quantizeState{},
|
||||||
|
kv: map[string]any{},
|
||||||
|
newType: fsggml.TensorTypeQ4_0,
|
||||||
|
tensor_name: "blk.0.attn_qkv.weight",
|
||||||
|
shape: []uint64{256},
|
||||||
|
ftype: fsggml.FileTypeQ3_K_M,
|
||||||
|
expected: fsggml.TensorTypeQ4_K,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "attn_qkv.weight_q4_k_m",
|
||||||
|
qs: quantizeState{},
|
||||||
|
kv: map[string]any{},
|
||||||
|
newType: fsggml.TensorTypeQ4_0,
|
||||||
|
tensor_name: "blk.0.attn_qkv.weight",
|
||||||
|
shape: []uint64{256},
|
||||||
|
ftype: fsggml.FileTypeQ4_K_M,
|
||||||
|
expected: fsggml.TensorTypeQ5_K,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "attn_qkv.weight_q5_k_m",
|
||||||
|
qs: quantizeState{},
|
||||||
|
kv: map[string]any{},
|
||||||
|
newType: fsggml.TensorTypeQ4_0,
|
||||||
|
tensor_name: "blk.0.attn_qkv.weight",
|
||||||
|
shape: []uint64{256},
|
||||||
|
ftype: fsggml.FileTypeQ5_K_M,
|
||||||
|
expected: fsggml.TensorTypeQ6_K,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
for _, tt := range cases {
|
||||||
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
|
if tt.expectedPanic != "" {
|
||||||
|
defer func() {
|
||||||
|
e := recover()
|
||||||
|
if !strings.Contains(fmt.Sprintf("%v", e), tt.expectedPanic) {
|
||||||
|
t.Fatalf("incorrect panic\ngot: %v\nexpected: %s", e, tt.expectedPanic)
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
} else {
|
||||||
|
defer func() {
|
||||||
|
e := recover()
|
||||||
|
if e != nil {
|
||||||
|
t.Fatalf("hit unexpected panic %v", e)
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
}
|
||||||
|
ret := getTensorNewType(tt.kv, &tt.qs, tt.newType, tt.tensor_name, tt.shape, tt.ftype)
|
||||||
|
if ret != tt.expected {
|
||||||
|
t.Fatalf("incorrect type returned\ngot: %d\nexpected: %d", ret, tt.expected)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestQuantizeModel(t *testing.T) {
|
||||||
|
cases := []struct {
|
||||||
|
name string
|
||||||
|
kv map[string]any
|
||||||
|
tensors []*fsggml.Tensor
|
||||||
|
newType string
|
||||||
|
expectedTensorTypes map[string]fsggml.TensorType
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "f16_q4_k",
|
||||||
|
kv: map[string]any{
|
||||||
|
"general.architecture": "foo",
|
||||||
|
},
|
||||||
|
tensors: []*fsggml.Tensor{
|
||||||
|
{
|
||||||
|
Name: "blk.0.attn.weight", Kind: uint32(fsggml.TensorTypeF16),
|
||||||
|
Offset: uint64(0), Shape: []uint64{512, 2},
|
||||||
|
WriterTo: bytes.NewReader(
|
||||||
|
append(append(append(quantBytes[fsggml.TensorTypeF16], quantBytes[fsggml.TensorTypeF16]...), quantBytes[fsggml.TensorTypeF16]...), quantBytes[fsggml.TensorTypeF16]...),
|
||||||
|
),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Name: "output.weight", Kind: uint32(fsggml.TensorTypeF16),
|
||||||
|
Offset: uint64(0), Shape: []uint64{256, 4},
|
||||||
|
WriterTo: bytes.NewReader(
|
||||||
|
append(append(append(quantBytes[fsggml.TensorTypeF16], quantBytes[fsggml.TensorTypeF16]...), quantBytes[fsggml.TensorTypeF16]...), quantBytes[fsggml.TensorTypeF16]...),
|
||||||
|
),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
newType: "Q4_K",
|
||||||
|
expectedTensorTypes: map[string]fsggml.TensorType{
|
||||||
|
"blk.0.attn.weight": fsggml.TensorTypeQ4_K,
|
||||||
|
"output.weight": fsggml.TensorTypeQ6_K,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "f32_q4_k",
|
||||||
|
kv: map[string]any{
|
||||||
|
"general.architecture": "foo",
|
||||||
|
},
|
||||||
|
tensors: []*fsggml.Tensor{
|
||||||
|
{
|
||||||
|
Name: "blk.0.attn_v.weight", Kind: uint32(fsggml.TensorTypeF32),
|
||||||
|
Offset: uint64(0), Shape: []uint64{512, 2},
|
||||||
|
WriterTo: bytes.NewReader(
|
||||||
|
append(append(append(quantBytes[fsggml.TensorTypeF32], quantBytes[fsggml.TensorTypeF32]...), quantBytes[fsggml.TensorTypeF32]...), quantBytes[fsggml.TensorTypeF32]...),
|
||||||
|
),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Name: "output.weight", Kind: uint32(fsggml.TensorTypeF32),
|
||||||
|
Offset: uint64(0), Shape: []uint64{512},
|
||||||
|
WriterTo: bytes.NewReader(append(quantBytes[fsggml.TensorTypeF32], quantBytes[fsggml.TensorTypeF32]...)),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
newType: "Q4_K",
|
||||||
|
expectedTensorTypes: map[string]fsggml.TensorType{
|
||||||
|
"blk.0.attn_v.weight": fsggml.TensorTypeQ6_K,
|
||||||
|
"output.weight": fsggml.TensorTypeF32,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "f16_q8_0",
|
||||||
|
kv: map[string]any{
|
||||||
|
"general.architecture": "foo",
|
||||||
|
},
|
||||||
|
tensors: []*fsggml.Tensor{
|
||||||
|
{
|
||||||
|
Name: "blk.0.attn.weight", Kind: uint32(fsggml.TensorTypeF16),
|
||||||
|
Offset: uint64(0), Shape: []uint64{32, 16, 2},
|
||||||
|
WriterTo: bytes.NewReader(
|
||||||
|
append(append(append(quantBytes[fsggml.TensorTypeF16], quantBytes[fsggml.TensorTypeF16]...), quantBytes[fsggml.TensorTypeF16]...), quantBytes[fsggml.TensorTypeF16]...),
|
||||||
|
),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Name: "output.weight", Kind: uint32(fsggml.TensorTypeF16),
|
||||||
|
Offset: uint64(0), Shape: []uint64{256, 4},
|
||||||
|
WriterTo: bytes.NewReader(
|
||||||
|
append(append(append(quantBytes[fsggml.TensorTypeF16], quantBytes[fsggml.TensorTypeF16]...), quantBytes[fsggml.TensorTypeF16]...), quantBytes[fsggml.TensorTypeF16]...),
|
||||||
|
),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
newType: "Q8_0",
|
||||||
|
expectedTensorTypes: map[string]fsggml.TensorType{
|
||||||
|
"blk.0.attn.weight": fsggml.TensorTypeQ8_0,
|
||||||
|
"output.weight": fsggml.TensorTypeQ8_0,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tt := range cases {
|
||||||
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
|
f, err := os.CreateTemp(t.TempDir(), tt.name)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err.Error())
|
||||||
|
}
|
||||||
|
defer f.Close()
|
||||||
|
err = fsggml.WriteGGUF(f, tt.kv, tt.tensors)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("failed to create initial model: %s", err)
|
||||||
|
}
|
||||||
|
fp, err := os.Open(f.Name())
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err.Error())
|
||||||
|
}
|
||||||
|
defer fp.Close()
|
||||||
|
meta, _, err := fsggml.Decode(fp, -1)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err.Error())
|
||||||
|
}
|
||||||
|
progressCalled := false
|
||||||
|
progress := func(n uint64) {
|
||||||
|
// fmt.Fprintf(os.Stderr, "progress: %f\n", p)
|
||||||
|
progressCalled = true
|
||||||
|
}
|
||||||
|
tmp, err := os.CreateTemp(t.TempDir(), tt.name+".out")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err.Error())
|
||||||
|
}
|
||||||
|
defer tmp.Close()
|
||||||
|
ftype, err := fsggml.ParseFileType(tt.newType)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err.Error())
|
||||||
|
}
|
||||||
|
|
||||||
|
err = quantize(fp, tmp, meta, ftype, progress)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("error during quantize: %s", err)
|
||||||
|
}
|
||||||
|
if !progressCalled {
|
||||||
|
t.Fatalf("progress was not reported")
|
||||||
|
}
|
||||||
|
// Now attempt to load it back and make sure types match expected
|
||||||
|
fpNew, err := os.Open(tmp.Name())
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("failed to load the quantized model %s: %s", tmp.Name(), err)
|
||||||
|
}
|
||||||
|
defer fpNew.Close()
|
||||||
|
newMeta, _, err := fsggml.Decode(fpNew, -1)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("failed to load the quantized model %s: %s", tmp.Name(), err)
|
||||||
|
}
|
||||||
|
tensors := newMeta.Tensors()
|
||||||
|
for _, l := range tensors.GroupLayers() {
|
||||||
|
for _, tensor := range l {
|
||||||
|
if fsggml.TensorType(tensor.Kind) != tt.expectedTensorTypes[tensor.Name] {
|
||||||
|
t.Fatalf("incorrect output type for %s\ngot:%s\nexpected:%s", tensor.Name, fsggml.TensorType(tensor.Kind), tt.expectedTensorTypes[tensor.Name])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestConvertToF32(t *testing.T) {
|
||||||
|
expected := make([]float32, 256)
|
||||||
|
for i := range expected {
|
||||||
|
expected[i] = float32(i)
|
||||||
|
}
|
||||||
|
for dtype, data := range quantBytes {
|
||||||
|
// Skip the no-op
|
||||||
|
if dtype == fsggml.TensorTypeF32 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
t.Run(dtype.String(), func(t *testing.T) {
|
||||||
|
fp32 := ggml.ConvertToF32(data, uint32(dtype), 256)
|
||||||
|
similarity := cosineSimilarity(expected, fp32)
|
||||||
|
if similarity < 0.999 {
|
||||||
|
t.Fatalf("Results not similar enough: %s %f", dtype.String(), similarity)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func dotProduct[V float32 | float64](v1, v2 []V) V {
|
||||||
|
var result V = 0
|
||||||
|
for i := range v1 {
|
||||||
|
result += v1[i] * v2[i]
|
||||||
|
}
|
||||||
|
return result
|
||||||
|
}
|
||||||
|
|
||||||
|
func magnitude[V float32 | float64](v []V) V {
|
||||||
|
var result V = 0
|
||||||
|
for _, val := range v {
|
||||||
|
result += val * val
|
||||||
|
}
|
||||||
|
return V(math.Sqrt(float64(result)))
|
||||||
|
}
|
||||||
|
|
||||||
|
func cosineSimilarity[V float32 | float64](v1, v2 []V) V {
|
||||||
|
return dotProduct(v1, v2) / (magnitude(v1) * magnitude(v2))
|
||||||
|
}
|
||||||
|
|
||||||
|
// Precomputed quantized data - arange 256
|
||||||
|
// # For gguf-py supported types
|
||||||
|
// import gguf
|
||||||
|
// import numpy as np
|
||||||
|
// print(repr(gguf.quantize(np.arange(256, dtype=np.float16), gguf.GGMLQuantizationType.Q4_0)))
|
||||||
|
//
|
||||||
|
// For types not supported by gguf-py converted via ggml_fp32_to_fp16_row and quantize_XXX
|
||||||
|
//
|
||||||
|
// data := make([]byte, 256*2)
|
||||||
|
// fp32 := make([]float32, 256)
|
||||||
|
// for i := range 256 {
|
||||||
|
// fp32[i] = float32(i)
|
||||||
|
// }
|
||||||
|
// l := C.quantize_q6_K((*C.float)(&fp32[0]), unsafe.Pointer(&data[0]), 1, 256, nil)
|
||||||
|
// for i := range data[:int(l)] {
|
||||||
|
// fmt.Printf("%d, ", data[i])
|
||||||
|
// }
|
||||||
|
var (
|
||||||
|
quantBytes = map[fsggml.TensorType][]byte{
|
||||||
|
fsggml.TensorTypeQ4_0: {
|
||||||
|
192, 195, 72, 72, 55, 55, 55, 55, 38, 38, 38, 38, 21,
|
||||||
|
21, 21, 21, 4, 4, 224, 199, 36, 36, 36, 36, 19, 19,
|
||||||
|
19, 19, 19, 19, 19, 19, 2, 2, 2, 2, 240, 201, 19,
|
||||||
|
19, 18, 18, 18, 18, 18, 18, 18, 18, 2, 2, 2, 2,
|
||||||
|
1, 1, 240, 203, 18, 18, 18, 18, 18, 18, 18, 18, 1,
|
||||||
|
1, 1, 1, 1, 1, 1, 1, 248, 204, 18, 18, 17, 17,
|
||||||
|
17, 17, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 248,
|
||||||
|
205, 17, 17, 17, 17, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||||
|
1, 1, 1, 1, 248, 206, 17, 17, 1, 1, 1, 1, 1,
|
||||||
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 248, 207, 1, 1,
|
||||||
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||||
|
1,
|
||||||
|
},
|
||||||
|
fsggml.TensorTypeQ4_1: {
|
||||||
|
34, 64, 0, 0, 128, 128, 145, 145, 162, 162, 179, 179, 196,
|
||||||
|
196, 213, 213, 230, 230, 247, 247, 34, 64, 0, 80, 128, 128,
|
||||||
|
145, 145, 162, 162, 179, 179, 196, 196, 213, 213, 230, 230, 247,
|
||||||
|
247, 34, 64, 0, 84, 128, 128, 145, 145, 162, 162, 179, 179,
|
||||||
|
196, 196, 213, 213, 230, 230, 247, 247, 34, 64, 0, 86, 128,
|
||||||
|
128, 145, 145, 162, 162, 179, 179, 196, 196, 213, 213, 230, 230,
|
||||||
|
247, 247, 34, 64, 0, 88, 128, 128, 145, 145, 162, 162, 179,
|
||||||
|
179, 196, 196, 213, 213, 230, 230, 247, 247, 34, 64, 0, 89,
|
||||||
|
128, 128, 145, 145, 162, 162, 179, 179, 196, 196, 213, 213, 230,
|
||||||
|
230, 247, 247, 34, 64, 0, 90, 128, 128, 145, 145, 162, 162,
|
||||||
|
179, 179, 196, 196, 213, 213, 230, 230, 247, 247, 34, 64, 0,
|
||||||
|
91, 128, 128, 145, 145, 162, 162, 179, 179, 196, 196, 213, 213,
|
||||||
|
230, 230, 247, 247,
|
||||||
|
},
|
||||||
|
fsggml.TensorTypeQ5_0: {
|
||||||
|
192, 191, 1, 0, 0, 0, 128, 127, 127, 110, 110, 93, 93,
|
||||||
|
76, 76, 59, 59, 42, 42, 25, 25, 8, 224, 195, 0, 0,
|
||||||
|
0, 0, 72, 72, 55, 55, 55, 55, 38, 38, 38, 38, 21,
|
||||||
|
21, 21, 21, 4, 4, 240, 197, 0, 0, 0, 0, 53, 37,
|
||||||
|
37, 37, 37, 36, 36, 20, 20, 20, 20, 19, 19, 3, 3,
|
||||||
|
3, 240, 199, 0, 0, 0, 0, 36, 36, 36, 36, 19, 19,
|
||||||
|
19, 19, 19, 19, 19, 19, 2, 2, 2, 2, 248, 200, 0,
|
||||||
|
0, 0, 0, 35, 19, 19, 19, 19, 19, 19, 18, 18, 18,
|
||||||
|
18, 2, 2, 2, 2, 2, 248, 201, 0, 0, 0, 0, 19,
|
||||||
|
19, 18, 18, 18, 18, 18, 18, 18, 18, 2, 2, 2, 2,
|
||||||
|
1, 1, 248, 202, 0, 0, 0, 0, 18, 18, 18, 18, 18,
|
||||||
|
18, 18, 18, 18, 2, 2, 1, 1, 1, 1, 1, 248, 203,
|
||||||
|
0, 0, 0, 0, 18, 18, 18, 18, 18, 18, 18, 18, 1,
|
||||||
|
1, 1, 1, 1, 1, 1, 1,
|
||||||
|
},
|
||||||
|
fsggml.TensorTypeQ5_1: {
|
||||||
|
0, 60, 0, 0, 0, 0, 255, 255, 0, 17, 34, 51, 68,
|
||||||
|
85, 102, 119, 136, 153, 170, 187, 204, 221, 238, 255, 0, 60,
|
||||||
|
0, 80, 0, 0, 255, 255, 0, 17, 34, 51, 68, 85, 102,
|
||||||
|
119, 136, 153, 170, 187, 204, 221, 238, 255, 0, 60, 0, 84,
|
||||||
|
0, 0, 255, 255, 0, 17, 34, 51, 68, 85, 102, 119, 136,
|
||||||
|
153, 170, 187, 204, 221, 238, 255, 0, 60, 0, 86, 0, 0,
|
||||||
|
255, 255, 0, 17, 34, 51, 68, 85, 102, 119, 136, 153, 170,
|
||||||
|
187, 204, 221, 238, 255, 0, 60, 0, 88, 0, 0, 255, 255,
|
||||||
|
0, 17, 34, 51, 68, 85, 102, 119, 136, 153, 170, 187, 204,
|
||||||
|
221, 238, 255, 0, 60, 0, 89, 0, 0, 255, 255, 0, 17,
|
||||||
|
34, 51, 68, 85, 102, 119, 136, 153, 170, 187, 204, 221, 238,
|
||||||
|
255, 0, 60, 0, 90, 0, 0, 255, 255, 0, 17, 34, 51,
|
||||||
|
68, 85, 102, 119, 136, 153, 170, 187, 204, 221, 238, 255, 0,
|
||||||
|
60, 0, 91, 0, 0, 255, 255, 0, 17, 34, 51, 68, 85,
|
||||||
|
102, 119, 136, 153, 170, 187, 204, 221, 238, 255,
|
||||||
|
},
|
||||||
|
fsggml.TensorTypeQ8_0: {
|
||||||
|
208, 51, 0, 4, 8, 12, 16, 20, 25, 29, 33, 37, 41,
|
||||||
|
45, 49, 53, 57, 61, 66, 70, 74, 78, 82, 86, 90, 94,
|
||||||
|
98, 102, 107, 111, 115, 119, 123, 127, 240, 55, 65, 67, 69,
|
||||||
|
71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 93, 95,
|
||||||
|
97, 99, 101, 103, 105, 107, 109, 111, 113, 115, 117, 119, 121,
|
||||||
|
123, 125, 127, 252, 57, 86, 87, 88, 90, 91, 92, 94, 95,
|
||||||
|
96, 98, 99, 100, 102, 103, 104, 106, 107, 108, 110, 111, 112,
|
||||||
|
114, 115, 116, 118, 119, 120, 122, 123, 124, 126, 127, 0, 60,
|
||||||
|
96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108,
|
||||||
|
109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
|
||||||
|
122, 123, 124, 125, 126, 127, 2, 61, 102, 103, 104, 105, 105,
|
||||||
|
106, 107, 108, 109, 109, 110, 111, 112, 113, 113, 114, 115, 116,
|
||||||
|
117, 117, 118, 119, 120, 121, 121, 122, 123, 124, 125, 125, 126,
|
||||||
|
127, 4, 62, 106, 107, 108, 108, 109, 110, 110, 111, 112, 112,
|
||||||
|
113, 114, 114, 115, 116, 116, 117, 118, 118, 119, 120, 120, 121,
|
||||||
|
122, 122, 123, 124, 124, 125, 126, 126, 127, 6, 63, 109, 110,
|
||||||
|
110, 111, 112, 112, 113, 113, 114, 114, 115, 116, 116, 117, 117,
|
||||||
|
118, 118, 119, 120, 120, 121, 121, 122, 122, 123, 124, 124, 125,
|
||||||
|
125, 126, 126, 127, 4, 64, 112, 112, 113, 113, 114, 114, 115,
|
||||||
|
115, 116, 116, 117, 117, 118, 118, 119, 119, 120, 120, 121, 121,
|
||||||
|
122, 122, 123, 123, 124, 124, 125, 125, 126, 126, 127, 127,
|
||||||
|
},
|
||||||
|
fsggml.TensorTypeBF16: {
|
||||||
|
0, 0, 128, 63, 0, 64, 64, 64, 128, 64, 160, 64, 192,
|
||||||
|
64, 224, 64, 0, 65, 16, 65, 32, 65, 48, 65, 64, 65,
|
||||||
|
80, 65, 96, 65, 112, 65, 128, 65, 136, 65, 144, 65, 152,
|
||||||
|
65, 160, 65, 168, 65, 176, 65, 184, 65, 192, 65, 200, 65,
|
||||||
|
208, 65, 216, 65, 224, 65, 232, 65, 240, 65, 248, 65, 0,
|
||||||
|
66, 4, 66, 8, 66, 12, 66, 16, 66, 20, 66, 24, 66,
|
||||||
|
28, 66, 32, 66, 36, 66, 40, 66, 44, 66, 48, 66, 52,
|
||||||
|
66, 56, 66, 60, 66, 64, 66, 68, 66, 72, 66, 76, 66,
|
||||||
|
80, 66, 84, 66, 88, 66, 92, 66, 96, 66, 100, 66, 104,
|
||||||
|
66, 108, 66, 112, 66, 116, 66, 120, 66, 124, 66, 128, 66,
|
||||||
|
130, 66, 132, 66, 134, 66, 136, 66, 138, 66, 140, 66, 142,
|
||||||
|
66, 144, 66, 146, 66, 148, 66, 150, 66, 152, 66, 154, 66,
|
||||||
|
156, 66, 158, 66, 160, 66, 162, 66, 164, 66, 166, 66, 168,
|
||||||
|
66, 170, 66, 172, 66, 174, 66, 176, 66, 178, 66, 180, 66,
|
||||||
|
182, 66, 184, 66, 186, 66, 188, 66, 190, 66, 192, 66, 194,
|
||||||
|
66, 196, 66, 198, 66, 200, 66, 202, 66, 204, 66, 206, 66,
|
||||||
|
208, 66, 210, 66, 212, 66, 214, 66, 216, 66, 218, 66, 220,
|
||||||
|
66, 222, 66, 224, 66, 226, 66, 228, 66, 230, 66, 232, 66,
|
||||||
|
234, 66, 236, 66, 238, 66, 240, 66, 242, 66, 244, 66, 246,
|
||||||
|
66, 248, 66, 250, 66, 252, 66, 254, 66, 0, 67, 1, 67,
|
||||||
|
2, 67, 3, 67, 4, 67, 5, 67, 6, 67, 7, 67, 8,
|
||||||
|
67, 9, 67, 10, 67, 11, 67, 12, 67, 13, 67, 14, 67,
|
||||||
|
15, 67, 16, 67, 17, 67, 18, 67, 19, 67, 20, 67, 21,
|
||||||
|
67, 22, 67, 23, 67, 24, 67, 25, 67, 26, 67, 27, 67,
|
||||||
|
28, 67, 29, 67, 30, 67, 31, 67, 32, 67, 33, 67, 34,
|
||||||
|
67, 35, 67, 36, 67, 37, 67, 38, 67, 39, 67, 40, 67,
|
||||||
|
41, 67, 42, 67, 43, 67, 44, 67, 45, 67, 46, 67, 47,
|
||||||
|
67, 48, 67, 49, 67, 50, 67, 51, 67, 52, 67, 53, 67,
|
||||||
|
54, 67, 55, 67, 56, 67, 57, 67, 58, 67, 59, 67, 60,
|
||||||
|
67, 61, 67, 62, 67, 63, 67, 64, 67, 65, 67, 66, 67,
|
||||||
|
67, 67, 68, 67, 69, 67, 70, 67, 71, 67, 72, 67, 73,
|
||||||
|
67, 74, 67, 75, 67, 76, 67, 77, 67, 78, 67, 79, 67,
|
||||||
|
80, 67, 81, 67, 82, 67, 83, 67, 84, 67, 85, 67, 86,
|
||||||
|
67, 87, 67, 88, 67, 89, 67, 90, 67, 91, 67, 92, 67,
|
||||||
|
93, 67, 94, 67, 95, 67, 96, 67, 97, 67, 98, 67, 99,
|
||||||
|
67, 100, 67, 101, 67, 102, 67, 103, 67, 104, 67, 105, 67,
|
||||||
|
106, 67, 107, 67, 108, 67, 109, 67, 110, 67, 111, 67, 112,
|
||||||
|
67, 113, 67, 114, 67, 115, 67, 116, 67, 117, 67, 118, 67,
|
||||||
|
119, 67, 120, 67, 121, 67, 122, 67, 123, 67, 124, 67, 125,
|
||||||
|
67, 126, 67, 127, 67,
|
||||||
|
},
|
||||||
|
fsggml.TensorTypeF16: {
|
||||||
|
0, 0, 0, 60, 0, 64, 0, 66, 0, 68, 0, 69, 0, 70, 0, 71, 0,
|
||||||
|
72, 128, 72, 0, 73, 128, 73, 0, 74, 128, 74, 0, 75, 128, 75,
|
||||||
|
0, 76, 64, 76, 128, 76, 192, 76, 0, 77, 64, 77, 128, 77, 192,
|
||||||
|
77, 0, 78, 64, 78, 128, 78, 192, 78, 0, 79, 64, 79, 128, 79,
|
||||||
|
192, 79, 0, 80, 32, 80, 64, 80, 96, 80, 128, 80, 160, 80,
|
||||||
|
192, 80, 224, 80, 0, 81, 32, 81, 64, 81, 96, 81, 128, 81,
|
||||||
|
160, 81, 192, 81, 224, 81, 0, 82, 32, 82, 64, 82, 96, 82,
|
||||||
|
128, 82, 160, 82, 192, 82, 224, 82, 0, 83, 32, 83, 64, 83,
|
||||||
|
96, 83, 128, 83, 160, 83, 192, 83, 224, 83, 0, 84, 16, 84,
|
||||||
|
32, 84, 48, 84, 64, 84, 80, 84, 96, 84, 112, 84, 128, 84,
|
||||||
|
144, 84, 160, 84, 176, 84, 192, 84, 208, 84, 224, 84, 240,
|
||||||
|
84, 0, 85, 16, 85, 32, 85, 48, 85, 64, 85, 80, 85, 96, 85,
|
||||||
|
112, 85, 128, 85, 144, 85, 160, 85, 176, 85, 192, 85, 208,
|
||||||
|
85, 224, 85, 240, 85, 0, 86, 16, 86, 32, 86, 48, 86, 64,
|
||||||
|
86, 80, 86, 96, 86, 112, 86, 128, 86, 144, 86, 160, 86,
|
||||||
|
176, 86, 192, 86, 208, 86, 224, 86, 240, 86, 0, 87, 16,
|
||||||
|
87, 32, 87, 48, 87, 64, 87, 80, 87, 96, 87, 112, 87, 128,
|
||||||
|
87, 144, 87, 160, 87, 176, 87, 192, 87, 208, 87, 224, 87,
|
||||||
|
240, 87, 0, 88, 8, 88, 16, 88, 24, 88, 32, 88, 40, 88,
|
||||||
|
48, 88, 56, 88, 64, 88, 72, 88, 80, 88, 88, 88, 96, 88,
|
||||||
|
104, 88, 112, 88, 120, 88, 128, 88, 136, 88, 144, 88, 152,
|
||||||
|
88, 160, 88, 168, 88, 176, 88, 184, 88, 192, 88, 200, 88,
|
||||||
|
208, 88, 216, 88, 224, 88, 232, 88, 240, 88, 248, 88, 0,
|
||||||
|
89, 8, 89, 16, 89, 24, 89, 32, 89, 40, 89, 48, 89, 56, 89,
|
||||||
|
64, 89, 72, 89, 80, 89, 88, 89, 96, 89, 104, 89, 112, 89,
|
||||||
|
120, 89, 128, 89, 136, 89, 144, 89, 152, 89, 160, 89, 168,
|
||||||
|
89, 176, 89, 184, 89, 192, 89, 200, 89, 208, 89, 216, 89,
|
||||||
|
224, 89, 232, 89, 240, 89, 248, 89, 0, 90, 8, 90, 16, 90,
|
||||||
|
24, 90, 32, 90, 40, 90, 48, 90, 56, 90, 64, 90, 72, 90, 80,
|
||||||
|
90, 88, 90, 96, 90, 104, 90, 112, 90, 120, 90, 128, 90,
|
||||||
|
136, 90, 144, 90, 152, 90, 160, 90, 168, 90, 176, 90, 184,
|
||||||
|
90, 192, 90, 200, 90, 208, 90, 216, 90, 224, 90, 232, 90,
|
||||||
|
240, 90, 248, 90, 0, 91, 8, 91, 16, 91, 24, 91, 32, 91, 40,
|
||||||
|
91, 48, 91, 56, 91, 64, 91, 72, 91, 80, 91, 88, 91, 96, 91,
|
||||||
|
104, 91, 112, 91, 120, 91, 128, 91, 136, 91, 144, 91, 152,
|
||||||
|
91, 160, 91, 168, 91, 176, 91, 184, 91, 192, 91, 200, 91,
|
||||||
|
208, 91, 216, 91, 224, 91, 232, 91, 240, 91, 248, 91,
|
||||||
|
},
|
||||||
|
fsggml.TensorTypeF32: {
|
||||||
|
0, 0, 0, 0, 0, 0, 128, 63, 0, 0, 0, 64, 0, 0, 64, 64, 0, 0, 128,
|
||||||
|
64, 0, 0, 160, 64, 0, 0, 192, 64, 0, 0, 224, 64, 0, 0, 0, 65, 0,
|
||||||
|
0, 16, 65, 0, 0, 32, 65, 0, 0, 48, 65, 0, 0, 64, 65, 0, 0, 80, 65,
|
||||||
|
0, 0, 96, 65, 0, 0, 112, 65, 0, 0, 128, 65, 0, 0, 136, 65, 0, 0,
|
||||||
|
144, 65, 0, 0, 152, 65, 0, 0, 160, 65, 0, 0, 168, 65, 0, 0, 176,
|
||||||
|
65, 0, 0, 184, 65, 0, 0, 192, 65, 0, 0, 200, 65, 0, 0, 208, 65, 0,
|
||||||
|
0, 216, 65, 0, 0, 224, 65, 0, 0, 232, 65, 0, 0, 240, 65, 0, 0, 248,
|
||||||
|
65, 0, 0, 0, 66, 0, 0, 4, 66, 0, 0, 8, 66, 0, 0, 12, 66, 0, 0, 16,
|
||||||
|
66, 0, 0, 20, 66, 0, 0, 24, 66, 0, 0, 28, 66, 0, 0, 32, 66, 0, 0,
|
||||||
|
36, 66, 0, 0, 40, 66, 0, 0, 44, 66, 0, 0, 48, 66, 0, 0, 52, 66, 0,
|
||||||
|
0, 56, 66, 0, 0, 60, 66, 0, 0, 64, 66, 0, 0, 68, 66, 0, 0, 72, 66,
|
||||||
|
0, 0, 76, 66, 0, 0, 80, 66, 0, 0, 84, 66, 0, 0, 88, 66, 0, 0, 92, 66,
|
||||||
|
0, 0, 96, 66, 0, 0, 100, 66, 0, 0, 104, 66, 0, 0, 108, 66, 0, 0, 112,
|
||||||
|
66, 0, 0, 116, 66, 0, 0, 120, 66, 0, 0, 124, 66, 0, 0, 128, 66, 0, 0,
|
||||||
|
130, 66, 0, 0, 132, 66, 0, 0, 134, 66, 0, 0, 136, 66, 0, 0, 138, 66,
|
||||||
|
0, 0, 140, 66, 0, 0, 142, 66, 0, 0, 144, 66, 0, 0, 146, 66, 0, 0, 148,
|
||||||
|
66, 0, 0, 150, 66, 0, 0, 152, 66, 0, 0, 154, 66, 0, 0, 156, 66, 0, 0,
|
||||||
|
158, 66, 0, 0, 160, 66, 0, 0, 162, 66, 0, 0, 164, 66, 0, 0, 166, 66,
|
||||||
|
0, 0, 168, 66, 0, 0, 170, 66, 0, 0, 172, 66, 0, 0, 174, 66, 0, 0, 176,
|
||||||
|
66, 0, 0, 178, 66, 0, 0, 180, 66, 0, 0, 182, 66, 0, 0, 184, 66, 0, 0,
|
||||||
|
186, 66, 0, 0, 188, 66, 0, 0, 190, 66, 0, 0, 192, 66, 0, 0, 194, 66, 0,
|
||||||
|
0, 196, 66, 0, 0, 198, 66, 0, 0, 200, 66, 0, 0, 202, 66, 0, 0, 204, 66,
|
||||||
|
0, 0, 206, 66, 0, 0, 208, 66, 0, 0, 210, 66, 0, 0, 212, 66, 0, 0, 214, 66,
|
||||||
|
0, 0, 216, 66, 0, 0, 218, 66, 0, 0, 220, 66, 0, 0, 222, 66, 0, 0, 224, 66,
|
||||||
|
0, 0, 226, 66, 0, 0, 228, 66, 0, 0, 230, 66, 0, 0, 232, 66, 0, 0, 234, 66,
|
||||||
|
0, 0, 236, 66, 0, 0, 238, 66, 0, 0, 240, 66, 0, 0, 242, 66, 0, 0, 244, 66,
|
||||||
|
0, 0, 246, 66, 0, 0, 248, 66, 0, 0, 250, 66, 0, 0, 252, 66, 0, 0, 254, 66,
|
||||||
|
0, 0, 0, 67, 0, 0, 1, 67, 0, 0, 2, 67, 0, 0, 3, 67, 0, 0, 4, 67, 0, 0, 5, 67,
|
||||||
|
0, 0, 6, 67, 0, 0, 7, 67, 0, 0, 8, 67, 0, 0, 9, 67, 0, 0, 10, 67, 0, 0, 11,
|
||||||
|
67, 0, 0, 12, 67, 0, 0, 13, 67, 0, 0, 14, 67, 0, 0, 15, 67, 0, 0, 16, 67,
|
||||||
|
0, 0, 17, 67, 0, 0, 18, 67, 0, 0, 19, 67, 0, 0, 20, 67, 0, 0, 21, 67, 0, 0,
|
||||||
|
22, 67, 0, 0, 23, 67, 0, 0, 24, 67, 0, 0, 25, 67, 0, 0, 26, 67, 0, 0, 27,
|
||||||
|
67, 0, 0, 28, 67, 0, 0, 29, 67, 0, 0, 30, 67, 0, 0, 31, 67, 0, 0, 32, 67,
|
||||||
|
0, 0, 33, 67, 0, 0, 34, 67, 0, 0, 35, 67, 0, 0, 36, 67, 0, 0, 37, 67, 0, 0,
|
||||||
|
38, 67, 0, 0, 39, 67, 0, 0, 40, 67, 0, 0, 41, 67, 0, 0, 42, 67, 0, 0, 43, 67,
|
||||||
|
0, 0, 44, 67, 0, 0, 45, 67, 0, 0, 46, 67, 0, 0, 47, 67, 0, 0, 48, 67, 0, 0,
|
||||||
|
49, 67, 0, 0, 50, 67, 0, 0, 51, 67, 0, 0, 52, 67, 0, 0, 53, 67, 0, 0, 54, 67,
|
||||||
|
0, 0, 55, 67, 0, 0, 56, 67, 0, 0, 57, 67, 0, 0, 58, 67, 0, 0, 59, 67, 0, 0,
|
||||||
|
60, 67, 0, 0, 61, 67, 0, 0, 62, 67, 0, 0, 63, 67, 0, 0, 64, 67, 0, 0, 65, 67,
|
||||||
|
0, 0, 66, 67, 0, 0, 67, 67, 0, 0, 68, 67, 0, 0, 69, 67, 0, 0, 70, 67, 0, 0, 71,
|
||||||
|
67, 0, 0, 72, 67, 0, 0, 73, 67, 0, 0, 74, 67, 0, 0, 75, 67, 0, 0, 76, 67, 0,
|
||||||
|
0, 77, 67, 0, 0, 78, 67, 0, 0, 79, 67, 0, 0, 80, 67, 0, 0, 81, 67, 0, 0, 82,
|
||||||
|
67, 0, 0, 83, 67, 0, 0, 84, 67, 0, 0, 85, 67, 0, 0, 86, 67, 0, 0, 87, 67, 0,
|
||||||
|
0, 88, 67, 0, 0, 89, 67, 0, 0, 90, 67, 0, 0, 91, 67, 0, 0, 92, 67, 0, 0, 93,
|
||||||
|
67, 0, 0, 94, 67, 0, 0, 95, 67, 0, 0, 96, 67, 0, 0, 97, 67, 0, 0, 98, 67, 0,
|
||||||
|
0, 99, 67, 0, 0, 100, 67, 0, 0, 101, 67, 0, 0, 102, 67, 0, 0, 103, 67, 0, 0,
|
||||||
|
104, 67, 0, 0, 105, 67, 0, 0, 106, 67, 0, 0, 107, 67, 0, 0, 108, 67, 0, 0, 109,
|
||||||
|
67, 0, 0, 110, 67, 0, 0, 111, 67, 0, 0, 112, 67, 0, 0, 113, 67, 0, 0, 114, 67,
|
||||||
|
0, 0, 115, 67, 0, 0, 116, 67, 0, 0, 117, 67, 0, 0, 118, 67, 0, 0, 119, 67, 0,
|
||||||
|
0, 120, 67, 0, 0, 121, 67, 0, 0, 122, 67, 0, 0, 123, 67, 0, 0, 124, 67, 0, 0,
|
||||||
|
125, 67, 0, 0, 126, 67, 0, 0, 127, 67,
|
||||||
|
},
|
||||||
|
fsggml.TensorTypeQ4_K: {
|
||||||
|
52, 52, 0, 0, 136, 208, 216, 223, 0, 0, 0, 0, 8, 0, 8, 15, 128,
|
||||||
|
128, 129, 129, 146, 146, 147, 147, 164, 164, 165, 165, 166, 182,
|
||||||
|
183, 183, 184, 200, 201, 201, 202, 218, 218, 219, 219, 236, 236,
|
||||||
|
237, 237, 254, 254, 255, 202, 202, 202, 203, 203, 203, 219, 219,
|
||||||
|
219, 220, 220, 220, 220, 220, 236, 237, 237, 237, 237, 237,
|
||||||
|
237, 237, 238, 254, 254, 254, 254, 254, 255, 255, 255, 255, 220,
|
||||||
|
220, 220, 220, 221, 221, 221, 221, 221, 221, 221, 237, 237, 237,
|
||||||
|
238, 238, 238, 238, 238, 238, 238, 238, 238, 254, 254, 255, 255,
|
||||||
|
255, 255, 255, 255, 255, 237, 237, 237, 237, 237, 237, 237, 238,
|
||||||
|
238, 238, 238, 238, 238, 238, 238, 238, 254, 254, 254, 254, 254,
|
||||||
|
254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
|
||||||
|
},
|
||||||
|
fsggml.TensorTypeQ2_K: {
|
||||||
|
1, 2, 3, 3, 4, 5, 7, 7, 8, 9, 10, 11, 12, 13, 14, 15, 184, 184,
|
||||||
|
184, 185, 249, 249, 249, 249, 249, 250, 250, 254, 254, 254, 254,
|
||||||
|
255, 253, 253, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
|
||||||
|
254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
|
||||||
|
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
|
||||||
|
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 171, 69, 0, 0,
|
||||||
|
},
|
||||||
|
fsggml.TensorTypeQ5_K: {
|
||||||
|
32, 48, 0, 0, 136, 208, 216, 223, 0, 0, 0, 0, 8, 0, 7, 15, 254,
|
||||||
|
254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
|
||||||
|
254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
|
||||||
|
255, 255, 255, 255, 255, 0, 1, 2, 19, 20, 37, 38, 55, 56, 73, 74,
|
||||||
|
91, 92, 109, 110, 127, 112, 128, 129, 146, 147, 164, 165, 182, 183,
|
||||||
|
200, 201, 218, 219, 236, 237, 254, 133, 133, 149, 150, 150, 150,
|
||||||
|
167, 167, 167, 168, 184, 184, 185, 185, 201, 202, 202, 202, 219,
|
||||||
|
219, 219, 219, 236, 236, 236, 237, 253, 253, 254, 254, 254, 255,
|
||||||
|
169, 169, 169, 169, 186, 186, 186, 186, 186, 187, 187, 203, 203,
|
||||||
|
203, 204, 204, 204, 220, 220, 221, 221, 221, 221, 237, 237, 238,
|
||||||
|
238, 238, 238, 254, 255, 255, 203, 203, 203, 204, 204, 204, 204,
|
||||||
|
204, 220, 220, 220, 221, 221, 221, 221, 221, 237, 237, 238, 238,
|
||||||
|
238, 238, 238, 238, 254, 255, 255, 255, 255, 255, 255, 255,
|
||||||
|
},
|
||||||
|
fsggml.TensorTypeQ6_K: {
|
||||||
|
96, 110, 92, 90, 88, 70, 68, 50, 48, 46, 44, 42, 24, 22, 4, 2, 80,
|
||||||
|
95, 78, 77, 76, 59, 58, 57, 40, 39, 38, 21, 20, 19, 2, 1, 75, 75,
|
||||||
|
74, 57, 57, 56, 55, 39, 38, 37, 21, 20, 20, 19, 2, 2, 72, 55, 55,
|
||||||
|
54, 54, 37, 37, 36, 36, 19, 19, 18, 18, 1, 1, 0, 35, 35, 35, 35,
|
||||||
|
34, 18, 18, 18, 17, 17, 17, 1, 1, 0, 0, 0, 35, 35, 34, 34, 18,
|
||||||
|
18, 18, 17, 17, 17, 17, 1, 0, 0, 0, 0, 35, 35, 35, 19, 19, 18, 18,
|
||||||
|
18, 18, 18, 1, 1, 1, 1, 1, 1, 34, 34, 18, 18, 18, 18, 17, 17, 17,
|
||||||
|
17, 1, 1, 0, 0, 0, 0, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 248, 240, 231, 224, 216, 208, 200, 192, 184, 176,
|
||||||
|
166, 160, 152, 144, 136, 128, 235, 43,
|
||||||
|
},
|
||||||
|
fsggml.TensorTypeQ3_K: {
|
||||||
|
1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 20, 20, 23, 23, 7, 7, 6, 6, 6, 2,
|
||||||
|
1, 1, 1, 1, 0, 0, 22, 22, 6, 6, 5, 5, 5, 1, 1, 1, 1, 1, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 238, 204, 170, 136, 102, 68,
|
||||||
|
34, 1, 5, 5, 5, 5, 189, 63,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
)
|
@ -24,7 +24,7 @@ import (
|
|||||||
|
|
||||||
var stream bool = false
|
var stream bool = false
|
||||||
|
|
||||||
func createBinFile(t *testing.T, kv map[string]any, ti []ggml.Tensor) (string, string) {
|
func createBinFile(t *testing.T, kv map[string]any, ti []*ggml.Tensor) (string, string) {
|
||||||
t.Helper()
|
t.Helper()
|
||||||
t.Setenv("OLLAMA_MODELS", cmp.Or(os.Getenv("OLLAMA_MODELS"), t.TempDir()))
|
t.Setenv("OLLAMA_MODELS", cmp.Or(os.Getenv("OLLAMA_MODELS"), t.TempDir()))
|
||||||
|
|
||||||
|
@ -99,7 +99,7 @@ func TestGenerateChat(t *testing.T) {
|
|||||||
"tokenizer.ggml.tokens": []string{""},
|
"tokenizer.ggml.tokens": []string{""},
|
||||||
"tokenizer.ggml.scores": []float32{0},
|
"tokenizer.ggml.scores": []float32{0},
|
||||||
"tokenizer.ggml.token_type": []int32{0},
|
"tokenizer.ggml.token_type": []int32{0},
|
||||||
}, []ggml.Tensor{
|
}, []*ggml.Tensor{
|
||||||
{Name: "token_embd.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
|
{Name: "token_embd.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
|
||||||
{Name: "blk.0.attn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
|
{Name: "blk.0.attn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
|
||||||
{Name: "blk.0.ffn_down.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
|
{Name: "blk.0.ffn_down.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
|
||||||
@ -158,7 +158,7 @@ func TestGenerateChat(t *testing.T) {
|
|||||||
_, digest := createBinFile(t, ggml.KV{
|
_, digest := createBinFile(t, ggml.KV{
|
||||||
"general.architecture": "bert",
|
"general.architecture": "bert",
|
||||||
"bert.pooling_type": uint32(0),
|
"bert.pooling_type": uint32(0),
|
||||||
}, []ggml.Tensor{})
|
}, []*ggml.Tensor{})
|
||||||
w := createRequest(t, s.CreateHandler, api.CreateRequest{
|
w := createRequest(t, s.CreateHandler, api.CreateRequest{
|
||||||
Model: "bert",
|
Model: "bert",
|
||||||
Files: map[string]string{"bert.gguf": digest},
|
Files: map[string]string{"bert.gguf": digest},
|
||||||
@ -643,7 +643,7 @@ func TestGenerate(t *testing.T) {
|
|||||||
"tokenizer.ggml.tokens": []string{""},
|
"tokenizer.ggml.tokens": []string{""},
|
||||||
"tokenizer.ggml.scores": []float32{0},
|
"tokenizer.ggml.scores": []float32{0},
|
||||||
"tokenizer.ggml.token_type": []int32{0},
|
"tokenizer.ggml.token_type": []int32{0},
|
||||||
}, []ggml.Tensor{
|
}, []*ggml.Tensor{
|
||||||
{Name: "token_embd.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
|
{Name: "token_embd.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
|
||||||
{Name: "blk.0.attn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
|
{Name: "blk.0.attn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
|
||||||
{Name: "blk.0.ffn_down.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
|
{Name: "blk.0.ffn_down.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
|
||||||
@ -698,7 +698,7 @@ func TestGenerate(t *testing.T) {
|
|||||||
_, digest := createBinFile(t, ggml.KV{
|
_, digest := createBinFile(t, ggml.KV{
|
||||||
"general.architecture": "bert",
|
"general.architecture": "bert",
|
||||||
"bert.pooling_type": uint32(0),
|
"bert.pooling_type": uint32(0),
|
||||||
}, []ggml.Tensor{})
|
}, []*ggml.Tensor{})
|
||||||
|
|
||||||
w := createRequest(t, s.CreateHandler, api.CreateRequest{
|
w := createRequest(t, s.CreateHandler, api.CreateRequest{
|
||||||
Model: "bert",
|
Model: "bert",
|
||||||
|
@ -126,7 +126,7 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est
|
|||||||
"tokenizer.ggml.tokens": []string{" "},
|
"tokenizer.ggml.tokens": []string{" "},
|
||||||
"tokenizer.ggml.scores": []float32{0},
|
"tokenizer.ggml.scores": []float32{0},
|
||||||
"tokenizer.ggml.token_type": []int32{0},
|
"tokenizer.ggml.token_type": []int32{0},
|
||||||
}, []ggml.Tensor{
|
}, []*ggml.Tensor{
|
||||||
{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
|
{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
|
||||||
{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
|
{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
|
||||||
}))
|
}))
|
||||||
|
Loading…
x
Reference in New Issue
Block a user