Follow up to #10363 (#10647)

The quantization PR didn't block all unsupported file types, which this PR fixes. It also updates the API docs to reflect the now reduced set of supported types.
2025-05-12 15:23:31 -07:00 · 2025-05-12 15:23:31 -07:00 · 9d6df90805
commit 9d6df90805
parent 0cefd46f23
4 changed files with 88 additions and 382 deletions
--- a/docs/api.md
+++ b/docs/api.md
@ -19,7 +19,7 @@
 ### Model names
-Model names follow a `model:tag` format, where `model` can have an optional namespace such as `example/model`. Some examples are `orca-mini:3b-q4_1` and `llama3:70b`. The tag is optional and, if not provided, will default to `latest`. The tag is used to identify a specific version.
+Model names follow a `model:tag` format, where `model` can have an optional namespace such as `example/model`. Some examples are `orca-mini:3b-q8_0` and `llama3:70b`. The tag is optional and, if not provided, will default to `latest`. The tag is used to identify a specific version.
 ### Durations
@ -952,19 +952,8 @@ If you are creating a model from a safetensors directory or from a GGUF file, yo
 | Type | Recommended |
 | --- | :-: |
 | q2_K | |
 | q3_K_L | |
 | q3_K_M | |
 | q3_K_S | |
 | q4_0 | |
 | q4_1 | |
 | q4_K_M | * |
 | q4_K_S | |
 | q5_0 | |
 | q5_1 | |
 | q5_K_M | |
 | q5_K_S | |
 | q6_K | |
 | q8_0 | * |
 ### Examples
@ -1009,8 +998,8 @@ Quantize a non-quantized model.
 ```shell
 curl http://localhost:11434/api/create -d '{
-  "model": "llama3.1:quantized",
+  "model": "llama3.2:quantized",
-  "from": "llama3.1:8b-instruct-fp16",
+  "from": "llama3.2:3b-instruct-fp16",
  "quantize": "q4_K_M"
 }'
 ```
@ -1020,12 +1009,14 @@ curl http://localhost:11434/api/create -d '{
 A stream of JSON objects is returned:
 ```json
-{"status":"quantizing F16 model to Q4_K_M"}
+{"status":"quantizing F16 model to Q4_K_M","digest":"0","total":6433687776,"completed":12302}
-{"status":"creating new layer sha256:667b0c1932bc6ffc593ed1d03f895bf2dc8dc6df21db3042284a6f4416b06a29"}
+{"status":"quantizing F16 model to Q4_K_M","digest":"0","total":6433687776,"completed":6433687552}
-{"status":"using existing layer sha256:11ce4ee3e170f6adebac9a991c22e22ab3f8530e154ee669954c4bc73061c258"}
+{"status":"verifying conversion"}
-{"status":"using existing layer sha256:0ba8f0e314b4264dfd19df045cde9d4c394a52474bf92ed6a3de22a4ca31a177"}
+{"status":"creating new layer sha256:fb7f4f211b89c6c4928ff4ddb73db9f9c0cfca3e000c3e40d6cf27ddc6ca72eb"}
 {"status":"using existing layer sha256:966de95ca8a62200913e3f8bfbf84c8494536f1b94b49166851e76644e966396"}
 {"status":"using existing layer sha256:fcc5a6bec9daf9b561a68827b67ab6088e1dba9d1fa2a50d7bbcc8384e0a265d"}
 {"status":"using existing layer sha256:a70ff7e570d97baaf4e62ac6e6ad9975e04caa6d900d3742d37698494479e0cd"}
 {"status":"using existing layer sha256:56bb8bd477a519ffa694fc449c2413c6f0e1d3b1c88fa7e3c9d88d3ae49d4dcb"}
 {"status":"creating new layer sha256:455f34728c9b5dd3376378bfb809ee166c145b0b4c1f1a6feca069055066ef9a"}
 {"status":"writing manifest"}
 {"status":"success"}
 ```
@ -1163,29 +1154,37 @@ A single JSON object will be returned.
 {
  "models": [
    {
-      "name": "codellama:13b",
+      "name": "deepseek-r1:latest",
-      "modified_at": "2023-11-04T14:56:49.277302595-07:00",
+      "model": "deepseek-r1:latest",
-      "size": 7365960935,
+      "modified_at": "2025-05-10T08:06:48.639712648-07:00",
-      "digest": "9f438cb9cd581fc025612d27f7c1a6669ff83a8bb0ed86c94fcf4c5440555697",
+      "size": 4683075271,
      "digest": "0a8c266910232fd3291e71e5ba1e058cc5af9d411192cf88b6d30e92b6e73163",
      "details": {
        "parent_model": "",
        "format": "gguf",
-        "family": "llama",
+        "family": "qwen2",
-        "families": null,
+        "families": [
-        "parameter_size": "13B",
+          "qwen2"
-        "quantization_level": "Q4_0"
+        ],
        "parameter_size": "7.6B",
        "quantization_level": "Q4_K_M"
      }
    },
    {
-      "name": "llama3:latest",
+      "name": "llama3.2:latest",
-      "modified_at": "2023-12-07T09:32:18.757212583-08:00",
+      "model": "llama3.2:latest",
-      "size": 3825819519,
+      "modified_at": "2025-05-04T17:37:44.706015396-07:00",
-      "digest": "fe938a131f40e6f6d40083c9f0f430a515233eb2edaa6d72eb85c50d64f2300e",
+      "size": 2019393189,
      "digest": "a80c4f17acd55265feec403c7aef86be0c25983ab279d83f3bcd3abbcb5b8b72",
      "details": {
        "parent_model": "",
        "format": "gguf",
        "family": "llama",
-        "families": null,
+        "families": [
-        "parameter_size": "7B",
+          "llama"
-        "quantization_level": "Q4_0"
+        ],
        "parameter_size": "3.2B",
        "quantization_level": "Q4_K_M"
      }
    }
  ]
--- a/fs/ggml/type.go
+++ b/fs/ggml/type.go
@ -12,42 +12,42 @@ type FileType uint32
 const (
 	FileTypeF32 FileType = iota
 	FileTypeF16
-	FileTypeQ4_0
+	fileTypeQ4_0
-	FileTypeQ4_1
+	fileTypeQ4_1
 	fileTypeQ4_1_F16 // unused by GGML
 	fileTypeQ4_2     // unused by GGML
 	fileTypeQ4_3     // unused by GGML
 	FileTypeQ8_0
-	FileTypeQ5_0
+	fileTypeQ5_0
-	FileTypeQ5_1
+	fileTypeQ5_1
-	FileTypeQ2_K
+	fileTypeQ2_K
-	FileTypeQ3_K_S
+	fileTypeQ3_K_S
-	FileTypeQ3_K_M
+	fileTypeQ3_K_M
-	FileTypeQ3_K_L
+	fileTypeQ3_K_L
 	FileTypeQ4_K_S
 	FileTypeQ4_K_M
-	FileTypeQ5_K_S
+	fileTypeQ5_K_S
-	FileTypeQ5_K_M
+	fileTypeQ5_K_M
-	FileTypeQ6_K
+	fileTypeQ6_K
-	fileTypeIQ2_XXS // not supported by ollama
+	fileTypeIQ2_XXS
-	fileTypeIQ2_XS  // not supported by ollama
+	fileTypeIQ2_XS
-	FileTypeQ2_K_S
+	fileTypeQ2_K_S
-	fileTypeIQ3_XS  // not supported by ollama
+	fileTypeIQ3_XS
-	fileTypeIQ3_XXS // not supported by ollama
+	fileTypeIQ3_XXS
-	fileTypeIQ1_S   // not supported by ollama
+	fileTypeIQ1_S
-	fileTypeIQ4_NL  // not supported by ollama
+	fileTypeIQ4_NL
-	fileTypeIQ3_S   // not supported by ollama
+	fileTypeIQ3_S
-	fileTypeIQ3_M   // not supported by ollama
+	fileTypeIQ3_M
-	fileTypeIQ2_S   // not supported by ollama
+	fileTypeIQ2_S
-	fileTypeIQ2_M   // not supported by ollama
+	fileTypeIQ2_M
-	fileTypeIQ4_XS  // not supported by ollama
+	fileTypeIQ4_XS
-	fileTypeIQ1_M   // not supported by ollama
+	fileTypeIQ1_M
 	FileTypeBF16
 	fileTypeQ4_0_4_4 // unused by GGML
 	fileTypeQ4_0_4_8 // unused by GGML
 	fileTypeQ4_0_8_8 // unused by GGML
-	fileTypeTQ1_0    // not supported by ollama
+	fileTypeTQ1_0
-	fileTypeTQ2_0    // not supported by ollama
+	fileTypeTQ2_0
 	FileTypeUnknown = 1024
 )
@ -60,36 +60,12 @@ func ParseFileType(s string) (FileType, error) {
 		return FileTypeF32, nil
 	case "F16":
 		return FileTypeF16, nil
 	case "Q4_0":
 		return FileTypeQ4_0, nil
 	case "Q4_1":
 		return FileTypeQ4_1, nil
 	case "Q8_0":
 		return FileTypeQ8_0, nil
 	case "Q5_0":
 		return FileTypeQ5_0, nil
 	case "Q5_1":
 		return FileTypeQ5_1, nil
 	case "Q2_K":
 		return FileTypeQ2_K, nil
 	case "Q3_K_S":
 		return FileTypeQ3_K_S, nil
 	case "Q3_K_M":
 		return FileTypeQ3_K_M, nil
 	case "Q3_K_L":
 		return FileTypeQ3_K_L, nil
 	case "Q4_K_S":
 		return FileTypeQ4_K_S, nil
 	case "Q4_K_M", "Q4_K":
 		return FileTypeQ4_K_M, nil
 	case "Q5_K_S":
 		return FileTypeQ5_K_S, nil
 	case "Q5_K_M", "Q5_K":
 		return FileTypeQ5_K_M, nil
 	case "Q6_K":
 		return FileTypeQ6_K, nil
 	case "Q2_K_S":
 		return FileTypeQ2_K_S, nil
 	case "BF16":
 		return FileTypeBF16, nil
 	default:
@ -111,40 +87,41 @@ func ParseFileType(s string) (FileType, error) {
 }
 func (t FileType) String() string {
 	// Note: this routine will return a broader set of file types for existing models
 	switch t {
 	case FileTypeF32:
 		return "F32"
 	case FileTypeF16:
 		return "F16"
-	case FileTypeQ4_0:
+	case fileTypeQ4_0:
 		return "Q4_0"
-	case FileTypeQ4_1:
+	case fileTypeQ4_1:
 		return "Q4_1"
 	case FileTypeQ8_0:
 		return "Q8_0"
-	case FileTypeQ5_0:
+	case fileTypeQ5_0:
 		return "Q5_0"
-	case FileTypeQ5_1:
+	case fileTypeQ5_1:
 		return "Q5_1"
-	case FileTypeQ2_K:
+	case fileTypeQ2_K:
 		return "Q2_K"
-	case FileTypeQ3_K_S:
+	case fileTypeQ3_K_S:
 		return "Q3_K_S"
-	case FileTypeQ3_K_M:
+	case fileTypeQ3_K_M:
 		return "Q3_K_M"
-	case FileTypeQ3_K_L:
+	case fileTypeQ3_K_L:
 		return "Q3_K_L"
 	case FileTypeQ4_K_S:
 		return "Q4_K_S"
 	case FileTypeQ4_K_M:
 		return "Q4_K_M"
-	case FileTypeQ5_K_S:
+	case fileTypeQ5_K_S:
 		return "Q5_K_S"
-	case FileTypeQ5_K_M:
+	case fileTypeQ5_K_M:
 		return "Q5_K_M"
-	case FileTypeQ6_K:
+	case fileTypeQ6_K:
 		return "Q6_K"
-	case FileTypeQ2_K_S:
+	case fileTypeQ2_K_S:
 		return "Q2_K_S"
 	case FileTypeBF16:
 		return "BF16"
@ -163,35 +140,35 @@ func (ftype FileType) ToTensorType() TensorType {
 		return TensorTypeF32
 	case FileTypeF16:
 		return TensorTypeF16
-	case FileTypeQ4_0:
+	case fileTypeQ4_0:
 		return TensorTypeQ4_0
-	case FileTypeQ4_1:
+	case fileTypeQ4_1:
 		return TensorTypeQ4_1
 	case FileTypeQ8_0:
 		return TensorTypeQ8_0
-	case FileTypeQ5_0:
+	case fileTypeQ5_0:
 		return TensorTypeQ5_0
-	case FileTypeQ5_1:
+	case fileTypeQ5_1:
 		return TensorTypeQ5_1
-	case FileTypeQ2_K:
+	case fileTypeQ2_K:
 		return TensorTypeQ2_K
-	case FileTypeQ3_K_S:
+	case fileTypeQ3_K_S:
 		return TensorTypeQ3_K
-	case FileTypeQ3_K_M:
+	case fileTypeQ3_K_M:
 		return TensorTypeQ3_K
-	case FileTypeQ3_K_L:
+	case fileTypeQ3_K_L:
 		return TensorTypeQ3_K
 	case FileTypeQ4_K_S:
 		return TensorTypeQ4_K
 	case FileTypeQ4_K_M:
 		return TensorTypeQ4_K
-	case FileTypeQ5_K_S:
+	case fileTypeQ5_K_S:
 		return TensorTypeQ5_K
-	case FileTypeQ5_K_M:
+	case fileTypeQ5_K_M:
 		return TensorTypeQ5_K
-	case FileTypeQ6_K:
+	case fileTypeQ6_K:
 		return TensorTypeQ6_K
-	case FileTypeQ2_K_S:
+	case fileTypeQ2_K_S:
 		return TensorTypeQ2_K
 	case FileTypeBF16:
 		return TensorTypeBF16
--- a/server/quantization.go
+++ b/server/quantization.go
@ -70,23 +70,7 @@ func getTensorNewType(kv fsggml.KV, qs *quantizeState, newType fsggml.TensorType
 			newType = fsggml.TensorTypeQ6_K
 		}
 	} else if strings.Contains(name, "attn_v.weight") {
-		if ftype == fsggml.FileTypeQ2_K {
+		if (ftype == fsggml.FileTypeQ4_K_M) &&
 			if kv.GQA() >= 4 {
 				newType = fsggml.TensorTypeQ4_K
 			} else {
 				newType = fsggml.TensorTypeQ3_K
 			}
 		} else if ftype == fsggml.FileTypeQ2_K_S && kv.GQA() >= 4 {
 			newType = fsggml.TensorTypeQ4_K
 		} else if ftype == fsggml.FileTypeQ3_K_M {
 			if qs.iAttnV < 2 {
 				newType = fsggml.TensorTypeQ5_K
 			} else {
 				newType = fsggml.TensorTypeQ4_K
 			}
 		} else if ftype == fsggml.FileTypeQ3_K_L {
 			newType = fsggml.TensorTypeQ5_K
 		} else if (ftype == fsggml.FileTypeQ4_K_M || ftype == fsggml.FileTypeQ5_K_M) &&
 			useMoreBits(qs.iAttnV, qs.nAttnV) {
 			newType = fsggml.TensorTypeQ6_K
 		} else if ftype == fsggml.FileTypeQ4_K_S && qs.iAttnV < 4 {
@ -114,54 +98,23 @@ func getTensorNewType(kv fsggml.KV, qs *quantizeState, newType fsggml.TensorType
 	} else if strings.Contains(name, "ffn_down") {
 		iLayer := qs.iFfnDown
 		n_layer := qs.nFfnDown
-		if ftype == fsggml.FileTypeQ2_K {
+		if ftype == fsggml.FileTypeQ4_K_M {
 			newType = fsggml.TensorTypeQ3_K
 		} else if ftype == fsggml.FileTypeQ2_K_S {
 			if iLayer < n_layer/8 {
 				newType = fsggml.TensorTypeQ4_K
 			}
 		} else if ftype == fsggml.FileTypeQ3_K_M {
 			if iLayer < n_layer/16 {
 				newType = fsggml.TensorTypeQ5_K
 			} else if useMoreBits(iLayer, n_layer) {
 				newType = fsggml.TensorTypeQ4_K
 			} else {
 				newType = fsggml.TensorTypeQ3_K
 			}
 		} else if ftype == fsggml.FileTypeQ3_K_L {
 			newType = fsggml.TensorTypeQ5_K
 		} else if ftype == fsggml.FileTypeQ4_K_M {
 			if useMoreBits(iLayer, n_layer) {
 				newType = fsggml.TensorTypeQ6_K
 			}
 		} else if ftype == fsggml.FileTypeQ5_K_M && useMoreBits(iLayer, n_layer) {
 			newType = fsggml.TensorTypeQ6_K
 		} else if ftype == fsggml.FileTypeQ4_K_S && iLayer < n_layer/8 {
 			newType = fsggml.TensorTypeQ5_K
 		}
 		qs.iFfnDown++
 	} else if strings.Contains(name, "attn_output.weight") {
 		if nExperts == 8 {
-			if ftype == fsggml.FileTypeQ2_K || ftype == fsggml.FileTypeQ3_K_S || ftype == fsggml.FileTypeQ3_K_M ||
+			if ftype == fsggml.FileTypeQ4_K_S || ftype == fsggml.FileTypeQ4_K_M {
 				ftype == fsggml.FileTypeQ4_K_S || ftype == fsggml.FileTypeQ4_K_M {
 				newType = fsggml.TensorTypeQ5_K
 			}
 		} else {
 			if ftype == fsggml.FileTypeQ2_K {
 				newType = fsggml.TensorTypeQ3_K
 			} else if ftype == fsggml.FileTypeQ3_K_M {
 				newType = fsggml.TensorTypeQ4_K
 			} else if ftype == fsggml.FileTypeQ3_K_L {
 				newType = fsggml.TensorTypeQ5_K
 			}
 		}
 	} else if strings.Contains(name, "attn_qkv.weight") {
-		if ftype == fsggml.FileTypeQ3_K_M || ftype == fsggml.FileTypeQ3_K_L {
+		if ftype == fsggml.FileTypeQ4_K_M {
 			newType = fsggml.TensorTypeQ4_K
 		} else if ftype == fsggml.FileTypeQ4_K_M {
 			newType = fsggml.TensorTypeQ5_K
 		} else if ftype == fsggml.FileTypeQ5_K_M {
 			newType = fsggml.TensorTypeQ6_K
 		}
 	}
--- a/server/quantization_test.go
+++ b/server/quantization_test.go
@ -42,71 +42,6 @@ func TestGetTensorNewType(t *testing.T) {
 			ftype:       fsggml.FileTypeF32,
 			expected:    fsggml.TensorTypeQ6_K,
 		},
 		{
 			name: "attn_v.weight_q4_k",
 			kv: map[string]any{
 				"general.architecture":        "foo",
 				"foo.attention.head_count":    uint32(4),
 				"foo.attention.head_count_kv": uint32(1),
 			},
 			newType:     fsggml.TensorTypeQ4_0,
 			tensor_name: "blk.0.attn_v.weight",
 			shape:       []uint64{256},
 			ftype:       fsggml.FileTypeQ2_K,
 			expected:    fsggml.TensorTypeQ4_K,
 		},
 		{
 			name:        "attn_v.weight_q3_k",
 			kv:          map[string]any{},
 			newType:     fsggml.TensorTypeQ4_0,
 			tensor_name: "blk.0.attn_v.weight",
 			shape:       []uint64{256},
 			ftype:       fsggml.FileTypeQ2_K,
 			expected:    fsggml.TensorTypeQ3_K,
 		},
 		{
 			name: "attn_v.weight_q2_k_s_q4_k",
 			kv: map[string]any{
 				"general.architecture":        "foo",
 				"foo.attention.head_count":    uint32(4),
 				"foo.attention.head_count_kv": uint32(1),
 			},
 			newType:     fsggml.TensorTypeQ4_0,
 			tensor_name: "blk.0.attn_v.weight",
 			shape:       []uint64{256},
 			ftype:       fsggml.FileTypeQ2_K_S,
 			expected:    fsggml.TensorTypeQ4_K,
 		},
 		{
 			name:        "attn_v.weight_q3_k_m",
 			kv:          map[string]any{},
 			newType:     fsggml.TensorTypeQ4_0,
 			tensor_name: "blk.0.attn_v.weight",
 			shape:       []uint64{256},
 			ftype:       fsggml.FileTypeQ3_K_M,
 			expected:    fsggml.TensorTypeQ5_K,
 		},
 		{
 			name: "attn_v.weight_q3_k_m_i",
 			qs: quantizeState{
 				iAttnV: 2,
 			},
 			kv:          map[string]any{},
 			newType:     fsggml.TensorTypeQ4_0,
 			tensor_name: "blk.0.attn_v.weight",
 			shape:       []uint64{256},
 			ftype:       fsggml.FileTypeQ3_K_M,
 			expected:    fsggml.TensorTypeQ4_K,
 		},
 		{
 			name:        "attn_v.weight_q3_k_l",
 			kv:          map[string]any{},
 			newType:     fsggml.TensorTypeQ4_0,
 			tensor_name: "blk.0.attn_v.weight",
 			shape:       []uint64{256},
 			ftype:       fsggml.FileTypeQ3_K_L,
 			expected:    fsggml.TensorTypeQ5_K,
 		},
 		{
 			name: "attn_v.weight_q4_k_m",
 			qs: quantizeState{
@ -156,88 +91,6 @@ func TestGetTensorNewType(t *testing.T) {
 			ftype:       fsggml.FileTypeF32,
 			expected:    fsggml.TensorTypeQ8_0,
 		},
 		{
 			name:        "ffn_down_q2_k",
 			qs:          quantizeState{},
 			kv:          map[string]any{},
 			newType:     fsggml.TensorTypeQ4_0,
 			tensor_name: "ffn_down",
 			shape:       []uint64{256},
 			ftype:       fsggml.FileTypeQ2_K,
 			expected:    fsggml.TensorTypeQ3_K,
 		},
 		{
 			name:        "ffn_down_q2_k_s",
 			qs:          quantizeState{},
 			kv:          map[string]any{},
 			newType:     fsggml.TensorTypeQ4_0,
 			tensor_name: "ffn_down",
 			shape:       []uint64{256},
 			ftype:       fsggml.FileTypeQ2_K_S,
 			expected:    fsggml.TensorTypeQ4_0,
 		},
 		{
 			name: "ffn_down_q2_k_s_layers",
 			qs: quantizeState{
 				iFfnDown: 2,
 				nFfnDown: 3 * 8,
 			},
 			kv:          map[string]any{},
 			newType:     fsggml.TensorTypeQ4_0,
 			tensor_name: "ffn_down",
 			shape:       []uint64{256},
 			ftype:       fsggml.FileTypeQ2_K_S,
 			expected:    fsggml.TensorTypeQ4_K,
 		},
 		{
 			name: "ffn_down_q3_k_m_base",
 			qs: quantizeState{
 				iFfnDown: 1,
 				nFfnDown: 8,
 			},
 			kv:          map[string]any{},
 			newType:     fsggml.TensorTypeQ4_0,
 			tensor_name: "ffn_down",
 			shape:       []uint64{256},
 			ftype:       fsggml.FileTypeQ3_K_M,
 			expected:    fsggml.TensorTypeQ3_K,
 		},
 		{
 			name: "ffn_down_q3_k_m_16",
 			qs: quantizeState{
 				iFfnDown: 2,
 				nFfnDown: 3 * 16,
 			},
 			kv:          map[string]any{},
 			newType:     fsggml.TensorTypeQ4_0,
 			tensor_name: "ffn_down",
 			shape:       []uint64{256},
 			ftype:       fsggml.FileTypeQ3_K_M,
 			expected:    fsggml.TensorTypeQ5_K,
 		},
 		{
 			name: "ffn_down_q3_k_m_8",
 			qs: quantizeState{
 				iFfnDown: 2,
 				nFfnDown: 3 * 8,
 			},
 			kv:          map[string]any{},
 			newType:     fsggml.TensorTypeQ4_0,
 			tensor_name: "ffn_down",
 			shape:       []uint64{256},
 			ftype:       fsggml.FileTypeQ3_K_M,
 			expected:    fsggml.TensorTypeQ4_K,
 		},
 		{
 			name:        "ffn_down_q3_k_l",
 			qs:          quantizeState{},
 			kv:          map[string]any{},
 			newType:     fsggml.TensorTypeQ4_0,
 			tensor_name: "ffn_down",
 			shape:       []uint64{256},
 			ftype:       fsggml.FileTypeQ3_K_L,
 			expected:    fsggml.TensorTypeQ5_K,
 		},
 		{
 			name: "ffn_down_q4_k_m",
 			qs: quantizeState{
@ -264,19 +117,6 @@ func TestGetTensorNewType(t *testing.T) {
 			ftype:       fsggml.FileTypeQ4_K_M,
 			expected:    fsggml.TensorTypeQ6_K,
 		},
 		{
 			name: "ffn_down_q5_k_m",
 			qs: quantizeState{
 				iFfnDown: 2,
 				nFfnDown: 3 * 8,
 			},
 			kv:          map[string]any{},
 			newType:     fsggml.TensorTypeQ4_0,
 			tensor_name: "ffn_down",
 			shape:       []uint64{256},
 			ftype:       fsggml.FileTypeQ5_K_M,
 			expected:    fsggml.TensorTypeQ6_K,
 		},
 		{
 			name: "ffn_down_q4_k_s",
 			qs: quantizeState{
@ -290,59 +130,6 @@ func TestGetTensorNewType(t *testing.T) {
 			ftype:       fsggml.FileTypeQ4_K_S,
 			expected:    fsggml.TensorTypeQ5_K,
 		},
 		{
 			name: "attn_output.weight_8_expert",
 			qs:   quantizeState{},
 			kv: map[string]any{
 				"general.architecture": "foo",
 				"foo.expert_count":     uint32(8),
 			},
 			newType:     fsggml.TensorTypeQ4_0,
 			tensor_name: "blk.0.attn_output.weight",
 			shape:       []uint64{256},
 			ftype:       fsggml.FileTypeQ2_K,
 			expected:    fsggml.TensorTypeQ5_K,
 		},
 		{
 			name:        "attn_output.weight_q2",
 			qs:          quantizeState{},
 			kv:          map[string]any{},
 			newType:     fsggml.TensorTypeQ4_0,
 			tensor_name: "blk.0.attn_output.weight",
 			shape:       []uint64{256},
 			ftype:       fsggml.FileTypeQ2_K,
 			expected:    fsggml.TensorTypeQ3_K,
 		},
 		{
 			name:        "attn_output.weight_q3_k_m",
 			qs:          quantizeState{},
 			kv:          map[string]any{},
 			newType:     fsggml.TensorTypeQ4_0,
 			tensor_name: "blk.0.attn_output.weight",
 			shape:       []uint64{256},
 			ftype:       fsggml.FileTypeQ3_K_M,
 			expected:    fsggml.TensorTypeQ4_K,
 		},
 		{
 			name:        "attn_output.weight_q3_k_l",
 			qs:          quantizeState{},
 			kv:          map[string]any{},
 			newType:     fsggml.TensorTypeQ4_0,
 			tensor_name: "blk.0.attn_output.weight",
 			shape:       []uint64{256},
 			ftype:       fsggml.FileTypeQ3_K_L,
 			expected:    fsggml.TensorTypeQ5_K,
 		},
 		{
 			name:        "attn_qkv.weight_q3_k_m",
 			qs:          quantizeState{},
 			kv:          map[string]any{},
 			newType:     fsggml.TensorTypeQ4_0,
 			tensor_name: "blk.0.attn_qkv.weight",
 			shape:       []uint64{256},
 			ftype:       fsggml.FileTypeQ3_K_M,
 			expected:    fsggml.TensorTypeQ4_K,
 		},
 		{
 			name:        "attn_qkv.weight_q4_k_m",
 			qs:          quantizeState{},
@ -353,16 +140,6 @@ func TestGetTensorNewType(t *testing.T) {
 			ftype:       fsggml.FileTypeQ4_K_M,
 			expected:    fsggml.TensorTypeQ5_K,
 		},
 		{
 			name:        "attn_qkv.weight_q5_k_m",
 			qs:          quantizeState{},
 			kv:          map[string]any{},
 			newType:     fsggml.TensorTypeQ4_0,
 			tensor_name: "blk.0.attn_qkv.weight",
 			shape:       []uint64{256},
 			ftype:       fsggml.FileTypeQ5_K_M,
 			expected:    fsggml.TensorTypeQ6_K,
 		},
 	}
 	for _, tt := range cases {
 		t.Run(tt.name, func(t *testing.T) {