server: do not attempt to parse offset file as gguf

This logic was causing issues for me when importing a gguf that had some padding at the end of the file. The valid gguf would be read, but then it would try to read the offset as a different gguf file. This does not seem right.
2025-04-09 09:41:46 -07:00
8 changed files with 91 additions and 289 deletions
--- a/4
+++ b/4
@@ -104,8 +104,8 @@ COPY --from=cuda-12 dist/lib/ollama/cuda_v12 /lib/ollama/cuda_v12
 FROM --platform=linux/arm64 scratch AS arm64
 COPY --from=cuda-11 dist/lib/ollama/cuda_v11 /lib/ollama/cuda_v11
 COPY --from=cuda-12 dist/lib/ollama/cuda_v12 /lib/ollama/cuda_v12
-COPY --from=jetpack-5 dist/lib/ollama/cuda_v11 /lib/ollama/cuda_jetpack5
-COPY --from=jetpack-6 dist/lib/ollama/cuda_v12 /lib/ollama/cuda_jetpack6
+COPY --from=jetpack-5 dist/lib/ollama/cuda_v11 lib/ollama/cuda_jetpack5
+COPY --from=jetpack-6 dist/lib/ollama/cuda_v12 lib/ollama/cuda_jetpack6

 FROM scratch AS rocm
 COPY --from=rocm-6 dist/lib/ollama/rocm /lib/ollama/rocm
--- a/api/types.go
+++ b/api/types.go
@@ -163,7 +163,6 @@ func (t *ToolCallFunctionArguments) String() string {

 type Tool struct {
 	Type     string       `json:"type"`
-	Items    any          `json:"items,omitempty"`
 	Function ToolFunction `json:"function"`
 }

@@ -214,12 +213,9 @@ type ToolFunction struct {
 	Description string `json:"description"`
 	Parameters  struct {
 		Type       string   `json:"type"`
-		Defs       any      `json:"$defs,omitempty"`
-		Items      any      `json:"items,omitempty"`
 		Required   []string `json:"required"`
 		Properties map[string]struct {
 			Type        PropertyType `json:"type"`
-			Items       any          `json:"items,omitempty"`
 			Description string       `json:"description"`
 			Enum        []any        `json:"enum,omitempty"`
 		} `json:"properties"`
--- a/fs/ggml/ggml.go
+++ b/fs/ggml/ggml.go
@@ -6,7 +6,6 @@ import (
 	"fmt"
 	"io"
 	"log/slog"
-	"reflect"
 	"slices"
 	"strings"

@@ -53,80 +52,32 @@ func (kv KV) EmbeddingLength() uint64 {
 	return uint64(kv.Uint("embedding_length"))
 }

-func (kv KV) HeadCounts() []uint64 {
-	return kv.UintOrArrayAsArray("attention.head_count", kv.BlockCount(), 1)
+func (kv KV) HeadCount() uint64 {
+	return uint64(kv.Uint("attention.head_count"))
 }

-func (kv KV) HeadCountKVs() []uint64 {
-	return kv.UintOrArrayAsArray("attention.head_count_kv", kv.BlockCount(), 1)
+func (kv KV) HeadCountKV() uint64 {
+	return uint64(kv.Uint("attention.head_count_kv", 1))
 }

-func (kv KV) EmbeddingHeadCount() []uint64 {
-	headCount := kv.HeadCounts()
-	embeddingHeadCount := make([]uint64, len(headCount))
-	for i, heads := range headCount {
-		if heads == 0 {
-			embeddingHeadCount[i] = 0
-		} else {
-			embeddingHeadCount[i] = kv.EmbeddingLength() / heads
-		}
+func (kv KV) EmbeddingHeadCount() uint64 {
+	if heads := kv.HeadCount(); heads > 0 {
+		return kv.EmbeddingLength() / heads
 	}

-	return embeddingHeadCount
+	return 0
 }

-func (kv KV) FillArrayOrDefault(key string, defaultValue []uint64) []uint64 {
-	length := len(defaultValue)
-	if v, ok := keyValueUntyped(kv, key); ok {
-		switch v := v.(type) {
-		case uint32:
-			return FillArray(uint64(v), length)
-		case uint64:
-			return FillArray(v, length)
-		case int32:
-			return FillArray(uint64(v), length)
-		default:
-			slog.Warn("unsupported type", "key", key, "type", reflect.TypeOf(v))
-		}
-	}
-
-	return defaultValue
+func (kv KV) EmbeddingHeadCountK() uint64 {
+	return uint64(kv.Uint("attention.key_length", uint32(kv.EmbeddingHeadCount())))
 }

-func (kv KV) EmbeddingHeadCountK() []uint64 {
-	return kv.FillArrayOrDefault("attention.key_length", kv.EmbeddingHeadCount())
+func (kv KV) EmbeddingHeadCountV() uint64 {
+	return uint64(kv.Uint("attention.value_length", uint32(kv.EmbeddingHeadCount())))
 }

-func (kv KV) EmbeddingHeadCountV() []uint64 {
-	return kv.FillArrayOrDefault("attention.value_length", kv.EmbeddingHeadCount())
-}
-
-func (kv KV) GQAMax() uint64 {
-	heads := kv.HeadCounts()
-	headsKV := kv.HeadCountKVs()
-	if len(heads) != len(headsKV) {
-		slog.Warn("head count and head count kv are not the same length")
-		return 0
-	}
-	if len(heads) == 0 {
-		slog.Warn("head count is empty")
-		return 0
-	}
-
-	maxGQA := uint64(0)
-	for i := range heads {
-		head := heads[i]
-		headKV := headsKV[i]
-		if head == 0 || headKV == 0 {
-			return 0
-		}
-		gqa := head / headKV
-		if gqa > maxGQA {
-			maxGQA = gqa
-		}
-	}
-
-	return maxGQA
+func (kv KV) GQA() uint64 {
+	return kv.HeadCount() / kv.HeadCountKV()
 }

 func (kv KV) ContextLength() uint64 {
@@ -153,41 +104,6 @@ func (kv KV) Bool(key string, defaultValue ...bool) bool {
 	return keyValue(kv, key, append(defaultValue, false)...)
 }

-func (kv KV) UintOrArrayAsArray(key string, n uint64, defaultSingleValue ...uint64) []uint64 {
-	var singleValue *uint64
-	if v, ok := keyValueUntyped(kv, key); ok {
-		switch v := v.(type) {
-		case *array:
-			switch v.values[0].(type) {
-			case int32, uint32, uint64:
-				values, ok := AsUint64Array(v.values)
-				if ok {
-					return values
-				}
-			default:
-				slog.Warn("unexpected array value type", "key", key, "type", reflect.TypeOf(v))
-			}
-		case uint32:
-			val := uint64(v)
-			singleValue = &val
-		case int32:
-			val := uint64(v)
-			singleValue = &val
-		}
-	}
-	if singleValue == nil {
-		slog.Warn("falling back to default")
-		singleValue = &defaultSingleValue[0]
-	}
-
-	values := make([]uint64, n)
-	for i := range values {
-		values[i] = *singleValue
-	}
-
-	return values
-}
-
 func (kv KV) Strings(key string, defaultValue ...[]string) []string {
 	r := keyValue(kv, key, &array{})
 	s := make([]string, r.size)
@@ -225,24 +141,16 @@ func (kv KV) OllamaEngineRequired() bool {
 }

 func keyValue[T string | uint32 | uint64 | float32 | *array | bool](kv KV, key string, defaultValue ...T) T {
-	if val, ok := keyValueUntyped(kv, key); ok {
-		return val.(T)
-	}
-
-	slog.Warn("key not found", "key", key, "default", defaultValue[0])
-	return defaultValue[0]
-}
-
-func keyValueUntyped(kv KV, key string) (any, bool) {
 	if !strings.HasPrefix(key, "tokenizer.") && !strings.HasPrefix(key, "general.") {
 		key = kv.Architecture() + "." + key
 	}

 	if val, ok := kv[key]; ok {
-		return val, true
+		return val.(T)
 	}

-	return nil, false
+	slog.Warn("key not found", "key", key, "default", defaultValue[0])
+	return defaultValue[0]
 }

 type Tensors struct {
@@ -510,22 +418,12 @@ func Decode(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) {

 func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType string) (kv []uint64, partialOffload, fullOffload uint64) {
 	embedding := f.KV().EmbeddingLength()
-	heads := f.KV().HeadCounts()
-	headsKV := f.KV().HeadCountKVs()
+	heads := f.KV().HeadCount()
+	headsKV := f.KV().HeadCountKV()
 	vocab := uint64(f.KV()["tokenizer.ggml.tokens"].(*array).size)

 	embeddingHeads := f.KV().EmbeddingHeadCount()
-	maxEmbeddingHeads, ok := MaxValue(embeddingHeads)
-	if !ok {
-		maxEmbeddingHeads = 1
-		slog.Warn("failed to get max embedding heads")
-	}
 	embeddingHeadsK := f.KV().EmbeddingHeadCountK()
-	maxEmbeddingHeadsK, ok := MaxValue(embeddingHeadsK)
-	if !ok {
-		maxEmbeddingHeadsK = 1
-		slog.Warn("failed to get max embedding headsK")
-	}
 	embeddingHeadsV := f.KV().EmbeddingHeadCountV()

 	layers := f.Tensors().GroupLayers()
@@ -533,30 +431,19 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri
 	bytesPerElement := kvCacheBytesPerElement(kvCacheType)
 	kv = make([]uint64, f.KV().BlockCount())
 	for i := range kv {
-		kv[i] = uint64(float64(context*(embeddingHeadsK[i]+embeddingHeadsV[i])*headsKV[i]) * bytesPerElement)
-	}
-
-	maxHeads, ok := MaxValue(heads)
-	if !ok {
-		maxHeads = 1
-		slog.Warn("failed to get max heads")
-	}
-	maxHeadsKV, ok := MaxValue(headsKV)
-	if !ok {
-		maxHeadsKV = 1
-		slog.Warn("failed to get max headsKV")
+		kv[i] = uint64(float64(context*(embeddingHeadsK+embeddingHeadsV)*headsKV) * bytesPerElement)
 	}

 	switch f.KV().Architecture() {
 	case "llama":
 		fullOffload = max(
-			4*batch*(1+4*embedding+context*(1+maxHeads)),
+			4*batch*(1+4*embedding+context*(1+heads)),
 			4*batch*(embedding+vocab),
 		)

 		partialOffload = 4 * batch * embedding
 		partialOffload += max(
-			4*batch*(1+embedding+max(context, embedding))+embedding*embedding*9/16+4*context*(batch*maxHeads+maxEmbeddingHeads*maxHeadsKV),
+			4*batch*(1+embedding+max(context, embedding))+embedding*embedding*9/16+4*context*(batch*heads+embeddingHeads*headsKV),
 			4*batch*(embedding+vocab)+embedding*vocab*105/128,
 		)

@@ -564,16 +451,16 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri
 			// mixtral 8x22b
 			ff := uint64(f.KV()["llama.feed_forward_length"].(uint32))
 			partialOffload = max(
-				3*ffnGateExpsWeight.Size()+4*batch*(2*ff+maxHeadsKV+embedding+context+maxEmbeddingHeads*maxHeadsKV),
-				4*(context*batch*maxHeads+context*maxEmbeddingHeads*maxHeadsKV+batch*1024+maxEmbeddingHeads*maxHeadsKV*batch),
+				3*ffnGateExpsWeight.Size()+4*batch*(2*ff+headsKV+embedding+context+embeddingHeads*headsKV),
+				4*(context*batch*heads+context*embeddingHeads*headsKV+batch*1024+embeddingHeads*headsKV*batch),
 			)
 		} else if ffnGateWeight, ok := layers["blk.0"]["ffn_gate.0.weight"]; ok {
 			// mixtral 8x7b
 			ffnGateWeight1 := ffnGateWeight.Shape[1]
-			fullOffload = 4 * batch * (2 + 3*embedding + context*(1+maxHeads) + 2*maxHeadsKV + ffnGateWeight1)
+			fullOffload = 4 * batch * (2 + 3*embedding + context*(1+heads) + 2*headsKV + ffnGateWeight1)
 			partialOffload = max(
-				4*batch*(3+maxEmbeddingHeads*maxHeadsKV+embedding+context*(1+maxHeads)+ffnGateWeight1)+(embedding*embedding+3*embedding*maxHeadsKV*ffnGateWeight1)*9/16,
-				4*batch*(1+2*embedding+context*(1+maxHeads))+embedding*(6*context*maxHeadsKV/maxHeads+embedding*9/16),
+				4*batch*(3+embeddingHeads*headsKV+embedding+context*(1+heads)+ffnGateWeight1)+(embedding*embedding+3*embedding*headsKV*ffnGateWeight1)*9/16,
+				4*batch*(1+2*embedding+context*(1+heads))+embedding*(6*context*headsKV/heads+embedding*9/16),
 			)
 		}
 	case "mllama":
@@ -582,7 +469,7 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri
 		crossAttentionLayers := f.KV().Uints("attention.cross_attention_layers")
 		for i := range kv {
 			if slices.Contains(crossAttentionLayers, uint32(i)) {
-				kv[i] = headsKV[i] * (embeddingHeadsK[i] + embeddingHeadsV[i]) *
+				kv[i] = headsKV * (embeddingHeadsK + embeddingHeadsV) *
 					4 * // sizeof(float32)
 					visionTokens *
 					tiles
@@ -590,7 +477,7 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri
 		}

 		fullOffload = max(
-			4*batch*(2+3*embedding+maxEmbeddingHeadsK*maxHeads+context*(1+maxHeads)),
+			4*batch*(2+3*embedding+embeddingHeadsK*heads+context*(1+heads)),
 			// vocab graph
 			4*batch*(embedding+vocab),
 		)
@@ -604,23 +491,23 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri

 		partialOffload = max(
 			4*(batch*
-				(2*embedding+1+context*(1+maxHeads)+maxEmbeddingHeadsK*maxHeads)+
+				(2*embedding+1+context*(1+heads)+embeddingHeadsK*heads)+
 				ropeFreqsCount+
-				maxEmbeddingHeadsK*context*maxHeadsKV),
+				embeddingHeadsK*context*headsKV),
 			// vocab graph
 			4*batch*(embedding+vocab)+embedding*vocab*105/128,
 		)
 	case "gemma", "gemma2", "gemma3":
 		fullOffload = max(
 			4*batch*(embedding+vocab),
-			4*batch*(2+context+context*maxHeads+2*embedding+2*maxEmbeddingHeadsK*maxHeads),
+			4*batch*(2+context+context*heads+2*embedding+2*embeddingHeadsK*heads),
 		)

 		partialOffload = max(
 			4*embedding*batch+embedding*vocab*105/128+4*vocab*batch,
-			4*batch*(2*embedding+1+2*maxEmbeddingHeadsK*maxHeads+context+context*maxHeads)+
-				4*maxEmbeddingHeadsK*context*8+
-				embedding*embedding*maxEmbeddingHeadsK*maxHeads*9/16,
+			4*batch*(2*embedding+1+2*embeddingHeadsK*heads+context+context*heads)+
+				4*embeddingHeadsK*context*8+
+				embedding*embeddingHeadsK*heads*9/16,
 		)

 		// Gemma2 also has sliding window attention but we only have an optimized implementation in the Ollama
@@ -632,42 +519,42 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri
 				// Every 6th layer is a global layer, which is the full context size that has already been set. The other
 				// layers are the smaller local (sliding) layers.
 				if (i+1)%gemma3GlobalCacheCount != 0 {
-					kv[i] = uint64(float64(slidingWindow*(embeddingHeadsK[i]+embeddingHeadsV[i])*headsKV[i]) * bytesPerElement)
+					kv[i] = uint64(float64(slidingWindow*(embeddingHeadsK+embeddingHeadsV)*headsKV) * bytesPerElement)
 				}
 			}
 		}
 	case "command-r":
 		fullOffload = max(
 			4*batch*(embedding+vocab),
-			4*batch*(2+4*embedding+context*(1+maxHeads)),
+			4*batch*(2+4*embedding+context*(1+heads)),
 		)

 		partialOffload = max(
 			4*batch*(embedding+vocab)+embedding*vocab*105/128,
-			4*batch*(1+2*embedding+context*(1+maxHeads))+4*embedding*context+embedding*embedding*9/16,
+			4*batch*(1+2*embedding+context*(1+heads))+4*embedding*context+embedding*embedding*9/16,
 		)
 	case "qwen2":
 		fullOffload = max(
 			4*batch*(embedding+vocab),
-			4*batch*(1+2*embedding+context+context*maxHeads),
+			4*batch*(1+2*embedding+context+context*heads),
 		)

 		partialOffload = max(
 			4*batch*(embedding+vocab)+embedding*vocab*105/128,
-			4*(batch*(1+2*embedding+context*(1+maxHeads))+embedding*(1+context)),
+			4*(batch*(1+2*embedding+context*(1+heads))+embedding*(1+context)),
 		)
 	case "phi2":
 		fullOffload = max(
 			4*batch*(embedding+vocab),
-			4*batch*(1+4*embedding+context+context*maxHeads),
+			4*batch*(1+4*embedding+context+context*heads),
 		)

 		partialOffload = max(
 			4*batch*(2*embedding+vocab)+embedding*vocab*105/128,
-			4*batch*(2+3*embedding+context+context*maxHeads),
+			4*batch*(2+3*embedding+context+context*heads),
 		)
 	case "stablelm":
-		fullOffload = 4 * batch * (context*(1+maxHeads) + 3*embedding + 2)
+		fullOffload = 4 * batch * (context*(1+heads) + 3*embedding + 2)
 		partialOffload = max(
 			4*batch*(vocab+2*embedding),
 			fullOffload,
@@ -675,12 +562,12 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri
 	case "deepseek2":
 		fullOffload = max(
 			4*batch*(3*embedding+vocab),
-			4*batch*(3*embedding+2+context*(1+maxHeadsKV)+2*maxEmbeddingHeadsK*maxHeadsKV),
+			4*batch*(3*embedding+2+context*(1+headsKV)+2*embeddingHeadsK*headsKV),
 		)

 		partialOffload = max(
 			4*batch*(3*embedding+vocab)+embedding*vocab*105/128,
-			4*batch*(2*embedding+1+2*maxEmbeddingHeadsK*maxHeadsKV+context+context*maxHeadsKV)+4*maxEmbeddingHeadsK*context*maxHeadsKV+embedding*embedding*maxEmbeddingHeadsK*maxHeadsKV*9/16,
+			4*batch*(2*embedding+1+2*embeddingHeadsK*headsKV+context+context*headsKV)+4*embeddingHeadsK*context*headsKV+embedding*embeddingHeadsK*headsKV*9/16,
 		)
 	case "chatglm":
 		fullOffload = 4 * batch * (embedding + vocab)
@@ -691,8 +578,8 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri
 				4*batch*(2+
 					2*embedding+
 					context+
-					context*maxHeads+
-					maxEmbeddingHeadsK*maxHeads+
+					context*heads+
+					embeddingHeadsK*heads+
 					qkvBias.Shape[0]),
 			)

@@ -700,11 +587,11 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri
 				partialOffload,
 				4*batch*(1+
 					2*embedding+
-					maxEmbeddingHeadsK*maxHeads+
+					embeddingHeadsK*heads+
 					context+
-					context*maxHeads)+
-					4*maxEmbeddingHeadsK*context+
-					4*context*maxEmbeddingHeadsK+
+					context*heads)+
+					4*embeddingHeadsK*context+
+					4*context*embeddingHeadsK+
 					4*qkvBias.Shape[0],
 			)
 		}
@@ -776,15 +663,9 @@ func (f GGML) SupportsFlashAttention() bool {
 	}

 	// Check head counts match and are non-zero
-	headCount := f.KV().HeadCounts()
-	embeddingHeadCountK := f.KV().EmbeddingHeadCountK()
-	embeddingHeadCountV := f.KV().EmbeddingHeadCountV()
-	for i := range headCount {
-		if embeddingHeadCountK[i] != embeddingHeadCountV[i] {
-			return false
-		}
-	}
-	return true
+	headCountK := f.KV().EmbeddingHeadCountK()
+	headCountV := f.KV().EmbeddingHeadCountV()
+	return headCountK != 0 && headCountV != 0 && headCountK == headCountV
 }

 // kvCacheBytesPerElement returns the number of bytes per element for a given KV cache type
@@ -798,54 +679,3 @@ func kvCacheBytesPerElement(cacheType string) float64 {
 		return 2 // f16 (default)
 	}
 }
-
-func AsUint64Array(v []any) ([]uint64, bool) {
-	switch v[0].(type) {
-	case uint32:
-		values := make([]uint64, len(v))
-		for i, v := range v {
-			values[i] = uint64(v.(uint32))
-		}
-		return values, true
-	case uint64:
-		values := make([]uint64, len(v))
-		for i, v := range v {
-			values[i] = v.(uint64)
-		}
-		return values, true
-	case int32:
-		values := make([]uint64, len(v))
-		for i, val := range v {
-			val := val.(int32)
-			if val < 0 {
-				slog.Warn("negative value in int32 array", "value", val)
-				return nil, false
-			}
-			values[i] = uint64(val)
-		}
-		return values, true
-	}
-	return nil, false
-}
-
-func MaxValue(values []uint64) (uint64, bool) {
-	if len(values) == 0 {
-		return 0, false
-	}
-
-	max := values[0]
-	for _, v := range values {
-		if v > max {
-			max = v
-		}
-	}
-	return max, true
-}
-
-func FillArray[T any](value T, n int) []T {
-	values := make([]T, n)
-	for i := range values {
-		values[i] = value
-	}
-	return values
-}
--- a/llm/memory.go
+++ b/llm/memory.go
@@ -149,7 +149,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 	}

 	if graphPartialOffload == 0 {
-		graphPartialOffload = f.KV().GQAMax() * kvTotal / 6
+		graphPartialOffload = f.KV().GQA() * kvTotal / 6
 	}
 	if graphFullOffload == 0 {
 		graphFullOffload = graphPartialOffload
--- a/openai/openai_test.go
+++ b/openai/openai_test.go
@@ -281,12 +281,9 @@ func TestChatMiddleware(t *testing.T) {
 							Description: "Get the current weather",
 							Parameters: struct {
 								Type       string   `json:"type"`
-								Defs       any      `json:"$defs,omitempty"`
-								Items      any      `json:"items,omitempty"`
 								Required   []string `json:"required"`
 								Properties map[string]struct {
 									Type        api.PropertyType `json:"type"`
-									Items       any              `json:"items,omitempty"`
 									Description string           `json:"description"`
 									Enum        []any            `json:"enum,omitempty"`
 								} `json:"properties"`
@@ -295,7 +292,6 @@ func TestChatMiddleware(t *testing.T) {
 								Required: []string{"location"},
 								Properties: map[string]struct {
 									Type        api.PropertyType `json:"type"`
-									Items       any              `json:"items,omitempty"`
 									Description string           `json:"description"`
 									Enum        []any            `json:"enum,omitempty"`
 								}{
--- a/server/create.go
+++ b/server/create.go
@@ -497,43 +497,37 @@ func ggufLayers(digest string, fn func(resp api.ProgressResponse)) ([]*layerGGML
 		return nil, err
 	}

-	var offset int64
-	for offset < stat.Size() {
-		f, n, err := ggml.Decode(blob, 0)
-		if errors.Is(err, io.EOF) {
-			break
-		} else if err != nil {
+	f, n, err := ggml.Decode(blob, 0)
+	if err != nil {
+		return nil, err
+	}
+
+	mediatype := "application/vnd.ollama.image.model"
+	if f.KV().Kind() == "adapter" {
+		mediatype = "application/vnd.ollama.image.adapter"
+	} else if _, ok := f.KV()[fmt.Sprintf("%s.vision.block_count", f.KV().Architecture())]; ok || f.KV().Kind() == "projector" {
+		mediatype = "application/vnd.ollama.image.projector"
+	}
+
+	var layer Layer
+	if digest != "" && n == stat.Size() {
+		layer, err = NewLayerFromLayer(digest, mediatype, blob.Name())
+		if err != nil {
+			slog.Debug("could not create new layer from layer", "error", err)
 			return nil, err
 		}
-
-		mediatype := "application/vnd.ollama.image.model"
-		if f.KV().Kind() == "adapter" {
-			mediatype = "application/vnd.ollama.image.adapter"
-		} else if _, ok := f.KV()[fmt.Sprintf("%s.vision.block_count", f.KV().Architecture())]; ok || f.KV().Kind() == "projector" {
-			mediatype = "application/vnd.ollama.image.projector"
-		}
-
-		var layer Layer
-		if digest != "" && n == stat.Size() && offset == 0 {
-			layer, err = NewLayerFromLayer(digest, mediatype, blob.Name())
-			if err != nil {
-				slog.Debug("could not create new layer from layer", "error", err)
-				return nil, err
-			}
-		}
-
-		// Fallback to creating layer from file copy (either NewLayerFromLayer failed, or digest empty/n != stat.Size())
-		if layer.Digest == "" {
-			layer, err = NewLayer(io.NewSectionReader(blob, offset, n), mediatype)
-			if err != nil {
-				return nil, err
-			}
-		}
-
-		layers = append(layers, &layerGGML{layer, f})
-		offset = n
 	}

+	// Fallback to creating layer from file copy (either NewLayerFromLayer failed, or digest empty/n != stat.Size())
+	if layer.Digest == "" {
+		layer, err = NewLayer(io.NewSectionReader(blob, 0, n), mediatype)
+		if err != nil {
+			return nil, err
+		}
+	}
+
+	layers = append(layers, &layerGGML{layer, f})
+
 	return detectChatTemplate(layers)
 }

--- a/server/routes_generate_test.go
+++ b/server/routes_generate_test.go
@@ -370,12 +370,9 @@ func TestGenerateChat(t *testing.T) {
 					Description: "Get the current weather",
 					Parameters: struct {
 						Type       string   `json:"type"`
-						Defs       any      `json:"$defs,omitempty"`
-						Items      any      `json:"items,omitempty"`
 						Required   []string `json:"required"`
 						Properties map[string]struct {
 							Type        api.PropertyType `json:"type"`
-							Items       any              `json:"items,omitempty"`
 							Description string           `json:"description"`
 							Enum        []any            `json:"enum,omitempty"`
 						} `json:"properties"`
@@ -384,7 +381,6 @@ func TestGenerateChat(t *testing.T) {
 						Required: []string{"location"},
 						Properties: map[string]struct {
 							Type        api.PropertyType `json:"type"`
-							Items       any              `json:"items,omitempty"`
 							Description string           `json:"description"`
 							Enum        []any            `json:"enum,omitempty"`
 						}{
@@ -471,12 +467,9 @@ func TestGenerateChat(t *testing.T) {
 					Description: "Get the current weather",
 					Parameters: struct {
 						Type       string   `json:"type"`
-						Defs       any      `json:"$defs,omitempty"`
-						Items      any      `json:"items,omitempty"`
 						Required   []string `json:"required"`
 						Properties map[string]struct {
 							Type        api.PropertyType `json:"type"`
-							Items       any              `json:"items,omitempty"`
 							Description string           `json:"description"`
 							Enum        []any            `json:"enum,omitempty"`
 						} `json:"properties"`
@@ -485,7 +478,6 @@ func TestGenerateChat(t *testing.T) {
 						Required: []string{"location"},
 						Properties: map[string]struct {
 							Type        api.PropertyType `json:"type"`
-							Items       any              `json:"items,omitempty"`
 							Description string           `json:"description"`
 							Enum        []any            `json:"enum,omitempty"`
 						}{
--- a/server/sched.go
+++ b/server/sched.go
@@ -667,19 +667,13 @@ func (runner *runnerRef) waitForVRAMRecovery() chan any {
 	return finished
 }

-type ByDurationAndName []*runnerRef
+type ByDuration []*runnerRef

-func (a ByDurationAndName) Len() int      { return len(a) }
-func (a ByDurationAndName) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
-func (a ByDurationAndName) Less(i, j int) bool {
-	// Primary sort by session duration (uint64 to handle negatives)
-	d1 := uint64(a[i].sessionDuration)
-	d2 := uint64(a[j].sessionDuration)
-	if d1 != d2 {
-		return d1 < d2
-	}
-	// Secondary sort by model path lex order
-	return a[i].modelPath < a[j].modelPath
+func (a ByDuration) Len() int      { return len(a) }
+func (a ByDuration) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
+func (a ByDuration) Less(i, j int) bool {
+	// uint64 to turn negative time (never unload) to largest
+	return uint64(a[i].sessionDuration) < uint64(a[j].sessionDuration)
 }

 // TODO - future consideration to pick runners based on size
@@ -781,7 +775,7 @@ func (s *Scheduler) findRunnerToUnload() *runnerRef {

 	// In the future we can enhance the algorithm to be smarter about picking the optimal runner to unload
 	// e.g., if we have multiple options, will one make room for the request?
-	sort.Sort(ByDurationAndName(runnerList))
+	sort.Sort(ByDuration(runnerList))

 	// First try to find a runner that's already idle
 	for _, runner := range runnerList {