add tests, organize, comments

tools package and utils
renaming and splitting stuff up
2025-05-13 17:44:47 -07:00 · 2025-05-13 17:44:45 -07:00 · 2025-05-13 17:43:15 -07:00 · 2025-05-13 17:43:15 -07:00 · 2025-05-13 17:43:15 -07:00 · 2025-05-13 17:43:15 -07:00
45 changed files with 1913 additions and 1948 deletions
--- a/convert/convert.go
+++ b/convert/convert.go
@ -191,8 +191,6 @@ func ConvertModel(fsys fs.FS, f *os.File) error {
 		conv = &phi3Model{}
 	case "Qwen2ForCausalLM":
 		conv = &qwen2Model{}
 	case "Qwen2_5_VLForConditionalGeneration":
 		conv = &qwen25VLModel{}
 	case "BertModel":
 		conv = &bertModel{}
 	case "CohereForCausalLM":
--- a/convert/convert_qwen2.go
+++ b/convert/convert_qwen2.go
@ -15,7 +15,6 @@ type qwen2Model struct {
 		Type                          string     `json:"type"`
 		Factor                        ropeFactor `json:"factor"`
 		OriginalMaxPositionEmbeddings uint32     `json:"original_max_position_embeddings"`
 		MropeSection                  []int32    `json:"mrope_section"`
 	} `json:"rope_scaling"`
 	RMSNormEPS float32 `json:"rms_norm_eps"`
 }
@ -40,8 +39,6 @@ func (q *qwen2Model) KV(t *Tokenizer) ggml.KV {
 	case "yarn":
 		kv["qwen2.rope.scaling.type"] = q.RopeScaling.Type
 		kv["qwen2.rope.scaling.factor"] = q.RopeScaling.Factor
 	case "mrope", "default":
 		kv["qwen2.rope.mrope_section"] = q.RopeScaling.MropeSection
 	default:
 		panic("unknown rope scaling type")
 	}
--- a/convert/convert_qwen25vl.go
+++ b/convert/convert_qwen25vl.go
@ -1,102 +0,0 @@
 package convert
 import (
 	"cmp"
 	"slices"
 	"strings"
 	"github.com/ollama/ollama/fs/ggml"
 )
 type qwen25VLModel struct {
 	qwen2Model
 	VisionModel struct {
 		Depth               uint32  `json:"depth"`
 		HiddenSize          uint32  `json:"hidden_size"`
 		NumHeads            uint32  `json:"num_heads"`
 		InChannels          uint32  `json:"in_chans"`
 		PatchSize           uint32  `json:"patch_size"`
 		SpatialMergeSize    uint32  `json:"spatial_merge_size"`
 		SpatialPatchSize    uint32  `json:"spatial_patch_size"`
 		WindowSize          uint32  `json:"window_size"`
 		RMSNormEps          float32 `json:"layer_norm_epsilon"`
 		RopeTheta           float32 `json:"rope_theta"`
 		FullAttentionBlocks []int32 `json:"fullatt_block_indexes"`
 		TemporalPatchSize   uint32  `json:"temporal_patch_size"`
 	} `json:"vision_config"`
 }
 var _ ModelConverter = (*qwen25VLModel)(nil)
 func (q *qwen25VLModel) KV(t *Tokenizer) ggml.KV {
 	kv := q.ModelParameters.KV(t)
 	kv["general.architecture"] = "qwen25vl"
 	for k, v := range q.qwen2Model.KV(t) {
 		if strings.HasPrefix(k, "qwen2.") {
 			kv[strings.Replace(k, "qwen2.", "qwen25vl.", 1)] = v
 		}
 	}
 	if q.VisionModel.FullAttentionBlocks == nil {
 		kv["qwen25vl.vision.fullatt_block_indexes"] = []int32{7, 15, 23, 31}
 	}
 	kv["qwen25vl.vision.block_count"] = cmp.Or(q.VisionModel.Depth, 32)
 	kv["qwen25vl.vision.embedding_length"] = q.VisionModel.HiddenSize
 	kv["qwen25vl.vision.attention.head_count"] = cmp.Or(q.VisionModel.NumHeads, 16)
 	kv["qwen25vl.vision.num_channels"] = q.VisionModel.InChannels
 	kv["qwen25vl.vision.patch_size"] = cmp.Or(q.VisionModel.PatchSize, 14)
 	kv["qwen25vl.vision.spatial_merge_size"] = cmp.Or(q.VisionModel.SpatialMergeSize, 2)
 	kv["qwen25vl.vision.spatial_patch_size"] = q.VisionModel.SpatialPatchSize
 	kv["qwen25vl.vision.window_size"] = cmp.Or(q.VisionModel.WindowSize, 112)
 	kv["qwen25vl.vision.attention.layer_norm_epsilon"] = cmp.Or(q.VisionModel.RMSNormEps, 1e-6)
 	kv["qwen25vl.vision.rope.freq_base"] = cmp.Or(q.VisionModel.RopeTheta, 1e4)
 	kv["qwen25vl.vision.fullatt_block_indexes"] = q.VisionModel.FullAttentionBlocks
 	kv["qwen25vl.vision.temporal_patch_size"] = cmp.Or(q.VisionModel.TemporalPatchSize, 2)
 	return kv
 }
 func (q *qwen25VLModel) Tensors(ts []Tensor) []*ggml.Tensor {
 	var out []*ggml.Tensor
 	for _, t := range ts {
 		if strings.Contains(t.Name(), "patch_embed.proj") {
 			for t := range splitDim(t, 2,
 				strings.NewReplacer("patch_embed.proj", "patch_embd_0"),
 				strings.NewReplacer("patch_embed.proj", "patch_embd_1"),
 			) {
 				t.Shape = slices.DeleteFunc(t.Shape, func(i uint64) bool { return i == 1 })
 				out = append(out, t)
 			}
 		} else if strings.Contains(t.Name(), "attn.qkv") {
 			out = append(out, slices.Collect(splitDim(t, 0,
 				strings.NewReplacer("attn.qkv", "attn_q"),
 				strings.NewReplacer("attn.qkv", "attn_k"),
 				strings.NewReplacer("attn.qkv", "attn_v"),
 			))...)
 		} else {
 			out = append(out, &ggml.Tensor{
 				Name:     t.Name(),
 				Kind:     t.Kind(),
 				Shape:    t.Shape(),
 				WriterTo: t,
 			})
 		}
 	}
 	return out
 }
 func (p *qwen25VLModel) Replacements() []string {
 	return append(
 		p.qwen2Model.Replacements(),
 		"visual", "v",
 		"blocks", "blk",
 		"attn.proj", "attn_out",
 		"norm1", "ln1",
 		"norm2", "ln2",
 	)
 }
--- a/convert/tensor.go
+++ b/convert/tensor.go
@ -1,56 +0,0 @@
 package convert
 import (
 	"iter"
 	"slices"
 	"strings"
 	"github.com/ollama/ollama/fs/ggml"
 	"github.com/pdevine/tensor"
 	"github.com/pdevine/tensor/native"
 )
 // splitDim splits a tensor along a specified dimension into multiple tensors. The dimension
 // is split evenly based on the number of replacers provided.
 func splitDim(t Tensor, dim int, replacers ...*strings.Replacer) iter.Seq[*ggml.Tensor] {
 	return func(yield func(*ggml.Tensor) bool) {
 		for i, replacer := range replacers {
 			shape := slices.Clone(t.Shape())
 			shape[dim] = shape[dim] / uint64(len(replacers))
 			slice := slices.Repeat([]tensor.Slice{nil}, len(shape))
 			slice[dim] = tensor.S(i*int(shape[dim]), (i+1)*int(shape[dim]))
 			tt := t.Clone()
 			tt.SetRepacker(func(_ string, data []float32, shape []uint64) ([]float32, error) {
 				dims := make([]int, len(shape))
 				for i := range shape {
 					dims[i] = int(shape[i])
 				}
 				var t tensor.Tensor = tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
 				t, err := t.Slice(slice...)
 				if err != nil {
 					return nil, err
 				}
 				t = tensor.Materialize(t)
 				// flatten tensor so it can be written as a vector
 				if err := t.Reshape(t.Shape().TotalSize()); err != nil {
 					return nil, err
 				}
 				return native.VectorF32(t.(*tensor.Dense))
 			})
 			if !yield(&ggml.Tensor{
 				Name:     replacer.Replace(t.Name()),
 				Kind:     t.Kind(),
 				Shape:    shape,
 				WriterTo: tt,
 			}) {
 				break
 			}
 		}
 	}
 }
--- a/fs/ggml/ggml.go
+++ b/fs/ggml/ggml.go
@ -6,7 +6,6 @@ import (
 	"fmt"
 	"io"
 	"log/slog"
 	"math"
 	"slices"
 	"strings"
@ -127,7 +126,6 @@ func (kv KV) OllamaEngineRequired() bool {
 		"mistral3",
 		"llama4",
 		"mllama",
 		"qwen25vl",
 	}, kv.Architecture())
 }
@ -651,29 +649,6 @@ func (llm GGML) VisionGraphSize() (weights, graphSize uint64) {
 		graphSize = 4 * (imageSize*imageSize*numChannels +
 			embeddingLength*patchSize +
 			numPatches*numPatches*headCount)
 	case "qwen25vl":
 		maxPixels := uint64(llm.KV().Uint("vision.max_pixels", 28*28*1280))
 		mergeSize := uint64(llm.KV().Uint("vision.spatial_merge_size", 2))
 		temporalPatchSize := uint64(2)
 		// Calculate max possible patches based on max_pixels
 		maxHeight := uint64(math.Sqrt(float64(maxPixels)))
 		maxWidth := maxPixels / maxHeight
 		maxGridHeight := maxHeight / patchSize
 		maxGridWidth := maxWidth / patchSize
 		// Account for merged patches (2x2 grid)
 		numPatches := (maxGridHeight * maxGridWidth) / (mergeSize * mergeSize)
 		// Calculate graph size based on typical operations in ProcessImage and createPatches
 		graphSize = 4 * (maxPixels*numChannels + // Original image storage
 			// Normalized pixels
 			maxPixels*numChannels +
 			// Patches storage (numPatches * channels * temporalPatchSize * patchSize^2)
 			numPatches*numChannels*temporalPatchSize*patchSize*patchSize +
 			// Self-attention calculations (similar to other architectures)
 			numPatches*numPatches*headCount +
 			// Additional buffer for processing
 			embeddingLength*numPatches)
 	case "llama4":
 		// vision graph is computed independently in the same schedule
 		// and is negligible compared to the worst case text graph
--- a/go.mod
+++ b/go.mod
@ -19,6 +19,7 @@ require (
 	github.com/d4l3k/go-bfloat16 v0.0.0-20211005043715-690c3bdd05f1
 	github.com/dlclark/regexp2 v1.11.4
 	github.com/emirpasic/gods/v2 v2.0.0-alpha
 	github.com/go-json-experiment/json v0.0.0-20250417205406-170dfdcf87d1
 	github.com/google/go-cmp v0.6.0
 	github.com/mattn/go-runewidth v0.0.14
 	github.com/nlpodyssey/gopickle v0.3.0
--- a/go.sum
+++ b/go.sum
@ -69,6 +69,8 @@ github.com/go-fonts/latin-modern v0.2.0/go.mod h1:rQVLdDMK+mK1xscDwsqM5J8U2jrRa3
 github.com/go-fonts/liberation v0.1.1/go.mod h1:K6qoJYypsmfVjWg8KOVDQhLc8UDgIK2HYqyqAO9z7GY=
 github.com/go-fonts/stix v0.1.0/go.mod h1:w/c1f0ldAUlJmLBvlbkvVXLAD+tAMqobIIQpmnUIzUY=
 github.com/go-gl/glfw v0.0.0-20190409004039-e6da0acd62b1/go.mod h1:vR7hzQXu2zJy9AVAgeJqvqgH9Q5CA+iKCZ2gyEVpxRU=
 github.com/go-json-experiment/json v0.0.0-20250417205406-170dfdcf87d1 h1:+VexzzkMLb1tnvpuQdGT/DicIRW7MN8ozsXqBMgp0Hk=
 github.com/go-json-experiment/json v0.0.0-20250417205406-170dfdcf87d1/go.mod h1:TiCD2a1pcmjd7YnhGH0f/zKNcCD06B029pHhzV23c2M=
 github.com/go-latex/latex v0.0.0-20210118124228-b3d85cf34e07/go.mod h1:CO1AlKB2CSIqUrmQPqA0gdRIlnLEY0gK5JGjh37zN5U=
 github.com/go-playground/assert/v2 v2.2.0 h1:JvknZsQTYeFEAhQwI4qEt9cyV5ONwRHC+lYKSsYSR8s=
 github.com/go-playground/assert/v2 v2.2.0/go.mod h1:VDjEfimB/XKnb+ZQfWdccd7VUvScMdVu0Titje2rxJ4=
--- a/llama/patches/0015-add-argsort-and-cuda-copy-for-i32.patch
+++ b/llama/patches/0015-add-argsort-and-cuda-copy-for-i32.patch
@ -1,277 +0,0 @@
 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
 From: Michael Yang <git@mxy.ng>
 Date: Thu, 1 May 2025 13:45:12 -0700
 Subject: [PATCH] add argsort and cuda copy for i32
 ---
 ggml/src/ggml-cpu/ops.cpp     |  43 ++++++++++++++
 ggml/src/ggml-cuda/argsort.cu | 102 +++++++++++++++++++++++++++++++++-
 ggml/src/ggml-cuda/cpy.cu     |  49 ++++++++++++++++
 3 files changed, 192 insertions(+), 2 deletions(-)
 diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
 index becdae07..7a44b6cf 100644
 --- a/ggml/src/ggml-cpu/ops.cpp
 +++ b/ggml/src/ggml-cpu/ops.cpp
@@ -6890,6 +6890,45 @@ static void ggml_compute_forward_argsort_f32(
     }
 }
 +static void ggml_compute_forward_argsort_i32(
 +    const ggml_compute_params * params,
 +    ggml_tensor * dst) {
 +
 +    const ggml_tensor * src0 = dst->src[0];
 +
 +    GGML_TENSOR_UNARY_OP_LOCALS
 +
 +    GGML_ASSERT(nb0 == sizeof(int32_t));
 +
 +    const int ith = params->ith;
 +    const int nth = params->nth;
 +
 +    const int64_t nr = ggml_nrows(src0);
 +
 +    ggml_sort_order order = (ggml_sort_order) ggml_get_op_params_i32(dst, 0);
 +
 +    for (int64_t i = ith; i < nr; i += nth) {
 +        int32_t * dst_data = (int32_t *)((char *) dst->data + i*nb1);
 +        const int32_t * src_data = (int32_t *)((char *) src0->data + i*nb01);
 +
 +        for (int64_t j = 0; j < ne0; j++) {
 +            dst_data[j] = j;
 +        }
 +
 +        // C doesn't have a functional sort, so we do a bubble sort instead
 +        for (int64_t j = 0; j < ne0; j++) {
 +            for (int64_t k = j + 1; k < ne0; k++) {
 +                if ((order == GGML_SORT_ORDER_ASC  && src_data[dst_data[j]] > src_data[dst_data[k]]) ||
 +                    (order == GGML_SORT_ORDER_DESC && src_data[dst_data[j]] < src_data[dst_data[k]])) {
 +                    int32_t tmp = dst_data[j];
 +                    dst_data[j] = dst_data[k];
 +                    dst_data[k] = tmp;
 +                }
 +            }
 +        }
 +    }
 +}
 +
 void ggml_compute_forward_argsort(
     const ggml_compute_params * params,
     ggml_tensor * dst) {
@@ -6901,6 +6940,10 @@ void ggml_compute_forward_argsort(
             {
                 ggml_compute_forward_argsort_f32(params, dst);
             } break;
 +        case GGML_TYPE_I32:
 +            {
 +                ggml_compute_forward_argsort_i32(params, dst);
 +            } break;
         default:
             {
                 GGML_ABORT("fatal error");
 diff --git a/ggml/src/ggml-cuda/argsort.cu b/ggml/src/ggml-cuda/argsort.cu
 index 607ded85..53b02634 100644
 --- a/ggml/src/ggml-cuda/argsort.cu
 +++ b/ggml/src/ggml-cuda/argsort.cu
@@ -85,13 +85,107 @@ static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, co
     }
 }
 +
 +template<ggml_sort_order order>
 +static __global__ void k_argsort_i32_i32(const int32_t * x, int * dst, const int ncols, const int ncols_pad) {
 +    extern __shared__ int shared_mem[];
 +    int * indices = shared_mem;
 +
 +    const int tid = threadIdx.x;
 +    const int row = blockIdx.y;
 +
 +    // Initialize all indices, handling the case where threads < ncols_pad
 +    for (int i = tid; i < ncols_pad; i += blockDim.x) {
 +        indices[i] = i < ncols ? i : 0; // Use 0 for padding indices
 +    }
 +    __syncthreads();
 +
 +    // Bitonic sort
 +    for (int k = 2; k <= ncols_pad; k *= 2) {
 +        for (int j = k/2; j > 0; j /= 2) {
 +            for (int i = tid; i < ncols_pad; i += blockDim.x) {
 +                const int ij = i ^ j;
 +                if (ij > i) {
 +                    // Only compare values within the actual data range
 +                    if (i < ncols && ij < ncols) {
 +                        if ((i & k) == 0) {
 +                            if (order == GGML_SORT_ORDER_ASC) {
 +                                if (x[row * ncols + indices[i]] > x[row * ncols + indices[ij]]) {
 +                                    int tmp = indices[i];
 +                                    indices[i] = indices[ij];
 +                                    indices[ij] = tmp;
 +                                }
 +                            } else {
 +                                if (x[row * ncols + indices[i]] < x[row * ncols + indices[ij]]) {
 +                                    int tmp = indices[i];
 +                                    indices[i] = indices[ij];
 +                                    indices[ij] = tmp;
 +                                }
 +                            }
 +                        } else {
 +                            if (order == GGML_SORT_ORDER_ASC) {
 +                                if (x[row * ncols + indices[i]] < x[row * ncols + indices[ij]]) {
 +                                    int tmp = indices[i];
 +                                    indices[i] = indices[ij];
 +                                    indices[ij] = tmp;
 +                                }
 +                            } else {
 +                                if (x[row * ncols + indices[i]] > x[row * ncols + indices[ij]]) {
 +                                    int tmp = indices[i];
 +                                    indices[i] = indices[ij];
 +                                    indices[ij] = tmp;
 +                                }
 +                            }
 +                        }
 +                    }
 +                }
 +            }
 +            __syncthreads();
 +        }
 +    }
 +
 +    // Write sorted indices to output, only threads handling valid data
 +    for (int i = tid; i < ncols; i += blockDim.x) {
 +        dst[row * ncols + i] = indices[i];
 +    }
 +}
 +
 +static void argsort_i32_i32_cuda(const int32_t * x, int * dst, const int ncols, const int nrows, ggml_sort_order order, cudaStream_t stream) {
 +    // Bitonic sort requires ncols to be power of 2
 +    const int ncols_pad = next_power_of_2(ncols);
 +
 +    // Ensure thread count doesn't exceed maximum (typically 1024)
 +    const int max_threads = 1024;  // This is the typical max for most GPUs
 +    const int threads_per_block = ncols_pad > max_threads ? max_threads : ncols_pad;
 +
 +    const dim3 block_dims(threads_per_block, 1, 1);
 +    const dim3 block_nums(1, nrows, 1);
 +    const size_t shared_mem = ncols_pad * sizeof(int);
 +
 +    // Check if shared memory size is within limits
 +    const size_t max_shared_mem = ggml_cuda_info().devices[ggml_cuda_get_device()].smpb;
 +
 +    // Instead of logging an error, use GGML_ASSERT with a descriptive message
 +    GGML_ASSERT(shared_mem <= max_shared_mem && "argsort: required shared memory exceeds device limit");
 +
 +    // Launch kernels with the updated thread configuration
 +    if (order == GGML_SORT_ORDER_ASC) {
 +        k_argsort_i32_i32<GGML_SORT_ORDER_ASC><<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad);
 +    } else if (order == GGML_SORT_ORDER_DESC) {
 +        k_argsort_i32_i32<GGML_SORT_ORDER_DESC><<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad);
 +    } else {
 +        GGML_ABORT("fatal error");
 +    }
 +}
 +
 +
 void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
     const ggml_tensor * src0 = dst->src[0];
     const float * src0_d = (const float *)src0->data;
     float * dst_d = (float *)dst->data;
     cudaStream_t stream = ctx.stream();
 -    GGML_ASSERT(src0->type == GGML_TYPE_F32);
 +    GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_I32);
     GGML_ASSERT( dst->type == GGML_TYPE_I32);
     GGML_ASSERT(ggml_is_contiguous(src0));
@@ -100,5 +194,9 @@ void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
     enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0];
 -    argsort_f32_i32_cuda(src0_d, (int *)dst_d, ncols, nrows, order, stream);
 +    if (src0->type == GGML_TYPE_I32) {
 +        argsort_i32_i32_cuda((const int32_t *)src0_d, (int *)dst_d, ncols, nrows, order, stream);
 +    } else {
 +        argsort_f32_i32_cuda(src0_d, (int *)dst_d, ncols, nrows, order, stream);
 +    }
 }
 diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu
 index 2d46176e..47383486 100644
 --- a/ggml/src/ggml-cuda/cpy.cu
 +++ b/ggml/src/ggml-cuda/cpy.cu
@@ -38,6 +38,13 @@ static __device__ void cpy_1_f16_f32(const char * cxi, char * cdsti) {
     *dsti = *xi;
 }
 +static __device__ void cpy_1_i32_i32(const char * cxi, char * cdsti) {
 +    const int32_t * xi = (const int32_t *) cxi;
 +    int32_t * dsti = (int32_t *) cdsti;
 +
 +    *dsti = *xi;
 +}
 +
 template <cpy_kernel_t cpy_1>
 static __global__ void cpy_f32_f16(const char * cx, char * cdst_direct, const int ne,
                                    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
@@ -68,6 +75,44 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst_direct, const in
     cpy_1(cx + x_offset, cdst + dst_offset);
 }
 +// First, add this template function after the other template functions
 +template <cpy_kernel_t cpy_1>
 +static __global__ void cpy_i32_i32(const char * cx, char * cdst, const int ne,
 +                                 const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
 +                                 const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
 +                                 const int nb12, const int nb13) {
 +    const int64_t i = blockDim.x*blockIdx.x + threadIdx.x;
 +
 +    if (i >= ne) {
 +        return;
 +    }
 +
 +    const int64_t i03 = i/(ne00 * ne01 * ne02);
 +    const int64_t i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
 +    const int64_t i01 = (i - i03*ne00*ne01*ne02  -  i02*ne01*ne00) / ne00;
 +    const int64_t i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
 +    const int64_t x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;
 +
 +    const int64_t i13 = i/(ne10 * ne11 * ne12);
 +    const int64_t i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
 +    const int64_t i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
 +    const int64_t i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
 +    const int64_t dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13 * nb13;
 +
 +    cpy_1(cx + x_offset, cdst + dst_offset);
 +}
 +
 +// Then modify the ggml_cpy_i32_i32_cuda function to use the new template
 +static void ggml_cpy_i32_i32_cuda(
 +    const char * cx, char * cdst, const int ne,
 +    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
 +    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int graph_cpynode_index) {
 +
 +    const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
 +    cpy_i32_i32<cpy_1_i32_i32><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
 +        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
 +}
 +
 static __device__ void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) {
     const float * xi = (const float *) cxi;
     block_q8_0 * dsti = (block_q8_0 *) cdsti;
@@ -631,6 +676,8 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
         ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
     } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
         ggml_cpy_f16_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
 +    } else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32) {
 +        ggml_cpy_i32_i32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
     } else {
         GGML_ABORT("%s: unsupported type combination (%s to %s)\n", __func__,
                 ggml_type_name(src0->type), ggml_type_name(src1->type));
@@ -686,6 +733,8 @@ void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) {
         return (void*) cpy_f32_f16<cpy_1_f32_f16>;
     } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
         return (void*) cpy_f32_f16<cpy_1_f16_f32>;
 +    } else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32) {
 +        return (void*) cpy_i32_i32<cpy_1_i32_i32>;
     } else {
         GGML_ABORT("%s: unsupported type combination (%s to %s)\n", __func__,
                 ggml_type_name(src0->type), ggml_type_name(src1->type));
--- a/ml/backend.go
+++ b/ml/backend.go
@ -119,21 +119,6 @@ type Context interface {
 	Layer(int) Context
 }
 // RopeOptions contains optional parameters for RoPE function
 type RopeOptions struct {
 	OriginalContextLen uint32
 }
 // RopeOption defines a function that modifies RopeOpts
 type RopeOption func(*RopeOptions)
 // WithContextLen sets a custom context length
 func WithContextLen(len uint32) RopeOption {
 	return func(opts *RopeOptions) {
 		opts.OriginalContextLen = len
 	}
 }
 type Tensor interface {
 	Dim(n int) int
 	Stride(n int) int
@ -159,7 +144,7 @@ type Tensor interface {
 	AvgPool2D(ctx Context, k, s int, p float32) Tensor
 	Conv2D(ctx Context, weight Tensor, s0, s1, p0, p1, d0, d1 int) Tensor
-	RoPE(ctx Context, positionIDs, ropeFactors Tensor, dim, ropeType uint32, base, scale float32, options ...RopeOption) Tensor
+	RoPE(ctx Context, positionIDs, ropeFactors Tensor, dim, ropeType uint32, base, scale float32) Tensor
 	IM2Col(ctx Context, weight Tensor, s0, s1, p0, p1, d0, d1 int) Tensor
 	Sin(ctx Context) Tensor
@ -187,7 +172,6 @@ type Tensor interface {
 	Duplicate(ctx Context) Tensor
 	TopK(ctx Context, k int) Tensor
 	Argsort(ctx Context) Tensor
 }
 // ScaledDotProductAttention implements a fused attention
--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@ -1060,17 +1060,7 @@ const (
 	ropeTypeVision C.int = 24
 )
-func (t *Tensor) RoPE(ctx ml.Context, positionIDs, ropeFactors ml.Tensor, ropeDim, ropeType uint32, ropeBase, ropeScale float32, options ...ml.RopeOption) ml.Tensor {
+func (t *Tensor) RoPE(ctx ml.Context, positionIDs, ropeFactors ml.Tensor, ropeDim, ropeType uint32, ropeBase, ropeScale float32) ml.Tensor {
 	// Default options
 	opts := &ml.RopeOptions{
 		OriginalContextLen: 131072,
 	}
 	// Apply any provided options
 	for _, option := range options {
 		option(opts)
 	}
 	if ropeFactors == nil {
 		ropeFactors = &Tensor{b: t.b}
 	}
@ -1083,19 +1073,16 @@ func (t *Tensor) RoPE(ctx ml.Context, positionIDs, ropeFactors ml.Tensor, ropeDi
 	return &Tensor{
 		b: t.b,
 		t: C.ggml_rope_ext(
-			ctx.(*Context).ctx,
+			ctx.(*Context).ctx, dequant, positionIDs.(*Tensor).t, ropeFactors.(*Tensor).t,
 			dequant,
 			positionIDs.(*Tensor).t,
 			ropeFactors.(*Tensor).t,
 			C.int(ropeDim),
 			C.int(ropeType),
-			C.int(opts.OriginalContextLen),
+			131072, // YaRN n_ctx_train
 			C.float(ropeBase),
 			C.float(ropeScale),
-			C.float(0.0),
+			0.,  // YaRN ext_factor
-			C.float(1.0),
+			1.,  // YaRN attn_factor
-			C.float(32.0),
+			32., // YaRN beta_fast
-			C.float(1.0),
+			1.,  // YaRN beta_slow
 		),
 	}
 }
@ -1189,10 +1176,3 @@ func (t *Tensor) TopK(ctx ml.Context, k int) ml.Tensor {
 		t: C.ggml_top_k(ctx.(*Context).ctx, t.t, C.int(k)),
 	}
 }
 func (t *Tensor) Argsort(ctx ml.Context) ml.Tensor {
 	return &Tensor{
 		b: t.b,
 		t: C.ggml_argsort(ctx.(*Context).ctx, t.t, C.GGML_SORT_ORDER_ASC),
 	}
 }
--- a/ml/backend/ggml/ggml/src/ggml-cpu/ops.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/ops.cpp
@ -6822,45 +6822,6 @@ static void ggml_compute_forward_argsort_f32(
    }
 }
 static void ggml_compute_forward_argsort_i32(
    const ggml_compute_params * params,
    ggml_tensor * dst) {
    const ggml_tensor * src0 = dst->src[0];
    GGML_TENSOR_UNARY_OP_LOCALS
    GGML_ASSERT(nb0 == sizeof(int32_t));
    const int ith = params->ith;
    const int nth = params->nth;
    const int64_t nr = ggml_nrows(src0);
    ggml_sort_order order = (ggml_sort_order) ggml_get_op_params_i32(dst, 0);
    for (int64_t i = ith; i < nr; i += nth) {
        int32_t * dst_data = (int32_t *)((char *) dst->data + i*nb1);
        const int32_t * src_data = (int32_t *)((char *) src0->data + i*nb01);
        for (int64_t j = 0; j < ne0; j++) {
            dst_data[j] = j;
        }
        // C doesn't have a functional sort, so we do a bubble sort instead
        for (int64_t j = 0; j < ne0; j++) {
            for (int64_t k = j + 1; k < ne0; k++) {
                if ((order == GGML_SORT_ORDER_ASC  && src_data[dst_data[j]] > src_data[dst_data[k]]) ||
                    (order == GGML_SORT_ORDER_DESC && src_data[dst_data[j]] < src_data[dst_data[k]])) {
                    int32_t tmp = dst_data[j];
                    dst_data[j] = dst_data[k];
                    dst_data[k] = tmp;
                }
            }
        }
    }
 }
 void ggml_compute_forward_argsort(
    const ggml_compute_params * params,
    ggml_tensor * dst) {
@ -6872,10 +6833,6 @@ void ggml_compute_forward_argsort(
            {
                ggml_compute_forward_argsort_f32(params, dst);
            } break;
        case GGML_TYPE_I32:
            {
                ggml_compute_forward_argsort_i32(params, dst);
            } break;
        default:
            {
                GGML_ABORT("fatal error");
--- a/ml/backend/ggml/ggml/src/ggml-cuda/argsort.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/argsort.cu
@ -85,107 +85,13 @@ static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, co
    }
 }
 template<ggml_sort_order order>
 static __global__ void k_argsort_i32_i32(const int32_t * x, int * dst, const int ncols, const int ncols_pad) {
    extern __shared__ int shared_mem[];
    int * indices = shared_mem;
    const int tid = threadIdx.x;
    const int row = blockIdx.y;
    // Initialize all indices, handling the case where threads < ncols_pad
    for (int i = tid; i < ncols_pad; i += blockDim.x) {
        indices[i] = i < ncols ? i : 0; // Use 0 for padding indices
    }
    __syncthreads();
    // Bitonic sort
    for (int k = 2; k <= ncols_pad; k *= 2) {
        for (int j = k/2; j > 0; j /= 2) {
            for (int i = tid; i < ncols_pad; i += blockDim.x) {
                const int ij = i ^ j;
                if (ij > i) {
                    // Only compare values within the actual data range
                    if (i < ncols && ij < ncols) {
                        if ((i & k) == 0) {
                            if (order == GGML_SORT_ORDER_ASC) {
                                if (x[row * ncols + indices[i]] > x[row * ncols + indices[ij]]) {
                                    int tmp = indices[i];
                                    indices[i] = indices[ij];
                                    indices[ij] = tmp;
                                }
                            } else {
                                if (x[row * ncols + indices[i]] < x[row * ncols + indices[ij]]) {
                                    int tmp = indices[i];
                                    indices[i] = indices[ij];
                                    indices[ij] = tmp;
                                }
                            }
                        } else {
                            if (order == GGML_SORT_ORDER_ASC) {
                                if (x[row * ncols + indices[i]] < x[row * ncols + indices[ij]]) {
                                    int tmp = indices[i];
                                    indices[i] = indices[ij];
                                    indices[ij] = tmp;
                                }
                            } else {
                                if (x[row * ncols + indices[i]] > x[row * ncols + indices[ij]]) {
                                    int tmp = indices[i];
                                    indices[i] = indices[ij];
                                    indices[ij] = tmp;
                                }
                            }
                        }
                    }
                }
            }
            __syncthreads();
        }
    }
    // Write sorted indices to output, only threads handling valid data
    for (int i = tid; i < ncols; i += blockDim.x) {
        dst[row * ncols + i] = indices[i];
    }
 }
 static void argsort_i32_i32_cuda(const int32_t * x, int * dst, const int ncols, const int nrows, ggml_sort_order order, cudaStream_t stream) {
    // Bitonic sort requires ncols to be power of 2
    const int ncols_pad = next_power_of_2(ncols);
    // Ensure thread count doesn't exceed maximum (typically 1024)
    const int max_threads = 1024;  // This is the typical max for most GPUs
    const int threads_per_block = ncols_pad > max_threads ? max_threads : ncols_pad;
    const dim3 block_dims(threads_per_block, 1, 1);
    const dim3 block_nums(1, nrows, 1);
    const size_t shared_mem = ncols_pad * sizeof(int);
    // Check if shared memory size is within limits
    const size_t max_shared_mem = ggml_cuda_info().devices[ggml_cuda_get_device()].smpb;
    // Instead of logging an error, use GGML_ASSERT with a descriptive message
    GGML_ASSERT(shared_mem <= max_shared_mem && "argsort: required shared memory exceeds device limit");
    // Launch kernels with the updated thread configuration
    if (order == GGML_SORT_ORDER_ASC) {
        k_argsort_i32_i32<GGML_SORT_ORDER_ASC><<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad);
    } else if (order == GGML_SORT_ORDER_DESC) {
        k_argsort_i32_i32<GGML_SORT_ORDER_DESC><<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad);
    } else {
        GGML_ABORT("fatal error");
    }
 }
 void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    const ggml_tensor * src0 = dst->src[0];
    const float * src0_d = (const float *)src0->data;
    float * dst_d = (float *)dst->data;
    cudaStream_t stream = ctx.stream();
-    GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_I32);
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_I32);
    GGML_ASSERT(ggml_is_contiguous(src0));
@ -194,9 +100,5 @@ void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0];
-    if (src0->type == GGML_TYPE_I32) {
+    argsort_f32_i32_cuda(src0_d, (int *)dst_d, ncols, nrows, order, stream);
        argsort_i32_i32_cuda((const int32_t *)src0_d, (int *)dst_d, ncols, nrows, order, stream);
    } else {
        argsort_f32_i32_cuda(src0_d, (int *)dst_d, ncols, nrows, order, stream);
    }
 }
--- a/ml/backend/ggml/ggml/src/ggml-cuda/cpy.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/cpy.cu
@ -38,13 +38,6 @@ static __device__ void cpy_1_f16_f32(const char * cxi, char * cdsti) {
    *dsti = *xi;
 }
 static __device__ void cpy_1_i32_i32(const char * cxi, char * cdsti) {
    const int32_t * xi = (const int32_t *) cxi;
    int32_t * dsti = (int32_t *) cdsti;
    *dsti = *xi;
 }
 template <cpy_kernel_t cpy_1>
 static __global__ void cpy_f32_f16(const char * cx, char * cdst_direct, const int ne,
                                   const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
@ -75,44 +68,6 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst_direct, const in
    cpy_1(cx + x_offset, cdst + dst_offset);
 }
 // First, add this template function after the other template functions
 template <cpy_kernel_t cpy_1>
 static __global__ void cpy_i32_i32(const char * cx, char * cdst, const int ne,
                                 const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
                                 const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
                                 const int nb12, const int nb13) {
    const int64_t i = blockDim.x*blockIdx.x + threadIdx.x;
    if (i >= ne) {
        return;
    }
    const int64_t i03 = i/(ne00 * ne01 * ne02);
    const int64_t i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
    const int64_t i01 = (i - i03*ne00*ne01*ne02  -  i02*ne01*ne00) / ne00;
    const int64_t i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
    const int64_t x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;
    const int64_t i13 = i/(ne10 * ne11 * ne12);
    const int64_t i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
    const int64_t i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
    const int64_t i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
    const int64_t dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13 * nb13;
    cpy_1(cx + x_offset, cdst + dst_offset);
 }
 // Then modify the ggml_cpy_i32_i32_cuda function to use the new template
 static void ggml_cpy_i32_i32_cuda(
    const char * cx, char * cdst, const int ne,
    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int graph_cpynode_index) {
    const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
    cpy_i32_i32<cpy_1_i32_i32><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
 }
 static __device__ void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) {
    const float * xi = (const float *) cxi;
    block_q8_0 * dsti = (block_q8_0 *) cdsti;
@ -678,8 +633,6 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
        ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
    } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
        ggml_cpy_f16_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
    } else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32) {
        ggml_cpy_i32_i32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
    } else {
        GGML_ABORT("%s: unsupported type combination (%s to %s)\n", __func__,
                ggml_type_name(src0->type), ggml_type_name(src1->type));
@ -735,8 +688,6 @@ void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) {
        return (void*) cpy_f32_f16<cpy_1_f32_f16>;
    } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
        return (void*) cpy_f32_f16<cpy_1_f16_f32>;
    } else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32) {
        return (void*) cpy_i32_i32<cpy_1_i32_i32>;
    } else {
        GGML_ABORT("%s: unsupported type combination (%s to %s)\n", __func__,
                ggml_type_name(src0->type), ggml_type_name(src1->type));
--- a/model/models/models.go
+++ b/model/models/models.go
@ -7,5 +7,4 @@ import (
 	_ "github.com/ollama/ollama/model/models/llama4"
 	_ "github.com/ollama/ollama/model/models/mistral3"
 	_ "github.com/ollama/ollama/model/models/mllama"
 	_ "github.com/ollama/ollama/model/models/qwen25vl"
 )
--- a/model/models/qwen25vl/model.go
+++ b/model/models/qwen25vl/model.go
@ -1,187 +0,0 @@
 package qwen25vl
 import (
 	"bytes"
 	"fmt"
 	"image"
 	"slices"
 	"sync"
 	"github.com/ollama/ollama/fs"
 	"github.com/ollama/ollama/kvcache"
 	"github.com/ollama/ollama/ml"
 	"github.com/ollama/ollama/model"
 	"github.com/ollama/ollama/model/input"
 )
 type Model struct {
 	model.Base
 	model.BytePairEncoding
 	*TextModel
 	*VisionModel `gguf:"v,vision"`
 	ImageProcessor
 }
 // Implement MultimodalProcessor interface
 var _ model.MultimodalProcessor = (*Model)(nil)
 func New(c fs.Config) (model.Model, error) {
 	m := &Model{
 		BytePairEncoding: model.NewBytePairEncoding(
 			c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
 			&model.Vocabulary{
 				Values: c.Strings("tokenizer.ggml.tokens"),
 				Types:  c.Ints("tokenizer.ggml.token_type"),
 				Merges: c.Strings("tokenizer.ggml.merges"),
 				BOS:    int32(c.Uint("tokenizer.ggml.bos_token_id")),
 				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", false),
 				EOS:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
 				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
 				EOT:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
 				AddEOT: c.Bool("tokenizer.ggml.add_eos_token", false),
 			},
 		),
 		TextModel:      NewTextModel(c),
 		VisionModel:    newVisionModel(c),
 		ImageProcessor: newImageProcessor(c),
 	}
 	m.Cache = kvcache.NewCausalCache(m.TextModel.Shift)
 	return m, nil
 }
 func (m *Model) PixelValues(ctx ml.Context, multimodalData []byte) (ml.Tensor, *Grid, error) {
 	image, _, err := image.Decode(bytes.NewReader(multimodalData))
 	if err != nil {
 		return nil, nil, err
 	}
 	f32s, grid, err := m.ImageProcessor.ProcessImage(image)
 	if err != nil {
 		return nil, nil, err
 	}
 	// Calculate tensor dimensions
 	patchDim := m.ImageProcessor.numChannels * m.ImageProcessor.temporalPatchSize *
 		m.ImageProcessor.patchSize * m.ImageProcessor.patchSize
 	numPatches := grid.Temporal * grid.Height * grid.Width
 	pixelValues, err := ctx.Input().FromFloatSlice(f32s, patchDim, numPatches)
 	if err != nil {
 		return nil, nil, fmt.Errorf("failed to create tensor from image: %w", err)
 	}
 	return pixelValues, grid, nil
 }
 func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, error) {
 	if len(m.VisionModel.Layers) == 0 {
 		return nil, model.ErrNoVisionModel
 	}
 	pixels, grid, err := m.PixelValues(ctx, multimodalData)
 	if err != nil {
 		return nil, err
 	}
 	visionOutputs := m.VisionModel.Forward(ctx, pixels, grid)
 	return &chunks{Model: m, Tensor: visionOutputs}, nil
 }
 type chunks struct {
 	*Model
 	ml.Tensor
 	dataOnce sync.Once
 	data     []float32
 }
 type chunk struct {
 	*chunks
 	s, n int
 }
 func (r *chunk) floats() []float32 {
 	r.dataOnce.Do(func() {
 		temp := r.Backend().NewContext()
 		defer temp.Close()
 		temp.Forward(r.Tensor).Compute(r.Tensor)
 		r.data = r.Floats()
 	})
 	return r.data[r.s*r.Dim(0) : (r.s+r.n)*r.Dim(0)]
 }
 // PostTokenize arranges Qwen-2.5-VL's inputs for the forward pass
 func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
 	var result []input.Input
 	var (
 		imageToken       int32 = 151655
 		visionStartToken int32 = 151652
 		visionEndToken   int32 = 151653
 	)
 	nImg := 0
 	for _, inp := range inputs {
 		if inp.Multimodal == nil {
 			// If not a multimodal input, add it to the result unchanged
 			result = append(result, inp)
 		} else {
 			// Adding the 'Picture' prefix is a hack, at the time of writing there is no way to prefix
 			// the image tokens with a prompt, so we add a prefix here
 			nImg++
 			pre, err := m.Encode(fmt.Sprintf(" Picture %d: ", nImg), true)
 			if err != nil {
 				return nil, fmt.Errorf("failed to encode image prompt: %w", err)
 			}
 			for i := range pre {
 				result = append(result, input.Input{Token: pre[i]})
 			}
 			// This is an image token with multimodal data
 			chunksData := inp.Multimodal.(*chunks)
 			patchesPerChunk := chunksData.Dim(1)
 			// First add the vision start token
 			result = append(result, input.Input{Token: visionStartToken, SameBatch: patchesPerChunk + 2})
 			// Add the image token with the multimodal tensor data at the first position
 			// Create a chunk with proper s and n values
 			result = append(result, input.Input{
 				Token:          imageToken,
 				Multimodal:     &chunk{chunks: chunksData, s: 0, n: patchesPerChunk},
 				MultimodalHash: inp.MultimodalHash,
 				SameBatch:      patchesPerChunk,
 			})
 			// Add the placeholder tokens for the remaining positions (tokensPerGrid-1)
 			result = append(result, slices.Repeat([]input.Input{{Token: imageToken}}, patchesPerChunk-1)...)
 			result = append(result, input.Input{Token: visionEndToken})
 		}
 	}
 	return result, nil
 }
 func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
 	positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
 	if err != nil {
 		return nil, err
 	}
 	outputs, err := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
 	if err != nil {
 		return nil, err
 	}
 	return m.TextModel.Forward(ctx, batch.Inputs, positions, outputs, batch, m.Cache)
 }
 func init() {
 	model.Register("qwen25vl", New)
 }
--- a/model/models/qwen25vl/model_text.go
+++ b/model/models/qwen25vl/model_text.go
@ -1,155 +0,0 @@
 package qwen25vl
 import (
 	"math"
 	"github.com/ollama/ollama/fs"
 	"github.com/ollama/ollama/kvcache"
 	"github.com/ollama/ollama/ml"
 	"github.com/ollama/ollama/ml/nn"
 	"github.com/ollama/ollama/model/input"
 )
 type TextOptions struct {
 	ctxLen, hiddenSize, numHeads, numKVHeads int
 	eps, ropeBase, ropeScale                 float32
 	ropeDim, defaultContextLen               uint32
 }
 type TextModel struct {
 	TokenEmbedding *nn.Embedding `gguf:"token_embd"`
 	Layers         []Layer       `gguf:"blk"`
 	OutputNorm     *nn.RMSNorm   `gguf:"output_norm"`
 	Output         *nn.Linear    `gguf:"output,alt:token_embd"`
 	*TextOptions
 }
 func NewTextModel(c fs.Config) *TextModel {
 	m := TextModel{
 		Layers: make([]Layer, c.Uint("block_count")),
 		TextOptions: &TextOptions{
 			ctxLen:            int(c.Uint("context_length")),
 			hiddenSize:        int(c.Uint("embedding_length")),
 			numHeads:          int(c.Uint("attention.head_count")),
 			numKVHeads:        int(c.Uint("attention.head_count_kv")),
 			eps:               c.Float("attention.layer_norm_rms_epsilon"),
 			ropeBase:          c.Float("rope.freq_base"),
 			ropeScale:         c.Float("rope.freq_scale", 1),
 			ropeDim:           c.Uint("rope.dimension_count", 128),
 			defaultContextLen: c.Uint("context_length", 128000),
 		},
 	}
 	return &m
 }
 // SelfAttention implements the multi-head self-attention mechanism
 // with separate projections for query, key, value and output transformations
 type SelfAttention struct {
 	Query  *nn.Linear `gguf:"attn_q"`
 	Key    *nn.Linear `gguf:"attn_k"`
 	Value  *nn.Linear `gguf:"attn_v"`
 	Output *nn.Linear `gguf:"attn_output"`
 }
 func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *TextOptions) ml.Tensor {
 	batchSize := hiddenState.Dim(1)
 	headDim := opts.hiddenSize / opts.numHeads
 	q := sa.Query.Forward(ctx, hiddenState)
 	q = q.Reshape(ctx, headDim, opts.numHeads, batchSize)
 	q = q.RoPE(ctx, positionIDs, nil, opts.ropeDim, 2, opts.ropeBase, opts.ropeScale, ml.WithContextLen(opts.defaultContextLen))
 	k := sa.Key.Forward(ctx, hiddenState)
 	k = k.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
 	k = k.RoPE(ctx, positionIDs, nil, opts.ropeDim, 2, opts.ropeBase, opts.ropeScale, ml.WithContextLen(opts.defaultContextLen))
 	v := sa.Value.Forward(ctx, hiddenState)
 	v = v.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
 	scaleFactor := 1.0 / math.Sqrt(float64(headDim))
 	kqv := nn.Attention(ctx, q, k, v, scaleFactor, cache)
 	kqv = kqv.Reshape(ctx, opts.hiddenSize, batchSize)
 	return sa.Output.Forward(ctx, kqv)
 }
 // Shift applies rotary position embeddings to the key tensor for causal attention caching
 func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
 	return key.RoPE(ctx, shift, nil, m.ropeDim, 2, m.ropeBase, m.ropeScale, ml.WithContextLen(m.defaultContextLen)), nil
 }
 // MLP implements the feed-forward network component with SwiGLU activation
 type MLP struct {
 	Up   *nn.Linear `gguf:"ffn_up"`
 	Down *nn.Linear `gguf:"ffn_down"`
 	Gate *nn.Linear `gguf:"ffn_gate"`
 }
 func (mlp *MLP) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *TextOptions) ml.Tensor {
 	// Apply SwiGLU activation gating
 	hiddenState = mlp.Gate.Forward(ctx, hiddenState).SILU(ctx).Mul(ctx, mlp.Up.Forward(ctx, hiddenState))
 	// Project back to hidden dimension
 	return mlp.Down.Forward(ctx, hiddenState)
 }
 // Layer represents a single transformer layer combining self-attention and feed-forward components
 type Layer struct {
 	AttentionNorm *nn.RMSNorm `gguf:"attn_norm"`
 	SelfAttention *SelfAttention
 	MLPNorm       *nn.RMSNorm `gguf:"ffn_norm"`
 	MLP           *MLP
 }
 func (l *Layer) Forward(ctx ml.Context, hiddenState, positionIDs, outputs ml.Tensor, cache kvcache.Cache, opts *TextOptions) ml.Tensor {
 	// Self-attention branch with residual connection
 	residual := hiddenState
 	hiddenState = l.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
 	hiddenState = l.SelfAttention.Forward(ctx, hiddenState, positionIDs, cache, opts)
 	// In the final layer (outputs != nil), optimize by pruning to just the token positions
 	// we need logits for.
 	if outputs != nil {
 		hiddenState = hiddenState.Rows(ctx, outputs)
 		residual = residual.Rows(ctx, outputs)
 	}
 	hiddenState = hiddenState.Add(ctx, residual)
 	// Feed-forward branch with residual connection
 	residual = hiddenState
 	hiddenState = l.MLPNorm.Forward(ctx, hiddenState, opts.eps)
 	hiddenState = l.MLP.Forward(ctx, hiddenState, opts)
 	return hiddenState.Add(ctx, residual)
 }
 func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor, batch input.Batch, cache kvcache.Cache) (ml.Tensor, error) {
 	// Initial token embedding
 	hiddenStates := m.TokenEmbedding.Forward(ctx, inputs).Duplicate(ctx)
 	for _, mi := range batch.Multimodal {
 		f32s := mi.Multimodal.(*chunk).floats()
 		img, err := ctx.Input().FromFloatSlice(f32s, len(f32s)/m.hiddenSize, m.hiddenSize)
 		if err != nil {
 			panic(err)
 		}
 		ctx.Forward(img.Copy(ctx, hiddenStates.View(ctx, mi.Index*hiddenStates.Stride(1), img.Dim(0)*img.Dim(1))))
 	}
 	// Process through transformer layers
 	for i, layer := range m.Layers {
 		cache.SetLayer(i)
 		var lastLayerOutputs ml.Tensor
 		if i == len(m.Layers)-1 {
 			lastLayerOutputs = outputs
 		}
 		hiddenStates = layer.Forward(ctx, hiddenStates, positions, lastLayerOutputs, cache, m.TextOptions)
 	}
 	hiddenStates = m.OutputNorm.Forward(ctx, hiddenStates, m.eps)
 	return m.Output.Forward(ctx, hiddenStates), nil
 }
--- a/model/models/qwen25vl/model_vision.go
+++ b/model/models/qwen25vl/model_vision.go
@ -1,391 +0,0 @@
 package qwen25vl
 import (
 	"fmt"
 	"math"
 	"slices"
 	"github.com/ollama/ollama/fs"
 	"github.com/ollama/ollama/ml"
 	"github.com/ollama/ollama/ml/nn"
 )
 // We only support batch size of 1
 var batchSize int = 1
 func rotateHalf(ctx ml.Context, t ml.Tensor) ml.Tensor {
 	x1 := t.View(ctx, 0, t.Dim(0)/2, t.Stride(1), t.Dim(1), t.Stride(2), t.Dim(2), t.Stride(3), t.Dim(3))
 	x2 := t.View(ctx, t.Stride(0)*t.Dim(0)/2, t.Dim(0)/2, t.Stride(1), t.Dim(1), t.Stride(2), t.Dim(2), t.Stride(3), t.Dim(3)).Contiguous(ctx)
 	return x2.Neg(ctx).Concat(ctx, x1, 0)
 }
 func applyRotaryPositionalEmbedding(ctx ml.Context, t, cos, sin ml.Tensor) ml.Tensor {
 	return t.Mul(ctx, cos).Add(ctx, rotateHalf(ctx, t).Mul(ctx, sin))
 }
 func blockDiagonalMask(ctx ml.Context, seqLength int, bounds []int, numHeads int) ml.Tensor {
 	// Create a flat slice for the mask (all -inf initially to block all attention)
 	flat := make([]float32, seqLength*seqLength)
 	for i := range flat {
 		flat[i] = float32(math.Inf(-1)) // Negative infinity to block attention
 	}
 	// Fill in the mask with zeros for tokens that CAN attend to each other
 	for i := 1; i < len(bounds); i++ {
 		start := bounds[i-1]
 		end := bounds[i]
 		// Enable attention within this sequence block by setting values to 0
 		for row := start; row < end; row++ {
 			for col := start; col < end; col++ {
 				idx := row*seqLength + col
 				flat[idx] = 0.0 // 0 allows attention, -inf blocks it
 			}
 		}
 	}
 	mask, err := ctx.Input().FromFloatSlice(flat, seqLength, seqLength)
 	if err != nil {
 		panic(err)
 	}
 	// Reshape to match [seqLength, seqLength, 1] for broadcasting
 	mask = mask.Reshape(ctx, seqLength, seqLength, 1)
 	return mask
 }
 type VisionSelfAttention struct {
 	Query  *nn.Linear `gguf:"attn_q"`
 	Key    *nn.Linear `gguf:"attn_k"`
 	Value  *nn.Linear `gguf:"attn_v"`
 	Output *nn.Linear `gguf:"attn_out"`
 }
 func (sa *VisionSelfAttention) Forward(ctx ml.Context, hiddenStates, cos, sin, mask ml.Tensor, opts *VisionModelOptions) ml.Tensor {
 	query := sa.Query.Forward(ctx, hiddenStates)
 	key := sa.Key.Forward(ctx, hiddenStates)
 	value := sa.Value.Forward(ctx, hiddenStates)
 	query = query.Reshape(ctx, opts.headDim, opts.numHeads, query.Dim(1), batchSize)
 	key = key.Reshape(ctx, opts.headDim, opts.numHeads, key.Dim(1), batchSize)
 	value = value.Reshape(ctx, opts.headDim, opts.numHeads, value.Dim(1), batchSize)
 	query = applyRotaryPositionalEmbedding(ctx, query, cos, sin)
 	key = applyRotaryPositionalEmbedding(ctx, key, cos, sin)
 	// Scale factor for scaled dot-product attention
 	scale := 1.0 / math.Sqrt(float64(opts.headDim))
 	// Scaled dot-product attention
 	query = query.Permute(ctx, 0, 2, 1, 3)
 	key = key.Permute(ctx, 0, 2, 1, 3)
 	value = value.Permute(ctx, 1, 2, 0, 3).Contiguous(ctx)
 	kq := key.MulmatFullPrec(ctx, query)
 	kq = kq.Scale(ctx, scale)
 	if mask != nil {
 		kq = kq.Add(ctx, mask)
 	}
 	kq = kq.Softmax(ctx)
 	kqv := value.Mulmat(ctx, kq)
 	attention := kqv.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
 	attention = attention.Reshape(ctx, opts.hiddenSize, attention.Dim(2), batchSize)
 	return sa.Output.Forward(ctx, attention)
 }
 // VisionMLP implements the multi-layer perceptron
 type VisionMLP struct {
 	Gate *nn.Linear `gguf:"ffn_gate"`
 	Up   *nn.Linear `gguf:"ffn_up"`
 	Down *nn.Linear `gguf:"ffn_down"`
 }
 func (mlp *VisionMLP) Forward(ctx ml.Context, hiddenStates ml.Tensor, opts *VisionModelOptions) ml.Tensor {
 	// Using activation as specified in config (likely GELU or SiLU/Swish)
 	gateOutput := mlp.Gate.Forward(ctx, hiddenStates)
 	upOutput := mlp.Up.Forward(ctx, hiddenStates)
 	hiddenStates = gateOutput.SILU(ctx).Mul(ctx, upOutput)
 	return mlp.Down.Forward(ctx, hiddenStates)
 }
 type VisionEncoderLayer struct {
 	Norm1         *nn.RMSNorm `gguf:"ln1"`
 	SelfAttention *VisionSelfAttention
 	Norm2         *nn.RMSNorm `gguf:"ln2"`
 	MLP           *VisionMLP
 }
 func (e *VisionEncoderLayer) Forward(ctx ml.Context, hiddenStates, cos, sin, mask ml.Tensor, opts *VisionModelOptions) ml.Tensor {
 	residual := hiddenStates
 	hiddenStates = e.Norm1.Forward(ctx, hiddenStates, opts.eps)
 	hiddenStates = e.SelfAttention.Forward(ctx, hiddenStates, cos, sin, mask, opts)
 	hiddenStates = hiddenStates.Add(ctx, residual)
 	residual = hiddenStates
 	hiddenStates = e.Norm2.Forward(ctx, hiddenStates, opts.eps)
 	hiddenStates = e.MLP.Forward(ctx, hiddenStates, opts)
 	return hiddenStates.Add(ctx, residual)
 }
 // VisionModelOptions contains configuration options
 type VisionModelOptions struct {
 	hiddenSize        int
 	numHeads          int
 	headDim           int
 	patchSize         int
 	numChannels       int
 	eps               float32
 	ropeTheta         float32
 	spatialMergeSize  int
 	windowSize        int
 	fullAttnBlocks    []int32
 	temporalPatchSize int
 }
 type PatchEmbedding struct {
 	PatchConv0 *nn.Conv2D `gguf:"patch_embd_0"`
 	PatchConv1 *nn.Conv2D `gguf:"patch_embd_1"`
 }
 func (pe *PatchEmbedding) Forward(ctx ml.Context, pixelValues ml.Tensor, opts *VisionModelOptions) ml.Tensor {
 	numPatches := pixelValues.Shape()[1]
 	// Reshape the input tensor to match the expected dimensions
 	pixelValues = pixelValues.Reshape(ctx, opts.patchSize*opts.patchSize, opts.temporalPatchSize, opts.numChannels, numPatches)
 	// Permute the tensor to bring the temporal dimension to the front
 	pixelValues = pixelValues.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx)
 	// Split the tensor into parts for the temporal convolutions
 	in0 := pixelValues.View(ctx, 0, 1, pixelValues.Stride(1), pixelValues.Dim(1), pixelValues.Stride(2), pixelValues.Dim(2), pixelValues.Stride(3), pixelValues.Dim(3)).Contiguous(ctx)
 	in0 = in0.Reshape(ctx, opts.patchSize, opts.patchSize, opts.numChannels, numPatches)
 	in1 := pixelValues.View(ctx, pixelValues.Stride(0), 1, pixelValues.Stride(1), pixelValues.Dim(1), pixelValues.Stride(2), pixelValues.Dim(2), pixelValues.Stride(3), pixelValues.Dim(3)).Contiguous(ctx)
 	in1 = in1.Reshape(ctx, opts.patchSize, opts.patchSize, opts.numChannels, numPatches)
 	s0, s1 := opts.patchSize, opts.patchSize // Use full stride
 	p0, p1 := 0, 0                           // padding
 	d0, d1 := 1, 1                           // dilation
 	out0 := pe.PatchConv0.Forward(ctx, in0, s0, s1, p0, p1, d0, d1)
 	out1 := pe.PatchConv1.Forward(ctx, in1, s0, s1, p0, p1, d0, d1)
 	// Add the outputs from the two temporal convolutions
 	out := out0.Add(ctx, out1)
 	// Reshape the output tensor to match the expected dimensions
 	return out.Reshape(ctx, opts.hiddenSize, numPatches)
 }
 // VisionPatchMerger implements patch merging for the Qwen vision model
 type VisionPatchMerger struct {
 	LNQ  *nn.RMSNorm `gguf:"ln_q"`
 	MLP0 *nn.Linear  `gguf:"mlp.0"`
 	MLP2 *nn.Linear  `gguf:"mlp.2"`
 }
 // Forward computes patch merging for the vision model
 func (pm *VisionPatchMerger) Forward(ctx ml.Context, visionOutputs ml.Tensor, opts *VisionModelOptions) ml.Tensor {
 	normalized := pm.LNQ.Forward(ctx, visionOutputs, opts.eps)
 	hiddenSize := visionOutputs.Dim(0) * (opts.spatialMergeSize * opts.spatialMergeSize)
 	// Reshape the normalized output to view the hidden size dimension
 	reshaped := normalized.Reshape(ctx, hiddenSize, normalized.Dim(1)/(opts.spatialMergeSize*opts.spatialMergeSize), batchSize)
 	hidden := pm.MLP0.Forward(ctx, reshaped)
 	activated := hidden.GELU(ctx)
 	output := pm.MLP2.Forward(ctx, activated)
 	return output
 }
 // VisionModel implements the Qwen vision model
 type VisionModel struct {
 	PatchEmbedding *PatchEmbedding
 	Layers         []VisionEncoderLayer `gguf:"blk"`
 	PatchMerger    *VisionPatchMerger   `gguf:"merger"`
 	*VisionModelOptions
 }
 // Forward computes the vision model for an input tensor
 func (m *VisionModel) Forward(ctx ml.Context, pixelValues ml.Tensor, grid *Grid) ml.Tensor {
 	// Extract patch embeddings
 	hiddenStates := m.PatchEmbedding.Forward(ctx, pixelValues, m.VisionModelOptions)
 	positionEmbedding := m.PositionalEmbedding(ctx, grid)
 	windowIndex, bounds := m.WindowIndex(ctx, grid)
 	spatialMergeUnit := m.spatialMergeSize * m.spatialMergeSize
 	hiddenStates = hiddenStates.Reshape(ctx, hiddenStates.Dim(0)*spatialMergeUnit, hiddenStates.Dim(1)/spatialMergeUnit)
 	hiddenStates = hiddenStates.Rows(ctx, windowIndex)
 	hiddenStates = hiddenStates.Reshape(ctx, hiddenStates.Dim(0)/spatialMergeUnit, hiddenStates.Dim(1)*spatialMergeUnit)
 	positionEmbedding = positionEmbedding.Reshape(ctx, positionEmbedding.Dim(0)*spatialMergeUnit, positionEmbedding.Dim(1)/spatialMergeUnit)
 	positionEmbedding = positionEmbedding.Rows(ctx, windowIndex)
 	positionEmbedding = positionEmbedding.Reshape(ctx, positionEmbedding.Dim(0)/spatialMergeUnit, positionEmbedding.Dim(1)*spatialMergeUnit)
 	positionEmbedding = positionEmbedding.Concat(ctx, positionEmbedding, 0)
 	cos, sin := positionEmbedding.Cos(ctx), positionEmbedding.Sin(ctx)
 	cos = cos.Reshape(ctx, cos.Dim(0), 1, cos.Dim(1))
 	sin = sin.Reshape(ctx, sin.Dim(0), 1, sin.Dim(1))
 	mask := blockDiagonalMask(ctx, hiddenStates.Dim(1), bounds, m.VisionModelOptions.numHeads)
 	// Apply encoder layers
 	for i, layer := range m.Layers {
 		if slices.Contains(m.fullAttnBlocks, int32(i)) {
 			hiddenStates = layer.Forward(ctx, hiddenStates, cos, sin, nil, m.VisionModelOptions)
 		} else {
 			hiddenStates = layer.Forward(
 				ctx,
 				hiddenStates,
 				cos,
 				sin,
 				mask,
 				m.VisionModelOptions,
 			)
 		}
 	}
 	hiddenStates = m.PatchMerger.Forward(ctx, hiddenStates, m.VisionModelOptions)
 	reverseWindowIndex := windowIndex.Argsort(ctx)
 	return hiddenStates.Rows(ctx, reverseWindowIndex)
 }
 // WindowIndex divides the grid into windows and returns:
 //  1. A tensor containing flattened indices of all grid points organized by windows
 //  2. A slice of boundaries that mark where each window's data begins and ends
 //     in the flattened representation, scaled by spatialMergeSize squared
 //
 // The boundaries slice always starts with 0 and contains cumulative ending
 // positions for each window, allowing downstream processing to identify
 // window boundaries in the tensor data.
 func (m *VisionModel) WindowIndex(ctx ml.Context, grid *Grid) (ml.Tensor, []int) {
 	vitMergerWindowSize := m.windowSize / m.spatialMergeSize / m.patchSize
 	llmGridH := grid.Height / m.spatialMergeSize
 	llmGridW := grid.Width / m.spatialMergeSize
 	// Calculate window parameters
 	numWindowsH := int(math.Ceil(float64(llmGridH) / float64(vitMergerWindowSize)))
 	numWindowsW := int(math.Ceil(float64(llmGridW) / float64(vitMergerWindowSize)))
 	// Initialize index_new slice
 	var index []int32
 	// Initialize bounds with the first element as 0
 	bounds := []int{0}
 	totalSeqLen := 0
 	// Process each window without padding
 	for wh := range numWindowsH {
 		for ww := range numWindowsW {
 			// Calculate window boundaries
 			hStart := wh * vitMergerWindowSize
 			wStart := ww * vitMergerWindowSize
 			hEnd := min(hStart+vitMergerWindowSize, llmGridH)
 			wEnd := min(wStart+vitMergerWindowSize, llmGridW)
 			// Calculate sequence length for this window
 			seqLen := (hEnd - hStart) * (wEnd - wStart)
 			// Collect indices for this window
 			for h := hStart; h < hEnd; h++ {
 				for w := wStart; w < wEnd; w++ {
 					index = append(index, int32(h*llmGridW+w))
 				}
 			}
 			totalSeqLen += seqLen
 			bounds = append(bounds, totalSeqLen*(m.spatialMergeSize*m.spatialMergeSize)+bounds[0])
 		}
 	}
 	t, err := ctx.Input().FromIntSlice(index, len(index))
 	if err != nil {
 		panic(err)
 	}
 	return t, bounds
 }
 // PositionalEmbedding generates rotary position embeddings for attention mechanisms
 func (m *VisionModel) PositionalEmbedding(ctx ml.Context, grid *Grid) ml.Tensor {
 	dim := m.headDim / 2
 	freq := dim / 2
 	theta := float64(m.ropeTheta)
 	merge := m.spatialMergeSize
 	// Create frequency patterns for position encoding
 	maxGridSize := max(grid.Height, grid.Width)
 	freqVals := make([]float32, freq*maxGridSize)
 	for i := range maxGridSize {
 		for j := range freq {
 			freqVals[i*freq+j] = float32(i) / float32(math.Pow(theta, float64(j*2)/float64(dim)))
 		}
 	}
 	freqs, err := ctx.Input().FromFloatSlice(freqVals, freq, maxGridSize)
 	if err != nil {
 		panic(fmt.Errorf("failed to create tensor from frequencies: %w", err))
 	}
 	// Create position coordinates (y,x pairs) for the grid
 	// In PyTorch: Equivalent to generating position ids with torch.arange()
 	coords := make([]int32, 0, grid.Height*grid.Width*2)
 	for y := range grid.Height {
 		for x := range grid.Width {
 			coords = append(coords, int32(y), int32(x))
 		}
 	}
 	pos, err := ctx.Input().FromIntSlice(coords, 2, grid.Width, grid.Height)
 	if err != nil {
 		panic(fmt.Errorf("failed to create tensor from positions: %w", err))
 	}
 	// Reshape and permute positions to match spatial merging pattern
 	pos = pos.Reshape(ctx, 2, grid.Width, merge, grid.Height/merge)
 	pos = pos.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
 	pos = pos.Reshape(ctx, 2, merge, merge, grid.Width/merge*grid.Height/merge)
 	pos = pos.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
 	pos = pos.Reshape(ctx, 2*merge*merge*grid.Width/merge*grid.Height/merge)
 	// Use position indices to look up corresponding frequency values
 	positionalEmbedding := freqs.Rows(ctx, pos)
 	positionalEmbedding = positionalEmbedding.Reshape(ctx, positionalEmbedding.Dim(0)*2, positionalEmbedding.Dim(1)/2)
 	return positionalEmbedding
 }
 // newVisionModel creates a new instance of the Qwen vision model
 func newVisionModel(c fs.Config) *VisionModel {
 	patchSize := int(c.Uint("vision.patch_size", 14))
 	hiddenSize := int(c.Uint("vision.embedding_length", 1280))
 	numHeads := int(c.Uint("vision.attention.head_count", 16))
 	numChannels := int(c.Uint("vision.num_channels", 3))
 	eps := c.Float("vision.attention.layer_norm_epsilon", 1e-6)
 	ropeTheta := c.Float("vision.rope.freq_base", 10000.0)
 	spatialMergeSize := int(c.Uint("vision.spatial_merge_size", 2))
 	windowSize := int(c.Uint("vision.window_size", 112))
 	fullAttnBlocks := c.Ints("qwen25vl.vision.fullatt_block_indexes", []int32{7, 15, 23, 31})
 	temporalPatchSize := int(c.Uint("vision.temporal_patch_size", 2))
 	model := &VisionModel{
 		Layers: make([]VisionEncoderLayer, c.Uint("vision.block_count", 32)),
 		VisionModelOptions: &VisionModelOptions{
 			hiddenSize:        hiddenSize,
 			numHeads:          numHeads,
 			headDim:           hiddenSize / numHeads,
 			patchSize:         patchSize,
 			numChannels:       numChannels,
 			eps:               eps,
 			ropeTheta:         ropeTheta,
 			spatialMergeSize:  spatialMergeSize,
 			windowSize:        windowSize,
 			temporalPatchSize: temporalPatchSize,
 			fullAttnBlocks:    fullAttnBlocks,
 		},
 	}
 	return model
 }
--- a/model/models/qwen25vl/process_image.go
+++ b/model/models/qwen25vl/process_image.go
@ -1,184 +0,0 @@
 package qwen25vl
 import (
 	"fmt"
 	"image"
 	"math"
 	"github.com/ollama/ollama/fs"
 	"github.com/ollama/ollama/model/imageproc"
 )
 // ImageProcessor contains configuration for the Qwen 2.5 VL image processing
 type ImageProcessor struct {
 	numChannels       int
 	patchSize         int
 	temporalPatchSize int
 	mergeSize         int
 	minPixels         int
 	maxPixels         int
 	factor            int
 	rescaleFactor     float32
 	imageMean         []float32
 	imageStd          []float32
 }
 // newImageProcessor creates a new image processor with default values
 func newImageProcessor(c fs.Config) ImageProcessor {
 	patchSize := int(c.Uint("vision.patch_size", 14))
 	mergeSize := int(c.Uint("vision.spatial_merge_size", 2))
 	return ImageProcessor{
 		numChannels:       int(c.Uint("vision.num_channels", 3)), // not set
 		patchSize:         patchSize,
 		temporalPatchSize: 2,
 		mergeSize:         mergeSize,
 		minPixels:         56 * 56,
 		maxPixels:         int(c.Uint("vision.max_pixels", 28*28*1280)), // 1MP limit
 		factor:            patchSize * mergeSize,
 		rescaleFactor:     1.0 / 255.0,
 		imageMean:         imageproc.ClipDefaultMean[:],
 		imageStd:          imageproc.ClipDefaultSTD[:],
 	}
 }
 // SmartResize implements the smart resize algorithm
 func (p *ImageProcessor) SmartResize(height, width int) (int, int) {
 	factor := p.factor
 	if height < factor || width < factor {
 		panic(fmt.Sprintf("height:%d or width:%d must be larger than factor:%d", height, width, factor))
 	} else if aspectRatio := max(height, width) / min(height, width); aspectRatio > 200 {
 		panic(fmt.Sprintf("absolute aspect ratio must be smaller than 200, got %v", aspectRatio))
 	}
 	round := func(x float64) int { return int(math.RoundToEven(x)) }
 	hBar := round(float64(height)/float64(factor)) * factor
 	wBar := round(float64(width)/float64(factor)) * factor
 	if hBar*wBar > p.maxPixels {
 		beta := math.Sqrt(float64(height*width) / float64(p.maxPixels))
 		hBar = int(math.Floor(float64(height)/beta/float64(factor))) * factor
 		wBar = int(math.Floor(float64(width)/beta/float64(factor))) * factor
 	} else if hBar*wBar < p.minPixels {
 		beta := math.Sqrt(float64(p.minPixels) / float64(height*width))
 		hBar = int(math.Ceil(float64(height)*beta/float64(factor))) * factor
 		wBar = int(math.Ceil(float64(width)*beta/float64(factor))) * factor
 	}
 	return hBar, wBar
 }
 type Grid struct {
 	Height   int
 	Width    int
 	Temporal int
 }
 func (p *ImageProcessor) ProcessImage(img image.Image) ([]float32, *Grid, error) {
 	origWidth := img.Bounds().Dx()
 	origHeight := img.Bounds().Dy()
 	// Calculate smart resize dimensions
 	resizedHeight, resizedWidth := p.SmartResize(origHeight, origWidth)
 	// Resize image using existing functions
 	resizedImg := imageproc.Resize(img, image.Point{X: resizedWidth, Y: resizedHeight}, imageproc.ResizeBilinear)
 	normalizedPixels := imageproc.Normalize(
 		resizedImg,
 		[3]float32{p.imageMean[0], p.imageMean[1], p.imageMean[2]},
 		[3]float32{p.imageStd[0], p.imageStd[1], p.imageStd[2]},
 		true, // rescale
 		true, // channelFirst
 	)
 	// Calculate grid dimensions
 	grid := &Grid{
 		Height:   resizedHeight / p.patchSize,
 		Width:    resizedWidth / p.patchSize,
 		Temporal: 1, // For single images, temporal dimension is 1
 	}
 	patches, err := p.createPatches(normalizedPixels, resizedHeight, resizedWidth, grid)
 	if err != nil {
 		return nil, nil, fmt.Errorf("failed to create patches: %v", err)
 	}
 	// Return patches and grid dimensions
 	return patches, grid, nil
 }
 func (p *ImageProcessor) createPatches(pixels []float32, height, width int, grid *Grid) ([]float32, error) {
 	channels := p.numChannels
 	patchSize := p.patchSize
 	mergeSize := p.mergeSize
 	temporalPatchSize := p.temporalPatchSize
 	// Calculate output dimensions
 	numPatches := grid.Temporal * grid.Height * grid.Width
 	patchDim := channels * temporalPatchSize * patchSize * patchSize
 	result := make([]float32, numPatches*patchDim)
 	patchIndex := 0
 	// Single temporal frame handling (copies to all frames)
 	for range grid.Temporal {
 		for h := 0; h < grid.Height; h += mergeSize {
 			for w := 0; w < grid.Width; w += mergeSize {
 				// Handle the 2x2 merged patches
 				for mh := range mergeSize {
 					for mw := range mergeSize {
 						baseOffset := patchIndex * patchDim
 						// Extract patch data for first temporal frame
 						for c := range channels {
 							channelOffset := baseOffset + (c * temporalPatchSize * patchSize * patchSize)
 							for py := range patchSize {
 								for px := range patchSize {
 									// Calculate source pixel coordinates
 									y := (h+mh)*patchSize + py
 									x := (w+mw)*patchSize + px
 									// Source index in input tensor (CHW format)
 									srcIdx := c*height*width + y*width + x
 									// Destination index in first temporal frame
 									dstIdx := channelOffset + (py * patchSize) + px
 									if srcIdx < len(pixels) && dstIdx < len(result) {
 										result[dstIdx] = pixels[srcIdx]
 									}
 								}
 							}
 						}
 						// Copy first temporal frame to all other frames
 						if temporalPatchSize > 1 {
 							for c := range channels {
 								channelOffset := baseOffset + (c * temporalPatchSize * patchSize * patchSize)
 								firstFrameOffset := channelOffset
 								frameSize := patchSize * patchSize
 								// Copy first frame to all other frames
 								for tp := 1; tp < temporalPatchSize; tp++ {
 									currentFrameOffset := channelOffset + (tp * frameSize)
 									copy(result[currentFrameOffset:currentFrameOffset+frameSize],
 										result[firstFrameOffset:firstFrameOffset+frameSize])
 								}
 							}
 						}
 						patchIndex++
 					}
 				}
 			}
 		}
 	}
 	return result, nil
 }
--- a/server/model.go
+++ b/server/model.go
@ -10,9 +10,6 @@ import (
 	"log/slog"
 	"net/http"
 	"os"
 	"slices"
 	"strings"
 	"text/template/parse"
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/fs/ggml"
@ -129,123 +126,19 @@ func detectContentType(r io.Reader) (string, error) {
 	return "unknown", nil
 }
-func parseObjects(s string) []map[string]any {
+// func ToolTemplate(m *Model) (*gotmpl.Template, bool) {
-	var objs []map[string]any
+// 	// create a subtree from the node that ranges over .ToolCalls
-	for offset := 0; offset < len(s); {
+// 	tmpl := m.Template.Subtree(func(n parse.Node) bool {
-		var obj map[string]any
+// 		if t, ok := n.(*parse.RangeNode); ok {
-		decoder := json.NewDecoder(strings.NewReader(s[offset:]))
+// 			return slices.Contains(template.Identifiers(t.Pipe), "ToolCalls")
-		if err := decoder.Decode(&obj); errors.Is(err, io.EOF) || errors.Is(err, io.ErrUnexpectedEOF) {
+// 		}
 			break
 		} else if syntax := &(json.SyntaxError{}); errors.As(err, &syntax) {
 			// skip over any syntax errors
 			offset += int(syntax.Offset)
 		} else if unmarshalType := &(json.UnmarshalTypeError{}); errors.As(err, &unmarshalType) {
 			// skip over any unmarshalable types
 			offset += int(unmarshalType.Offset)
 		} else if err != nil {
 			return nil
 		} else {
 			offset += int(decoder.InputOffset())
 			objs = append(objs, obj)
 		}
 	}
-	return objs
+// 		return false
-}
+// 	})
-// parseToolCalls attempts to parse a JSON string into a slice of ToolCalls.
+// 	if tmpl == nil {
-// mxyng: this only really works if the input contains tool calls in some JSON format
+// 		return nil, false
-func (m *Model) parseToolCalls(s string) ([]api.ToolCall, bool) {
+// 	}
 	// create a subtree from the node that ranges over .ToolCalls
 	tmpl := m.Template.Subtree(func(n parse.Node) bool {
 		if t, ok := n.(*parse.RangeNode); ok {
 			return slices.Contains(template.Identifiers(t.Pipe), "ToolCalls")
 		}
-		return false
+// 	return tmpl, true
-	})
+// }
 	if tmpl == nil {
 		return nil, false
 	}
 	var b bytes.Buffer
 	if err := tmpl.Execute(&b, map[string][]api.ToolCall{
 		"ToolCalls": {
 			{
 				Function: api.ToolCallFunction{
 					Name: "@@name@@",
 					Arguments: api.ToolCallFunctionArguments{
 						"@@argument@@": 1,
 					},
 				},
 			},
 		},
 	}); err != nil {
 		return nil, false
 	}
 	templateObjects := parseObjects(b.String())
 	if len(templateObjects) == 0 {
 		return nil, false
 	}
 	// find the keys that correspond to the name and arguments fields
 	var name, arguments string
 	for k, v := range templateObjects[0] {
 		switch v.(type) {
 		case string:
 			name = k
 		case map[string]any:
 			arguments = k
 		}
 	}
 	if name == "" || arguments == "" {
 		return nil, false
 	}
 	responseObjects := parseObjects(s)
 	if len(responseObjects) == 0 {
 		return nil, false
 	}
 	// collect all nested objects
 	var collect func(any) []map[string]any
 	collect = func(obj any) (all []map[string]any) {
 		switch o := obj.(type) {
 		case map[string]any:
 			all = append(all, o)
 			for _, v := range o {
 				all = append(all, collect(v)...)
 			}
 		case []any:
 			for _, v := range o {
 				all = append(all, collect(v)...)
 			}
 		}
 		return all
 	}
 	var objs []map[string]any
 	for _, p := range responseObjects {
 		objs = append(objs, collect(p)...)
 	}
 	var toolCalls []api.ToolCall
 	for _, kv := range objs {
 		n, nok := kv[name].(string)
 		a, aok := kv[arguments].(map[string]any)
 		if nok && aok {
 			toolCalls = append(toolCalls, api.ToolCall{
 				Function: api.ToolCallFunction{
 					Name:      n,
 					Arguments: a,
 				},
 			})
 		}
 	}
 	return toolCalls, len(toolCalls) > 0
 }
--- a/server/model_test.go
+++ b/server/model_test.go
@ -1,179 +1,185 @@
 package server
-import (
+// import (
-	"bytes"
+// 	"testing"
-	"encoding/json"
+// 	gotmpl "text/template"
-	"fmt"
+// )
 	"os"
 	"path/filepath"
 	"testing"
-	"github.com/google/go-cmp/cmp"
+// func TestToolToken(t *testing.T) {
 // 	cases := []struct {
 // 		name     string
 // 		template string
 // 		want     string
 // 		ok       bool
 // 	}{
 // 		{
 // 			name:     "basic tool call with action prefix",
 // 			template: "{{if .ToolCalls}}Action: ```json{{end}}",
 // 			want:     "Action:",
 // 			ok:       true,
 // 		},
 // 		{
 // 			name:     "incomplete functools bracket",
 // 			template: "{{if .ToolCalls}}functools[{{end}}",
 // 			want:     "functools",
 // 			ok:       true,
 // 		},
 // 		{
 // 			name:     "tool call with angle brackets",
 // 			template: "{{if .ToolCalls}}Hello, world! <tool_call>{{end}}",
 // 			want:     "<tool_call>",
 // 			ok:       true,
 // 		},
 // 		{
 // 			name:     "multiple tool call formats",
 // 			template: "{{if .ToolCalls}}[tool_call] <tool_call>{{end}}",
 // 			want:     "[tool_call]",
 // 			ok:       true,
 // 		},
 // 		{
 // 			name:     "single angle bracket tool call",
 // 			template: "{{if .ToolCalls}}<tool_call>{{end}}",
 // 			want:     "<tool_call>",
 // 			ok:       true,
 // 		},
 // 		{
 // 			name:     "incomplete angle bracket after tool call",
 // 			template: "{{if .ToolCalls}}[tool_call] <{{end}}",
 // 			want:     "[tool_call]",
 // 			ok:       true,
 // 		},
 // 		{
 // 			name:     "angle bracket prefix with tool call",
 // 			template: "{{if .ToolCalls}}> <tool_call>{{end}}",
 // 			want:     "<tool_call>",
 // 			ok:       true,
 // 		},
 // 		{
 // 			name:     "uppercase tool call with incomplete bracket",
 // 			template: "{{if .ToolCalls}}[TOOL_CALL] [{{end}}",
 // 			want:     "[TOOL_CALL]",
 // 			ok:       true,
 // 		},
 // 		{
 // 			name:     "uppercase tool call with adjacent bracket",
 // 			template: "{{if .ToolCalls}}[TOOL_CALL][{{end}}",
 // 			want:     "[TOOL_CALL]",
 // 			ok:       true,
 // 		},
 // 		{
 // 			name:     "tool call with pipe delimiters",
 // 			template: "{{if .ToolCalls}}<|tool_call|>{{end}}",
 // 			want:     "<|tool_call|>",
 // 			ok:       true,
 // 		},
 // 	}
-	"github.com/ollama/ollama/api"
+// 	for _, tt := range cases {
-	"github.com/ollama/ollama/template"
+// 		t.Run(tt.name, func(t *testing.T) {
-)
+// 			tmpl, err := gotmpl.New("test").Parse(tt.template)
 // 			if err != nil {
 // 				t.Fatalf("failed to parse template: %v", err)
 // 			}
 // 			got, ok := ToolPrefix(tmpl)
 // 			if got != tt.want {
 // 				t.Errorf("ToolToken(%q) = %q; want %q", tt.template, got, tt.want)
 // 			}
 // 			if ok != tt.ok {
 // 				t.Errorf("ToolToken(%q) = %v; want %v", tt.template, ok, tt.ok)
 // 			}
 // 		})
 // 	}
 // }
-func readFile(t *testing.T, base, name string) *bytes.Buffer {
+// func TestTextAfterToolCalls(t *testing.T) {
-	t.Helper()
+// 	cases := []struct {
 // 		name     string
 // 		template string
 // 		want     string
 // 		ok       bool
 // 	}{
 // 		{
 // 			name:     "basic tool call with text after",
 // 			template: `{{if .ToolCalls}}tool response{{end}}`,
 // 			want:     "tool response",
 // 			ok:       true,
 // 		},
 // 		{
 // 			name:     "tool call with mixed content after",
 // 			template: `{{if .ToolCalls}}<tool_call>{{.Something}}{{end}}`,
 // 			want:     "<tool_call>",
 // 			ok:       true,
 // 		},
 // 		{
 // 			name:     "tool call with no text after",
 // 			template: `{{if .ToolCalls}}{{.Something}}{{end}}`,
 // 			want:     "",
 // 			ok:       true,
 // 		},
 // 		{
 // 			name:     "nested tool call",
 // 			template: `{{if .Something}}{{if .ToolCalls}}[TOOL_CALL]{{end}}{{end}}`,
 // 			want:     "[TOOL_CALL]",
 // 			ok:       true,
 // 		},
 // 		{
 // 			name:     "no tool calls",
 // 			template: `{{if .Something}}no tools here{{end}}`,
 // 			want:     "",
 // 			ok:       false,
 // 		},
 // 		{
 // 			name:     "empty template",
 // 			template: ``,
 // 			want:     "",
 // 			ok:       false,
 // 		},
 // 		{
 // 			name:     "multiple tool calls sections",
 // 			template: `{{if .ToolCalls}}first{{end}}{{if .ToolCalls}}second{{end}}`,
 // 			want:     "first",
 // 			ok:       true,
 // 		},
 // 		{
 // 			name:     "range over tool calls",
 // 			template: `{{if .ToolCalls}}{{range .ToolCalls}}tool{{end}}{{end}}`,
 // 			want:     "",
 // 			ok:       true,
 // 		},
 // 		{
 // 			name:     "tool calls with pipe delimiters",
 // 			template: `{{if .ToolCalls}}<|tool|>{{end}}`,
 // 			want:     "<|tool|>",
 // 			ok:       true,
 // 		},
 // 		{
 // 			name:     "tool calls with nested template",
 // 			template: `{{if .ToolCalls}}{{template "tool" .}}{{end}}`,
 // 			want:     "",
 // 			ok:       true,
 // 		},
 // 		{
 // 			name:     "tool calls with whitespace variations",
 // 			template: `{{if .ToolCalls}}  tool  {{end}}`,
 // 			want:     "  tool  ",
 // 			ok:       true,
 // 		},
 // 	}
-	bts, err := os.ReadFile(filepath.Join(base, name))
+// 	for _, tt := range cases {
-	if err != nil {
+// 		t.Run(tt.name, func(t *testing.T) {
-		t.Fatal(err)
+// 			tmpl, err := gotmpl.New("test").Parse(tt.template)
-	}
+// 			if err != nil {
 // 				t.Fatalf("failed to parse template: %v", err)
 // 			}
-	return bytes.NewBuffer(bts)
+// 			got, ok := extractToolCallsTemplate(tmpl)
-}
+// 			if got != tt.want {
-
+// 				t.Errorf("TextAfterToolCalls() got = %q, want %q", got, tt.want)
-func TestExecuteWithTools(t *testing.T) {
+// 			}
-	p := filepath.Join("testdata", "tools")
+// 			if ok != tt.ok {
-	cases := []struct {
+// 				t.Errorf("TextAfterToolCalls() ok = %v, want %v", ok, tt.ok)
-		model  string
+// 			}
-		output string
+// 		})
-		ok     bool
+// 	}
-	}{
+// }
 		{"mistral", `[TOOL_CALLS]  [{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]`, true},
 		{"mistral", `[TOOL_CALLS]  [{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]
 The temperature in San Francisco, CA is 70°F and in Toronto, Canada is 20°C.`, true},
 		{"mistral", `[TOOL_CALLS]  [{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"To }]`, false},
 		{"mistral", `I'm not aware of that information. However, I can suggest searching for the weather using the "get_current_weather" function:
 		[{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]`, true},
 		{"mistral", " The weather in San Francisco, CA is 70°F and in Toronto, Canada is 20°C.", false},
 		{"command-r-plus", "Action: ```json" + `
 [
    {
        "tool_name": "get_current_weather",
        "parameters": {
            "format": "fahrenheit",
            "location": "San Francisco, CA"
        }
    },
    {
        "tool_name": "get_current_weather",
        "parameters": {
            "format": "celsius",
            "location": "Toronto, Canada"
        }
    }
 ]
 ` + "```", true},
 		{"command-r-plus", " The weather in San Francisco, CA is 70°F and in Toronto, Canada is 20°C.", false},
 		{"firefunction", ` functools[{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]`, true},
 		{"firefunction", " The weather in San Francisco, CA is 70°F and in Toronto, Canada is 20°C.", false},
 		{"llama3-groq-tool-use", `<tool_call>
 {"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}}
 {"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}
 </tool_call>`, true},
 		{"xlam", `{"tool_calls": [{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]}`, true},
 		{"nemotron", `<toolcall>{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]} </toolcall>`, true},
 	}
 	var tools []api.Tool
 	if err := json.Unmarshal(readFile(t, p, "tools.json").Bytes(), &tools); err != nil {
 		t.Fatal(err)
 	}
 	var messages []api.Message
 	if err := json.Unmarshal(readFile(t, p, "messages.json").Bytes(), &messages); err != nil {
 		t.Fatal(err)
 	}
 	calls := []api.ToolCall{
 		{
 			Function: api.ToolCallFunction{
 				Name: "get_current_weather",
 				Arguments: api.ToolCallFunctionArguments{
 					"format":   "fahrenheit",
 					"location": "San Francisco, CA",
 				},
 			},
 		},
 		{
 			Function: api.ToolCallFunction{
 				Name: "get_current_weather",
 				Arguments: api.ToolCallFunctionArguments{
 					"format":   "celsius",
 					"location": "Toronto, Canada",
 				},
 			},
 		},
 	}
 	for _, tt := range cases {
 		t.Run(tt.model, func(t *testing.T) {
 			tmpl, err := template.Parse(readFile(t, p, fmt.Sprintf("%s.gotmpl", tt.model)).String())
 			if err != nil {
 				t.Fatal(err)
 			}
 			t.Run("template", func(t *testing.T) {
 				var actual bytes.Buffer
 				if err := tmpl.Execute(&actual, template.Values{Tools: tools, Messages: messages}); err != nil {
 					t.Fatal(err)
 				}
 				if diff := cmp.Diff(actual.String(), readFile(t, p, fmt.Sprintf("%s.out", tt.model)).String()); diff != "" {
 					t.Errorf("mismatch (-got +want):\n%s", diff)
 				}
 			})
 			t.Run("parse", func(t *testing.T) {
 				m := &Model{Template: tmpl}
 				actual, ok := m.parseToolCalls(tt.output)
 				if ok != tt.ok {
 					t.Fatalf("expected %t, got %t", tt.ok, ok)
 				}
 				if tt.ok {
 					if diff := cmp.Diff(actual, calls); diff != "" {
 						t.Errorf("mismatch (-got +want):\n%s", diff)
 					}
 				}
 			})
 		})
 	}
 }
 func TestParseObjects(t *testing.T) {
 	tests := []struct {
 		input string
 		want  []map[string]any
 	}{
 		{
 			input: `[{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]`,
 			want: []map[string]any{
 				{"name": "get_current_weather", "arguments": map[string]any{"format": "fahrenheit", "location": "San Francisco, CA"}},
 				{"name": "get_current_weather", "arguments": map[string]any{"format": "celsius", "location": "Toronto, Canada"}},
 			},
 		},
 		{
 			input: `<toolcall>{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}} </toolcall>`,
 			want: []map[string]any{
 				{"name": "get_current_weather", "arguments": map[string]any{"format": "fahrenheit", "location": "San Francisco, CA"}},
 			},
 		},
 		{
 			input: `<toolcall>{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}} </toolcall> <toolcall>{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, ON"}} </toolcall>`,
 			want: []map[string]any{
 				{"name": "get_current_weather", "arguments": map[string]any{"format": "fahrenheit", "location": "San Francisco, CA"}},
 				{"name": "get_current_weather", "arguments": map[string]any{"format": "celsius", "location": "Toronto, ON"}},
 			},
 		},
 		{
 			input: `{"name": "get_current_weather", "arguments": `,
 			want:  nil,
 		},
 	}
 	for _, tc := range tests {
 		t.Run(tc.input, func(t *testing.T) {
 			got := parseObjects(tc.input)
 			if diff := cmp.Diff(got, tc.want); diff != "" {
 				t.Errorf("mismatch (-got +want):\n%s", diff)
 			}
 		})
 	}
 }
--- a/server/routes.go
+++ b/server/routes.go
@ -38,6 +38,7 @@ import (
 	"github.com/ollama/ollama/server/internal/client/ollama"
 	"github.com/ollama/ollama/server/internal/registry"
 	"github.com/ollama/ollama/template"
 	"github.com/ollama/ollama/tools"
 	"github.com/ollama/ollama/types/errtypes"
 	"github.com/ollama/ollama/types/model"
 	"github.com/ollama/ollama/version"
@ -1482,11 +1483,22 @@ func (s *Server) ChatHandler(c *gin.Context) {
 		return
 	}
 	slog.Debug("chat request", "images", len(images), "prompt", prompt)
 	var toolParser *tools.Parser
 	if len(req.Tools) > 0 {
 		toolParser, err = tools.NewParser(m.Template.Template)
 		if err != nil {
 			slog.Error("failed to create tool parser", "error", err)
 			c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
 			return
 		}
 	}
 	ch := make(chan any)
 	go func() {
 		defer close(ch)
-		var sb strings.Builder
+
 		var toolCallIndex int = 0
 		if err := r.Completion(c.Request.Context(), llm.CompletionRequest{
 			Prompt:  prompt,
 			Images:  images,
@ -1512,37 +1524,21 @@ func (s *Server) ChatHandler(c *gin.Context) {
 				res.LoadDuration = checkpointLoaded.Sub(checkpointStart)
 			}
-			// TODO: tool call checking and filtering should be moved outside of this callback once streaming
+			if len(req.Tools) > 0 && !toolParser.Done {
-			// however this was a simple change for now without reworking streaming logic of this (and other)
+				toolCalls, content, err := toolParser.Add(r.Content)
-			// handlers
+				if err == nil {
-			if req.Stream != nil && !*req.Stream || len(req.Tools) == 0 {
+					if len(content) > 0 {
-				ch <- res
+						res.Message.Content = content
-				return
+						slog.Debug("tools: setting content to", "content", content)
-			}
+					} else if len(toolCalls) > 0 {
-
+						res.Message.ToolCalls = toolCalls
-			// Streaming tool calls:
+						res.Message.Content = ""
-			// If tools are recognized, use a flag to track the sending of a tool downstream
+					} else {
-			// This ensures that content is cleared from the message on the last chunk sent
+						return
-			sb.WriteString(r.Content)
+					}
 			if toolCalls, ok := m.parseToolCalls(sb.String()); ok {
 				res.Message.ToolCalls = toolCalls
 				for i := range toolCalls {
 					toolCalls[i].Function.Index = toolCallIndex
 					toolCallIndex++
 				}
 				res.Message.Content = ""
 				sb.Reset()
 				ch <- res
 				return
 			}
 			if r.Done {
 				// Send any remaining content if no tool calls were detected
 				if toolCallIndex == 0 {
 					res.Message.Content = sb.String()
 				}
 				ch <- res
 			}
 			ch <- res
 		}); err != nil {
 			ch <- gin.H{"error": err.Error()}
 		}
@ -1551,11 +1547,15 @@ func (s *Server) ChatHandler(c *gin.Context) {
 	if req.Stream != nil && !*req.Stream {
 		var resp api.ChatResponse
 		var sb strings.Builder
 		var toolCalls []api.ToolCall
 		for rr := range ch {
 			switch t := rr.(type) {
 			case api.ChatResponse:
 				sb.WriteString(t.Message.Content)
 				resp = t
 				if len(req.Tools) > 0 {
 					toolCalls = append(toolCalls, t.Message.ToolCalls...)
 				}
 			case gin.H:
 				msg, ok := t["error"].(string)
 				if !ok {
@ -1571,12 +1571,8 @@ func (s *Server) ChatHandler(c *gin.Context) {
 		}
 		resp.Message.Content = sb.String()
-
+		if len(toolCalls) > 0 {
-		if len(req.Tools) > 0 {
+			resp.Message.ToolCalls = toolCalls
 			if toolCalls, ok := m.parseToolCalls(sb.String()); ok {
 				resp.Message.ToolCalls = toolCalls
 				resp.Message.Content = ""
 			}
 		}
 		c.JSON(http.StatusOK, resp)
--- a/server/testdata/tools/command-r-plus.gotmpl
+++ b/server/testdata/tools/command-r-plus.gotmpl
--- a/server/testdata/tools/command-r-plus.out
+++ b/server/testdata/tools/command-r-plus.out
--- a/server/testdata/tools/firefunction.gotmpl
+++ b/server/testdata/tools/firefunction.gotmpl
--- a/server/testdata/tools/firefunction.out
+++ b/server/testdata/tools/firefunction.out
--- a/server/testdata/tools/llama3-groq-tool-use.gotmpl
+++ b/server/testdata/tools/llama3-groq-tool-use.gotmpl
--- a/server/testdata/tools/llama3-groq-tool-use.out
+++ b/server/testdata/tools/llama3-groq-tool-use.out
--- a/tools/testdata/llama3.2.gotmpl
+++ b/tools/testdata/llama3.2.gotmpl
@ -0,0 +1,44 @@
 <|start_header_id|>system<|end_header_id|>
 Cutting Knowledge Date: December 2023
 {{ if .System }}{{ .System }}
 {{- end }}
 {{- if .Tools }}When you receive a tool call response, use the output to format an answer to the orginal user question.
 You are a helpful assistant with tool calling capabilities.
 {{- end }}<|eot_id|>
 {{- range $i, $_ := .Messages }}
 {{- $last := eq (len (slice $.Messages $i)) 1 }}
 {{- if eq .Role "user" }}<|start_header_id|>user<|end_header_id|>
 {{- if and $.Tools $last }}
 Given the following functions, please respond with a JSON for a function call with its proper arguments that best answers the given prompt.
 Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}. Do not use variables.
 {{ range $.Tools }}
 {{- . }}
 {{ end }}
 {{ .Content }}<|eot_id|>
 {{- else }}
 {{ .Content }}<|eot_id|>
 {{- end }}{{ if $last }}<|start_header_id|>assistant<|end_header_id|>
 {{ end }}
 {{- else if eq .Role "assistant" }}<|start_header_id|>assistant<|end_header_id|>
 {{- if .ToolCalls }}
 {{ range .ToolCalls }}
 {"name": "{{ .Function.Name }}", "parameters": {{ .Function.Arguments }}}{{ end }}
 {{- else }}
 {{ .Content }}
 {{- end }}{{ if not $last }}<|eot_id|>{{ end }}
 {{- else if eq .Role "tool" }}<|start_header_id|>ipython<|end_header_id|>
 {{ .Content }}<|eot_id|>{{ if $last }}<|start_header_id|>assistant<|end_header_id|>
 {{ end }}
 {{- end }}
 {{- end }}
--- a/tools/testdata/llama3.2.out
+++ b/tools/testdata/llama3.2.out
@ -0,0 +1,24 @@
 <|start_header_id|>system<|end_header_id|>
 Cutting Knowledge Date: December 2023
 You are a knowledgeable assistant. You can answer questions and perform tasks.When you receive a tool call response, use the output to format an answer to the orginal user question.
 You are a helpful assistant with tool calling capabilities.<|eot_id|><|start_header_id|>user<|end_header_id|>
 What's the weather like today in Paris?<|eot_id|><|start_header_id|>assistant<|end_header_id|>
 {"name": "get_current_weather", "parameters": {"format":"celsius","location":"Paris, France"}}<|eot_id|><|start_header_id|>ipython<|end_header_id|>
 22<|eot_id|><|start_header_id|>assistant<|end_header_id|>
 The current temperature in Paris, France is 22 degrees Celsius.<|eot_id|><|start_header_id|>user<|end_header_id|>
 Given the following functions, please respond with a JSON for a function call with its proper arguments that best answers the given prompt.
 Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}. Do not use variables.
 {"type":"function","function":{"name":"get_current_weather","description":"Get the current weather","parameters":{"type":"object","required":["location","format"],"properties":{"format":{"type":"string","description":"The temperature unit to use. Infer this from the user's location.","enum":["celsius","fahrenheit"]},"location":{"type":"string","description":"The city and state, e.g. San Francisco, CA"}}}}}
 What's the weather like today in San Francisco and Toronto?<|eot_id|><|start_header_id|>assistant<|end_header_id|>
--- a/server/testdata/tools/messages.json
+++ b/server/testdata/tools/messages.json
--- a/server/testdata/tools/mistral.gotmpl
+++ b/server/testdata/tools/mistral.gotmpl
--- a/server/testdata/tools/mistral.out
+++ b/server/testdata/tools/mistral.out
--- a/server/testdata/tools/nemotron.gotmpl
+++ b/server/testdata/tools/nemotron.gotmpl
--- a/server/testdata/tools/nemotron.out
+++ b/server/testdata/tools/nemotron.out
--- a/tools/testdata/qwen2.5-coder.gotmpl
+++ b/tools/testdata/qwen2.5-coder.gotmpl
@ -0,0 +1,51 @@
 {{- if .Suffix }}<|fim_prefix|>{{ .Prompt }}<|fim_suffix|>{{ .Suffix }}<|fim_middle|>
 {{- else if .Messages }}
 {{- if or .System .Tools }}<|im_start|>system
 {{- if .System }}
 {{ .System }}
 {{- end }}
 {{- if .Tools }}
 # Tools
 You may call one or more functions to assist with the user query.
 You are provided with function signatures within <tools></tools> XML tags:
 <tools>
 {{- range .Tools }}
 {"type": "function", "function": {{ .Function }}}
 {{- end }}
 </tools>
 For each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:
 <tool_call>
 {"name": <function-name>, "arguments": <args-json-object>}
 </tool_call>
 {{- end }}<|im_end|>
 {{ end }}
 {{- range $i, $_ := .Messages }}
 {{- $last := eq (len (slice $.Messages $i)) 1 -}}
 {{- if eq .Role "user" }}<|im_start|>user
 {{ .Content }}<|im_end|>
 {{ else if eq .Role "assistant" }}<|im_start|>assistant
 {{ if .Content }}{{ .Content }}
 {{- else if .ToolCalls }}<tool_call>
 {{ range .ToolCalls }}{"name": "{{ .Function.Name }}", "arguments": {{ .Function.Arguments }}}
 {{ end }}</tool_call>
 {{- end }}{{ if not $last }}<|im_end|>
 {{ end }}
 {{- else if eq .Role "tool" }}<|im_start|>user
 <tool_response>
 {{ .Content }}
 </tool_response><|im_end|>
 {{ end }}
 {{- if and (ne .Role "assistant") $last }}<|im_start|>assistant
 {{ end }}
 {{- end }}
 {{- else }}
 {{- if .System }}<|im_start|>system
 {{ .System }}<|im_end|>
 {{ end }}{{ if .Prompt }}<|im_start|>user
 {{ .Prompt }}<|im_end|>
 {{ end }}<|im_start|>assistant
 {{ end }}{{ .Response }}{{ if .Response }}<|im_end|>{{ end }}
--- a/tools/testdata/qwen2.5-coder.out
+++ b/tools/testdata/qwen2.5-coder.out
@ -0,0 +1,31 @@
 <|im_start|>system
 You are a knowledgeable assistant. You can answer questions and perform tasks.
 # Tools
 You may call one or more functions to assist with the user query.
 You are provided with function signatures within <tools></tools> XML tags:
 <tools>
 {"type": "function", "function": {"name":"get_current_weather","description":"Get the current weather","parameters":{"type":"object","required":["location","format"],"properties":{"format":{"type":"string","description":"The temperature unit to use. Infer this from the user's location.","enum":["celsius","fahrenheit"]},"location":{"type":"string","description":"The city and state, e.g. San Francisco, CA"}}}}}
 </tools>
 For each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:
 <tool_call>
 {"name": <function-name>, "arguments": <args-json-object>}
 </tool_call><|im_end|>
 <|im_start|>user
 What's the weather like today in Paris?<|im_end|>
 <|im_start|>assistant
 <tool_call>
 {"name": "get_current_weather", "arguments": {"format":"celsius","location":"Paris, France"}}
 </tool_call><|im_end|>
 <|im_start|>user
 <tool_response>
 22
 </tool_response><|im_end|>
 <|im_start|>assistant
 The current temperature in Paris, France is 22 degrees Celsius.<|im_end|>
 <|im_start|>user
 What's the weather like today in San Francisco and Toronto?<|im_end|>
 <|im_start|>assistant
--- a/tools/testdata/qwen3.gotmpl
+++ b/tools/testdata/qwen3.gotmpl
@ -0,0 +1,50 @@
 {{- if .Messages }}
 {{- if or .System .Tools }}<|im_start|>system
 {{- if .System }}
 {{ .System }}
 {{- end }}
 {{- if .Tools }}
 # Tools
 You may call one or more functions to assist with the user query.
 You are provided with function signatures within <tools></tools> XML tags:
 <tools>
 {{- range .Tools }}
 {"type": "function", "function": {{ .Function }}}
 {{- end }}
 </tools>
 For each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:
 <tool_call>
 {"name": <function-name>, "arguments": <args-json-object>}
 </tool_call>
 {{- end }}<|im_end|>
 {{ end }}
 {{- range $i, $_ := .Messages }}
 {{- $last := eq (len (slice $.Messages $i)) 1 -}}
 {{- if eq .Role "user" }}<|im_start|>user
 {{ .Content }}<|im_end|>
 {{ else if eq .Role "assistant" }}<|im_start|>assistant
 {{ if .Content }}{{ .Content }}
 {{- else if .ToolCalls }}<tool_call>
 {{ range .ToolCalls }}{"name": "{{ .Function.Name }}", "arguments": {{ .Function.Arguments }}}
 {{ end }}</tool_call>
 {{- end }}{{ if not $last }}<|im_end|>
 {{ end }}
 {{- else if eq .Role "tool" }}<|im_start|>user
 <tool_response>
 {{ .Content }}
 </tool_response><|im_end|>
 {{ end }}
 {{- if and (ne .Role "assistant") $last }}<|im_start|>assistant
 {{ end }}
 {{- end }}
 {{- else }}
 {{- if .System }}<|im_start|>system
 {{ .System }}<|im_end|>
 {{ end }}{{ if .Prompt }}<|im_start|>user
 {{ .Prompt }}<|im_end|>
 {{ end }}<|im_start|>assistant
 {{ end }}{{ .Response }}{{ if .Response }}<|im_end|>{{ end }}
--- a/tools/testdata/qwen3.out
+++ b/tools/testdata/qwen3.out
@ -0,0 +1,31 @@
 <|im_start|>system
 You are a knowledgeable assistant. You can answer questions and perform tasks.
 # Tools
 You may call one or more functions to assist with the user query.
 You are provided with function signatures within <tools></tools> XML tags:
 <tools>
 {"type": "function", "function": {"name":"get_current_weather","description":"Get the current weather","parameters":{"type":"object","required":["location","format"],"properties":{"format":{"type":"string","description":"The temperature unit to use. Infer this from the user's location.","enum":["celsius","fahrenheit"]},"location":{"type":"string","description":"The city and state, e.g. San Francisco, CA"}}}}}
 </tools>
 For each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:
 <tool_call>
 {"name": <function-name>, "arguments": <args-json-object>}
 </tool_call><|im_end|>
 <|im_start|>user
 What's the weather like today in Paris?<|im_end|>
 <|im_start|>assistant
 <tool_call>
 {"name": "get_current_weather", "arguments": {"format":"celsius","location":"Paris, France"}}
 </tool_call><|im_end|>
 <|im_start|>user
 <tool_response>
 22
 </tool_response><|im_end|>
 <|im_start|>assistant
 The current temperature in Paris, France is 22 degrees Celsius.<|im_end|>
 <|im_start|>user
 What's the weather like today in San Francisco and Toronto?<|im_end|>
 <|im_start|>assistant
--- a/server/testdata/tools/tools.json
+++ b/server/testdata/tools/tools.json
--- a/server/testdata/tools/xlam.gotmpl
+++ b/server/testdata/tools/xlam.gotmpl
--- a/server/testdata/tools/xlam.out
+++ b/server/testdata/tools/xlam.out
--- a/tools/tools.go
+++ b/tools/tools.go
@ -0,0 +1,242 @@
 package tools
 import (
 	"errors"
 	"io"
 	"log/slog"
 	"strings"
 	gotmpl "text/template"
 	jsonv2 "github.com/go-json-experiment/json"
 	jsontext "github.com/go-json-experiment/json/jsontext"
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/template"
 )
 type Parser struct {
 	greedyParse   bool
 	prefixFound   bool
 	prefixPartial bool
 	tmpl          *gotmpl.Template
 	sb            *strings.Builder
 	prefix        string
 	index         int
 	name          string
 	arguments     string
 	Done          bool
 }
 // parseJSONToolCalls attempts to parse a JSON string into a slice of ToolCalls.
 // It first tries to incrementally decode the JSON to handle partial inputs.
 // Returns:
 //   - []api.ToolCall: The parsed tool calls if successful
 //   - bool: True if JSON is incomplete and needs more input
 func (p *Parser) parseJSONToolCalls(s string) ([]api.ToolCall, bool) {
 	// First try incremental decoding to handle partial JSON
 	dec := jsontext.NewDecoder(strings.NewReader(s))
 	if got, err := dec.ReadValue(); err == nil {
 		s = got.String()
 	}
 	// Attempt full unmarshal of the JSON
 	var resp any
 	err := jsonv2.Unmarshal([]byte(s), &resp)
 	if err != nil {
 		// Handle incomplete JSON cases
 		if errors.Is(err, io.ErrUnexpectedEOF) || err.Error() == "unexpected end of JSON input" {
 			slog.Debug("incomplete JSON detected", "input", s)
 			return nil, true
 		}
 		slog.Debug("failed to unmarshal response", "error", err)
 		return nil, false
 	}
 	// Collect all nested objects that could contain tool calls
 	var objs []map[string]any
 	objs = append(objs, collect(resp)...)
 	if len(objs) == 0 {
 		return nil, false
 	}
 	var toolCalls []api.ToolCall
 	for _, kv := range objs {
 		n, nok := kv[p.name].(string)
 		a, aok := kv[p.arguments].(map[string]any)
 		if nok && aok {
 			toolCalls = append(toolCalls, api.ToolCall{
 				Function: api.ToolCallFunction{
 					Name:      n,
 					Arguments: a,
 				},
 			})
 		}
 	}
 	// Valid JSON, no tool calls found
 	if len(toolCalls) == 0 {
 		return nil, false
 	}
 	return toolCalls, false
 }
 // checkPrefix processes a string to find and handle a prefix pattern.
 //
 // Returns:
 //   - The processed string with prefix removed if found
 //   - Whether the prefix was found at the start of the string
 //   - Whether to continue parsing
 func (p *Parser) checkPrefix(s string) (string, bool, bool) {
 	// Keep original for overlap checks
 	original := s
 	s = strings.TrimSpace(s)
 	if s == "" {
 		return "", false, true
 	}
 	// If no prefix defined, just return trimmed string
 	if p.prefix == "" {
 		return s, false, true
 	}
 	// Check for prefix at start of string
 	if processedStr, hasPrefix := strings.CutPrefix(s, p.prefix); hasPrefix {
 		// Found prefix at start - accumulate for potential tool
 		return processedStr, true, true
 	}
 	// Check if prefix overlaps end of string
 	if overlap := suffixOverlap(original, p.prefix); overlap > 0 {
 		p.prefixPartial = true
 		// Return everything except overlapping portion
 		p.sb.Reset()
 		p.sb.WriteString(original[len(original)-overlap:])
 		return original[0 : len(original)-overlap], false, false
 	}
 	// Check if prefix appears in middle of string
 	if idx := strings.Index(original, p.prefix); idx != -1 {
 		p.prefixPartial = true
 		// Save remainder starting at prefix for next pass
 		p.sb.Reset()
 		p.sb.WriteString(strings.TrimSpace(original[idx:]))
 		// Return everything before prefix
 		return original[:idx], false, false
 	}
 	// No prefix found
 	p.prefixPartial = false
 	return s, false, true
 }
 // Add processes a string input to parse tool calls and content.
 // It handles prefix detection and JSON parsing to extract tool calls.
 //
 // Returns:
 //   - tools: Any parsed tool calls
 //   - content: Non-tool call content
 //   - err: Error if parsing failed
 func (p *Parser) Add(s string) (tools []api.ToolCall, content string, err error) {
 	p.sb.WriteString(s)
 	s = p.sb.String()
 	if len(s) == 0 {
 		return nil, "", nil
 	}
 	// Check for prefix pattern in input
 	s, prefixFound, shouldContinue := p.checkPrefix(s)
 	if !shouldContinue {
 		if s != "" {
 			// Return content before prefix
 			return nil, s, nil
 		}
 		// Need more input to complete prefix
 		return nil, "", nil
 	}
 	// Update prefix found state
 	if prefixFound {
 		p.prefixFound = true
 	}
 	// Exit if prefix exists in template, greedy parsing is off, and prefix not found
 	if !p.greedyParse && !p.prefixFound {
 		p.sb.Reset()
 		return nil, "", errors.New("prefix not found")
 	}
 	toolCalls, isPartial := p.parseJSONToolCalls(s)
 	if isPartial {
 		// Need more input to complete JSON
 		return nil, "", nil
 	}
 	// Do not try greedy parsing if partial JSON not found
 	p.greedyParse = false
 	// Handle invalid tool call format
 	if len(toolCalls) == 0 {
 		p.sb.Reset()
 		if p.prefix == "" {
 			p.Done = true
 		}
 		if p.prefixFound {
 			// Drop tokens since prefix was found
 			return nil, "", nil
 		}
 		return nil, s, nil
 	}
 	for _, tc := range toolCalls {
 		tc.Function.Index = p.index
 		p.index++
 	}
 	// Mark as done if no prefix needed
 	if p.prefix == "" {
 		p.Done = true
 	}
 	p.sb.Reset()
 	return toolCalls, "", nil
 }
 // NewParser creates a new tool call parser from a template. It extracts the tool call format,
 // prefix, and field names from the template to use for parsing tool calls from model output.
 //
 // Returns an error if the template does not contain valid tool call formatting.
 func NewParser(templateToProcess *gotmpl.Template) (*Parser, error) {
 	parsed, err := template.Parse(templateToProcess.Root.String())
 	if err != nil {
 		return nil, err
 	}
 	if parsed == nil {
 		return nil, errors.New("failed to parse template")
 	}
 	tt, tc := toolTemplate(parsed)
 	if !tc {
 		return nil, errors.New("failed to find tool calls in template")
 	}
 	if tt == nil {
 		return nil, errors.New("failed to find tool template")
 	}
 	tp := toolPrefix(templateToProcess)
 	tp = strings.TrimSpace(tp)
 	name, arguments, err := extractToolArgs(tt)
 	if err != nil {
 		return nil, err
 	}
 	return &Parser{
 		tmpl:        tt,
 		sb:          &strings.Builder{},
 		prefix:      tp,
 		greedyParse: true,
 		name:        name,
 		arguments:   arguments,
 	}, nil
 }
--- a/tools/tools_test.go
+++ b/tools/tools_test.go
@ -0,0 +1,482 @@
 package tools
 import (
 	"bytes"
 	"encoding/json"
 	"fmt"
 	"os"
 	"path/filepath"
 	"strings"
 	"testing"
 	"github.com/google/go-cmp/cmp"
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/template"
 )
 func readFile(t *testing.T, base, name string) *bytes.Buffer {
 	t.Helper()
 	bts, err := os.ReadFile(filepath.Join(base, name))
 	if err != nil {
 		t.Fatal(err)
 	}
 	return bytes.NewBuffer(bts)
 }
 func TestParseToolCalls(t *testing.T) {
 	p := filepath.Join("testdata")
 	t1 := api.ToolCall{
 		Function: api.ToolCallFunction{
 			Name: "get_current_weather",
 			Arguments: api.ToolCallFunctionArguments{
 				"format":   "fahrenheit",
 				"location": "San Francisco, CA",
 			},
 		},
 	}
 	t2 := api.ToolCall{
 		Function: api.ToolCallFunction{
 			Name: "get_current_weather",
 			Arguments: api.ToolCallFunctionArguments{
 				"format":   "celsius",
 				"location": "Toronto, Canada",
 			},
 		},
 	}
 	cases := []struct {
 		name             string
 		model            string
 		output           string
 		expectedToolCall []api.ToolCall
 		expectedTokens   string
 	}{
 		{
 			name:             "mistral malformed json with tool calls prefix",
 			model:            "mistral",
 			output:           `[TOOL_CALLS]  [{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_curren}]`,
 			expectedToolCall: []api.ToolCall{},
 			expectedTokens:   "",
 		},
 		{
 			name:             "mistral multiple tool calls without prefix",
 			model:            "mistral",
 			output:           `[{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]`,
 			expectedToolCall: []api.ToolCall{t1, t2},
 			expectedTokens:   "",
 		},
 		{
 			name:  "mistral tool calls with text between no prefix",
 			model: "mistral",
 			output: `[{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}] 
 			model outputs more tokens here and then [{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]`,
 			expectedToolCall: []api.ToolCall{t1, t2},
 			expectedTokens:   `model outputs more tokens here and then [{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]`,
 		},
 		{
 			name:             "mistral valid json with tool calls prefix",
 			model:            "mistral",
 			output:           `[TOOL_CALLS]  [{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]`,
 			expectedToolCall: []api.ToolCall{t1, t2},
 			expectedTokens:   "",
 		},
 		{
 			name:  "mistral multiple tool calls with text between and prefix",
 			model: "mistral",
 			output: `[TOOL_CALLS]  [{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]
 			model outputs more tokens here and then [{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]`,
 			expectedToolCall: []api.ToolCall{t1, t2, t1, t2},
 			expectedTokens:   "",
 		},
 		{
 			name:             "mistral incomplete json with tool calls prefix",
 			model:            "mistral",
 			output:           `[TOOL_CALLS]  [{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, `,
 			expectedToolCall: []api.ToolCall{},
 			expectedTokens:   "",
 		},
 		{
 			name:  "mistral invalid tool call with explanatory text no prefix",
 			model: "mistral",
 			output: `I'm not aware of that information. However, I can suggest searching for the weather using the "get_current_weather" function:
 		[{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]`,
 			expectedToolCall: []api.ToolCall{},
 			expectedTokens:   `I'm not aware of that information. However, I can suggest searching for the weather using the "get_current_weather" function: [{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]`,
 		},
 		{
 			name:             "mistral tool calls without prefix",
 			model:            "mistral",
 			output:           `[{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]`,
 			expectedToolCall: []api.ToolCall{t1, t2},
 			expectedTokens:   "",
 		},
 		{
 			name:  "command r plus tool calls with json block format",
 			model: "command-r-plus",
 			output: "Action: ```json" + `
 		[
 		    {
 		        "tool_name": "get_current_weather",
 		        "parameters": {
 		            "format": "fahrenheit",
 		            "location": "San Francisco, CA"
 		        }
 		    },
 		    {
 		        "tool_name": "get_current_weather",
 		        "parameters": {
 		            "format": "celsius",
 		            "location": "Toronto, Canada"
 		        }
 		    }
 		]
 		` + "```",
 			expectedToolCall: []api.ToolCall{t1, t2},
 			expectedTokens:   "",
 		},
 		{
 			name:             "firefunction tool calls with functools prefix",
 			model:            "firefunction",
 			output:           ` functools[{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]`,
 			expectedToolCall: []api.ToolCall{t1, t2},
 			expectedTokens:   "",
 		},
 		{
 			name:  "llama3 groq single tool call with xml tags",
 			model: "llama3-groq-tool-use",
 			output: `<tool_call>
 		{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}}
 		</tool_call>`,
 			expectedToolCall: []api.ToolCall{t1},
 			expectedTokens:   "",
 		},
 		{
 			name:             "xlam tool calls with wrapper object",
 			model:            "xlam",
 			output:           `{"tool_calls": [{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]}`,
 			expectedToolCall: []api.ToolCall{t1, t2},
 			expectedTokens:   "",
 		},
 		{
 			name:             "qwen2.5-coder single tool call with prefix",
 			model:            "qwen2.5-coder",
 			output:           `<tool_call>{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}}</tool_call>`,
 			expectedToolCall: []api.ToolCall{t1},
 			expectedTokens:   "",
 		},
 		{
 			name:             "qwen2.5-coder multiple tool calls with and without prefix",
 			model:            "qwen2.5-coder",
 			output:           `{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}} <tool_call>{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}}</tool_call> <tool_call>{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}</tool_call>`,
 			expectedToolCall: []api.ToolCall{t1, t1, t2},
 			expectedTokens:   "",
 		},
 		{
 			name:             "qwen2.5-coder multiple tool calls without prefix",
 			model:            "qwen2.5-coder",
 			output:           `[{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}}, {"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]`,
 			expectedToolCall: []api.ToolCall{t1, t2},
 			expectedTokens:   "",
 		},
 		{
 			name:             "qwen2.5-coder plain text response no tool calls",
 			model:            "qwen2.5-coder",
 			output:           "The weather in San Francisco, CA is 70°F and in Toronto, Canada is 20°C.",
 			expectedToolCall: []api.ToolCall{},
 			expectedTokens:   "The weather in San Francisco, CA is 70°F and in Toronto, Canada is 20°C.",
 		},
 		{
 			name:             "qwen2.5-coder tool calls with trailing text",
 			model:            "qwen2.5-coder",
 			output:           `[{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}}, {"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}] some tokens after call`,
 			expectedToolCall: []api.ToolCall{t1, t2},
 			expectedTokens:   "some tokens after call",
 		},
 		{
 			name:             "qwen2.5 tool calls with prefix and trailing text",
 			model:            "qwen2.5-coder",
 			output:           `<tool_call> [{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}}, {"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}] </tool_call> some tokens after call`,
 			expectedToolCall: []api.ToolCall{t1, t2},
 			expectedTokens:   "",
 		},
 		{
 			name:             "qwen2.5 tool calls without prefix and valid tool call",
 			model:            "qwen2.5-coder",
 			output:           `[{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}}, {"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]`,
 			expectedToolCall: []api.ToolCall{t1, t2},
 			expectedTokens:   "",
 		},
 		{
 			name:             "qwen2.5 tool calls without prefix and invalid tool call",
 			model:            "qwen2.5-coder",
 			output:           `[{"options": "foo"}]`,
 			expectedToolCall: []api.ToolCall{},
 			expectedTokens:   `[{"options": "foo"}]`,
 		},
 		{
 			name:             "qwen2.5 tool calls with prefix and invalid tool call",
 			model:            "qwen2.5-coder",
 			output:           `<tool_call> [{"options": "foo"}] </tool_call> `,
 			expectedToolCall: []api.ToolCall{},
 			expectedTokens:   ``,
 		},
 		{
 			name:             "qwen3 tool call with think prefix and tool prefix (sent as a single token)",
 			model:            "qwen3",
 			output:           `<think>Okay, let me think what tool we should use...</think><tool_call>{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}}</tool_call>`,
 			expectedToolCall: []api.ToolCall{t1},
 			expectedTokens:   "<think>Okay, let me think what tool we should use...</think>",
 		},
 		{
 			name:             "qwen3 tool call with think prefix, tool prefix, and whitespace (sent as separate tokens)",
 			model:            "qwen3",
 			output:           `<think>Okay, let me think what tool we should use...</think> <tool_call> {"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}} </tool_call>`,
 			expectedToolCall: []api.ToolCall{t1},
 			expectedTokens:   "<think>Okay, let me think what tool we should use...</think>",
 		},
 		{
 			name:             "qwen3 empty think prefix without tool prefix and invalid tool call",
 			model:            "qwen3",
 			output:           `<think></think>{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}} </tool_call>`,
 			expectedToolCall: []api.ToolCall{},
 			expectedTokens:   `<think></think>{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}} </tool_call>`,
 		},
 		{
 			name:             "qwen3 empty think prefix with tool prefix and valid tool call",
 			model:            "qwen3",
 			output:           `<think></think><tool_call>{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}}  </tool_call>`,
 			expectedToolCall: []api.ToolCall{t1},
 			expectedTokens:   `<think></think>`,
 		},
 		{
 			name:             "qwen3 invalid tool call with fake tool prefix (single rune suffix match)",
 			model:            "qwen3",
 			output:           `<think></think>< fakeout{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}} </tool_call>`,
 			expectedToolCall: []api.ToolCall{},
 			expectedTokens:   `<think></think>< fakeout{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}} </tool_call>`,
 		},
 		{
 			name:             "qwen3 invalid tool call with partial tool prefix (multiple rune suffix match)",
 			model:            "qwen3",
 			output:           `<think></think><tool_c fakeout{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}} </tool_call>`,
 			expectedToolCall: []api.ToolCall{},
 			expectedTokens:   `<think></think><tool_c fakeout{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}} </tool_call>`,
 		},
 		{
 			name:             "qwen3 invalid tool call with malformed tool prefix",
 			model:            "qwen3",
 			output:           `<think></think><tool_cfakeout {"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}} </tool_call>`,
 			expectedToolCall: []api.ToolCall{},
 			expectedTokens:   `<think></think><tool_cfakeout {"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}} </tool_call>`,
 		},
 		{
 			name:             "llama3.2 valid tool call without prefix",
 			model:            "llama3.2",
 			output:           `{"name": "get_current_weather", "parameters": {"format":"fahrenheit","location":"San Francisco, CA"}}`,
 			expectedToolCall: []api.ToolCall{t1},
 			expectedTokens:   "",
 		},
 		{
 			name:             "llama3.2 incomplete tool call without prefix",
 			model:            "llama3.2",
 			output:           `{"name": "get_current_weather", "parameters": {"format":"fahrenheit","location":"San Francisco, `,
 			expectedToolCall: []api.ToolCall{},
 			expectedTokens:   "",
 		},
 		{
 			name:             "llama3.2 tool call with leading text",
 			model:            "llama3.2",
 			output:           `some non json text{"name": "get_current_weather", "parameters": {"format":"fahrenheit","location":"San Francisco, CA"}}`,
 			expectedToolCall: []api.ToolCall{},
 			expectedTokens:   `some non json text{"name": "get_current_weather", "parameters": {"format":"fahrenheit","location":"San Francisco, CA"}}`,
 		},
 		{
 			name:             "llama3.2 tool call with invalid tool prefix (no prefix in template)",
 			model:            "llama3.2",
 			output:           `<tool_call>{"name": "get_current_weather", "parameters": {"format":"fahrenheit","location":"San Francisco, CA"}}`,
 			expectedToolCall: []api.ToolCall{},
 			expectedTokens:   `<tool_call>{"name": "get_current_weather", "parameters": {"format":"fahrenheit","location":"San Francisco, CA"}}`,
 		},
 	}
 	var tools []api.Tool
 	if err := json.Unmarshal(readFile(t, p, "tools.json").Bytes(), &tools); err != nil {
 		t.Fatal(err)
 	}
 	var messages []api.Message
 	if err := json.Unmarshal(readFile(t, p, "messages.json").Bytes(), &messages); err != nil {
 		t.Fatal(err)
 	}
 	for _, tt := range cases {
 		t.Run(tt.name, func(t *testing.T) {
 			tmpl, err := template.Parse(readFile(t, p, fmt.Sprintf("%s.gotmpl", tt.model)).String())
 			if err != nil {
 				t.Fatal(err)
 			}
 			t.Run("template", func(t *testing.T) {
 				actual := &bytes.Buffer{} // Create new buffer for each test
 				if err := tmpl.Execute(actual, template.Values{Tools: tools, Messages: messages}); err != nil {
 					t.Fatal(err)
 				}
 				if diff := cmp.Diff(actual.String(), readFile(t, p, fmt.Sprintf("%s.out", tt.model)).String()); diff != "" {
 					t.Errorf("mismatch (-got +want):\n%s", diff)
 				}
 			})
 			t.Run("parse", func(t *testing.T) {
 				// fmt.Printf("tmpl: %s\n", tmpl.Root.String())
 				tp, err := NewParser(tmpl.Template)
 				if err != nil {
 					t.Fatal(err)
 				}
 				got := []api.ToolCall{}
 				var gotTokens strings.Builder
 				var add bool
 				tokens := strings.Fields(tt.output)
 				for _, tok := range tokens {
 					s := " " + tok
 					add = true
 					if !tp.Done {
 						toolCalls, content, err := tp.Add(s)
 						if err == nil {
 							if content != "" {
 								fmt.Printf("content: %q\n", content)
 								gotTokens.WriteString(content)
 								add = false
 							} else if len(toolCalls) > 0 {
 								got = append(got, toolCalls...)
 								add = false
 							} else {
 								add = false
 							}
 						}
 					}
 					if add {
 						gotTokens.WriteString(s)
 					}
 				}
 				// Compare tool calls if we expect any
 				if diff := cmp.Diff(got, tt.expectedToolCall); diff != "" {
 					t.Errorf("tool calls mismatch (-got +want):\n%s", diff)
 				}
 				// Compare tokens if we expect any
 				stripped := strings.TrimSpace(gotTokens.String())
 				if diff := cmp.Diff(stripped, tt.expectedTokens); diff != "" {
 					t.Log("actualTokens", stripped, "expectedTokens", tt.expectedTokens)
 					t.Errorf("tokens mismatch (-got +want):\n%s", diff)
 				}
 			})
 		})
 	}
 }
 func TestParseJSONToolCalls(t *testing.T) {
 	tests := []struct {
 		name          string
 		input         string
 		parser        *Parser
 		wantToolCalls []api.ToolCall
 		wantPartial   bool
 		wantValid     bool
 	}{
 		{
 			name:   "valid single tool call",
 			input:  `{"name": "test_tool", "arguments": {"arg1": "value1"}}`,
 			parser: &Parser{name: "name", arguments: "arguments"},
 			wantToolCalls: []api.ToolCall{
 				{
 					Function: api.ToolCallFunction{
 						Name: "test_tool",
 						Arguments: map[string]any{
 							"arg1": "value1",
 						},
 					},
 				},
 			},
 			wantPartial: false,
 			wantValid:   true,
 		},
 		{
 			name:          "incomplete JSON",
 			input:         `{"name": "test_tool", "arguments": {"arg1": `,
 			parser:        &Parser{name: "name", arguments: "arguments"},
 			wantToolCalls: nil,
 			wantPartial:   true,
 			wantValid:     false,
 		},
 		{
 			name:          "invalid JSON",
 			input:         `not json at all`,
 			parser:        &Parser{name: "name", arguments: "arguments"},
 			wantToolCalls: nil,
 			wantPartial:   false,
 			wantValid:     false,
 		},
 		{
 			name:          "missing required fields",
 			input:         `{"other": "field"}`,
 			parser:        &Parser{name: "name", arguments: "arguments"},
 			wantToolCalls: nil,
 			wantPartial:   false,
 			wantValid:     false,
 		},
 		{
 			name: "multiple tool calls in array",
 			input: `[
 				{"name": "tool1", "arguments": {"arg1": 1}},
 				{"name": "tool2", "arguments": {"arg2": "value"}}
 			]`,
 			parser: &Parser{name: "name", arguments: "arguments"},
 			wantToolCalls: []api.ToolCall{
 				{
 					Function: api.ToolCallFunction{
 						Name: "tool1",
 						Arguments: map[string]any{
 							"arg1": float64(1),
 						},
 					},
 				},
 				{
 					Function: api.ToolCallFunction{
 						Name: "tool2",
 						Arguments: map[string]any{
 							"arg2": "value",
 						},
 					},
 				},
 			},
 			wantPartial: false,
 			wantValid:   true,
 		},
 	}
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
 			gotCalls, gotPartial := tt.parser.parseJSONToolCalls(tt.input)
 			if gotPartial != tt.wantPartial {
 				t.Errorf("parseJSONToolCalls() partial = %v, want %v", gotPartial, tt.wantPartial)
 			}
 			if len(gotCalls) != 0 != tt.wantValid {
 				t.Errorf("parseJSONToolCalls() valid = %v, want %v", len(gotCalls) == 0, tt.wantValid)
 			}
 			if diff := cmp.Diff(gotCalls, tt.wantToolCalls); diff != "" {
 				t.Errorf("parseJSONToolCalls() tool calls mismatch (-got +want):\n%s", diff)
 			}
 		})
 	}
 }
--- a/tools/utils.go
+++ b/tools/utils.go
@ -0,0 +1,257 @@
 package tools
 import (
 	"bytes"
 	"errors"
 	"log/slog"
 	"slices"
 	"strings"
 	gotmpl "text/template"
 	"text/template/parse"
 	jsonv2 "github.com/go-json-experiment/json"
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/template"
 )
 // extractToolCallsFormat traverses a template AST to find text that follows a ".ToolCalls" condition.
 // It walks the template nodes looking for if-statements containing ".ToolCalls" and extracts any
 // immediate text nodes that follow. This is used to identify tool call prefixes and formatting.
 //
 // Returns:
 //   - string: The extracted text following the first ".ToolCalls" condition found
 //   - bool: Whether a ".ToolCalls" condition was found in the template
 func extractToolCallsFormat(tmpl *gotmpl.Template) (string, bool) {
 	if tmpl == nil || tmpl.Tree == nil {
 		slog.Debug("TextAfterToolCalls: template or tree is nil")
 		return "", false
 	}
 	var result string
 	var found bool
 	var walk func(nodes []parse.Node)
 	walk = func(nodes []parse.Node) {
 		for _, node := range nodes {
 			if found {
 				return
 			}
 			switch n := node.(type) {
 			case *parse.IfNode:
 				if isToolCallsNode(n) {
 					// Collect immediate TextNode(s) at start of IfNode's list
 					var sb strings.Builder
 					for _, innerNode := range n.List.Nodes {
 						if tn, ok := innerNode.(*parse.TextNode); ok {
 							sb.Write(tn.Text)
 						} else {
 							// Stop at first non-text node
 							break
 						}
 					}
 					result = sb.String()
 					found = true
 					return
 				}
 				// Recurse into child nodes
 				walk(n.List.Nodes)
 				if n.ElseList != nil {
 					walk(n.ElseList.Nodes)
 				}
 			case *parse.ListNode:
 				walk(n.Nodes)
 			case *parse.RangeNode:
 				walk(n.List.Nodes)
 				if n.ElseList != nil {
 					walk(n.ElseList.Nodes)
 				}
 			case *parse.WithNode:
 				walk(n.List.Nodes)
 				if n.ElseList != nil {
 					walk(n.ElseList.Nodes)
 				}
 			default:
 				// Continue to next node
 				continue
 			}
 			if found {
 				return
 			}
 		}
 	}
 	walk(tmpl.Tree.Root.Nodes)
 	return result, found
 }
 // isToolCallsNode detects if a node's condition includes ".ToolCalls"
 func isToolCallsNode(n *parse.IfNode) bool {
 	for _, cmd := range n.Pipe.Cmds {
 		for _, arg := range cmd.Args {
 			if field, ok := arg.(*parse.FieldNode); ok {
 				if slices.Contains(field.Ident, "ToolCalls") {
 					return true
 				}
 			}
 		}
 	}
 	return false
 }
 // TODO(parthsareen): get full prefix from the template instead of just the first token
 // toolPrefix returns the prefix for the tool call if it exists from a template
 func toolPrefix(tmpl *gotmpl.Template) string {
 	tokenText, ok := extractToolCallsFormat(tmpl)
 	if !ok {
 		return ""
 	}
 	tokenText = strings.TrimSpace(tokenText)
 	if tokenText == "" {
 		return ""
 	}
 	first := strings.Fields(tokenText)[0]
 	start := -1
 	end := -1
 	for i, r := range tokenText {
 		if r == '<' || r == '[' {
 			start = i
 		}
 		if (r == '>' || r == ']') && start != -1 {
 			end = i
 			break
 		}
 	}
 	if start != -1 && end != -1 {
 		// return the token including the [ or < and the ] or >
 		return tokenText[start : end+1]
 	} else if start != -1 {
 		// get until the [ or < - in the case tag was not closed
 		return tokenText[:start]
 	} else if end != -1 {
 		// get after the ] or > - in the case tag was not opened
 		return tokenText[end+1:]
 	}
 	return first
 }
 // toolTemplate creates a subtree from the node that ranges over .ToolCalls
 //
 // Returns:
 //   - *gotmpl.Template: The subtree containing the .ToolCalls range
 //   - bool: Whether a .ToolCalls range was found in the template
 func toolTemplate(t *template.Template) (*gotmpl.Template, bool) {
 	tmpl := t.Subtree(func(n parse.Node) bool {
 		if t, ok := n.(*parse.RangeNode); ok {
 			return slices.Contains(template.Identifiers(t.Pipe), "ToolCalls")
 		}
 		return false
 	})
 	if tmpl == nil {
 		return nil, false
 	}
 	return tmpl, true
 }
 // suffixOverlap returns the length of the longest suffix overlap between two strings
 //
 // Returns:
 //   - int: The length of the longest suffix overlap
 func suffixOverlap(s, delim string) int {
 	max := min(len(delim), len(s))
 	for i := max; i > 0; i-- {
 		if strings.HasSuffix(s, delim[:i]) {
 			return i
 		}
 	}
 	return 0
 }
 // extractToolArgs executes a template with a known tool call format to extract the name and arguments
 //
 // Returns:
 //   - string: The name of the tool call
 //   - string: The arguments of the tool call
 //   - error: Error if parsing failed
 func extractToolArgs(tmpl *gotmpl.Template) (name, arguments string, err error) {
 	var b bytes.Buffer
 	if err := tmpl.Execute(&b, map[string][]api.ToolCall{
 		"ToolCalls": {
 			{
 				Function: api.ToolCallFunction{
 					Name: "@@name@@",
 					Arguments: api.ToolCallFunctionArguments{
 						"@@argument@@": 1,
 					},
 				},
 			},
 		},
 	}); err != nil {
 		return "", "", err
 	}
 	var obj any
 	err = jsonv2.Unmarshal(b.Bytes(), &obj)
 	if err != nil {
 		return "", "", err
 	}
 	var objs []map[string]any
 	switch v := obj.(type) {
 	case map[string]any:
 		objs = []map[string]any{v}
 	case []map[string]any:
 		objs = v
 	case []any:
 		objs = collect(v)
 	}
 	if len(objs) == 0 {
 		return "", "", errors.New("no template objects found")
 	}
 	// find the keys that correspond to the name and arguments fields
 	for k, v := range objs[0] {
 		switch v.(type) {
 		case string:
 			name = k
 		case map[string]any:
 			arguments = k
 		}
 	}
 	if name == "" || arguments == "" {
 		slog.Debug("missing required fields in tool call template", "name", name, "arguments", arguments)
 		return "", "", errors.New("missing required fields in tool call template")
 	}
 	return name, arguments, nil
 }
 // collect recursively traverses an object to collect all nested maps
 //
 // Returns:
 //   - []map[string]any: A slice of all nested maps found in the object
 func collect(obj any) []map[string]any {
 	var all []map[string]any
 	switch o := obj.(type) {
 	case map[string]any:
 		all = append(all, o)
 		for _, v := range o {
 			all = append(all, collect(v)...)
 		}
 	case []any:
 		for _, v := range o {
 			all = append(all, collect(v)...)
 		}
 	default:
 		return nil
 	}
 	return all
 }
--- a/tools/utils_test.go
+++ b/tools/utils_test.go
@ -0,0 +1,464 @@
 package tools
 import (
 	"testing"
 	gotmpl "text/template"
 	"github.com/ollama/ollama/template"
 )
 func TestExtractToolCallsFormat(t *testing.T) {
 	cases := []struct {
 		name     string
 		template string
 		want     string
 		found    bool
 	}{
 		{
 			name:     "nil template",
 			template: "",
 			want:     "",
 			found:    false,
 		},
 		{
 			name:     "basic tool call with text",
 			template: "{{if .ToolCalls}}Hello world{{end}}",
 			want:     "Hello world",
 			found:    true,
 		},
 		{
 			name:     "tool call with json format",
 			template: "{{if .ToolCalls}}```json\n{{end}}",
 			want:     "```json\n",
 			found:    true,
 		},
 		{
 			name:     "tool call in range",
 			template: "{{range .ToolCalls}}tool: {{.}}{{end}}",
 			want:     "",
 			found:    false,
 		},
 		{
 			name:     "tool call with multiple text nodes",
 			template: "{{if .ToolCalls}}First text{{if .Something}}inner{{end}}Second text{{end}}",
 			want:     "First text",
 			found:    true,
 		},
 		{
 			name:     "nested if without tool calls",
 			template: "{{if .Something}}{{if .OtherThing}}text{{end}}{{end}}",
 			want:     "",
 			found:    false,
 		},
 	}
 	for _, tc := range cases {
 		t.Run(tc.name, func(t *testing.T) {
 			tmpl, err := gotmpl.New("test").Parse(tc.template)
 			if err != nil && tc.template != "" {
 				t.Fatalf("failed to parse template: %v", err)
 			}
 			got, found := extractToolCallsFormat(tmpl)
 			if got != tc.want {
 				t.Errorf("got text %q, want %q", got, tc.want)
 			}
 			if found != tc.found {
 				t.Errorf("got found %v, want %v", found, tc.found)
 			}
 		})
 	}
 }
 func TestToolPrefix(t *testing.T) {
 	cases := []struct {
 		name     string
 		template string
 		want     string
 	}{
 		{
 			name:     "basic tool call with action prefix",
 			template: "{{if .ToolCalls}}Action: ```json{{end}}",
 			want:     "Action:",
 		},
 		{
 			name:     "incomplete functools bracket",
 			template: "{{if .ToolCalls}}functools[{{end}}",
 			want:     "functools",
 		},
 		{
 			name:     "tool call with angle brackets",
 			template: "{{if .ToolCalls}}Hello, world! <tool_call>{{end}}",
 			want:     "<tool_call>",
 		},
 		{
 			name:     "multiple tool call formats",
 			template: "{{if .ToolCalls}}[tool_call] <tool_call>{{end}}",
 			want:     "[tool_call]",
 		},
 		{
 			name:     "single angle bracket tool call",
 			template: "{{if .ToolCalls}}<tool_call>{{end}}",
 			want:     "<tool_call>",
 		},
 		{
 			name:     "incomplete angle bracket after tool call",
 			template: "{{if .ToolCalls}}[tool_call] <{{end}}",
 			want:     "[tool_call]",
 		},
 		{
 			name:     "angle bracket prefix with tool call",
 			template: "{{if .ToolCalls}}> <tool_call>{{end}}",
 			want:     "<tool_call>",
 		},
 		{
 			name:     "uppercase tool call with incomplete bracket",
 			template: "{{if .ToolCalls}}[TOOL_CALL] [{{end}}",
 			want:     "[TOOL_CALL]",
 		},
 		{
 			name:     "uppercase tool call with adjacent bracket",
 			template: "{{if .ToolCalls}}[TOOL_CALL][{{end}}",
 			want:     "[TOOL_CALL]",
 		},
 		{
 			name:     "tool call with pipe delimiters",
 			template: "{{if .ToolCalls}}<|tool_call|>{{end}}",
 			want:     "<|tool_call|>",
 		},
 		{
 			name:     "tool with no prefix",
 			template: "{{if .ToolCalls}}{{end}}",
 			want:     "",
 		},
 	}
 	for _, tt := range cases {
 		t.Run(tt.name, func(t *testing.T) {
 			tmpl, err := gotmpl.New("test").Parse(tt.template)
 			if err != nil {
 				t.Fatalf("failed to parse template: %v", err)
 			}
 			got := toolPrefix(tmpl)
 			if got != tt.want {
 				t.Errorf("ToolToken(%q) = %q; want %q", tt.template, got, tt.want)
 			}
 		})
 	}
 }
 func TestToolTemplate(t *testing.T) {
 	cases := []struct {
 		name     string
 		template string
 		want     bool
 	}{
 		{
 			name:     "basic tool call range",
 			template: "{{range .ToolCalls}}test{{end}}",
 			want:     true,
 		},
 		{
 			name:     "no tool calls",
 			template: "{{range .Other}}test{{end}}",
 			want:     false,
 		},
 		{
 			name:     "nested tool calls",
 			template: "{{range .Outer}}{{range .ToolCalls}}test{{end}}{{end}}",
 			want:     true,
 		},
 		{
 			name:     "empty template",
 			template: "",
 			want:     false,
 		},
 		{
 			name:     "tool calls in if statement",
 			template: "{{if .ToolCalls}}test{{end}}",
 			want:     false,
 		},
 	}
 	for _, tt := range cases {
 		t.Run(tt.name, func(t *testing.T) {
 			tmpl, err := gotmpl.New("test").Parse(tt.template)
 			if err != nil {
 				t.Fatalf("failed to parse template: %v", err)
 			}
 			parsed, err := template.Parse(tmpl.Root.String())
 			if err != nil {
 				t.Fatalf("failed to parse template: %v", err)
 			}
 			_, got := toolTemplate(parsed)
 			if got != tt.want {
 				t.Errorf("toolTemplate() = %v; want %v", got, tt.want)
 			}
 		})
 	}
 }
 func TestSuffixOverlap(t *testing.T) {
 	cases := []struct {
 		name string
 		s    string
 		d    string
 		want int
 	}{
 		{
 			name: "no overlap",
 			s:    "hello world",
 			d:    "",
 			want: 0,
 		},
 		{
 			name: "full overlap",
 			s:    "<tool_call>",
 			d:    "<tool_call>",
 			want: 11,
 		},
 		{
 			name: "partial overlap",
 			s:    "text <tool_call>",
 			d:    "<tool_call>",
 			want: 11,
 		},
 		{
 			name: "delimiter longer than string",
 			s:    "<tool>",
 			d:    "<tool_call>",
 			want: 0,
 		},
 		{
 			name: "empty string",
 			s:    "",
 			d:    "<tool_call>",
 			want: 0,
 		},
 		{
 			name: "empty delimiter",
 			s:    "<tool_call>",
 			d:    "",
 			want: 0,
 		},
 		{
 			name: "single char overlap",
 			s:    "test<",
 			d:    "<tool_call>",
 			want: 1,
 		},
 		{
 			name: "partial tool call",
 			s:    "hello <tool_",
 			d:    "<tool_call>",
 			want: 6,
 		},
 	}
 	for _, tt := range cases {
 		t.Run(tt.name, func(t *testing.T) {
 			got := suffixOverlap(tt.s, tt.d)
 			if got != tt.want {
 				t.Errorf("suffixOverlap(%q, %q) = %d; want %d", tt.s, tt.d, got, tt.want)
 			}
 		})
 	}
 }
 func TestExtractToolArgs(t *testing.T) {
 	cases := []struct {
 		name     string
 		template string
 		want     string
 		ok       bool
 	}{
 		{
 			name:     "basic tool call with text after",
 			template: `{{if .ToolCalls}}tool response{{end}}`,
 			want:     "tool response",
 			ok:       true,
 		},
 		{
 			name:     "tool call with mixed content after",
 			template: `{{if .ToolCalls}}<tool_call>{{.Something}}{{end}}`,
 			want:     "<tool_call>",
 			ok:       true,
 		},
 		{
 			name:     "tool call with no text after",
 			template: `{{if .ToolCalls}}{{.Something}}{{end}}`,
 			want:     "",
 			ok:       true,
 		},
 		{
 			name:     "nested tool call",
 			template: `{{if .Something}}{{if .ToolCalls}}[TOOL_CALL]{{end}}{{end}}`,
 			want:     "[TOOL_CALL]",
 			ok:       true,
 		},
 		{
 			name:     "no tool calls",
 			template: `{{if .Something}}no tools here{{end}}`,
 			want:     "",
 			ok:       false,
 		},
 		{
 			name:     "empty template",
 			template: ``,
 			want:     "",
 			ok:       false,
 		},
 		{
 			name:     "multiple tool calls sections",
 			template: `{{if .ToolCalls}}first{{end}}{{if .ToolCalls}}second{{end}}`,
 			want:     "first",
 			ok:       true,
 		},
 		{
 			name:     "range over tool calls",
 			template: `{{if .ToolCalls}}{{range .ToolCalls}}tool{{end}}{{end}}`,
 			want:     "",
 			ok:       true,
 		},
 		{
 			name:     "tool calls with pipe delimiters",
 			template: `{{if .ToolCalls}}<|tool|>{{end}}`,
 			want:     "<|tool|>",
 			ok:       true,
 		},
 		{
 			name:     "tool calls with nested template",
 			template: `{{if .ToolCalls}}{{template "tool" .}}{{end}}`,
 			want:     "",
 			ok:       true,
 		},
 		{
 			name:     "tool calls with whitespace variations",
 			template: `{{if .ToolCalls}}  tool  {{end}}`,
 			want:     "  tool  ",
 			ok:       true,
 		},
 	}
 	for _, tt := range cases {
 		t.Run(tt.name, func(t *testing.T) {
 			tmpl, err := gotmpl.New("test").Parse(tt.template)
 			if err != nil {
 				t.Fatalf("failed to parse template: %v", err)
 			}
 			got, ok := extractToolCallsFormat(tmpl)
 			if got != tt.want {
 				t.Errorf("TextAfterToolCalls() got = %q, want %q", got, tt.want)
 			}
 			if ok != tt.ok {
 				t.Errorf("TextAfterToolCalls() ok = %v, want %v", ok, tt.ok)
 			}
 		})
 	}
 }
 func TestCollect(t *testing.T) {
 	cases := []struct {
 		name string
 		obj  any
 		want []map[string]any
 	}{
 		{
 			name: "simple map",
 			obj: map[string]any{
 				"key": "value",
 			},
 			want: []map[string]any{
 				{"key": "value"},
 			},
 		},
 		{
 			name: "nested map",
 			obj: map[string]any{
 				"outer": map[string]any{
 					"inner": "value",
 				},
 			},
 			want: []map[string]any{
 				{"outer": map[string]any{"inner": "value"}},
 				{"inner": "value"},
 			},
 		},
 		{
 			name: "array of maps",
 			obj: []any{
 				map[string]any{"key1": "val1"},
 				map[string]any{"key2": "val2"},
 			},
 			want: []map[string]any{
 				{"key1": "val1"},
 				{"key2": "val2"},
 			},
 		},
 		{
 			name: "deeply nested",
 			obj: map[string]any{
 				"l1": map[string]any{
 					"l2": map[string]any{
 						"l3": "value",
 					},
 				},
 			},
 			want: []map[string]any{
 				{"l1": map[string]any{"l2": map[string]any{"l3": "value"}}},
 				{"l2": map[string]any{"l3": "value"}},
 				{"l3": "value"},
 			},
 		},
 		{
 			name: "non-map value",
 			obj:  "string",
 			want: nil,
 		},
 	}
 	for _, tt := range cases {
 		t.Run(tt.name, func(t *testing.T) {
 			got := collect(tt.obj)
 			if len(got) != len(tt.want) {
 				t.Errorf("collect() got %d maps, want %d", len(got), len(tt.want))
 				return
 			}
 			// Compare each map in the result
 			for i := range tt.want {
 				if !mapsEqual(got[i], tt.want[i]) {
 					t.Errorf("collect() map[%d] = %v, want %v", i, got[i], tt.want[i])
 				}
 			}
 		})
 	}
 }
 // mapsEqual compares two maps for deep equality
 func mapsEqual(m1, m2 map[string]any) bool {
 	if len(m1) != len(m2) {
 		return false
 	}
 	for k, v1 := range m1 {
 		v2, ok := m2[k]
 		if !ok {
 			return false
 		}
 		switch val1 := v1.(type) {
 		case map[string]any:
 			val2, ok := v2.(map[string]any)
 			if !ok || !mapsEqual(val1, val2) {
 				return false
 			}
 		default:
 			if v1 != v2 {
 				return false
 			}
 		}
 	}
 	return true
 }
Author	SHA1	Message	Date
ParthSareen	8ed95a4e96	add tests, organize, comments	2025-05-13 17:44:47 -07:00
ParthSareen	bc83789be9	tools package and utils	2025-05-13 17:44:45 -07:00
ParthSareen	4059b8db01	renaming and splitting stuff up	2025-05-13 17:43:15 -07:00
ParthSareen	b8b9c0c7cf	checkpoint	2025-05-13 17:43:15 -07:00
ParthSareen	779547fcde	checkpoint - cleanup still left, functionality setup	2025-05-13 17:43:15 -07:00
ParthSareen	6cb7494061	checkpoint for new parser TODO: - cleanup routes interface - internal/external states	2025-05-13 17:43:15 -07:00
ParthSareen	a44734b030	add new parser, tests, and templates	2025-05-13 17:43:15 -07:00
ParthSareen	b5a982ecb0	wip	2025-05-13 17:43:15 -07:00
ParthSareen	516a540df7	jsonv2 decoder	2025-05-13 17:43:15 -07:00
ParthSareen	7f2f996cd6	server/routes: catch when JSON tool was used	2025-05-13 17:43:15 -07:00
ParthSareen	610054a234	model: support tools streaming and improve parsing	2025-05-13 17:43:15 -07:00