From 7f69031491dec04ab57b30eace26186b6004388d Mon Sep 17 00:00:00 2001
From: jmorganca <jmorganca@gmail.com>
Date: Tue, 17 Dec 2024 11:22:32 -0800
Subject: [PATCH] llama: update vendored code to 081b29bd

---
 api/types.go                                  |   2 -
 cmd/interactive_test.go                       |   9 +-
 docs/api.md                                   |   4 -
 llama/amx.cpp                                 |   2 +-
 llama/amx.h                                   |   2 +-
 llama/clip.cpp                                |   2 +-
 llama/clip.h                                  |   2 +-
 llama/common.cpp                              |  21 +-
 llama/common.h                                |  17 +-
 llama/ggml-alloc.c                            |   3 +-
 llama/ggml-alloc.h                            |   2 +-
 llama/ggml-backend-impl.h                     |   2 +-
 llama/ggml-backend-reg.cpp                    |   2 +-
 llama/ggml-backend.cpp                        |   2 +-
 llama/ggml-backend.h                          |   2 +-
 llama/ggml-blas.cpp                           |   2 +-
 llama/ggml-blas.h                             |   2 +-
 llama/ggml-common.h                           |   2 +-
 llama/ggml-cpp.h                              |   2 +-
 llama/ggml-cpu-aarch64.cpp                    |   2 +-
 llama/ggml-cpu-aarch64.h                      |   2 +-
 llama/ggml-cpu-impl.h                         |   2 +-
 llama/ggml-cpu-quants.c                       |   2 +-
 llama/ggml-cpu-quants.h                       |   2 +-
 llama/ggml-cpu-traits.cpp                     |   2 +-
 llama/ggml-cpu-traits.h                       |   2 +-
 llama/ggml-cpu.c                              |   2 +-
 llama/ggml-cpu.cpp                            |   5 +-
 llama/ggml-cpu.h                              |   2 +-
 llama/ggml-cuda.h                             |   2 +-
 llama/ggml-cuda/acc.cu                        |   2 +-
 llama/ggml-cuda/acc.cuh                       |   2 +-
 llama/ggml-cuda/arange.cu                     |   2 +-
 llama/ggml-cuda/arange.cuh                    |   2 +-
 llama/ggml-cuda/argmax.cu                     |   2 +-
 llama/ggml-cuda/argmax.cuh                    |   2 +-
 llama/ggml-cuda/argsort.cu                    |   2 +-
 llama/ggml-cuda/argsort.cuh                   |   2 +-
 llama/ggml-cuda/binbcast.cu                   |   2 +-
 llama/ggml-cuda/binbcast.cuh                  |   2 +-
 llama/ggml-cuda/clamp.cu                      |   2 +-
 llama/ggml-cuda/clamp.cuh                     |   2 +-
 llama/ggml-cuda/common.cuh                    |   2 +-
 llama/ggml-cuda/concat.cu                     |   2 +-
 llama/ggml-cuda/concat.cuh                    |   2 +-
 llama/ggml-cuda/conv-transpose-1d.cu          |   2 +-
 llama/ggml-cuda/conv-transpose-1d.cuh         |   2 +-
 llama/ggml-cuda/convert.cu                    |   2 +-
 llama/ggml-cuda/convert.cuh                   |   2 +-
 llama/ggml-cuda/count-equal.cu                |   2 +-
 llama/ggml-cuda/count-equal.cuh               |   2 +-
 llama/ggml-cuda/cpy.cu                        |   2 +-
 llama/ggml-cuda/cpy.cuh                       |   2 +-
 llama/ggml-cuda/cross-entropy-loss.cu         |   2 +-
 llama/ggml-cuda/cross-entropy-loss.cuh        |   2 +-
 llama/ggml-cuda/dequantize.cuh                |   2 +-
 llama/ggml-cuda/diagmask.cu                   |   2 +-
 llama/ggml-cuda/diagmask.cuh                  |   2 +-
 llama/ggml-cuda/fattn-common.cuh              |   2 +-
 llama/ggml-cuda/fattn-tile-f16.cu             |   2 +-
 llama/ggml-cuda/fattn-tile-f16.cuh            |   2 +-
 llama/ggml-cuda/fattn-tile-f32.cu             |   2 +-
 llama/ggml-cuda/fattn-tile-f32.cuh            |   2 +-
 llama/ggml-cuda/fattn-vec-f16.cuh             |   2 +-
 llama/ggml-cuda/fattn-vec-f32.cuh             |   2 +-
 llama/ggml-cuda/fattn-wmma-f16.cuh            |   2 +-
 llama/ggml-cuda/fattn.cu                      |   2 +-
 llama/ggml-cuda/fattn.cuh                     |   2 +-
 llama/ggml-cuda/getrows.cu                    |   2 +-
 llama/ggml-cuda/getrows.cuh                   |   2 +-
 llama/ggml-cuda/ggml-cuda.cu                  |   2 +-
 llama/ggml-cuda/im2col.cu                     |   2 +-
 llama/ggml-cuda/im2col.cuh                    |   2 +-
 llama/ggml-cuda/mma.cuh                       |   2 +-
 llama/ggml-cuda/mmq.cu                        |   2 +-
 llama/ggml-cuda/mmq.cuh                       |   2 +-
 llama/ggml-cuda/mmv.cu                        |   2 +-
 llama/ggml-cuda/mmv.cuh                       |   2 +-
 llama/ggml-cuda/mmvq.cu                       |   2 +-
 llama/ggml-cuda/mmvq.cuh                      |   2 +-
 llama/ggml-cuda/norm.cu                       |   2 +-
 llama/ggml-cuda/norm.cuh                      |   2 +-
 llama/ggml-cuda/opt-step-adamw.cu             |   2 +-
 llama/ggml-cuda/opt-step-adamw.cuh            |   2 +-
 llama/ggml-cuda/out-prod.cu                   |   2 +-
 llama/ggml-cuda/out-prod.cuh                  |   2 +-
 llama/ggml-cuda/pad.cu                        |   2 +-
 llama/ggml-cuda/pad.cuh                       |   2 +-
 llama/ggml-cuda/pool2d.cu                     |   2 +-
 llama/ggml-cuda/pool2d.cuh                    |   2 +-
 llama/ggml-cuda/quantize.cu                   |   2 +-
 llama/ggml-cuda/quantize.cuh                  |   2 +-
 llama/ggml-cuda/rope.cu                       |   2 +-
 llama/ggml-cuda/rope.cuh                      |   2 +-
 llama/ggml-cuda/scale.cu                      |   2 +-
 llama/ggml-cuda/scale.cuh                     |   2 +-
 llama/ggml-cuda/softmax.cu                    |   2 +-
 llama/ggml-cuda/softmax.cuh                   |   2 +-
 llama/ggml-cuda/sum.cu                        |   2 +-
 llama/ggml-cuda/sum.cuh                       |   2 +-
 llama/ggml-cuda/sumrows.cu                    |   2 +-
 llama/ggml-cuda/sumrows.cuh                   |   2 +-
 .../fattn-vec-f16-instance-hs128-f16-f16.cu   |   2 +-
 .../fattn-vec-f16-instance-hs128-f16-q4_0.cu  |   2 +-
 .../fattn-vec-f16-instance-hs128-f16-q4_1.cu  |   2 +-
 .../fattn-vec-f16-instance-hs128-f16-q5_0.cu  |   2 +-
 .../fattn-vec-f16-instance-hs128-f16-q5_1.cu  |   2 +-
 .../fattn-vec-f16-instance-hs128-f16-q8_0.cu  |   2 +-
 .../fattn-vec-f16-instance-hs128-q4_0-f16.cu  |   2 +-
 .../fattn-vec-f16-instance-hs128-q4_0-q4_0.cu |   2 +-
 .../fattn-vec-f16-instance-hs128-q4_0-q4_1.cu |   2 +-
 .../fattn-vec-f16-instance-hs128-q4_0-q5_0.cu |   2 +-
 .../fattn-vec-f16-instance-hs128-q4_0-q5_1.cu |   2 +-
 .../fattn-vec-f16-instance-hs128-q4_0-q8_0.cu |   2 +-
 .../fattn-vec-f16-instance-hs128-q4_1-f16.cu  |   2 +-
 .../fattn-vec-f16-instance-hs128-q4_1-q4_0.cu |   2 +-
 .../fattn-vec-f16-instance-hs128-q4_1-q4_1.cu |   2 +-
 .../fattn-vec-f16-instance-hs128-q4_1-q5_0.cu |   2 +-
 .../fattn-vec-f16-instance-hs128-q4_1-q5_1.cu |   2 +-
 .../fattn-vec-f16-instance-hs128-q4_1-q8_0.cu |   2 +-
 .../fattn-vec-f16-instance-hs128-q5_0-f16.cu  |   2 +-
 .../fattn-vec-f16-instance-hs128-q5_0-q4_0.cu |   2 +-
 .../fattn-vec-f16-instance-hs128-q5_0-q4_1.cu |   2 +-
 .../fattn-vec-f16-instance-hs128-q5_0-q5_0.cu |   2 +-
 .../fattn-vec-f16-instance-hs128-q5_0-q5_1.cu |   2 +-
 .../fattn-vec-f16-instance-hs128-q5_0-q8_0.cu |   2 +-
 .../fattn-vec-f16-instance-hs128-q5_1-f16.cu  |   2 +-
 .../fattn-vec-f16-instance-hs128-q5_1-q4_0.cu |   2 +-
 .../fattn-vec-f16-instance-hs128-q5_1-q4_1.cu |   2 +-
 .../fattn-vec-f16-instance-hs128-q5_1-q5_0.cu |   2 +-
 .../fattn-vec-f16-instance-hs128-q5_1-q5_1.cu |   2 +-
 .../fattn-vec-f16-instance-hs128-q5_1-q8_0.cu |   2 +-
 .../fattn-vec-f16-instance-hs128-q8_0-f16.cu  |   2 +-
 .../fattn-vec-f16-instance-hs128-q8_0-q4_0.cu |   2 +-
 .../fattn-vec-f16-instance-hs128-q8_0-q4_1.cu |   2 +-
 .../fattn-vec-f16-instance-hs128-q8_0-q5_0.cu |   2 +-
 .../fattn-vec-f16-instance-hs128-q8_0-q5_1.cu |   2 +-
 .../fattn-vec-f16-instance-hs128-q8_0-q8_0.cu |   2 +-
 .../fattn-vec-f16-instance-hs256-f16-f16.cu   |   2 +-
 .../fattn-vec-f16-instance-hs64-f16-f16.cu    |   2 +-
 .../fattn-vec-f16-instance-hs64-f16-q4_0.cu   |   2 +-
 .../fattn-vec-f16-instance-hs64-f16-q4_1.cu   |   2 +-
 .../fattn-vec-f16-instance-hs64-f16-q5_0.cu   |   2 +-
 .../fattn-vec-f16-instance-hs64-f16-q5_1.cu   |   2 +-
 .../fattn-vec-f16-instance-hs64-f16-q8_0.cu   |   2 +-
 .../fattn-vec-f32-instance-hs128-f16-f16.cu   |   2 +-
 .../fattn-vec-f32-instance-hs128-f16-q4_0.cu  |   2 +-
 .../fattn-vec-f32-instance-hs128-f16-q4_1.cu  |   2 +-
 .../fattn-vec-f32-instance-hs128-f16-q5_0.cu  |   2 +-
 .../fattn-vec-f32-instance-hs128-f16-q5_1.cu  |   2 +-
 .../fattn-vec-f32-instance-hs128-f16-q8_0.cu  |   2 +-
 .../fattn-vec-f32-instance-hs128-q4_0-f16.cu  |   2 +-
 .../fattn-vec-f32-instance-hs128-q4_0-q4_0.cu |   2 +-
 .../fattn-vec-f32-instance-hs128-q4_0-q4_1.cu |   2 +-
 .../fattn-vec-f32-instance-hs128-q4_0-q5_0.cu |   2 +-
 .../fattn-vec-f32-instance-hs128-q4_0-q5_1.cu |   2 +-
 .../fattn-vec-f32-instance-hs128-q4_0-q8_0.cu |   2 +-
 .../fattn-vec-f32-instance-hs128-q4_1-f16.cu  |   2 +-
 .../fattn-vec-f32-instance-hs128-q4_1-q4_0.cu |   2 +-
 .../fattn-vec-f32-instance-hs128-q4_1-q4_1.cu |   2 +-
 .../fattn-vec-f32-instance-hs128-q4_1-q5_0.cu |   2 +-
 .../fattn-vec-f32-instance-hs128-q4_1-q5_1.cu |   2 +-
 .../fattn-vec-f32-instance-hs128-q4_1-q8_0.cu |   2 +-
 .../fattn-vec-f32-instance-hs128-q5_0-f16.cu  |   2 +-
 .../fattn-vec-f32-instance-hs128-q5_0-q4_0.cu |   2 +-
 .../fattn-vec-f32-instance-hs128-q5_0-q4_1.cu |   2 +-
 .../fattn-vec-f32-instance-hs128-q5_0-q5_0.cu |   2 +-
 .../fattn-vec-f32-instance-hs128-q5_0-q5_1.cu |   2 +-
 .../fattn-vec-f32-instance-hs128-q5_0-q8_0.cu |   2 +-
 .../fattn-vec-f32-instance-hs128-q5_1-f16.cu  |   2 +-
 .../fattn-vec-f32-instance-hs128-q5_1-q4_0.cu |   2 +-
 .../fattn-vec-f32-instance-hs128-q5_1-q4_1.cu |   2 +-
 .../fattn-vec-f32-instance-hs128-q5_1-q5_0.cu |   2 +-
 .../fattn-vec-f32-instance-hs128-q5_1-q5_1.cu |   2 +-
 .../fattn-vec-f32-instance-hs128-q5_1-q8_0.cu |   2 +-
 .../fattn-vec-f32-instance-hs128-q8_0-f16.cu  |   2 +-
 .../fattn-vec-f32-instance-hs128-q8_0-q4_0.cu |   2 +-
 .../fattn-vec-f32-instance-hs128-q8_0-q4_1.cu |   2 +-
 .../fattn-vec-f32-instance-hs128-q8_0-q5_0.cu |   2 +-
 .../fattn-vec-f32-instance-hs128-q8_0-q5_1.cu |   2 +-
 .../fattn-vec-f32-instance-hs128-q8_0-q8_0.cu |   2 +-
 .../fattn-vec-f32-instance-hs256-f16-f16.cu   |   2 +-
 .../fattn-vec-f32-instance-hs64-f16-f16.cu    |   2 +-
 .../fattn-vec-f32-instance-hs64-f16-q4_0.cu   |   2 +-
 .../fattn-vec-f32-instance-hs64-f16-q4_1.cu   |   2 +-
 .../fattn-vec-f32-instance-hs64-f16-q5_0.cu   |   2 +-
 .../fattn-vec-f32-instance-hs64-f16-q5_1.cu   |   2 +-
 .../fattn-vec-f32-instance-hs64-f16-q8_0.cu   |   2 +-
 .../fattn-wmma-f16-instance-kqfloat-cpb16.cu  |   2 +-
 .../fattn-wmma-f16-instance-kqfloat-cpb32.cu  |   2 +-
 .../fattn-wmma-f16-instance-kqhalf-cpb16.cu   |   2 +-
 .../fattn-wmma-f16-instance-kqhalf-cpb32.cu   |   2 +-
 .../fattn-wmma-f16-instance-kqhalf-cpb8.cu    |   2 +-
 .../template-instances/mmq-instance-iq1_s.cu  |   2 +-
 .../template-instances/mmq-instance-iq2_s.cu  |   2 +-
 .../template-instances/mmq-instance-iq2_xs.cu |   2 +-
 .../mmq-instance-iq2_xxs.cu                   |   2 +-
 .../template-instances/mmq-instance-iq3_s.cu  |   2 +-
 .../mmq-instance-iq3_xxs.cu                   |   2 +-
 .../template-instances/mmq-instance-iq4_nl.cu |   2 +-
 .../template-instances/mmq-instance-iq4_xs.cu |   2 +-
 .../template-instances/mmq-instance-q2_k.cu   |   2 +-
 .../template-instances/mmq-instance-q3_k.cu   |   2 +-
 .../template-instances/mmq-instance-q4_0.cu   |   2 +-
 .../template-instances/mmq-instance-q4_1.cu   |   2 +-
 .../template-instances/mmq-instance-q4_k.cu   |   2 +-
 .../template-instances/mmq-instance-q5_0.cu   |   2 +-
 .../template-instances/mmq-instance-q5_1.cu   |   2 +-
 .../template-instances/mmq-instance-q5_k.cu   |   2 +-
 .../template-instances/mmq-instance-q6_k.cu   |   2 +-
 .../template-instances/mmq-instance-q8_0.cu   |   2 +-
 llama/ggml-cuda/tsembd.cu                     |   2 +-
 llama/ggml-cuda/tsembd.cuh                    |   2 +-
 llama/ggml-cuda/unary.cu                      |   2 +-
 llama/ggml-cuda/unary.cuh                     |   2 +-
 llama/ggml-cuda/upscale.cu                    |   2 +-
 llama/ggml-cuda/upscale.cuh                   |   2 +-
 llama/ggml-cuda/vecdotq.cuh                   |   2 +-
 llama/ggml-cuda/vendors/cuda.h                |   2 +-
 llama/ggml-cuda/vendors/hip.h                 |   2 +-
 llama/ggml-cuda/vendors/musa.h                |   2 +-
 llama/ggml-cuda/wkv6.cu                       |   2 +-
 llama/ggml-cuda/wkv6.cuh                      |   2 +-
 llama/ggml-impl.h                             |  18 +-
 llama/ggml-metal-embed.metal                  |   6 +-
 llama/ggml-metal-impl.h                       |   2 +-
 llama/ggml-metal.h                            |   2 +-
 llama/ggml-metal.metal                        |   2 +-
 llama/ggml-metal_darwin_arm64.m               |   2 +-
 llama/ggml-quants.c                           |   2 +-
 llama/ggml-quants.h                           |   2 +-
 llama/ggml-threading.cpp                      |   2 +-
 llama/ggml-threading.h                        |   2 +-
 llama/ggml.c                                  |  71 ++--
 llama/ggml.h                                  |   2 +-
 llama/json-schema-to-grammar.cpp              |   2 +-
 llama/json-schema-to-grammar.h                |   2 +-
 llama/llama-grammar.cpp                       |   2 +-
 llama/llama-grammar.h                         |   2 +-
 llama/llama-impl.h                            |   2 +-
 llama/llama-sampling.cpp                      | 127 ++-----
 llama/llama-sampling.h                        |   2 +-
 llama/llama-vocab.cpp                         |   4 +-
 llama/llama-vocab.h                           |   2 +-
 llama/llama.cpp                               | 309 +++++++++++++++++-
 llama/llama.go                                |   2 -
 llama/llama.h                                 |  16 +-
 llama/llava.cpp                               |   2 +-
 llama/llava.h                                 |   2 +-
 llama/log.cpp                                 |   2 +-
 llama/log.h                                   |   2 +-
 llama/mmq.cpp                                 |   2 +-
 llama/mmq.h                                   |   2 +-
 llama/runner/runner.go                        |   2 -
 llama/sampling.cpp                            |  29 +-
 llama/sampling.h                              |   2 +-
 llama/sampling_ext.cpp                        |   1 -
 llama/sampling_ext.h                          |   1 -
 llama/unicode-data.cpp                        |   2 +-
 llama/unicode-data.h                          |   2 +-
 llama/unicode.cpp                             | 104 +++---
 llama/unicode.h                               |  21 +-
 llama/vendoring                               |   2 +-
 llm/server.go                                 |   1 -
 parser/parser_test.go                         |   1 -
 server/images.go                              |   7 +
 266 files changed, 760 insertions(+), 505 deletions(-)

diff --git a/api/types.go b/api/types.go
index 0ea0b9bf0..e56eb1684 100644
--- a/api/types.go
+++ b/api/types.go
@@ -225,7 +225,6 @@ type Options struct {
 	Mirostat         int      `json:"mirostat,omitempty"`
 	MirostatTau      float32  `json:"mirostat_tau,omitempty"`
 	MirostatEta      float32  `json:"mirostat_eta,omitempty"`
-	PenalizeNewline  bool     `json:"penalize_newline,omitempty"`
 	Stop             []string `json:"stop,omitempty"`
 }
 
@@ -602,7 +601,6 @@ func DefaultOptions() Options {
 		Mirostat:         0,
 		MirostatTau:      5.0,
 		MirostatEta:      0.1,
-		PenalizeNewline:  true,
 		Seed:             -1,
 
 		Runner: Runner{
diff --git a/cmd/interactive_test.go b/cmd/interactive_test.go
index 118f42640..a0cfa5439 100644
--- a/cmd/interactive_test.go
+++ b/cmd/interactive_test.go
@@ -63,17 +63,15 @@ func TestModelfileBuilder(t *testing.T) {
 			{Role: "assistant", Content: "Yes it is true, I am half horse, half shark."},
 		},
 		Options: map[string]any{
-			"temperature":      0.9,
-			"seed":             42,
-			"penalize_newline": false,
-			"stop":             []string{"hi", "there"},
+			"temperature": 0.9,
+			"seed":        42,
+			"stop":        []string{"hi", "there"},
 		},
 	}
 
 	t.Run("model", func(t *testing.T) {
 		expect := `FROM hork
 SYSTEM You are part horse and part shark, but all hork. Do horklike things
-PARAMETER penalize_newline false
 PARAMETER seed 42
 PARAMETER stop hi
 PARAMETER stop there
@@ -92,7 +90,6 @@ MESSAGE assistant Yes it is true, I am half horse, half shark.
 		opts.ParentModel = "horseshark"
 		expect := `FROM horseshark
 SYSTEM You are part horse and part shark, but all hork. Do horklike things
-PARAMETER penalize_newline false
 PARAMETER seed 42
 PARAMETER stop hi
 PARAMETER stop there
diff --git a/docs/api.md b/docs/api.md
index 41605fc70..692ac4f5a 100644
--- a/docs/api.md
+++ b/docs/api.md
@@ -396,17 +396,13 @@ curl http://localhost:11434/api/generate -d '{
     "mirostat": 1,
     "mirostat_tau": 0.8,
     "mirostat_eta": 0.6,
-    "penalize_newline": true,
     "stop": ["\n", "user:"],
     "numa": false,
     "num_ctx": 1024,
     "num_batch": 2,
     "num_gpu": 1,
     "main_gpu": 0,
-    "low_vram": false,
-    "vocab_only": false,
     "use_mmap": true,
-    "use_mlock": false,
     "num_thread": 8
   }
 }'
diff --git a/llama/amx.cpp b/llama/amx.cpp
index 7e375ced1..b63fcda6f 100644
--- a/llama/amx.cpp
+++ b/llama/amx.cpp
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/amx.h b/llama/amx.h
index 384d7ecee..021ab73ca 100644
--- a/llama/amx.h
+++ b/llama/amx.h
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/clip.cpp b/llama/clip.cpp
index dafbc3236..1d7e126c5 100644
--- a/llama/clip.cpp
+++ b/llama/clip.cpp
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/clip.h b/llama/clip.h
index 4c64880e2..b44f7517c 100644
--- a/llama/clip.h
+++ b/llama/clip.h
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/common.cpp b/llama/common.cpp
index 0bf26ce0f..05b598864 100644
--- a/llama/common.cpp
+++ b/llama/common.cpp
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
@@ -966,6 +966,25 @@ struct common_init_result common_init_from_params(common_params & params) {
         params.sampling.ignore_eos = false;
     }
 
+    if (params.sampling.ignore_eos) {
+        for (llama_token i = 0; i < llama_n_vocab(model); i++) {
+            if (llama_token_is_eog(model, i)) {
+                LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(lctx, i).c_str(), -INFINITY);
+                params.sampling.logit_bias.push_back({i, -INFINITY});
+            }
+        }
+    }
+
+    if (params.sampling.penalty_last_n == -1) {
+        LOG_INF("%s: setting penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
+        params.sampling.penalty_last_n = llama_n_ctx(lctx);
+    }
+
+    if (params.sampling.dry_penalty_last_n == -1) {
+        LOG_INF("%s: setting dry_penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
+        params.sampling.dry_penalty_last_n = llama_n_ctx(lctx);
+    }
+
     if (params.warmup) {
         LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);
 
diff --git a/llama/common.h b/llama/common.h
index b5b0168b1..4c605f757 100644
--- a/llama/common.h
+++ b/llama/common.h
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
@@ -121,6 +121,7 @@ enum common_sampler_type {
     COMMON_SAMPLER_TYPE_TEMPERATURE = 7,
     COMMON_SAMPLER_TYPE_XTC         = 8,
     COMMON_SAMPLER_TYPE_INFILL      = 9,
+    COMMON_SAMPLER_TYPE_PENALTIES   = 10,
 };
 
 // dimensionality reduction methods, used by cvector-generator
@@ -156,7 +157,6 @@ struct common_params_sampling {
     int32_t mirostat           = 0;     // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
     float   mirostat_tau       = 5.00f; // target entropy
     float   mirostat_eta       = 0.10f; // learning rate
-    bool    penalize_nl        = false; // consider newlines as a repeatable token
     bool    ignore_eos         = false;
     bool    no_perf            = false; // disable performance metrics
     bool    timing_per_token   = false;
@@ -165,6 +165,7 @@ struct common_params_sampling {
 
 
     std::vector<enum common_sampler_type> samplers = {
+        COMMON_SAMPLER_TYPE_PENALTIES,
         COMMON_SAMPLER_TYPE_DRY,
         COMMON_SAMPLER_TYPE_TOP_K,
         COMMON_SAMPLER_TYPE_TYPICAL_P,
@@ -219,11 +220,13 @@ struct common_params {
     float   defrag_thold          =  0.1f; // KV cache defragmentation threshold
 
     // offload params
-    std::vector<ggml_backend_dev_t> devices;         // devices to use for offloading
-    int32_t n_gpu_layers                    =    -1; // number of layers to store in VRAM (-1 - use default)
-    int32_t main_gpu                        =     0; // the GPU that is used for scratch and small tensors
-    float   tensor_split[128]               =   {0}; // how split tensors should be distributed across GPUs
-    enum llama_split_mode        split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
+    std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
+
+    int32_t n_gpu_layers      = -1;  // number of layers to store in VRAM (-1 - use default)
+    int32_t main_gpu          = 0;   // the GPU that is used for scratch and small tensors
+    float   tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
+
+    enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
 
     struct cpu_params cpuparams;
     struct cpu_params cpuparams_batch;
diff --git a/llama/ggml-alloc.c b/llama/ggml-alloc.c
index f5fd1fc21..ffdfb8977 100644
--- a/llama/ggml-alloc.c
+++ b/llama/ggml-alloc.c
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
@@ -560,7 +560,6 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
         size_t offset = ggml_dyn_tallocr_alloc(alloc, size, node);
         hn->buffer_id = buffer_id;
         hn->offset = offset;
-        return;
     }
 }
 
diff --git a/llama/ggml-alloc.h b/llama/ggml-alloc.h
index d17cd4f63..3dc1f5a1f 100644
--- a/llama/ggml-alloc.h
+++ b/llama/ggml-alloc.h
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-backend-impl.h b/llama/ggml-backend-impl.h
index f39d669bd..2e04f91be 100644
--- a/llama/ggml-backend-impl.h
+++ b/llama/ggml-backend-impl.h
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-backend-reg.cpp b/llama/ggml-backend-reg.cpp
index 31b4df87c..0baa8422f 100644
--- a/llama/ggml-backend-reg.cpp
+++ b/llama/ggml-backend-reg.cpp
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-backend.cpp b/llama/ggml-backend.cpp
index fbb697e51..956611a2a 100644
--- a/llama/ggml-backend.cpp
+++ b/llama/ggml-backend.cpp
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-backend.h b/llama/ggml-backend.h
index 9ce526889..83413fa9e 100644
--- a/llama/ggml-backend.h
+++ b/llama/ggml-backend.h
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-blas.cpp b/llama/ggml-blas.cpp
index 382909fee..97e269807 100644
--- a/llama/ggml-blas.cpp
+++ b/llama/ggml-blas.cpp
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-blas.h b/llama/ggml-blas.h
index b1f1d8a66..62278ada5 100644
--- a/llama/ggml-blas.h
+++ b/llama/ggml-blas.h
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-common.h b/llama/ggml-common.h
index f4b6189ba..01ad72d7a 100644
--- a/llama/ggml-common.h
+++ b/llama/ggml-common.h
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cpp.h b/llama/ggml-cpp.h
index c23921a04..1066459ae 100644
--- a/llama/ggml-cpp.h
+++ b/llama/ggml-cpp.h
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cpu-aarch64.cpp b/llama/ggml-cpu-aarch64.cpp
index 3677698a7..39ac7a331 100644
--- a/llama/ggml-cpu-aarch64.cpp
+++ b/llama/ggml-cpu-aarch64.cpp
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cpu-aarch64.h b/llama/ggml-cpu-aarch64.h
index 86ac1142c..dc6376bbc 100644
--- a/llama/ggml-cpu-aarch64.h
+++ b/llama/ggml-cpu-aarch64.h
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cpu-impl.h b/llama/ggml-cpu-impl.h
index abdfb73a7..7bd64bb1e 100644
--- a/llama/ggml-cpu-impl.h
+++ b/llama/ggml-cpu-impl.h
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cpu-quants.c b/llama/ggml-cpu-quants.c
index b516f8fe2..08df46ff2 100644
--- a/llama/ggml-cpu-quants.c
+++ b/llama/ggml-cpu-quants.c
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cpu-quants.h b/llama/ggml-cpu-quants.h
index ca4d246ea..d02d9cf13 100644
--- a/llama/ggml-cpu-quants.h
+++ b/llama/ggml-cpu-quants.h
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cpu-traits.cpp b/llama/ggml-cpu-traits.cpp
index 00fce8813..c7d518ea9 100644
--- a/llama/ggml-cpu-traits.cpp
+++ b/llama/ggml-cpu-traits.cpp
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cpu-traits.h b/llama/ggml-cpu-traits.h
index 36aa251b5..51af5290b 100644
--- a/llama/ggml-cpu-traits.h
+++ b/llama/ggml-cpu-traits.h
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cpu.c b/llama/ggml-cpu.c
index b6797e3ab..b697fac09 100644
--- a/llama/ggml-cpu.c
+++ b/llama/ggml-cpu.c
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cpu.cpp b/llama/ggml-cpu.cpp
index eb21a55aa..104bb73f5 100644
--- a/llama/ggml-cpu.cpp
+++ b/llama/ggml-cpu.cpp
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
@@ -419,8 +419,11 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st
     switch (op->op) {
         case GGML_OP_CPY:
             return
+                op->type != GGML_TYPE_IQ3_XXS &&
+                op->type != GGML_TYPE_IQ3_S   &&
                 op->type != GGML_TYPE_IQ2_XXS &&
                 op->type != GGML_TYPE_IQ2_XS  &&
+                op->type != GGML_TYPE_IQ2_S   &&
                 op->type != GGML_TYPE_IQ1_S   &&
                 op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float
         case GGML_OP_MUL_MAT:
diff --git a/llama/ggml-cpu.h b/llama/ggml-cpu.h
index fa135856a..f9d5c1b44 100644
--- a/llama/ggml-cpu.h
+++ b/llama/ggml-cpu.h
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda.h b/llama/ggml-cuda.h
index 5388c3c30..d8a9a74d7 100644
--- a/llama/ggml-cuda.h
+++ b/llama/ggml-cuda.h
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/acc.cu b/llama/ggml-cuda/acc.cu
index a49aafc81..6db611b7b 100644
--- a/llama/ggml-cuda/acc.cu
+++ b/llama/ggml-cuda/acc.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/acc.cuh b/llama/ggml-cuda/acc.cuh
index e9b4c54e5..e0048b904 100644
--- a/llama/ggml-cuda/acc.cuh
+++ b/llama/ggml-cuda/acc.cuh
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/arange.cu b/llama/ggml-cuda/arange.cu
index e9d41ec4c..b26728c22 100644
--- a/llama/ggml-cuda/arange.cu
+++ b/llama/ggml-cuda/arange.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/arange.cuh b/llama/ggml-cuda/arange.cuh
index 600f4c4d3..039525ad0 100644
--- a/llama/ggml-cuda/arange.cuh
+++ b/llama/ggml-cuda/arange.cuh
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/argmax.cu b/llama/ggml-cuda/argmax.cu
index b84f2d467..6f916c432 100644
--- a/llama/ggml-cuda/argmax.cu
+++ b/llama/ggml-cuda/argmax.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/argmax.cuh b/llama/ggml-cuda/argmax.cuh
index 8fca051f7..940724cc1 100644
--- a/llama/ggml-cuda/argmax.cuh
+++ b/llama/ggml-cuda/argmax.cuh
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/argsort.cu b/llama/ggml-cuda/argsort.cu
index 90a1ecf91..2099b5de4 100644
--- a/llama/ggml-cuda/argsort.cu
+++ b/llama/ggml-cuda/argsort.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/argsort.cuh b/llama/ggml-cuda/argsort.cuh
index 17ffc03f4..b0069c0f0 100644
--- a/llama/ggml-cuda/argsort.cuh
+++ b/llama/ggml-cuda/argsort.cuh
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/binbcast.cu b/llama/ggml-cuda/binbcast.cu
index 89176cb8e..a891de6bc 100644
--- a/llama/ggml-cuda/binbcast.cu
+++ b/llama/ggml-cuda/binbcast.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/binbcast.cuh b/llama/ggml-cuda/binbcast.cuh
index f71cd10c3..78af15b92 100644
--- a/llama/ggml-cuda/binbcast.cuh
+++ b/llama/ggml-cuda/binbcast.cuh
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/clamp.cu b/llama/ggml-cuda/clamp.cu
index ae828ac9b..f6cd97ab0 100644
--- a/llama/ggml-cuda/clamp.cu
+++ b/llama/ggml-cuda/clamp.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/clamp.cuh b/llama/ggml-cuda/clamp.cuh
index 9ea28b9db..4a10db70e 100644
--- a/llama/ggml-cuda/clamp.cuh
+++ b/llama/ggml-cuda/clamp.cuh
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/common.cuh b/llama/ggml-cuda/common.cuh
index f46137c9e..7603ee3f3 100644
--- a/llama/ggml-cuda/common.cuh
+++ b/llama/ggml-cuda/common.cuh
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/concat.cu b/llama/ggml-cuda/concat.cu
index a2d4dbb95..13e25851a 100644
--- a/llama/ggml-cuda/concat.cu
+++ b/llama/ggml-cuda/concat.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/concat.cuh b/llama/ggml-cuda/concat.cuh
index 5fb80402f..9ab0b933c 100644
--- a/llama/ggml-cuda/concat.cuh
+++ b/llama/ggml-cuda/concat.cuh
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/conv-transpose-1d.cu b/llama/ggml-cuda/conv-transpose-1d.cu
index 7f4d76f18..73910fba0 100644
--- a/llama/ggml-cuda/conv-transpose-1d.cu
+++ b/llama/ggml-cuda/conv-transpose-1d.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/conv-transpose-1d.cuh b/llama/ggml-cuda/conv-transpose-1d.cuh
index 96f719515..a545dddb1 100644
--- a/llama/ggml-cuda/conv-transpose-1d.cuh
+++ b/llama/ggml-cuda/conv-transpose-1d.cuh
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/convert.cu b/llama/ggml-cuda/convert.cu
index b101e5e6e..24e234188 100644
--- a/llama/ggml-cuda/convert.cu
+++ b/llama/ggml-cuda/convert.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/convert.cuh b/llama/ggml-cuda/convert.cuh
index 6ea121967..5a2ada20d 100644
--- a/llama/ggml-cuda/convert.cuh
+++ b/llama/ggml-cuda/convert.cuh
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/count-equal.cu b/llama/ggml-cuda/count-equal.cu
index 0ae127151..4d29fd77c 100644
--- a/llama/ggml-cuda/count-equal.cu
+++ b/llama/ggml-cuda/count-equal.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/count-equal.cuh b/llama/ggml-cuda/count-equal.cuh
index abf20d980..e5b768526 100644
--- a/llama/ggml-cuda/count-equal.cuh
+++ b/llama/ggml-cuda/count-equal.cuh
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/cpy.cu b/llama/ggml-cuda/cpy.cu
index 47103d518..2e077dc04 100644
--- a/llama/ggml-cuda/cpy.cu
+++ b/llama/ggml-cuda/cpy.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/cpy.cuh b/llama/ggml-cuda/cpy.cuh
index 6c1860c22..139e499cd 100644
--- a/llama/ggml-cuda/cpy.cuh
+++ b/llama/ggml-cuda/cpy.cuh
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/cross-entropy-loss.cu b/llama/ggml-cuda/cross-entropy-loss.cu
index 5ab09f10d..1082ede47 100644
--- a/llama/ggml-cuda/cross-entropy-loss.cu
+++ b/llama/ggml-cuda/cross-entropy-loss.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/cross-entropy-loss.cuh b/llama/ggml-cuda/cross-entropy-loss.cuh
index 1f1e4c828..7643904a4 100644
--- a/llama/ggml-cuda/cross-entropy-loss.cuh
+++ b/llama/ggml-cuda/cross-entropy-loss.cuh
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/dequantize.cuh b/llama/ggml-cuda/dequantize.cuh
index 31ec4a261..e7b2f9667 100644
--- a/llama/ggml-cuda/dequantize.cuh
+++ b/llama/ggml-cuda/dequantize.cuh
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/diagmask.cu b/llama/ggml-cuda/diagmask.cu
index 89dc3b119..51bf18b2c 100644
--- a/llama/ggml-cuda/diagmask.cu
+++ b/llama/ggml-cuda/diagmask.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/diagmask.cuh b/llama/ggml-cuda/diagmask.cuh
index 54bdb98c0..ef2a4fd76 100644
--- a/llama/ggml-cuda/diagmask.cuh
+++ b/llama/ggml-cuda/diagmask.cuh
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/fattn-common.cuh b/llama/ggml-cuda/fattn-common.cuh
index 46a58b58f..338795d31 100644
--- a/llama/ggml-cuda/fattn-common.cuh
+++ b/llama/ggml-cuda/fattn-common.cuh
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/fattn-tile-f16.cu b/llama/ggml-cuda/fattn-tile-f16.cu
index 92ada9ecb..9ede4f690 100644
--- a/llama/ggml-cuda/fattn-tile-f16.cu
+++ b/llama/ggml-cuda/fattn-tile-f16.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/fattn-tile-f16.cuh b/llama/ggml-cuda/fattn-tile-f16.cuh
index 8d79eb863..d5dfca977 100644
--- a/llama/ggml-cuda/fattn-tile-f16.cuh
+++ b/llama/ggml-cuda/fattn-tile-f16.cuh
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/fattn-tile-f32.cu b/llama/ggml-cuda/fattn-tile-f32.cu
index 1e0c0b71b..e7e13eba7 100644
--- a/llama/ggml-cuda/fattn-tile-f32.cu
+++ b/llama/ggml-cuda/fattn-tile-f32.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/fattn-tile-f32.cuh b/llama/ggml-cuda/fattn-tile-f32.cuh
index 7c3944b29..7272b04da 100644
--- a/llama/ggml-cuda/fattn-tile-f32.cuh
+++ b/llama/ggml-cuda/fattn-tile-f32.cuh
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/fattn-vec-f16.cuh b/llama/ggml-cuda/fattn-vec-f16.cuh
index 51485b1f5..d9d2b1a8d 100644
--- a/llama/ggml-cuda/fattn-vec-f16.cuh
+++ b/llama/ggml-cuda/fattn-vec-f16.cuh
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/fattn-vec-f32.cuh b/llama/ggml-cuda/fattn-vec-f32.cuh
index b317368e7..b3ab75780 100644
--- a/llama/ggml-cuda/fattn-vec-f32.cuh
+++ b/llama/ggml-cuda/fattn-vec-f32.cuh
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/fattn-wmma-f16.cuh b/llama/ggml-cuda/fattn-wmma-f16.cuh
index babedef8a..745457059 100644
--- a/llama/ggml-cuda/fattn-wmma-f16.cuh
+++ b/llama/ggml-cuda/fattn-wmma-f16.cuh
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/fattn.cu b/llama/ggml-cuda/fattn.cu
index a9c07bbf6..dbeb5344f 100644
--- a/llama/ggml-cuda/fattn.cu
+++ b/llama/ggml-cuda/fattn.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/fattn.cuh b/llama/ggml-cuda/fattn.cuh
index efe7e1c18..b48852350 100644
--- a/llama/ggml-cuda/fattn.cuh
+++ b/llama/ggml-cuda/fattn.cuh
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/getrows.cu b/llama/ggml-cuda/getrows.cu
index 74172cbd0..55b7311b4 100644
--- a/llama/ggml-cuda/getrows.cu
+++ b/llama/ggml-cuda/getrows.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/getrows.cuh b/llama/ggml-cuda/getrows.cuh
index 503e5a6df..9370b4f77 100644
--- a/llama/ggml-cuda/getrows.cuh
+++ b/llama/ggml-cuda/getrows.cuh
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/ggml-cuda.cu b/llama/ggml-cuda/ggml-cuda.cu
index dc71ded53..2449f1b03 100644
--- a/llama/ggml-cuda/ggml-cuda.cu
+++ b/llama/ggml-cuda/ggml-cuda.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/im2col.cu b/llama/ggml-cuda/im2col.cu
index 7ee597304..93d1113f0 100644
--- a/llama/ggml-cuda/im2col.cu
+++ b/llama/ggml-cuda/im2col.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/im2col.cuh b/llama/ggml-cuda/im2col.cuh
index 728a78916..d694d8023 100644
--- a/llama/ggml-cuda/im2col.cuh
+++ b/llama/ggml-cuda/im2col.cuh
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/mma.cuh b/llama/ggml-cuda/mma.cuh
index 0cb75d795..f1c4d8532 100644
--- a/llama/ggml-cuda/mma.cuh
+++ b/llama/ggml-cuda/mma.cuh
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/mmq.cu b/llama/ggml-cuda/mmq.cu
index 965f0499f..021b762c8 100644
--- a/llama/ggml-cuda/mmq.cu
+++ b/llama/ggml-cuda/mmq.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/mmq.cuh b/llama/ggml-cuda/mmq.cuh
index 2498a6d09..3a8835e7e 100644
--- a/llama/ggml-cuda/mmq.cuh
+++ b/llama/ggml-cuda/mmq.cuh
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/mmv.cu b/llama/ggml-cuda/mmv.cu
index 932709b68..6bfe30a7f 100644
--- a/llama/ggml-cuda/mmv.cu
+++ b/llama/ggml-cuda/mmv.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/mmv.cuh b/llama/ggml-cuda/mmv.cuh
index 86575ccfa..5cd8922f7 100644
--- a/llama/ggml-cuda/mmv.cuh
+++ b/llama/ggml-cuda/mmv.cuh
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/mmvq.cu b/llama/ggml-cuda/mmvq.cu
index cdf7c7778..276d80873 100644
--- a/llama/ggml-cuda/mmvq.cu
+++ b/llama/ggml-cuda/mmvq.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/mmvq.cuh b/llama/ggml-cuda/mmvq.cuh
index 07aa2c4ef..c9a200cdc 100644
--- a/llama/ggml-cuda/mmvq.cuh
+++ b/llama/ggml-cuda/mmvq.cuh
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/norm.cu b/llama/ggml-cuda/norm.cu
index b12468f40..5c207af5b 100644
--- a/llama/ggml-cuda/norm.cu
+++ b/llama/ggml-cuda/norm.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/norm.cuh b/llama/ggml-cuda/norm.cuh
index 36a6a03c4..0f4a2951b 100644
--- a/llama/ggml-cuda/norm.cuh
+++ b/llama/ggml-cuda/norm.cuh
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/opt-step-adamw.cu b/llama/ggml-cuda/opt-step-adamw.cu
index 17ddb60df..42f32be08 100644
--- a/llama/ggml-cuda/opt-step-adamw.cu
+++ b/llama/ggml-cuda/opt-step-adamw.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/opt-step-adamw.cuh b/llama/ggml-cuda/opt-step-adamw.cuh
index 99f3da8cf..f7c9b7681 100644
--- a/llama/ggml-cuda/opt-step-adamw.cuh
+++ b/llama/ggml-cuda/opt-step-adamw.cuh
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/out-prod.cu b/llama/ggml-cuda/out-prod.cu
index cfcec7636..77ac57dfd 100644
--- a/llama/ggml-cuda/out-prod.cu
+++ b/llama/ggml-cuda/out-prod.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/out-prod.cuh b/llama/ggml-cuda/out-prod.cuh
index 3c7e747f8..bf22d0cb6 100644
--- a/llama/ggml-cuda/out-prod.cuh
+++ b/llama/ggml-cuda/out-prod.cuh
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/pad.cu b/llama/ggml-cuda/pad.cu
index 429c7132d..3506f55e9 100644
--- a/llama/ggml-cuda/pad.cu
+++ b/llama/ggml-cuda/pad.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/pad.cuh b/llama/ggml-cuda/pad.cuh
index c70f78875..e4030f7e7 100644
--- a/llama/ggml-cuda/pad.cuh
+++ b/llama/ggml-cuda/pad.cuh
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/pool2d.cu b/llama/ggml-cuda/pool2d.cu
index 8bb8fbd39..845c302df 100644
--- a/llama/ggml-cuda/pool2d.cu
+++ b/llama/ggml-cuda/pool2d.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/pool2d.cuh b/llama/ggml-cuda/pool2d.cuh
index d079a5a11..60a1029b8 100644
--- a/llama/ggml-cuda/pool2d.cuh
+++ b/llama/ggml-cuda/pool2d.cuh
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/quantize.cu b/llama/ggml-cuda/quantize.cu
index dd4eb9324..f4f18fdf2 100644
--- a/llama/ggml-cuda/quantize.cu
+++ b/llama/ggml-cuda/quantize.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/quantize.cuh b/llama/ggml-cuda/quantize.cuh
index c3672dfae..c69ea9881 100644
--- a/llama/ggml-cuda/quantize.cuh
+++ b/llama/ggml-cuda/quantize.cuh
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/rope.cu b/llama/ggml-cuda/rope.cu
index 9c61e8faf..ff351db47 100644
--- a/llama/ggml-cuda/rope.cu
+++ b/llama/ggml-cuda/rope.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/rope.cuh b/llama/ggml-cuda/rope.cuh
index f22911d24..c6420e245 100644
--- a/llama/ggml-cuda/rope.cuh
+++ b/llama/ggml-cuda/rope.cuh
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/scale.cu b/llama/ggml-cuda/scale.cu
index 76eb6f35d..feff5a12b 100644
--- a/llama/ggml-cuda/scale.cu
+++ b/llama/ggml-cuda/scale.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/scale.cuh b/llama/ggml-cuda/scale.cuh
index 8acab2e2b..f807772b7 100644
--- a/llama/ggml-cuda/scale.cuh
+++ b/llama/ggml-cuda/scale.cuh
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/softmax.cu b/llama/ggml-cuda/softmax.cu
index dc74bf605..9a664dd7a 100644
--- a/llama/ggml-cuda/softmax.cu
+++ b/llama/ggml-cuda/softmax.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/softmax.cuh b/llama/ggml-cuda/softmax.cuh
index ae179064b..178d5835b 100644
--- a/llama/ggml-cuda/softmax.cuh
+++ b/llama/ggml-cuda/softmax.cuh
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/sum.cu b/llama/ggml-cuda/sum.cu
index a7f475df9..6abb22d29 100644
--- a/llama/ggml-cuda/sum.cu
+++ b/llama/ggml-cuda/sum.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/sum.cuh b/llama/ggml-cuda/sum.cuh
index ebbc5f3cc..134514634 100644
--- a/llama/ggml-cuda/sum.cuh
+++ b/llama/ggml-cuda/sum.cuh
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/sumrows.cu b/llama/ggml-cuda/sumrows.cu
index 7cebb3d79..4cfa98595 100644
--- a/llama/ggml-cuda/sumrows.cu
+++ b/llama/ggml-cuda/sumrows.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/sumrows.cuh b/llama/ggml-cuda/sumrows.cuh
index 8c45f8309..890d149f3 100644
--- a/llama/ggml-cuda/sumrows.cuh
+++ b/llama/ggml-cuda/sumrows.cuh
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu
index 6df43898a..fb8a6413f 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu
index 075ead721..d51ab412a 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu
index 604cf7fa8..c68c98169 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu
index 89abad5f1..1eb161955 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu
index 9dc9a883a..96f14e1cf 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu
index ef40f0dba..1d289ce78 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu
index 20dfc61a6..4274d14b0 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu
index 514b7731c..4f6a49e48 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu
index ae2a66989..e73dc503b 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu
index ea2ec19b8..33d21bce6 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu
index 3298b1e41..277bb7d1f 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu
index 0243ee5c4..2bea2d234 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu
index c2fd7666f..f9dcbe888 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu
index 014f978c9..3c1c11479 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu
index 23acfacf5..054f51c5a 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu
index 10d4f84d2..3989dd786 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu
index bbaa83367..0f51e825e 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu
index 02e0dd320..07364b030 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu
index f69195221..00a5e4db6 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu
index 8131f6b14..d7110f1c6 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu
index e3f9bdd4c..6be93270b 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu
index c1c13c7fa..fb21ad841 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu
index 6860e9555..cb2853875 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu
index 5d5ebb7a8..b9323e9c2 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu
index e4203928d..83d10682a 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu
index 53daa03cb..18d3fe017 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu
index 49489a958..f448877ed 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu
index 936132ac0..0f7135742 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu
index dccdd0343..5a9280230 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu
index aa9606280..278e1778a 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu
index 93f56461b..432af8f2f 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu
index 3c9db7a78..4245e1ed6 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu
index f1e287875..c2582463a 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu
index ecf18ad2b..c9f9119e3 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu
index 4c74eebeb..1a832dca2 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu
index fea31c915..5889ab1f8 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu
index d4d464522..3e5ab335f 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu
index 5c8d298f5..c7b29fa96 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu
index 76a17f036..3aacd0b6c 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu
index eb692c818..1713d9ba0 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu
index 85f6bede1..797cdbf50 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu
index cfa78304e..264ccdb33 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu
index 52c9eebaa..e525b7f68 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu
index cfa4c2a50..c5717ce68 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu
index 02aef31d5..7bae7853f 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu
index c7dce6a6e..e20053de7 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu
index 3d0198668..9923d4ee0 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu
index 283d91716..049a4139f 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu
index e33e64e1e..b7c334839 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu
index 0f63d587f..834dd185e 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu
index 4a9a2e951..ab1fa04a8 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu
index b27ee133f..5b22898d6 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu
index 7c55961df..5ca2a90c8 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu
index 147bd03e1..d4ab31472 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu
index b7a8e5246..34d5a4e6f 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu
index 52a97e00c..7e997ceb4 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu
index 3ca391e32..220b966a8 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu
index a3da427cf..8059a5967 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu
index 2a975f5d6..8aaae726d 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu
index 8f9f54d55..af82c8c5e 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu
index 6bba7acc1..f4868aa85 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu
index 92a7971a4..8a9218feb 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu
index 23596d172..d4b2a48ce 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu
index 42113656d..7810035ff 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu
index 88e07e336..cf1d76be9 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu
index 92e022510..8af0b435f 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu
index fdd0ffee0..a471c6f1a 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu
index 0a44bece2..f84082876 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu
index f6cd122f1..ae9781961 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu
index 1d81b9500..8bdf58e7f 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu
index 5b26a0813..a014fd4e0 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu
index 003d7d68e..83b977d91 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu
index 392b18fcf..0f0a91150 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu
index 2e78b0ee3..7f004d2e2 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu
index b425254b3..aa5442231 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu
index 7d6344d87..0a838ec98 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu
index 64daf5d35..a8b4b3fd3 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu
index ac6db018e..1a704f5c6 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu
index 001087c62..2a51a222c 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu
index 5c68f760d..2372995d5 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu
index d66d8c4fe..ad29aaf64 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu
index 0d8b6b178..6ffa959a3 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu
index aaafe33cf..1ee14e28d 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu
index bbed377eb..7d097471d 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu
index d047f0947..15d5e3c22 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu
index cb61979c6..97efc77af 100644
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu b/llama/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu
index 59731a34c..a6b5fcf19 100644
--- a/llama/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu
+++ b/llama/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu b/llama/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu
index 4c0d8b2c0..cce80fc55 100644
--- a/llama/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu
+++ b/llama/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu b/llama/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu
index 2eca1711c..4d86450cc 100644
--- a/llama/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu
+++ b/llama/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu b/llama/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu
index 3dd7ab0bb..1629af503 100644
--- a/llama/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu
+++ b/llama/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu b/llama/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu
index 77e464244..3fba3533f 100644
--- a/llama/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu
+++ b/llama/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/mmq-instance-iq1_s.cu b/llama/ggml-cuda/template-instances/mmq-instance-iq1_s.cu
index 75fc71443..1ec23c3fc 100644
--- a/llama/ggml-cuda/template-instances/mmq-instance-iq1_s.cu
+++ b/llama/ggml-cuda/template-instances/mmq-instance-iq1_s.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/mmq-instance-iq2_s.cu b/llama/ggml-cuda/template-instances/mmq-instance-iq2_s.cu
index f6618e14b..524b78f5b 100644
--- a/llama/ggml-cuda/template-instances/mmq-instance-iq2_s.cu
+++ b/llama/ggml-cuda/template-instances/mmq-instance-iq2_s.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/mmq-instance-iq2_xs.cu b/llama/ggml-cuda/template-instances/mmq-instance-iq2_xs.cu
index 1f7bf8feb..f1be2630f 100644
--- a/llama/ggml-cuda/template-instances/mmq-instance-iq2_xs.cu
+++ b/llama/ggml-cuda/template-instances/mmq-instance-iq2_xs.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/mmq-instance-iq2_xxs.cu b/llama/ggml-cuda/template-instances/mmq-instance-iq2_xxs.cu
index d801a2526..2f01d7848 100644
--- a/llama/ggml-cuda/template-instances/mmq-instance-iq2_xxs.cu
+++ b/llama/ggml-cuda/template-instances/mmq-instance-iq2_xxs.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/mmq-instance-iq3_s.cu b/llama/ggml-cuda/template-instances/mmq-instance-iq3_s.cu
index 1b7541edf..eabfd6e51 100644
--- a/llama/ggml-cuda/template-instances/mmq-instance-iq3_s.cu
+++ b/llama/ggml-cuda/template-instances/mmq-instance-iq3_s.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/mmq-instance-iq3_xxs.cu b/llama/ggml-cuda/template-instances/mmq-instance-iq3_xxs.cu
index 73372686f..f3d479283 100644
--- a/llama/ggml-cuda/template-instances/mmq-instance-iq3_xxs.cu
+++ b/llama/ggml-cuda/template-instances/mmq-instance-iq3_xxs.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/mmq-instance-iq4_nl.cu b/llama/ggml-cuda/template-instances/mmq-instance-iq4_nl.cu
index e0b7aa416..47376f651 100644
--- a/llama/ggml-cuda/template-instances/mmq-instance-iq4_nl.cu
+++ b/llama/ggml-cuda/template-instances/mmq-instance-iq4_nl.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/mmq-instance-iq4_xs.cu b/llama/ggml-cuda/template-instances/mmq-instance-iq4_xs.cu
index 56be2d974..e8feefee2 100644
--- a/llama/ggml-cuda/template-instances/mmq-instance-iq4_xs.cu
+++ b/llama/ggml-cuda/template-instances/mmq-instance-iq4_xs.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/mmq-instance-q2_k.cu b/llama/ggml-cuda/template-instances/mmq-instance-q2_k.cu
index 60fa51aa9..270a0994f 100644
--- a/llama/ggml-cuda/template-instances/mmq-instance-q2_k.cu
+++ b/llama/ggml-cuda/template-instances/mmq-instance-q2_k.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/mmq-instance-q3_k.cu b/llama/ggml-cuda/template-instances/mmq-instance-q3_k.cu
index f65cfd209..6e5686a42 100644
--- a/llama/ggml-cuda/template-instances/mmq-instance-q3_k.cu
+++ b/llama/ggml-cuda/template-instances/mmq-instance-q3_k.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/mmq-instance-q4_0.cu b/llama/ggml-cuda/template-instances/mmq-instance-q4_0.cu
index da02c911a..113df2d2d 100644
--- a/llama/ggml-cuda/template-instances/mmq-instance-q4_0.cu
+++ b/llama/ggml-cuda/template-instances/mmq-instance-q4_0.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/mmq-instance-q4_1.cu b/llama/ggml-cuda/template-instances/mmq-instance-q4_1.cu
index 34e8f679f..0102022c4 100644
--- a/llama/ggml-cuda/template-instances/mmq-instance-q4_1.cu
+++ b/llama/ggml-cuda/template-instances/mmq-instance-q4_1.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/mmq-instance-q4_k.cu b/llama/ggml-cuda/template-instances/mmq-instance-q4_k.cu
index e9033e75f..15adf4814 100644
--- a/llama/ggml-cuda/template-instances/mmq-instance-q4_k.cu
+++ b/llama/ggml-cuda/template-instances/mmq-instance-q4_k.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/mmq-instance-q5_0.cu b/llama/ggml-cuda/template-instances/mmq-instance-q5_0.cu
index 41b33713f..de9f47beb 100644
--- a/llama/ggml-cuda/template-instances/mmq-instance-q5_0.cu
+++ b/llama/ggml-cuda/template-instances/mmq-instance-q5_0.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/mmq-instance-q5_1.cu b/llama/ggml-cuda/template-instances/mmq-instance-q5_1.cu
index 815654d13..608180f13 100644
--- a/llama/ggml-cuda/template-instances/mmq-instance-q5_1.cu
+++ b/llama/ggml-cuda/template-instances/mmq-instance-q5_1.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/mmq-instance-q5_k.cu b/llama/ggml-cuda/template-instances/mmq-instance-q5_k.cu
index 93b2d0e06..665275aee 100644
--- a/llama/ggml-cuda/template-instances/mmq-instance-q5_k.cu
+++ b/llama/ggml-cuda/template-instances/mmq-instance-q5_k.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/mmq-instance-q6_k.cu b/llama/ggml-cuda/template-instances/mmq-instance-q6_k.cu
index 72042bf24..42e4c687f 100644
--- a/llama/ggml-cuda/template-instances/mmq-instance-q6_k.cu
+++ b/llama/ggml-cuda/template-instances/mmq-instance-q6_k.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/template-instances/mmq-instance-q8_0.cu b/llama/ggml-cuda/template-instances/mmq-instance-q8_0.cu
index 2bc5b2ccf..e6050a49a 100644
--- a/llama/ggml-cuda/template-instances/mmq-instance-q8_0.cu
+++ b/llama/ggml-cuda/template-instances/mmq-instance-q8_0.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/tsembd.cu b/llama/ggml-cuda/tsembd.cu
index 467c4dfc0..8dcd03036 100644
--- a/llama/ggml-cuda/tsembd.cu
+++ b/llama/ggml-cuda/tsembd.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/tsembd.cuh b/llama/ggml-cuda/tsembd.cuh
index 75137a59d..23dfa3ed3 100644
--- a/llama/ggml-cuda/tsembd.cuh
+++ b/llama/ggml-cuda/tsembd.cuh
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/unary.cu b/llama/ggml-cuda/unary.cu
index b86253caf..9e3f5b259 100644
--- a/llama/ggml-cuda/unary.cu
+++ b/llama/ggml-cuda/unary.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/unary.cuh b/llama/ggml-cuda/unary.cuh
index 7845f2ae7..3543a85c3 100644
--- a/llama/ggml-cuda/unary.cuh
+++ b/llama/ggml-cuda/unary.cuh
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/upscale.cu b/llama/ggml-cuda/upscale.cu
index 1a45c5748..8d733d175 100644
--- a/llama/ggml-cuda/upscale.cu
+++ b/llama/ggml-cuda/upscale.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/upscale.cuh b/llama/ggml-cuda/upscale.cuh
index 93116d183..52636d526 100644
--- a/llama/ggml-cuda/upscale.cuh
+++ b/llama/ggml-cuda/upscale.cuh
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/vecdotq.cuh b/llama/ggml-cuda/vecdotq.cuh
index 1f9606492..cee27cab7 100644
--- a/llama/ggml-cuda/vecdotq.cuh
+++ b/llama/ggml-cuda/vecdotq.cuh
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/vendors/cuda.h b/llama/ggml-cuda/vendors/cuda.h
index 07a2e6446..9914a0a1f 100644
--- a/llama/ggml-cuda/vendors/cuda.h
+++ b/llama/ggml-cuda/vendors/cuda.h
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/vendors/hip.h b/llama/ggml-cuda/vendors/hip.h
index 9e88e723c..bd7534bb5 100644
--- a/llama/ggml-cuda/vendors/hip.h
+++ b/llama/ggml-cuda/vendors/hip.h
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/vendors/musa.h b/llama/ggml-cuda/vendors/musa.h
index 8902cd967..d6747bd26 100644
--- a/llama/ggml-cuda/vendors/musa.h
+++ b/llama/ggml-cuda/vendors/musa.h
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/wkv6.cu b/llama/ggml-cuda/wkv6.cu
index d458e1afb..fb76f1008 100644
--- a/llama/ggml-cuda/wkv6.cu
+++ b/llama/ggml-cuda/wkv6.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-cuda/wkv6.cuh b/llama/ggml-cuda/wkv6.cuh
index 4d3df9feb..6a2e44ac0 100644
--- a/llama/ggml-cuda/wkv6.cuh
+++ b/llama/ggml-cuda/wkv6.cuh
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-impl.h b/llama/ggml-impl.h
index f5f0c7649..e43a549be 100644
--- a/llama/ggml-impl.h
+++ b/llama/ggml-impl.h
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
@@ -577,6 +577,22 @@ static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) {
 #define GGML_FP32_TO_BF16(x) ggml_compute_fp32_to_bf16(x)
 #define GGML_BF16_TO_FP32(x) ggml_compute_bf16_to_fp32(x)
 
+// expose GGUF internals for test code
+
+GGML_API size_t gguf_type_size(enum gguf_type type);
+
+GGML_API struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_params params);
+
+struct gguf_buf {
+    void * data;
+    size_t size;
+    size_t offset;
+};
+GGML_API struct gguf_buf gguf_buf_init(size_t size);
+GGML_API void gguf_buf_free(struct gguf_buf buf);
+
+GGML_API void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf * buf, bool only_meta);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/llama/ggml-metal-embed.metal b/llama/ggml-metal-embed.metal
index f45d869e9..f560e5c9c 100644
--- a/llama/ggml-metal-embed.metal
+++ b/llama/ggml-metal-embed.metal
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
@@ -28,7 +28,7 @@
 #define GGML_COMMON_IMPL_METAL
 #if defined(GGML_METAL_EMBED_LIBRARY)
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
@@ -1911,7 +1911,7 @@ GGML_TABLE_END()
 #include "../ggml-common.h"
 #endif
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-metal-impl.h b/llama/ggml-metal-impl.h
index 982b6f9dc..d361e7728 100644
--- a/llama/ggml-metal-impl.h
+++ b/llama/ggml-metal-impl.h
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-metal.h b/llama/ggml-metal.h
index f8e84bf23..7d6a7e981 100644
--- a/llama/ggml-metal.h
+++ b/llama/ggml-metal.h
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-metal.metal b/llama/ggml-metal.metal
index 8552f726b..615ec0e3a 100644
--- a/llama/ggml-metal.metal
+++ b/llama/ggml-metal.metal
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-metal_darwin_arm64.m b/llama/ggml-metal_darwin_arm64.m
index 56d8a7549..c0b83c7d7 100644
--- a/llama/ggml-metal_darwin_arm64.m
+++ b/llama/ggml-metal_darwin_arm64.m
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-quants.c b/llama/ggml-quants.c
index 7cf946749..6d5432865 100644
--- a/llama/ggml-quants.c
+++ b/llama/ggml-quants.c
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-quants.h b/llama/ggml-quants.h
index 2edd3d878..a698f73ac 100644
--- a/llama/ggml-quants.h
+++ b/llama/ggml-quants.h
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-threading.cpp b/llama/ggml-threading.cpp
index 4d2c10f0f..741737968 100644
--- a/llama/ggml-threading.cpp
+++ b/llama/ggml-threading.cpp
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml-threading.h b/llama/ggml-threading.h
index baa20979c..bf5085f5d 100644
--- a/llama/ggml-threading.h
+++ b/llama/ggml-threading.h
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/ggml.c b/llama/ggml.c
index f836cba14..c1b47f191 100644
--- a/llama/ggml.c
+++ b/llama/ggml.c
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
@@ -6084,12 +6084,12 @@ struct ggml_tensor * ggml_graph_get_tensor(const struct ggml_cgraph * cgraph, co
 
 struct ggml_tensor * ggml_graph_get_grad(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node) {
     const size_t igrad = ggml_hash_find(&cgraph->visited_hash_set, node);
-    return igrad != GGML_HASHSET_FULL && ggml_bitset_get(cgraph->visited_hash_set.used, igrad) ? cgraph->grads[igrad] : NULL;
+    return igrad != GGML_HASHSET_FULL && ggml_bitset_get(cgraph->visited_hash_set.used, igrad) && cgraph->grads ? cgraph->grads[igrad] : NULL;
 }
 
 struct ggml_tensor * ggml_graph_get_grad_acc(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node) {
     const size_t igrad = ggml_hash_find(&cgraph->visited_hash_set, node);
-    return igrad != GGML_HASHSET_FULL && ggml_bitset_get(cgraph->visited_hash_set.used, igrad) ? cgraph->grad_accs[igrad] : NULL;
+    return igrad != GGML_HASHSET_FULL && ggml_bitset_get(cgraph->visited_hash_set.used, igrad) && cgraph->grad_accs ? cgraph->grad_accs[igrad] : NULL;
 }
 
 void ggml_graph_print(const struct ggml_cgraph * cgraph) {
@@ -6536,7 +6536,7 @@ struct gguf_context {
     void * data;
 };
 
-static size_t gguf_type_size(enum gguf_type type) {
+size_t gguf_type_size(enum gguf_type type) {
     GGML_ASSERT(0 <= type && type < GGUF_TYPE_COUNT);
     return GGUF_TYPE_SIZE[type];
 }
@@ -6664,13 +6664,7 @@ struct gguf_context * gguf_init_empty(void) {
     return ctx;
 }
 
-struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params) {
-    FILE * file = ggml_fopen(fname, "rb");
-    if (!file) {
-        fprintf(stderr, "%s: failed to open '%s': '%s'\n", __func__, fname, strerror(errno));
-        return NULL;
-    }
-
+struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_params params) {
     // offset from start of file
     size_t offset = 0;
 
@@ -6683,7 +6677,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
         for (uint32_t i = 0; i < sizeof(magic); i++) {
             if (magic[i] != GGUF_MAGIC[i]) {
                 fprintf(stderr, "%s: invalid magic characters '%c%c%c%c'\n", __func__, magic[0], magic[1], magic[2], magic[3]);
-                fclose(file);
                 return NULL;
             }
         }
@@ -6694,7 +6687,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
     struct gguf_context * ctx = calloc(1, sizeof(struct gguf_context));
     if (!ctx) {
         fprintf(stderr, "%s: failed to allocate memory for context\n", __func__);
-        fclose(file);
         return NULL;
     }
 
@@ -6712,7 +6704,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
 
         if (ctx->header.version == 1) {
             fprintf(stderr, "%s: GGUFv1 is no longer supported. please use a more up-to-date version\n", __func__);
-            fclose(file);
             gguf_free(ctx);
             return NULL;
         }
@@ -6725,7 +6716,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
 
         if (!ok) {
             fprintf(stderr, "%s: failed to read header\n", __func__);
-            fclose(file);
             gguf_free(ctx);
             return NULL;
         }
@@ -6735,12 +6725,13 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
     {
         const uint64_t n_kv = ctx->header.n_kv;
 
-        ctx->kv = calloc(n_kv, sizeof(struct gguf_kv));
-        if (!ctx->kv) {
-            fprintf(stderr, "%s: failed to allocate memory for kv pairs\n", __func__);
-            fclose(file);
-            gguf_free(ctx);
-            return NULL;
+        if (n_kv > 0) {
+            ctx->kv = calloc(n_kv, sizeof(struct gguf_kv));
+            if (!ctx->kv) {
+                fprintf(stderr, "%s: failed to allocate memory for kv pairs\n", __func__);
+                gguf_free(ctx);
+                return NULL;
+            }
         }
 
         for (uint64_t i = 0; i < n_kv; ++i) {
@@ -6787,7 +6778,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
                                     // prevent from integer overflow in the malloc below
                                     if (kv->value.arr.n >= SIZE_MAX/gguf_type_size(kv->value.arr.type)) {
                                         fprintf(stderr, "%s: array size is too large (%" PRIu64 ")\n", __func__, kv->value.arr.n);
-                                        fclose(file);
                                         gguf_free(ctx);
                                         return NULL;
                                     }
@@ -6795,7 +6785,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
                                     kv->value.arr.data = calloc(kv->value.arr.n, gguf_type_size(kv->value.arr.type));
                                     if (!kv->value.arr.data) {
                                         fprintf(stderr, "%s: failed to allocate memory for array\n", __func__);
-                                        fclose(file);
                                         gguf_free(ctx);
                                         return NULL;
                                     }
@@ -6807,7 +6796,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
                                     // prevent from integer overflow in the malloc below
                                     if (kv->value.arr.n >= SIZE_MAX/sizeof(struct gguf_str)) {
                                         fprintf(stderr, "%s: array size is too large (%" PRIu64 ")\n", __func__, kv->value.arr.n);
-                                        fclose(file);
                                         gguf_free(ctx);
                                         return NULL;
                                     }
@@ -6815,7 +6803,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
                                     kv->value.arr.data = calloc(kv->value.arr.n, sizeof(struct gguf_str));
                                     if (!kv->value.arr.data) {
                                         fprintf(stderr, "%s: failed to allocate memory for array\n", __func__);
-                                        fclose(file);
                                         gguf_free(ctx);
                                         return NULL;
                                     }
@@ -6846,7 +6833,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
 
         if (!ok) {
             fprintf(stderr, "%s: failed to read key-value pairs\n", __func__);
-            fclose(file);
             gguf_free(ctx);
             return NULL;
         }
@@ -6857,7 +6843,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
         ctx->infos = calloc(ctx->header.n_tensors, sizeof(struct gguf_tensor_info));
         if (!ctx->infos) {
             fprintf(stderr, "%s: failed to allocate memory for tensor infos\n", __func__);
-            fclose(file);
             gguf_free(ctx);
             return NULL;
         }
@@ -6893,7 +6878,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
 
             if (!ok) {
                 fprintf(stderr, "%s: failed to read tensor info\n", __func__);
-                fclose(file);
                 gguf_free(ctx);
                 return NULL;
             }
@@ -6936,7 +6920,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
                 // this tensor type support have been removed:
                 fprintf(stderr, "%s: tensor '%s' of type %d: %s\n",
                         __func__, info->name.data, (int) info->type, ggml_type_name(info->type));
-                fclose(file);
                 gguf_free(ctx);
                 return NULL;
             }
@@ -6944,7 +6927,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
             if (ne % ggml_blck_size(info->type) != 0) {
                 fprintf(stderr, "%s: tensor '%s' of type %d (%s) number of elements (%" PRId64 ") is not a multiple of block size (%" PRId64 ")\n",
                         __func__, info->name.data, (int) info->type, ggml_type_name(info->type), ne, ggml_blck_size(info->type));
-                fclose(file);
                 gguf_free(ctx);
                 return NULL;
             }
@@ -6976,7 +6958,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
         *params.ctx = ggml_init(pdata);
         if (*params.ctx == NULL) {
             fprintf(stderr, "%s: failed to initialize context\n", __func__);
-            fclose(file);
             gguf_free(ctx);
             return NULL;
         }
@@ -6995,7 +6976,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
 
             if (!ok) {
                 fprintf(stderr, "%s: failed to read tensor data\n", __func__);
-                fclose(file);
                 ggml_free(ctx_data);
                 gguf_free(ctx);
                 return NULL;
@@ -7034,7 +7014,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
 
         if (!ok) {
             fprintf(stderr, "%s: failed to read the tensor data\n", __func__);
-            fclose(file);
             ggml_free(ctx_data);
             gguf_free(ctx);
             return NULL;
@@ -7043,11 +7022,21 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
         ggml_set_no_alloc(ctx_data, params.no_alloc);
     }
 
-    fclose(file);
-
     return ctx;
 }
 
+struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params) {
+    FILE * file = ggml_fopen(fname, "rb");
+    if (!file) {
+        fprintf(stderr, "%s: failed to open '%s': '%s'\n", __func__, fname, strerror(errno));
+        return NULL;
+    }
+
+    struct gguf_context * result = gguf_init_from_file_impl(file, params);
+    fclose(file);
+    return result;
+}
+
 void gguf_free(struct gguf_context * ctx) {
     if (ctx == NULL) {
         return;
@@ -7507,13 +7496,7 @@ void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const vo
 //    fwrite(val, sizeof(char), size, file);
 //}
 
-struct gguf_buf {
-    void * data;
-    size_t size;
-    size_t offset;
-};
-
-static struct gguf_buf gguf_buf_init(size_t size) {
+struct gguf_buf gguf_buf_init(size_t size) {
     struct gguf_buf buf = {
         /*buf.data   =*/ size == 0 ? NULL : GGML_CALLOC(1, size),
         /*buf.size   =*/ size,
@@ -7523,7 +7506,7 @@ static struct gguf_buf gguf_buf_init(size_t size) {
     return buf;
 }
 
-static void gguf_buf_free(struct gguf_buf buf) {
+void gguf_buf_free(struct gguf_buf buf) {
     if (buf.data) {
         GGML_FREE(buf.data);
     }
@@ -7561,7 +7544,7 @@ static void gguf_bwrite_el(struct gguf_buf * buf, const void * val, size_t el_si
     buf->offset += el_size;
 }
 
-static void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf * buf, bool only_meta) {
+void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf * buf, bool only_meta) {
     // write header
     gguf_bwrite_el(buf, &ctx->header.magic,     sizeof(ctx->header.magic));
     gguf_bwrite_el(buf, &ctx->header.version,   sizeof(ctx->header.version));
diff --git a/llama/ggml.h b/llama/ggml.h
index b3be4485d..758afeabc 100644
--- a/llama/ggml.h
+++ b/llama/ggml.h
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/json-schema-to-grammar.cpp b/llama/json-schema-to-grammar.cpp
index 8ae99aafc..6b7a6d229 100644
--- a/llama/json-schema-to-grammar.cpp
+++ b/llama/json-schema-to-grammar.cpp
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/json-schema-to-grammar.h b/llama/json-schema-to-grammar.h
index b8a31467e..dfe21235b 100644
--- a/llama/json-schema-to-grammar.h
+++ b/llama/json-schema-to-grammar.h
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/llama-grammar.cpp b/llama/llama-grammar.cpp
index a56f198a8..f44451e54 100644
--- a/llama/llama-grammar.cpp
+++ b/llama/llama-grammar.cpp
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/llama-grammar.h b/llama/llama-grammar.h
index e6b92d7de..67c45199f 100644
--- a/llama/llama-grammar.h
+++ b/llama/llama-grammar.h
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/llama-impl.h b/llama/llama-impl.h
index 99a71baea..7ecdf3bad 100644
--- a/llama/llama-impl.h
+++ b/llama/llama-impl.h
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/llama-sampling.cpp b/llama/llama-sampling.cpp
index d9bce9e9b..a94a2bbdf 100644
--- a/llama/llama-sampling.cpp
+++ b/llama/llama-sampling.cpp
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
@@ -1422,19 +1422,15 @@ struct llama_sampler * llama_sampler_init_grammar_impl(const struct llama_vocab
 // penalties
 
 struct llama_sampler_penalties {
-    const int32_t     n_vocab;
-    const llama_token special_eos_id;
-    const llama_token linefeed_id;
-
     const int32_t penalty_last_n;
     const float   penalty_repeat;
     const float   penalty_freq;
     const float   penalty_present;
 
-    const bool    penalize_nl;
-    const bool    ignore_eos;
-
     ring_buffer<llama_token> prev;
+
+    // a frequency map to count token occurrences
+    std::unordered_map<llama_token, int> token_count;
 };
 
 static const char * llama_sampler_penalties_name(const struct llama_sampler * /*smpl*/) {
@@ -1447,76 +1443,50 @@ static void llama_sampler_penalties_accept(struct llama_sampler * smpl, llama_to
         return;
     }
 
+    ctx->token_count[token]++;
+
+    // if the ring buffer is full, remove the oldest token
+    if (ctx->prev.size() >= (size_t) ctx->penalty_last_n) {
+        const auto old = ctx->prev.front();
+
+        ctx->token_count[old]--;
+        if (ctx->token_count[old] == 0) {
+            ctx->token_count.erase(old);
+        }
+    }
+
     ctx->prev.push_back(token);
+
+#if 0
+    // sanity check
+    std::unordered_map<llama_token, int> tmp;
+    for (int i = 0; i < std::min<int>(ctx->penalty_last_n, ctx->prev.size()); ++i) {
+        tmp[ctx->prev.rat(i)]++;
+    }
+
+    assert(ctx->token_count == tmp);
+#endif
 }
 
 static void llama_sampler_penalties_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
     auto * ctx = (llama_sampler_penalties *) smpl->ctx;
 
-    if (ctx->ignore_eos) {
-        assert(ctx->special_eos_id >= 0);
-
-        // optimistically check if the candidates are not yet sorted/shuffled/truncated
-        if (cur_p->size > (size_t) ctx->special_eos_id && cur_p->data[ctx->special_eos_id].id == ctx->special_eos_id) {
-            cur_p->data[ctx->special_eos_id].logit = -INFINITY;
-        } else {
-            // else, search for the special EOS token
-            for (size_t i = 0; i < cur_p->size; ++i) {
-                if (cur_p->data[i].id == ctx->special_eos_id) {
-                    cur_p->data[i].logit = -INFINITY;
-                    break;
-                }
-            }
-        }
-    }
-
     if ((ctx->penalty_last_n == 0) ||
         (ctx->penalty_repeat == 1.0f && ctx->penalty_freq == 0.0f && ctx->penalty_present == 0.0f)) {
         return;
     }
 
-    bool nl_found = false;
-    size_t nl_idx = 0;
-    float nl_logit = -INFINITY;
-    if (!ctx->penalize_nl) {
-        assert(ctx->linefeed_id >= 0);
-
-        // optimistically check if the candidates are not yet sorted/shuffled/truncated
-        if (cur_p->size > (size_t) ctx->linefeed_id && cur_p->data[ctx->linefeed_id].id == ctx->linefeed_id) {
-            nl_found = true;
-            nl_idx = ctx->linefeed_id;
-            nl_logit = cur_p->data[ctx->linefeed_id].logit;
-        } else {
-            // else, search for the linefeed token
-            for (size_t i = 0; i < cur_p->size; ++i) {
-                if (cur_p->data[i].id == ctx->linefeed_id) {
-                    nl_found = true;
-                    nl_idx = i;
-                    nl_logit = cur_p->data[i].logit;
-                    break;
-                }
-            }
-        }
-    }
-
-    // Create a frequency map to count occurrences of each token in last_tokens
-    // TODO: optimize this by maintaining the token count in the sampler context
-    using llama_token_cnt = std::unordered_map<llama_token, int>;
-    llama_token_cnt token_count;
-
-    for (int i = 0; i < std::min<int>(ctx->penalty_last_n, ctx->prev.size()); ++i) {
-        token_count[ctx->prev.rat(i)]++;
-    }
-
     // Apply frequency and presence penalties to the cur_p
     for (size_t i = 0; i < cur_p->size; ++i) {
-        const auto token_iter = token_count.find(cur_p->data[i].id);
-        if (token_iter == token_count.end()) {
+        const auto token_iter = ctx->token_count.find(cur_p->data[i].id);
+        if (token_iter == ctx->token_count.end()) {
             continue;
         }
 
         const int count = token_iter->second;
 
+        assert(count > 0 && count <= ctx->penalty_last_n);
+
         // The academic publication that described this technique actually just only divided, but that would cause tokens with negative logits to become more likely, which is obviously wrong.
         // This is common fix for this problem, which is to multiply by the penalty instead of dividing.
         if (cur_p->data[i].logit <= 0) {
@@ -1529,30 +1499,21 @@ static void llama_sampler_penalties_apply(struct llama_sampler * smpl, llama_tok
     }
 
     cur_p->sorted = false;
-
-    if (!ctx->penalize_nl && nl_found) {
-        // restore the logit of the newline token if it was penalized
-        cur_p->data[nl_idx].logit = nl_logit;
-    }
 }
 
 static void llama_sampler_penalties_reset(struct llama_sampler * smpl) {
     auto * ctx = (llama_sampler_penalties *) smpl->ctx;
     ctx->prev.clear();
+    ctx->token_count.clear();
 }
 
 static struct llama_sampler * llama_sampler_penalties_clone(const struct llama_sampler * smpl) {
     const auto * ctx = (const llama_sampler_penalties *) smpl->ctx;
     auto * result = llama_sampler_init_penalties(
-            ctx->n_vocab,
-            ctx->special_eos_id,
-            ctx->linefeed_id,
             ctx->penalty_last_n,
             ctx->penalty_repeat,
             ctx->penalty_freq,
-            ctx->penalty_present,
-            ctx->penalize_nl,
-            ctx->ignore_eos);
+            ctx->penalty_present);
 
     // copy the state
     {
@@ -1578,38 +1539,21 @@ static struct llama_sampler_i llama_sampler_penalties_i = {
 };
 
 struct llama_sampler * llama_sampler_init_penalties(
-        int32_t n_vocab,
-        llama_token special_eos_id,
-        llama_token linefeed_id,
         int32_t penalty_last_n,
         float penalty_repeat,
         float penalty_freq,
-        float penalty_present,
-        bool penalize_nl,
-        bool ignore_eos) {
-    if (linefeed_id == LLAMA_TOKEN_NULL) {
-        penalize_nl = true;
-    }
-
-    if (special_eos_id == LLAMA_TOKEN_NULL) {
-        ignore_eos = false;
-    }
-
+        float penalty_present) {
     penalty_last_n = std::max(penalty_last_n, 0);
 
     return new llama_sampler {
         /* .iface = */ &llama_sampler_penalties_i,
         /* .ctx   = */ new llama_sampler_penalties {
-            /* .n_vocab         = */ n_vocab,
-            /* .special_eos_id  = */ special_eos_id,
-            /* .linefeed_id     = */ linefeed_id,
             /* .penalty_last_n  = */ penalty_last_n,
             /* .penalty_repeat  = */ penalty_repeat,
             /* .penalty_freq    = */ penalty_freq,
             /* .penalty_present = */ penalty_present,
-            /* .penalize_nl     = */ penalize_nl,
-            /* .ignore_eos      = */ ignore_eos,
             /* .prev            = */ ring_buffer<llama_token>(penalty_last_n),
+            /* .token_count     = */ {},
         },
     };
 }
@@ -1637,7 +1581,8 @@ static void get_overlapping_token_sequences(const llama_vocab & vocab, const std
         if (word.find(str) != std::string::npos) {
             token_sequences.emplace(token_id, std::vector<llama_token>());
         } else {
-            size_t word_len = word.size(), str_len = str.size();
+            size_t word_len = word.size();
+            size_t str_len = str.size();
             size_t pos = -1;
             while ((pos = word.find(str[0], pos + 1)) != std::string::npos) {
                 bool match = true;
diff --git a/llama/llama-sampling.h b/llama/llama-sampling.h
index e6b2d0800..ede8f14f3 100644
--- a/llama/llama-sampling.h
+++ b/llama/llama-sampling.h
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/llama-vocab.cpp b/llama/llama-vocab.cpp
index 6d16e2a9f..ab810489a 100644
--- a/llama/llama-vocab.cpp
+++ b/llama/llama-vocab.cpp
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
@@ -764,7 +764,7 @@ struct llm_tokenizer_wpm_session {
         std::vector<std::string> words(1, "");
 
         for (const uint32_t cpt : cpts_nfd) {
-            const auto flags = unicode_cpt_flags(cpt);
+            const auto flags = unicode_cpt_flags_from_cpt(cpt);
 
             if (flags.is_whitespace) {
                 if (words.back().size()) {  // finish previous word if any
diff --git a/llama/llama-vocab.h b/llama/llama-vocab.h
index c9e940a5d..888068dff 100644
--- a/llama/llama-vocab.h
+++ b/llama/llama-vocab.h
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/llama.cpp b/llama/llama.cpp
index 938368687..1a861eac2 100644
--- a/llama/llama.cpp
+++ b/llama/llama.cpp
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
@@ -211,6 +211,7 @@ enum llm_arch {
     LLM_ARCH_OLMOE,
     LLM_ARCH_OPENELM,
     LLM_ARCH_ARCTIC,
+    LLM_ARCH_DEEPSEEK,
     LLM_ARCH_DEEPSEEK2,
     LLM_ARCH_CHATGLM,
     LLM_ARCH_BITNET,
@@ -268,6 +269,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
     { LLM_ARCH_OLMOE,           "olmoe"        },
     { LLM_ARCH_OPENELM,         "openelm"      },
     { LLM_ARCH_ARCTIC,          "arctic"       },
+    { LLM_ARCH_DEEPSEEK,        "deepseek"     },
     { LLM_ARCH_DEEPSEEK2,       "deepseek2"    },
     { LLM_ARCH_CHATGLM,         "chatglm"      },
     { LLM_ARCH_BITNET,          "bitnet"       },
@@ -1386,6 +1388,33 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
             { LLM_TENSOR_FFN_UP_EXPS,     "blk.%d.ffn_up_exps" },
         },
     },
+    {
+        LLM_ARCH_DEEPSEEK,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,         "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,        "output_norm" },
+            { LLM_TENSOR_OUTPUT,             "output" },
+            { LLM_TENSOR_ROPE_FREQS,         "rope_freqs" },
+            { LLM_TENSOR_ATTN_NORM,          "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_Q,             "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_K,             "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_V,             "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,           "blk.%d.attn_output" },
+            { LLM_TENSOR_ATTN_ROT_EMBD,      "blk.%d.attn_rot_embd" },
+            { LLM_TENSOR_FFN_GATE_INP,       "blk.%d.ffn_gate_inp" },
+            { LLM_TENSOR_FFN_NORM,           "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_GATE,           "blk.%d.ffn_gate" },
+            { LLM_TENSOR_FFN_DOWN,           "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_UP,             "blk.%d.ffn_up" },
+            { LLM_TENSOR_FFN_GATE_EXPS,      "blk.%d.ffn_gate_exps" },
+            { LLM_TENSOR_FFN_DOWN_EXPS,      "blk.%d.ffn_down_exps" },
+            { LLM_TENSOR_FFN_UP_EXPS,        "blk.%d.ffn_up_exps" },
+            { LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" },
+            { LLM_TENSOR_FFN_GATE_SHEXP,     "blk.%d.ffn_gate_shexp" },
+            { LLM_TENSOR_FFN_DOWN_SHEXP,     "blk.%d.ffn_down_shexp" },
+            { LLM_TENSOR_FFN_UP_SHEXP,       "blk.%d.ffn_up_shexp" },
+        },
+    },
     {
         LLM_ARCH_DEEPSEEK2,
         {
@@ -1678,6 +1707,7 @@ enum llm_chat_template {
     LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN,
     LLM_CHAT_TEMPLATE_MISTRAL_V7,
     LLM_CHAT_TEMPLATE_PHI_3,
+    LLM_CHAT_TEMPLATE_FALCON_3,
     LLM_CHAT_TEMPLATE_ZEPHYR,
     LLM_CHAT_TEMPLATE_MONARCH,
     LLM_CHAT_TEMPLATE_GEMMA,
@@ -1695,6 +1725,7 @@ enum llm_chat_template {
     LLM_CHAT_TEMPLATE_EXAONE_3,
     LLM_CHAT_TEMPLATE_RWKV_WORLD,
     LLM_CHAT_TEMPLATE_GRANITE,
+    LLM_CHAT_TEMPLATE_GIGACHAT,
     LLM_CHAT_TEMPLATE_UNKNOWN,
 };
 
@@ -1709,6 +1740,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
     { "mistral-v3-tekken", LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN },
     { "mistral-v7",        LLM_CHAT_TEMPLATE_MISTRAL_V7        },
     { "phi3",              LLM_CHAT_TEMPLATE_PHI_3             },
+    { "falcon3",           LLM_CHAT_TEMPLATE_FALCON_3          },
     { "zephyr",            LLM_CHAT_TEMPLATE_ZEPHYR            },
     { "monarch",           LLM_CHAT_TEMPLATE_MONARCH           },
     { "gemma",             LLM_CHAT_TEMPLATE_GEMMA             },
@@ -1726,6 +1758,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
     { "exaone3",           LLM_CHAT_TEMPLATE_EXAONE_3          },
     { "rwkv-world",        LLM_CHAT_TEMPLATE_RWKV_WORLD        },
     { "granite",           LLM_CHAT_TEMPLATE_GRANITE           },
+    { "gigachat",          LLM_CHAT_TEMPLATE_GIGACHAT          },
 };
 
 static llm_arch llm_arch_from_string(const std::string & name) {
@@ -6270,6 +6303,19 @@ static void llm_load_hparams(
                     model.type = e_model::MODEL_UNKNOWN;
                 }
             } break;
+        case LLM_ARCH_DEEPSEEK:
+            {
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+                ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
+                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
+                ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
+                ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
+
+                switch (hparams.n_layer) {
+                    case 28: model.type = e_model::MODEL_20B; break;
+                    default: model.type = e_model::MODEL_UNKNOWN;
+                }
+            } break;
         case LLM_ARCH_DEEPSEEK2:
             {
                 bool is_lite = (hparams.n_layer == 27);
@@ -6611,6 +6657,11 @@ static void llm_load_vocab(
             } else if (
                     tokenizer_pre == "falcon") {
                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_FALCON;
+            } else if (
+                    tokenizer_pre == "falcon3") {
+                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
+                vocab.tokenizer_ignore_merges = true;
+                vocab.tokenizer_add_bos = true;
             } else if (
                     tokenizer_pre == "mpt") {
                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_MPT;
@@ -6622,6 +6673,7 @@ static void llm_load_vocab(
                     tokenizer_pre == "phi-2"   ||
                     tokenizer_pre == "jina-es" ||
                     tokenizer_pre == "jina-de" ||
+                    tokenizer_pre == "gigachat"   ||
                     tokenizer_pre == "jina-v1-en" ||
                     tokenizer_pre == "jina-v2-es" ||
                     tokenizer_pre == "jina-v2-de" ||
@@ -7274,6 +7326,13 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
 
     LLAMA_LOG_INFO("%s: max token length = %d\n", __func__, vocab.max_token_len);
 
+    if (model.arch == LLM_ARCH_DEEPSEEK) {
+        LLAMA_LOG_INFO("%s: n_layer_dense_lead   = %d\n",     __func__, hparams.n_layer_dense_lead);
+        LLAMA_LOG_INFO("%s: n_ff_exp             = %d\n",     __func__, hparams.n_ff_exp);
+        LLAMA_LOG_INFO("%s: n_expert_shared      = %d\n",     __func__, hparams.n_expert_shared);
+        LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n",   __func__, hparams.expert_weights_scale);
+    }
+
     if (model.arch == LLM_ARCH_DEEPSEEK2) {
         LLAMA_LOG_INFO("%s: n_layer_dense_lead   = %d\n",     __func__, hparams.n_layer_dense_lead);
         LLAMA_LOG_INFO("%s: n_lora_q             = %d\n",     __func__, hparams.n_lora_q);
@@ -9104,6 +9163,55 @@ static bool llm_load_tensors(
                         layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd,   n_ff, n_expert}, 0);
                     }
                 } break;
+            case LLM_ARCH_DEEPSEEK:
+                {
+
+                    const int64_t n_ff_exp        = hparams.n_ff_exp;
+                    const int64_t n_expert_shared = hparams.n_expert_shared;
+
+                    model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+                    // output
+                    model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                    model.output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
+
+                    for (int i = 0; i < n_layer; ++i) {
+                        auto & layer = model.layers[i];
+
+                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
+                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
+                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
+                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+                        if (i < (int) hparams.n_layer_dense_lead) {
+                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+                        } else {
+                            layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
+
+                            if (n_expert == 0) {
+                                throw std::runtime_error("n_expert must be > 0");
+                            }
+                            if (n_expert_used == 0) {
+                                throw std::runtime_error("n_expert_used must be > 0");
+                            }
+
+                            // MoE branch
+                            layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
+                            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, 0);
+                            layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
+
+                            // Shared expert branch
+                            layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
+                            layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {        n_ff_exp * n_expert_shared, n_embd}, 0);
+                            layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
+                        }
+                    }
+                } break;
             case LLM_ARCH_DEEPSEEK2:
                 {
                     const bool is_lite = (hparams.n_layer == 27);
@@ -15737,6 +15845,161 @@ struct llm_build_context {
         return gf;
     }
 
+    struct ggml_cgraph * build_deepseek() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+
+        // mutable variable, needed during the last layer of the computation to skip unused tokens
+        int32_t n_tokens = this->n_tokens;
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+        GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = build_inp_pos();
+
+        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+        const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
+        for (int il = 0; il < n_layer; ++il) {
+            struct ggml_tensor * inpSA = inpL;
+
+            // norm
+            cur = llm_build_norm(ctx0, inpL, hparams,
+                    model.layers[il].attn_norm, NULL,
+                    LLM_NORM_RMS, cb, il);
+            cb(cur, "attn_norm", il);
+
+            // self-attention
+            {
+                // rope freq factors for llama3; may return nullptr for llama2 and other models
+                struct ggml_tensor * rope_factors = build_rope_factors(il);
+
+                // compute Q and K and RoPE them
+                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                cb(Qcur, "Qcur", il);
+                if (model.layers[il].bq) {
+                    Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+                    cb(Qcur, "Qcur", il);
+                }
+
+                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                cb(Kcur, "Kcur", il);
+                if (model.layers[il].bk) {
+                    Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+                    cb(Kcur, "Kcur", il);
+                }
+
+                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                cb(Vcur, "Vcur", il);
+                if (model.layers[il].bv) {
+                    Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+                    cb(Vcur, "Vcur", il);
+                }
+
+                Qcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Qcur, "Qcur", il);
+
+                Kcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Kcur, "Kcur", il);
+
+                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                        model.layers[il].wo, model.layers[il].bo,
+                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il);
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                n_tokens = n_outputs;
+                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+            }
+
+
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+            cb(ffn_inp, "ffn_inp", il);
+
+            cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                    model.layers[il].ffn_norm, NULL,
+                    LLM_NORM_RMS, cb, il);
+            cb(cur, "ffn_norm", il);
+
+            if ((uint32_t) il < hparams.n_layer_dense_lead) {
+                cur = llm_build_ffn(ctx0, lctx, cur,
+                        model.layers[il].ffn_up,   NULL, NULL,
+                        model.layers[il].ffn_gate, NULL, NULL,
+                        model.layers[il].ffn_down, NULL, NULL,
+                        NULL,
+                        LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
+                cb(cur, "ffn_out", il);
+            } else {
+                // MoE branch
+                ggml_tensor * moe_out =
+                        llm_build_moe_ffn(ctx0, lctx, cur,
+                            model.layers[il].ffn_gate_inp,
+                            model.layers[il].ffn_up_exps,
+                            model.layers[il].ffn_gate_exps,
+                            model.layers[il].ffn_down_exps,
+                            n_expert, n_expert_used,
+                            LLM_FFN_SILU, false,
+                            false, hparams.expert_weights_scale,
+                            cb, il);
+                cb(moe_out, "ffn_moe_out", il);
+
+                // FFN shared expert
+                {
+                    ggml_tensor * ffn_shexp = llm_build_ffn(ctx0, lctx, cur,
+                            model.layers[il].ffn_up_shexp,   NULL, NULL,
+                            model.layers[il].ffn_gate_shexp, NULL, NULL,
+                            model.layers[il].ffn_down_shexp, NULL, NULL,
+                            NULL,
+                            LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
+                    cb(ffn_shexp, "ffn_shexp", il);
+
+                    cur = ggml_add(ctx0, moe_out, ffn_shexp);
+                    cb(cur, "ffn_out", il);
+                }
+            }
+
+            cur = ggml_add(ctx0, cur, ffn_inp);
+            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = llm_build_norm(ctx0, cur, hparams,
+                model.output_norm, NULL,
+                LLM_NORM_RMS, cb, -1);
+        cb(cur, "result_norm", -1);
+
+        // lm_head
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
     struct ggml_cgraph * build_deepseek2() {
         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
 
@@ -17580,6 +17843,10 @@ static struct ggml_cgraph * llama_build_graph(
             {
                 result = llm.build_arctic();
             } break;
+        case LLM_ARCH_DEEPSEEK:
+            {
+                result = llm.build_deepseek();
+            } break;
         case LLM_ARCH_DEEPSEEK2:
             {
                 result = llm.build_deepseek2();
@@ -20830,6 +21097,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
         case LLM_ARCH_COMMAND_R:
         case LLM_ARCH_OLMO:
         case LLM_ARCH_ARCTIC:
+        case LLM_ARCH_DEEPSEEK:
         case LLM_ARCH_DEEPSEEK2:
         case LLM_ARCH_CHATGLM:
         case LLM_ARCH_GRANITE:
@@ -22659,6 +22927,8 @@ static llm_chat_template llama_chat_detect_template(const std::string & tmpl) {
         }
     } else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>")) {
         return LLM_CHAT_TEMPLATE_PHI_3;
+    } else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|user|>")) {
+        return LLM_CHAT_TEMPLATE_FALCON_3;
     } else if (tmpl_contains("<|user|>") && tmpl_contains("<|endoftext|>")) {
         return LLM_CHAT_TEMPLATE_ZEPHYR;
     } else if (tmpl_contains("bos_token + message['role']")) {
@@ -22703,6 +22973,8 @@ static llm_chat_template llama_chat_detect_template(const std::string & tmpl) {
         return LLM_CHAT_TEMPLATE_RWKV_WORLD;
     } else if (tmpl_contains("<|start_of_role|>")) {
         return LLM_CHAT_TEMPLATE_GRANITE;
+    } else if (tmpl_contains("message['role'] + additional_special_tokens[0] + message['content'] + additional_special_tokens[1]")) {
+        return LLM_CHAT_TEMPLATE_GIGACHAT;
     }
     return LLM_CHAT_TEMPLATE_UNKNOWN;
 }
@@ -22809,6 +23081,15 @@ static int32_t llama_chat_apply_template_internal(
         if (add_ass) {
             ss << "<|assistant|>\n";
         }
+    } else if (tmpl == LLM_CHAT_TEMPLATE_FALCON_3) {
+        // Falcon 3
+        for (auto message : chat) {
+            std::string role(message->role);
+            ss << "<|" << role << "|>\n" << message->content << "\n";
+        }
+        if (add_ass) {
+            ss << "<|assistant|>\n";
+        }
     } else if (tmpl == LLM_CHAT_TEMPLATE_ZEPHYR) {
         // zephyr template
         for (auto message : chat) {
@@ -23026,6 +23307,32 @@ static int32_t llama_chat_apply_template_internal(
         if (add_ass) {
             ss << "<|start_of_role|>assistant<|end_of_role|>\n";
         }
+    } else if (tmpl == LLM_CHAT_TEMPLATE_GIGACHAT) {
+        // GigaChat template
+        bool has_system = !chat.empty() && std::string(chat[0]->role) == "system";
+
+        // Handle system message if present
+        if (has_system) {
+            ss << "<s>" << chat[0]->content << "<|message_sep|>";
+        } else {
+            ss << "<s>";
+        }
+
+        // Process remaining messages
+        for (size_t i = has_system ? 1 : 0; i < chat.size(); i++) {
+            std::string role(chat[i]->role);
+            if (role == "user") {
+                ss << "user<|role_sep|>" << chat[i]->content << "<|message_sep|>"
+                << "available functions<|role_sep|>[]<|message_sep|>";
+            } else if (role == "assistant") {
+                ss << "assistant<|role_sep|>" << chat[i]->content << "<|message_sep|>";
+            }
+        }
+
+        // Add generation prompt if needed
+        if (add_ass) {
+            ss << "assistant<|role_sep|>";
+        }
     } else {
         // template not supported
         return -1;
diff --git a/llama/llama.go b/llama/llama.go
index c11d53411..46bed885f 100644
--- a/llama/llama.go
+++ b/llama/llama.go
@@ -671,7 +671,6 @@ type SamplingParams struct {
 	Mirostat       int
 	MirostatTau    float32
 	MirostatEta    float32
-	PenalizeNl     bool
 	Seed           uint32
 	Grammar        string
 }
@@ -690,7 +689,6 @@ func NewSamplingContext(model *Model, params SamplingParams) (*SamplingContext,
 	cparams.mirostat = C.int32_t(params.Mirostat)
 	cparams.mirostat_tau = C.float(params.MirostatTau)
 	cparams.mirostat_eta = C.float(params.MirostatEta)
-	cparams.penalize_nl = C.bool(params.PenalizeNl)
 	cparams.seed = C.uint32_t(params.Seed)
 
 	grammar := C.CString(params.Grammar)
diff --git a/llama/llama.h b/llama/llama.h
index a73aea997..e00dfa2ac 100644
--- a/llama/llama.h
+++ b/llama/llama.h
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
@@ -1170,16 +1170,12 @@ extern "C" {
                           const char * grammar_str,
                           const char * grammar_root);
 
+    /// NOTE: Avoid using on the full vocabulary as searching for repeated tokens can become slow. For example, apply top-k or top-p sampling first.
     LLAMA_API struct llama_sampler * llama_sampler_init_penalties(
-                             int32_t   n_vocab,         // llama_n_vocab()
-                         llama_token   special_eos_id,  // llama_token_eos()
-                         llama_token   linefeed_id,     // llama_token_nl()
-                             int32_t   penalty_last_n,  // last n tokens to penalize (0 = disable penalty, -1 = context size)
-                               float   penalty_repeat,  // 1.0 = disabled
-                               float   penalty_freq,    // 0.0 = disabled
-                               float   penalty_present, // 0.0 = disabled
-                                bool   penalize_nl,     // consider newlines as a repeatable token
-                                bool   ignore_eos);     // ignore the end-of-sequence token
+                             int32_t   penalty_last_n,   // last n tokens to penalize (0 = disable penalty, -1 = context size)
+                               float   penalty_repeat,   // 1.0 = disabled
+                               float   penalty_freq,     // 0.0 = disabled
+                               float   penalty_present); // 0.0 = disabled
 
     ///  @details DRY sampler, designed by p-e-w, as described in: https://github.com/oobabooga/text-generation-webui/pull/5677, porting Koboldcpp implementation authored by pi6am: https://github.com/LostRuins/koboldcpp/pull/982
     LLAMA_API struct llama_sampler *    llama_sampler_init_dry(
diff --git a/llama/llava.cpp b/llama/llava.cpp
index 8e35e7c61..b7d72c23b 100644
--- a/llama/llava.cpp
+++ b/llama/llava.cpp
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/llava.h b/llama/llava.h
index 8f26901f9..787a2b2b2 100644
--- a/llama/llava.h
+++ b/llama/llava.h
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/log.cpp b/llama/log.cpp
index 9815dd68d..7854b4840 100644
--- a/llama/log.cpp
+++ b/llama/log.cpp
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/log.h b/llama/log.h
index 4fc59d608..24e27fd2d 100644
--- a/llama/log.h
+++ b/llama/log.h
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/mmq.cpp b/llama/mmq.cpp
index 3e2ce6295..1506c6da5 100644
--- a/llama/mmq.cpp
+++ b/llama/mmq.cpp
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/mmq.h b/llama/mmq.h
index 63773678c..7df3f3267 100644
--- a/llama/mmq.h
+++ b/llama/mmq.h
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/runner/runner.go b/llama/runner/runner.go
index 9771420ef..4dd2ea2b4 100644
--- a/llama/runner/runner.go
+++ b/llama/runner/runner.go
@@ -568,7 +568,6 @@ type Options struct {
 	Mirostat         int      `json:"mirostat"`
 	MirostatTau      float32  `json:"mirostat_tau"`
 	MirostatEta      float32  `json:"mirostat_eta"`
-	PenalizeNewline  bool     `json:"penalize_nl"`
 	Stop             []string `json:"stop"`
 }
 
@@ -640,7 +639,6 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
 	samplingParams.Mirostat = req.Mirostat
 	samplingParams.MirostatTau = req.MirostatTau
 	samplingParams.MirostatEta = req.MirostatEta
-	samplingParams.PenalizeNl = req.PenalizeNewline
 	samplingParams.Seed = uint32(req.Seed)
 	samplingParams.Grammar = req.Grammar
 
diff --git a/llama/sampling.cpp b/llama/sampling.cpp
index 3d0345e02..361c07447 100644
--- a/llama/sampling.cpp
+++ b/llama/sampling.cpp
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
@@ -187,32 +187,20 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
                 params.logit_bias.size(),
                 params.logit_bias.data()));
 
-    llama_sampler_chain_add(result->chain,
-            llama_sampler_init_penalties(
-                llama_n_vocab  (model),
-                llama_token_eos(model),
-                llama_token_nl (model),
-                params.penalty_last_n,
-                params.penalty_repeat,
-                params.penalty_freq,
-                params.penalty_present,
-                params.penalize_nl,
-                params.ignore_eos));
-
     if (params.mirostat == 0) {
         for (const auto & cnstr : params.samplers) {
             switch (cnstr) {
-                    case COMMON_SAMPLER_TYPE_DRY:
+                case COMMON_SAMPLER_TYPE_DRY:
                     {
-                        std::vector<const char*> c_breakers;
+                        std::vector<const char *> c_breakers;
                         c_breakers.reserve(params.dry_sequence_breakers.size());
-                        for (const auto& str : params.dry_sequence_breakers) {
+                        for (const auto & str : params.dry_sequence_breakers) {
                             c_breakers.push_back(str.c_str());
                         }
 
                         llama_sampler_chain_add(result->chain, llama_sampler_init_dry      (model, params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
                     }
-                        break;
+                    break;
                 case COMMON_SAMPLER_TYPE_TOP_K:
                     llama_sampler_chain_add(result->chain, llama_sampler_init_top_k    (params.top_k));
                     break;
@@ -234,6 +222,9 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
                 case COMMON_SAMPLER_TYPE_INFILL:
                     llama_sampler_chain_add(result->chain, llama_sampler_init_infill   (model));
                     break;
+                case COMMON_SAMPLER_TYPE_PENALTIES:
+                    llama_sampler_chain_add(result->chain, llama_sampler_init_penalties(params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
+                    break;
                 default:
                     GGML_ASSERT(false && "unknown sampler type");
             }
@@ -441,6 +432,7 @@ char common_sampler_type_to_chr(enum common_sampler_type cnstr) {
         case COMMON_SAMPLER_TYPE_TEMPERATURE: return 't';
         case COMMON_SAMPLER_TYPE_XTC:         return 'x';
         case COMMON_SAMPLER_TYPE_INFILL:      return 'i';
+        case COMMON_SAMPLER_TYPE_PENALTIES:   return 'e';
         default : return '?';
     }
 }
@@ -455,6 +447,7 @@ std::string common_sampler_type_to_str(enum common_sampler_type cnstr) {
         case COMMON_SAMPLER_TYPE_TEMPERATURE: return "temperature";
         case COMMON_SAMPLER_TYPE_XTC:         return "xtc";
         case COMMON_SAMPLER_TYPE_INFILL:      return "infill";
+        case COMMON_SAMPLER_TYPE_PENALTIES:   return "penalties";
         default : return "";
     }
 }
@@ -469,6 +462,7 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
         { "temperature", COMMON_SAMPLER_TYPE_TEMPERATURE },
         { "xtc",         COMMON_SAMPLER_TYPE_XTC },
         { "infill",      COMMON_SAMPLER_TYPE_INFILL },
+        { "penalties",   COMMON_SAMPLER_TYPE_PENALTIES },
     };
 
     // since samplers names are written multiple ways
@@ -515,6 +509,7 @@ std::vector<common_sampler_type> common_sampler_types_from_chars(const std::stri
         { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TEMPERATURE), COMMON_SAMPLER_TYPE_TEMPERATURE },
         { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_XTC),         COMMON_SAMPLER_TYPE_XTC },
         { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_INFILL),      COMMON_SAMPLER_TYPE_INFILL },
+        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_PENALTIES),   COMMON_SAMPLER_TYPE_PENALTIES },
     };
 
     std::vector<common_sampler_type> samplers;
diff --git a/llama/sampling.h b/llama/sampling.h
index 01c955e88..a7693f2ce 100644
--- a/llama/sampling.h
+++ b/llama/sampling.h
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/sampling_ext.cpp b/llama/sampling_ext.cpp
index 030864a13..0f137dc8d 100644
--- a/llama/sampling_ext.cpp
+++ b/llama/sampling_ext.cpp
@@ -18,7 +18,6 @@ struct common_sampler *common_sampler_cinit(const struct llama_model *model, str
         sparams.mirostat = params->mirostat;
         sparams.mirostat_tau = params->mirostat_tau;
         sparams.mirostat_eta = params->mirostat_eta;
-        sparams.penalize_nl = params->penalize_nl;
         sparams.seed = params->seed;
         sparams.grammar = params->grammar;
         sparams.xtc_probability = 0.0;
diff --git a/llama/sampling_ext.h b/llama/sampling_ext.h
index 1bd355f8f..39f499f19 100644
--- a/llama/sampling_ext.h
+++ b/llama/sampling_ext.h
@@ -23,7 +23,6 @@ extern "C"
         int32_t mirostat;
         float mirostat_tau;
         float mirostat_eta;
-        bool penalize_nl;
         uint32_t seed;
         char *grammar;
     };
diff --git a/llama/unicode-data.cpp b/llama/unicode-data.cpp
index b22fad9b1..e903ea926 100644
--- a/llama/unicode-data.cpp
+++ b/llama/unicode-data.cpp
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/unicode-data.h b/llama/unicode-data.h
index f61b4744d..eb5743940 100644
--- a/llama/unicode-data.h
+++ b/llama/unicode-data.h
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
diff --git a/llama/unicode.cpp b/llama/unicode.cpp
index 4bfa4cdcc..a4033d4f6 100644
--- a/llama/unicode.cpp
+++ b/llama/unicode.cpp
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
@@ -102,15 +102,15 @@ uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset) {
     throw std::invalid_argument("failed to convert utf8 to codepoint");
 }
 
-//static std::vector<uint16_t> unicode_cpt_to_utf16(uint32_t cp) {
+//static std::vector<uint16_t> unicode_cpt_to_utf16(uint32_t cpt) {
 //    std::vector<uint16_t> result;
-//    if (/* 0x0000 <= cp && */ cp <= 0xffff) {
-//        result.emplace_back(cp);
+//    if (/* 0x0000 <= cpt && */ cpt <= 0xffff) {
+//        result.emplace_back(cpt);
 //        return result;
 //    }
-//    if (0x10000 <= cp && cp <= 0x10ffff) {
-//        result.emplace_back(0xd800 | ((cp - 0x10000) >> 10));
-//        result.emplace_back(0xdc00 | ((cp - 0x10000) & 0x03ff));
+//    if (0x10000 <= cpt && cpt <= 0x10ffff) {
+//        result.emplace_back(0xd800 | ((cpt - 0x10000) >> 10));
+//        result.emplace_back(0xdc00 | ((cpt - 0x10000) & 0x03ff));
 //        return result;
 //    }
 //    throw std::invalid_argument("failed to convert codepoint to utf16");
@@ -151,8 +151,8 @@ uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset) {
 //    return result;
 //}
 
-static std::vector<codepoint_flags> unicode_cpt_flags_array() {
-    std::vector<codepoint_flags> cpt_flags(MAX_CODEPOINTS, codepoint_flags::UNDEFINED);
+static std::vector<unicode_cpt_flags> unicode_cpt_flags_array() {
+    std::vector<unicode_cpt_flags> cpt_flags(MAX_CODEPOINTS, unicode_cpt_flags::UNDEFINED);
 
     assert (unicode_ranges_flags.begin()[0].first == 0);
     assert (unicode_ranges_flags.begin()[unicode_ranges_flags.size()-1].first == MAX_CODEPOINTS);
@@ -301,8 +301,8 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
             return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : OUT_OF_RANGE;
         };
 
-        auto _get_flags = [&] (const size_t pos) -> codepoint_flags {
-            return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags(cpts[pos]) : codepoint_flags{};
+        auto _get_flags = [&] (const size_t pos) -> unicode_cpt_flags {
+            return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags_from_cpt(cpts[pos]) : unicode_cpt_flags{};
         };
 
         size_t _prev_end = offset_ini;
@@ -419,8 +419,8 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
             return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : OUT_OF_RANGE;
         };
 
-        auto _get_flags = [&] (const size_t pos) -> codepoint_flags {
-            return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags(cpts[pos]) : codepoint_flags{};
+        auto _get_flags = [&] (const size_t pos) -> unicode_cpt_flags {
+            return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags_from_cpt(cpts[pos]) : unicode_cpt_flags{};
         };
 
         size_t _prev_end = offset_ini;
@@ -620,29 +620,29 @@ static std::vector<size_t> unicode_regex_split_custom(const std::string & text,
 // interface
 //
 
-std::string unicode_cpt_to_utf8(uint32_t cp) {
+std::string unicode_cpt_to_utf8(uint32_t cpt) {
     std::string result;
 
-    if (/* 0x00 <= cp && */ cp <= 0x7f) {
-        result.push_back(cp);
+    if (/* 0x00 <= cpt && */ cpt <= 0x7f) {
+        result.push_back(cpt);
         return result;
     }
-    if (0x80 <= cp && cp <= 0x7ff) {
-        result.push_back(0xc0 | ((cp >> 6) & 0x1f));
-        result.push_back(0x80 | (cp & 0x3f));
+    if (0x80 <= cpt && cpt <= 0x7ff) {
+        result.push_back(0xc0 | ((cpt >> 6) & 0x1f));
+        result.push_back(0x80 | (cpt & 0x3f));
         return result;
     }
-    if (0x800 <= cp && cp <= 0xffff) {
-        result.push_back(0xe0 | ((cp >> 12) & 0x0f));
-        result.push_back(0x80 | ((cp >> 6) & 0x3f));
-        result.push_back(0x80 | (cp & 0x3f));
+    if (0x800 <= cpt && cpt <= 0xffff) {
+        result.push_back(0xe0 | ((cpt >> 12) & 0x0f));
+        result.push_back(0x80 | ((cpt >> 6) & 0x3f));
+        result.push_back(0x80 | (cpt & 0x3f));
         return result;
     }
-    if (0x10000 <= cp && cp <= 0x10ffff) {
-        result.push_back(0xf0 | ((cp >> 18) & 0x07));
-        result.push_back(0x80 | ((cp >> 12) & 0x3f));
-        result.push_back(0x80 | ((cp >> 6) & 0x3f));
-        result.push_back(0x80 | (cp & 0x3f));
+    if (0x10000 <= cpt && cpt <= 0x10ffff) {
+        result.push_back(0xf0 | ((cpt >> 18) & 0x07));
+        result.push_back(0x80 | ((cpt >> 12) & 0x3f));
+        result.push_back(0x80 | ((cpt >> 6) & 0x3f));
+        result.push_back(0x80 | (cpt & 0x3f));
         return result;
     }
 
@@ -672,19 +672,19 @@ std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8) {
     return result;
 }
 
-codepoint_flags unicode_cpt_flags(const uint32_t cp) {
-    static const codepoint_flags undef(codepoint_flags::UNDEFINED);
+unicode_cpt_flags unicode_cpt_flags_from_cpt(const uint32_t cpt) {
+    static const unicode_cpt_flags undef(unicode_cpt_flags::UNDEFINED);
     static const auto cpt_flags = unicode_cpt_flags_array();
-    return cp < cpt_flags.size() ? cpt_flags[cp] : undef;
+    return cpt < cpt_flags.size() ? cpt_flags[cpt] : undef;
 }
 
-codepoint_flags unicode_cpt_flags(const std::string & utf8) {
-    static const codepoint_flags undef(codepoint_flags::UNDEFINED);
+unicode_cpt_flags unicode_cpt_flags_from_utf8(const std::string & utf8) {
+    static const unicode_cpt_flags undef(unicode_cpt_flags::UNDEFINED);
     if (utf8.empty()) {
         return undef;  // undefined
     }
     size_t offset = 0;
-    return unicode_cpt_flags(unicode_cpt_from_utf8(utf8, offset));
+    return unicode_cpt_flags_from_cpt(unicode_cpt_from_utf8(utf8, offset));
 }
 
 std::string unicode_byte_to_utf8(uint8_t byte) {
@@ -697,41 +697,41 @@ uint8_t unicode_utf8_to_byte(const std::string & utf8) {
     return map.at(utf8);
 }
 
-uint32_t unicode_tolower(uint32_t cp) {
+uint32_t unicode_tolower(uint32_t cpt) {
     // binary search
-    auto it = std::lower_bound(unicode_map_lowercase.begin(), unicode_map_lowercase.end(), cp,
+    auto it = std::lower_bound(unicode_map_lowercase.begin(), unicode_map_lowercase.end(), cpt,
         [](const std::pair<uint32_t, uint32_t> & pair, uint32_t value) {
             return pair.first < value;
         });
-    if (it != unicode_map_lowercase.end() && it->first == cp) {
+    if (it != unicode_map_lowercase.end() && it->first == cpt) {
         return it->second;
     }
-    return cp;  // Return the original code point if no lowercase mapping is found
+    return cpt;  // Return the original code point if no lowercase mapping is found
 }
 
 std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs) {
     // unicode categories
     static const std::map<std::string, int> k_ucat_enum = {
-        { "\\p{N}", codepoint_flags::NUMBER },
-        { "\\p{L}", codepoint_flags::LETTER },
-        { "\\p{P}", codepoint_flags::PUNCTUATION },
+        { "\\p{N}", unicode_cpt_flags::NUMBER },
+        { "\\p{L}", unicode_cpt_flags::LETTER },
+        { "\\p{P}", unicode_cpt_flags::PUNCTUATION },
     };
 
     static const std::map<int, int> k_ucat_cpt = {
-        { codepoint_flags::NUMBER,        0xD1 },
-        { codepoint_flags::LETTER,        0xD2 },
-        { codepoint_flags::PUNCTUATION,   0xD3 },
+        { unicode_cpt_flags::NUMBER,      0xD1 },
+        { unicode_cpt_flags::LETTER,      0xD2 },
+        { unicode_cpt_flags::PUNCTUATION, 0xD3 },
     };
 
     static const std::map<int, std::string> k_ucat_map = {
-        { codepoint_flags::NUMBER,        "\x30-\x39" }, // 0-9
-        { codepoint_flags::LETTER,        "\x41-\x5A\x61-\x7A" }, // A-Za-z
-        { codepoint_flags::PUNCTUATION,   "\x21-\x23\x25-\x2A\x2C-\x2F\x3A-\x3B\x3F-\x40\\\x5B-\\\x5D\x5F\\\x7B\\\x7D" }, // !-#%-*,-/:-;?-@\[-\]_\{\}
+        { unicode_cpt_flags::NUMBER,      "\x30-\x39" }, // 0-9
+        { unicode_cpt_flags::LETTER,      "\x41-\x5A\x61-\x7A" }, // A-Za-z
+        { unicode_cpt_flags::PUNCTUATION, "\x21-\x23\x25-\x2A\x2C-\x2F\x3A-\x3B\x3F-\x40\\\x5B-\\\x5D\x5F\\\x7B\\\x7D" }, // !-#%-*,-/:-;?-@\[-\]_\{\}
     };
 
     // compute collapsed codepoints only if needed by at least one regex
     bool need_collapse = false;
-    for (auto & regex_expr : regex_exprs) {
+    for (const auto & regex_expr : regex_exprs) {
         // search for unicode categories
         for (const auto & ucat : k_ucat_enum) {
             if (std::string::npos != regex_expr.find(ucat.first)) {
@@ -757,7 +757,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
                 continue;
             }
 
-            const auto flags = unicode_cpt_flags(cpts[i]);
+            const auto flags = unicode_cpt_flags_from_cpt(cpts[i]);
 
             if (flags.is_whitespace) {
                 //NOTE: C++ std::regex \s does not mach 0x85, Rust and Python regex does.
@@ -773,7 +773,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
 
     std::vector<size_t> bpe_offsets = { cpts.size() };
 
-    for (auto & regex_expr : regex_exprs) {
+    for (const auto & regex_expr : regex_exprs) {
         // first, see if we have an efficient custom regex implementation
         auto tmp = unicode_regex_split_custom(text, regex_expr, bpe_offsets);
 
@@ -787,7 +787,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
             // if a unicode category is used in the regex, we use the collapsed text and replace the unicode category
             // with the corresponding collapsed representation
             bool use_collapsed = false;
-            for (auto & ucat : k_ucat_enum) {
+            for (const auto & ucat : k_ucat_enum) {
                 if (std::string::npos != regex_expr.find(ucat.first)) {
                     use_collapsed = true;
                     break;
@@ -853,7 +853,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
                 // std::wregex \s does not mach non-ASCII whitespaces, using 0x0B as fallback
                 std::wstring wtext(cpts.begin(), cpts.end());
                 for (size_t i = 0; i < wtext.size(); ++i) {
-                    if (wtext[i] > 0x7F && unicode_cpt_flags(wtext[i]).is_whitespace) {
+                    if (wtext[i] > 0x7F && unicode_cpt_flags_from_cpt(wtext[i]).is_whitespace) {
                         wtext[i] = 0x0B;
                     }
                 }
diff --git a/llama/unicode.h b/llama/unicode.h
index eca7da920..c28ba9cba 100644
--- a/llama/unicode.h
+++ b/llama/unicode.h
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 081b29bd2a3d91e7772e3910ce223dd63b8d7d26 - do not edit this file
  *
  * MIT License
  *
@@ -30,9 +30,7 @@
 #include <string>
 #include <vector>
 
-// TODO: prefix all symbols with "llama_"
-
-struct codepoint_flags {
+struct unicode_cpt_flags {
     enum {
         UNDEFINED       = 0x0001,
         NUMBER          = 0x0002,  // regex: \p{N}
@@ -61,7 +59,7 @@ struct codepoint_flags {
     uint16_t is_nfd         : 1;
 
     // decode from uint16
-    inline codepoint_flags(const uint16_t flags=0) {
+    inline unicode_cpt_flags(const uint16_t flags = 0) {
         *reinterpret_cast<uint16_t*>(this) = flags;
     }
 
@@ -76,18 +74,19 @@ struct codepoint_flags {
 
 size_t unicode_len_utf8(char src);
 
-std::string unicode_cpt_to_utf8(uint32_t cp);
-uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset);
+std::string unicode_cpt_to_utf8  (uint32_t cpt);
+uint32_t    unicode_cpt_from_utf8(const std::string & utf8, size_t & offset);
+
 std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8);
 
 std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & cpts);
 
-codepoint_flags unicode_cpt_flags(const uint32_t cp);
-codepoint_flags unicode_cpt_flags(const std::string & utf8);
+unicode_cpt_flags unicode_cpt_flags_from_cpt (uint32_t cpt);
+unicode_cpt_flags unicode_cpt_flags_from_utf8(const std::string & utf8);
 
 std::string unicode_byte_to_utf8(uint8_t byte);
-uint8_t unicode_utf8_to_byte(const std::string & utf8);
+uint8_t     unicode_utf8_to_byte(const std::string & utf8);
 
-uint32_t unicode_tolower(uint32_t cp);
+uint32_t unicode_tolower(uint32_t cpt);
 
 std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs);
diff --git a/llama/vendoring b/llama/vendoring
index f36fbcc6f..cb1ac85ea 100644
--- a/llama/vendoring
+++ b/llama/vendoring
@@ -1 +1 @@
-LLAMACPP_BASE_COMMIT=ba1cb19cdd0d92e012e0f6e009e0620f854b6afd
+LLAMACPP_BASE_COMMIT=081b29bd2a3d91e7772e3910ce223dd63b8d7d26
diff --git a/llm/server.go b/llm/server.go
index bb9062adc..89e5f54a6 100644
--- a/llm/server.go
+++ b/llm/server.go
@@ -692,7 +692,6 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
 		"mirostat":          req.Options.Mirostat,
 		"mirostat_tau":      req.Options.MirostatTau,
 		"mirostat_eta":      req.Options.MirostatEta,
-		"penalize_nl":       req.Options.PenalizeNewline,
 		"seed":              req.Options.Seed,
 		"stop":              req.Options.Stop,
 		"image_data":        req.Images,
diff --git a/parser/parser_test.go b/parser/parser_test.go
index b5614c2ed..698de4368 100644
--- a/parser/parser_test.go
+++ b/parser/parser_test.go
@@ -494,7 +494,6 @@ func TestParseFileParameters(t *testing.T) {
 		"mirostat 1":                   {"mirostat", "1"},
 		"mirostat_tau 1.0":             {"mirostat_tau", "1.0"},
 		"mirostat_eta 1.0":             {"mirostat_eta", "1.0"},
-		"penalize_newline true":        {"penalize_newline", "true"},
 		"stop ### User:":               {"stop", "### User:"},
 		"stop ### User: ":              {"stop", "### User:"},
 		"stop \"### User:\"":           {"stop", "### User:"},
diff --git a/server/images.go b/server/images.go
index 4006584fa..5b3504852 100644
--- a/server/images.go
+++ b/server/images.go
@@ -355,6 +355,8 @@ func realpath(rel, from string) string {
 	return abspath
 }
 
+var deprecatedParameters = []string{"penalize_newline"}
+
 func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantization string, modelfile *parser.File, fn func(resp api.ProgressResponse)) (err error) {
 	config := ConfigV2{
 		OS:           "linux",
@@ -526,6 +528,11 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio
 
 			messages = append(messages, &api.Message{Role: role, Content: content})
 		default:
+			if slices.Contains(deprecatedParameters, c.Name) {
+				fn(api.ProgressResponse{Status: fmt.Sprintf("warning: parameter %s is deprecated", c.Name)})
+				break
+			}
+
 			ps, err := api.FormatParams(map[string][]string{c.Name: {c.Args}})
 			if err != nil {
 				return err