From ef378ad673a3f01382add316835957b1d4184177 Mon Sep 17 00:00:00 2001 From: Patrick Devine Date: Fri, 14 Mar 2025 17:41:07 -0700 Subject: [PATCH] gemma3 quantization (#9776) --- llama/llama.cpp/src/llama-arch.cpp | 19 ++++ llama/llama.cpp/src/llama-arch.h | 1 + llama/llama.cpp/src/llama-model.cpp | 7 ++ llama/llama.cpp/src/llama-quant.cpp | 9 ++ llama/patches/0021-gemma3-quantization.patch | 113 +++++++++++++++++++ 5 files changed, 149 insertions(+) create mode 100644 llama/patches/0021-gemma3-quantization.patch diff --git a/llama/llama.cpp/src/llama-arch.cpp b/llama/llama.cpp/src/llama-arch.cpp index b6f20286b..b443fcd3f 100644 --- a/llama/llama.cpp/src/llama-arch.cpp +++ b/llama/llama.cpp/src/llama-arch.cpp @@ -37,6 +37,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_MINICPM3, "minicpm3" }, { LLM_ARCH_GEMMA, "gemma" }, { LLM_ARCH_GEMMA2, "gemma2" }, + { LLM_ARCH_GEMMA3, "gemma3" }, { LLM_ARCH_STARCODER2, "starcoder2" }, { LLM_ARCH_MAMBA, "mamba" }, { LLM_ARCH_XVERSE, "xverse" }, @@ -804,6 +805,24 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" }, }, }, + { + LLM_ARCH_GEMMA3, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + { LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" }, + }, + }, { LLM_ARCH_STARCODER2, { diff --git a/llama/llama.cpp/src/llama-arch.h b/llama/llama.cpp/src/llama-arch.h index ec7422244..aad92a5d2 100644 --- a/llama/llama.cpp/src/llama-arch.h +++ b/llama/llama.cpp/src/llama-arch.h @@ -41,6 +41,7 @@ enum llm_arch { LLM_ARCH_MINICPM3, LLM_ARCH_GEMMA, LLM_ARCH_GEMMA2, + LLM_ARCH_GEMMA3, LLM_ARCH_STARCODER2, LLM_ARCH_MAMBA, LLM_ARCH_XVERSE, diff --git a/llama/llama.cpp/src/llama-model.cpp b/llama/llama.cpp/src/llama-model.cpp index ab1a07d10..701830418 100644 --- a/llama/llama.cpp/src/llama-model.cpp +++ b/llama/llama.cpp/src/llama-model.cpp @@ -878,6 +878,9 @@ void llama_model::load_hparams(llama_model_loader & ml) { default: type = LLM_TYPE_UNKNOWN; } } break; + case LLM_ARCH_GEMMA3: + { + } break; case LLM_ARCH_STARCODER2: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); @@ -2537,6 +2540,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0); } } break; + case LLM_ARCH_GEMMA3: + { + } break; case LLM_ARCH_STARCODER2: { tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); @@ -4029,6 +4035,7 @@ enum llama_rope_type llama_model_rope_type(const struct llama_model * model) { case LLM_ARCH_PHIMOE: case LLM_ARCH_GEMMA: case LLM_ARCH_GEMMA2: + case LLM_ARCH_GEMMA3: case LLM_ARCH_STARCODER2: case LLM_ARCH_OPENELM: case LLM_ARCH_GPTNEOX: diff --git a/llama/llama.cpp/src/llama-quant.cpp b/llama/llama.cpp/src/llama-quant.cpp index 6eb1da08e..d2f3a5108 100644 --- a/llama/llama.cpp/src/llama-quant.cpp +++ b/llama/llama.cpp/src/llama-quant.cpp @@ -737,6 +737,15 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: // This used to be a regex, but has an extreme cost to compile times. bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'? + // don't quantize vision stuff + quantize &= name.find("v.blk.") == std::string::npos; + + quantize &= name.find("mm.mm_input_projection.weight") == std::string::npos; + quantize &= name.find("mm.mm_soft_emb_norm.weight") == std::string::npos; + quantize &= name.find("v.patch_embedding.weight") == std::string::npos; + quantize &= name.find("v.position_embedding.weight") == std::string::npos; + quantize &= name.find("v.post_layernorm.weight") == std::string::npos; + // quantize only 2D and 3D tensors (experts) quantize &= (ggml_n_dims(tensor) >= 2); diff --git a/llama/patches/0021-gemma3-quantization.patch b/llama/patches/0021-gemma3-quantization.patch new file mode 100644 index 000000000..4f6dbc11b --- /dev/null +++ b/llama/patches/0021-gemma3-quantization.patch @@ -0,0 +1,113 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Patrick Devine +Date: Fri, 14 Mar 2025 16:33:23 -0700 +Subject: [PATCH] gemma3 quantization + +--- + src/llama-arch.cpp | 19 +++++++++++++++++++ + src/llama-arch.h | 1 + + src/llama-model.cpp | 7 +++++++ + src/llama-quant.cpp | 9 +++++++++ + 4 files changed, 36 insertions(+) + +diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp +index b6f20286..b443fcd3 100644 +--- a/src/llama-arch.cpp ++++ b/src/llama-arch.cpp +@@ -37,6 +37,7 @@ static const std::map LLM_ARCH_NAMES = { + { LLM_ARCH_MINICPM3, "minicpm3" }, + { LLM_ARCH_GEMMA, "gemma" }, + { LLM_ARCH_GEMMA2, "gemma2" }, ++ { LLM_ARCH_GEMMA3, "gemma3" }, + { LLM_ARCH_STARCODER2, "starcoder2" }, + { LLM_ARCH_MAMBA, "mamba" }, + { LLM_ARCH_XVERSE, "xverse" }, +@@ -804,6 +805,24 @@ static const std::map> LLM_TENSOR_N + { LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" }, + }, + }, ++ { ++ LLM_ARCH_GEMMA3, ++ { ++ { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, ++ { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, ++ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, ++ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, ++ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, ++ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, ++ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, ++ { LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" }, ++ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, ++ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, ++ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, ++ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, ++ { LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" }, ++ }, ++ }, + { + LLM_ARCH_STARCODER2, + { +diff --git a/src/llama-arch.h b/src/llama-arch.h +index ec742224..aad92a5d 100644 +--- a/src/llama-arch.h ++++ b/src/llama-arch.h +@@ -41,6 +41,7 @@ enum llm_arch { + LLM_ARCH_MINICPM3, + LLM_ARCH_GEMMA, + LLM_ARCH_GEMMA2, ++ LLM_ARCH_GEMMA3, + LLM_ARCH_STARCODER2, + LLM_ARCH_MAMBA, + LLM_ARCH_XVERSE, +diff --git a/src/llama-model.cpp b/src/llama-model.cpp +index ab1a07d1..70183041 100644 +--- a/src/llama-model.cpp ++++ b/src/llama-model.cpp +@@ -878,6 +878,9 @@ void llama_model::load_hparams(llama_model_loader & ml) { + default: type = LLM_TYPE_UNKNOWN; + } + } break; ++ case LLM_ARCH_GEMMA3: ++ { ++ } break; + case LLM_ARCH_STARCODER2: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); +@@ -2537,6 +2540,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) { + layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0); + } + } break; ++ case LLM_ARCH_GEMMA3: ++ { ++ } break; + case LLM_ARCH_STARCODER2: + { + tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); +@@ -4029,6 +4035,7 @@ enum llama_rope_type llama_model_rope_type(const struct llama_model * model) { + case LLM_ARCH_PHIMOE: + case LLM_ARCH_GEMMA: + case LLM_ARCH_GEMMA2: ++ case LLM_ARCH_GEMMA3: + case LLM_ARCH_STARCODER2: + case LLM_ARCH_OPENELM: + case LLM_ARCH_GPTNEOX: +diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp +index 6eb1da08..d2f3a510 100644 +--- a/src/llama-quant.cpp ++++ b/src/llama-quant.cpp +@@ -737,6 +737,15 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: + // This used to be a regex, but has an extreme cost to compile times. + bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'? + ++ // don't quantize vision stuff ++ quantize &= name.find("v.blk.") == std::string::npos; ++ ++ quantize &= name.find("mm.mm_input_projection.weight") == std::string::npos; ++ quantize &= name.find("mm.mm_soft_emb_norm.weight") == std::string::npos; ++ quantize &= name.find("v.patch_embedding.weight") == std::string::npos; ++ quantize &= name.find("v.position_embedding.weight") == std::string::npos; ++ quantize &= name.find("v.post_layernorm.weight") == std::string::npos; ++ + // quantize only 2D and 3D tensors (experts) + quantize &= (ggml_n_dims(tensor) >= 2); +