diff --git a/Makefile.sync b/Makefile.sync index bceae7f59..711667c98 100644 --- a/Makefile.sync +++ b/Makefile.sync @@ -15,11 +15,13 @@ help: @echo " make -f $(lastword $(MAKEFILE_LIST)) clean sync" .PHONY: sync -sync: llama/build-info.cpp llama/llama.cpp ml/backend/ggml/ggml +sync: llama/build-info.cpp ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal -.PHONY: llama/build-info.cpp -llama/build-info.cpp: llama/build-info.cpp.in - sed -e 's|@FETCH_HEAD@|$(FETCH_HEAD)|' $< > $@ +llama/build-info.cpp: llama/build-info.cpp.in llama/llama.cpp + sed -e 's|@FETCH_HEAD@|$(FETCH_HEAD)|' <$< >$@ + +ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal: ml/backend/ggml/ggml + go generate ./$(@D) .PHONY: llama/llama.cpp llama/llama.cpp: llama/vendor/ @@ -30,12 +32,13 @@ ml/backend/ggml/ggml: llama/vendor/ggml/ rsync -arvzc -f "merge $@/.rsync-filter" $< $@ PATCHES=$(wildcard llama/patches/*.patch) +PATCHED=$(join $(dir $(PATCHES)), $(addsuffix ed, $(addprefix ., $(notdir $(PATCHES))))) .PHONY: apply-patches .NOTPARALLEL: -apply-patches: $(addsuffix ed, $(PATCHES)) +apply-patches: $(PATCHED) -%.patched: %.patch +llama/patches/.%.patched: llama/patches/%.patch @if git -c user.name=nobody -c 'user.email=<>' -C $(WORKDIR) am -3 $(realpath $<); then touch $@; else git -C $(WORKDIR) am --abort; exit 1; fi .PHONY: checkout @@ -57,4 +60,4 @@ format-patches: llama/patches .PHONE: clean clean: checkout - $(RM) $(addsuffix ed, $(PATCHES)) + $(RM) llama/patches/.*.patched diff --git a/convert/convert.go b/convert/convert.go index 249ec8077..48804d7f3 100644 --- a/convert/convert.go +++ b/convert/convert.go @@ -1,6 +1,7 @@ package convert import ( + "cmp" "encoding/json" "errors" "fmt" @@ -14,13 +15,12 @@ import ( ) type ModelParameters struct { - Architectures []string `json:"architectures"` - VocabSize uint32 `json:"vocab_size"` - TextModel TextParameters `json:"text_config"` -} + Architectures []string `json:"architectures"` + VocabSize uint32 `json:"vocab_size"` -type TextParameters struct { - VocabSize uint32 `json:"vocab_size"` + TextModel struct { + VocabSize uint32 `json:"vocab_size"` + } `json:"text_config"` } type AdapterParameters struct { @@ -173,6 +173,8 @@ func ConvertModel(fsys fs.FS, f *os.File) error { switch p.Architectures[0] { case "LlamaForCausalLM": conv = &llamaModel{} + case "MllamaForConditionalGeneration": + conv = &mllamaModel{} case "Llama4ForConditionalGeneration": conv = &llama4Model{} case "Mistral3ForConditionalGeneration": @@ -212,24 +214,22 @@ func ConvertModel(fsys fs.FS, f *os.File) error { return err } - vocabSize := int(p.VocabSize) - if vocabSize == 0 { - tVocabSize := int(p.TextModel.VocabSize) - vocabSize = tVocabSize - } + vocabSize := int(cmp.Or(p.VocabSize, p.TextModel.VocabSize)) switch { case vocabSize == 0: - slog.Warn("vocabulary size was not explicitly set by the model", "default size", len(t.Vocabulary.Tokens)) + slog.Debug("vocabulary size was not explicitly set by the model", "default size", len(t.Vocabulary.Tokens)) case vocabSize > len(t.Vocabulary.Tokens): - slog.Warn("vocabulary is smaller than expected, padding with dummy tokens", "expect", vocabSize, "actual", len(t.Vocabulary.Tokens)) + slog.Debug("vocabulary is smaller than expected, padding with dummy tokens", "expect", vocabSize, "actual", len(t.Vocabulary.Tokens)) for i := range vocabSize - len(t.Vocabulary.Tokens) { t.Vocabulary.Tokens = append(t.Vocabulary.Tokens, fmt.Sprintf("[PAD%d]", i)) t.Vocabulary.Scores = append(t.Vocabulary.Scores, -1) t.Vocabulary.Types = append(t.Vocabulary.Types, tokenTypeUserDefined) } case vocabSize < len(t.Vocabulary.Tokens): - return fmt.Errorf("vocabulary is larger than expected '%d' instead of '%d'", len(t.Vocabulary.Tokens), vocabSize) + slog.Debug("vocabulary is larger than expected", "want", vocabSize, "got", len(t.Vocabulary.Tokens)) + p.VocabSize = uint32(len(t.Vocabulary.Tokens)) + p.TextModel.VocabSize = uint32(len(t.Vocabulary.Tokens)) default: slog.Debug("vocabulary", "size", len(t.Vocabulary.Tokens)) } diff --git a/convert/convert_mllama.go b/convert/convert_mllama.go new file mode 100644 index 000000000..12478be71 --- /dev/null +++ b/convert/convert_mllama.go @@ -0,0 +1,160 @@ +package convert + +import ( + "strings" + + "github.com/ollama/ollama/fs/ggml" + "github.com/pdevine/tensor" + "github.com/pdevine/tensor/native" +) + +type mllamaModel struct { + ModelParameters + TextModel struct { + llamaModel + + CrossAttentionLayers []int32 `json:"cross_attention_layers"` + } `json:"text_config"` + VisionModel struct { + NumHiddenLayers uint32 `json:"num_hidden_layers"` + NumGlobalLayers uint32 `json:"num_global_layers"` + IntermediateLayersIndices []int32 `json:"intermediate_layers_indices"` + + HiddenSize uint32 `json:"hidden_size"` + IntermediateSize uint32 `json:"intermediate_size"` + + AttentionHeads uint32 `json:"attention_heads"` + + ImageSize uint32 `json:"image_size"` + PatchSize uint32 `json:"patch_size"` + NumChannels uint32 `json:"num_channels"` + MaxNumTiles uint32 `json:"max_num_tiles"` + NormEpsilon float32 `json:"norm_eps"` + RopeTheta float32 `json:"rope.freq_base"` + } `json:"vision_config"` +} + +func (m *mllamaModel) KV(t *Tokenizer) ggml.KV { + kv := m.ModelParameters.KV(t) + kv["general.architecture"] = "mllama" + + for k, v := range m.TextModel.KV(t) { + if strings.HasPrefix(k, "llama.") { + kv[strings.ReplaceAll(k, "llama.", "mllama.")] = v + } + } + + kv["mllama.attention.cross_attention_layers"] = m.TextModel.CrossAttentionLayers + + kv["mllama.vision.block_count"] = m.VisionModel.NumHiddenLayers + kv["mllama.vision.global.block_count"] = m.VisionModel.NumGlobalLayers + kv["mllama.vision.intermediate_layers_indices"] = m.VisionModel.IntermediateLayersIndices + + kv["mllama.vision.embedding_length"] = m.VisionModel.HiddenSize + kv["mllama.vision.feed_forward_length"] = m.VisionModel.IntermediateSize + + kv["mllama.vision.attention.head_count"] = m.VisionModel.AttentionHeads + kv["mllama.vision.attention.layer_norm_epsilon"] = m.VisionModel.NormEpsilon + + kv["mllama.vision.image_size"] = m.VisionModel.ImageSize + kv["mllama.vision.patch_size"] = m.VisionModel.PatchSize + kv["mllama.vision.max_num_tiles"] = m.VisionModel.MaxNumTiles + kv["mllama.vision.num_channels"] = m.VisionModel.NumChannels + + return kv +} + +func (m *mllamaModel) Replacements() []string { + return append( + m.TextModel.Replacements(), + "language_model.", "", + "gate_attn", "attn_gate", + "gate_ffn", "ffn_gate", + "cross_attn.", "cross_attn_", + "vision_model", "v", + "class_embedding", "class_embd", + "patch_embedding", "patch_embd", + "gated_positional_embedding.tile_embedding", "tile_position_embd", + "gated_positional_embedding.embedding", "position_embd.weight", + "gated_positional_embedding", "position_embd", + "embedding.weight", "weight", + "pre_tile_positional_embedding", "pre_tile_position_embd", + "post_tile_positional_embedding", "post_tile_position_embd", + "layernorm_pre", "pre_ln", + "layernorm_post", "post_ln", + "global_transformer.layers", "global.blk", + "transformer.layers", "blk", + "mlp.fc1", "ffn_up", + "mlp.fc2", "ffn_down", + "multi_modal_projector", "mm.0", + ) +} + +func (m *mllamaModel) Tensors(ts []Tensor) []*ggml.Tensor { + var out []*ggml.Tensor + var text []Tensor + for _, t := range ts { + if t.Name() == "v.position_embd.gate" { + for _, name := range []string{"v.position_embd.gate", "v.tile_position_embd.gate"} { + tt := t.Clone() + tt.SetRepacker(m.repack(name)) + out = append(out, &ggml.Tensor{ + Name: name, + Kind: t.Kind(), + Shape: t.Shape(), + WriterTo: tt, + }) + } + } else if t.Name() == "v.pre_tile_position_embd.gate" || t.Name() == "v.post_tile_position_embd.gate" { + t.SetRepacker(m.repack(t.Name())) + out = append(out, &ggml.Tensor{ + Name: t.Name(), + Kind: t.Kind(), + Shape: t.Shape(), + WriterTo: t, + }) + } else if strings.HasPrefix(t.Name(), "v.") || strings.HasPrefix(t.Name(), "mm.") { + out = append(out, &ggml.Tensor{ + Name: t.Name(), + Kind: t.Kind(), + Shape: t.Shape(), + WriterTo: t, + }) + } else { + text = append(text, t) + } + } + + return append(out, m.TextModel.Tensors(text)...) +} + +func (m *mllamaModel) repack(name string) Repacker { + return func(_ string, data []float32, shape []uint64) (_ []float32, err error) { + dims := make([]int, len(shape)) + for i, dim := range shape { + dims[i] = int(dim) + } + + var t tensor.Tensor = tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data)) + + t, err = tensor.Tanh(t) + if err != nil { + return nil, err + } + + if name == "v.position_embd.gate" { + t, err = tensor.Sub(float32(1), t) + if err != nil { + return nil, err + } + } + + t = tensor.Materialize(t) + // flatten tensor so it can be return as a vector + if err := t.Reshape(t.Shape().TotalSize()); err != nil { + return nil, err + } + + return native.VectorF32(t.(*tensor.Dense)) + } +} diff --git a/convert/reader.go b/convert/reader.go index ab81d5c0b..07d12f0dd 100644 --- a/convert/reader.go +++ b/convert/reader.go @@ -38,7 +38,10 @@ const ( func (t tensorBase) Kind() uint32 { if strings.HasSuffix(t.name, ".ffn_gate_inp.weight") || t.name == "token_types.weight" || - t.name == "v.positional_embedding_vlm" { + t.name == "v.positional_embedding_vlm" || + t.name == "v.tile_position_embd.weight" || + t.name == "v.pre_tile_position_embd.weight" || + t.name == "v.post_tile_position_embd.weight" { // these tensors are always F32 return 0 } diff --git a/fs/ggml/ggml.go b/fs/ggml/ggml.go index 735d41fa5..c29d715bd 100644 --- a/fs/ggml/ggml.go +++ b/fs/ggml/ggml.go @@ -125,6 +125,7 @@ func (kv KV) OllamaEngineRequired() bool { "gemma3", "mistral3", "llama4", + "mllama", }, kv.Architecture()) } diff --git a/llama/llama.cpp/include/llama.h b/llama/llama.cpp/include/llama.h index 41beef219..abedebdb7 100644 --- a/llama/llama.cpp/include/llama.h +++ b/llama/llama.cpp/include/llama.h @@ -258,7 +258,6 @@ extern "C" { llama_token * token; float * embd; - int32_t n_embd; llama_pos * pos; int32_t * n_seq_id; llama_seq_id ** seq_id; @@ -366,7 +365,6 @@ extern "C" { bool flash_attn; // whether to use flash attention [EXPERIMENTAL] bool no_perf; // whether to measure performance timings bool op_offload; // whether to offload host tensor operations to device - bool cross_attn; // whether to use cross attention }; // model quantization parameters @@ -466,10 +464,6 @@ extern "C" { struct llama_context_params params), "use llama_init_from_model instead"); - // TODO (jmorganca): this should most likely be passed in as part of a batch - // and not set on the context for all batches. - LLAMA_API void llama_set_cross_attention(struct llama_context * ctx, bool cross_attn_state); - // Frees all allocated memory LLAMA_API void llama_free(struct llama_context * ctx); diff --git a/llama/llama.cpp/src/llama-arch.cpp b/llama/llama.cpp/src/llama-arch.cpp index eb7b5325e..5ab3f5722 100644 --- a/llama/llama.cpp/src/llama-arch.cpp +++ b/llama/llama.cpp/src/llama-arch.cpp @@ -6,7 +6,6 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_LLAMA, "llama" }, - { LLM_ARCH_MLLAMA, "mllama" }, { LLM_ARCH_LLAMA4, "llama4" }, { LLM_ARCH_DECI, "deci" }, { LLM_ARCH_FALCON, "falcon" }, @@ -145,7 +144,6 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" }, { LLM_KV_ATTENTION_SCALE, "%s.attention.scale" }, { LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection" }, - { LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS, "%s.attention.cross_attention_layers" }, { LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" }, { LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" }, @@ -275,40 +273,6 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" }, }, }, - { - LLM_ARCH_MLLAMA, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_ROPE_FREQS, "rope_freqs" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, - { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, - { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" }, - { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" }, - { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, - { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, - { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, - { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - { LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" }, - { LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" }, - { LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" }, - { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" }, - { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" }, - { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, - { LLM_TENSOR_CROSS_ATTN_K_NORM, "blk.%d.cross_attn_k_norm" }, - { LLM_TENSOR_CROSS_ATTN_K_PROJ, "blk.%d.cross_attn_k_proj" }, - { LLM_TENSOR_CROSS_ATTN_O_PROJ, "blk.%d.cross_attn_o_proj" }, - { LLM_TENSOR_CROSS_ATTN_Q_NORM, "blk.%d.cross_attn_q_norm" }, - { LLM_TENSOR_CROSS_ATTN_Q_PROJ, "blk.%d.cross_attn_q_proj" }, - { LLM_TENSOR_CROSS_ATTN_V_PROJ, "blk.%d.cross_attn_v_proj" }, - { LLM_TENSOR_CROSS_ATTN_ATTN_GATE, "blk.%d.cross_attn_attn_gate" }, - { LLM_TENSOR_CROSS_ATTN_MLP_GATE, "blk.%d.cross_attn_mlp_gate" }, - }, - }, { LLM_ARCH_DECI, { @@ -1737,14 +1701,6 @@ static const std::map LLM_TENSOR_INFOS = { // this tensor is loaded for T5, but never used {LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}}, {LLM_TENSOR_BSKCN_TV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, - {LLM_TENSOR_CROSS_ATTN_K_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, - {LLM_TENSOR_CROSS_ATTN_K_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, - {LLM_TENSOR_CROSS_ATTN_O_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, - {LLM_TENSOR_CROSS_ATTN_Q_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, - {LLM_TENSOR_CROSS_ATTN_Q_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, - {LLM_TENSOR_CROSS_ATTN_V_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, - {LLM_TENSOR_CROSS_ATTN_ATTN_GATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, - {LLM_TENSOR_CROSS_ATTN_MLP_GATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_CONV1D, {LLM_TENSOR_LAYER_INPUT, GGML_OP_IM2COL}}, {LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, diff --git a/llama/llama.cpp/src/llama-arch.h b/llama/llama.cpp/src/llama-arch.h index bc8a4f0bb..525c1b7d4 100644 --- a/llama/llama.cpp/src/llama-arch.h +++ b/llama/llama.cpp/src/llama-arch.h @@ -11,7 +11,6 @@ enum llm_arch { LLM_ARCH_LLAMA, LLM_ARCH_LLAMA4, - LLM_ARCH_MLLAMA, LLM_ARCH_DECI, LLM_ARCH_FALCON, LLM_ARCH_BAICHUAN, @@ -149,7 +148,6 @@ enum llm_kv { LLM_KV_ATTENTION_SLIDING_WINDOW, LLM_KV_ATTENTION_SCALE, LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, - LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS, LLM_KV_ATTENTION_KEY_LENGTH_MLA, LLM_KV_ATTENTION_VALUE_LENGTH_MLA, @@ -351,14 +349,6 @@ enum llm_tensor { LLM_TENSOR_CLS, LLM_TENSOR_CLS_OUT, LLM_TENSOR_BSKCN_TV, - LLM_TENSOR_CROSS_ATTN_K_NORM, - LLM_TENSOR_CROSS_ATTN_K_PROJ, - LLM_TENSOR_CROSS_ATTN_O_PROJ, - LLM_TENSOR_CROSS_ATTN_Q_NORM, - LLM_TENSOR_CROSS_ATTN_Q_PROJ, - LLM_TENSOR_CROSS_ATTN_V_PROJ, - LLM_TENSOR_CROSS_ATTN_ATTN_GATE, - LLM_TENSOR_CROSS_ATTN_MLP_GATE, LLM_TENSOR_CONV1D, LLM_TENSOR_CONVNEXT_DW, LLM_TENSOR_CONVNEXT_NORM, diff --git a/llama/llama.cpp/src/llama-batch.cpp b/llama/llama.cpp/src/llama-batch.cpp index 241b316e8..a88b2fe30 100644 --- a/llama/llama.cpp/src/llama-batch.cpp +++ b/llama/llama.cpp/src/llama-batch.cpp @@ -320,7 +320,6 @@ struct llama_batch llama_batch_get_one( /*n_tokens =*/ n_tokens, /*tokens =*/ tokens, /*embd =*/ nullptr, - /*n_embd =*/ 0, /*pos =*/ nullptr, /*n_seq_id =*/ nullptr, /*seq_id =*/ nullptr, @@ -333,7 +332,6 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_ /*n_tokens =*/ 0, /*tokens =*/ nullptr, /*embd =*/ nullptr, - /*n_embd =*/ 0, /*pos =*/ nullptr, /*n_seq_id =*/ nullptr, /*seq_id =*/ nullptr, @@ -342,7 +340,6 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_ if (embd) { batch.embd = (float *) malloc(sizeof(float) * n_tokens_alloc * embd); - batch.n_embd = embd; } else { batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens_alloc); } diff --git a/llama/llama.cpp/src/llama-context.cpp b/llama/llama.cpp/src/llama-context.cpp index c5948e8fb..1f3a39564 100644 --- a/llama/llama.cpp/src/llama-context.cpp +++ b/llama/llama.cpp/src/llama-context.cpp @@ -514,7 +514,7 @@ float * llama_context::get_logits_ith(int32_t i) { throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, n_outputs)); } - return logits + j*model.hparams.n_vocab; + return logits + j*model.vocab.n_tokens(); } catch (const std::exception & err) { LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what()); #ifndef NDEBUG @@ -632,10 +632,6 @@ void llama_context::set_warmup(bool value) { cparams.warmup = value; } -void llama_context::set_cross_attn(bool value) { - cparams.cross_attn = value; -} - void llama_context::set_adapter_lora( llama_adapter_lora * adapter, float scale) { @@ -713,7 +709,7 @@ int llama_context::encode(llama_batch & inp_batch) { const int64_t n_embd = hparams.n_embd; - llama_sbatch sbatch = llama_sbatch(batch, batch.n_embd, /* simple_split */ true, /* logits_all */ true); + llama_sbatch sbatch = llama_sbatch(batch, n_embd, /* simple_split */ true, /* logits_all */ true); const llama_ubatch ubatch = sbatch.split_simple(n_tokens); @@ -867,9 +863,10 @@ int llama_context::decode(llama_batch & inp_batch) { const llama_batch & batch = batch_allocr.batch; + const auto & vocab = model.vocab; const auto & hparams = model.hparams; - const int32_t n_vocab = hparams.n_vocab; + const int32_t n_vocab = vocab.n_tokens(); const int64_t n_tokens_all = batch.n_tokens; const int64_t n_embd = hparams.n_embd; @@ -1093,7 +1090,7 @@ int llama_context::decode(llama_batch & inp_batch) { // make the outputs have the same order they had in the user-provided batch // note: this is mostly relevant for recurrent models atm if (!sorted_output) { - const uint32_t n_vocab = model.hparams.n_vocab; + const uint32_t n_vocab = model.vocab.n_tokens(); const uint32_t n_embd = model.hparams.n_embd; GGML_ASSERT((size_t) n_outputs == out_ids.size()); @@ -1148,11 +1145,12 @@ int llama_context::decode(llama_batch & inp_batch) { int32_t llama_context::output_reserve(int32_t n_outputs) { const auto & hparams = model.hparams; + const auto & vocab = model.vocab; const int64_t n_outputs_max = std::max(n_outputs, n_seq_max()); const auto n_batch = cparams.n_batch; - const auto n_vocab = hparams.n_vocab; + const auto n_vocab = vocab.n_tokens(); const auto n_embd = hparams.n_embd; // TODO: use a per-batch flag for logits presence instead @@ -1687,7 +1685,7 @@ size_t llama_context::state_write_data(llama_io_write_i & io) { { LLAMA_LOG_DEBUG("%s: - writing logits\n", __func__); - const uint64_t logits_size = std::min((uint64_t) this->logits_size, (uint64_t) n_outputs * model.hparams.n_vocab); + const uint64_t logits_size = std::min((uint64_t) this->logits_size, (uint64_t) n_outputs * model.vocab.n_tokens()); io.write(&logits_size, sizeof(logits_size)); @@ -2099,7 +2097,6 @@ llama_context_params llama_context_default_params() { /*.flash_attn =*/ false, /*.no_perf =*/ true, /*.op_offload =*/ true, - /*.cross_attn =*/ false, }; return result; @@ -2225,10 +2222,6 @@ void llama_set_warmup(llama_context * ctx, bool warmup) { ctx->set_warmup(warmup); } -void llama_set_cross_attention(struct llama_context * ctx, bool cross_attention) { - ctx->set_cross_attn(cross_attention); -} - void llama_synchronize(llama_context * ctx) { ctx->synchronize(); } diff --git a/llama/llama.cpp/src/llama-context.h b/llama/llama.cpp/src/llama-context.h index 9970dfc6d..0264e9371 100644 --- a/llama/llama.cpp/src/llama-context.h +++ b/llama/llama.cpp/src/llama-context.h @@ -72,7 +72,6 @@ struct llama_context { void set_embeddings (bool value); void set_causal_attn(bool value); void set_warmup(bool value); - void set_cross_attn(bool value); void set_adapter_lora( llama_adapter_lora * adapter, diff --git a/llama/llama.cpp/src/llama-cparams.h b/llama/llama.cpp/src/llama-cparams.h index 7a6156ced..246fa5777 100644 --- a/llama/llama.cpp/src/llama-cparams.h +++ b/llama/llama.cpp/src/llama-cparams.h @@ -31,7 +31,6 @@ struct llama_cparams { bool no_perf; bool warmup; bool op_offload; - bool cross_attn; enum llama_pooling_type pooling_type; diff --git a/llama/llama.cpp/src/llama-graph.cpp b/llama/llama.cpp/src/llama-graph.cpp index f14869cf5..b0e3f6359 100644 --- a/llama/llama.cpp/src/llama-graph.cpp +++ b/llama/llama.cpp/src/llama-graph.cpp @@ -532,12 +532,6 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) { } } -void llm_graph_input_cross_attn_state::set_input(const llama_ubatch * ubatch) { - if (ubatch->embd) { - ggml_backend_tensor_set(cross_attn_state, ubatch->embd, 0, ggml_nbytes(cross_attn_state)); - } -} - // // llm_graph_context // @@ -1520,25 +1514,6 @@ llm_graph_input_attn_cross * llm_graph_context::build_attn_inp_cross() const { return (llm_graph_input_attn_cross *) res->add_input(std::move(inp)); } -ggml_tensor * llm_graph_context::build_inp_cross_attn_state() const { - const int64_t n_embd = hparams.n_embd; - - auto inp = std::make_unique(); - - ggml_tensor * cur = nullptr; - - inp->cross_attn_state = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd, 1601, 4); - ggml_set_input(inp->cross_attn_state); - - cur = inp->cross_attn_state; - - cb(cur, "inp_cross_attn_state", -1); - - res->add_input(std::move(inp)); - - return cur; -} - ggml_tensor * llm_graph_context::build_attn( llm_graph_input_attn_cross * inp, ggml_cgraph * gf, diff --git a/llama/llama.cpp/src/llama-graph.h b/llama/llama.cpp/src/llama-graph.h index 5a3227850..832a8c09f 100644 --- a/llama/llama.cpp/src/llama-graph.h +++ b/llama/llama.cpp/src/llama-graph.h @@ -87,7 +87,6 @@ public: ggml_tensor * tokens = nullptr; // I32 [n_batch] ggml_tensor * embd = nullptr; // F32 [n_embd, n_batch] - ggml_tensor * cross_attn_state; // F32 [4, n_embd, 1061] }; class llm_graph_input_pos : public llm_graph_input_i { @@ -285,16 +284,6 @@ public: const llama_cross * cross = nullptr; }; -class llm_graph_input_cross_attn_state : public llm_graph_input_i { -public: - llm_graph_input_cross_attn_state() = default; - virtual ~llm_graph_input_cross_attn_state() = default; - - void set_input(const llama_ubatch * ubatch) override; - - ggml_tensor * cross_attn_state; // F32 [4, n_embd, 1061] -}; - // // llm_graph_result // @@ -506,7 +495,6 @@ struct llm_graph_context { ggml_tensor * build_inp_cls() const; ggml_tensor * build_inp_s_copy() const; ggml_tensor * build_inp_s_mask() const; - ggml_tensor * build_inp_cross_attn_state() const; ggml_tensor * build_inp_cross_embd() const; ggml_tensor * build_inp_pos_bucket_enc() const; diff --git a/llama/llama.cpp/src/llama-hparams.cpp b/llama/llama.cpp/src/llama-hparams.cpp index 6a02de036..8a6679601 100644 --- a/llama/llama.cpp/src/llama-hparams.cpp +++ b/llama/llama.cpp/src/llama-hparams.cpp @@ -85,7 +85,3 @@ bool llama_hparams::is_swa(uint32_t il) const { GGML_ABORT("fatal error"); } - -bool llama_hparams::cross_attention_layers(uint32_t il) const { - return std::find(cross_attn_layers.begin(), cross_attn_layers.end(), il) != cross_attn_layers.end(); -} diff --git a/llama/llama.cpp/src/llama-hparams.h b/llama/llama.cpp/src/llama-hparams.h index b6fc7e6df..48dce4071 100644 --- a/llama/llama.cpp/src/llama-hparams.h +++ b/llama/llama.cpp/src/llama-hparams.h @@ -2,8 +2,6 @@ #include "llama.h" -#include - #include // bump if necessary @@ -44,7 +42,6 @@ struct llama_hparams { uint32_t n_expert = 0; uint32_t n_expert_used = 0; uint32_t n_rel_attn_bkts = 0; - uint32_t n_vocab = 0; // note: deepseek2 using MLA converts into MQA with larger heads, then decompresses to MHA uint32_t n_embd_head_k_mla = 0; @@ -59,7 +56,6 @@ struct llama_hparams { std::array n_ff_arr; std::array, 4> n_bskcn_arr = {}; - std::array cross_attn_layers; uint32_t n_layer_dense_lead = 0; uint32_t n_lora_q = 0; @@ -163,9 +159,6 @@ struct llama_hparams { // Block skip connection bool n_bskcn(uint32_t n, uint32_t il) const; - // cross attention layers - bool cross_attention_layers(uint32_t il) const; - bool is_swa(uint32_t il) const; }; diff --git a/llama/llama.cpp/src/llama-kv-cache.cpp b/llama/llama.cpp/src/llama-kv-cache.cpp index 1a50c0347..60e67b036 100644 --- a/llama/llama.cpp/src/llama-kv-cache.cpp +++ b/llama/llama.cpp/src/llama-kv-cache.cpp @@ -100,16 +100,8 @@ llama_kv_cache_unified::llama_kv_cache_unified( throw std::runtime_error("failed to create ggml context for kv cache"); } - ggml_tensor * k, *v; - - // for cross attention layers - if (model.arch == LLM_ARCH_MLLAMA && hparams.cross_attention_layers(i)) { - k = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hparams.n_embd_head_k, 6404, hparams.n_head_kv(i)); - v = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hparams.n_embd_head_v, 6404, hparams.n_head_kv(i)); - } else { - k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size); - v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size); - } + ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size); + ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size); ggml_format_name(k, "cache_k_l%d", i); ggml_format_name(v, "cache_v_l%d", i); k_l.push_back(k); @@ -459,7 +451,7 @@ void llama_kv_cache_unified::set_full() { llama_sbatch llama_kv_cache_unified::sbatch_init( const llama_batch & batch, bool logits_all) { - return llama_sbatch(batch, batch.n_embd, true, logits_all); + return llama_sbatch(batch, hparams.n_embd, true, logits_all); } llama_ubatch llama_kv_cache_unified::ubatch_next( diff --git a/llama/llama.cpp/src/llama-model-loader.cpp b/llama/llama.cpp/src/llama-model-loader.cpp index 2acfd4a8f..7f6617fac 100644 --- a/llama/llama.cpp/src/llama-model-loader.cpp +++ b/llama/llama.cpp/src/llama-model-loader.cpp @@ -315,8 +315,6 @@ namespace GGUFMeta { return true; } - template bool llama_model_loader::get_arr>(enum llm_kv kid, std::array& result, bool required); - template bool llama_model_loader::get_arr(const std::string & key, std::array & result, bool required) { const int kid = gguf_find_key(meta.get(), key.c_str()); diff --git a/llama/llama.cpp/src/llama-model.cpp b/llama/llama.cpp/src/llama-model.cpp index e8298f562..db62973fa 100644 --- a/llama/llama.cpp/src/llama-model.cpp +++ b/llama/llama.cpp/src/llama-model.cpp @@ -433,7 +433,6 @@ void llama_model::load_hparams(llama_model_loader & ml) { // get general kv ml.get_key(LLM_KV_GENERAL_NAME, name, false); - ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab, false); // everything past this point is not vocab-related if (hparams.vocab_only) { @@ -445,7 +444,6 @@ void llama_model::load_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer); ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false); ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false); - ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false); if (arch == LLM_ARCH_WAVTOKENIZER_DEC) { ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd_features); @@ -469,11 +467,9 @@ void llama_model::load_hparams(llama_model_loader & ml) { std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0); std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0); std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0); - std::fill(hparams.cross_attn_layers.begin(), hparams.cross_attn_layers.end(), -1); ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer, false); ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false); - ml.get_arr(LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS, hparams.cross_attn_layers, false); // n_head_kv is optional, default to n_head hparams.n_head_kv_arr = hparams.n_head_arr; @@ -526,7 +522,7 @@ void llama_model::load_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false); - if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_MLLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON) { + if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON) { if (hparams.n_rot != hparams.n_embd_head_k) { throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k)); } @@ -589,16 +585,6 @@ void llama_model::load_hparams(llama_model_loader & ml) { hparams.use_kq_norm = false; } } break; - case LLM_ARCH_MLLAMA: - { - ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - - switch (hparams.n_layer) { - case 40: type = LLM_TYPE_11B; break; - case 100: type = LLM_TYPE_90B; break; - default: type = LLM_TYPE_UNKNOWN; - } - } break; case LLM_ARCH_DECI: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); @@ -1595,7 +1581,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { const int64_t n_embd_head_v = hparams.n_embd_head_v; const int64_t n_ff = hparams.n_ff(); const int64_t n_embd_gqa = n_embd_v_gqa; - const int64_t n_vocab = hparams.n_vocab; + const int64_t n_vocab = vocab.n_tokens(); const int64_t n_token_types = vocab.n_token_types(); const int64_t n_rot = hparams.n_rot; const int64_t n_expert = hparams.n_expert; @@ -1854,52 +1840,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) { } } } break; - case LLM_ARCH_MLLAMA: - { - tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab+8}, 0); - - // output - { - output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); - output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED); - - // if output is NULL, init from the input tok embed - if (output == NULL) { - output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); - } - } - - for (int i = 0; i < n_layer; ++i) { - auto & layer = layers[i]; - - if (hparams.cross_attention_layers(i)) { - layer.cross_attn_k_norm = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_K_NORM, "weight", i), {128}, 0); - layer.cross_attn_k_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_K_PROJ, "weight", i), {n_embd, 1024}, 0); - layer.cross_attn_o_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_O_PROJ, "weight", i), {n_embd, n_embd}, 0); - layer.cross_attn_q_norm = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_Q_NORM, "weight", i), {128}, 0); - layer.cross_attn_q_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_Q_PROJ, "weight", i), {n_embd, n_embd}, 0); - layer.cross_attn_v_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_V_PROJ, "weight", i), {n_embd, 1024}, 0); - layer.cross_attn_attn_gate = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_ATTN_GATE, i), {1}, 0); - layer.cross_attn_mlp_gate = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_MLP_GATE, i), {1}, 0); - layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); - layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0); - layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); - layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); - layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); - } else { - layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); - layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0); - layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0); - layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0); - layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0); - layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); - layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0)); - layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); - layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0); - layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); - } - } - } break; case LLM_ARCH_DECI: { tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); @@ -4816,246 +4756,6 @@ struct llm_build_llama : public llm_graph_context { } }; -struct llm_build_mllama: public llm_graph_context { - llm_build_mllama(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { - // mutable variable, needed during the last layer of the computation to skip unused tokens - int32_t n_tokens = this->n_tokens; - - const int64_t n_embd_head = hparams.n_embd_head_v; - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - GGML_ASSERT(n_embd_head == hparams.n_rot); - - ggml_tensor * cur; - ggml_tensor * inpL; - ggml_tensor * inpCAS; - - inpL = build_inp_embd(model.tok_embd); - inpCAS = build_inp_cross_attn_state(); - - // inp_pos - contains the positions - ggml_tensor * inp_pos = build_inp_pos(); - - auto * inp_attn = build_attn_inp_kv_unified(); - const llama_kv_cache_unified * kv_self = static_cast(memory); - - for (int il = 0; il < n_layer; ++il) { - ggml_tensor * inpSA = inpL; - - // norm - cur = build_norm(inpL, - model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, il); - cb(cur, "attn_norm", il); - - if (hparams.cross_attention_layers(il)) { - if (!ubatch.embd && !cparams.cross_attn) { - continue; - } - - // cross attention layer - ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_q_proj, cur); - cb(Qcur, "Qcur", il); - - Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - cb(Qcur, "Qcur", il); - - Qcur = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 0, 2, 1, 3)); - cb(Qcur, "Qcur", il); - - Qcur = build_norm(Qcur, model.layers[il].cross_attn_q_norm, NULL, LLM_NORM_RMS, il); - cb(Qcur, "Qcur", il); - - ggml_tensor * Kcur, * Vcur; - if (ubatch.embd) { - Kcur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_k_proj, inpCAS); - cb(Kcur, "Kcur", il); - - Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, 6404); - cb(Kcur, "Kcur", il); - - Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3)); - cb(Kcur, "Kcur", il); - - Kcur = build_norm(Kcur, model.layers[il].cross_attn_k_norm, NULL, LLM_NORM_RMS, il); - cb(Kcur, "Kcur", il); - - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, kv_self->k_l[il])); - - Vcur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_v_proj, inpCAS); - cb(Vcur, "Vcur", il); - - Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, 6404); - cb(Vcur, "Vcur", il); - - Vcur = ggml_permute(ctx0, Vcur, 0, 2, 1, 3); - cb(Vcur, "Vcur", il); - - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, kv_self->v_l[il])); - } else { - Kcur = ggml_view_tensor(ctx0, kv_self->k_l[il]); - cb(Kcur, "Kcur (view)", il); - - Vcur = ggml_view_tensor(ctx0, kv_self->v_l[il]); - cb(Vcur, "Vcur (view)", il); - } - - struct ggml_tensor * kq = ggml_mul_mat(ctx0, Kcur, Qcur); - cb(kq, "kq", il); - - // TODO: apply causal masks - struct ggml_tensor * kq_soft_max = ggml_soft_max_ext(ctx0, kq, nullptr, 1.f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias); - cb(kq_soft_max, "kq_soft_max", il); - - Vcur = ggml_cont(ctx0, ggml_transpose(ctx0, Vcur)); - cb(Vcur, "Vcur", il); - - struct ggml_tensor * kqv = ggml_mul_mat(ctx0, Vcur, kq_soft_max); - cb(kqv, "kqv", il); - - struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3); - cb(kqv_merged, "kqv_merged", il); - - cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_head_v*n_head, n_tokens); - cb(cur, "kqv_merged_cont", il); - - cur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_o_proj, cur); - cb(cur, "cur", il); - - // TODO: do this in place once? - cur = ggml_mul(ctx0, cur, ggml_tanh(ctx0, model.layers[il].cross_attn_attn_gate)); - - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); - cb(ffn_inp, "ffn_inp", il); - - // feed-forward network - cur = build_norm(ffn_inp, - model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, il); - cb(cur, "ffn_norm", il); - - cur = build_ffn(cur, - model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, - model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL, - model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, - NULL, - LLM_FFN_SILU, LLM_FFN_PAR, il); - cb(cur, "ffn_out", il); - - // TODO: do this inplace once? - cur = ggml_add_inplace(ctx0, ggml_mul_inplace(ctx0, cur, ggml_tanh(ctx0, model.layers[il].cross_attn_mlp_gate)), ffn_inp); - cb(cur, "ffn_out", il); - - cur = build_cvec(cur, il); - cb(cur, "l_out", il); - - // input for next layer - inpL = cur; - } else { - // self attention layer - - // rope freq factors for llama3; may return nullptr for llama2 and other models - ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il); - - // compute Q and K and RoPE them - ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); - cb(Qcur, "Qcur", il); - if (model.layers[il].bq) { - Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); - cb(Qcur, "Qcur", il); - } - - ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); - cb(Kcur, "Kcur", il); - if (model.layers[il].bk) { - Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); - cb(Kcur, "Kcur", il); - } - - ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); - cb(Vcur, "Vcur", il); - if (model.layers[il].bv) { - Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); - cb(Vcur, "Vcur", il); - } - - Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); - Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); - - Qcur = ggml_rope_ext( - ctx0, Qcur, inp_pos, rope_factors, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow - ); - - Kcur = ggml_rope_ext( - ctx0, Kcur, inp_pos, rope_factors, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow - ); - - cb(Qcur, "Qcur", il); - cb(Kcur, "Kcur", il); - cb(Vcur, "Vcur", il); - - cur = build_attn(inp_attn, gf, - model.layers[il].wo, model.layers[il].bo, - Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); - - if (il == n_layer - 1) { - // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - n_tokens = n_outputs; - cur = ggml_get_rows(ctx0, cur, inp_out_ids); - inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); - } - - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); - cb(ffn_inp, "ffn_inp", il); - - // feed-forward network - cur = build_norm(ffn_inp, - model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, il); - cb(cur, "ffn_norm", il); - - cur = build_ffn(cur, - model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, - model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL, - model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, - NULL, - LLM_FFN_SILU, LLM_FFN_PAR, il); - cb(cur, "ffn_out", il); - - cur = ggml_add(ctx0, cur, ffn_inp); - cb(cur, "ffn_out", il); - - cur = build_cvec(cur, il); - cb(cur, "l_out", il); - - // input for next layer - inpL = cur; - } - } - - cur = inpL; - - cur = build_norm(cur, - model.output_norm, NULL, - LLM_NORM_RMS, -1); - cb(cur, "result_norm", -1); - res->t_embd = cur; - - // lm_head - cur = build_lora_mm(model.output, cur); - - cb(cur, "result_output", -1); - res->t_logits = cur; - - ggml_build_forward_expand(gf, cur); - } -}; - struct llm_build_deci : public llm_graph_context { llm_build_deci(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; @@ -13428,10 +13128,6 @@ llm_graph_result_ptr llama_model::build_graph( { llm = std::make_unique(*this, params, gf); } break; - case LLM_ARCH_MLLAMA: - { - llm = std::make_unique(*this, params, gf); - } break; case LLM_ARCH_DECI: { llm = std::make_unique(*this, params, gf); @@ -13793,7 +13489,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { // use what we call a normal RoPE, operating on pairs of consecutive head values case LLM_ARCH_LLAMA: case LLM_ARCH_LLAMA4: - case LLM_ARCH_MLLAMA: case LLM_ARCH_DECI: case LLM_ARCH_BAICHUAN: case LLM_ARCH_STARCODER: diff --git a/llama/llama.cpp/src/llama-model.h b/llama/llama.cpp/src/llama-model.h index 9281e6296..43746c7dd 100644 --- a/llama/llama.cpp/src/llama-model.h +++ b/llama/llama.cpp/src/llama-model.h @@ -11,7 +11,6 @@ #include #include #include -#include struct llama_cparams; struct llama_ubatch; @@ -75,7 +74,6 @@ enum llm_type { LLM_TYPE_40B, LLM_TYPE_65B, LLM_TYPE_70B, - LLM_TYPE_90B, LLM_TYPE_236B, LLM_TYPE_290B, LLM_TYPE_314B, @@ -320,16 +318,6 @@ struct llama_layer { struct ggml_tensor * bskcn_tv = nullptr; - // cross attention - struct ggml_tensor * cross_attn_k_norm = nullptr; - struct ggml_tensor * cross_attn_k_proj = nullptr; - struct ggml_tensor * cross_attn_o_proj = nullptr; - struct ggml_tensor * cross_attn_q_norm = nullptr; - struct ggml_tensor * cross_attn_q_proj = nullptr; - struct ggml_tensor * cross_attn_v_proj = nullptr; - struct ggml_tensor * cross_attn_attn_gate = nullptr; - struct ggml_tensor * cross_attn_mlp_gate = nullptr; - struct llama_layer_posnet posnet; struct llama_layer_convnext convnext; diff --git a/llama/llama.cpp/src/llama-quant.cpp b/llama/llama.cpp/src/llama-quant.cpp index 56531980c..820d5128e 100644 --- a/llama/llama.cpp/src/llama-quant.cpp +++ b/llama/llama.cpp/src/llama-quant.cpp @@ -639,9 +639,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: if (llama_model_has_encoder(&model)) { n_attn_layer *= 3; } - if (qs.n_attention_wv != n_attn_layer) { - LLAMA_LOG_WARN("%s: n_attention_wv is unexpected, expected: %d, found: %d\n", __func__, n_attn_layer, qs.n_attention_wv); - } + GGML_ASSERT((qs.n_attention_wv == n_attn_layer) && "n_attention_wv is unexpected"); } size_t total_size_org = 0; diff --git a/llama/llama.cpp/tools/mtmd/llava.cpp b/llama/llama.cpp/tools/mtmd/llava.cpp index b0eb79bb3..ebef8b3c1 100644 --- a/llama/llama.cpp/tools/mtmd/llava.cpp +++ b/llama/llama.cpp/tools/mtmd/llava.cpp @@ -462,7 +462,7 @@ struct llava_embd_batch { std::vector seq_ids; std::vector logits; llama_batch batch; - llava_embd_batch(float * embd, int32_t n_embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) { + llava_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) { pos .resize(n_tokens); n_seq_id.resize(n_tokens); seq_ids .resize(n_tokens + 1); @@ -474,7 +474,6 @@ struct llava_embd_batch { /*n_tokens =*/ n_tokens, /*tokens =*/ nullptr, /*embd =*/ embd, - /*n_embd =*/ n_embd, /*pos =*/ pos.data(), /*n_seq_id =*/ n_seq_id.data(), /*seq_id =*/ seq_ids.data(), @@ -498,7 +497,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_ n_eval = n_batch; } float * embd = image_embed->embed+i*n_embd; - llava_embd_batch llava_batch = llava_embd_batch(embd, n_embd, n_eval, *n_past, 0); + llava_embd_batch llava_batch = llava_embd_batch(embd, n_eval, *n_past, 0); if (llama_decode(ctx_llama, llava_batch.batch)) { LOG_ERR("%s : failed to eval\n", __func__); return false; diff --git a/llama/llama.go b/llama/llama.go index f0f2af828..1251be3a5 100644 --- a/llama/llama.go +++ b/llama/llama.go @@ -17,7 +17,6 @@ package llama #include "llava.h" #include "gguf.h" -#include "mllama.h" #include "sampling_ext.h" extern bool llamaProgressCallback(float progress, void *user_data); @@ -510,63 +509,6 @@ func (c *ClipContext) NewEmbed(llamaContext *Context, data []byte) ([][]float32, return embed, nil } -type MllamaContext struct { - c *C.struct_mllama_ctx -} - -func NewMllamaContext(llamaContext *Context, modelPath string) (*MllamaContext, error) { - mp := C.CString(modelPath) - defer C.free(unsafe.Pointer(mp)) - c := C.mllama_model_load(mp, 1) - if c == nil { - return nil, fmt.Errorf("unable to load mllama model: %v", modelPath) - } - - projEmbedSize := int(C.mllama_n_embd(c)) - modelEmbedSize := llamaContext.Model().NEmbd() - if projEmbedSize != modelEmbedSize { - return nil, fmt.Errorf("projector embedding size (%d) does not match model (%d)", projEmbedSize, modelEmbedSize) - } - - return &MllamaContext{c: c}, nil -} - -func (m *MllamaContext) Free() { - C.mllama_free(m.c) -} - -func (m *MllamaContext) NewEmbed(llamaContext *Context, data []byte, aspectRatioId int) ([][]float32, error) { - img := C.mllama_image_init() - defer C.mllama_image_free(img) - - ok := bool(C.mllama_image_load_from_data(unsafe.Pointer(&data[0]), C.int(len(data)), 560, 560, 3, 4, C.int(aspectRatioId), img)) - if !ok { - return nil, errors.New("unable to load mllama image data") - } - - rows := make([]float32, m.EmbedSize(llamaContext)) - ok = bool(C.mllama_image_encode(m.c, C.int(llamaContext.numThreads), img, (*C.float)(unsafe.Pointer(&rows[0])))) - if !ok { - return nil, errors.New("unable to make mllama embedding from image") - } - - embed := make([][]float32, 1) - embed[0] = rows - - return embed, nil -} - -func (m *MllamaContext) EmbedSize(llamaContext *Context) int { - numTokens := int(C.mllama_n_positions(m.c) * C.mllama_n_tiles(m.c)) - numEmbed := llamaContext.Model().NEmbd() - - return numTokens * numEmbed -} - -func (c *Context) SetCrossAttention(state bool) { - C.llama_set_cross_attention(c.c, C.bool(state)) -} - func (c *Context) Synchronize() { C.llama_synchronize(c.c) } diff --git a/llama/mllama.cpp b/llama/mllama.cpp deleted file mode 100644 index 1ba8f5bef..000000000 --- a/llama/mllama.cpp +++ /dev/null @@ -1,887 +0,0 @@ -// NOTE: This is modified from clip.cpp for Mllama only -#include "mllama.h" - -#include "ggml-alloc.h" -#include "ggml-backend.h" -#include "ggml-cpu.h" -#include "ggml.h" -#include "gguf.h" - -#ifdef GGML_USE_CUDA -#include "ggml-cuda.h" -#endif - -#ifdef GGML_USE_METAL -#include "ggml-metal.h" -#endif - -#ifdef GGML_USE_CANN -#include "ggml-cann.h" -#endif - -#ifdef GGML_USE_VULKAN -#include "ggml-vulkan.h" -#endif - -#include -#include -#include -#include -#include -#include -#include -#include - -#define REQUIRE(x) \ - do { \ - if (!(x)) { \ - throw std::runtime_error("REQUIRE failed: " #x); \ - } \ - } while (0) - -#define LOG(fmt, ...) fprintf(stderr, "%s: " fmt "\n", __func__, ##__VA_ARGS__) - -#if defined(_WIN32) -#define WIN32_LEAN_AND_MEAN -#ifndef NOMINMAX - #define NOMINMAX -#endif -#include -#if __GLIBCXX__ -#include -#include -#include -#endif -#endif - -struct mllama_image { - int width; - int height; - - int num_channels = 3; - int num_tiles = 4; - - int aspect_ratio_id; - - std::vector data; -}; - -static std::string format(const char *fmt, ...) { - va_list args; - va_start(args, fmt); - std::vector b(128); - int n = vsnprintf(b.data(), b.size(), fmt, args); - REQUIRE(n >= 0 && n < b.size()); - va_end(args); - return std::string(b.data(), b.size()); -} - -// -// utilities to get data from a gguf file -// - -static int get_key_index(const gguf_context *ctx, const char *key) { - int key_index = gguf_find_key(ctx, key); - REQUIRE(key_index != -1); - return key_index; -} - -static std::vector get_u32_array(const gguf_context *ctx, const std::string &key) { - const int i = get_key_index(ctx, key.c_str()); - const int n = gguf_get_arr_n(ctx, i); - const uint32_t *data = (uint32_t *)gguf_get_arr_data(ctx, i); - - std::vector s(n); - for (size_t j = 0; j < s.size(); j++) { - s[j] = data[j]; - } - - return s; -} - -static uint32_t get_u32(const gguf_context *ctx, const std::string &key) { - return gguf_get_val_u32(ctx, get_key_index(ctx, key.c_str())); -} - -static float get_f32(const gguf_context *ctx, const std::string &key) { - return gguf_get_val_f32(ctx, get_key_index(ctx, key.c_str())); -} - -static std::string get_ftype(int ftype) { - return ggml_type_name(static_cast(ftype)); -} - -// -// mllama layers -// - -struct mllama_hparams { - uint32_t image_size; - uint32_t patch_size; - uint32_t hidden_size; - uint32_t n_intermediate; - uint32_t projection_dim; - uint32_t n_head; - uint32_t n_layer; - uint32_t n_global_layer; - uint32_t n_tiles; - - float eps; - - std::vector intermediate_layers; -}; - -struct mllama_layer { - // attention - struct ggml_tensor *k_w; - struct ggml_tensor *k_b; - struct ggml_tensor *q_w; - struct ggml_tensor *q_b; - struct ggml_tensor *v_w; - struct ggml_tensor *v_b; - - struct ggml_tensor *o_w; - struct ggml_tensor *o_b; - - struct ggml_tensor *attn_gate; - - // layernorm 1 - struct ggml_tensor *ln_1_w; - struct ggml_tensor *ln_1_b; - - // ff - struct ggml_tensor *ff_i_w; - struct ggml_tensor *ff_i_b; - - struct ggml_tensor *ff_o_w; - struct ggml_tensor *ff_o_b; - - struct ggml_tensor *ff_gate; - - // layernorm 2 - struct ggml_tensor *ln_2_w; - struct ggml_tensor *ln_2_b; -}; - -struct mllama_vision_model { - struct mllama_hparams hparams; - - // embeddings - struct ggml_tensor *class_embedding; - struct ggml_tensor *patch_embeddings; - struct ggml_tensor *position_embeddings; - struct ggml_tensor *position_embeddings_gate; - struct ggml_tensor *tile_position_embeddings; - struct ggml_tensor *tile_position_embeddings_gate; - struct ggml_tensor *pre_tile_position_embeddings; - struct ggml_tensor *pre_tile_position_embeddings_gate; - struct ggml_tensor *post_tile_position_embeddings; - struct ggml_tensor *post_tile_position_embeddings_gate; - - struct ggml_tensor *pre_ln_w; - struct ggml_tensor *pre_ln_b; - - std::vector layers; - std::vector global_layers; - - struct ggml_tensor *post_ln_w; - struct ggml_tensor *post_ln_b; - - struct ggml_tensor *mm_0_w; - struct ggml_tensor *mm_0_b; -}; - -struct mllama_ctx { - struct mllama_vision_model vision_model; - - uint32_t ftype = 1; - - struct gguf_context *ctx_gguf; - struct ggml_context *ctx_data; - - std::vector buf_compute_meta; - - // memory buffers to evaluate the model - ggml_backend_buffer_t params_buffer = nullptr; - - ggml_backend_t backend = nullptr; - ggml_gallocr_t compute_alloc = nullptr; -}; - -static ggml_tensor *mllama_image_build_encoder_layer( - struct ggml_context *ctx0, const size_t il, const struct mllama_layer &layer, struct ggml_tensor *embeddings, - const float eps, const int hidden_size, const int batch_size, const int n_head, const int d_head) { - struct ggml_tensor *cur = embeddings; - - { - // layernorm1 - cur = ggml_norm(ctx0, cur, eps); - cur = ggml_add(ctx0, ggml_mul(ctx0, cur, layer.ln_1_w), layer.ln_1_b); - ggml_set_name(cur, format("%d pre layernorm", il).c_str()); - } - - { - // self-attention - struct ggml_tensor *Q = ggml_mul_mat(ctx0, layer.q_w, cur); - if (layer.q_b != nullptr) { - Q = ggml_add(ctx0, Q, layer.q_b); - } - - Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, Q->ne[1], batch_size); - Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3)); - ggml_set_name(Q, format("%d query", il).c_str()); - - struct ggml_tensor *K = ggml_mul_mat(ctx0, layer.k_w, cur); - if (layer.k_b != nullptr) { - K = ggml_add(ctx0, K, layer.k_b); - } - - K = ggml_reshape_4d(ctx0, K, d_head, n_head, K->ne[1], batch_size); - K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3)); - ggml_set_name(K, format("%d key", il).c_str()); - - struct ggml_tensor *V = ggml_mul_mat(ctx0, layer.v_w, cur); - if (layer.v_b != nullptr) { - V = ggml_add(ctx0, V, layer.v_b); - } - - V = ggml_reshape_4d(ctx0, V, d_head, n_head, V->ne[1], batch_size); - V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3)); - ggml_set_name(V, format("%d value", il).c_str()); - - struct ggml_tensor *KQ = ggml_mul_mat(ctx0, K, Q); - KQ = ggml_scale_inplace(ctx0, KQ, 1.0f / sqrtf((float)d_head)); - KQ = ggml_soft_max_inplace(ctx0, KQ); - ggml_set_name(KQ, format("%d KQ", il).c_str()); - - struct ggml_tensor *KQV = ggml_mul_mat(ctx0, V, KQ); - KQV = ggml_reshape_4d(ctx0, KQV, d_head, KQV->ne[1], n_head, batch_size); - KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - KQV = ggml_cont_3d(ctx0, KQV, hidden_size, KQV->ne[2], batch_size); - ggml_set_name(KQV, format("%d KQV", il).c_str()); - - cur = ggml_mul_mat(ctx0, layer.o_w, KQV); - if (layer.o_b != nullptr) { - cur = ggml_add(ctx0, cur, layer.o_b); - } - ggml_set_name(cur, format("%d self attention", il).c_str()); - - if (layer.attn_gate != nullptr) { - cur = ggml_mul_inplace(ctx0, cur, layer.attn_gate); - ggml_set_name(cur, format("%d self attention gate", il).c_str()); - } - } - - cur = ggml_add(ctx0, cur, embeddings); - ggml_set_name(cur, format("%d residual", il).c_str()); - - embeddings = cur; - - { - // layernorm2 - cur = ggml_norm(ctx0, cur, eps); - cur = ggml_add(ctx0, ggml_mul(ctx0, cur, layer.ln_2_w), layer.ln_2_b); - ggml_set_name(cur, format("%d post layernorm", il).c_str()); - } - - { - // feed forward - cur = ggml_add(ctx0, ggml_mul_mat(ctx0, layer.ff_i_w, cur), layer.ff_i_b); - cur = ggml_gelu_inplace(ctx0, cur); - cur = ggml_add(ctx0, ggml_mul_mat(ctx0, layer.ff_o_w, cur), layer.ff_o_b); - ggml_set_name(cur, format("%d feed forward", il).c_str()); - - if (layer.ff_gate != nullptr) { - cur = ggml_mul_inplace(ctx0, cur, layer.ff_gate); - ggml_set_name(cur, format("%d feed forward gate", il).c_str()); - } - } - - // residual 2 - cur = ggml_add(ctx0, cur, embeddings); - ggml_set_name(cur, format("%d residual", il).c_str()); - - embeddings = cur; - - return embeddings; -} - -static ggml_cgraph *mllama_image_build_graph(mllama_ctx *ctx, const mllama_image_batch *imgs) { - const auto &model = ctx->vision_model; - const auto &hparams = model.hparams; - - const int image_size = hparams.image_size; - const int image_size_width = image_size; - const int image_size_height = image_size; - - const int patch_size = hparams.patch_size; - const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size)); - const int num_positions = num_patches + (model.class_embedding == nullptr ? 0 : 1); - const int hidden_size = hparams.hidden_size; - const int n_head = hparams.n_head; - const int d_head = hidden_size / n_head; - - const int batch_size = imgs->size; - REQUIRE(batch_size == 1); - - int num_tiles = 4; - int num_channels = 3; - if (imgs->data != nullptr) { - num_tiles = imgs->data[0].num_tiles > 0 ? imgs->data[0].num_tiles : num_tiles; - num_channels = imgs->data[0].num_channels > 0 ? imgs->data[0].num_channels : num_channels; - } - - struct ggml_init_params params = { - ctx->buf_compute_meta.size(), // mem_size - ctx->buf_compute_meta.data(), // mem_buffer - true, // no_alloc - }; - - struct ggml_context *ctx0 = ggml_init(params); - struct ggml_cgraph *gf = ggml_new_graph(ctx0); - - struct ggml_tensor *inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size_width, image_size_height, num_channels, num_tiles); - ggml_set_name(inp_raw, "inp_raw"); - ggml_set_input(inp_raw); - - struct ggml_tensor *inp = ggml_conv_2d(ctx0, model.patch_embeddings, inp_raw, patch_size, patch_size, 0, 0, 1, 1); - - inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, num_tiles); - inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3)); - - struct ggml_tensor *aspect_ratios = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, imgs->size); - ggml_set_name(aspect_ratios, "aspect_ratios"); - ggml_set_input(aspect_ratios); - - if (model.pre_tile_position_embeddings != nullptr) { - struct ggml_tensor *pre_tile_position_embeddings = ggml_get_rows(ctx0, model.pre_tile_position_embeddings, aspect_ratios); - ggml_set_name(pre_tile_position_embeddings, "pre_tile_position_embeddings"); - - pre_tile_position_embeddings = ggml_reshape_3d(ctx0, pre_tile_position_embeddings, hidden_size, 1, num_tiles); - if (model.pre_tile_position_embeddings_gate != nullptr) { - pre_tile_position_embeddings = ggml_mul_inplace(ctx0, pre_tile_position_embeddings, model.pre_tile_position_embeddings_gate); - } - - inp = ggml_add(ctx0, inp, pre_tile_position_embeddings); - } - - struct ggml_tensor *embeddings = inp; - - if (model.class_embedding != nullptr) { - // concat class_embeddings and patch_embeddings - embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, num_tiles); - ggml_set_name(embeddings, "embeddings"); - ggml_set_input(embeddings); - for (int i = 0; i < num_tiles; ++i) { - // repeat class embeddings for each tile - embeddings = ggml_acc_inplace(ctx0, embeddings, model.class_embedding, embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], i * embeddings->nb[2]); - } - - embeddings = ggml_acc_inplace(ctx0, embeddings, inp, embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]); - } - - struct ggml_tensor *positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions); - ggml_set_name(positions, "positions"); - ggml_set_input(positions); - - struct ggml_tensor *position_embd = ggml_get_rows(ctx0, model.position_embeddings, positions); - if (model.position_embeddings_gate != nullptr) { - position_embd = ggml_mul_inplace(ctx0, position_embd, model.position_embeddings_gate); - } - - embeddings = ggml_add(ctx0, embeddings, position_embd); - - if (model.tile_position_embeddings != nullptr) { - struct ggml_tensor *tile_position_embeddings = ggml_get_rows(ctx0, model.tile_position_embeddings, aspect_ratios); - ggml_set_name(tile_position_embeddings, "tile_position_embeddings"); - - tile_position_embeddings = ggml_reshape_3d(ctx0, tile_position_embeddings, hidden_size, num_positions, num_tiles); - if (model.tile_position_embeddings_gate != nullptr) { - tile_position_embeddings = ggml_mul_inplace(ctx0, tile_position_embeddings, model.tile_position_embeddings_gate); - } - - embeddings = ggml_add(ctx0, embeddings, tile_position_embeddings); - } - - // pre-layernorm - if (model.pre_ln_w != nullptr) { - embeddings = ggml_mul(ctx0, ggml_norm(ctx0, embeddings, hparams.eps), model.pre_ln_w); - if (model.pre_ln_b != nullptr) { - embeddings = ggml_add(ctx0, embeddings, model.pre_ln_b); - } - - ggml_set_name(embeddings, "pre layernorm"); - } - - const int num_padding_patches = 8 - (embeddings->ne[1] % 8) % 8; - - embeddings = ggml_pad(ctx0, embeddings, 0, num_padding_patches, 0, 0); - embeddings = ggml_view_3d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1] * embeddings->ne[2], batch_size, embeddings->nb[1], embeddings->nb[2] * embeddings->ne[3], 0); - - std::vector intermediate_embeddings; - - // encoder - for (size_t il = 0; il < model.layers.size(); il++) { - if (hparams.intermediate_layers[il]) { - intermediate_embeddings.push_back(embeddings); - } - - embeddings = mllama_image_build_encoder_layer( - ctx0, il, model.layers[il], embeddings, - hparams.eps, hidden_size, batch_size, n_head, d_head); - } - - // post-layernorm - if (model.post_ln_w != nullptr) { - embeddings = ggml_mul(ctx0, ggml_norm(ctx0, embeddings, hparams.eps), model.post_ln_w); - if (model.post_ln_b != nullptr) { - embeddings = ggml_add(ctx0, embeddings, model.post_ln_b); - } - - ggml_set_name(embeddings, "post layernorm"); - } - - embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size, num_positions + num_padding_patches, num_tiles); - - if (model.post_tile_position_embeddings != nullptr) { - struct ggml_tensor *post_tile_position_embeddings = ggml_get_rows(ctx0, model.post_tile_position_embeddings, aspect_ratios); - ggml_set_name(post_tile_position_embeddings, "post_tile_position_embeddings"); - - post_tile_position_embeddings = ggml_reshape_3d(ctx0, post_tile_position_embeddings, hidden_size, 1, num_tiles); - if (model.post_tile_position_embeddings_gate != nullptr) { - post_tile_position_embeddings = ggml_mul(ctx0, post_tile_position_embeddings, model.post_tile_position_embeddings_gate); - } - - embeddings = ggml_add(ctx0, embeddings, post_tile_position_embeddings); - } - - embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size, num_tiles * (num_positions + num_padding_patches), 1); - - // global encoder - for (size_t il = 0; il < model.global_layers.size(); il++) { - embeddings = mllama_image_build_encoder_layer( - ctx0, il, model.global_layers[il], embeddings, - hparams.eps, hidden_size, batch_size, n_head, d_head); - } - - struct ggml_tensor *stacked_embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 0, hidden_size, (num_positions + num_padding_patches) * num_tiles); - for (size_t i = 0; i < intermediate_embeddings.size(); ++i) { - stacked_embeddings = ggml_concat(ctx0, stacked_embeddings, ggml_reshape_3d(ctx0, intermediate_embeddings[i], 1, intermediate_embeddings[i]->ne[0], intermediate_embeddings[i]->ne[1]), 0); - } - - stacked_embeddings = ggml_reshape_4d(ctx0, stacked_embeddings, intermediate_embeddings.size() * hidden_size, num_positions + num_padding_patches, num_tiles, batch_size); - stacked_embeddings = ggml_unpad(ctx0, stacked_embeddings, 0, num_padding_patches, 0, 0); - - embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size, num_positions + num_padding_patches, num_tiles); - embeddings = ggml_unpad(ctx0, embeddings, 0, num_padding_patches, 0, 0); - embeddings = ggml_concat(ctx0, embeddings, stacked_embeddings, 0); - - // mllama projector - embeddings = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_0_w, embeddings), model.mm_0_b); - ggml_set_name(embeddings, "multi modal projector"); - - // build the graph - ggml_build_forward_expand(gf, embeddings); - - ggml_free(ctx0); - - return gf; -} - -static struct ggml_tensor *mllama_tensor_load(struct ggml_context *ctx, const char *name, const bool optional) { - struct ggml_tensor *cur = ggml_get_tensor(ctx, name); - REQUIRE(cur != nullptr || optional); - return cur; -} - -static std::vector mllama_layers_load(struct ggml_context *ctx, const char *prefix, const int n) { - std::vector layers(n); - for (size_t i = 0; i < layers.size(); i++) { - auto &layer = layers[i]; - layer.ln_1_w = mllama_tensor_load(ctx, format("%s.blk.%d.ln1.weight", prefix, i).c_str(), false); - layer.ln_1_b = mllama_tensor_load(ctx, format("%s.blk.%d.ln1.bias", prefix, i).c_str(), false); - layer.ln_2_w = mllama_tensor_load(ctx, format("%s.blk.%d.ln2.weight", prefix, i).c_str(), false); - layer.ln_2_b = mllama_tensor_load(ctx, format("%s.blk.%d.ln2.bias", prefix, i).c_str(), false); - - layer.k_w = mllama_tensor_load(ctx, format("%s.blk.%d.attn_k.weight", prefix, i).c_str(), false); - layer.k_b = mllama_tensor_load(ctx, format("%s.blk.%d.attn_k.bias", prefix, i).c_str(), true); - layer.q_w = mllama_tensor_load(ctx, format("%s.blk.%d.attn_q.weight", prefix, i).c_str(), false); - layer.q_b = mllama_tensor_load(ctx, format("%s.blk.%d.attn_q.bias", prefix, i).c_str(), true); - layer.v_w = mllama_tensor_load(ctx, format("%s.blk.%d.attn_v.weight", prefix, i).c_str(), false); - layer.v_b = mllama_tensor_load(ctx, format("%s.blk.%d.attn_v.bias", prefix, i).c_str(), true); - layer.o_w = mllama_tensor_load(ctx, format("%s.blk.%d.attn_out.weight", prefix, i).c_str(), false); - layer.o_b = mllama_tensor_load(ctx, format("%s.blk.%d.attn_out.bias", prefix, i).c_str(), true); - - layer.ff_i_w = mllama_tensor_load(ctx, format("%s.blk.%d.ffn_down.weight", prefix, i).c_str(), false); - layer.ff_i_b = mllama_tensor_load(ctx, format("%s.blk.%d.ffn_down.bias", prefix, i).c_str(), false); - layer.ff_o_w = mllama_tensor_load(ctx, format("%s.blk.%d.ffn_up.weight", prefix, i).c_str(), false); - layer.ff_o_b = mllama_tensor_load(ctx, format("%s.blk.%d.ffn_up.bias", prefix, i).c_str(), false); - - layer.attn_gate = mllama_tensor_load(ctx, format("%s.blk.%d.attn_gate", prefix, i).c_str(), true); - layer.ff_gate = mllama_tensor_load(ctx, format("%s.blk.%d.ffn_gate", prefix, i).c_str(), true); - } - - return layers; -} - -// read and create ggml_context containing the tensors and their data -struct mllama_ctx *mllama_model_load(const char *fname, const int verbosity = 1) { - struct ggml_context *meta = nullptr; - - struct gguf_init_params params = { - true, // no_alloc - &meta, // ctx - }; - - struct gguf_context *ctx = gguf_init_from_file(fname, params); - REQUIRE(ctx != nullptr); - - if (verbosity >= 1) { - const int n_tensors = gguf_get_n_tensors(ctx); - const int n_kv = gguf_get_n_kv(ctx); - const std::string ftype = get_ftype(get_u32(ctx, "general.file_type")); - const int idx_desc = get_key_index(ctx, "general.description"); - const std::string description = gguf_get_val_str(ctx, idx_desc); - const int idx_name = gguf_find_key(ctx, "general.name"); - if (idx_name != -1) { // make name optional temporarily as some of the uploaded models missing it due to a bug - const std::string name = gguf_get_val_str(ctx, idx_name); - LOG("model name: %s", name.c_str()); - } - LOG("description: %s", description.c_str()); - LOG("GGUF version: %d", gguf_get_version(ctx)); - LOG("alignment: %zu", gguf_get_alignment(ctx)); - LOG("n_tensors: %d", n_tensors); - LOG("n_kv: %d", n_kv); - LOG("ftype: %s", ftype.c_str()); - LOG(""); - } - const int n_tensors = gguf_get_n_tensors(ctx); - - mllama_ctx *new_mllama = new mllama_ctx{}; - - ggml_backend_t backend = ggml_backend_init_best(); - if (backend == nullptr) { - LOG("%s: failed to initialize backend\n", __func__); - mllama_free(new_mllama); - gguf_free(ctx); - return nullptr; - } - LOG("%s: using %s backend\n", __func__, ggml_backend_name(backend)); - new_mllama->backend = backend; - - // load tensors - { - std::vector read_buf; - struct ggml_init_params params = { - (n_tensors + 1) * ggml_tensor_overhead(), // mem_size - nullptr, // mem_buffer - true, // no_alloc - }; - - new_mllama->ctx_data = ggml_init(params); - if (!new_mllama->ctx_data) { - LOG("ggml_init() failed"); - mllama_free(new_mllama); - gguf_free(ctx); - return nullptr; - } - -#ifdef _WIN32 - int wlen = MultiByteToWideChar(CP_UTF8, 0, fname, -1, NULL, 0); - if (!wlen) { - return NULL; - } - wchar_t * wbuf = (wchar_t *) malloc(wlen * sizeof(wchar_t)); - wlen = MultiByteToWideChar(CP_UTF8, 0, fname, -1, wbuf, wlen); - if (!wlen) { - free(wbuf); - return NULL; - } -#if __GLIBCXX__ - int fd = _wopen(wbuf, _O_RDONLY | _O_BINARY); - __gnu_cxx::stdio_filebuf buffer(fd, std::ios_base::in); - std::istream fin(&buffer); -#else // MSVC - // unused in our current build - auto fin = std::ifstream(wbuf, std::ios::binary); -#endif - free(wbuf); -#else - auto fin = std::ifstream(fname, std::ios::binary); -#endif - if (!fin) { - LOG("cannot open model file for loading tensors\n"); - mllama_free(new_mllama); - gguf_free(ctx); - return nullptr; - } - - // add tensors to context - for (int i = 0; i < n_tensors; ++i) { - const char *name = gguf_get_tensor_name(ctx, i); - struct ggml_tensor *t = ggml_get_tensor(meta, name); - struct ggml_tensor *cur = ggml_dup_tensor(new_mllama->ctx_data, t); - ggml_set_name(cur, name); - } - - // alloc memory and offload data - new_mllama->params_buffer = ggml_backend_alloc_ctx_tensors(new_mllama->ctx_data, new_mllama->backend); - for (int i = 0; i < n_tensors; ++i) { - const char *name = gguf_get_tensor_name(ctx, i); - struct ggml_tensor *cur = ggml_get_tensor(new_mllama->ctx_data, name); - const size_t offset = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i); - fin.seekg(offset, std::ios::beg); - if (!fin) { - LOG("failed to seek for tensor %s\n", name); - mllama_free(new_mllama); - gguf_free(ctx); - return nullptr; - } - int num_bytes = ggml_nbytes(cur); - if (ggml_backend_buffer_is_host(new_mllama->params_buffer)) { - // for the CPU and Metal backend, we can read directly into the tensor - fin.read(reinterpret_cast(cur->data), num_bytes); - } else { - // read into a temporary buffer first, then copy to device memory - read_buf.resize(num_bytes); - fin.read(reinterpret_cast(read_buf.data()), num_bytes); - ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes); - } - } - -#if defined(_WIN32) && defined(__GLIBCXX__) - close(fd); -#else - fin.close(); -#endif - } - - // vision model - // load vision model - auto &vision_model = new_mllama->vision_model; - auto &hparams = vision_model.hparams; - hparams.hidden_size = get_u32(ctx, "mllama.vision.embedding_length"); - hparams.n_head = get_u32(ctx, "mllama.vision.attention.head_count"); - hparams.n_intermediate = get_u32(ctx, "mllama.vision.feed_forward_length"); - hparams.n_layer = get_u32(ctx, "mllama.vision.block_count"); - hparams.n_global_layer = get_u32(ctx, "mllama.vision.global.block_count"); - hparams.n_tiles = get_u32(ctx, "mllama.vision.max_num_tiles"); - hparams.image_size = get_u32(ctx, "mllama.vision.image_size"); - hparams.patch_size = get_u32(ctx, "mllama.vision.patch_size"); - hparams.projection_dim = get_u32(ctx, "mllama.vision.projection_dim"); - hparams.eps = get_f32(ctx, "mllama.vision.attention.layer_norm_epsilon"); - - std::vector intermediate_layers_indices = get_u32_array(ctx, "mllama.vision.intermediate_layers_indices"); - hparams.intermediate_layers.resize(hparams.n_layer); - for (size_t i = 0; i < intermediate_layers_indices.size(); i++) { - hparams.intermediate_layers[intermediate_layers_indices[i]] = true; - } - - if (verbosity >= 2) { - LOG(""); - LOG("vision model hparams"); - LOG("image_size %d", hparams.image_size); - LOG("patch_size %d", hparams.patch_size); - LOG("v_hidden_size %d", hparams.hidden_size); - LOG("v_n_intermediate %d", hparams.n_intermediate); - LOG("v_projection_dim %d", hparams.projection_dim); - LOG("v_n_head %d", hparams.n_head); - LOG("v_n_layer %d", hparams.n_layer); - LOG("v_n_global_layer %d", hparams.n_global_layer); - LOG("v_eps %f", hparams.eps); - } - - vision_model.class_embedding = mllama_tensor_load(new_mllama->ctx_data, "v.class_embd", true); - vision_model.patch_embeddings = mllama_tensor_load(new_mllama->ctx_data, "v.patch_embd.weight", true); - - vision_model.position_embeddings = mllama_tensor_load(new_mllama->ctx_data, "v.position_embd.weight", true); - vision_model.position_embeddings_gate = mllama_tensor_load(new_mllama->ctx_data, "v.position_embd.gate", true); - - vision_model.pre_ln_w = mllama_tensor_load(new_mllama->ctx_data, "v.pre_ln.weight", true); - vision_model.pre_ln_b = mllama_tensor_load(new_mllama->ctx_data, "v.pre_ln.bias", true); - vision_model.post_ln_w = mllama_tensor_load(new_mllama->ctx_data, "v.post_ln.weight", true); - vision_model.post_ln_b = mllama_tensor_load(new_mllama->ctx_data, "v.post_ln.bias", true); - - vision_model.tile_position_embeddings = mllama_tensor_load(new_mllama->ctx_data, "v.tile_position_embd.weight", true); - vision_model.tile_position_embeddings_gate = mllama_tensor_load(new_mllama->ctx_data, "v.tile_position_embd.gate", true); - - vision_model.pre_tile_position_embeddings = mllama_tensor_load(new_mllama->ctx_data, "v.pre_tile_position_embd.weight", true); - vision_model.pre_tile_position_embeddings_gate = mllama_tensor_load(new_mllama->ctx_data, "v.pre_tile_position_embd.gate", true); - - vision_model.post_tile_position_embeddings = mllama_tensor_load(new_mllama->ctx_data, "v.post_tile_position_embd.weight", true); - vision_model.post_tile_position_embeddings_gate = mllama_tensor_load(new_mllama->ctx_data, "v.post_tile_position_embd.gate", true); - - vision_model.mm_0_w = mllama_tensor_load(new_mllama->ctx_data, "mm.0.weight", false); - vision_model.mm_0_b = mllama_tensor_load(new_mllama->ctx_data, "mm.0.bias", false); - - vision_model.layers = mllama_layers_load(new_mllama->ctx_data, "v", hparams.n_layer); - vision_model.global_layers = mllama_layers_load(new_mllama->ctx_data, "v.global", hparams.n_global_layer); - - ggml_free(meta); - - new_mllama->ctx_gguf = ctx; - - { - // measure mem requirement and allocate - new_mllama->buf_compute_meta.resize(GGML_DEFAULT_GRAPH_SIZE * ggml_tensor_overhead() + ggml_graph_overhead()); - new_mllama->compute_alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(new_mllama->backend)); - struct mllama_image_batch batch; - batch.size = 1; - ggml_cgraph *gf = mllama_image_build_graph(new_mllama, &batch); - ggml_gallocr_reserve(new_mllama->compute_alloc, gf); - size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_mllama->compute_alloc, 0); - LOG("compute allocated memory: %.2f MB", compute_memory_buffer_size / 1024.0 / 1024.0); - } - - return new_mllama; -} - -struct mllama_image *mllama_image_init() { - return new mllama_image(); -} - -void mllama_image_free(struct mllama_image *img) { delete img; } -void mllama_image_batch_free(struct mllama_image_batch *batch) { - if (batch->size > 0) { - delete[] batch->data; - batch->size = 0; - } -} - -bool mllama_image_load_from_data(const void *data, const int n, const int width, const int height, const int num_channels, const int num_tiles, const int aspect_ratio_id, struct mllama_image *img) { - img->width = width; - img->height = height; - img->num_channels = num_channels; - img->num_tiles = num_tiles; - img->aspect_ratio_id = aspect_ratio_id; - img->data.resize(n); - - memcpy(img->data.data(), data, n); - return true; -} - -inline int mllama(int x, int lower, int upper) { - return std::max(lower, std::min(x, upper)); -} - -void mllama_free(mllama_ctx *ctx) { - ggml_free(ctx->ctx_data); - gguf_free(ctx->ctx_gguf); - - ggml_backend_buffer_free(ctx->params_buffer); - ggml_backend_free(ctx->backend); - ggml_gallocr_free(ctx->compute_alloc); - delete ctx; -} - -bool mllama_image_encode(struct mllama_ctx *ctx, const int n_threads, mllama_image *img, float *vec) { - mllama_image_batch imgs{}; - imgs.size = 1; - imgs.data = img; - return mllama_image_batch_encode(ctx, n_threads, &imgs, vec); -} - -bool mllama_image_batch_encode(mllama_ctx *ctx, const int n_threads, const mllama_image_batch *imgs, float *vec) { - int batch_size = imgs->size; - REQUIRE(batch_size == 1); - - // build the inference graph - ggml_cgraph *gf = mllama_image_build_graph(ctx, imgs); - ggml_gallocr_alloc_graph(ctx->compute_alloc, gf); - - // set inputs - const auto &model = ctx->vision_model; - const auto &hparams = model.hparams; - - const int image_size = hparams.image_size; - int image_size_width = image_size; - int image_size_height = image_size; - - const int patch_size = hparams.patch_size; - const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size)); - const int num_positions = num_patches + (model.class_embedding == nullptr ? 0 : 1); - - { - struct ggml_tensor *inp_raw = ggml_graph_get_tensor(gf, "inp_raw"); - ggml_backend_tensor_set(inp_raw, imgs->data[0].data.data(), 0, ggml_nbytes(inp_raw)); - } - - { - struct ggml_tensor *embeddings = ggml_graph_get_tensor(gf, "embeddings"); - if (embeddings != nullptr) { - void *zeros = malloc(ggml_nbytes(embeddings)); - memset(zeros, 0, ggml_nbytes(embeddings)); - ggml_backend_tensor_set(embeddings, zeros, 0, ggml_nbytes(embeddings)); - free(zeros); - } - } - - { - struct ggml_tensor *positions = ggml_graph_get_tensor(gf, "positions"); - if (positions != nullptr) { - int *positions_data = (int *)malloc(ggml_nbytes(positions)); - for (int i = 0; i < num_positions; i++) { - positions_data[i] = i; - } - ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions)); - free(positions_data); - } - } - - { - struct ggml_tensor *aspect_ratios = ggml_graph_get_tensor(gf, "aspect_ratios"); - if (aspect_ratios != nullptr) { - int *aspect_ratios_data = (int *)malloc(ggml_nbytes(aspect_ratios)); - aspect_ratios_data[0] = imgs->data[0].aspect_ratio_id; - ggml_backend_tensor_set(aspect_ratios, aspect_ratios_data, 0, ggml_nbytes(aspect_ratios)); - free(aspect_ratios_data); - } - } - - if (ggml_backend_is_cpu(ctx->backend)) { - ggml_backend_cpu_set_n_threads(ctx->backend, n_threads); - } - - ggml_backend_graph_compute(ctx->backend, gf); - - // the last node is the embedding tensor - struct ggml_tensor *embeddings = ggml_graph_node(gf, ggml_graph_n_nodes(gf) - 1); - - // copy the embeddings to the location passed by the user - ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings)); - - return true; -} - -int32_t mllama_image_size(const struct mllama_ctx *ctx) { - return ctx->vision_model.hparams.image_size; -} - -int32_t mllama_patch_size(const struct mllama_ctx *ctx) { - return ctx->vision_model.hparams.patch_size; -} - -int32_t mllama_hidden_size(const struct mllama_ctx *ctx) { - return ctx->vision_model.hparams.hidden_size; -} - -int mllama_n_patches(const struct mllama_ctx *ctx) { - const auto &hparams = ctx->vision_model.hparams; - return (hparams.image_size / hparams.patch_size) * (hparams.image_size / hparams.patch_size); -} - -int mllama_n_positions(const struct mllama_ctx *ctx) { - return mllama_n_patches(ctx) + (ctx->vision_model.class_embedding == nullptr ? 0 : 1); -} - -int mllama_n_tiles(const struct mllama_ctx *ctx) { - return ctx->vision_model.hparams.n_tiles; -} - -int mllama_n_embd(const struct mllama_ctx *ctx) { - return ctx->vision_model.hparams.projection_dim; -} - -size_t mllama_n_embd_bytes(const struct mllama_ctx *ctx) { - return mllama_n_positions(ctx) * mllama_n_embd(ctx) * mllama_n_tiles(ctx) * sizeof(float); -} diff --git a/llama/mllama.h b/llama/mllama.h deleted file mode 100644 index 446dbb9ec..000000000 --- a/llama/mllama.h +++ /dev/null @@ -1,61 +0,0 @@ -#ifndef MLLAMA_H -#define MLLAMA_H - -#include -#include - -#ifdef LLAMA_SHARED -#if defined(_WIN32) && !defined(__MINGW32__) -#ifdef LLAMA_BUILD -#define MLLAMA_API __declspec(dllexport) -#else -#define MLLAMA_API __declspec(dllimport) -#endif -#else -#define MLLAMA_API __attribute__((visibility("default"))) -#endif -#else -#define MLLAMA_API -#endif - -#ifdef __cplusplus -extern "C" { -#endif - -struct mllama_ctx; - -struct mllama_image_batch { - struct mllama_image *data; - size_t size; -}; - -MLLAMA_API struct mllama_ctx *mllama_model_load(const char *fname, int verbosity); -MLLAMA_API struct mllama_ctx *mllama_model_load_cpu(const char *fname, int verbosity); - -MLLAMA_API void mllama_free(struct mllama_ctx *ctx); - -MLLAMA_API int32_t mllama_image_size(const struct mllama_ctx *ctx); -MLLAMA_API int32_t mllama_patch_size(const struct mllama_ctx *ctx); -MLLAMA_API int32_t mllama_hidden_size(const struct mllama_ctx *ctx); - -MLLAMA_API int mllama_n_patches(const struct mllama_ctx *ctx); -MLLAMA_API int mllama_n_positions(const struct mllama_ctx *ctx); -MLLAMA_API int mllama_n_tiles(const struct mllama_ctx *ctx); -MLLAMA_API int mllama_n_embd(const struct mllama_ctx *ctx); -MLLAMA_API size_t mllama_n_embd_bytes(const struct mllama_ctx *ctx); - -MLLAMA_API struct mllama_image *mllama_image_init(); - -MLLAMA_API void mllama_image_free(struct mllama_image *img); -MLLAMA_API void mllama_image_batch_free(struct mllama_image_batch *batch); - -MLLAMA_API bool mllama_image_load_from_data(const void *data, const int n, const int nx, const int ny, const int nc, const int nt, const int aspect_ratio_id, struct mllama_image *img); - -MLLAMA_API bool mllama_image_encode(struct mllama_ctx *ctx, int n_threads, struct mllama_image *img, float *vec); -MLLAMA_API bool mllama_image_batch_encode(struct mllama_ctx *ctx, int n_threads, const struct mllama_image_batch *imgs, float *vec); - -#ifdef __cplusplus -} -#endif - -#endif // MLLAMA_H diff --git a/llama/patches/0005-solar-pro.patch b/llama/patches/0005-solar-pro.patch index c630f2439..deb53c225 100644 --- a/llama/patches/0005-solar-pro.patch +++ b/llama/patches/0005-solar-pro.patch @@ -270,7 +270,7 @@ index 3a4e72a3..831b68c0 100644 + // self-attention + { + // rope freq factors for llama3; may return nullptr for llama2 and other models -+ ggml_tensor * rope_factors = static_cast(memory)->cbs.get_rope_factors(n_ctx_per_seq, il); ++ ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il); + + // compute Q and K and RoPE them + ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); diff --git a/llama/patches/0006-add-mllama-support.patch b/llama/patches/0006-add-mllama-support.patch deleted file mode 100644 index 05f85ec34..000000000 --- a/llama/patches/0006-add-mllama-support.patch +++ /dev/null @@ -1,1027 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: jmorganca -Date: Sun, 20 Apr 2025 16:12:36 -0700 -Subject: [PATCH] add mllama support - -adds support for the llama 3.2 vision architecture ---- - ggml/src/ggml-backend-reg.cpp | 6 +- - include/llama.h | 6 + - src/llama-arch.cpp | 44 +++++ - src/llama-arch.h | 10 ++ - src/llama-batch.cpp | 3 + - src/llama-context.cpp | 23 ++- - src/llama-context.h | 1 + - src/llama-cparams.h | 1 + - src/llama-graph.cpp | 25 +++ - src/llama-graph.h | 12 ++ - src/llama-hparams.cpp | 4 + - src/llama-hparams.h | 7 + - src/llama-kv-cache.cpp | 14 +- - src/llama-model-loader.cpp | 2 + - src/llama-model.cpp | 311 +++++++++++++++++++++++++++++++++- - src/llama-model.h | 12 ++ - src/llama-quant.cpp | 4 +- - tools/mtmd/llava.cpp | 5 +- - tools/mtmd/mtmd-helper.cpp | 7 +- - 19 files changed, 475 insertions(+), 22 deletions(-) - -diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp -index 405d8e31..82ae1b5b 100644 ---- a/ggml/src/ggml-backend-reg.cpp -+++ b/ggml/src/ggml-backend-reg.cpp -@@ -178,9 +178,9 @@ struct ggml_backend_registry { - #ifdef GGML_USE_CANN - register_backend(ggml_backend_cann_reg()); - #endif --#ifdef GGML_USE_BLAS -- register_backend(ggml_backend_blas_reg()); --#endif -+// #ifdef GGML_USE_BLAS -+// register_backend(ggml_backend_blas_reg()); -+// #endif - #ifdef GGML_USE_RPC - register_backend(ggml_backend_rpc_reg()); - #endif -diff --git a/include/llama.h b/include/llama.h -index abedebdb..41beef21 100644 ---- a/include/llama.h -+++ b/include/llama.h -@@ -258,6 +258,7 @@ extern "C" { - - llama_token * token; - float * embd; -+ int32_t n_embd; - llama_pos * pos; - int32_t * n_seq_id; - llama_seq_id ** seq_id; -@@ -365,6 +366,7 @@ extern "C" { - bool flash_attn; // whether to use flash attention [EXPERIMENTAL] - bool no_perf; // whether to measure performance timings - bool op_offload; // whether to offload host tensor operations to device -+ bool cross_attn; // whether to use cross attention - }; - - // model quantization parameters -@@ -464,6 +466,10 @@ extern "C" { - struct llama_context_params params), - "use llama_init_from_model instead"); - -+ // TODO (jmorganca): this should most likely be passed in as part of a batch -+ // and not set on the context for all batches. -+ LLAMA_API void llama_set_cross_attention(struct llama_context * ctx, bool cross_attn_state); -+ - // Frees all allocated memory - LLAMA_API void llama_free(struct llama_context * ctx); - -diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp -index 5ab3f572..eb7b5325 100644 ---- a/src/llama-arch.cpp -+++ b/src/llama-arch.cpp -@@ -6,6 +6,7 @@ - - static const std::map LLM_ARCH_NAMES = { - { LLM_ARCH_LLAMA, "llama" }, -+ { LLM_ARCH_MLLAMA, "mllama" }, - { LLM_ARCH_LLAMA4, "llama4" }, - { LLM_ARCH_DECI, "deci" }, - { LLM_ARCH_FALCON, "falcon" }, -@@ -144,6 +145,7 @@ static const std::map LLM_KV_NAMES = { - { LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" }, - { LLM_KV_ATTENTION_SCALE, "%s.attention.scale" }, - { LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection" }, -+ { LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS, "%s.attention.cross_attention_layers" }, - { LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" }, - { LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" }, - -@@ -273,6 +275,40 @@ static const std::map> LLM_TENSOR_N - { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" }, - }, - }, -+ { -+ LLM_ARCH_MLLAMA, -+ { -+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, -+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, -+ { LLM_TENSOR_OUTPUT, "output" }, -+ { LLM_TENSOR_ROPE_FREQS, "rope_freqs" }, -+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, -+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, -+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, -+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, -+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, -+ { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" }, -+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" }, -+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, -+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, -+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, -+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, -+ { LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" }, -+ { LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" }, -+ { LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" }, -+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" }, -+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" }, -+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, -+ { LLM_TENSOR_CROSS_ATTN_K_NORM, "blk.%d.cross_attn_k_norm" }, -+ { LLM_TENSOR_CROSS_ATTN_K_PROJ, "blk.%d.cross_attn_k_proj" }, -+ { LLM_TENSOR_CROSS_ATTN_O_PROJ, "blk.%d.cross_attn_o_proj" }, -+ { LLM_TENSOR_CROSS_ATTN_Q_NORM, "blk.%d.cross_attn_q_norm" }, -+ { LLM_TENSOR_CROSS_ATTN_Q_PROJ, "blk.%d.cross_attn_q_proj" }, -+ { LLM_TENSOR_CROSS_ATTN_V_PROJ, "blk.%d.cross_attn_v_proj" }, -+ { LLM_TENSOR_CROSS_ATTN_ATTN_GATE, "blk.%d.cross_attn_attn_gate" }, -+ { LLM_TENSOR_CROSS_ATTN_MLP_GATE, "blk.%d.cross_attn_mlp_gate" }, -+ }, -+ }, - { - LLM_ARCH_DECI, - { -@@ -1701,6 +1737,14 @@ static const std::map LLM_TENSOR_INFOS = { - // this tensor is loaded for T5, but never used - {LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}}, - {LLM_TENSOR_BSKCN_TV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, -+ {LLM_TENSOR_CROSS_ATTN_K_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, -+ {LLM_TENSOR_CROSS_ATTN_K_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, -+ {LLM_TENSOR_CROSS_ATTN_O_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, -+ {LLM_TENSOR_CROSS_ATTN_Q_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, -+ {LLM_TENSOR_CROSS_ATTN_Q_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, -+ {LLM_TENSOR_CROSS_ATTN_V_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, -+ {LLM_TENSOR_CROSS_ATTN_ATTN_GATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, -+ {LLM_TENSOR_CROSS_ATTN_MLP_GATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, - {LLM_TENSOR_CONV1D, {LLM_TENSOR_LAYER_INPUT, GGML_OP_IM2COL}}, - {LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, - {LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, -diff --git a/src/llama-arch.h b/src/llama-arch.h -index 525c1b7d..bc8a4f0b 100644 ---- a/src/llama-arch.h -+++ b/src/llama-arch.h -@@ -11,6 +11,7 @@ - enum llm_arch { - LLM_ARCH_LLAMA, - LLM_ARCH_LLAMA4, -+ LLM_ARCH_MLLAMA, - LLM_ARCH_DECI, - LLM_ARCH_FALCON, - LLM_ARCH_BAICHUAN, -@@ -148,6 +149,7 @@ enum llm_kv { - LLM_KV_ATTENTION_SLIDING_WINDOW, - LLM_KV_ATTENTION_SCALE, - LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, -+ LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS, - LLM_KV_ATTENTION_KEY_LENGTH_MLA, - LLM_KV_ATTENTION_VALUE_LENGTH_MLA, - -@@ -349,6 +351,14 @@ enum llm_tensor { - LLM_TENSOR_CLS, - LLM_TENSOR_CLS_OUT, - LLM_TENSOR_BSKCN_TV, -+ LLM_TENSOR_CROSS_ATTN_K_NORM, -+ LLM_TENSOR_CROSS_ATTN_K_PROJ, -+ LLM_TENSOR_CROSS_ATTN_O_PROJ, -+ LLM_TENSOR_CROSS_ATTN_Q_NORM, -+ LLM_TENSOR_CROSS_ATTN_Q_PROJ, -+ LLM_TENSOR_CROSS_ATTN_V_PROJ, -+ LLM_TENSOR_CROSS_ATTN_ATTN_GATE, -+ LLM_TENSOR_CROSS_ATTN_MLP_GATE, - LLM_TENSOR_CONV1D, - LLM_TENSOR_CONVNEXT_DW, - LLM_TENSOR_CONVNEXT_NORM, -diff --git a/src/llama-batch.cpp b/src/llama-batch.cpp -index a88b2fe3..241b316e 100644 ---- a/src/llama-batch.cpp -+++ b/src/llama-batch.cpp -@@ -320,6 +320,7 @@ struct llama_batch llama_batch_get_one( - /*n_tokens =*/ n_tokens, - /*tokens =*/ tokens, - /*embd =*/ nullptr, -+ /*n_embd =*/ 0, - /*pos =*/ nullptr, - /*n_seq_id =*/ nullptr, - /*seq_id =*/ nullptr, -@@ -332,6 +333,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_ - /*n_tokens =*/ 0, - /*tokens =*/ nullptr, - /*embd =*/ nullptr, -+ /*n_embd =*/ 0, - /*pos =*/ nullptr, - /*n_seq_id =*/ nullptr, - /*seq_id =*/ nullptr, -@@ -340,6 +342,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_ - - if (embd) { - batch.embd = (float *) malloc(sizeof(float) * n_tokens_alloc * embd); -+ batch.n_embd = embd; - } else { - batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens_alloc); - } -diff --git a/src/llama-context.cpp b/src/llama-context.cpp -index dca22d8b..c22687e4 100644 ---- a/src/llama-context.cpp -+++ b/src/llama-context.cpp -@@ -514,7 +514,7 @@ float * llama_context::get_logits_ith(int32_t i) { - throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, n_outputs)); - } - -- return logits + j*model.vocab.n_tokens(); -+ return logits + j*model.hparams.n_vocab; - } catch (const std::exception & err) { - LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what()); - #ifndef NDEBUG -@@ -632,6 +632,10 @@ void llama_context::set_warmup(bool value) { - cparams.warmup = value; - } - -+void llama_context::set_cross_attn(bool value) { -+ cparams.cross_attn = value; -+} -+ - void llama_context::set_adapter_lora( - llama_adapter_lora * adapter, - float scale) { -@@ -709,7 +713,7 @@ int llama_context::encode(llama_batch & inp_batch) { - - const int64_t n_embd = hparams.n_embd; - -- llama_sbatch sbatch = llama_sbatch(batch, n_embd, /* simple_split */ true, /* logits_all */ true); -+ llama_sbatch sbatch = llama_sbatch(batch, batch.n_embd, /* simple_split */ true, /* logits_all */ true); - - const llama_ubatch ubatch = sbatch.split_simple(n_tokens); - -@@ -863,10 +867,9 @@ int llama_context::decode(llama_batch & inp_batch) { - - const llama_batch & batch = batch_allocr.batch; - -- const auto & vocab = model.vocab; - const auto & hparams = model.hparams; - -- const int32_t n_vocab = vocab.n_tokens(); -+ const int32_t n_vocab = hparams.n_vocab; - - const int64_t n_tokens_all = batch.n_tokens; - const int64_t n_embd = hparams.n_embd; -@@ -1087,7 +1090,7 @@ int llama_context::decode(llama_batch & inp_batch) { - // make the outputs have the same order they had in the user-provided batch - // note: this is mostly relevant for recurrent models atm - if (!sorted_output) { -- const uint32_t n_vocab = model.vocab.n_tokens(); -+ const uint32_t n_vocab = model.hparams.n_vocab; - const uint32_t n_embd = model.hparams.n_embd; - - GGML_ASSERT((size_t) n_outputs == out_ids.size()); -@@ -1142,12 +1145,11 @@ int llama_context::decode(llama_batch & inp_batch) { - - int32_t llama_context::output_reserve(int32_t n_outputs) { - const auto & hparams = model.hparams; -- const auto & vocab = model.vocab; - - const int64_t n_outputs_max = std::max(n_outputs, n_seq_max()); - - const auto n_batch = cparams.n_batch; -- const auto n_vocab = vocab.n_tokens(); -+ const auto n_vocab = hparams.n_vocab; - const auto n_embd = hparams.n_embd; - - // TODO: use a per-batch flag for logits presence instead -@@ -1682,7 +1684,7 @@ size_t llama_context::state_write_data(llama_io_write_i & io) { - { - LLAMA_LOG_DEBUG("%s: - writing logits\n", __func__); - -- const uint64_t logits_size = std::min((uint64_t) this->logits_size, (uint64_t) n_outputs * model.vocab.n_tokens()); -+ const uint64_t logits_size = std::min((uint64_t) this->logits_size, (uint64_t) n_outputs * model.hparams.n_vocab); - - io.write(&logits_size, sizeof(logits_size)); - -@@ -2091,6 +2093,7 @@ llama_context_params llama_context_default_params() { - /*.flash_attn =*/ false, - /*.no_perf =*/ true, - /*.op_offload =*/ true, -+ /*.cross_attn =*/ false, - }; - - return result; -@@ -2216,6 +2219,10 @@ void llama_set_warmup(llama_context * ctx, bool warmup) { - ctx->set_warmup(warmup); - } - -+void llama_set_cross_attention(struct llama_context * ctx, bool cross_attention) { -+ ctx->set_cross_attn(cross_attention); -+} -+ - void llama_synchronize(llama_context * ctx) { - ctx->synchronize(); - } -diff --git a/src/llama-context.h b/src/llama-context.h -index c0ceacb1..c4ab242a 100644 ---- a/src/llama-context.h -+++ b/src/llama-context.h -@@ -71,6 +71,7 @@ struct llama_context { - void set_embeddings (bool value); - void set_causal_attn(bool value); - void set_warmup(bool value); -+ void set_cross_attn(bool value); - - void set_adapter_lora( - llama_adapter_lora * adapter, -diff --git a/src/llama-cparams.h b/src/llama-cparams.h -index 246fa577..7a6156ce 100644 ---- a/src/llama-cparams.h -+++ b/src/llama-cparams.h -@@ -31,6 +31,7 @@ struct llama_cparams { - bool no_perf; - bool warmup; - bool op_offload; -+ bool cross_attn; - - enum llama_pooling_type pooling_type; - -diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp -index b0e3f635..f14869cf 100644 ---- a/src/llama-graph.cpp -+++ b/src/llama-graph.cpp -@@ -532,6 +532,12 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) { - } - } - -+void llm_graph_input_cross_attn_state::set_input(const llama_ubatch * ubatch) { -+ if (ubatch->embd) { -+ ggml_backend_tensor_set(cross_attn_state, ubatch->embd, 0, ggml_nbytes(cross_attn_state)); -+ } -+} -+ - // - // llm_graph_context - // -@@ -1514,6 +1520,25 @@ llm_graph_input_attn_cross * llm_graph_context::build_attn_inp_cross() const { - return (llm_graph_input_attn_cross *) res->add_input(std::move(inp)); - } - -+ggml_tensor * llm_graph_context::build_inp_cross_attn_state() const { -+ const int64_t n_embd = hparams.n_embd; -+ -+ auto inp = std::make_unique(); -+ -+ ggml_tensor * cur = nullptr; -+ -+ inp->cross_attn_state = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd, 1601, 4); -+ ggml_set_input(inp->cross_attn_state); -+ -+ cur = inp->cross_attn_state; -+ -+ cb(cur, "inp_cross_attn_state", -1); -+ -+ res->add_input(std::move(inp)); -+ -+ return cur; -+} -+ - ggml_tensor * llm_graph_context::build_attn( - llm_graph_input_attn_cross * inp, - ggml_cgraph * gf, -diff --git a/src/llama-graph.h b/src/llama-graph.h -index 832a8c09..5a322785 100644 ---- a/src/llama-graph.h -+++ b/src/llama-graph.h -@@ -87,6 +87,7 @@ public: - - ggml_tensor * tokens = nullptr; // I32 [n_batch] - ggml_tensor * embd = nullptr; // F32 [n_embd, n_batch] -+ ggml_tensor * cross_attn_state; // F32 [4, n_embd, 1061] - }; - - class llm_graph_input_pos : public llm_graph_input_i { -@@ -284,6 +285,16 @@ public: - const llama_cross * cross = nullptr; - }; - -+class llm_graph_input_cross_attn_state : public llm_graph_input_i { -+public: -+ llm_graph_input_cross_attn_state() = default; -+ virtual ~llm_graph_input_cross_attn_state() = default; -+ -+ void set_input(const llama_ubatch * ubatch) override; -+ -+ ggml_tensor * cross_attn_state; // F32 [4, n_embd, 1061] -+}; -+ - // - // llm_graph_result - // -@@ -495,6 +506,7 @@ struct llm_graph_context { - ggml_tensor * build_inp_cls() const; - ggml_tensor * build_inp_s_copy() const; - ggml_tensor * build_inp_s_mask() const; -+ ggml_tensor * build_inp_cross_attn_state() const; - - ggml_tensor * build_inp_cross_embd() const; - ggml_tensor * build_inp_pos_bucket_enc() const; -diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp -index 8a667960..6a02de03 100644 ---- a/src/llama-hparams.cpp -+++ b/src/llama-hparams.cpp -@@ -85,3 +85,7 @@ bool llama_hparams::is_swa(uint32_t il) const { - - GGML_ABORT("fatal error"); - } -+ -+bool llama_hparams::cross_attention_layers(uint32_t il) const { -+ return std::find(cross_attn_layers.begin(), cross_attn_layers.end(), il) != cross_attn_layers.end(); -+} -diff --git a/src/llama-hparams.h b/src/llama-hparams.h -index 48dce407..b6fc7e6d 100644 ---- a/src/llama-hparams.h -+++ b/src/llama-hparams.h -@@ -2,6 +2,8 @@ - - #include "llama.h" - -+#include -+ - #include - - // bump if necessary -@@ -42,6 +44,7 @@ struct llama_hparams { - uint32_t n_expert = 0; - uint32_t n_expert_used = 0; - uint32_t n_rel_attn_bkts = 0; -+ uint32_t n_vocab = 0; - - // note: deepseek2 using MLA converts into MQA with larger heads, then decompresses to MHA - uint32_t n_embd_head_k_mla = 0; -@@ -56,6 +59,7 @@ struct llama_hparams { - std::array n_ff_arr; - - std::array, 4> n_bskcn_arr = {}; -+ std::array cross_attn_layers; - - uint32_t n_layer_dense_lead = 0; - uint32_t n_lora_q = 0; -@@ -159,6 +163,9 @@ struct llama_hparams { - // Block skip connection - bool n_bskcn(uint32_t n, uint32_t il) const; - -+ // cross attention layers -+ bool cross_attention_layers(uint32_t il) const; -+ - bool is_swa(uint32_t il) const; - }; - -diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp -index 3dcad65b..a7b0a7eb 100644 ---- a/src/llama-kv-cache.cpp -+++ b/src/llama-kv-cache.cpp -@@ -100,8 +100,16 @@ llama_kv_cache_unified::llama_kv_cache_unified( - throw std::runtime_error("failed to create ggml context for kv cache"); - } - -- ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size); -- ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size); -+ ggml_tensor * k, *v; -+ -+ // for cross attention layers -+ if (model.arch == LLM_ARCH_MLLAMA && hparams.cross_attention_layers(i)) { -+ k = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hparams.n_embd_head_k, 6404, hparams.n_head_kv(i)); -+ v = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hparams.n_embd_head_v, 6404, hparams.n_head_kv(i)); -+ } else { -+ k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size); -+ v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size); -+ } - ggml_format_name(k, "cache_k_l%d", i); - ggml_format_name(v, "cache_v_l%d", i); - k_l.push_back(k); -@@ -446,7 +454,7 @@ void llama_kv_cache_unified::set_full() { - llama_sbatch llama_kv_cache_unified::sbatch_init( - const llama_batch & batch, - bool logits_all) { -- return llama_sbatch(batch, hparams.n_embd, true, logits_all); -+ return llama_sbatch(batch, batch.n_embd, true, logits_all); - } - - llama_ubatch llama_kv_cache_unified::ubatch_next( -diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp -index 7f6617fa..2acfd4a8 100644 ---- a/src/llama-model-loader.cpp -+++ b/src/llama-model-loader.cpp -@@ -315,6 +315,8 @@ namespace GGUFMeta { - return true; - } - -+ template bool llama_model_loader::get_arr>(enum llm_kv kid, std::array& result, bool required); -+ - template - bool llama_model_loader::get_arr(const std::string & key, std::array & result, bool required) { - const int kid = gguf_find_key(meta.get(), key.c_str()); -diff --git a/src/llama-model.cpp b/src/llama-model.cpp -index 831b68c0..e8298f56 100644 ---- a/src/llama-model.cpp -+++ b/src/llama-model.cpp -@@ -433,6 +433,7 @@ void llama_model::load_hparams(llama_model_loader & ml) { - - // get general kv - ml.get_key(LLM_KV_GENERAL_NAME, name, false); -+ ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab, false); - - // everything past this point is not vocab-related - if (hparams.vocab_only) { -@@ -444,6 +445,7 @@ void llama_model::load_hparams(llama_model_loader & ml) { - ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer); - ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false); - ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false); -+ ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false); - - if (arch == LLM_ARCH_WAVTOKENIZER_DEC) { - ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd_features); -@@ -467,9 +469,11 @@ void llama_model::load_hparams(llama_model_loader & ml) { - std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0); - std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0); - std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0); -+ std::fill(hparams.cross_attn_layers.begin(), hparams.cross_attn_layers.end(), -1); - - ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer, false); - ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false); -+ ml.get_arr(LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS, hparams.cross_attn_layers, false); - - // n_head_kv is optional, default to n_head - hparams.n_head_kv_arr = hparams.n_head_arr; -@@ -522,7 +526,7 @@ void llama_model::load_hparams(llama_model_loader & ml) { - - ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false); - -- if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON) { -+ if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_MLLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON) { - if (hparams.n_rot != hparams.n_embd_head_k) { - throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k)); - } -@@ -585,6 +589,16 @@ void llama_model::load_hparams(llama_model_loader & ml) { - hparams.use_kq_norm = false; - } - } break; -+ case LLM_ARCH_MLLAMA: -+ { -+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); -+ -+ switch (hparams.n_layer) { -+ case 40: type = LLM_TYPE_11B; break; -+ case 100: type = LLM_TYPE_90B; break; -+ default: type = LLM_TYPE_UNKNOWN; -+ } -+ } break; - case LLM_ARCH_DECI: - { - ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); -@@ -1581,7 +1595,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { - const int64_t n_embd_head_v = hparams.n_embd_head_v; - const int64_t n_ff = hparams.n_ff(); - const int64_t n_embd_gqa = n_embd_v_gqa; -- const int64_t n_vocab = vocab.n_tokens(); -+ const int64_t n_vocab = hparams.n_vocab; - const int64_t n_token_types = vocab.n_token_types(); - const int64_t n_rot = hparams.n_rot; - const int64_t n_expert = hparams.n_expert; -@@ -1840,6 +1854,52 @@ bool llama_model::load_tensors(llama_model_loader & ml) { - } - } - } break; -+ case LLM_ARCH_MLLAMA: -+ { -+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab+8}, 0); -+ -+ // output -+ { -+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); -+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED); -+ -+ // if output is NULL, init from the input tok embed -+ if (output == NULL) { -+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); -+ } -+ } -+ -+ for (int i = 0; i < n_layer; ++i) { -+ auto & layer = layers[i]; -+ -+ if (hparams.cross_attention_layers(i)) { -+ layer.cross_attn_k_norm = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_K_NORM, "weight", i), {128}, 0); -+ layer.cross_attn_k_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_K_PROJ, "weight", i), {n_embd, 1024}, 0); -+ layer.cross_attn_o_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_O_PROJ, "weight", i), {n_embd, n_embd}, 0); -+ layer.cross_attn_q_norm = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_Q_NORM, "weight", i), {128}, 0); -+ layer.cross_attn_q_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_Q_PROJ, "weight", i), {n_embd, n_embd}, 0); -+ layer.cross_attn_v_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_V_PROJ, "weight", i), {n_embd, 1024}, 0); -+ layer.cross_attn_attn_gate = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_ATTN_GATE, i), {1}, 0); -+ layer.cross_attn_mlp_gate = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_MLP_GATE, i), {1}, 0); -+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); -+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0); -+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); -+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); -+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); -+ } else { -+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); -+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0); -+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0); -+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0); -+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0); -+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); -+ layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0)); -+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); -+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0); -+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); -+ } -+ } -+ } break; - case LLM_ARCH_DECI: - { - tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); -@@ -4756,6 +4816,246 @@ struct llm_build_llama : public llm_graph_context { - } - }; - -+struct llm_build_mllama: public llm_graph_context { -+ llm_build_mllama(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { -+ // mutable variable, needed during the last layer of the computation to skip unused tokens -+ int32_t n_tokens = this->n_tokens; -+ -+ const int64_t n_embd_head = hparams.n_embd_head_v; -+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); -+ GGML_ASSERT(n_embd_head == hparams.n_rot); -+ -+ ggml_tensor * cur; -+ ggml_tensor * inpL; -+ ggml_tensor * inpCAS; -+ -+ inpL = build_inp_embd(model.tok_embd); -+ inpCAS = build_inp_cross_attn_state(); -+ -+ // inp_pos - contains the positions -+ ggml_tensor * inp_pos = build_inp_pos(); -+ -+ auto * inp_attn = build_attn_inp_kv_unified(); -+ const llama_kv_cache_unified * kv_self = static_cast(memory); -+ -+ for (int il = 0; il < n_layer; ++il) { -+ ggml_tensor * inpSA = inpL; -+ -+ // norm -+ cur = build_norm(inpL, -+ model.layers[il].attn_norm, NULL, -+ LLM_NORM_RMS, il); -+ cb(cur, "attn_norm", il); -+ -+ if (hparams.cross_attention_layers(il)) { -+ if (!ubatch.embd && !cparams.cross_attn) { -+ continue; -+ } -+ -+ // cross attention layer -+ ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_q_proj, cur); -+ cb(Qcur, "Qcur", il); -+ -+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); -+ cb(Qcur, "Qcur", il); -+ -+ Qcur = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 0, 2, 1, 3)); -+ cb(Qcur, "Qcur", il); -+ -+ Qcur = build_norm(Qcur, model.layers[il].cross_attn_q_norm, NULL, LLM_NORM_RMS, il); -+ cb(Qcur, "Qcur", il); -+ -+ ggml_tensor * Kcur, * Vcur; -+ if (ubatch.embd) { -+ Kcur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_k_proj, inpCAS); -+ cb(Kcur, "Kcur", il); -+ -+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, 6404); -+ cb(Kcur, "Kcur", il); -+ -+ Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3)); -+ cb(Kcur, "Kcur", il); -+ -+ Kcur = build_norm(Kcur, model.layers[il].cross_attn_k_norm, NULL, LLM_NORM_RMS, il); -+ cb(Kcur, "Kcur", il); -+ -+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, kv_self->k_l[il])); -+ -+ Vcur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_v_proj, inpCAS); -+ cb(Vcur, "Vcur", il); -+ -+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, 6404); -+ cb(Vcur, "Vcur", il); -+ -+ Vcur = ggml_permute(ctx0, Vcur, 0, 2, 1, 3); -+ cb(Vcur, "Vcur", il); -+ -+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, kv_self->v_l[il])); -+ } else { -+ Kcur = ggml_view_tensor(ctx0, kv_self->k_l[il]); -+ cb(Kcur, "Kcur (view)", il); -+ -+ Vcur = ggml_view_tensor(ctx0, kv_self->v_l[il]); -+ cb(Vcur, "Vcur (view)", il); -+ } -+ -+ struct ggml_tensor * kq = ggml_mul_mat(ctx0, Kcur, Qcur); -+ cb(kq, "kq", il); -+ -+ // TODO: apply causal masks -+ struct ggml_tensor * kq_soft_max = ggml_soft_max_ext(ctx0, kq, nullptr, 1.f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias); -+ cb(kq_soft_max, "kq_soft_max", il); -+ -+ Vcur = ggml_cont(ctx0, ggml_transpose(ctx0, Vcur)); -+ cb(Vcur, "Vcur", il); -+ -+ struct ggml_tensor * kqv = ggml_mul_mat(ctx0, Vcur, kq_soft_max); -+ cb(kqv, "kqv", il); -+ -+ struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3); -+ cb(kqv_merged, "kqv_merged", il); -+ -+ cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_head_v*n_head, n_tokens); -+ cb(cur, "kqv_merged_cont", il); -+ -+ cur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_o_proj, cur); -+ cb(cur, "cur", il); -+ -+ // TODO: do this in place once? -+ cur = ggml_mul(ctx0, cur, ggml_tanh(ctx0, model.layers[il].cross_attn_attn_gate)); -+ -+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); -+ cb(ffn_inp, "ffn_inp", il); -+ -+ // feed-forward network -+ cur = build_norm(ffn_inp, -+ model.layers[il].ffn_norm, NULL, -+ LLM_NORM_RMS, il); -+ cb(cur, "ffn_norm", il); -+ -+ cur = build_ffn(cur, -+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, -+ model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL, -+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, -+ NULL, -+ LLM_FFN_SILU, LLM_FFN_PAR, il); -+ cb(cur, "ffn_out", il); -+ -+ // TODO: do this inplace once? -+ cur = ggml_add_inplace(ctx0, ggml_mul_inplace(ctx0, cur, ggml_tanh(ctx0, model.layers[il].cross_attn_mlp_gate)), ffn_inp); -+ cb(cur, "ffn_out", il); -+ -+ cur = build_cvec(cur, il); -+ cb(cur, "l_out", il); -+ -+ // input for next layer -+ inpL = cur; -+ } else { -+ // self attention layer -+ -+ // rope freq factors for llama3; may return nullptr for llama2 and other models -+ ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il); -+ -+ // compute Q and K and RoPE them -+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); -+ cb(Qcur, "Qcur", il); -+ if (model.layers[il].bq) { -+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); -+ cb(Qcur, "Qcur", il); -+ } -+ -+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); -+ cb(Kcur, "Kcur", il); -+ if (model.layers[il].bk) { -+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); -+ cb(Kcur, "Kcur", il); -+ } -+ -+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); -+ cb(Vcur, "Vcur", il); -+ if (model.layers[il].bv) { -+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); -+ cb(Vcur, "Vcur", il); -+ } -+ -+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); -+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); -+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); -+ -+ Qcur = ggml_rope_ext( -+ ctx0, Qcur, inp_pos, rope_factors, -+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, -+ ext_factor, attn_factor, beta_fast, beta_slow -+ ); -+ -+ Kcur = ggml_rope_ext( -+ ctx0, Kcur, inp_pos, rope_factors, -+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, -+ ext_factor, attn_factor, beta_fast, beta_slow -+ ); -+ -+ cb(Qcur, "Qcur", il); -+ cb(Kcur, "Kcur", il); -+ cb(Vcur, "Vcur", il); -+ -+ cur = build_attn(inp_attn, gf, -+ model.layers[il].wo, model.layers[il].bo, -+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); -+ -+ if (il == n_layer - 1) { -+ // skip computing output for unused tokens -+ struct ggml_tensor * inp_out_ids = build_inp_out_ids(); -+ n_tokens = n_outputs; -+ cur = ggml_get_rows(ctx0, cur, inp_out_ids); -+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); -+ } -+ -+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); -+ cb(ffn_inp, "ffn_inp", il); -+ -+ // feed-forward network -+ cur = build_norm(ffn_inp, -+ model.layers[il].ffn_norm, NULL, -+ LLM_NORM_RMS, il); -+ cb(cur, "ffn_norm", il); -+ -+ cur = build_ffn(cur, -+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, -+ model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL, -+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, -+ NULL, -+ LLM_FFN_SILU, LLM_FFN_PAR, il); -+ cb(cur, "ffn_out", il); -+ -+ cur = ggml_add(ctx0, cur, ffn_inp); -+ cb(cur, "ffn_out", il); -+ -+ cur = build_cvec(cur, il); -+ cb(cur, "l_out", il); -+ -+ // input for next layer -+ inpL = cur; -+ } -+ } -+ -+ cur = inpL; -+ -+ cur = build_norm(cur, -+ model.output_norm, NULL, -+ LLM_NORM_RMS, -1); -+ cb(cur, "result_norm", -1); -+ res->t_embd = cur; -+ -+ // lm_head -+ cur = build_lora_mm(model.output, cur); -+ -+ cb(cur, "result_output", -1); -+ res->t_logits = cur; -+ -+ ggml_build_forward_expand(gf, cur); -+ } -+}; -+ - struct llm_build_deci : public llm_graph_context { - llm_build_deci(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { - const int64_t n_embd_head = hparams.n_embd_head_v; -@@ -12496,7 +12796,7 @@ struct llm_build_solar : public llm_graph_context { - // self-attention - { - // rope freq factors for llama3; may return nullptr for llama2 and other models -- ggml_tensor * rope_factors = static_cast(memory)->cbs.get_rope_factors(n_ctx_per_seq, il); -+ ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il); - - // compute Q and K and RoPE them - ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); -@@ -13128,6 +13428,10 @@ llm_graph_result_ptr llama_model::build_graph( - { - llm = std::make_unique(*this, params, gf); - } break; -+ case LLM_ARCH_MLLAMA: -+ { -+ llm = std::make_unique(*this, params, gf); -+ } break; - case LLM_ARCH_DECI: - { - llm = std::make_unique(*this, params, gf); -@@ -13489,6 +13793,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { - // use what we call a normal RoPE, operating on pairs of consecutive head values - case LLM_ARCH_LLAMA: - case LLM_ARCH_LLAMA4: -+ case LLM_ARCH_MLLAMA: - case LLM_ARCH_DECI: - case LLM_ARCH_BAICHUAN: - case LLM_ARCH_STARCODER: -diff --git a/src/llama-model.h b/src/llama-model.h -index 43746c7d..9281e629 100644 ---- a/src/llama-model.h -+++ b/src/llama-model.h -@@ -11,6 +11,7 @@ - #include - #include - #include -+#include - - struct llama_cparams; - struct llama_ubatch; -@@ -74,6 +75,7 @@ enum llm_type { - LLM_TYPE_40B, - LLM_TYPE_65B, - LLM_TYPE_70B, -+ LLM_TYPE_90B, - LLM_TYPE_236B, - LLM_TYPE_290B, - LLM_TYPE_314B, -@@ -318,6 +320,16 @@ struct llama_layer { - - struct ggml_tensor * bskcn_tv = nullptr; - -+ // cross attention -+ struct ggml_tensor * cross_attn_k_norm = nullptr; -+ struct ggml_tensor * cross_attn_k_proj = nullptr; -+ struct ggml_tensor * cross_attn_o_proj = nullptr; -+ struct ggml_tensor * cross_attn_q_norm = nullptr; -+ struct ggml_tensor * cross_attn_q_proj = nullptr; -+ struct ggml_tensor * cross_attn_v_proj = nullptr; -+ struct ggml_tensor * cross_attn_attn_gate = nullptr; -+ struct ggml_tensor * cross_attn_mlp_gate = nullptr; -+ - struct llama_layer_posnet posnet; - - struct llama_layer_convnext convnext; -diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp -index 820d5128..56531980 100644 ---- a/src/llama-quant.cpp -+++ b/src/llama-quant.cpp -@@ -639,7 +639,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: - if (llama_model_has_encoder(&model)) { - n_attn_layer *= 3; - } -- GGML_ASSERT((qs.n_attention_wv == n_attn_layer) && "n_attention_wv is unexpected"); -+ if (qs.n_attention_wv != n_attn_layer) { -+ LLAMA_LOG_WARN("%s: n_attention_wv is unexpected, expected: %d, found: %d\n", __func__, n_attn_layer, qs.n_attention_wv); -+ } - } - - size_t total_size_org = 0; -diff --git a/tools/mtmd/llava.cpp b/tools/mtmd/llava.cpp -index ebef8b3c..b0eb79bb 100644 ---- a/tools/mtmd/llava.cpp -+++ b/tools/mtmd/llava.cpp -@@ -462,7 +462,7 @@ struct llava_embd_batch { - std::vector seq_ids; - std::vector logits; - llama_batch batch; -- llava_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) { -+ llava_embd_batch(float * embd, int32_t n_embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) { - pos .resize(n_tokens); - n_seq_id.resize(n_tokens); - seq_ids .resize(n_tokens + 1); -@@ -474,6 +474,7 @@ struct llava_embd_batch { - /*n_tokens =*/ n_tokens, - /*tokens =*/ nullptr, - /*embd =*/ embd, -+ /*n_embd =*/ n_embd, - /*pos =*/ pos.data(), - /*n_seq_id =*/ n_seq_id.data(), - /*seq_id =*/ seq_ids.data(), -@@ -497,7 +498,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_ - n_eval = n_batch; - } - float * embd = image_embed->embed+i*n_embd; -- llava_embd_batch llava_batch = llava_embd_batch(embd, n_eval, *n_past, 0); -+ llava_embd_batch llava_batch = llava_embd_batch(embd, n_embd, n_eval, *n_past, 0); - if (llama_decode(ctx_llama, llava_batch.batch)) { - LOG_ERR("%s : failed to eval\n", __func__); - return false; -diff --git a/tools/mtmd/mtmd-helper.cpp b/tools/mtmd/mtmd-helper.cpp -index 7a328867..61ebdd43 100644 ---- a/tools/mtmd/mtmd-helper.cpp -+++ b/tools/mtmd/mtmd-helper.cpp -@@ -58,7 +58,7 @@ struct decode_embd_batch { - std::vector seq_ids; - std::vector logits; - llama_batch batch; -- decode_embd_batch(float * embd, int32_t n_tokens, int n_pos_per_embd, int n_mmproj_embd) : n_pos_per_embd(n_pos_per_embd), n_mmproj_embd(n_mmproj_embd) { -+ decode_embd_batch(float * embd, int32_t n_embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) : n_pos_per_embd(n_pos_per_embd), n_mmproj_embd(n_mmproj_embd) { - pos .resize(n_tokens * n_pos_per_embd); - n_seq_id.resize(n_tokens); - seq_ids .resize(n_tokens + 1); -@@ -69,6 +69,7 @@ struct decode_embd_batch { - /*n_tokens =*/ n_tokens, - /*tokens =*/ nullptr, - /*embd =*/ embd, -+ /*n_embd =*/ n_embd, - /*pos =*/ pos.data(), - /*n_seq_id =*/ n_seq_id.data(), - /*seq_id =*/ seq_ids.data(), -@@ -131,6 +132,7 @@ struct decode_embd_batch { - /*n_tokens =*/ n_tokens, - /*tokens =*/ nullptr, - /*embd =*/ batch.embd + offset * n_mmproj_embd, -+ /*n_embd =*/ batch.n_embd, - /*pos =*/ pos_ptr, - /*n_seq_id =*/ batch.n_seq_id + offset, - /*seq_id =*/ batch.seq_id + offset, -@@ -166,7 +168,8 @@ int32_t mtmd_helper_decode_image_chunk( - int32_t n_tokens = mtmd_image_tokens_get_n_tokens(image_tokens); - int32_t i_batch = 0; - int32_t n_img_batches = GGML_PAD(n_tokens, n_batch) / n_batch; -- decode_embd_batch batch_embd(encoded_embd, n_tokens, n_pos_per_embd, n_mmproj_embd); -+ int n_embd = llama_model_n_embd(llama_get_model(lctx)); -+ decode_embd_batch batch_embd(encoded_embd, n_embd, n_tokens, n_past, seq_id); - - const int nx = mtmd_image_tokens_get_nx(image_tokens); - const int ny = mtmd_image_tokens_get_ny(image_tokens); diff --git a/llama/patches/0008-fix-deepseek-deseret-regex.patch b/llama/patches/0006-fix-deepseek-deseret-regex.patch similarity index 100% rename from llama/patches/0008-fix-deepseek-deseret-regex.patch rename to llama/patches/0006-fix-deepseek-deseret-regex.patch diff --git a/llama/patches/0007-add-unpad-operator.patch b/llama/patches/0007-add-unpad-operator.patch deleted file mode 100644 index fc45aeff4..000000000 --- a/llama/patches/0007-add-unpad-operator.patch +++ /dev/null @@ -1,419 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: jmorganca -Date: Sun, 13 Apr 2025 22:10:06 -0400 -Subject: [PATCH] add unpad operator - -adds the unpad operator to GGML ---- - ggml/include/ggml.h | 10 +++++ - ggml/src/ggml-cpu/ggml-cpu.c | 5 +++ - ggml/src/ggml-cpu/ops.cpp | 55 ++++++++++++++++++++++++++++ - ggml/src/ggml-cpu/ops.h | 1 + - ggml/src/ggml-cuda/ggml-cuda.cu | 4 ++ - ggml/src/ggml-cuda/pad.cu | 46 +++++++++++++++++++++++ - ggml/src/ggml-cuda/pad.cuh | 1 + - ggml/src/ggml-metal/ggml-metal.m | 33 +++++++++++++++++ - ggml/src/ggml-metal/ggml-metal.metal | 45 +++++++++++++++++++++++ - ggml/src/ggml.c | 25 ++++++++++++- - 10 files changed, 223 insertions(+), 2 deletions(-) - -diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h -index e91dedf1..8dc107ba 100644 ---- a/ggml/include/ggml.h -+++ b/ggml/include/ggml.h -@@ -489,6 +489,7 @@ extern "C" { - GGML_OP_UPSCALE, // nearest interpolate - GGML_OP_PAD, - GGML_OP_PAD_REFLECT_1D, -+ GGML_OP_UNPAD, - GGML_OP_ARANGE, - GGML_OP_TIMESTEP_EMBEDDING, - GGML_OP_ARGSORT, -@@ -1781,6 +1782,15 @@ extern "C" { - int p0, - int p1); - -+ // unpad each dimension: [x, ..., x, y, ..., y] -> [x, ..., x] -+ GGML_API struct ggml_tensor * ggml_unpad( -+ struct ggml_context * ctx, -+ struct ggml_tensor * a, -+ int p0, -+ int p1, -+ int p2, -+ int p3); -+ - // Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151 - // timesteps: [N,] - // return: [N, dim] -diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c -index a30e67f2..835e6495 100644 ---- a/ggml/src/ggml-cpu/ggml-cpu.c -+++ b/ggml/src/ggml-cpu/ggml-cpu.c -@@ -1951,6 +1951,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm - { - ggml_compute_forward_pad_reflect_1d(params, tensor); - } break; -+ case GGML_OP_UNPAD: -+ { -+ ggml_compute_forward_unpad(params, tensor); -+ } break; - case GGML_OP_ARANGE: - { - ggml_compute_forward_arange(params, tensor); -@@ -2274,6 +2278,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { - case GGML_OP_UPSCALE: - case GGML_OP_PAD: - case GGML_OP_PAD_REFLECT_1D: -+ case GGML_OP_UNPAD: - case GGML_OP_ARANGE: - case GGML_OP_TIMESTEP_EMBEDDING: - case GGML_OP_ARGSORT: -diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp -index 955fec59..1868a10c 100644 ---- a/ggml/src/ggml-cpu/ops.cpp -+++ b/ggml/src/ggml-cpu/ops.cpp -@@ -6690,6 +6690,61 @@ void ggml_compute_forward_pad_reflect_1d( - } - } - -+// ggml_compute_forward_unpad -+ -+static void ggml_compute_forward_unpad_f32( -+ const struct ggml_compute_params *params, -+ struct ggml_tensor *dst) { -+ -+ const struct ggml_tensor * src0 = dst->src[0]; -+ -+ GGML_ASSERT(src0->nb[0] == sizeof(float)); -+ GGML_ASSERT( dst->nb[0] == sizeof(float)); -+ -+ const int ith = params->ith; -+ const int nth = params->nth; -+ -+ GGML_TENSOR_UNARY_OP_LOCALS -+ -+ float * dst_ptr = (float *) dst->data; -+ -+ // TODO: optimize -+ -+ for (int64_t i2 = 0; i2 < ne2; ++i2) { -+ for (int64_t i1 = ith; i1 < ne1; i1 += nth) { -+ for (int64_t i0 = 0; i0 < ne0; ++i0) { -+ for (int64_t i3 = 0; i3 < ne3; ++i3) { -+ const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0; -+ -+ const float * src_ptr = (const float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); -+ -+ if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) { -+ dst_ptr[dst_idx] = *src_ptr; -+ } -+ } -+ } -+ } -+ } -+} -+ -+void ggml_compute_forward_unpad( -+ const struct ggml_compute_params * params, -+ struct ggml_tensor * dst) { -+ -+ const struct ggml_tensor * src0 = dst->src[0]; -+ -+ switch (src0->type) { -+ case GGML_TYPE_F32: -+ { -+ ggml_compute_forward_unpad_f32(params, dst); -+ } break; -+ default: -+ { -+ GGML_ABORT("fatal error"); -+ } -+ } -+} -+ - // ggml_compute_forward_arange - - static void ggml_compute_forward_arange_f32( -diff --git a/ggml/src/ggml-cpu/ops.h b/ggml/src/ggml-cpu/ops.h -index dc081b9e..a7125555 100644 ---- a/ggml/src/ggml-cpu/ops.h -+++ b/ggml/src/ggml-cpu/ops.h -@@ -72,6 +72,7 @@ void ggml_compute_forward_pool_2d_back(const struct ggml_compute_params * params - void ggml_compute_forward_upscale(const struct ggml_compute_params * params, struct ggml_tensor * dst); - void ggml_compute_forward_pad(const struct ggml_compute_params * params, struct ggml_tensor * dst); - void ggml_compute_forward_pad_reflect_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst); -+void ggml_compute_forward_unpad(const struct ggml_compute_params * params, struct ggml_tensor * dst); - void ggml_compute_forward_arange(const struct ggml_compute_params * params, struct ggml_tensor * dst); - void ggml_compute_forward_timestep_embedding(const struct ggml_compute_params * params, struct ggml_tensor * dst); - void ggml_compute_forward_argsort(const struct ggml_compute_params * params, struct ggml_tensor * dst); -diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu -index cb0d8528..6fe86674 100644 ---- a/ggml/src/ggml-cuda/ggml-cuda.cu -+++ b/ggml/src/ggml-cuda/ggml-cuda.cu -@@ -2238,6 +2238,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg - case GGML_OP_PAD: - ggml_cuda_op_pad(ctx, dst); - break; -+ case GGML_OP_UNPAD: -+ ggml_cuda_op_unpad(ctx, dst); -+ break; - case GGML_OP_ARANGE: - ggml_cuda_op_arange(ctx, dst); - break; -@@ -3212,6 +3215,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g - case GGML_OP_UPSCALE: - return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST; - case GGML_OP_PAD: -+ case GGML_OP_UNPAD: - case GGML_OP_ARANGE: - case GGML_OP_TIMESTEP_EMBEDDING: - case GGML_OP_LEAKY_RELU: -diff --git a/ggml/src/ggml-cuda/pad.cu b/ggml/src/ggml-cuda/pad.cu -index 77432b04..7d45a7e1 100644 ---- a/ggml/src/ggml-cuda/pad.cu -+++ b/ggml/src/ggml-cuda/pad.cu -@@ -47,3 +47,49 @@ void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { - src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], - dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], stream); - } -+ -+static __global__ void unpad_f32(const float * x, float * dst, const int ne0, const int ne00, const int ne01, const int ne02, const int ne03) { -+ // blockIdx.z: idx of ne2*ne3, aka ne02*ne03 -+ // blockIdx.y: idx of ne1 -+ // blockIDx.x: idx of ne0 / BLOCK_SIZE -+ int nidx = threadIdx.x + blockIdx.x * blockDim.x; -+ if (nidx >= ne0) { -+ return; -+ } -+ -+ // operation -+ int offset_dst = -+ nidx + -+ blockIdx.y * ne0 + -+ blockIdx.z * ne0 * gridDim.y; -+ if (nidx < ne00 && blockIdx.y < ne01 && blockIdx.z < ne02*ne03) { -+ int offset_src = -+ nidx + -+ blockIdx.y * ne00 + -+ blockIdx.z * ne00 * ne01; -+ dst[offset_dst] = x[offset_src]; -+ } -+} -+ -+static void unpad_f32_cuda(const float * x, float * dst, -+ const int ne00, const int ne01, const int ne02, const int ne03, -+ const int ne0, const int ne1, const int ne2, const int ne3, cudaStream_t stream) { -+ int num_blocks = (ne0 + CUDA_PAD_BLOCK_SIZE - 1) / CUDA_PAD_BLOCK_SIZE; -+ dim3 gridDim(num_blocks, ne1, ne2*ne3); -+ unpad_f32<<>>(x, dst, ne0, ne00, ne01, ne02, ne03); -+} -+ -+void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { -+ const ggml_tensor * src0 = dst->src[0]; -+ const float * src0_d = (const float *)src0->data; -+ float * dst_d = (float *)dst->data; -+ cudaStream_t stream = ctx.stream(); -+ -+ GGML_ASSERT(src0->type == GGML_TYPE_F32); -+ GGML_ASSERT(dst->type == GGML_TYPE_F32); -+ GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors -+ -+ unpad_f32_cuda(src0_d, dst_d, -+ src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], -+ dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], stream); -+} -\ No newline at end of file -diff --git a/ggml/src/ggml-cuda/pad.cuh b/ggml/src/ggml-cuda/pad.cuh -index 8fd386b0..e2ededc3 100644 ---- a/ggml/src/ggml-cuda/pad.cuh -+++ b/ggml/src/ggml-cuda/pad.cuh -@@ -3,3 +3,4 @@ - #define CUDA_PAD_BLOCK_SIZE 256 - - void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst); -+void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst); -diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m -index 1b56f858..7641247e 100644 ---- a/ggml/src/ggml-metal/ggml-metal.m -+++ b/ggml/src/ggml-metal/ggml-metal.m -@@ -347,6 +347,7 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte - GGML_METAL_KERNEL_TYPE_UPSCALE_F32, - GGML_METAL_KERNEL_TYPE_PAD_F32, - GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32, -+ GGML_METAL_KERNEL_TYPE_UNPAD_F32, - GGML_METAL_KERNEL_TYPE_ARANGE_F32, - GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32, - GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, -@@ -1294,6 +1295,7 @@ @implementation GGMLMetalClass - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32, upscale_f32, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32, pad_f32, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32, pad_reflect_1d_f32, true); -+ GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UNPAD_F32, unpad_f32, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32, timestep_embedding_f32, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARANGE_F32, arange_f32, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, argsort_f32_i32_asc, true); -@@ -1655,6 +1657,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex - case GGML_OP_POOL_2D: - case GGML_OP_PAD: - case GGML_OP_PAD_REFLECT_1D: -+ case GGML_OP_UNPAD: - case GGML_OP_TIMESTEP_EMBEDDING: - case GGML_OP_ARGSORT: - case GGML_OP_LEAKY_RELU: -@@ -4184,6 +4187,36 @@ static bool ggml_metal_encode_node( - - const int nth = MIN(1024, ne0); - -+ [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; -+ } break; -+ case GGML_OP_UNPAD: -+ { -+ GGML_ASSERT(src0->type == GGML_TYPE_F32); -+ -+ id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_UNPAD_F32].pipeline; -+ -+ [encoder setComputePipelineState:pipeline]; -+ [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; -+ [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; -+ [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2]; -+ [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3]; -+ [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4]; -+ [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5]; -+ [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6]; -+ [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7]; -+ [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8]; -+ [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9]; -+ [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:10]; -+ [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:11]; -+ [encoder setBytes:&ne2 length:sizeof(ne2) atIndex:12]; -+ [encoder setBytes:&ne3 length:sizeof(ne3) atIndex:13]; -+ [encoder setBytes:&nb0 length:sizeof(nb0) atIndex:14]; -+ [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:15]; -+ [encoder setBytes:&nb2 length:sizeof(nb2) atIndex:16]; -+ [encoder setBytes:&nb3 length:sizeof(nb3) atIndex:17]; -+ -+ const int nth = MIN(1024, ne0); -+ - [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; - } break; - case GGML_OP_ARANGE: -diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal -index 9cfddf45..080a943b 100644 ---- a/ggml/src/ggml-metal/ggml-metal.metal -+++ b/ggml/src/ggml-metal/ggml-metal.metal -@@ -3121,6 +3121,51 @@ kernel void kernel_pad_reflect_1d_f32( - } - } - -+kernel void kernel_unpad_f32( -+ device const char * src0, -+ device char * dst, -+ constant int64_t & ne00, -+ constant int64_t & ne01, -+ constant int64_t & ne02, -+ constant int64_t & ne03, -+ constant uint64_t & nb00, -+ constant uint64_t & nb01, -+ constant uint64_t & nb02, -+ constant uint64_t & nb03, -+ constant int64_t & ne0, -+ constant int64_t & ne1, -+ constant int64_t & ne2, -+ constant int64_t & ne3, -+ constant uint64_t & nb0, -+ constant uint64_t & nb1, -+ constant uint64_t & nb2, -+ constant uint64_t & nb3, -+ uint3 tgpig[[threadgroup_position_in_grid]], -+ uint3 tpitg[[thread_position_in_threadgroup]], -+ uint3 ntg[[threads_per_threadgroup]]) { -+ -+ const int64_t i3 = tgpig.z; -+ const int64_t i2 = tgpig.y; -+ const int64_t i1 = tgpig.x; -+ -+ const int64_t i03 = i3; -+ const int64_t i02 = i2; -+ const int64_t i01 = i1; -+ -+ device const float * src0_ptr = (device const float *) (src0 + i03*nb03 + i02*nb02 + i01*nb01); -+ device float * dst_ptr = (device float *) (dst + i3*nb3 + i2*nb2 + i1*nb1); -+ -+ if (i1 < ne01 && i2 < ne02 && i3 < ne03) { -+ for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) { -+ if (i0 < ne00) { -+ dst_ptr[i0] = src0_ptr[i0]; -+ } -+ } -+ -+ return; -+ } -+} -+ - kernel void kernel_arange_f32( - device char * dst, - constant ggml_metal_kargs_arange & args, -diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c -index 8a654624..6b034d35 100644 ---- a/ggml/src/ggml.c -+++ b/ggml/src/ggml.c -@@ -923,6 +923,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { - "UPSCALE", - "PAD", - "PAD_REFLECT_1D", -+ "UNPAD", - "ARANGE", - "TIMESTEP_EMBEDDING", - "ARGSORT", -@@ -953,7 +954,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { - "OPT_STEP_ADAMW", - }; - --static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82"); -+static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83"); - - static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { - "none", -@@ -1018,6 +1019,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { - "upscale(x)", - "pad(x)", - "pad_reflect_1d(x)", -+ "unpad(x)", - "arange(start, stop, step)", - "timestep_embedding(timesteps, dim, max_period)", - "argsort(x)", -@@ -1048,7 +1050,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { - "adamw(x)", - }; - --static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82"); -+static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83"); - - static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2"); - -@@ -4274,6 +4276,25 @@ struct ggml_tensor * ggml_pad_reflect_1d( - return result; - } - -+// ggml_unpad -+ -+struct ggml_tensor * ggml_unpad( -+ struct ggml_context * ctx, -+ struct ggml_tensor * a, -+ int p0, int p1, int p2, int p3) { -+ -+ struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type, -+ a->ne[0] - p0, -+ a->ne[1] - p1, -+ a->ne[2] - p2, -+ a->ne[3] - p3); -+ -+ result->op = GGML_OP_UNPAD; -+ result->src[0] = a; -+ -+ return result; -+} -+ - // ggml_arange - - struct ggml_tensor * ggml_arange( diff --git a/llama/patches/0009-maintain-ordering-for-rules-for-grammar.patch b/llama/patches/0007-maintain-ordering-for-rules-for-grammar.patch similarity index 100% rename from llama/patches/0009-maintain-ordering-for-rules-for-grammar.patch rename to llama/patches/0007-maintain-ordering-for-rules-for-grammar.patch diff --git a/llama/patches/0010-ensure-KV-cache-is-fully-defragmented.patch b/llama/patches/0008-ensure-KV-cache-is-fully-defragmented.patch similarity index 94% rename from llama/patches/0010-ensure-KV-cache-is-fully-defragmented.patch rename to llama/patches/0008-ensure-KV-cache-is-fully-defragmented.patch index c5faeaaae..52116ce3f 100644 --- a/llama/patches/0010-ensure-KV-cache-is-fully-defragmented.patch +++ b/llama/patches/0008-ensure-KV-cache-is-fully-defragmented.patch @@ -58,7 +58,7 @@ index c22687e4..c5948e8f 100644 auto * gf = graph_init(); diff --git a/src/llama-context.h b/src/llama-context.h -index c4ab242a..9970dfc6 100644 +index c0ceacb1..0264e937 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -5,6 +5,7 @@ @@ -70,10 +70,10 @@ index c4ab242a..9970dfc6 100644 #include "ggml-cpp.h" #include "ggml-opt.h" diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp -index a7b0a7eb..1a50c034 100644 +index 3dcad65b..60e67b03 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp -@@ -372,8 +372,6 @@ void llama_kv_cache_unified::commit() { +@@ -364,8 +364,6 @@ void llama_kv_cache_unified::commit() { } bool llama_kv_cache_unified::update(llama_context & lctx) { @@ -82,7 +82,7 @@ index a7b0a7eb..1a50c034 100644 auto * sched = lctx.get_sched(); if (has_shift) { -@@ -396,8 +394,6 @@ bool llama_kv_cache_unified::update(llama_context & lctx) { +@@ -388,8 +386,6 @@ bool llama_kv_cache_unified::update(llama_context & lctx) { res->set_inputs(nullptr); lctx.graph_compute(gf, false); @@ -91,7 +91,7 @@ index a7b0a7eb..1a50c034 100644 } { -@@ -411,27 +407,36 @@ bool llama_kv_cache_unified::update(llama_context & lctx) { +@@ -403,27 +399,36 @@ bool llama_kv_cache_unified::update(llama_context & lctx) { if (do_defrag) { LLAMA_LOG_DEBUG("%s: defragmenting KV cache\n", __func__); @@ -133,7 +133,7 @@ index a7b0a7eb..1a50c034 100644 } void llama_kv_cache_unified::defrag_sched(float thold) { -@@ -715,11 +720,10 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_shift( +@@ -707,11 +712,10 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_shift( llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag( const llama_cparams & cparams, ggml_context * ctx, @@ -147,7 +147,7 @@ index a7b0a7eb..1a50c034 100644 #if 0 // CPU defrag // -@@ -791,32 +795,20 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag( +@@ -783,32 +787,20 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag( ggml_backend_tensor_set(v_l[il], buf_v.data(), 0, buf_v.size()); } #else @@ -185,7 +185,7 @@ index a7b0a7eb..1a50c034 100644 ggml_tensor * view_v_src; ggml_tensor * view_v_dst; -@@ -824,31 +816,29 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag( +@@ -816,31 +808,29 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag( if (cparams.flash_attn) { // NOTE: the V cache is not transposed when using flash attention view_v_src = ggml_view_2d(ctx, v_l[il], @@ -225,7 +225,7 @@ index a7b0a7eb..1a50c034 100644 } //LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes); -@@ -865,17 +855,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) { +@@ -857,17 +847,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) { assert(n_used <= n_kv); @@ -244,7 +244,7 @@ index a7b0a7eb..1a50c034 100644 // determine which KV cells to move where // -@@ -883,10 +863,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) { +@@ -875,10 +855,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) { // // if ids[i] == i || ids[i] == n_kv, then cell i is not moved // @@ -256,7 +256,7 @@ index a7b0a7eb..1a50c034 100644 for (uint32_t i0 = 0; i0 < n_used; ++i0) { const auto & cell0 = cells[i0]; -@@ -935,19 +912,11 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) { +@@ -927,19 +904,11 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) { // are we moving a continuous block of memory? bool cont = false; @@ -276,7 +276,7 @@ index a7b0a7eb..1a50c034 100644 cont = false; continue; } -@@ -963,8 +932,10 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) { +@@ -955,8 +924,10 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) { head = n_used; if (!cont) { @@ -288,7 +288,7 @@ index a7b0a7eb..1a50c034 100644 } nf++; -@@ -974,22 +945,16 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) { +@@ -966,22 +937,16 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) { } } diff --git a/llama/patches/0011-sort-devices-by-score.patch b/llama/patches/0009-sort-devices-by-score.patch similarity index 99% rename from llama/patches/0011-sort-devices-by-score.patch rename to llama/patches/0009-sort-devices-by-score.patch index 8c3908cf6..e27d1ae92 100644 --- a/llama/patches/0011-sort-devices-by-score.patch +++ b/llama/patches/0009-sort-devices-by-score.patch @@ -11,7 +11,7 @@ with the fastest acceleration is loaded 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp -index 82ae1b5b..1487f322 100644 +index 405d8e31..4e67d243 100644 --- a/ggml/src/ggml-backend-reg.cpp +++ b/ggml/src/ggml-backend-reg.cpp @@ -157,7 +157,7 @@ struct ggml_backend_reg_entry { diff --git a/llama/patches/0012-add-phony-target-ggml-cpu-for-all-cpu-variants.patch b/llama/patches/0010-add-phony-target-ggml-cpu-for-all-cpu-variants.patch similarity index 100% rename from llama/patches/0012-add-phony-target-ggml-cpu-for-all-cpu-variants.patch rename to llama/patches/0010-add-phony-target-ggml-cpu-for-all-cpu-variants.patch diff --git a/llama/patches/0013-remove-amx.patch b/llama/patches/0011-remove-amx.patch similarity index 100% rename from llama/patches/0013-remove-amx.patch rename to llama/patches/0011-remove-amx.patch diff --git a/llama/patches/0014-fix-string-arr-kv-loading.patch b/llama/patches/0012-fix-string-arr-kv-loading.patch similarity index 100% rename from llama/patches/0014-fix-string-arr-kv-loading.patch rename to llama/patches/0012-fix-string-arr-kv-loading.patch diff --git a/llama/patches/0015-ollama-debug-tensor.patch b/llama/patches/0013-ollama-debug-tensor.patch similarity index 91% rename from llama/patches/0015-ollama-debug-tensor.patch rename to llama/patches/0013-ollama-debug-tensor.patch index d8f9fc8a4..53d911277 100644 --- a/llama/patches/0015-ollama-debug-tensor.patch +++ b/llama/patches/0013-ollama-debug-tensor.patch @@ -8,7 +8,7 @@ Subject: [PATCH] ollama debug tensor 1 file changed, 6 insertions(+) diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c -index 835e6495..3902894b 100644 +index a30e67f2..2462d2b8 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -15,6 +15,8 @@ @@ -20,7 +20,7 @@ index 835e6495..3902894b 100644 #if defined(_MSC_VER) || defined(__MINGW32__) #include // using malloc.h with MSC/MINGW #elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__) -@@ -2846,6 +2848,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { +@@ -2841,6 +2843,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { ggml_compute_forward(¶ms, node); diff --git a/llama/patches/0016-add-ollama-vocab-for-grammar-support.patch b/llama/patches/0014-add-ollama-vocab-for-grammar-support.patch similarity index 100% rename from llama/patches/0016-add-ollama-vocab-for-grammar-support.patch rename to llama/patches/0014-add-ollama-vocab-for-grammar-support.patch diff --git a/llm/memory.go b/llm/memory.go index 76082bf7c..b5a8dd5c6 100644 --- a/llm/memory.go +++ b/llm/memory.go @@ -111,9 +111,8 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin slog.Debug("evaluating", "library", gpus[0].Library, "gpu_count", len(gpus), "available", availableList) for _, projector := range projectors { - weight, graph := projectorMemoryRequirements(projector) + weight := projectorMemoryRequirements(projector) projectorWeights += weight - projectorGraph += graph // multimodal models require at least 2048 context opts.NumCtx = max(opts.NumCtx, 2048) @@ -409,51 +408,21 @@ func (m MemoryEstimate) LogValue() slog.Value { return slog.GroupValue(attrs...) } -func projectorMemoryRequirements(filename string) (weights, graphSize uint64) { +func projectorMemoryRequirements(filename string) (weights uint64) { file, err := os.Open(filename) if err != nil { - return 0, 0 + return 0 } defer file.Close() ggml, _, err := ggml.Decode(file, 1024) if err != nil { - return 0, 0 + return 0 } for _, layer := range ggml.Tensors().GroupLayers() { weights += layer.Size() } - switch arch := ggml.KV().Architecture(); arch { - case "mllama": - kv := func(n string) uint64 { - if v, ok := ggml.KV()[arch+".vision."+n].(uint32); ok { - return uint64(v) - } - - return 0 - } - - imageSize := kv("image_size") - - maxNumTiles := kv("max_num_tiles") - embeddingLength := kv("embedding_length") - headCount := kv("attention.head_count") - - numPatches := (imageSize / kv("patch_size")) * (imageSize / kv("patch_size")) - if _, ok := ggml.Tensors().GroupLayers()["v"]["class_embd"]; ok { - numPatches++ - } - - numPaddedPatches := numPatches + 8 - (numPatches%8)%8 - - graphSize = 4 * (8 + - imageSize*imageSize*kv("num_channels")*maxNumTiles + - embeddingLength*numPatches*maxNumTiles + - 9*embeddingLength*numPaddedPatches*maxNumTiles + - numPaddedPatches*maxNumTiles*numPaddedPatches*maxNumTiles*headCount) - } - - return weights, graphSize + return weights } diff --git a/llm/server.go b/llm/server.go index a64669c2f..c07315fa3 100644 --- a/llm/server.go +++ b/llm/server.go @@ -679,9 +679,8 @@ ws ::= ([ \t\n] ws)? const maxBufferSize = 512 * format.KiloByte type ImageData struct { - Data []byte `json:"data"` - ID int `json:"id"` - AspectRatioID int `json:"aspect_ratio_id"` + Data []byte `json:"data"` + ID int `json:"id"` } type CompletionRequest struct { diff --git a/ml/backend.go b/ml/backend.go index ba24ecb45..f84a99845 100644 --- a/ml/backend.go +++ b/ml/backend.go @@ -161,7 +161,6 @@ type Tensor interface { Set(ctx Context, t2 Tensor, offset int, strides ...int) Tensor Pad(ctx Context, shape ...int) Tensor - Unpad(ctx Context, shape ...int) Tensor Stack(ctx Context, dim int, s ...Tensor) Tensor diff --git a/ml/backend/ggml/ggml.go b/ml/backend/ggml/ggml.go index e97795a69..e1aa687c8 100644 --- a/ml/backend/ggml/ggml.go +++ b/ml/backend/ggml/ggml.go @@ -1017,17 +1017,6 @@ func (t *Tensor) Sigmoid(ctx ml.Context) ml.Tensor { } } -func (t *Tensor) Unpad(ctx ml.Context, shape ...int) ml.Tensor { - if len(shape) != 4 { - panic("expected 4 dimensions") - } - - return &Tensor{ - b: t.b, - t: C.ggml_unpad(ctx.(*Context).ctx, t.t, C.int(shape[0]), C.int(shape[1]), C.int(shape[2]), C.int(shape[3])), - } -} - func (t *Tensor) View(ctx ml.Context, offset int, shape ...int) ml.Tensor { switch len(shape) { case 1: diff --git a/ml/backend/ggml/ggml/include/ggml.h b/ml/backend/ggml/ggml/include/ggml.h index 8dc107ba8..e91dedf14 100644 --- a/ml/backend/ggml/ggml/include/ggml.h +++ b/ml/backend/ggml/ggml/include/ggml.h @@ -489,7 +489,6 @@ extern "C" { GGML_OP_UPSCALE, // nearest interpolate GGML_OP_PAD, GGML_OP_PAD_REFLECT_1D, - GGML_OP_UNPAD, GGML_OP_ARANGE, GGML_OP_TIMESTEP_EMBEDDING, GGML_OP_ARGSORT, @@ -1782,15 +1781,6 @@ extern "C" { int p0, int p1); - // unpad each dimension: [x, ..., x, y, ..., y] -> [x, ..., x] - GGML_API struct ggml_tensor * ggml_unpad( - struct ggml_context * ctx, - struct ggml_tensor * a, - int p0, - int p1, - int p2, - int p3); - // Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151 // timesteps: [N,] // return: [N, dim] diff --git a/ml/backend/ggml/ggml/src/ggml-backend-reg.cpp b/ml/backend/ggml/ggml/src/ggml-backend-reg.cpp index 1487f322f..4e67d243a 100644 --- a/ml/backend/ggml/ggml/src/ggml-backend-reg.cpp +++ b/ml/backend/ggml/ggml/src/ggml-backend-reg.cpp @@ -178,9 +178,9 @@ struct ggml_backend_registry { #ifdef GGML_USE_CANN register_backend(ggml_backend_cann_reg()); #endif -// #ifdef GGML_USE_BLAS -// register_backend(ggml_backend_blas_reg()); -// #endif +#ifdef GGML_USE_BLAS + register_backend(ggml_backend_blas_reg()); +#endif #ifdef GGML_USE_RPC register_backend(ggml_backend_rpc_reg()); #endif diff --git a/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.c b/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.c index 3902894ba..2462d2b85 100644 --- a/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.c @@ -1953,10 +1953,6 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm { ggml_compute_forward_pad_reflect_1d(params, tensor); } break; - case GGML_OP_UNPAD: - { - ggml_compute_forward_unpad(params, tensor); - } break; case GGML_OP_ARANGE: { ggml_compute_forward_arange(params, tensor); @@ -2280,7 +2276,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { case GGML_OP_UPSCALE: case GGML_OP_PAD: case GGML_OP_PAD_REFLECT_1D: - case GGML_OP_UNPAD: case GGML_OP_ARANGE: case GGML_OP_TIMESTEP_EMBEDDING: case GGML_OP_ARGSORT: diff --git a/ml/backend/ggml/ggml/src/ggml-cpu/ops.cpp b/ml/backend/ggml/ggml/src/ggml-cpu/ops.cpp index 1868a10cd..955fec59a 100644 --- a/ml/backend/ggml/ggml/src/ggml-cpu/ops.cpp +++ b/ml/backend/ggml/ggml/src/ggml-cpu/ops.cpp @@ -6690,61 +6690,6 @@ void ggml_compute_forward_pad_reflect_1d( } } -// ggml_compute_forward_unpad - -static void ggml_compute_forward_unpad_f32( - const struct ggml_compute_params *params, - struct ggml_tensor *dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - GGML_ASSERT(src0->nb[0] == sizeof(float)); - GGML_ASSERT( dst->nb[0] == sizeof(float)); - - const int ith = params->ith; - const int nth = params->nth; - - GGML_TENSOR_UNARY_OP_LOCALS - - float * dst_ptr = (float *) dst->data; - - // TODO: optimize - - for (int64_t i2 = 0; i2 < ne2; ++i2) { - for (int64_t i1 = ith; i1 < ne1; i1 += nth) { - for (int64_t i0 = 0; i0 < ne0; ++i0) { - for (int64_t i3 = 0; i3 < ne3; ++i3) { - const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0; - - const float * src_ptr = (const float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); - - if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) { - dst_ptr[dst_idx] = *src_ptr; - } - } - } - } - } -} - -void ggml_compute_forward_unpad( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - switch (src0->type) { - case GGML_TYPE_F32: - { - ggml_compute_forward_unpad_f32(params, dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - // ggml_compute_forward_arange static void ggml_compute_forward_arange_f32( diff --git a/ml/backend/ggml/ggml/src/ggml-cpu/ops.h b/ml/backend/ggml/ggml/src/ggml-cpu/ops.h index a7125555e..dc081b9e6 100644 --- a/ml/backend/ggml/ggml/src/ggml-cpu/ops.h +++ b/ml/backend/ggml/ggml/src/ggml-cpu/ops.h @@ -72,7 +72,6 @@ void ggml_compute_forward_pool_2d_back(const struct ggml_compute_params * params void ggml_compute_forward_upscale(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_pad(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_pad_reflect_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst); -void ggml_compute_forward_unpad(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_arange(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_timestep_embedding(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_argsort(const struct ggml_compute_params * params, struct ggml_tensor * dst); diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu b/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu index 6fe866742..cb0d8528d 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu @@ -2238,9 +2238,6 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg case GGML_OP_PAD: ggml_cuda_op_pad(ctx, dst); break; - case GGML_OP_UNPAD: - ggml_cuda_op_unpad(ctx, dst); - break; case GGML_OP_ARANGE: ggml_cuda_op_arange(ctx, dst); break; @@ -3215,7 +3212,6 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g case GGML_OP_UPSCALE: return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST; case GGML_OP_PAD: - case GGML_OP_UNPAD: case GGML_OP_ARANGE: case GGML_OP_TIMESTEP_EMBEDDING: case GGML_OP_LEAKY_RELU: diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/pad.cu b/ml/backend/ggml/ggml/src/ggml-cuda/pad.cu index 7d45a7e19..77432b046 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/pad.cu +++ b/ml/backend/ggml/ggml/src/ggml-cuda/pad.cu @@ -47,49 +47,3 @@ void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], stream); } - -static __global__ void unpad_f32(const float * x, float * dst, const int ne0, const int ne00, const int ne01, const int ne02, const int ne03) { - // blockIdx.z: idx of ne2*ne3, aka ne02*ne03 - // blockIdx.y: idx of ne1 - // blockIDx.x: idx of ne0 / BLOCK_SIZE - int nidx = threadIdx.x + blockIdx.x * blockDim.x; - if (nidx >= ne0) { - return; - } - - // operation - int offset_dst = - nidx + - blockIdx.y * ne0 + - blockIdx.z * ne0 * gridDim.y; - if (nidx < ne00 && blockIdx.y < ne01 && blockIdx.z < ne02*ne03) { - int offset_src = - nidx + - blockIdx.y * ne00 + - blockIdx.z * ne00 * ne01; - dst[offset_dst] = x[offset_src]; - } -} - -static void unpad_f32_cuda(const float * x, float * dst, - const int ne00, const int ne01, const int ne02, const int ne03, - const int ne0, const int ne1, const int ne2, const int ne3, cudaStream_t stream) { - int num_blocks = (ne0 + CUDA_PAD_BLOCK_SIZE - 1) / CUDA_PAD_BLOCK_SIZE; - dim3 gridDim(num_blocks, ne1, ne2*ne3); - unpad_f32<<>>(x, dst, ne0, ne00, ne01, ne02, ne03); -} - -void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { - const ggml_tensor * src0 = dst->src[0]; - const float * src0_d = (const float *)src0->data; - float * dst_d = (float *)dst->data; - cudaStream_t stream = ctx.stream(); - - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT(dst->type == GGML_TYPE_F32); - GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors - - unpad_f32_cuda(src0_d, dst_d, - src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], - dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], stream); -} \ No newline at end of file diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/pad.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/pad.cuh index e2ededc3c..8fd386b00 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/pad.cuh +++ b/ml/backend/ggml/ggml/src/ggml-cuda/pad.cuh @@ -3,4 +3,3 @@ #define CUDA_PAD_BLOCK_SIZE 256 void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst); -void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal index 56fdb3cda..3656c2383 100644 --- a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal +++ b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal @@ -5599,51 +5599,6 @@ kernel void kernel_pad_reflect_1d_f32( } } -kernel void kernel_unpad_f32( - device const char * src0, - device char * dst, - constant int64_t & ne00, - constant int64_t & ne01, - constant int64_t & ne02, - constant int64_t & ne03, - constant uint64_t & nb00, - constant uint64_t & nb01, - constant uint64_t & nb02, - constant uint64_t & nb03, - constant int64_t & ne0, - constant int64_t & ne1, - constant int64_t & ne2, - constant int64_t & ne3, - constant uint64_t & nb0, - constant uint64_t & nb1, - constant uint64_t & nb2, - constant uint64_t & nb3, - uint3 tgpig[[threadgroup_position_in_grid]], - uint3 tpitg[[thread_position_in_threadgroup]], - uint3 ntg[[threads_per_threadgroup]]) { - - const int64_t i3 = tgpig.z; - const int64_t i2 = tgpig.y; - const int64_t i1 = tgpig.x; - - const int64_t i03 = i3; - const int64_t i02 = i2; - const int64_t i01 = i1; - - device const float * src0_ptr = (device const float *) (src0 + i03*nb03 + i02*nb02 + i01*nb01); - device float * dst_ptr = (device float *) (dst + i3*nb3 + i2*nb2 + i1*nb1); - - if (i1 < ne01 && i2 < ne02 && i3 < ne03) { - for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) { - if (i0 < ne00) { - dst_ptr[i0] = src0_ptr[i0]; - } - } - - return; - } -} - kernel void kernel_arange_f32( device char * dst, constant ggml_metal_kargs_arange & args, diff --git a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m index 7641247eb..1b56f858c 100644 --- a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m +++ b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m @@ -347,7 +347,6 @@ enum ggml_metal_kernel_type { GGML_METAL_KERNEL_TYPE_UPSCALE_F32, GGML_METAL_KERNEL_TYPE_PAD_F32, GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32, - GGML_METAL_KERNEL_TYPE_UNPAD_F32, GGML_METAL_KERNEL_TYPE_ARANGE_F32, GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32, GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, @@ -1295,7 +1294,6 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32, upscale_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32, pad_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32, pad_reflect_1d_f32, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UNPAD_F32, unpad_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32, timestep_embedding_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARANGE_F32, arange_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, argsort_f32_i32_asc, true); @@ -1657,7 +1655,6 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex case GGML_OP_POOL_2D: case GGML_OP_PAD: case GGML_OP_PAD_REFLECT_1D: - case GGML_OP_UNPAD: case GGML_OP_TIMESTEP_EMBEDDING: case GGML_OP_ARGSORT: case GGML_OP_LEAKY_RELU: @@ -4187,36 +4184,6 @@ static bool ggml_metal_encode_node( const int nth = MIN(1024, ne0); - [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; - } break; - case GGML_OP_UNPAD: - { - GGML_ASSERT(src0->type == GGML_TYPE_F32); - - id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_UNPAD_F32].pipeline; - - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2]; - [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3]; - [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4]; - [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5]; - [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6]; - [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7]; - [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8]; - [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9]; - [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:10]; - [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:11]; - [encoder setBytes:&ne2 length:sizeof(ne2) atIndex:12]; - [encoder setBytes:&ne3 length:sizeof(ne3) atIndex:13]; - [encoder setBytes:&nb0 length:sizeof(nb0) atIndex:14]; - [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:15]; - [encoder setBytes:&nb2 length:sizeof(nb2) atIndex:16]; - [encoder setBytes:&nb3 length:sizeof(nb3) atIndex:17]; - - const int nth = MIN(1024, ne0); - [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; } break; case GGML_OP_ARANGE: diff --git a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.metal b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.metal index 080a943bc..9cfddf450 100644 --- a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.metal +++ b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.metal @@ -3121,51 +3121,6 @@ kernel void kernel_pad_reflect_1d_f32( } } -kernel void kernel_unpad_f32( - device const char * src0, - device char * dst, - constant int64_t & ne00, - constant int64_t & ne01, - constant int64_t & ne02, - constant int64_t & ne03, - constant uint64_t & nb00, - constant uint64_t & nb01, - constant uint64_t & nb02, - constant uint64_t & nb03, - constant int64_t & ne0, - constant int64_t & ne1, - constant int64_t & ne2, - constant int64_t & ne3, - constant uint64_t & nb0, - constant uint64_t & nb1, - constant uint64_t & nb2, - constant uint64_t & nb3, - uint3 tgpig[[threadgroup_position_in_grid]], - uint3 tpitg[[thread_position_in_threadgroup]], - uint3 ntg[[threads_per_threadgroup]]) { - - const int64_t i3 = tgpig.z; - const int64_t i2 = tgpig.y; - const int64_t i1 = tgpig.x; - - const int64_t i03 = i3; - const int64_t i02 = i2; - const int64_t i01 = i1; - - device const float * src0_ptr = (device const float *) (src0 + i03*nb03 + i02*nb02 + i01*nb01); - device float * dst_ptr = (device float *) (dst + i3*nb3 + i2*nb2 + i1*nb1); - - if (i1 < ne01 && i2 < ne02 && i3 < ne03) { - for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) { - if (i0 < ne00) { - dst_ptr[i0] = src0_ptr[i0]; - } - } - - return; - } -} - kernel void kernel_arange_f32( device char * dst, constant ggml_metal_kargs_arange & args, diff --git a/ml/backend/ggml/ggml/src/ggml.c b/ml/backend/ggml/ggml/src/ggml.c index 6b034d352..8a6546240 100644 --- a/ml/backend/ggml/ggml/src/ggml.c +++ b/ml/backend/ggml/ggml/src/ggml.c @@ -923,7 +923,6 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "UPSCALE", "PAD", "PAD_REFLECT_1D", - "UNPAD", "ARANGE", "TIMESTEP_EMBEDDING", "ARGSORT", @@ -954,7 +953,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "OPT_STEP_ADAMW", }; -static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83"); +static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82"); static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "none", @@ -1019,7 +1018,6 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "upscale(x)", "pad(x)", "pad_reflect_1d(x)", - "unpad(x)", "arange(start, stop, step)", "timestep_embedding(timesteps, dim, max_period)", "argsort(x)", @@ -1050,7 +1048,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "adamw(x)", }; -static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83"); +static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82"); static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2"); @@ -4276,25 +4274,6 @@ struct ggml_tensor * ggml_pad_reflect_1d( return result; } -// ggml_unpad - -struct ggml_tensor * ggml_unpad( - struct ggml_context * ctx, - struct ggml_tensor * a, - int p0, int p1, int p2, int p3) { - - struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type, - a->ne[0] - p0, - a->ne[1] - p1, - a->ne[2] - p2, - a->ne[3] - p3); - - result->op = GGML_OP_UNPAD; - result->src[0] = a; - - return result; -} - // ggml_arange struct ggml_tensor * ggml_arange( diff --git a/model/models/llama4/model_vision.go b/model/models/llama4/model_vision.go index 3bf9cee75..e6b1afef6 100644 --- a/model/models/llama4/model_vision.go +++ b/model/models/llama4/model_vision.go @@ -208,7 +208,7 @@ func (m *VisionModel) Forward(ctx ml.Context, pixelValues ml.Tensor) ml.Tensor { } hiddenStates = m.LayerNormPost.Forward(ctx, hiddenStates, m.eps) - hiddenStates = hiddenStates.Unpad(ctx, 0, 1, 0, 0) + hiddenStates = hiddenStates.Pad(ctx, 0, -1, 0, 0) hiddenStates = m.VisionAdapter.Forward(ctx, hiddenStates, m.VisionOptions) return hiddenStates } diff --git a/model/models/mllama/imageproc.go b/model/models/mllama/imageproc.go deleted file mode 100644 index 13f2fb8b3..000000000 --- a/model/models/mllama/imageproc.go +++ /dev/null @@ -1,201 +0,0 @@ -package mllama - -import ( - "fmt" - "image" - _ "image/jpeg" - _ "image/png" - "io" - "math" - "slices" - - "golang.org/x/image/draw" - - "github.com/ollama/ollama/model/imageproc" -) - -func getSupportedAspectRatios(maxTiles int) []image.Point { - ratios := []image.Point{} - - for w := range maxTiles { - for h := range maxTiles { - if (w+1)*(h+1) <= maxTiles { - ratios = append(ratios, image.Point{w + 1, h + 1}) - } - } - } - - return ratios -} - -func clip(a, a_min, a_max int) int { - if a < a_min { - return a_min - } else if a > a_max { - return a_max - } - - return a -} - -func getOptimalTiledCanvas(imageSize image.Point, maxImageTiles, tileSize int) image.Point { - possibleTileArrangements := getSupportedAspectRatios(maxImageTiles) - possibleCanvasSizes := []image.Point{} - for _, pta := range possibleTileArrangements { - possibleCanvasSizes = append(possibleCanvasSizes, image.Point{pta.X * tileSize, pta.Y * tileSize}) - } - - scales := []float64{} - - for _, pcs := range possibleCanvasSizes { - scaleHeight := float64(pcs.Y) / float64(imageSize.Y) - scaleWidth := float64(pcs.X) / float64(imageSize.X) - - if scaleWidth > scaleHeight { - scales = append(scales, scaleHeight) - } else { - scales = append(scales, scaleWidth) - } - } - - var minUpscale float64 - var maxDownscale float64 - var upscale bool - - for _, s := range scales { - if s > 1.0 { - upscale = true - if minUpscale == 0 { - minUpscale = s - } else { - minUpscale = math.Min(minUpscale, s) - } - } else { - maxDownscale = math.Max(maxDownscale, s) - } - } - - selectedScale := maxDownscale - if upscale { - selectedScale = minUpscale - } - - var selectedCanvas image.Point - for n, pcs := range possibleCanvasSizes { - if scales[n] == selectedScale { - // choose the smallest possible canvas - if selectedCanvas.X == 0 && selectedCanvas.Y == 0 { - selectedCanvas = pcs - } else if pcs.X*pcs.Y < selectedCanvas.X*selectedCanvas.Y { - selectedCanvas = pcs - } - } - } - return selectedCanvas -} - -func getImageSizeFitToCanvas(imageSize, canvasSize image.Point, tileSize int) image.Point { - targetWidth := clip(imageSize.X, tileSize, canvasSize.X) - targetHeight := clip(imageSize.Y, tileSize, canvasSize.Y) - - scaleWidth := float64(targetWidth) / float64(imageSize.X) - scaleHeight := float64(targetHeight) / float64(imageSize.Y) - - var w, h int - - if scaleWidth < scaleHeight { - w = targetWidth - h = min(int(math.Floor(float64(imageSize.Y)*scaleWidth)), targetHeight) - } else { - w = min(int(math.Floor(float64(imageSize.X)*scaleHeight)), targetWidth) - h = targetHeight - } - - return image.Point{w, h} -} - -func resizeImage(img image.Image, format string, outputSize image.Point, maxImageTiles int) (image.Image, image.Point) { - if format == "png" { - img = imageproc.Composite(img) - } - - b := img.Bounds() - tileSize := outputSize.Y - - canvasSize := getOptimalTiledCanvas(b.Max, maxImageTiles, tileSize) - aspectRatio := image.Point{canvasSize.X / tileSize, canvasSize.Y / tileSize} - newSize := getImageSizeFitToCanvas(b.Max, canvasSize, tileSize) - - return imageproc.Resize(img, newSize, imageproc.ResizeBilinear), aspectRatio -} - -func padImage(img image.Image, outputSize, aspectRatio image.Point) image.Image { - paddedSize := image.Point{ - X: outputSize.X * aspectRatio.X, - Y: outputSize.Y * aspectRatio.Y, - } - - dst := image.NewRGBA(image.Rect(0, 0, paddedSize.X, paddedSize.Y)) - draw.Draw(dst, img.Bounds(), img, image.Point{0, 0}, draw.Over) - - return dst -} - -func splitToTiles(img image.Image, numTilesSize image.Point) []image.Image { - b := img.Bounds() - width := b.Max.X - b.Min.X - height := b.Max.Y - b.Min.Y - tileHeight := height / numTilesSize.Y - tileWidth := width / numTilesSize.X - - images := []image.Image{} - - for h := range numTilesSize.Y { - for w := range numTilesSize.X { - rect := image.Rect(tileWidth*w, tileHeight*h, tileWidth*(w+1), tileHeight*(h+1)) - images = append(images, img.(interface { - SubImage(image.Rectangle) image.Image - }).SubImage(rect)) - } - } - - return images -} - -func packImages(img image.Image, aspectRatio image.Point) []float32 { - subImages := splitToTiles(img, aspectRatio) - - var pixelVals []float32 - - rescale := true - channelFirst := true - - for _, subImg := range subImages { - vals := imageproc.Normalize(subImg, imageproc.ClipDefaultMean, imageproc.ClipDefaultSTD, rescale, channelFirst) - pixelVals = append(pixelVals, vals...) - } - - return pixelVals -} - -func Preprocess(imageData io.Reader) ([]float32, map[string]any, error) { - outputSize := image.Point{560, 560} - maxTiles := 4 - - img, format, err := image.Decode(imageData) - if err != nil { - return nil, nil, fmt.Errorf("failed to decode image: %w", err) - } - - newImage, aspectRatio := resizeImage(img, format, outputSize, maxTiles) - newImage = padImage(newImage, outputSize, aspectRatio) - - data := packImages(newImage, aspectRatio) - aspectRatioIndex := slices.Index(getSupportedAspectRatios(maxTiles), aspectRatio) + 1 - - opts := map[string]any{ - "aspectRatioIndex": aspectRatioIndex, - } - - return data, opts, nil -} diff --git a/model/models/mllama/imageproc_test.go b/model/models/mllama/imageproc_test.go deleted file mode 100644 index a14b91bd1..000000000 --- a/model/models/mllama/imageproc_test.go +++ /dev/null @@ -1,420 +0,0 @@ -package mllama - -import ( - "bytes" - "image" - "image/png" - "testing" - - "github.com/google/go-cmp/cmp" -) - -func TestAspectRatios(t *testing.T) { - type aspectCase struct { - MaxTiles int - Expected []image.Point - } - - cases := []aspectCase{ - { - MaxTiles: 1, - Expected: []image.Point{{1, 1}}, - }, - { - MaxTiles: 2, - Expected: []image.Point{{1, 1}, {1, 2}, {2, 1}}, - }, - { - MaxTiles: 3, - Expected: []image.Point{{1, 1}, {1, 2}, {1, 3}, {2, 1}, {3, 1}}, - }, - { - MaxTiles: 4, - Expected: []image.Point{{1, 1}, {1, 2}, {1, 3}, {1, 4}, {2, 1}, {2, 2}, {3, 1}, {4, 1}}, - }, - } - - for _, c := range cases { - actual := getSupportedAspectRatios(c.MaxTiles) - - if diff := cmp.Diff(actual, c.Expected); diff != "" { - t.Errorf("mismatch (-got +want):\n%s", diff) - } - } -} - -func TestGetImageSizeFitToCanvas(t *testing.T) { - type imageSizeCase struct { - ImageRect image.Point - CanvasRect image.Point - TileSize int - Expected image.Point - } - - cases := []imageSizeCase{ - { - ImageRect: image.Point{400, 400}, - CanvasRect: image.Point{640, 480}, - TileSize: 200, - Expected: image.Point{400, 400}, - }, - { - ImageRect: image.Point{1024, 768}, - CanvasRect: image.Point{640, 480}, - TileSize: 200, - Expected: image.Point{640, 480}, - }, - { - ImageRect: image.Point{500, 500}, - CanvasRect: image.Point{1000, 1000}, - TileSize: 750, - Expected: image.Point{750, 750}, - }, - { - ImageRect: image.Point{500, 1000}, - CanvasRect: image.Point{2000, 2000}, - TileSize: 2000, - Expected: image.Point{1000, 2000}, - }, - { - ImageRect: image.Point{4000, 3000}, - CanvasRect: image.Point{2000, 1000}, - TileSize: 1000, - Expected: image.Point{1333, 1000}, - }, - { - ImageRect: image.Point{667, 1000}, - CanvasRect: image.Point{1000, 1000}, - TileSize: 560, - Expected: image.Point{667, 1000}, - }, - } - - for _, c := range cases { - actual := getImageSizeFitToCanvas(c.ImageRect, c.CanvasRect, c.TileSize) - - if actual != c.Expected { - t.Errorf("incorrect image rect: '%#v'. expected: '%#v'", actual, c.Expected) - } - } -} - -func TestGetOptimalTiledCanvas(t *testing.T) { - type tiledCanvasSizeCase struct { - ImageSize image.Point - MaxImageTiles int - TileSize int - Expected image.Point - } - - cases := []tiledCanvasSizeCase{ - { - ImageSize: image.Point{1024, 768}, - MaxImageTiles: 4, - TileSize: 1000, - Expected: image.Point{2000, 1000}, - }, - { - ImageSize: image.Point{1024, 768}, - MaxImageTiles: 4, - TileSize: 560, - Expected: image.Point{1120, 1120}, - }, - { - ImageSize: image.Point{800, 600}, - MaxImageTiles: 4, - TileSize: 560, - Expected: image.Point{1120, 1120}, - }, - { - ImageSize: image.Point{640, 480}, - MaxImageTiles: 4, - TileSize: 560, - Expected: image.Point{1120, 560}, - }, - { - ImageSize: image.Point{320, 200}, - MaxImageTiles: 4, - TileSize: 560, - Expected: image.Point{560, 560}, - }, - { - ImageSize: image.Point{1320, 200}, - MaxImageTiles: 4, - TileSize: 560, - Expected: image.Point{1680, 560}, - }, - { - ImageSize: image.Point{2000, 200}, - MaxImageTiles: 4, - TileSize: 560, - Expected: image.Point{2240, 560}, - }, - { - ImageSize: image.Point{10000, 200}, - MaxImageTiles: 4, - TileSize: 560, - Expected: image.Point{2240, 560}, - }, - { - ImageSize: image.Point{480, 640}, - MaxImageTiles: 4, - TileSize: 560, - Expected: image.Point{560, 1120}, - }, - { - ImageSize: image.Point{200, 320}, - MaxImageTiles: 4, - TileSize: 560, - Expected: image.Point{560, 560}, - }, - { - ImageSize: image.Point{200, 1320}, - MaxImageTiles: 4, - TileSize: 560, - Expected: image.Point{560, 1680}, - }, - { - ImageSize: image.Point{200, 2000}, - MaxImageTiles: 4, - TileSize: 560, - Expected: image.Point{560, 2240}, - }, - { - ImageSize: image.Point{200, 10000}, - MaxImageTiles: 4, - TileSize: 560, - Expected: image.Point{560, 2240}, - }, - { - ImageSize: image.Point{10000, 10000}, - MaxImageTiles: 4, - TileSize: 560, - Expected: image.Point{1120, 1120}, - }, - } - - for _, c := range cases { - actual := getOptimalTiledCanvas(c.ImageSize, c.MaxImageTiles, c.TileSize) - - if actual != c.Expected { - t.Errorf("incorrect tiled canvas: '%#v'. expected: '%#v'", actual, c.Expected) - } - } -} - -func TestSplitToTiles(t *testing.T) { - type splitCase struct { - TestImage image.Image - NumTilesSize image.Point - Expected []image.Image - } - - cases := []splitCase{ - { - TestImage: image.NewRGBA(image.Rect(0, 0, 1024, 768)), - NumTilesSize: image.Point{1, 1}, - Expected: []image.Image{image.NewRGBA(image.Rect(0, 0, 1024, 768))}, - }, - { - TestImage: image.NewRGBA(image.Rect(0, 0, 1000, 500)), - NumTilesSize: image.Point{2, 1}, - Expected: []image.Image{ - image.NewRGBA(image.Rect(0, 0, 500, 500)), - image.NewRGBA(image.Rect(500, 0, 1000, 500)), - }, - }, - { - TestImage: image.NewRGBA(image.Rect(0, 0, 1000, 1000)), - NumTilesSize: image.Point{2, 2}, - Expected: []image.Image{ - image.NewRGBA(image.Rect(0, 0, 500, 500)), - image.NewRGBA(image.Rect(500, 0, 1000, 500)), - image.NewRGBA(image.Rect(0, 500, 500, 1000)), - image.NewRGBA(image.Rect(500, 500, 1000, 1000)), - }, - }, - } - - for _, c := range cases { - actual := splitToTiles(c.TestImage, c.NumTilesSize) - - if len(actual) != len(c.Expected) { - t.Errorf("incorrect number of images '%d': expected: '%d'", len(actual), len(c.Expected)) - } - - for i := range actual { - if actual[i].Bounds() != c.Expected[i].Bounds() { - t.Errorf("image size incorrect: '%#v': expected: '%#v'", actual[i].Bounds(), c.Expected[i].Bounds()) - } - } - } -} - -func TestResize(t *testing.T) { - type resizeCase struct { - TestImage image.Image - OutputSize image.Point - MaxImageTiles int - ExpectedImage image.Image - ExpectedAspectRatio image.Point - } - - cases := []resizeCase{ - { - TestImage: image.NewRGBA(image.Rect(0, 0, 200, 200)), - OutputSize: image.Point{100, 100}, - MaxImageTiles: 1, - ExpectedImage: image.NewRGBA(image.Rect(0, 0, 100, 100)), - ExpectedAspectRatio: image.Point{1, 1}, - }, - { - TestImage: image.NewRGBA(image.Rect(0, 0, 200, 200)), - OutputSize: image.Point{100, 100}, - MaxImageTiles: 2, - ExpectedImage: image.NewRGBA(image.Rect(0, 0, 100, 100)), - ExpectedAspectRatio: image.Point{1, 1}, - }, - { - TestImage: image.NewRGBA(image.Rect(0, 0, 10, 10)), - OutputSize: image.Point{560, 560}, - MaxImageTiles: 4, - ExpectedImage: image.NewRGBA(image.Rect(0, 0, 560, 560)), - ExpectedAspectRatio: image.Point{1, 1}, - }, - { - TestImage: image.NewRGBA(image.Rect(0, 0, 2560, 1920)), - OutputSize: image.Point{560, 560}, - MaxImageTiles: 4, - ExpectedImage: image.NewRGBA(image.Rect(0, 0, 1120, 840)), - ExpectedAspectRatio: image.Point{2, 2}, - }, - { - TestImage: image.NewRGBA(image.Rect(0, 0, 1024, 768)), - OutputSize: image.Point{560, 560}, - MaxImageTiles: 4, - ExpectedImage: image.NewRGBA(image.Rect(0, 0, 1024, 768)), - ExpectedAspectRatio: image.Point{2, 2}, - }, - } - - for _, c := range cases { - actualImage, actualAspectRatio := resizeImage(c.TestImage, "png", c.OutputSize, c.MaxImageTiles) - - if actualImage.Bounds() != c.ExpectedImage.Bounds() { - t.Errorf("image size incorrect: '%#v': expected: '%#v'", actualImage.Bounds(), c.ExpectedImage.Bounds()) - } - - if actualAspectRatio != c.ExpectedAspectRatio { - t.Errorf("aspect ratio incorrect: '%#v': expected: '%#v'", actualAspectRatio, c.ExpectedAspectRatio) - } - } -} - -func TestPad(t *testing.T) { - type padCase struct { - TestImage image.Image - OutputSize image.Point - AspectRatio image.Point - Expected image.Image - } - - cases := []padCase{ - { - TestImage: image.NewRGBA(image.Rect(0, 0, 1000, 667)), - OutputSize: image.Point{560, 560}, - AspectRatio: image.Point{2, 2}, - Expected: image.NewRGBA(image.Rect(0, 0, 1120, 1120)), - }, - } - - for _, c := range cases { - actual := padImage(c.TestImage, c.OutputSize, c.AspectRatio) - - if actual.Bounds() != c.Expected.Bounds() { - t.Errorf("image size incorrect: '%#v': expected: '%#v'", actual.Bounds(), c.Expected.Bounds()) - } - } -} - -func TestPackImages(t *testing.T) { - type packCase struct { - TestImage image.Image - AspectRatio image.Point - ExpectedVals int - } - - cases := []packCase{ - { - TestImage: image.NewRGBA(image.Rect(0, 0, 1120, 1120)), - AspectRatio: image.Point{2, 2}, - ExpectedVals: 2 * 2 * 3 * 560 * 560, - }, - { - TestImage: image.NewRGBA(image.Rect(0, 0, 560, 560)), - AspectRatio: image.Point{1, 1}, - ExpectedVals: 1 * 1 * 3 * 560 * 560, - }, - { - TestImage: image.NewRGBA(image.Rect(0, 0, 1120, 560)), - AspectRatio: image.Point{1, 2}, - ExpectedVals: 1 * 2 * 3 * 560 * 560, - }, - } - - for _, c := range cases { - actualVals := packImages(c.TestImage, c.AspectRatio) - if len(actualVals) != c.ExpectedVals { - t.Errorf("packed image size incorrect: '%d': expected: '%d'", len(actualVals), c.ExpectedVals) - } - } -} - -func TestPreprocess(t *testing.T) { - type preprocessCase struct { - TestImage image.Image - ExpectedVals int - ExpectedAspectRatioID int - } - - cases := []preprocessCase{ - { - TestImage: image.NewRGBA(image.Rect(0, 0, 10, 10)), - ExpectedVals: 0, - ExpectedAspectRatioID: 1, - }, - { - TestImage: image.NewRGBA(image.Rect(0, 0, 1024, 768)), - ExpectedVals: 0, - ExpectedAspectRatioID: 6, - }, - } - - for _, c := range cases { - var buf bytes.Buffer - err := png.Encode(&buf, c.TestImage) - if err != nil { - t.Fatal(err) - } - - imgData, opts, err := Preprocess(&buf) - if err != nil { - t.Fatalf("error processing: %q", err) - } - - if len(imgData) == 0 { - t.Errorf("no image data returned") - } - - ar, ok := opts["aspectRatioIndex"] - if !ok { - t.Fatalf("no aspect ratio found") - } - - aspectRatioID := ar.(int) - - if aspectRatioID != c.ExpectedAspectRatioID { - t.Errorf("aspect ratio incorrect: '%d': expected: '%d'", aspectRatioID, c.ExpectedAspectRatioID) - } - } -} diff --git a/model/models/mllama/model.go b/model/models/mllama/model.go index 3fa26dedb..93b443ef1 100644 --- a/model/models/mllama/model.go +++ b/model/models/mllama/model.go @@ -2,11 +2,7 @@ package mllama import ( "bytes" - "encoding/binary" - "fmt" - "hash/fnv" "image" - "slices" "github.com/ollama/ollama/fs" "github.com/ollama/ollama/kvcache" @@ -34,10 +30,6 @@ const ( ) func New(c fs.Config) (model.Model, error) { - // Verify unified config - if c.Uint("vision.block_count") == 0 { - return nil, fmt.Errorf("non-unified vision model not supported") - } m := Model{ BytePairEncoding: model.NewBytePairEncoding( c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`), @@ -76,22 +68,19 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, er return nil, err } - f32s, aspectRatioID, err := m.ImageProcessor.ProcessImage(image) + f32s, ratio, err := m.ImageProcessor.ProcessImage(image) if err != nil { return nil, err } - pixelValues, err := ctx.Input().FromFloatSlice(f32s, - m.ImageProcessor.imageSize, - m.ImageProcessor.imageSize, - m.ImageProcessor.numChannels, - m.ImageProcessor.maxNumTiles, - ) + pixelValues, err := ctx.Input().FromFloatSlice(f32s, m.imageSize, m.imageSize, m.numChannels, ratio.numTiles()) if err != nil { return nil, err } - aspectRatio, err := ctx.Input().FromIntSlice([]int32{int32(aspectRatioID)}, 1) + pixelValues = pixelValues.Pad(ctx, 0, 0, 0, m.ImageProcessor.maxNumTiles-ratio.numTiles()) + + aspectRatio, err := ctx.Input().FromIntSlice([]int32{int32(ratio.rank)}, 1) if err != nil { return nil, err } @@ -102,41 +91,19 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, er } func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) { - var images []input.Input - fnvHash := fnv.New64a() - for i := range inputs { - if inputs[i].Multimodal == nil { - if len(images) > 0 { - inputs[i].Multimodal = []ml.Tensor{images[0].Multimodal.(ml.Tensor)} - inputs[i].MultimodalHash = images[0].MultimodalHash - for j := 1; j < len(images); j++ { - inputs[i].Multimodal = append(inputs[i].Multimodal.([]ml.Tensor), images[0].Multimodal.(ml.Tensor)) - fnvHash.Reset() - binary.Write(fnvHash, binary.NativeEndian, inputs[i].MultimodalHash) - binary.Write(fnvHash, binary.NativeEndian, inputs[j].MultimodalHash) - inputs[i].MultimodalHash = fnvHash.Sum64() - } - images = nil - } - } else { - images = append(images, inputs[i]) - inputs[i].Token = -1 + if inputs[i].Multimodal != nil { + inputs[i].Token = 128256 // <|image|> } } - inputs = slices.DeleteFunc(inputs, func(input input.Input) bool { return input.Token == -1 }) - return inputs, nil } func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) { var crossAttentionStates ml.Tensor if len(batch.Multimodal) > 0 { - images := batch.Multimodal[len(batch.Multimodal)-1].Multimodal.([]ml.Tensor) - if len(images) > 0 { - crossAttentionStates = images[len(images)-1] - } + crossAttentionStates = batch.Multimodal[len(batch.Multimodal)-1].Multimodal.(ml.Tensor) } positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions)) @@ -150,7 +117,7 @@ func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) { } // TODO: attention mask, cross attention mask - return m.TextModel.Forward(ctx, batch.Inputs, positions, outputs, nil, crossAttentionStates, nil, m.Cache.(*kvcache.WrapperCache)), nil + return m.TextModel.Forward(ctx, batch.Inputs, positions, outputs, crossAttentionStates, nil, m.Cache.(*kvcache.WrapperCache)), nil } func init() { diff --git a/model/models/mllama/model_text.go b/model/models/mllama/model_text.go index 490eb696c..9bd414afc 100644 --- a/model/models/mllama/model_text.go +++ b/model/models/mllama/model_text.go @@ -18,7 +18,7 @@ type TextSelfAttention struct { RopeFactors ml.Tensor `gguf:"rope_freqs.weight"` } -func (sa *TextSelfAttention) Forward(ctx ml.Context, hiddenState, positions, _ ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor { +func (sa *TextSelfAttention) Forward(ctx ml.Context, hiddenState, positions ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor { batchSize := hiddenState.Dim(1) headDim := opts.hiddenSize / opts.numHeads ropeType := uint32(0) @@ -69,11 +69,11 @@ type TextSelfAttentionDecoderLayer struct { MLP *TextMLP } -func (d *TextSelfAttentionDecoderLayer) Forward(ctx ml.Context, hiddenState, positions, outputs, mask, _, _ ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor { +func (d *TextSelfAttentionDecoderLayer) Forward(ctx ml.Context, hiddenState, positions, outputs, _, _ ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor { residual := hiddenState hiddenState = d.AttentionNorm.Forward(ctx, hiddenState, opts.eps) - hiddenState = d.SelfAttention.Forward(ctx, hiddenState, positions, mask, cache, opts) + hiddenState = d.SelfAttention.Forward(ctx, hiddenState, positions, cache, opts) // In the final layer (outputs != nil), optimize by pruning to just the token positions // we need logits for. @@ -151,7 +151,7 @@ type TextCrossAttentionDecoderLayer struct { MLPGate ml.Tensor `gguf:"cross_attn_mlp_gate"` } -func (d *TextCrossAttentionDecoderLayer) Forward(ctx ml.Context, hiddenState, _, _, _, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor { +func (d *TextCrossAttentionDecoderLayer) Forward(ctx ml.Context, hiddenState, _, _, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor { residual := hiddenState hiddenState = d.AttentionNorm.Forward(ctx, hiddenState, opts.eps) @@ -167,14 +167,14 @@ func (d *TextCrossAttentionDecoderLayer) Forward(ctx ml.Context, hiddenState, _, } type TextDecoderLayer interface { - Forward(ctx ml.Context, hiddenState, positionIDs, outputs, mask, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor + Forward(ctx ml.Context, hiddenState, positionIDs, outputs, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor } type TextDecoder struct { Layers []TextDecoderLayer } -func (d *TextDecoder) Forward(ctx ml.Context, hiddenState, positionIDs, outputs, mask, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor { +func (d *TextDecoder) Forward(ctx ml.Context, hiddenState, positionIDs, outputs, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor { for i, layer := range d.Layers { layerType := selfAttentionLayer if slices.Contains(opts.crossAttentionLayers, int32(i)) { @@ -190,7 +190,7 @@ func (d *TextDecoder) Forward(ctx ml.Context, hiddenState, positionIDs, outputs, lastLayerOutputs = outputs } - hiddenState = layer.Forward(ctx, hiddenState, positionIDs, lastLayerOutputs, mask, crossAttentionStates, crossAttentionMask, cache, opts) + hiddenState = layer.Forward(ctx, hiddenState, positionIDs, lastLayerOutputs, crossAttentionStates, crossAttentionMask, cache, opts) } } @@ -214,9 +214,9 @@ type TextModel struct { *TextModelOptions } -func (m *TextModel) Forward(ctx ml.Context, inputIDs, positionIDs, outputs, mask, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache) ml.Tensor { +func (m *TextModel) Forward(ctx ml.Context, inputIDs, positionIDs, outputs, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache) ml.Tensor { hiddenState := m.TokenEmbedding.Forward(ctx, inputIDs) - hiddenState = m.Transformer.Forward(ctx, hiddenState, positionIDs, outputs, mask, crossAttentionStates, crossAttentionMask, cache, m.TextModelOptions) + hiddenState = m.Transformer.Forward(ctx, hiddenState, positionIDs, outputs, crossAttentionStates, crossAttentionMask, cache, m.TextModelOptions) hiddenState = m.OutputNorm.Forward(ctx, hiddenState, m.eps) return m.Output.Forward(ctx, hiddenState) } diff --git a/model/models/mllama/model_vision.go b/model/models/mllama/model_vision.go index bd3d150a3..77ea53731 100644 --- a/model/models/mllama/model_vision.go +++ b/model/models/mllama/model_vision.go @@ -15,7 +15,7 @@ type VisionSelfAttention struct { Query *nn.Linear `gguf:"attn_q"` Key *nn.Linear `gguf:"attn_k"` Value *nn.Linear `gguf:"attn_v"` - Output *nn.Linear `gguf:"attn_out"` + Output *nn.Linear `gguf:"attn_output"` Gate ml.Tensor `gguf:"attn_gate"` } @@ -45,36 +45,29 @@ func (sa *VisionSelfAttention) Forward(ctx ml.Context, hiddenState ml.Tensor, op attention = attention.Reshape(ctx, opts.hiddenSize, attention.Dim(2), batchSize) hiddenState = sa.Output.Forward(ctx, attention) - if sa.Gate != nil { - hiddenState = hiddenState.Mul(ctx, sa.Gate) - } - return hiddenState } type VisionMLP struct { - Down *nn.Linear `gguf:"ffn_down"` Up *nn.Linear `gguf:"ffn_up"` - - Gate ml.Tensor `gguf:"ffn_gate"` + Down *nn.Linear `gguf:"ffn_down"` } func (mlp *VisionMLP) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *VisionModelOptions) ml.Tensor { - hiddenState = mlp.Down.Forward(ctx, hiddenState).GELU(ctx) - hiddenState = mlp.Up.Forward(ctx, hiddenState) - if mlp.Gate != nil { - hiddenState = hiddenState.Mul(ctx, mlp.Gate) - } + hiddenState = mlp.Up.Forward(ctx, hiddenState).GELU(ctx) + hiddenState = mlp.Down.Forward(ctx, hiddenState) return hiddenState } type VisionEncoderLayer struct { - AttentionNorm *nn.LayerNorm `gguf:"ln1"` + AttentionNorm *nn.LayerNorm `gguf:"attn_norm"` SelfAttention *VisionSelfAttention + AttentionGate ml.Tensor `gguf:"attn_gate"` - MLPNorm *nn.LayerNorm `gguf:"ln2"` + MLPNorm *nn.LayerNorm `gguf:"ffn_norm"` MLP *VisionMLP + MLPGate ml.Tensor `gguf:"ffn_gate"` } func (e *VisionEncoderLayer) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *VisionModelOptions) ml.Tensor { @@ -83,13 +76,22 @@ func (e *VisionEncoderLayer) Forward(ctx ml.Context, hiddenState ml.Tensor, opts // self attention hiddenState = e.AttentionNorm.Forward(ctx, hiddenState, opts.eps) hiddenState = e.SelfAttention.Forward(ctx, hiddenState, opts) + + if e.AttentionGate != nil { + hiddenState = hiddenState.Mul(ctx, e.AttentionGate) + } hiddenState = hiddenState.Add(ctx, residual) residual = hiddenState // feed forward hiddenState = e.MLPNorm.Forward(ctx, hiddenState, opts.eps) hiddenState = e.MLP.Forward(ctx, hiddenState, opts) - return hiddenState.Add(ctx, residual) + hiddenState = hiddenState.Add(ctx, residual) + if e.MLPGate != nil { + hiddenState = hiddenState.Mul(ctx, e.MLPGate) + } + + return hiddenState } type VisionEncoder struct { @@ -114,9 +116,9 @@ type PrecomputedAspectRatioEmbedding struct { Gate ml.Tensor `gguf:"gate"` } -func (e *PrecomputedAspectRatioEmbedding) Forward(ctx ml.Context, hiddenState ml.Tensor, aspectRatioIDs ml.Tensor, opts *VisionModelOptions) ml.Tensor { +func (e *PrecomputedAspectRatioEmbedding) Forward(ctx ml.Context, hiddenState ml.Tensor, aspectRatioIDs ml.Tensor, numTiles int, opts *VisionModelOptions) ml.Tensor { embeddings := e.Embedding.Forward(ctx, aspectRatioIDs) - embeddings = embeddings.Reshape(ctx, opts.hiddenSize, 1, opts.numTiles) + embeddings = embeddings.Reshape(ctx, opts.hiddenSize, 1, numTiles) if e.Gate != nil { embeddings = embeddings.Mul(ctx, e.Gate) } @@ -132,7 +134,7 @@ type PrecomputedPositionEmbedding struct { TilePositionEmbeddingGate ml.Tensor `gguf:"tile_position_embd.gate"` } -func (e *PrecomputedPositionEmbedding) Forward(ctx ml.Context, hiddenState, positionIDs, aspectRatioIDs ml.Tensor, numPositions int, opts *VisionModelOptions) ml.Tensor { +func (e *PrecomputedPositionEmbedding) Forward(ctx ml.Context, hiddenState, positionIDs, aspectRatioIDs ml.Tensor, numPositions, numTiles int, opts *VisionModelOptions) ml.Tensor { positionEmbedding := e.PositionEmbedding.Forward(ctx, positionIDs) if e.PositionEmbeddingGate != nil { positionEmbedding = positionEmbedding.Mul(ctx, e.PositionEmbeddingGate) @@ -141,7 +143,7 @@ func (e *PrecomputedPositionEmbedding) Forward(ctx ml.Context, hiddenState, posi hiddenState = hiddenState.Add(ctx, positionEmbedding) tilePositionEmbedding := e.TilePositionEmbedding.Forward(ctx, aspectRatioIDs) - tilePositionEmbedding = tilePositionEmbedding.Reshape(ctx, opts.hiddenSize, numPositions, opts.numTiles) + tilePositionEmbedding = tilePositionEmbedding.Reshape(ctx, opts.hiddenSize, numPositions, numTiles) if e.TilePositionEmbeddingGate != nil { tilePositionEmbedding = tilePositionEmbedding.Mul(ctx, e.TilePositionEmbeddingGate) } @@ -150,9 +152,9 @@ func (e *PrecomputedPositionEmbedding) Forward(ctx ml.Context, hiddenState, posi } type VisionModelOptions struct { - hiddenSize, numHeads, numTiles int - imageSize, patchSize int - eps float32 + hiddenSize, numHeads int + imageSize, patchSize int + eps float32 intermediateLayersIndices []int32 } @@ -181,14 +183,16 @@ func (m *VisionModel) Forward(ctx ml.Context, pixelValues, positionIDs, aspectRa numPositions++ } + numTiles := pixelValues.Dim(3) + hiddenState := m.PatchEmbeddings.Forward(ctx, pixelValues, m.patchSize, m.patchSize, 0, 0, 1, 1) - hiddenState = hiddenState.Reshape(ctx, numPatches, m.hiddenSize, m.numTiles) + hiddenState = hiddenState.Reshape(ctx, numPatches, m.hiddenSize, numTiles) hiddenState = hiddenState.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx) - hiddenState = m.PreTilePositionEmbedding.Forward(ctx, hiddenState, aspectRatioIDs, m.VisionModelOptions) - hiddenState = m.ClassEmbedding.Repeat(ctx, 2, m.numTiles).Concat(ctx, hiddenState, 1) + hiddenState = m.PreTilePositionEmbedding.Forward(ctx, hiddenState, aspectRatioIDs, numTiles, m.VisionModelOptions) + hiddenState = m.ClassEmbedding.Repeat(ctx, 2, numTiles).Concat(ctx, hiddenState, 1) - hiddenState = m.PositionEmbedding.Forward(ctx, hiddenState, positionIDs, aspectRatioIDs, numPositions, m.VisionModelOptions) + hiddenState = m.PositionEmbedding.Forward(ctx, hiddenState, positionIDs, aspectRatioIDs, numPositions, numTiles, m.VisionModelOptions) hiddenState = m.PreLayerNorm.Forward(ctx, hiddenState, m.eps) numPaddingPatches := 8 - (hiddenState.Dim(1)%8)%8 @@ -199,18 +203,18 @@ func (m *VisionModel) Forward(ctx ml.Context, pixelValues, positionIDs, aspectRa hiddenState = m.PostLayerNorm.Forward(ctx, hiddenState, m.eps) - hiddenState = hiddenState.Reshape(ctx, m.hiddenSize, numPositions+numPaddingPatches, m.numTiles, batchSize) - hiddenState = m.PostTilePositionEmbedding.Forward(ctx, hiddenState, aspectRatioIDs, m.VisionModelOptions) + hiddenState = hiddenState.Reshape(ctx, m.hiddenSize, numPositions+numPaddingPatches, numTiles, batchSize) + hiddenState = m.PostTilePositionEmbedding.Forward(ctx, hiddenState, aspectRatioIDs, numTiles, m.VisionModelOptions) - hiddenState = hiddenState.Reshape(ctx, m.hiddenSize, m.numTiles*(numPositions+numPaddingPatches), batchSize) + hiddenState = hiddenState.Reshape(ctx, m.hiddenSize, numTiles*(numPositions+numPaddingPatches), batchSize) hiddenState, _ = m.GlobalTransformer.Forward(ctx, hiddenState, nil, m.VisionModelOptions) hiddenStates := intermediateHiddenStates[0].Stack(ctx, 0, intermediateHiddenStates[1:]...) - hiddenStates = hiddenStates.Reshape(ctx, len(intermediateHiddenStates)*m.hiddenSize, numPositions+numPaddingPatches, m.numTiles, batchSize) - hiddenStates = hiddenStates.Unpad(ctx, 0, numPaddingPatches, 0, 0) + hiddenStates = hiddenStates.Reshape(ctx, len(intermediateHiddenStates)*m.hiddenSize, numPositions+numPaddingPatches, numTiles, batchSize) + hiddenStates = hiddenStates.Pad(ctx, 0, -numPaddingPatches, 0, 0) - hiddenState = hiddenState.Reshape(ctx, m.hiddenSize, numPositions+numPaddingPatches, m.numTiles, batchSize) - hiddenState = hiddenState.Unpad(ctx, 0, numPaddingPatches, 0, 0) + hiddenState = hiddenState.Reshape(ctx, m.hiddenSize, numPositions+numPaddingPatches, numTiles, batchSize) + hiddenState = hiddenState.Pad(ctx, 0, -numPaddingPatches, 0, 0) return hiddenState.Concat(ctx, hiddenStates, 0) } @@ -222,7 +226,6 @@ func newVisionModel(c fs.Config) *VisionModel { VisionModelOptions: &VisionModelOptions{ hiddenSize: int(c.Uint("vision.embedding_length")), numHeads: int(c.Uint("vision.attention.head_count")), - numTiles: int(c.Uint("vision.max_num_tiles")), imageSize: int(c.Uint("vision.image_size")), patchSize: int(c.Uint("vision.patch_size")), diff --git a/model/models/mllama/process_image.go b/model/models/mllama/process_image.go index 1b0506d32..8e60508ff 100644 --- a/model/models/mllama/process_image.go +++ b/model/models/mllama/process_image.go @@ -2,17 +2,31 @@ package mllama import ( "image" - "image/color" "math" "slices" "golang.org/x/image/draw" "github.com/ollama/ollama/fs" + "github.com/ollama/ollama/model/imageproc" ) +type supportedAspectRatio struct { + rank, width, height int +} + +func (a supportedAspectRatio) Point() image.Point { + return image.Point{a.width, a.height} +} + +func (a supportedAspectRatio) numTiles() int { + return a.width * a.height +} + type ImageProcessor struct { imageSize, numChannels, maxNumTiles int + + mean, std [3]float32 } func newImageProcessor(c fs.Config) ImageProcessor { @@ -20,71 +34,49 @@ func newImageProcessor(c fs.Config) ImageProcessor { imageSize: int(c.Uint("vision.image_size")), numChannels: int(c.Uint("vision.num_channels")), maxNumTiles: int(c.Uint("vision.max_num_tiles")), + + mean: imageproc.ClipDefaultMean, + std: imageproc.ClipDefaultSTD, } } -func (p *ImageProcessor) supportedAspectRatios(maxTiles int) []image.Point { - ratios := []image.Point{} - - for w := range maxTiles { - for h := range maxTiles { - if (w+1)*(h+1) <= maxTiles { - ratios = append(ratios, image.Point{w + 1, h + 1}) - } +func (p ImageProcessor) supportedAspectRatios() (ratios []supportedAspectRatio) { + for w := 1; w <= p.maxNumTiles; w++ { + for h := 1; h <= p.maxNumTiles/w; h++ { + ratios = append(ratios, supportedAspectRatio{len(ratios) + 1, w, h}) } } - return ratios } -func (p *ImageProcessor) clip(a, a_min, a_max int) int { - if a < a_min { - return a_min - } else if a > a_max { - return a_max - } +func (p ImageProcessor) fitToCanvas(imageSize, canvasSize image.Point) image.Point { + tw := min(max(imageSize.X, p.imageSize), canvasSize.X) + th := min(max(imageSize.Y, p.imageSize), canvasSize.Y) - return a -} + r := math.Min( + float64(tw)/float64(imageSize.X), + float64(th)/float64(imageSize.Y), + ) -func (p *ImageProcessor) fitToCanvas(imageSize, canvasSize image.Point, tileSize int) image.Point { - targetWidth := p.clip(imageSize.X, tileSize, canvasSize.X) - targetHeight := p.clip(imageSize.Y, tileSize, canvasSize.Y) - - scaleWidth := float64(targetWidth) / float64(imageSize.X) - scaleHeight := float64(targetHeight) / float64(imageSize.Y) - - var w, h int - - if scaleWidth < scaleHeight { - w = targetWidth - h = min(int(math.Floor(float64(imageSize.Y)*scaleWidth)), targetHeight) - } else { - w = min(int(math.Floor(float64(imageSize.X)*scaleHeight)), targetWidth) - h = targetHeight - } + w := min(int(math.Floor(float64(imageSize.X)*r)), tw) + h := min(int(math.Floor(float64(imageSize.Y)*r)), th) return image.Point{w, h} } -func (p *ImageProcessor) optimalTiledCanvas(imageSize image.Point, maxImageTiles, tileSize int) image.Point { - possibleTileArrangements := p.supportedAspectRatios(maxImageTiles) - possibleCanvasSizes := []image.Point{} - for _, pta := range possibleTileArrangements { - possibleCanvasSizes = append(possibleCanvasSizes, image.Point{pta.X * tileSize, pta.Y * tileSize}) +func (p ImageProcessor) optimalTiledCanvas(imageSize image.Point) image.Point { + possibleTileArrangements := p.supportedAspectRatios() + possibleCanvasSizes := make([]image.Point, len(possibleTileArrangements)) + for i, pta := range possibleTileArrangements { + possibleCanvasSizes[i] = image.Point{pta.width * p.imageSize, pta.height * p.imageSize} } - scales := []float64{} - - for _, pcs := range possibleCanvasSizes { - scaleHeight := float64(pcs.Y) / float64(imageSize.Y) - scaleWidth := float64(pcs.X) / float64(imageSize.X) - - if scaleWidth > scaleHeight { - scales = append(scales, scaleHeight) - } else { - scales = append(scales, scaleWidth) - } + scales := make([]float64, len(possibleCanvasSizes)) + for i, pcs := range possibleCanvasSizes { + scales[i] = min( + float64(pcs.Y)/float64(imageSize.Y), + float64(pcs.X)/float64(imageSize.X), + ) } var minUpscale float64 @@ -123,47 +115,41 @@ func (p *ImageProcessor) optimalTiledCanvas(imageSize image.Point, maxImageTiles return selectedCanvas } -func (p *ImageProcessor) splitToTiles(img image.Image, numTilesSize image.Point) []image.Image { +func (p ImageProcessor) splitToTiles(img image.Image, numTilesSize image.Point) []image.Image { b := img.Bounds() width := b.Max.X - b.Min.X height := b.Max.Y - b.Min.Y tileHeight := height / numTilesSize.Y tileWidth := width / numTilesSize.X - images := []image.Image{} + images := make([]image.Image, 0, numTilesSize.Y*numTilesSize.X) for h := range numTilesSize.Y { for w := range numTilesSize.X { rect := image.Rect(tileWidth*w, tileHeight*h, tileWidth*(w+1), tileHeight*(h+1)) - images = append(images, img.(interface { + if subImg, ok := img.(interface { SubImage(image.Rectangle) image.Image - }).SubImage(rect)) + }); ok { + images = append(images, subImg.SubImage(rect)) + } else { + // Handle the case where img does not implement SubImage + // This is a fallback and may not be efficient + newImg := image.NewRGBA(rect) + draw.Draw(newImg, rect, img, rect.Min, draw.Src) + images = append(images, newImg) + } } } return images } -// remove the "alpha" channel by drawing over a prefilled image -// -//nolint:unused -func (p *ImageProcessor) compositeImage(img image.Image) image.Image { - dst := image.NewRGBA(img.Bounds()) - - white := color.RGBA{255, 255, 255, 255} - draw.Draw(dst, dst.Bounds(), &image.Uniform{white}, image.Point{}, draw.Src) - draw.Draw(dst, dst.Bounds(), img, img.Bounds().Min, draw.Over) - - return dst -} - -func (p *ImageProcessor) resize(img image.Image, outputSize image.Point, maxImageTiles int) (image.Image, image.Point) { +func (p ImageProcessor) resize(img image.Image) (image.Image, image.Point) { b := img.Bounds() - tileSize := outputSize.Y - canvasSize := p.optimalTiledCanvas(b.Max, maxImageTiles, tileSize) - aspectRatio := image.Point{canvasSize.X / tileSize, canvasSize.Y / tileSize} - newSize := p.fitToCanvas(b.Max, canvasSize, tileSize) + canvasSize := p.optimalTiledCanvas(b.Max) + aspectRatio := image.Point{canvasSize.X / p.imageSize, canvasSize.Y / p.imageSize} + newSize := p.fitToCanvas(b.Max, canvasSize) dst := image.NewRGBA(image.Rect(0, 0, newSize.X, newSize.Y)) @@ -177,10 +163,10 @@ func (p *ImageProcessor) resize(img image.Image, outputSize image.Point, maxImag return dst, aspectRatio } -func (p *ImageProcessor) pad(img image.Image, outputSize, aspectRatio image.Point) image.Image { +func (p ImageProcessor) pad(img image.Image, aspectRatio image.Point) image.Image { paddedSize := image.Point{ - X: outputSize.X * aspectRatio.X, - Y: outputSize.Y * aspectRatio.Y, + X: p.imageSize * aspectRatio.X, + Y: p.imageSize * aspectRatio.Y, } dst := image.NewRGBA(image.Rect(0, 0, paddedSize.X, paddedSize.Y)) @@ -189,7 +175,7 @@ func (p *ImageProcessor) pad(img image.Image, outputSize, aspectRatio image.Poin return dst } -func (p *ImageProcessor) pack(img image.Image, aspectRatio image.Point, mean, std [3]float32) []float32 { +func (p ImageProcessor) pack(img image.Image, aspectRatio image.Point) []float32 { subImages := p.splitToTiles(img, aspectRatio) var pixelVals []float32 @@ -205,9 +191,9 @@ func (p *ImageProcessor) pack(img image.Image, aspectRatio image.Point, mean, st gVal := float32(g>>8) / 255.0 bVal := float32(b>>8) / 255.0 - rVal = (rVal - mean[0]) / std[0] - gVal = (gVal - mean[1]) / std[1] - bVal = (bVal - mean[2]) / std[2] + rVal = (rVal - p.mean[0]) / p.std[0] + gVal = (gVal - p.mean[1]) / p.std[1] + bVal = (bVal - p.mean[2]) / p.std[2] rVals = append(rVals, rVal) gVals = append(gVals, gVal) @@ -222,17 +208,15 @@ func (p *ImageProcessor) pack(img image.Image, aspectRatio image.Point, mean, st return pixelVals } -func (p ImageProcessor) ProcessImage(img image.Image) ([]float32, int, error) { - outputSize := image.Point{p.imageSize, p.imageSize} +func (p ImageProcessor) ProcessImage(img image.Image) ([]float32, supportedAspectRatio, error) { + newImage, newImageRatio := p.resize(img) + newImage = p.pad(newImage, newImageRatio) + pixelValues := p.pack(newImage, newImageRatio) - // clip values - mean := [3]float32{0.48145466, 0.4578275, 0.40821073} - std := [3]float32{0.26862954, 0.26130258, 0.27577711} + supportedAspectRatios := p.supportedAspectRatios() + aspectRatioID := slices.IndexFunc(supportedAspectRatios, func(i supportedAspectRatio) bool { + return i.width == newImageRatio.X && i.height == newImageRatio.Y + }) - newImage, aspectRatio := p.resize(img, outputSize, p.maxNumTiles) - newImage = p.pad(newImage, outputSize, aspectRatio) - - data := p.pack(newImage, aspectRatio, mean, std) - aspectRatioIndex := slices.Index(p.supportedAspectRatios(p.maxNumTiles), aspectRatio) + 1 - return data, aspectRatioIndex, nil + return pixelValues, supportedAspectRatios[aspectRatioID], nil } diff --git a/model/models/mllama/process_image_test.go b/model/models/mllama/process_image_test.go new file mode 100644 index 000000000..a9669b182 --- /dev/null +++ b/model/models/mllama/process_image_test.go @@ -0,0 +1,387 @@ +package mllama + +import ( + "image" + "testing" + + "github.com/google/go-cmp/cmp" +) + +func TestSupportedAspectRatios(t *testing.T) { + cases := []struct { + p ImageProcessor + want []supportedAspectRatio + }{ + { + p: ImageProcessor{maxNumTiles: 1}, + want: []supportedAspectRatio{ + {1, 1, 1}, + }, + }, + { + p: ImageProcessor{maxNumTiles: 2}, + want: []supportedAspectRatio{ + {1, 1, 1}, + {2, 1, 2}, + {3, 2, 1}, + }, + }, + { + p: ImageProcessor{maxNumTiles: 3}, + want: []supportedAspectRatio{ + {1, 1, 1}, + {2, 1, 2}, + {3, 1, 3}, + {4, 2, 1}, + {5, 3, 1}, + }, + }, + { + p: ImageProcessor{maxNumTiles: 4}, + want: []supportedAspectRatio{ + {1, 1, 1}, + {2, 1, 2}, + {3, 1, 3}, + {4, 1, 4}, + {5, 2, 1}, + {6, 2, 2}, + {7, 3, 1}, + {8, 4, 1}, + }, + }, + } + + for _, tt := range cases { + actual := tt.p.supportedAspectRatios() + if diff := cmp.Diff(actual, tt.want, cmp.AllowUnexported(supportedAspectRatio{})); diff != "" { + t.Errorf("mismatch (-got +want):\n%s", diff) + } + } +} + +func TestFitToCanvas(t *testing.T) { + cases := []struct { + p ImageProcessor + image image.Point + canvas image.Point + expect image.Point + }{ + { + p: ImageProcessor{imageSize: 200}, + image: image.Point{400, 400}, + canvas: image.Point{640, 480}, + expect: image.Point{400, 400}, + }, + { + p: ImageProcessor{imageSize: 200}, + image: image.Point{1024, 768}, + canvas: image.Point{640, 480}, + expect: image.Point{640, 480}, + }, + { + p: ImageProcessor{imageSize: 750}, + image: image.Point{500, 500}, + canvas: image.Point{1000, 1000}, + expect: image.Point{750, 750}, + }, + { + p: ImageProcessor{imageSize: 2000}, + image: image.Point{500, 1000}, + canvas: image.Point{2000, 2000}, + expect: image.Point{1000, 2000}, + }, + { + p: ImageProcessor{imageSize: 1000}, + image: image.Point{4000, 3000}, + canvas: image.Point{2000, 1000}, + expect: image.Point{1333, 1000}, + }, + { + p: ImageProcessor{imageSize: 560}, + image: image.Point{667, 1000}, + canvas: image.Point{1000, 1000}, + expect: image.Point{667, 1000}, + }, + } + + for _, tt := range cases { + actual := tt.p.fitToCanvas(tt.image, tt.canvas) + if diff := cmp.Diff(actual, tt.expect); diff != "" { + t.Errorf("mismatch (-got +want):\n%s", diff) + } + } +} + +func TestOptimalTiledCanvas(t *testing.T) { + cases := []struct { + p ImageProcessor + image image.Point + expect image.Point + }{ + { + p: ImageProcessor{maxNumTiles: 4, imageSize: 1000}, + image: image.Point{1024, 768}, + expect: image.Point{2000, 1000}, + }, + { + p: ImageProcessor{maxNumTiles: 4, imageSize: 560}, + image: image.Point{1024, 768}, + expect: image.Point{1120, 1120}, + }, + { + p: ImageProcessor{maxNumTiles: 4, imageSize: 560}, + image: image.Point{800, 600}, + expect: image.Point{1120, 1120}, + }, + { + p: ImageProcessor{maxNumTiles: 4, imageSize: 560}, + image: image.Point{640, 480}, + expect: image.Point{1120, 560}, + }, + { + p: ImageProcessor{maxNumTiles: 4, imageSize: 560}, + image: image.Point{320, 200}, + expect: image.Point{560, 560}, + }, + { + p: ImageProcessor{maxNumTiles: 4, imageSize: 560}, + image: image.Point{1320, 200}, + expect: image.Point{1680, 560}, + }, + { + p: ImageProcessor{maxNumTiles: 4, imageSize: 560}, + image: image.Point{2000, 200}, + expect: image.Point{2240, 560}, + }, + { + p: ImageProcessor{maxNumTiles: 4, imageSize: 560}, + image: image.Point{10000, 200}, + expect: image.Point{2240, 560}, + }, + { + p: ImageProcessor{maxNumTiles: 4, imageSize: 560}, + image: image.Point{480, 640}, + expect: image.Point{560, 1120}, + }, + { + p: ImageProcessor{maxNumTiles: 4, imageSize: 560}, + image: image.Point{200, 320}, + expect: image.Point{560, 560}, + }, + { + p: ImageProcessor{maxNumTiles: 4, imageSize: 560}, + image: image.Point{200, 1320}, + expect: image.Point{560, 1680}, + }, + { + p: ImageProcessor{maxNumTiles: 4, imageSize: 560}, + image: image.Point{200, 2000}, + expect: image.Point{560, 2240}, + }, + { + p: ImageProcessor{maxNumTiles: 4, imageSize: 560}, + image: image.Point{200, 10000}, + expect: image.Point{560, 2240}, + }, + { + p: ImageProcessor{maxNumTiles: 4, imageSize: 560}, + image: image.Point{10000, 10000}, + expect: image.Point{1120, 1120}, + }, + } + + for _, tt := range cases { + actual := tt.p.optimalTiledCanvas(tt.image) + if diff := cmp.Diff(actual, tt.expect); diff != "" { + t.Errorf("mismatch (-got +want):\n%s", diff) + } + } +} + +func TestSplitToTiles(t *testing.T) { + cases := []struct { + imageMax image.Point + numTiles image.Point + expect []image.Image + }{ + { + imageMax: image.Point{1024, 768}, + numTiles: image.Point{1, 1}, + expect: []image.Image{image.NewRGBA(image.Rect(0, 0, 1024, 768))}, + }, + { + imageMax: image.Point{1000, 500}, + numTiles: image.Point{2, 1}, + expect: []image.Image{ + image.NewRGBA(image.Rect(0, 0, 500, 500)), + image.NewRGBA(image.Rect(500, 0, 1000, 500)), + }, + }, + { + imageMax: image.Point{1000, 1000}, + numTiles: image.Point{2, 2}, + expect: []image.Image{ + image.NewRGBA(image.Rect(0, 0, 500, 500)), + image.NewRGBA(image.Rect(500, 0, 1000, 500)), + image.NewRGBA(image.Rect(0, 500, 500, 1000)), + image.NewRGBA(image.Rect(500, 500, 1000, 1000)), + }, + }, + } + + var p ImageProcessor + + for _, tt := range cases { + actual := p.splitToTiles(image.NewRGBA(image.Rectangle{Max: tt.imageMax}), tt.numTiles) + + if len(actual) != len(tt.expect) { + t.Errorf("incorrect number of images '%d': expect: '%d'", len(actual), len(tt.expect)) + } + + for i := range actual { + if actual[i].Bounds() != tt.expect[i].Bounds() { + t.Errorf("image size incorrect: '%#v': expect: '%#v'", actual[i].Bounds(), tt.expect[i].Bounds()) + } + } + } +} + +func TestResize(t *testing.T) { + cases := []struct { + p ImageProcessor + imageMax image.Point + expectImage image.Image + expectAspectRatio image.Point + }{ + { + p: ImageProcessor{maxNumTiles: 1, imageSize: 100}, + imageMax: image.Point{200, 200}, + expectImage: image.NewRGBA(image.Rect(0, 0, 100, 100)), + expectAspectRatio: image.Point{1, 1}, + }, + { + p: ImageProcessor{maxNumTiles: 2, imageSize: 100}, + imageMax: image.Point{200, 200}, + expectImage: image.NewRGBA(image.Rect(0, 0, 100, 100)), + expectAspectRatio: image.Point{1, 1}, + }, + { + p: ImageProcessor{maxNumTiles: 4, imageSize: 560}, + imageMax: image.Point{10, 10}, + expectImage: image.NewRGBA(image.Rect(0, 0, 560, 560)), + expectAspectRatio: image.Point{1, 1}, + }, + { + p: ImageProcessor{maxNumTiles: 4, imageSize: 560}, + imageMax: image.Point{2560, 1920}, + expectImage: image.NewRGBA(image.Rect(0, 0, 1120, 840)), + expectAspectRatio: image.Point{2, 2}, + }, + { + p: ImageProcessor{maxNumTiles: 4, imageSize: 560}, + imageMax: image.Point{1024, 768}, + expectImage: image.NewRGBA(image.Rect(0, 0, 1024, 768)), + expectAspectRatio: image.Point{2, 2}, + }, + } + + for _, tt := range cases { + actualImage, actualAspectRatio := tt.p.resize(image.Rectangle{Max: tt.imageMax}) + + if actualImage.Bounds() != tt.expectImage.Bounds() { + t.Errorf("image size incorrect: '%#v': expect: '%#v'", actualImage.Bounds(), tt.expectImage.Bounds()) + } + + if actualAspectRatio != tt.expectAspectRatio { + t.Errorf("aspect ratio incorrect: '%#v': expect: '%#v'", actualAspectRatio, tt.expectAspectRatio) + } + } +} + +func TestPad(t *testing.T) { + cases := []struct { + p ImageProcessor + imageMax image.Point + aspectRatio image.Point + expect image.Image + }{ + { + p: ImageProcessor{maxNumTiles: 4, imageSize: 560}, + imageMax: image.Point{1000, 667}, + aspectRatio: image.Point{2, 2}, + expect: image.NewRGBA(image.Rect(0, 0, 1120, 1120)), + }, + } + + for _, tt := range cases { + actual := tt.p.pad(image.Rectangle{Max: tt.imageMax}, tt.aspectRatio) + + if actual.Bounds() != tt.expect.Bounds() { + t.Errorf("image size incorrect: '%#v': expect: '%#v'", actual.Bounds(), tt.expect.Bounds()) + } + } +} + +func TestPackImages(t *testing.T) { + cases := []struct { + imageMax image.Point + aspectRatio image.Point + expectVals int + }{ + { + imageMax: image.Point{1120, 1120}, + aspectRatio: image.Point{2, 2}, + expectVals: 2 * 2 * 3 * 560 * 560, + }, + { + imageMax: image.Point{560, 560}, + aspectRatio: image.Point{1, 1}, + expectVals: 1 * 1 * 3 * 560 * 560, + }, + { + imageMax: image.Point{1120, 560}, + aspectRatio: image.Point{1, 2}, + expectVals: 1 * 2 * 3 * 560 * 560, + }, + } + + for _, tt := range cases { + var p ImageProcessor + actualVals := p.pack(image.NewRGBA(image.Rectangle{Max: tt.imageMax}), tt.aspectRatio) + if len(actualVals) != tt.expectVals { + t.Errorf("packed image size incorrect: '%d': expect: '%d'", len(actualVals), tt.expectVals) + } + } +} + +func TestPreprocess(t *testing.T) { + cases := []struct { + imageMax image.Point + expectAspectRatioID int + }{ + { + imageMax: image.Point{10, 10}, + expectAspectRatioID: 1, + }, + { + imageMax: image.Point{1024, 768}, + expectAspectRatioID: 6, + }, + } + + p := ImageProcessor{imageSize: 560, maxNumTiles: 4} + for _, tt := range cases { + img, aspectRatio, err := p.ProcessImage(image.NewRGBA(image.Rectangle{Max: tt.imageMax})) + if err != nil { + t.Fatalf("error processing: %q", err) + } + + if len(img) == 0 { + t.Errorf("no image data returned") + } + + if aspectRatio.rank != tt.expectAspectRatioID { + t.Errorf("aspect ratio incorrect: '%d': expect: '%d'", aspectRatio, tt.expectAspectRatioID) + } + } +} diff --git a/runner/llamarunner/image.go b/runner/llamarunner/image.go index e7e30a4d8..1d0c1a4f5 100644 --- a/runner/llamarunner/image.go +++ b/runner/llamarunner/image.go @@ -5,7 +5,6 @@ import ( "fmt" "hash/maphash" "log/slog" - "slices" "sync" "time" @@ -18,8 +17,7 @@ type ImageContext struct { // mu is required to be held when generating embeddings or accessing the cache mu sync.Mutex - clip *llama.ClipContext - mllama *llama.MllamaContext + clip *llama.ClipContext // cache of images to embeddings images []imageCache @@ -35,8 +33,6 @@ func NewImageContext(llamaContext *llama.Context, modelPath string) (*ImageConte var c ImageContext if arch == "clip" { c.clip, err = llama.NewClipContext(llamaContext, modelPath) - } else if arch == "mllama" { - c.mllama, err = llama.NewMllamaContext(llamaContext, modelPath) } else { return nil, fmt.Errorf("unknown vision model architecture: %s", arch) } @@ -58,12 +54,9 @@ func (c *ImageContext) Free(modelPath string) { if c.clip != nil { c.clip.Free() } - if c.mllama != nil { - c.mllama.Free() - } } -func (c *ImageContext) NewEmbed(llamaContext *llama.Context, data []byte, aspectRatioId int) ([][]float32, error) { +func (c *ImageContext) NewEmbed(llamaContext *llama.Context, data []byte) ([][]float32, error) { if c == nil { return nil, nil } @@ -79,12 +72,7 @@ func (c *ImageContext) NewEmbed(llamaContext *llama.Context, data []byte, aspect embed, err := c.findImage(hash) if err != nil { - if c.mllama != nil { - embed, err = c.mllama.NewEmbed(llamaContext, data, aspectRatioId) - if err != nil { - return nil, err - } - } else if c.clip != nil { + if c.clip != nil { embed, err = c.clip.NewEmbed(llamaContext, data) if err != nil { return nil, err @@ -105,33 +93,11 @@ func (c *ImageContext) BatchSize(configuredBatchSize int) int { return 0 } - // Mllama maps an image to 1 embedding token (llava creates many tokens) - // and doesn't support more than a single image per request. - // The embeddings are large (100 MB), so allocating a big batch can fail - // on some systems - if c.mllama != nil { - return 1 - } - return configuredBatchSize } func (c *ImageContext) EmbedSize(llamaContext *llama.Context) int { - if c != nil && c.mllama != nil { - return c.mllama.EmbedSize(llamaContext) - } else { - return llamaContext.Model().NEmbd() - } -} - -func (c *ImageContext) NeedCrossAttention(inputs ...input) bool { - if c == nil || c.mllama == nil { - return false - } - - return slices.ContainsFunc(inputs, func(input input) bool { - return input.embed != nil - }) + return llamaContext.Model().NEmbd() } type imageCache struct { diff --git a/runner/llamarunner/runner.go b/runner/llamarunner/runner.go index 73e50ee0b..7aa9b96a2 100644 --- a/runner/llamarunner/runner.go +++ b/runner/llamarunner/runner.go @@ -57,10 +57,6 @@ type Sequence struct { // input cache being used by this sequence cache *InputCacheSlot - // does this sequence require cross-attention layers to be processed? - if we have seen - // an image for certain multi-modal models - crossAttention bool - // channel to send responses over responses chan string @@ -205,7 +201,7 @@ func (s *Server) inputs(prompt string, images []llm.ImageData) ([]input, error) return nil, fmt.Errorf("invalid image index: %d", n) } - embed, err := s.image.NewEmbed(s.lc, images[imageIndex].Data, images[imageIndex].AspectRatioID) + embed, err := s.image.NewEmbed(s.lc, images[imageIndex].Data) if err != nil { return nil, err } @@ -368,7 +364,6 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch) defer s.mu.Unlock() var batch *llama.Batch - crossAttention := false seqIdx := s.nextSeq - 1 for range s.seqs { @@ -416,9 +411,8 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch) batch = tokenBatch } else { batch = embedBatch - seq.crossAttention = s.image.NeedCrossAttention(input) } - } else if embedding != batch.IsEmbedding() || crossAttention != seq.crossAttention { + } else if embedding != batch.IsEmbedding() { s.nextSeq = seqIdx break } @@ -427,7 +421,6 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch) break } - crossAttention = seq.crossAttention batch.Add(input.token, input.embed, len(seq.cache.Inputs)+len(seq.pendingInputs), i+1 == len(seq.inputs), seq.cache.Id) seq.pendingInputs = append(seq.pendingInputs, input) seq.iBatch = batch.NumTokens() - 1 @@ -440,20 +433,11 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch) return nil } - s.lc.SetCrossAttention(crossAttention) - err := s.lc.Decode(batch) if err != nil { return fmt.Errorf("failed to decode batch: %w", err) } - if crossAttention { - // synchronize state to ensure the cross attention batch is complete. - // needed specifically for multi-GPU systems otherwise an inflight - // task may be incorrectly invalidated causing a crash - s.lc.Synchronize() - } - for i, seq := range s.seqs { if seq == nil { continue @@ -622,8 +606,6 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) { return } - seq.crossAttention = s.image.NeedCrossAttention(seq.cache.Inputs...) - s.seqs[i] = seq s.cond.Signal() found = true diff --git a/server/prompt.go b/server/prompt.go index 5b5b958f1..147a02b69 100644 --- a/server/prompt.go +++ b/server/prompt.go @@ -3,47 +3,32 @@ package server import ( "bytes" "context" - "encoding/binary" "errors" "fmt" "log/slog" + "slices" "strings" "github.com/ollama/ollama/api" "github.com/ollama/ollama/llm" - "github.com/ollama/ollama/model/models/mllama" "github.com/ollama/ollama/template" ) type tokenizeFunc func(context.Context, string) ([]int, error) -var errTooManyImages = errors.New("vision model only supports a single image per message") - // chatPrompt accepts a list of messages and returns the prompt and images that should be used for the next chat turn. // chatPrompt truncates any messages that exceed the context window of the model, making sure to always include 1) the // latest message and 2) system messages func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.Options, msgs []api.Message, tools []api.Tool) (prompt string, images []llm.ImageData, _ error) { var system []api.Message - isMllama := checkMllamaModelFamily(m) - - var imageNumTokens int // TODO: Ideally we would compute this from the projector metadata but some pieces are implementation dependent - if isMllama { - // Our mllama implementation packs all of the embeddings into a single token - imageNumTokens = 1 - } else { - // Clip images are represented as 768 tokens, each an embedding - imageNumTokens = 768 - } + // Clip images are represented as 768 tokens, each an embedding + imageNumTokens := 768 n := len(msgs) - 1 // in reverse, find all messages that fit into context window for i := n; i >= 0; i-- { - if isMllama && len(msgs[i].Images) > 1 { - return "", nil, errTooManyImages - } - // always include the last message if i == n { continue @@ -84,48 +69,17 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api. currMsgIdx := n for cnt, msg := range msgs[currMsgIdx:] { - prefix := "" - imgPrompt := "" + if slices.Contains(m.Config.ModelFamilies, "mllama") && len(msg.Images) > 1 { + return "", nil, errors.New("this model only supports one image while more than one image requested") + } + + var prefix string prompt := msg.Content for _, i := range msg.Images { - var imgData llm.ImageData - - if isMllama { - if len(m.ProjectorPaths) == 0 { - imgData = llm.ImageData{ - ID: len(images), - Data: i, - } - } else { - data, opts, err := mllama.Preprocess(bytes.NewReader(i)) - if err != nil { - return "", nil, err - } - - buf := new(bytes.Buffer) - err = binary.Write(buf, binary.LittleEndian, data) - if err != nil { - return "", nil, err - } - - ar, ok := opts["aspectRatioIndex"].(int) - if !ok { - return "", nil, fmt.Errorf("missing aspect ratio for image") - } - - imgData = llm.ImageData{ - ID: len(images), - Data: buf.Bytes(), - AspectRatioID: ar, - } - } - imgPrompt = "<|image|>" - } else { - imgData = llm.ImageData{ - ID: len(images), - Data: i, - } + imgData := llm.ImageData{ + ID: len(images), + Data: i, } imgTag := fmt.Sprintf("[img-%d]", imgData.ID) @@ -137,7 +91,7 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api. images = append(images, imgData) } - msgs[currMsgIdx+cnt].Content = prefix + imgPrompt + prompt + msgs[currMsgIdx+cnt].Content = prefix + prompt } // truncate any messages that do not fit into the context window @@ -148,12 +102,3 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api. return b.String(), images, nil } - -func checkMllamaModelFamily(m *Model) bool { - for _, arch := range m.Config.ModelFamilies { - if arch == "mllama" { - return true - } - } - return false -} diff --git a/server/prompt_test.go b/server/prompt_test.go index b81c01eef..fb6c96c0c 100644 --- a/server/prompt_test.go +++ b/server/prompt_test.go @@ -2,8 +2,6 @@ package server import ( "bytes" - "image" - "image/png" "testing" "github.com/google/go-cmp/cmp" @@ -14,10 +12,9 @@ import ( func TestChatPrompt(t *testing.T) { type expect struct { - prompt string - images [][]byte - aspectRatioID int - error error + prompt string + images [][]byte + error error } tmpl, err := template.Parse(` @@ -28,28 +25,6 @@ func TestChatPrompt(t *testing.T) { t.Fatal(err) } visionModel := Model{Template: tmpl, ProjectorPaths: []string{"vision"}} - mllamaModel := Model{Template: tmpl, ProjectorPaths: []string{"vision"}, Config: ConfigV2{ModelFamilies: []string{"mllama"}}} - - createImg := func(width, height int) ([]byte, error) { - img := image.NewRGBA(image.Rect(0, 0, width, height)) - var buf bytes.Buffer - - if err := png.Encode(&buf, img); err != nil { - return nil, err - } - - return buf.Bytes(), nil - } - - imgBuf, err := createImg(5, 5) - if err != nil { - t.Fatal(err) - } - - imgBuf2, err := createImg(6, 6) - if err != nil { - t.Fatal(err) - } cases := []struct { name string @@ -227,90 +202,6 @@ func TestChatPrompt(t *testing.T) { images: [][]byte{[]byte("one hotdog"), []byte("two hotdogs")}, }, }, - { - name: "messages with mllama (no images)", - model: mllamaModel, - limit: 2048, - msgs: []api.Message{ - {Role: "user", Content: "You're a test, Harry!"}, - {Role: "assistant", Content: "I-I'm a what?"}, - {Role: "user", Content: "A test. And a thumping good one at that, I'd wager."}, - }, - expect: expect{ - prompt: "You're a test, Harry! I-I'm a what? A test. And a thumping good one at that, I'd wager. ", - }, - }, - { - name: "messages with mllama single prompt", - model: mllamaModel, - limit: 2048, - msgs: []api.Message{ - {Role: "user", Content: "How many hotdogs are in this image?", Images: []api.ImageData{imgBuf}}, - }, - expect: expect{ - prompt: "[img-0]<|image|>How many hotdogs are in this image? ", - images: [][]byte{imgBuf}, - aspectRatioID: 1, - }, - }, - { - name: "messages with mllama", - model: mllamaModel, - limit: 2048, - msgs: []api.Message{ - {Role: "user", Content: "You're a test, Harry!"}, - {Role: "assistant", Content: "I-I'm a what?"}, - {Role: "user", Content: "A test. And a thumping good one at that, I'd wager.", Images: []api.ImageData{imgBuf}}, - }, - expect: expect{ - prompt: "You're a test, Harry! I-I'm a what? [img-0]<|image|>A test. And a thumping good one at that, I'd wager. ", - images: [][]byte{imgBuf}, - aspectRatioID: 1, - }, - }, - { - name: "multiple messages with mllama", - model: mllamaModel, - limit: 2048, - msgs: []api.Message{ - {Role: "user", Content: "You're a test, Harry!", Images: []api.ImageData{imgBuf}}, - {Role: "assistant", Content: "I-I'm a what?"}, - {Role: "user", Content: "A test. And a thumping good one at that, I'd wager.", Images: []api.ImageData{imgBuf2}}, - }, - expect: expect{ - prompt: "[img-0]<|image|>You're a test, Harry! I-I'm a what? [img-1]<|image|>A test. And a thumping good one at that, I'd wager. ", - images: [][]byte{imgBuf, imgBuf2}, - aspectRatioID: 1, - }, - }, - { - name: "earlier image with mllama", - model: mllamaModel, - limit: 2048, - msgs: []api.Message{ - {Role: "user", Content: "How many hotdogs are in this image?", Images: []api.ImageData{imgBuf}}, - {Role: "assistant", Content: "There are four hotdogs."}, - {Role: "user", Content: "Which ones have mustard?"}, - }, - expect: expect{ - prompt: "[img-0]<|image|>How many hotdogs are in this image? There are four hotdogs. Which ones have mustard? ", - images: [][]byte{imgBuf}, - aspectRatioID: 1, - }, - }, - { - name: "too many images with mllama", - model: mllamaModel, - limit: 2048, - msgs: []api.Message{ - {Role: "user", Content: "You're a test, Harry!"}, - {Role: "assistant", Content: "I-I'm a what?"}, - {Role: "user", Content: "A test. And a thumping good one at that, I'd wager.", Images: []api.ImageData{imgBuf, imgBuf}}, - }, - expect: expect{ - error: errTooManyImages, - }, - }, } for _, tt := range cases { @@ -341,10 +232,6 @@ func TestChatPrompt(t *testing.T) { if !bytes.Equal(images[i].Data, tt.images[i]) { t.Errorf("expected %q, got %q", tt.images[i], images[i].Data) } - } else { - if images[i].AspectRatioID != tt.aspectRatioID { - t.Errorf("expected aspect ratio %d, got %d", tt.aspectRatioID, images[i].AspectRatioID) - } } } }) diff --git a/server/routes.go b/server/routes.go index fd65669a2..d0b8f487e 100644 --- a/server/routes.go +++ b/server/routes.go @@ -4,7 +4,6 @@ import ( "bytes" "cmp" "context" - "encoding/binary" "encoding/json" "errors" "fmt" @@ -35,7 +34,6 @@ import ( "github.com/ollama/ollama/fs/ggml" "github.com/ollama/ollama/llm" "github.com/ollama/ollama/logutil" - "github.com/ollama/ollama/model/models/mllama" "github.com/ollama/ollama/openai" "github.com/ollama/ollama/server/internal/client/ollama" "github.com/ollama/ollama/server/internal/registry" @@ -100,6 +98,10 @@ func (s *Server) scheduleRunner(ctx context.Context, name string, caps []model.C return nil, nil, nil, err } + if slices.Contains(model.Config.ModelFamilies, "mllama") && len(model.ProjectorPaths) > 0 { + return nil, nil, nil, fmt.Errorf("'llama3.2-vision' is no longer compatible with your version of Ollama and has been replaced by a newer version. To re-download, run 'ollama pull llama3.2-vision'") + } + if err := model.CheckCapabilities(caps...); err != nil { return nil, nil, nil, fmt.Errorf("%s %w", name, err) } @@ -206,38 +208,14 @@ func (s *Server) GenerateHandler(c *gin.Context) { return } - isMllama := checkMllamaModelFamily(m) - if isMllama && len(req.Images) > 1 { - c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "this model only supports one image: more than one image sent"}) + if slices.Contains(m.Config.ModelFamilies, "mllama") && len(req.Images) > 1 { + c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "this model only supports one image while more than one image requested"}) return } images := make([]llm.ImageData, len(req.Images)) for i := range req.Images { - if isMllama && len(m.ProjectorPaths) > 0 { - data, opts, err := mllama.Preprocess(bytes.NewReader(req.Images[i])) - if err != nil { - c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": "error processing image"}) - return - } - - ar, ok := opts["aspectRatioIndex"].(int) - if !ok { - c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": "error processing image"}) - return - } - - buf := new(bytes.Buffer) - err = binary.Write(buf, binary.LittleEndian, data) - if err != nil { - c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": "error processing image"}) - return - } - - images[i] = llm.ImageData{ID: i, Data: buf.Bytes(), AspectRatioID: ar} - } else { - images[i] = llm.ImageData{ID: i, Data: req.Images[i]} - } + images[i] = llm.ImageData{ID: i, Data: req.Images[i]} } prompt := req.Prompt @@ -269,9 +247,6 @@ func (s *Server) GenerateHandler(c *gin.Context) { for _, i := range images { imgPrompt := "" - if isMllama { - imgPrompt = "<|image|>" - } msgs = append(msgs, api.Message{Role: "user", Content: fmt.Sprintf("[img-%d]"+imgPrompt, i.ID)}) } diff --git a/server/sched.go b/server/sched.go index 43da138e2..3fc54e55a 100644 --- a/server/sched.go +++ b/server/sched.go @@ -8,6 +8,7 @@ import ( "os" "reflect" "runtime" + "slices" "sort" "strconv" "strings" @@ -132,11 +133,11 @@ func (s *Scheduler) processPending(ctx context.Context) { continue } numParallel := int(envconfig.NumParallel()) - // TODO (jmorganca): mllama doesn't support parallel yet - // see https://github.com/ollama/ollama/issues/4165 - if checkMllamaModelFamily(pending.model) && numParallel != 1 { + // `mllama` is a snowflake and uses an encoder cache which cannot be used with num_parallel > 1 + // ref: https://github.com/ollama/ollama/issues/4165 + if slices.Contains(pending.model.Config.ModelFamilies, "mllama") && numParallel != 1 { numParallel = 1 - slog.Warn("mllama doesn't support parallel requests yet") + slog.Warn("mllama does not currently support parallel requests") } for {