chore: update mllama to use ollama engine (#10637)

This commit is contained in:
Michael Yang 2025-05-13 17:36:02 -07:00 committed by GitHub
parent 0478d440f0
commit 23125648b8
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
67 changed files with 785 additions and 4354 deletions

View File

@ -15,11 +15,13 @@ help:
@echo " make -f $(lastword $(MAKEFILE_LIST)) clean sync" @echo " make -f $(lastword $(MAKEFILE_LIST)) clean sync"
.PHONY: sync .PHONY: sync
sync: llama/build-info.cpp llama/llama.cpp ml/backend/ggml/ggml sync: llama/build-info.cpp ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal
.PHONY: llama/build-info.cpp llama/build-info.cpp: llama/build-info.cpp.in llama/llama.cpp
llama/build-info.cpp: llama/build-info.cpp.in sed -e 's|@FETCH_HEAD@|$(FETCH_HEAD)|' <$< >$@
sed -e 's|@FETCH_HEAD@|$(FETCH_HEAD)|' $< > $@
ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal: ml/backend/ggml/ggml
go generate ./$(@D)
.PHONY: llama/llama.cpp .PHONY: llama/llama.cpp
llama/llama.cpp: llama/vendor/ llama/llama.cpp: llama/vendor/
@ -30,12 +32,13 @@ ml/backend/ggml/ggml: llama/vendor/ggml/
rsync -arvzc -f "merge $@/.rsync-filter" $< $@ rsync -arvzc -f "merge $@/.rsync-filter" $< $@
PATCHES=$(wildcard llama/patches/*.patch) PATCHES=$(wildcard llama/patches/*.patch)
PATCHED=$(join $(dir $(PATCHES)), $(addsuffix ed, $(addprefix ., $(notdir $(PATCHES)))))
.PHONY: apply-patches .PHONY: apply-patches
.NOTPARALLEL: .NOTPARALLEL:
apply-patches: $(addsuffix ed, $(PATCHES)) apply-patches: $(PATCHED)
%.patched: %.patch llama/patches/.%.patched: llama/patches/%.patch
@if git -c user.name=nobody -c 'user.email=<>' -C $(WORKDIR) am -3 $(realpath $<); then touch $@; else git -C $(WORKDIR) am --abort; exit 1; fi @if git -c user.name=nobody -c 'user.email=<>' -C $(WORKDIR) am -3 $(realpath $<); then touch $@; else git -C $(WORKDIR) am --abort; exit 1; fi
.PHONY: checkout .PHONY: checkout
@ -57,4 +60,4 @@ format-patches: llama/patches
.PHONE: clean .PHONE: clean
clean: checkout clean: checkout
$(RM) $(addsuffix ed, $(PATCHES)) $(RM) llama/patches/.*.patched

View File

@ -1,6 +1,7 @@
package convert package convert
import ( import (
"cmp"
"encoding/json" "encoding/json"
"errors" "errors"
"fmt" "fmt"
@ -14,13 +15,12 @@ import (
) )
type ModelParameters struct { type ModelParameters struct {
Architectures []string `json:"architectures"` Architectures []string `json:"architectures"`
VocabSize uint32 `json:"vocab_size"` VocabSize uint32 `json:"vocab_size"`
TextModel TextParameters `json:"text_config"`
}
type TextParameters struct { TextModel struct {
VocabSize uint32 `json:"vocab_size"` VocabSize uint32 `json:"vocab_size"`
} `json:"text_config"`
} }
type AdapterParameters struct { type AdapterParameters struct {
@ -173,6 +173,8 @@ func ConvertModel(fsys fs.FS, f *os.File) error {
switch p.Architectures[0] { switch p.Architectures[0] {
case "LlamaForCausalLM": case "LlamaForCausalLM":
conv = &llamaModel{} conv = &llamaModel{}
case "MllamaForConditionalGeneration":
conv = &mllamaModel{}
case "Llama4ForConditionalGeneration": case "Llama4ForConditionalGeneration":
conv = &llama4Model{} conv = &llama4Model{}
case "Mistral3ForConditionalGeneration": case "Mistral3ForConditionalGeneration":
@ -212,24 +214,22 @@ func ConvertModel(fsys fs.FS, f *os.File) error {
return err return err
} }
vocabSize := int(p.VocabSize) vocabSize := int(cmp.Or(p.VocabSize, p.TextModel.VocabSize))
if vocabSize == 0 {
tVocabSize := int(p.TextModel.VocabSize)
vocabSize = tVocabSize
}
switch { switch {
case vocabSize == 0: case vocabSize == 0:
slog.Warn("vocabulary size was not explicitly set by the model", "default size", len(t.Vocabulary.Tokens)) slog.Debug("vocabulary size was not explicitly set by the model", "default size", len(t.Vocabulary.Tokens))
case vocabSize > len(t.Vocabulary.Tokens): case vocabSize > len(t.Vocabulary.Tokens):
slog.Warn("vocabulary is smaller than expected, padding with dummy tokens", "expect", vocabSize, "actual", len(t.Vocabulary.Tokens)) slog.Debug("vocabulary is smaller than expected, padding with dummy tokens", "expect", vocabSize, "actual", len(t.Vocabulary.Tokens))
for i := range vocabSize - len(t.Vocabulary.Tokens) { for i := range vocabSize - len(t.Vocabulary.Tokens) {
t.Vocabulary.Tokens = append(t.Vocabulary.Tokens, fmt.Sprintf("[PAD%d]", i)) t.Vocabulary.Tokens = append(t.Vocabulary.Tokens, fmt.Sprintf("[PAD%d]", i))
t.Vocabulary.Scores = append(t.Vocabulary.Scores, -1) t.Vocabulary.Scores = append(t.Vocabulary.Scores, -1)
t.Vocabulary.Types = append(t.Vocabulary.Types, tokenTypeUserDefined) t.Vocabulary.Types = append(t.Vocabulary.Types, tokenTypeUserDefined)
} }
case vocabSize < len(t.Vocabulary.Tokens): case vocabSize < len(t.Vocabulary.Tokens):
return fmt.Errorf("vocabulary is larger than expected '%d' instead of '%d'", len(t.Vocabulary.Tokens), vocabSize) slog.Debug("vocabulary is larger than expected", "want", vocabSize, "got", len(t.Vocabulary.Tokens))
p.VocabSize = uint32(len(t.Vocabulary.Tokens))
p.TextModel.VocabSize = uint32(len(t.Vocabulary.Tokens))
default: default:
slog.Debug("vocabulary", "size", len(t.Vocabulary.Tokens)) slog.Debug("vocabulary", "size", len(t.Vocabulary.Tokens))
} }

160
convert/convert_mllama.go Normal file
View File

@ -0,0 +1,160 @@
package convert
import (
"strings"
"github.com/ollama/ollama/fs/ggml"
"github.com/pdevine/tensor"
"github.com/pdevine/tensor/native"
)
type mllamaModel struct {
ModelParameters
TextModel struct {
llamaModel
CrossAttentionLayers []int32 `json:"cross_attention_layers"`
} `json:"text_config"`
VisionModel struct {
NumHiddenLayers uint32 `json:"num_hidden_layers"`
NumGlobalLayers uint32 `json:"num_global_layers"`
IntermediateLayersIndices []int32 `json:"intermediate_layers_indices"`
HiddenSize uint32 `json:"hidden_size"`
IntermediateSize uint32 `json:"intermediate_size"`
AttentionHeads uint32 `json:"attention_heads"`
ImageSize uint32 `json:"image_size"`
PatchSize uint32 `json:"patch_size"`
NumChannels uint32 `json:"num_channels"`
MaxNumTiles uint32 `json:"max_num_tiles"`
NormEpsilon float32 `json:"norm_eps"`
RopeTheta float32 `json:"rope.freq_base"`
} `json:"vision_config"`
}
func (m *mllamaModel) KV(t *Tokenizer) ggml.KV {
kv := m.ModelParameters.KV(t)
kv["general.architecture"] = "mllama"
for k, v := range m.TextModel.KV(t) {
if strings.HasPrefix(k, "llama.") {
kv[strings.ReplaceAll(k, "llama.", "mllama.")] = v
}
}
kv["mllama.attention.cross_attention_layers"] = m.TextModel.CrossAttentionLayers
kv["mllama.vision.block_count"] = m.VisionModel.NumHiddenLayers
kv["mllama.vision.global.block_count"] = m.VisionModel.NumGlobalLayers
kv["mllama.vision.intermediate_layers_indices"] = m.VisionModel.IntermediateLayersIndices
kv["mllama.vision.embedding_length"] = m.VisionModel.HiddenSize
kv["mllama.vision.feed_forward_length"] = m.VisionModel.IntermediateSize
kv["mllama.vision.attention.head_count"] = m.VisionModel.AttentionHeads
kv["mllama.vision.attention.layer_norm_epsilon"] = m.VisionModel.NormEpsilon
kv["mllama.vision.image_size"] = m.VisionModel.ImageSize
kv["mllama.vision.patch_size"] = m.VisionModel.PatchSize
kv["mllama.vision.max_num_tiles"] = m.VisionModel.MaxNumTiles
kv["mllama.vision.num_channels"] = m.VisionModel.NumChannels
return kv
}
func (m *mllamaModel) Replacements() []string {
return append(
m.TextModel.Replacements(),
"language_model.", "",
"gate_attn", "attn_gate",
"gate_ffn", "ffn_gate",
"cross_attn.", "cross_attn_",
"vision_model", "v",
"class_embedding", "class_embd",
"patch_embedding", "patch_embd",
"gated_positional_embedding.tile_embedding", "tile_position_embd",
"gated_positional_embedding.embedding", "position_embd.weight",
"gated_positional_embedding", "position_embd",
"embedding.weight", "weight",
"pre_tile_positional_embedding", "pre_tile_position_embd",
"post_tile_positional_embedding", "post_tile_position_embd",
"layernorm_pre", "pre_ln",
"layernorm_post", "post_ln",
"global_transformer.layers", "global.blk",
"transformer.layers", "blk",
"mlp.fc1", "ffn_up",
"mlp.fc2", "ffn_down",
"multi_modal_projector", "mm.0",
)
}
func (m *mllamaModel) Tensors(ts []Tensor) []*ggml.Tensor {
var out []*ggml.Tensor
var text []Tensor
for _, t := range ts {
if t.Name() == "v.position_embd.gate" {
for _, name := range []string{"v.position_embd.gate", "v.tile_position_embd.gate"} {
tt := t.Clone()
tt.SetRepacker(m.repack(name))
out = append(out, &ggml.Tensor{
Name: name,
Kind: t.Kind(),
Shape: t.Shape(),
WriterTo: tt,
})
}
} else if t.Name() == "v.pre_tile_position_embd.gate" || t.Name() == "v.post_tile_position_embd.gate" {
t.SetRepacker(m.repack(t.Name()))
out = append(out, &ggml.Tensor{
Name: t.Name(),
Kind: t.Kind(),
Shape: t.Shape(),
WriterTo: t,
})
} else if strings.HasPrefix(t.Name(), "v.") || strings.HasPrefix(t.Name(), "mm.") {
out = append(out, &ggml.Tensor{
Name: t.Name(),
Kind: t.Kind(),
Shape: t.Shape(),
WriterTo: t,
})
} else {
text = append(text, t)
}
}
return append(out, m.TextModel.Tensors(text)...)
}
func (m *mllamaModel) repack(name string) Repacker {
return func(_ string, data []float32, shape []uint64) (_ []float32, err error) {
dims := make([]int, len(shape))
for i, dim := range shape {
dims[i] = int(dim)
}
var t tensor.Tensor = tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
t, err = tensor.Tanh(t)
if err != nil {
return nil, err
}
if name == "v.position_embd.gate" {
t, err = tensor.Sub(float32(1), t)
if err != nil {
return nil, err
}
}
t = tensor.Materialize(t)
// flatten tensor so it can be return as a vector
if err := t.Reshape(t.Shape().TotalSize()); err != nil {
return nil, err
}
return native.VectorF32(t.(*tensor.Dense))
}
}

View File

@ -38,7 +38,10 @@ const (
func (t tensorBase) Kind() uint32 { func (t tensorBase) Kind() uint32 {
if strings.HasSuffix(t.name, ".ffn_gate_inp.weight") || if strings.HasSuffix(t.name, ".ffn_gate_inp.weight") ||
t.name == "token_types.weight" || t.name == "token_types.weight" ||
t.name == "v.positional_embedding_vlm" { t.name == "v.positional_embedding_vlm" ||
t.name == "v.tile_position_embd.weight" ||
t.name == "v.pre_tile_position_embd.weight" ||
t.name == "v.post_tile_position_embd.weight" {
// these tensors are always F32 // these tensors are always F32
return 0 return 0
} }

View File

@ -125,6 +125,7 @@ func (kv KV) OllamaEngineRequired() bool {
"gemma3", "gemma3",
"mistral3", "mistral3",
"llama4", "llama4",
"mllama",
}, kv.Architecture()) }, kv.Architecture())
} }

View File

@ -258,7 +258,6 @@ extern "C" {
llama_token * token; llama_token * token;
float * embd; float * embd;
int32_t n_embd;
llama_pos * pos; llama_pos * pos;
int32_t * n_seq_id; int32_t * n_seq_id;
llama_seq_id ** seq_id; llama_seq_id ** seq_id;
@ -366,7 +365,6 @@ extern "C" {
bool flash_attn; // whether to use flash attention [EXPERIMENTAL] bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
bool no_perf; // whether to measure performance timings bool no_perf; // whether to measure performance timings
bool op_offload; // whether to offload host tensor operations to device bool op_offload; // whether to offload host tensor operations to device
bool cross_attn; // whether to use cross attention
}; };
// model quantization parameters // model quantization parameters
@ -466,10 +464,6 @@ extern "C" {
struct llama_context_params params), struct llama_context_params params),
"use llama_init_from_model instead"); "use llama_init_from_model instead");
// TODO (jmorganca): this should most likely be passed in as part of a batch
// and not set on the context for all batches.
LLAMA_API void llama_set_cross_attention(struct llama_context * ctx, bool cross_attn_state);
// Frees all allocated memory // Frees all allocated memory
LLAMA_API void llama_free(struct llama_context * ctx); LLAMA_API void llama_free(struct llama_context * ctx);

View File

@ -6,7 +6,6 @@
static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = { static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_LLAMA, "llama" }, { LLM_ARCH_LLAMA, "llama" },
{ LLM_ARCH_MLLAMA, "mllama" },
{ LLM_ARCH_LLAMA4, "llama4" }, { LLM_ARCH_LLAMA4, "llama4" },
{ LLM_ARCH_DECI, "deci" }, { LLM_ARCH_DECI, "deci" },
{ LLM_ARCH_FALCON, "falcon" }, { LLM_ARCH_FALCON, "falcon" },
@ -145,7 +144,6 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" }, { LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" }, { LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
{ LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection" }, { LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection" },
{ LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS, "%s.attention.cross_attention_layers" },
{ LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" }, { LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" },
{ LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" }, { LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
@ -275,40 +273,6 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" }, { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
}, },
}, },
{
LLM_ARCH_MLLAMA,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
{ LLM_TENSOR_OUTPUT, "output" },
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
{ LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
{ LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
{ LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
{ LLM_TENSOR_CROSS_ATTN_K_NORM, "blk.%d.cross_attn_k_norm" },
{ LLM_TENSOR_CROSS_ATTN_K_PROJ, "blk.%d.cross_attn_k_proj" },
{ LLM_TENSOR_CROSS_ATTN_O_PROJ, "blk.%d.cross_attn_o_proj" },
{ LLM_TENSOR_CROSS_ATTN_Q_NORM, "blk.%d.cross_attn_q_norm" },
{ LLM_TENSOR_CROSS_ATTN_Q_PROJ, "blk.%d.cross_attn_q_proj" },
{ LLM_TENSOR_CROSS_ATTN_V_PROJ, "blk.%d.cross_attn_v_proj" },
{ LLM_TENSOR_CROSS_ATTN_ATTN_GATE, "blk.%d.cross_attn_attn_gate" },
{ LLM_TENSOR_CROSS_ATTN_MLP_GATE, "blk.%d.cross_attn_mlp_gate" },
},
},
{ {
LLM_ARCH_DECI, LLM_ARCH_DECI,
{ {
@ -1737,14 +1701,6 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
// this tensor is loaded for T5, but never used // this tensor is loaded for T5, but never used
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}}, {LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
{LLM_TENSOR_BSKCN_TV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_BSKCN_TV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
{LLM_TENSOR_CROSS_ATTN_K_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
{LLM_TENSOR_CROSS_ATTN_K_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_CROSS_ATTN_O_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_CROSS_ATTN_Q_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
{LLM_TENSOR_CROSS_ATTN_Q_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_CROSS_ATTN_V_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_CROSS_ATTN_ATTN_GATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
{LLM_TENSOR_CROSS_ATTN_MLP_GATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
{LLM_TENSOR_CONV1D, {LLM_TENSOR_LAYER_INPUT, GGML_OP_IM2COL}}, {LLM_TENSOR_CONV1D, {LLM_TENSOR_LAYER_INPUT, GGML_OP_IM2COL}},
{LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
{LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},

View File

@ -11,7 +11,6 @@
enum llm_arch { enum llm_arch {
LLM_ARCH_LLAMA, LLM_ARCH_LLAMA,
LLM_ARCH_LLAMA4, LLM_ARCH_LLAMA4,
LLM_ARCH_MLLAMA,
LLM_ARCH_DECI, LLM_ARCH_DECI,
LLM_ARCH_FALCON, LLM_ARCH_FALCON,
LLM_ARCH_BAICHUAN, LLM_ARCH_BAICHUAN,
@ -149,7 +148,6 @@ enum llm_kv {
LLM_KV_ATTENTION_SLIDING_WINDOW, LLM_KV_ATTENTION_SLIDING_WINDOW,
LLM_KV_ATTENTION_SCALE, LLM_KV_ATTENTION_SCALE,
LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS,
LLM_KV_ATTENTION_KEY_LENGTH_MLA, LLM_KV_ATTENTION_KEY_LENGTH_MLA,
LLM_KV_ATTENTION_VALUE_LENGTH_MLA, LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
@ -351,14 +349,6 @@ enum llm_tensor {
LLM_TENSOR_CLS, LLM_TENSOR_CLS,
LLM_TENSOR_CLS_OUT, LLM_TENSOR_CLS_OUT,
LLM_TENSOR_BSKCN_TV, LLM_TENSOR_BSKCN_TV,
LLM_TENSOR_CROSS_ATTN_K_NORM,
LLM_TENSOR_CROSS_ATTN_K_PROJ,
LLM_TENSOR_CROSS_ATTN_O_PROJ,
LLM_TENSOR_CROSS_ATTN_Q_NORM,
LLM_TENSOR_CROSS_ATTN_Q_PROJ,
LLM_TENSOR_CROSS_ATTN_V_PROJ,
LLM_TENSOR_CROSS_ATTN_ATTN_GATE,
LLM_TENSOR_CROSS_ATTN_MLP_GATE,
LLM_TENSOR_CONV1D, LLM_TENSOR_CONV1D,
LLM_TENSOR_CONVNEXT_DW, LLM_TENSOR_CONVNEXT_DW,
LLM_TENSOR_CONVNEXT_NORM, LLM_TENSOR_CONVNEXT_NORM,

View File

@ -320,7 +320,6 @@ struct llama_batch llama_batch_get_one(
/*n_tokens =*/ n_tokens, /*n_tokens =*/ n_tokens,
/*tokens =*/ tokens, /*tokens =*/ tokens,
/*embd =*/ nullptr, /*embd =*/ nullptr,
/*n_embd =*/ 0,
/*pos =*/ nullptr, /*pos =*/ nullptr,
/*n_seq_id =*/ nullptr, /*n_seq_id =*/ nullptr,
/*seq_id =*/ nullptr, /*seq_id =*/ nullptr,
@ -333,7 +332,6 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
/*n_tokens =*/ 0, /*n_tokens =*/ 0,
/*tokens =*/ nullptr, /*tokens =*/ nullptr,
/*embd =*/ nullptr, /*embd =*/ nullptr,
/*n_embd =*/ 0,
/*pos =*/ nullptr, /*pos =*/ nullptr,
/*n_seq_id =*/ nullptr, /*n_seq_id =*/ nullptr,
/*seq_id =*/ nullptr, /*seq_id =*/ nullptr,
@ -342,7 +340,6 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
if (embd) { if (embd) {
batch.embd = (float *) malloc(sizeof(float) * n_tokens_alloc * embd); batch.embd = (float *) malloc(sizeof(float) * n_tokens_alloc * embd);
batch.n_embd = embd;
} else { } else {
batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens_alloc); batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens_alloc);
} }

View File

@ -514,7 +514,7 @@ float * llama_context::get_logits_ith(int32_t i) {
throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, n_outputs)); throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, n_outputs));
} }
return logits + j*model.hparams.n_vocab; return logits + j*model.vocab.n_tokens();
} catch (const std::exception & err) { } catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what()); LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what());
#ifndef NDEBUG #ifndef NDEBUG
@ -632,10 +632,6 @@ void llama_context::set_warmup(bool value) {
cparams.warmup = value; cparams.warmup = value;
} }
void llama_context::set_cross_attn(bool value) {
cparams.cross_attn = value;
}
void llama_context::set_adapter_lora( void llama_context::set_adapter_lora(
llama_adapter_lora * adapter, llama_adapter_lora * adapter,
float scale) { float scale) {
@ -713,7 +709,7 @@ int llama_context::encode(llama_batch & inp_batch) {
const int64_t n_embd = hparams.n_embd; const int64_t n_embd = hparams.n_embd;
llama_sbatch sbatch = llama_sbatch(batch, batch.n_embd, /* simple_split */ true, /* logits_all */ true); llama_sbatch sbatch = llama_sbatch(batch, n_embd, /* simple_split */ true, /* logits_all */ true);
const llama_ubatch ubatch = sbatch.split_simple(n_tokens); const llama_ubatch ubatch = sbatch.split_simple(n_tokens);
@ -867,9 +863,10 @@ int llama_context::decode(llama_batch & inp_batch) {
const llama_batch & batch = batch_allocr.batch; const llama_batch & batch = batch_allocr.batch;
const auto & vocab = model.vocab;
const auto & hparams = model.hparams; const auto & hparams = model.hparams;
const int32_t n_vocab = hparams.n_vocab; const int32_t n_vocab = vocab.n_tokens();
const int64_t n_tokens_all = batch.n_tokens; const int64_t n_tokens_all = batch.n_tokens;
const int64_t n_embd = hparams.n_embd; const int64_t n_embd = hparams.n_embd;
@ -1093,7 +1090,7 @@ int llama_context::decode(llama_batch & inp_batch) {
// make the outputs have the same order they had in the user-provided batch // make the outputs have the same order they had in the user-provided batch
// note: this is mostly relevant for recurrent models atm // note: this is mostly relevant for recurrent models atm
if (!sorted_output) { if (!sorted_output) {
const uint32_t n_vocab = model.hparams.n_vocab; const uint32_t n_vocab = model.vocab.n_tokens();
const uint32_t n_embd = model.hparams.n_embd; const uint32_t n_embd = model.hparams.n_embd;
GGML_ASSERT((size_t) n_outputs == out_ids.size()); GGML_ASSERT((size_t) n_outputs == out_ids.size());
@ -1148,11 +1145,12 @@ int llama_context::decode(llama_batch & inp_batch) {
int32_t llama_context::output_reserve(int32_t n_outputs) { int32_t llama_context::output_reserve(int32_t n_outputs) {
const auto & hparams = model.hparams; const auto & hparams = model.hparams;
const auto & vocab = model.vocab;
const int64_t n_outputs_max = std::max<int64_t>(n_outputs, n_seq_max()); const int64_t n_outputs_max = std::max<int64_t>(n_outputs, n_seq_max());
const auto n_batch = cparams.n_batch; const auto n_batch = cparams.n_batch;
const auto n_vocab = hparams.n_vocab; const auto n_vocab = vocab.n_tokens();
const auto n_embd = hparams.n_embd; const auto n_embd = hparams.n_embd;
// TODO: use a per-batch flag for logits presence instead // TODO: use a per-batch flag for logits presence instead
@ -1687,7 +1685,7 @@ size_t llama_context::state_write_data(llama_io_write_i & io) {
{ {
LLAMA_LOG_DEBUG("%s: - writing logits\n", __func__); LLAMA_LOG_DEBUG("%s: - writing logits\n", __func__);
const uint64_t logits_size = std::min((uint64_t) this->logits_size, (uint64_t) n_outputs * model.hparams.n_vocab); const uint64_t logits_size = std::min((uint64_t) this->logits_size, (uint64_t) n_outputs * model.vocab.n_tokens());
io.write(&logits_size, sizeof(logits_size)); io.write(&logits_size, sizeof(logits_size));
@ -2099,7 +2097,6 @@ llama_context_params llama_context_default_params() {
/*.flash_attn =*/ false, /*.flash_attn =*/ false,
/*.no_perf =*/ true, /*.no_perf =*/ true,
/*.op_offload =*/ true, /*.op_offload =*/ true,
/*.cross_attn =*/ false,
}; };
return result; return result;
@ -2225,10 +2222,6 @@ void llama_set_warmup(llama_context * ctx, bool warmup) {
ctx->set_warmup(warmup); ctx->set_warmup(warmup);
} }
void llama_set_cross_attention(struct llama_context * ctx, bool cross_attention) {
ctx->set_cross_attn(cross_attention);
}
void llama_synchronize(llama_context * ctx) { void llama_synchronize(llama_context * ctx) {
ctx->synchronize(); ctx->synchronize();
} }

View File

@ -72,7 +72,6 @@ struct llama_context {
void set_embeddings (bool value); void set_embeddings (bool value);
void set_causal_attn(bool value); void set_causal_attn(bool value);
void set_warmup(bool value); void set_warmup(bool value);
void set_cross_attn(bool value);
void set_adapter_lora( void set_adapter_lora(
llama_adapter_lora * adapter, llama_adapter_lora * adapter,

View File

@ -31,7 +31,6 @@ struct llama_cparams {
bool no_perf; bool no_perf;
bool warmup; bool warmup;
bool op_offload; bool op_offload;
bool cross_attn;
enum llama_pooling_type pooling_type; enum llama_pooling_type pooling_type;

View File

@ -532,12 +532,6 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
} }
} }
void llm_graph_input_cross_attn_state::set_input(const llama_ubatch * ubatch) {
if (ubatch->embd) {
ggml_backend_tensor_set(cross_attn_state, ubatch->embd, 0, ggml_nbytes(cross_attn_state));
}
}
// //
// llm_graph_context // llm_graph_context
// //
@ -1520,25 +1514,6 @@ llm_graph_input_attn_cross * llm_graph_context::build_attn_inp_cross() const {
return (llm_graph_input_attn_cross *) res->add_input(std::move(inp)); return (llm_graph_input_attn_cross *) res->add_input(std::move(inp));
} }
ggml_tensor * llm_graph_context::build_inp_cross_attn_state() const {
const int64_t n_embd = hparams.n_embd;
auto inp = std::make_unique<llm_graph_input_cross_attn_state>();
ggml_tensor * cur = nullptr;
inp->cross_attn_state = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd, 1601, 4);
ggml_set_input(inp->cross_attn_state);
cur = inp->cross_attn_state;
cb(cur, "inp_cross_attn_state", -1);
res->add_input(std::move(inp));
return cur;
}
ggml_tensor * llm_graph_context::build_attn( ggml_tensor * llm_graph_context::build_attn(
llm_graph_input_attn_cross * inp, llm_graph_input_attn_cross * inp,
ggml_cgraph * gf, ggml_cgraph * gf,

View File

@ -87,7 +87,6 @@ public:
ggml_tensor * tokens = nullptr; // I32 [n_batch] ggml_tensor * tokens = nullptr; // I32 [n_batch]
ggml_tensor * embd = nullptr; // F32 [n_embd, n_batch] ggml_tensor * embd = nullptr; // F32 [n_embd, n_batch]
ggml_tensor * cross_attn_state; // F32 [4, n_embd, 1061]
}; };
class llm_graph_input_pos : public llm_graph_input_i { class llm_graph_input_pos : public llm_graph_input_i {
@ -285,16 +284,6 @@ public:
const llama_cross * cross = nullptr; const llama_cross * cross = nullptr;
}; };
class llm_graph_input_cross_attn_state : public llm_graph_input_i {
public:
llm_graph_input_cross_attn_state() = default;
virtual ~llm_graph_input_cross_attn_state() = default;
void set_input(const llama_ubatch * ubatch) override;
ggml_tensor * cross_attn_state; // F32 [4, n_embd, 1061]
};
// //
// llm_graph_result // llm_graph_result
// //
@ -506,7 +495,6 @@ struct llm_graph_context {
ggml_tensor * build_inp_cls() const; ggml_tensor * build_inp_cls() const;
ggml_tensor * build_inp_s_copy() const; ggml_tensor * build_inp_s_copy() const;
ggml_tensor * build_inp_s_mask() const; ggml_tensor * build_inp_s_mask() const;
ggml_tensor * build_inp_cross_attn_state() const;
ggml_tensor * build_inp_cross_embd() const; ggml_tensor * build_inp_cross_embd() const;
ggml_tensor * build_inp_pos_bucket_enc() const; ggml_tensor * build_inp_pos_bucket_enc() const;

View File

@ -85,7 +85,3 @@ bool llama_hparams::is_swa(uint32_t il) const {
GGML_ABORT("fatal error"); GGML_ABORT("fatal error");
} }
bool llama_hparams::cross_attention_layers(uint32_t il) const {
return std::find(cross_attn_layers.begin(), cross_attn_layers.end(), il) != cross_attn_layers.end();
}

View File

@ -2,8 +2,6 @@
#include "llama.h" #include "llama.h"
#include <algorithm>
#include <array> #include <array>
// bump if necessary // bump if necessary
@ -44,7 +42,6 @@ struct llama_hparams {
uint32_t n_expert = 0; uint32_t n_expert = 0;
uint32_t n_expert_used = 0; uint32_t n_expert_used = 0;
uint32_t n_rel_attn_bkts = 0; uint32_t n_rel_attn_bkts = 0;
uint32_t n_vocab = 0;
// note: deepseek2 using MLA converts into MQA with larger heads, then decompresses to MHA // note: deepseek2 using MLA converts into MQA with larger heads, then decompresses to MHA
uint32_t n_embd_head_k_mla = 0; uint32_t n_embd_head_k_mla = 0;
@ -59,7 +56,6 @@ struct llama_hparams {
std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr; std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
std::array<std::array<uint32_t, LLAMA_MAX_LAYERS>, 4> n_bskcn_arr = {}; std::array<std::array<uint32_t, LLAMA_MAX_LAYERS>, 4> n_bskcn_arr = {};
std::array<uint32_t, LLAMA_MAX_LAYERS> cross_attn_layers;
uint32_t n_layer_dense_lead = 0; uint32_t n_layer_dense_lead = 0;
uint32_t n_lora_q = 0; uint32_t n_lora_q = 0;
@ -163,9 +159,6 @@ struct llama_hparams {
// Block skip connection // Block skip connection
bool n_bskcn(uint32_t n, uint32_t il) const; bool n_bskcn(uint32_t n, uint32_t il) const;
// cross attention layers
bool cross_attention_layers(uint32_t il) const;
bool is_swa(uint32_t il) const; bool is_swa(uint32_t il) const;
}; };

View File

@ -100,16 +100,8 @@ llama_kv_cache_unified::llama_kv_cache_unified(
throw std::runtime_error("failed to create ggml context for kv cache"); throw std::runtime_error("failed to create ggml context for kv cache");
} }
ggml_tensor * k, *v; ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size);
// for cross attention layers
if (model.arch == LLM_ARCH_MLLAMA && hparams.cross_attention_layers(i)) {
k = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hparams.n_embd_head_k, 6404, hparams.n_head_kv(i));
v = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hparams.n_embd_head_v, 6404, hparams.n_head_kv(i));
} else {
k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size);
}
ggml_format_name(k, "cache_k_l%d", i); ggml_format_name(k, "cache_k_l%d", i);
ggml_format_name(v, "cache_v_l%d", i); ggml_format_name(v, "cache_v_l%d", i);
k_l.push_back(k); k_l.push_back(k);
@ -459,7 +451,7 @@ void llama_kv_cache_unified::set_full() {
llama_sbatch llama_kv_cache_unified::sbatch_init( llama_sbatch llama_kv_cache_unified::sbatch_init(
const llama_batch & batch, const llama_batch & batch,
bool logits_all) { bool logits_all) {
return llama_sbatch(batch, batch.n_embd, true, logits_all); return llama_sbatch(batch, hparams.n_embd, true, logits_all);
} }
llama_ubatch llama_kv_cache_unified::ubatch_next( llama_ubatch llama_kv_cache_unified::ubatch_next(

View File

@ -315,8 +315,6 @@ namespace GGUFMeta {
return true; return true;
} }
template bool llama_model_loader::get_arr<std::array<unsigned int, 512>>(enum llm_kv kid, std::array<unsigned int, 512>& result, bool required);
template<typename T, size_t N_MAX> template<typename T, size_t N_MAX>
bool llama_model_loader::get_arr(const std::string & key, std::array<T, N_MAX> & result, bool required) { bool llama_model_loader::get_arr(const std::string & key, std::array<T, N_MAX> & result, bool required) {
const int kid = gguf_find_key(meta.get(), key.c_str()); const int kid = gguf_find_key(meta.get(), key.c_str());

View File

@ -433,7 +433,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
// get general kv // get general kv
ml.get_key(LLM_KV_GENERAL_NAME, name, false); ml.get_key(LLM_KV_GENERAL_NAME, name, false);
ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab, false);
// everything past this point is not vocab-related // everything past this point is not vocab-related
if (hparams.vocab_only) { if (hparams.vocab_only) {
@ -445,7 +444,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer); ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer);
ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false); ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false); ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false);
if (arch == LLM_ARCH_WAVTOKENIZER_DEC) { if (arch == LLM_ARCH_WAVTOKENIZER_DEC) {
ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd_features); ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd_features);
@ -469,11 +467,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0); std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0);
std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0); std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0); std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0);
std::fill(hparams.cross_attn_layers.begin(), hparams.cross_attn_layers.end(), -1);
ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer, false); ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer, false);
ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false); ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false);
ml.get_arr(LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS, hparams.cross_attn_layers, false);
// n_head_kv is optional, default to n_head // n_head_kv is optional, default to n_head
hparams.n_head_kv_arr = hparams.n_head_arr; hparams.n_head_kv_arr = hparams.n_head_arr;
@ -526,7 +522,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false); ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_MLLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON) { if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON) {
if (hparams.n_rot != hparams.n_embd_head_k) { if (hparams.n_rot != hparams.n_embd_head_k) {
throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k)); throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
} }
@ -589,16 +585,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
hparams.use_kq_norm = false; hparams.use_kq_norm = false;
} }
} break; } break;
case LLM_ARCH_MLLAMA:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
switch (hparams.n_layer) {
case 40: type = LLM_TYPE_11B; break;
case 100: type = LLM_TYPE_90B; break;
default: type = LLM_TYPE_UNKNOWN;
}
} break;
case LLM_ARCH_DECI: case LLM_ARCH_DECI:
{ {
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@ -1595,7 +1581,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
const int64_t n_embd_head_v = hparams.n_embd_head_v; const int64_t n_embd_head_v = hparams.n_embd_head_v;
const int64_t n_ff = hparams.n_ff(); const int64_t n_ff = hparams.n_ff();
const int64_t n_embd_gqa = n_embd_v_gqa; const int64_t n_embd_gqa = n_embd_v_gqa;
const int64_t n_vocab = hparams.n_vocab; const int64_t n_vocab = vocab.n_tokens();
const int64_t n_token_types = vocab.n_token_types(); const int64_t n_token_types = vocab.n_token_types();
const int64_t n_rot = hparams.n_rot; const int64_t n_rot = hparams.n_rot;
const int64_t n_expert = hparams.n_expert; const int64_t n_expert = hparams.n_expert;
@ -1854,52 +1840,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
} }
} }
} break; } break;
case LLM_ARCH_MLLAMA:
{
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab+8}, 0);
// output
{
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
// if output is NULL, init from the input tok embed
if (output == NULL) {
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
}
}
for (int i = 0; i < n_layer; ++i) {
auto & layer = layers[i];
if (hparams.cross_attention_layers(i)) {
layer.cross_attn_k_norm = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_K_NORM, "weight", i), {128}, 0);
layer.cross_attn_k_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_K_PROJ, "weight", i), {n_embd, 1024}, 0);
layer.cross_attn_o_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_O_PROJ, "weight", i), {n_embd, n_embd}, 0);
layer.cross_attn_q_norm = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_Q_NORM, "weight", i), {128}, 0);
layer.cross_attn_q_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_Q_PROJ, "weight", i), {n_embd, n_embd}, 0);
layer.cross_attn_v_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_V_PROJ, "weight", i), {n_embd, 1024}, 0);
layer.cross_attn_attn_gate = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_ATTN_GATE, i), {1}, 0);
layer.cross_attn_mlp_gate = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_MLP_GATE, i), {1}, 0);
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
} else {
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
}
}
} break;
case LLM_ARCH_DECI: case LLM_ARCH_DECI:
{ {
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@ -4816,246 +4756,6 @@ struct llm_build_llama : public llm_graph_context {
} }
}; };
struct llm_build_mllama: public llm_graph_context {
llm_build_mllama(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
// mutable variable, needed during the last layer of the computation to skip unused tokens
int32_t n_tokens = this->n_tokens;
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_head == hparams.n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
ggml_tensor * inpCAS;
inpL = build_inp_embd(model.tok_embd);
inpCAS = build_inp_cross_attn_state();
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();
auto * inp_attn = build_attn_inp_kv_unified();
const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
for (int il = 0; il < n_layer; ++il) {
ggml_tensor * inpSA = inpL;
// norm
cur = build_norm(inpL,
model.layers[il].attn_norm, NULL,
LLM_NORM_RMS, il);
cb(cur, "attn_norm", il);
if (hparams.cross_attention_layers(il)) {
if (!ubatch.embd && !cparams.cross_attn) {
continue;
}
// cross attention layer
ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_q_proj, cur);
cb(Qcur, "Qcur", il);
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
cb(Qcur, "Qcur", il);
Qcur = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 0, 2, 1, 3));
cb(Qcur, "Qcur", il);
Qcur = build_norm(Qcur, model.layers[il].cross_attn_q_norm, NULL, LLM_NORM_RMS, il);
cb(Qcur, "Qcur", il);
ggml_tensor * Kcur, * Vcur;
if (ubatch.embd) {
Kcur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_k_proj, inpCAS);
cb(Kcur, "Kcur", il);
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, 6404);
cb(Kcur, "Kcur", il);
Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
cb(Kcur, "Kcur", il);
Kcur = build_norm(Kcur, model.layers[il].cross_attn_k_norm, NULL, LLM_NORM_RMS, il);
cb(Kcur, "Kcur", il);
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, kv_self->k_l[il]));
Vcur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_v_proj, inpCAS);
cb(Vcur, "Vcur", il);
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, 6404);
cb(Vcur, "Vcur", il);
Vcur = ggml_permute(ctx0, Vcur, 0, 2, 1, 3);
cb(Vcur, "Vcur", il);
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, kv_self->v_l[il]));
} else {
Kcur = ggml_view_tensor(ctx0, kv_self->k_l[il]);
cb(Kcur, "Kcur (view)", il);
Vcur = ggml_view_tensor(ctx0, kv_self->v_l[il]);
cb(Vcur, "Vcur (view)", il);
}
struct ggml_tensor * kq = ggml_mul_mat(ctx0, Kcur, Qcur);
cb(kq, "kq", il);
// TODO: apply causal masks
struct ggml_tensor * kq_soft_max = ggml_soft_max_ext(ctx0, kq, nullptr, 1.f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias);
cb(kq_soft_max, "kq_soft_max", il);
Vcur = ggml_cont(ctx0, ggml_transpose(ctx0, Vcur));
cb(Vcur, "Vcur", il);
struct ggml_tensor * kqv = ggml_mul_mat(ctx0, Vcur, kq_soft_max);
cb(kqv, "kqv", il);
struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
cb(kqv_merged, "kqv_merged", il);
cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_head_v*n_head, n_tokens);
cb(cur, "kqv_merged_cont", il);
cur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_o_proj, cur);
cb(cur, "cur", il);
// TODO: do this in place once?
cur = ggml_mul(ctx0, cur, ggml_tanh(ctx0, model.layers[il].cross_attn_attn_gate));
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
cb(ffn_inp, "ffn_inp", il);
// feed-forward network
cur = build_norm(ffn_inp,
model.layers[il].ffn_norm, NULL,
LLM_NORM_RMS, il);
cb(cur, "ffn_norm", il);
cur = build_ffn(cur,
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
NULL,
LLM_FFN_SILU, LLM_FFN_PAR, il);
cb(cur, "ffn_out", il);
// TODO: do this inplace once?
cur = ggml_add_inplace(ctx0, ggml_mul_inplace(ctx0, cur, ggml_tanh(ctx0, model.layers[il].cross_attn_mlp_gate)), ffn_inp);
cb(cur, "ffn_out", il);
cur = build_cvec(cur, il);
cb(cur, "l_out", il);
// input for next layer
inpL = cur;
} else {
// self attention layer
// rope freq factors for llama3; may return nullptr for llama2 and other models
ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
// compute Q and K and RoPE them
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
cb(Qcur, "Qcur", il);
if (model.layers[il].bq) {
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
cb(Qcur, "Qcur", il);
}
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
cb(Kcur, "Kcur", il);
if (model.layers[il].bk) {
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
cb(Kcur, "Kcur", il);
}
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
cb(Vcur, "Vcur", il);
if (model.layers[il].bv) {
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
cb(Vcur, "Vcur", il);
}
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
Qcur = ggml_rope_ext(
ctx0, Qcur, inp_pos, rope_factors,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
Kcur = ggml_rope_ext(
ctx0, Kcur, inp_pos, rope_factors,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
cb(Qcur, "Qcur", il);
cb(Kcur, "Kcur", il);
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn, gf,
model.layers[il].wo, model.layers[il].bo,
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
if (il == n_layer - 1) {
// skip computing output for unused tokens
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
n_tokens = n_outputs;
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
}
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
cb(ffn_inp, "ffn_inp", il);
// feed-forward network
cur = build_norm(ffn_inp,
model.layers[il].ffn_norm, NULL,
LLM_NORM_RMS, il);
cb(cur, "ffn_norm", il);
cur = build_ffn(cur,
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
NULL,
LLM_FFN_SILU, LLM_FFN_PAR, il);
cb(cur, "ffn_out", il);
cur = ggml_add(ctx0, cur, ffn_inp);
cb(cur, "ffn_out", il);
cur = build_cvec(cur, il);
cb(cur, "l_out", il);
// input for next layer
inpL = cur;
}
}
cur = inpL;
cur = build_norm(cur,
model.output_norm, NULL,
LLM_NORM_RMS, -1);
cb(cur, "result_norm", -1);
res->t_embd = cur;
// lm_head
cur = build_lora_mm(model.output, cur);
cb(cur, "result_output", -1);
res->t_logits = cur;
ggml_build_forward_expand(gf, cur);
}
};
struct llm_build_deci : public llm_graph_context { struct llm_build_deci : public llm_graph_context {
llm_build_deci(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { llm_build_deci(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_head = hparams.n_embd_head_v;
@ -13428,10 +13128,6 @@ llm_graph_result_ptr llama_model::build_graph(
{ {
llm = std::make_unique<llm_build_llama>(*this, params, gf); llm = std::make_unique<llm_build_llama>(*this, params, gf);
} break; } break;
case LLM_ARCH_MLLAMA:
{
llm = std::make_unique<llm_build_mllama>(*this, params, gf);
} break;
case LLM_ARCH_DECI: case LLM_ARCH_DECI:
{ {
llm = std::make_unique<llm_build_deci>(*this, params, gf); llm = std::make_unique<llm_build_deci>(*this, params, gf);
@ -13793,7 +13489,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
// use what we call a normal RoPE, operating on pairs of consecutive head values // use what we call a normal RoPE, operating on pairs of consecutive head values
case LLM_ARCH_LLAMA: case LLM_ARCH_LLAMA:
case LLM_ARCH_LLAMA4: case LLM_ARCH_LLAMA4:
case LLM_ARCH_MLLAMA:
case LLM_ARCH_DECI: case LLM_ARCH_DECI:
case LLM_ARCH_BAICHUAN: case LLM_ARCH_BAICHUAN:
case LLM_ARCH_STARCODER: case LLM_ARCH_STARCODER:

View File

@ -11,7 +11,6 @@
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include <vector> #include <vector>
#include <stdexcept>
struct llama_cparams; struct llama_cparams;
struct llama_ubatch; struct llama_ubatch;
@ -75,7 +74,6 @@ enum llm_type {
LLM_TYPE_40B, LLM_TYPE_40B,
LLM_TYPE_65B, LLM_TYPE_65B,
LLM_TYPE_70B, LLM_TYPE_70B,
LLM_TYPE_90B,
LLM_TYPE_236B, LLM_TYPE_236B,
LLM_TYPE_290B, LLM_TYPE_290B,
LLM_TYPE_314B, LLM_TYPE_314B,
@ -320,16 +318,6 @@ struct llama_layer {
struct ggml_tensor * bskcn_tv = nullptr; struct ggml_tensor * bskcn_tv = nullptr;
// cross attention
struct ggml_tensor * cross_attn_k_norm = nullptr;
struct ggml_tensor * cross_attn_k_proj = nullptr;
struct ggml_tensor * cross_attn_o_proj = nullptr;
struct ggml_tensor * cross_attn_q_norm = nullptr;
struct ggml_tensor * cross_attn_q_proj = nullptr;
struct ggml_tensor * cross_attn_v_proj = nullptr;
struct ggml_tensor * cross_attn_attn_gate = nullptr;
struct ggml_tensor * cross_attn_mlp_gate = nullptr;
struct llama_layer_posnet posnet; struct llama_layer_posnet posnet;
struct llama_layer_convnext convnext; struct llama_layer_convnext convnext;

View File

@ -639,9 +639,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
if (llama_model_has_encoder(&model)) { if (llama_model_has_encoder(&model)) {
n_attn_layer *= 3; n_attn_layer *= 3;
} }
if (qs.n_attention_wv != n_attn_layer) { GGML_ASSERT((qs.n_attention_wv == n_attn_layer) && "n_attention_wv is unexpected");
LLAMA_LOG_WARN("%s: n_attention_wv is unexpected, expected: %d, found: %d\n", __func__, n_attn_layer, qs.n_attention_wv);
}
} }
size_t total_size_org = 0; size_t total_size_org = 0;

View File

@ -462,7 +462,7 @@ struct llava_embd_batch {
std::vector<llama_seq_id *> seq_ids; std::vector<llama_seq_id *> seq_ids;
std::vector<int8_t> logits; std::vector<int8_t> logits;
llama_batch batch; llama_batch batch;
llava_embd_batch(float * embd, int32_t n_embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) { llava_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
pos .resize(n_tokens); pos .resize(n_tokens);
n_seq_id.resize(n_tokens); n_seq_id.resize(n_tokens);
seq_ids .resize(n_tokens + 1); seq_ids .resize(n_tokens + 1);
@ -474,7 +474,6 @@ struct llava_embd_batch {
/*n_tokens =*/ n_tokens, /*n_tokens =*/ n_tokens,
/*tokens =*/ nullptr, /*tokens =*/ nullptr,
/*embd =*/ embd, /*embd =*/ embd,
/*n_embd =*/ n_embd,
/*pos =*/ pos.data(), /*pos =*/ pos.data(),
/*n_seq_id =*/ n_seq_id.data(), /*n_seq_id =*/ n_seq_id.data(),
/*seq_id =*/ seq_ids.data(), /*seq_id =*/ seq_ids.data(),
@ -498,7 +497,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
n_eval = n_batch; n_eval = n_batch;
} }
float * embd = image_embed->embed+i*n_embd; float * embd = image_embed->embed+i*n_embd;
llava_embd_batch llava_batch = llava_embd_batch(embd, n_embd, n_eval, *n_past, 0); llava_embd_batch llava_batch = llava_embd_batch(embd, n_eval, *n_past, 0);
if (llama_decode(ctx_llama, llava_batch.batch)) { if (llama_decode(ctx_llama, llava_batch.batch)) {
LOG_ERR("%s : failed to eval\n", __func__); LOG_ERR("%s : failed to eval\n", __func__);
return false; return false;

View File

@ -17,7 +17,6 @@ package llama
#include "llava.h" #include "llava.h"
#include "gguf.h" #include "gguf.h"
#include "mllama.h"
#include "sampling_ext.h" #include "sampling_ext.h"
extern bool llamaProgressCallback(float progress, void *user_data); extern bool llamaProgressCallback(float progress, void *user_data);
@ -510,63 +509,6 @@ func (c *ClipContext) NewEmbed(llamaContext *Context, data []byte) ([][]float32,
return embed, nil return embed, nil
} }
type MllamaContext struct {
c *C.struct_mllama_ctx
}
func NewMllamaContext(llamaContext *Context, modelPath string) (*MllamaContext, error) {
mp := C.CString(modelPath)
defer C.free(unsafe.Pointer(mp))
c := C.mllama_model_load(mp, 1)
if c == nil {
return nil, fmt.Errorf("unable to load mllama model: %v", modelPath)
}
projEmbedSize := int(C.mllama_n_embd(c))
modelEmbedSize := llamaContext.Model().NEmbd()
if projEmbedSize != modelEmbedSize {
return nil, fmt.Errorf("projector embedding size (%d) does not match model (%d)", projEmbedSize, modelEmbedSize)
}
return &MllamaContext{c: c}, nil
}
func (m *MllamaContext) Free() {
C.mllama_free(m.c)
}
func (m *MllamaContext) NewEmbed(llamaContext *Context, data []byte, aspectRatioId int) ([][]float32, error) {
img := C.mllama_image_init()
defer C.mllama_image_free(img)
ok := bool(C.mllama_image_load_from_data(unsafe.Pointer(&data[0]), C.int(len(data)), 560, 560, 3, 4, C.int(aspectRatioId), img))
if !ok {
return nil, errors.New("unable to load mllama image data")
}
rows := make([]float32, m.EmbedSize(llamaContext))
ok = bool(C.mllama_image_encode(m.c, C.int(llamaContext.numThreads), img, (*C.float)(unsafe.Pointer(&rows[0]))))
if !ok {
return nil, errors.New("unable to make mllama embedding from image")
}
embed := make([][]float32, 1)
embed[0] = rows
return embed, nil
}
func (m *MllamaContext) EmbedSize(llamaContext *Context) int {
numTokens := int(C.mllama_n_positions(m.c) * C.mllama_n_tiles(m.c))
numEmbed := llamaContext.Model().NEmbd()
return numTokens * numEmbed
}
func (c *Context) SetCrossAttention(state bool) {
C.llama_set_cross_attention(c.c, C.bool(state))
}
func (c *Context) Synchronize() { func (c *Context) Synchronize() {
C.llama_synchronize(c.c) C.llama_synchronize(c.c)
} }

887
llama/mllama.cpp vendored
View File

@ -1,887 +0,0 @@
// NOTE: This is modified from clip.cpp for Mllama only
#include "mllama.h"
#include "ggml-alloc.h"
#include "ggml-backend.h"
#include "ggml-cpu.h"
#include "ggml.h"
#include "gguf.h"
#ifdef GGML_USE_CUDA
#include "ggml-cuda.h"
#endif
#ifdef GGML_USE_METAL
#include "ggml-metal.h"
#endif
#ifdef GGML_USE_CANN
#include "ggml-cann.h"
#endif
#ifdef GGML_USE_VULKAN
#include "ggml-vulkan.h"
#endif
#include <algorithm>
#include <cmath>
#include <cstdarg>
#include <cstdlib>
#include <cstring>
#include <fstream>
#include <stdexcept>
#include <vector>
#define REQUIRE(x) \
do { \
if (!(x)) { \
throw std::runtime_error("REQUIRE failed: " #x); \
} \
} while (0)
#define LOG(fmt, ...) fprintf(stderr, "%s: " fmt "\n", __func__, ##__VA_ARGS__)
#if defined(_WIN32)
#define WIN32_LEAN_AND_MEAN
#ifndef NOMINMAX
#define NOMINMAX
#endif
#include <windows.h>
#if __GLIBCXX__
#include <cstdio>
#include <ext/stdio_filebuf.h>
#include <fcntl.h>
#endif
#endif
struct mllama_image {
int width;
int height;
int num_channels = 3;
int num_tiles = 4;
int aspect_ratio_id;
std::vector<float> data;
};
static std::string format(const char *fmt, ...) {
va_list args;
va_start(args, fmt);
std::vector<char> b(128);
int n = vsnprintf(b.data(), b.size(), fmt, args);
REQUIRE(n >= 0 && n < b.size());
va_end(args);
return std::string(b.data(), b.size());
}
//
// utilities to get data from a gguf file
//
static int get_key_index(const gguf_context *ctx, const char *key) {
int key_index = gguf_find_key(ctx, key);
REQUIRE(key_index != -1);
return key_index;
}
static std::vector<uint32_t> get_u32_array(const gguf_context *ctx, const std::string &key) {
const int i = get_key_index(ctx, key.c_str());
const int n = gguf_get_arr_n(ctx, i);
const uint32_t *data = (uint32_t *)gguf_get_arr_data(ctx, i);
std::vector<uint32_t> s(n);
for (size_t j = 0; j < s.size(); j++) {
s[j] = data[j];
}
return s;
}
static uint32_t get_u32(const gguf_context *ctx, const std::string &key) {
return gguf_get_val_u32(ctx, get_key_index(ctx, key.c_str()));
}
static float get_f32(const gguf_context *ctx, const std::string &key) {
return gguf_get_val_f32(ctx, get_key_index(ctx, key.c_str()));
}
static std::string get_ftype(int ftype) {
return ggml_type_name(static_cast<ggml_type>(ftype));
}
//
// mllama layers
//
struct mllama_hparams {
uint32_t image_size;
uint32_t patch_size;
uint32_t hidden_size;
uint32_t n_intermediate;
uint32_t projection_dim;
uint32_t n_head;
uint32_t n_layer;
uint32_t n_global_layer;
uint32_t n_tiles;
float eps;
std::vector<bool> intermediate_layers;
};
struct mllama_layer {
// attention
struct ggml_tensor *k_w;
struct ggml_tensor *k_b;
struct ggml_tensor *q_w;
struct ggml_tensor *q_b;
struct ggml_tensor *v_w;
struct ggml_tensor *v_b;
struct ggml_tensor *o_w;
struct ggml_tensor *o_b;
struct ggml_tensor *attn_gate;
// layernorm 1
struct ggml_tensor *ln_1_w;
struct ggml_tensor *ln_1_b;
// ff
struct ggml_tensor *ff_i_w;
struct ggml_tensor *ff_i_b;
struct ggml_tensor *ff_o_w;
struct ggml_tensor *ff_o_b;
struct ggml_tensor *ff_gate;
// layernorm 2
struct ggml_tensor *ln_2_w;
struct ggml_tensor *ln_2_b;
};
struct mllama_vision_model {
struct mllama_hparams hparams;
// embeddings
struct ggml_tensor *class_embedding;
struct ggml_tensor *patch_embeddings;
struct ggml_tensor *position_embeddings;
struct ggml_tensor *position_embeddings_gate;
struct ggml_tensor *tile_position_embeddings;
struct ggml_tensor *tile_position_embeddings_gate;
struct ggml_tensor *pre_tile_position_embeddings;
struct ggml_tensor *pre_tile_position_embeddings_gate;
struct ggml_tensor *post_tile_position_embeddings;
struct ggml_tensor *post_tile_position_embeddings_gate;
struct ggml_tensor *pre_ln_w;
struct ggml_tensor *pre_ln_b;
std::vector<mllama_layer> layers;
std::vector<mllama_layer> global_layers;
struct ggml_tensor *post_ln_w;
struct ggml_tensor *post_ln_b;
struct ggml_tensor *mm_0_w;
struct ggml_tensor *mm_0_b;
};
struct mllama_ctx {
struct mllama_vision_model vision_model;
uint32_t ftype = 1;
struct gguf_context *ctx_gguf;
struct ggml_context *ctx_data;
std::vector<uint8_t> buf_compute_meta;
// memory buffers to evaluate the model
ggml_backend_buffer_t params_buffer = nullptr;
ggml_backend_t backend = nullptr;
ggml_gallocr_t compute_alloc = nullptr;
};
static ggml_tensor *mllama_image_build_encoder_layer(
struct ggml_context *ctx0, const size_t il, const struct mllama_layer &layer, struct ggml_tensor *embeddings,
const float eps, const int hidden_size, const int batch_size, const int n_head, const int d_head) {
struct ggml_tensor *cur = embeddings;
{
// layernorm1
cur = ggml_norm(ctx0, cur, eps);
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, layer.ln_1_w), layer.ln_1_b);
ggml_set_name(cur, format("%d pre layernorm", il).c_str());
}
{
// self-attention
struct ggml_tensor *Q = ggml_mul_mat(ctx0, layer.q_w, cur);
if (layer.q_b != nullptr) {
Q = ggml_add(ctx0, Q, layer.q_b);
}
Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, Q->ne[1], batch_size);
Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
ggml_set_name(Q, format("%d query", il).c_str());
struct ggml_tensor *K = ggml_mul_mat(ctx0, layer.k_w, cur);
if (layer.k_b != nullptr) {
K = ggml_add(ctx0, K, layer.k_b);
}
K = ggml_reshape_4d(ctx0, K, d_head, n_head, K->ne[1], batch_size);
K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
ggml_set_name(K, format("%d key", il).c_str());
struct ggml_tensor *V = ggml_mul_mat(ctx0, layer.v_w, cur);
if (layer.v_b != nullptr) {
V = ggml_add(ctx0, V, layer.v_b);
}
V = ggml_reshape_4d(ctx0, V, d_head, n_head, V->ne[1], batch_size);
V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3));
ggml_set_name(V, format("%d value", il).c_str());
struct ggml_tensor *KQ = ggml_mul_mat(ctx0, K, Q);
KQ = ggml_scale_inplace(ctx0, KQ, 1.0f / sqrtf((float)d_head));
KQ = ggml_soft_max_inplace(ctx0, KQ);
ggml_set_name(KQ, format("%d KQ", il).c_str());
struct ggml_tensor *KQV = ggml_mul_mat(ctx0, V, KQ);
KQV = ggml_reshape_4d(ctx0, KQV, d_head, KQV->ne[1], n_head, batch_size);
KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
KQV = ggml_cont_3d(ctx0, KQV, hidden_size, KQV->ne[2], batch_size);
ggml_set_name(KQV, format("%d KQV", il).c_str());
cur = ggml_mul_mat(ctx0, layer.o_w, KQV);
if (layer.o_b != nullptr) {
cur = ggml_add(ctx0, cur, layer.o_b);
}
ggml_set_name(cur, format("%d self attention", il).c_str());
if (layer.attn_gate != nullptr) {
cur = ggml_mul_inplace(ctx0, cur, layer.attn_gate);
ggml_set_name(cur, format("%d self attention gate", il).c_str());
}
}
cur = ggml_add(ctx0, cur, embeddings);
ggml_set_name(cur, format("%d residual", il).c_str());
embeddings = cur;
{
// layernorm2
cur = ggml_norm(ctx0, cur, eps);
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, layer.ln_2_w), layer.ln_2_b);
ggml_set_name(cur, format("%d post layernorm", il).c_str());
}
{
// feed forward
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, layer.ff_i_w, cur), layer.ff_i_b);
cur = ggml_gelu_inplace(ctx0, cur);
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, layer.ff_o_w, cur), layer.ff_o_b);
ggml_set_name(cur, format("%d feed forward", il).c_str());
if (layer.ff_gate != nullptr) {
cur = ggml_mul_inplace(ctx0, cur, layer.ff_gate);
ggml_set_name(cur, format("%d feed forward gate", il).c_str());
}
}
// residual 2
cur = ggml_add(ctx0, cur, embeddings);
ggml_set_name(cur, format("%d residual", il).c_str());
embeddings = cur;
return embeddings;
}
static ggml_cgraph *mllama_image_build_graph(mllama_ctx *ctx, const mllama_image_batch *imgs) {
const auto &model = ctx->vision_model;
const auto &hparams = model.hparams;
const int image_size = hparams.image_size;
const int image_size_width = image_size;
const int image_size_height = image_size;
const int patch_size = hparams.patch_size;
const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
const int num_positions = num_patches + (model.class_embedding == nullptr ? 0 : 1);
const int hidden_size = hparams.hidden_size;
const int n_head = hparams.n_head;
const int d_head = hidden_size / n_head;
const int batch_size = imgs->size;
REQUIRE(batch_size == 1);
int num_tiles = 4;
int num_channels = 3;
if (imgs->data != nullptr) {
num_tiles = imgs->data[0].num_tiles > 0 ? imgs->data[0].num_tiles : num_tiles;
num_channels = imgs->data[0].num_channels > 0 ? imgs->data[0].num_channels : num_channels;
}
struct ggml_init_params params = {
ctx->buf_compute_meta.size(), // mem_size
ctx->buf_compute_meta.data(), // mem_buffer
true, // no_alloc
};
struct ggml_context *ctx0 = ggml_init(params);
struct ggml_cgraph *gf = ggml_new_graph(ctx0);
struct ggml_tensor *inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size_width, image_size_height, num_channels, num_tiles);
ggml_set_name(inp_raw, "inp_raw");
ggml_set_input(inp_raw);
struct ggml_tensor *inp = ggml_conv_2d(ctx0, model.patch_embeddings, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, num_tiles);
inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3));
struct ggml_tensor *aspect_ratios = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, imgs->size);
ggml_set_name(aspect_ratios, "aspect_ratios");
ggml_set_input(aspect_ratios);
if (model.pre_tile_position_embeddings != nullptr) {
struct ggml_tensor *pre_tile_position_embeddings = ggml_get_rows(ctx0, model.pre_tile_position_embeddings, aspect_ratios);
ggml_set_name(pre_tile_position_embeddings, "pre_tile_position_embeddings");
pre_tile_position_embeddings = ggml_reshape_3d(ctx0, pre_tile_position_embeddings, hidden_size, 1, num_tiles);
if (model.pre_tile_position_embeddings_gate != nullptr) {
pre_tile_position_embeddings = ggml_mul_inplace(ctx0, pre_tile_position_embeddings, model.pre_tile_position_embeddings_gate);
}
inp = ggml_add(ctx0, inp, pre_tile_position_embeddings);
}
struct ggml_tensor *embeddings = inp;
if (model.class_embedding != nullptr) {
// concat class_embeddings and patch_embeddings
embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, num_tiles);
ggml_set_name(embeddings, "embeddings");
ggml_set_input(embeddings);
for (int i = 0; i < num_tiles; ++i) {
// repeat class embeddings for each tile
embeddings = ggml_acc_inplace(ctx0, embeddings, model.class_embedding, embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], i * embeddings->nb[2]);
}
embeddings = ggml_acc_inplace(ctx0, embeddings, inp, embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
}
struct ggml_tensor *positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions);
ggml_set_name(positions, "positions");
ggml_set_input(positions);
struct ggml_tensor *position_embd = ggml_get_rows(ctx0, model.position_embeddings, positions);
if (model.position_embeddings_gate != nullptr) {
position_embd = ggml_mul_inplace(ctx0, position_embd, model.position_embeddings_gate);
}
embeddings = ggml_add(ctx0, embeddings, position_embd);
if (model.tile_position_embeddings != nullptr) {
struct ggml_tensor *tile_position_embeddings = ggml_get_rows(ctx0, model.tile_position_embeddings, aspect_ratios);
ggml_set_name(tile_position_embeddings, "tile_position_embeddings");
tile_position_embeddings = ggml_reshape_3d(ctx0, tile_position_embeddings, hidden_size, num_positions, num_tiles);
if (model.tile_position_embeddings_gate != nullptr) {
tile_position_embeddings = ggml_mul_inplace(ctx0, tile_position_embeddings, model.tile_position_embeddings_gate);
}
embeddings = ggml_add(ctx0, embeddings, tile_position_embeddings);
}
// pre-layernorm
if (model.pre_ln_w != nullptr) {
embeddings = ggml_mul(ctx0, ggml_norm(ctx0, embeddings, hparams.eps), model.pre_ln_w);
if (model.pre_ln_b != nullptr) {
embeddings = ggml_add(ctx0, embeddings, model.pre_ln_b);
}
ggml_set_name(embeddings, "pre layernorm");
}
const int num_padding_patches = 8 - (embeddings->ne[1] % 8) % 8;
embeddings = ggml_pad(ctx0, embeddings, 0, num_padding_patches, 0, 0);
embeddings = ggml_view_3d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1] * embeddings->ne[2], batch_size, embeddings->nb[1], embeddings->nb[2] * embeddings->ne[3], 0);
std::vector<struct ggml_tensor *> intermediate_embeddings;
// encoder
for (size_t il = 0; il < model.layers.size(); il++) {
if (hparams.intermediate_layers[il]) {
intermediate_embeddings.push_back(embeddings);
}
embeddings = mllama_image_build_encoder_layer(
ctx0, il, model.layers[il], embeddings,
hparams.eps, hidden_size, batch_size, n_head, d_head);
}
// post-layernorm
if (model.post_ln_w != nullptr) {
embeddings = ggml_mul(ctx0, ggml_norm(ctx0, embeddings, hparams.eps), model.post_ln_w);
if (model.post_ln_b != nullptr) {
embeddings = ggml_add(ctx0, embeddings, model.post_ln_b);
}
ggml_set_name(embeddings, "post layernorm");
}
embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size, num_positions + num_padding_patches, num_tiles);
if (model.post_tile_position_embeddings != nullptr) {
struct ggml_tensor *post_tile_position_embeddings = ggml_get_rows(ctx0, model.post_tile_position_embeddings, aspect_ratios);
ggml_set_name(post_tile_position_embeddings, "post_tile_position_embeddings");
post_tile_position_embeddings = ggml_reshape_3d(ctx0, post_tile_position_embeddings, hidden_size, 1, num_tiles);
if (model.post_tile_position_embeddings_gate != nullptr) {
post_tile_position_embeddings = ggml_mul(ctx0, post_tile_position_embeddings, model.post_tile_position_embeddings_gate);
}
embeddings = ggml_add(ctx0, embeddings, post_tile_position_embeddings);
}
embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size, num_tiles * (num_positions + num_padding_patches), 1);
// global encoder
for (size_t il = 0; il < model.global_layers.size(); il++) {
embeddings = mllama_image_build_encoder_layer(
ctx0, il, model.global_layers[il], embeddings,
hparams.eps, hidden_size, batch_size, n_head, d_head);
}
struct ggml_tensor *stacked_embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 0, hidden_size, (num_positions + num_padding_patches) * num_tiles);
for (size_t i = 0; i < intermediate_embeddings.size(); ++i) {
stacked_embeddings = ggml_concat(ctx0, stacked_embeddings, ggml_reshape_3d(ctx0, intermediate_embeddings[i], 1, intermediate_embeddings[i]->ne[0], intermediate_embeddings[i]->ne[1]), 0);
}
stacked_embeddings = ggml_reshape_4d(ctx0, stacked_embeddings, intermediate_embeddings.size() * hidden_size, num_positions + num_padding_patches, num_tiles, batch_size);
stacked_embeddings = ggml_unpad(ctx0, stacked_embeddings, 0, num_padding_patches, 0, 0);
embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size, num_positions + num_padding_patches, num_tiles);
embeddings = ggml_unpad(ctx0, embeddings, 0, num_padding_patches, 0, 0);
embeddings = ggml_concat(ctx0, embeddings, stacked_embeddings, 0);
// mllama projector
embeddings = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_0_w, embeddings), model.mm_0_b);
ggml_set_name(embeddings, "multi modal projector");
// build the graph
ggml_build_forward_expand(gf, embeddings);
ggml_free(ctx0);
return gf;
}
static struct ggml_tensor *mllama_tensor_load(struct ggml_context *ctx, const char *name, const bool optional) {
struct ggml_tensor *cur = ggml_get_tensor(ctx, name);
REQUIRE(cur != nullptr || optional);
return cur;
}
static std::vector<struct mllama_layer> mllama_layers_load(struct ggml_context *ctx, const char *prefix, const int n) {
std::vector<struct mllama_layer> layers(n);
for (size_t i = 0; i < layers.size(); i++) {
auto &layer = layers[i];
layer.ln_1_w = mllama_tensor_load(ctx, format("%s.blk.%d.ln1.weight", prefix, i).c_str(), false);
layer.ln_1_b = mllama_tensor_load(ctx, format("%s.blk.%d.ln1.bias", prefix, i).c_str(), false);
layer.ln_2_w = mllama_tensor_load(ctx, format("%s.blk.%d.ln2.weight", prefix, i).c_str(), false);
layer.ln_2_b = mllama_tensor_load(ctx, format("%s.blk.%d.ln2.bias", prefix, i).c_str(), false);
layer.k_w = mllama_tensor_load(ctx, format("%s.blk.%d.attn_k.weight", prefix, i).c_str(), false);
layer.k_b = mllama_tensor_load(ctx, format("%s.blk.%d.attn_k.bias", prefix, i).c_str(), true);
layer.q_w = mllama_tensor_load(ctx, format("%s.blk.%d.attn_q.weight", prefix, i).c_str(), false);
layer.q_b = mllama_tensor_load(ctx, format("%s.blk.%d.attn_q.bias", prefix, i).c_str(), true);
layer.v_w = mllama_tensor_load(ctx, format("%s.blk.%d.attn_v.weight", prefix, i).c_str(), false);
layer.v_b = mllama_tensor_load(ctx, format("%s.blk.%d.attn_v.bias", prefix, i).c_str(), true);
layer.o_w = mllama_tensor_load(ctx, format("%s.blk.%d.attn_out.weight", prefix, i).c_str(), false);
layer.o_b = mllama_tensor_load(ctx, format("%s.blk.%d.attn_out.bias", prefix, i).c_str(), true);
layer.ff_i_w = mllama_tensor_load(ctx, format("%s.blk.%d.ffn_down.weight", prefix, i).c_str(), false);
layer.ff_i_b = mllama_tensor_load(ctx, format("%s.blk.%d.ffn_down.bias", prefix, i).c_str(), false);
layer.ff_o_w = mllama_tensor_load(ctx, format("%s.blk.%d.ffn_up.weight", prefix, i).c_str(), false);
layer.ff_o_b = mllama_tensor_load(ctx, format("%s.blk.%d.ffn_up.bias", prefix, i).c_str(), false);
layer.attn_gate = mllama_tensor_load(ctx, format("%s.blk.%d.attn_gate", prefix, i).c_str(), true);
layer.ff_gate = mllama_tensor_load(ctx, format("%s.blk.%d.ffn_gate", prefix, i).c_str(), true);
}
return layers;
}
// read and create ggml_context containing the tensors and their data
struct mllama_ctx *mllama_model_load(const char *fname, const int verbosity = 1) {
struct ggml_context *meta = nullptr;
struct gguf_init_params params = {
true, // no_alloc
&meta, // ctx
};
struct gguf_context *ctx = gguf_init_from_file(fname, params);
REQUIRE(ctx != nullptr);
if (verbosity >= 1) {
const int n_tensors = gguf_get_n_tensors(ctx);
const int n_kv = gguf_get_n_kv(ctx);
const std::string ftype = get_ftype(get_u32(ctx, "general.file_type"));
const int idx_desc = get_key_index(ctx, "general.description");
const std::string description = gguf_get_val_str(ctx, idx_desc);
const int idx_name = gguf_find_key(ctx, "general.name");
if (idx_name != -1) { // make name optional temporarily as some of the uploaded models missing it due to a bug
const std::string name = gguf_get_val_str(ctx, idx_name);
LOG("model name: %s", name.c_str());
}
LOG("description: %s", description.c_str());
LOG("GGUF version: %d", gguf_get_version(ctx));
LOG("alignment: %zu", gguf_get_alignment(ctx));
LOG("n_tensors: %d", n_tensors);
LOG("n_kv: %d", n_kv);
LOG("ftype: %s", ftype.c_str());
LOG("");
}
const int n_tensors = gguf_get_n_tensors(ctx);
mllama_ctx *new_mllama = new mllama_ctx{};
ggml_backend_t backend = ggml_backend_init_best();
if (backend == nullptr) {
LOG("%s: failed to initialize backend\n", __func__);
mllama_free(new_mllama);
gguf_free(ctx);
return nullptr;
}
LOG("%s: using %s backend\n", __func__, ggml_backend_name(backend));
new_mllama->backend = backend;
// load tensors
{
std::vector<uint8_t> read_buf;
struct ggml_init_params params = {
(n_tensors + 1) * ggml_tensor_overhead(), // mem_size
nullptr, // mem_buffer
true, // no_alloc
};
new_mllama->ctx_data = ggml_init(params);
if (!new_mllama->ctx_data) {
LOG("ggml_init() failed");
mllama_free(new_mllama);
gguf_free(ctx);
return nullptr;
}
#ifdef _WIN32
int wlen = MultiByteToWideChar(CP_UTF8, 0, fname, -1, NULL, 0);
if (!wlen) {
return NULL;
}
wchar_t * wbuf = (wchar_t *) malloc(wlen * sizeof(wchar_t));
wlen = MultiByteToWideChar(CP_UTF8, 0, fname, -1, wbuf, wlen);
if (!wlen) {
free(wbuf);
return NULL;
}
#if __GLIBCXX__
int fd = _wopen(wbuf, _O_RDONLY | _O_BINARY);
__gnu_cxx::stdio_filebuf<char> buffer(fd, std::ios_base::in);
std::istream fin(&buffer);
#else // MSVC
// unused in our current build
auto fin = std::ifstream(wbuf, std::ios::binary);
#endif
free(wbuf);
#else
auto fin = std::ifstream(fname, std::ios::binary);
#endif
if (!fin) {
LOG("cannot open model file for loading tensors\n");
mllama_free(new_mllama);
gguf_free(ctx);
return nullptr;
}
// add tensors to context
for (int i = 0; i < n_tensors; ++i) {
const char *name = gguf_get_tensor_name(ctx, i);
struct ggml_tensor *t = ggml_get_tensor(meta, name);
struct ggml_tensor *cur = ggml_dup_tensor(new_mllama->ctx_data, t);
ggml_set_name(cur, name);
}
// alloc memory and offload data
new_mllama->params_buffer = ggml_backend_alloc_ctx_tensors(new_mllama->ctx_data, new_mllama->backend);
for (int i = 0; i < n_tensors; ++i) {
const char *name = gguf_get_tensor_name(ctx, i);
struct ggml_tensor *cur = ggml_get_tensor(new_mllama->ctx_data, name);
const size_t offset = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i);
fin.seekg(offset, std::ios::beg);
if (!fin) {
LOG("failed to seek for tensor %s\n", name);
mllama_free(new_mllama);
gguf_free(ctx);
return nullptr;
}
int num_bytes = ggml_nbytes(cur);
if (ggml_backend_buffer_is_host(new_mllama->params_buffer)) {
// for the CPU and Metal backend, we can read directly into the tensor
fin.read(reinterpret_cast<char *>(cur->data), num_bytes);
} else {
// read into a temporary buffer first, then copy to device memory
read_buf.resize(num_bytes);
fin.read(reinterpret_cast<char *>(read_buf.data()), num_bytes);
ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
}
}
#if defined(_WIN32) && defined(__GLIBCXX__)
close(fd);
#else
fin.close();
#endif
}
// vision model
// load vision model
auto &vision_model = new_mllama->vision_model;
auto &hparams = vision_model.hparams;
hparams.hidden_size = get_u32(ctx, "mllama.vision.embedding_length");
hparams.n_head = get_u32(ctx, "mllama.vision.attention.head_count");
hparams.n_intermediate = get_u32(ctx, "mllama.vision.feed_forward_length");
hparams.n_layer = get_u32(ctx, "mllama.vision.block_count");
hparams.n_global_layer = get_u32(ctx, "mllama.vision.global.block_count");
hparams.n_tiles = get_u32(ctx, "mllama.vision.max_num_tiles");
hparams.image_size = get_u32(ctx, "mllama.vision.image_size");
hparams.patch_size = get_u32(ctx, "mllama.vision.patch_size");
hparams.projection_dim = get_u32(ctx, "mllama.vision.projection_dim");
hparams.eps = get_f32(ctx, "mllama.vision.attention.layer_norm_epsilon");
std::vector<uint32_t> intermediate_layers_indices = get_u32_array(ctx, "mllama.vision.intermediate_layers_indices");
hparams.intermediate_layers.resize(hparams.n_layer);
for (size_t i = 0; i < intermediate_layers_indices.size(); i++) {
hparams.intermediate_layers[intermediate_layers_indices[i]] = true;
}
if (verbosity >= 2) {
LOG("");
LOG("vision model hparams");
LOG("image_size %d", hparams.image_size);
LOG("patch_size %d", hparams.patch_size);
LOG("v_hidden_size %d", hparams.hidden_size);
LOG("v_n_intermediate %d", hparams.n_intermediate);
LOG("v_projection_dim %d", hparams.projection_dim);
LOG("v_n_head %d", hparams.n_head);
LOG("v_n_layer %d", hparams.n_layer);
LOG("v_n_global_layer %d", hparams.n_global_layer);
LOG("v_eps %f", hparams.eps);
}
vision_model.class_embedding = mllama_tensor_load(new_mllama->ctx_data, "v.class_embd", true);
vision_model.patch_embeddings = mllama_tensor_load(new_mllama->ctx_data, "v.patch_embd.weight", true);
vision_model.position_embeddings = mllama_tensor_load(new_mllama->ctx_data, "v.position_embd.weight", true);
vision_model.position_embeddings_gate = mllama_tensor_load(new_mllama->ctx_data, "v.position_embd.gate", true);
vision_model.pre_ln_w = mllama_tensor_load(new_mllama->ctx_data, "v.pre_ln.weight", true);
vision_model.pre_ln_b = mllama_tensor_load(new_mllama->ctx_data, "v.pre_ln.bias", true);
vision_model.post_ln_w = mllama_tensor_load(new_mllama->ctx_data, "v.post_ln.weight", true);
vision_model.post_ln_b = mllama_tensor_load(new_mllama->ctx_data, "v.post_ln.bias", true);
vision_model.tile_position_embeddings = mllama_tensor_load(new_mllama->ctx_data, "v.tile_position_embd.weight", true);
vision_model.tile_position_embeddings_gate = mllama_tensor_load(new_mllama->ctx_data, "v.tile_position_embd.gate", true);
vision_model.pre_tile_position_embeddings = mllama_tensor_load(new_mllama->ctx_data, "v.pre_tile_position_embd.weight", true);
vision_model.pre_tile_position_embeddings_gate = mllama_tensor_load(new_mllama->ctx_data, "v.pre_tile_position_embd.gate", true);
vision_model.post_tile_position_embeddings = mllama_tensor_load(new_mllama->ctx_data, "v.post_tile_position_embd.weight", true);
vision_model.post_tile_position_embeddings_gate = mllama_tensor_load(new_mllama->ctx_data, "v.post_tile_position_embd.gate", true);
vision_model.mm_0_w = mllama_tensor_load(new_mllama->ctx_data, "mm.0.weight", false);
vision_model.mm_0_b = mllama_tensor_load(new_mllama->ctx_data, "mm.0.bias", false);
vision_model.layers = mllama_layers_load(new_mllama->ctx_data, "v", hparams.n_layer);
vision_model.global_layers = mllama_layers_load(new_mllama->ctx_data, "v.global", hparams.n_global_layer);
ggml_free(meta);
new_mllama->ctx_gguf = ctx;
{
// measure mem requirement and allocate
new_mllama->buf_compute_meta.resize(GGML_DEFAULT_GRAPH_SIZE * ggml_tensor_overhead() + ggml_graph_overhead());
new_mllama->compute_alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(new_mllama->backend));
struct mllama_image_batch batch;
batch.size = 1;
ggml_cgraph *gf = mllama_image_build_graph(new_mllama, &batch);
ggml_gallocr_reserve(new_mllama->compute_alloc, gf);
size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_mllama->compute_alloc, 0);
LOG("compute allocated memory: %.2f MB", compute_memory_buffer_size / 1024.0 / 1024.0);
}
return new_mllama;
}
struct mllama_image *mllama_image_init() {
return new mllama_image();
}
void mllama_image_free(struct mllama_image *img) { delete img; }
void mllama_image_batch_free(struct mllama_image_batch *batch) {
if (batch->size > 0) {
delete[] batch->data;
batch->size = 0;
}
}
bool mllama_image_load_from_data(const void *data, const int n, const int width, const int height, const int num_channels, const int num_tiles, const int aspect_ratio_id, struct mllama_image *img) {
img->width = width;
img->height = height;
img->num_channels = num_channels;
img->num_tiles = num_tiles;
img->aspect_ratio_id = aspect_ratio_id;
img->data.resize(n);
memcpy(img->data.data(), data, n);
return true;
}
inline int mllama(int x, int lower, int upper) {
return std::max(lower, std::min(x, upper));
}
void mllama_free(mllama_ctx *ctx) {
ggml_free(ctx->ctx_data);
gguf_free(ctx->ctx_gguf);
ggml_backend_buffer_free(ctx->params_buffer);
ggml_backend_free(ctx->backend);
ggml_gallocr_free(ctx->compute_alloc);
delete ctx;
}
bool mllama_image_encode(struct mllama_ctx *ctx, const int n_threads, mllama_image *img, float *vec) {
mllama_image_batch imgs{};
imgs.size = 1;
imgs.data = img;
return mllama_image_batch_encode(ctx, n_threads, &imgs, vec);
}
bool mllama_image_batch_encode(mllama_ctx *ctx, const int n_threads, const mllama_image_batch *imgs, float *vec) {
int batch_size = imgs->size;
REQUIRE(batch_size == 1);
// build the inference graph
ggml_cgraph *gf = mllama_image_build_graph(ctx, imgs);
ggml_gallocr_alloc_graph(ctx->compute_alloc, gf);
// set inputs
const auto &model = ctx->vision_model;
const auto &hparams = model.hparams;
const int image_size = hparams.image_size;
int image_size_width = image_size;
int image_size_height = image_size;
const int patch_size = hparams.patch_size;
const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
const int num_positions = num_patches + (model.class_embedding == nullptr ? 0 : 1);
{
struct ggml_tensor *inp_raw = ggml_graph_get_tensor(gf, "inp_raw");
ggml_backend_tensor_set(inp_raw, imgs->data[0].data.data(), 0, ggml_nbytes(inp_raw));
}
{
struct ggml_tensor *embeddings = ggml_graph_get_tensor(gf, "embeddings");
if (embeddings != nullptr) {
void *zeros = malloc(ggml_nbytes(embeddings));
memset(zeros, 0, ggml_nbytes(embeddings));
ggml_backend_tensor_set(embeddings, zeros, 0, ggml_nbytes(embeddings));
free(zeros);
}
}
{
struct ggml_tensor *positions = ggml_graph_get_tensor(gf, "positions");
if (positions != nullptr) {
int *positions_data = (int *)malloc(ggml_nbytes(positions));
for (int i = 0; i < num_positions; i++) {
positions_data[i] = i;
}
ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
free(positions_data);
}
}
{
struct ggml_tensor *aspect_ratios = ggml_graph_get_tensor(gf, "aspect_ratios");
if (aspect_ratios != nullptr) {
int *aspect_ratios_data = (int *)malloc(ggml_nbytes(aspect_ratios));
aspect_ratios_data[0] = imgs->data[0].aspect_ratio_id;
ggml_backend_tensor_set(aspect_ratios, aspect_ratios_data, 0, ggml_nbytes(aspect_ratios));
free(aspect_ratios_data);
}
}
if (ggml_backend_is_cpu(ctx->backend)) {
ggml_backend_cpu_set_n_threads(ctx->backend, n_threads);
}
ggml_backend_graph_compute(ctx->backend, gf);
// the last node is the embedding tensor
struct ggml_tensor *embeddings = ggml_graph_node(gf, ggml_graph_n_nodes(gf) - 1);
// copy the embeddings to the location passed by the user
ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
return true;
}
int32_t mllama_image_size(const struct mllama_ctx *ctx) {
return ctx->vision_model.hparams.image_size;
}
int32_t mllama_patch_size(const struct mllama_ctx *ctx) {
return ctx->vision_model.hparams.patch_size;
}
int32_t mllama_hidden_size(const struct mllama_ctx *ctx) {
return ctx->vision_model.hparams.hidden_size;
}
int mllama_n_patches(const struct mllama_ctx *ctx) {
const auto &hparams = ctx->vision_model.hparams;
return (hparams.image_size / hparams.patch_size) * (hparams.image_size / hparams.patch_size);
}
int mllama_n_positions(const struct mllama_ctx *ctx) {
return mllama_n_patches(ctx) + (ctx->vision_model.class_embedding == nullptr ? 0 : 1);
}
int mllama_n_tiles(const struct mllama_ctx *ctx) {
return ctx->vision_model.hparams.n_tiles;
}
int mllama_n_embd(const struct mllama_ctx *ctx) {
return ctx->vision_model.hparams.projection_dim;
}
size_t mllama_n_embd_bytes(const struct mllama_ctx *ctx) {
return mllama_n_positions(ctx) * mllama_n_embd(ctx) * mllama_n_tiles(ctx) * sizeof(float);
}

61
llama/mllama.h vendored
View File

@ -1,61 +0,0 @@
#ifndef MLLAMA_H
#define MLLAMA_H
#include <stddef.h>
#include <stdint.h>
#ifdef LLAMA_SHARED
#if defined(_WIN32) && !defined(__MINGW32__)
#ifdef LLAMA_BUILD
#define MLLAMA_API __declspec(dllexport)
#else
#define MLLAMA_API __declspec(dllimport)
#endif
#else
#define MLLAMA_API __attribute__((visibility("default")))
#endif
#else
#define MLLAMA_API
#endif
#ifdef __cplusplus
extern "C" {
#endif
struct mllama_ctx;
struct mllama_image_batch {
struct mllama_image *data;
size_t size;
};
MLLAMA_API struct mllama_ctx *mllama_model_load(const char *fname, int verbosity);
MLLAMA_API struct mllama_ctx *mllama_model_load_cpu(const char *fname, int verbosity);
MLLAMA_API void mllama_free(struct mllama_ctx *ctx);
MLLAMA_API int32_t mllama_image_size(const struct mllama_ctx *ctx);
MLLAMA_API int32_t mllama_patch_size(const struct mllama_ctx *ctx);
MLLAMA_API int32_t mllama_hidden_size(const struct mllama_ctx *ctx);
MLLAMA_API int mllama_n_patches(const struct mllama_ctx *ctx);
MLLAMA_API int mllama_n_positions(const struct mllama_ctx *ctx);
MLLAMA_API int mllama_n_tiles(const struct mllama_ctx *ctx);
MLLAMA_API int mllama_n_embd(const struct mllama_ctx *ctx);
MLLAMA_API size_t mllama_n_embd_bytes(const struct mllama_ctx *ctx);
MLLAMA_API struct mllama_image *mllama_image_init();
MLLAMA_API void mllama_image_free(struct mllama_image *img);
MLLAMA_API void mllama_image_batch_free(struct mllama_image_batch *batch);
MLLAMA_API bool mllama_image_load_from_data(const void *data, const int n, const int nx, const int ny, const int nc, const int nt, const int aspect_ratio_id, struct mllama_image *img);
MLLAMA_API bool mllama_image_encode(struct mllama_ctx *ctx, int n_threads, struct mllama_image *img, float *vec);
MLLAMA_API bool mllama_image_batch_encode(struct mllama_ctx *ctx, int n_threads, const struct mllama_image_batch *imgs, float *vec);
#ifdef __cplusplus
}
#endif
#endif // MLLAMA_H

View File

@ -270,7 +270,7 @@ index 3a4e72a3..831b68c0 100644
+ // self-attention + // self-attention
+ { + {
+ // rope freq factors for llama3; may return nullptr for llama2 and other models + // rope freq factors for llama3; may return nullptr for llama2 and other models
+ ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il); + ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
+ +
+ // compute Q and K and RoPE them + // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);

File diff suppressed because it is too large Load Diff

View File

@ -1,419 +0,0 @@
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: jmorganca <jmorganca@gmail.com>
Date: Sun, 13 Apr 2025 22:10:06 -0400
Subject: [PATCH] add unpad operator
adds the unpad operator to GGML
---
ggml/include/ggml.h | 10 +++++
ggml/src/ggml-cpu/ggml-cpu.c | 5 +++
ggml/src/ggml-cpu/ops.cpp | 55 ++++++++++++++++++++++++++++
ggml/src/ggml-cpu/ops.h | 1 +
ggml/src/ggml-cuda/ggml-cuda.cu | 4 ++
ggml/src/ggml-cuda/pad.cu | 46 +++++++++++++++++++++++
ggml/src/ggml-cuda/pad.cuh | 1 +
ggml/src/ggml-metal/ggml-metal.m | 33 +++++++++++++++++
ggml/src/ggml-metal/ggml-metal.metal | 45 +++++++++++++++++++++++
ggml/src/ggml.c | 25 ++++++++++++-
10 files changed, 223 insertions(+), 2 deletions(-)
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index e91dedf1..8dc107ba 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -489,6 +489,7 @@ extern "C" {
GGML_OP_UPSCALE, // nearest interpolate
GGML_OP_PAD,
GGML_OP_PAD_REFLECT_1D,
+ GGML_OP_UNPAD,
GGML_OP_ARANGE,
GGML_OP_TIMESTEP_EMBEDDING,
GGML_OP_ARGSORT,
@@ -1781,6 +1782,15 @@ extern "C" {
int p0,
int p1);
+ // unpad each dimension: [x, ..., x, y, ..., y] -> [x, ..., x]
+ GGML_API struct ggml_tensor * ggml_unpad(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ int p0,
+ int p1,
+ int p2,
+ int p3);
+
// Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151
// timesteps: [N,]
// return: [N, dim]
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index a30e67f2..835e6495 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -1951,6 +1951,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
{
ggml_compute_forward_pad_reflect_1d(params, tensor);
} break;
+ case GGML_OP_UNPAD:
+ {
+ ggml_compute_forward_unpad(params, tensor);
+ } break;
case GGML_OP_ARANGE:
{
ggml_compute_forward_arange(params, tensor);
@@ -2274,6 +2278,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
case GGML_OP_UPSCALE:
case GGML_OP_PAD:
case GGML_OP_PAD_REFLECT_1D:
+ case GGML_OP_UNPAD:
case GGML_OP_ARANGE:
case GGML_OP_TIMESTEP_EMBEDDING:
case GGML_OP_ARGSORT:
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
index 955fec59..1868a10c 100644
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
@@ -6690,6 +6690,61 @@ void ggml_compute_forward_pad_reflect_1d(
}
}
+// ggml_compute_forward_unpad
+
+static void ggml_compute_forward_unpad_f32(
+ const struct ggml_compute_params *params,
+ struct ggml_tensor *dst) {
+
+ const struct ggml_tensor * src0 = dst->src[0];
+
+ GGML_ASSERT(src0->nb[0] == sizeof(float));
+ GGML_ASSERT( dst->nb[0] == sizeof(float));
+
+ const int ith = params->ith;
+ const int nth = params->nth;
+
+ GGML_TENSOR_UNARY_OP_LOCALS
+
+ float * dst_ptr = (float *) dst->data;
+
+ // TODO: optimize
+
+ for (int64_t i2 = 0; i2 < ne2; ++i2) {
+ for (int64_t i1 = ith; i1 < ne1; i1 += nth) {
+ for (int64_t i0 = 0; i0 < ne0; ++i0) {
+ for (int64_t i3 = 0; i3 < ne3; ++i3) {
+ const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0;
+
+ const float * src_ptr = (const float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
+
+ if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
+ dst_ptr[dst_idx] = *src_ptr;
+ }
+ }
+ }
+ }
+ }
+}
+
+void ggml_compute_forward_unpad(
+ const struct ggml_compute_params * params,
+ struct ggml_tensor * dst) {
+
+ const struct ggml_tensor * src0 = dst->src[0];
+
+ switch (src0->type) {
+ case GGML_TYPE_F32:
+ {
+ ggml_compute_forward_unpad_f32(params, dst);
+ } break;
+ default:
+ {
+ GGML_ABORT("fatal error");
+ }
+ }
+}
+
// ggml_compute_forward_arange
static void ggml_compute_forward_arange_f32(
diff --git a/ggml/src/ggml-cpu/ops.h b/ggml/src/ggml-cpu/ops.h
index dc081b9e..a7125555 100644
--- a/ggml/src/ggml-cpu/ops.h
+++ b/ggml/src/ggml-cpu/ops.h
@@ -72,6 +72,7 @@ void ggml_compute_forward_pool_2d_back(const struct ggml_compute_params * params
void ggml_compute_forward_upscale(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_pad(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_pad_reflect_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_unpad(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_arange(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_timestep_embedding(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_argsort(const struct ggml_compute_params * params, struct ggml_tensor * dst);
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index cb0d8528..6fe86674 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2238,6 +2238,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
case GGML_OP_PAD:
ggml_cuda_op_pad(ctx, dst);
break;
+ case GGML_OP_UNPAD:
+ ggml_cuda_op_unpad(ctx, dst);
+ break;
case GGML_OP_ARANGE:
ggml_cuda_op_arange(ctx, dst);
break;
@@ -3212,6 +3215,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
case GGML_OP_UPSCALE:
return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST;
case GGML_OP_PAD:
+ case GGML_OP_UNPAD:
case GGML_OP_ARANGE:
case GGML_OP_TIMESTEP_EMBEDDING:
case GGML_OP_LEAKY_RELU:
diff --git a/ggml/src/ggml-cuda/pad.cu b/ggml/src/ggml-cuda/pad.cu
index 77432b04..7d45a7e1 100644
--- a/ggml/src/ggml-cuda/pad.cu
+++ b/ggml/src/ggml-cuda/pad.cu
@@ -47,3 +47,49 @@ void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], stream);
}
+
+static __global__ void unpad_f32(const float * x, float * dst, const int ne0, const int ne00, const int ne01, const int ne02, const int ne03) {
+ // blockIdx.z: idx of ne2*ne3, aka ne02*ne03
+ // blockIdx.y: idx of ne1
+ // blockIDx.x: idx of ne0 / BLOCK_SIZE
+ int nidx = threadIdx.x + blockIdx.x * blockDim.x;
+ if (nidx >= ne0) {
+ return;
+ }
+
+ // operation
+ int offset_dst =
+ nidx +
+ blockIdx.y * ne0 +
+ blockIdx.z * ne0 * gridDim.y;
+ if (nidx < ne00 && blockIdx.y < ne01 && blockIdx.z < ne02*ne03) {
+ int offset_src =
+ nidx +
+ blockIdx.y * ne00 +
+ blockIdx.z * ne00 * ne01;
+ dst[offset_dst] = x[offset_src];
+ }
+}
+
+static void unpad_f32_cuda(const float * x, float * dst,
+ const int ne00, const int ne01, const int ne02, const int ne03,
+ const int ne0, const int ne1, const int ne2, const int ne3, cudaStream_t stream) {
+ int num_blocks = (ne0 + CUDA_PAD_BLOCK_SIZE - 1) / CUDA_PAD_BLOCK_SIZE;
+ dim3 gridDim(num_blocks, ne1, ne2*ne3);
+ unpad_f32<<<gridDim, CUDA_PAD_BLOCK_SIZE, 0, stream>>>(x, dst, ne0, ne00, ne01, ne02, ne03);
+}
+
+void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+ const ggml_tensor * src0 = dst->src[0];
+ const float * src0_d = (const float *)src0->data;
+ float * dst_d = (float *)dst->data;
+ cudaStream_t stream = ctx.stream();
+
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
+ GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
+
+ unpad_f32_cuda(src0_d, dst_d,
+ src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
+ dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], stream);
+}
\ No newline at end of file
diff --git a/ggml/src/ggml-cuda/pad.cuh b/ggml/src/ggml-cuda/pad.cuh
index 8fd386b0..e2ededc3 100644
--- a/ggml/src/ggml-cuda/pad.cuh
+++ b/ggml/src/ggml-cuda/pad.cuh
@@ -3,3 +3,4 @@
#define CUDA_PAD_BLOCK_SIZE 256
void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
index 1b56f858..7641247e 100644
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -347,6 +347,7 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
GGML_METAL_KERNEL_TYPE_UPSCALE_F32,
GGML_METAL_KERNEL_TYPE_PAD_F32,
GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32,
+ GGML_METAL_KERNEL_TYPE_UNPAD_F32,
GGML_METAL_KERNEL_TYPE_ARANGE_F32,
GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,
GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,
@@ -1294,6 +1295,7 @@ @implementation GGMLMetalClass
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32, upscale_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32, pad_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32, pad_reflect_1d_f32, true);
+ GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UNPAD_F32, unpad_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32, timestep_embedding_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARANGE_F32, arange_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, argsort_f32_i32_asc, true);
@@ -1655,6 +1657,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
case GGML_OP_POOL_2D:
case GGML_OP_PAD:
case GGML_OP_PAD_REFLECT_1D:
+ case GGML_OP_UNPAD:
case GGML_OP_TIMESTEP_EMBEDDING:
case GGML_OP_ARGSORT:
case GGML_OP_LEAKY_RELU:
@@ -4184,6 +4187,36 @@ static bool ggml_metal_encode_node(
const int nth = MIN(1024, ne0);
+ [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+ } break;
+ case GGML_OP_UNPAD:
+ {
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
+
+ id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_UNPAD_F32].pipeline;
+
+ [encoder setComputePipelineState:pipeline];
+ [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+ [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
+ [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
+ [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
+ [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
+ [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5];
+ [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
+ [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
+ [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
+ [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9];
+ [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:10];
+ [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:11];
+ [encoder setBytes:&ne2 length:sizeof(ne2) atIndex:12];
+ [encoder setBytes:&ne3 length:sizeof(ne3) atIndex:13];
+ [encoder setBytes:&nb0 length:sizeof(nb0) atIndex:14];
+ [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:15];
+ [encoder setBytes:&nb2 length:sizeof(nb2) atIndex:16];
+ [encoder setBytes:&nb3 length:sizeof(nb3) atIndex:17];
+
+ const int nth = MIN(1024, ne0);
+
[encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
} break;
case GGML_OP_ARANGE:
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
index 9cfddf45..080a943b 100644
--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
@@ -3121,6 +3121,51 @@ kernel void kernel_pad_reflect_1d_f32(
}
}
+kernel void kernel_unpad_f32(
+ device const char * src0,
+ device char * dst,
+ constant int64_t & ne00,
+ constant int64_t & ne01,
+ constant int64_t & ne02,
+ constant int64_t & ne03,
+ constant uint64_t & nb00,
+ constant uint64_t & nb01,
+ constant uint64_t & nb02,
+ constant uint64_t & nb03,
+ constant int64_t & ne0,
+ constant int64_t & ne1,
+ constant int64_t & ne2,
+ constant int64_t & ne3,
+ constant uint64_t & nb0,
+ constant uint64_t & nb1,
+ constant uint64_t & nb2,
+ constant uint64_t & nb3,
+ uint3 tgpig[[threadgroup_position_in_grid]],
+ uint3 tpitg[[thread_position_in_threadgroup]],
+ uint3 ntg[[threads_per_threadgroup]]) {
+
+ const int64_t i3 = tgpig.z;
+ const int64_t i2 = tgpig.y;
+ const int64_t i1 = tgpig.x;
+
+ const int64_t i03 = i3;
+ const int64_t i02 = i2;
+ const int64_t i01 = i1;
+
+ device const float * src0_ptr = (device const float *) (src0 + i03*nb03 + i02*nb02 + i01*nb01);
+ device float * dst_ptr = (device float *) (dst + i3*nb3 + i2*nb2 + i1*nb1);
+
+ if (i1 < ne01 && i2 < ne02 && i3 < ne03) {
+ for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
+ if (i0 < ne00) {
+ dst_ptr[i0] = src0_ptr[i0];
+ }
+ }
+
+ return;
+ }
+}
+
kernel void kernel_arange_f32(
device char * dst,
constant ggml_metal_kargs_arange & args,
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 8a654624..6b034d35 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -923,6 +923,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"UPSCALE",
"PAD",
"PAD_REFLECT_1D",
+ "UNPAD",
"ARANGE",
"TIMESTEP_EMBEDDING",
"ARGSORT",
@@ -953,7 +954,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"OPT_STEP_ADAMW",
};
-static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
+static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"none",
@@ -1018,6 +1019,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"upscale(x)",
"pad(x)",
"pad_reflect_1d(x)",
+ "unpad(x)",
"arange(start, stop, step)",
"timestep_embedding(timesteps, dim, max_period)",
"argsort(x)",
@@ -1048,7 +1050,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"adamw(x)",
};
-static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
+static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
@@ -4274,6 +4276,25 @@ struct ggml_tensor * ggml_pad_reflect_1d(
return result;
}
+// ggml_unpad
+
+struct ggml_tensor * ggml_unpad(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ int p0, int p1, int p2, int p3) {
+
+ struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type,
+ a->ne[0] - p0,
+ a->ne[1] - p1,
+ a->ne[2] - p2,
+ a->ne[3] - p3);
+
+ result->op = GGML_OP_UNPAD;
+ result->src[0] = a;
+
+ return result;
+}
+
// ggml_arange
struct ggml_tensor * ggml_arange(

View File

@ -58,7 +58,7 @@ index c22687e4..c5948e8f 100644
auto * gf = graph_init(); auto * gf = graph_init();
diff --git a/src/llama-context.h b/src/llama-context.h diff --git a/src/llama-context.h b/src/llama-context.h
index c4ab242a..9970dfc6 100644 index c0ceacb1..0264e937 100644
--- a/src/llama-context.h --- a/src/llama-context.h
+++ b/src/llama-context.h +++ b/src/llama-context.h
@@ -5,6 +5,7 @@ @@ -5,6 +5,7 @@
@ -70,10 +70,10 @@ index c4ab242a..9970dfc6 100644
#include "ggml-cpp.h" #include "ggml-cpp.h"
#include "ggml-opt.h" #include "ggml-opt.h"
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
index a7b0a7eb..1a50c034 100644 index 3dcad65b..60e67b03 100644
--- a/src/llama-kv-cache.cpp --- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp
@@ -372,8 +372,6 @@ void llama_kv_cache_unified::commit() { @@ -364,8 +364,6 @@ void llama_kv_cache_unified::commit() {
} }
bool llama_kv_cache_unified::update(llama_context & lctx) { bool llama_kv_cache_unified::update(llama_context & lctx) {
@ -82,7 +82,7 @@ index a7b0a7eb..1a50c034 100644
auto * sched = lctx.get_sched(); auto * sched = lctx.get_sched();
if (has_shift) { if (has_shift) {
@@ -396,8 +394,6 @@ bool llama_kv_cache_unified::update(llama_context & lctx) { @@ -388,8 +386,6 @@ bool llama_kv_cache_unified::update(llama_context & lctx) {
res->set_inputs(nullptr); res->set_inputs(nullptr);
lctx.graph_compute(gf, false); lctx.graph_compute(gf, false);
@ -91,7 +91,7 @@ index a7b0a7eb..1a50c034 100644
} }
{ {
@@ -411,27 +407,36 @@ bool llama_kv_cache_unified::update(llama_context & lctx) { @@ -403,27 +399,36 @@ bool llama_kv_cache_unified::update(llama_context & lctx) {
if (do_defrag) { if (do_defrag) {
LLAMA_LOG_DEBUG("%s: defragmenting KV cache\n", __func__); LLAMA_LOG_DEBUG("%s: defragmenting KV cache\n", __func__);
@ -133,7 +133,7 @@ index a7b0a7eb..1a50c034 100644
} }
void llama_kv_cache_unified::defrag_sched(float thold) { void llama_kv_cache_unified::defrag_sched(float thold) {
@@ -715,11 +720,10 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_shift( @@ -707,11 +712,10 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_shift(
llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag( llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag(
const llama_cparams & cparams, const llama_cparams & cparams,
ggml_context * ctx, ggml_context * ctx,
@ -147,7 +147,7 @@ index a7b0a7eb..1a50c034 100644
#if 0 #if 0
// CPU defrag // CPU defrag
// //
@@ -791,32 +795,20 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag( @@ -783,32 +787,20 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag(
ggml_backend_tensor_set(v_l[il], buf_v.data(), 0, buf_v.size()); ggml_backend_tensor_set(v_l[il], buf_v.data(), 0, buf_v.size());
} }
#else #else
@ -185,7 +185,7 @@ index a7b0a7eb..1a50c034 100644
ggml_tensor * view_v_src; ggml_tensor * view_v_src;
ggml_tensor * view_v_dst; ggml_tensor * view_v_dst;
@@ -824,31 +816,29 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag( @@ -816,31 +808,29 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag(
if (cparams.flash_attn) { if (cparams.flash_attn) {
// NOTE: the V cache is not transposed when using flash attention // NOTE: the V cache is not transposed when using flash attention
view_v_src = ggml_view_2d(ctx, v_l[il], view_v_src = ggml_view_2d(ctx, v_l[il],
@ -225,7 +225,7 @@ index a7b0a7eb..1a50c034 100644
} }
//LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes); //LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
@@ -865,17 +855,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) { @@ -857,17 +847,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
assert(n_used <= n_kv); assert(n_used <= n_kv);
@ -244,7 +244,7 @@ index a7b0a7eb..1a50c034 100644
// determine which KV cells to move where // determine which KV cells to move where
// //
@@ -883,10 +863,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) { @@ -875,10 +855,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
// //
// if ids[i] == i || ids[i] == n_kv, then cell i is not moved // if ids[i] == i || ids[i] == n_kv, then cell i is not moved
// //
@ -256,7 +256,7 @@ index a7b0a7eb..1a50c034 100644
for (uint32_t i0 = 0; i0 < n_used; ++i0) { for (uint32_t i0 = 0; i0 < n_used; ++i0) {
const auto & cell0 = cells[i0]; const auto & cell0 = cells[i0];
@@ -935,19 +912,11 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) { @@ -927,19 +904,11 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
// are we moving a continuous block of memory? // are we moving a continuous block of memory?
bool cont = false; bool cont = false;
@ -276,7 +276,7 @@ index a7b0a7eb..1a50c034 100644
cont = false; cont = false;
continue; continue;
} }
@@ -963,8 +932,10 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) { @@ -955,8 +924,10 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
head = n_used; head = n_used;
if (!cont) { if (!cont) {
@ -288,7 +288,7 @@ index a7b0a7eb..1a50c034 100644
} }
nf++; nf++;
@@ -974,22 +945,16 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) { @@ -966,22 +937,16 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
} }
} }

View File

@ -11,7 +11,7 @@ with the fastest acceleration is loaded
1 file changed, 13 insertions(+), 8 deletions(-) 1 file changed, 13 insertions(+), 8 deletions(-)
diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
index 82ae1b5b..1487f322 100644 index 405d8e31..4e67d243 100644
--- a/ggml/src/ggml-backend-reg.cpp --- a/ggml/src/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-backend-reg.cpp +++ b/ggml/src/ggml-backend-reg.cpp
@@ -157,7 +157,7 @@ struct ggml_backend_reg_entry { @@ -157,7 +157,7 @@ struct ggml_backend_reg_entry {

View File

@ -8,7 +8,7 @@ Subject: [PATCH] ollama debug tensor
1 file changed, 6 insertions(+) 1 file changed, 6 insertions(+)
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index 835e6495..3902894b 100644 index a30e67f2..2462d2b8 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c --- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -15,6 +15,8 @@ @@ -15,6 +15,8 @@
@ -20,7 +20,7 @@ index 835e6495..3902894b 100644
#if defined(_MSC_VER) || defined(__MINGW32__) #if defined(_MSC_VER) || defined(__MINGW32__)
#include <malloc.h> // using malloc.h with MSC/MINGW #include <malloc.h> // using malloc.h with MSC/MINGW
#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__) #elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
@@ -2846,6 +2848,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { @@ -2841,6 +2843,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
ggml_compute_forward(&params, node); ggml_compute_forward(&params, node);

View File

@ -111,9 +111,8 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
slog.Debug("evaluating", "library", gpus[0].Library, "gpu_count", len(gpus), "available", availableList) slog.Debug("evaluating", "library", gpus[0].Library, "gpu_count", len(gpus), "available", availableList)
for _, projector := range projectors { for _, projector := range projectors {
weight, graph := projectorMemoryRequirements(projector) weight := projectorMemoryRequirements(projector)
projectorWeights += weight projectorWeights += weight
projectorGraph += graph
// multimodal models require at least 2048 context // multimodal models require at least 2048 context
opts.NumCtx = max(opts.NumCtx, 2048) opts.NumCtx = max(opts.NumCtx, 2048)
@ -409,51 +408,21 @@ func (m MemoryEstimate) LogValue() slog.Value {
return slog.GroupValue(attrs...) return slog.GroupValue(attrs...)
} }
func projectorMemoryRequirements(filename string) (weights, graphSize uint64) { func projectorMemoryRequirements(filename string) (weights uint64) {
file, err := os.Open(filename) file, err := os.Open(filename)
if err != nil { if err != nil {
return 0, 0 return 0
} }
defer file.Close() defer file.Close()
ggml, _, err := ggml.Decode(file, 1024) ggml, _, err := ggml.Decode(file, 1024)
if err != nil { if err != nil {
return 0, 0 return 0
} }
for _, layer := range ggml.Tensors().GroupLayers() { for _, layer := range ggml.Tensors().GroupLayers() {
weights += layer.Size() weights += layer.Size()
} }
switch arch := ggml.KV().Architecture(); arch { return weights
case "mllama":
kv := func(n string) uint64 {
if v, ok := ggml.KV()[arch+".vision."+n].(uint32); ok {
return uint64(v)
}
return 0
}
imageSize := kv("image_size")
maxNumTiles := kv("max_num_tiles")
embeddingLength := kv("embedding_length")
headCount := kv("attention.head_count")
numPatches := (imageSize / kv("patch_size")) * (imageSize / kv("patch_size"))
if _, ok := ggml.Tensors().GroupLayers()["v"]["class_embd"]; ok {
numPatches++
}
numPaddedPatches := numPatches + 8 - (numPatches%8)%8
graphSize = 4 * (8 +
imageSize*imageSize*kv("num_channels")*maxNumTiles +
embeddingLength*numPatches*maxNumTiles +
9*embeddingLength*numPaddedPatches*maxNumTiles +
numPaddedPatches*maxNumTiles*numPaddedPatches*maxNumTiles*headCount)
}
return weights, graphSize
} }

View File

@ -679,9 +679,8 @@ ws ::= ([ \t\n] ws)?
const maxBufferSize = 512 * format.KiloByte const maxBufferSize = 512 * format.KiloByte
type ImageData struct { type ImageData struct {
Data []byte `json:"data"` Data []byte `json:"data"`
ID int `json:"id"` ID int `json:"id"`
AspectRatioID int `json:"aspect_ratio_id"`
} }
type CompletionRequest struct { type CompletionRequest struct {

View File

@ -161,7 +161,6 @@ type Tensor interface {
Set(ctx Context, t2 Tensor, offset int, strides ...int) Tensor Set(ctx Context, t2 Tensor, offset int, strides ...int) Tensor
Pad(ctx Context, shape ...int) Tensor Pad(ctx Context, shape ...int) Tensor
Unpad(ctx Context, shape ...int) Tensor
Stack(ctx Context, dim int, s ...Tensor) Tensor Stack(ctx Context, dim int, s ...Tensor) Tensor

View File

@ -1017,17 +1017,6 @@ func (t *Tensor) Sigmoid(ctx ml.Context) ml.Tensor {
} }
} }
func (t *Tensor) Unpad(ctx ml.Context, shape ...int) ml.Tensor {
if len(shape) != 4 {
panic("expected 4 dimensions")
}
return &Tensor{
b: t.b,
t: C.ggml_unpad(ctx.(*Context).ctx, t.t, C.int(shape[0]), C.int(shape[1]), C.int(shape[2]), C.int(shape[3])),
}
}
func (t *Tensor) View(ctx ml.Context, offset int, shape ...int) ml.Tensor { func (t *Tensor) View(ctx ml.Context, offset int, shape ...int) ml.Tensor {
switch len(shape) { switch len(shape) {
case 1: case 1:

View File

@ -489,7 +489,6 @@ extern "C" {
GGML_OP_UPSCALE, // nearest interpolate GGML_OP_UPSCALE, // nearest interpolate
GGML_OP_PAD, GGML_OP_PAD,
GGML_OP_PAD_REFLECT_1D, GGML_OP_PAD_REFLECT_1D,
GGML_OP_UNPAD,
GGML_OP_ARANGE, GGML_OP_ARANGE,
GGML_OP_TIMESTEP_EMBEDDING, GGML_OP_TIMESTEP_EMBEDDING,
GGML_OP_ARGSORT, GGML_OP_ARGSORT,
@ -1782,15 +1781,6 @@ extern "C" {
int p0, int p0,
int p1); int p1);
// unpad each dimension: [x, ..., x, y, ..., y] -> [x, ..., x]
GGML_API struct ggml_tensor * ggml_unpad(
struct ggml_context * ctx,
struct ggml_tensor * a,
int p0,
int p1,
int p2,
int p3);
// Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151 // Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151
// timesteps: [N,] // timesteps: [N,]
// return: [N, dim] // return: [N, dim]

View File

@ -178,9 +178,9 @@ struct ggml_backend_registry {
#ifdef GGML_USE_CANN #ifdef GGML_USE_CANN
register_backend(ggml_backend_cann_reg()); register_backend(ggml_backend_cann_reg());
#endif #endif
// #ifdef GGML_USE_BLAS #ifdef GGML_USE_BLAS
// register_backend(ggml_backend_blas_reg()); register_backend(ggml_backend_blas_reg());
// #endif #endif
#ifdef GGML_USE_RPC #ifdef GGML_USE_RPC
register_backend(ggml_backend_rpc_reg()); register_backend(ggml_backend_rpc_reg());
#endif #endif

View File

@ -1953,10 +1953,6 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
{ {
ggml_compute_forward_pad_reflect_1d(params, tensor); ggml_compute_forward_pad_reflect_1d(params, tensor);
} break; } break;
case GGML_OP_UNPAD:
{
ggml_compute_forward_unpad(params, tensor);
} break;
case GGML_OP_ARANGE: case GGML_OP_ARANGE:
{ {
ggml_compute_forward_arange(params, tensor); ggml_compute_forward_arange(params, tensor);
@ -2280,7 +2276,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
case GGML_OP_UPSCALE: case GGML_OP_UPSCALE:
case GGML_OP_PAD: case GGML_OP_PAD:
case GGML_OP_PAD_REFLECT_1D: case GGML_OP_PAD_REFLECT_1D:
case GGML_OP_UNPAD:
case GGML_OP_ARANGE: case GGML_OP_ARANGE:
case GGML_OP_TIMESTEP_EMBEDDING: case GGML_OP_TIMESTEP_EMBEDDING:
case GGML_OP_ARGSORT: case GGML_OP_ARGSORT:

View File

@ -6690,61 +6690,6 @@ void ggml_compute_forward_pad_reflect_1d(
} }
} }
// ggml_compute_forward_unpad
static void ggml_compute_forward_unpad_f32(
const struct ggml_compute_params *params,
struct ggml_tensor *dst) {
const struct ggml_tensor * src0 = dst->src[0];
GGML_ASSERT(src0->nb[0] == sizeof(float));
GGML_ASSERT( dst->nb[0] == sizeof(float));
const int ith = params->ith;
const int nth = params->nth;
GGML_TENSOR_UNARY_OP_LOCALS
float * dst_ptr = (float *) dst->data;
// TODO: optimize
for (int64_t i2 = 0; i2 < ne2; ++i2) {
for (int64_t i1 = ith; i1 < ne1; i1 += nth) {
for (int64_t i0 = 0; i0 < ne0; ++i0) {
for (int64_t i3 = 0; i3 < ne3; ++i3) {
const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0;
const float * src_ptr = (const float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
dst_ptr[dst_idx] = *src_ptr;
}
}
}
}
}
}
void ggml_compute_forward_unpad(
const struct ggml_compute_params * params,
struct ggml_tensor * dst) {
const struct ggml_tensor * src0 = dst->src[0];
switch (src0->type) {
case GGML_TYPE_F32:
{
ggml_compute_forward_unpad_f32(params, dst);
} break;
default:
{
GGML_ABORT("fatal error");
}
}
}
// ggml_compute_forward_arange // ggml_compute_forward_arange
static void ggml_compute_forward_arange_f32( static void ggml_compute_forward_arange_f32(

View File

@ -72,7 +72,6 @@ void ggml_compute_forward_pool_2d_back(const struct ggml_compute_params * params
void ggml_compute_forward_upscale(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_upscale(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_pad(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_pad(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_pad_reflect_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_pad_reflect_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_unpad(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_arange(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_arange(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_timestep_embedding(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_timestep_embedding(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_argsort(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_argsort(const struct ggml_compute_params * params, struct ggml_tensor * dst);

View File

@ -2238,9 +2238,6 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
case GGML_OP_PAD: case GGML_OP_PAD:
ggml_cuda_op_pad(ctx, dst); ggml_cuda_op_pad(ctx, dst);
break; break;
case GGML_OP_UNPAD:
ggml_cuda_op_unpad(ctx, dst);
break;
case GGML_OP_ARANGE: case GGML_OP_ARANGE:
ggml_cuda_op_arange(ctx, dst); ggml_cuda_op_arange(ctx, dst);
break; break;
@ -3215,7 +3212,6 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
case GGML_OP_UPSCALE: case GGML_OP_UPSCALE:
return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST; return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST;
case GGML_OP_PAD: case GGML_OP_PAD:
case GGML_OP_UNPAD:
case GGML_OP_ARANGE: case GGML_OP_ARANGE:
case GGML_OP_TIMESTEP_EMBEDDING: case GGML_OP_TIMESTEP_EMBEDDING:
case GGML_OP_LEAKY_RELU: case GGML_OP_LEAKY_RELU:

View File

@ -47,49 +47,3 @@ void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], stream); dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], stream);
} }
static __global__ void unpad_f32(const float * x, float * dst, const int ne0, const int ne00, const int ne01, const int ne02, const int ne03) {
// blockIdx.z: idx of ne2*ne3, aka ne02*ne03
// blockIdx.y: idx of ne1
// blockIDx.x: idx of ne0 / BLOCK_SIZE
int nidx = threadIdx.x + blockIdx.x * blockDim.x;
if (nidx >= ne0) {
return;
}
// operation
int offset_dst =
nidx +
blockIdx.y * ne0 +
blockIdx.z * ne0 * gridDim.y;
if (nidx < ne00 && blockIdx.y < ne01 && blockIdx.z < ne02*ne03) {
int offset_src =
nidx +
blockIdx.y * ne00 +
blockIdx.z * ne00 * ne01;
dst[offset_dst] = x[offset_src];
}
}
static void unpad_f32_cuda(const float * x, float * dst,
const int ne00, const int ne01, const int ne02, const int ne03,
const int ne0, const int ne1, const int ne2, const int ne3, cudaStream_t stream) {
int num_blocks = (ne0 + CUDA_PAD_BLOCK_SIZE - 1) / CUDA_PAD_BLOCK_SIZE;
dim3 gridDim(num_blocks, ne1, ne2*ne3);
unpad_f32<<<gridDim, CUDA_PAD_BLOCK_SIZE, 0, stream>>>(x, dst, ne0, ne00, ne01, ne02, ne03);
}
void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];
const float * src0_d = (const float *)src0->data;
float * dst_d = (float *)dst->data;
cudaStream_t stream = ctx.stream();
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT(dst->type == GGML_TYPE_F32);
GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
unpad_f32_cuda(src0_d, dst_d,
src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], stream);
}

View File

@ -3,4 +3,3 @@
#define CUDA_PAD_BLOCK_SIZE 256 #define CUDA_PAD_BLOCK_SIZE 256
void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst); void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

View File

@ -5599,51 +5599,6 @@ kernel void kernel_pad_reflect_1d_f32(
} }
} }
kernel void kernel_unpad_f32(
device const char * src0,
device char * dst,
constant int64_t & ne00,
constant int64_t & ne01,
constant int64_t & ne02,
constant int64_t & ne03,
constant uint64_t & nb00,
constant uint64_t & nb01,
constant uint64_t & nb02,
constant uint64_t & nb03,
constant int64_t & ne0,
constant int64_t & ne1,
constant int64_t & ne2,
constant int64_t & ne3,
constant uint64_t & nb0,
constant uint64_t & nb1,
constant uint64_t & nb2,
constant uint64_t & nb3,
uint3 tgpig[[threadgroup_position_in_grid]],
uint3 tpitg[[thread_position_in_threadgroup]],
uint3 ntg[[threads_per_threadgroup]]) {
const int64_t i3 = tgpig.z;
const int64_t i2 = tgpig.y;
const int64_t i1 = tgpig.x;
const int64_t i03 = i3;
const int64_t i02 = i2;
const int64_t i01 = i1;
device const float * src0_ptr = (device const float *) (src0 + i03*nb03 + i02*nb02 + i01*nb01);
device float * dst_ptr = (device float *) (dst + i3*nb3 + i2*nb2 + i1*nb1);
if (i1 < ne01 && i2 < ne02 && i3 < ne03) {
for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
if (i0 < ne00) {
dst_ptr[i0] = src0_ptr[i0];
}
}
return;
}
}
kernel void kernel_arange_f32( kernel void kernel_arange_f32(
device char * dst, device char * dst,
constant ggml_metal_kargs_arange & args, constant ggml_metal_kargs_arange & args,

View File

@ -347,7 +347,6 @@ enum ggml_metal_kernel_type {
GGML_METAL_KERNEL_TYPE_UPSCALE_F32, GGML_METAL_KERNEL_TYPE_UPSCALE_F32,
GGML_METAL_KERNEL_TYPE_PAD_F32, GGML_METAL_KERNEL_TYPE_PAD_F32,
GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32, GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32,
GGML_METAL_KERNEL_TYPE_UNPAD_F32,
GGML_METAL_KERNEL_TYPE_ARANGE_F32, GGML_METAL_KERNEL_TYPE_ARANGE_F32,
GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32, GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,
GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,
@ -1295,7 +1294,6 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32, upscale_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32, upscale_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32, pad_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32, pad_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32, pad_reflect_1d_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32, pad_reflect_1d_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UNPAD_F32, unpad_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32, timestep_embedding_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32, timestep_embedding_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARANGE_F32, arange_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARANGE_F32, arange_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, argsort_f32_i32_asc, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, argsort_f32_i32_asc, true);
@ -1657,7 +1655,6 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
case GGML_OP_POOL_2D: case GGML_OP_POOL_2D:
case GGML_OP_PAD: case GGML_OP_PAD:
case GGML_OP_PAD_REFLECT_1D: case GGML_OP_PAD_REFLECT_1D:
case GGML_OP_UNPAD:
case GGML_OP_TIMESTEP_EMBEDDING: case GGML_OP_TIMESTEP_EMBEDDING:
case GGML_OP_ARGSORT: case GGML_OP_ARGSORT:
case GGML_OP_LEAKY_RELU: case GGML_OP_LEAKY_RELU:
@ -4187,36 +4184,6 @@ static bool ggml_metal_encode_node(
const int nth = MIN(1024, ne0); const int nth = MIN(1024, ne0);
[encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
} break;
case GGML_OP_UNPAD:
{
GGML_ASSERT(src0->type == GGML_TYPE_F32);
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_UNPAD_F32].pipeline;
[encoder setComputePipelineState:pipeline];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
[encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
[encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5];
[encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
[encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
[encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
[encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9];
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:10];
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:11];
[encoder setBytes:&ne2 length:sizeof(ne2) atIndex:12];
[encoder setBytes:&ne3 length:sizeof(ne3) atIndex:13];
[encoder setBytes:&nb0 length:sizeof(nb0) atIndex:14];
[encoder setBytes:&nb1 length:sizeof(nb1) atIndex:15];
[encoder setBytes:&nb2 length:sizeof(nb2) atIndex:16];
[encoder setBytes:&nb3 length:sizeof(nb3) atIndex:17];
const int nth = MIN(1024, ne0);
[encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
} break; } break;
case GGML_OP_ARANGE: case GGML_OP_ARANGE:

View File

@ -3121,51 +3121,6 @@ kernel void kernel_pad_reflect_1d_f32(
} }
} }
kernel void kernel_unpad_f32(
device const char * src0,
device char * dst,
constant int64_t & ne00,
constant int64_t & ne01,
constant int64_t & ne02,
constant int64_t & ne03,
constant uint64_t & nb00,
constant uint64_t & nb01,
constant uint64_t & nb02,
constant uint64_t & nb03,
constant int64_t & ne0,
constant int64_t & ne1,
constant int64_t & ne2,
constant int64_t & ne3,
constant uint64_t & nb0,
constant uint64_t & nb1,
constant uint64_t & nb2,
constant uint64_t & nb3,
uint3 tgpig[[threadgroup_position_in_grid]],
uint3 tpitg[[thread_position_in_threadgroup]],
uint3 ntg[[threads_per_threadgroup]]) {
const int64_t i3 = tgpig.z;
const int64_t i2 = tgpig.y;
const int64_t i1 = tgpig.x;
const int64_t i03 = i3;
const int64_t i02 = i2;
const int64_t i01 = i1;
device const float * src0_ptr = (device const float *) (src0 + i03*nb03 + i02*nb02 + i01*nb01);
device float * dst_ptr = (device float *) (dst + i3*nb3 + i2*nb2 + i1*nb1);
if (i1 < ne01 && i2 < ne02 && i3 < ne03) {
for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
if (i0 < ne00) {
dst_ptr[i0] = src0_ptr[i0];
}
}
return;
}
}
kernel void kernel_arange_f32( kernel void kernel_arange_f32(
device char * dst, device char * dst,
constant ggml_metal_kargs_arange & args, constant ggml_metal_kargs_arange & args,

View File

@ -923,7 +923,6 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"UPSCALE", "UPSCALE",
"PAD", "PAD",
"PAD_REFLECT_1D", "PAD_REFLECT_1D",
"UNPAD",
"ARANGE", "ARANGE",
"TIMESTEP_EMBEDDING", "TIMESTEP_EMBEDDING",
"ARGSORT", "ARGSORT",
@ -954,7 +953,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"OPT_STEP_ADAMW", "OPT_STEP_ADAMW",
}; };
static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83"); static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"none", "none",
@ -1019,7 +1018,6 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"upscale(x)", "upscale(x)",
"pad(x)", "pad(x)",
"pad_reflect_1d(x)", "pad_reflect_1d(x)",
"unpad(x)",
"arange(start, stop, step)", "arange(start, stop, step)",
"timestep_embedding(timesteps, dim, max_period)", "timestep_embedding(timesteps, dim, max_period)",
"argsort(x)", "argsort(x)",
@ -1050,7 +1048,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"adamw(x)", "adamw(x)",
}; };
static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83"); static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2"); static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
@ -4276,25 +4274,6 @@ struct ggml_tensor * ggml_pad_reflect_1d(
return result; return result;
} }
// ggml_unpad
struct ggml_tensor * ggml_unpad(
struct ggml_context * ctx,
struct ggml_tensor * a,
int p0, int p1, int p2, int p3) {
struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type,
a->ne[0] - p0,
a->ne[1] - p1,
a->ne[2] - p2,
a->ne[3] - p3);
result->op = GGML_OP_UNPAD;
result->src[0] = a;
return result;
}
// ggml_arange // ggml_arange
struct ggml_tensor * ggml_arange( struct ggml_tensor * ggml_arange(

View File

@ -208,7 +208,7 @@ func (m *VisionModel) Forward(ctx ml.Context, pixelValues ml.Tensor) ml.Tensor {
} }
hiddenStates = m.LayerNormPost.Forward(ctx, hiddenStates, m.eps) hiddenStates = m.LayerNormPost.Forward(ctx, hiddenStates, m.eps)
hiddenStates = hiddenStates.Unpad(ctx, 0, 1, 0, 0) hiddenStates = hiddenStates.Pad(ctx, 0, -1, 0, 0)
hiddenStates = m.VisionAdapter.Forward(ctx, hiddenStates, m.VisionOptions) hiddenStates = m.VisionAdapter.Forward(ctx, hiddenStates, m.VisionOptions)
return hiddenStates return hiddenStates
} }

View File

@ -1,201 +0,0 @@
package mllama
import (
"fmt"
"image"
_ "image/jpeg"
_ "image/png"
"io"
"math"
"slices"
"golang.org/x/image/draw"
"github.com/ollama/ollama/model/imageproc"
)
func getSupportedAspectRatios(maxTiles int) []image.Point {
ratios := []image.Point{}
for w := range maxTiles {
for h := range maxTiles {
if (w+1)*(h+1) <= maxTiles {
ratios = append(ratios, image.Point{w + 1, h + 1})
}
}
}
return ratios
}
func clip(a, a_min, a_max int) int {
if a < a_min {
return a_min
} else if a > a_max {
return a_max
}
return a
}
func getOptimalTiledCanvas(imageSize image.Point, maxImageTiles, tileSize int) image.Point {
possibleTileArrangements := getSupportedAspectRatios(maxImageTiles)
possibleCanvasSizes := []image.Point{}
for _, pta := range possibleTileArrangements {
possibleCanvasSizes = append(possibleCanvasSizes, image.Point{pta.X * tileSize, pta.Y * tileSize})
}
scales := []float64{}
for _, pcs := range possibleCanvasSizes {
scaleHeight := float64(pcs.Y) / float64(imageSize.Y)
scaleWidth := float64(pcs.X) / float64(imageSize.X)
if scaleWidth > scaleHeight {
scales = append(scales, scaleHeight)
} else {
scales = append(scales, scaleWidth)
}
}
var minUpscale float64
var maxDownscale float64
var upscale bool
for _, s := range scales {
if s > 1.0 {
upscale = true
if minUpscale == 0 {
minUpscale = s
} else {
minUpscale = math.Min(minUpscale, s)
}
} else {
maxDownscale = math.Max(maxDownscale, s)
}
}
selectedScale := maxDownscale
if upscale {
selectedScale = minUpscale
}
var selectedCanvas image.Point
for n, pcs := range possibleCanvasSizes {
if scales[n] == selectedScale {
// choose the smallest possible canvas
if selectedCanvas.X == 0 && selectedCanvas.Y == 0 {
selectedCanvas = pcs
} else if pcs.X*pcs.Y < selectedCanvas.X*selectedCanvas.Y {
selectedCanvas = pcs
}
}
}
return selectedCanvas
}
func getImageSizeFitToCanvas(imageSize, canvasSize image.Point, tileSize int) image.Point {
targetWidth := clip(imageSize.X, tileSize, canvasSize.X)
targetHeight := clip(imageSize.Y, tileSize, canvasSize.Y)
scaleWidth := float64(targetWidth) / float64(imageSize.X)
scaleHeight := float64(targetHeight) / float64(imageSize.Y)
var w, h int
if scaleWidth < scaleHeight {
w = targetWidth
h = min(int(math.Floor(float64(imageSize.Y)*scaleWidth)), targetHeight)
} else {
w = min(int(math.Floor(float64(imageSize.X)*scaleHeight)), targetWidth)
h = targetHeight
}
return image.Point{w, h}
}
func resizeImage(img image.Image, format string, outputSize image.Point, maxImageTiles int) (image.Image, image.Point) {
if format == "png" {
img = imageproc.Composite(img)
}
b := img.Bounds()
tileSize := outputSize.Y
canvasSize := getOptimalTiledCanvas(b.Max, maxImageTiles, tileSize)
aspectRatio := image.Point{canvasSize.X / tileSize, canvasSize.Y / tileSize}
newSize := getImageSizeFitToCanvas(b.Max, canvasSize, tileSize)
return imageproc.Resize(img, newSize, imageproc.ResizeBilinear), aspectRatio
}
func padImage(img image.Image, outputSize, aspectRatio image.Point) image.Image {
paddedSize := image.Point{
X: outputSize.X * aspectRatio.X,
Y: outputSize.Y * aspectRatio.Y,
}
dst := image.NewRGBA(image.Rect(0, 0, paddedSize.X, paddedSize.Y))
draw.Draw(dst, img.Bounds(), img, image.Point{0, 0}, draw.Over)
return dst
}
func splitToTiles(img image.Image, numTilesSize image.Point) []image.Image {
b := img.Bounds()
width := b.Max.X - b.Min.X
height := b.Max.Y - b.Min.Y
tileHeight := height / numTilesSize.Y
tileWidth := width / numTilesSize.X
images := []image.Image{}
for h := range numTilesSize.Y {
for w := range numTilesSize.X {
rect := image.Rect(tileWidth*w, tileHeight*h, tileWidth*(w+1), tileHeight*(h+1))
images = append(images, img.(interface {
SubImage(image.Rectangle) image.Image
}).SubImage(rect))
}
}
return images
}
func packImages(img image.Image, aspectRatio image.Point) []float32 {
subImages := splitToTiles(img, aspectRatio)
var pixelVals []float32
rescale := true
channelFirst := true
for _, subImg := range subImages {
vals := imageproc.Normalize(subImg, imageproc.ClipDefaultMean, imageproc.ClipDefaultSTD, rescale, channelFirst)
pixelVals = append(pixelVals, vals...)
}
return pixelVals
}
func Preprocess(imageData io.Reader) ([]float32, map[string]any, error) {
outputSize := image.Point{560, 560}
maxTiles := 4
img, format, err := image.Decode(imageData)
if err != nil {
return nil, nil, fmt.Errorf("failed to decode image: %w", err)
}
newImage, aspectRatio := resizeImage(img, format, outputSize, maxTiles)
newImage = padImage(newImage, outputSize, aspectRatio)
data := packImages(newImage, aspectRatio)
aspectRatioIndex := slices.Index(getSupportedAspectRatios(maxTiles), aspectRatio) + 1
opts := map[string]any{
"aspectRatioIndex": aspectRatioIndex,
}
return data, opts, nil
}

View File

@ -1,420 +0,0 @@
package mllama
import (
"bytes"
"image"
"image/png"
"testing"
"github.com/google/go-cmp/cmp"
)
func TestAspectRatios(t *testing.T) {
type aspectCase struct {
MaxTiles int
Expected []image.Point
}
cases := []aspectCase{
{
MaxTiles: 1,
Expected: []image.Point{{1, 1}},
},
{
MaxTiles: 2,
Expected: []image.Point{{1, 1}, {1, 2}, {2, 1}},
},
{
MaxTiles: 3,
Expected: []image.Point{{1, 1}, {1, 2}, {1, 3}, {2, 1}, {3, 1}},
},
{
MaxTiles: 4,
Expected: []image.Point{{1, 1}, {1, 2}, {1, 3}, {1, 4}, {2, 1}, {2, 2}, {3, 1}, {4, 1}},
},
}
for _, c := range cases {
actual := getSupportedAspectRatios(c.MaxTiles)
if diff := cmp.Diff(actual, c.Expected); diff != "" {
t.Errorf("mismatch (-got +want):\n%s", diff)
}
}
}
func TestGetImageSizeFitToCanvas(t *testing.T) {
type imageSizeCase struct {
ImageRect image.Point
CanvasRect image.Point
TileSize int
Expected image.Point
}
cases := []imageSizeCase{
{
ImageRect: image.Point{400, 400},
CanvasRect: image.Point{640, 480},
TileSize: 200,
Expected: image.Point{400, 400},
},
{
ImageRect: image.Point{1024, 768},
CanvasRect: image.Point{640, 480},
TileSize: 200,
Expected: image.Point{640, 480},
},
{
ImageRect: image.Point{500, 500},
CanvasRect: image.Point{1000, 1000},
TileSize: 750,
Expected: image.Point{750, 750},
},
{
ImageRect: image.Point{500, 1000},
CanvasRect: image.Point{2000, 2000},
TileSize: 2000,
Expected: image.Point{1000, 2000},
},
{
ImageRect: image.Point{4000, 3000},
CanvasRect: image.Point{2000, 1000},
TileSize: 1000,
Expected: image.Point{1333, 1000},
},
{
ImageRect: image.Point{667, 1000},
CanvasRect: image.Point{1000, 1000},
TileSize: 560,
Expected: image.Point{667, 1000},
},
}
for _, c := range cases {
actual := getImageSizeFitToCanvas(c.ImageRect, c.CanvasRect, c.TileSize)
if actual != c.Expected {
t.Errorf("incorrect image rect: '%#v'. expected: '%#v'", actual, c.Expected)
}
}
}
func TestGetOptimalTiledCanvas(t *testing.T) {
type tiledCanvasSizeCase struct {
ImageSize image.Point
MaxImageTiles int
TileSize int
Expected image.Point
}
cases := []tiledCanvasSizeCase{
{
ImageSize: image.Point{1024, 768},
MaxImageTiles: 4,
TileSize: 1000,
Expected: image.Point{2000, 1000},
},
{
ImageSize: image.Point{1024, 768},
MaxImageTiles: 4,
TileSize: 560,
Expected: image.Point{1120, 1120},
},
{
ImageSize: image.Point{800, 600},
MaxImageTiles: 4,
TileSize: 560,
Expected: image.Point{1120, 1120},
},
{
ImageSize: image.Point{640, 480},
MaxImageTiles: 4,
TileSize: 560,
Expected: image.Point{1120, 560},
},
{
ImageSize: image.Point{320, 200},
MaxImageTiles: 4,
TileSize: 560,
Expected: image.Point{560, 560},
},
{
ImageSize: image.Point{1320, 200},
MaxImageTiles: 4,
TileSize: 560,
Expected: image.Point{1680, 560},
},
{
ImageSize: image.Point{2000, 200},
MaxImageTiles: 4,
TileSize: 560,
Expected: image.Point{2240, 560},
},
{
ImageSize: image.Point{10000, 200},
MaxImageTiles: 4,
TileSize: 560,
Expected: image.Point{2240, 560},
},
{
ImageSize: image.Point{480, 640},
MaxImageTiles: 4,
TileSize: 560,
Expected: image.Point{560, 1120},
},
{
ImageSize: image.Point{200, 320},
MaxImageTiles: 4,
TileSize: 560,
Expected: image.Point{560, 560},
},
{
ImageSize: image.Point{200, 1320},
MaxImageTiles: 4,
TileSize: 560,
Expected: image.Point{560, 1680},
},
{
ImageSize: image.Point{200, 2000},
MaxImageTiles: 4,
TileSize: 560,
Expected: image.Point{560, 2240},
},
{
ImageSize: image.Point{200, 10000},
MaxImageTiles: 4,
TileSize: 560,
Expected: image.Point{560, 2240},
},
{
ImageSize: image.Point{10000, 10000},
MaxImageTiles: 4,
TileSize: 560,
Expected: image.Point{1120, 1120},
},
}
for _, c := range cases {
actual := getOptimalTiledCanvas(c.ImageSize, c.MaxImageTiles, c.TileSize)
if actual != c.Expected {
t.Errorf("incorrect tiled canvas: '%#v'. expected: '%#v'", actual, c.Expected)
}
}
}
func TestSplitToTiles(t *testing.T) {
type splitCase struct {
TestImage image.Image
NumTilesSize image.Point
Expected []image.Image
}
cases := []splitCase{
{
TestImage: image.NewRGBA(image.Rect(0, 0, 1024, 768)),
NumTilesSize: image.Point{1, 1},
Expected: []image.Image{image.NewRGBA(image.Rect(0, 0, 1024, 768))},
},
{
TestImage: image.NewRGBA(image.Rect(0, 0, 1000, 500)),
NumTilesSize: image.Point{2, 1},
Expected: []image.Image{
image.NewRGBA(image.Rect(0, 0, 500, 500)),
image.NewRGBA(image.Rect(500, 0, 1000, 500)),
},
},
{
TestImage: image.NewRGBA(image.Rect(0, 0, 1000, 1000)),
NumTilesSize: image.Point{2, 2},
Expected: []image.Image{
image.NewRGBA(image.Rect(0, 0, 500, 500)),
image.NewRGBA(image.Rect(500, 0, 1000, 500)),
image.NewRGBA(image.Rect(0, 500, 500, 1000)),
image.NewRGBA(image.Rect(500, 500, 1000, 1000)),
},
},
}
for _, c := range cases {
actual := splitToTiles(c.TestImage, c.NumTilesSize)
if len(actual) != len(c.Expected) {
t.Errorf("incorrect number of images '%d': expected: '%d'", len(actual), len(c.Expected))
}
for i := range actual {
if actual[i].Bounds() != c.Expected[i].Bounds() {
t.Errorf("image size incorrect: '%#v': expected: '%#v'", actual[i].Bounds(), c.Expected[i].Bounds())
}
}
}
}
func TestResize(t *testing.T) {
type resizeCase struct {
TestImage image.Image
OutputSize image.Point
MaxImageTiles int
ExpectedImage image.Image
ExpectedAspectRatio image.Point
}
cases := []resizeCase{
{
TestImage: image.NewRGBA(image.Rect(0, 0, 200, 200)),
OutputSize: image.Point{100, 100},
MaxImageTiles: 1,
ExpectedImage: image.NewRGBA(image.Rect(0, 0, 100, 100)),
ExpectedAspectRatio: image.Point{1, 1},
},
{
TestImage: image.NewRGBA(image.Rect(0, 0, 200, 200)),
OutputSize: image.Point{100, 100},
MaxImageTiles: 2,
ExpectedImage: image.NewRGBA(image.Rect(0, 0, 100, 100)),
ExpectedAspectRatio: image.Point{1, 1},
},
{
TestImage: image.NewRGBA(image.Rect(0, 0, 10, 10)),
OutputSize: image.Point{560, 560},
MaxImageTiles: 4,
ExpectedImage: image.NewRGBA(image.Rect(0, 0, 560, 560)),
ExpectedAspectRatio: image.Point{1, 1},
},
{
TestImage: image.NewRGBA(image.Rect(0, 0, 2560, 1920)),
OutputSize: image.Point{560, 560},
MaxImageTiles: 4,
ExpectedImage: image.NewRGBA(image.Rect(0, 0, 1120, 840)),
ExpectedAspectRatio: image.Point{2, 2},
},
{
TestImage: image.NewRGBA(image.Rect(0, 0, 1024, 768)),
OutputSize: image.Point{560, 560},
MaxImageTiles: 4,
ExpectedImage: image.NewRGBA(image.Rect(0, 0, 1024, 768)),
ExpectedAspectRatio: image.Point{2, 2},
},
}
for _, c := range cases {
actualImage, actualAspectRatio := resizeImage(c.TestImage, "png", c.OutputSize, c.MaxImageTiles)
if actualImage.Bounds() != c.ExpectedImage.Bounds() {
t.Errorf("image size incorrect: '%#v': expected: '%#v'", actualImage.Bounds(), c.ExpectedImage.Bounds())
}
if actualAspectRatio != c.ExpectedAspectRatio {
t.Errorf("aspect ratio incorrect: '%#v': expected: '%#v'", actualAspectRatio, c.ExpectedAspectRatio)
}
}
}
func TestPad(t *testing.T) {
type padCase struct {
TestImage image.Image
OutputSize image.Point
AspectRatio image.Point
Expected image.Image
}
cases := []padCase{
{
TestImage: image.NewRGBA(image.Rect(0, 0, 1000, 667)),
OutputSize: image.Point{560, 560},
AspectRatio: image.Point{2, 2},
Expected: image.NewRGBA(image.Rect(0, 0, 1120, 1120)),
},
}
for _, c := range cases {
actual := padImage(c.TestImage, c.OutputSize, c.AspectRatio)
if actual.Bounds() != c.Expected.Bounds() {
t.Errorf("image size incorrect: '%#v': expected: '%#v'", actual.Bounds(), c.Expected.Bounds())
}
}
}
func TestPackImages(t *testing.T) {
type packCase struct {
TestImage image.Image
AspectRatio image.Point
ExpectedVals int
}
cases := []packCase{
{
TestImage: image.NewRGBA(image.Rect(0, 0, 1120, 1120)),
AspectRatio: image.Point{2, 2},
ExpectedVals: 2 * 2 * 3 * 560 * 560,
},
{
TestImage: image.NewRGBA(image.Rect(0, 0, 560, 560)),
AspectRatio: image.Point{1, 1},
ExpectedVals: 1 * 1 * 3 * 560 * 560,
},
{
TestImage: image.NewRGBA(image.Rect(0, 0, 1120, 560)),
AspectRatio: image.Point{1, 2},
ExpectedVals: 1 * 2 * 3 * 560 * 560,
},
}
for _, c := range cases {
actualVals := packImages(c.TestImage, c.AspectRatio)
if len(actualVals) != c.ExpectedVals {
t.Errorf("packed image size incorrect: '%d': expected: '%d'", len(actualVals), c.ExpectedVals)
}
}
}
func TestPreprocess(t *testing.T) {
type preprocessCase struct {
TestImage image.Image
ExpectedVals int
ExpectedAspectRatioID int
}
cases := []preprocessCase{
{
TestImage: image.NewRGBA(image.Rect(0, 0, 10, 10)),
ExpectedVals: 0,
ExpectedAspectRatioID: 1,
},
{
TestImage: image.NewRGBA(image.Rect(0, 0, 1024, 768)),
ExpectedVals: 0,
ExpectedAspectRatioID: 6,
},
}
for _, c := range cases {
var buf bytes.Buffer
err := png.Encode(&buf, c.TestImage)
if err != nil {
t.Fatal(err)
}
imgData, opts, err := Preprocess(&buf)
if err != nil {
t.Fatalf("error processing: %q", err)
}
if len(imgData) == 0 {
t.Errorf("no image data returned")
}
ar, ok := opts["aspectRatioIndex"]
if !ok {
t.Fatalf("no aspect ratio found")
}
aspectRatioID := ar.(int)
if aspectRatioID != c.ExpectedAspectRatioID {
t.Errorf("aspect ratio incorrect: '%d': expected: '%d'", aspectRatioID, c.ExpectedAspectRatioID)
}
}
}

View File

@ -2,11 +2,7 @@ package mllama
import ( import (
"bytes" "bytes"
"encoding/binary"
"fmt"
"hash/fnv"
"image" "image"
"slices"
"github.com/ollama/ollama/fs" "github.com/ollama/ollama/fs"
"github.com/ollama/ollama/kvcache" "github.com/ollama/ollama/kvcache"
@ -34,10 +30,6 @@ const (
) )
func New(c fs.Config) (model.Model, error) { func New(c fs.Config) (model.Model, error) {
// Verify unified config
if c.Uint("vision.block_count") == 0 {
return nil, fmt.Errorf("non-unified vision model not supported")
}
m := Model{ m := Model{
BytePairEncoding: model.NewBytePairEncoding( BytePairEncoding: model.NewBytePairEncoding(
c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`), c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
@ -76,22 +68,19 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, er
return nil, err return nil, err
} }
f32s, aspectRatioID, err := m.ImageProcessor.ProcessImage(image) f32s, ratio, err := m.ImageProcessor.ProcessImage(image)
if err != nil { if err != nil {
return nil, err return nil, err
} }
pixelValues, err := ctx.Input().FromFloatSlice(f32s, pixelValues, err := ctx.Input().FromFloatSlice(f32s, m.imageSize, m.imageSize, m.numChannels, ratio.numTiles())
m.ImageProcessor.imageSize,
m.ImageProcessor.imageSize,
m.ImageProcessor.numChannels,
m.ImageProcessor.maxNumTiles,
)
if err != nil { if err != nil {
return nil, err return nil, err
} }
aspectRatio, err := ctx.Input().FromIntSlice([]int32{int32(aspectRatioID)}, 1) pixelValues = pixelValues.Pad(ctx, 0, 0, 0, m.ImageProcessor.maxNumTiles-ratio.numTiles())
aspectRatio, err := ctx.Input().FromIntSlice([]int32{int32(ratio.rank)}, 1)
if err != nil { if err != nil {
return nil, err return nil, err
} }
@ -102,41 +91,19 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, er
} }
func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) { func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
var images []input.Input
fnvHash := fnv.New64a()
for i := range inputs { for i := range inputs {
if inputs[i].Multimodal == nil { if inputs[i].Multimodal != nil {
if len(images) > 0 { inputs[i].Token = 128256 // <|image|>
inputs[i].Multimodal = []ml.Tensor{images[0].Multimodal.(ml.Tensor)}
inputs[i].MultimodalHash = images[0].MultimodalHash
for j := 1; j < len(images); j++ {
inputs[i].Multimodal = append(inputs[i].Multimodal.([]ml.Tensor), images[0].Multimodal.(ml.Tensor))
fnvHash.Reset()
binary.Write(fnvHash, binary.NativeEndian, inputs[i].MultimodalHash)
binary.Write(fnvHash, binary.NativeEndian, inputs[j].MultimodalHash)
inputs[i].MultimodalHash = fnvHash.Sum64()
}
images = nil
}
} else {
images = append(images, inputs[i])
inputs[i].Token = -1
} }
} }
inputs = slices.DeleteFunc(inputs, func(input input.Input) bool { return input.Token == -1 })
return inputs, nil return inputs, nil
} }
func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) { func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
var crossAttentionStates ml.Tensor var crossAttentionStates ml.Tensor
if len(batch.Multimodal) > 0 { if len(batch.Multimodal) > 0 {
images := batch.Multimodal[len(batch.Multimodal)-1].Multimodal.([]ml.Tensor) crossAttentionStates = batch.Multimodal[len(batch.Multimodal)-1].Multimodal.(ml.Tensor)
if len(images) > 0 {
crossAttentionStates = images[len(images)-1]
}
} }
positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions)) positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
@ -150,7 +117,7 @@ func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
} }
// TODO: attention mask, cross attention mask // TODO: attention mask, cross attention mask
return m.TextModel.Forward(ctx, batch.Inputs, positions, outputs, nil, crossAttentionStates, nil, m.Cache.(*kvcache.WrapperCache)), nil return m.TextModel.Forward(ctx, batch.Inputs, positions, outputs, crossAttentionStates, nil, m.Cache.(*kvcache.WrapperCache)), nil
} }
func init() { func init() {

View File

@ -18,7 +18,7 @@ type TextSelfAttention struct {
RopeFactors ml.Tensor `gguf:"rope_freqs.weight"` RopeFactors ml.Tensor `gguf:"rope_freqs.weight"`
} }
func (sa *TextSelfAttention) Forward(ctx ml.Context, hiddenState, positions, _ ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor { func (sa *TextSelfAttention) Forward(ctx ml.Context, hiddenState, positions ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor {
batchSize := hiddenState.Dim(1) batchSize := hiddenState.Dim(1)
headDim := opts.hiddenSize / opts.numHeads headDim := opts.hiddenSize / opts.numHeads
ropeType := uint32(0) ropeType := uint32(0)
@ -69,11 +69,11 @@ type TextSelfAttentionDecoderLayer struct {
MLP *TextMLP MLP *TextMLP
} }
func (d *TextSelfAttentionDecoderLayer) Forward(ctx ml.Context, hiddenState, positions, outputs, mask, _, _ ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor { func (d *TextSelfAttentionDecoderLayer) Forward(ctx ml.Context, hiddenState, positions, outputs, _, _ ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor {
residual := hiddenState residual := hiddenState
hiddenState = d.AttentionNorm.Forward(ctx, hiddenState, opts.eps) hiddenState = d.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
hiddenState = d.SelfAttention.Forward(ctx, hiddenState, positions, mask, cache, opts) hiddenState = d.SelfAttention.Forward(ctx, hiddenState, positions, cache, opts)
// In the final layer (outputs != nil), optimize by pruning to just the token positions // In the final layer (outputs != nil), optimize by pruning to just the token positions
// we need logits for. // we need logits for.
@ -151,7 +151,7 @@ type TextCrossAttentionDecoderLayer struct {
MLPGate ml.Tensor `gguf:"cross_attn_mlp_gate"` MLPGate ml.Tensor `gguf:"cross_attn_mlp_gate"`
} }
func (d *TextCrossAttentionDecoderLayer) Forward(ctx ml.Context, hiddenState, _, _, _, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor { func (d *TextCrossAttentionDecoderLayer) Forward(ctx ml.Context, hiddenState, _, _, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor {
residual := hiddenState residual := hiddenState
hiddenState = d.AttentionNorm.Forward(ctx, hiddenState, opts.eps) hiddenState = d.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
@ -167,14 +167,14 @@ func (d *TextCrossAttentionDecoderLayer) Forward(ctx ml.Context, hiddenState, _,
} }
type TextDecoderLayer interface { type TextDecoderLayer interface {
Forward(ctx ml.Context, hiddenState, positionIDs, outputs, mask, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor Forward(ctx ml.Context, hiddenState, positionIDs, outputs, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor
} }
type TextDecoder struct { type TextDecoder struct {
Layers []TextDecoderLayer Layers []TextDecoderLayer
} }
func (d *TextDecoder) Forward(ctx ml.Context, hiddenState, positionIDs, outputs, mask, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor { func (d *TextDecoder) Forward(ctx ml.Context, hiddenState, positionIDs, outputs, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor {
for i, layer := range d.Layers { for i, layer := range d.Layers {
layerType := selfAttentionLayer layerType := selfAttentionLayer
if slices.Contains(opts.crossAttentionLayers, int32(i)) { if slices.Contains(opts.crossAttentionLayers, int32(i)) {
@ -190,7 +190,7 @@ func (d *TextDecoder) Forward(ctx ml.Context, hiddenState, positionIDs, outputs,
lastLayerOutputs = outputs lastLayerOutputs = outputs
} }
hiddenState = layer.Forward(ctx, hiddenState, positionIDs, lastLayerOutputs, mask, crossAttentionStates, crossAttentionMask, cache, opts) hiddenState = layer.Forward(ctx, hiddenState, positionIDs, lastLayerOutputs, crossAttentionStates, crossAttentionMask, cache, opts)
} }
} }
@ -214,9 +214,9 @@ type TextModel struct {
*TextModelOptions *TextModelOptions
} }
func (m *TextModel) Forward(ctx ml.Context, inputIDs, positionIDs, outputs, mask, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache) ml.Tensor { func (m *TextModel) Forward(ctx ml.Context, inputIDs, positionIDs, outputs, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache) ml.Tensor {
hiddenState := m.TokenEmbedding.Forward(ctx, inputIDs) hiddenState := m.TokenEmbedding.Forward(ctx, inputIDs)
hiddenState = m.Transformer.Forward(ctx, hiddenState, positionIDs, outputs, mask, crossAttentionStates, crossAttentionMask, cache, m.TextModelOptions) hiddenState = m.Transformer.Forward(ctx, hiddenState, positionIDs, outputs, crossAttentionStates, crossAttentionMask, cache, m.TextModelOptions)
hiddenState = m.OutputNorm.Forward(ctx, hiddenState, m.eps) hiddenState = m.OutputNorm.Forward(ctx, hiddenState, m.eps)
return m.Output.Forward(ctx, hiddenState) return m.Output.Forward(ctx, hiddenState)
} }

View File

@ -15,7 +15,7 @@ type VisionSelfAttention struct {
Query *nn.Linear `gguf:"attn_q"` Query *nn.Linear `gguf:"attn_q"`
Key *nn.Linear `gguf:"attn_k"` Key *nn.Linear `gguf:"attn_k"`
Value *nn.Linear `gguf:"attn_v"` Value *nn.Linear `gguf:"attn_v"`
Output *nn.Linear `gguf:"attn_out"` Output *nn.Linear `gguf:"attn_output"`
Gate ml.Tensor `gguf:"attn_gate"` Gate ml.Tensor `gguf:"attn_gate"`
} }
@ -45,36 +45,29 @@ func (sa *VisionSelfAttention) Forward(ctx ml.Context, hiddenState ml.Tensor, op
attention = attention.Reshape(ctx, opts.hiddenSize, attention.Dim(2), batchSize) attention = attention.Reshape(ctx, opts.hiddenSize, attention.Dim(2), batchSize)
hiddenState = sa.Output.Forward(ctx, attention) hiddenState = sa.Output.Forward(ctx, attention)
if sa.Gate != nil {
hiddenState = hiddenState.Mul(ctx, sa.Gate)
}
return hiddenState return hiddenState
} }
type VisionMLP struct { type VisionMLP struct {
Down *nn.Linear `gguf:"ffn_down"`
Up *nn.Linear `gguf:"ffn_up"` Up *nn.Linear `gguf:"ffn_up"`
Down *nn.Linear `gguf:"ffn_down"`
Gate ml.Tensor `gguf:"ffn_gate"`
} }
func (mlp *VisionMLP) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *VisionModelOptions) ml.Tensor { func (mlp *VisionMLP) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *VisionModelOptions) ml.Tensor {
hiddenState = mlp.Down.Forward(ctx, hiddenState).GELU(ctx) hiddenState = mlp.Up.Forward(ctx, hiddenState).GELU(ctx)
hiddenState = mlp.Up.Forward(ctx, hiddenState) hiddenState = mlp.Down.Forward(ctx, hiddenState)
if mlp.Gate != nil {
hiddenState = hiddenState.Mul(ctx, mlp.Gate)
}
return hiddenState return hiddenState
} }
type VisionEncoderLayer struct { type VisionEncoderLayer struct {
AttentionNorm *nn.LayerNorm `gguf:"ln1"` AttentionNorm *nn.LayerNorm `gguf:"attn_norm"`
SelfAttention *VisionSelfAttention SelfAttention *VisionSelfAttention
AttentionGate ml.Tensor `gguf:"attn_gate"`
MLPNorm *nn.LayerNorm `gguf:"ln2"` MLPNorm *nn.LayerNorm `gguf:"ffn_norm"`
MLP *VisionMLP MLP *VisionMLP
MLPGate ml.Tensor `gguf:"ffn_gate"`
} }
func (e *VisionEncoderLayer) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *VisionModelOptions) ml.Tensor { func (e *VisionEncoderLayer) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *VisionModelOptions) ml.Tensor {
@ -83,13 +76,22 @@ func (e *VisionEncoderLayer) Forward(ctx ml.Context, hiddenState ml.Tensor, opts
// self attention // self attention
hiddenState = e.AttentionNorm.Forward(ctx, hiddenState, opts.eps) hiddenState = e.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
hiddenState = e.SelfAttention.Forward(ctx, hiddenState, opts) hiddenState = e.SelfAttention.Forward(ctx, hiddenState, opts)
if e.AttentionGate != nil {
hiddenState = hiddenState.Mul(ctx, e.AttentionGate)
}
hiddenState = hiddenState.Add(ctx, residual) hiddenState = hiddenState.Add(ctx, residual)
residual = hiddenState residual = hiddenState
// feed forward // feed forward
hiddenState = e.MLPNorm.Forward(ctx, hiddenState, opts.eps) hiddenState = e.MLPNorm.Forward(ctx, hiddenState, opts.eps)
hiddenState = e.MLP.Forward(ctx, hiddenState, opts) hiddenState = e.MLP.Forward(ctx, hiddenState, opts)
return hiddenState.Add(ctx, residual) hiddenState = hiddenState.Add(ctx, residual)
if e.MLPGate != nil {
hiddenState = hiddenState.Mul(ctx, e.MLPGate)
}
return hiddenState
} }
type VisionEncoder struct { type VisionEncoder struct {
@ -114,9 +116,9 @@ type PrecomputedAspectRatioEmbedding struct {
Gate ml.Tensor `gguf:"gate"` Gate ml.Tensor `gguf:"gate"`
} }
func (e *PrecomputedAspectRatioEmbedding) Forward(ctx ml.Context, hiddenState ml.Tensor, aspectRatioIDs ml.Tensor, opts *VisionModelOptions) ml.Tensor { func (e *PrecomputedAspectRatioEmbedding) Forward(ctx ml.Context, hiddenState ml.Tensor, aspectRatioIDs ml.Tensor, numTiles int, opts *VisionModelOptions) ml.Tensor {
embeddings := e.Embedding.Forward(ctx, aspectRatioIDs) embeddings := e.Embedding.Forward(ctx, aspectRatioIDs)
embeddings = embeddings.Reshape(ctx, opts.hiddenSize, 1, opts.numTiles) embeddings = embeddings.Reshape(ctx, opts.hiddenSize, 1, numTiles)
if e.Gate != nil { if e.Gate != nil {
embeddings = embeddings.Mul(ctx, e.Gate) embeddings = embeddings.Mul(ctx, e.Gate)
} }
@ -132,7 +134,7 @@ type PrecomputedPositionEmbedding struct {
TilePositionEmbeddingGate ml.Tensor `gguf:"tile_position_embd.gate"` TilePositionEmbeddingGate ml.Tensor `gguf:"tile_position_embd.gate"`
} }
func (e *PrecomputedPositionEmbedding) Forward(ctx ml.Context, hiddenState, positionIDs, aspectRatioIDs ml.Tensor, numPositions int, opts *VisionModelOptions) ml.Tensor { func (e *PrecomputedPositionEmbedding) Forward(ctx ml.Context, hiddenState, positionIDs, aspectRatioIDs ml.Tensor, numPositions, numTiles int, opts *VisionModelOptions) ml.Tensor {
positionEmbedding := e.PositionEmbedding.Forward(ctx, positionIDs) positionEmbedding := e.PositionEmbedding.Forward(ctx, positionIDs)
if e.PositionEmbeddingGate != nil { if e.PositionEmbeddingGate != nil {
positionEmbedding = positionEmbedding.Mul(ctx, e.PositionEmbeddingGate) positionEmbedding = positionEmbedding.Mul(ctx, e.PositionEmbeddingGate)
@ -141,7 +143,7 @@ func (e *PrecomputedPositionEmbedding) Forward(ctx ml.Context, hiddenState, posi
hiddenState = hiddenState.Add(ctx, positionEmbedding) hiddenState = hiddenState.Add(ctx, positionEmbedding)
tilePositionEmbedding := e.TilePositionEmbedding.Forward(ctx, aspectRatioIDs) tilePositionEmbedding := e.TilePositionEmbedding.Forward(ctx, aspectRatioIDs)
tilePositionEmbedding = tilePositionEmbedding.Reshape(ctx, opts.hiddenSize, numPositions, opts.numTiles) tilePositionEmbedding = tilePositionEmbedding.Reshape(ctx, opts.hiddenSize, numPositions, numTiles)
if e.TilePositionEmbeddingGate != nil { if e.TilePositionEmbeddingGate != nil {
tilePositionEmbedding = tilePositionEmbedding.Mul(ctx, e.TilePositionEmbeddingGate) tilePositionEmbedding = tilePositionEmbedding.Mul(ctx, e.TilePositionEmbeddingGate)
} }
@ -150,9 +152,9 @@ func (e *PrecomputedPositionEmbedding) Forward(ctx ml.Context, hiddenState, posi
} }
type VisionModelOptions struct { type VisionModelOptions struct {
hiddenSize, numHeads, numTiles int hiddenSize, numHeads int
imageSize, patchSize int imageSize, patchSize int
eps float32 eps float32
intermediateLayersIndices []int32 intermediateLayersIndices []int32
} }
@ -181,14 +183,16 @@ func (m *VisionModel) Forward(ctx ml.Context, pixelValues, positionIDs, aspectRa
numPositions++ numPositions++
} }
numTiles := pixelValues.Dim(3)
hiddenState := m.PatchEmbeddings.Forward(ctx, pixelValues, m.patchSize, m.patchSize, 0, 0, 1, 1) hiddenState := m.PatchEmbeddings.Forward(ctx, pixelValues, m.patchSize, m.patchSize, 0, 0, 1, 1)
hiddenState = hiddenState.Reshape(ctx, numPatches, m.hiddenSize, m.numTiles) hiddenState = hiddenState.Reshape(ctx, numPatches, m.hiddenSize, numTiles)
hiddenState = hiddenState.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx) hiddenState = hiddenState.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx)
hiddenState = m.PreTilePositionEmbedding.Forward(ctx, hiddenState, aspectRatioIDs, m.VisionModelOptions) hiddenState = m.PreTilePositionEmbedding.Forward(ctx, hiddenState, aspectRatioIDs, numTiles, m.VisionModelOptions)
hiddenState = m.ClassEmbedding.Repeat(ctx, 2, m.numTiles).Concat(ctx, hiddenState, 1) hiddenState = m.ClassEmbedding.Repeat(ctx, 2, numTiles).Concat(ctx, hiddenState, 1)
hiddenState = m.PositionEmbedding.Forward(ctx, hiddenState, positionIDs, aspectRatioIDs, numPositions, m.VisionModelOptions) hiddenState = m.PositionEmbedding.Forward(ctx, hiddenState, positionIDs, aspectRatioIDs, numPositions, numTiles, m.VisionModelOptions)
hiddenState = m.PreLayerNorm.Forward(ctx, hiddenState, m.eps) hiddenState = m.PreLayerNorm.Forward(ctx, hiddenState, m.eps)
numPaddingPatches := 8 - (hiddenState.Dim(1)%8)%8 numPaddingPatches := 8 - (hiddenState.Dim(1)%8)%8
@ -199,18 +203,18 @@ func (m *VisionModel) Forward(ctx ml.Context, pixelValues, positionIDs, aspectRa
hiddenState = m.PostLayerNorm.Forward(ctx, hiddenState, m.eps) hiddenState = m.PostLayerNorm.Forward(ctx, hiddenState, m.eps)
hiddenState = hiddenState.Reshape(ctx, m.hiddenSize, numPositions+numPaddingPatches, m.numTiles, batchSize) hiddenState = hiddenState.Reshape(ctx, m.hiddenSize, numPositions+numPaddingPatches, numTiles, batchSize)
hiddenState = m.PostTilePositionEmbedding.Forward(ctx, hiddenState, aspectRatioIDs, m.VisionModelOptions) hiddenState = m.PostTilePositionEmbedding.Forward(ctx, hiddenState, aspectRatioIDs, numTiles, m.VisionModelOptions)
hiddenState = hiddenState.Reshape(ctx, m.hiddenSize, m.numTiles*(numPositions+numPaddingPatches), batchSize) hiddenState = hiddenState.Reshape(ctx, m.hiddenSize, numTiles*(numPositions+numPaddingPatches), batchSize)
hiddenState, _ = m.GlobalTransformer.Forward(ctx, hiddenState, nil, m.VisionModelOptions) hiddenState, _ = m.GlobalTransformer.Forward(ctx, hiddenState, nil, m.VisionModelOptions)
hiddenStates := intermediateHiddenStates[0].Stack(ctx, 0, intermediateHiddenStates[1:]...) hiddenStates := intermediateHiddenStates[0].Stack(ctx, 0, intermediateHiddenStates[1:]...)
hiddenStates = hiddenStates.Reshape(ctx, len(intermediateHiddenStates)*m.hiddenSize, numPositions+numPaddingPatches, m.numTiles, batchSize) hiddenStates = hiddenStates.Reshape(ctx, len(intermediateHiddenStates)*m.hiddenSize, numPositions+numPaddingPatches, numTiles, batchSize)
hiddenStates = hiddenStates.Unpad(ctx, 0, numPaddingPatches, 0, 0) hiddenStates = hiddenStates.Pad(ctx, 0, -numPaddingPatches, 0, 0)
hiddenState = hiddenState.Reshape(ctx, m.hiddenSize, numPositions+numPaddingPatches, m.numTiles, batchSize) hiddenState = hiddenState.Reshape(ctx, m.hiddenSize, numPositions+numPaddingPatches, numTiles, batchSize)
hiddenState = hiddenState.Unpad(ctx, 0, numPaddingPatches, 0, 0) hiddenState = hiddenState.Pad(ctx, 0, -numPaddingPatches, 0, 0)
return hiddenState.Concat(ctx, hiddenStates, 0) return hiddenState.Concat(ctx, hiddenStates, 0)
} }
@ -222,7 +226,6 @@ func newVisionModel(c fs.Config) *VisionModel {
VisionModelOptions: &VisionModelOptions{ VisionModelOptions: &VisionModelOptions{
hiddenSize: int(c.Uint("vision.embedding_length")), hiddenSize: int(c.Uint("vision.embedding_length")),
numHeads: int(c.Uint("vision.attention.head_count")), numHeads: int(c.Uint("vision.attention.head_count")),
numTiles: int(c.Uint("vision.max_num_tiles")),
imageSize: int(c.Uint("vision.image_size")), imageSize: int(c.Uint("vision.image_size")),
patchSize: int(c.Uint("vision.patch_size")), patchSize: int(c.Uint("vision.patch_size")),

View File

@ -2,17 +2,31 @@ package mllama
import ( import (
"image" "image"
"image/color"
"math" "math"
"slices" "slices"
"golang.org/x/image/draw" "golang.org/x/image/draw"
"github.com/ollama/ollama/fs" "github.com/ollama/ollama/fs"
"github.com/ollama/ollama/model/imageproc"
) )
type supportedAspectRatio struct {
rank, width, height int
}
func (a supportedAspectRatio) Point() image.Point {
return image.Point{a.width, a.height}
}
func (a supportedAspectRatio) numTiles() int {
return a.width * a.height
}
type ImageProcessor struct { type ImageProcessor struct {
imageSize, numChannels, maxNumTiles int imageSize, numChannels, maxNumTiles int
mean, std [3]float32
} }
func newImageProcessor(c fs.Config) ImageProcessor { func newImageProcessor(c fs.Config) ImageProcessor {
@ -20,71 +34,49 @@ func newImageProcessor(c fs.Config) ImageProcessor {
imageSize: int(c.Uint("vision.image_size")), imageSize: int(c.Uint("vision.image_size")),
numChannels: int(c.Uint("vision.num_channels")), numChannels: int(c.Uint("vision.num_channels")),
maxNumTiles: int(c.Uint("vision.max_num_tiles")), maxNumTiles: int(c.Uint("vision.max_num_tiles")),
mean: imageproc.ClipDefaultMean,
std: imageproc.ClipDefaultSTD,
} }
} }
func (p *ImageProcessor) supportedAspectRatios(maxTiles int) []image.Point { func (p ImageProcessor) supportedAspectRatios() (ratios []supportedAspectRatio) {
ratios := []image.Point{} for w := 1; w <= p.maxNumTiles; w++ {
for h := 1; h <= p.maxNumTiles/w; h++ {
for w := range maxTiles { ratios = append(ratios, supportedAspectRatio{len(ratios) + 1, w, h})
for h := range maxTiles {
if (w+1)*(h+1) <= maxTiles {
ratios = append(ratios, image.Point{w + 1, h + 1})
}
} }
} }
return ratios return ratios
} }
func (p *ImageProcessor) clip(a, a_min, a_max int) int { func (p ImageProcessor) fitToCanvas(imageSize, canvasSize image.Point) image.Point {
if a < a_min { tw := min(max(imageSize.X, p.imageSize), canvasSize.X)
return a_min th := min(max(imageSize.Y, p.imageSize), canvasSize.Y)
} else if a > a_max {
return a_max
}
return a r := math.Min(
} float64(tw)/float64(imageSize.X),
float64(th)/float64(imageSize.Y),
)
func (p *ImageProcessor) fitToCanvas(imageSize, canvasSize image.Point, tileSize int) image.Point { w := min(int(math.Floor(float64(imageSize.X)*r)), tw)
targetWidth := p.clip(imageSize.X, tileSize, canvasSize.X) h := min(int(math.Floor(float64(imageSize.Y)*r)), th)
targetHeight := p.clip(imageSize.Y, tileSize, canvasSize.Y)
scaleWidth := float64(targetWidth) / float64(imageSize.X)
scaleHeight := float64(targetHeight) / float64(imageSize.Y)
var w, h int
if scaleWidth < scaleHeight {
w = targetWidth
h = min(int(math.Floor(float64(imageSize.Y)*scaleWidth)), targetHeight)
} else {
w = min(int(math.Floor(float64(imageSize.X)*scaleHeight)), targetWidth)
h = targetHeight
}
return image.Point{w, h} return image.Point{w, h}
} }
func (p *ImageProcessor) optimalTiledCanvas(imageSize image.Point, maxImageTiles, tileSize int) image.Point { func (p ImageProcessor) optimalTiledCanvas(imageSize image.Point) image.Point {
possibleTileArrangements := p.supportedAspectRatios(maxImageTiles) possibleTileArrangements := p.supportedAspectRatios()
possibleCanvasSizes := []image.Point{} possibleCanvasSizes := make([]image.Point, len(possibleTileArrangements))
for _, pta := range possibleTileArrangements { for i, pta := range possibleTileArrangements {
possibleCanvasSizes = append(possibleCanvasSizes, image.Point{pta.X * tileSize, pta.Y * tileSize}) possibleCanvasSizes[i] = image.Point{pta.width * p.imageSize, pta.height * p.imageSize}
} }
scales := []float64{} scales := make([]float64, len(possibleCanvasSizes))
for i, pcs := range possibleCanvasSizes {
for _, pcs := range possibleCanvasSizes { scales[i] = min(
scaleHeight := float64(pcs.Y) / float64(imageSize.Y) float64(pcs.Y)/float64(imageSize.Y),
scaleWidth := float64(pcs.X) / float64(imageSize.X) float64(pcs.X)/float64(imageSize.X),
)
if scaleWidth > scaleHeight {
scales = append(scales, scaleHeight)
} else {
scales = append(scales, scaleWidth)
}
} }
var minUpscale float64 var minUpscale float64
@ -123,47 +115,41 @@ func (p *ImageProcessor) optimalTiledCanvas(imageSize image.Point, maxImageTiles
return selectedCanvas return selectedCanvas
} }
func (p *ImageProcessor) splitToTiles(img image.Image, numTilesSize image.Point) []image.Image { func (p ImageProcessor) splitToTiles(img image.Image, numTilesSize image.Point) []image.Image {
b := img.Bounds() b := img.Bounds()
width := b.Max.X - b.Min.X width := b.Max.X - b.Min.X
height := b.Max.Y - b.Min.Y height := b.Max.Y - b.Min.Y
tileHeight := height / numTilesSize.Y tileHeight := height / numTilesSize.Y
tileWidth := width / numTilesSize.X tileWidth := width / numTilesSize.X
images := []image.Image{} images := make([]image.Image, 0, numTilesSize.Y*numTilesSize.X)
for h := range numTilesSize.Y { for h := range numTilesSize.Y {
for w := range numTilesSize.X { for w := range numTilesSize.X {
rect := image.Rect(tileWidth*w, tileHeight*h, tileWidth*(w+1), tileHeight*(h+1)) rect := image.Rect(tileWidth*w, tileHeight*h, tileWidth*(w+1), tileHeight*(h+1))
images = append(images, img.(interface { if subImg, ok := img.(interface {
SubImage(image.Rectangle) image.Image SubImage(image.Rectangle) image.Image
}).SubImage(rect)) }); ok {
images = append(images, subImg.SubImage(rect))
} else {
// Handle the case where img does not implement SubImage
// This is a fallback and may not be efficient
newImg := image.NewRGBA(rect)
draw.Draw(newImg, rect, img, rect.Min, draw.Src)
images = append(images, newImg)
}
} }
} }
return images return images
} }
// remove the "alpha" channel by drawing over a prefilled image func (p ImageProcessor) resize(img image.Image) (image.Image, image.Point) {
//
//nolint:unused
func (p *ImageProcessor) compositeImage(img image.Image) image.Image {
dst := image.NewRGBA(img.Bounds())
white := color.RGBA{255, 255, 255, 255}
draw.Draw(dst, dst.Bounds(), &image.Uniform{white}, image.Point{}, draw.Src)
draw.Draw(dst, dst.Bounds(), img, img.Bounds().Min, draw.Over)
return dst
}
func (p *ImageProcessor) resize(img image.Image, outputSize image.Point, maxImageTiles int) (image.Image, image.Point) {
b := img.Bounds() b := img.Bounds()
tileSize := outputSize.Y
canvasSize := p.optimalTiledCanvas(b.Max, maxImageTiles, tileSize) canvasSize := p.optimalTiledCanvas(b.Max)
aspectRatio := image.Point{canvasSize.X / tileSize, canvasSize.Y / tileSize} aspectRatio := image.Point{canvasSize.X / p.imageSize, canvasSize.Y / p.imageSize}
newSize := p.fitToCanvas(b.Max, canvasSize, tileSize) newSize := p.fitToCanvas(b.Max, canvasSize)
dst := image.NewRGBA(image.Rect(0, 0, newSize.X, newSize.Y)) dst := image.NewRGBA(image.Rect(0, 0, newSize.X, newSize.Y))
@ -177,10 +163,10 @@ func (p *ImageProcessor) resize(img image.Image, outputSize image.Point, maxImag
return dst, aspectRatio return dst, aspectRatio
} }
func (p *ImageProcessor) pad(img image.Image, outputSize, aspectRatio image.Point) image.Image { func (p ImageProcessor) pad(img image.Image, aspectRatio image.Point) image.Image {
paddedSize := image.Point{ paddedSize := image.Point{
X: outputSize.X * aspectRatio.X, X: p.imageSize * aspectRatio.X,
Y: outputSize.Y * aspectRatio.Y, Y: p.imageSize * aspectRatio.Y,
} }
dst := image.NewRGBA(image.Rect(0, 0, paddedSize.X, paddedSize.Y)) dst := image.NewRGBA(image.Rect(0, 0, paddedSize.X, paddedSize.Y))
@ -189,7 +175,7 @@ func (p *ImageProcessor) pad(img image.Image, outputSize, aspectRatio image.Poin
return dst return dst
} }
func (p *ImageProcessor) pack(img image.Image, aspectRatio image.Point, mean, std [3]float32) []float32 { func (p ImageProcessor) pack(img image.Image, aspectRatio image.Point) []float32 {
subImages := p.splitToTiles(img, aspectRatio) subImages := p.splitToTiles(img, aspectRatio)
var pixelVals []float32 var pixelVals []float32
@ -205,9 +191,9 @@ func (p *ImageProcessor) pack(img image.Image, aspectRatio image.Point, mean, st
gVal := float32(g>>8) / 255.0 gVal := float32(g>>8) / 255.0
bVal := float32(b>>8) / 255.0 bVal := float32(b>>8) / 255.0
rVal = (rVal - mean[0]) / std[0] rVal = (rVal - p.mean[0]) / p.std[0]
gVal = (gVal - mean[1]) / std[1] gVal = (gVal - p.mean[1]) / p.std[1]
bVal = (bVal - mean[2]) / std[2] bVal = (bVal - p.mean[2]) / p.std[2]
rVals = append(rVals, rVal) rVals = append(rVals, rVal)
gVals = append(gVals, gVal) gVals = append(gVals, gVal)
@ -222,17 +208,15 @@ func (p *ImageProcessor) pack(img image.Image, aspectRatio image.Point, mean, st
return pixelVals return pixelVals
} }
func (p ImageProcessor) ProcessImage(img image.Image) ([]float32, int, error) { func (p ImageProcessor) ProcessImage(img image.Image) ([]float32, supportedAspectRatio, error) {
outputSize := image.Point{p.imageSize, p.imageSize} newImage, newImageRatio := p.resize(img)
newImage = p.pad(newImage, newImageRatio)
pixelValues := p.pack(newImage, newImageRatio)
// clip values supportedAspectRatios := p.supportedAspectRatios()
mean := [3]float32{0.48145466, 0.4578275, 0.40821073} aspectRatioID := slices.IndexFunc(supportedAspectRatios, func(i supportedAspectRatio) bool {
std := [3]float32{0.26862954, 0.26130258, 0.27577711} return i.width == newImageRatio.X && i.height == newImageRatio.Y
})
newImage, aspectRatio := p.resize(img, outputSize, p.maxNumTiles) return pixelValues, supportedAspectRatios[aspectRatioID], nil
newImage = p.pad(newImage, outputSize, aspectRatio)
data := p.pack(newImage, aspectRatio, mean, std)
aspectRatioIndex := slices.Index(p.supportedAspectRatios(p.maxNumTiles), aspectRatio) + 1
return data, aspectRatioIndex, nil
} }

View File

@ -0,0 +1,387 @@
package mllama
import (
"image"
"testing"
"github.com/google/go-cmp/cmp"
)
func TestSupportedAspectRatios(t *testing.T) {
cases := []struct {
p ImageProcessor
want []supportedAspectRatio
}{
{
p: ImageProcessor{maxNumTiles: 1},
want: []supportedAspectRatio{
{1, 1, 1},
},
},
{
p: ImageProcessor{maxNumTiles: 2},
want: []supportedAspectRatio{
{1, 1, 1},
{2, 1, 2},
{3, 2, 1},
},
},
{
p: ImageProcessor{maxNumTiles: 3},
want: []supportedAspectRatio{
{1, 1, 1},
{2, 1, 2},
{3, 1, 3},
{4, 2, 1},
{5, 3, 1},
},
},
{
p: ImageProcessor{maxNumTiles: 4},
want: []supportedAspectRatio{
{1, 1, 1},
{2, 1, 2},
{3, 1, 3},
{4, 1, 4},
{5, 2, 1},
{6, 2, 2},
{7, 3, 1},
{8, 4, 1},
},
},
}
for _, tt := range cases {
actual := tt.p.supportedAspectRatios()
if diff := cmp.Diff(actual, tt.want, cmp.AllowUnexported(supportedAspectRatio{})); diff != "" {
t.Errorf("mismatch (-got +want):\n%s", diff)
}
}
}
func TestFitToCanvas(t *testing.T) {
cases := []struct {
p ImageProcessor
image image.Point
canvas image.Point
expect image.Point
}{
{
p: ImageProcessor{imageSize: 200},
image: image.Point{400, 400},
canvas: image.Point{640, 480},
expect: image.Point{400, 400},
},
{
p: ImageProcessor{imageSize: 200},
image: image.Point{1024, 768},
canvas: image.Point{640, 480},
expect: image.Point{640, 480},
},
{
p: ImageProcessor{imageSize: 750},
image: image.Point{500, 500},
canvas: image.Point{1000, 1000},
expect: image.Point{750, 750},
},
{
p: ImageProcessor{imageSize: 2000},
image: image.Point{500, 1000},
canvas: image.Point{2000, 2000},
expect: image.Point{1000, 2000},
},
{
p: ImageProcessor{imageSize: 1000},
image: image.Point{4000, 3000},
canvas: image.Point{2000, 1000},
expect: image.Point{1333, 1000},
},
{
p: ImageProcessor{imageSize: 560},
image: image.Point{667, 1000},
canvas: image.Point{1000, 1000},
expect: image.Point{667, 1000},
},
}
for _, tt := range cases {
actual := tt.p.fitToCanvas(tt.image, tt.canvas)
if diff := cmp.Diff(actual, tt.expect); diff != "" {
t.Errorf("mismatch (-got +want):\n%s", diff)
}
}
}
func TestOptimalTiledCanvas(t *testing.T) {
cases := []struct {
p ImageProcessor
image image.Point
expect image.Point
}{
{
p: ImageProcessor{maxNumTiles: 4, imageSize: 1000},
image: image.Point{1024, 768},
expect: image.Point{2000, 1000},
},
{
p: ImageProcessor{maxNumTiles: 4, imageSize: 560},
image: image.Point{1024, 768},
expect: image.Point{1120, 1120},
},
{
p: ImageProcessor{maxNumTiles: 4, imageSize: 560},
image: image.Point{800, 600},
expect: image.Point{1120, 1120},
},
{
p: ImageProcessor{maxNumTiles: 4, imageSize: 560},
image: image.Point{640, 480},
expect: image.Point{1120, 560},
},
{
p: ImageProcessor{maxNumTiles: 4, imageSize: 560},
image: image.Point{320, 200},
expect: image.Point{560, 560},
},
{
p: ImageProcessor{maxNumTiles: 4, imageSize: 560},
image: image.Point{1320, 200},
expect: image.Point{1680, 560},
},
{
p: ImageProcessor{maxNumTiles: 4, imageSize: 560},
image: image.Point{2000, 200},
expect: image.Point{2240, 560},
},
{
p: ImageProcessor{maxNumTiles: 4, imageSize: 560},
image: image.Point{10000, 200},
expect: image.Point{2240, 560},
},
{
p: ImageProcessor{maxNumTiles: 4, imageSize: 560},
image: image.Point{480, 640},
expect: image.Point{560, 1120},
},
{
p: ImageProcessor{maxNumTiles: 4, imageSize: 560},
image: image.Point{200, 320},
expect: image.Point{560, 560},
},
{
p: ImageProcessor{maxNumTiles: 4, imageSize: 560},
image: image.Point{200, 1320},
expect: image.Point{560, 1680},
},
{
p: ImageProcessor{maxNumTiles: 4, imageSize: 560},
image: image.Point{200, 2000},
expect: image.Point{560, 2240},
},
{
p: ImageProcessor{maxNumTiles: 4, imageSize: 560},
image: image.Point{200, 10000},
expect: image.Point{560, 2240},
},
{
p: ImageProcessor{maxNumTiles: 4, imageSize: 560},
image: image.Point{10000, 10000},
expect: image.Point{1120, 1120},
},
}
for _, tt := range cases {
actual := tt.p.optimalTiledCanvas(tt.image)
if diff := cmp.Diff(actual, tt.expect); diff != "" {
t.Errorf("mismatch (-got +want):\n%s", diff)
}
}
}
func TestSplitToTiles(t *testing.T) {
cases := []struct {
imageMax image.Point
numTiles image.Point
expect []image.Image
}{
{
imageMax: image.Point{1024, 768},
numTiles: image.Point{1, 1},
expect: []image.Image{image.NewRGBA(image.Rect(0, 0, 1024, 768))},
},
{
imageMax: image.Point{1000, 500},
numTiles: image.Point{2, 1},
expect: []image.Image{
image.NewRGBA(image.Rect(0, 0, 500, 500)),
image.NewRGBA(image.Rect(500, 0, 1000, 500)),
},
},
{
imageMax: image.Point{1000, 1000},
numTiles: image.Point{2, 2},
expect: []image.Image{
image.NewRGBA(image.Rect(0, 0, 500, 500)),
image.NewRGBA(image.Rect(500, 0, 1000, 500)),
image.NewRGBA(image.Rect(0, 500, 500, 1000)),
image.NewRGBA(image.Rect(500, 500, 1000, 1000)),
},
},
}
var p ImageProcessor
for _, tt := range cases {
actual := p.splitToTiles(image.NewRGBA(image.Rectangle{Max: tt.imageMax}), tt.numTiles)
if len(actual) != len(tt.expect) {
t.Errorf("incorrect number of images '%d': expect: '%d'", len(actual), len(tt.expect))
}
for i := range actual {
if actual[i].Bounds() != tt.expect[i].Bounds() {
t.Errorf("image size incorrect: '%#v': expect: '%#v'", actual[i].Bounds(), tt.expect[i].Bounds())
}
}
}
}
func TestResize(t *testing.T) {
cases := []struct {
p ImageProcessor
imageMax image.Point
expectImage image.Image
expectAspectRatio image.Point
}{
{
p: ImageProcessor{maxNumTiles: 1, imageSize: 100},
imageMax: image.Point{200, 200},
expectImage: image.NewRGBA(image.Rect(0, 0, 100, 100)),
expectAspectRatio: image.Point{1, 1},
},
{
p: ImageProcessor{maxNumTiles: 2, imageSize: 100},
imageMax: image.Point{200, 200},
expectImage: image.NewRGBA(image.Rect(0, 0, 100, 100)),
expectAspectRatio: image.Point{1, 1},
},
{
p: ImageProcessor{maxNumTiles: 4, imageSize: 560},
imageMax: image.Point{10, 10},
expectImage: image.NewRGBA(image.Rect(0, 0, 560, 560)),
expectAspectRatio: image.Point{1, 1},
},
{
p: ImageProcessor{maxNumTiles: 4, imageSize: 560},
imageMax: image.Point{2560, 1920},
expectImage: image.NewRGBA(image.Rect(0, 0, 1120, 840)),
expectAspectRatio: image.Point{2, 2},
},
{
p: ImageProcessor{maxNumTiles: 4, imageSize: 560},
imageMax: image.Point{1024, 768},
expectImage: image.NewRGBA(image.Rect(0, 0, 1024, 768)),
expectAspectRatio: image.Point{2, 2},
},
}
for _, tt := range cases {
actualImage, actualAspectRatio := tt.p.resize(image.Rectangle{Max: tt.imageMax})
if actualImage.Bounds() != tt.expectImage.Bounds() {
t.Errorf("image size incorrect: '%#v': expect: '%#v'", actualImage.Bounds(), tt.expectImage.Bounds())
}
if actualAspectRatio != tt.expectAspectRatio {
t.Errorf("aspect ratio incorrect: '%#v': expect: '%#v'", actualAspectRatio, tt.expectAspectRatio)
}
}
}
func TestPad(t *testing.T) {
cases := []struct {
p ImageProcessor
imageMax image.Point
aspectRatio image.Point
expect image.Image
}{
{
p: ImageProcessor{maxNumTiles: 4, imageSize: 560},
imageMax: image.Point{1000, 667},
aspectRatio: image.Point{2, 2},
expect: image.NewRGBA(image.Rect(0, 0, 1120, 1120)),
},
}
for _, tt := range cases {
actual := tt.p.pad(image.Rectangle{Max: tt.imageMax}, tt.aspectRatio)
if actual.Bounds() != tt.expect.Bounds() {
t.Errorf("image size incorrect: '%#v': expect: '%#v'", actual.Bounds(), tt.expect.Bounds())
}
}
}
func TestPackImages(t *testing.T) {
cases := []struct {
imageMax image.Point
aspectRatio image.Point
expectVals int
}{
{
imageMax: image.Point{1120, 1120},
aspectRatio: image.Point{2, 2},
expectVals: 2 * 2 * 3 * 560 * 560,
},
{
imageMax: image.Point{560, 560},
aspectRatio: image.Point{1, 1},
expectVals: 1 * 1 * 3 * 560 * 560,
},
{
imageMax: image.Point{1120, 560},
aspectRatio: image.Point{1, 2},
expectVals: 1 * 2 * 3 * 560 * 560,
},
}
for _, tt := range cases {
var p ImageProcessor
actualVals := p.pack(image.NewRGBA(image.Rectangle{Max: tt.imageMax}), tt.aspectRatio)
if len(actualVals) != tt.expectVals {
t.Errorf("packed image size incorrect: '%d': expect: '%d'", len(actualVals), tt.expectVals)
}
}
}
func TestPreprocess(t *testing.T) {
cases := []struct {
imageMax image.Point
expectAspectRatioID int
}{
{
imageMax: image.Point{10, 10},
expectAspectRatioID: 1,
},
{
imageMax: image.Point{1024, 768},
expectAspectRatioID: 6,
},
}
p := ImageProcessor{imageSize: 560, maxNumTiles: 4}
for _, tt := range cases {
img, aspectRatio, err := p.ProcessImage(image.NewRGBA(image.Rectangle{Max: tt.imageMax}))
if err != nil {
t.Fatalf("error processing: %q", err)
}
if len(img) == 0 {
t.Errorf("no image data returned")
}
if aspectRatio.rank != tt.expectAspectRatioID {
t.Errorf("aspect ratio incorrect: '%d': expect: '%d'", aspectRatio, tt.expectAspectRatioID)
}
}
}

View File

@ -5,7 +5,6 @@ import (
"fmt" "fmt"
"hash/maphash" "hash/maphash"
"log/slog" "log/slog"
"slices"
"sync" "sync"
"time" "time"
@ -18,8 +17,7 @@ type ImageContext struct {
// mu is required to be held when generating embeddings or accessing the cache // mu is required to be held when generating embeddings or accessing the cache
mu sync.Mutex mu sync.Mutex
clip *llama.ClipContext clip *llama.ClipContext
mllama *llama.MllamaContext
// cache of images to embeddings // cache of images to embeddings
images []imageCache images []imageCache
@ -35,8 +33,6 @@ func NewImageContext(llamaContext *llama.Context, modelPath string) (*ImageConte
var c ImageContext var c ImageContext
if arch == "clip" { if arch == "clip" {
c.clip, err = llama.NewClipContext(llamaContext, modelPath) c.clip, err = llama.NewClipContext(llamaContext, modelPath)
} else if arch == "mllama" {
c.mllama, err = llama.NewMllamaContext(llamaContext, modelPath)
} else { } else {
return nil, fmt.Errorf("unknown vision model architecture: %s", arch) return nil, fmt.Errorf("unknown vision model architecture: %s", arch)
} }
@ -58,12 +54,9 @@ func (c *ImageContext) Free(modelPath string) {
if c.clip != nil { if c.clip != nil {
c.clip.Free() c.clip.Free()
} }
if c.mllama != nil {
c.mllama.Free()
}
} }
func (c *ImageContext) NewEmbed(llamaContext *llama.Context, data []byte, aspectRatioId int) ([][]float32, error) { func (c *ImageContext) NewEmbed(llamaContext *llama.Context, data []byte) ([][]float32, error) {
if c == nil { if c == nil {
return nil, nil return nil, nil
} }
@ -79,12 +72,7 @@ func (c *ImageContext) NewEmbed(llamaContext *llama.Context, data []byte, aspect
embed, err := c.findImage(hash) embed, err := c.findImage(hash)
if err != nil { if err != nil {
if c.mllama != nil { if c.clip != nil {
embed, err = c.mllama.NewEmbed(llamaContext, data, aspectRatioId)
if err != nil {
return nil, err
}
} else if c.clip != nil {
embed, err = c.clip.NewEmbed(llamaContext, data) embed, err = c.clip.NewEmbed(llamaContext, data)
if err != nil { if err != nil {
return nil, err return nil, err
@ -105,33 +93,11 @@ func (c *ImageContext) BatchSize(configuredBatchSize int) int {
return 0 return 0
} }
// Mllama maps an image to 1 embedding token (llava creates many tokens)
// and doesn't support more than a single image per request.
// The embeddings are large (100 MB), so allocating a big batch can fail
// on some systems
if c.mllama != nil {
return 1
}
return configuredBatchSize return configuredBatchSize
} }
func (c *ImageContext) EmbedSize(llamaContext *llama.Context) int { func (c *ImageContext) EmbedSize(llamaContext *llama.Context) int {
if c != nil && c.mllama != nil { return llamaContext.Model().NEmbd()
return c.mllama.EmbedSize(llamaContext)
} else {
return llamaContext.Model().NEmbd()
}
}
func (c *ImageContext) NeedCrossAttention(inputs ...input) bool {
if c == nil || c.mllama == nil {
return false
}
return slices.ContainsFunc(inputs, func(input input) bool {
return input.embed != nil
})
} }
type imageCache struct { type imageCache struct {

View File

@ -57,10 +57,6 @@ type Sequence struct {
// input cache being used by this sequence // input cache being used by this sequence
cache *InputCacheSlot cache *InputCacheSlot
// does this sequence require cross-attention layers to be processed? - if we have seen
// an image for certain multi-modal models
crossAttention bool
// channel to send responses over // channel to send responses over
responses chan string responses chan string
@ -205,7 +201,7 @@ func (s *Server) inputs(prompt string, images []llm.ImageData) ([]input, error)
return nil, fmt.Errorf("invalid image index: %d", n) return nil, fmt.Errorf("invalid image index: %d", n)
} }
embed, err := s.image.NewEmbed(s.lc, images[imageIndex].Data, images[imageIndex].AspectRatioID) embed, err := s.image.NewEmbed(s.lc, images[imageIndex].Data)
if err != nil { if err != nil {
return nil, err return nil, err
} }
@ -368,7 +364,6 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
defer s.mu.Unlock() defer s.mu.Unlock()
var batch *llama.Batch var batch *llama.Batch
crossAttention := false
seqIdx := s.nextSeq - 1 seqIdx := s.nextSeq - 1
for range s.seqs { for range s.seqs {
@ -416,9 +411,8 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
batch = tokenBatch batch = tokenBatch
} else { } else {
batch = embedBatch batch = embedBatch
seq.crossAttention = s.image.NeedCrossAttention(input)
} }
} else if embedding != batch.IsEmbedding() || crossAttention != seq.crossAttention { } else if embedding != batch.IsEmbedding() {
s.nextSeq = seqIdx s.nextSeq = seqIdx
break break
} }
@ -427,7 +421,6 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
break break
} }
crossAttention = seq.crossAttention
batch.Add(input.token, input.embed, len(seq.cache.Inputs)+len(seq.pendingInputs), i+1 == len(seq.inputs), seq.cache.Id) batch.Add(input.token, input.embed, len(seq.cache.Inputs)+len(seq.pendingInputs), i+1 == len(seq.inputs), seq.cache.Id)
seq.pendingInputs = append(seq.pendingInputs, input) seq.pendingInputs = append(seq.pendingInputs, input)
seq.iBatch = batch.NumTokens() - 1 seq.iBatch = batch.NumTokens() - 1
@ -440,20 +433,11 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
return nil return nil
} }
s.lc.SetCrossAttention(crossAttention)
err := s.lc.Decode(batch) err := s.lc.Decode(batch)
if err != nil { if err != nil {
return fmt.Errorf("failed to decode batch: %w", err) return fmt.Errorf("failed to decode batch: %w", err)
} }
if crossAttention {
// synchronize state to ensure the cross attention batch is complete.
// needed specifically for multi-GPU systems otherwise an inflight
// task may be incorrectly invalidated causing a crash
s.lc.Synchronize()
}
for i, seq := range s.seqs { for i, seq := range s.seqs {
if seq == nil { if seq == nil {
continue continue
@ -622,8 +606,6 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
return return
} }
seq.crossAttention = s.image.NeedCrossAttention(seq.cache.Inputs...)
s.seqs[i] = seq s.seqs[i] = seq
s.cond.Signal() s.cond.Signal()
found = true found = true

View File

@ -3,47 +3,32 @@ package server
import ( import (
"bytes" "bytes"
"context" "context"
"encoding/binary"
"errors" "errors"
"fmt" "fmt"
"log/slog" "log/slog"
"slices"
"strings" "strings"
"github.com/ollama/ollama/api" "github.com/ollama/ollama/api"
"github.com/ollama/ollama/llm" "github.com/ollama/ollama/llm"
"github.com/ollama/ollama/model/models/mllama"
"github.com/ollama/ollama/template" "github.com/ollama/ollama/template"
) )
type tokenizeFunc func(context.Context, string) ([]int, error) type tokenizeFunc func(context.Context, string) ([]int, error)
var errTooManyImages = errors.New("vision model only supports a single image per message")
// chatPrompt accepts a list of messages and returns the prompt and images that should be used for the next chat turn. // chatPrompt accepts a list of messages and returns the prompt and images that should be used for the next chat turn.
// chatPrompt truncates any messages that exceed the context window of the model, making sure to always include 1) the // chatPrompt truncates any messages that exceed the context window of the model, making sure to always include 1) the
// latest message and 2) system messages // latest message and 2) system messages
func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.Options, msgs []api.Message, tools []api.Tool) (prompt string, images []llm.ImageData, _ error) { func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.Options, msgs []api.Message, tools []api.Tool) (prompt string, images []llm.ImageData, _ error) {
var system []api.Message var system []api.Message
isMllama := checkMllamaModelFamily(m)
var imageNumTokens int
// TODO: Ideally we would compute this from the projector metadata but some pieces are implementation dependent // TODO: Ideally we would compute this from the projector metadata but some pieces are implementation dependent
if isMllama { // Clip images are represented as 768 tokens, each an embedding
// Our mllama implementation packs all of the embeddings into a single token imageNumTokens := 768
imageNumTokens = 1
} else {
// Clip images are represented as 768 tokens, each an embedding
imageNumTokens = 768
}
n := len(msgs) - 1 n := len(msgs) - 1
// in reverse, find all messages that fit into context window // in reverse, find all messages that fit into context window
for i := n; i >= 0; i-- { for i := n; i >= 0; i-- {
if isMllama && len(msgs[i].Images) > 1 {
return "", nil, errTooManyImages
}
// always include the last message // always include the last message
if i == n { if i == n {
continue continue
@ -84,48 +69,17 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.
currMsgIdx := n currMsgIdx := n
for cnt, msg := range msgs[currMsgIdx:] { for cnt, msg := range msgs[currMsgIdx:] {
prefix := "" if slices.Contains(m.Config.ModelFamilies, "mllama") && len(msg.Images) > 1 {
imgPrompt := "" return "", nil, errors.New("this model only supports one image while more than one image requested")
}
var prefix string
prompt := msg.Content prompt := msg.Content
for _, i := range msg.Images { for _, i := range msg.Images {
var imgData llm.ImageData imgData := llm.ImageData{
ID: len(images),
if isMllama { Data: i,
if len(m.ProjectorPaths) == 0 {
imgData = llm.ImageData{
ID: len(images),
Data: i,
}
} else {
data, opts, err := mllama.Preprocess(bytes.NewReader(i))
if err != nil {
return "", nil, err
}
buf := new(bytes.Buffer)
err = binary.Write(buf, binary.LittleEndian, data)
if err != nil {
return "", nil, err
}
ar, ok := opts["aspectRatioIndex"].(int)
if !ok {
return "", nil, fmt.Errorf("missing aspect ratio for image")
}
imgData = llm.ImageData{
ID: len(images),
Data: buf.Bytes(),
AspectRatioID: ar,
}
}
imgPrompt = "<|image|>"
} else {
imgData = llm.ImageData{
ID: len(images),
Data: i,
}
} }
imgTag := fmt.Sprintf("[img-%d]", imgData.ID) imgTag := fmt.Sprintf("[img-%d]", imgData.ID)
@ -137,7 +91,7 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.
images = append(images, imgData) images = append(images, imgData)
} }
msgs[currMsgIdx+cnt].Content = prefix + imgPrompt + prompt msgs[currMsgIdx+cnt].Content = prefix + prompt
} }
// truncate any messages that do not fit into the context window // truncate any messages that do not fit into the context window
@ -148,12 +102,3 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.
return b.String(), images, nil return b.String(), images, nil
} }
func checkMllamaModelFamily(m *Model) bool {
for _, arch := range m.Config.ModelFamilies {
if arch == "mllama" {
return true
}
}
return false
}

View File

@ -2,8 +2,6 @@ package server
import ( import (
"bytes" "bytes"
"image"
"image/png"
"testing" "testing"
"github.com/google/go-cmp/cmp" "github.com/google/go-cmp/cmp"
@ -14,10 +12,9 @@ import (
func TestChatPrompt(t *testing.T) { func TestChatPrompt(t *testing.T) {
type expect struct { type expect struct {
prompt string prompt string
images [][]byte images [][]byte
aspectRatioID int error error
error error
} }
tmpl, err := template.Parse(` tmpl, err := template.Parse(`
@ -28,28 +25,6 @@ func TestChatPrompt(t *testing.T) {
t.Fatal(err) t.Fatal(err)
} }
visionModel := Model{Template: tmpl, ProjectorPaths: []string{"vision"}} visionModel := Model{Template: tmpl, ProjectorPaths: []string{"vision"}}
mllamaModel := Model{Template: tmpl, ProjectorPaths: []string{"vision"}, Config: ConfigV2{ModelFamilies: []string{"mllama"}}}
createImg := func(width, height int) ([]byte, error) {
img := image.NewRGBA(image.Rect(0, 0, width, height))
var buf bytes.Buffer
if err := png.Encode(&buf, img); err != nil {
return nil, err
}
return buf.Bytes(), nil
}
imgBuf, err := createImg(5, 5)
if err != nil {
t.Fatal(err)
}
imgBuf2, err := createImg(6, 6)
if err != nil {
t.Fatal(err)
}
cases := []struct { cases := []struct {
name string name string
@ -227,90 +202,6 @@ func TestChatPrompt(t *testing.T) {
images: [][]byte{[]byte("one hotdog"), []byte("two hotdogs")}, images: [][]byte{[]byte("one hotdog"), []byte("two hotdogs")},
}, },
}, },
{
name: "messages with mllama (no images)",
model: mllamaModel,
limit: 2048,
msgs: []api.Message{
{Role: "user", Content: "You're a test, Harry!"},
{Role: "assistant", Content: "I-I'm a what?"},
{Role: "user", Content: "A test. And a thumping good one at that, I'd wager."},
},
expect: expect{
prompt: "You're a test, Harry! I-I'm a what? A test. And a thumping good one at that, I'd wager. ",
},
},
{
name: "messages with mllama single prompt",
model: mllamaModel,
limit: 2048,
msgs: []api.Message{
{Role: "user", Content: "How many hotdogs are in this image?", Images: []api.ImageData{imgBuf}},
},
expect: expect{
prompt: "[img-0]<|image|>How many hotdogs are in this image? ",
images: [][]byte{imgBuf},
aspectRatioID: 1,
},
},
{
name: "messages with mllama",
model: mllamaModel,
limit: 2048,
msgs: []api.Message{
{Role: "user", Content: "You're a test, Harry!"},
{Role: "assistant", Content: "I-I'm a what?"},
{Role: "user", Content: "A test. And a thumping good one at that, I'd wager.", Images: []api.ImageData{imgBuf}},
},
expect: expect{
prompt: "You're a test, Harry! I-I'm a what? [img-0]<|image|>A test. And a thumping good one at that, I'd wager. ",
images: [][]byte{imgBuf},
aspectRatioID: 1,
},
},
{
name: "multiple messages with mllama",
model: mllamaModel,
limit: 2048,
msgs: []api.Message{
{Role: "user", Content: "You're a test, Harry!", Images: []api.ImageData{imgBuf}},
{Role: "assistant", Content: "I-I'm a what?"},
{Role: "user", Content: "A test. And a thumping good one at that, I'd wager.", Images: []api.ImageData{imgBuf2}},
},
expect: expect{
prompt: "[img-0]<|image|>You're a test, Harry! I-I'm a what? [img-1]<|image|>A test. And a thumping good one at that, I'd wager. ",
images: [][]byte{imgBuf, imgBuf2},
aspectRatioID: 1,
},
},
{
name: "earlier image with mllama",
model: mllamaModel,
limit: 2048,
msgs: []api.Message{
{Role: "user", Content: "How many hotdogs are in this image?", Images: []api.ImageData{imgBuf}},
{Role: "assistant", Content: "There are four hotdogs."},
{Role: "user", Content: "Which ones have mustard?"},
},
expect: expect{
prompt: "[img-0]<|image|>How many hotdogs are in this image? There are four hotdogs. Which ones have mustard? ",
images: [][]byte{imgBuf},
aspectRatioID: 1,
},
},
{
name: "too many images with mllama",
model: mllamaModel,
limit: 2048,
msgs: []api.Message{
{Role: "user", Content: "You're a test, Harry!"},
{Role: "assistant", Content: "I-I'm a what?"},
{Role: "user", Content: "A test. And a thumping good one at that, I'd wager.", Images: []api.ImageData{imgBuf, imgBuf}},
},
expect: expect{
error: errTooManyImages,
},
},
} }
for _, tt := range cases { for _, tt := range cases {
@ -341,10 +232,6 @@ func TestChatPrompt(t *testing.T) {
if !bytes.Equal(images[i].Data, tt.images[i]) { if !bytes.Equal(images[i].Data, tt.images[i]) {
t.Errorf("expected %q, got %q", tt.images[i], images[i].Data) t.Errorf("expected %q, got %q", tt.images[i], images[i].Data)
} }
} else {
if images[i].AspectRatioID != tt.aspectRatioID {
t.Errorf("expected aspect ratio %d, got %d", tt.aspectRatioID, images[i].AspectRatioID)
}
} }
} }
}) })

View File

@ -4,7 +4,6 @@ import (
"bytes" "bytes"
"cmp" "cmp"
"context" "context"
"encoding/binary"
"encoding/json" "encoding/json"
"errors" "errors"
"fmt" "fmt"
@ -35,7 +34,6 @@ import (
"github.com/ollama/ollama/fs/ggml" "github.com/ollama/ollama/fs/ggml"
"github.com/ollama/ollama/llm" "github.com/ollama/ollama/llm"
"github.com/ollama/ollama/logutil" "github.com/ollama/ollama/logutil"
"github.com/ollama/ollama/model/models/mllama"
"github.com/ollama/ollama/openai" "github.com/ollama/ollama/openai"
"github.com/ollama/ollama/server/internal/client/ollama" "github.com/ollama/ollama/server/internal/client/ollama"
"github.com/ollama/ollama/server/internal/registry" "github.com/ollama/ollama/server/internal/registry"
@ -100,6 +98,10 @@ func (s *Server) scheduleRunner(ctx context.Context, name string, caps []model.C
return nil, nil, nil, err return nil, nil, nil, err
} }
if slices.Contains(model.Config.ModelFamilies, "mllama") && len(model.ProjectorPaths) > 0 {
return nil, nil, nil, fmt.Errorf("'llama3.2-vision' is no longer compatible with your version of Ollama and has been replaced by a newer version. To re-download, run 'ollama pull llama3.2-vision'")
}
if err := model.CheckCapabilities(caps...); err != nil { if err := model.CheckCapabilities(caps...); err != nil {
return nil, nil, nil, fmt.Errorf("%s %w", name, err) return nil, nil, nil, fmt.Errorf("%s %w", name, err)
} }
@ -206,38 +208,14 @@ func (s *Server) GenerateHandler(c *gin.Context) {
return return
} }
isMllama := checkMllamaModelFamily(m) if slices.Contains(m.Config.ModelFamilies, "mllama") && len(req.Images) > 1 {
if isMllama && len(req.Images) > 1 { c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "this model only supports one image while more than one image requested"})
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "this model only supports one image: more than one image sent"})
return return
} }
images := make([]llm.ImageData, len(req.Images)) images := make([]llm.ImageData, len(req.Images))
for i := range req.Images { for i := range req.Images {
if isMllama && len(m.ProjectorPaths) > 0 { images[i] = llm.ImageData{ID: i, Data: req.Images[i]}
data, opts, err := mllama.Preprocess(bytes.NewReader(req.Images[i]))
if err != nil {
c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": "error processing image"})
return
}
ar, ok := opts["aspectRatioIndex"].(int)
if !ok {
c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": "error processing image"})
return
}
buf := new(bytes.Buffer)
err = binary.Write(buf, binary.LittleEndian, data)
if err != nil {
c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": "error processing image"})
return
}
images[i] = llm.ImageData{ID: i, Data: buf.Bytes(), AspectRatioID: ar}
} else {
images[i] = llm.ImageData{ID: i, Data: req.Images[i]}
}
} }
prompt := req.Prompt prompt := req.Prompt
@ -269,9 +247,6 @@ func (s *Server) GenerateHandler(c *gin.Context) {
for _, i := range images { for _, i := range images {
imgPrompt := "" imgPrompt := ""
if isMllama {
imgPrompt = "<|image|>"
}
msgs = append(msgs, api.Message{Role: "user", Content: fmt.Sprintf("[img-%d]"+imgPrompt, i.ID)}) msgs = append(msgs, api.Message{Role: "user", Content: fmt.Sprintf("[img-%d]"+imgPrompt, i.ID)})
} }

View File

@ -8,6 +8,7 @@ import (
"os" "os"
"reflect" "reflect"
"runtime" "runtime"
"slices"
"sort" "sort"
"strconv" "strconv"
"strings" "strings"
@ -132,11 +133,11 @@ func (s *Scheduler) processPending(ctx context.Context) {
continue continue
} }
numParallel := int(envconfig.NumParallel()) numParallel := int(envconfig.NumParallel())
// TODO (jmorganca): mllama doesn't support parallel yet // `mllama` is a snowflake and uses an encoder cache which cannot be used with num_parallel > 1
// see https://github.com/ollama/ollama/issues/4165 // ref: https://github.com/ollama/ollama/issues/4165
if checkMllamaModelFamily(pending.model) && numParallel != 1 { if slices.Contains(pending.model.Config.ModelFamilies, "mllama") && numParallel != 1 {
numParallel = 1 numParallel = 1
slog.Warn("mllama doesn't support parallel requests yet") slog.Warn("mllama does not currently support parallel requests")
} }
for { for {